diff options
Diffstat (limited to 'drivers/iommu/intel')
| -rw-r--r-- | drivers/iommu/intel/Kconfig | 103 | ||||
| -rw-r--r-- | drivers/iommu/intel/Makefile | 8 | ||||
| -rw-r--r-- | drivers/iommu/intel/cache.c | 528 | ||||
| -rw-r--r-- | drivers/iommu/intel/debugfs.c | 815 | ||||
| -rw-r--r-- | drivers/iommu/intel/dmar.c | 2403 | ||||
| -rw-r--r-- | drivers/iommu/intel/iommu.c | 4222 | ||||
| -rw-r--r-- | drivers/iommu/intel/iommu.h | 1380 | ||||
| -rw-r--r-- | drivers/iommu/intel/irq_remapping.c | 1605 | ||||
| -rw-r--r-- | drivers/iommu/intel/nested.c | 242 | ||||
| -rw-r--r-- | drivers/iommu/intel/pasid.c | 1152 | ||||
| -rw-r--r-- | drivers/iommu/intel/pasid.h | 326 | ||||
| -rw-r--r-- | drivers/iommu/intel/perf.c | 164 | ||||
| -rw-r--r-- | drivers/iommu/intel/perf.h | 71 | ||||
| -rw-r--r-- | drivers/iommu/intel/perfmon.c | 790 | ||||
| -rw-r--r-- | drivers/iommu/intel/perfmon.h | 64 | ||||
| -rw-r--r-- | drivers/iommu/intel/prq.c | 396 | ||||
| -rw-r--r-- | drivers/iommu/intel/svm.c | 231 | ||||
| -rw-r--r-- | drivers/iommu/intel/trace.c | 14 | ||||
| -rw-r--r-- | drivers/iommu/intel/trace.h | 191 |
19 files changed, 14705 insertions, 0 deletions
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig new file mode 100644 index 000000000000..5471f814e073 --- /dev/null +++ b/drivers/iommu/intel/Kconfig @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Intel IOMMU support +config DMAR_TABLE + bool + +config DMAR_PERF + bool + +config DMAR_DEBUG + bool + +config INTEL_IOMMU + bool "Support for Intel IOMMU using DMA Remapping Devices" + depends on PCI_MSI && ACPI && X86 + select IOMMU_API + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_X86_64 + select IOMMU_PT_VTDSS + select IOMMU_IOVA + select IOMMU_IOPF + select IOMMUFD_DRIVER if IOMMUFD + select NEED_DMA_MAP_STATE + select DMAR_TABLE + select SWIOTLB + select PCI_ATS + select PCI_PRI + select PCI_PASID + help + DMA remapping (DMAR) devices support enables independent address + translations for Direct Memory Access (DMA) from devices. + These DMA remapping devices are reported via ACPI tables + and include PCI device scope covered by these DMA + remapping devices. + +if INTEL_IOMMU + +config INTEL_IOMMU_DEBUGFS + bool "Export Intel IOMMU internals in Debugfs" + depends on IOMMU_DEBUGFS + select DMAR_PERF + select DMAR_DEBUG + help + !!!WARNING!!! + + DO NOT ENABLE THIS OPTION UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!!! + + Expose Intel IOMMU internals in Debugfs. + + This option is -NOT- intended for production environments, and should + only be enabled for debugging Intel IOMMU. + +config INTEL_IOMMU_SVM + bool "Support for Shared Virtual Memory with Intel IOMMU" + depends on X86_64 + select MMU_NOTIFIER + select IOMMU_SVA + help + Shared Virtual Memory (SVM) provides a facility for devices + to access DMA resources through process address space by + means of a Process Address Space ID (PASID). + +config INTEL_IOMMU_DEFAULT_ON + bool "Enable Intel DMA Remapping Devices by default" + default y + help + Selecting this option will enable a DMAR device at boot time if + one is found. If this option is not selected, DMAR support can + be enabled by passing intel_iommu=on to the kernel. + +config INTEL_IOMMU_FLOPPY_WA + def_bool y + depends on X86 && BLK_DEV_FD + help + Floppy disk drivers are known to bypass DMA API calls + thereby failing to work when IOMMU is enabled. This + workaround will setup a 1:1 mapping for the first + 16MiB to make floppy (an ISA device) work. + +config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON + bool "Enable Intel IOMMU scalable mode by default" + default y + help + Selecting this option will enable by default the scalable mode if + hardware presents the capability. The scalable mode is defined in + VT-d 3.0. The scalable mode capability could be checked by reading + /sys/devices/virtual/iommu/dmar*/intel-iommu/ecap. If this option + is not selected, scalable mode support could also be enabled by + passing intel_iommu=sm_on to the kernel. If not sure, please use + the default value. + +config INTEL_IOMMU_PERF_EVENTS + bool "Intel IOMMU performance events" + default y + depends on INTEL_IOMMU && PERF_EVENTS + help + Selecting this option will enable the performance monitoring + infrastructure in the Intel IOMMU. It collects information about + key events occurring during operation of the remapping hardware, + to aid performance tuning and debug. These are available on modern + processors which support Intel VT-d 4.0 and later. + +endif # INTEL_IOMMU diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile new file mode 100644 index 000000000000..ada651c4a01b --- /dev/null +++ b/drivers/iommu/intel/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += iommu.o pasid.o nested.o cache.o prq.o +obj-$(CONFIG_DMAR_TABLE) += dmar.o trace.o +obj-$(CONFIG_DMAR_PERF) += perf.o +obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o +obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o +obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o +obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c new file mode 100644 index 000000000000..265e7290256b --- /dev/null +++ b/drivers/iommu/intel/cache.c @@ -0,0 +1,528 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cache.c - Intel VT-d cache invalidation + * + * Copyright (C) 2024 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include <linux/dmar.h> +#include <linux/iommu.h> +#include <linux/memory.h> +#include <linux/pci.h> +#include <linux/spinlock.h> + +#include "iommu.h" +#include "pasid.h" +#include "trace.h" + +/* Check if an existing cache tag can be reused for a new association. */ +static bool cache_tage_match(struct cache_tag *tag, u16 domain_id, + struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, enum cache_tag_type type) +{ + if (tag->type != type) + return false; + + if (tag->domain_id != domain_id || tag->pasid != pasid) + return false; + + if (type == CACHE_TAG_IOTLB || type == CACHE_TAG_NESTING_IOTLB) + return tag->iommu == iommu; + + if (type == CACHE_TAG_DEVTLB || type == CACHE_TAG_NESTING_DEVTLB) + return tag->dev == dev; + + return false; +} + +/* Assign a cache tag with specified type to domain. */ +int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev, + ioasid_t pasid, enum cache_tag_type type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct cache_tag *tag, *temp; + struct list_head *prev; + unsigned long flags; + + tag = kzalloc(sizeof(*tag), GFP_KERNEL); + if (!tag) + return -ENOMEM; + + tag->type = type; + tag->iommu = iommu; + tag->domain_id = did; + tag->pasid = pasid; + tag->users = 1; + + if (type == CACHE_TAG_DEVTLB || type == CACHE_TAG_NESTING_DEVTLB) + tag->dev = dev; + else + tag->dev = iommu->iommu.dev; + + spin_lock_irqsave(&domain->cache_lock, flags); + prev = &domain->cache_tags; + list_for_each_entry(temp, &domain->cache_tags, node) { + if (cache_tage_match(temp, did, iommu, dev, pasid, type)) { + temp->users++; + spin_unlock_irqrestore(&domain->cache_lock, flags); + kfree(tag); + trace_cache_tag_assign(temp); + return 0; + } + if (temp->iommu == iommu) + prev = &temp->node; + } + /* + * Link cache tags of same iommu unit together, so corresponding + * flush ops can be batched for iommu unit. + */ + list_add(&tag->node, prev); + + spin_unlock_irqrestore(&domain->cache_lock, flags); + trace_cache_tag_assign(tag); + + return 0; +} + +/* Unassign a cache tag with specified type from domain. */ +static void cache_tag_unassign(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid, + enum cache_tag_type type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct cache_tag *tag; + unsigned long flags; + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + if (cache_tage_match(tag, did, iommu, dev, pasid, type)) { + trace_cache_tag_unassign(tag); + if (--tag->users == 0) { + list_del(&tag->node); + kfree(tag); + } + break; + } + } + spin_unlock_irqrestore(&domain->cache_lock, flags); +} + +/* domain->qi_batch will be freed in iommu_free_domain() path. */ +static int domain_qi_batch_alloc(struct dmar_domain *domain) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&domain->cache_lock, flags); + if (domain->qi_batch) + goto out_unlock; + + domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_ATOMIC); + if (!domain->qi_batch) + ret = -ENOMEM; +out_unlock: + spin_unlock_irqrestore(&domain->cache_lock, flags); + + return ret; +} + +static int __cache_tag_assign_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + int ret; + + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + if (ret || !info->ats_enabled) + return ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_DEVTLB); + if (ret) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + + return ret; +} + +static void __cache_tag_unassign_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_IOTLB); + + if (info->ats_enabled) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_DEVTLB); +} + +static int __cache_tag_assign_parent_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + int ret; + + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + if (ret || !info->ats_enabled) + return ret; + + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_DEVTLB); + if (ret) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + + return ret; +} + +static void __cache_tag_unassign_parent_domain(struct dmar_domain *domain, u16 did, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); + + if (info->ats_enabled) + cache_tag_unassign(domain, did, dev, pasid, CACHE_TAG_NESTING_DEVTLB); +} + +static u16 domain_get_id_for_dev(struct dmar_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * The driver assigns different domain IDs for all domains except + * the SVA type. + */ + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return FLPT_DEFAULT_DID; + + return domain_id_iommu(domain, iommu); +} + +/* + * Assign cache tags to a domain when it's associated with a device's + * PASID using a specific domain ID. + * + * On success (return value of 0), cache tags are created and added to the + * domain's cache tag list. On failure (negative return value), an error + * code is returned indicating the reason for the failure. + */ +int cache_tag_assign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid) +{ + u16 did = domain_get_id_for_dev(domain, dev); + int ret; + + ret = __cache_tag_assign_domain(domain, did, dev, pasid); + if (ret || domain->domain.type != IOMMU_DOMAIN_NESTED) + return ret; + + ret = __cache_tag_assign_parent_domain(domain->s2_domain, did, dev, pasid); + if (ret) + __cache_tag_unassign_domain(domain, did, dev, pasid); + + return ret; +} + +/* + * Remove the cache tags associated with a device's PASID when the domain is + * detached from the device. + * + * The cache tags must be previously assigned to the domain by calling the + * assign interface. + */ +void cache_tag_unassign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid) +{ + u16 did = domain_get_id_for_dev(domain, dev); + + __cache_tag_unassign_domain(domain, did, dev, pasid); + if (domain->domain.type == IOMMU_DOMAIN_NESTED) + __cache_tag_unassign_parent_domain(domain->s2_domain, did, dev, pasid); +} + +static unsigned long calculate_psi_aligned_address(unsigned long start, + unsigned long end, + unsigned long *_pages, + unsigned long *_mask) +{ + unsigned long pages = aligned_nrpages(start, end - start + 1); + unsigned long aligned_pages = __roundup_pow_of_two(pages); + unsigned long bitmask = aligned_pages - 1; + unsigned long mask = ilog2(aligned_pages); + unsigned long pfn = IOVA_PFN(start); + + /* + * PSI masks the low order bits of the base address. If the + * address isn't aligned to the mask, then compute a mask value + * needed to ensure the target range is flushed. + */ + if (unlikely(bitmask & pfn)) { + unsigned long end_pfn = pfn + pages - 1, shared_bits; + + /* + * Since end_pfn <= pfn + bitmask, the only way bits + * higher than bitmask can differ in pfn and end_pfn is + * by carrying. This means after masking out bitmask, + * high bits starting with the first set bit in + * shared_bits are all equal in both pfn and end_pfn. + */ + shared_bits = ~(pfn ^ end_pfn) & ~bitmask; + mask = shared_bits ? __ffs(shared_bits) : MAX_AGAW_PFN_WIDTH; + aligned_pages = 1UL << mask; + } + + *_pages = aligned_pages; + *_mask = mask; + + return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask); +} + +static void qi_batch_flush_descs(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (!iommu || !batch->index) + return; + + qi_submit_sync(iommu, batch->descs, batch->index, 0); + + /* Reset the index value and clean the whole batch buffer. */ + memset(batch, 0, sizeof(*batch)); +} + +static void qi_batch_increment_index(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (++batch->index == QI_MAX_BATCHED_DESC_COUNT) + qi_batch_flush_descs(iommu, batch); +} + +static void qi_batch_add_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_batch *batch) +{ + qi_desc_iotlb(iommu, did, addr, size_order, type, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned int mask, + struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any Device-TLB + * invalidation requests while address remapping hardware is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, + u64 addr, unsigned long npages, bool ih, + struct qi_batch *batch) +{ + /* + * npages == -1 means a PASID-selective invalidation, otherwise, + * a positive value for Page-selective-within-PASID invalidation. + * 0 is not a valid input. + */ + if (!npages) + return; + + qi_desc_piotlb(did, pasid, addr, npages, ih, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_pasid_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any + * Device-TLB invalidation requests while address remapping hardware + * is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, qdep, addr, size_order, + &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long pages, + unsigned long mask, int ih) +{ + struct intel_iommu *iommu = tag->iommu; + u64 type = DMA_TLB_PSI_FLUSH; + + if (intel_domain_is_fs_paging(domain)) { + qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, + pages, ih, domain->qi_batch); + return; + } + + /* + * Fallback to domain selective flush if no PSI support or the size + * is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap) || pages == -1) { + addr = 0; + mask = 0; + ih = 0; + type = DMA_TLB_DSI_FLUSH; + } + + if (ecap_qis(iommu->ecap)) + qi_batch_add_iotlb(iommu, tag->domain_id, addr | ih, mask, type, + domain->qi_batch); + else + __iommu_flush_iotlb(iommu, tag->domain_id, addr | ih, mask, type); +} + +static void cache_tag_flush_devtlb_psi(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long mask) +{ + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + if (tag->pasid == IOMMU_NO_PASID) { + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + return; + } + + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, + domain->qi_batch); +} + +/* + * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive) + * when the memory mappings in the target domain have been modified. + */ +void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, + unsigned long end, int ih) +{ + struct intel_iommu *iommu = NULL; + unsigned long pages, mask, addr; + struct cache_tag *tag; + unsigned long flags; + + if (start == 0 && end == ULONG_MAX) { + addr = 0; + pages = -1; + mask = MAX_AGAW_PFN_WIDTH; + } else { + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + } + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; + + switch (tag->type) { + case CACHE_TAG_IOTLB: + case CACHE_TAG_NESTING_IOTLB: + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, ih); + break; + case CACHE_TAG_NESTING_DEVTLB: + /* + * Address translation cache in device side caches the + * result of nested translation. There is no easy way + * to identify the exact set of nested translations + * affected by a change in S2. So just flush the entire + * device cache. + */ + addr = 0; + mask = MAX_AGAW_PFN_WIDTH; + fallthrough; + case CACHE_TAG_DEVTLB: + cache_tag_flush_devtlb_psi(domain, tag, addr, mask); + break; + } + + trace_cache_tag_flush_range(tag, start, end, addr, pages, mask); + } + qi_batch_flush_descs(iommu, domain->qi_batch); + spin_unlock_irqrestore(&domain->cache_lock, flags); +} + +/* + * Invalidates all ranges of IOVA when the memory mappings in the target + * domain have been modified. + */ +void cache_tag_flush_all(struct dmar_domain *domain) +{ + cache_tag_flush_range(domain, 0, ULONG_MAX, 0); +} + +/* + * Invalidate a range of IOVA when new mappings are created in the target + * domain. + * + * - VT-d spec, Section 6.1 Caching Mode: When the CM field is reported as + * Set, any software updates to remapping structures other than first- + * stage mapping requires explicit invalidation of the caches. + * - VT-d spec, Section 6.8 Write Buffer Flushing: For hardware that requires + * write buffer flushing, software must explicitly perform write-buffer + * flushing, if cache invalidation is not required. + */ +void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, + unsigned long end) +{ + struct intel_iommu *iommu = NULL; + unsigned long pages, mask, addr; + struct cache_tag *tag; + unsigned long flags; + + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + + spin_lock_irqsave(&domain->cache_lock, flags); + list_for_each_entry(tag, &domain->cache_tags, node) { + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; + + if (!cap_caching_mode(iommu->cap) || + intel_domain_is_fs_paging(domain)) { + iommu_flush_write_buffer(iommu); + continue; + } + + if (tag->type == CACHE_TAG_IOTLB || + tag->type == CACHE_TAG_NESTING_IOTLB) + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, 0); + + trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask); + } + qi_batch_flush_descs(iommu, domain->qi_batch); + spin_unlock_irqrestore(&domain->cache_lock, flags); +} diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c new file mode 100644 index 000000000000..617fd81a80f0 --- /dev/null +++ b/drivers/iommu/intel/debugfs.c @@ -0,0 +1,815 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2018 Intel Corporation. + * + * Authors: Gayatri Kammela <gayatri.kammela@intel.com> + * Sohil Mehta <sohil.mehta@intel.com> + * Jacob Pan <jacob.jun.pan@linux.intel.com> + * Lu Baolu <baolu.lu@linux.intel.com> + */ + +#include <linux/debugfs.h> +#include <linux/dmar.h> +#include <linux/pci.h> + +#include <asm/irq_remapping.h> + +#include "iommu.h" +#include "pasid.h" +#include "perf.h" + +struct tbl_walk { + u16 bus; + u16 devfn; + u32 pasid; + struct root_entry *rt_entry; + struct context_entry *ctx_entry; + struct pasid_entry *pasid_tbl_entry; +}; + +struct iommu_regset { + int offset; + const char *regs; +}; + +#define DEBUG_BUFFER_SIZE 1024 +static char debug_buf[DEBUG_BUFFER_SIZE]; + +#define IOMMU_REGSET_ENTRY(_reg_) \ + { DMAR_##_reg_##_REG, __stringify(_reg_) } + +static const struct iommu_regset iommu_regs_32[] = { + IOMMU_REGSET_ENTRY(VER), + IOMMU_REGSET_ENTRY(GCMD), + IOMMU_REGSET_ENTRY(GSTS), + IOMMU_REGSET_ENTRY(FSTS), + IOMMU_REGSET_ENTRY(FECTL), + IOMMU_REGSET_ENTRY(FEDATA), + IOMMU_REGSET_ENTRY(FEADDR), + IOMMU_REGSET_ENTRY(FEUADDR), + IOMMU_REGSET_ENTRY(PMEN), + IOMMU_REGSET_ENTRY(PLMBASE), + IOMMU_REGSET_ENTRY(PLMLIMIT), + IOMMU_REGSET_ENTRY(ICS), + IOMMU_REGSET_ENTRY(PRS), + IOMMU_REGSET_ENTRY(PECTL), + IOMMU_REGSET_ENTRY(PEDATA), + IOMMU_REGSET_ENTRY(PEADDR), + IOMMU_REGSET_ENTRY(PEUADDR), +}; + +static const struct iommu_regset iommu_regs_64[] = { + IOMMU_REGSET_ENTRY(CAP), + IOMMU_REGSET_ENTRY(ECAP), + IOMMU_REGSET_ENTRY(RTADDR), + IOMMU_REGSET_ENTRY(PHMBASE), + IOMMU_REGSET_ENTRY(PHMLIMIT), + IOMMU_REGSET_ENTRY(IQH), + IOMMU_REGSET_ENTRY(IQT), + IOMMU_REGSET_ENTRY(IQA), + IOMMU_REGSET_ENTRY(IRTA), + IOMMU_REGSET_ENTRY(PQH), + IOMMU_REGSET_ENTRY(PQT), + IOMMU_REGSET_ENTRY(PQA), + IOMMU_REGSET_ENTRY(MTRRCAP), + IOMMU_REGSET_ENTRY(MTRRDEF), + IOMMU_REGSET_ENTRY(MTRR_FIX64K_00000), + IOMMU_REGSET_ENTRY(MTRR_FIX16K_80000), + IOMMU_REGSET_ENTRY(MTRR_FIX16K_A0000), + IOMMU_REGSET_ENTRY(MTRR_FIX4K_C0000), + IOMMU_REGSET_ENTRY(MTRR_FIX4K_C8000), + IOMMU_REGSET_ENTRY(MTRR_FIX4K_D0000), + IOMMU_REGSET_ENTRY(MTRR_FIX4K_D8000), + IOMMU_REGSET_ENTRY(MTRR_FIX4K_E0000), + IOMMU_REGSET_ENTRY(MTRR_FIX4K_E8000), + IOMMU_REGSET_ENTRY(MTRR_FIX4K_F0000), + IOMMU_REGSET_ENTRY(MTRR_FIX4K_F8000), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE0), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK0), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE1), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK1), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE2), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK2), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE3), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK3), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE4), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK4), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE5), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK5), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE6), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK6), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE7), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK7), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE8), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK8), + IOMMU_REGSET_ENTRY(MTRR_PHYSBASE9), + IOMMU_REGSET_ENTRY(MTRR_PHYSMASK9), +}; + +static struct dentry *intel_iommu_debug; + +static int iommu_regset_show(struct seq_file *m, void *unused) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + unsigned long flag; + int i, ret = 0; + u64 value; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + if (!drhd->reg_base_addr) { + seq_puts(m, "IOMMU: Invalid base address\n"); + ret = -EINVAL; + goto out; + } + + seq_printf(m, "IOMMU: %s Register Base Address: %llx\n", + iommu->name, drhd->reg_base_addr); + seq_puts(m, "Name\t\t\tOffset\t\tContents\n"); + /* + * Publish the contents of the 64-bit hardware registers + * by adding the offset to the pointer (virtual address). + */ + raw_spin_lock_irqsave(&iommu->register_lock, flag); + for (i = 0 ; i < ARRAY_SIZE(iommu_regs_32); i++) { + value = dmar_readl(iommu->reg + iommu_regs_32[i].offset); + seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n", + iommu_regs_32[i].regs, iommu_regs_32[i].offset, + value); + } + for (i = 0 ; i < ARRAY_SIZE(iommu_regs_64); i++) { + value = dmar_readq(iommu->reg + iommu_regs_64[i].offset); + seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n", + iommu_regs_64[i].regs, iommu_regs_64[i].offset, + value); + } + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + seq_putc(m, '\n'); + } +out: + rcu_read_unlock(); + + return ret; +} +DEFINE_SHOW_ATTRIBUTE(iommu_regset); + +static inline void print_tbl_walk(struct seq_file *m) +{ + struct tbl_walk *tbl_wlk = m->private; + + seq_printf(m, "%02x:%02x.%x\t0x%016llx:0x%016llx\t0x%016llx:0x%016llx\t", + tbl_wlk->bus, PCI_SLOT(tbl_wlk->devfn), + PCI_FUNC(tbl_wlk->devfn), tbl_wlk->rt_entry->hi, + tbl_wlk->rt_entry->lo, tbl_wlk->ctx_entry->hi, + tbl_wlk->ctx_entry->lo); + + /* + * A legacy mode DMAR doesn't support PASID, hence default it to -1 + * indicating that it's invalid. Also, default all PASID related fields + * to 0. + */ + if (!tbl_wlk->pasid_tbl_entry) + seq_printf(m, "%-6d\t0x%016llx:0x%016llx:0x%016llx\n", -1, + (u64)0, (u64)0, (u64)0); + else + seq_printf(m, "%-6d\t0x%016llx:0x%016llx:0x%016llx\n", + tbl_wlk->pasid, tbl_wlk->pasid_tbl_entry->val[2], + tbl_wlk->pasid_tbl_entry->val[1], + tbl_wlk->pasid_tbl_entry->val[0]); +} + +static void pasid_tbl_walk(struct seq_file *m, struct pasid_entry *tbl_entry, + u16 dir_idx) +{ + struct tbl_walk *tbl_wlk = m->private; + u8 tbl_idx; + + for (tbl_idx = 0; tbl_idx < PASID_TBL_ENTRIES; tbl_idx++) { + if (pasid_pte_is_present(tbl_entry)) { + tbl_wlk->pasid_tbl_entry = tbl_entry; + tbl_wlk->pasid = (dir_idx << PASID_PDE_SHIFT) + tbl_idx; + print_tbl_walk(m); + } + + tbl_entry++; + } +} + +static void pasid_dir_walk(struct seq_file *m, u64 pasid_dir_ptr, + u16 pasid_dir_size) +{ + struct pasid_dir_entry *dir_entry = phys_to_virt(pasid_dir_ptr); + struct pasid_entry *pasid_tbl; + u16 dir_idx; + + for (dir_idx = 0; dir_idx < pasid_dir_size; dir_idx++) { + pasid_tbl = get_pasid_table_from_pde(dir_entry); + if (pasid_tbl) + pasid_tbl_walk(m, pasid_tbl, dir_idx); + + dir_entry++; + } +} + +static void ctx_tbl_walk(struct seq_file *m, struct intel_iommu *iommu, u16 bus) +{ + struct context_entry *context; + u16 devfn, pasid_dir_size; + u64 pasid_dir_ptr; + + for (devfn = 0; devfn < 256; devfn++) { + struct tbl_walk tbl_wlk = {0}; + + /* + * Scalable mode root entry points to upper scalable mode + * context table and lower scalable mode context table. Each + * scalable mode context table has 128 context entries where as + * legacy mode context table has 256 context entries. So in + * scalable mode, the context entries for former 128 devices are + * in the lower scalable mode context table, while the latter + * 128 devices are in the upper scalable mode context table. + * In scalable mode, when devfn > 127, iommu_context_addr() + * automatically refers to upper scalable mode context table and + * hence the caller doesn't have to worry about differences + * between scalable mode and non scalable mode. + */ + context = iommu_context_addr(iommu, bus, devfn, 0); + if (!context) + return; + + if (!context_present(context)) + continue; + + tbl_wlk.bus = bus; + tbl_wlk.devfn = devfn; + tbl_wlk.rt_entry = &iommu->root_entry[bus]; + tbl_wlk.ctx_entry = context; + m->private = &tbl_wlk; + + if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) { + pasid_dir_ptr = context->lo & VTD_PAGE_MASK; + pasid_dir_size = get_pasid_dir_size(context); + pasid_dir_walk(m, pasid_dir_ptr, pasid_dir_size); + continue; + } + + print_tbl_walk(m); + } +} + +static void root_tbl_walk(struct seq_file *m, struct intel_iommu *iommu) +{ + u16 bus; + + spin_lock(&iommu->lock); + seq_printf(m, "IOMMU %s: Root Table Address: 0x%llx\n", iommu->name, + (u64)virt_to_phys(iommu->root_entry)); + seq_puts(m, "B.D.F\tRoot_entry\t\t\t\tContext_entry\t\t\t\tPASID\tPASID_table_entry\n"); + + /* + * No need to check if the root entry is present or not because + * iommu_context_addr() performs the same check before returning + * context entry. + */ + for (bus = 0; bus < 256; bus++) + ctx_tbl_walk(m, iommu, bus); + spin_unlock(&iommu->lock); +} + +static int dmar_translation_struct_show(struct seq_file *m, void *unused) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + u32 sts; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_TES)) { + seq_printf(m, "DMA Remapping is not enabled on %s\n", + iommu->name); + continue; + } + root_tbl_walk(m, iommu); + seq_putc(m, '\n'); + } + rcu_read_unlock(); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dmar_translation_struct); + +static inline unsigned long level_to_directory_size(int level) +{ + return BIT_ULL(VTD_PAGE_SHIFT + VTD_STRIDE_SHIFT * (level - 1)); +} + +static inline void +dump_page_info(struct seq_file *m, unsigned long iova, u64 *path) +{ + seq_printf(m, "0x%013lx |\t0x%016llx\t0x%016llx\t0x%016llx", + iova >> VTD_PAGE_SHIFT, path[5], path[4], path[3]); + if (path[2]) { + seq_printf(m, "\t0x%016llx", path[2]); + if (path[1]) + seq_printf(m, "\t0x%016llx", path[1]); + } + seq_putc(m, '\n'); +} + +static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, + int level, unsigned long start, + u64 *path) +{ + int i; + + if (level > 5 || level < 1) + return; + + for (i = 0; i < BIT_ULL(VTD_STRIDE_SHIFT); + i++, pde++, start += level_to_directory_size(level)) { + if (!dma_pte_present(pde)) + continue; + + path[level] = pde->val; + if (dma_pte_superpage(pde) || level == 1) + dump_page_info(m, start, path); + else + pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)), + level - 1, start, path); + path[level] = 0; + } +} + +static int domain_translation_struct_show(struct seq_file *m, + struct device_domain_info *info, + ioasid_t pasid) +{ + bool scalable, found = false; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + u16 devfn, bus, seg; + + bus = info->bus; + devfn = info->devfn; + seg = info->segment; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + struct context_entry *context; + u64 pgd, path[6] = { 0 }; + u32 sts, agaw; + + if (seg != iommu->segment) + continue; + + sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_TES)) { + seq_printf(m, "DMA Remapping is not enabled on %s\n", + iommu->name); + continue; + } + if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) + scalable = true; + else + scalable = false; + + /* + * The iommu->lock is held across the callback, which will + * block calls to domain_attach/domain_detach. Hence, + * the domain of the device will not change during traversal. + * + * Traversing page table possibly races with the iommu_unmap() + * interface. This could be solved by RCU-freeing the page + * table pages in the iommu_unmap() path. + */ + spin_lock(&iommu->lock); + + context = iommu_context_addr(iommu, bus, devfn, 0); + if (!context || !context_present(context)) + goto iommu_unlock; + + if (scalable) { /* scalable mode */ + struct pasid_entry *pasid_tbl, *pasid_tbl_entry; + struct pasid_dir_entry *dir_tbl, *dir_entry; + u16 dir_idx, tbl_idx, pgtt; + u64 pasid_dir_ptr; + + pasid_dir_ptr = context->lo & VTD_PAGE_MASK; + + /* Dump specified device domain mappings with PASID. */ + dir_idx = pasid >> PASID_PDE_SHIFT; + tbl_idx = pasid & PASID_PTE_MASK; + + dir_tbl = phys_to_virt(pasid_dir_ptr); + dir_entry = &dir_tbl[dir_idx]; + + pasid_tbl = get_pasid_table_from_pde(dir_entry); + if (!pasid_tbl) + goto iommu_unlock; + + pasid_tbl_entry = &pasid_tbl[tbl_idx]; + if (!pasid_pte_is_present(pasid_tbl_entry)) + goto iommu_unlock; + + /* + * According to PASID Granular Translation Type(PGTT), + * get the page table pointer. + */ + pgtt = (u16)(pasid_tbl_entry->val[0] & GENMASK_ULL(8, 6)) >> 6; + agaw = (u8)(pasid_tbl_entry->val[0] & GENMASK_ULL(4, 2)) >> 2; + + switch (pgtt) { + case PASID_ENTRY_PGTT_FL_ONLY: + pgd = pasid_tbl_entry->val[2]; + break; + case PASID_ENTRY_PGTT_SL_ONLY: + case PASID_ENTRY_PGTT_NESTED: + pgd = pasid_tbl_entry->val[0]; + break; + default: + goto iommu_unlock; + } + pgd &= VTD_PAGE_MASK; + } else { /* legacy mode */ + u8 tt = (u8)(context->lo & GENMASK_ULL(3, 2)) >> 2; + + /* + * According to Translation Type(TT), + * get the page table pointer(SSPTPTR). + */ + switch (tt) { + case CONTEXT_TT_MULTI_LEVEL: + case CONTEXT_TT_DEV_IOTLB: + pgd = context->lo & VTD_PAGE_MASK; + agaw = context->hi & 7; + break; + default: + goto iommu_unlock; + } + } + + seq_printf(m, "Device %04x:%02x:%02x.%x ", + iommu->segment, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + if (scalable) + seq_printf(m, "with pasid %x @0x%llx\n", pasid, pgd); + else + seq_printf(m, "@0x%llx\n", pgd); + + seq_printf(m, "%-17s\t%-18s\t%-18s\t%-18s\t%-18s\t%-s\n", + "IOVA_PFN", "PML5E", "PML4E", "PDPE", "PDE", "PTE"); + pgtable_walk_level(m, phys_to_virt(pgd), agaw + 2, 0, path); + + found = true; +iommu_unlock: + spin_unlock(&iommu->lock); + if (found) + break; + } + rcu_read_unlock(); + + return 0; +} + +static int dev_domain_translation_struct_show(struct seq_file *m, void *unused) +{ + struct device_domain_info *info = (struct device_domain_info *)m->private; + + return domain_translation_struct_show(m, info, IOMMU_NO_PASID); +} +DEFINE_SHOW_ATTRIBUTE(dev_domain_translation_struct); + +static int pasid_domain_translation_struct_show(struct seq_file *m, void *unused) +{ + struct dev_pasid_info *dev_pasid = (struct dev_pasid_info *)m->private; + struct device_domain_info *info = dev_iommu_priv_get(dev_pasid->dev); + + return domain_translation_struct_show(m, info, dev_pasid->pasid); +} +DEFINE_SHOW_ATTRIBUTE(pasid_domain_translation_struct); + +static void invalidation_queue_entry_show(struct seq_file *m, + struct intel_iommu *iommu) +{ + int index, shift = qi_shift(iommu); + struct qi_desc *desc; + int offset; + + if (ecap_smts(iommu->ecap)) + seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tqw2\t\t\tqw3\t\t\tstatus\n"); + else + seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tstatus\n"); + + for (index = 0; index < QI_LENGTH; index++) { + offset = index << shift; + desc = iommu->qi->desc + offset; + if (ecap_smts(iommu->ecap)) + seq_printf(m, "%5d\t%016llx\t%016llx\t%016llx\t%016llx\t%016x\n", + index, desc->qw0, desc->qw1, + desc->qw2, desc->qw3, + iommu->qi->desc_status[index]); + else + seq_printf(m, "%5d\t%016llx\t%016llx\t%016x\n", + index, desc->qw0, desc->qw1, + iommu->qi->desc_status[index]); + } +} + +static int invalidation_queue_show(struct seq_file *m, void *unused) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + unsigned long flags; + struct q_inval *qi; + int shift; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + qi = iommu->qi; + shift = qi_shift(iommu); + + if (!qi || !ecap_qis(iommu->ecap)) + continue; + + seq_printf(m, "Invalidation queue on IOMMU: %s\n", iommu->name); + + raw_spin_lock_irqsave(&qi->q_lock, flags); + seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n", + (u64)virt_to_phys(qi->desc), + dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift, + dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift); + invalidation_queue_entry_show(m, iommu); + raw_spin_unlock_irqrestore(&qi->q_lock, flags); + seq_putc(m, '\n'); + } + rcu_read_unlock(); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(invalidation_queue); + +#ifdef CONFIG_IRQ_REMAP +static void ir_tbl_remap_entry_show(struct seq_file *m, + struct intel_iommu *iommu) +{ + struct irte *ri_entry; + unsigned long flags; + int idx; + + seq_puts(m, " Entry SrcID DstID Vct IRTE_high\t\tIRTE_low\n"); + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + for (idx = 0; idx < INTR_REMAP_TABLE_ENTRIES; idx++) { + ri_entry = &iommu->ir_table->base[idx]; + if (!ri_entry->present || ri_entry->p_pst) + continue; + + seq_printf(m, " %-5d %02x:%02x.%01x %08x %02x %016llx\t%016llx\n", + idx, PCI_BUS_NUM(ri_entry->sid), + PCI_SLOT(ri_entry->sid), PCI_FUNC(ri_entry->sid), + ri_entry->dest_id, ri_entry->vector, + ri_entry->high, ri_entry->low); + } + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); +} + +static void ir_tbl_posted_entry_show(struct seq_file *m, + struct intel_iommu *iommu) +{ + struct irte *pi_entry; + unsigned long flags; + int idx; + + seq_puts(m, " Entry SrcID PDA_high PDA_low Vct IRTE_high\t\tIRTE_low\n"); + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + for (idx = 0; idx < INTR_REMAP_TABLE_ENTRIES; idx++) { + pi_entry = &iommu->ir_table->base[idx]; + if (!pi_entry->present || !pi_entry->p_pst) + continue; + + seq_printf(m, " %-5d %02x:%02x.%01x %08x %08x %02x %016llx\t%016llx\n", + idx, PCI_BUS_NUM(pi_entry->sid), + PCI_SLOT(pi_entry->sid), PCI_FUNC(pi_entry->sid), + pi_entry->pda_h, pi_entry->pda_l << 6, + pi_entry->vector, pi_entry->high, + pi_entry->low); + } + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); +} + +/* + * For active IOMMUs go through the Interrupt remapping + * table and print valid entries in a table format for + * Remapped and Posted Interrupts. + */ +static int ir_translation_struct_show(struct seq_file *m, void *unused) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + u64 irta; + u32 sts; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + if (!ecap_ir_support(iommu->ecap)) + continue; + + seq_printf(m, "Remapped Interrupt supported on IOMMU: %s\n", + iommu->name); + + sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); + if (iommu->ir_table && (sts & DMA_GSTS_IRES)) { + irta = virt_to_phys(iommu->ir_table->base); + seq_printf(m, " IR table address:%llx\n", irta); + ir_tbl_remap_entry_show(m, iommu); + } else { + seq_puts(m, "Interrupt Remapping is not enabled\n"); + } + seq_putc(m, '\n'); + } + + seq_puts(m, "****\n\n"); + + for_each_active_iommu(iommu, drhd) { + if (!cap_pi_support(iommu->cap)) + continue; + + seq_printf(m, "Posted Interrupt supported on IOMMU: %s\n", + iommu->name); + + if (iommu->ir_table) { + irta = virt_to_phys(iommu->ir_table->base); + seq_printf(m, " IR table address:%llx\n", irta); + ir_tbl_posted_entry_show(m, iommu); + } else { + seq_puts(m, "Interrupt Remapping is not enabled\n"); + } + seq_putc(m, '\n'); + } + rcu_read_unlock(); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(ir_translation_struct); +#endif + +static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu, + struct dmar_drhd_unit *drhd) +{ + seq_printf(m, "IOMMU: %s Register Base Address: %llx\n", + iommu->name, drhd->reg_base_addr); + + dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE); + seq_printf(m, "%s\n", debug_buf); +} + +static int latency_show(struct seq_file *m, void *v) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + latency_show_one(m, iommu, drhd); + rcu_read_unlock(); + + return 0; +} + +static int dmar_perf_latency_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, latency_show, NULL); +} + +static ssize_t dmar_perf_latency_write(struct file *filp, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int counting; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (kstrtoint(buf, 0, &counting)) + return -EINVAL; + + switch (counting) { + case 0: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB); + dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB); + dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC); + } + rcu_read_unlock(); + break; + case 1: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + dmar_latency_enable(iommu, DMAR_LATENCY_INV_IOTLB); + rcu_read_unlock(); + break; + case 2: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + dmar_latency_enable(iommu, DMAR_LATENCY_INV_DEVTLB); + rcu_read_unlock(); + break; + case 3: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC); + rcu_read_unlock(); + break; + default: + return -EINVAL; + } + + *ppos += cnt; + return cnt; +} + +static const struct file_operations dmar_perf_latency_fops = { + .open = dmar_perf_latency_open, + .write = dmar_perf_latency_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void __init intel_iommu_debugfs_init(void) +{ + intel_iommu_debug = debugfs_create_dir("intel", iommu_debugfs_dir); + + debugfs_create_file("iommu_regset", 0444, intel_iommu_debug, NULL, + &iommu_regset_fops); + debugfs_create_file("dmar_translation_struct", 0444, intel_iommu_debug, + NULL, &dmar_translation_struct_fops); + debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug, + NULL, &invalidation_queue_fops); +#ifdef CONFIG_IRQ_REMAP + debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug, + NULL, &ir_translation_struct_fops); +#endif + debugfs_create_file("dmar_perf_latency", 0644, intel_iommu_debug, + NULL, &dmar_perf_latency_fops); +} + +/* + * Create a debugfs directory for each device, and then create a + * debugfs file in this directory for users to dump the page table + * of the default domain. e.g. + * /sys/kernel/debug/iommu/intel/0000:00:01.0/domain_translation_struct + */ +void intel_iommu_debugfs_create_dev(struct device_domain_info *info) +{ + info->debugfs_dentry = debugfs_create_dir(dev_name(info->dev), intel_iommu_debug); + + debugfs_create_file("domain_translation_struct", 0444, info->debugfs_dentry, + info, &dev_domain_translation_struct_fops); +} + +/* Remove the device debugfs directory. */ +void intel_iommu_debugfs_remove_dev(struct device_domain_info *info) +{ + debugfs_remove_recursive(info->debugfs_dentry); +} + +/* + * Create a debugfs directory per pair of {device, pasid}, then create the + * corresponding debugfs file in this directory for users to dump its page + * table. e.g. + * /sys/kernel/debug/iommu/intel/0000:00:01.0/1/domain_translation_struct + * + * The debugfs only dumps the page tables whose mappings are created and + * destroyed by the iommu_map/unmap() interfaces. Check the mapping type + * of the domain before creating debugfs directory. + */ +void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev_pasid->dev); + char dir_name[10]; + + sprintf(dir_name, "%x", dev_pasid->pasid); + dev_pasid->debugfs_dentry = debugfs_create_dir(dir_name, info->debugfs_dentry); + + debugfs_create_file("domain_translation_struct", 0444, dev_pasid->debugfs_dentry, + dev_pasid, &pasid_domain_translation_struct_fops); +} + +/* Remove the device pasid debugfs directory. */ +void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid) +{ + debugfs_remove_recursive(dev_pasid->debugfs_dentry); +} diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c new file mode 100644 index 000000000000..ec975c73cfe6 --- /dev/null +++ b/drivers/iommu/intel/dmar.c @@ -0,0 +1,2403 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2006, Intel Corporation. + * + * Copyright (C) 2006-2008 Intel Corporation + * Author: Ashok Raj <ashok.raj@intel.com> + * Author: Shaohua Li <shaohua.li@intel.com> + * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + * + * This file implements early detection/parsing of Remapping Devices + * reported to OS through BIOS via DMA remapping reporting (DMAR) ACPI + * tables. + * + * These routines are used by both DMA-remapping and Interrupt-remapping + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/iova.h> +#include <linux/timer.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/tboot.h> +#include <linux/dmi.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/numa.h> +#include <linux/limits.h> +#include <asm/irq_remapping.h> + +#include "iommu.h" +#include "../irq_remapping.h" +#include "../iommu-pages.h" +#include "perf.h" +#include "trace.h" +#include "perfmon.h" + +typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *); +struct dmar_res_callback { + dmar_res_handler_t cb[ACPI_DMAR_TYPE_RESERVED]; + void *arg[ACPI_DMAR_TYPE_RESERVED]; + bool ignore_unhandled; + bool print_entry; +}; + +/* + * Assumptions: + * 1) The hotplug framework guarentees that DMAR unit will be hot-added + * before IO devices managed by that unit. + * 2) The hotplug framework guarantees that DMAR unit will be hot-removed + * after IO devices managed by that unit. + * 3) Hotplug events are rare. + * + * Locking rules for DMA and interrupt remapping related global data structures: + * 1) Use dmar_global_lock in process context + * 2) Use RCU in interrupt context + */ +DECLARE_RWSEM(dmar_global_lock); +LIST_HEAD(dmar_drhd_units); + +struct acpi_table_header * __initdata dmar_tbl; +static int dmar_dev_scope_status = 1; +static DEFINE_IDA(dmar_seq_ids); + +static int alloc_iommu(struct dmar_drhd_unit *drhd); +static void free_iommu(struct intel_iommu *iommu); + +static void dmar_register_drhd_unit(struct dmar_drhd_unit *drhd) +{ + /* + * add INCLUDE_ALL at the tail, so scan the list will find it at + * the very end. + */ + if (drhd->include_all) + list_add_tail_rcu(&drhd->list, &dmar_drhd_units); + else + list_add_rcu(&drhd->list, &dmar_drhd_units); +} + +void *dmar_alloc_dev_scope(void *start, void *end, int *cnt) +{ + struct acpi_dmar_device_scope *scope; + + *cnt = 0; + while (start < end) { + scope = start; + if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_NAMESPACE || + scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT || + scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE) + (*cnt)++; + else if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_IOAPIC && + scope->entry_type != ACPI_DMAR_SCOPE_TYPE_HPET) { + pr_warn("Unsupported device scope\n"); + } + start += scope->length; + } + if (*cnt == 0) + return NULL; + + return kcalloc(*cnt, sizeof(struct dmar_dev_scope), GFP_KERNEL); +} + +void dmar_free_dev_scope(struct dmar_dev_scope **devices, int *cnt) +{ + int i; + struct device *tmp_dev; + + if (*devices && *cnt) { + for_each_active_dev_scope(*devices, *cnt, i, tmp_dev) + put_device(tmp_dev); + kfree(*devices); + } + + *devices = NULL; + *cnt = 0; +} + +/* Optimize out kzalloc()/kfree() for normal cases */ +static char dmar_pci_notify_info_buf[64]; + +static struct dmar_pci_notify_info * +dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event) +{ + int level = 0; + size_t size; + struct pci_dev *tmp; + struct dmar_pci_notify_info *info; + + /* + * Ignore devices that have a domain number higher than what can + * be looked up in DMAR, e.g. VMD subdevices with domain 0x10000 + */ + if (pci_domain_nr(dev->bus) > U16_MAX) + return NULL; + + /* Only generate path[] for device addition event */ + if (event == BUS_NOTIFY_ADD_DEVICE) + for (tmp = dev; tmp; tmp = tmp->bus->self) + level++; + + size = struct_size(info, path, level); + if (size <= sizeof(dmar_pci_notify_info_buf)) { + info = (struct dmar_pci_notify_info *)dmar_pci_notify_info_buf; + } else { + info = kzalloc(size, GFP_KERNEL); + if (!info) { + if (dmar_dev_scope_status == 0) + dmar_dev_scope_status = -ENOMEM; + return NULL; + } + } + + info->event = event; + info->dev = dev; + info->seg = pci_domain_nr(dev->bus); + info->level = level; + if (event == BUS_NOTIFY_ADD_DEVICE) { + for (tmp = dev; tmp; tmp = tmp->bus->self) { + level--; + info->path[level].bus = tmp->bus->number; + info->path[level].device = PCI_SLOT(tmp->devfn); + info->path[level].function = PCI_FUNC(tmp->devfn); + if (pci_is_root_bus(tmp->bus)) + info->bus = tmp->bus->number; + } + } + + return info; +} + +static inline void dmar_free_pci_notify_info(struct dmar_pci_notify_info *info) +{ + if ((void *)info != dmar_pci_notify_info_buf) + kfree(info); +} + +static bool dmar_match_pci_path(struct dmar_pci_notify_info *info, int bus, + struct acpi_dmar_pci_path *path, int count) +{ + int i; + + if (info->bus != bus) + goto fallback; + if (info->level != count) + goto fallback; + + for (i = 0; i < count; i++) { + if (path[i].device != info->path[i].device || + path[i].function != info->path[i].function) + goto fallback; + } + + return true; + +fallback: + + if (count != 1) + return false; + + i = info->level - 1; + if (bus == info->path[i].bus && + path[0].device == info->path[i].device && + path[0].function == info->path[i].function) { + pr_info(FW_BUG "RMRR entry for device %02x:%02x.%x is broken - applying workaround\n", + bus, path[0].device, path[0].function); + return true; + } + + return false; +} + +/* Return: > 0 if match found, 0 if no match found, < 0 if error happens */ +int dmar_insert_dev_scope(struct dmar_pci_notify_info *info, + void *start, void*end, u16 segment, + struct dmar_dev_scope *devices, + int devices_cnt) +{ + int i, level; + struct device *tmp, *dev = &info->dev->dev; + struct acpi_dmar_device_scope *scope; + struct acpi_dmar_pci_path *path; + + if (segment != info->seg) + return 0; + + for (; start < end; start += scope->length) { + scope = start; + if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_ENDPOINT && + scope->entry_type != ACPI_DMAR_SCOPE_TYPE_BRIDGE) + continue; + + path = (struct acpi_dmar_pci_path *)(scope + 1); + level = (scope->length - sizeof(*scope)) / sizeof(*path); + if (!dmar_match_pci_path(info, scope->bus, path, level)) + continue; + + /* + * We expect devices with endpoint scope to have normal PCI + * headers, and devices with bridge scope to have bridge PCI + * headers. However PCI NTB devices may be listed in the + * DMAR table with bridge scope, even though they have a + * normal PCI header. NTB devices are identified by class + * "BRIDGE_OTHER" (0680h) - we don't declare a socpe mismatch + * for this special case. + */ + if ((scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT && + info->dev->hdr_type != PCI_HEADER_TYPE_NORMAL) || + (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE && + (info->dev->hdr_type == PCI_HEADER_TYPE_NORMAL && + info->dev->class >> 16 != PCI_BASE_CLASS_BRIDGE))) { + pr_warn("Device scope type does not match for %s\n", + pci_name(info->dev)); + return -EINVAL; + } + + for_each_dev_scope(devices, devices_cnt, i, tmp) + if (tmp == NULL) { + devices[i].bus = info->dev->bus->number; + devices[i].devfn = info->dev->devfn; + rcu_assign_pointer(devices[i].dev, + get_device(dev)); + return 1; + } + if (WARN_ON(i >= devices_cnt)) + return -EINVAL; + } + + return 0; +} + +int dmar_remove_dev_scope(struct dmar_pci_notify_info *info, u16 segment, + struct dmar_dev_scope *devices, int count) +{ + int index; + struct device *tmp; + + if (info->seg != segment) + return 0; + + for_each_active_dev_scope(devices, count, index, tmp) + if (tmp == &info->dev->dev) { + RCU_INIT_POINTER(devices[index].dev, NULL); + synchronize_rcu(); + put_device(tmp); + return 1; + } + + return 0; +} + +static int dmar_pci_bus_add_dev(struct dmar_pci_notify_info *info) +{ + int ret = 0; + struct dmar_drhd_unit *dmaru; + struct acpi_dmar_hardware_unit *drhd; + + for_each_drhd_unit(dmaru) { + if (dmaru->include_all) + continue; + + drhd = container_of(dmaru->hdr, + struct acpi_dmar_hardware_unit, header); + ret = dmar_insert_dev_scope(info, (void *)(drhd + 1), + ((void *)drhd) + drhd->header.length, + dmaru->segment, + dmaru->devices, dmaru->devices_cnt); + if (ret) + break; + } + if (ret >= 0) + ret = dmar_iommu_notify_scope_dev(info); + if (ret < 0 && dmar_dev_scope_status == 0) + dmar_dev_scope_status = ret; + + if (ret >= 0) + intel_irq_remap_add_device(info); + + return ret; +} + +static void dmar_pci_bus_del_dev(struct dmar_pci_notify_info *info) +{ + struct dmar_drhd_unit *dmaru; + + for_each_drhd_unit(dmaru) + if (dmar_remove_dev_scope(info, dmaru->segment, + dmaru->devices, dmaru->devices_cnt)) + break; + dmar_iommu_notify_scope_dev(info); +} + +static inline void vf_inherit_msi_domain(struct pci_dev *pdev) +{ + struct pci_dev *physfn = pci_physfn(pdev); + + dev_set_msi_domain(&pdev->dev, dev_get_msi_domain(&physfn->dev)); +} + +static int dmar_pci_bus_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct pci_dev *pdev = to_pci_dev(data); + struct dmar_pci_notify_info *info; + + /* Only care about add/remove events for physical functions. + * For VFs we actually do the lookup based on the corresponding + * PF in device_to_iommu() anyway. */ + if (pdev->is_virtfn) { + /* + * Ensure that the VF device inherits the irq domain of the + * PF device. Ideally the device would inherit the domain + * from the bus, but DMAR can have multiple units per bus + * which makes this impossible. The VF 'bus' could inherit + * from the PF device, but that's yet another x86'sism to + * inflict on everybody else. + */ + if (action == BUS_NOTIFY_ADD_DEVICE) + vf_inherit_msi_domain(pdev); + return NOTIFY_DONE; + } + + if (action != BUS_NOTIFY_ADD_DEVICE && + action != BUS_NOTIFY_REMOVED_DEVICE) + return NOTIFY_DONE; + + info = dmar_alloc_pci_notify_info(pdev, action); + if (!info) + return NOTIFY_DONE; + + down_write(&dmar_global_lock); + if (action == BUS_NOTIFY_ADD_DEVICE) + dmar_pci_bus_add_dev(info); + else if (action == BUS_NOTIFY_REMOVED_DEVICE) + dmar_pci_bus_del_dev(info); + up_write(&dmar_global_lock); + + dmar_free_pci_notify_info(info); + + return NOTIFY_OK; +} + +static struct notifier_block dmar_pci_bus_nb = { + .notifier_call = dmar_pci_bus_notifier, + .priority = 1, +}; + +static struct dmar_drhd_unit * +dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd) +{ + struct dmar_drhd_unit *dmaru; + + list_for_each_entry_rcu(dmaru, &dmar_drhd_units, list, + dmar_rcu_check()) + if (dmaru->segment == drhd->segment && + dmaru->reg_base_addr == drhd->address) + return dmaru; + + return NULL; +} + +/* + * dmar_parse_one_drhd - parses exactly one DMA remapping hardware definition + * structure which uniquely represent one DMA remapping hardware unit + * present in the platform + */ +static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg) +{ + struct acpi_dmar_hardware_unit *drhd; + struct dmar_drhd_unit *dmaru; + int ret; + + drhd = (struct acpi_dmar_hardware_unit *)header; + dmaru = dmar_find_dmaru(drhd); + if (dmaru) + goto out; + + dmaru = kzalloc(sizeof(*dmaru) + header->length, GFP_KERNEL); + if (!dmaru) + return -ENOMEM; + + /* + * If header is allocated from slab by ACPI _DSM method, we need to + * copy the content because the memory buffer will be freed on return. + */ + dmaru->hdr = (void *)(dmaru + 1); + memcpy(dmaru->hdr, header, header->length); + dmaru->reg_base_addr = drhd->address; + dmaru->segment = drhd->segment; + /* The size of the register set is 2 ^ N 4 KB pages. */ + dmaru->reg_size = 1UL << (drhd->size + 12); + dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */ + dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1), + ((void *)drhd) + drhd->header.length, + &dmaru->devices_cnt); + if (dmaru->devices_cnt && dmaru->devices == NULL) { + kfree(dmaru); + return -ENOMEM; + } + + ret = alloc_iommu(dmaru); + if (ret) { + dmar_free_dev_scope(&dmaru->devices, + &dmaru->devices_cnt); + kfree(dmaru); + return ret; + } + dmar_register_drhd_unit(dmaru); + +out: + if (arg) + (*(int *)arg)++; + + return 0; +} + +static void dmar_free_drhd(struct dmar_drhd_unit *dmaru) +{ + if (dmaru->devices && dmaru->devices_cnt) + dmar_free_dev_scope(&dmaru->devices, &dmaru->devices_cnt); + if (dmaru->iommu) + free_iommu(dmaru->iommu); + kfree(dmaru); +} + +static int __init dmar_parse_one_andd(struct acpi_dmar_header *header, + void *arg) +{ + struct acpi_dmar_andd *andd = (void *)header; + + /* Check for NUL termination within the designated length */ + if (strnlen(andd->device_name, header->length - 8) == header->length - 8) { + pr_warn(FW_BUG + "Your BIOS is broken; ANDD object name is not NUL-terminated\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + return -EINVAL; + } + pr_info("ANDD device: %x name: %s\n", andd->device_number, + andd->device_name); + + return 0; +} + +#ifdef CONFIG_ACPI_NUMA +static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) +{ + struct acpi_dmar_rhsa *rhsa; + struct dmar_drhd_unit *drhd; + + rhsa = (struct acpi_dmar_rhsa *)header; + for_each_drhd_unit(drhd) { + if (drhd->reg_base_addr == rhsa->base_address) { + int node = pxm_to_node(rhsa->proximity_domain); + + if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; + drhd->iommu->node = node; + return 0; + } + } + pr_warn(FW_BUG + "Your BIOS is broken; RHSA refers to non-existent DMAR unit at %llx\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + rhsa->base_address, + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + + return 0; +} +#else +#define dmar_parse_one_rhsa dmar_res_noop +#endif + +static void +dmar_table_print_dmar_entry(struct acpi_dmar_header *header) +{ + struct acpi_dmar_hardware_unit *drhd; + struct acpi_dmar_reserved_memory *rmrr; + struct acpi_dmar_atsr *atsr; + struct acpi_dmar_rhsa *rhsa; + struct acpi_dmar_satc *satc; + + switch (header->type) { + case ACPI_DMAR_TYPE_HARDWARE_UNIT: + drhd = container_of(header, struct acpi_dmar_hardware_unit, + header); + pr_info("DRHD base: %#016Lx flags: %#x\n", + (unsigned long long)drhd->address, drhd->flags); + break; + case ACPI_DMAR_TYPE_RESERVED_MEMORY: + rmrr = container_of(header, struct acpi_dmar_reserved_memory, + header); + pr_info("RMRR base: %#016Lx end: %#016Lx\n", + (unsigned long long)rmrr->base_address, + (unsigned long long)rmrr->end_address); + break; + case ACPI_DMAR_TYPE_ROOT_ATS: + atsr = container_of(header, struct acpi_dmar_atsr, header); + pr_info("ATSR flags: %#x\n", atsr->flags); + break; + case ACPI_DMAR_TYPE_HARDWARE_AFFINITY: + rhsa = container_of(header, struct acpi_dmar_rhsa, header); + pr_info("RHSA base: %#016Lx proximity domain: %#x\n", + (unsigned long long)rhsa->base_address, + rhsa->proximity_domain); + break; + case ACPI_DMAR_TYPE_NAMESPACE: + /* We don't print this here because we need to sanity-check + it first. So print it in dmar_parse_one_andd() instead. */ + break; + case ACPI_DMAR_TYPE_SATC: + satc = container_of(header, struct acpi_dmar_satc, header); + pr_info("SATC flags: 0x%x\n", satc->flags); + break; + } +} + +/** + * dmar_table_detect - checks to see if the platform supports DMAR devices + */ +static int __init dmar_table_detect(void) +{ + acpi_status status = AE_OK; + + /* if we could find DMAR table, then there are DMAR devices */ + status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl); + + if (ACPI_SUCCESS(status) && !dmar_tbl) { + pr_warn("Unable to map DMAR\n"); + status = AE_NOT_FOUND; + } + + return ACPI_SUCCESS(status) ? 0 : -ENOENT; +} + +static int dmar_walk_remapping_entries(struct acpi_dmar_header *start, + size_t len, struct dmar_res_callback *cb) +{ + struct acpi_dmar_header *iter, *next; + struct acpi_dmar_header *end = ((void *)start) + len; + + for (iter = start; iter < end; iter = next) { + next = (void *)iter + iter->length; + if (iter->length == 0) { + /* Avoid looping forever on bad ACPI tables */ + pr_debug(FW_BUG "Invalid 0-length structure\n"); + break; + } else if (next > end) { + /* Avoid passing table end */ + pr_warn(FW_BUG "Record passes table end\n"); + return -EINVAL; + } + + if (cb->print_entry) + dmar_table_print_dmar_entry(iter); + + if (iter->type >= ACPI_DMAR_TYPE_RESERVED) { + /* continue for forward compatibility */ + pr_debug("Unknown DMAR structure type %d\n", + iter->type); + } else if (cb->cb[iter->type]) { + int ret; + + ret = cb->cb[iter->type](iter, cb->arg[iter->type]); + if (ret) + return ret; + } else if (!cb->ignore_unhandled) { + pr_warn("No handler for DMAR structure type %d\n", + iter->type); + return -EINVAL; + } + } + + return 0; +} + +static inline int dmar_walk_dmar_table(struct acpi_table_dmar *dmar, + struct dmar_res_callback *cb) +{ + return dmar_walk_remapping_entries((void *)(dmar + 1), + dmar->header.length - sizeof(*dmar), cb); +} + +/** + * parse_dmar_table - parses the DMA reporting table + */ +static int __init +parse_dmar_table(void) +{ + struct acpi_table_dmar *dmar; + int drhd_count = 0; + int ret; + struct dmar_res_callback cb = { + .print_entry = true, + .ignore_unhandled = true, + .arg[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &drhd_count, + .cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_parse_one_drhd, + .cb[ACPI_DMAR_TYPE_RESERVED_MEMORY] = &dmar_parse_one_rmrr, + .cb[ACPI_DMAR_TYPE_ROOT_ATS] = &dmar_parse_one_atsr, + .cb[ACPI_DMAR_TYPE_HARDWARE_AFFINITY] = &dmar_parse_one_rhsa, + .cb[ACPI_DMAR_TYPE_NAMESPACE] = &dmar_parse_one_andd, + .cb[ACPI_DMAR_TYPE_SATC] = &dmar_parse_one_satc, + }; + + /* + * Do it again, earlier dmar_tbl mapping could be mapped with + * fixed map. + */ + dmar_table_detect(); + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + dmar_tbl = tboot_get_dmar_table(dmar_tbl); + + dmar = (struct acpi_table_dmar *)dmar_tbl; + if (!dmar) + return -ENODEV; + + if (dmar->width < PAGE_SHIFT - 1) { + pr_warn("Invalid DMAR haw\n"); + return -EINVAL; + } + + pr_info("Host address width %d\n", dmar->width + 1); + ret = dmar_walk_dmar_table(dmar, &cb); + if (ret == 0 && drhd_count == 0) + pr_warn(FW_BUG "No DRHD structure found in DMAR table\n"); + + return ret; +} + +static int dmar_pci_device_match(struct dmar_dev_scope devices[], + int cnt, struct pci_dev *dev) +{ + int index; + struct device *tmp; + + while (dev) { + for_each_active_dev_scope(devices, cnt, index, tmp) + if (dev_is_pci(tmp) && dev == to_pci_dev(tmp)) + return 1; + + /* Check our parent */ + dev = dev->bus->self; + } + + return 0; +} + +struct dmar_drhd_unit * +dmar_find_matched_drhd_unit(struct pci_dev *dev) +{ + struct dmar_drhd_unit *dmaru; + struct acpi_dmar_hardware_unit *drhd; + + dev = pci_physfn(dev); + + rcu_read_lock(); + for_each_drhd_unit(dmaru) { + drhd = container_of(dmaru->hdr, + struct acpi_dmar_hardware_unit, + header); + + if (dmaru->include_all && + drhd->segment == pci_domain_nr(dev->bus)) + goto out; + + if (dmar_pci_device_match(dmaru->devices, + dmaru->devices_cnt, dev)) + goto out; + } + dmaru = NULL; +out: + rcu_read_unlock(); + + return dmaru; +} + +static void __init dmar_acpi_insert_dev_scope(u8 device_number, + struct acpi_device *adev) +{ + struct dmar_drhd_unit *dmaru; + struct acpi_dmar_hardware_unit *drhd; + struct acpi_dmar_device_scope *scope; + struct device *tmp; + int i; + struct acpi_dmar_pci_path *path; + + for_each_drhd_unit(dmaru) { + drhd = container_of(dmaru->hdr, + struct acpi_dmar_hardware_unit, + header); + + for (scope = (void *)(drhd + 1); + (unsigned long)scope < ((unsigned long)drhd) + drhd->header.length; + scope = ((void *)scope) + scope->length) { + if (scope->entry_type != ACPI_DMAR_SCOPE_TYPE_NAMESPACE) + continue; + if (scope->enumeration_id != device_number) + continue; + + path = (void *)(scope + 1); + pr_info("ACPI device \"%s\" under DMAR at %llx as %02x:%02x.%d\n", + dev_name(&adev->dev), dmaru->reg_base_addr, + scope->bus, path->device, path->function); + for_each_dev_scope(dmaru->devices, dmaru->devices_cnt, i, tmp) + if (tmp == NULL) { + dmaru->devices[i].bus = scope->bus; + dmaru->devices[i].devfn = PCI_DEVFN(path->device, + path->function); + rcu_assign_pointer(dmaru->devices[i].dev, + get_device(&adev->dev)); + return; + } + BUG_ON(i >= dmaru->devices_cnt); + } + } + pr_warn("No IOMMU scope found for ANDD enumeration ID %d (%s)\n", + device_number, dev_name(&adev->dev)); +} + +static int __init dmar_acpi_dev_scope_init(void) +{ + struct acpi_dmar_andd *andd; + + if (dmar_tbl == NULL) + return -ENODEV; + + for (andd = (void *)dmar_tbl + sizeof(struct acpi_table_dmar); + ((unsigned long)andd) < ((unsigned long)dmar_tbl) + dmar_tbl->length; + andd = ((void *)andd) + andd->header.length) { + if (andd->header.type == ACPI_DMAR_TYPE_NAMESPACE) { + acpi_handle h; + struct acpi_device *adev; + + if (!ACPI_SUCCESS(acpi_get_handle(ACPI_ROOT_OBJECT, + andd->device_name, + &h))) { + pr_err("Failed to find handle for ACPI object %s\n", + andd->device_name); + continue; + } + adev = acpi_fetch_acpi_dev(h); + if (!adev) { + pr_err("Failed to get device for ACPI object %s\n", + andd->device_name); + continue; + } + dmar_acpi_insert_dev_scope(andd->device_number, adev); + } + } + return 0; +} + +int __init dmar_dev_scope_init(void) +{ + struct pci_dev *dev = NULL; + struct dmar_pci_notify_info *info; + + if (dmar_dev_scope_status != 1) + return dmar_dev_scope_status; + + if (list_empty(&dmar_drhd_units)) { + dmar_dev_scope_status = -ENODEV; + } else { + dmar_dev_scope_status = 0; + + dmar_acpi_dev_scope_init(); + + for_each_pci_dev(dev) { + if (dev->is_virtfn) + continue; + + info = dmar_alloc_pci_notify_info(dev, + BUS_NOTIFY_ADD_DEVICE); + if (!info) { + pci_dev_put(dev); + return dmar_dev_scope_status; + } else { + dmar_pci_bus_add_dev(info); + dmar_free_pci_notify_info(info); + } + } + } + + return dmar_dev_scope_status; +} + +void __init dmar_register_bus_notifier(void) +{ + bus_register_notifier(&pci_bus_type, &dmar_pci_bus_nb); +} + + +int __init dmar_table_init(void) +{ + static int dmar_table_initialized; + int ret; + + if (dmar_table_initialized == 0) { + ret = parse_dmar_table(); + if (ret < 0) { + if (ret != -ENODEV) + pr_info("Parse DMAR table failure.\n"); + } else if (list_empty(&dmar_drhd_units)) { + pr_info("No DMAR devices found\n"); + ret = -ENODEV; + } + + if (ret < 0) + dmar_table_initialized = ret; + else + dmar_table_initialized = 1; + } + + return dmar_table_initialized < 0 ? dmar_table_initialized : 0; +} + +static void warn_invalid_dmar(u64 addr, const char *message) +{ + pr_warn_once(FW_BUG + "Your BIOS is broken; DMAR reported at address %llx%s!\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + addr, message, + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); +} + +static int __ref +dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg) +{ + struct acpi_dmar_hardware_unit *drhd; + void __iomem *addr; + u64 cap, ecap; + + drhd = (void *)entry; + if (!drhd->address) { + warn_invalid_dmar(0, ""); + return -EINVAL; + } + + if (arg) + addr = ioremap(drhd->address, VTD_PAGE_SIZE); + else + addr = early_ioremap(drhd->address, VTD_PAGE_SIZE); + if (!addr) { + pr_warn("Can't validate DRHD address: %llx\n", drhd->address); + return -EINVAL; + } + + cap = dmar_readq(addr + DMAR_CAP_REG); + ecap = dmar_readq(addr + DMAR_ECAP_REG); + + if (arg) + iounmap(addr); + else + early_iounmap(addr, VTD_PAGE_SIZE); + + if (cap == (uint64_t)-1 && ecap == (uint64_t)-1) { + warn_invalid_dmar(drhd->address, " returns all ones"); + return -EINVAL; + } + + return 0; +} + +void __init detect_intel_iommu(void) +{ + int ret; + struct dmar_res_callback validate_drhd_cb = { + .cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_validate_one_drhd, + .ignore_unhandled = true, + }; + + down_write(&dmar_global_lock); + ret = dmar_table_detect(); + if (!ret) + ret = dmar_walk_dmar_table((struct acpi_table_dmar *)dmar_tbl, + &validate_drhd_cb); + if (!ret && !no_iommu && !iommu_detected && + (!dmar_disabled || dmar_platform_optin())) { + iommu_detected = 1; + /* Make sure ACS will be enabled */ + pci_request_acs(); + } + + if (!ret) { + x86_init.iommu.iommu_init = intel_iommu_init; + x86_platform.iommu_shutdown = intel_iommu_shutdown; + } + + if (dmar_tbl) { + acpi_put_table(dmar_tbl); + dmar_tbl = NULL; + } + up_write(&dmar_global_lock); +} + +static void unmap_iommu(struct intel_iommu *iommu) +{ + iounmap(iommu->reg); + release_mem_region(iommu->reg_phys, iommu->reg_size); +} + +/** + * map_iommu: map the iommu's registers + * @iommu: the iommu to map + * @drhd: DMA remapping hardware definition structure + * + * Memory map the iommu's registers. Start w/ a single page, and + * possibly expand if that turns out to be insufficent. + */ +static int map_iommu(struct intel_iommu *iommu, struct dmar_drhd_unit *drhd) +{ + u64 phys_addr = drhd->reg_base_addr; + int map_size, err=0; + + iommu->reg_phys = phys_addr; + iommu->reg_size = drhd->reg_size; + + if (!request_mem_region(iommu->reg_phys, iommu->reg_size, iommu->name)) { + pr_err("Can't reserve memory\n"); + err = -EBUSY; + goto out; + } + + iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size); + if (!iommu->reg) { + pr_err("Can't map the region\n"); + err = -ENOMEM; + goto release; + } + + iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); + iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); + + if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) { + err = -EINVAL; + warn_invalid_dmar(phys_addr, " returns all ones"); + goto unmap; + } + + /* the registers might be more than one page */ + map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), + cap_max_fault_reg_offset(iommu->cap)); + map_size = VTD_PAGE_ALIGN(map_size); + if (map_size > iommu->reg_size) { + iounmap(iommu->reg); + release_mem_region(iommu->reg_phys, iommu->reg_size); + iommu->reg_size = map_size; + if (!request_mem_region(iommu->reg_phys, iommu->reg_size, + iommu->name)) { + pr_err("Can't reserve memory\n"); + err = -EBUSY; + goto out; + } + iommu->reg = ioremap(iommu->reg_phys, iommu->reg_size); + if (!iommu->reg) { + pr_err("Can't map the region\n"); + err = -ENOMEM; + goto release; + } + } + + if (cap_ecmds(iommu->cap)) { + int i; + + for (i = 0; i < DMA_MAX_NUM_ECMDCAP; i++) { + iommu->ecmdcap[i] = dmar_readq(iommu->reg + DMAR_ECCAP_REG + + i * DMA_ECMD_REG_STEP); + } + } + + err = 0; + goto out; + +unmap: + iounmap(iommu->reg); +release: + release_mem_region(iommu->reg_phys, iommu->reg_size); +out: + return err; +} + +static int alloc_iommu(struct dmar_drhd_unit *drhd) +{ + struct intel_iommu *iommu; + u32 ver, sts; + int agaw = -1; + int msagaw = -1; + int err; + + if (!drhd->reg_base_addr) { + warn_invalid_dmar(0, ""); + return -EINVAL; + } + + iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return -ENOMEM; + + iommu->seq_id = ida_alloc_range(&dmar_seq_ids, 0, + DMAR_UNITS_SUPPORTED - 1, GFP_KERNEL); + if (iommu->seq_id < 0) { + pr_err("Failed to allocate seq_id\n"); + err = iommu->seq_id; + goto error; + } + snprintf(iommu->name, sizeof(iommu->name), "dmar%d", iommu->seq_id); + + err = map_iommu(iommu, drhd); + if (err) { + pr_err("Failed to map %s\n", iommu->name); + goto error_free_seq_id; + } + + if (!cap_sagaw(iommu->cap) && + (!ecap_smts(iommu->ecap) || ecap_slts(iommu->ecap))) { + pr_info("%s: No supported address widths. Not attempting DMA translation.\n", + iommu->name); + drhd->ignored = 1; + } + + if (!drhd->ignored) { + agaw = iommu_calculate_agaw(iommu); + if (agaw < 0) { + pr_err("Cannot get a valid agaw for iommu (seq_id = %d)\n", + iommu->seq_id); + drhd->ignored = 1; + } + } + if (!drhd->ignored) { + msagaw = iommu_calculate_max_sagaw(iommu); + if (msagaw < 0) { + pr_err("Cannot get a valid max agaw for iommu (seq_id = %d)\n", + iommu->seq_id); + drhd->ignored = 1; + agaw = -1; + } + } + iommu->agaw = agaw; + iommu->msagaw = msagaw; + iommu->segment = drhd->segment; + iommu->device_rbtree = RB_ROOT; + spin_lock_init(&iommu->device_rbtree_lock); + mutex_init(&iommu->iopf_lock); + iommu->node = NUMA_NO_NODE; + spin_lock_init(&iommu->lock); + ida_init(&iommu->domain_ida); + mutex_init(&iommu->did_lock); + + ver = readl(iommu->reg + DMAR_VER_REG); + pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", + iommu->name, + (unsigned long long)drhd->reg_base_addr, + DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver), + (unsigned long long)iommu->cap, + (unsigned long long)iommu->ecap); + + /* Reflect status in gcmd */ + sts = readl(iommu->reg + DMAR_GSTS_REG); + if (sts & DMA_GSTS_IRES) + iommu->gcmd |= DMA_GCMD_IRE; + if (sts & DMA_GSTS_TES) + iommu->gcmd |= DMA_GCMD_TE; + if (sts & DMA_GSTS_QIES) + iommu->gcmd |= DMA_GCMD_QIE; + + if (alloc_iommu_pmu(iommu)) + pr_debug("Cannot alloc PMU for iommu (seq_id = %d)\n", iommu->seq_id); + + raw_spin_lock_init(&iommu->register_lock); + + /* + * A value of N in PSS field of eCap register indicates hardware + * supports PASID field of N+1 bits. + */ + if (pasid_supported(iommu)) + iommu->iommu.max_pasids = 2UL << ecap_pss(iommu->ecap); + + /* + * This is only for hotplug; at boot time intel_iommu_enabled won't + * be set yet. When intel_iommu_init() runs, it registers the units + * present at boot time, then sets intel_iommu_enabled. + */ + if (intel_iommu_enabled && !drhd->ignored) { + err = iommu_device_sysfs_add(&iommu->iommu, NULL, + intel_iommu_groups, + "%s", iommu->name); + if (err) + goto err_unmap; + + err = iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + if (err) + goto err_sysfs; + + iommu_pmu_register(iommu); + } + + drhd->iommu = iommu; + iommu->drhd = drhd; + + return 0; + +err_sysfs: + iommu_device_sysfs_remove(&iommu->iommu); +err_unmap: + free_iommu_pmu(iommu); + unmap_iommu(iommu); +error_free_seq_id: + ida_free(&dmar_seq_ids, iommu->seq_id); +error: + kfree(iommu); + return err; +} + +static void free_iommu(struct intel_iommu *iommu) +{ + if (intel_iommu_enabled && !iommu->drhd->ignored) { + iommu_pmu_unregister(iommu); + iommu_device_unregister(&iommu->iommu); + iommu_device_sysfs_remove(&iommu->iommu); + } + + free_iommu_pmu(iommu); + + if (iommu->irq) { + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + } + free_irq(iommu->irq, iommu); + dmar_free_hwirq(iommu->irq); + iommu->irq = 0; + } + + if (iommu->qi) { + iommu_free_pages(iommu->qi->desc); + kfree(iommu->qi->desc_status); + kfree(iommu->qi); + } + + if (iommu->reg) + unmap_iommu(iommu); + + ida_destroy(&iommu->domain_ida); + ida_free(&dmar_seq_ids, iommu->seq_id); + kfree(iommu); +} + +/* + * Reclaim all the submitted descriptors which have completed its work. + */ +static inline void reclaim_free_desc(struct q_inval *qi) +{ + while (qi->desc_status[qi->free_tail] == QI_FREE && qi->free_tail != qi->free_head) { + qi->free_tail = (qi->free_tail + 1) % QI_LENGTH; + qi->free_cnt++; + } +} + +static const char *qi_type_string(u8 type) +{ + switch (type) { + case QI_CC_TYPE: + return "Context-cache Invalidation"; + case QI_IOTLB_TYPE: + return "IOTLB Invalidation"; + case QI_DIOTLB_TYPE: + return "Device-TLB Invalidation"; + case QI_IEC_TYPE: + return "Interrupt Entry Cache Invalidation"; + case QI_IWD_TYPE: + return "Invalidation Wait"; + case QI_EIOTLB_TYPE: + return "PASID-based IOTLB Invalidation"; + case QI_PC_TYPE: + return "PASID-cache Invalidation"; + case QI_DEIOTLB_TYPE: + return "PASID-based Device-TLB Invalidation"; + case QI_PGRP_RESP_TYPE: + return "Page Group Response"; + default: + return "UNKNOWN"; + } +} + +static void qi_dump_fault(struct intel_iommu *iommu, u32 fault) +{ + unsigned int head = dmar_readl(iommu->reg + DMAR_IQH_REG); + u64 iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG); + struct qi_desc *desc = iommu->qi->desc + head; + + if (fault & DMA_FSTS_IQE) + pr_err("VT-d detected Invalidation Queue Error: Reason %llx", + DMAR_IQER_REG_IQEI(iqe_err)); + if (fault & DMA_FSTS_ITE) + pr_err("VT-d detected Invalidation Time-out Error: SID %llx", + DMAR_IQER_REG_ITESID(iqe_err)); + if (fault & DMA_FSTS_ICE) + pr_err("VT-d detected Invalidation Completion Error: SID %llx", + DMAR_IQER_REG_ICESID(iqe_err)); + + pr_err("QI HEAD: %s qw0 = 0x%llx, qw1 = 0x%llx\n", + qi_type_string(desc->qw0 & 0xf), + (unsigned long long)desc->qw0, + (unsigned long long)desc->qw1); + + head = ((head >> qi_shift(iommu)) + QI_LENGTH - 1) % QI_LENGTH; + head <<= qi_shift(iommu); + desc = iommu->qi->desc + head; + + pr_err("QI PRIOR: %s qw0 = 0x%llx, qw1 = 0x%llx\n", + qi_type_string(desc->qw0 & 0xf), + (unsigned long long)desc->qw0, + (unsigned long long)desc->qw1); +} + +static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) +{ + u32 fault; + int head, tail; + struct device *dev; + u64 iqe_err, ite_sid; + struct q_inval *qi = iommu->qi; + int shift = qi_shift(iommu); + + if (qi->desc_status[wait_index] == QI_ABORT) + return -EAGAIN; + + fault = readl(iommu->reg + DMAR_FSTS_REG); + if (fault & (DMA_FSTS_IQE | DMA_FSTS_ITE | DMA_FSTS_ICE)) + qi_dump_fault(iommu, fault); + + /* + * If IQE happens, the head points to the descriptor associated + * with the error. No new descriptors are fetched until the IQE + * is cleared. + */ + if (fault & DMA_FSTS_IQE) { + head = readl(iommu->reg + DMAR_IQH_REG); + if ((head >> shift) == index) { + struct qi_desc *desc = qi->desc + head; + + /* + * desc->qw2 and desc->qw3 are either reserved or + * used by software as private data. We won't print + * out these two qw's for security consideration. + */ + memcpy(desc, qi->desc + (wait_index << shift), + 1 << shift); + writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG); + pr_info("Invalidation Queue Error (IQE) cleared\n"); + return -EINVAL; + } + } + + /* + * If ITE happens, all pending wait_desc commands are aborted. + * No new descriptors are fetched until the ITE is cleared. + */ + if (fault & DMA_FSTS_ITE) { + head = readl(iommu->reg + DMAR_IQH_REG); + head = ((head >> shift) - 1 + QI_LENGTH) % QI_LENGTH; + head |= 1; + tail = readl(iommu->reg + DMAR_IQT_REG); + tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH; + + /* + * SID field is valid only when the ITE field is Set in FSTS_REG + * see Intel VT-d spec r4.1, section 11.4.9.9 + */ + iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG); + ite_sid = DMAR_IQER_REG_ITESID(iqe_err); + + writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG); + pr_info("Invalidation Time-out Error (ITE) cleared\n"); + + do { + if (qi->desc_status[head] == QI_IN_USE) + qi->desc_status[head] = QI_ABORT; + head = (head - 2 + QI_LENGTH) % QI_LENGTH; + } while (head != tail); + + /* + * If device was released or isn't present, no need to retry + * the ATS invalidate request anymore. + * + * 0 value of ite_sid means old VT-d device, no ite_sid value. + * see Intel VT-d spec r4.1, section 11.4.9.9 + */ + if (ite_sid) { + dev = device_rbtree_find(iommu, ite_sid); + if (!dev || !dev_is_pci(dev) || + !pci_device_is_present(to_pci_dev(dev))) + return -ETIMEDOUT; + } + if (qi->desc_status[wait_index] == QI_ABORT) + return -EAGAIN; + } + + if (fault & DMA_FSTS_ICE) { + writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG); + pr_info("Invalidation Completion Error (ICE) cleared\n"); + } + + return 0; +} + +/* + * Function to submit invalidation descriptors of all types to the queued + * invalidation interface(QI). Multiple descriptors can be submitted at a + * time, a wait descriptor will be appended to each submission to ensure + * hardware has completed the invalidation before return. Wait descriptors + * can be part of the submission but it will not be polled for completion. + */ +int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + unsigned int count, unsigned long options) +{ + struct q_inval *qi = iommu->qi; + s64 devtlb_start_ktime = 0; + s64 iotlb_start_ktime = 0; + s64 iec_start_ktime = 0; + struct qi_desc wait_desc; + int wait_index, index; + unsigned long flags; + int offset, shift; + int rc, i; + u64 type; + + if (!qi) + return 0; + + type = desc->qw0 & GENMASK_ULL(3, 0); + + if ((type == QI_IOTLB_TYPE || type == QI_EIOTLB_TYPE) && + dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IOTLB)) + iotlb_start_ktime = ktime_to_ns(ktime_get()); + + if ((type == QI_DIOTLB_TYPE || type == QI_DEIOTLB_TYPE) && + dmar_latency_enabled(iommu, DMAR_LATENCY_INV_DEVTLB)) + devtlb_start_ktime = ktime_to_ns(ktime_get()); + + if (type == QI_IEC_TYPE && + dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IEC)) + iec_start_ktime = ktime_to_ns(ktime_get()); + +restart: + rc = 0; + + raw_spin_lock_irqsave(&qi->q_lock, flags); + /* + * Check if we have enough empty slots in the queue to submit, + * the calculation is based on: + * # of desc + 1 wait desc + 1 space between head and tail + */ + while (qi->free_cnt < count + 2) { + raw_spin_unlock_irqrestore(&qi->q_lock, flags); + cpu_relax(); + raw_spin_lock_irqsave(&qi->q_lock, flags); + } + + index = qi->free_head; + wait_index = (index + count) % QI_LENGTH; + shift = qi_shift(iommu); + + for (i = 0; i < count; i++) { + offset = ((index + i) % QI_LENGTH) << shift; + memcpy(qi->desc + offset, &desc[i], 1 << shift); + qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE; + trace_qi_submit(iommu, desc[i].qw0, desc[i].qw1, + desc[i].qw2, desc[i].qw3); + } + qi->desc_status[wait_index] = QI_IN_USE; + + wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_STATUS_WRITE | QI_IWD_TYPE; + if (options & QI_OPT_WAIT_DRAIN) + wait_desc.qw0 |= QI_IWD_PRQ_DRAIN; + wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]); + wait_desc.qw2 = 0; + wait_desc.qw3 = 0; + + offset = wait_index << shift; + memcpy(qi->desc + offset, &wait_desc, 1 << shift); + + qi->free_head = (qi->free_head + count + 1) % QI_LENGTH; + qi->free_cnt -= count + 1; + + /* + * update the HW tail register indicating the presence of + * new descriptors. + */ + writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG); + + while (READ_ONCE(qi->desc_status[wait_index]) != QI_DONE) { + /* + * We will leave the interrupts disabled, to prevent interrupt + * context to queue another cmd while a cmd is already submitted + * and waiting for completion on this cpu. This is to avoid + * a deadlock where the interrupt context can wait indefinitely + * for free slots in the queue. + */ + rc = qi_check_fault(iommu, index, wait_index); + if (rc) + break; + + raw_spin_unlock(&qi->q_lock); + cpu_relax(); + raw_spin_lock(&qi->q_lock); + } + + /* + * The reclaim code can free descriptors from multiple submissions + * starting from the tail of the queue. When count == 0, the + * status of the standalone wait descriptor at the tail of the queue + * must be set to QI_FREE to allow the reclaim code to proceed. + * It is also possible that descriptors from one of the previous + * submissions has to be reclaimed by a subsequent submission. + */ + for (i = 0; i <= count; i++) + qi->desc_status[(index + i) % QI_LENGTH] = QI_FREE; + + reclaim_free_desc(qi); + raw_spin_unlock_irqrestore(&qi->q_lock, flags); + + if (rc == -EAGAIN) + goto restart; + + if (iotlb_start_ktime) + dmar_latency_update(iommu, DMAR_LATENCY_INV_IOTLB, + ktime_to_ns(ktime_get()) - iotlb_start_ktime); + + if (devtlb_start_ktime) + dmar_latency_update(iommu, DMAR_LATENCY_INV_DEVTLB, + ktime_to_ns(ktime_get()) - devtlb_start_ktime); + + if (iec_start_ktime) + dmar_latency_update(iommu, DMAR_LATENCY_INV_IEC, + ktime_to_ns(ktime_get()) - iec_start_ktime); + + return rc; +} + +/* + * Flush the global interrupt entry cache. + */ +void qi_global_iec(struct intel_iommu *iommu) +{ + struct qi_desc desc; + + desc.qw0 = QI_IEC_TYPE; + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; + + /* should never fail */ + qi_submit_sync(iommu, &desc, 1, 0); +} + +void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, + u64 type) +{ + struct qi_desc desc; + + desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) + | QI_CC_GRAN(type) | QI_CC_TYPE; + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(iommu, &desc, 1, 0); +} + +void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type) +{ + struct qi_desc desc; + + qi_desc_iotlb(iommu, did, addr, size_order, type, &desc); + qi_submit_sync(iommu, &desc, 1, 0); +} + +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned mask) +{ + struct qi_desc desc; + + /* + * VT-d spec, section 4.3: + * + * Software is recommended to not submit any Device-TLB invalidation + * requests while address remapping hardware is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &desc); + qi_submit_sync(iommu, &desc, 1, 0); +} + +/* PASID-based IOTLB invalidation */ +void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, + unsigned long npages, bool ih) +{ + struct qi_desc desc = {.qw2 = 0, .qw3 = 0}; + + /* + * npages == -1 means a PASID-selective invalidation, otherwise, + * a positive value for Page-selective-within-PASID invalidation. + * 0 is not a valid input. + */ + if (WARN_ON(!npages)) { + pr_err("Invalid input npages = %ld\n", npages); + return; + } + + qi_desc_piotlb(did, pasid, addr, npages, ih, &desc); + qi_submit_sync(iommu, &desc, 1, 0); +} + +/* PASID-based device IOTLB Invalidate */ +void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, unsigned int size_order) +{ + struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; + + /* + * VT-d spec, section 4.3: + * + * Software is recommended to not submit any Device-TLB invalidation + * requests while address remapping hardware is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, + qdep, addr, size_order, + &desc); + qi_submit_sync(iommu, &desc, 1, 0); +} + +void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, + u64 granu, u32 pasid) +{ + struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; + + desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | + QI_PC_GRAN(granu) | QI_PC_TYPE; + qi_submit_sync(iommu, &desc, 1, 0); +} + +/* + * Disable Queued Invalidation interface. + */ +void dmar_disable_qi(struct intel_iommu *iommu) +{ + unsigned long flags; + u32 sts; + cycles_t start_time = get_cycles(); + + if (!ecap_qis(iommu->ecap)) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + sts = readl(iommu->reg + DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_QIES)) + goto end; + + /* + * Give a chance to HW to complete the pending invalidation requests. + */ + while ((readl(iommu->reg + DMAR_IQT_REG) != + readl(iommu->reg + DMAR_IQH_REG)) && + (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time))) + cpu_relax(); + + iommu->gcmd &= ~DMA_GCMD_QIE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, + !(sts & DMA_GSTS_QIES), sts); +end: + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +/* + * Enable queued invalidation. + */ +static void __dmar_enable_qi(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flags; + struct q_inval *qi = iommu->qi; + u64 val = virt_to_phys(qi->desc); + + qi->free_head = qi->free_tail = 0; + qi->free_cnt = QI_LENGTH; + + /* + * Set DW=1 and QS=1 in IQA_REG when Scalable Mode capability + * is present. + */ + if (ecap_smts(iommu->ecap)) + val |= BIT_ULL(11) | BIT_ULL(0); + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + /* write zero to the tail reg */ + writel(0, iommu->reg + DMAR_IQT_REG); + + dmar_writeq(iommu->reg + DMAR_IQA_REG, val); + + iommu->gcmd |= DMA_GCMD_QIE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_QIES), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +/* + * Enable Queued Invalidation interface. This is a must to support + * interrupt-remapping. Also used by DMA-remapping, which replaces + * register based IOTLB invalidation. + */ +int dmar_enable_qi(struct intel_iommu *iommu) +{ + struct q_inval *qi; + void *desc; + + if (!ecap_qis(iommu->ecap)) + return -ENOENT; + + /* + * queued invalidation is already setup and enabled. + */ + if (iommu->qi) + return 0; + + iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC); + if (!iommu->qi) + return -ENOMEM; + + qi = iommu->qi; + + /* + * Need two pages to accommodate 256 descriptors of 256 bits each + * if the remapping hardware supports scalable mode translation. + */ + desc = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, + ecap_smts(iommu->ecap) ? SZ_8K : + SZ_4K); + if (!desc) { + kfree(qi); + iommu->qi = NULL; + return -ENOMEM; + } + + qi->desc = desc; + + qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC); + if (!qi->desc_status) { + iommu_free_pages(qi->desc); + kfree(qi); + iommu->qi = NULL; + return -ENOMEM; + } + + raw_spin_lock_init(&qi->q_lock); + + __dmar_enable_qi(iommu); + + return 0; +} + +/* iommu interrupt handling. Most stuff are MSI-like. */ + +enum faulttype { + DMA_REMAP, + INTR_REMAP, + UNKNOWN, +}; + +static const char *dma_remap_fault_reasons[] = +{ + "Software", + "Present bit in root entry is clear", + "Present bit in context entry is clear", + "Invalid context entry", + "Access beyond MGAW", + "PTE Write access is not set", + "PTE Read access is not set", + "Next page table ptr is invalid", + "Root table address invalid", + "Context table ptr is invalid", + "non-zero reserved fields in RTP", + "non-zero reserved fields in CTP", + "non-zero reserved fields in PTE", + "PCE for translation request specifies blocking", +}; + +static const char * const dma_remap_sm_fault_reasons[] = { + "SM: Invalid Root Table Address", + "SM: TTM 0 for request with PASID", + "SM: TTM 0 for page group request", + "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x33-0x37 */ + "SM: Error attempting to access Root Entry", + "SM: Present bit in Root Entry is clear", + "SM: Non-zero reserved field set in Root Entry", + "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x3B-0x3F */ + "SM: Error attempting to access Context Entry", + "SM: Present bit in Context Entry is clear", + "SM: Non-zero reserved field set in the Context Entry", + "SM: Invalid Context Entry", + "SM: DTE field in Context Entry is clear", + "SM: PASID Enable field in Context Entry is clear", + "SM: PASID is larger than the max in Context Entry", + "SM: PRE field in Context-Entry is clear", + "SM: RID_PASID field error in Context-Entry", + "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x49-0x4F */ + "SM: Error attempting to access the PASID Directory Entry", + "SM: Present bit in Directory Entry is clear", + "SM: Non-zero reserved field set in PASID Directory Entry", + "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x53-0x57 */ + "SM: Error attempting to access PASID Table Entry", + "SM: Present bit in PASID Table Entry is clear", + "SM: Non-zero reserved field set in PASID Table Entry", + "SM: Invalid Scalable-Mode PASID Table Entry", + "SM: ERE field is clear in PASID Table Entry", + "SM: SRE field is clear in PASID Table Entry", + "Unknown", "Unknown",/* 0x5E-0x5F */ + "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x60-0x67 */ + "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x68-0x6F */ + "SM: Error attempting to access first-level paging entry", + "SM: Present bit in first-level paging entry is clear", + "SM: Non-zero reserved field set in first-level paging entry", + "SM: Error attempting to access FL-PML4 entry", + "SM: First-level entry address beyond MGAW in Nested translation", + "SM: Read permission error in FL-PML4 entry in Nested translation", + "SM: Read permission error in first-level paging entry in Nested translation", + "SM: Write permission error in first-level paging entry in Nested translation", + "SM: Error attempting to access second-level paging entry", + "SM: Read/Write permission error in second-level paging entry", + "SM: Non-zero reserved field set in second-level paging entry", + "SM: Invalid second-level page table pointer", + "SM: A/D bit update needed in second-level entry when set up in no snoop", + "Unknown", "Unknown", "Unknown", /* 0x7D-0x7F */ + "SM: Address in first-level translation is not canonical", + "SM: U/S set 0 for first-level translation with user privilege", + "SM: No execute permission for request with PASID and ER=1", + "SM: Address beyond the DMA hardware max", + "SM: Second-level entry address beyond the max", + "SM: No write permission for Write/AtomicOp request", + "SM: No read permission for Read/AtomicOp request", + "SM: Invalid address-interrupt address", + "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", /* 0x88-0x8F */ + "SM: A/D bit update needed in first-level entry when set up in no snoop", +}; + +static const char *irq_remap_fault_reasons[] = +{ + "Detected reserved fields in the decoded interrupt-remapped request", + "Interrupt index exceeded the interrupt-remapping table size", + "Present field in the IRTE entry is clear", + "Error accessing interrupt-remapping table pointed by IRTA_REG", + "Detected reserved fields in the IRTE entry", + "Blocked a compatibility format interrupt request", + "Blocked an interrupt request due to source-id verification failure", +}; + +static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type) +{ + if (fault_reason >= 0x20 && (fault_reason - 0x20 < + ARRAY_SIZE(irq_remap_fault_reasons))) { + *fault_type = INTR_REMAP; + return irq_remap_fault_reasons[fault_reason - 0x20]; + } else if (fault_reason >= 0x30 && (fault_reason - 0x30 < + ARRAY_SIZE(dma_remap_sm_fault_reasons))) { + *fault_type = DMA_REMAP; + return dma_remap_sm_fault_reasons[fault_reason - 0x30]; + } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) { + *fault_type = DMA_REMAP; + return dma_remap_fault_reasons[fault_reason]; + } else { + *fault_type = UNKNOWN; + return "Unknown"; + } +} + + +static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq) +{ + if (iommu->irq == irq) + return DMAR_FECTL_REG; + else if (iommu->pr_irq == irq) + return DMAR_PECTL_REG; + else if (iommu->perf_irq == irq) + return DMAR_PERFINTRCTL_REG; + else + BUG(); +} + +void dmar_msi_unmask(struct irq_data *data) +{ + struct intel_iommu *iommu = irq_data_get_irq_handler_data(data); + int reg = dmar_msi_reg(iommu, data->irq); + unsigned long flag; + + /* unmask it */ + raw_spin_lock_irqsave(&iommu->register_lock, flag); + writel(0, iommu->reg + reg); + /* Read a reg to force flush the post write */ + readl(iommu->reg + reg); + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void dmar_msi_mask(struct irq_data *data) +{ + struct intel_iommu *iommu = irq_data_get_irq_handler_data(data); + int reg = dmar_msi_reg(iommu, data->irq); + unsigned long flag; + + /* mask it */ + raw_spin_lock_irqsave(&iommu->register_lock, flag); + writel(DMA_FECTL_IM, iommu->reg + reg); + /* Read a reg to force flush the post write */ + readl(iommu->reg + reg); + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void dmar_msi_write(int irq, struct msi_msg *msg) +{ + struct intel_iommu *iommu = irq_get_handler_data(irq); + int reg = dmar_msi_reg(iommu, irq); + unsigned long flag; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + writel(msg->data, iommu->reg + reg + 4); + writel(msg->address_lo, iommu->reg + reg + 8); + writel(msg->address_hi, iommu->reg + reg + 12); + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +static int dmar_fault_do_one(struct intel_iommu *iommu, int type, + u8 fault_reason, u32 pasid, u16 source_id, + unsigned long long addr) +{ + const char *reason; + int fault_type; + + reason = dmar_get_fault_reason(fault_reason, &fault_type); + + if (fault_type == INTR_REMAP) { + pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n", + source_id >> 8, PCI_SLOT(source_id & 0xFF), + PCI_FUNC(source_id & 0xFF), addr >> 48, + fault_reason, reason); + + return 0; + } + + if (pasid == IOMMU_PASID_INVALID) + pr_err("[%s NO_PASID] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n", + type ? "DMA Read" : "DMA Write", + source_id >> 8, PCI_SLOT(source_id & 0xFF), + PCI_FUNC(source_id & 0xFF), addr, + fault_reason, reason); + else + pr_err("[%s PASID 0x%x] Request device [%02x:%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n", + type ? "DMA Read" : "DMA Write", pasid, + source_id >> 8, PCI_SLOT(source_id & 0xFF), + PCI_FUNC(source_id & 0xFF), addr, + fault_reason, reason); + + dmar_fault_dump_ptes(iommu, source_id, addr, pasid); + + return 0; +} + +#define PRIMARY_FAULT_REG_LEN (16) +irqreturn_t dmar_fault(int irq, void *dev_id) +{ + struct intel_iommu *iommu = dev_id; + int reg, fault_index; + u32 fault_status; + unsigned long flag; + static DEFINE_RATELIMIT_STATE(rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + fault_status = readl(iommu->reg + DMAR_FSTS_REG); + if (fault_status && __ratelimit(&rs)) + pr_err("DRHD: handling fault status reg %x\n", fault_status); + + /* TBD: ignore advanced fault log currently */ + if (!(fault_status & DMA_FSTS_PPF)) + goto unlock_exit; + + fault_index = dma_fsts_fault_record_index(fault_status); + reg = cap_fault_reg_offset(iommu->cap); + while (1) { + /* Disable printing, simply clear the fault when ratelimited */ + bool ratelimited = !__ratelimit(&rs); + u8 fault_reason; + u16 source_id; + u64 guest_addr; + u32 pasid; + int type; + u32 data; + bool pasid_present; + + /* highest 32 bits */ + data = readl(iommu->reg + reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12); + if (!(data & DMA_FRCD_F)) + break; + + if (!ratelimited) { + fault_reason = dma_frcd_fault_reason(data); + type = dma_frcd_type(data); + + pasid = dma_frcd_pasid_value(data); + data = readl(iommu->reg + reg + + fault_index * PRIMARY_FAULT_REG_LEN + 8); + source_id = dma_frcd_source_id(data); + + pasid_present = dma_frcd_pasid_present(data); + guest_addr = dmar_readq(iommu->reg + reg + + fault_index * PRIMARY_FAULT_REG_LEN); + guest_addr = dma_frcd_page_addr(guest_addr); + } + + /* clear the fault */ + writel(DMA_FRCD_F, iommu->reg + reg + + fault_index * PRIMARY_FAULT_REG_LEN + 12); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + + if (!ratelimited) + /* Using pasid -1 if pasid is not present */ + dmar_fault_do_one(iommu, type, fault_reason, + pasid_present ? pasid : IOMMU_PASID_INVALID, + source_id, guest_addr); + + fault_index++; + if (fault_index >= cap_num_fault_regs(iommu->cap)) + fault_index = 0; + raw_spin_lock_irqsave(&iommu->register_lock, flag); + } + + writel(DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_PRO, + iommu->reg + DMAR_FSTS_REG); + +unlock_exit: + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + return IRQ_HANDLED; +} + +int dmar_set_interrupt(struct intel_iommu *iommu) +{ + int irq, ret; + + /* + * Check if the fault interrupt is already initialized. + */ + if (iommu->irq) + return 0; + + irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu); + if (irq > 0) { + iommu->irq = irq; + } else { + pr_err("No free IRQ vectors\n"); + return -EINVAL; + } + + ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu); + if (ret) + pr_err("Can't request irq\n"); + return ret; +} + +int enable_drhd_fault_handling(unsigned int cpu) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + /* + * Enable fault control interrupt. + */ + guard(rwsem_read)(&dmar_global_lock); + for_each_iommu(iommu, drhd) { + u32 fault_status; + int ret; + + if (iommu->irq || iommu->node != cpu_to_node(cpu)) + continue; + + ret = dmar_set_interrupt(iommu); + + if (ret) { + pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n", + (unsigned long long)drhd->reg_base_addr, ret); + return -1; + } + + /* + * Clear any previous faults. + */ + dmar_fault(iommu->irq, iommu); + fault_status = readl(iommu->reg + DMAR_FSTS_REG); + writel(fault_status, iommu->reg + DMAR_FSTS_REG); + } + + return 0; +} + +/* + * Re-enable Queued Invalidation interface. + */ +int dmar_reenable_qi(struct intel_iommu *iommu) +{ + if (!ecap_qis(iommu->ecap)) + return -ENOENT; + + if (!iommu->qi) + return -ENOENT; + + /* + * First disable queued invalidation. + */ + dmar_disable_qi(iommu); + /* + * Then enable queued invalidation again. Since there is no pending + * invalidation requests now, it's safe to re-enable queued + * invalidation. + */ + __dmar_enable_qi(iommu); + + return 0; +} + +/* + * Check interrupt remapping support in DMAR table description. + */ +int __init dmar_ir_support(void) +{ + struct acpi_table_dmar *dmar; + dmar = (struct acpi_table_dmar *)dmar_tbl; + if (!dmar) + return 0; + return dmar->flags & 0x1; +} + +/* Check whether DMAR units are in use */ +static inline bool dmar_in_use(void) +{ + return irq_remapping_enabled || intel_iommu_enabled; +} + +static int __init dmar_free_unused_resources(void) +{ + struct dmar_drhd_unit *dmaru, *dmaru_n; + + if (dmar_in_use()) + return 0; + + if (dmar_dev_scope_status != 1 && !list_empty(&dmar_drhd_units)) + bus_unregister_notifier(&pci_bus_type, &dmar_pci_bus_nb); + + down_write(&dmar_global_lock); + list_for_each_entry_safe(dmaru, dmaru_n, &dmar_drhd_units, list) { + list_del(&dmaru->list); + dmar_free_drhd(dmaru); + } + up_write(&dmar_global_lock); + + return 0; +} + +late_initcall(dmar_free_unused_resources); + +/* + * DMAR Hotplug Support + * For more details, please refer to Intel(R) Virtualization Technology + * for Directed-IO Architecture Specifiction, Rev 2.2, Section 8.8 + * "Remapping Hardware Unit Hot Plug". + */ +static guid_t dmar_hp_guid = + GUID_INIT(0xD8C1A3A6, 0xBE9B, 0x4C9B, + 0x91, 0xBF, 0xC3, 0xCB, 0x81, 0xFC, 0x5D, 0xAF); + +/* + * Currently there's only one revision and BIOS will not check the revision id, + * so use 0 for safety. + */ +#define DMAR_DSM_REV_ID 0 +#define DMAR_DSM_FUNC_DRHD 1 +#define DMAR_DSM_FUNC_ATSR 2 +#define DMAR_DSM_FUNC_RHSA 3 +#define DMAR_DSM_FUNC_SATC 4 + +static inline bool dmar_detect_dsm(acpi_handle handle, int func) +{ + return acpi_check_dsm(handle, &dmar_hp_guid, DMAR_DSM_REV_ID, 1 << func); +} + +static int dmar_walk_dsm_resource(acpi_handle handle, int func, + dmar_res_handler_t handler, void *arg) +{ + int ret = -ENODEV; + union acpi_object *obj; + struct acpi_dmar_header *start; + struct dmar_res_callback callback; + static int res_type[] = { + [DMAR_DSM_FUNC_DRHD] = ACPI_DMAR_TYPE_HARDWARE_UNIT, + [DMAR_DSM_FUNC_ATSR] = ACPI_DMAR_TYPE_ROOT_ATS, + [DMAR_DSM_FUNC_RHSA] = ACPI_DMAR_TYPE_HARDWARE_AFFINITY, + [DMAR_DSM_FUNC_SATC] = ACPI_DMAR_TYPE_SATC, + }; + + if (!dmar_detect_dsm(handle, func)) + return 0; + + obj = acpi_evaluate_dsm_typed(handle, &dmar_hp_guid, DMAR_DSM_REV_ID, + func, NULL, ACPI_TYPE_BUFFER); + if (!obj) + return -ENODEV; + + memset(&callback, 0, sizeof(callback)); + callback.cb[res_type[func]] = handler; + callback.arg[res_type[func]] = arg; + start = (struct acpi_dmar_header *)obj->buffer.pointer; + ret = dmar_walk_remapping_entries(start, obj->buffer.length, &callback); + + ACPI_FREE(obj); + + return ret; +} + +static int dmar_hp_add_drhd(struct acpi_dmar_header *header, void *arg) +{ + int ret; + struct dmar_drhd_unit *dmaru; + + dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header); + if (!dmaru) + return -ENODEV; + + ret = dmar_ir_hotplug(dmaru, true); + if (ret == 0) + ret = dmar_iommu_hotplug(dmaru, true); + + return ret; +} + +static int dmar_hp_remove_drhd(struct acpi_dmar_header *header, void *arg) +{ + int i, ret; + struct device *dev; + struct dmar_drhd_unit *dmaru; + + dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header); + if (!dmaru) + return 0; + + /* + * All PCI devices managed by this unit should have been destroyed. + */ + if (!dmaru->include_all && dmaru->devices && dmaru->devices_cnt) { + for_each_active_dev_scope(dmaru->devices, + dmaru->devices_cnt, i, dev) + return -EBUSY; + } + + ret = dmar_ir_hotplug(dmaru, false); + if (ret == 0) + ret = dmar_iommu_hotplug(dmaru, false); + + return ret; +} + +static int dmar_hp_release_drhd(struct acpi_dmar_header *header, void *arg) +{ + struct dmar_drhd_unit *dmaru; + + dmaru = dmar_find_dmaru((struct acpi_dmar_hardware_unit *)header); + if (dmaru) { + list_del_rcu(&dmaru->list); + synchronize_rcu(); + dmar_free_drhd(dmaru); + } + + return 0; +} + +static int dmar_hotplug_insert(acpi_handle handle) +{ + int ret; + int drhd_count = 0; + + ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD, + &dmar_validate_one_drhd, (void *)1); + if (ret) + goto out; + + ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD, + &dmar_parse_one_drhd, (void *)&drhd_count); + if (ret == 0 && drhd_count == 0) { + pr_warn(FW_BUG "No DRHD structures in buffer returned by _DSM method\n"); + goto out; + } else if (ret) { + goto release_drhd; + } + + ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_RHSA, + &dmar_parse_one_rhsa, NULL); + if (ret) + goto release_drhd; + + ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR, + &dmar_parse_one_atsr, NULL); + if (ret) + goto release_atsr; + + ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD, + &dmar_hp_add_drhd, NULL); + if (!ret) + return 0; + + dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD, + &dmar_hp_remove_drhd, NULL); +release_atsr: + dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR, + &dmar_release_one_atsr, NULL); +release_drhd: + dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD, + &dmar_hp_release_drhd, NULL); +out: + return ret; +} + +static int dmar_hotplug_remove(acpi_handle handle) +{ + int ret; + + ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR, + &dmar_check_one_atsr, NULL); + if (ret) + return ret; + + ret = dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD, + &dmar_hp_remove_drhd, NULL); + if (ret == 0) { + WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_ATSR, + &dmar_release_one_atsr, NULL)); + WARN_ON(dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD, + &dmar_hp_release_drhd, NULL)); + } else { + dmar_walk_dsm_resource(handle, DMAR_DSM_FUNC_DRHD, + &dmar_hp_add_drhd, NULL); + } + + return ret; +} + +static acpi_status dmar_get_dsm_handle(acpi_handle handle, u32 lvl, + void *context, void **retval) +{ + acpi_handle *phdl = retval; + + if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) { + *phdl = handle; + return AE_CTRL_TERMINATE; + } + + return AE_OK; +} + +static int dmar_device_hotplug(acpi_handle handle, bool insert) +{ + int ret; + acpi_handle tmp = NULL; + acpi_status status; + + if (!dmar_in_use()) + return 0; + + if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) { + tmp = handle; + } else { + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, + ACPI_UINT32_MAX, + dmar_get_dsm_handle, + NULL, NULL, &tmp); + if (ACPI_FAILURE(status)) { + pr_warn("Failed to locate _DSM method.\n"); + return -ENXIO; + } + } + if (tmp == NULL) + return 0; + + down_write(&dmar_global_lock); + if (insert) + ret = dmar_hotplug_insert(tmp); + else + ret = dmar_hotplug_remove(tmp); + up_write(&dmar_global_lock); + + return ret; +} + +int dmar_device_add(acpi_handle handle) +{ + return dmar_device_hotplug(handle, true); +} + +int dmar_device_remove(acpi_handle handle) +{ + return dmar_device_hotplug(handle, false); +} + +/* + * dmar_platform_optin - Is %DMA_CTRL_PLATFORM_OPT_IN_FLAG set in DMAR table + * + * Returns true if the platform has %DMA_CTRL_PLATFORM_OPT_IN_FLAG set in + * the ACPI DMAR table. This means that the platform boot firmware has made + * sure no device can issue DMA outside of RMRR regions. + */ +bool dmar_platform_optin(void) +{ + struct acpi_table_dmar *dmar; + acpi_status status; + bool ret; + + status = acpi_get_table(ACPI_SIG_DMAR, 0, + (struct acpi_table_header **)&dmar); + if (ACPI_FAILURE(status)) + return false; + + ret = !!(dmar->flags & DMAR_PLATFORM_OPT_IN); + acpi_put_table((struct acpi_table_header *)dmar); + + return ret; +} +EXPORT_SYMBOL_GPL(dmar_platform_optin); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c new file mode 100644 index 000000000000..134302fbcd92 --- /dev/null +++ b/drivers/iommu/intel/iommu.c @@ -0,0 +1,4222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2006-2014 Intel Corporation. + * + * Authors: David Woodhouse <dwmw2@infradead.org>, + * Ashok Raj <ashok.raj@intel.com>, + * Shaohua Li <shaohua.li@intel.com>, + * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + * Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "DMAR: " fmt +#define dev_fmt(fmt) pr_fmt(fmt) + +#include <linux/crash_dump.h> +#include <linux/dma-direct.h> +#include <linux/dmi.h> +#include <linux/memory.h> +#include <linux/pci.h> +#include <linux/pci-ats.h> +#include <linux/spinlock.h> +#include <linux/syscore_ops.h> +#include <linux/tboot.h> +#include <uapi/linux/iommufd.h> + +#include "iommu.h" +#include "../dma-iommu.h" +#include "../irq_remapping.h" +#include "../iommu-pages.h" +#include "pasid.h" +#include "perfmon.h" + +#define ROOT_SIZE VTD_PAGE_SIZE +#define CONTEXT_SIZE VTD_PAGE_SIZE + +#define IS_GFX_DEVICE(pdev) pci_is_display(pdev) +#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) +#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) +#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) + +#define IOAPIC_RANGE_START (0xfee00000) +#define IOAPIC_RANGE_END (0xfeefffff) +#define IOVA_START_ADDR (0x1000) + +#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 + +static void __init check_tylersburg_isoch(void); +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); +static int rwbf_quirk; + +#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) + +/* + * set to 1 to panic kernel if can't successfully enable VT-d + * (used when kernel is launched w/ TXT) + */ +static int force_on = 0; +static int intel_iommu_tboot_noforce; +static int no_platform_optin; + +#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) + +/* + * Take a root_entry and return the Lower Context Table Pointer (LCTP) + * if marked present. + */ +static phys_addr_t root_entry_lctp(struct root_entry *re) +{ + if (!(re->lo & 1)) + return 0; + + return re->lo & VTD_PAGE_MASK; +} + +/* + * Take a root_entry and return the Upper Context Table Pointer (UCTP) + * if marked present. + */ +static phys_addr_t root_entry_uctp(struct root_entry *re) +{ + if (!(re->hi & 1)) + return 0; + + return re->hi & VTD_PAGE_MASK; +} + +static int device_rid_cmp_key(const void *key, const struct rb_node *node) +{ + struct device_domain_info *info = + rb_entry(node, struct device_domain_info, node); + const u16 *rid_lhs = key; + + if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) + return -1; + + if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) + return 1; + + return 0; +} + +static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) +{ + struct device_domain_info *info = + rb_entry(lhs, struct device_domain_info, node); + u16 key = PCI_DEVID(info->bus, info->devfn); + + return device_rid_cmp_key(&key, rhs); +} + +/* + * Looks up an IOMMU-probed device using its source ID. + * + * Returns the pointer to the device if there is a match. Otherwise, + * returns NULL. + * + * Note that this helper doesn't guarantee that the device won't be + * released by the iommu subsystem after being returned. The caller + * should use its own synchronization mechanism to avoid the device + * being released during its use if its possibly the case. + */ +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) +{ + struct device_domain_info *info = NULL; + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); + if (node) + info = rb_entry(node, struct device_domain_info, node); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); + + return info ? info->dev : NULL; +} + +static int device_rbtree_insert(struct intel_iommu *iommu, + struct device_domain_info *info) +{ + struct rb_node *curr; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); + if (WARN_ON(curr)) + return -EEXIST; + + return 0; +} + +static void device_rbtree_remove(struct device_domain_info *info) +{ + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + + spin_lock_irqsave(&iommu->device_rbtree_lock, flags); + rb_erase(&info->node, &iommu->device_rbtree); + spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); +} + +struct dmar_rmrr_unit { + struct list_head list; /* list of rmrr units */ + struct acpi_dmar_header *hdr; /* ACPI header */ + u64 base_address; /* reserved base address*/ + u64 end_address; /* reserved end address */ + struct dmar_dev_scope *devices; /* target devices */ + int devices_cnt; /* target device count */ +}; + +struct dmar_atsr_unit { + struct list_head list; /* list of ATSR units */ + struct acpi_dmar_header *hdr; /* ACPI header */ + struct dmar_dev_scope *devices; /* target devices */ + int devices_cnt; /* target device count */ + u8 include_all:1; /* include all ports */ +}; + +struct dmar_satc_unit { + struct list_head list; /* list of SATC units */ + struct acpi_dmar_header *hdr; /* ACPI header */ + struct dmar_dev_scope *devices; /* target devices */ + struct intel_iommu *iommu; /* the corresponding iommu */ + int devices_cnt; /* target device count */ + u8 atc_required:1; /* ATS is required */ +}; + +static LIST_HEAD(dmar_atsr_units); +static LIST_HEAD(dmar_rmrr_units); +static LIST_HEAD(dmar_satc_units); + +#define for_each_rmrr_units(rmrr) \ + list_for_each_entry(rmrr, &dmar_rmrr_units, list) + +static void intel_iommu_domain_free(struct iommu_domain *domain); + +int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); +int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); + +int intel_iommu_enabled = 0; +EXPORT_SYMBOL_GPL(intel_iommu_enabled); + +static int intel_iommu_superpage = 1; +static int iommu_identity_mapping; +static int iommu_skip_te_disable; +static int disable_igfx_iommu; + +#define IDENTMAP_AZALIA 4 + +const struct iommu_ops intel_iommu_ops; + +static bool translation_pre_enabled(struct intel_iommu *iommu) +{ + return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); +} + +static void clear_translation_pre_enabled(struct intel_iommu *iommu) +{ + iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; +} + +static void init_translation_status(struct intel_iommu *iommu) +{ + u32 gsts; + + gsts = readl(iommu->reg + DMAR_GSTS_REG); + if (gsts & DMA_GSTS_TES) + iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; +} + +static int __init intel_iommu_setup(char *str) +{ + if (!str) + return -EINVAL; + + while (*str) { + if (!strncmp(str, "on", 2)) { + dmar_disabled = 0; + pr_info("IOMMU enabled\n"); + } else if (!strncmp(str, "off", 3)) { + dmar_disabled = 1; + no_platform_optin = 1; + pr_info("IOMMU disabled\n"); + } else if (!strncmp(str, "igfx_off", 8)) { + disable_igfx_iommu = 1; + pr_info("Disable GFX device mapping\n"); + } else if (!strncmp(str, "forcedac", 8)) { + pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); + iommu_dma_forcedac = true; + } else if (!strncmp(str, "strict", 6)) { + pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); + iommu_set_dma_strict(); + } else if (!strncmp(str, "sp_off", 6)) { + pr_info("Disable supported super page\n"); + intel_iommu_superpage = 0; + } else if (!strncmp(str, "sm_on", 5)) { + pr_info("Enable scalable mode if hardware supports\n"); + intel_iommu_sm = 1; + } else if (!strncmp(str, "sm_off", 6)) { + pr_info("Scalable mode is disallowed\n"); + intel_iommu_sm = 0; + } else if (!strncmp(str, "tboot_noforce", 13)) { + pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); + intel_iommu_tboot_noforce = 1; + } else { + pr_notice("Unknown option - '%s'\n", str); + } + + str += strcspn(str, ","); + while (*str == ',') + str++; + } + + return 1; +} +__setup("intel_iommu=", intel_iommu_setup); + +/* + * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. + * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of + * the returned SAGAW. + */ +static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) +{ + unsigned long fl_sagaw, sl_sagaw; + + fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); + sl_sagaw = cap_sagaw(iommu->cap); + + /* Second level only. */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return sl_sagaw; + + /* First level only. */ + if (!ecap_slts(iommu->ecap)) + return fl_sagaw; + + return fl_sagaw & sl_sagaw; +} + +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) +{ + unsigned long sagaw; + int agaw; + + sagaw = __iommu_calculate_sagaw(iommu); + for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { + if (test_bit(agaw, &sagaw)) + break; + } + + return agaw; +} + +/* + * Calculate max SAGAW for each iommu. + */ +int iommu_calculate_max_sagaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); +} + +/* + * calculate agaw for each iommu. + * "SAGAW" may be different across iommus, use a default agaw, and + * get a supported less agaw for iommus that don't support the default agaw. + */ +int iommu_calculate_agaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); +} + +static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) +{ + return sm_supported(iommu) ? + ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); +} + +struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, + u8 devfn, int alloc) +{ + struct root_entry *root = &iommu->root_entry[bus]; + struct context_entry *context; + u64 *entry; + + /* + * Except that the caller requested to allocate a new entry, + * returning a copied context entry makes no sense. + */ + if (!alloc && context_copied(iommu, bus, devfn)) + return NULL; + + entry = &root->lo; + if (sm_supported(iommu)) { + if (devfn >= 0x80) { + devfn -= 0x80; + entry = &root->hi; + } + devfn *= 2; + } + if (*entry & 1) + context = phys_to_virt(*entry & VTD_PAGE_MASK); + else { + unsigned long phy_addr; + if (!alloc) + return NULL; + + context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, + SZ_4K); + if (!context) + return NULL; + + __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); + phy_addr = virt_to_phys((void *)context); + *entry = phy_addr | 1; + __iommu_flush_cache(iommu, entry, sizeof(*entry)); + } + return &context[devfn]; +} + +/** + * is_downstream_to_pci_bridge - test if a device belongs to the PCI + * sub-hierarchy of a candidate PCI-PCI bridge + * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy + * @bridge: the candidate PCI-PCI bridge + * + * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. + */ +static bool +is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) +{ + struct pci_dev *pdev, *pbridge; + + if (!dev_is_pci(dev) || !dev_is_pci(bridge)) + return false; + + pdev = to_pci_dev(dev); + pbridge = to_pci_dev(bridge); + + if (pbridge->subordinate && + pbridge->subordinate->number <= pdev->bus->number && + pbridge->subordinate->busn_res.end >= pdev->bus->number) + return true; + + return false; +} + +static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) +{ + struct dmar_drhd_unit *drhd; + u32 vtbar; + int rc; + + /* We know that this device on this chipset has its own IOMMU. + * If we find it under a different IOMMU, then the BIOS is lying + * to us. Hope that the IOMMU for this device is actually + * disabled, and it needs no translation... + */ + rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); + if (rc) { + /* "can't" happen */ + dev_info(&pdev->dev, "failed to run vt-d quirk\n"); + return false; + } + vtbar &= 0xffff0000; + + /* we know that the this iommu should be at offset 0xa000 from vtbar */ + drhd = dmar_find_matched_drhd_unit(pdev); + if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { + pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + return true; + } + + return false; +} + +static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) +{ + if (!iommu || iommu->drhd->ignored) + return true; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if (pdev->vendor == PCI_VENDOR_ID_INTEL && + pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && + quirk_ioat_snb_local_iommu(pdev)) + return true; + } + + return false; +} + +static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) +{ + struct dmar_drhd_unit *drhd = NULL; + struct pci_dev *pdev = NULL; + struct intel_iommu *iommu; + struct device *tmp; + u16 segment = 0; + int i; + + if (!dev) + return NULL; + + if (dev_is_pci(dev)) { + struct pci_dev *pf_pdev; + + pdev = pci_real_dma_dev(to_pci_dev(dev)); + + /* VFs aren't listed in scope tables; we need to look up + * the PF instead to find the IOMMU. */ + pf_pdev = pci_physfn(pdev); + dev = &pf_pdev->dev; + segment = pci_domain_nr(pdev->bus); + } else if (has_acpi_companion(dev)) + dev = &ACPI_COMPANION(dev)->dev; + + rcu_read_lock(); + for_each_iommu(iommu, drhd) { + if (pdev && segment != drhd->segment) + continue; + + for_each_active_dev_scope(drhd->devices, + drhd->devices_cnt, i, tmp) { + if (tmp == dev) { + /* For a VF use its original BDF# not that of the PF + * which we used for the IOMMU lookup. Strictly speaking + * we could do this for all PCI devices; we only need to + * get the BDF# from the scope table for ACPI matches. */ + if (pdev && pdev->is_virtfn) + goto got_pdev; + + if (bus && devfn) { + *bus = drhd->devices[i].bus; + *devfn = drhd->devices[i].devfn; + } + goto out; + } + + if (is_downstream_to_pci_bridge(dev, tmp)) + goto got_pdev; + } + + if (pdev && drhd->include_all) { +got_pdev: + if (bus && devfn) { + *bus = pdev->bus->number; + *devfn = pdev->devfn; + } + goto out; + } + } + iommu = NULL; +out: + if (iommu_is_dummy(iommu, dev)) + iommu = NULL; + + rcu_read_unlock(); + + return iommu; +} + +static void free_context_table(struct intel_iommu *iommu) +{ + struct context_entry *context; + int i; + + if (!iommu->root_entry) + return; + + for (i = 0; i < ROOT_ENTRY_NR; i++) { + context = iommu_context_addr(iommu, i, 0, 0); + if (context) + iommu_free_pages(context); + + if (!sm_supported(iommu)) + continue; + + context = iommu_context_addr(iommu, i, 0x80, 0); + if (context) + iommu_free_pages(context); + } + + iommu_free_pages(iommu->root_entry); + iommu->root_entry = NULL; +} + +#ifdef CONFIG_DMAR_DEBUG +static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, + u8 bus, u8 devfn, struct dma_pte *parent, int level) +{ + struct dma_pte *pte; + int offset; + + while (1) { + offset = pfn_level_offset(pfn, level); + pte = &parent[offset]; + + pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); + + if (!dma_pte_present(pte)) { + pr_info("page table not present at level %d\n", level - 1); + break; + } + + if (level == 1 || dma_pte_superpage(pte)) + break; + + parent = phys_to_virt(dma_pte_addr(pte)); + level--; + } +} + +void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, + unsigned long long addr, u32 pasid) +{ + struct pasid_dir_entry *dir, *pde; + struct pasid_entry *entries, *pte; + struct context_entry *ctx_entry; + struct root_entry *rt_entry; + int i, dir_index, index, level; + u8 devfn = source_id & 0xff; + u8 bus = source_id >> 8; + struct dma_pte *pgtable; + + pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); + + /* root entry dump */ + if (!iommu->root_entry) { + pr_info("root table is not present\n"); + return; + } + rt_entry = &iommu->root_entry[bus]; + + if (sm_supported(iommu)) + pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", + rt_entry->hi, rt_entry->lo); + else + pr_info("root entry: 0x%016llx", rt_entry->lo); + + /* context entry dump */ + ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); + if (!ctx_entry) { + pr_info("context table is not present\n"); + return; + } + + pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", + ctx_entry->hi, ctx_entry->lo); + + /* legacy mode does not require PASID entries */ + if (!sm_supported(iommu)) { + if (!context_present(ctx_entry)) { + pr_info("legacy mode page table is not present\n"); + return; + } + level = agaw_to_level(ctx_entry->hi & 7); + pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + goto pgtable_walk; + } + + if (!context_present(ctx_entry)) { + pr_info("pasid directory table is not present\n"); + return; + } + + /* get the pointer to pasid directory entry */ + dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + + /* For request-without-pasid, get the pasid from context entry */ + if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) + pasid = IOMMU_NO_PASID; + + dir_index = pasid >> PASID_PDE_SHIFT; + pde = &dir[dir_index]; + pr_info("pasid dir entry: 0x%016llx\n", pde->val); + + /* get the pointer to the pasid table entry */ + entries = get_pasid_table_from_pde(pde); + if (!entries) { + pr_info("pasid table is not present\n"); + return; + } + index = pasid & PASID_PTE_MASK; + pte = &entries[index]; + for (i = 0; i < ARRAY_SIZE(pte->val); i++) + pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); + + if (!pasid_pte_is_present(pte)) { + pr_info("scalable mode page table is not present\n"); + return; + } + + if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { + level = pte->val[2] & BIT_ULL(2) ? 5 : 4; + pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); + } else { + level = agaw_to_level((pte->val[0] >> 2) & 0x7); + pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); + } + +pgtable_walk: + pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); +} +#endif + +/* iommu handling */ +static int iommu_alloc_root_entry(struct intel_iommu *iommu) +{ + struct root_entry *root; + + root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); + if (!root) { + pr_err("Allocating root entry for %s failed\n", + iommu->name); + return -ENOMEM; + } + + __iommu_flush_cache(iommu, root, ROOT_SIZE); + iommu->root_entry = root; + + return 0; +} + +static void iommu_set_root_entry(struct intel_iommu *iommu) +{ + u64 addr; + u32 sts; + unsigned long flag; + + addr = virt_to_phys(iommu->root_entry); + if (sm_supported(iommu)) + addr |= DMA_RTADDR_SMT; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); + + writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_RTPS), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* + * Hardware invalidates all DMA remapping hardware translation + * caches as part of SRTP flow. + */ + if (cap_esrtps(iommu->cap)) + return; + + iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); + if (sm_supported(iommu)) + qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); +} + +void iommu_flush_write_buffer(struct intel_iommu *iommu) +{ + u32 val; + unsigned long flag; + + if (!rwbf_quirk && !cap_rwbf(iommu->cap)) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (!(val & DMA_GSTS_WBFS)), val); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +/* return value determine if we need a write buffer flush */ +static void __iommu_flush_context(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, + u64 type) +{ + u64 val = 0; + unsigned long flag; + + switch (type) { + case DMA_CCMD_GLOBAL_INVL: + val = DMA_CCMD_GLOBAL_INVL; + break; + case DMA_CCMD_DOMAIN_INVL: + val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); + break; + case DMA_CCMD_DEVICE_INVL: + val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) + | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); + break; + default: + pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", + iommu->name, type); + return; + } + val |= DMA_CCMD_ICC; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, + dmar_readq, (!(val & DMA_CCMD_ICC)), val); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type) +{ + int tlb_offset = ecap_iotlb_offset(iommu->ecap); + u64 val = 0, val_iva = 0; + unsigned long flag; + + switch (type) { + case DMA_TLB_GLOBAL_FLUSH: + /* global flush doesn't need set IVA_REG */ + val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; + break; + case DMA_TLB_DSI_FLUSH: + val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + break; + case DMA_TLB_PSI_FLUSH: + val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); + /* IH bit is passed in as part of address */ + val_iva = size_order | addr; + break; + default: + pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", + iommu->name, type); + return; + } + + if (cap_write_drain(iommu->cap)) + val |= DMA_TLB_WRITE_DRAIN; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + /* Note: Only uses first TLB reg currently */ + if (val_iva) + dmar_writeq(iommu->reg + tlb_offset, val_iva); + dmar_writeq(iommu->reg + tlb_offset + 8, val); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, tlb_offset + 8, + dmar_readq, (!(val & DMA_TLB_IVT)), val); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + + /* check IOTLB invalidation granularity */ + if (DMA_TLB_IAIG(val) == 0) + pr_err("Flush IOTLB failed\n"); + if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) + pr_debug("TLB flush request %Lx, actual %Lx\n", + (unsigned long long)DMA_TLB_IIRG(type), + (unsigned long long)DMA_TLB_IAIG(val)); +} + +static struct device_domain_info * +domain_lookup_dev_info(struct dmar_domain *domain, + struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + struct device_domain_info *info; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); + list_for_each_entry(info, &domain->devices, link) { + if (info->iommu == iommu && info->bus == bus && + info->devfn == devfn) { + spin_unlock_irqrestore(&domain->lock, flags); + return info; + } + } + spin_unlock_irqrestore(&domain->lock, flags); + + return NULL; +} + +/* + * The extra devTLB flush quirk impacts those QAT devices with PCI device + * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() + * check because it applies only to the built-in QAT devices and it doesn't + * grant additional privileges. + */ +#define BUGGY_QAT_DEVID_MASK 0x4940 +static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) +{ + if (pdev->vendor != PCI_VENDOR_ID_INTEL) + return false; + + if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) + return false; + + return true; +} + +static void iommu_enable_pci_ats(struct device_domain_info *info) +{ + struct pci_dev *pdev; + + if (!info->ats_supported) + return; + + pdev = to_pci_dev(info->dev); + if (!pci_ats_page_aligned(pdev)) + return; + + if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) + info->ats_enabled = 1; +} + +static void iommu_disable_pci_ats(struct device_domain_info *info) +{ + if (!info->ats_enabled) + return; + + pci_disable_ats(to_pci_dev(info->dev)); + info->ats_enabled = 0; +} + +static void iommu_enable_pci_pri(struct device_domain_info *info) +{ + struct pci_dev *pdev; + + if (!info->ats_enabled || !info->pri_supported) + return; + + pdev = to_pci_dev(info->dev); + /* PASID is required in PRG Response Message. */ + if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) + return; + + if (pci_reset_pri(pdev)) + return; + + if (!pci_enable_pri(pdev, PRQ_DEPTH)) + info->pri_enabled = 1; +} + +static void iommu_disable_pci_pri(struct device_domain_info *info) +{ + if (!info->pri_enabled) + return; + + if (WARN_ON(info->iopf_refcount)) + iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); + + pci_disable_pri(to_pci_dev(info->dev)); + info->pri_enabled = 0; +} + +static void intel_flush_iotlb_all(struct iommu_domain *domain) +{ + cache_tag_flush_all(to_dmar_domain(domain)); +} + +static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) +{ + u32 pmen; + unsigned long flags; + + if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + pmen = readl(iommu->reg + DMAR_PMEN_REG); + pmen &= ~DMA_PMEN_EPM; + writel(pmen, iommu->reg + DMAR_PMEN_REG); + + /* wait for the protected region status bit to clear */ + IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, + readl, !(pmen & DMA_PMEN_PRS), pmen); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static void iommu_enable_translation(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flags; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + iommu->gcmd |= DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_TES), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static void iommu_disable_translation(struct intel_iommu *iommu) +{ + u32 sts; + unsigned long flag; + + if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && + (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) + return; + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + iommu->gcmd &= ~DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + /* Make sure hardware complete it */ + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (!(sts & DMA_GSTS_TES)), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); +} + +static void disable_dmar_iommu(struct intel_iommu *iommu) +{ + /* + * All iommu domains must have been detached from the devices, + * hence there should be no domain IDs in use. + */ + if (WARN_ON(!ida_is_empty(&iommu->domain_ida))) + return; + + if (iommu->gcmd & DMA_GCMD_TE) + iommu_disable_translation(iommu); +} + +static void free_dmar_iommu(struct intel_iommu *iommu) +{ + if (iommu->copied_tables) { + bitmap_free(iommu->copied_tables); + iommu->copied_tables = NULL; + } + + /* free context mapping */ + free_context_table(iommu); + + if (ecap_prs(iommu->ecap)) + intel_iommu_finish_prq(iommu); +} + +/* + * Check and return whether first level is used by default for + * DMA translation. + */ +static bool first_level_by_default(struct intel_iommu *iommu) +{ + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu)) + return false; + + /* Only level (either FL or SL) is available, just use it */ + if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) + return ecap_flts(iommu->ecap); + + return true; +} + +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) +{ + struct iommu_domain_info *info, *curr; + int num, ret = -ENOSPC; + + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return 0; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + guard(mutex)(&iommu->did_lock); + curr = xa_load(&domain->iommu_array, iommu->seq_id); + if (curr) { + curr->refcnt++; + kfree(info); + return 0; + } + + num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, + cap_ndoms(iommu->cap) - 1, GFP_KERNEL); + if (num < 0) { + pr_err("%s: No free domain ids\n", iommu->name); + goto err_unlock; + } + + info->refcnt = 1; + info->did = num; + info->iommu = iommu; + curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, + NULL, info, GFP_KERNEL); + if (curr) { + ret = xa_err(curr) ? : -EBUSY; + goto err_clear; + } + + return 0; + +err_clear: + ida_free(&iommu->domain_ida, info->did); +err_unlock: + kfree(info); + return ret; +} + +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) +{ + struct iommu_domain_info *info; + + if (domain->domain.type == IOMMU_DOMAIN_SVA) + return; + + guard(mutex)(&iommu->did_lock); + info = xa_load(&domain->iommu_array, iommu->seq_id); + if (--info->refcnt == 0) { + ida_free(&iommu->domain_ida, info->did); + xa_erase(&domain->iommu_array, iommu->seq_id); + kfree(info); + } +} + +/* + * For kdump cases, old valid entries may be cached due to the + * in-flight DMA and copied pgtable, but there is no unmapping + * behaviour for them, thus we need an explicit cache flush for + * the newly-mapped device. For kdump, at this point, the device + * is supposed to finish reset at its driver probe stage, so no + * in-flight DMA will exist, and we don't need to worry anymore + * hereafter. + */ +static void copied_context_tear_down(struct intel_iommu *iommu, + struct context_entry *context, + u8 bus, u8 devfn) +{ + u16 did_old; + + if (!context_copied(iommu, bus, devfn)) + return; + + assert_spin_locked(&iommu->lock); + + did_old = context_domain_id(context); + context_clear_entry(context); + + if (did_old < cap_ndoms(iommu->cap)) { + iommu->flush.flush_context(iommu, did_old, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did_old, 0, 0, + DMA_TLB_DSI_FLUSH); + } + + clear_context_copied(iommu, bus, devfn); +} + +/* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ +static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, + u8 bus, u8 devfn) +{ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + } else { + iommu_flush_write_buffer(iommu); + } +} + +static int domain_context_mapping_one(struct dmar_domain *domain, + struct intel_iommu *iommu, + u8 bus, u8 devfn) +{ + struct device_domain_info *info = + domain_lookup_dev_info(domain, iommu, bus, devfn); + u16 did = domain_id_iommu(domain, iommu); + int translation = CONTEXT_TT_MULTI_LEVEL; + struct pt_iommu_vtdss_hw_info pt_info; + struct context_entry *context; + int ret; + + if (WARN_ON(!intel_domain_is_ss_paging(domain))) + return -EINVAL; + + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); + + pr_debug("Set context mapping for %02x:%02x.%d\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + spin_lock(&iommu->lock); + ret = -ENOMEM; + context = iommu_context_addr(iommu, bus, devfn, 1); + if (!context) + goto out_unlock; + + ret = 0; + if (context_present(context) && !context_copied(iommu, bus, devfn)) + goto out_unlock; + + copied_context_tear_down(iommu, context, bus, devfn); + context_clear_entry(context); + context_set_domain_id(context, did); + + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; + + context_set_address_root(context, pt_info.ssptptr); + context_set_address_width(context, pt_info.aw); + context_set_translation_type(context, translation); + context_set_fault_enable(context); + context_set_present(context); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(context, sizeof(*context)); + context_present_cache_flush(iommu, did, bus, devfn); + ret = 0; + +out_unlock: + spin_unlock(&iommu->lock); + + return ret; +} + +static int domain_context_mapping_cb(struct pci_dev *pdev, + u16 alias, void *opaque) +{ + struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); + struct intel_iommu *iommu = info->iommu; + struct dmar_domain *domain = opaque; + + return domain_context_mapping_one(domain, iommu, + PCI_BUS_NUM(alias), alias & 0xff); +} + +static int +domain_context_mapping(struct dmar_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; + int ret; + + if (!dev_is_pci(dev)) + return domain_context_mapping_one(domain, iommu, bus, devfn); + + ret = pci_for_each_dma_alias(to_pci_dev(dev), + domain_context_mapping_cb, domain); + if (ret) + return ret; + + iommu_enable_pci_ats(info); + + return 0; +} + +static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) +{ + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + u16 did; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, 0); + if (!context) { + spin_unlock(&iommu->lock); + return; + } + + did = context_domain_id(context); + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + spin_unlock(&iommu->lock); + intel_context_flush_no_pasid(info, context, did); +} + +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, + did, flags); + return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did, + iommu_domain_did(old, iommu), + flags); +} + +static int domain_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_second_level(iommu, domain, + dev, pasid); + return intel_pasid_replace_second_level(iommu, domain, dev, + iommu_domain_did(old, iommu), + pasid); +} + +static int domain_setup_passthrough(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_pass_through(iommu, dev, pasid); + return intel_pasid_replace_pass_through(iommu, dev, + iommu_domain_did(old, iommu), + pasid); +} + +static int domain_setup_first_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, + u32 pasid, struct iommu_domain *old) +{ + struct pt_iommu_x86_64_hw_info pt_info; + unsigned int flags = 0; + + pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info); + if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5)) + return -EINVAL; + + if (pt_info.levels == 5) + flags |= PASID_FLAG_FL5LP; + + if (domain->force_snooping) + flags |= PASID_FLAG_PAGE_SNOOP; + + if (!(domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + flags |= PASID_FLAG_PWSNP; + + return __domain_setup_first_level(iommu, dev, pasid, + domain_id_iommu(domain, iommu), + pt_info.gcr3_pt, flags, old); +} + +static int dmar_domain_attach_device(struct dmar_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + int ret; + + ret = domain_attach_iommu(domain, iommu); + if (ret) + return ret; + + info->domain = domain; + info->domain_attached = true; + spin_lock_irqsave(&domain->lock, flags); + list_add(&info->link, &domain->devices); + spin_unlock_irqrestore(&domain->lock, flags); + + if (dev_is_real_dma_subdevice(dev)) + return 0; + + if (!sm_supported(iommu)) + ret = domain_context_mapping(domain, dev); + else if (intel_domain_is_fs_paging(domain)) + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (intel_domain_is_ss_paging(domain)) + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (WARN_ON(true)) + ret = -EINVAL; + + if (ret) + goto out_block_translation; + + ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); + if (ret) + goto out_block_translation; + + return 0; + +out_block_translation: + device_block_translation(dev); + return ret; +} + +/** + * device_rmrr_is_relaxable - Test whether the RMRR of this device + * is relaxable (ie. is allowed to be not enforced under some conditions) + * @dev: device handle + * + * We assume that PCI USB devices with RMRRs have them largely + * for historical reasons and that the RMRR space is not actively used post + * boot. This exclusion may change if vendors begin to abuse it. + * + * The same exception is made for graphics devices, with the requirement that + * any use of the RMRR regions will be torn down before assigning the device + * to a guest. + * + * Return: true if the RMRR is relaxable, false otherwise + */ +static bool device_rmrr_is_relaxable(struct device *dev) +{ + struct pci_dev *pdev; + + if (!dev_is_pci(dev)) + return false; + + pdev = to_pci_dev(dev); + if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) + return true; + else + return false; +} + +static int device_def_domain_type(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * Hardware does not support the passthrough translation mode. + * Always use a dynamaic mapping domain. + */ + if (!ecap_pass_through(iommu->ecap)) + return IOMMU_DOMAIN_DMA; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) + return IOMMU_DOMAIN_IDENTITY; + } + + return 0; +} + +static void intel_iommu_init_qi(struct intel_iommu *iommu) +{ + /* + * Start from the sane iommu hardware state. + * If the queued invalidation is already initialized by us + * (for example, while enabling interrupt-remapping) then + * we got the things already rolling from a sane state. + */ + if (!iommu->qi) { + /* + * Clear any previous faults. + */ + dmar_fault(-1, iommu); + /* + * Disable queued invalidation if supported and already enabled + * before OS handover. + */ + dmar_disable_qi(iommu); + } + + if (dmar_enable_qi(iommu)) { + /* + * Queued Invalidate not enabled, use Register Based Invalidate + */ + iommu->flush.flush_context = __iommu_flush_context; + iommu->flush.flush_iotlb = __iommu_flush_iotlb; + pr_info("%s: Using Register based invalidation\n", + iommu->name); + } else { + iommu->flush.flush_context = qi_flush_context; + iommu->flush.flush_iotlb = qi_flush_iotlb; + pr_info("%s: Using Queued invalidation\n", iommu->name); + } +} + +static int copy_context_table(struct intel_iommu *iommu, + struct root_entry *old_re, + struct context_entry **tbl, + int bus, bool ext) +{ + int tbl_idx, pos = 0, idx, devfn, ret = 0, did; + struct context_entry *new_ce = NULL, ce; + struct context_entry *old_ce = NULL; + struct root_entry re; + phys_addr_t old_ce_phys; + + tbl_idx = ext ? bus * 2 : bus; + memcpy(&re, old_re, sizeof(re)); + + for (devfn = 0; devfn < 256; devfn++) { + /* First calculate the correct index */ + idx = (ext ? devfn * 2 : devfn) % 256; + + if (idx == 0) { + /* First save what we may have and clean up */ + if (new_ce) { + tbl[tbl_idx] = new_ce; + __iommu_flush_cache(iommu, new_ce, + VTD_PAGE_SIZE); + pos = 1; + } + + if (old_ce) + memunmap(old_ce); + + ret = 0; + if (devfn < 0x80) + old_ce_phys = root_entry_lctp(&re); + else + old_ce_phys = root_entry_uctp(&re); + + if (!old_ce_phys) { + if (ext && devfn == 0) { + /* No LCTP, try UCTP */ + devfn = 0x7f; + continue; + } else { + goto out; + } + } + + ret = -ENOMEM; + old_ce = memremap(old_ce_phys, PAGE_SIZE, + MEMREMAP_WB); + if (!old_ce) + goto out; + + new_ce = iommu_alloc_pages_node_sz(iommu->node, + GFP_KERNEL, SZ_4K); + if (!new_ce) + goto out_unmap; + + ret = 0; + } + + /* Now copy the context entry */ + memcpy(&ce, old_ce + idx, sizeof(ce)); + + if (!context_present(&ce)) + continue; + + did = context_domain_id(&ce); + if (did >= 0 && did < cap_ndoms(iommu->cap)) + ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL); + + set_context_copied(iommu, bus, devfn); + new_ce[idx] = ce; + } + + tbl[tbl_idx + pos] = new_ce; + + __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); + +out_unmap: + memunmap(old_ce); + +out: + return ret; +} + +static int copy_translation_tables(struct intel_iommu *iommu) +{ + struct context_entry **ctxt_tbls; + struct root_entry *old_rt; + phys_addr_t old_rt_phys; + int ctxt_table_entries; + u64 rtaddr_reg; + int bus, ret; + bool new_ext, ext; + + rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); + ext = !!(rtaddr_reg & DMA_RTADDR_SMT); + new_ext = !!sm_supported(iommu); + + /* + * The RTT bit can only be changed when translation is disabled, + * but disabling translation means to open a window for data + * corruption. So bail out and don't copy anything if we would + * have to change the bit. + */ + if (new_ext != ext) + return -EINVAL; + + iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); + if (!iommu->copied_tables) + return -ENOMEM; + + old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; + if (!old_rt_phys) + return -EINVAL; + + old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); + if (!old_rt) + return -ENOMEM; + + /* This is too big for the stack - allocate it from slab */ + ctxt_table_entries = ext ? 512 : 256; + ret = -ENOMEM; + ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); + if (!ctxt_tbls) + goto out_unmap; + + for (bus = 0; bus < 256; bus++) { + ret = copy_context_table(iommu, &old_rt[bus], + ctxt_tbls, bus, ext); + if (ret) { + pr_err("%s: Failed to copy context table for bus %d\n", + iommu->name, bus); + continue; + } + } + + spin_lock(&iommu->lock); + + /* Context tables are copied, now write them to the root_entry table */ + for (bus = 0; bus < 256; bus++) { + int idx = ext ? bus * 2 : bus; + u64 val; + + if (ctxt_tbls[idx]) { + val = virt_to_phys(ctxt_tbls[idx]) | 1; + iommu->root_entry[bus].lo = val; + } + + if (!ext || !ctxt_tbls[idx + 1]) + continue; + + val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; + iommu->root_entry[bus].hi = val; + } + + spin_unlock(&iommu->lock); + + kfree(ctxt_tbls); + + __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); + + ret = 0; + +out_unmap: + memunmap(old_rt); + + return ret; +} + +static int __init init_dmars(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int ret; + + for_each_iommu(iommu, drhd) { + if (drhd->ignored) { + iommu_disable_translation(iommu); + continue; + } + + /* + * Find the max pasid size of all IOMMU's in the system. + * We need to ensure the system pasid table is no bigger + * than the smallest supported. + */ + if (pasid_supported(iommu)) { + u32 temp = 2 << ecap_pss(iommu->ecap); + + intel_pasid_max_id = min_t(u32, temp, + intel_pasid_max_id); + } + + intel_iommu_init_qi(iommu); + init_translation_status(iommu); + + if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { + iommu_disable_translation(iommu); + clear_translation_pre_enabled(iommu); + pr_warn("Translation was enabled for %s but we are not in kdump mode\n", + iommu->name); + } + + /* + * TBD: + * we could share the same root & context tables + * among all IOMMU's. Need to Split it later. + */ + ret = iommu_alloc_root_entry(iommu); + if (ret) + goto free_iommu; + + if (translation_pre_enabled(iommu)) { + pr_info("Translation already enabled - trying to copy translation structures\n"); + + ret = copy_translation_tables(iommu); + if (ret) { + /* + * We found the IOMMU with translation + * enabled - but failed to copy over the + * old root-entry table. Try to proceed + * by disabling translation now and + * allocating a clean root-entry table. + * This might cause DMAR faults, but + * probably the dump will still succeed. + */ + pr_err("Failed to copy translation tables from previous kernel for %s\n", + iommu->name); + iommu_disable_translation(iommu); + clear_translation_pre_enabled(iommu); + } else { + pr_info("Copied translation tables from previous kernel for %s\n", + iommu->name); + } + } + + intel_svm_check(iommu); + } + + /* + * Now that qi is enabled on all iommus, set the root entry and flush + * caches. This is required on some Intel X58 chipsets, otherwise the + * flush_context function will loop forever and the boot hangs. + */ + for_each_active_iommu(iommu, drhd) { + iommu_flush_write_buffer(iommu); + iommu_set_root_entry(iommu); + } + + check_tylersburg_isoch(); + + /* + * for each drhd + * enable fault log + * global invalidate context cache + * global invalidate iotlb + * enable translation + */ + for_each_iommu(iommu, drhd) { + if (drhd->ignored) { + /* + * we always have to disable PMRs or DMA may fail on + * this device + */ + if (force_on) + iommu_disable_protect_mem_regions(iommu); + continue; + } + + iommu_flush_write_buffer(iommu); + + if (ecap_prs(iommu->ecap)) { + /* + * Call dmar_alloc_hwirq() with dmar_global_lock held, + * could cause possible lock race condition. + */ + up_write(&dmar_global_lock); + ret = intel_iommu_enable_prq(iommu); + down_write(&dmar_global_lock); + if (ret) + goto free_iommu; + } + + ret = dmar_set_interrupt(iommu); + if (ret) + goto free_iommu; + } + + return 0; + +free_iommu: + for_each_active_iommu(iommu, drhd) { + disable_dmar_iommu(iommu); + free_dmar_iommu(iommu); + } + + return ret; +} + +static void __init init_no_remapping_devices(void) +{ + struct dmar_drhd_unit *drhd; + struct device *dev; + int i; + + for_each_drhd_unit(drhd) { + if (!drhd->include_all) { + for_each_active_dev_scope(drhd->devices, + drhd->devices_cnt, i, dev) + break; + /* ignore DMAR unit if no devices exist */ + if (i == drhd->devices_cnt) + drhd->ignored = 1; + } + } + + for_each_active_drhd_unit(drhd) { + if (drhd->include_all) + continue; + + for_each_active_dev_scope(drhd->devices, + drhd->devices_cnt, i, dev) + if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) + break; + if (i < drhd->devices_cnt) + continue; + + /* This IOMMU has *only* gfx devices. Either bypass it or + set the gfx_mapped flag, as appropriate */ + drhd->gfx_dedicated = 1; + if (disable_igfx_iommu) + drhd->ignored = 1; + } +} + +#ifdef CONFIG_SUSPEND +static int init_iommu_hw(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + int ret; + + for_each_active_iommu(iommu, drhd) { + if (iommu->qi) { + ret = dmar_reenable_qi(iommu); + if (ret) + return ret; + } + } + + for_each_iommu(iommu, drhd) { + if (drhd->ignored) { + /* + * we always have to disable PMRs or DMA may fail on + * this device + */ + if (force_on) + iommu_disable_protect_mem_regions(iommu); + continue; + } + + iommu_flush_write_buffer(iommu); + iommu_set_root_entry(iommu); + iommu_enable_translation(iommu); + iommu_disable_protect_mem_regions(iommu); + } + + return 0; +} + +static void iommu_flush_all(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + for_each_active_iommu(iommu, drhd) { + iommu->flush.flush_context(iommu, 0, 0, 0, + DMA_CCMD_GLOBAL_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, + DMA_TLB_GLOBAL_FLUSH); + } +} + +static int iommu_suspend(void *data) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + unsigned long flag; + + iommu_flush_all(); + + for_each_active_iommu(iommu, drhd) { + iommu_disable_translation(iommu); + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + + iommu->iommu_state[SR_DMAR_FECTL_REG] = + readl(iommu->reg + DMAR_FECTL_REG); + iommu->iommu_state[SR_DMAR_FEDATA_REG] = + readl(iommu->reg + DMAR_FEDATA_REG); + iommu->iommu_state[SR_DMAR_FEADDR_REG] = + readl(iommu->reg + DMAR_FEADDR_REG); + iommu->iommu_state[SR_DMAR_FEUADDR_REG] = + readl(iommu->reg + DMAR_FEUADDR_REG); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } + return 0; +} + +static void iommu_resume(void *data) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + unsigned long flag; + + if (init_iommu_hw()) { + if (force_on) + panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); + else + WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); + return; + } + + for_each_active_iommu(iommu, drhd) { + + raw_spin_lock_irqsave(&iommu->register_lock, flag); + + writel(iommu->iommu_state[SR_DMAR_FECTL_REG], + iommu->reg + DMAR_FECTL_REG); + writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], + iommu->reg + DMAR_FEDATA_REG); + writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], + iommu->reg + DMAR_FEADDR_REG); + writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], + iommu->reg + DMAR_FEUADDR_REG); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flag); + } +} + +static const struct syscore_ops iommu_syscore_ops = { + .resume = iommu_resume, + .suspend = iommu_suspend, +}; + +static struct syscore iommu_syscore = { + .ops = &iommu_syscore_ops, +}; + +static void __init init_iommu_pm_ops(void) +{ + register_syscore(&iommu_syscore); +} + +#else +static inline void init_iommu_pm_ops(void) {} +#endif /* CONFIG_PM */ + +static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) +{ + if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || + !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || + rmrr->end_address <= rmrr->base_address || + arch_rmrr_sanity_check(rmrr)) + return -EINVAL; + + return 0; +} + +int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) +{ + struct acpi_dmar_reserved_memory *rmrr; + struct dmar_rmrr_unit *rmrru; + + rmrr = (struct acpi_dmar_reserved_memory *)header; + if (rmrr_sanity_check(rmrr)) { + pr_warn(FW_BUG + "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + rmrr->base_address, rmrr->end_address, + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + } + + rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL); + if (!rmrru) + goto out; + + rmrru->hdr = header; + + rmrru->base_address = rmrr->base_address; + rmrru->end_address = rmrr->end_address; + + rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), + ((void *)rmrr) + rmrr->header.length, + &rmrru->devices_cnt); + if (rmrru->devices_cnt && rmrru->devices == NULL) + goto free_rmrru; + + list_add(&rmrru->list, &dmar_rmrr_units); + + return 0; +free_rmrru: + kfree(rmrru); +out: + return -ENOMEM; +} + +static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) +{ + struct dmar_atsr_unit *atsru; + struct acpi_dmar_atsr *tmp; + + list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, + dmar_rcu_check()) { + tmp = (struct acpi_dmar_atsr *)atsru->hdr; + if (atsr->segment != tmp->segment) + continue; + if (atsr->header.length != tmp->header.length) + continue; + if (memcmp(atsr, tmp, atsr->header.length) == 0) + return atsru; + } + + return NULL; +} + +int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) +{ + struct acpi_dmar_atsr *atsr; + struct dmar_atsr_unit *atsru; + + if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) + return 0; + + atsr = container_of(hdr, struct acpi_dmar_atsr, header); + atsru = dmar_find_atsr(atsr); + if (atsru) + return 0; + + atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); + if (!atsru) + return -ENOMEM; + + /* + * If memory is allocated from slab by ACPI _DSM method, we need to + * copy the memory content because the memory buffer will be freed + * on return. + */ + atsru->hdr = (void *)(atsru + 1); + memcpy(atsru->hdr, hdr, hdr->length); + atsru->include_all = atsr->flags & 0x1; + if (!atsru->include_all) { + atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), + (void *)atsr + atsr->header.length, + &atsru->devices_cnt); + if (atsru->devices_cnt && atsru->devices == NULL) { + kfree(atsru); + return -ENOMEM; + } + } + + list_add_rcu(&atsru->list, &dmar_atsr_units); + + return 0; +} + +static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) +{ + dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); + kfree(atsru); +} + +int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) +{ + struct acpi_dmar_atsr *atsr; + struct dmar_atsr_unit *atsru; + + atsr = container_of(hdr, struct acpi_dmar_atsr, header); + atsru = dmar_find_atsr(atsr); + if (atsru) { + list_del_rcu(&atsru->list); + synchronize_rcu(); + intel_iommu_free_atsr(atsru); + } + + return 0; +} + +int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) +{ + int i; + struct device *dev; + struct acpi_dmar_atsr *atsr; + struct dmar_atsr_unit *atsru; + + atsr = container_of(hdr, struct acpi_dmar_atsr, header); + atsru = dmar_find_atsr(atsr); + if (!atsru) + return 0; + + if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { + for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, + i, dev) + return -EBUSY; + } + + return 0; +} + +static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) +{ + struct dmar_satc_unit *satcu; + struct acpi_dmar_satc *tmp; + + list_for_each_entry_rcu(satcu, &dmar_satc_units, list, + dmar_rcu_check()) { + tmp = (struct acpi_dmar_satc *)satcu->hdr; + if (satc->segment != tmp->segment) + continue; + if (satc->header.length != tmp->header.length) + continue; + if (memcmp(satc, tmp, satc->header.length) == 0) + return satcu; + } + + return NULL; +} + +int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) +{ + struct acpi_dmar_satc *satc; + struct dmar_satc_unit *satcu; + + if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) + return 0; + + satc = container_of(hdr, struct acpi_dmar_satc, header); + satcu = dmar_find_satc(satc); + if (satcu) + return 0; + + satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); + if (!satcu) + return -ENOMEM; + + satcu->hdr = (void *)(satcu + 1); + memcpy(satcu->hdr, hdr, hdr->length); + satcu->atc_required = satc->flags & 0x1; + satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), + (void *)satc + satc->header.length, + &satcu->devices_cnt); + if (satcu->devices_cnt && !satcu->devices) { + kfree(satcu); + return -ENOMEM; + } + list_add_rcu(&satcu->list, &dmar_satc_units); + + return 0; +} + +static int intel_iommu_add(struct dmar_drhd_unit *dmaru) +{ + struct intel_iommu *iommu = dmaru->iommu; + int ret; + + /* + * Disable translation if already enabled prior to OS handover. + */ + if (iommu->gcmd & DMA_GCMD_TE) + iommu_disable_translation(iommu); + + ret = iommu_alloc_root_entry(iommu); + if (ret) + goto out; + + intel_svm_check(iommu); + + if (dmaru->ignored) { + /* + * we always have to disable PMRs or DMA may fail on this device + */ + if (force_on) + iommu_disable_protect_mem_regions(iommu); + return 0; + } + + intel_iommu_init_qi(iommu); + iommu_flush_write_buffer(iommu); + + if (ecap_prs(iommu->ecap)) { + ret = intel_iommu_enable_prq(iommu); + if (ret) + goto disable_iommu; + } + + ret = dmar_set_interrupt(iommu); + if (ret) + goto disable_iommu; + + iommu_set_root_entry(iommu); + iommu_enable_translation(iommu); + + iommu_disable_protect_mem_regions(iommu); + return 0; + +disable_iommu: + disable_dmar_iommu(iommu); +out: + free_dmar_iommu(iommu); + return ret; +} + +int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) +{ + int ret = 0; + struct intel_iommu *iommu = dmaru->iommu; + + if (!intel_iommu_enabled) + return 0; + if (iommu == NULL) + return -EINVAL; + + if (insert) { + ret = intel_iommu_add(dmaru); + } else { + disable_dmar_iommu(iommu); + free_dmar_iommu(iommu); + } + + return ret; +} + +static void intel_iommu_free_dmars(void) +{ + struct dmar_rmrr_unit *rmrru, *rmrr_n; + struct dmar_atsr_unit *atsru, *atsr_n; + struct dmar_satc_unit *satcu, *satc_n; + + list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { + list_del(&rmrru->list); + dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); + kfree(rmrru); + } + + list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { + list_del(&atsru->list); + intel_iommu_free_atsr(atsru); + } + list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { + list_del(&satcu->list); + dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); + kfree(satcu); + } +} + +static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) +{ + struct dmar_satc_unit *satcu; + struct acpi_dmar_satc *satc; + struct device *tmp; + int i; + + rcu_read_lock(); + + list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { + satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); + if (satc->segment != pci_domain_nr(dev->bus)) + continue; + for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) + if (to_pci_dev(tmp) == dev) + goto out; + } + satcu = NULL; +out: + rcu_read_unlock(); + return satcu; +} + +static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) +{ + struct pci_dev *bridge = NULL; + struct dmar_atsr_unit *atsru; + struct dmar_satc_unit *satcu; + struct acpi_dmar_atsr *atsr; + bool supported = true; + struct pci_bus *bus; + struct device *tmp; + int i; + + dev = pci_physfn(dev); + satcu = dmar_find_matched_satc_unit(dev); + if (satcu) + /* + * This device supports ATS as it is in SATC table. + * When IOMMU is in legacy mode, enabling ATS is done + * automatically by HW for the device that requires + * ATS, hence OS should not enable this device ATS + * to avoid duplicated TLB invalidation. + */ + return !(satcu->atc_required && !sm_supported(iommu)); + + for (bus = dev->bus; bus; bus = bus->parent) { + bridge = bus->self; + /* If it's an integrated device, allow ATS */ + if (!bridge) + return true; + /* Connected via non-PCIe: no ATS */ + if (!pci_is_pcie(bridge) || + pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) + return false; + /* If we found the root port, look it up in the ATSR */ + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) + break; + } + + rcu_read_lock(); + list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { + atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); + if (atsr->segment != pci_domain_nr(dev->bus)) + continue; + + for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) + if (tmp == &bridge->dev) + goto out; + + if (atsru->include_all) + goto out; + } + supported = false; +out: + rcu_read_unlock(); + + return supported; +} + +int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) +{ + int ret; + struct dmar_rmrr_unit *rmrru; + struct dmar_atsr_unit *atsru; + struct dmar_satc_unit *satcu; + struct acpi_dmar_atsr *atsr; + struct acpi_dmar_reserved_memory *rmrr; + struct acpi_dmar_satc *satc; + + if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) + return 0; + + list_for_each_entry(rmrru, &dmar_rmrr_units, list) { + rmrr = container_of(rmrru->hdr, + struct acpi_dmar_reserved_memory, header); + if (info->event == BUS_NOTIFY_ADD_DEVICE) { + ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), + ((void *)rmrr) + rmrr->header.length, + rmrr->segment, rmrru->devices, + rmrru->devices_cnt); + if (ret < 0) + return ret; + } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { + dmar_remove_dev_scope(info, rmrr->segment, + rmrru->devices, rmrru->devices_cnt); + } + } + + list_for_each_entry(atsru, &dmar_atsr_units, list) { + if (atsru->include_all) + continue; + + atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); + if (info->event == BUS_NOTIFY_ADD_DEVICE) { + ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), + (void *)atsr + atsr->header.length, + atsr->segment, atsru->devices, + atsru->devices_cnt); + if (ret > 0) + break; + else if (ret < 0) + return ret; + } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { + if (dmar_remove_dev_scope(info, atsr->segment, + atsru->devices, atsru->devices_cnt)) + break; + } + } + list_for_each_entry(satcu, &dmar_satc_units, list) { + satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); + if (info->event == BUS_NOTIFY_ADD_DEVICE) { + ret = dmar_insert_dev_scope(info, (void *)(satc + 1), + (void *)satc + satc->header.length, + satc->segment, satcu->devices, + satcu->devices_cnt); + if (ret > 0) + break; + else if (ret < 0) + return ret; + } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { + if (dmar_remove_dev_scope(info, satc->segment, + satcu->devices, satcu->devices_cnt)) + break; + } + } + + return 0; +} + +static void intel_disable_iommus(void) +{ + struct intel_iommu *iommu = NULL; + struct dmar_drhd_unit *drhd; + + for_each_iommu(iommu, drhd) + iommu_disable_translation(iommu); +} + +void intel_iommu_shutdown(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + + if (no_iommu || dmar_disabled) + return; + + /* + * All other CPUs were brought down, hotplug interrupts were disabled, + * no lock and RCU checking needed anymore + */ + list_for_each_entry(drhd, &dmar_drhd_units, list) { + iommu = drhd->iommu; + + /* Disable PMRs explicitly here. */ + iommu_disable_protect_mem_regions(iommu); + + /* Make sure the IOMMUs are switched off */ + iommu_disable_translation(iommu); + } +} + +static struct intel_iommu *dev_to_intel_iommu(struct device *dev) +{ + struct iommu_device *iommu_dev = dev_to_iommu_device(dev); + + return container_of(iommu_dev, struct intel_iommu, iommu); +} + +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + u32 ver = readl(iommu->reg + DMAR_VER_REG); + return sysfs_emit(buf, "%d:%d\n", + DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); +} +static DEVICE_ATTR_RO(version); + +static ssize_t address_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + return sysfs_emit(buf, "%llx\n", iommu->reg_phys); +} +static DEVICE_ATTR_RO(address); + +static ssize_t cap_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + return sysfs_emit(buf, "%llx\n", iommu->cap); +} +static DEVICE_ATTR_RO(cap); + +static ssize_t ecap_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + return sysfs_emit(buf, "%llx\n", iommu->ecap); +} +static DEVICE_ATTR_RO(ecap); + +static ssize_t domains_supported_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); +} +static DEVICE_ATTR_RO(domains_supported); + +static ssize_t domains_used_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_iommu *iommu = dev_to_intel_iommu(dev); + unsigned int count = 0; + int id; + + for (id = 0; id < cap_ndoms(iommu->cap); id++) + if (ida_exists(&iommu->domain_ida, id)) + count++; + + return sysfs_emit(buf, "%d\n", count); +} +static DEVICE_ATTR_RO(domains_used); + +static struct attribute *intel_iommu_attrs[] = { + &dev_attr_version.attr, + &dev_attr_address.attr, + &dev_attr_cap.attr, + &dev_attr_ecap.attr, + &dev_attr_domains_supported.attr, + &dev_attr_domains_used.attr, + NULL, +}; + +static struct attribute_group intel_iommu_group = { + .name = "intel-iommu", + .attrs = intel_iommu_attrs, +}; + +const struct attribute_group *intel_iommu_groups[] = { + &intel_iommu_group, + NULL, +}; + +static bool has_external_pci(void) +{ + struct pci_dev *pdev = NULL; + + for_each_pci_dev(pdev) + if (pdev->external_facing) { + pci_dev_put(pdev); + return true; + } + + return false; +} + +static int __init platform_optin_force_iommu(void) +{ + if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) + return 0; + + if (no_iommu || dmar_disabled) + pr_info("Intel-IOMMU force enabled due to platform opt in\n"); + + /* + * If Intel-IOMMU is disabled by default, we will apply identity + * map for all devices except those marked as being untrusted. + */ + if (dmar_disabled) + iommu_set_default_passthrough(false); + + dmar_disabled = 0; + no_iommu = 0; + + return 1; +} + +static int __init probe_acpi_namespace_devices(void) +{ + struct dmar_drhd_unit *drhd; + /* To avoid a -Wunused-but-set-variable warning. */ + struct intel_iommu *iommu __maybe_unused; + struct device *dev; + int i, ret = 0; + + for_each_active_iommu(iommu, drhd) { + for_each_active_dev_scope(drhd->devices, + drhd->devices_cnt, i, dev) { + struct acpi_device_physical_node *pn; + struct acpi_device *adev; + + if (dev->bus != &acpi_bus_type) + continue; + + up_read(&dmar_global_lock); + adev = to_acpi_device(dev); + mutex_lock(&adev->physical_node_lock); + list_for_each_entry(pn, + &adev->physical_node_list, node) { + ret = iommu_probe_device(pn->dev); + if (ret) + break; + } + mutex_unlock(&adev->physical_node_lock); + down_read(&dmar_global_lock); + + if (ret) + return ret; + } + } + + return 0; +} + +static __init int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || dmar_disabled) + pr_warn("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; + no_iommu = 0; + + return 1; +} + +int __init intel_iommu_init(void) +{ + int ret = -ENODEV; + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + /* + * Intel IOMMU is required for a TXT/tboot launch or platform + * opt in, so enforce that. + */ + force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || + platform_optin_force_iommu(); + + down_write(&dmar_global_lock); + if (dmar_table_init()) { + if (force_on) + panic("tboot: Failed to initialize DMAR table\n"); + goto out_free_dmar; + } + + if (dmar_dev_scope_init() < 0) { + if (force_on) + panic("tboot: Failed to initialize DMAR device scope\n"); + goto out_free_dmar; + } + + up_write(&dmar_global_lock); + + /* + * The bus notifier takes the dmar_global_lock, so lockdep will + * complain later when we register it under the lock. + */ + dmar_register_bus_notifier(); + + down_write(&dmar_global_lock); + + if (!no_iommu) + intel_iommu_debugfs_init(); + + if (no_iommu || dmar_disabled) { + /* + * We exit the function here to ensure IOMMU's remapping and + * mempool aren't setup, which means that the IOMMU's PMRs + * won't be disabled via the call to init_dmars(). So disable + * it explicitly here. The PMRs were setup by tboot prior to + * calling SENTER, but the kernel is expected to reset/tear + * down the PMRs. + */ + if (intel_iommu_tboot_noforce) { + for_each_iommu(iommu, drhd) + iommu_disable_protect_mem_regions(iommu); + } + + /* + * Make sure the IOMMUs are switched off, even when we + * boot into a kexec kernel and the previous kernel left + * them enabled + */ + intel_disable_iommus(); + goto out_free_dmar; + } + + if (list_empty(&dmar_rmrr_units)) + pr_info("No RMRR found\n"); + + if (list_empty(&dmar_atsr_units)) + pr_info("No ATSR found\n"); + + if (list_empty(&dmar_satc_units)) + pr_info("No SATC found\n"); + + init_no_remapping_devices(); + + ret = init_dmars(); + if (ret) { + if (force_on) + panic("tboot: Failed to initialize DMARs\n"); + pr_err("Initialization failed\n"); + goto out_free_dmar; + } + up_write(&dmar_global_lock); + + init_iommu_pm_ops(); + + down_read(&dmar_global_lock); + for_each_active_iommu(iommu, drhd) { + /* + * The flush queue implementation does not perform + * page-selective invalidations that are required for efficient + * TLB flushes in virtual environments. The benefit of batching + * is likely to be much lower than the overhead of synchronizing + * the virtual and physical IOMMU page-tables. + */ + if (cap_caching_mode(iommu->cap) && + !first_level_by_default(iommu)) { + pr_info_once("IOMMU batching disallowed due to virtualization\n"); + iommu_set_dma_strict(); + } + iommu_device_sysfs_add(&iommu->iommu, NULL, + intel_iommu_groups, + "%s", iommu->name); + /* + * The iommu device probe is protected by the iommu_probe_device_lock. + * Release the dmar_global_lock before entering the device probe path + * to avoid unnecessary lock order splat. + */ + up_read(&dmar_global_lock); + iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + down_read(&dmar_global_lock); + + iommu_pmu_register(iommu); + } + + if (probe_acpi_namespace_devices()) + pr_warn("ACPI name space devices didn't probe correctly\n"); + + /* Finally, we enable the DMA remapping hardware. */ + for_each_iommu(iommu, drhd) { + if (!drhd->ignored && !translation_pre_enabled(iommu)) + iommu_enable_translation(iommu); + + iommu_disable_protect_mem_regions(iommu); + } + up_read(&dmar_global_lock); + + pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); + + intel_iommu_enabled = 1; + + return 0; + +out_free_dmar: + intel_iommu_free_dmars(); + up_write(&dmar_global_lock); + return ret; +} + +static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) +{ + struct device_domain_info *info = opaque; + + domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); + return 0; +} + +/* + * NB - intel-iommu lacks any sort of reference counting for the users of + * dependent devices. If multiple endpoints have intersecting dependent + * devices, unbinding the driver from any one of them will possibly leave + * the others unable to operate. + */ +static void domain_context_clear(struct device_domain_info *info) +{ + if (!dev_is_pci(info->dev)) { + domain_context_clear_one(info, info->bus, info->devfn); + return; + } + + pci_for_each_dma_alias(to_pci_dev(info->dev), + &domain_context_clear_one_cb, info); + iommu_disable_pci_ats(info); +} + +/* + * Clear the page table pointer in context or pasid table entries so that + * all DMA requests without PASID from the device are blocked. If the page + * table has been set, clean up the data structures. + */ +void device_block_translation(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + + /* Device in DMA blocking state. Noting to do. */ + if (!info->domain_attached) + return; + + if (info->domain) + cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); + + if (!dev_is_real_dma_subdevice(dev)) { + if (sm_supported(iommu)) + intel_pasid_tear_down_entry(iommu, dev, + IOMMU_NO_PASID, false); + else + domain_context_clear(info); + } + + /* Device now in DMA blocking state. */ + info->domain_attached = false; + + if (!info->domain) + return; + + spin_lock_irqsave(&info->domain->lock, flags); + list_del(&info->link); + spin_unlock_irqrestore(&info->domain->lock, flags); + + domain_detach_iommu(info->domain, iommu); + info->domain = NULL; +} + +static int blocking_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev); + device_block_translation(dev); + return 0; +} + +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); + +static struct iommu_domain blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocking_domain_attach_dev, + .set_dev_pasid = blocking_domain_set_dev_pasid, + } +}; + +static struct dmar_domain *paging_domain_alloc(void) +{ + struct dmar_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); + spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); + xa_init(&domain->iommu_array); + INIT_LIST_HEAD(&domain->s1_domains); + spin_lock_init(&domain->s1_lock); + + return domain; +} + +static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int mgaw = cap_mgaw(iommu->cap); + + /* + * Spec 3.6 First-Stage Translation: + * + * Software must limit addresses to less than the minimum of MGAW + * and the lower canonical address width implied by FSPM (i.e., + * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level). + */ + if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) { + *top_level = 4; + return min(57, mgaw); + } + + /* Four level is always supported */ + *top_level = 3; + return min(48, mgaw); +} + +static struct iommu_domain * +intel_iommu_domain_alloc_first_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) +{ + struct pt_iommu_x86_64_cfg cfg = {}; + struct dmar_domain *dmar_domain; + int ret; + + if (flags & ~IOMMU_HWPT_ALLOC_PASID) + return ERR_PTR(-EOPNOTSUPP); + + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + + dmar_domain = paging_domain_alloc(); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + cfg.common.hw_max_vasz_lg2 = + compute_vasz_lg2_fs(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | + BIT(PT_FEAT_FLUSH_RANGE); + /* First stage always uses scalable mode */ + if (!ecap_smpwc(iommu->ecap)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); + dmar_domain->domain.ops = &intel_fs_paging_domain_ops; + /* + * iotlb sync for map is only needed for legacy implementations that + * explicitly require flushing internal write buffers to ensure memory + * coherence. + */ + if (rwbf_required(iommu)) + dmar_domain->iotlb_sync_map = true; + + ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); + } + + if (!cap_fl1gp_support(iommu->cap)) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; + + return &dmar_domain->domain; +} + +static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int sagaw = cap_sagaw(iommu->cap); + unsigned int mgaw = cap_mgaw(iommu->cap); + + /* + * Find the largest table size that both the mgaw and sagaw support. + * This sets the valid range of IOVA and the top starting level. + * Some HW may only support a 4 or 5 level walk but must limit IOVA to + * 3 levels. + */ + if (mgaw > 48 && sagaw >= BIT(3)) { + *top_level = 4; + return min(57, mgaw); + } else if (mgaw > 39 && sagaw >= BIT(2)) { + *top_level = 3 + ffs(sagaw >> 3); + return min(48, mgaw); + } else if (mgaw > 30 && sagaw >= BIT(1)) { + *top_level = 2 + ffs(sagaw >> 2); + return min(39, mgaw); + } + return 0; +} + +static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { + IOMMU_PT_DIRTY_OPS(vtdss), + .set_dirty_tracking = intel_iommu_set_dirty_tracking, +}; + +static struct iommu_domain * +intel_iommu_domain_alloc_second_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) +{ + struct pt_iommu_vtdss_cfg cfg = {}; + struct dmar_domain *dmar_domain; + unsigned int sslps; + int ret; + + if (flags & + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID))) + return ERR_PTR(-EOPNOTSUPP); + + if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && + !nested_supported(iommu)) || + ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + !ssads_supported(iommu))) + return ERR_PTR(-EOPNOTSUPP); + + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + + dmar_domain = paging_domain_alloc(); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE); + + /* + * Read-only mapping is disallowed on the domain which serves as the + * parent in a nested configuration, due to HW errata + * (ERRATA_772415_SPR17) + */ + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) + cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE); + + if (!iommu_paging_structure_coherency(iommu)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); + dmar_domain->domain.ops = &intel_ss_paging_domain_ops; + dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; + + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops; + + ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); + } + + /* Adjust the supported page sizes to HW capability */ + sslps = cap_super_page_val(iommu->cap); + if (!(sslps & BIT(0))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M; + if (!(sslps & BIT(1))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; + + /* + * Besides the internal write buffer flush, the caching mode used for + * legacy nested translation (which utilizes shadowing page tables) + * also requires iotlb sync on map. + */ + if (rwbf_required(iommu) || cap_caching_mode(iommu->cap)) + dmar_domain->iotlb_sync_map = true; + + return &dmar_domain->domain; +} + +static struct iommu_domain * +intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct iommu_domain *domain; + + if (user_data) + return ERR_PTR(-EOPNOTSUPP); + + /* Prefer first stage if possible by default. */ + domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags); + if (domain != ERR_PTR(-EOPNOTSUPP)) + return domain; + return intel_iommu_domain_alloc_second_stage(dev, iommu, flags); +} + +static void intel_iommu_domain_free(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + if (WARN_ON(dmar_domain->nested_parent && + !list_empty(&dmar_domain->s1_domains))) + return; + + if (WARN_ON(!list_empty(&dmar_domain->devices))) + return; + + pt_iommu_deinit(&dmar_domain->iommu); + + kfree(dmar_domain->qi_batch); + kfree(dmar_domain); +} + +static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) +{ + if (WARN_ON(dmar_domain->domain.dirty_ops || + dmar_domain->nested_parent)) + return -EINVAL; + + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return -EINVAL; + + if (!ecap_smpwc(iommu->ecap) && + !(dmar_domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; + + /* Supports the number of table levels */ + if (!cap_fl5lp_support(iommu->cap) && + dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48) + return -EINVAL; + + /* Same page size support */ + if (!cap_fl1gp_support(iommu->cap) && + (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map) + return -EINVAL; + + return 0; +} + +static int +paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) +{ + unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2; + unsigned int sslps = cap_super_page_val(iommu->cap); + struct pt_iommu_vtdss_hw_info pt_info; + + pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info); + + if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) + return -EINVAL; + if (dmar_domain->nested_parent && !nested_supported(iommu)) + return -EINVAL; + + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return -EINVAL; + + if (!iommu_paging_structure_coherency(iommu) && + !(dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; + + /* Address width falls within the capability */ + if (cap_mgaw(iommu->cap) < vasz_lg2) + return -EINVAL; + + /* Page table level is supported. */ + if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw))) + return -EINVAL; + + /* Same page size support */ + if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) + return -EINVAL; + if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) && + !dmar_domain->iotlb_sync_map) + return -EINVAL; + + /* + * FIXME this is locked wrong, it needs to be under the + * dmar_domain->lock + */ + if ((dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) && + !ecap_sc_support(iommu->ecap)) + return -EINVAL; + return 0; +} + +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + int ret = -EINVAL; + + if (intel_domain_is_fs_paging(dmar_domain)) + ret = paging_domain_compatible_first_stage(dmar_domain, iommu); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = paging_domain_compatible_second_stage(dmar_domain, iommu); + else if (WARN_ON(true)) + ret = -EINVAL; + if (ret) + return ret; + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && + context_copied(iommu, info->bus, info->devfn)) + return intel_pasid_setup_sm_context(dev); + + return 0; +} + +static int intel_iommu_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + int ret; + + device_block_translation(dev); + + ret = paging_domain_compatible(domain, dev); + if (ret) + return ret; + + ret = iopf_for_domain_set(domain, dev); + if (ret) + return ret; + + ret = dmar_domain_attach_device(to_dmar_domain(domain), dev); + if (ret) + iopf_for_domain_remove(domain, dev); + + return ret; +} + +static void intel_iommu_tlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + cache_tag_flush_range(to_dmar_domain(domain), gather->start, + gather->end, + iommu_pages_list_empty(&gather->freelist)); + iommu_put_pages_list(&gather->freelist); +} + +static bool domain_support_force_snooping(struct dmar_domain *domain) +{ + struct device_domain_info *info; + bool support = true; + + assert_spin_locked(&domain->lock); + list_for_each_entry(info, &domain->devices, link) { + if (!ecap_sc_support(info->iommu->ecap)) { + support = false; + break; + } + } + + return support; +} + +static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct device_domain_info *info; + + guard(spinlock_irqsave)(&dmar_domain->lock); + + if (dmar_domain->force_snooping) + return true; + + if (!domain_support_force_snooping(dmar_domain)) + return false; + + dmar_domain->force_snooping = true; + list_for_each_entry(info, &dmar_domain->devices, link) + intel_pasid_setup_page_snoop_control(info->iommu, info->dev, + IOMMU_NO_PASID); + return true; +} + +static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + guard(spinlock_irqsave)(&dmar_domain->lock); + if (!domain_support_force_snooping(dmar_domain)) + return false; + + /* + * Second level page table supports per-PTE snoop control. The + * iommu_map() interface will handle this by setting SNP bit. + */ + dmar_domain->sspt.vtdss_pt.common.features |= + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE); + dmar_domain->force_snooping = true; + return true; +} + +static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + case IOMMU_CAP_DEFERRED_FLUSH: + return true; + case IOMMU_CAP_PRE_BOOT_PROTECTION: + return dmar_platform_optin(); + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return ecap_sc_support(info->iommu->ecap); + case IOMMU_CAP_DIRTY_TRACKING: + return ssads_supported(info->iommu); + default: + return false; + } +} + +static struct iommu_device *intel_iommu_probe_device(struct device *dev) +{ + struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; + struct device_domain_info *info; + struct intel_iommu *iommu; + u8 bus, devfn; + int ret; + + iommu = device_lookup_iommu(dev, &bus, &devfn); + if (!iommu || !iommu->iommu.ops) + return ERR_PTR(-ENODEV); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + if (dev_is_real_dma_subdevice(dev)) { + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->segment = pci_domain_nr(pdev->bus); + } else { + info->bus = bus; + info->devfn = devfn; + info->segment = iommu->segment; + } + + info->dev = dev; + info->iommu = iommu; + if (dev_is_pci(dev)) { + if (ecap_dev_iotlb_support(iommu->ecap) && + pci_ats_supported(pdev) && + dmar_ats_supported(pdev, iommu)) { + info->ats_supported = 1; + info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); + + /* + * For IOMMU that supports device IOTLB throttling + * (DIT), we assign PFSID to the invalidation desc + * of a VF such that IOMMU HW can gauge queue depth + * at PF level. If DIT is not set, PFSID will be + * treated as reserved, which should be set to 0. + */ + if (ecap_dit(iommu->ecap)) + info->pfsid = pci_dev_id(pci_physfn(pdev)); + info->ats_qdep = pci_ats_queue_depth(pdev); + } + if (sm_supported(iommu)) { + if (pasid_supported(iommu)) { + int features = pci_pasid_features(pdev); + + if (features >= 0) + info->pasid_supported = features | 1; + } + + if (info->ats_supported && ecap_prs(iommu->ecap) && + ecap_pds(iommu->ecap) && pci_pri_supported(pdev)) + info->pri_supported = 1; + } + } + + dev_iommu_priv_set(dev, info); + if (pdev && pci_ats_supported(pdev)) { + pci_prepare_ats(pdev, VTD_PAGE_SHIFT); + ret = device_rbtree_insert(iommu, info); + if (ret) + goto free; + } + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { + ret = intel_pasid_alloc_table(dev); + if (ret) { + dev_err(dev, "PASID table allocation failed\n"); + goto clear_rbtree; + } + + if (!context_copied(iommu, info->bus, info->devfn)) { + ret = intel_pasid_setup_sm_context(dev); + if (ret) + goto free_table; + } + } + + intel_iommu_debugfs_create_dev(info); + + return &iommu->iommu; +free_table: + intel_pasid_free_table(dev); +clear_rbtree: + device_rbtree_remove(info); +free: + kfree(info); + + return ERR_PTR(ret); +} + +static void intel_iommu_probe_finalize(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * The PCIe spec, in its wisdom, declares that the behaviour of the + * device is undefined if you enable PASID support after ATS support. + * So always enable PASID support on devices which have it, even if + * we can't yet know if we're ever going to use it. + */ + if (info->pasid_supported && + !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1)) + info->pasid_enabled = 1; + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { + iommu_enable_pci_ats(info); + /* Assign a DEVTLB cache tag to the default domain. */ + if (info->ats_enabled && info->domain) { + u16 did = domain_id_iommu(info->domain, iommu); + + if (cache_tag_assign(info->domain, did, dev, + IOMMU_NO_PASID, CACHE_TAG_DEVTLB)) + iommu_disable_pci_ats(info); + } + } + iommu_enable_pci_pri(info); +} + +static void intel_iommu_release_device(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + iommu_disable_pci_pri(info); + iommu_disable_pci_ats(info); + + if (info->pasid_enabled) { + pci_disable_pasid(to_pci_dev(dev)); + info->pasid_enabled = 0; + } + + mutex_lock(&iommu->iopf_lock); + if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) + device_rbtree_remove(info); + mutex_unlock(&iommu->iopf_lock); + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && + !context_copied(iommu, info->bus, info->devfn)) + intel_pasid_teardown_sm_context(dev); + + intel_pasid_free_table(dev); + intel_iommu_debugfs_remove_dev(info); + kfree(info); +} + +static void intel_iommu_get_resv_regions(struct device *device, + struct list_head *head) +{ + int prot = DMA_PTE_READ | DMA_PTE_WRITE; + struct iommu_resv_region *reg; + struct dmar_rmrr_unit *rmrr; + struct device *i_dev; + int i; + + rcu_read_lock(); + for_each_rmrr_units(rmrr) { + for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, + i, i_dev) { + struct iommu_resv_region *resv; + enum iommu_resv_type type; + size_t length; + + if (i_dev != device && + !is_downstream_to_pci_bridge(device, i_dev)) + continue; + + length = rmrr->end_address - rmrr->base_address + 1; + + type = device_rmrr_is_relaxable(device) ? + IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; + + resv = iommu_alloc_resv_region(rmrr->base_address, + length, prot, type, + GFP_ATOMIC); + if (!resv) + break; + + list_add_tail(&resv->list, head); + } + } + rcu_read_unlock(); + +#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA + if (dev_is_pci(device)) { + struct pci_dev *pdev = to_pci_dev(device); + + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { + reg = iommu_alloc_resv_region(0, 1UL << 24, prot, + IOMMU_RESV_DIRECT_RELAXABLE, + GFP_KERNEL); + if (reg) + list_add_tail(®->list, head); + } + } +#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ + + reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, + IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, + 0, IOMMU_RESV_MSI, GFP_KERNEL); + if (!reg) + return; + list_add_tail(®->list, head); +} + +static struct iommu_group *intel_iommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + return generic_device_group(dev); +} + +int intel_iommu_enable_iopf(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + if (!info->pri_enabled) + return -ENODEV; + + /* pri_enabled is protected by the group mutex. */ + iommu_group_mutex_assert(dev); + if (info->iopf_refcount) { + info->iopf_refcount++; + return 0; + } + + ret = iopf_queue_add_device(iommu->iopf_queue, dev); + if (ret) + return ret; + + info->iopf_refcount = 1; + + return 0; +} + +void intel_iommu_disable_iopf(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) + return; + + iommu_group_mutex_assert(dev); + if (--info->iopf_refcount) + return; + + iopf_queue_remove_device(iommu->iopf_queue, dev); +} + +static bool intel_iommu_is_attach_deferred(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + return translation_pre_enabled(info->iommu) && !info->domain; +} + +/* + * Check that the device does not live on an external facing PCI port that is + * marked as untrusted. Such devices should not be able to apply quirks and + * thus not be able to bypass the IOMMU restrictions. + */ +static bool risky_device(struct pci_dev *pdev) +{ + if (pdev->untrusted) { + pci_info(pdev, + "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", + pdev->vendor, pdev->device); + pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); + return true; + } + return false; +} + +static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + if (dmar_domain->iotlb_sync_map) + cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1); + + return 0; +} + +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dev_pasid_info *curr, *dev_pasid = NULL; + struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; + unsigned long flags; + + if (!domain) + return; + + /* Identity domain has no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY) + return; + + dmar_domain = to_dmar_domain(domain); + spin_lock_irqsave(&dmar_domain->lock, flags); + list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { + if (curr->dev == dev && curr->pasid == pasid) { + list_del(&curr->link_domain); + dev_pasid = curr; + break; + } + } + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + cache_tag_unassign_domain(dmar_domain, dev, pasid); + domain_detach_iommu(dmar_domain, iommu); + if (!WARN_ON_ONCE(!dev_pasid)) { + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + kfree(dev_pasid); + } +} + +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); + iopf_for_domain_remove(old, dev); + domain_remove_dev_pasid(old, dev, pasid); + + return 0; +} + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + unsigned long flags; + int ret; + + dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); + if (!dev_pasid) + return ERR_PTR(-ENOMEM); + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) + goto out_free; + + ret = cache_tag_assign_domain(dmar_domain, dev, pasid); + if (ret) + goto out_detach_iommu; + + dev_pasid->dev = dev; + dev_pasid->pasid = pasid; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return dev_pasid; +out_detach_iommu: + domain_detach_iommu(dmar_domain, iommu); +out_free: + kfree(dev_pasid); + return ERR_PTR(ret); +} + +static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EINVAL; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (domain->dirty_ops) + return -EINVAL; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + goto out_remove_dev_pasid; + + if (intel_domain_is_fs_paging(dmar_domain)) + ret = domain_setup_first_level(iommu, dmar_domain, + dev, pasid, old); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = domain_setup_second_level(iommu, dmar_domain, + dev, pasid, old); + else if (WARN_ON(true)) + ret = -EINVAL; + + if (ret) + goto out_unwind_iopf; + + domain_remove_dev_pasid(old, dev, pasid); + + intel_iommu_debugfs_create_dev_pasid(dev_pasid); + + return 0; + +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + +static void *intel_iommu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct iommu_hw_info_vtd *vtd; + + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_INTEL_VTD) + return ERR_PTR(-EOPNOTSUPP); + + vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); + if (!vtd) + return ERR_PTR(-ENOMEM); + + vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; + vtd->cap_reg = iommu->cap; + vtd->ecap_reg = iommu->ecap; + *length = sizeof(*vtd); + *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; + return vtd; +} + +/* + * Set dirty tracking for the device list of a domain. The caller must + * hold the domain->lock when calling it. + */ +static int device_set_dirty_tracking(struct list_head *devices, bool enable) +{ + struct device_domain_info *info; + int ret = 0; + + list_for_each_entry(info, devices, link) { + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, + IOMMU_NO_PASID, enable); + if (ret) + break; + } + + return ret; +} + +static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, + bool enable) +{ + struct dmar_domain *s1_domain; + unsigned long flags; + int ret; + + spin_lock(&domain->s1_lock); + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + ret = device_set_dirty_tracking(&s1_domain->devices, enable); + spin_unlock_irqrestore(&s1_domain->lock, flags); + if (ret) + goto err_unwind; + } + spin_unlock(&domain->s1_lock); + return 0; + +err_unwind: + list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { + spin_lock_irqsave(&s1_domain->lock, flags); + device_set_dirty_tracking(&s1_domain->devices, + domain->dirty_tracking); + spin_unlock_irqrestore(&s1_domain->lock, flags); + } + spin_unlock(&domain->s1_lock); + return ret; +} + +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + int ret; + + spin_lock(&dmar_domain->lock); + if (dmar_domain->dirty_tracking == enable) + goto out_unlock; + + ret = device_set_dirty_tracking(&dmar_domain->devices, enable); + if (ret) + goto err_unwind; + + if (dmar_domain->nested_parent) { + ret = parent_domain_set_dirty_tracking(dmar_domain, enable); + if (ret) + goto err_unwind; + } + + dmar_domain->dirty_tracking = enable; +out_unlock: + spin_unlock(&dmar_domain->lock); + + return 0; + +err_unwind: + device_set_dirty_tracking(&dmar_domain->devices, + dmar_domain->dirty_tracking); + spin_unlock(&dmar_domain->lock); + return ret; +} + +static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, 1); + if (!context) { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + if (context_present(context) && !context_copied(iommu, bus, devfn)) { + spin_unlock(&iommu->lock); + return 0; + } + + copied_context_tear_down(iommu, context, bus, devfn); + context_clear_entry(context); + context_set_domain_id(context, FLPT_DEFAULT_DID); + + /* + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. + */ + context_set_address_width(context, iommu->msagaw); + context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); + context_set_fault_enable(context); + context_set_present(context); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(context, sizeof(*context)); + context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); + spin_unlock(&iommu->lock); + + return 0; +} + +static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; + + return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); +} + +static int device_setup_pass_through(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) + return context_setup_pass_through(dev, info->bus, info->devfn); + + return pci_for_each_dma_alias(to_pci_dev(dev), + context_setup_pass_through_cb, dev); +} + +static int identity_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + device_block_translation(dev); + + if (dev_is_real_dma_subdevice(dev)) + return 0; + + /* + * No PRI support with the global identity domain. No need to enable or + * disable PRI in this path as the iommu has been put in the blocking + * state. + */ + if (sm_supported(iommu)) + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); + else + ret = device_setup_pass_through(dev); + + if (!ret) + info->domain_attached = true; + + return ret; +} + +static int identity_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + return ret; + + ret = domain_setup_passthrough(iommu, dev, pasid, old); + if (ret) { + iopf_for_domain_replace(old, domain, dev); + return ret; + } + + domain_remove_dev_pasid(old, dev, pasid); + return 0; +} + +static struct iommu_domain identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = identity_domain_attach_dev, + .set_dev_pasid = identity_domain_set_dev_pasid, + }, +}; + +const struct iommu_domain_ops intel_fs_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, +}; + +const struct iommu_domain_ops intel_ss_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(vtdss), + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, +}; + +const struct iommu_ops intel_iommu_ops = { + .blocked_domain = &blocking_domain, + .release_domain = &blocking_domain, + .identity_domain = &identity_domain, + .capable = intel_iommu_capable, + .hw_info = intel_iommu_hw_info, + .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, + .domain_alloc_sva = intel_svm_domain_alloc, + .domain_alloc_nested = intel_iommu_domain_alloc_nested, + .probe_device = intel_iommu_probe_device, + .probe_finalize = intel_iommu_probe_finalize, + .release_device = intel_iommu_release_device, + .get_resv_regions = intel_iommu_get_resv_regions, + .device_group = intel_iommu_device_group, + .is_attach_deferred = intel_iommu_is_attach_deferred, + .def_domain_type = device_def_domain_type, + .page_response = intel_iommu_page_response, +}; + +static void quirk_iommu_igfx(struct pci_dev *dev) +{ + if (risky_device(dev)) + return; + + pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); + disable_igfx_iommu = 1; +} + +/* G4x/GM45 integrated gfx dmar support is totally busted. */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); + +/* QM57/QS57 integrated gfx malfunctions with dmar */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx); + +/* Broadwell igfx malfunctions with dmar */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); + +static void quirk_iommu_rwbf(struct pci_dev *dev) +{ + if (risky_device(dev)) + return; + + /* + * Mobile 4 Series Chipset neglects to set RWBF capability, + * but needs it. Same seems to hold for the desktop versions. + */ + pci_info(dev, "Forcing write-buffer flush capability\n"); + rwbf_quirk = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); + +#define GGC 0x52 +#define GGC_MEMORY_SIZE_MASK (0xf << 8) +#define GGC_MEMORY_SIZE_NONE (0x0 << 8) +#define GGC_MEMORY_SIZE_1M (0x1 << 8) +#define GGC_MEMORY_SIZE_2M (0x3 << 8) +#define GGC_MEMORY_VT_ENABLED (0x8 << 8) +#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) +#define GGC_MEMORY_SIZE_3M_VT (0xa << 8) +#define GGC_MEMORY_SIZE_4M_VT (0xb << 8) + +static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) +{ + unsigned short ggc; + + if (risky_device(dev)) + return; + + if (pci_read_config_word(dev, GGC, &ggc)) + return; + + if (!(ggc & GGC_MEMORY_VT_ENABLED)) { + pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); + disable_igfx_iommu = 1; + } else if (!disable_igfx_iommu) { + /* we have to ensure the gfx device is idle before we flush */ + pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); + iommu_set_dma_strict(); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); + +static void quirk_igfx_skip_te_disable(struct pci_dev *dev) +{ + unsigned short ver; + + if (!IS_GFX_DEVICE(dev)) + return; + + ver = (dev->device >> 8) & 0xff; + if (ver != 0x45 && ver != 0x46 && ver != 0x4c && + ver != 0x4e && ver != 0x8a && ver != 0x98 && + ver != 0x9a && ver != 0xa7 && ver != 0x7d) + return; + + if (risky_device(dev)) + return; + + pci_info(dev, "Skip IOMMU disabling for graphics\n"); + iommu_skip_te_disable = 1; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); + +/* On Tylersburg chipsets, some BIOSes have been known to enable the + ISOCH DMAR unit for the Azalia sound device, but not give it any + TLB entries, which causes it to deadlock. Check for that. We do + this in a function called from init_dmars(), instead of in a PCI + quirk, because we don't want to print the obnoxious "BIOS broken" + message if VT-d is actually disabled. +*/ +static void __init check_tylersburg_isoch(void) +{ + struct pci_dev *pdev; + uint32_t vtisochctrl; + + /* If there's no Azalia in the system anyway, forget it. */ + pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); + if (!pdev) + return; + + if (risky_device(pdev)) { + pci_dev_put(pdev); + return; + } + + pci_dev_put(pdev); + + /* System Management Registers. Might be hidden, in which case + we can't do the sanity check. But that's OK, because the + known-broken BIOSes _don't_ actually hide it, so far. */ + pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); + if (!pdev) + return; + + if (risky_device(pdev)) { + pci_dev_put(pdev); + return; + } + + if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { + pci_dev_put(pdev); + return; + } + + pci_dev_put(pdev); + + /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ + if (vtisochctrl & 1) + return; + + /* Drop all bits other than the number of TLB entries */ + vtisochctrl &= 0x1c; + + /* If we have the recommended number of TLB entries (16), fine. */ + if (vtisochctrl == 0x10) + return; + + /* Zero TLB entries? You get to ride the short bus to school. */ + if (!vtisochctrl) { + WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" + "BIOS vendor: %s; Ver: %s; Product Version: %s\n", + dmi_get_system_info(DMI_BIOS_VENDOR), + dmi_get_system_info(DMI_BIOS_VERSION), + dmi_get_system_info(DMI_PRODUCT_VERSION)); + iommu_identity_mapping |= IDENTMAP_AZALIA; + return; + } + + pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", + vtisochctrl); +} + +/* + * Here we deal with a device TLB defect where device may inadvertently issue ATS + * invalidation completion before posted writes initiated with translated address + * that utilized translations matching the invalidation address range, violating + * the invalidation completion ordering. + * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is + * vulnerable to this defect. In other words, any dTLB invalidation initiated not + * under the control of the trusted/privileged host device driver must use this + * quirk. + * Device TLBs are invalidated under the following six conditions: + * 1. Device driver does DMA API unmap IOVA + * 2. Device driver unbind a PASID from a process, sva_unbind_device() + * 3. PASID is torn down, after PASID cache is flushed. e.g. process + * exit_mmap() due to crash + * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where + * VM has to free pages that were unmapped + * 5. Userspace driver unmaps a DMA buffer + * 6. Cache invalidation in vSVA usage (upcoming) + * + * For #1 and #2, device drivers are responsible for stopping DMA traffic + * before unmap/unbind. For #3, iommu driver gets mmu_notifier to + * invalidate TLB the same way as normal user unmap which will use this quirk. + * The dTLB invalidation after PASID cache flush does not need this quirk. + * + * As a reminder, #6 will *NEED* this quirk as we enable nested translation. + */ +void quirk_extra_dev_tlb_flush(struct device_domain_info *info, + unsigned long address, unsigned long mask, + u32 pasid, u16 qdep) +{ + u16 sid; + + if (likely(!info->dtlb_extra_inval)) + return; + + sid = PCI_DEVID(info->bus, info->devfn); + if (pasid == IOMMU_NO_PASID) { + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, address, mask); + } else { + qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, + pasid, qdep, address, mask); + } +} + +#define ecmd_get_status_code(res) (((res) & 0xff) >> 1) + +/* + * Function to submit a command to the enhanced command interface. The + * valid enhanced command descriptions are defined in Table 47 of the + * VT-d spec. The VT-d hardware implementation may support some but not + * all commands, which can be determined by checking the Enhanced + * Command Capability Register. + * + * Return values: + * - 0: Command successful without any error; + * - Negative: software error value; + * - Nonzero positive: failure status code defined in Table 48. + */ +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) +{ + unsigned long flags; + u64 res; + int ret; + + if (!cap_ecmds(iommu->cap)) + return -ENODEV; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); + if (res & DMA_ECMD_ECRSP_IP) { + ret = -EBUSY; + goto err; + } + + /* + * Unconditionally write the operand B, because + * - There is no side effect if an ecmd doesn't require an + * operand B, but we set the register to some value. + * - It's not invoked in any critical path. The extra MMIO + * write doesn't bring any performance concerns. + */ + dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); + dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); + + IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, + !(res & DMA_ECMD_ECRSP_IP), res); + + if (res & DMA_ECMD_ECRSP_IP) { + ret = -ETIMEDOUT; + goto err; + } + + ret = ecmd_get_status_code(res); +err: + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + return ret; +} + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h new file mode 100644 index 000000000000..25c5e22096d4 --- /dev/null +++ b/drivers/iommu/intel/iommu.h @@ -0,0 +1,1380 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright © 2006-2015, Intel Corporation. + * + * Authors: Ashok Raj <ashok.raj@intel.com> + * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> + * David Woodhouse <David.Woodhouse@intel.com> + */ + +#ifndef _INTEL_IOMMU_H_ +#define _INTEL_IOMMU_H_ + +#include <linux/types.h> +#include <linux/iova.h> +#include <linux/io.h> +#include <linux/idr.h> +#include <linux/mmu_notifier.h> +#include <linux/list.h> +#include <linux/iommu.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/dmar.h> +#include <linux/bitfield.h> +#include <linux/xarray.h> +#include <linux/perf_event.h> +#include <linux/pci.h> +#include <linux/generic_pt/iommu.h> + +#include <asm/iommu.h> +#include <uapi/linux/iommufd.h> + +/* + * VT-d hardware uses 4KiB page size regardless of host page size. + */ +#define VTD_PAGE_SHIFT (12) +#define VTD_PAGE_SIZE (1UL << VTD_PAGE_SHIFT) +#define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) +#define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) + +#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) + +#define VTD_STRIDE_SHIFT (9) +#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT) + +#define DMA_PTE_READ BIT_ULL(0) +#define DMA_PTE_WRITE BIT_ULL(1) +#define DMA_PTE_LARGE_PAGE BIT_ULL(7) +#define DMA_PTE_SNP BIT_ULL(11) + +#define DMA_FL_PTE_PRESENT BIT_ULL(0) +#define DMA_FL_PTE_US BIT_ULL(2) +#define DMA_FL_PTE_ACCESS BIT_ULL(5) +#define DMA_FL_PTE_DIRTY BIT_ULL(6) + +#define DMA_SL_PTE_DIRTY_BIT 9 +#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT) + +#define ADDR_WIDTH_5LEVEL (57) +#define ADDR_WIDTH_4LEVEL (48) + +#define CONTEXT_TT_MULTI_LEVEL 0 +#define CONTEXT_TT_DEV_IOTLB 1 +#define CONTEXT_TT_PASS_THROUGH 2 +#define CONTEXT_PASIDE BIT_ULL(3) + +/* + * Intel IOMMU register specification per version 1.0 public spec. + */ +#define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ +#define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ +#define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ +#define DMAR_GCMD_REG 0x18 /* Global command register */ +#define DMAR_GSTS_REG 0x1c /* Global status register */ +#define DMAR_RTADDR_REG 0x20 /* Root entry table */ +#define DMAR_CCMD_REG 0x28 /* Context command reg */ +#define DMAR_FSTS_REG 0x34 /* Fault Status register */ +#define DMAR_FECTL_REG 0x38 /* Fault control register */ +#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ +#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ +#define DMAR_FEUADDR_REG 0x44 /* Upper address register */ +#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ +#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ +#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ +#define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ +#define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ +#define DMAR_IQH_REG 0x80 /* Invalidation queue head register */ +#define DMAR_IQT_REG 0x88 /* Invalidation queue tail register */ +#define DMAR_IQ_SHIFT 4 /* Invalidation queue head/tail shift */ +#define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */ +#define DMAR_ICS_REG 0x9c /* Invalidation complete status register */ +#define DMAR_IQER_REG 0xb0 /* Invalidation queue error record register */ +#define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */ +#define DMAR_PQH_REG 0xc0 /* Page request queue head register */ +#define DMAR_PQT_REG 0xc8 /* Page request queue tail register */ +#define DMAR_PQA_REG 0xd0 /* Page request queue address register */ +#define DMAR_PRS_REG 0xdc /* Page request status register */ +#define DMAR_PECTL_REG 0xe0 /* Page request event control register */ +#define DMAR_PEDATA_REG 0xe4 /* Page request event interrupt data register */ +#define DMAR_PEADDR_REG 0xe8 /* Page request event interrupt addr register */ +#define DMAR_PEUADDR_REG 0xec /* Page request event Upper address register */ +#define DMAR_MTRRCAP_REG 0x100 /* MTRR capability register */ +#define DMAR_MTRRDEF_REG 0x108 /* MTRR default type register */ +#define DMAR_MTRR_FIX64K_00000_REG 0x120 /* MTRR Fixed range registers */ +#define DMAR_MTRR_FIX16K_80000_REG 0x128 +#define DMAR_MTRR_FIX16K_A0000_REG 0x130 +#define DMAR_MTRR_FIX4K_C0000_REG 0x138 +#define DMAR_MTRR_FIX4K_C8000_REG 0x140 +#define DMAR_MTRR_FIX4K_D0000_REG 0x148 +#define DMAR_MTRR_FIX4K_D8000_REG 0x150 +#define DMAR_MTRR_FIX4K_E0000_REG 0x158 +#define DMAR_MTRR_FIX4K_E8000_REG 0x160 +#define DMAR_MTRR_FIX4K_F0000_REG 0x168 +#define DMAR_MTRR_FIX4K_F8000_REG 0x170 +#define DMAR_MTRR_PHYSBASE0_REG 0x180 /* MTRR Variable range registers */ +#define DMAR_MTRR_PHYSMASK0_REG 0x188 +#define DMAR_MTRR_PHYSBASE1_REG 0x190 +#define DMAR_MTRR_PHYSMASK1_REG 0x198 +#define DMAR_MTRR_PHYSBASE2_REG 0x1a0 +#define DMAR_MTRR_PHYSMASK2_REG 0x1a8 +#define DMAR_MTRR_PHYSBASE3_REG 0x1b0 +#define DMAR_MTRR_PHYSMASK3_REG 0x1b8 +#define DMAR_MTRR_PHYSBASE4_REG 0x1c0 +#define DMAR_MTRR_PHYSMASK4_REG 0x1c8 +#define DMAR_MTRR_PHYSBASE5_REG 0x1d0 +#define DMAR_MTRR_PHYSMASK5_REG 0x1d8 +#define DMAR_MTRR_PHYSBASE6_REG 0x1e0 +#define DMAR_MTRR_PHYSMASK6_REG 0x1e8 +#define DMAR_MTRR_PHYSBASE7_REG 0x1f0 +#define DMAR_MTRR_PHYSMASK7_REG 0x1f8 +#define DMAR_MTRR_PHYSBASE8_REG 0x200 +#define DMAR_MTRR_PHYSMASK8_REG 0x208 +#define DMAR_MTRR_PHYSBASE9_REG 0x210 +#define DMAR_MTRR_PHYSMASK9_REG 0x218 +#define DMAR_PERFCAP_REG 0x300 +#define DMAR_PERFCFGOFF_REG 0x310 +#define DMAR_PERFOVFOFF_REG 0x318 +#define DMAR_PERFCNTROFF_REG 0x31c +#define DMAR_PERFINTRSTS_REG 0x324 +#define DMAR_PERFINTRCTL_REG 0x328 +#define DMAR_PERFEVNTCAP_REG 0x380 +#define DMAR_ECMD_REG 0x400 +#define DMAR_ECEO_REG 0x408 +#define DMAR_ECRSP_REG 0x410 +#define DMAR_ECCAP_REG 0x430 + +#define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) +#define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) +#define DMAR_IQER_REG_ICESID(reg) FIELD_GET(GENMASK_ULL(63, 48), reg) + +#define OFFSET_STRIDE (9) + +#define dmar_readq(a) readq(a) +#define dmar_writeq(a,v) writeq(v,a) +#define dmar_readl(a) readl(a) +#define dmar_writel(a, v) writel(v, a) + +#define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) +#define DMAR_VER_MINOR(v) ((v) & 0x0f) + +/* + * Decoding Capability Register + */ +#define cap_esrtps(c) (((c) >> 63) & 1) +#define cap_esirtps(c) (((c) >> 62) & 1) +#define cap_ecmds(c) (((c) >> 61) & 1) +#define cap_fl5lp_support(c) (((c) >> 60) & 1) +#define cap_pi_support(c) (((c) >> 59) & 1) +#define cap_fl1gp_support(c) (((c) >> 56) & 1) +#define cap_read_drain(c) (((c) >> 55) & 1) +#define cap_write_drain(c) (((c) >> 54) & 1) +#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) +#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) +#define cap_pgsel_inv(c) (((c) >> 39) & 1) + +#define cap_super_page_val(c) (((c) >> 34) & 0xf) + +#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) +#define cap_max_fault_reg_offset(c) \ + (cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16) + +#define cap_zlr(c) (((c) >> 22) & 1) +#define cap_isoch(c) (((c) >> 23) & 1) +#define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) +#define cap_sagaw(c) (((c) >> 8) & 0x1f) +#define cap_caching_mode(c) (((c) >> 7) & 1) +#define cap_phmr(c) (((c) >> 6) & 1) +#define cap_plmr(c) (((c) >> 5) & 1) +#define cap_rwbf(c) (((c) >> 4) & 1) +#define cap_afl(c) (((c) >> 3) & 1) +#define cap_ndoms(c) (((unsigned long)1) << (4 + 2 * ((c) & 0x7))) +/* + * Extended Capability Register + */ + +#define ecap_pms(e) (((e) >> 51) & 0x1) +#define ecap_rps(e) (((e) >> 49) & 0x1) +#define ecap_smpwc(e) (((e) >> 48) & 0x1) +#define ecap_flts(e) (((e) >> 47) & 0x1) +#define ecap_slts(e) (((e) >> 46) & 0x1) +#define ecap_slads(e) (((e) >> 45) & 0x1) +#define ecap_smts(e) (((e) >> 43) & 0x1) +#define ecap_dit(e) (((e) >> 41) & 0x1) +#define ecap_pds(e) (((e) >> 42) & 0x1) +#define ecap_pasid(e) (((e) >> 40) & 0x1) +#define ecap_pss(e) (((e) >> 35) & 0x1f) +#define ecap_eafs(e) (((e) >> 34) & 0x1) +#define ecap_nwfs(e) (((e) >> 33) & 0x1) +#define ecap_srs(e) (((e) >> 31) & 0x1) +#define ecap_ers(e) (((e) >> 30) & 0x1) +#define ecap_prs(e) (((e) >> 29) & 0x1) +#define ecap_broken_pasid(e) (((e) >> 28) & 0x1) +#define ecap_dis(e) (((e) >> 27) & 0x1) +#define ecap_nest(e) (((e) >> 26) & 0x1) +#define ecap_mts(e) (((e) >> 25) & 0x1) +#define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) +#define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16) +#define ecap_coherent(e) ((e) & 0x1) +#define ecap_qis(e) ((e) & 0x2) +#define ecap_pass_through(e) (((e) >> 6) & 0x1) +#define ecap_eim_support(e) (((e) >> 4) & 0x1) +#define ecap_ir_support(e) (((e) >> 3) & 0x1) +#define ecap_dev_iotlb_support(e) (((e) >> 2) & 0x1) +#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf) +#define ecap_sc_support(e) (((e) >> 7) & 0x1) /* Snooping Control */ + +/* + * Decoding Perf Capability Register + */ +#define pcap_num_cntr(p) ((p) & 0xffff) +#define pcap_cntr_width(p) (((p) >> 16) & 0x7f) +#define pcap_num_event_group(p) (((p) >> 24) & 0x1f) +#define pcap_filters_mask(p) (((p) >> 32) & 0x1f) +#define pcap_interrupt(p) (((p) >> 50) & 0x1) +/* The counter stride is calculated as 2 ^ (x+10) bytes */ +#define pcap_cntr_stride(p) (1ULL << ((((p) >> 52) & 0x7) + 10)) + +/* + * Decoding Perf Event Capability Register + */ +#define pecap_es(p) ((p) & 0xfffffff) + +/* Virtual command interface capability */ +#define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */ + +/* IOTLB_REG */ +#define DMA_TLB_FLUSH_GRANU_OFFSET 60 +#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) +#define DMA_TLB_DSI_FLUSH (((u64)2) << 60) +#define DMA_TLB_PSI_FLUSH (((u64)3) << 60) +#define DMA_TLB_IIRG(type) ((type >> 60) & 3) +#define DMA_TLB_IAIG(val) (((val) >> 57) & 3) +#define DMA_TLB_READ_DRAIN (((u64)1) << 49) +#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) +#define DMA_TLB_DID(id) (((u64)((id) & 0xffff)) << 32) +#define DMA_TLB_IVT (((u64)1) << 63) +#define DMA_TLB_IH_NONLEAF (((u64)1) << 6) +#define DMA_TLB_MAX_SIZE (0x3f) + +/* INVALID_DESC */ +#define DMA_CCMD_INVL_GRANU_OFFSET 61 +#define DMA_ID_TLB_GLOBAL_FLUSH (((u64)1) << 4) +#define DMA_ID_TLB_DSI_FLUSH (((u64)2) << 4) +#define DMA_ID_TLB_PSI_FLUSH (((u64)3) << 4) +#define DMA_ID_TLB_READ_DRAIN (((u64)1) << 7) +#define DMA_ID_TLB_WRITE_DRAIN (((u64)1) << 6) +#define DMA_ID_TLB_DID(id) (((u64)((id & 0xffff) << 16))) +#define DMA_ID_TLB_IH_NONLEAF (((u64)1) << 6) +#define DMA_ID_TLB_ADDR(addr) (addr) +#define DMA_ID_TLB_ADDR_MASK(mask) (mask) + +/* PMEN_REG */ +#define DMA_PMEN_EPM (((u32)1)<<31) +#define DMA_PMEN_PRS (((u32)1)<<0) + +/* GCMD_REG */ +#define DMA_GCMD_TE (((u32)1) << 31) +#define DMA_GCMD_SRTP (((u32)1) << 30) +#define DMA_GCMD_SFL (((u32)1) << 29) +#define DMA_GCMD_EAFL (((u32)1) << 28) +#define DMA_GCMD_WBF (((u32)1) << 27) +#define DMA_GCMD_QIE (((u32)1) << 26) +#define DMA_GCMD_SIRTP (((u32)1) << 24) +#define DMA_GCMD_IRE (((u32) 1) << 25) +#define DMA_GCMD_CFI (((u32) 1) << 23) + +/* GSTS_REG */ +#define DMA_GSTS_TES (((u32)1) << 31) +#define DMA_GSTS_RTPS (((u32)1) << 30) +#define DMA_GSTS_FLS (((u32)1) << 29) +#define DMA_GSTS_AFLS (((u32)1) << 28) +#define DMA_GSTS_WBFS (((u32)1) << 27) +#define DMA_GSTS_QIES (((u32)1) << 26) +#define DMA_GSTS_IRTPS (((u32)1) << 24) +#define DMA_GSTS_IRES (((u32)1) << 25) +#define DMA_GSTS_CFIS (((u32)1) << 23) + +/* DMA_RTADDR_REG */ +#define DMA_RTADDR_SMT (((u64)1) << 10) + +/* CCMD_REG */ +#define DMA_CCMD_ICC (((u64)1) << 63) +#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) +#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) +#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) +#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) +#define DMA_CCMD_MASK_NOBIT 0 +#define DMA_CCMD_MASK_1BIT 1 +#define DMA_CCMD_MASK_2BIT 2 +#define DMA_CCMD_MASK_3BIT 3 +#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) +#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) + +/* ECMD_REG */ +#define DMA_MAX_NUM_ECMD 256 +#define DMA_MAX_NUM_ECMDCAP (DMA_MAX_NUM_ECMD / 64) +#define DMA_ECMD_REG_STEP 8 +#define DMA_ECMD_ENABLE 0xf0 +#define DMA_ECMD_DISABLE 0xf1 +#define DMA_ECMD_FREEZE 0xf4 +#define DMA_ECMD_UNFREEZE 0xf5 +#define DMA_ECMD_OA_SHIFT 16 +#define DMA_ECMD_ECRSP_IP 0x1 +#define DMA_ECMD_ECCAP3 3 +#define DMA_ECMD_ECCAP3_ECNTS BIT_ULL(48) +#define DMA_ECMD_ECCAP3_DCNTS BIT_ULL(49) +#define DMA_ECMD_ECCAP3_FCNTS BIT_ULL(52) +#define DMA_ECMD_ECCAP3_UFCNTS BIT_ULL(53) +#define DMA_ECMD_ECCAP3_ESSENTIAL (DMA_ECMD_ECCAP3_ECNTS | \ + DMA_ECMD_ECCAP3_DCNTS | \ + DMA_ECMD_ECCAP3_FCNTS | \ + DMA_ECMD_ECCAP3_UFCNTS) + +/* FECTL_REG */ +#define DMA_FECTL_IM (((u32)1) << 31) + +/* FSTS_REG */ +#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */ +#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */ +#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */ +#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */ +#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */ +#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */ +#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) + +/* FRCD_REG, 32 bits access */ +#define DMA_FRCD_F (((u32)1) << 31) +#define dma_frcd_type(d) ((d >> 30) & 1) +#define dma_frcd_fault_reason(c) (c & 0xff) +#define dma_frcd_source_id(c) (c & 0xffff) +#define dma_frcd_pasid_value(c) (((c) >> 8) & 0xfffff) +#define dma_frcd_pasid_present(c) (((c) >> 31) & 1) +/* low 64 bit */ +#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT)) + +/* PRS_REG */ +#define DMA_PRS_PPR ((u32)1) +#define DMA_PRS_PRO ((u32)2) + +#define DMA_VCS_PAS ((u64)1) + +/* PERFINTRSTS_REG */ +#define DMA_PERFINTRSTS_PIS ((u32)1) + +#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ +do { \ + cycles_t start_time = get_cycles(); \ + while (1) { \ + sts = op(iommu->reg + offset); \ + if (cond) \ + break; \ + if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\ + panic("DMAR hardware is malfunctioning\n"); \ + cpu_relax(); \ + } \ +} while (0) + +#define QI_LENGTH 256 /* queue length */ + +enum { + QI_FREE, + QI_IN_USE, + QI_DONE, + QI_ABORT +}; + +#define QI_CC_TYPE 0x1 +#define QI_IOTLB_TYPE 0x2 +#define QI_DIOTLB_TYPE 0x3 +#define QI_IEC_TYPE 0x4 +#define QI_IWD_TYPE 0x5 +#define QI_EIOTLB_TYPE 0x6 +#define QI_PC_TYPE 0x7 +#define QI_DEIOTLB_TYPE 0x8 +#define QI_PGRP_RESP_TYPE 0x9 +#define QI_PSTRM_RESP_TYPE 0xa + +#define QI_IEC_SELECTIVE (((u64)1) << 4) +#define QI_IEC_IIDEX(idx) (((u64)(idx & 0xffff) << 32)) +#define QI_IEC_IM(m) (((u64)(m & 0x1f) << 27)) + +#define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) +#define QI_IWD_STATUS_WRITE (((u64)1) << 5) +#define QI_IWD_FENCE (((u64)1) << 6) +#define QI_IWD_PRQ_DRAIN (((u64)1) << 7) + +#define QI_IOTLB_DID(did) (((u64)did) << 16) +#define QI_IOTLB_DR(dr) (((u64)dr) << 7) +#define QI_IOTLB_DW(dw) (((u64)dw) << 6) +#define QI_IOTLB_GRAN(gran) (((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4)) +#define QI_IOTLB_ADDR(addr) (((u64)addr) & VTD_PAGE_MASK) +#define QI_IOTLB_IH(ih) (((u64)ih) << 6) +#define QI_IOTLB_AM(am) (((u8)am) & 0x3f) + +#define QI_CC_FM(fm) (((u64)fm) << 48) +#define QI_CC_SID(sid) (((u64)sid) << 32) +#define QI_CC_DID(did) (((u64)did) << 16) +#define QI_CC_GRAN(gran) (((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4)) + +#define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) +#define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16) +#define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) +#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ + ((u64)((pfsid >> 4) & 0xfff) << 52)) +#define QI_DEV_IOTLB_SIZE 1 +#define QI_DEV_IOTLB_MAX_INVS 32 + +#define QI_PC_PASID(pasid) (((u64)pasid) << 32) +#define QI_PC_DID(did) (((u64)did) << 16) +#define QI_PC_GRAN(gran) (((u64)gran) << 4) + +/* PASID cache invalidation granu */ +#define QI_PC_ALL_PASIDS 0 +#define QI_PC_PASID_SEL 1 +#define QI_PC_GLOBAL 3 + +#define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) +#define QI_EIOTLB_IH(ih) (((u64)ih) << 6) +#define QI_EIOTLB_AM(am) (((u64)am) & 0x3f) +#define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32) +#define QI_EIOTLB_DID(did) (((u64)did) << 16) +#define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) + +/* QI Dev-IOTLB inv granu */ +#define QI_DEV_IOTLB_GRAN_ALL 1 +#define QI_DEV_IOTLB_GRAN_PASID_SEL 0 + +#define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) +#define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) +#define QI_DEV_EIOTLB_PASID(p) ((u64)((p) & 0xfffff) << 32) +#define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16) +#define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4) +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ + ((u64)((pfsid >> 4) & 0xfff) << 52)) +#define QI_DEV_EIOTLB_MAX_INVS 32 + +/* Page group response descriptor QW0 */ +#define QI_PGRP_PASID_P(p) (((u64)(p)) << 4) +#define QI_PGRP_RESP_CODE(res) (((u64)(res)) << 12) +#define QI_PGRP_DID(rid) (((u64)(rid)) << 16) +#define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) + +/* Page group response descriptor QW1 */ +#define QI_PGRP_IDX(idx) (((u64)(idx)) << 3) + + +#define QI_RESP_SUCCESS 0x0 +#define QI_RESP_INVALID 0x1 +#define QI_RESP_FAILURE 0xf + +#define QI_GRAN_NONG_PASID 2 +#define QI_GRAN_PSI_PASID 3 + +#define qi_shift(iommu) (DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap)) + +struct qi_desc { + u64 qw0; + u64 qw1; + u64 qw2; + u64 qw3; +}; + +struct q_inval { + raw_spinlock_t q_lock; + void *desc; /* invalidation queue */ + int *desc_status; /* desc status */ + int free_head; /* first free entry */ + int free_tail; /* last free entry */ + int free_cnt; +}; + +/* Page Request Queue depth */ +#define PRQ_ORDER 4 +#define PRQ_SIZE (SZ_4K << PRQ_ORDER) +#define PRQ_RING_MASK (PRQ_SIZE - 0x20) +#define PRQ_DEPTH (PRQ_SIZE >> 5) + +struct dmar_pci_notify_info; + +#ifdef CONFIG_IRQ_REMAP +#define INTR_REMAP_TABLE_REG_SIZE 0xf +#define INTR_REMAP_TABLE_REG_SIZE_MASK 0xf + +#define INTR_REMAP_TABLE_ENTRIES 65536 + +struct irq_domain; + +struct ir_table { + struct irte *base; + unsigned long *bitmap; +}; + +void intel_irq_remap_add_device(struct dmar_pci_notify_info *info); +#else +static inline void +intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { } +#endif + +struct iommu_flush { + void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid, + u8 fm, u64 type); + void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); +}; + +enum { + SR_DMAR_FECTL_REG, + SR_DMAR_FEDATA_REG, + SR_DMAR_FEADDR_REG, + SR_DMAR_FEUADDR_REG, + MAX_SR_DMAR_REGS +}; + +#define VTD_FLAG_TRANS_PRE_ENABLED (1 << 0) +#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1) +#define VTD_FLAG_SVM_CAPABLE (1 << 2) + +#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) +#define pasid_supported(iommu) (sm_supported(iommu) && \ + ecap_pasid((iommu)->ecap)) +#define ssads_supported(iommu) (sm_supported(iommu) && \ + ecap_slads((iommu)->ecap) && \ + ecap_smpwc(iommu->ecap)) +#define nested_supported(iommu) (sm_supported(iommu) && \ + ecap_nest((iommu)->ecap)) + +struct pasid_entry; +struct pasid_state_entry; +struct page_req_dsc; + +/* + * 0: Present + * 1-11: Reserved + * 12-63: Context Ptr (12 - (haw-1)) + * 64-127: Reserved + */ +struct root_entry { + u64 lo; + u64 hi; +}; + +/* + * low 64 bits: + * 0: present + * 1: fault processing disable + * 2-3: translation type + * 12-63: address space root + * high 64 bits: + * 0-2: address width + * 3-6: aval + * 8-23: domain id + */ +struct context_entry { + u64 lo; + u64 hi; +}; + +struct iommu_domain_info { + struct intel_iommu *iommu; + unsigned int refcnt; /* Refcount of devices per iommu */ + u16 did; /* Domain ids per IOMMU. Use u16 since + * domain ids are 16 bit wide according + * to VT-d spec, section 9.3 */ +}; + +/* + * We start simply by using a fixed size for the batched descriptors. This + * size is currently sufficient for our needs. Future improvements could + * involve dynamically allocating the batch buffer based on actual demand, + * allowing us to adjust the batch size for optimal performance in different + * scenarios. + */ +#define QI_MAX_BATCHED_DESC_COUNT 16 +struct qi_batch { + struct qi_desc descs[QI_MAX_BATCHED_DESC_COUNT]; + unsigned int index; +}; + +struct dmar_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + /* First stage page table */ + struct pt_iommu_x86_64 fspt; + /* Second stage page table */ + struct pt_iommu_vtdss sspt; + }; + + struct xarray iommu_array; /* Attached IOMMU array */ + + u8 force_snooping:1; /* Create PASID entry with snoop control */ + u8 dirty_tracking:1; /* Dirty tracking is enabled */ + u8 nested_parent:1; /* Has other domains nested on it */ + u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write + * buffer when creating mappings. + */ + + spinlock_t lock; /* Protect device tracking lists */ + struct list_head devices; /* all devices' list */ + struct list_head dev_pasids; /* all attached pasids */ + + spinlock_t cache_lock; /* Protect the cache tag list */ + struct list_head cache_tags; /* Cache tag list */ + struct qi_batch *qi_batch; /* Batched QI descriptors */ + + union { + /* DMA remapping domain */ + struct { + /* Protect the s1_domains list */ + spinlock_t s1_lock; + /* Track s1_domains nested on this domain */ + struct list_head s1_domains; + }; + + /* Nested user domain */ + struct { + /* parent page table which the user domain is nested on */ + struct dmar_domain *s2_domain; + /* page table attributes */ + struct iommu_hwpt_vtd_s1 s1_cfg; + /* link to parent domain siblings */ + struct list_head s2_link; + }; + + /* SVA domain */ + struct { + struct mmu_notifier notifier; + }; + }; +}; +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain); + +/* + * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters. + * But in practice, there are only 14 counters for the existing + * platform. Setting the max number of counters to 64 should be good + * enough for a long time. Also, supporting more than 64 counters + * requires more extras, e.g., extra freeze and overflow registers, + * which is not necessary for now. + */ +#define IOMMU_PMU_IDX_MAX 64 + +struct iommu_pmu { + struct intel_iommu *iommu; + u32 num_cntr; /* Number of counters */ + u32 num_eg; /* Number of event group */ + u32 cntr_width; /* Counter width */ + u32 cntr_stride; /* Counter Stride */ + u32 filter; /* Bitmask of filter support */ + void __iomem *base; /* the PerfMon base address */ + void __iomem *cfg_reg; /* counter configuration base address */ + void __iomem *cntr_reg; /* counter 0 address*/ + void __iomem *overflow; /* overflow status register */ + + u64 *evcap; /* Indicates all supported events */ + u32 **cntr_evcap; /* Supported events of each counter. */ + + struct pmu pmu; + DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); + struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; + unsigned char irq_name[16]; +}; + +#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) +#define IOMMU_IRQ_ID_OFFSET_PERF (2 * DMAR_UNITS_SUPPORTED) + +struct intel_iommu { + void __iomem *reg; /* Pointer to hardware regs, virtual addr */ + u64 reg_phys; /* physical address of hw register set */ + u64 reg_size; /* size of hw register set */ + u64 cap; + u64 ecap; + u64 vccap; + u64 ecmdcap[DMA_MAX_NUM_ECMDCAP]; + u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ + raw_spinlock_t register_lock; /* protect register handling */ + int seq_id; /* sequence id of the iommu */ + int agaw; /* agaw of this iommu */ + int msagaw; /* max sagaw of this iommu */ + unsigned int irq, pr_irq, perf_irq; + u16 segment; /* PCI segment# */ + unsigned char name[16]; /* Device Name */ + +#ifdef CONFIG_INTEL_IOMMU + /* mutex to protect domain_ida */ + struct mutex did_lock; + struct ida domain_ida; /* domain id allocator */ + unsigned long *copied_tables; /* bitmap of copied tables */ + spinlock_t lock; /* protect context, domain ids */ + struct root_entry *root_entry; /* virtual address */ + + struct iommu_flush flush; +#endif + struct page_req_dsc *prq; + unsigned char prq_name[16]; /* Name for PRQ interrupt */ + unsigned long prq_seq_number; + struct completion prq_complete; + struct iopf_queue *iopf_queue; + unsigned char iopfq_name[16]; + /* Synchronization between fault report and iommu device release. */ + struct mutex iopf_lock; + struct q_inval *qi; /* Queued invalidation info */ + u32 iommu_state[MAX_SR_DMAR_REGS]; /* Store iommu states between suspend and resume.*/ + + /* rb tree for all probed devices */ + struct rb_root device_rbtree; + /* protect the device_rbtree */ + spinlock_t device_rbtree_lock; + +#ifdef CONFIG_IRQ_REMAP + struct ir_table *ir_table; /* Interrupt remapping info */ + struct irq_domain *ir_domain; +#endif + struct iommu_device iommu; /* IOMMU core code handle */ + int node; + u32 flags; /* Software defined flags */ + + struct dmar_drhd_unit *drhd; + void *perf_statistic; + + struct iommu_pmu *pmu; +}; + +/* PCI domain-device relationship */ +struct device_domain_info { + struct list_head link; /* link to domain siblings */ + u32 segment; /* PCI segment number */ + u8 bus; /* PCI bus number */ + u8 devfn; /* PCI devfn number */ + u16 pfsid; /* SRIOV physical function source ID */ + u8 pasid_supported:3; + u8 pasid_enabled:1; + u8 pri_supported:1; + u8 pri_enabled:1; + u8 ats_supported:1; + u8 ats_enabled:1; + u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */ + u8 domain_attached:1; /* Device has domain attached */ + u8 ats_qdep; + unsigned int iopf_refcount; + struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ + struct intel_iommu *iommu; /* IOMMU used by this device */ + struct dmar_domain *domain; /* pointer to domain */ + struct pasid_table *pasid_table; /* pasid table */ + /* device tracking node(lookup by PCI RID) */ + struct rb_node node; +#ifdef CONFIG_INTEL_IOMMU_DEBUGFS + struct dentry *debugfs_dentry; /* pointer to device directory dentry */ +#endif +}; + +struct dev_pasid_info { + struct list_head link_domain; /* link to domain siblings */ + struct device *dev; + ioasid_t pasid; +#ifdef CONFIG_INTEL_IOMMU_DEBUGFS + struct dentry *debugfs_dentry; /* pointer to pasid directory dentry */ +#endif +}; + +static inline void __iommu_flush_cache( + struct intel_iommu *iommu, void *addr, int size) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(addr, size); +} + +/* Convert generic struct iommu_domain to private struct dmar_domain */ +static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct dmar_domain, domain); +} + +/* + * Domain ID 0 and 1 are reserved: + * + * If Caching mode is set, then invalid translations are tagged + * with domain-id 0, hence we need to pre-allocate it. We also + * use domain-id 0 as a marker for non-allocated domain-id, so + * make sure it is not used for a real domain. + * + * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid + * entry for first-level or pass-through translation modes should + * be programmed with a domain id different from those used for + * second-level or nested translation. We reserve a domain id for + * this purpose. This domain id is also used for identity domain + * in legacy mode. + */ +#define FLPT_DEFAULT_DID 1 +#define IDA_START_DID 2 + +/* Retrieve the domain ID which has allocated to the domain */ +static inline u16 +domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) +{ + struct iommu_domain_info *info = + xa_load(&domain->iommu_array, iommu->seq_id); + + return info->did; +} + +static inline u16 +iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) +{ + if (domain->type == IOMMU_DOMAIN_SVA || + domain->type == IOMMU_DOMAIN_IDENTITY) + return FLPT_DEFAULT_DID; + return domain_id_iommu(to_dmar_domain(domain), iommu); +} + +static inline bool dev_is_real_dma_subdevice(struct device *dev) +{ + return dev && dev_is_pci(dev) && + pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); +} + +/* + * 0: readable + * 1: writable + * 2-6: reserved + * 7: super page + * 8-10: available + * 11: snoop behavior + * 12-63: Host physical address + */ +struct dma_pte { + u64 val; +}; + +static inline u64 dma_pte_addr(struct dma_pte *pte) +{ +#ifdef CONFIG_64BIT + return pte->val & VTD_PAGE_MASK; +#else + /* Must have a full atomic 64-bit read */ + return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK; +#endif +} + +static inline bool dma_pte_present(struct dma_pte *pte) +{ + return (pte->val & 3) != 0; +} + +static inline bool dma_pte_superpage(struct dma_pte *pte) +{ + return (pte->val & DMA_PTE_LARGE_PAGE); +} + +static inline bool context_present(struct context_entry *context) +{ + return (context->lo & 1); +} + +#define LEVEL_STRIDE (9) +#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) +#define MAX_AGAW_WIDTH (64) +#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) + +static inline int agaw_to_level(int agaw) +{ + return agaw + 2; +} + +static inline int width_to_agaw(int width) +{ + return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); +} + +static inline unsigned int level_to_offset_bits(int level) +{ + return (level - 1) * LEVEL_STRIDE; +} + +static inline int pfn_level_offset(u64 pfn, int level) +{ + return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; +} + + +static inline void context_set_present(struct context_entry *context) +{ + context->lo |= 1; +} + +static inline void context_set_fault_enable(struct context_entry *context) +{ + context->lo &= (((u64)-1) << 2) | 1; +} + +static inline void context_set_translation_type(struct context_entry *context, + unsigned long value) +{ + context->lo &= (((u64)-1) << 4) | 3; + context->lo |= (value & 3) << 2; +} + +static inline void context_set_address_root(struct context_entry *context, + unsigned long value) +{ + context->lo &= ~VTD_PAGE_MASK; + context->lo |= value & VTD_PAGE_MASK; +} + +static inline void context_set_address_width(struct context_entry *context, + unsigned long value) +{ + context->hi |= value & 7; +} + +static inline void context_set_domain_id(struct context_entry *context, + unsigned long value) +{ + context->hi |= (value & ((1 << 16) - 1)) << 8; +} + +static inline void context_set_pasid(struct context_entry *context) +{ + context->lo |= CONTEXT_PASIDE; +} + +static inline int context_domain_id(struct context_entry *c) +{ + return((c->hi >> 8) & 0xffff); +} + +static inline void context_clear_entry(struct context_entry *context) +{ + context->lo = 0; + context->hi = 0; +} + +#ifdef CONFIG_INTEL_IOMMU +static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + if (!iommu->copied_tables) + return false; + + return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + set_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} +#endif /* CONFIG_INTEL_IOMMU */ + +/* + * Set the RID_PASID field of a scalable mode context entry. The + * IOMMU hardware will use the PASID value set in this field for + * DMA translations of DMA requests without PASID. + */ +static inline void +context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) +{ + context->hi |= pasid & ((1 << 20) - 1); +} + +/* + * Set the DTE(Device-TLB Enable) field of a scalable mode context + * entry. + */ +static inline void context_set_sm_dte(struct context_entry *context) +{ + context->lo |= BIT_ULL(2); +} + +/* + * Set the PRE(Page Request Enable) field of a scalable mode context + * entry. + */ +static inline void context_set_sm_pre(struct context_entry *context) +{ + context->lo |= BIT_ULL(4); +} + +/* + * Clear the PRE(Page Request Enable) field of a scalable mode context + * entry. + */ +static inline void context_clear_sm_pre(struct context_entry *context) +{ + context->lo &= ~BIT_ULL(4); +} + +/* Returns a number of VTD pages, but aligned to MM page size */ +static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size) +{ + host_addr &= ~PAGE_MASK; + return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; +} + +/* Return a size from number of VTD pages. */ +static inline unsigned long nrpages_to_size(unsigned long npages) +{ + return npages << VTD_PAGE_SHIFT; +} + +static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_desc *desc) +{ + u8 dw = 0, dr = 0; + int ih = addr & 1; + + if (cap_write_drain(iommu->cap)) + dw = 1; + + if (cap_read_drain(iommu->cap)) + dr = 1; + + desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) + | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; + desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) + | QI_IOTLB_AM(size_order); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_dev_iotlb(u16 sid, u16 pfsid, u16 qdep, u64 addr, + unsigned int mask, struct qi_desc *desc) +{ + if (mask) { + addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; + desc->qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; + } else { + desc->qw1 = QI_DEV_IOTLB_ADDR(addr); + } + + if (qdep >= QI_DEV_IOTLB_MAX_INVS) + qdep = 0; + + desc->qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | + QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_piotlb(u16 did, u32 pasid, u64 addr, + unsigned long npages, bool ih, + struct qi_desc *desc) +{ + if (npages == -1) { + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = 0; + } else { + int mask = ilog2(__roundup_pow_of_two(npages)); + unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); + + if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) + addr = ALIGN_DOWN(addr, align); + + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = QI_EIOTLB_ADDR(addr) | + QI_EIOTLB_IH(ih) | + QI_EIOTLB_AM(mask); + } +} + +static inline void qi_desc_dev_iotlb_pasid(u16 sid, u16 pfsid, u32 pasid, + u16 qdep, u64 addr, + unsigned int size_order, + struct qi_desc *desc) +{ + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); + + desc->qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(pfsid); + + /* + * If S bit is 0, we only flush a single page. If S bit is set, + * The least significant zero bit indicates the invalidation address + * range. VT-d spec 6.5.2.6. + * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. + * size order = 0 is PAGE_SIZE 4KB + * Max Invs Pending (MIP) is set to 0 for now until we have DIT in + * ECAP. + */ + if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) + pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", + addr, size_order); + + /* Take page address */ + desc->qw1 = QI_DEV_EIOTLB_ADDR(addr); + + if (size_order) { + /* + * Existing 0s in address below size_order may be the least + * significant bit, we must set them to 1s to avoid having + * smaller size than desired. + */ + desc->qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, + VTD_PAGE_SHIFT); + /* Clear size_order bit to indicate size */ + desc->qw1 &= ~mask; + /* Set the S bit to indicate flushing more than 1 page */ + desc->qw1 |= QI_DEV_EIOTLB_SIZE; + } +} + +/* Convert value to context PASID directory size field coding. */ +#define context_pdts(pds) (((pds) & 0x7) << 9) + +struct dmar_drhd_unit *dmar_find_matched_drhd_unit(struct pci_dev *dev); + +int dmar_enable_qi(struct intel_iommu *iommu); +void dmar_disable_qi(struct intel_iommu *iommu); +int dmar_reenable_qi(struct intel_iommu *iommu); +void qi_global_iec(struct intel_iommu *iommu); + +void qi_flush_context(struct intel_iommu *iommu, u16 did, + u16 sid, u8 fm, u64 type); +void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned mask); + +void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, + unsigned long npages, bool ih); + +void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order); +void quirk_extra_dev_tlb_flush(struct device_domain_info *info, + unsigned long address, unsigned long pages, + u32 pasid, u16 qdep); +void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, + u32 pasid); + +int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + unsigned int count, unsigned long options); + +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); +/* + * Options used in qi_submit_sync: + * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. + */ +#define QI_OPT_WAIT_DRAIN BIT(0) + +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void device_block_translation(struct device *dev); +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); + +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old); + +int dmar_ir_support(void); + +void iommu_flush_write_buffer(struct intel_iommu *iommu); +struct iommu_domain * +intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, + const struct iommu_user_data *user_data); +struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid); + +enum cache_tag_type { + CACHE_TAG_IOTLB, + CACHE_TAG_DEVTLB, + CACHE_TAG_NESTING_IOTLB, + CACHE_TAG_NESTING_DEVTLB, +}; + +struct cache_tag { + struct list_head node; + enum cache_tag_type type; + struct intel_iommu *iommu; + /* + * The @dev field represents the location of the cache. For IOTLB, it + * resides on the IOMMU hardware. @dev stores the device pointer to + * the IOMMU hardware. For DevTLB, it locates in the PCIe endpoint. + * @dev stores the device pointer to that endpoint. + */ + struct device *dev; + u16 domain_id; + ioasid_t pasid; + unsigned int users; +}; + +int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev, + ioasid_t pasid, enum cache_tag_type type); +int cache_tag_assign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid); +void cache_tag_unassign_domain(struct dmar_domain *domain, + struct device *dev, ioasid_t pasid); +void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, + unsigned long end, int ih); +void cache_tag_flush_all(struct dmar_domain *domain); +void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, + unsigned long end); + +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did); + +int intel_iommu_enable_prq(struct intel_iommu *iommu); +int intel_iommu_finish_prq(struct intel_iommu *iommu); +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); + +int intel_iommu_enable_iopf(struct device *dev); +void intel_iommu_disable_iopf(struct device *dev); + +static inline int iopf_for_domain_set(struct iommu_domain *domain, + struct device *dev) +{ + if (!domain || !domain->iopf_handler) + return 0; + + return intel_iommu_enable_iopf(dev); +} + +static inline void iopf_for_domain_remove(struct iommu_domain *domain, + struct device *dev) +{ + if (!domain || !domain->iopf_handler) + return; + + intel_iommu_disable_iopf(dev); +} + +static inline int iopf_for_domain_replace(struct iommu_domain *new, + struct iommu_domain *old, + struct device *dev) +{ + int ret; + + ret = iopf_for_domain_set(new, dev); + if (ret) + return ret; + + iopf_for_domain_remove(old, dev); + + return 0; +} + +#ifdef CONFIG_INTEL_IOMMU_SVM +void intel_svm_check(struct intel_iommu *iommu); +struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm); +#else +static inline void intel_svm_check(struct intel_iommu *iommu) {} +static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + return ERR_PTR(-ENODEV); +} +#endif + +#ifdef CONFIG_INTEL_IOMMU_DEBUGFS +void intel_iommu_debugfs_init(void); +void intel_iommu_debugfs_create_dev(struct device_domain_info *info); +void intel_iommu_debugfs_remove_dev(struct device_domain_info *info); +void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid); +void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid); +#else +static inline void intel_iommu_debugfs_init(void) {} +static inline void intel_iommu_debugfs_create_dev(struct device_domain_info *info) {} +static inline void intel_iommu_debugfs_remove_dev(struct device_domain_info *info) {} +static inline void intel_iommu_debugfs_create_dev_pasid(struct dev_pasid_info *dev_pasid) {} +static inline void intel_iommu_debugfs_remove_dev_pasid(struct dev_pasid_info *dev_pasid) {} +#endif /* CONFIG_INTEL_IOMMU_DEBUGFS */ + +extern const struct attribute_group *intel_iommu_groups[]; +struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, + u8 devfn, int alloc); + +extern const struct iommu_ops intel_iommu_ops; +extern const struct iommu_domain_ops intel_fs_paging_domain_ops; +extern const struct iommu_domain_ops intel_ss_paging_domain_ops; + +static inline bool intel_domain_is_fs_paging(struct dmar_domain *domain) +{ + return domain->domain.ops == &intel_fs_paging_domain_ops; +} + +static inline bool intel_domain_is_ss_paging(struct dmar_domain *domain) +{ + return domain->domain.ops == &intel_ss_paging_domain_ops; +} + +#ifdef CONFIG_INTEL_IOMMU +extern int intel_iommu_sm; +int iommu_calculate_agaw(struct intel_iommu *iommu); +int iommu_calculate_max_sagaw(struct intel_iommu *iommu); +int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob); + +static inline bool ecmd_has_pmu_essential(struct intel_iommu *iommu) +{ + return (iommu->ecmdcap[DMA_ECMD_ECCAP3] & DMA_ECMD_ECCAP3_ESSENTIAL) == + DMA_ECMD_ECCAP3_ESSENTIAL; +} + +extern int dmar_disabled; +extern int intel_iommu_enabled; +#else +static inline int iommu_calculate_agaw(struct intel_iommu *iommu) +{ + return 0; +} +static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) +{ + return 0; +} +#define dmar_disabled (1) +#define intel_iommu_enabled (0) +#define intel_iommu_sm (0) +#endif + +static inline const char *decode_prq_descriptor(char *str, size_t size, + u64 dw0, u64 dw1, u64 dw2, u64 dw3) +{ + char *buf = str; + int bytes; + + bytes = snprintf(buf, size, + "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx", + FIELD_GET(GENMASK_ULL(31, 16), dw0), + FIELD_GET(GENMASK_ULL(63, 12), dw1), + dw1 & BIT_ULL(0) ? 'r' : '-', + dw1 & BIT_ULL(1) ? 'w' : '-', + dw0 & BIT_ULL(52) ? 'x' : '-', + dw0 & BIT_ULL(53) ? 'p' : '-', + dw1 & BIT_ULL(2) ? 'l' : '-', + FIELD_GET(GENMASK_ULL(51, 32), dw0), + FIELD_GET(GENMASK_ULL(11, 3), dw1)); + + /* Private Data */ + if (dw0 & BIT_ULL(9)) { + size -= bytes; + buf += bytes; + snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3); + } + + return str; +} + +#endif diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c new file mode 100644 index 000000000000..4f9b01dc91e8 --- /dev/null +++ b/drivers/iommu/intel/irq_remapping.c @@ -0,0 +1,1605 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "DMAR-IR: " fmt + +#include <linux/interrupt.h> +#include <linux/dmar.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/jiffies.h> +#include <linux/hpet.h> +#include <linux/pci.h> +#include <linux/irq.h> +#include <linux/irqchip/irq-msi-lib.h> +#include <linux/acpi.h> +#include <linux/irqdomain.h> +#include <linux/crash_dump.h> +#include <asm/io_apic.h> +#include <asm/apic.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/irq_remapping.h> +#include <asm/pci-direct.h> +#include <asm/posted_intr.h> + +#include "iommu.h" +#include "../irq_remapping.h" +#include "../iommu-pages.h" + +struct ioapic_scope { + struct intel_iommu *iommu; + unsigned int id; + unsigned int bus; /* PCI bus number */ + unsigned int devfn; /* PCI devfn number */ +}; + +struct hpet_scope { + struct intel_iommu *iommu; + u8 id; + unsigned int bus; + unsigned int devfn; +}; + +struct irq_2_iommu { + struct intel_iommu *iommu; + u16 irte_index; + u16 sub_handle; + u8 irte_mask; + bool posted_msi; + bool posted_vcpu; +}; + +struct intel_ir_data { + struct irq_2_iommu irq_2_iommu; + struct irte irte_entry; + union { + struct msi_msg msi_entry; + }; +}; + +#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0) +#define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8) + +static int __read_mostly eim_mode; +static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; +static struct hpet_scope ir_hpet[MAX_HPET_TBS]; + +/* + * Lock ordering: + * ->dmar_global_lock + * ->irq_2_ir_lock + * ->qi->q_lock + * ->iommu->register_lock + * Note: + * intel_irq_remap_ops.{supported,prepare,enable,disable,reenable} are called + * in single-threaded environment with interrupt disabled, so no need to tabke + * the dmar_global_lock. + */ +DEFINE_RAW_SPINLOCK(irq_2_ir_lock); +static const struct irq_domain_ops intel_ir_domain_ops; + +static void iommu_disable_irq_remapping(struct intel_iommu *iommu); +static int __init parse_ioapics_under_ir(void); +static const struct msi_parent_ops dmar_msi_parent_ops; + +static bool ir_pre_enabled(struct intel_iommu *iommu) +{ + return (iommu->flags & VTD_FLAG_IRQ_REMAP_PRE_ENABLED); +} + +static void clear_ir_pre_enabled(struct intel_iommu *iommu) +{ + iommu->flags &= ~VTD_FLAG_IRQ_REMAP_PRE_ENABLED; +} + +static void init_ir_status(struct intel_iommu *iommu) +{ + u32 gsts; + + gsts = readl(iommu->reg + DMAR_GSTS_REG); + if (gsts & DMA_GSTS_IRES) + iommu->flags |= VTD_FLAG_IRQ_REMAP_PRE_ENABLED; +} + +static int alloc_irte(struct intel_iommu *iommu, + struct irq_2_iommu *irq_iommu, u16 count) +{ + struct ir_table *table = iommu->ir_table; + unsigned int mask = 0; + unsigned long flags; + int index; + + if (!count || !irq_iommu) + return -1; + + if (count > 1) { + count = __roundup_pow_of_two(count); + mask = ilog2(count); + } + + if (mask > ecap_max_handle_mask(iommu->ecap)) { + pr_err("Requested mask %x exceeds the max invalidation handle" + " mask value %Lx\n", mask, + ecap_max_handle_mask(iommu->ecap)); + return -1; + } + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + index = bitmap_find_free_region(table->bitmap, + INTR_REMAP_TABLE_ENTRIES, mask); + if (index < 0) { + pr_warn("IR%d: can't allocate an IRTE\n", iommu->seq_id); + } else { + irq_iommu->iommu = iommu; + irq_iommu->irte_index = index; + irq_iommu->sub_handle = 0; + irq_iommu->irte_mask = mask; + } + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return index; +} + +static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) +{ + struct qi_desc desc; + + desc.qw0 = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask) + | QI_IEC_SELECTIVE; + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; + + return qi_submit_sync(iommu, &desc, 1, 0); +} + +static int modify_irte(struct irq_2_iommu *irq_iommu, + struct irte *irte_modified) +{ + struct intel_iommu *iommu; + unsigned long flags; + struct irte *irte; + int rc, index; + + if (!irq_iommu) + return -1; + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + + iommu = irq_iommu->iommu; + + index = irq_iommu->irte_index + irq_iommu->sub_handle; + irte = &iommu->ir_table->base[index]; + + if ((irte->pst == 1) || (irte_modified->pst == 1)) { + /* + * We use cmpxchg16 to atomically update the 128-bit IRTE, + * and it cannot be updated by the hardware or other processors + * behind us, so the return value of cmpxchg16 should be the + * same as the old value. + */ + u128 old = irte->irte; + WARN_ON(!try_cmpxchg128(&irte->irte, &old, irte_modified->irte)); + } else { + WRITE_ONCE(irte->low, irte_modified->low); + WRITE_ONCE(irte->high, irte_modified->high); + } + __iommu_flush_cache(iommu, irte, sizeof(*irte)); + + rc = qi_flush_iec(iommu, index, 0); + + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return rc; +} + +static struct intel_iommu *map_hpet_to_iommu(u8 hpet_id) +{ + int i; + + for (i = 0; i < MAX_HPET_TBS; i++) { + if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu) + return ir_hpet[i].iommu; + } + return NULL; +} + +static struct intel_iommu *map_ioapic_to_iommu(int apic) +{ + int i; + + for (i = 0; i < MAX_IO_APICS; i++) { + if (ir_ioapic[i].id == apic && ir_ioapic[i].iommu) + return ir_ioapic[i].iommu; + } + return NULL; +} + +static struct irq_domain *map_dev_to_ir(struct pci_dev *dev) +{ + struct dmar_drhd_unit *drhd = dmar_find_matched_drhd_unit(dev); + + return drhd ? drhd->iommu->ir_domain : NULL; +} + +static int clear_entries(struct irq_2_iommu *irq_iommu) +{ + struct irte *start, *entry, *end; + struct intel_iommu *iommu; + int index; + + if (irq_iommu->sub_handle) + return 0; + + iommu = irq_iommu->iommu; + index = irq_iommu->irte_index; + + start = iommu->ir_table->base + index; + end = start + (1 << irq_iommu->irte_mask); + + for (entry = start; entry < end; entry++) { + WRITE_ONCE(entry->low, 0); + WRITE_ONCE(entry->high, 0); + } + bitmap_release_region(iommu->ir_table->bitmap, index, + irq_iommu->irte_mask); + + return qi_flush_iec(iommu, index, irq_iommu->irte_mask); +} + +/* + * source validation type + */ +#define SVT_NO_VERIFY 0x0 /* no verification is required */ +#define SVT_VERIFY_SID_SQ 0x1 /* verify using SID and SQ fields */ +#define SVT_VERIFY_BUS 0x2 /* verify bus of request-id */ + +/* + * source-id qualifier + */ +#define SQ_ALL_16 0x0 /* verify all 16 bits of request-id */ +#define SQ_13_IGNORE_1 0x1 /* verify most significant 13 bits, ignore + * the third least significant bit + */ +#define SQ_13_IGNORE_2 0x2 /* verify most significant 13 bits, ignore + * the second and third least significant bits + */ +#define SQ_13_IGNORE_3 0x3 /* verify most significant 13 bits, ignore + * the least three significant bits + */ + +/* + * set SVT, SQ and SID fields of irte to verify + * source ids of interrupt requests + */ +static void set_irte_sid(struct irte *irte, unsigned int svt, + unsigned int sq, unsigned int sid) +{ + if (disable_sourceid_checking) + svt = SVT_NO_VERIFY; + irte->svt = svt; + irte->sq = sq; + irte->sid = sid; +} + +/* + * Set an IRTE to match only the bus number. Interrupt requests that reference + * this IRTE must have a requester-id whose bus number is between or equal + * to the start_bus and end_bus arguments. + */ +static void set_irte_verify_bus(struct irte *irte, unsigned int start_bus, + unsigned int end_bus) +{ + set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16, + (start_bus << 8) | end_bus); +} + +static int set_ioapic_sid(struct irte *irte, int apic) +{ + int i; + u16 sid = 0; + + if (!irte) + return -1; + + for (i = 0; i < MAX_IO_APICS; i++) { + if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { + sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn); + break; + } + } + + if (sid == 0) { + pr_warn("Failed to set source-id of IOAPIC (%d)\n", apic); + return -1; + } + + set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, sid); + + return 0; +} + +static int set_hpet_sid(struct irte *irte, u8 id) +{ + int i; + u16 sid = 0; + + if (!irte) + return -1; + + for (i = 0; i < MAX_HPET_TBS; i++) { + if (ir_hpet[i].iommu && ir_hpet[i].id == id) { + sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn); + break; + } + } + + if (sid == 0) { + pr_warn("Failed to set source-id of HPET block (%d)\n", id); + return -1; + } + + /* + * Should really use SQ_ALL_16. Some platforms are broken. + * While we figure out the right quirks for these broken platforms, use + * SQ_13_IGNORE_3 for now. + */ + set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid); + + return 0; +} + +struct set_msi_sid_data { + struct pci_dev *pdev; + u16 alias; + int count; + int busmatch_count; +}; + +static int set_msi_sid_cb(struct pci_dev *pdev, u16 alias, void *opaque) +{ + struct set_msi_sid_data *data = opaque; + + if (data->count == 0 || PCI_BUS_NUM(alias) == PCI_BUS_NUM(data->alias)) + data->busmatch_count++; + + data->pdev = pdev; + data->alias = alias; + data->count++; + + return 0; +} + +static int set_msi_sid(struct irte *irte, struct pci_dev *dev) +{ + struct set_msi_sid_data data; + + if (!irte || !dev) + return -1; + + data.count = 0; + data.busmatch_count = 0; + pci_for_each_dma_alias(dev, set_msi_sid_cb, &data); + + /* + * DMA alias provides us with a PCI device and alias. The only case + * where the it will return an alias on a different bus than the + * device is the case of a PCIe-to-PCI bridge, where the alias is for + * the subordinate bus. In this case we can only verify the bus. + * + * If there are multiple aliases, all with the same bus number, + * then all we can do is verify the bus. This is typical in NTB + * hardware which use proxy IDs where the device will generate traffic + * from multiple devfn numbers on the same bus. + * + * If the alias device is on a different bus than our source device + * then we have a topology based alias, use it. + * + * Otherwise, the alias is for a device DMA quirk and we cannot + * assume that MSI uses the same requester ID. Therefore use the + * original device. + */ + if (PCI_BUS_NUM(data.alias) != data.pdev->bus->number) + set_irte_verify_bus(irte, PCI_BUS_NUM(data.alias), + dev->bus->number); + else if (data.count >= 2 && data.busmatch_count == data.count) + set_irte_verify_bus(irte, dev->bus->number, dev->bus->number); + else if (data.pdev->bus->number != dev->bus->number) + set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, data.alias); + else + set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, + pci_dev_id(dev)); + + return 0; +} + +static int iommu_load_old_irte(struct intel_iommu *iommu) +{ + struct irte *old_ir_table; + phys_addr_t irt_phys; + unsigned int i; + size_t size; + u64 irta; + + /* Check whether the old ir-table has the same size as ours */ + irta = dmar_readq(iommu->reg + DMAR_IRTA_REG); + if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK) + != INTR_REMAP_TABLE_REG_SIZE) + return -EINVAL; + + irt_phys = irta & VTD_PAGE_MASK; + size = INTR_REMAP_TABLE_ENTRIES*sizeof(struct irte); + + /* Map the old IR table */ + old_ir_table = memremap(irt_phys, size, MEMREMAP_WB); + if (!old_ir_table) + return -ENOMEM; + + /* Copy data over */ + memcpy(iommu->ir_table->base, old_ir_table, size); + + __iommu_flush_cache(iommu, iommu->ir_table->base, size); + + /* + * Now check the table for used entries and mark those as + * allocated in the bitmap + */ + for (i = 0; i < INTR_REMAP_TABLE_ENTRIES; i++) { + if (iommu->ir_table->base[i].present) + bitmap_set(iommu->ir_table->bitmap, i, 1); + } + + memunmap(old_ir_table); + + return 0; +} + + +static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode) +{ + unsigned long flags; + u64 addr; + u32 sts; + + addr = virt_to_phys((void *)iommu->ir_table->base); + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + dmar_writeq(iommu->reg + DMAR_IRTA_REG, + (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE); + + /* Set interrupt-remapping table pointer */ + writel(iommu->gcmd | DMA_GCMD_SIRTP, iommu->reg + DMAR_GCMD_REG); + + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_IRTPS), sts); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + /* + * Global invalidation of interrupt entry cache to make sure the + * hardware uses the new irq remapping table. + */ + if (!cap_esirtps(iommu->cap)) + qi_global_iec(iommu); +} + +static void iommu_enable_irq_remapping(struct intel_iommu *iommu) +{ + unsigned long flags; + u32 sts; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + /* Enable interrupt-remapping */ + iommu->gcmd |= DMA_GCMD_IRE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_IRES), sts); + + /* Block compatibility-format MSIs */ + if (sts & DMA_GSTS_CFIS) { + iommu->gcmd &= ~DMA_GCMD_CFI; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, !(sts & DMA_GSTS_CFIS), sts); + } + + /* + * With CFI clear in the Global Command register, we should be + * protected from dangerous (i.e. compatibility) interrupts + * regardless of x2apic status. Check just to be sure. + */ + if (sts & DMA_GSTS_CFIS) + WARN(1, KERN_WARNING + "Compatibility-format IRQs enabled despite intr remapping;\n" + "you are vulnerable to IRQ injection.\n"); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static int intel_setup_irq_remapping(struct intel_iommu *iommu) +{ + struct irq_domain_info info = { + .ops = &intel_ir_domain_ops, + .parent = arch_get_ir_parent_domain(), + .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, + .size = INTR_REMAP_TABLE_ENTRIES, + .host_data = iommu, + }; + struct ir_table *ir_table; + unsigned long *bitmap; + void *ir_table_base; + + if (iommu->ir_table) + return 0; + + ir_table = kzalloc(sizeof(struct ir_table), GFP_KERNEL); + if (!ir_table) + return -ENOMEM; + + /* 1MB - maximum possible interrupt remapping table size */ + ir_table_base = + iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, SZ_1M); + if (!ir_table_base) { + pr_err("IR%d: failed to allocate 1M of pages\n", iommu->seq_id); + goto out_free_table; + } + + bitmap = bitmap_zalloc(INTR_REMAP_TABLE_ENTRIES, GFP_KERNEL); + if (bitmap == NULL) { + pr_err("IR%d: failed to allocate bitmap\n", iommu->seq_id); + goto out_free_pages; + } + + info.fwnode = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id); + if (!info.fwnode) + goto out_free_bitmap; + + iommu->ir_domain = msi_create_parent_irq_domain(&info, &dmar_msi_parent_ops); + if (!iommu->ir_domain) { + pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); + goto out_free_fwnode; + } + + ir_table->base = ir_table_base; + ir_table->bitmap = bitmap; + iommu->ir_table = ir_table; + + /* + * If the queued invalidation is already initialized, + * shouldn't disable it. + */ + if (!iommu->qi) { + /* + * Clear previous faults. + */ + dmar_fault(-1, iommu); + dmar_disable_qi(iommu); + + if (dmar_enable_qi(iommu)) { + pr_err("Failed to enable queued invalidation\n"); + goto out_free_ir_domain; + } + } + + init_ir_status(iommu); + + if (ir_pre_enabled(iommu)) { + if (!is_kdump_kernel()) { + pr_info_once("IRQ remapping was enabled on %s but we are not in kdump mode\n", + iommu->name); + clear_ir_pre_enabled(iommu); + iommu_disable_irq_remapping(iommu); + } else if (iommu_load_old_irte(iommu)) + pr_err("Failed to copy IR table for %s from previous kernel\n", + iommu->name); + else + pr_info("Copied IR table for %s from previous kernel\n", + iommu->name); + } + + iommu_set_irq_remapping(iommu, eim_mode); + + return 0; + +out_free_ir_domain: + irq_domain_remove(iommu->ir_domain); + iommu->ir_domain = NULL; +out_free_fwnode: + irq_domain_free_fwnode(info.fwnode); +out_free_bitmap: + bitmap_free(bitmap); +out_free_pages: + iommu_free_pages(ir_table_base); +out_free_table: + kfree(ir_table); + + iommu->ir_table = NULL; + + return -ENOMEM; +} + +static void intel_teardown_irq_remapping(struct intel_iommu *iommu) +{ + struct fwnode_handle *fn; + + if (iommu && iommu->ir_table) { + if (iommu->ir_domain) { + fn = iommu->ir_domain->fwnode; + + irq_domain_remove(iommu->ir_domain); + irq_domain_free_fwnode(fn); + iommu->ir_domain = NULL; + } + iommu_free_pages(iommu->ir_table->base); + bitmap_free(iommu->ir_table->bitmap); + kfree(iommu->ir_table); + iommu->ir_table = NULL; + } +} + +/* + * Disable Interrupt Remapping. + */ +static void iommu_disable_irq_remapping(struct intel_iommu *iommu) +{ + unsigned long flags; + u32 sts; + + if (!ecap_ir_support(iommu->ecap)) + return; + + /* + * global invalidation of interrupt entry cache before disabling + * interrupt-remapping. + */ + if (!cap_esirtps(iommu->cap)) + qi_global_iec(iommu); + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + sts = readl(iommu->reg + DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_IRES)) + goto end; + + iommu->gcmd &= ~DMA_GCMD_IRE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, !(sts & DMA_GSTS_IRES), sts); + +end: + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static int __init dmar_x2apic_optout(void) +{ + struct acpi_table_dmar *dmar; + dmar = (struct acpi_table_dmar *)dmar_tbl; + if (!dmar || no_x2apic_optout) + return 0; + return dmar->flags & DMAR_X2APIC_OPT_OUT; +} + +static void __init intel_cleanup_irq_remapping(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + for_each_iommu(iommu, drhd) { + if (ecap_ir_support(iommu->ecap)) { + iommu_disable_irq_remapping(iommu); + intel_teardown_irq_remapping(iommu); + } + } + + if (x2apic_supported()) + pr_warn("Failed to enable irq remapping. You are vulnerable to irq-injection attacks.\n"); +} + +static int __init intel_prepare_irq_remapping(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int eim = 0; + + if (irq_remap_broken) { + pr_warn("This system BIOS has enabled interrupt remapping\n" + "on a chipset that contains an erratum making that\n" + "feature unstable. To maintain system stability\n" + "interrupt remapping is being disabled. Please\n" + "contact your BIOS vendor for an update\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + return -ENODEV; + } + + if (dmar_table_init() < 0) + return -ENODEV; + + if (!dmar_ir_support()) + return -ENODEV; + + if (parse_ioapics_under_ir()) { + pr_info("Not enabling interrupt remapping\n"); + goto error; + } + + /* First make sure all IOMMUs support IRQ remapping */ + for_each_iommu(iommu, drhd) + if (!ecap_ir_support(iommu->ecap)) + goto error; + + /* Detect remapping mode: lapic or x2apic */ + if (x2apic_supported()) { + eim = !dmar_x2apic_optout(); + if (!eim) { + pr_info("x2apic is disabled because BIOS sets x2apic opt out bit."); + pr_info("Use 'intremap=no_x2apic_optout' to override the BIOS setting.\n"); + } + } + + for_each_iommu(iommu, drhd) { + if (eim && !ecap_eim_support(iommu->ecap)) { + pr_info("%s does not support EIM\n", iommu->name); + eim = 0; + } + } + + eim_mode = eim; + if (eim) + pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n"); + + /* Do the initializations early */ + for_each_iommu(iommu, drhd) { + if (intel_setup_irq_remapping(iommu)) { + pr_err("Failed to setup irq remapping for %s\n", + iommu->name); + goto error; + } + } + + return 0; + +error: + intel_cleanup_irq_remapping(); + return -ENODEV; +} + +/* + * Set Posted-Interrupts capability. + */ +static inline void set_irq_posting_cap(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + if (!disable_irq_post) { + /* + * If IRTE is in posted format, the 'pda' field goes across the + * 64-bit boundary, we need use cmpxchg16b to atomically update + * it. We only expose posted-interrupt when X86_FEATURE_CX16 + * is supported. Actually, hardware platforms supporting PI + * should have X86_FEATURE_CX16 support, this has been confirmed + * with Intel hardware guys. + */ + if (boot_cpu_has(X86_FEATURE_CX16)) + intel_irq_remap_ops.capability |= 1 << IRQ_POSTING_CAP; + + for_each_iommu(iommu, drhd) + if (!cap_pi_support(iommu->cap)) { + intel_irq_remap_ops.capability &= + ~(1 << IRQ_POSTING_CAP); + break; + } + } +} + +static int __init intel_enable_irq_remapping(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + bool setup = false; + + /* + * Setup Interrupt-remapping for all the DRHD's now. + */ + for_each_iommu(iommu, drhd) { + if (!ir_pre_enabled(iommu)) + iommu_enable_irq_remapping(iommu); + setup = true; + } + + if (!setup) + goto error; + + irq_remapping_enabled = 1; + + set_irq_posting_cap(); + + pr_info("Enabled IRQ remapping in %s mode\n", eim_mode ? "x2apic" : "xapic"); + + return eim_mode ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE; + +error: + intel_cleanup_irq_remapping(); + return -1; +} + +static int ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope, + struct intel_iommu *iommu, + struct acpi_dmar_hardware_unit *drhd) +{ + struct acpi_dmar_pci_path *path; + u8 bus; + int count, free = -1; + + bus = scope->bus; + path = (struct acpi_dmar_pci_path *)(scope + 1); + count = (scope->length - sizeof(struct acpi_dmar_device_scope)) + / sizeof(struct acpi_dmar_pci_path); + + while (--count > 0) { + /* + * Access PCI directly due to the PCI + * subsystem isn't initialized yet. + */ + bus = read_pci_config_byte(bus, path->device, path->function, + PCI_SECONDARY_BUS); + path++; + } + + for (count = 0; count < MAX_HPET_TBS; count++) { + if (ir_hpet[count].iommu == iommu && + ir_hpet[count].id == scope->enumeration_id) + return 0; + else if (ir_hpet[count].iommu == NULL && free == -1) + free = count; + } + if (free == -1) { + pr_warn("Exceeded Max HPET blocks\n"); + return -ENOSPC; + } + + ir_hpet[free].iommu = iommu; + ir_hpet[free].id = scope->enumeration_id; + ir_hpet[free].bus = bus; + ir_hpet[free].devfn = PCI_DEVFN(path->device, path->function); + pr_info("HPET id %d under DRHD base 0x%Lx\n", + scope->enumeration_id, drhd->address); + + return 0; +} + +static int ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope, + struct intel_iommu *iommu, + struct acpi_dmar_hardware_unit *drhd) +{ + struct acpi_dmar_pci_path *path; + u8 bus; + int count, free = -1; + + bus = scope->bus; + path = (struct acpi_dmar_pci_path *)(scope + 1); + count = (scope->length - sizeof(struct acpi_dmar_device_scope)) + / sizeof(struct acpi_dmar_pci_path); + + while (--count > 0) { + /* + * Access PCI directly due to the PCI + * subsystem isn't initialized yet. + */ + bus = read_pci_config_byte(bus, path->device, path->function, + PCI_SECONDARY_BUS); + path++; + } + + for (count = 0; count < MAX_IO_APICS; count++) { + if (ir_ioapic[count].iommu == iommu && + ir_ioapic[count].id == scope->enumeration_id) + return 0; + else if (ir_ioapic[count].iommu == NULL && free == -1) + free = count; + } + if (free == -1) { + pr_warn("Exceeded Max IO APICS\n"); + return -ENOSPC; + } + + ir_ioapic[free].bus = bus; + ir_ioapic[free].devfn = PCI_DEVFN(path->device, path->function); + ir_ioapic[free].iommu = iommu; + ir_ioapic[free].id = scope->enumeration_id; + pr_info("IOAPIC id %d under DRHD base 0x%Lx IOMMU %d\n", + scope->enumeration_id, drhd->address, iommu->seq_id); + + return 0; +} + +static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header, + struct intel_iommu *iommu) +{ + int ret = 0; + struct acpi_dmar_hardware_unit *drhd; + struct acpi_dmar_device_scope *scope; + void *start, *end; + + drhd = (struct acpi_dmar_hardware_unit *)header; + start = (void *)(drhd + 1); + end = ((void *)drhd) + header->length; + + while (start < end && ret == 0) { + scope = start; + if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) + ret = ir_parse_one_ioapic_scope(scope, iommu, drhd); + else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) + ret = ir_parse_one_hpet_scope(scope, iommu, drhd); + start += scope->length; + } + + return ret; +} + +static void ir_remove_ioapic_hpet_scope(struct intel_iommu *iommu) +{ + int i; + + for (i = 0; i < MAX_HPET_TBS; i++) + if (ir_hpet[i].iommu == iommu) + ir_hpet[i].iommu = NULL; + + for (i = 0; i < MAX_IO_APICS; i++) + if (ir_ioapic[i].iommu == iommu) + ir_ioapic[i].iommu = NULL; +} + +/* + * Finds the assocaition between IOAPIC's and its Interrupt-remapping + * hardware unit. + */ +static int __init parse_ioapics_under_ir(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + bool ir_supported = false; + int ioapic_idx; + + for_each_iommu(iommu, drhd) { + int ret; + + if (!ecap_ir_support(iommu->ecap)) + continue; + + ret = ir_parse_ioapic_hpet_scope(drhd->hdr, iommu); + if (ret) + return ret; + + ir_supported = true; + } + + if (!ir_supported) + return -ENODEV; + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) { + int ioapic_id = mpc_ioapic_id(ioapic_idx); + if (!map_ioapic_to_iommu(ioapic_id)) { + pr_err(FW_BUG "ioapic %d has no mapping iommu, " + "interrupt remapping will be disabled\n", + ioapic_id); + return -1; + } + } + + return 0; +} + +static int __init ir_dev_scope_init(void) +{ + int ret; + + if (!irq_remapping_enabled) + return 0; + + down_write(&dmar_global_lock); + ret = dmar_dev_scope_init(); + up_write(&dmar_global_lock); + + return ret; +} +rootfs_initcall(ir_dev_scope_init); + +static void disable_irq_remapping(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + + /* + * Disable Interrupt-remapping for all the DRHD's now. + */ + for_each_iommu(iommu, drhd) { + if (!ecap_ir_support(iommu->ecap)) + continue; + + iommu_disable_irq_remapping(iommu); + } + + /* + * Clear Posted-Interrupts capability. + */ + if (!disable_irq_post) + intel_irq_remap_ops.capability &= ~(1 << IRQ_POSTING_CAP); +} + +static int reenable_irq_remapping(int eim) +{ + struct dmar_drhd_unit *drhd; + bool setup = false; + struct intel_iommu *iommu = NULL; + + for_each_iommu(iommu, drhd) + if (iommu->qi) + dmar_reenable_qi(iommu); + + /* + * Setup Interrupt-remapping for all the DRHD's now. + */ + for_each_iommu(iommu, drhd) { + if (!ecap_ir_support(iommu->ecap)) + continue; + + /* Set up interrupt remapping for iommu.*/ + iommu_set_irq_remapping(iommu, eim); + iommu_enable_irq_remapping(iommu); + setup = true; + } + + if (!setup) + goto error; + + set_irq_posting_cap(); + + return 0; + +error: + /* + * handle error condition gracefully here! + */ + return -1; +} + +/* + * Store the MSI remapping domain pointer in the device if enabled. + * + * This is called from dmar_pci_bus_add_dev() so it works even when DMA + * remapping is disabled. Only update the pointer if the device is not + * already handled by a non default PCI/MSI interrupt domain. This protects + * e.g. VMD devices. + */ +void intel_irq_remap_add_device(struct dmar_pci_notify_info *info) +{ + if (!irq_remapping_enabled || !pci_dev_has_default_msi_parent_domain(info->dev)) + return; + + dev_set_msi_domain(&info->dev->dev, map_dev_to_ir(info->dev)); +} + +static void prepare_irte(struct irte *irte, int vector, unsigned int dest) +{ + memset(irte, 0, sizeof(*irte)); + + irte->present = 1; + irte->dst_mode = apic->dest_mode_logical; + /* + * Trigger mode in the IRTE will always be edge, and for IO-APIC, the + * actual level or edge trigger will be setup in the IO-APIC + * RTE. This will help simplify level triggered irq migration. + * For more details, see the comments (in io_apic.c) explainig IO-APIC + * irq migration in the presence of interrupt-remapping. + */ + irte->trigger_mode = 0; + irte->dlvry_mode = APIC_DELIVERY_MODE_FIXED; + irte->vector = vector; + irte->dest_id = IRTE_DEST(dest); + irte->redir_hint = 1; +} + +static void prepare_irte_posted(struct irte *irte) +{ + memset(irte, 0, sizeof(*irte)); + + irte->present = 1; + irte->p_pst = 1; +} + +struct irq_remap_ops intel_irq_remap_ops = { + .prepare = intel_prepare_irq_remapping, + .enable = intel_enable_irq_remapping, + .disable = disable_irq_remapping, + .reenable = reenable_irq_remapping, + .enable_faulting = enable_drhd_fault_handling, +}; + +#ifdef CONFIG_X86_POSTED_MSI + +static phys_addr_t get_pi_desc_addr(struct irq_data *irqd) +{ + int cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); + + if (WARN_ON(cpu >= nr_cpu_ids)) + return 0; + + return __pa(per_cpu_ptr(&posted_msi_pi_desc, cpu)); +} + +static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + struct irte *irte = &ir_data->irte_entry; + struct irte irte_pi; + u64 pid_addr; + + pid_addr = get_pi_desc_addr(irqd); + + if (!pid_addr) { + pr_warn("Failed to setup IRQ %d for posted mode", irqd->irq); + return; + } + + memset(&irte_pi, 0, sizeof(irte_pi)); + + /* The shared IRTE already be set up as posted during alloc_irte */ + dmar_copy_shared_irte(&irte_pi, irte); + + irte_pi.pda_l = (pid_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); + irte_pi.pda_h = (pid_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); + + modify_irte(&ir_data->irq_2_iommu, &irte_pi); +} + +#else +static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {} +#endif + +static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + + /* + * Don't modify IRTEs for IRQs that are being posted to vCPUs if the + * host CPU affinity changes. + */ + if (ir_data->irq_2_iommu.posted_vcpu && !force_host) + return; + + ir_data->irq_2_iommu.posted_vcpu = false; + + if (ir_data->irq_2_iommu.posted_msi) + intel_ir_reconfigure_irte_posted(irqd); + else + modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); +} + +static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + struct irte *irte = &ir_data->irte_entry; + struct irq_cfg *cfg = irqd_cfg(irqd); + + /* + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. + */ + irte->vector = cfg->vector; + irte->dest_id = IRTE_DEST(cfg->dest_apicid); + + __intel_ir_reconfigure_irte(irqd, force_host); +} + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For both level and edge triggered, irq migration is a simple atomic + * update(of vector and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we eliminate the io-apic RTE modification (with the + * updated vector information), by using a virtual vector (io-apic pin number). + * Real vector that is used for interrupting cpu will be coming from + * the interrupt-remapping table entry. + * + * As the migration is a simple atomic update of IRTE, the same mechanism + * is used to migrate MSI irq's in the presence of interrupt-remapping. + */ +static int +intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_data *parent = data->parent_data; + struct irq_cfg *cfg = irqd_cfg(data); + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + intel_ir_reconfigure_irte(data, false); + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + vector_schedule_cleanup(cfg); + + return IRQ_SET_MASK_OK_DONE; +} + +static void intel_ir_compose_msi_msg(struct irq_data *irq_data, + struct msi_msg *msg) +{ + struct intel_ir_data *ir_data = irq_data->chip_data; + + *msg = ir_data->msi_entry; +} + +static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) +{ + struct intel_ir_data *ir_data = data->chip_data; + struct intel_iommu_pi_data *pi_data = info; + + /* stop posting interrupts, back to the default mode */ + if (!pi_data) { + __intel_ir_reconfigure_irte(data, true); + } else { + struct irte irte_pi; + + /* + * We are not caching the posted interrupt entry. We + * copy the data from the remapped entry and modify + * the fields which are relevant for posted mode. The + * cached remapped entry is used for switching back to + * remapped mode. + */ + memset(&irte_pi, 0, sizeof(irte_pi)); + dmar_copy_shared_irte(&irte_pi, &ir_data->irte_entry); + + /* Update the posted mode fields */ + irte_pi.p_pst = 1; + irte_pi.p_urgent = 0; + irte_pi.p_vector = pi_data->vector; + irte_pi.pda_l = (pi_data->pi_desc_addr >> + (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); + irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) & + ~(-1UL << PDA_HIGH_BIT); + + ir_data->irq_2_iommu.posted_vcpu = true; + modify_irte(&ir_data->irq_2_iommu, &irte_pi); + } + + return 0; +} + +static struct irq_chip intel_ir_chip = { + .name = "INTEL-IR", + .irq_ack = apic_ack_irq, + .irq_set_affinity = intel_ir_set_affinity, + .irq_compose_msi_msg = intel_ir_compose_msi_msg, + .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, +}; + +/* + * With posted MSIs, the MSI vectors are multiplexed into a single notification + * vector, and only the notification vector is sent to the APIC IRR. Device + * MSIs are then dispatched in a demux loop that harvests the MSIs from the + * CPU's Posted Interrupt Request bitmap. I.e. Posted MSIs never get sent to + * the APIC IRR, and thus do not need an EOI. The notification handler instead + * performs a single EOI after processing the PIR. + * + * Note! Pending SMP/CPU affinity changes, which are per MSI, must still be + * honored, only the APIC EOI is omitted. + * + * For the example below, 3 MSIs are coalesced into one CPU notification. Only + * one apic_eoi() is needed, but each MSI needs to process pending changes to + * its CPU affinity. + * + * __sysvec_posted_msi_notification() + * irq_enter(); + * handle_edge_irq() + * irq_chip_ack_parent() + * irq_move_irq(); // No EOI + * handle_irq_event() + * driver_handler() + * handle_edge_irq() + * irq_chip_ack_parent() + * irq_move_irq(); // No EOI + * handle_irq_event() + * driver_handler() + * handle_edge_irq() + * irq_chip_ack_parent() + * irq_move_irq(); // No EOI + * handle_irq_event() + * driver_handler() + * apic_eoi() + * irq_exit() + * + */ +static struct irq_chip intel_ir_chip_post_msi = { + .name = "INTEL-IR-POST", + .irq_ack = irq_move_irq, + .irq_set_affinity = intel_ir_set_affinity, + .irq_compose_msi_msg = intel_ir_compose_msi_msg, + .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, +}; + +static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle) +{ + memset(msg, 0, sizeof(*msg)); + + msg->arch_addr_lo.dmar_base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_addr_lo.dmar_subhandle_valid = true; + msg->arch_addr_lo.dmar_format = true; + msg->arch_addr_lo.dmar_index_0_14 = index & 0x7FFF; + msg->arch_addr_lo.dmar_index_15 = !!(index & 0x8000); + + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + + msg->arch_data.dmar_subhandle = subhandle; +} + +static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, + struct irq_cfg *irq_cfg, + struct irq_alloc_info *info, + int index, int sub_handle) +{ + struct irte *irte = &data->irte_entry; + + prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid); + + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_IOAPIC: + /* Set source-id of interrupt request */ + set_ioapic_sid(irte, info->devid); + apic_pr_verbose("IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", + info->devid, irte->present, irte->fpd, irte->dst_mode, + irte->redir_hint, irte->trigger_mode, irte->dlvry_mode, + irte->avail, irte->vector, irte->dest_id, irte->sid, + irte->sq, irte->svt); + sub_handle = info->ioapic.pin; + break; + case X86_IRQ_ALLOC_TYPE_HPET: + set_hpet_sid(irte, info->devid); + break; + case X86_IRQ_ALLOC_TYPE_PCI_MSI: + case X86_IRQ_ALLOC_TYPE_PCI_MSIX: + if (posted_msi_supported()) { + prepare_irte_posted(irte); + data->irq_2_iommu.posted_msi = 1; + } + + set_msi_sid(irte, + pci_real_dma_dev(msi_desc_to_pci_dev(info->desc))); + break; + default: + BUG_ON(1); + break; + } + fill_msi_msg(&data->msi_entry, index, sub_handle); +} + +static void intel_free_irq_resources(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct intel_ir_data *data; + struct irq_2_iommu *irq_iommu; + unsigned long flags; + int i; + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + irq_iommu = &data->irq_2_iommu; + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + clear_entries(irq_iommu); + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + irq_domain_reset_irq_data(irq_data); + kfree(data); + } + } +} + +static int intel_irq_remapping_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + struct intel_iommu *iommu = domain->host_data; + struct irq_alloc_info *info = arg; + struct intel_ir_data *data, *ird; + struct irq_data *irq_data; + struct irq_cfg *irq_cfg; + int i, ret, index; + + if (!info || !iommu) + return -EINVAL; + if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI) + return -EINVAL; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_free_parent; + + index = alloc_irte(iommu, &data->irq_2_iommu, nr_irqs); + if (index < 0) { + pr_warn("Failed to allocate IRTE\n"); + kfree(data); + goto out_free_parent; + } + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + irq_cfg = irqd_cfg(irq_data); + if (!irq_data || !irq_cfg) { + if (!i) + kfree(data); + ret = -EINVAL; + goto out_free_data; + } + + if (i > 0) { + ird = kzalloc(sizeof(*ird), GFP_KERNEL); + if (!ird) + goto out_free_data; + /* Initialize the common data */ + ird->irq_2_iommu = data->irq_2_iommu; + ird->irq_2_iommu.sub_handle = i; + } else { + ird = data; + } + + irq_data->hwirq = (index << 16) + i; + irq_data->chip_data = ird; + if (posted_msi_supported() && + ((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) || + (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX))) + irq_data->chip = &intel_ir_chip_post_msi; + else + irq_data->chip = &intel_ir_chip; + intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i); + } + return 0; + +out_free_data: + intel_free_irq_resources(domain, virq, i); +out_free_parent: + irq_domain_free_irqs_common(domain, virq, nr_irqs); + return ret; +} + +static void intel_irq_remapping_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + intel_free_irq_resources(domain, virq, nr_irqs); + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static int intel_irq_remapping_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool reserve) +{ + intel_ir_reconfigure_irte(irq_data, true); + return 0; +} + +static void intel_irq_remapping_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct intel_ir_data *data = irq_data->chip_data; + struct irte entry; + + WARN_ON_ONCE(data->irq_2_iommu.posted_vcpu); + data->irq_2_iommu.posted_vcpu = false; + + memset(&entry, 0, sizeof(entry)); + modify_irte(&data->irq_2_iommu, &entry); +} + +static int intel_irq_remapping_select(struct irq_domain *d, + struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + struct intel_iommu *iommu = NULL; + + if (x86_fwspec_is_ioapic(fwspec)) + iommu = map_ioapic_to_iommu(fwspec->param[0]); + else if (x86_fwspec_is_hpet(fwspec)) + iommu = map_hpet_to_iommu(fwspec->param[0]); + + return iommu && d == iommu->ir_domain; +} + +static const struct irq_domain_ops intel_ir_domain_ops = { + .select = intel_irq_remapping_select, + .alloc = intel_irq_remapping_alloc, + .free = intel_irq_remapping_free, + .activate = intel_irq_remapping_activate, + .deactivate = intel_irq_remapping_deactivate, +}; + +static const struct msi_parent_ops dmar_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, + .bus_select_token = DOMAIN_BUS_DMAR, + .bus_select_mask = MATCH_PCI_MSI, + .prefix = "IR-", + .init_dev_msi_info = msi_parent_init_dev_msi_info, +}; + +/* + * Support of Interrupt Remapping Unit Hotplug + */ +static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu) +{ + int ret; + int eim = x2apic_enabled(); + + if (eim && !ecap_eim_support(iommu->ecap)) { + pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n", + iommu->reg_phys, iommu->ecap); + return -ENODEV; + } + + if (ir_parse_ioapic_hpet_scope(dmaru->hdr, iommu)) { + pr_warn("DRHD %Lx: failed to parse managed IOAPIC/HPET\n", + iommu->reg_phys); + return -ENODEV; + } + + /* TODO: check all IOAPICs are covered by IOMMU */ + + /* Setup Interrupt-remapping now. */ + ret = intel_setup_irq_remapping(iommu); + if (ret) { + pr_err("Failed to setup irq remapping for %s\n", + iommu->name); + intel_teardown_irq_remapping(iommu); + ir_remove_ioapic_hpet_scope(iommu); + } else { + iommu_enable_irq_remapping(iommu); + } + + return ret; +} + +int dmar_ir_hotplug(struct dmar_drhd_unit *dmaru, bool insert) +{ + int ret = 0; + struct intel_iommu *iommu = dmaru->iommu; + + if (!irq_remapping_enabled) + return 0; + if (iommu == NULL) + return -EINVAL; + if (!ecap_ir_support(iommu->ecap)) + return 0; + if (irq_remapping_cap(IRQ_POSTING_CAP) && + !cap_pi_support(iommu->cap)) + return -EBUSY; + + if (insert) { + if (!iommu->ir_table) + ret = dmar_ir_add(dmaru, iommu); + } else { + if (iommu->ir_table) { + if (!bitmap_empty(iommu->ir_table->bitmap, + INTR_REMAP_TABLE_ENTRIES)) { + ret = -EBUSY; + } else { + iommu_disable_irq_remapping(iommu); + intel_teardown_irq_remapping(iommu); + ir_remove_ioapic_hpet_scope(iommu); + } + } + } + + return ret; +} diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c new file mode 100644 index 000000000000..a3fb8c193ca6 --- /dev/null +++ b/drivers/iommu/intel/nested.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nested.c - nested mode translation support + * + * Copyright (C) 2023 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + * Jacob Pan <jacob.jun.pan@linux.intel.com> + * Yi Liu <yi.l.liu@intel.com> + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include <linux/iommu.h> +#include <linux/pci.h> +#include <linux/pci-ats.h> + +#include "iommu.h" +#include "pasid.h" + +static int intel_nested_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + int ret = 0; + + device_block_translation(dev); + + /* + * Stage-1 domain cannot work alone, it is nested on a s2_domain. + * The s2_domain will be used in nested translation, hence needs + * to ensure the s2_domain is compatible with this IOMMU. + */ + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); + if (ret) { + dev_err_ratelimited(dev, "s2 domain is not compatible\n"); + return ret; + } + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) { + dev_err_ratelimited(dev, "Failed to attach domain to iommu\n"); + return ret; + } + + ret = cache_tag_assign_domain(dmar_domain, dev, IOMMU_NO_PASID); + if (ret) + goto detach_iommu; + + ret = iopf_for_domain_set(domain, dev); + if (ret) + goto unassign_tag; + + ret = intel_pasid_setup_nested(iommu, dev, + IOMMU_NO_PASID, dmar_domain); + if (ret) + goto disable_iopf; + + info->domain = dmar_domain; + info->domain_attached = true; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&info->link, &dmar_domain->devices); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return 0; +disable_iopf: + iopf_for_domain_remove(domain, dev); +unassign_tag: + cache_tag_unassign_domain(dmar_domain, dev, IOMMU_NO_PASID); +detach_iommu: + domain_detach_iommu(dmar_domain, iommu); + + return ret; +} + +static void intel_nested_domain_free(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct dmar_domain *s2_domain = dmar_domain->s2_domain; + + spin_lock(&s2_domain->s1_lock); + list_del(&dmar_domain->s2_link); + spin_unlock(&s2_domain->s1_lock); + kfree(dmar_domain->qi_batch); + kfree(dmar_domain); +} + +static int intel_nested_cache_invalidate_user(struct iommu_domain *domain, + struct iommu_user_data_array *array) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct iommu_hwpt_vtd_s1_invalidate inv_entry; + u32 index, processed = 0; + int ret = 0; + + if (array->type != IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) { + ret = -EINVAL; + goto out; + } + + for (index = 0; index < array->entry_num; index++) { + ret = iommu_copy_struct_from_user_array(&inv_entry, array, + IOMMU_HWPT_INVALIDATE_DATA_VTD_S1, + index, __reserved); + if (ret) + break; + + if ((inv_entry.flags & ~IOMMU_VTD_INV_FLAGS_LEAF) || + inv_entry.__reserved) { + ret = -EOPNOTSUPP; + break; + } + + if (!IS_ALIGNED(inv_entry.addr, VTD_PAGE_SIZE) || + ((inv_entry.npages == U64_MAX) && inv_entry.addr)) { + ret = -EINVAL; + break; + } + + cache_tag_flush_range(dmar_domain, inv_entry.addr, + inv_entry.addr + nrpages_to_size(inv_entry.npages) - 1, + inv_entry.flags & IOMMU_VTD_INV_FLAGS_LEAF); + processed++; + } + +out: + array->entry_num = processed; + return ret; +} + +static int domain_setup_nested(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_nested(iommu, dev, pasid, domain); + return intel_pasid_replace_nested(iommu, dev, pasid, + iommu_domain_did(old, iommu), + domain); +} + +static int intel_nested_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + goto out_remove_dev_pasid; + + ret = domain_setup_nested(iommu, dmar_domain, dev, pasid, old); + if (ret) + goto out_unwind_iopf; + + domain_remove_dev_pasid(old, dev, pasid); + + return 0; + +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + +static const struct iommu_domain_ops intel_nested_domain_ops = { + .attach_dev = intel_nested_attach_dev, + .set_dev_pasid = intel_nested_set_dev_pasid, + .free = intel_nested_domain_free, + .cache_invalidate_user = intel_nested_cache_invalidate_user, +}; + +struct iommu_domain * +intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, + const struct iommu_user_data *user_data) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *s2_domain = to_dmar_domain(parent); + struct intel_iommu *iommu = info->iommu; + struct iommu_hwpt_vtd_s1 vtd; + struct dmar_domain *domain; + int ret; + + if (!nested_supported(iommu) || flags & ~IOMMU_HWPT_ALLOC_PASID) + return ERR_PTR(-EOPNOTSUPP); + + /* Must be nested domain */ + if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) + return ERR_PTR(-EOPNOTSUPP); + if (!intel_domain_is_ss_paging(s2_domain) || !s2_domain->nested_parent) + return ERR_PTR(-EINVAL); + + ret = iommu_copy_struct_from_user(&vtd, user_data, + IOMMU_HWPT_DATA_VTD_S1, __reserved); + if (ret) + return ERR_PTR(ret); + + domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->s2_domain = s2_domain; + domain->s1_cfg = vtd; + domain->domain.ops = &intel_nested_domain_ops; + domain->domain.type = IOMMU_DOMAIN_NESTED; + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); + spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); + xa_init(&domain->iommu_array); + + spin_lock(&s2_domain->s1_lock); + list_add(&domain->s2_link, &s2_domain->s1_domains); + spin_unlock(&s2_domain->s1_lock); + + return &domain->domain; +} diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c new file mode 100644 index 000000000000..3e2255057079 --- /dev/null +++ b/drivers/iommu/intel/pasid.c @@ -0,0 +1,1152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * intel-pasid.c - PASID idr, table and entry manipulation + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include <linux/bitops.h> +#include <linux/cpufeature.h> +#include <linux/dmar.h> +#include <linux/iommu.h> +#include <linux/memory.h> +#include <linux/pci.h> +#include <linux/pci-ats.h> +#include <linux/spinlock.h> + +#include "iommu.h" +#include "pasid.h" +#include "../iommu-pages.h" + +/* + * Intel IOMMU system wide PASID name space: + */ +u32 intel_pasid_max_id = PASID_MAX; + +/* + * Per device pasid table management: + */ + +/* + * Allocate a pasid table for @dev. It should be called in a + * single-thread context. + */ +int intel_pasid_alloc_table(struct device *dev) +{ + struct device_domain_info *info; + struct pasid_table *pasid_table; + struct pasid_dir_entry *dir; + u32 max_pasid = 0; + int order, size; + + might_sleep(); + info = dev_iommu_priv_get(dev); + if (WARN_ON(!info || !dev_is_pci(dev))) + return -ENODEV; + if (WARN_ON(info->pasid_table)) + return -EEXIST; + + pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL); + if (!pasid_table) + return -ENOMEM; + + if (info->pasid_supported) + max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)), + intel_pasid_max_id); + + size = max_pasid >> (PASID_PDE_SHIFT - 3); + order = size ? get_order(size) : 0; + dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL, + 1 << (order + PAGE_SHIFT)); + if (!dir) { + kfree(pasid_table); + return -ENOMEM; + } + + pasid_table->table = dir; + pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); + info->pasid_table = pasid_table; + + if (!ecap_coherent(info->iommu->ecap)) + clflush_cache_range(pasid_table->table, (1 << order) * PAGE_SIZE); + + return 0; +} + +void intel_pasid_free_table(struct device *dev) +{ + struct device_domain_info *info; + struct pasid_table *pasid_table; + struct pasid_dir_entry *dir; + struct pasid_entry *table; + int i, max_pde; + + info = dev_iommu_priv_get(dev); + if (!info || !dev_is_pci(dev) || !info->pasid_table) + return; + + pasid_table = info->pasid_table; + info->pasid_table = NULL; + + /* Free scalable mode PASID directory tables: */ + dir = pasid_table->table; + max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; + for (i = 0; i < max_pde; i++) { + table = get_pasid_table_from_pde(&dir[i]); + iommu_free_pages(table); + } + + iommu_free_pages(pasid_table->table); + kfree(pasid_table); +} + +struct pasid_table *intel_pasid_get_table(struct device *dev) +{ + struct device_domain_info *info; + + info = dev_iommu_priv_get(dev); + if (!info) + return NULL; + + return info->pasid_table; +} + +static int intel_pasid_get_dev_max_id(struct device *dev) +{ + struct device_domain_info *info; + + info = dev_iommu_priv_get(dev); + if (!info || !info->pasid_table) + return 0; + + return info->pasid_table->max_pasid; +} + +static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) +{ + struct device_domain_info *info; + struct pasid_table *pasid_table; + struct pasid_dir_entry *dir; + struct pasid_entry *entries; + int dir_index, index; + + pasid_table = intel_pasid_get_table(dev); + if (WARN_ON(!pasid_table || pasid >= intel_pasid_get_dev_max_id(dev))) + return NULL; + + dir = pasid_table->table; + info = dev_iommu_priv_get(dev); + dir_index = pasid >> PASID_PDE_SHIFT; + index = pasid & PASID_PTE_MASK; + +retry: + entries = get_pasid_table_from_pde(&dir[dir_index]); + if (!entries) { + u64 tmp; + + entries = iommu_alloc_pages_node_sz(info->iommu->node, + GFP_ATOMIC, SZ_4K); + if (!entries) + return NULL; + + /* + * The pasid directory table entry won't be freed after + * allocation. No worry about the race with free and + * clear. However, this entry might be populated by others + * while we are preparing it. Use theirs with a retry. + */ + tmp = 0ULL; + if (!try_cmpxchg64(&dir[dir_index].val, &tmp, + (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { + iommu_free_pages(entries); + goto retry; + } + if (!ecap_coherent(info->iommu->ecap)) { + clflush_cache_range(entries, VTD_PAGE_SIZE); + clflush_cache_range(&dir[dir_index].val, sizeof(*dir)); + } + } + + return &entries[index]; +} + +/* + * Interfaces for PASID table entry manipulation: + */ +static void +intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) +{ + struct pasid_entry *pe; + + pe = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pe)) + return; + + if (fault_ignore && pasid_pte_is_present(pe)) + pasid_clear_entry_with_fpd(pe); + else + pasid_clear_entry(pe); +} + +static void +pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, + u16 did, u32 pasid) +{ + struct qi_desc desc; + + desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) | + QI_PC_PASID(pasid) | QI_PC_TYPE; + desc.qw1 = 0; + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(iommu, &desc, 1, 0); +} + +static void +devtlb_invalidation_with_pasid(struct intel_iommu *iommu, + struct device *dev, u32 pasid) +{ + struct device_domain_info *info; + u16 sid, qdep, pfsid; + + info = dev_iommu_priv_get(dev); + if (!info || !info->ats_enabled) + return; + + if (pci_dev_is_disconnected(to_pci_dev(dev))) + return; + + sid = PCI_DEVID(info->bus, info->devfn); + qdep = info->ats_qdep; + pfsid = info->pfsid; + + /* + * When PASID 0 is used, it indicates RID2PASID(DMA request w/o PASID), + * devTLB flush w/o PASID should be used. For non-zero PASID under + * SVA usage, device could do DMA with multiple PASIDs. It is more + * efficient to flush devTLB specific to the PASID. + */ + if (pasid == IOMMU_NO_PASID) + qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT); + else + qi_flush_dev_iotlb_pasid(iommu, sid, pfsid, pasid, qdep, 0, 64 - VTD_PAGE_SHIFT); +} + +void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, + u32 pasid, bool fault_ignore) +{ + struct pasid_entry *pte; + u16 did, pgtt; + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte)) { + spin_unlock(&iommu->lock); + return; + } + + if (!pasid_pte_is_present(pte)) { + if (!pasid_pte_is_fault_disabled(pte)) { + WARN_ON(READ_ONCE(pte->val[0]) != 0); + spin_unlock(&iommu->lock); + return; + } + + /* + * When a PASID is used for SVA by a device, it's possible + * that the pasid entry is non-present with the Fault + * Processing Disabled bit set. Clear the pasid entry and + * drain the PRQ for the PASID before return. + */ + pasid_clear_entry(pte); + spin_unlock(&iommu->lock); + intel_iommu_drain_pasid_prq(dev, pasid); + + return; + } + + did = pasid_get_domain_id(pte); + pgtt = pasid_pte_get_pgtt(pte); + intel_pasid_clear_entry(dev, pasid, fault_ignore); + spin_unlock(&iommu->lock); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + + if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY) + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + else + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + + devtlb_invalidation_with_pasid(iommu, dev, pasid); + if (!fault_ignore) + intel_iommu_drain_pasid_prq(dev, pasid); +} + +/* + * This function flushes cache for a newly setup pasid table entry. + * Caller of it should not modify the in-use pasid table entries. + */ +static void pasid_flush_caches(struct intel_iommu *iommu, + struct pasid_entry *pte, + u32 pasid, u16 did) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + if (cap_caching_mode(iommu->cap)) { + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + } else { + iommu_flush_write_buffer(iommu); + } +} + +/* + * This function is supposed to be used after caller updates the fields + * except for the SSADE and P bit of a pasid table entry. It does the + * below: + * - Flush cacheline if needed + * - Flush the caches per Table 28 ”Guidance to Software for Invalidations“ + * of VT-d spec 5.0. + */ +static void intel_pasid_flush_present(struct intel_iommu *iommu, + struct device *dev, + u32 pasid, u16 did, + struct pasid_entry *pte) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * VT-d spec 5.0 table28 states guides for cache invalidation: + * + * - PASID-selective-within-Domain PASID-cache invalidation + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + + devtlb_invalidation_with_pasid(iommu, dev, pasid); +} + +/* + * Set up the scalable mode pasid table entry for first only + * translation type. + */ +static void pasid_pte_config_first_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + phys_addr_t fsptptr, u16 did, + int flags) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + /* Setup the first level page table pointer: */ + pasid_set_flptr(pte, fsptptr); + + if (flags & PASID_FLAG_FL5LP) + pasid_set_flpm(pte, 1); + + if (flags & PASID_FLAG_PAGE_SNOOP) + pasid_set_pgsnp(pte); + + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP); + + /* Setup Present and PASID Granular Transfer Type: */ + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); + pasid_set_present(pte); +} + +int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, + phys_addr_t fsptptr, u32 pasid, u16 did, + int flags) +{ + struct pasid_entry *pte; + + if (!ecap_flts(iommu->ecap)) { + pr_err("No first level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { + pr_err("No 5-level paging support for first-level on %s\n", + iommu->name); + return -EINVAL; + } + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_pte_config_first_level(iommu, pte, fsptptr, did, flags); + + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} + +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, phys_addr_t fsptptr, + u32 pasid, u16 did, u16 old_did, + int flags) +{ + struct pasid_entry *pte, new_pte; + + if (!ecap_flts(iommu->ecap)) { + pr_err("No first level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { + pr_err("No 5-level paging support for first-level on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + +/* + * Set up the scalable mode pasid entry for second only translation type. + */ +static void pasid_pte_config_second_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct dmar_domain *domain, u16 did) +{ + struct pt_iommu_vtdss_hw_info pt_info; + + lockdep_assert_held(&iommu->lock); + + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_slptr(pte, pt_info.ssptptr); + pasid_set_address_width(pte, pt_info.aw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); + if (domain->dirty_tracking) + pasid_set_ssade(pte); + + pasid_set_present(pte); +} + +int intel_pasid_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u32 pasid) +{ + struct pasid_entry *pte; + u16 did; + + + /* + * If hardware advertises no support for second level + * translation, return directly. + */ + if (!ecap_slts(iommu->ecap)) { + pr_err("No second level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + did = domain_id_iommu(domain, iommu); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_pte_config_second_level(iommu, pte, domain, did); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} + +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did; + + /* + * If hardware advertises no support for second level + * translation, return directly. + */ + if (!ecap_slts(iommu->ecap)) { + pr_err("No second level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + did = domain_id_iommu(domain, iommu); + + pasid_pte_config_second_level(iommu, &new_pte, domain, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + +/* + * Set up dirty tracking on a second only or nested translation type. + */ +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + bool enabled) +{ + struct pasid_entry *pte; + u16 did, pgtt; + + spin_lock(&iommu->lock); + + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, "Failed to get pasid entry of PASID %d\n", pasid); + return -ENODEV; + } + + did = pasid_get_domain_id(pte); + pgtt = pasid_pte_get_pgtt(pte); + if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && + pgtt != PASID_ENTRY_PGTT_NESTED) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, + "Dirty tracking not supported on translation type %d\n", + pgtt); + return -EOPNOTSUPP; + } + + if (pasid_get_ssade(pte) == enabled) { + spin_unlock(&iommu->lock); + return 0; + } + + if (enabled) + pasid_set_ssade(pte); + else + pasid_clear_ssade(pte); + spin_unlock(&iommu->lock); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * From VT-d spec table 25 "Guidance to Software for Invalidations": + * + * - PASID-selective-within-Domain PASID-cache invalidation + * If (PGTT=SS or Nested) + * - Domain-selective IOTLB invalidation + * Else + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + + devtlb_invalidation_with_pasid(iommu, dev, pasid); + + return 0; +} + +/* + * Set up the scalable mode pasid entry for passthrough translation type. + */ +static void pasid_pte_config_pass_through(struct intel_iommu *iommu, + struct pasid_entry *pte, u16 did) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_present(pte); +} + +int intel_pasid_setup_pass_through(struct intel_iommu *iommu, + struct device *dev, u32 pasid) +{ + u16 did = FLPT_DEFAULT_DID; + struct pasid_entry *pte; + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_pte_config_pass_through(iommu, pte, did); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} + +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did = FLPT_DEFAULT_DID; + + pasid_pte_config_pass_through(iommu, &new_pte, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + +/* + * Set the page snoop control for a pasid entry which has been set up. + */ +void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, + struct device *dev, u32 pasid) +{ + struct pasid_entry *pte; + u16 did; + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte || !pasid_pte_is_present(pte))) { + spin_unlock(&iommu->lock); + return; + } + + pasid_set_pgsnp(pte); + did = pasid_get_domain_id(pte); + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, did, pte); +} + +static void pasid_pte_config_nestd(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct iommu_hwpt_vtd_s1 *s1_cfg, + struct dmar_domain *s2_domain, + u16 did) +{ + struct pt_iommu_vtdss_hw_info pt_info; + + lockdep_assert_held(&iommu->lock); + + pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info); + + pasid_clear_entry(pte); + + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, pt_info.ssptptr); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, pt_info.aw); + pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); +} + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @pasid: PASID to be programmed in the device PASID table + * @domain: User stage-1 domain nested on a stage-2 domain + * + * This is used for nested translation. The input domain should be + * nested type and nested on a parent with 'is_nested_parent' flag + * set. + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct pasid_entry *pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_pte_config_nestd(iommu, pte, s1_cfg, s2_domain, did); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} + +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct pasid_entry *pte, new_pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + +/* + * Interfaces to setup or teardown a pasid table to the scalable-mode + * context table entry: + */ + +static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + u16 did; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, false); + if (!context) { + spin_unlock(&iommu->lock); + return; + } + + did = context_domain_id(context); + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + spin_unlock(&iommu->lock); + intel_context_flush_no_pasid(info, context, did); +} + +static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; + + if (dev == &pdev->dev) + device_pasid_table_teardown(dev, PCI_BUS_NUM(alias), alias & 0xff); + + return 0; +} + +void intel_pasid_teardown_sm_context(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) { + device_pasid_table_teardown(dev, info->bus, info->devfn); + return; + } + + pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_teardown, dev); +} + +/* + * Get the PASID directory size for scalable mode context entry. + * Value of X in the PDTS field of a scalable mode context entry + * indicates PASID directory with 2^(X + 7) entries. + */ +static unsigned long context_get_sm_pds(struct pasid_table *table) +{ + unsigned long pds, max_pde; + + max_pde = table->max_pasid >> PASID_PDE_SHIFT; + pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS); + if (pds < 7) + return 0; + + return pds - 7; +} + +static int context_entry_set_pasid_table(struct context_entry *context, + struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct pasid_table *table = info->pasid_table; + struct intel_iommu *iommu = info->iommu; + unsigned long pds; + + context_clear_entry(context); + + pds = context_get_sm_pds(table); + context->lo = (u64)virt_to_phys(table->table) | context_pdts(pds); + context_set_sm_rid2pasid(context, IOMMU_NO_PASID); + + if (info->ats_supported) + context_set_sm_dte(context); + if (info->pasid_supported) + context_set_pasid(context); + if (info->pri_supported) + context_set_sm_pre(context); + + context_set_fault_enable(context); + context_set_present(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + + return 0; +} + +static int device_pasid_table_setup(struct device *dev, u8 bus, u8 devfn) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, true); + if (!context) { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + if (context_present(context) && !context_copied(iommu, bus, devfn)) { + spin_unlock(&iommu->lock); + return 0; + } + + if (context_copied(iommu, bus, devfn)) { + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); + + /* + * For kdump cases, old valid entries may be cached due to + * the in-flight DMA and copied pgtable, but there is no + * unmapping behaviour for them, thus we need explicit cache + * flushes for all affected domain IDs and PASIDs used in + * the copied PASID table. Given that we have no idea about + * which domain IDs and PASIDs were used in the copied tables, + * upgrade them to global PASID and IOTLB cache invalidation. + */ + iommu->flush.flush_context(iommu, 0, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); + devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); + + /* + * At this point, the device is supposed to finish reset at + * its driver probe stage, so no in-flight DMA will exist, + * and we don't need to worry anymore hereafter. + */ + clear_context_copied(iommu, bus, devfn); + } + + context_entry_set_pasid_table(context, dev); + spin_unlock(&iommu->lock); + + /* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we don't need to flush the caches. If it does + * cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH); + } + + return 0; +} + +static int pci_pasid_table_setup(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; + + if (dev != &pdev->dev) + return 0; + + return device_pasid_table_setup(dev, PCI_BUS_NUM(alias), alias & 0xff); +} + +/* + * Set the device's PASID table to its context table entry. + * + * The PASID table is set to the context entries of both device itself + * and its alias requester ID for DMA. + */ +int intel_pasid_setup_sm_context(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) + return device_pasid_table_setup(dev, info->bus, info->devfn); + + return pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_setup, dev); +} + +/* + * Global Device-TLB invalidation following changes in a context entry which + * was present. + */ +static void __context_flush_dev_iotlb(struct device_domain_info *info) +{ + if (!info->ats_enabled) + return; + + qi_flush_dev_iotlb(info->iommu, PCI_DEVID(info->bus, info->devfn), + info->pfsid, info->ats_qdep, 0, MAX_AGAW_PFN_WIDTH); + + /* + * There is no guarantee that the device DMA is stopped when it reaches + * here. Therefore, always attempt the extra device TLB invalidation + * quirk. The impact on performance is acceptable since this is not a + * performance-critical path. + */ + quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, IOMMU_NO_PASID, + info->ats_qdep); +} + +/* + * Cache invalidations after change in a context table entry that was present + * according to the Spec 6.5.3.3 (Guidance to Software for Invalidations). + * This helper can only be used when IOMMU is working in the legacy mode or + * IOMMU is in scalable mode but all PASID table entries of the device are + * non-present. + */ +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did) +{ + struct intel_iommu *iommu = info->iommu; + + /* + * Device-selective context-cache invalidation. The Domain-ID field + * of the Context-cache Invalidate Descriptor is ignored by hardware + * when operating in scalable mode. Therefore the @did value doesn't + * matter in scalable mode. + */ + iommu->flush.flush_context(iommu, did, PCI_DEVID(info->bus, info->devfn), + DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); + + /* + * For legacy mode: + * - Domain-selective IOTLB invalidation + * - Global Device-TLB invalidation to all affected functions + */ + if (!sm_supported(iommu)) { + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + __context_flush_dev_iotlb(info); + + return; + } + + __context_flush_dev_iotlb(info); +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h new file mode 100644 index 000000000000..b4c85242dc79 --- /dev/null +++ b/drivers/iommu/intel/pasid.h @@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * pasid.h - PASID idr, table and entry header + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ + +#ifndef __INTEL_PASID_H +#define __INTEL_PASID_H + +#define PASID_MAX 0x100000 +#define PASID_PTE_MASK 0x3F +#define PASID_PTE_PRESENT 1 +#define PASID_PTE_FPD 2 +#define PDE_PFN_MASK PAGE_MASK +#define PASID_PDE_SHIFT 6 +#define MAX_NR_PASID_BITS 20 +#define PASID_TBL_ENTRIES BIT(PASID_PDE_SHIFT) + +#define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) +#define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) + +#define PASID_FLAG_NESTED BIT(1) +#define PASID_FLAG_PAGE_SNOOP BIT(2) +#define PASID_FLAG_PWSNP BIT(2) + +/* + * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- + * level translation, otherwise, 4-level paging will be used. + */ +#define PASID_FLAG_FL5LP BIT(1) + +struct pasid_dir_entry { + u64 val; +}; + +struct pasid_entry { + u64 val[8]; +}; + +#define PASID_ENTRY_PGTT_FL_ONLY (1) +#define PASID_ENTRY_PGTT_SL_ONLY (2) +#define PASID_ENTRY_PGTT_NESTED (3) +#define PASID_ENTRY_PGTT_PT (4) + +/* The representative of a PASID table */ +struct pasid_table { + void *table; /* pasid table pointer */ + u32 max_pasid; /* max pasid */ +}; + +/* Get PRESENT bit of a PASID directory entry. */ +static inline bool pasid_pde_is_present(struct pasid_dir_entry *pde) +{ + return READ_ONCE(pde->val) & PASID_PTE_PRESENT; +} + +/* Get PASID table from a PASID directory entry. */ +static inline struct pasid_entry * +get_pasid_table_from_pde(struct pasid_dir_entry *pde) +{ + if (!pasid_pde_is_present(pde)) + return NULL; + + return phys_to_virt(READ_ONCE(pde->val) & PDE_PFN_MASK); +} + +/* Get PRESENT bit of a PASID table entry. */ +static inline bool pasid_pte_is_present(struct pasid_entry *pte) +{ + return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT; +} + +/* Get FPD(Fault Processing Disable) bit of a PASID table entry */ +static inline bool pasid_pte_is_fault_disabled(struct pasid_entry *pte) +{ + return READ_ONCE(pte->val[0]) & PASID_PTE_FPD; +} + +/* Get PGTT field of a PASID table entry */ +static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) +{ + return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7); +} + +static inline void pasid_clear_entry(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], 0); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + +static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], PASID_PTE_FPD); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + +static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) +{ + u64 old; + + old = READ_ONCE(*ptr); + WRITE_ONCE(*ptr, (old & ~mask) | bits); +} + +static inline u64 pasid_get_bits(u64 *ptr) +{ + return READ_ONCE(*ptr); +} + +/* + * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode + * PASID entry. + */ +static inline void +pasid_set_domain_id(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value); +} + +/* + * Get domain ID value of a scalable mode PASID entry. + */ +static inline u16 +pasid_get_domain_id(struct pasid_entry *pe) +{ + return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); +} + +/* + * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_slptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value); +} + +/* + * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID + * entry. + */ +static inline void +pasid_set_address_width(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); +} + +/* + * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_translation_type(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); +} + +/* + * Enable fault processing by clearing the FPD(Fault Processing + * Disable) field (Bit 1) of a scalable mode PASID entry. + */ +static inline void pasid_set_fault_enable(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 1, 0); +} + +/* + * Enable second level A/D bits by setting the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); +} + +/* + * Disable second level A/D bits by clearing the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_clear_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 0); +} + +/* + * Checks if second level A/D bits specifically the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry is set. + */ +static inline bool pasid_get_ssade(struct pasid_entry *pe) +{ + return pasid_get_bits(&pe->val[0]) & (1 << 9); +} + +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + +/* + * Setup the WPE(Write Protect Enable) field (Bit 132) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_wpe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4); +} + +/* + * Setup the P(Present) field (Bit 0) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_present(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 0, 1); +} + +/* + * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) +{ + pasid_set_bits(&pe->val[1], 1 << 23, value << 23); +} + +/* + * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode + * PASID entry. + */ +static inline void +pasid_set_pgsnp(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24); +} + +/* + * Setup the First Level Page table Pointer field (Bit 140~191) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_flptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value); +} + +/* + * Setup the First Level Paging Mode field (Bit 130~131) of a + * scalable mode PASID entry. + */ +static inline void +pasid_set_flpm(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); +} + +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + +extern unsigned int intel_pasid_max_id; +int intel_pasid_alloc_table(struct device *dev); +void intel_pasid_free_table(struct device *dev); +struct pasid_table *intel_pasid_get_table(struct device *dev); +int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, + phys_addr_t fsptptr, u32 pasid, u16 did, + int flags); +int intel_pasid_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u32 pasid); +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + bool enabled); +int intel_pasid_setup_pass_through(struct intel_iommu *iommu, + struct device *dev, u32 pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, phys_addr_t fsptptr, + u32 pasid, u16 did, u16 old_did, int flags); +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain); + +void intel_pasid_tear_down_entry(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + bool fault_ignore); +void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, + struct device *dev, u32 pasid); +int intel_pasid_setup_sm_context(struct device *dev); +void intel_pasid_teardown_sm_context(struct device *dev); +#endif /* __INTEL_PASID_H */ diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c new file mode 100644 index 000000000000..dceeadc3ee7c --- /dev/null +++ b/drivers/iommu/intel/perf.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf.c - performance monitor + * + * Copyright (C) 2021 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + * Fenghua Yu <fenghua.yu@intel.com> + */ + +#include <linux/spinlock.h> + +#include "iommu.h" +#include "perf.h" + +static DEFINE_SPINLOCK(latency_lock); + +bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type) +{ + struct latency_statistic *lstat = iommu->perf_statistic; + + return lstat && lstat[type].enabled; +} + +int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type) +{ + struct latency_statistic *lstat; + unsigned long flags; + int ret = -EBUSY; + + if (dmar_latency_enabled(iommu, type)) + return 0; + + spin_lock_irqsave(&latency_lock, flags); + if (!iommu->perf_statistic) { + iommu->perf_statistic = kcalloc(DMAR_LATENCY_NUM, sizeof(*lstat), + GFP_ATOMIC); + if (!iommu->perf_statistic) { + ret = -ENOMEM; + goto unlock_out; + } + } + + lstat = iommu->perf_statistic; + + if (!lstat[type].enabled) { + lstat[type].enabled = true; + lstat[type].counter[COUNTS_MIN] = UINT_MAX; + ret = 0; + } +unlock_out: + spin_unlock_irqrestore(&latency_lock, flags); + + return ret; +} + +void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type) +{ + struct latency_statistic *lstat = iommu->perf_statistic; + unsigned long flags; + + if (!dmar_latency_enabled(iommu, type)) + return; + + spin_lock_irqsave(&latency_lock, flags); + memset(&lstat[type], 0, sizeof(*lstat) * DMAR_LATENCY_NUM); + spin_unlock_irqrestore(&latency_lock, flags); +} + +void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency) +{ + struct latency_statistic *lstat = iommu->perf_statistic; + unsigned long flags; + u64 min, max; + + if (!dmar_latency_enabled(iommu, type)) + return; + + spin_lock_irqsave(&latency_lock, flags); + if (latency < 100) + lstat[type].counter[COUNTS_10e2]++; + else if (latency < 1000) + lstat[type].counter[COUNTS_10e3]++; + else if (latency < 10000) + lstat[type].counter[COUNTS_10e4]++; + else if (latency < 100000) + lstat[type].counter[COUNTS_10e5]++; + else if (latency < 1000000) + lstat[type].counter[COUNTS_10e6]++; + else if (latency < 10000000) + lstat[type].counter[COUNTS_10e7]++; + else + lstat[type].counter[COUNTS_10e8_plus]++; + + min = lstat[type].counter[COUNTS_MIN]; + max = lstat[type].counter[COUNTS_MAX]; + lstat[type].counter[COUNTS_MIN] = min_t(u64, min, latency); + lstat[type].counter[COUNTS_MAX] = max_t(u64, max, latency); + lstat[type].counter[COUNTS_SUM] += latency; + lstat[type].samples++; + spin_unlock_irqrestore(&latency_lock, flags); +} + +static char *latency_counter_names[] = { + " <0.1us", + " 0.1us-1us", " 1us-10us", " 10us-100us", + " 100us-1ms", " 1ms-10ms", " >=10ms", + " min(us)", " max(us)", " average(us)" +}; + +static char *latency_type_names[] = { + " inv_iotlb", " inv_devtlb", " inv_iec", + " svm_prq" +}; + +void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) +{ + struct latency_statistic *lstat = iommu->perf_statistic; + unsigned long flags; + int bytes = 0, i, j; + + memset(str, 0, size); + + for (i = 0; i < COUNTS_NUM; i++) + bytes += scnprintf(str + bytes, size - bytes, + "%s", latency_counter_names[i]); + + spin_lock_irqsave(&latency_lock, flags); + for (i = 0; i < DMAR_LATENCY_NUM; i++) { + if (!dmar_latency_enabled(iommu, i)) + continue; + + bytes += scnprintf(str + bytes, size - bytes, + "\n%s", latency_type_names[i]); + + for (j = 0; j < COUNTS_NUM; j++) { + u64 val = lstat[i].counter[j]; + + switch (j) { + case COUNTS_MIN: + if (val == UINT_MAX) + val = 0; + else + val = div_u64(val, 1000); + break; + case COUNTS_MAX: + val = div_u64(val, 1000); + break; + case COUNTS_SUM: + if (lstat[i].samples) + val = div_u64(val, (lstat[i].samples * 1000)); + else + val = 0; + break; + default: + break; + } + + bytes += scnprintf(str + bytes, size - bytes, + "%12lld", val); + } + } + spin_unlock_irqrestore(&latency_lock, flags); +} diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h new file mode 100644 index 000000000000..1d4baad7e852 --- /dev/null +++ b/drivers/iommu/intel/perf.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * perf.h - performance monitor header + * + * Copyright (C) 2021 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ + +enum latency_type { + DMAR_LATENCY_INV_IOTLB = 0, + DMAR_LATENCY_INV_DEVTLB, + DMAR_LATENCY_INV_IEC, + DMAR_LATENCY_NUM +}; + +enum latency_count { + COUNTS_10e2 = 0, /* < 0.1us */ + COUNTS_10e3, /* 0.1us ~ 1us */ + COUNTS_10e4, /* 1us ~ 10us */ + COUNTS_10e5, /* 10us ~ 100us */ + COUNTS_10e6, /* 100us ~ 1ms */ + COUNTS_10e7, /* 1ms ~ 10ms */ + COUNTS_10e8_plus, /* 10ms and plus*/ + COUNTS_MIN, + COUNTS_MAX, + COUNTS_SUM, + COUNTS_NUM +}; + +struct latency_statistic { + bool enabled; + u64 counter[COUNTS_NUM]; + u64 samples; +}; + +#ifdef CONFIG_DMAR_PERF +int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type); +void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type); +bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type); +void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, + u64 latency); +void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size); +#else +static inline int +dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type) +{ + return -EINVAL; +} + +static inline void +dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type) +{ +} + +static inline bool +dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type) +{ + return false; +} + +static inline void +dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency) +{ +} + +static inline void +dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) +{ +} +#endif /* CONFIG_DMAR_PERF */ diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c new file mode 100644 index 000000000000..75f493bcb353 --- /dev/null +++ b/drivers/iommu/intel/perfmon.c @@ -0,0 +1,790 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Support Intel IOMMU PerfMon + * Copyright(c) 2023 Intel Corporation. + */ +#define pr_fmt(fmt) "DMAR: " fmt +#define dev_fmt(fmt) pr_fmt(fmt) + +#include <linux/dmar.h> +#include "iommu.h" +#include "perfmon.h" + +PMU_FORMAT_ATTR(event, "config:0-27"); /* ES: Events Select */ +PMU_FORMAT_ATTR(event_group, "config:28-31"); /* EGI: Event Group Index */ + +static struct attribute *iommu_pmu_format_attrs[] = { + &format_attr_event_group.attr, + &format_attr_event.attr, + NULL +}; + +static struct attribute_group iommu_pmu_format_attr_group = { + .name = "format", + .attrs = iommu_pmu_format_attrs, +}; + +/* The available events are added in attr_update later */ +static struct attribute *attrs_empty[] = { + NULL +}; + +static struct attribute_group iommu_pmu_events_attr_group = { + .name = "events", + .attrs = attrs_empty, +}; + +static const struct attribute_group *iommu_pmu_attr_groups[] = { + &iommu_pmu_format_attr_group, + &iommu_pmu_events_attr_group, + NULL +}; + +static inline struct iommu_pmu *dev_to_iommu_pmu(struct device *dev) +{ + /* + * The perf_event creates its own dev for each PMU. + * See pmu_dev_alloc() + */ + return container_of(dev_get_drvdata(dev), struct iommu_pmu, pmu); +} + +#define IOMMU_PMU_ATTR(_name, _format, _filter) \ + PMU_FORMAT_ATTR(_name, _format); \ + \ +static struct attribute *_name##_attr[] = { \ + &format_attr_##_name.attr, \ + NULL \ +}; \ + \ +static umode_t \ +_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i) \ +{ \ + struct device *dev = kobj_to_dev(kobj); \ + struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev); \ + \ + if (!iommu_pmu) \ + return 0; \ + return (iommu_pmu->filter & _filter) ? attr->mode : 0; \ +} \ + \ +static struct attribute_group _name = { \ + .name = "format", \ + .attrs = _name##_attr, \ + .is_visible = _name##_is_visible, \ +}; + +IOMMU_PMU_ATTR(filter_requester_id_en, "config1:0", IOMMU_PMU_FILTER_REQUESTER_ID); +IOMMU_PMU_ATTR(filter_domain_en, "config1:1", IOMMU_PMU_FILTER_DOMAIN); +IOMMU_PMU_ATTR(filter_pasid_en, "config1:2", IOMMU_PMU_FILTER_PASID); +IOMMU_PMU_ATTR(filter_ats_en, "config1:3", IOMMU_PMU_FILTER_ATS); +IOMMU_PMU_ATTR(filter_page_table_en, "config1:4", IOMMU_PMU_FILTER_PAGE_TABLE); +IOMMU_PMU_ATTR(filter_requester_id, "config1:16-31", IOMMU_PMU_FILTER_REQUESTER_ID); +IOMMU_PMU_ATTR(filter_domain, "config1:32-47", IOMMU_PMU_FILTER_DOMAIN); +IOMMU_PMU_ATTR(filter_pasid, "config2:0-21", IOMMU_PMU_FILTER_PASID); +IOMMU_PMU_ATTR(filter_ats, "config2:24-28", IOMMU_PMU_FILTER_ATS); +IOMMU_PMU_ATTR(filter_page_table, "config2:32-36", IOMMU_PMU_FILTER_PAGE_TABLE); + +#define iommu_pmu_en_requester_id(e) ((e) & 0x1) +#define iommu_pmu_en_domain(e) (((e) >> 1) & 0x1) +#define iommu_pmu_en_pasid(e) (((e) >> 2) & 0x1) +#define iommu_pmu_en_ats(e) (((e) >> 3) & 0x1) +#define iommu_pmu_en_page_table(e) (((e) >> 4) & 0x1) +#define iommu_pmu_get_requester_id(filter) (((filter) >> 16) & 0xffff) +#define iommu_pmu_get_domain(filter) (((filter) >> 32) & 0xffff) +#define iommu_pmu_get_pasid(filter) ((filter) & 0x3fffff) +#define iommu_pmu_get_ats(filter) (((filter) >> 24) & 0x1f) +#define iommu_pmu_get_page_table(filter) (((filter) >> 32) & 0x1f) + +#define iommu_pmu_set_filter(_name, _config, _filter, _idx, _econfig) \ +{ \ + if ((iommu_pmu->filter & _filter) && iommu_pmu_en_##_name(_econfig)) { \ + dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \ + IOMMU_PMU_CFG_SIZE + \ + (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \ + iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\ + } \ +} + +#define iommu_pmu_clear_filter(_filter, _idx) \ +{ \ + if (iommu_pmu->filter & _filter) { \ + dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \ + IOMMU_PMU_CFG_SIZE + \ + (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \ + 0); \ + } \ +} + +/* + * Define the event attr related functions + * Input: _name: event attr name + * _string: string of the event in sysfs + * _g_idx: event group encoding + * _event: event encoding + */ +#define IOMMU_PMU_EVENT_ATTR(_name, _string, _g_idx, _event) \ + PMU_EVENT_ATTR_STRING(_name, event_attr_##_name, _string) \ + \ +static struct attribute *_name##_attr[] = { \ + &event_attr_##_name.attr.attr, \ + NULL \ +}; \ + \ +static umode_t \ +_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i) \ +{ \ + struct device *dev = kobj_to_dev(kobj); \ + struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev); \ + \ + if (!iommu_pmu) \ + return 0; \ + return (iommu_pmu->evcap[_g_idx] & _event) ? attr->mode : 0; \ +} \ + \ +static struct attribute_group _name = { \ + .name = "events", \ + .attrs = _name##_attr, \ + .is_visible = _name##_is_visible, \ +}; + +IOMMU_PMU_EVENT_ATTR(iommu_clocks, "event_group=0x0,event=0x001", 0x0, 0x001) +IOMMU_PMU_EVENT_ATTR(iommu_requests, "event_group=0x0,event=0x002", 0x0, 0x002) +IOMMU_PMU_EVENT_ATTR(pw_occupancy, "event_group=0x0,event=0x004", 0x0, 0x004) +IOMMU_PMU_EVENT_ATTR(ats_blocked, "event_group=0x0,event=0x008", 0x0, 0x008) +IOMMU_PMU_EVENT_ATTR(iommu_mrds, "event_group=0x1,event=0x001", 0x1, 0x001) +IOMMU_PMU_EVENT_ATTR(iommu_mem_blocked, "event_group=0x1,event=0x020", 0x1, 0x020) +IOMMU_PMU_EVENT_ATTR(pg_req_posted, "event_group=0x1,event=0x040", 0x1, 0x040) +IOMMU_PMU_EVENT_ATTR(ctxt_cache_lookup, "event_group=0x2,event=0x001", 0x2, 0x001) +IOMMU_PMU_EVENT_ATTR(ctxt_cache_hit, "event_group=0x2,event=0x002", 0x2, 0x002) +IOMMU_PMU_EVENT_ATTR(pasid_cache_lookup, "event_group=0x2,event=0x004", 0x2, 0x004) +IOMMU_PMU_EVENT_ATTR(pasid_cache_hit, "event_group=0x2,event=0x008", 0x2, 0x008) +IOMMU_PMU_EVENT_ATTR(ss_nonleaf_lookup, "event_group=0x2,event=0x010", 0x2, 0x010) +IOMMU_PMU_EVENT_ATTR(ss_nonleaf_hit, "event_group=0x2,event=0x020", 0x2, 0x020) +IOMMU_PMU_EVENT_ATTR(fs_nonleaf_lookup, "event_group=0x2,event=0x040", 0x2, 0x040) +IOMMU_PMU_EVENT_ATTR(fs_nonleaf_hit, "event_group=0x2,event=0x080", 0x2, 0x080) +IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_lookup, "event_group=0x2,event=0x100", 0x2, 0x100) +IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_hit, "event_group=0x2,event=0x200", 0x2, 0x200) +IOMMU_PMU_EVENT_ATTR(iotlb_lookup, "event_group=0x3,event=0x001", 0x3, 0x001) +IOMMU_PMU_EVENT_ATTR(iotlb_hit, "event_group=0x3,event=0x002", 0x3, 0x002) +IOMMU_PMU_EVENT_ATTR(hpt_leaf_lookup, "event_group=0x3,event=0x004", 0x3, 0x004) +IOMMU_PMU_EVENT_ATTR(hpt_leaf_hit, "event_group=0x3,event=0x008", 0x3, 0x008) +IOMMU_PMU_EVENT_ATTR(int_cache_lookup, "event_group=0x4,event=0x001", 0x4, 0x001) +IOMMU_PMU_EVENT_ATTR(int_cache_hit_nonposted, "event_group=0x4,event=0x002", 0x4, 0x002) +IOMMU_PMU_EVENT_ATTR(int_cache_hit_posted, "event_group=0x4,event=0x004", 0x4, 0x004) + +static const struct attribute_group *iommu_pmu_attr_update[] = { + &filter_requester_id_en, + &filter_domain_en, + &filter_pasid_en, + &filter_ats_en, + &filter_page_table_en, + &filter_requester_id, + &filter_domain, + &filter_pasid, + &filter_ats, + &filter_page_table, + &iommu_clocks, + &iommu_requests, + &pw_occupancy, + &ats_blocked, + &iommu_mrds, + &iommu_mem_blocked, + &pg_req_posted, + &ctxt_cache_lookup, + &ctxt_cache_hit, + &pasid_cache_lookup, + &pasid_cache_hit, + &ss_nonleaf_lookup, + &ss_nonleaf_hit, + &fs_nonleaf_lookup, + &fs_nonleaf_hit, + &hpt_nonleaf_lookup, + &hpt_nonleaf_hit, + &iotlb_lookup, + &iotlb_hit, + &hpt_leaf_lookup, + &hpt_leaf_hit, + &int_cache_lookup, + &int_cache_hit_nonposted, + &int_cache_hit_posted, + NULL +}; + +static inline void __iomem * +iommu_event_base(struct iommu_pmu *iommu_pmu, int idx) +{ + return iommu_pmu->cntr_reg + idx * iommu_pmu->cntr_stride; +} + +static inline void __iomem * +iommu_config_base(struct iommu_pmu *iommu_pmu, int idx) +{ + return iommu_pmu->cfg_reg + idx * IOMMU_PMU_CFG_OFFSET; +} + +static inline struct iommu_pmu *iommu_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct iommu_pmu, pmu); +} + +static inline u64 iommu_event_config(struct perf_event *event) +{ + u64 config = event->attr.config; + + return (iommu_event_select(config) << IOMMU_EVENT_CFG_ES_SHIFT) | + (iommu_event_group(config) << IOMMU_EVENT_CFG_EGI_SHIFT) | + IOMMU_EVENT_CFG_INT; +} + +static inline bool is_iommu_pmu_event(struct iommu_pmu *iommu_pmu, + struct perf_event *event) +{ + return event->pmu == &iommu_pmu->pmu; +} + +static int iommu_pmu_validate_event(struct perf_event *event) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + u32 event_group = iommu_event_group(event->attr.config); + + if (event_group >= iommu_pmu->num_eg) + return -EINVAL; + + return 0; +} + +static int iommu_pmu_validate_group(struct perf_event *event) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct perf_event *sibling; + int nr = 0; + + /* + * All events in a group must be scheduled simultaneously. + * Check whether there is enough counters for all the events. + */ + for_each_sibling_event(sibling, event->group_leader) { + if (!is_iommu_pmu_event(iommu_pmu, sibling) || + sibling->state <= PERF_EVENT_STATE_OFF) + continue; + + if (++nr > iommu_pmu->num_cntr) + return -EINVAL; + } + + return 0; +} + +static int iommu_pmu_event_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* sampling not supported */ + if (event->attr.sample_period) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + if (iommu_pmu_validate_event(event)) + return -EINVAL; + + hwc->config = iommu_event_config(event); + + return iommu_pmu_validate_group(event); +} + +static void iommu_pmu_event_update(struct perf_event *event) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct hw_perf_event *hwc = &event->hw; + u64 prev_count, new_count, delta; + int shift = 64 - iommu_pmu->cntr_width; + +again: + prev_count = local64_read(&hwc->prev_count); + new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx)); + if (local64_xchg(&hwc->prev_count, new_count) != prev_count) + goto again; + + /* + * The counter width is enumerated. Always shift the counter + * before using it. + */ + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +static void iommu_pmu_start(struct perf_event *event, int flags) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct intel_iommu *iommu = iommu_pmu->iommu; + struct hw_perf_event *hwc = &event->hw; + u64 count; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX)) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + + hwc->state = 0; + + /* Always reprogram the period */ + count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx)); + local64_set((&hwc->prev_count), count); + + /* + * The error of ecmd will be ignored. + * - The existing perf_event subsystem doesn't handle the error. + * Only IOMMU PMU returns runtime HW error. We don't want to + * change the existing generic interfaces for the specific case. + * - It's a corner case caused by HW, which is very unlikely to + * happen. There is nothing SW can do. + * - The worst case is that the user will get <not count> with + * perf command, which can give the user some hints. + */ + ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, 0); + + perf_event_update_userpage(event); +} + +static void iommu_pmu_stop(struct perf_event *event, int flags) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct intel_iommu *iommu = iommu_pmu->iommu; + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + ecmd_submit_sync(iommu, DMA_ECMD_DISABLE, hwc->idx, 0); + + iommu_pmu_event_update(event); + + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + } +} + +static inline int +iommu_pmu_validate_per_cntr_event(struct iommu_pmu *iommu_pmu, + int idx, struct perf_event *event) +{ + u32 event_group = iommu_event_group(event->attr.config); + u32 select = iommu_event_select(event->attr.config); + + if (!(iommu_pmu->cntr_evcap[idx][event_group] & select)) + return -EINVAL; + + return 0; +} + +static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx; + + /* + * The counters which support limited events are usually at the end. + * Schedule them first to accommodate more events. + */ + for (idx = iommu_pmu->num_cntr - 1; idx >= 0; idx--) { + if (test_and_set_bit(idx, iommu_pmu->used_mask)) + continue; + /* Check per-counter event capabilities */ + if (!iommu_pmu_validate_per_cntr_event(iommu_pmu, idx, event)) + break; + clear_bit(idx, iommu_pmu->used_mask); + } + if (idx < 0) + return -EINVAL; + + iommu_pmu->event_list[idx] = event; + hwc->idx = idx; + + /* config events */ + dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config); + + iommu_pmu_set_filter(requester_id, event->attr.config1, + IOMMU_PMU_FILTER_REQUESTER_ID, idx, + event->attr.config1); + iommu_pmu_set_filter(domain, event->attr.config1, + IOMMU_PMU_FILTER_DOMAIN, idx, + event->attr.config1); + iommu_pmu_set_filter(pasid, event->attr.config2, + IOMMU_PMU_FILTER_PASID, idx, + event->attr.config1); + iommu_pmu_set_filter(ats, event->attr.config2, + IOMMU_PMU_FILTER_ATS, idx, + event->attr.config1); + iommu_pmu_set_filter(page_table, event->attr.config2, + IOMMU_PMU_FILTER_PAGE_TABLE, idx, + event->attr.config1); + + return 0; +} + +static int iommu_pmu_add(struct perf_event *event, int flags) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + struct hw_perf_event *hwc = &event->hw; + int ret; + + ret = iommu_pmu_assign_event(iommu_pmu, event); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + iommu_pmu_start(event, 0); + + return 0; +} + +static void iommu_pmu_del(struct perf_event *event, int flags) +{ + struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event); + int idx = event->hw.idx; + + iommu_pmu_stop(event, PERF_EF_UPDATE); + + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_REQUESTER_ID, idx); + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_DOMAIN, idx); + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PASID, idx); + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_ATS, idx); + iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PAGE_TABLE, idx); + + iommu_pmu->event_list[idx] = NULL; + event->hw.idx = -1; + clear_bit(idx, iommu_pmu->used_mask); + + perf_event_update_userpage(event); +} + +static void iommu_pmu_enable(struct pmu *pmu) +{ + struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu); + struct intel_iommu *iommu = iommu_pmu->iommu; + + ecmd_submit_sync(iommu, DMA_ECMD_UNFREEZE, 0, 0); +} + +static void iommu_pmu_disable(struct pmu *pmu) +{ + struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu); + struct intel_iommu *iommu = iommu_pmu->iommu; + + ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, 0); +} + +static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu) +{ + struct perf_event *event; + u64 status; + int i; + + /* + * Two counters may be overflowed very close. Always check + * whether there are more to handle. + */ + while ((status = dmar_readq(iommu_pmu->overflow))) { + for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) { + /* + * Find the assigned event of the counter. + * Accumulate the value into the event->count. + */ + event = iommu_pmu->event_list[i]; + if (!event) { + pr_warn_once("Cannot find the assigned event for counter %d\n", i); + continue; + } + iommu_pmu_event_update(event); + } + + dmar_writeq(iommu_pmu->overflow, status); + } +} + +static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id) +{ + struct intel_iommu *iommu = dev_id; + + if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG)) + return IRQ_NONE; + + iommu_pmu_counter_overflow(iommu->pmu); + + /* Clear the status bit */ + dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS); + + return IRQ_HANDLED; +} + +static int __iommu_pmu_register(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + + iommu_pmu->pmu.name = iommu->name; + iommu_pmu->pmu.task_ctx_nr = perf_invalid_context; + iommu_pmu->pmu.event_init = iommu_pmu_event_init; + iommu_pmu->pmu.pmu_enable = iommu_pmu_enable; + iommu_pmu->pmu.pmu_disable = iommu_pmu_disable; + iommu_pmu->pmu.add = iommu_pmu_add; + iommu_pmu->pmu.del = iommu_pmu_del; + iommu_pmu->pmu.start = iommu_pmu_start; + iommu_pmu->pmu.stop = iommu_pmu_stop; + iommu_pmu->pmu.read = iommu_pmu_event_update; + iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups; + iommu_pmu->pmu.attr_update = iommu_pmu_attr_update; + iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; + iommu_pmu->pmu.module = THIS_MODULE; + + return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1); +} + +static inline void __iomem * +get_perf_reg_address(struct intel_iommu *iommu, u32 offset) +{ + u32 off = dmar_readl(iommu->reg + offset); + + return iommu->reg + off; +} + +int alloc_iommu_pmu(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu; + int i, j, ret; + u64 perfcap; + u32 cap; + + if (!ecap_pms(iommu->ecap)) + return 0; + + /* The IOMMU PMU requires the ECMD support as well */ + if (!cap_ecmds(iommu->cap)) + return -ENODEV; + + perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG); + /* The performance monitoring is not supported. */ + if (!perfcap) + return -ENODEV; + + /* Sanity check for the number of the counters and event groups */ + if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap)) + return -ENODEV; + + /* The interrupt on overflow is required */ + if (!pcap_interrupt(perfcap)) + return -ENODEV; + + /* Check required Enhanced Command Capability */ + if (!ecmd_has_pmu_essential(iommu)) + return -ENODEV; + + iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL); + if (!iommu_pmu) + return -ENOMEM; + + iommu_pmu->num_cntr = pcap_num_cntr(perfcap); + if (iommu_pmu->num_cntr > IOMMU_PMU_IDX_MAX) { + pr_warn_once("The number of IOMMU counters %d > max(%d), clipping!", + iommu_pmu->num_cntr, IOMMU_PMU_IDX_MAX); + iommu_pmu->num_cntr = IOMMU_PMU_IDX_MAX; + } + + iommu_pmu->cntr_width = pcap_cntr_width(perfcap); + iommu_pmu->filter = pcap_filters_mask(perfcap); + iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap); + iommu_pmu->num_eg = pcap_num_event_group(perfcap); + + iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64), GFP_KERNEL); + if (!iommu_pmu->evcap) { + ret = -ENOMEM; + goto free_pmu; + } + + /* Parse event group capabilities */ + for (i = 0; i < iommu_pmu->num_eg; i++) { + u64 pcap; + + pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG + + i * IOMMU_PMU_CAP_REGS_STEP); + iommu_pmu->evcap[i] = pecap_es(pcap); + } + + iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32 *), GFP_KERNEL); + if (!iommu_pmu->cntr_evcap) { + ret = -ENOMEM; + goto free_pmu_evcap; + } + for (i = 0; i < iommu_pmu->num_cntr; i++) { + iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg, sizeof(u32), GFP_KERNEL); + if (!iommu_pmu->cntr_evcap[i]) { + ret = -ENOMEM; + goto free_pmu_cntr_evcap; + } + /* + * Set to the global capabilities, will adjust according + * to per-counter capabilities later. + */ + for (j = 0; j < iommu_pmu->num_eg; j++) + iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j]; + } + + iommu_pmu->cfg_reg = get_perf_reg_address(iommu, DMAR_PERFCFGOFF_REG); + iommu_pmu->cntr_reg = get_perf_reg_address(iommu, DMAR_PERFCNTROFF_REG); + iommu_pmu->overflow = get_perf_reg_address(iommu, DMAR_PERFOVFOFF_REG); + + /* + * Check per-counter capabilities. All counters should have the + * same capabilities on Interrupt on Overflow Support and Counter + * Width. + */ + for (i = 0; i < iommu_pmu->num_cntr; i++) { + cap = dmar_readl(iommu_pmu->cfg_reg + + i * IOMMU_PMU_CFG_OFFSET + + IOMMU_PMU_CFG_CNTRCAP_OFFSET); + if (!iommu_cntrcap_pcc(cap)) + continue; + + /* + * It's possible that some counters have a different + * capability because of e.g., HW bug. Check the corner + * case here and simply drop those counters. + */ + if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) || + !iommu_cntrcap_ios(cap)) { + iommu_pmu->num_cntr = i; + pr_warn("PMU counter capability inconsistent, counter number reduced to %d\n", + iommu_pmu->num_cntr); + } + + /* Clear the pre-defined events group */ + for (j = 0; j < iommu_pmu->num_eg; j++) + iommu_pmu->cntr_evcap[i][j] = 0; + + /* Override with per-counter event capabilities */ + for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) { + cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET + + IOMMU_PMU_CFG_CNTREVCAP_OFFSET + + (j * IOMMU_PMU_OFF_REGS_STEP)); + iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap); + /* + * Some events may only be supported by a specific counter. + * Track them in the evcap as well. + */ + iommu_pmu->evcap[iommu_event_group(cap)] |= iommu_event_select(cap); + } + } + + iommu_pmu->iommu = iommu; + iommu->pmu = iommu_pmu; + + return 0; + +free_pmu_cntr_evcap: + for (i = 0; i < iommu_pmu->num_cntr; i++) + kfree(iommu_pmu->cntr_evcap[i]); + kfree(iommu_pmu->cntr_evcap); +free_pmu_evcap: + kfree(iommu_pmu->evcap); +free_pmu: + kfree(iommu_pmu); + + return ret; +} + +void free_iommu_pmu(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + + if (!iommu_pmu) + return; + + if (iommu_pmu->evcap) { + int i; + + for (i = 0; i < iommu_pmu->num_cntr; i++) + kfree(iommu_pmu->cntr_evcap[i]); + kfree(iommu_pmu->cntr_evcap); + } + kfree(iommu_pmu->evcap); + kfree(iommu_pmu); + iommu->pmu = NULL; +} + +static int iommu_pmu_set_interrupt(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + int irq, ret; + + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) + return -EINVAL; + + snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id); + + iommu->perf_irq = irq; + ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler, + IRQF_ONESHOT, iommu_pmu->irq_name, iommu); + if (ret) { + dmar_free_hwirq(irq); + iommu->perf_irq = 0; + return ret; + } + return 0; +} + +static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu) +{ + if (!iommu->perf_irq) + return; + + free_irq(iommu->perf_irq, iommu); + dmar_free_hwirq(iommu->perf_irq); + iommu->perf_irq = 0; +} + +void iommu_pmu_register(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + + if (!iommu_pmu) + return; + + if (__iommu_pmu_register(iommu)) + goto err; + + /* Set interrupt for overflow */ + if (iommu_pmu_set_interrupt(iommu)) + goto unregister; + + return; + +unregister: + perf_pmu_unregister(&iommu_pmu->pmu); +err: + pr_err("Failed to register PMU for iommu (seq_id = %d)\n", iommu->seq_id); + free_iommu_pmu(iommu); +} + +void iommu_pmu_unregister(struct intel_iommu *iommu) +{ + struct iommu_pmu *iommu_pmu = iommu->pmu; + + if (!iommu_pmu) + return; + + iommu_pmu_unset_interrupt(iommu); + perf_pmu_unregister(&iommu_pmu->pmu); +} diff --git a/drivers/iommu/intel/perfmon.h b/drivers/iommu/intel/perfmon.h new file mode 100644 index 000000000000..58606af9a2b9 --- /dev/null +++ b/drivers/iommu/intel/perfmon.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * PERFCFGOFF_REG, PERFFRZOFF_REG + * PERFOVFOFF_REG, PERFCNTROFF_REG + */ +#define IOMMU_PMU_NUM_OFF_REGS 4 +#define IOMMU_PMU_OFF_REGS_STEP 4 + +#define IOMMU_PMU_FILTER_REQUESTER_ID 0x01 +#define IOMMU_PMU_FILTER_DOMAIN 0x02 +#define IOMMU_PMU_FILTER_PASID 0x04 +#define IOMMU_PMU_FILTER_ATS 0x08 +#define IOMMU_PMU_FILTER_PAGE_TABLE 0x10 + +#define IOMMU_PMU_FILTER_EN BIT(31) + +#define IOMMU_PMU_CFG_OFFSET 0x100 +#define IOMMU_PMU_CFG_CNTRCAP_OFFSET 0x80 +#define IOMMU_PMU_CFG_CNTREVCAP_OFFSET 0x84 +#define IOMMU_PMU_CFG_SIZE 0x8 +#define IOMMU_PMU_CFG_FILTERS_OFFSET 0x4 + +#define IOMMU_PMU_CAP_REGS_STEP 8 + +#define iommu_cntrcap_pcc(p) ((p) & 0x1) +#define iommu_cntrcap_cw(p) (((p) >> 8) & 0xff) +#define iommu_cntrcap_ios(p) (((p) >> 16) & 0x1) +#define iommu_cntrcap_egcnt(p) (((p) >> 28) & 0xf) + +#define IOMMU_EVENT_CFG_EGI_SHIFT 8 +#define IOMMU_EVENT_CFG_ES_SHIFT 32 +#define IOMMU_EVENT_CFG_INT BIT_ULL(1) + +#define iommu_event_select(p) ((p) & 0xfffffff) +#define iommu_event_group(p) (((p) >> 28) & 0xf) + +#ifdef CONFIG_INTEL_IOMMU_PERF_EVENTS +int alloc_iommu_pmu(struct intel_iommu *iommu); +void free_iommu_pmu(struct intel_iommu *iommu); +void iommu_pmu_register(struct intel_iommu *iommu); +void iommu_pmu_unregister(struct intel_iommu *iommu); +#else +static inline int +alloc_iommu_pmu(struct intel_iommu *iommu) +{ + return 0; +} + +static inline void +free_iommu_pmu(struct intel_iommu *iommu) +{ +} + +static inline void +iommu_pmu_register(struct intel_iommu *iommu) +{ +} + +static inline void +iommu_pmu_unregister(struct intel_iommu *iommu) +{ +} +#endif /* CONFIG_INTEL_IOMMU_PERF_EVENTS */ diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c new file mode 100644 index 000000000000..ff63c228e6e1 --- /dev/null +++ b/drivers/iommu/intel/prq.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Intel Corporation + * + * Originally split from drivers/iommu/intel/svm.c + */ + +#include <linux/pci.h> +#include <linux/pci-ats.h> + +#include "iommu.h" +#include "pasid.h" +#include "../iommu-pages.h" +#include "trace.h" + +/* Page request queue descriptor */ +struct page_req_dsc { + union { + struct { + u64 type:8; + u64 pasid_present:1; + u64 rsvd:7; + u64 rid:16; + u64 pasid:20; + u64 exe_req:1; + u64 pm_req:1; + u64 rsvd2:10; + }; + u64 qw_0; + }; + union { + struct { + u64 rd_req:1; + u64 wr_req:1; + u64 lpig:1; + u64 prg_index:9; + u64 addr:52; + }; + u64 qw_1; + }; + u64 qw_2; + u64 qw_3; +}; + +/** + * intel_iommu_drain_pasid_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + int head, tail; + u16 sid, did; + + info = dev_iommu_priv_get(dev); + if (!info->iopf_refcount) + return; + + iommu = info->iommu; + domain = info->domain; + sid = PCI_DEVID(info->bus, info->devfn); + did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (req->rid != sid || + (req->pasid_present && pasid != req->pasid) || + (!req->pasid_present && pasid != IOMMU_NO_PASID)) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + iopf_queue_flush_dev(dev); + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + if (pasid == IOMMU_NO_PASID) { + qi_desc_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, &desc[1]); + qi_desc_dev_iotlb(sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, &desc[2]); + } else { + qi_desc_piotlb(did, pasid, 0, -1, 0, &desc[1]); + qi_desc_dev_iotlb_pasid(sid, info->pfsid, pasid, info->ats_qdep, + 0, MAX_AGAW_PFN_WIDTH, &desc[2]); + } +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + +static bool is_canonical_address(u64 addr) +{ + int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + long saddr = (long)addr; + + return (((saddr << shift) >> shift) == saddr); +} + +static void handle_bad_prq_event(struct intel_iommu *iommu, + struct page_req_dsc *req, int result) +{ + struct qi_desc desc = { }; + + pr_err("%s: Invalid page request: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + + if (!req->lpig) + return; + + desc.qw0 = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID(req->rid) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_RESP_CODE(result) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(req->prg_index); + + qi_submit_sync(iommu, &desc, 1, 0); +} + +static int prq_to_iommu_prot(struct page_req_dsc *req) +{ + int prot = 0; + + if (req->rd_req) + prot |= IOMMU_FAULT_PERM_READ; + if (req->wr_req) + prot |= IOMMU_FAULT_PERM_WRITE; + if (req->exe_req) + prot |= IOMMU_FAULT_PERM_EXEC; + if (req->pm_req) + prot |= IOMMU_FAULT_PERM_PRIV; + + return prot; +} + +static void intel_prq_report(struct intel_iommu *iommu, struct device *dev, + struct page_req_dsc *desc) +{ + struct iopf_fault event = { }; + + /* Fill in event data for device specific processing */ + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; + event.fault.prm.pasid = desc->pasid; + event.fault.prm.grpid = desc->prg_index; + event.fault.prm.perm = prq_to_iommu_prot(desc); + + if (desc->lpig) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (desc->pasid_present) { + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + } + + iommu_report_device_fault(dev, &event); +} + +static irqreturn_t prq_event_thread(int irq, void *d) +{ + struct intel_iommu *iommu = d; + struct page_req_dsc *req; + int head, tail, handled; + struct device *dev; + u64 address; + + /* + * Clear PPR bit before reading head/tail registers, to ensure that + * we get a new interrupt if needed. + */ + writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); + + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + handled = (head != tail); + while (head != tail) { + req = &iommu->prq[head / sizeof(*req)]; + address = (u64)req->addr << VTD_PAGE_SHIFT; + + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", + iommu->name); +bad_req: + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + goto prq_advance; + } + + if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { + pr_err("IOMMU: %s: Page request in Privilege Mode\n", + iommu->name); + goto bad_req; + } + + if (unlikely(req->exe_req && req->rd_req)) { + pr_err("IOMMU: %s: Execution request not supported\n", + iommu->name); + goto bad_req; + } + + /* Drop Stop Marker message. No need for a response. */ + if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) + goto prq_advance; + + /* + * If prq is to be handled outside iommu driver via receiver of + * the fault notifiers, we skip the page response here. + */ + mutex_lock(&iommu->iopf_lock); + dev = device_rbtree_find(iommu, req->rid); + if (!dev) { + mutex_unlock(&iommu->iopf_lock); + goto bad_req; + } + + intel_prq_report(iommu, dev, req); + trace_prq_report(iommu, dev, req->qw_0, req->qw_1, + req->qw_2, req->qw_3, + iommu->prq_seq_number++); + mutex_unlock(&iommu->iopf_lock); +prq_advance: + head = (head + sizeof(*req)) & PRQ_RING_MASK; + } + + dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", + iommu->name); + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + if (head == tail) { + iopf_queue_discard_partial(iommu->iopf_queue); + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", + iommu->name); + } + } + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + + return IRQ_RETVAL(handled); +} + +int intel_iommu_enable_prq(struct intel_iommu *iommu) +{ + struct iopf_queue *iopfq; + int irq, ret; + + iommu->prq = + iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, PRQ_SIZE); + if (!iommu->prq) { + pr_warn("IOMMU: %s: Failed to allocate page request queue\n", + iommu->name); + return -ENOMEM; + } + + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) { + pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", + iommu->name); + ret = -EINVAL; + goto free_prq; + } + iommu->pr_irq = irq; + + snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), + "dmar%d-iopfq", iommu->seq_id); + iopfq = iopf_queue_alloc(iommu->iopfq_name); + if (!iopfq) { + pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); + ret = -ENOMEM; + goto free_hwirq; + } + iommu->iopf_queue = iopfq; + + snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); + + ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, + iommu->prq_name, iommu); + if (ret) { + pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", + iommu->name); + goto free_iopfq; + } + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + + init_completion(&iommu->prq_complete); + + return 0; + +free_iopfq: + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; +free_hwirq: + dmar_free_hwirq(irq); + iommu->pr_irq = 0; +free_prq: + iommu_free_pages(iommu->prq); + iommu->prq = NULL; + + return ret; +} + +int intel_iommu_finish_prq(struct intel_iommu *iommu) +{ + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); + + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + } + + if (iommu->iopf_queue) { + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; + } + + iommu_free_pages(iommu->prq); + iommu->prq = NULL; + + return 0; +} + +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; + struct iommu_fault_page_request *prm; + struct qi_desc desc; + bool pasid_present; + u16 sid; + + prm = &evt->fault.prm; + sid = PCI_DEVID(bus, devfn); + pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + + desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | + QI_PGRP_PASID_P(pasid_present) | + QI_PGRP_RESP_CODE(msg->code) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(prm->grpid); + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(iommu, &desc, 1, 0); +} diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c new file mode 100644 index 000000000000..71de7947971f --- /dev/null +++ b/drivers/iommu/intel/svm.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2015 Intel Corporation. + * + * Authors: David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/mmu_notifier.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/slab.h> +#include <linux/rculist.h> +#include <linux/pci.h> +#include <linux/pci-ats.h> +#include <linux/dmar.h> +#include <linux/interrupt.h> +#include <linux/mm_types.h> +#include <linux/xarray.h> +#include <asm/page.h> +#include <asm/fpu/api.h> + +#include "iommu.h" +#include "pasid.h" +#include "perf.h" +#include "../iommu-pages.h" +#include "trace.h" + +void intel_svm_check(struct intel_iommu *iommu) +{ + if (!pasid_supported(iommu)) + return; + + if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && + !cap_fl1gp_support(iommu->cap)) { + pr_err("%s SVM disabled, incompatible 1GB page capability\n", + iommu->name); + return; + } + + if (cpu_feature_enabled(X86_FEATURE_LA57) && + !cap_fl5lp_support(iommu->cap)) { + pr_err("%s SVM disabled, incompatible paging mode\n", + iommu->name); + return; + } + + iommu->flags |= VTD_FLAG_SVM_CAPABLE; +} + +/* Pages have been freed at this point */ +static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); + + if (start == 0 && end == ULONG_MAX) { + cache_tag_flush_all(domain); + return; + } + + /* + * The mm_types defines vm_end as the first byte after the end address, + * different from IOMMU subsystem using the last address of an address + * range. + */ + cache_tag_flush_range(domain, start, end - 1, 0); +} + +static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); + struct dev_pasid_info *dev_pasid; + struct device_domain_info *info; + unsigned long flags; + + /* This might end up being called from exit_mmap(), *before* the page + * tables are cleared. And __mmu_notifier_release() will delete us from + * the list of notifiers so that our invalidate_range() callback doesn't + * get called when the page tables are cleared. So we need to protect + * against hardware accessing those page tables. + * + * We do it by clearing the entry in the PASID table and then flushing + * the IOTLB and the PASID table caches. This might upset hardware; + * perhaps we'll want to point the PASID to a dummy PGD (like the zero + * page) so that we end up taking a fault that the hardware really + * *has* to handle gracefully without affecting other processes. + */ + spin_lock_irqsave(&domain->lock, flags); + list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { + info = dev_iommu_priv_get(dev_pasid->dev); + intel_pasid_tear_down_entry(info->iommu, dev_pasid->dev, + dev_pasid->pasid, true); + } + spin_unlock_irqrestore(&domain->lock, flags); + +} + +static void intel_mm_free_notifier(struct mmu_notifier *mn) +{ + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); + + kfree(domain->qi_batch); + kfree(domain); +} + +static const struct mmu_notifier_ops intel_mmuops = { + .release = intel_mm_release, + .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs, + .free_notifier = intel_mm_free_notifier, +}; + +static int intel_iommu_sva_supported(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu; + + if (!info || dmar_disabled) + return -EINVAL; + + iommu = info->iommu; + if (!iommu) + return -EINVAL; + + if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) + return -ENODEV; + + if (!info->pasid_enabled || !info->ats_enabled) + return -EINVAL; + + /* + * Devices having device-specific I/O fault handling should not + * support PCI/PRI. The IOMMU side has no means to check the + * capability of device-specific IOPF. Therefore, IOMMU can only + * default that if the device driver enables SVA on a non-PRI + * device, it will handle IOPF in its own way. + */ + if (!info->pri_supported) + return 0; + + /* Devices supporting PRI should have it enabled. */ + if (!info->pri_enabled) + return -EINVAL; + + return 0; +} + +static int intel_svm_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct mm_struct *mm = domain->mm; + struct dev_pasid_info *dev_pasid; + unsigned long sflags; + int ret = 0; + + ret = intel_iommu_sva_supported(dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + goto out_remove_dev_pasid; + + /* Setup the pasid table: */ + sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + sflags |= PASID_FLAG_PWSNP; + ret = __domain_setup_first_level(iommu, dev, pasid, + FLPT_DEFAULT_DID, __pa(mm->pgd), + sflags, old); + if (ret) + goto out_unwind_iopf; + + domain_remove_dev_pasid(old, dev, pasid); + + return 0; +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + +static void intel_svm_domain_free(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + /* dmar_domain free is deferred to the mmu free_notifier callback. */ + mmu_notifier_put(&dmar_domain->notifier); +} + +static const struct iommu_domain_ops intel_svm_domain_ops = { + .set_dev_pasid = intel_svm_set_dev_pasid, + .free = intel_svm_domain_free +}; + +struct iommu_domain *intel_svm_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + struct dmar_domain *domain; + int ret; + + ret = intel_iommu_sva_supported(dev); + if (ret) + return ERR_PTR(ret); + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->domain.ops = &intel_svm_domain_ops; + INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); + spin_lock_init(&domain->cache_lock); + spin_lock_init(&domain->lock); + + domain->notifier.ops = &intel_mmuops; + ret = mmu_notifier_register(&domain->notifier, mm); + if (ret) { + kfree(domain); + return ERR_PTR(ret); + } + + return &domain->domain; +} diff --git a/drivers/iommu/intel/trace.c b/drivers/iommu/intel/trace.c new file mode 100644 index 000000000000..117e626e3ea9 --- /dev/null +++ b/drivers/iommu/intel/trace.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel IOMMU trace support + * + * Copyright (C) 2019 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ + +#include <linux/string.h> +#include <linux/types.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h new file mode 100644 index 000000000000..6311ba3f1691 --- /dev/null +++ b/drivers/iommu/intel/trace.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Intel IOMMU trace support + * + * Copyright (C) 2019 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM intel_iommu + +#if !defined(_TRACE_INTEL_IOMMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_INTEL_IOMMU_H + +#include <linux/tracepoint.h> + +#include "iommu.h" + +#define MSG_MAX 256 + +TRACE_EVENT(qi_submit, + TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3), + + TP_ARGS(iommu, qw0, qw1, qw2, qw3), + + TP_STRUCT__entry( + __field(u64, qw0) + __field(u64, qw1) + __field(u64, qw2) + __field(u64, qw3) + __string(iommu, iommu->name) + ), + + TP_fast_assign( + __assign_str(iommu); + __entry->qw0 = qw0; + __entry->qw1 = qw1; + __entry->qw2 = qw2; + __entry->qw3 = qw3; + ), + + TP_printk("%s %s: 0x%llx 0x%llx 0x%llx 0x%llx", + __print_symbolic(__entry->qw0 & 0xf, + { QI_CC_TYPE, "cc_inv" }, + { QI_IOTLB_TYPE, "iotlb_inv" }, + { QI_DIOTLB_TYPE, "dev_tlb_inv" }, + { QI_IEC_TYPE, "iec_inv" }, + { QI_IWD_TYPE, "inv_wait" }, + { QI_EIOTLB_TYPE, "p_iotlb_inv" }, + { QI_PC_TYPE, "pc_inv" }, + { QI_DEIOTLB_TYPE, "p_dev_tlb_inv" }, + { QI_PGRP_RESP_TYPE, "page_grp_resp" }), + __get_str(iommu), + __entry->qw0, __entry->qw1, __entry->qw2, __entry->qw3 + ) +); + +TRACE_EVENT(prq_report, + TP_PROTO(struct intel_iommu *iommu, struct device *dev, + u64 dw0, u64 dw1, u64 dw2, u64 dw3, + unsigned long seq), + + TP_ARGS(iommu, dev, dw0, dw1, dw2, dw3, seq), + + TP_STRUCT__entry( + __field(u64, dw0) + __field(u64, dw1) + __field(u64, dw2) + __field(u64, dw3) + __field(unsigned long, seq) + __string(iommu, iommu->name) + __string(dev, dev_name(dev)) + __dynamic_array(char, buff, MSG_MAX) + ), + + TP_fast_assign( + __entry->dw0 = dw0; + __entry->dw1 = dw1; + __entry->dw2 = dw2; + __entry->dw3 = dw3; + __entry->seq = seq; + __assign_str(iommu); + __assign_str(dev); + ), + + TP_printk("%s/%s seq# %ld: %s", + __get_str(iommu), __get_str(dev), __entry->seq, + decode_prq_descriptor(__get_str(buff), MSG_MAX, __entry->dw0, + __entry->dw1, __entry->dw2, __entry->dw3) + ) +); + +DECLARE_EVENT_CLASS(cache_tag_log, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag), + TP_STRUCT__entry( + __string(iommu, tag->iommu->name) + __string(dev, dev_name(tag->dev)) + __field(u16, type) + __field(u16, domain_id) + __field(u32, pasid) + __field(u32, users) + ), + TP_fast_assign( + __assign_str(iommu); + __assign_str(dev); + __entry->type = tag->type; + __entry->domain_id = tag->domain_id; + __entry->pasid = tag->pasid; + __entry->users = tag->users; + ), + TP_printk("%s/%s type %s did %d pasid %d ref %d", + __get_str(iommu), __get_str(dev), + __print_symbolic(__entry->type, + { CACHE_TAG_IOTLB, "iotlb" }, + { CACHE_TAG_DEVTLB, "devtlb" }, + { CACHE_TAG_NESTING_IOTLB, "nesting_iotlb" }, + { CACHE_TAG_NESTING_DEVTLB, "nesting_devtlb" }), + __entry->domain_id, __entry->pasid, __entry->users + ) +); + +DEFINE_EVENT(cache_tag_log, cache_tag_assign, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag) +); + +DEFINE_EVENT(cache_tag_log, cache_tag_unassign, + TP_PROTO(struct cache_tag *tag), + TP_ARGS(tag) +); + +DECLARE_EVENT_CLASS(cache_tag_flush, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask), + TP_STRUCT__entry( + __string(iommu, tag->iommu->name) + __string(dev, dev_name(tag->dev)) + __field(u16, type) + __field(u16, domain_id) + __field(u32, pasid) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned long, addr) + __field(unsigned long, pages) + __field(unsigned long, mask) + ), + TP_fast_assign( + __assign_str(iommu); + __assign_str(dev); + __entry->type = tag->type; + __entry->domain_id = tag->domain_id; + __entry->pasid = tag->pasid; + __entry->start = start; + __entry->end = end; + __entry->addr = addr; + __entry->pages = pages; + __entry->mask = mask; + ), + TP_printk("%s %s[%d] type %s did %d [0x%lx-0x%lx] addr 0x%lx pages 0x%lx mask 0x%lx", + __get_str(iommu), __get_str(dev), __entry->pasid, + __print_symbolic(__entry->type, + { CACHE_TAG_IOTLB, "iotlb" }, + { CACHE_TAG_DEVTLB, "devtlb" }, + { CACHE_TAG_NESTING_IOTLB, "nesting_iotlb" }, + { CACHE_TAG_NESTING_DEVTLB, "nesting_devtlb" }), + __entry->domain_id, __entry->start, __entry->end, + __entry->addr, __entry->pages, __entry->mask + ) +); + +DEFINE_EVENT(cache_tag_flush, cache_tag_flush_range, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask) +); + +DEFINE_EVENT(cache_tag_flush, cache_tag_flush_range_np, + TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, + unsigned long addr, unsigned long pages, unsigned long mask), + TP_ARGS(tag, start, end, addr, pages, mask) +); +#endif /* _TRACE_INTEL_IOMMU_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH ../../drivers/iommu/intel/ +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> |
