diff options
author | Rodrigo Vivi <rodrigo.vivi@intel.com> | 2022-12-30 04:09:09 -0500 |
---|---|---|
committer | Rodrigo Vivi <rodrigo.vivi@intel.com> | 2022-12-30 04:18:36 -0500 |
commit | b501d4dc83aa3940189b68045cadc8b3eac73988 (patch) | |
tree | e5310a4fafd59505c3db86bb05baf964cb8da944 /drivers/iommu | |
parent | 4071d98b296a5bc5fd4b15ec651bd05800ec9510 (diff) | |
parent | 1b929c02afd37871d5afb9d498426f83432e71c2 (diff) |
Merge drm/drm-next into drm-intel-gt-next
Sync after v6.2-rc1 landed in drm-next.
We need to get some dependencies in place before we can merge
the fixes series from Gwan-gyeong and Chris.
References: https://lore.kernel.org/all/Y6x5JCDnh2rvh4lA@intel.com/
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Diffstat (limited to 'drivers/iommu')
59 files changed, 8745 insertions, 1147 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index dc5f7a156ff5..79707685d54a 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -188,6 +188,7 @@ config MSM_IOMMU source "drivers/iommu/amd/Kconfig" source "drivers/iommu/intel/Kconfig" +source "drivers/iommu/iommufd/Kconfig" config IRQ_REMAP bool "Support for Interrupt Remapping" @@ -389,7 +390,7 @@ config ARM_SMMU_V3 depends on ARM64 select IOMMU_API select IOMMU_IO_PGTABLE_LPAE - select GENERIC_MSI_IRQ_DOMAIN + select GENERIC_MSI_IRQ help Support for implementations of the ARM System MMU architecture version 3 providing translation support to a PCIe root complex. @@ -474,13 +475,13 @@ config QCOM_IOMMU Support for IOMMU on certain Qualcomm SoCs. config HYPERV_IOMMU - bool "Hyper-V x2APIC IRQ Handling" + bool "Hyper-V IRQ Handling" depends on HYPERV && X86 select IOMMU_API default HYPERV help - Stub IOMMU driver to handle IRQs as to allow Hyper-V Linux - guests to run with x2APIC mode enabled. + Stub IOMMU driver to handle IRQs to support Hyper-V Linux + guest and root partitions. config VIRTIO_IOMMU tristate "Virtio IOMMU driver" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index cc9f381013c3..f461d0651385 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += amd/ intel/ arm/ +obj-y += amd/ intel/ arm/ iommufd/ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o @@ -28,6 +28,6 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o obj-$(CONFIG_S390_IOMMU) += s390-iommu.o obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o -obj-$(CONFIG_IOMMU_SVA) += iommu-sva-lib.o io-pgfault.o +obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o io-pgfault.o obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o obj-$(CONFIG_APPLE_DART) += apple-dart.o diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 1d0a70c85333..3d684190b4d5 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -734,7 +734,6 @@ struct amd_iommu { u8 max_counters; #ifdef CONFIG_IRQ_REMAP struct irq_domain *ir_domain; - struct irq_domain *msi_domain; struct amd_irte_ops *irte_ops; #endif diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 1a2d425bf568..467b194975b3 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -85,7 +85,7 @@ #define LOOP_TIMEOUT 2000000 -#define IVRS_GET_SBDF_ID(seg, bus, dev, fd) (((seg & 0xffff) << 16) | ((bus & 0xff) << 8) \ +#define IVRS_GET_SBDF_ID(seg, bus, dev, fn) (((seg & 0xffff) << 16) | ((bus & 0xff) << 8) \ | ((dev & 0x1f) << 3) | (fn & 0x7)) /* @@ -3402,18 +3402,24 @@ static int __init parse_amd_iommu_options(char *str) static int __init parse_ivrs_ioapic(char *str) { u32 seg = 0, bus, dev, fn; - int ret, id, i; + int id, i; u32 devid; - ret = sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn); - if (ret != 4) { - ret = sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn); - if (ret != 5) { - pr_err("Invalid command line: ivrs_ioapic%s\n", str); - return 1; - } + if (sscanf(str, "=%d@%x:%x.%x", &id, &bus, &dev, &fn) == 4 || + sscanf(str, "=%d@%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) + goto found; + + if (sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn) == 4 || + sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) { + pr_warn("ivrs_ioapic%s option format deprecated; use ivrs_ioapic=%d@%04x:%02x:%02x.%d instead\n", + str, id, seg, bus, dev, fn); + goto found; } + pr_err("Invalid command line: ivrs_ioapic%s\n", str); + return 1; + +found: if (early_ioapic_map_size == EARLY_MAP_SIZE) { pr_err("Early IOAPIC map overflow - ignoring ivrs_ioapic%s\n", str); @@ -3434,18 +3440,24 @@ static int __init parse_ivrs_ioapic(char *str) static int __init parse_ivrs_hpet(char *str) { u32 seg = 0, bus, dev, fn; - int ret, id, i; + int id, i; u32 devid; - ret = sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn); - if (ret != 4) { - ret = sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn); - if (ret != 5) { - pr_err("Invalid command line: ivrs_hpet%s\n", str); - return 1; - } + if (sscanf(str, "=%d@%x:%x.%x", &id, &bus, &dev, &fn) == 4 || + sscanf(str, "=%d@%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) + goto found; + + if (sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn) == 4 || + sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) { + pr_warn("ivrs_hpet%s option format deprecated; use ivrs_hpet=%d@%04x:%02x:%02x.%d instead\n", + str, id, seg, bus, dev, fn); + goto found; } + pr_err("Invalid command line: ivrs_hpet%s\n", str); + return 1; + +found: if (early_hpet_map_size == EARLY_MAP_SIZE) { pr_err("Early HPET map overflow - ignoring ivrs_hpet%s\n", str); @@ -3466,19 +3478,36 @@ static int __init parse_ivrs_hpet(char *str) static int __init parse_ivrs_acpihid(char *str) { u32 seg = 0, bus, dev, fn; - char *hid, *uid, *p; + char *hid, *uid, *p, *addr; char acpiid[ACPIHID_UID_LEN + ACPIHID_HID_LEN] = {0}; - int ret, i; - - ret = sscanf(str, "[%x:%x.%x]=%s", &bus, &dev, &fn, acpiid); - if (ret != 4) { - ret = sscanf(str, "[%x:%x:%x.%x]=%s", &seg, &bus, &dev, &fn, acpiid); - if (ret != 5) { - pr_err("Invalid command line: ivrs_acpihid(%s)\n", str); - return 1; + int i; + + addr = strchr(str, '@'); + if (!addr) { + if (sscanf(str, "[%x:%x.%x]=%s", &bus, &dev, &fn, acpiid) == 4 || + sscanf(str, "[%x:%x:%x.%x]=%s", &seg, &bus, &dev, &fn, acpiid) == 5) { + pr_warn("ivrs_acpihid%s option format deprecated; use ivrs_acpihid=%s@%04x:%02x:%02x.%d instead\n", + str, acpiid, seg, bus, dev, fn); + goto found; } + goto not_found; } + /* We have the '@', make it the terminator to get just the acpiid */ + *addr++ = 0; + + if (sscanf(str, "=%s", acpiid) != 1) + goto not_found; + + if (sscanf(addr, "%x:%x.%x", &bus, &dev, &fn) == 3 || + sscanf(addr, "%x:%x:%x.%x", &seg, &bus, &dev, &fn) == 4) + goto found; + +not_found: + pr_err("Invalid command line: ivrs_acpihid%s\n", str); + return 1; + +found: p = acpiid; hid = strsep(&p, ":"); uid = p; @@ -3488,6 +3517,13 @@ static int __init parse_ivrs_acpihid(char *str) return 1; } + /* + * Ignore leading zeroes after ':', so e.g., AMDI0095:00 + * will match AMDI0095:0 in the second strcmp in acpi_dev_hid_uid_match + */ + while (*uid == '0' && *(uid + 1)) + uid++; + i = early_acpihid_map_size++; memcpy(early_acpihid_map[i].hid, hid, strlen(hid)); memcpy(early_acpihid_map[i].uid, uid, strlen(uid)); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d3b39d0416fa..cbeaab55c0db 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -767,7 +767,7 @@ EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier); static void iommu_poll_ga_log(struct amd_iommu *iommu) { - u32 head, tail, cnt = 0; + u32 head, tail; if (iommu->ga_log == NULL) return; @@ -780,7 +780,6 @@ static void iommu_poll_ga_log(struct amd_iommu *iommu) u64 log_entry; raw = (u64 *)(iommu->ga_log + head); - cnt++; /* Avoid memcpy function-call overhead */ log_entry = *raw; @@ -812,10 +811,10 @@ static void amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { if (!irq_remapping_enabled || !dev_is_pci(dev) || - pci_dev_has_special_msi_domain(to_pci_dev(dev))) + !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev))) return; - dev_set_msi_domain(dev, iommu->msi_domain); + dev_set_msi_domain(dev, iommu->ir_domain); } #else /* CONFIG_IRQ_REMAP */ @@ -2155,21 +2154,13 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); - struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; + struct amd_iommu *iommu = rlookup_amd_iommu(dev); int ret; - if (!check_device(dev)) - return -EINVAL; - - dev_data = dev_iommu_priv_get(dev); dev_data->defer_attach = false; - iommu = rlookup_amd_iommu(dev); - if (!iommu) - return -EINVAL; - if (dev_data->domain) detach_device(dev); @@ -2286,6 +2277,8 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return false; case IOMMU_CAP_PRE_BOOT_PROTECTION: return amdr_ivrs_remap_support; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return true; default: break; } @@ -3294,17 +3287,9 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (!info) return -EINVAL; - if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI && - info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX) + if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI) return -EINVAL; - /* - * With IRQ remapping enabled, don't need contiguous CPU vectors - * to support multiple MSI interrupts. - */ - if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) - info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; - sbdf = get_devid(info); if (sbdf < 0) return -EINVAL; @@ -3656,6 +3641,21 @@ static struct irq_chip amd_ir_chip = { .irq_compose_msi_msg = ir_compose_msi_msg, }; +static const struct msi_parent_ops amdvi_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | + MSI_FLAG_MULTI_PCI_MSI | + MSI_FLAG_PCI_IMS, + .prefix = "IR-", + .init_dev_msi_info = msi_parent_init_dev_msi_info, +}; + +static const struct msi_parent_ops virt_amdvi_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | + MSI_FLAG_MULTI_PCI_MSI, + .prefix = "vIR-", + .init_dev_msi_info = msi_parent_init_dev_msi_info, +}; + int amd_iommu_create_irq_domain(struct amd_iommu *iommu) { struct fwnode_handle *fn; @@ -3663,16 +3663,21 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index); if (!fn) return -ENOMEM; - iommu->ir_domain = irq_domain_create_tree(fn, &amd_ir_domain_ops, iommu); + iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0, + fn, &amd_ir_domain_ops, iommu); if (!iommu->ir_domain) { irq_domain_free_fwnode(fn); return -ENOMEM; } - iommu->ir_domain->parent = arch_get_ir_parent_domain(); - iommu->msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain, - "AMD-IR-MSI", - iommu->index); + irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_AMDVI); + iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + + if (amd_iommu_np_cache) + iommu->ir_domain->msi_parent_ops = &virt_amdvi_msi_parent_ops; + else + iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops; + return 0; } diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 6a1f02c62dff..864e4ffb6aa9 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -587,6 +587,7 @@ out_drop_state: put_device_state(dev_state); out: + pci_dev_put(pdev); return ret; } @@ -639,7 +640,9 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid, if (pasid_state->mm == NULL) goto out_free; - mmu_notifier_register(&pasid_state->mn, mm); + ret = mmu_notifier_register(&pasid_state->mn, mm); + if (ret) + goto out_free; ret = set_pasid_state(dev_state, pasid_state, pasid); if (ret) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 5968a568aae2..a5a63b1c947e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -10,7 +10,7 @@ #include <linux/slab.h> #include "arm-smmu-v3.h" -#include "../../iommu-sva-lib.h" +#include "../../iommu-sva.h" #include "../../io-pgtable-arm.h" struct arm_smmu_mmu_notifier { @@ -344,11 +344,6 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) if (!bond) return ERR_PTR(-ENOMEM); - /* Allocate a PASID for this mm if necessary */ - ret = iommu_sva_alloc_pasid(mm, 1, (1U << master->ssid_bits) - 1); - if (ret) - goto err_free_bond; - bond->mm = mm; bond->sva.dev = dev; refcount_set(&bond->refs, 1); @@ -367,42 +362,6 @@ err_free_bond: return ERR_PTR(ret); } -struct iommu_sva * -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata) -{ - struct iommu_sva *handle; - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - - if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) - return ERR_PTR(-EINVAL); - - mutex_lock(&sva_lock); - handle = __arm_smmu_sva_bind(dev, mm); - mutex_unlock(&sva_lock); - return handle; -} - -void arm_smmu_sva_unbind(struct iommu_sva *handle) -{ - struct arm_smmu_bond *bond = sva_to_bond(handle); - - mutex_lock(&sva_lock); - if (refcount_dec_and_test(&bond->refs)) { - list_del(&bond->list); - arm_smmu_mmu_notifier_put(bond->smmu_mn); - kfree(bond); - } - mutex_unlock(&sva_lock); -} - -u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle) -{ - struct arm_smmu_bond *bond = sva_to_bond(handle); - - return bond->mm->pasid; -} - bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { unsigned long reg, fld; @@ -550,3 +509,64 @@ void arm_smmu_sva_notifier_synchronize(void) */ mmu_notifier_synchronize(); } + +void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id) +{ + struct mm_struct *mm = domain->mm; + struct arm_smmu_bond *bond = NULL, *t; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + + mutex_lock(&sva_lock); + list_for_each_entry(t, &master->bonds, list) { + if (t->mm == mm) { + bond = t; + break; + } + } + + if (!WARN_ON(!bond) && refcount_dec_and_test(&bond->refs)) { + list_del(&bond->list); + arm_smmu_mmu_notifier_put(bond->smmu_mn); + kfree(bond); + } + mutex_unlock(&sva_lock); +} + +static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id) +{ + int ret = 0; + struct iommu_sva *handle; + struct mm_struct *mm = domain->mm; + + mutex_lock(&sva_lock); + handle = __arm_smmu_sva_bind(dev, mm); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); + mutex_unlock(&sva_lock); + + return ret; +} + +static void arm_smmu_sva_domain_free(struct iommu_domain *domain) +{ + kfree(domain); +} + +static const struct iommu_domain_ops arm_smmu_sva_domain_ops = { + .set_dev_pasid = arm_smmu_sva_set_dev_pasid, + .free = arm_smmu_sva_domain_free +}; + +struct iommu_domain *arm_smmu_sva_domain_alloc(void) +{ + struct iommu_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + domain->ops = &arm_smmu_sva_domain_ops; + + return domain; +} diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6d5df91c5c46..ab160198edd6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -29,7 +29,7 @@ #include "arm-smmu-v3.h" #include "../../dma-iommu.h" -#include "../../iommu-sva-lib.h" +#include "../../iommu-sva.h" static bool disable_bypass = true; module_param(disable_bypass, bool, 0444); @@ -2009,6 +2009,9 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { struct arm_smmu_domain *smmu_domain; + if (type == IOMMU_DOMAIN_SVA) + return arm_smmu_sva_domain_alloc(); + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_DMA_FQ && @@ -2430,23 +2433,14 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) goto out_unlock; } } else if (smmu_domain->smmu != smmu) { - dev_err(dev, - "cannot attach to SMMU %s (upstream of %s)\n", - dev_name(smmu_domain->smmu->dev), - dev_name(smmu->dev)); - ret = -ENXIO; + ret = -EINVAL; goto out_unlock; } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->ssid_bits != smmu_domain->s1_cfg.s1cdmax) { - dev_err(dev, - "cannot attach to incompatible domain (%u SSID bits != %u)\n", - smmu_domain->s1_cfg.s1cdmax, master->ssid_bits); ret = -EINVAL; goto out_unlock; } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && smmu_domain->stall_enabled != master->stall_enabled) { - dev_err(dev, "cannot attach to stall-%s domain\n", - smmu_domain->stall_enabled ? "enabled" : "disabled"); ret = -EINVAL; goto out_unlock; } @@ -2838,6 +2832,17 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } +static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev_pasid(dev, pasid, IOMMU_DOMAIN_SVA); + if (WARN_ON(IS_ERR(domain)) || !domain) + return; + + arm_smmu_sva_remove_dev_pasid(domain, dev, pasid); +} + static struct iommu_ops arm_smmu_ops = { .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, @@ -2846,11 +2851,9 @@ static struct iommu_ops arm_smmu_ops = { .device_group = arm_smmu_device_group, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, + .remove_dev_pasid = arm_smmu_remove_dev_pasid, .dev_enable_feat = arm_smmu_dev_enable_feature, .dev_disable_feat = arm_smmu_dev_disable_feature, - .sva_bind = arm_smmu_sva_bind, - .sva_unbind = arm_smmu_sva_unbind, - .sva_get_pasid = arm_smmu_sva_get_pasid, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, .pgsize_bitmap = -1UL, /* Restricted during device attach */ @@ -3543,6 +3546,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) /* SID/SSID sizes */ smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg); smmu->sid_bits = FIELD_GET(IDR1_SIDSIZE, reg); + smmu->iommu.max_pasids = 1UL << smmu->ssid_bits; /* * If the SMMU supports fewer bits than would fill a single L2 stream diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index cd48590ada30..8d772ea8a583 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -754,11 +754,10 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master); int arm_smmu_master_enable_sva(struct arm_smmu_master *master); int arm_smmu_master_disable_sva(struct arm_smmu_master *master); bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master); -struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, - void *drvdata); -void arm_smmu_sva_unbind(struct iommu_sva *handle); -u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle); void arm_smmu_sva_notifier_synchronize(void); +struct iommu_domain *arm_smmu_sva_domain_alloc(void); +void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id); #else /* CONFIG_ARM_SMMU_V3_SVA */ static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { @@ -790,19 +789,17 @@ static inline bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master return false; } -static inline struct iommu_sva * -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +static inline void arm_smmu_sva_notifier_synchronize(void) {} + +static inline struct iommu_domain *arm_smmu_sva_domain_alloc(void) { - return ERR_PTR(-ENODEV); + return NULL; } -static inline void arm_smmu_sva_unbind(struct iommu_sva *handle) {} - -static inline u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle) +static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, + ioasid_t id) { - return IOMMU_PASID_INVALID; } - -static inline void arm_smmu_sva_notifier_synchronize(void) {} #endif /* CONFIG_ARM_SMMU_V3_SVA */ #endif /* _ARM_SMMU_V3_H */ diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c index 658f3cc83278..9dc772f2cbb2 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c @@ -136,6 +136,9 @@ int arm_mmu500_reset(struct arm_smmu_device *smmu) reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR); reg &= ~ARM_MMU500_ACTLR_CPRE; arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_ACTLR, reg); + reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR); + if (reg & ARM_MMU500_ACTLR_CPRE) + dev_warn_once(smmu->dev, "Failed to disable prefetcher [errata #841119 and #826419], check ACR.CACHE_LOCK\n"); } return 0; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c index 6eed8e67a0ca..74e9ef2fd580 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c @@ -10,16 +10,6 @@ #include "arm-smmu.h" #include "arm-smmu-qcom.h" -enum qcom_smmu_impl_reg_offset { - QCOM_SMMU_TBU_PWR_STATUS, - QCOM_SMMU_STATS_SYNC_INV_TBU_ACK, - QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR, -}; - -struct qcom_smmu_config { - const u32 *reg_offset; -}; - void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) { int ret; @@ -59,84 +49,3 @@ void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) tbu_pwr_status, sync_inv_ack, sync_inv_progress); } } - -/* Implementation Defined Register Space 0 register offsets */ -static const u32 qcom_smmu_impl0_reg_offset[] = { - [QCOM_SMMU_TBU_PWR_STATUS] = 0x2204, - [QCOM_SMMU_STATS_SYNC_INV_TBU_ACK] = 0x25dc, - [QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR] = 0x2670, -}; - -static const struct qcom_smmu_config qcm2290_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sc7180_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sc7280_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sc8180x_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sc8280xp_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sm6125_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sm6350_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sm8150_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sm8250_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sm8350_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct qcom_smmu_config sm8450_smmu_cfg = { - .reg_offset = qcom_smmu_impl0_reg_offset, -}; - -static const struct of_device_id __maybe_unused qcom_smmu_impl_debug_match[] = { - { .compatible = "qcom,msm8998-smmu-v2" }, - { .compatible = "qcom,qcm2290-smmu-500", .data = &qcm2290_smmu_cfg }, - { .compatible = "qcom,sc7180-smmu-500", .data = &sc7180_smmu_cfg }, - { .compatible = "qcom,sc7280-smmu-500", .data = &sc7280_smmu_cfg}, - { .compatible = "qcom,sc8180x-smmu-500", .data = &sc8180x_smmu_cfg }, - { .compatible = "qcom,sc8280xp-smmu-500", .data = &sc8280xp_smmu_cfg }, - { .compatible = "qcom,sdm630-smmu-v2" }, - { .compatible = "qcom,sdm845-smmu-500" }, - { .compatible = "qcom,sm6125-smmu-500", .data = &sm6125_smmu_cfg}, - { .compatible = "qcom,sm6350-smmu-500", .data = &sm6350_smmu_cfg}, - { .compatible = "qcom,sm8150-smmu-500", .data = &sm8150_smmu_cfg }, - { .compatible = "qcom,sm8250-smmu-500", .data = &sm8250_smmu_cfg }, - { .compatible = "qcom,sm8350-smmu-500", .data = &sm8350_smmu_cfg }, - { .compatible = "qcom,sm8450-smmu-500", .data = &sm8450_smmu_cfg }, - { } -}; - -const void *qcom_smmu_impl_data(struct arm_smmu_device *smmu) -{ - const struct of_device_id *match; - const struct device_node *np = smmu->dev->of_node; - - match = of_match_node(qcom_smmu_impl_debug_match, np); - if (!match) - return NULL; - - return match->data; -} diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index b2708de25ea3..91d404deb115 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -361,6 +361,8 @@ static int qcom_sdm845_smmu500_reset(struct arm_smmu_device *smmu) { int ret; + arm_mmu500_reset(smmu); + /* * To address performance degradation in non-real time clients, * such as USB and UFS, turn off wait-for-safe on sdm845 based boards, @@ -374,41 +376,67 @@ static int qcom_sdm845_smmu500_reset(struct arm_smmu_device *smmu) return ret; } -static int qcom_smmu500_reset(struct arm_smmu_device *smmu) -{ - const struct device_node *np = smmu->dev->of_node; - - arm_mmu500_reset(smmu); - - if (of_device_is_compatible(np, "qcom,sdm845-smmu-500")) - return qcom_sdm845_smmu500_reset(smmu); +static const struct arm_smmu_impl qcom_smmu_v2_impl = { + .init_context = qcom_smmu_init_context, + .cfg_probe = qcom_smmu_cfg_probe, + .def_domain_type = qcom_smmu_def_domain_type, + .write_s2cr = qcom_smmu_write_s2cr, + .tlb_sync = qcom_smmu_tlb_sync, +}; - return 0; -} +static const struct arm_smmu_impl qcom_smmu_500_impl = { + .init_context = qcom_smmu_init_context, + .cfg_probe = qcom_smmu_cfg_probe, + .def_domain_type = qcom_smmu_def_domain_type, + .reset = arm_mmu500_reset, + .write_s2cr = qcom_smmu_write_s2cr, + .tlb_sync = qcom_smmu_tlb_sync, +}; -static const struct arm_smmu_impl qcom_smmu_impl = { +static const struct arm_smmu_impl sdm845_smmu_500_impl = { .init_context = qcom_smmu_init_context, .cfg_probe = qcom_smmu_cfg_probe, .def_domain_type = qcom_smmu_def_domain_type, - .reset = qcom_smmu500_reset, + .reset = qcom_sdm845_smmu500_reset, .write_s2cr = qcom_smmu_write_s2cr, .tlb_sync = qcom_smmu_tlb_sync, }; -static const struct arm_smmu_impl qcom_adreno_smmu_impl = { +static const struct arm_smmu_impl qcom_adreno_smmu_v2_impl = { + .init_context = qcom_adreno_smmu_init_context, + .def_domain_type = qcom_smmu_def_domain_type, + .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank, + .write_sctlr = qcom_adreno_smmu_write_sctlr, + .tlb_sync = qcom_smmu_tlb_sync, +}; + +static const struct arm_smmu_impl qcom_adreno_smmu_500_impl = { .init_context = qcom_adreno_smmu_init_context, .def_domain_type = qcom_smmu_def_domain_type, - .reset = qcom_smmu500_reset, + .reset = arm_mmu500_reset, .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank, .write_sctlr = qcom_adreno_smmu_write_sctlr, .tlb_sync = qcom_smmu_tlb_sync, }; static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, - const struct arm_smmu_impl *impl) + const struct qcom_smmu_match_data *data) { + const struct device_node *np = smmu->dev->of_node; + const struct arm_smmu_impl *impl; struct qcom_smmu *qsmmu; + if (!data) + return ERR_PTR(-EINVAL); + + if (np && of_device_is_compatible(np, "qcom,adreno-smmu")) + impl = data->adreno_impl; + else + impl = data->impl; + + if (!impl) + return smmu; + /* Check to make sure qcom_scm has finished probing */ if (!qcom_scm_is_available()) return ERR_PTR(-EPROBE_DEFER); @@ -418,27 +446,77 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, return ERR_PTR(-ENOMEM); qsmmu->smmu.impl = impl; - qsmmu->cfg = qcom_smmu_impl_data(smmu); + qsmmu->cfg = data->cfg; return &qsmmu->smmu; } +/* Implementation Defined Register Space 0 register offsets */ +static const u32 qcom_smmu_impl0_reg_offset[] = { + [QCOM_SMMU_TBU_PWR_STATUS] = 0x2204, + [QCOM_SMMU_STATS_SYNC_INV_TBU_ACK] = 0x25dc, + [QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR] = 0x2670, +}; + +static const struct qcom_smmu_config qcom_smmu_impl0_cfg = { + .reg_offset = qcom_smmu_impl0_reg_offset, +}; + +/* + * It is not yet possible to use MDP SMMU with the bypass quirk on the msm8996, + * there are not enough context banks. + */ +static const struct qcom_smmu_match_data msm8996_smmu_data = { + .impl = NULL, + .adreno_impl = &qcom_adreno_smmu_v2_impl, +}; + +static const struct qcom_smmu_match_data qcom_smmu_v2_data = { + .impl = &qcom_smmu_v2_impl, + .adreno_impl = &qcom_adreno_smmu_v2_impl, +}; + +static const struct qcom_smmu_match_data sdm845_smmu_500_data = { + .impl = &sdm845_smmu_500_impl, + /* + * No need for adreno impl here. On sdm845 the Adreno SMMU is handled + * by the separate sdm845-smmu-v2 device. + */ + /* Also no debug configuration. */ +}; + +static const struct qcom_smmu_match_data qcom_smmu_500_impl0_data = { + .impl = &qcom_smmu_500_impl, + .adreno_impl = &qcom_adreno_smmu_500_impl, + .cfg = &qcom_smmu_impl0_cfg, +}; + +/* + * Do not add any more qcom,SOC-smmu-500 entries to this list, unless they need + * special handling and can not be covered by the qcom,smmu-500 entry. + */ static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { - { .compatible = "qcom,msm8998-smmu-v2" }, - { .compatible = "qcom,qcm2290-smmu-500" }, - { .compatible = "qcom,sc7180-smmu-500" }, - { .compatible = "qcom,sc7280-smmu-500" }, - { .compatible = "qcom,sc8180x-smmu-500" }, - { .compatible = "qcom,sc8280xp-smmu-500" }, - { .compatible = "qcom,sdm630-smmu-v2" }, - { .compatible = "qcom,sdm845-smmu-500" }, - { .compatible = "qcom,sm6125-smmu-500" }, - { .compatible = "qcom,sm6350-smmu-500" }, - { .compatible = "qcom,sm6375-smmu-500" }, - { .compatible = "qcom,sm8150-smmu-500" }, - { .compatible = "qcom,sm8250-smmu-500" }, - { .compatible = "qcom,sm8350-smmu-500" }, - { .compatible = "qcom,sm8450-smmu-500" }, + { .compatible = "qcom,msm8996-smmu-v2", .data = &msm8996_smmu_data }, + { .compatible = "qcom,msm8998-smmu-v2", .data = &qcom_smmu_v2_data }, + { .compatible = "qcom,qcm2290-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,qdu1000-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sc7180-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sc7280-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sc8180x-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sc8280xp-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sdm630-smmu-v2", .data = &qcom_smmu_v2_data }, + { .compatible = "qcom,sdm845-smmu-v2", .data = &qcom_smmu_v2_data }, + { .compatible = "qcom,sdm845-smmu-500", .data = &sdm845_smmu_500_data }, + { .compatible = "qcom,sm6115-smmu-500", .data = &qcom_smmu_500_impl0_data}, + { .compatible = "qcom,sm6125-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sm6350-smmu-v2", .data = &qcom_smmu_v2_data }, + { .compatible = "qcom,sm6350-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sm6375-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sm8150-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sm8250-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sm8350-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,sm8450-smmu-500", .data = &qcom_smmu_500_impl0_data }, + { .compatible = "qcom,smmu-500", .data = &qcom_smmu_500_impl0_data }, { } }; @@ -453,26 +531,19 @@ static struct acpi_platform_list qcom_acpi_platlist[] = { struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) { const struct device_node *np = smmu->dev->of_node; + const struct of_device_id *match; #ifdef CONFIG_ACPI if (np == NULL) { /* Match platform for ACPI boot */ if (acpi_match_platform_list(qcom_acpi_platlist) >= 0) - return qcom_smmu_create(smmu, &qcom_smmu_impl); + return qcom_smmu_create(smmu, &qcom_smmu_500_impl0_data); } #endif - /* - * Do not change this order of implementation, i.e., first adreno - * smmu impl and then apss smmu since we can have both implementing - * arm,mmu-500 in which case we will miss setting adreno smmu specific - * features if the order is changed. - */ - if (of_device_is_compatible(np, "qcom,adreno-smmu")) - return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); - - if (of_match_node(qcom_smmu_impl_of_match, np)) - return qcom_smmu_create(smmu, &qcom_smmu_impl); + match = of_match_node(qcom_smmu_impl_of_match, np); + if (match) + return qcom_smmu_create(smmu, match->data); return smmu; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h index 99ec8f8629a0..593910567b88 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h @@ -14,15 +14,26 @@ struct qcom_smmu { u32 stall_enabled; }; +enum qcom_smmu_impl_reg_offset { + QCOM_SMMU_TBU_PWR_STATUS, + QCOM_SMMU_STATS_SYNC_INV_TBU_ACK, + QCOM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR, +}; + +struct qcom_smmu_config { + const u32 *reg_offset; +}; + +struct qcom_smmu_match_data { + const struct qcom_smmu_config *cfg; + const struct arm_smmu_impl *impl; + const struct arm_smmu_impl *adreno_impl; +}; + #ifdef CONFIG_ARM_SMMU_QCOM_DEBUG void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu); -const void *qcom_smmu_impl_data(struct arm_smmu_device *smmu); #else static inline void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) { } -static inline const void *qcom_smmu_impl_data(struct arm_smmu_device *smmu) -{ - return NULL; -} #endif #endif /* _ARM_SMMU_QCOM_H */ diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 30dab1418e3f..719fbca1fe52 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1150,9 +1150,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) * different SMMUs. */ if (smmu_domain->smmu != smmu) { - dev_err(dev, - "cannot attach to SMMU %s whilst already attached to domain on SMMU %s\n", - dev_name(smmu_domain->smmu->dev), dev_name(smmu->dev)); ret = -EINVAL; goto rpm_put; } diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 3869c3ecda8c..270c3d9128ba 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -381,13 +381,8 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev * Sanity check the domain. We don't support domains across * different IOMMUs. */ - if (qcom_domain->iommu != qcom_iommu) { - dev_err(dev, "cannot attach to IOMMU %s while already " - "attached to domain on IOMMU %s\n", - dev_name(qcom_domain->iommu->dev), - dev_name(qcom_iommu->dev)); + if (qcom_domain->iommu != qcom_iommu) return -EINVAL; - } return 0; } @@ -415,7 +410,8 @@ static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *de } static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { int ret; unsigned long flags; @@ -426,13 +422,14 @@ static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova, return -ENODEV; spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags); - ret = ops->map(ops, iova, paddr, size, prot, GFP_ATOMIC); + ret = ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot, GFP_ATOMIC, mapped); spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags); return ret; } static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { size_t ret; unsigned long flags; @@ -449,7 +446,7 @@ static size_t qcom_iommu_unmap(struct iommu_domain *domain, unsigned long iova, */ pm_runtime_get_sync(qcom_domain->iommu->dev); spin_lock_irqsave(&qcom_domain->pgtbl_lock, flags); - ret = ops->unmap(ops, iova, size, gather); + ret = ops->unmap_pages(ops, iova, pgsize, pgcount, gather); spin_unlock_irqrestore(&qcom_domain->pgtbl_lock, flags); pm_runtime_put_sync(qcom_domain->iommu->dev); @@ -587,8 +584,8 @@ static const struct iommu_ops qcom_iommu_ops = { .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = qcom_iommu_attach_dev, .detach_dev = qcom_iommu_detach_dev, - .map = qcom_iommu_map, - .unmap = qcom_iommu_unmap, + .map_pages = qcom_iommu_map, + .unmap_pages = qcom_iommu_unmap, .flush_iotlb_all = qcom_iommu_flush_iotlb_all, .iotlb_sync = qcom_iommu_iotlb_sync, .iova_to_phys = qcom_iommu_iova_to_phys, diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 9297b741f5e8..f798c44e0903 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -744,9 +744,6 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev, /* IOMMU can map any pages, so himem can also be used here */ gfp |= __GFP_NOWARN | __GFP_HIGHMEM; - /* It makes no sense to muck about with huge pages */ - gfp &= ~__GFP_COMP; - while (count) { struct page *page = NULL; unsigned int order_size; diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 45fd4850bacb..b0cde2211987 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -708,10 +708,6 @@ static int exynos_sysmmu_probe(struct platform_device *pdev) if (ret) return ret; - ret = iommu_device_register(&data->iommu, &exynos_iommu_ops, dev); - if (ret) - goto err_iommu_register; - platform_set_drvdata(pdev, data); if (PG_ENT_SHIFT < 0) { @@ -743,11 +739,13 @@ static int exynos_sysmmu_probe(struct platform_device *pdev) pm_runtime_enable(dev); + ret = iommu_device_register(&data->iommu, &exynos_iommu_ops, dev); + if (ret) + goto err_dma_set_mask; + return 0; err_dma_set_mask: - iommu_device_unregister(&data->iommu); -err_iommu_register: iommu_device_sysfs_remove(&data->iommu); return ret; } @@ -1432,12 +1430,6 @@ static int __init exynos_iommu_init(void) return -ENOMEM; } - ret = platform_driver_register(&exynos_sysmmu_driver); - if (ret) { - pr_err("%s: Failed to register driver\n", __func__); - goto err_reg_driver; - } - zero_lv2_table = kmem_cache_zalloc(lv2table_kmem_cache, GFP_KERNEL); if (zero_lv2_table == NULL) { pr_err("%s: Failed to allocate zero level2 page table\n", @@ -1446,10 +1438,16 @@ static int __init exynos_iommu_init(void) goto err_zero_lv2; } + ret = platform_driver_register(&exynos_sysmmu_driver); + if (ret) { + pr_err("%s: Failed to register driver\n", __func__); + goto err_reg_driver; + } + return 0; -err_zero_lv2: - platform_driver_unregister(&exynos_sysmmu_driver); err_reg_driver: + platform_driver_unregister(&exynos_sysmmu_driver); +err_zero_lv2: kmem_cache_destroy(lv2table_kmem_cache); return ret; } diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c index 0d03f837a5d4..05d820fb1d0b 100644 --- a/drivers/iommu/fsl_pamu.c +++ b/drivers/iommu/fsl_pamu.c @@ -211,7 +211,7 @@ int pamu_config_ppaace(int liodn, u32 omi, u32 stashid, int prot) ppaace->op_encode.index_ot.omi = omi; } else if (~omi != 0) { pr_debug("bad operation mapping index: %d\n", omi); - return -EINVAL; + return -ENODEV; } /* configure stash id */ @@ -779,7 +779,7 @@ static int fsl_pamu_probe(struct platform_device *pdev) of_get_address(dev->of_node, 0, &size, NULL); irq = irq_of_parse_and_map(dev->of_node, 0); - if (irq == NO_IRQ) { + if (!irq) { dev_warn(dev, "no interrupts listed in PAMU node\n"); goto error; } @@ -868,7 +868,7 @@ static int fsl_pamu_probe(struct platform_device *pdev) ret = create_csd(ppaact_phys, mem_size, csd_port_id); if (ret) { dev_err(dev, "could not create coherence subdomain\n"); - return ret; + goto error; } } @@ -903,7 +903,7 @@ static int fsl_pamu_probe(struct platform_device *pdev) return 0; error: - if (irq != NO_IRQ) + if (irq) free_irq(irq, data); kfree_sensitive(data); diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index fa20f4b03e12..4408ac3c49b6 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -258,7 +258,7 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, liodn = of_get_property(dev->of_node, "fsl,liodn", &len); if (!liodn) { pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node); - return -EINVAL; + return -ENODEV; } spin_lock_irqsave(&dma_domain->domain_lock, flags); @@ -267,7 +267,7 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, if (liodn[i] >= PAACE_NUMBER_ENTRIES) { pr_debug("Invalid liodn %d, attach device failed for %pOF\n", liodn[i], dev->of_node); - ret = -EINVAL; + ret = -ENODEV; break; } diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index e190bb8c225c..8302db7f783e 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -122,9 +122,12 @@ static int __init hyperv_prepare_irq_remapping(void) const char *name; const struct irq_domain_ops *ops; + /* + * For a Hyper-V root partition, ms_hyperv_msi_ext_dest_id() + * will always return false. + */ if (!hypervisor_is_type(X86_HYPER_MS_HYPERV) || - x86_init.hyper.msi_ext_dest_id() || - !x2apic_supported()) + x86_init.hyper.msi_ext_dest_id()) return -ENODEV; if (hv_root_partition) { @@ -170,7 +173,9 @@ static int __init hyperv_prepare_irq_remapping(void) static int __init hyperv_enable_irq_remapping(void) { - return IRQ_REMAP_X2APIC_MODE; + if (x2apic_supported()) + return IRQ_REMAP_X2APIC_MODE; + return IRQ_REMAP_XAPIC_MODE; } struct irq_remap_ops hyperv_irq_remap_ops = { diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 5a8f780e7ffd..b00a0ceb2d13 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -820,6 +820,7 @@ int __init dmar_dev_scope_init(void) info = dmar_alloc_pci_notify_info(dev, BUS_NOTIFY_ADD_DEVICE); if (!info) { + pci_dev_put(dev); return dmar_dev_scope_status; } else { dmar_pci_bus_add_dev(info); @@ -1105,6 +1106,13 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) raw_spin_lock_init(&iommu->register_lock); /* + * A value of N in PSS field of eCap register indicates hardware + * supports PASID field of N+1 bits. + */ + if (pasid_supported(iommu)) + iommu->iommu.max_pasids = 2UL << ecap_pss(iommu->ecap); + + /* * This is only for hotplug; at boot time intel_iommu_enabled won't * be set yet. When intel_iommu_init() runs, it registers the units * present at boot time, then sets intel_iommu_enabled. diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 48cdcd0a5cf3..59df7e42fd53 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -27,7 +27,7 @@ #include "iommu.h" #include "../dma-iommu.h" #include "../irq_remapping.h" -#include "../iommu-sva-lib.h" +#include "../iommu-sva.h" #include "pasid.h" #include "cap_audit.h" @@ -277,7 +277,8 @@ static LIST_HEAD(dmar_satc_units); #define for_each_rmrr_units(rmrr) \ list_for_each_entry(rmrr, &dmar_rmrr_units, list) -static void dmar_remove_one_dev_info(struct device *dev); +static void device_block_translation(struct device *dev); +static void intel_iommu_domain_free(struct iommu_domain *domain); int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); @@ -382,11 +383,6 @@ static inline int domain_type_is_si(struct dmar_domain *domain) return domain->domain.type == IOMMU_DOMAIN_IDENTITY; } -static inline bool domain_use_first_level(struct dmar_domain *domain) -{ - return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL; -} - static inline int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) { @@ -500,7 +496,7 @@ static int domain_update_iommu_superpage(struct dmar_domain *domain, rcu_read_lock(); for_each_active_iommu(iommu, drhd) { if (iommu != skip) { - if (domain && domain_use_first_level(domain)) { + if (domain && domain->use_first_level) { if (!cap_fl1gp_support(iommu->cap)) mask = 0x1; } else { @@ -578,7 +574,7 @@ static void domain_update_iommu_cap(struct dmar_domain *domain) * paging and 57-bits with 5-level paging). Hence, skip bit * [N-1]. */ - if (domain_use_first_level(domain)) + if (domain->use_first_level) domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); else domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); @@ -779,19 +775,6 @@ static void domain_flush_cache(struct dmar_domain *domain, clflush_cache_range(addr, size); } -static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - struct context_entry *context; - int ret = 0; - - spin_lock(&iommu->lock); - context = iommu_context_addr(iommu, bus, devfn, 0); - if (context) - ret = context_present(context); - spin_unlock(&iommu->lock); - return ret; -} - static void free_context_table(struct intel_iommu *iommu) { struct context_entry *context; @@ -959,11 +942,9 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; - if (domain_use_first_level(domain)) { - pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; - if (iommu_is_dma_domain(&domain->domain)) - pteval |= DMA_FL_PTE_ACCESS; - } + if (domain->use_first_level) + pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; + if (cmpxchg64(&pte->val, 0ULL, pteval)) /* Someone else set it while we were thinking; use theirs. */ free_pgtable_page(tmp_page); @@ -1398,11 +1379,29 @@ static void domain_update_iotlb(struct dmar_domain *domain) spin_unlock_irqrestore(&domain->lock, flags); } +/* + * The extra devTLB flush quirk impacts those QAT devices with PCI device + * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() + * check because it applies only to the built-in QAT devices and it doesn't + * grant additional privileges. + */ +#define BUGGY_QAT_DEVID_MASK 0x4940 +static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) +{ + if (pdev->vendor != PCI_VENDOR_ID_INTEL) + return false; + + if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) + return false; + + return true; +} + static void iommu_enable_pci_caps(struct device_domain_info *info) { struct pci_dev *pdev; - if (!info || !dev_is_pci(info->dev)) + if (!dev_is_pci(info->dev)) return; pdev = to_pci_dev(info->dev); @@ -1442,7 +1441,7 @@ static void iommu_enable_pci_caps(struct device_domain_info *info) } } -static void iommu_disable_dev_iotlb(struct device_domain_info *info) +static void iommu_disable_pci_caps(struct device_domain_info *info) { struct pci_dev *pdev; @@ -1480,6 +1479,7 @@ static void __iommu_flush_dev_iotlb(struct device_domain_info *info, qdep = info->ats_qdep; qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, qdep, addr, mask); + quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep); } static void iommu_flush_dev_iotlb(struct dmar_domain *domain, @@ -1512,7 +1512,7 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, if (ih) ih = 1 << 6; - if (domain_use_first_level(domain)) { + if (domain->use_first_level) { qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih); } else { unsigned long bitmask = aligned_pages - 1; @@ -1566,7 +1566,7 @@ static inline void __mapping_notify_one(struct intel_iommu *iommu, * It's a non-present to present mapping. Only flush if caching mode * and second level. */ - if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain)) + if (cap_caching_mode(iommu->cap) && !domain->use_first_level) iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1); else iommu_flush_write_buffer(iommu); @@ -1582,7 +1582,7 @@ static void intel_flush_iotlb_all(struct iommu_domain *domain) struct intel_iommu *iommu = info->iommu; u16 did = domain_id_iommu(dmar_domain, iommu); - if (domain_use_first_level(dmar_domain)) + if (dmar_domain->use_first_level) qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0); else iommu->flush.flush_iotlb(iommu, did, 0, 0, @@ -1755,7 +1755,7 @@ static struct dmar_domain *alloc_domain(unsigned int type) domain->nid = NUMA_NO_NODE; if (first_level_by_default(type)) - domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; + domain->use_first_level = true; domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); spin_lock_init(&domain->lock); @@ -2047,7 +2047,6 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } else { iommu_flush_write_buffer(iommu); } - iommu_enable_pci_caps(info); ret = 0; @@ -2099,30 +2098,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) &domain_context_mapping_cb, &data); } -static int domain_context_mapped_cb(struct pci_dev *pdev, - u16 alias, void *opaque) -{ - struct intel_iommu *iommu = opaque; - - return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff); -} - -static int domain_context_mapped(struct device *dev) -{ - struct intel_iommu *iommu; - u8 bus, devfn; - - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return -ENODEV; - - if (!dev_is_pci(dev)) - return device_context_mapped(iommu, bus, devfn); - - return !pci_for_each_dma_alias(to_pci_dev(dev), - domain_context_mapped_cb, iommu); -} - /* Returns a number of VTD pages, but aligned to MM page size */ static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size) @@ -2212,7 +2187,7 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); attr |= DMA_FL_PTE_PRESENT; - if (domain_use_first_level(domain)) { + if (domain->use_first_level) { attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; if (prot & DMA_PTE_WRITE) attr |= DMA_FL_PTE_DIRTY; @@ -2455,7 +2430,8 @@ static int __init si_domain_init(int hw) return 0; } -static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) +static int dmar_domain_attach_device(struct dmar_domain *domain, + struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu; @@ -2477,18 +2453,11 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) /* PASID table is mandatory for a PCI device in scalable mode. */ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { - ret = intel_pasid_alloc_table(dev); - if (ret) { - dev_err(dev, "PASID table allocation failed\n"); - dmar_remove_one_dev_info(dev); - return ret; - } - /* Setup the PASID entry for requests without PASID: */ if (hw_pass_through && domain_type_is_si(domain)) ret = intel_pasid_setup_pass_through(iommu, domain, dev, PASID_RID2PASID); - else if (domain_use_first_level(domain)) + else if (domain->use_first_level) ret = domain_setup_first_level(iommu, domain, dev, PASID_RID2PASID); else @@ -2496,7 +2465,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) dev, PASID_RID2PASID); if (ret) { dev_err(dev, "Setup RID2PASID failed\n"); - dmar_remove_one_dev_info(dev); + device_block_translation(dev); return ret; } } @@ -2504,10 +2473,12 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) ret = domain_context_mapping(domain, dev); if (ret) { dev_err(dev, "Domain context map failed\n"); - dmar_remove_one_dev_info(dev); + device_block_translation(dev); return ret; } + iommu_enable_pci_caps(info); + return 0; } @@ -3856,8 +3827,10 @@ static inline bool has_external_pci(void) struct pci_dev *pdev = NULL; for_each_pci_dev(pdev) - if (pdev->external_facing) + if (pdev->external_facing) { + pci_dev_put(pdev); return true; + } return false; } @@ -4106,9 +4079,8 @@ static void dmar_remove_one_dev_info(struct device *dev) intel_pasid_tear_down_entry(iommu, info->dev, PASID_RID2PASID, false); - iommu_disable_dev_iotlb(info); + iommu_disable_pci_caps(info); domain_context_clear(info); - intel_pasid_free_table(info->dev); } spin_lock_irqsave(&domain->lock, flags); @@ -4119,6 +4091,37 @@ static void dmar_remove_one_dev_info(struct device *dev) info->domain = NULL; } +/* + * Clear the page table pointer in context or pasid table entries so that + * all DMA requests without PASID from the device are blocked. If the page + * table has been set, clean up the data structures. + */ +static void device_block_translation(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + + iommu_disable_pci_caps(info); + if (!dev_is_real_dma_subdevice(dev)) { + if (sm_supported(iommu)) + intel_pasid_tear_down_entry(iommu, dev, + PASID_RID2PASID, false); + else + domain_context_clear(info); + } + + if (!info->domain) + return; + + spin_lock_irqsave(&info->domain->lock, flags); + list_del(&info->link); + spin_unlock_irqrestore(&info->domain->lock, flags); + + domain_detach_iommu(info->domain, iommu); + info->domain = NULL; +} + static int md_domain_init(struct dmar_domain *domain, int guest_width) { int adjust_width; @@ -4140,12 +4143,28 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) return 0; } +static int blocking_domain_attach_dev(struct iommu_domain *domain, + struct device *dev) +{ + device_block_translation(dev); + return 0; +} + +static struct iommu_domain blocking_domain = { + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocking_domain_attach_dev, + .free = intel_iommu_domain_free + } +}; + static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) { struct dmar_domain *dmar_domain; struct iommu_domain *domain; switch (type) { + case IOMMU_DOMAIN_BLOCKED: + return &blocking_domain; case IOMMU_DOMAIN_DMA: case IOMMU_DOMAIN_DMA_FQ: case IOMMU_DOMAIN_UNMANAGED: @@ -4169,6 +4188,8 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return domain; case IOMMU_DOMAIN_IDENTITY: return &si_domain->domain; + case IOMMU_DOMAIN_SVA: + return intel_svm_domain_alloc(); default: return NULL; } @@ -4178,7 +4199,7 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) static void intel_iommu_domain_free(struct iommu_domain *domain) { - if (domain != &si_domain->domain) + if (domain != &si_domain->domain && domain != &blocking_domain) domain_exit(to_dmar_domain(domain)); } @@ -4194,19 +4215,15 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, return -ENODEV; if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) - return -EOPNOTSUPP; + return -EINVAL; /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); - if (dmar_domain->max_addr > (1LL << addr_width)) { - dev_err(dev, "%s: iommu width (%d) is not " - "sufficient for the mapped address (%llx)\n", - __func__, addr_width, dmar_domain->max_addr); - return -EFAULT; - } + if (dmar_domain->max_addr > (1LL << addr_width)) + return -EINVAL; dmar_domain->gaw = addr_width; /* @@ -4229,6 +4246,7 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, static int intel_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; if (domain->type == IOMMU_DOMAIN_UNMANAGED && @@ -4237,25 +4255,14 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, return -EPERM; } - /* normally dev is not mapped */ - if (unlikely(domain_context_mapped(dev))) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - - if (info->domain) - dmar_remove_one_dev_info(dev); - } + if (info->domain) + device_block_translation(dev); ret = prepare_domain_attach_device(domain, dev); if (ret) return ret; - return domain_add_dev_info(to_dmar_domain(domain), dev); -} - -static void intel_iommu_detach_device(struct iommu_domain *domain, - struct device *dev) -{ - dmar_remove_one_dev_info(dev); + return dmar_domain_attach_device(to_dmar_domain(domain), dev); } static int intel_iommu_map(struct iommu_domain *domain, @@ -4419,7 +4426,7 @@ static void domain_set_force_snooping(struct dmar_domain *domain) * Second level page table supports per-PTE snoop control. The * iommu_map() interface will handle this by setting SNP bit. */ - if (!domain_use_first_level(domain)) { + if (!domain->use_first_level) { domain->set_pte_snp = true; return; } @@ -4452,14 +4459,20 @@ static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) { - if (cap == IOMMU_CAP_CACHE_COHERENCY) + struct device_domain_info *info = dev_iommu_priv_get(dev); + + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: return true; - if (cap == IOMMU_CAP_INTR_REMAP) + case IOMMU_CAP_INTR_REMAP: return irq_remapping_enabled == 1; - if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION) + case IOMMU_CAP_PRE_BOOT_PROTECTION: return dmar_platform_optin(); - - return false; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return ecap_sc_support(info->iommu->ecap); + default: + return false; + } } static struct iommu_device *intel_iommu_probe_device(struct device *dev) @@ -4468,6 +4481,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) struct device_domain_info *info; struct intel_iommu *iommu; u8 bus, devfn; + int ret; iommu = device_to_iommu(dev, &bus, &devfn); if (!iommu || !iommu->iommu.ops) @@ -4492,9 +4506,10 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) if (dev_is_pci(dev)) { if (ecap_dev_iotlb_support(iommu->ecap) && pci_ats_supported(pdev) && - dmar_ats_supported(pdev, iommu)) + dmar_ats_supported(pdev, iommu)) { info->ats_supported = 1; - + info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); + } if (sm_supported(iommu)) { if (pasid_supported(iommu)) { int features = pci_pasid_features(pdev); @@ -4511,6 +4526,16 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) dev_iommu_priv_set(dev, info); + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { + ret = intel_pasid_alloc_table(dev); + if (ret) { + dev_err(dev, "PASID table allocation failed\n"); + dev_iommu_priv_set(dev, NULL); + kfree(info); + return ERR_PTR(ret); + } + } + return &iommu->iommu; } @@ -4519,6 +4544,7 @@ static void intel_iommu_release_device(struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); dmar_remove_one_dev_info(dev); + intel_pasid_free_table(dev); dev_iommu_priv_set(dev, NULL); kfree(info); set_dma_ops(dev, NULL); @@ -4712,6 +4738,28 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); } +static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); + struct iommu_domain *domain; + + /* Domain type specific cleanup: */ + domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); + if (domain) { + switch (domain->type) { + case IOMMU_DOMAIN_SVA: + intel_svm_remove_dev_pasid(dev, pasid); + break; + default: + /* should never reach here */ + WARN_ON(1); + break; + } + } + + intel_pasid_tear_down_entry(iommu, dev, pasid, false); +} + const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .domain_alloc = intel_iommu_domain_alloc, @@ -4724,16 +4772,13 @@ const struct iommu_ops intel_iommu_ops = { .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, + .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, #ifdef CONFIG_INTEL_IOMMU_SVM - .sva_bind = intel_svm_bind, - .sva_unbind = intel_svm_unbind, - .sva_get_pasid = intel_svm_get_pasid, .page_response = intel_svm_page_response, #endif .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = intel_iommu_attach_device, - .detach_dev = intel_iommu_detach_device, .map_pages = intel_iommu_map_pages, .unmap_pages = intel_iommu_unmap_pages, .iotlb_sync_map = intel_iommu_iotlb_sync_map, @@ -4933,3 +4978,48 @@ static void __init check_tylersburg_isoch(void) pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", vtisochctrl); } + +/* + * Here we deal with a device TLB defect where device may inadvertently issue ATS + * invalidation completion before posted writes initiated with translated address + * that utilized translations matching the invalidation address range, violating + * the invalidation completion ordering. + * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is + * vulnerable to this defect. In other words, any dTLB invalidation initiated not + * under the control of the trusted/privileged host device driver must use this + * quirk. + * Device TLBs are invalidated under the following six conditions: + * 1. Device driver does DMA API unmap IOVA + * 2. Device driver unbind a PASID from a process, sva_unbind_device() + * 3. PASID is torn down, after PASID cache is flushed. e.g. process + * exit_mmap() due to crash + * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where + * VM has to free pages that were unmapped + * 5. Userspace driver unmaps a DMA buffer + * 6. Cache invalidation in vSVA usage (upcoming) + * + * For #1 and #2, device drivers are responsible for stopping DMA traffic + * before unmap/unbind. For #3, iommu driver gets mmu_notifier to + * invalidate TLB the same way as normal user unmap which will use this quirk. + * The dTLB invalidation after PASID cache flush does not need this quirk. + * + * As a reminder, #6 will *NEED* this quirk as we enable nested translation. + */ +void quirk_extra_dev_tlb_flush(struct device_domain_info *info, + unsigned long address, unsigned long mask, + u32 pasid, u16 qdep) +{ + u16 sid; + + if (likely(!info->dtlb_extra_inval)) + return; + + sid = PCI_DEVID(info->bus, info->devfn); + if (pasid == PASID_RID2PASID) { + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, address, mask); + } else { + qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, + pasid, qdep, address, mask); + } +} diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 92023dff9513..06e61e474856 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -480,8 +480,6 @@ enum { #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1) #define VTD_FLAG_SVM_CAPABLE (1 << 2) -extern int intel_iommu_sm; - #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) @@ -517,14 +515,6 @@ struct context_entry { u64 hi; }; -/* - * When VT-d works in the scalable mode, it allows DMA translation to - * happen through either first level or second level page table. This - * bit marks that the DMA translation for the domain goes through the - * first level page table, otherwise, it goes through the second level. - */ -#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(1) - struct iommu_domain_info { struct intel_iommu *iommu; unsigned int refcnt; /* Refcount of devices per iommu */ @@ -541,6 +531,11 @@ struct dmar_domain { u8 iommu_coherency: 1; /* indicate coherency of iommu access */ u8 force_snooping : 1; /* Create IOPTEs with snoop control */ u8 set_pte_snp:1; + u8 use_first_level:1; /* DMA translation for the domain goes + * through the first level page table, + * otherwise, goes through the second + * level. + */ spinlock_t lock; /* Protect device tracking lists */ struct list_head devices; /* all devices' list */ @@ -550,8 +545,6 @@ struct dmar_domain { /* adjusted guest address width, 0 is level 2 30-bit */ int agaw; - - int flags; /* flags to find out type of domain */ int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ @@ -600,7 +593,6 @@ struct intel_iommu { #ifdef CONFIG_IRQ_REMAP struct ir_table *ir_table; /* Interrupt remapping info */ struct irq_domain *ir_domain; - struct irq_domain *ir_msi_domain; #endif struct iommu_device iommu; /* IOMMU core code handle */ int node; @@ -623,6 +615,7 @@ struct device_domain_info { u8 pri_enabled:1; u8 ats_supported:1; u8 ats_enabled:1; + u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */ u8 ats_qdep; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ @@ -728,6 +721,9 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, u32 pasid, u16 qdep, u64 addr, unsigned int size_order); +void quirk_extra_dev_tlb_flush(struct device_domain_info *info, + unsigned long address, unsigned long pages, + u32 pasid, u16 qdep); void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, u32 pasid); @@ -750,12 +746,10 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); -struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, - void *drvdata); -void intel_svm_unbind(struct iommu_sva *handle); -u32 intel_svm_get_pasid(struct iommu_sva *handle); int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); +struct iommu_domain *intel_svm_domain_alloc(void); +void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid); struct intel_svm_dev { struct list_head list; @@ -780,6 +774,14 @@ struct intel_svm { }; #else static inline void intel_svm_check(struct intel_iommu *iommu) {} +static inline struct iommu_domain *intel_svm_domain_alloc(void) +{ + return NULL; +} + +static inline void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ +} #endif #ifdef CONFIG_INTEL_IOMMU_DEBUGFS @@ -795,6 +797,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, extern const struct iommu_ops intel_iommu_ops; #ifdef CONFIG_INTEL_IOMMU +extern int intel_iommu_sm; extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; @@ -810,6 +813,7 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) } #define dmar_disabled (1) #define intel_iommu_enabled (0) +#define intel_iommu_sm (0) #endif static inline const char *decode_prq_descriptor(char *str, size_t size, diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 5962bb5027d0..f58f5f57af78 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -82,6 +82,7 @@ static const struct irq_domain_ops intel_ir_domain_ops; static void iommu_disable_irq_remapping(struct intel_iommu *iommu); static int __init parse_ioapics_under_ir(void); +static const struct msi_parent_ops dmar_msi_parent_ops, virt_dmar_msi_parent_ops; static bool ir_pre_enabled(struct intel_iommu *iommu) { @@ -173,7 +174,6 @@ static int modify_irte(struct irq_2_iommu *irq_iommu, index = irq_iommu->irte_index + irq_iommu->sub_handle; irte = &iommu->ir_table->base[index]; -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) if ((irte->pst == 1) || (irte_modified->pst == 1)) { bool ret; @@ -187,11 +187,9 @@ static int modify_irte(struct irq_2_iommu *irq_iommu, * same as the old value. */ WARN_ON(!ret); - } else -#endif - { - set_64bit(&irte->low, irte_modified->low); - set_64bit(&irte->high, irte_modified->high); + } else { + WRITE_ONCE(irte->low, irte_modified->low); + WRITE_ONCE(irte->high, irte_modified->high); } __iommu_flush_cache(iommu, irte, sizeof(*irte)); @@ -230,7 +228,7 @@ static struct irq_domain *map_dev_to_ir(struct pci_dev *dev) { struct dmar_drhd_unit *drhd = dmar_find_matched_drhd_unit(dev); - return drhd ? drhd->iommu->ir_msi_domain : NULL; + return drhd ? drhd->iommu->ir_domain : NULL; } static int clear_entries(struct irq_2_iommu *irq_iommu) @@ -249,8 +247,8 @@ static int clear_entries(struct irq_2_iommu *irq_iommu) end = start + (1 << irq_iommu->irte_mask); for (entry = start; entry < end; entry++) { - set_64bit(&entry->low, 0); - set_64bit(&entry->high, 0); + WRITE_ONCE(entry->low, 0); + WRITE_ONCE(entry->high, 0); } bitmap_release_region(iommu->ir_table->bitmap, index, irq_iommu->irte_mask); @@ -573,10 +571,14 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); goto out_free_fwnode; } - iommu->ir_msi_domain = - arch_create_remap_msi_irq_domain(iommu->ir_domain, - "INTEL-IR-MSI", - iommu->seq_id); + + irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_DMAR); + iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + + if (cap_caching_mode(iommu->cap)) + iommu->ir_domain->msi_parent_ops = &virt_dmar_msi_parent_ops; + else + iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops; ir_table->base = page_address(pages); ir_table->bitmap = bitmap; @@ -620,9 +622,6 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) return 0; out_free_ir_domain: - if (iommu->ir_msi_domain) - irq_domain_remove(iommu->ir_msi_domain); - iommu->ir_msi_domain = NULL; irq_domain_remove(iommu->ir_domain); iommu->ir_domain = NULL; out_free_fwnode: @@ -644,13 +643,6 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu) struct fwnode_handle *fn; if (iommu && iommu->ir_table) { - if (iommu->ir_msi_domain) { - fn = iommu->ir_msi_domain->fwnode; - - irq_domain_remove(iommu->ir_msi_domain); - irq_domain_free_fwnode(fn); - iommu->ir_msi_domain = NULL; - } if (iommu->ir_domain) { fn = iommu->ir_domain->fwnode; @@ -1107,7 +1099,7 @@ error: */ void intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { - if (!irq_remapping_enabled || pci_dev_has_special_msi_domain(info->dev)) + if (!irq_remapping_enabled || !pci_dev_has_default_msi_parent_domain(info->dev)) return; dev_set_msi_domain(&info->dev->dev, map_dev_to_ir(info->dev)); @@ -1334,17 +1326,9 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain, if (!info || !iommu) return -EINVAL; - if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI && - info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX) + if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI) return -EINVAL; - /* - * With IRQ remapping enabled, don't need contiguous CPU vectors - * to support multiple MSI interrupts. - */ - if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) - info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); if (ret < 0) return ret; @@ -1445,6 +1429,21 @@ static const struct irq_domain_ops intel_ir_domain_ops = { .deactivate = intel_irq_remapping_deactivate, }; +static const struct msi_parent_ops dmar_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | + MSI_FLAG_MULTI_PCI_MSI | + MSI_FLAG_PCI_IMS, + .prefix = "IR-", + .init_dev_msi_info = msi_parent_init_dev_msi_info, +}; + +static const struct msi_parent_ops virt_dmar_msi_parent_ops = { + .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | + MSI_FLAG_MULTI_PCI_MSI, + .prefix = "vIR-", + .init_dev_msi_info = msi_parent_init_dev_msi_info, +}; + /* * Support of Interrupt Remapping Unit Hotplug */ diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index c30ddac40ee5..fb3c7020028d 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -101,8 +101,10 @@ int intel_pasid_alloc_table(struct device *dev) might_sleep(); info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table)) - return -EINVAL; + if (WARN_ON(!info || !dev_is_pci(dev))) + return -ENODEV; + if (WARN_ON(info->pasid_table)) + return -EEXIST; pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL); if (!pasid_table) @@ -642,7 +644,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, * Since it is a second level only translation setup, we should * set SRE bit as well (addresses are expected to be GPAs). */ - if (pasid != PASID_RID2PASID) + if (pasid != PASID_RID2PASID && ecap_srs(iommu->ecap)) pasid_set_sre(pte); pasid_set_present(pte); spin_unlock(&iommu->lock); @@ -685,7 +687,8 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, * We should set SRE bit as well since the addresses are expected * to be GPAs. */ - pasid_set_sre(pte); + if (ecap_srs(iommu->ecap)) + pasid_set_sre(pte); pasid_set_present(pte); spin_unlock(&iommu->lock); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 7d08eb034f2d..c76b66263467 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -24,7 +24,7 @@ #include "iommu.h" #include "pasid.h" #include "perf.h" -#include "../iommu-sva-lib.h" +#include "../iommu-sva.h" #include "trace.h" static irqreturn_t prq_event_thread(int irq, void *d); @@ -184,10 +184,13 @@ static void __flush_svm_range_dev(struct intel_svm *svm, return; qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih); - if (info->ats_enabled) + if (info->ats_enabled) { qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid, svm->pasid, sdev->qdep, address, order_base_2(pages)); + quirk_extra_dev_tlb_flush(info, address, order_base_2(pages), + svm->pasid, sdev->qdep); + } } static void intel_flush_svm_range_dev(struct intel_svm *svm, @@ -296,19 +299,9 @@ out: return 0; } -static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm, - unsigned int flags) -{ - ioasid_t max_pasid = dev_is_pci(dev) ? - pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id; - - return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1); -} - static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, - struct mm_struct *mm, - unsigned int flags) + struct mm_struct *mm) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_svm_dev *sdev; @@ -324,22 +317,18 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, svm->pasid = mm->pasid; svm->mm = mm; - svm->flags = flags; INIT_LIST_HEAD_RCU(&svm->devs); - if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) { - svm->notifier.ops = &intel_mmuops; - ret = mmu_notifier_register(&svm->notifier, mm); - if (ret) { - kfree(svm); - return ERR_PTR(ret); - } + svm->notifier.ops = &intel_mmuops; + ret = mmu_notifier_register(&svm->notifier, mm); + if (ret) { + kfree(svm); + return ERR_PTR(ret); } ret = pasid_private_add(svm->pasid, svm); if (ret) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); + mmu_notifier_unregister(&svm->notifier, mm); kfree(svm); return ERR_PTR(ret); } @@ -374,9 +363,7 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, } /* Setup the pasid table: */ - sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ? - PASID_FLAG_SUPERVISOR_MODE : 0; - sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid, FLPT_DEFAULT_DID, sflags); if (ret) @@ -390,8 +377,7 @@ free_sdev: kfree(sdev); free_svm: if (list_empty(&svm->devs)) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); + mmu_notifier_unregister(&svm->notifier, mm); pasid_private_remove(mm->pasid); kfree(svm); } @@ -745,12 +731,16 @@ bad_req: * If prq is to be handled outside iommu driver via receiver of * the fault notifiers, we skip the page response here. */ - if (!pdev || intel_svm_prq_report(iommu, &pdev->dev, req)) - handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + if (!pdev) + goto bad_req; - trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1, - req->priv_data[0], req->priv_data[1], - iommu->prq_seq_number++); + if (intel_svm_prq_report(iommu, &pdev->dev, req)) + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + else + trace_prq_report(iommu, &pdev->dev, req->qw_0, req->qw_1, + req->priv_data[0], req->priv_data[1], + iommu->prq_seq_number++); + pci_dev_put(pdev); prq_advance: head = (head + sizeof(*req)) & PRQ_RING_MASK; } @@ -780,67 +770,6 @@ prq_advance: return IRQ_RETVAL(handled); } -struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) -{ - struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); - unsigned int flags = 0; - struct iommu_sva *sva; - int ret; - - if (drvdata) - flags = *(unsigned int *)drvdata; - - if (flags & SVM_FLAG_SUPERVISOR_MODE) { - if (!ecap_srs(iommu->ecap)) { - dev_err(dev, "%s: Supervisor PASID not supported\n", - iommu->name); - return ERR_PTR(-EOPNOTSUPP); - } - - if (mm) { - dev_err(dev, "%s: Supervisor PASID with user provided mm\n", - iommu->name); - return ERR_PTR(-EINVAL); - } - - mm = &init_mm; - } - - mutex_lock(&pasid_mutex); - ret = intel_svm_alloc_pasid(dev, mm, flags); - if (ret) { - mutex_unlock(&pasid_mutex); - return ERR_PTR(ret); - } - - sva = intel_svm_bind_mm(iommu, dev, mm, flags); - mutex_unlock(&pasid_mutex); - - return sva; -} - -void intel_svm_unbind(struct iommu_sva *sva) -{ - struct intel_svm_dev *sdev = to_intel_svm_dev(sva); - - mutex_lock(&pasid_mutex); - intel_svm_unbind_mm(sdev->dev, sdev->pasid); - mutex_unlock(&pasid_mutex); -} - -u32 intel_svm_get_pasid(struct iommu_sva *sva) -{ - struct intel_svm_dev *sdev; - u32 pasid; - - mutex_lock(&pasid_mutex); - sdev = to_intel_svm_dev(sva); - pasid = sdev->pasid; - mutex_unlock(&pasid_mutex); - - return pasid; -} - int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg) @@ -911,3 +840,50 @@ int intel_svm_page_response(struct device *dev, out: return ret; } + +void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + mutex_lock(&pasid_mutex); + intel_svm_unbind_mm(dev, pasid); + mutex_unlock(&pasid_mutex); +} + +static int intel_svm_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct mm_struct *mm = domain->mm; + struct iommu_sva *sva; + int ret = 0; + + mutex_lock(&pasid_mutex); + sva = intel_svm_bind_mm(iommu, dev, mm); + if (IS_ERR(sva)) + ret = PTR_ERR(sva); + mutex_unlock(&pasid_mutex); + + return ret; +} + +static void intel_svm_domain_free(struct iommu_domain *domain) +{ + kfree(to_dmar_domain(domain)); +} + +static const struct iommu_domain_ops intel_svm_domain_ops = { + .set_dev_pasid = intel_svm_set_dev_pasid, + .free = intel_svm_domain_free +}; + +struct iommu_domain *intel_svm_domain_alloc(void) +{ + struct dmar_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + domain->domain.ops = &intel_svm_domain_ops; + + return &domain->domain; +} diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 1df8c1dcae77..e5b8b9110c13 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -11,7 +11,7 @@ #include <linux/slab.h> #include <linux/workqueue.h> -#include "iommu-sva-lib.h" +#include "iommu-sva.h" /** * struct iopf_queue - IO Page Fault queue @@ -69,69 +69,18 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, return iommu_page_response(dev, &resp); } -static enum iommu_page_response_code -iopf_handle_single(struct iopf_fault *iopf) -{ - vm_fault_t ret; - struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned int access_flags = 0; - unsigned int fault_flags = FAULT_FLAG_REMOTE; - struct iommu_fault_page_request *prm = &iopf->fault.prm; - enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; - - if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) - return status; - - mm = iommu_sva_find(prm->pasid); - if (IS_ERR_OR_NULL(mm)) - return status; - - mmap_read_lock(mm); - - vma = find_extend_vma(mm, prm->addr); - if (!vma) - /* Unmapped area */ - goto out_put_mm; - - if (prm->perm & IOMMU_FAULT_PERM_READ) - access_flags |= VM_READ; - - if (prm->perm & IOMMU_FAULT_PERM_WRITE) { - access_flags |= VM_WRITE; - fault_flags |= FAULT_FLAG_WRITE; - } - - if (prm->perm & IOMMU_FAULT_PERM_EXEC) { - access_flags |= VM_EXEC; - fault_flags |= FAULT_FLAG_INSTRUCTION; - } - - if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) - fault_flags |= FAULT_FLAG_USER; - - if (access_flags & ~vma->vm_flags) - /* Access fault */ - goto out_put_mm; - - ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); - status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : - IOMMU_PAGE_RESP_SUCCESS; - -out_put_mm: - mmap_read_unlock(mm); - mmput(mm); - - return status; -} - -static void iopf_handle_group(struct work_struct *work) +static void iopf_handler(struct work_struct *work) { struct iopf_group *group; + struct iommu_domain *domain; struct iopf_fault *iopf, *next; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); + domain = iommu_get_domain_for_dev_pasid(group->dev, + group->last_fault.fault.prm.pasid, 0); + if (!domain || !domain->iopf_handler) + status = IOMMU_PAGE_RESP_INVALID; list_for_each_entry_safe(iopf, next, &group->faults, list) { /* @@ -139,7 +88,8 @@ static void iopf_handle_group(struct work_struct *work) * faults in the group if there is an error. */ if (status == IOMMU_PAGE_RESP_SUCCESS) - status = iopf_handle_single(iopf); + status = domain->iopf_handler(&iopf->fault, + domain->fault_data); if (!(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) @@ -181,6 +131,13 @@ static void iopf_handle_group(struct work_struct *work) * request completes, outstanding faults will have been dealt with by the time * the PASID is freed. * + * Any valid page fault will be eventually routed to an iommu domain and the + * page fault handler installed there will get called. The users of this + * handling framework should guarantee that the iommu domain could only be + * freed after the device has stopped generating page faults (or the iommu + * hardware has been set to block the page faults) and the pending page faults + * have been flushed. + * * Return: 0 on success and <0 on error. */ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) @@ -235,7 +192,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) group->last_fault.fault = *fault; INIT_LIST_HEAD(&group->faults); list_add(&group->last_fault.list, &group->faults); - INIT_WORK(&group->work, iopf_handle_group); + INIT_WORK(&group->work, iopf_handler); /* See if we have partial faults for this group */ list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index ba3115fd0f86..75f244a3e12d 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -564,8 +564,7 @@ static int arm_v7s_map_pages(struct io_pgtable_ops *ops, unsigned long iova, iova += pgsize; paddr += pgsize; - if (mapped) - *mapped += pgsize; + *mapped += pgsize; } /* * Synchronise all PTE updates for the new mapping before there's @@ -576,12 +575,6 @@ static int arm_v7s_map_pages(struct io_pgtable_ops *ops, unsigned long iova, return ret; } -static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) -{ - return arm_v7s_map_pages(ops, iova, paddr, size, 1, prot, gfp, NULL); -} - static void arm_v7s_free_pgtable(struct io_pgtable *iop) { struct arm_v7s_io_pgtable *data = io_pgtable_to_data(iop); @@ -764,12 +757,6 @@ static size_t arm_v7s_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova return unmapped; } -static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) -{ - return arm_v7s_unmap_pages(ops, iova, size, 1, gather); -} - static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) { @@ -842,9 +829,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, goto out_free_data; data->iop.ops = (struct io_pgtable_ops) { - .map = arm_v7s_map, .map_pages = arm_v7s_map_pages, - .unmap = arm_v7s_unmap, .unmap_pages = arm_v7s_unmap_pages, .iova_to_phys = arm_v7s_iova_to_phys, }; @@ -954,6 +939,7 @@ static int __init arm_v7s_do_selftests(void) }; unsigned int iova, size, iova_start; unsigned int i, loopnr = 0; + size_t mapped; selftest_running = true; @@ -984,15 +970,16 @@ static int __init arm_v7s_do_selftests(void) iova = 0; for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) { size = 1UL << i; - if (ops->map(ops, iova, iova, size, IOMMU_READ | - IOMMU_WRITE | - IOMMU_NOEXEC | - IOMMU_CACHE, GFP_KERNEL)) + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) return __FAIL(ops); /* Overlapping mappings */ - if (!ops->map(ops, iova, iova + size, size, - IOMMU_READ | IOMMU_NOEXEC, GFP_KERNEL)) + if (!ops->map_pages(ops, iova, iova + size, size, 1, + IOMMU_READ | IOMMU_NOEXEC, GFP_KERNEL, + &mapped)) return __FAIL(ops); if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) @@ -1007,11 +994,12 @@ static int __init arm_v7s_do_selftests(void) size = 1UL << __ffs(cfg.pgsize_bitmap); while (i < loopnr) { iova_start = i * SZ_16M; - if (ops->unmap(ops, iova_start + size, size, NULL) != size) + if (ops->unmap_pages(ops, iova_start + size, size, 1, NULL) != size) return __FAIL(ops); /* Remap of partial unmap */ - if (ops->map(ops, iova_start + size, size, size, IOMMU_READ, GFP_KERNEL)) + if (ops->map_pages(ops, iova_start + size, size, size, 1, + IOMMU_READ, GFP_KERNEL, &mapped)) return __FAIL(ops); if (ops->iova_to_phys(ops, iova_start + size + 42) @@ -1025,14 +1013,15 @@ static int __init arm_v7s_do_selftests(void) for_each_set_bit(i, &cfg.pgsize_bitmap, BITS_PER_LONG) { size = 1UL << i; - if (ops->unmap(ops, iova, size, NULL) != size) + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) return __FAIL(ops); if (ops->iova_to_phys(ops, iova + 42)) return __FAIL(ops); /* Remap full block */ - if (ops->map(ops, iova, iova, size, IOMMU_WRITE, GFP_KERNEL)) + if (ops->map_pages(ops, iova, iova, size, 1, IOMMU_WRITE, + GFP_KERNEL, &mapped)) return __FAIL(ops); if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 0ba817e86346..72dcdd468cf3 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -360,7 +360,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, max_entries = ARM_LPAE_PTES_PER_TABLE(data) - map_idx_start; num_entries = min_t(int, pgcount, max_entries); ret = arm_lpae_init_pte(data, iova, paddr, prot, lvl, num_entries, ptep); - if (!ret && mapped) + if (!ret) *mapped += num_entries * size; return ret; @@ -496,13 +496,6 @@ static int arm_lpae_map_pages(struct io_pgtable_ops *ops, unsigned long iova, return ret; } -static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t size, int iommu_prot, gfp_t gfp) -{ - return arm_lpae_map_pages(ops, iova, paddr, size, 1, iommu_prot, gfp, - NULL); -} - static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl, arm_lpae_iopte *ptep) { @@ -682,12 +675,6 @@ static size_t arm_lpae_unmap_pages(struct io_pgtable_ops *ops, unsigned long iov data->start_level, ptep); } -static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) -{ - return arm_lpae_unmap_pages(ops, iova, size, 1, gather); -} - static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) { @@ -799,9 +786,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) data->pgd_bits = va_bits - (data->bits_per_level * (levels - 1)); data->iop.ops = (struct io_pgtable_ops) { - .map = arm_lpae_map, .map_pages = arm_lpae_map_pages, - .unmap = arm_lpae_unmap, .unmap_pages = arm_lpae_unmap_pages, .iova_to_phys = arm_lpae_iova_to_phys, }; @@ -1176,7 +1161,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) int i, j; unsigned long iova; - size_t size; + size_t size, mapped; struct io_pgtable_ops *ops; selftest_running = true; @@ -1209,15 +1194,16 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { size = 1UL << j; - if (ops->map(ops, iova, iova, size, IOMMU_READ | - IOMMU_WRITE | - IOMMU_NOEXEC | - IOMMU_CACHE, GFP_KERNEL)) + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) return __FAIL(ops, i); /* Overlapping mappings */ - if (!ops->map(ops, iova, iova + size, size, - IOMMU_READ | IOMMU_NOEXEC, GFP_KERNEL)) + if (!ops->map_pages(ops, iova, iova + size, size, 1, + IOMMU_READ | IOMMU_NOEXEC, + GFP_KERNEL, &mapped)) return __FAIL(ops, i); if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) @@ -1228,11 +1214,12 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) /* Partial unmap */ size = 1UL << __ffs(cfg->pgsize_bitmap); - if (ops->unmap(ops, SZ_1G + size, size, NULL) != size) + if (ops->unmap_pages(ops, SZ_1G + size, size, 1, NULL) != size) return __FAIL(ops, i); /* Remap of partial unmap */ - if (ops->map(ops, SZ_1G + size, size, size, IOMMU_READ, GFP_KERNEL)) + if (ops->map_pages(ops, SZ_1G + size, size, size, 1, + IOMMU_READ, GFP_KERNEL, &mapped)) return __FAIL(ops, i); if (ops->iova_to_phys(ops, SZ_1G + size + 42) != (size + 42)) @@ -1243,14 +1230,15 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { size = 1UL << j; - if (ops->unmap(ops, iova, size, NULL) != size) + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) return __FAIL(ops, i); if (ops->iova_to_phys(ops, iova + 42)) return __FAIL(ops, i); /* Remap full block */ - if (ops->map(ops, iova, iova, size, IOMMU_WRITE, GFP_KERNEL)) + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_WRITE, GFP_KERNEL, &mapped)) return __FAIL(ops, i); if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c deleted file mode 100644 index 106506143896..000000000000 --- a/drivers/iommu/iommu-sva-lib.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Helpers for IOMMU drivers implementing SVA - */ -#include <linux/mutex.h> -#include <linux/sched/mm.h> - -#include "iommu-sva-lib.h" - -static DEFINE_MUTEX(iommu_sva_lock); -static DECLARE_IOASID_SET(iommu_sva_pasid); - -/** - * iommu_sva_alloc_pasid - Allocate a PASID for the mm - * @mm: the mm - * @min: minimum PASID value (inclusive) - * @max: maximum PASID value (inclusive) - * - * Try to allocate a PASID for this mm, or take a reference to the existing one - * provided it fits within the [@min, @max] range. On success the PASID is - * available in mm->pasid and will be available for the lifetime of the mm. - * - * Returns 0 on success and < 0 on error. - */ -int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max) -{ - int ret = 0; - ioasid_t pasid; - - if (min == INVALID_IOASID || max == INVALID_IOASID || - min == 0 || max < min) - return -EINVAL; - - mutex_lock(&iommu_sva_lock); - /* Is a PASID already associated with this mm? */ - if (pasid_valid(mm->pasid)) { - if (mm->pasid < min || mm->pasid >= max) - ret = -EOVERFLOW; - goto out; - } - - pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm); - if (!pasid_valid(pasid)) - ret = -ENOMEM; - else - mm_pasid_set(mm, pasid); -out: - mutex_unlock(&iommu_sva_lock); - return ret; -} -EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid); - -/* ioasid_find getter() requires a void * argument */ -static bool __mmget_not_zero(void *mm) -{ - return mmget_not_zero(mm); -} - -/** - * iommu_sva_find() - Find mm associated to the given PASID - * @pasid: Process Address Space ID assigned to the mm - * - * On success a reference to the mm is taken, and must be released with mmput(). - * - * Returns the mm corresponding to this PASID, or an error if not found. - */ -struct mm_struct *iommu_sva_find(ioasid_t pasid) -{ - return ioasid_find(&iommu_sva_pasid, pasid, __mmget_not_zero); -} -EXPORT_SYMBOL_GPL(iommu_sva_find); diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c new file mode 100644 index 000000000000..24bf9b2b58aa --- /dev/null +++ b/drivers/iommu/iommu-sva.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for IOMMU drivers implementing SVA + */ +#include <linux/mutex.h> +#include <linux/sched/mm.h> +#include <linux/iommu.h> + +#include "iommu-sva.h" + +static DEFINE_MUTEX(iommu_sva_lock); +static DECLARE_IOASID_SET(iommu_sva_pasid); + +/** + * iommu_sva_alloc_pasid - Allocate a PASID for the mm + * @mm: the mm + * @min: minimum PASID value (inclusive) + * @max: maximum PASID value (inclusive) + * + * Try to allocate a PASID for this mm, or take a reference to the existing one + * provided it fits within the [@min, @max] range. On success the PASID is + * available in mm->pasid and will be available for the lifetime of the mm. + * + * Returns 0 on success and < 0 on error. + */ +int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max) +{ + int ret = 0; + ioasid_t pasid; + + if (min == INVALID_IOASID || max == INVALID_IOASID || + min == 0 || max < min) + return -EINVAL; + + mutex_lock(&iommu_sva_lock); + /* Is a PASID already associated with this mm? */ + if (pasid_valid(mm->pasid)) { + if (mm->pasid < min || mm->pasid >= max) + ret = -EOVERFLOW; + goto out; + } + + pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm); + if (!pasid_valid(pasid)) + ret = -ENOMEM; + else + mm_pasid_set(mm, pasid); +out: + mutex_unlock(&iommu_sva_lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid); + +/* ioasid_find getter() requires a void * argument */ +static bool __mmget_not_zero(void *mm) +{ + return mmget_not_zero(mm); +} + +/** + * iommu_sva_find() - Find mm associated to the given PASID + * @pasid: Process Address Space ID assigned to the mm + * + * On success a reference to the mm is taken, and must be released with mmput(). + * + * Returns the mm corresponding to this PASID, or an error if not found. + */ +struct mm_struct *iommu_sva_find(ioasid_t pasid) +{ + return ioasid_find(&iommu_sva_pasid, pasid, __mmget_not_zero); +} +EXPORT_SYMBOL_GPL(iommu_sva_find); + +/** + * iommu_sva_bind_device() - Bind a process address space to a device + * @dev: the device + * @mm: the mm to bind, caller must hold a reference to mm_users + * + * Create a bond between device and address space, allowing the device to + * access the mm using the PASID returned by iommu_sva_get_pasid(). If a + * bond already exists between @device and @mm, an additional internal + * reference is taken. Caller must call iommu_sva_unbind_device() + * to release each reference. + * + * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to + * initialize the required SVA features. + * + * On error, returns an ERR_PTR value. + */ +struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) +{ + struct iommu_domain *domain; + struct iommu_sva *handle; + ioasid_t max_pasids; + int ret; + + max_pasids = dev->iommu->max_pasids; + if (!max_pasids) + return ERR_PTR(-EOPNOTSUPP); + + /* Allocate mm->pasid if necessary. */ + ret = iommu_sva_alloc_pasid(mm, 1, max_pasids - 1); + if (ret) + return ERR_PTR(ret); + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return ERR_PTR(-ENOMEM); + + mutex_lock(&iommu_sva_lock); + /* Search for an existing domain. */ + domain = iommu_get_domain_for_dev_pasid(dev, mm->pasid, + IOMMU_DOMAIN_SVA); + if (IS_ERR(domain)) { + ret = PTR_ERR(domain); + goto out_unlock; + } + + if (domain) { + domain->users++; + goto out; + } + + /* Allocate a new domain and set it on device pasid. */ + domain = iommu_sva_domain_alloc(dev, mm); + if (!domain) { + ret = -ENOMEM; + goto out_unlock; + } + + ret = iommu_attach_device_pasid(domain, dev, mm->pasid); + if (ret) + goto out_free_domain; + domain->users = 1; +out: + mutex_unlock(&iommu_sva_lock); + handle->dev = dev; + handle->domain = domain; + + return handle; + +out_free_domain: + iommu_domain_free(domain); +out_unlock: + mutex_unlock(&iommu_sva_lock); + kfree(handle); + + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(iommu_sva_bind_device); + +/** + * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device + * @handle: the handle returned by iommu_sva_bind_device() + * + * Put reference to a bond between device and address space. The device should + * not be issuing any more transaction for this PASID. All outstanding page + * requests for this PASID must have been flushed to the IOMMU. + */ +void iommu_sva_unbind_device(struct iommu_sva *handle) +{ + struct iommu_domain *domain = handle->domain; + ioasid_t pasid = domain->mm->pasid; + struct device *dev = handle->dev; + + mutex_lock(&iommu_sva_lock); + if (--domain->users == 0) { + iommu_detach_device_pasid(domain, dev, pasid); + iommu_domain_free(domain); + } + mutex_unlock(&iommu_sva_lock); + kfree(handle); +} +EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); + +u32 iommu_sva_get_pasid(struct iommu_sva *handle) +{ + struct iommu_domain *domain = handle->domain; + + return domain->mm->pasid; +} +EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); + +/* + * I/O page fault handler for SVA + */ +enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +{ + vm_fault_t ret; + struct vm_area_struct *vma; + struct mm_struct *mm = data; + unsigned int access_flags = 0; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + struct iommu_fault_page_request *prm = &fault->prm; + enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; + + if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) + return status; + + if (!mmget_not_zero(mm)) + return status; + + mmap_read_lock(mm); + + vma = find_extend_vma(mm, prm->addr); + if (!vma) + /* Unmapped area */ + goto out_put_mm; + + if (prm->perm & IOMMU_FAULT_PERM_READ) + access_flags |= VM_READ; + + if (prm->perm & IOMMU_FAULT_PERM_WRITE) { + access_flags |= VM_WRITE; + fault_flags |= FAULT_FLAG_WRITE; + } + + if (prm->perm & IOMMU_FAULT_PERM_EXEC) { + access_flags |= VM_EXEC; + fault_flags |= FAULT_FLAG_INSTRUCTION; + } + + if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) + fault_flags |= FAULT_FLAG_USER; + + if (access_flags & ~vma->vm_flags) + /* Access fault */ + goto out_put_mm; + + ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); + status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : + IOMMU_PAGE_RESP_SUCCESS; + +out_put_mm: + mmap_read_unlock(mm); + mmput(mm); + + return status; +} diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva.h index 8909ea1094e3..7215a761b962 100644 --- a/drivers/iommu/iommu-sva-lib.h +++ b/drivers/iommu/iommu-sva.h @@ -2,8 +2,8 @@ /* * SVA library for IOMMU drivers */ -#ifndef _IOMMU_SVA_LIB_H -#define _IOMMU_SVA_LIB_H +#ifndef _IOMMU_SVA_H +#define _IOMMU_SVA_H #include <linux/ioasid.h> #include <linux/mm_types.h> @@ -26,6 +26,8 @@ int iopf_queue_flush_dev(struct device *dev); struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); +enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data); #else /* CONFIG_IOMMU_SVA */ static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) @@ -63,5 +65,11 @@ static inline int iopf_queue_discard_partial(struct iopf_queue *queue) { return -ENODEV; } + +static inline enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +{ + return IOMMU_PAGE_RESP_INVALID; +} #endif /* CONFIG_IOMMU_SVA */ -#endif /* _IOMMU_SVA_LIB_H */ +#endif /* _IOMMU_SVA_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 65a3b3d886dc..de91dd88705b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -21,6 +21,7 @@ #include <linux/idr.h> #include <linux/err.h> #include <linux/pci.h> +#include <linux/pci-ats.h> #include <linux/bitops.h> #include <linux/platform_device.h> #include <linux/property.h> @@ -28,9 +29,12 @@ #include <linux/module.h> #include <linux/cc_platform.h> #include <trace/events/iommu.h> +#include <linux/sched/mm.h> #include "dma-iommu.h" +#include "iommu-sva.h" + static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); @@ -42,6 +46,7 @@ struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; struct list_head devices; + struct xarray pasid_array; struct mutex mutex; void *iommu_data; void (*iommu_data_release)(void *iommu_data); @@ -278,18 +283,46 @@ static void dev_iommu_free(struct device *dev) kfree(param); } +static u32 dev_iommu_get_max_pasids(struct device *dev) +{ + u32 max_pasids = 0, bits = 0; + int ret; + + if (dev_is_pci(dev)) { + ret = pci_max_pasids(to_pci_dev(dev)); + if (ret > 0) + max_pasids = ret; + } else { + ret = device_property_read_u32(dev, "pasid-num-bits", &bits); + if (!ret) + max_pasids = 1UL << bits; + } + + return min_t(u32, max_pasids, dev->iommu->iommu_dev->max_pasids); +} + static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_device *iommu_dev; struct iommu_group *group; + static DEFINE_MUTEX(iommu_probe_device_lock); int ret; if (!ops) return -ENODEV; - - if (!dev_iommu_get(dev)) - return -ENOMEM; + /* + * Serialise to avoid races between IOMMU drivers registering in + * parallel and/or the "replay" calls from ACPI/OF code via client + * driver probe. Once the latter have been cleaned up we should + * probably be able to use device_lock() here to minimise the scope, + * but for now enforcing a simple global ordering is fine. + */ + mutex_lock(&iommu_probe_device_lock); + if (!dev_iommu_get(dev)) { + ret = -ENOMEM; + goto err_unlock; + } if (!try_module_get(ops->owner)) { ret = -EINVAL; @@ -303,17 +336,21 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list } dev->iommu->iommu_dev = iommu_dev; + dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev); group = iommu_group_get_for_dev(dev); if (IS_ERR(group)) { ret = PTR_ERR(group); goto out_release; } - iommu_group_put(group); + mutex_lock(&group->mutex); if (group_list && !group->default_domain && list_empty(&group->entry)) list_add_tail(&group->entry, group_list); + mutex_unlock(&group->mutex); + iommu_group_put(group); + mutex_unlock(&iommu_probe_device_lock); iommu_device_link(iommu_dev, dev); return 0; @@ -328,6 +365,9 @@ out_module_put: err_free: dev_iommu_free(dev); +err_unlock: + mutex_unlock(&iommu_probe_device_lock); + return ret; } @@ -703,6 +743,7 @@ struct iommu_group *iommu_group_alloc(void) mutex_init(&group->mutex); INIT_LIST_HEAD(&group->devices); INIT_LIST_HEAD(&group->entry); + xa_init(&group->pasid_array); ret = ida_alloc(&iommu_group_ida, GFP_KERNEL); if (ret < 0) { @@ -1799,11 +1840,11 @@ int bus_iommu_probe(struct bus_type *bus) return ret; list_for_each_entry_safe(group, next, &group_list, entry) { + mutex_lock(&group->mutex); + /* Remove item from the list */ list_del_init(&group->entry); - mutex_lock(&group->mutex); - /* Try to allocate default domain */ probe_alloc_default_domain(bus, group); @@ -1912,6 +1953,8 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc); void iommu_domain_free(struct iommu_domain *domain) { + if (domain->type == IOMMU_DOMAIN_SVA) + mmdrop(domain->mm); iommu_put_dma_cookie(domain); domain->ops->free(domain); } @@ -1949,6 +1992,18 @@ static int __iommu_attach_device(struct iommu_domain *domain, return ret; } +/** + * iommu_attach_device - Attach an IOMMU domain to a device + * @domain: IOMMU domain to attach + * @dev: Device that will be attached + * + * Returns 0 on success and error code on failure + * + * Note that EINVAL can be treated as a soft failure, indicating + * that certain configuration of the domain is incompatible with + * the device. In this case attaching a different domain to the + * device may succeed. + */ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct iommu_group *group; @@ -2075,6 +2130,18 @@ static int __iommu_attach_group(struct iommu_domain *domain, return ret; } +/** + * iommu_attach_group - Attach an IOMMU domain to an IOMMU group + * @domain: IOMMU domain to attach + * @group: IOMMU group that will be attached + * + * Returns 0 on success and error code on failure + * + * Note that EINVAL can be treated as a soft failure, indicating + * that certain configuration of the domain is incompatible with + * the group. In this case attaching a different domain to the + * group may succeed. + */ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { int ret; @@ -2726,98 +2793,6 @@ int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) } EXPORT_SYMBOL_GPL(iommu_dev_disable_feature); -/** - * iommu_sva_bind_device() - Bind a process address space to a device - * @dev: the device - * @mm: the mm to bind, caller must hold a reference to it - * @drvdata: opaque data pointer to pass to bind callback - * - * Create a bond between device and address space, allowing the device to access - * the mm using the returned PASID. If a bond already exists between @device and - * @mm, it is returned and an additional reference is taken. Caller must call - * iommu_sva_unbind_device() to release each reference. - * - * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to - * initialize the required SVA features. - * - * On error, returns an ERR_PTR value. - */ -struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata) -{ - struct iommu_group *group; - struct iommu_sva *handle = ERR_PTR(-EINVAL); - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (!ops->sva_bind) - return ERR_PTR(-ENODEV); - - group = iommu_group_get(dev); - if (!group) - return ERR_PTR(-ENODEV); - - /* Ensure device count and domain don't change while we're binding */ - mutex_lock(&group->mutex); - - /* - * To keep things simple, SVA currently doesn't support IOMMU groups - * with more than one device. Existing SVA-capable systems are not - * affected by the problems that required IOMMU groups (lack of ACS - * isolation, device ID aliasing and other hardware issues). - */ - if (iommu_group_device_count(group) != 1) - goto out_unlock; - - handle = ops->sva_bind(dev, mm, drvdata); - -out_unlock: - mutex_unlock(&group->mutex); - iommu_group_put(group); - - return handle; -} -EXPORT_SYMBOL_GPL(iommu_sva_bind_device); - -/** - * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device - * @handle: the handle returned by iommu_sva_bind_device() - * - * Put reference to a bond between device and address space. The device should - * not be issuing any more transaction for this PASID. All outstanding page - * requests for this PASID must have been flushed to the IOMMU. - */ -void iommu_sva_unbind_device(struct iommu_sva *handle) -{ - struct iommu_group *group; - struct device *dev = handle->dev; - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (!ops->sva_unbind) - return; - - group = iommu_group_get(dev); - if (!group) - return; - - mutex_lock(&group->mutex); - ops->sva_unbind(handle); - mutex_unlock(&group->mutex); - - iommu_group_put(group); -} -EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); - -u32 iommu_sva_get_pasid(struct iommu_sva *handle) -{ - const struct iommu_ops *ops = dev_iommu_ops(handle->dev); - - if (!ops->sva_get_pasid) - return IOMMU_PASID_INVALID; - - return ops->sva_get_pasid(handle); -} -EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); - /* * Changes the default domain of an iommu group that has *only* one device * @@ -3087,7 +3062,8 @@ int iommu_device_use_default_domain(struct device *dev) mutex_lock(&group->mutex); if (group->owner_cnt) { - if (group->owner || !iommu_is_default_domain(group)) { + if (group->owner || !iommu_is_default_domain(group) || + !xa_empty(&group->pasid_array)) { ret = -EBUSY; goto unlock_out; } @@ -3118,7 +3094,7 @@ void iommu_device_unuse_default_domain(struct device *dev) return; mutex_lock(&group->mutex); - if (!WARN_ON(!group->owner_cnt)) + if (!WARN_ON(!group->owner_cnt || !xa_empty(&group->pasid_array))) group->owner_cnt--; mutex_unlock(&group->mutex); @@ -3148,40 +3124,49 @@ static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) return 0; } +static int __iommu_take_dma_ownership(struct iommu_group *group, void *owner) +{ + int ret; + + if ((group->domain && group->domain != group->default_domain) || + !xa_empty(&group->pasid_array)) + return -EBUSY; + + ret = __iommu_group_alloc_blocking_domain(group); + if (ret) + return ret; + ret = __iommu_group_set_domain(group, group->blocking_domain); + if (ret) + return ret; + + group->owner = owner; + group->owner_cnt++; + return 0; +} + /** * iommu_group_claim_dma_owner() - Set DMA ownership of a group * @group: The group. * @owner: Caller specified pointer. Used for exclusive ownership. * - * This is to support backward compatibility for vfio which manages - * the dma ownership in iommu_group level. New invocations on this - * interface should be prohibited. + * This is to support backward compatibility for vfio which manages the dma + * ownership in iommu_group level. New invocations on this interface should be + * prohibited. Only a single owner may exist for a group. */ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) { int ret = 0; + if (WARN_ON(!owner)) + return -EINVAL; + mutex_lock(&group->mutex); if (group->owner_cnt) { ret = -EPERM; goto unlock_out; - } else { - if (group->domain && group->domain != group->default_domain) { - ret = -EBUSY; - goto unlock_out; - } - - ret = __iommu_group_alloc_blocking_domain(group); - if (ret) - goto unlock_out; - - ret = __iommu_group_set_domain(group, group->blocking_domain); - if (ret) - goto unlock_out; - group->owner = owner; } - group->owner_cnt++; + ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); @@ -3190,30 +3175,92 @@ unlock_out: EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner); /** - * iommu_group_release_dma_owner() - Release DMA ownership of a group - * @group: The group. + * iommu_device_claim_dma_owner() - Set DMA ownership of a device + * @dev: The device. + * @owner: Caller specified pointer. Used for exclusive ownership. * - * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). + * Claim the DMA ownership of a device. Multiple devices in the same group may + * concurrently claim ownership if they present the same owner value. Returns 0 + * on success and error code on failure */ -void iommu_group_release_dma_owner(struct iommu_group *group) +int iommu_device_claim_dma_owner(struct device *dev, void *owner) { - int ret; + struct iommu_group *group = iommu_group_get(dev); + int ret = 0; + + if (!group) + return -ENODEV; + if (WARN_ON(!owner)) + return -EINVAL; mutex_lock(&group->mutex); - if (WARN_ON(!group->owner_cnt || !group->owner)) + if (group->owner_cnt) { + if (group->owner != owner) { + ret = -EPERM; + goto unlock_out; + } + group->owner_cnt++; goto unlock_out; + } + + ret = __iommu_take_dma_ownership(group, owner); +unlock_out: + mutex_unlock(&group->mutex); + iommu_group_put(group); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner); + +static void __iommu_release_dma_ownership(struct iommu_group *group) +{ + int ret; + + if (WARN_ON(!group->owner_cnt || !group->owner || + !xa_empty(&group->pasid_array))) + return; group->owner_cnt = 0; group->owner = NULL; ret = __iommu_group_set_domain(group, group->default_domain); WARN(ret, "iommu driver failed to attach the default domain"); +} -unlock_out: +/** + * iommu_group_release_dma_owner() - Release DMA ownership of a group + * @dev: The device + * + * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). + */ +void iommu_group_release_dma_owner(struct iommu_group *group) +{ + mutex_lock(&group->mutex); + __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner); /** + * iommu_device_release_dma_owner() - Release DMA ownership of a device + * @group: The device. + * + * Release the DMA ownership claimed by iommu_device_claim_dma_owner(). + */ +void iommu_device_release_dma_owner(struct device *dev) +{ + struct iommu_group *group = iommu_group_get(dev); + + mutex_lock(&group->mutex); + if (group->owner_cnt > 1) + group->owner_cnt--; + else + __iommu_release_dma_ownership(group); + mutex_unlock(&group->mutex); + iommu_group_put(group); +} +EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner); + +/** * iommu_group_dma_owner_claimed() - Query group dma ownership status * @group: The group. * @@ -3231,3 +3278,150 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group) return user; } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); + +static int __iommu_set_group_pasid(struct iommu_domain *domain, + struct iommu_group *group, ioasid_t pasid) +{ + struct group_device *device; + int ret = 0; + + list_for_each_entry(device, &group->devices, list) { + ret = domain->ops->set_dev_pasid(domain, device->dev, pasid); + if (ret) + break; + } + + return ret; +} + +static void __iommu_remove_group_pasid(struct iommu_group *group, + ioasid_t pasid) +{ + struct group_device *device; + const struct iommu_ops *ops; + + list_for_each_entry(device, &group->devices, list) { + ops = dev_iommu_ops(device->dev); + ops->remove_dev_pasid(device->dev, pasid); + } +} + +/* + * iommu_attach_device_pasid() - Attach a domain to pasid of device + * @domain: the iommu domain. + * @dev: the attached device. + * @pasid: the pasid of the device. + * + * Return: 0 on success, or an error. + */ +int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct iommu_group *group; + void *curr; + int ret; + + if (!domain->ops->set_dev_pasid) + return -EOPNOTSUPP; + + group = iommu_group_get(dev); + if (!group) + return -ENODEV; + + mutex_lock(&group->mutex); + curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL); + if (curr) { + ret = xa_err(curr) ? : -EBUSY; + goto out_unlock; + } + + ret = __iommu_set_group_pasid(domain, group, pasid); + if (ret) { + __iommu_remove_group_pasid(group, pasid); + xa_erase(&group->pasid_array, pasid); + } +out_unlock: + mutex_unlock(&group->mutex); + iommu_group_put(group); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); + +/* + * iommu_detach_device_pasid() - Detach the domain from pasid of device + * @domain: the iommu domain. + * @dev: the attached device. + * @pasid: the pasid of the device. + * + * The @domain must have been attached to @pasid of the @dev with + * iommu_attach_device_pasid(). + */ +void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid) +{ + struct iommu_group *group = iommu_group_get(dev); + + mutex_lock(&group->mutex); + __iommu_remove_group_pasid(group, pasid); + WARN_ON(xa_erase(&group->pasid_array, pasid) != domain); + mutex_unlock(&group->mutex); + + iommu_group_put(group); +} +EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); + +/* + * iommu_get_domain_for_dev_pasid() - Retrieve domain for @pasid of @dev + * @dev: the queried device + * @pasid: the pasid of the device + * @type: matched domain type, 0 for any match + * + * This is a variant of iommu_get_domain_for_dev(). It returns the existing + * domain attached to pasid of a device. Callers must hold a lock around this + * function, and both iommu_attach/detach_dev_pasid() whenever a domain of + * type is being manipulated. This API does not internally resolve races with + * attach/detach. + * + * Return: attached domain on success, NULL otherwise. + */ +struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, + ioasid_t pasid, + unsigned int type) +{ + struct iommu_domain *domain; + struct iommu_group *group; + + group = iommu_group_get(dev); + if (!group) + return NULL; + + xa_lock(&group->pasid_array); + domain = xa_load(&group->pasid_array, pasid); + if (type && domain && domain->type != type) + domain = ERR_PTR(-EBUSY); + xa_unlock(&group->pasid_array); + iommu_group_put(group); + + return domain; +} +EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid); + +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); + if (!domain) + return NULL; + + domain->type = IOMMU_DOMAIN_SVA; + mmgrab(mm); + domain->mm = mm; + domain->iopf_handler = iommu_sva_handle_iopf; + domain->fault_data = mm; + + return domain; +} diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig new file mode 100644 index 000000000000..8306616b6d81 --- /dev/null +++ b/drivers/iommu/iommufd/Kconfig @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-2.0-only +config IOMMUFD + tristate "IOMMU Userspace API" + select INTERVAL_TREE + select INTERVAL_TREE_SPAN_ITER + select IOMMU_API + default n + help + Provides /dev/iommu, the user API to control the IOMMU subsystem as + it relates to managing IO page tables that point at user space memory. + + If you don't know what to do here, say N. + +if IOMMUFD +config IOMMUFD_VFIO_CONTAINER + bool "IOMMUFD provides the VFIO container /dev/vfio/vfio" + depends on VFIO && !VFIO_CONTAINER + default VFIO && !VFIO_CONTAINER + help + IOMMUFD will provide /dev/vfio/vfio instead of VFIO. This relies on + IOMMUFD providing compatibility emulation to give the same ioctls. + It provides an option to build a kernel with legacy VFIO components + removed. + + IOMMUFD VFIO container emulation is known to lack certain features + of the native VFIO container, such as no-IOMMU support, peer-to-peer + DMA mapping, PPC IOMMU support, as well as other potentially + undiscovered gaps. This option is currently intended for the + purpose of testing IOMMUFD with unmodified userspace supporting VFIO + and making use of the Type1 VFIO IOMMU backend. General purpose + enabling of this option is currently discouraged. + + Unless testing IOMMUFD, say N here. + +config IOMMUFD_TEST + bool "IOMMU Userspace API Test support" + depends on DEBUG_KERNEL + depends on FAULT_INJECTION + depends on RUNTIME_TESTING_MENU + default n + help + This is dangerous, do not enable unless running + tools/testing/selftests/iommu +endif diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile new file mode 100644 index 000000000000..8aeba81800c5 --- /dev/null +++ b/drivers/iommu/iommufd/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +iommufd-y := \ + device.o \ + hw_pagetable.o \ + io_pagetable.o \ + ioas.o \ + main.o \ + pages.o \ + vfio_compat.o + +iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o + +obj-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c new file mode 100644 index 000000000000..d81f93a321af --- /dev/null +++ b/drivers/iommu/iommufd/device.c @@ -0,0 +1,772 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/iommufd.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/irqdomain.h> + +#include "io_pagetable.h" +#include "iommufd_private.h" + +static bool allow_unsafe_interrupts; +module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC( + allow_unsafe_interrupts, + "Allow IOMMUFD to bind to devices even if the platform cannot isolate " + "the MSI interrupt window. Enabling this is a security weakness."); + +/* + * A iommufd_device object represents the binding relationship between a + * consuming driver and the iommufd. These objects are created/destroyed by + * external drivers, not by userspace. + */ +struct iommufd_device { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_hw_pagetable *hwpt; + /* Head at iommufd_hw_pagetable::devices */ + struct list_head devices_item; + /* always the physical device */ + struct device *dev; + struct iommu_group *group; + bool enforce_cache_coherency; +}; + +void iommufd_device_destroy(struct iommufd_object *obj) +{ + struct iommufd_device *idev = + container_of(obj, struct iommufd_device, obj); + + iommu_device_release_dma_owner(idev->dev); + iommu_group_put(idev->group); + iommufd_ctx_put(idev->ictx); +} + +/** + * iommufd_device_bind - Bind a physical device to an iommu fd + * @ictx: iommufd file descriptor + * @dev: Pointer to a physical device struct + * @id: Output ID number to return to userspace for this device + * + * A successful bind establishes an ownership over the device and returns + * struct iommufd_device pointer, otherwise returns error pointer. + * + * A driver using this API must set driver_managed_dma and must not touch + * the device until this routine succeeds and establishes ownership. + * + * Binding a PCI device places the entire RID under iommufd control. + * + * The caller must undo this with iommufd_device_unbind() + */ +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, + struct device *dev, u32 *id) +{ + struct iommufd_device *idev; + struct iommu_group *group; + int rc; + + /* + * iommufd always sets IOMMU_CACHE because we offer no way for userspace + * to restore cache coherency. + */ + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) + return ERR_PTR(-EINVAL); + + group = iommu_group_get(dev); + if (!group) + return ERR_PTR(-ENODEV); + + rc = iommu_device_claim_dma_owner(dev, ictx); + if (rc) + goto out_group_put; + + idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_release_owner; + } + idev->ictx = ictx; + iommufd_ctx_get(ictx); + idev->dev = dev; + idev->enforce_cache_coherency = + device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY); + /* The calling driver is a user until iommufd_device_unbind() */ + refcount_inc(&idev->obj.users); + /* group refcount moves into iommufd_device */ + idev->group = group; + + /* + * If the caller fails after this success it must call + * iommufd_unbind_device() which is safe since we hold this refcount. + * This also means the device is a leaf in the graph and no other object + * can take a reference on it. + */ + iommufd_object_finalize(ictx, &idev->obj); + *id = idev->obj.id; + return idev; + +out_release_owner: + iommu_device_release_dma_owner(dev); +out_group_put: + iommu_group_put(group); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); + +/** + * iommufd_device_unbind - Undo iommufd_device_bind() + * @idev: Device returned by iommufd_device_bind() + * + * Release the device from iommufd control. The DMA ownership will return back + * to unowned with DMA controlled by the DMA API. This invalidates the + * iommufd_device pointer, other APIs that consume it must not be called + * concurrently. + */ +void iommufd_device_unbind(struct iommufd_device *idev) +{ + bool was_destroyed; + + was_destroyed = iommufd_object_destroy_user(idev->ictx, &idev->obj); + WARN_ON(!was_destroyed); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); + +static int iommufd_device_setup_msi(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + phys_addr_t sw_msi_start) +{ + int rc; + + /* + * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to + * call iommu_get_msi_cookie() on its behalf. This is necessary to setup + * the MSI window so iommu_dma_prepare_msi() can install pages into our + * domain after request_irq(). If it is not done interrupts will not + * work on this domain. + * + * FIXME: This is conceptually broken for iommufd since we want to allow + * userspace to change the domains, eg switch from an identity IOAS to a + * DMA IOAS. There is currently no way to create a MSI window that + * matches what the IRQ layer actually expects in a newly created + * domain. + */ + if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) { + rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (rc) + return rc; + + /* + * iommu_get_msi_cookie() can only be called once per domain, + * it returns -EBUSY on later calls. + */ + hwpt->msi_cookie = true; + } + + /* + * For historical compat with VFIO the insecure interrupt path is + * allowed if the module parameter is set. Insecure means that a MemWr + * operation from the device (eg a simple DMA) cannot trigger an + * interrupt outside this iommufd context. + */ + if (!device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP) && + !irq_domain_check_msi_remap()) { + if (!allow_unsafe_interrupts) + return -EPERM; + + dev_warn( + idev->dev, + "MSI interrupts are not secure, they cannot be isolated by the platform. " + "Check that platform features like interrupt remapping are enabled. " + "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); + } + return 0; +} + +static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt, + struct iommu_group *group) +{ + struct iommufd_device *cur_dev; + + list_for_each_entry(cur_dev, &hwpt->devices, devices_item) + if (cur_dev->group == group) + return true; + return false; +} + +static int iommufd_device_do_attach(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + phys_addr_t sw_msi_start = PHYS_ADDR_MAX; + int rc; + + mutex_lock(&hwpt->devices_lock); + + /* + * Try to upgrade the domain we have, it is an iommu driver bug to + * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail + * enforce_cache_coherency when there are no devices attached to the + * domain. + */ + if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) { + if (hwpt->domain->ops->enforce_cache_coherency) + hwpt->enforce_cache_coherency = + hwpt->domain->ops->enforce_cache_coherency( + hwpt->domain); + if (!hwpt->enforce_cache_coherency) { + WARN_ON(list_empty(&hwpt->devices)); + rc = -EINVAL; + goto out_unlock; + } + } + + rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev, + idev->group, &sw_msi_start); + if (rc) + goto out_unlock; + + rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); + if (rc) + goto out_iova; + + /* + * FIXME: Hack around missing a device-centric iommu api, only attach to + * the group once for the first device that is in the group. + */ + if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) { + rc = iommu_attach_group(hwpt->domain, idev->group); + if (rc) + goto out_iova; + + if (list_empty(&hwpt->devices)) { + rc = iopt_table_add_domain(&hwpt->ioas->iopt, + hwpt->domain); + if (rc) + goto out_detach; + } + } + + idev->hwpt = hwpt; + refcount_inc(&hwpt->obj.users); + list_add(&idev->devices_item, &hwpt->devices); + mutex_unlock(&hwpt->devices_lock); + return 0; + +out_detach: + iommu_detach_group(hwpt->domain, idev->group); +out_iova: + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); +out_unlock: + mutex_unlock(&hwpt->devices_lock); + return rc; +} + +/* + * When automatically managing the domains we search for a compatible domain in + * the iopt and if one is found use it, otherwise create a new domain. + * Automatic domain selection will never pick a manually created domain. + */ +static int iommufd_device_auto_get_domain(struct iommufd_device *idev, + struct iommufd_ioas *ioas) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + /* + * There is no differentiation when domains are allocated, so any domain + * that is willing to attach to the device is interchangeable with any + * other. + */ + mutex_lock(&ioas->mutex); + list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { + if (!hwpt->auto_domain) + continue; + + rc = iommufd_device_do_attach(idev, hwpt); + + /* + * -EINVAL means the domain is incompatible with the device. + * Other error codes should propagate to userspace as failure. + * Success means the domain is attached. + */ + if (rc == -EINVAL) + continue; + goto out_unlock; + } + + hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev->dev); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_unlock; + } + hwpt->auto_domain = true; + + rc = iommufd_device_do_attach(idev, hwpt); + if (rc) + goto out_abort; + list_add_tail(&hwpt->hwpt_item, &ioas->hwpt_list); + + mutex_unlock(&ioas->mutex); + iommufd_object_finalize(idev->ictx, &hwpt->obj); + return 0; + +out_abort: + iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj); +out_unlock: + mutex_unlock(&ioas->mutex); + return rc; +} + +/** + * iommufd_device_attach - Connect a device from an iommu_domain + * @idev: device to attach + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE + * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * + * This connects the device to an iommu_domain, either automatically or manually + * selected. Once this completes the device could do DMA. + * + * The caller should return the resulting pt_id back to userspace. + * This function is undone by calling iommufd_device_detach(). + */ +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +{ + struct iommufd_object *pt_obj; + int rc; + + pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) + return PTR_ERR(pt_obj); + + switch (pt_obj->type) { + case IOMMUFD_OBJ_HW_PAGETABLE: { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + rc = iommufd_device_do_attach(idev, hwpt); + if (rc) + goto out_put_pt_obj; + + mutex_lock(&hwpt->ioas->mutex); + list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); + mutex_unlock(&hwpt->ioas->mutex); + break; + } + case IOMMUFD_OBJ_IOAS: { + struct iommufd_ioas *ioas = + container_of(pt_obj, struct iommufd_ioas, obj); + + rc = iommufd_device_auto_get_domain(idev, ioas); + if (rc) + goto out_put_pt_obj; + break; + } + default: + rc = -EINVAL; + goto out_put_pt_obj; + } + + refcount_inc(&idev->obj.users); + *pt_id = idev->hwpt->obj.id; + rc = 0; + +out_put_pt_obj: + iommufd_put_object(pt_obj); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); + +/** + * iommufd_device_detach - Disconnect a device to an iommu_domain + * @idev: device to detach + * + * Undo iommufd_device_attach(). This disconnects the idev from the previously + * attached pt_id. The device returns back to a blocked DMA translation. + */ +void iommufd_device_detach(struct iommufd_device *idev) +{ + struct iommufd_hw_pagetable *hwpt = idev->hwpt; + + mutex_lock(&hwpt->ioas->mutex); + mutex_lock(&hwpt->devices_lock); + list_del(&idev->devices_item); + if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) { + if (list_empty(&hwpt->devices)) { + iopt_table_remove_domain(&hwpt->ioas->iopt, + hwpt->domain); + list_del(&hwpt->hwpt_item); + } + iommu_detach_group(hwpt->domain, idev->group); + } + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + mutex_unlock(&hwpt->devices_lock); + mutex_unlock(&hwpt->ioas->mutex); + + if (hwpt->auto_domain) + iommufd_object_destroy_user(idev->ictx, &hwpt->obj); + else + refcount_dec(&hwpt->obj.users); + + idev->hwpt = NULL; + + refcount_dec(&idev->obj.users); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); + +void iommufd_access_destroy_object(struct iommufd_object *obj) +{ + struct iommufd_access *access = + container_of(obj, struct iommufd_access, obj); + + iopt_remove_access(&access->ioas->iopt, access); + iommufd_ctx_put(access->ictx); + refcount_dec(&access->ioas->obj.users); +} + +/** + * iommufd_access_create - Create an iommufd_access + * @ictx: iommufd file descriptor + * @ioas_id: ID for a IOMMUFD_OBJ_IOAS + * @ops: Driver's ops to associate with the access + * @data: Opaque data to pass into ops functions + * + * An iommufd_access allows a driver to read/write to the IOAS without using + * DMA. The underlying CPU memory can be accessed using the + * iommufd_access_pin_pages() or iommufd_access_rw() functions. + * + * The provided ops are required to use iommufd_access_pin_pages(). + */ +struct iommufd_access * +iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id, + const struct iommufd_access_ops *ops, void *data) +{ + struct iommufd_access *access; + struct iommufd_object *obj; + int rc; + + /* + * There is no uAPI for the access object, but to keep things symmetric + * use the object infrastructure anyhow. + */ + access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + if (IS_ERR(access)) + return access; + + access->data = data; + access->ops = ops; + + obj = iommufd_get_object(ictx, ioas_id, IOMMUFD_OBJ_IOAS); + if (IS_ERR(obj)) { + rc = PTR_ERR(obj); + goto out_abort; + } + access->ioas = container_of(obj, struct iommufd_ioas, obj); + iommufd_ref_to_users(obj); + + if (ops->needs_pin_pages) + access->iova_alignment = PAGE_SIZE; + else + access->iova_alignment = 1; + rc = iopt_add_access(&access->ioas->iopt, access); + if (rc) + goto out_put_ioas; + + /* The calling driver is a user until iommufd_access_destroy() */ + refcount_inc(&access->obj.users); + access->ictx = ictx; + iommufd_ctx_get(ictx); + iommufd_object_finalize(ictx, &access->obj); + return access; +out_put_ioas: + refcount_dec(&access->ioas->obj.users); +out_abort: + iommufd_object_abort(ictx, &access->obj); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); + +/** + * iommufd_access_destroy - Destroy an iommufd_access + * @access: The access to destroy + * + * The caller must stop using the access before destroying it. + */ +void iommufd_access_destroy(struct iommufd_access *access) +{ + bool was_destroyed; + + was_destroyed = iommufd_object_destroy_user(access->ictx, &access->obj); + WARN_ON(!was_destroyed); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); + +/** + * iommufd_access_notify_unmap - Notify users of an iopt to stop using it + * @iopt: iopt to work on + * @iova: Starting iova in the iopt + * @length: Number of bytes + * + * After this function returns there should be no users attached to the pages + * linked to this iopt that intersect with iova,length. Anyone that has attached + * a user through iopt_access_pages() needs to detach it through + * iommufd_access_unpin_pages() before this function returns. + * + * iommufd_access_destroy() will wait for any outstanding unmap callback to + * complete. Once iommufd_access_destroy() no unmap ops are running or will + * run in the future. Due to this a driver must not create locking that prevents + * unmap to complete while iommufd_access_destroy() is running. + */ +void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, + unsigned long length) +{ + struct iommufd_ioas *ioas = + container_of(iopt, struct iommufd_ioas, iopt); + struct iommufd_access *access; + unsigned long index; + + xa_lock(&ioas->iopt.access_list); + xa_for_each(&ioas->iopt.access_list, index, access) { + if (!iommufd_lock_obj(&access->obj)) + continue; + xa_unlock(&ioas->iopt.access_list); + + access->ops->unmap(access->data, iova, length); + + iommufd_put_object(&access->obj); + xa_lock(&ioas->iopt.access_list); + } + xa_unlock(&ioas->iopt.access_list); +} + +/** + * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages + * @access: IOAS access to act on + * @iova: Starting IOVA + * @length: Number of bytes to access + * + * Return the struct page's. The caller must stop accessing them before calling + * this. The iova/length must exactly match the one provided to access_pages. + */ +void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, unsigned long length) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + + if (WARN_ON(!length) || + WARN_ON(check_add_overflow(iova, length - 1, &last_iova))) + return; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) + iopt_area_remove_access( + area, iopt_area_iova_to_index(area, iter.cur_iova), + iopt_area_iova_to_index( + area, + min(last_iova, iopt_area_last_iova(area)))); + up_read(&iopt->iova_rwsem); + WARN_ON(!iopt_area_contig_done(&iter)); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); + +static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter) +{ + if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE) + return false; + + if (!iopt_area_contig_done(iter) && + (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) % + PAGE_SIZE) != (PAGE_SIZE - 1)) + return false; + return true; +} + +static bool check_area_prot(struct iopt_area *area, unsigned int flags) +{ + if (flags & IOMMUFD_ACCESS_RW_WRITE) + return area->iommu_prot & IOMMU_WRITE; + return area->iommu_prot & IOMMU_READ; +} + +/** + * iommufd_access_pin_pages() - Return a list of pages under the iova + * @access: IOAS access to act on + * @iova: Starting IOVA + * @length: Number of bytes to access + * @out_pages: Output page list + * @flags: IOPMMUFD_ACCESS_RW_* flags + * + * Reads @length bytes starting at iova and returns the struct page * pointers. + * These can be kmap'd by the caller for CPU access. + * + * The caller must perform iommufd_access_unpin_pages() when done to balance + * this. + * + * This API always requires a page aligned iova. This happens naturally if the + * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However + * smaller alignments have corner cases where this API can fail on otherwise + * aligned iova. + */ +int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, + unsigned long length, struct page **out_pages, + unsigned int flags) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + int rc; + + /* Driver's ops don't support pin_pages */ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap)) + return -EINVAL; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + unsigned long last_index = iopt_area_iova_to_index(area, last); + unsigned long index = + iopt_area_iova_to_index(area, iter.cur_iova); + + if (area->prevent_access || + !iopt_area_contig_is_aligned(&iter)) { + rc = -EINVAL; + goto err_remove; + } + + if (!check_area_prot(area, flags)) { + rc = -EPERM; + goto err_remove; + } + + rc = iopt_area_add_access(area, index, last_index, out_pages, + flags); + if (rc) + goto err_remove; + out_pages += last_index - index + 1; + } + if (!iopt_area_contig_done(&iter)) { + rc = -ENOENT; + goto err_remove; + } + + up_read(&iopt->iova_rwsem); + return 0; + +err_remove: + if (iova < iter.cur_iova) { + last_iova = iter.cur_iova - 1; + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) + iopt_area_remove_access( + area, + iopt_area_iova_to_index(area, iter.cur_iova), + iopt_area_iova_to_index( + area, min(last_iova, + iopt_area_last_iova(area)))); + } + up_read(&iopt->iova_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); + +/** + * iommufd_access_rw - Read or write data under the iova + * @access: IOAS access to act on + * @iova: Starting IOVA + * @data: Kernel buffer to copy to/from + * @length: Number of bytes to access + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * Copy kernel to/from data into the range given by IOVA/length. If flags + * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized + * by changing it into copy_to/from_user(). + */ +int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t length, unsigned int flags) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + struct iopt_area *area; + unsigned long last_iova; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + unsigned long bytes = (last - iter.cur_iova) + 1; + + if (area->prevent_access) { + rc = -EINVAL; + goto err_out; + } + + if (!check_area_prot(area, flags)) { + rc = -EPERM; + goto err_out; + } + + rc = iopt_pages_rw_access( + area->pages, iopt_area_start_byte(area, iter.cur_iova), + data, bytes, flags); + if (rc) + goto err_out; + data += bytes; + } + if (!iopt_area_contig_done(&iter)) + rc = -ENOENT; +err_out: + up_read(&iopt->iova_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); + +#ifdef CONFIG_IOMMUFD_TEST +/* + * Creating a real iommufd_device is too hard, bypass creating a iommufd_device + * and go directly to attaching a domain. + */ +struct iommufd_hw_pagetable * +iommufd_device_selftest_attach(struct iommufd_ctx *ictx, + struct iommufd_ioas *ioas, + struct device *mock_dev) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + hwpt = iommufd_hw_pagetable_alloc(ictx, ioas, mock_dev); + if (IS_ERR(hwpt)) + return hwpt; + + rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); + if (rc) + goto out_hwpt; + + refcount_inc(&hwpt->obj.users); + iommufd_object_finalize(ictx, &hwpt->obj); + return hwpt; + +out_hwpt: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + +void iommufd_device_selftest_detach(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt) +{ + iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + refcount_dec(&hwpt->obj.users); +} +#endif diff --git a/drivers/iommu/iommufd/double_span.h b/drivers/iommu/iommufd/double_span.h new file mode 100644 index 000000000000..b37aab7488c0 --- /dev/null +++ b/drivers/iommu/iommufd/double_span.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef __IOMMUFD_DOUBLE_SPAN_H +#define __IOMMUFD_DOUBLE_SPAN_H + +#include <linux/interval_tree.h> + +/* + * This is a variation of the general interval_tree_span_iter that computes the + * spans over the union of two different interval trees. Used ranges are broken + * up and reported based on the tree that provides the interval. The first span + * always takes priority. Like interval_tree_span_iter it is greedy and the same + * value of is_used will not repeat on two iteration cycles. + */ +struct interval_tree_double_span_iter { + struct rb_root_cached *itrees[2]; + struct interval_tree_span_iter spans[2]; + union { + unsigned long start_hole; + unsigned long start_used; + }; + union { + unsigned long last_hole; + unsigned long last_used; + }; + /* 0 = hole, 1 = used span[0], 2 = used span[1], -1 done iteration */ + int is_used; +}; + +void interval_tree_double_span_iter_update( + struct interval_tree_double_span_iter *iter); +void interval_tree_double_span_iter_first( + struct interval_tree_double_span_iter *iter, + struct rb_root_cached *itree1, struct rb_root_cached *itree2, + unsigned long first_index, unsigned long last_index); +void interval_tree_double_span_iter_next( + struct interval_tree_double_span_iter *iter); + +static inline bool +interval_tree_double_span_iter_done(struct interval_tree_double_span_iter *state) +{ + return state->is_used == -1; +} + +#define interval_tree_for_each_double_span(span, itree1, itree2, first_index, \ + last_index) \ + for (interval_tree_double_span_iter_first(span, itree1, itree2, \ + first_index, last_index); \ + !interval_tree_double_span_iter_done(span); \ + interval_tree_double_span_iter_next(span)) + +#endif diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c new file mode 100644 index 000000000000..43d473989a06 --- /dev/null +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/iommu.h> + +#include "iommufd_private.h" + +void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_pagetable *hwpt = + container_of(obj, struct iommufd_hw_pagetable, obj); + + WARN_ON(!list_empty(&hwpt->devices)); + + iommu_domain_free(hwpt->domain); + refcount_dec(&hwpt->ioas->obj.users); + mutex_destroy(&hwpt->devices_lock); +} + +/** + * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device + * @ictx: iommufd context + * @ioas: IOAS to associate the domain with + * @dev: Device to get an iommu_domain for + * + * Allocate a new iommu_domain and return it as a hw_pagetable. + */ +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct device *dev) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); + if (IS_ERR(hwpt)) + return hwpt; + + hwpt->domain = iommu_domain_alloc(dev->bus); + if (!hwpt->domain) { + rc = -ENOMEM; + goto out_abort; + } + + INIT_LIST_HEAD(&hwpt->devices); + INIT_LIST_HEAD(&hwpt->hwpt_item); + mutex_init(&hwpt->devices_lock); + /* Pairs with iommufd_hw_pagetable_destroy() */ + refcount_inc(&ioas->obj.users); + hwpt->ioas = ioas; + return hwpt; + +out_abort: + iommufd_object_abort(ictx, &hwpt->obj); + return ERR_PTR(rc); +} diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c new file mode 100644 index 000000000000..e0ae72b9e67f --- /dev/null +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -0,0 +1,1216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The + * PFNs can be placed into an iommu_domain, or returned to the caller as a page + * list for access by an in-kernel user. + * + * The datastructure uses the iopt_pages to optimize the storage of the PFNs + * between the domains and xarray. + */ +#include <linux/iommufd.h> +#include <linux/lockdep.h> +#include <linux/iommu.h> +#include <linux/sched/mm.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/errno.h> + +#include "io_pagetable.h" +#include "double_span.h" + +struct iopt_pages_list { + struct iopt_pages *pages; + struct iopt_area *area; + struct list_head next; + unsigned long start_byte; + unsigned long length; +}; + +struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, + struct io_pagetable *iopt, + unsigned long iova, + unsigned long last_iova) +{ + lockdep_assert_held(&iopt->iova_rwsem); + + iter->cur_iova = iova; + iter->last_iova = last_iova; + iter->area = iopt_area_iter_first(iopt, iova, iova); + if (!iter->area) + return NULL; + if (!iter->area->pages) { + iter->area = NULL; + return NULL; + } + return iter->area; +} + +struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) +{ + unsigned long last_iova; + + if (!iter->area) + return NULL; + last_iova = iopt_area_last_iova(iter->area); + if (iter->last_iova <= last_iova) + return NULL; + + iter->cur_iova = last_iova + 1; + iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, + iter->last_iova); + if (!iter->area) + return NULL; + if (iter->cur_iova != iopt_area_iova(iter->area) || + !iter->area->pages) { + iter->area = NULL; + return NULL; + } + return iter->area; +} + +static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_used || span->last_hole - span->start_hole < length - 1) + return false; + + span->start_hole = ALIGN(span->start_hole, iova_alignment) | + page_offset; + if (span->start_hole > span->last_hole || + span->last_hole - span->start_hole < length - 1) + return false; + return true; +} + +static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_hole || span->last_used - span->start_used < length - 1) + return false; + + span->start_used = ALIGN(span->start_used, iova_alignment) | + page_offset; + if (span->start_used > span->last_used || + span->last_used - span->start_used < length - 1) + return false; + return true; +} + +/* + * Automatically find a block of IOVA that is not being used and not reserved. + * Does not return a 0 IOVA even if it is valid. + */ +static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, + unsigned long uptr, unsigned long length) +{ + unsigned long page_offset = uptr % PAGE_SIZE; + struct interval_tree_double_span_iter used_span; + struct interval_tree_span_iter allowed_span; + unsigned long iova_alignment; + + lockdep_assert_held(&iopt->iova_rwsem); + + /* Protect roundup_pow-of_two() from overflow */ + if (length == 0 || length >= ULONG_MAX / 2) + return -EOVERFLOW; + + /* + * Keep alignment present in the uptr when building the IOVA, this + * increases the chance we can map a THP. + */ + if (!uptr) + iova_alignment = roundup_pow_of_two(length); + else + iova_alignment = min_t(unsigned long, + roundup_pow_of_two(length), + 1UL << __ffs64(uptr)); + + if (iova_alignment < iopt->iova_alignment) + return -EINVAL; + + interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, + PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { + if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { + allowed_span.start_used = PAGE_SIZE; + allowed_span.last_used = ULONG_MAX - PAGE_SIZE; + allowed_span.is_hole = false; + } + + if (!__alloc_iova_check_used(&allowed_span, length, + iova_alignment, page_offset)) + continue; + + interval_tree_for_each_double_span( + &used_span, &iopt->reserved_itree, &iopt->area_itree, + allowed_span.start_used, allowed_span.last_used) { + if (!__alloc_iova_check_hole(&used_span, length, + iova_alignment, + page_offset)) + continue; + + *iova = used_span.start_hole; + return 0; + } + } + return -ENOSPC; +} + +static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length) +{ + unsigned long last; + + lockdep_assert_held(&iopt->iova_rwsem); + + if ((iova & (iopt->iova_alignment - 1))) + return -EINVAL; + + if (check_add_overflow(iova, length - 1, &last)) + return -EOVERFLOW; + + /* No reserved IOVA intersects the range */ + if (iopt_reserved_iter_first(iopt, iova, last)) + return -EINVAL; + + /* Check that there is not already a mapping in the range */ + if (iopt_area_iter_first(iopt, iova, last)) + return -EEXIST; + return 0; +} + +/* + * The area takes a slice of the pages from start_bytes to start_byte + length + */ +static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, + struct iopt_pages *pages, unsigned long iova, + unsigned long start_byte, unsigned long length, + int iommu_prot) +{ + lockdep_assert_held_write(&iopt->iova_rwsem); + + if ((iommu_prot & IOMMU_WRITE) && !pages->writable) + return -EPERM; + + area->iommu_prot = iommu_prot; + area->page_offset = start_byte % PAGE_SIZE; + if (area->page_offset & (iopt->iova_alignment - 1)) + return -EINVAL; + + area->node.start = iova; + if (check_add_overflow(iova, length - 1, &area->node.last)) + return -EOVERFLOW; + + area->pages_node.start = start_byte / PAGE_SIZE; + if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) + return -EOVERFLOW; + area->pages_node.last = area->pages_node.last / PAGE_SIZE; + if (WARN_ON(area->pages_node.last >= pages->npages)) + return -EOVERFLOW; + + /* + * The area is inserted with a NULL pages indicating it is not fully + * initialized yet. + */ + area->iopt = iopt; + interval_tree_insert(&area->node, &iopt->area_itree); + return 0; +} + +static int iopt_alloc_area_pages(struct io_pagetable *iopt, + struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list *elm; + unsigned long iova; + int rc = 0; + + list_for_each_entry(elm, pages_list, next) { + elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT); + if (!elm->area) + return -ENOMEM; + } + + down_write(&iopt->iova_rwsem); + if ((length & (iopt->iova_alignment - 1)) || !length) { + rc = -EINVAL; + goto out_unlock; + } + + if (flags & IOPT_ALLOC_IOVA) { + /* Use the first entry to guess the ideal IOVA alignment */ + elm = list_first_entry(pages_list, struct iopt_pages_list, + next); + rc = iopt_alloc_iova( + iopt, dst_iova, + (uintptr_t)elm->pages->uptr + elm->start_byte, length); + if (rc) + goto out_unlock; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { + rc = -EINVAL; + goto out_unlock; + } + } else { + rc = iopt_check_iova(iopt, *dst_iova, length); + if (rc) + goto out_unlock; + } + + /* + * Areas are created with a NULL pages so that the IOVA space is + * reserved and we can unlock the iova_rwsem. + */ + iova = *dst_iova; + list_for_each_entry(elm, pages_list, next) { + rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, + elm->start_byte, elm->length, iommu_prot); + if (rc) + goto out_unlock; + iova += elm->length; + } + +out_unlock: + up_write(&iopt->iova_rwsem); + return rc; +} + +static void iopt_abort_area(struct iopt_area *area) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(area->pages); + if (area->iopt) { + down_write(&area->iopt->iova_rwsem); + interval_tree_remove(&area->node, &area->iopt->area_itree); + up_write(&area->iopt->iova_rwsem); + } + kfree(area); +} + +void iopt_free_pages_list(struct list_head *pages_list) +{ + struct iopt_pages_list *elm; + + while ((elm = list_first_entry_or_null(pages_list, + struct iopt_pages_list, next))) { + if (elm->area) + iopt_abort_area(elm->area); + if (elm->pages) + iopt_put_pages(elm->pages); + list_del(&elm->next); + kfree(elm); + } +} + +static int iopt_fill_domains_pages(struct list_head *pages_list) +{ + struct iopt_pages_list *undo_elm; + struct iopt_pages_list *elm; + int rc; + + list_for_each_entry(elm, pages_list, next) { + rc = iopt_area_fill_domains(elm->area, elm->pages); + if (rc) + goto err_undo; + } + return 0; + +err_undo: + list_for_each_entry(undo_elm, pages_list, next) { + if (undo_elm == elm) + break; + iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); + } + return rc; +} + +int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list *elm; + int rc; + + rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, + iommu_prot, flags); + if (rc) + return rc; + + down_read(&iopt->domains_rwsem); + rc = iopt_fill_domains_pages(pages_list); + if (rc) + goto out_unlock_domains; + + down_write(&iopt->iova_rwsem); + list_for_each_entry(elm, pages_list, next) { + /* + * area->pages must be set inside the domains_rwsem to ensure + * any newly added domains will get filled. Moves the reference + * in from the list. + */ + elm->area->pages = elm->pages; + elm->pages = NULL; + elm->area = NULL; + } + up_write(&iopt->iova_rwsem); +out_unlock_domains: + up_read(&iopt->domains_rwsem); + return rc; +} + +/** + * iopt_map_user_pages() - Map a user VA to an iova in the io page table + * @ictx: iommufd_ctx the iopt is part of + * @iopt: io_pagetable to act on + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains + * the chosen iova on output. Otherwise is the iova to map to on input + * @uptr: User VA to map + * @length: Number of bytes to map + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping + * @flags: IOPT_ALLOC_IOVA or zero + * + * iova, uptr, and length must be aligned to iova_alignment. For domain backed + * page tables this will pin the pages and load them into the domain at iova. + * For non-domain page tables this will only setup a lazy reference and the + * caller must use iopt_access_pages() to touch them. + * + * iopt_unmap_iova() must be called to undo this before the io_pagetable can be + * destroyed. + */ +int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, void __user *uptr, + unsigned long length, int iommu_prot, + unsigned int flags) +{ + struct iopt_pages_list elm = {}; + LIST_HEAD(pages_list); + int rc; + + elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); + if (IS_ERR(elm.pages)) + return PTR_ERR(elm.pages); + if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && + elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) + elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; + elm.start_byte = uptr - elm.pages->uptr; + elm.length = length; + list_add(&elm.next, &pages_list); + + rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); + if (rc) { + if (elm.area) + iopt_abort_area(elm.area); + if (elm.pages) + iopt_put_pages(elm.pages); + return rc; + } + return 0; +} + +int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, struct list_head *pages_list) +{ + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + struct iopt_pages_list *elm; + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + + elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); + if (!elm) { + rc = -ENOMEM; + goto err_free; + } + elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); + elm->pages = area->pages; + elm->length = (last - iter.cur_iova) + 1; + kref_get(&elm->pages->kref); + list_add_tail(&elm->next, pages_list); + } + if (!iopt_area_contig_done(&iter)) { + rc = -ENOENT; + goto err_free; + } + up_read(&iopt->iova_rwsem); + return 0; +err_free: + up_read(&iopt->iova_rwsem); + iopt_free_pages_list(pages_list); + return rc; +} + +static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, + unsigned long last, unsigned long *unmapped) +{ + struct iopt_area *area; + unsigned long unmapped_bytes = 0; + int rc = -ENOENT; + + /* + * The domains_rwsem must be held in read mode any time any area->pages + * is NULL. This prevents domain attach/detatch from running + * concurrently with cleaning up the area. + */ +again: + down_read(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + while ((area = iopt_area_iter_first(iopt, start, last))) { + unsigned long area_last = iopt_area_last_iova(area); + unsigned long area_first = iopt_area_iova(area); + struct iopt_pages *pages; + + /* Userspace should not race map/unmap's of the same area */ + if (!area->pages) { + rc = -EBUSY; + goto out_unlock_iova; + } + + if (area_first < start || area_last > last) { + rc = -ENOENT; + goto out_unlock_iova; + } + + /* + * num_accesses writers must hold the iova_rwsem too, so we can + * safely read it under the write side of the iovam_rwsem + * without the pages->mutex. + */ + if (area->num_accesses) { + start = area_first; + area->prevent_access = true; + up_write(&iopt->iova_rwsem); + up_read(&iopt->domains_rwsem); + iommufd_access_notify_unmap(iopt, area_first, + iopt_area_length(area)); + if (WARN_ON(READ_ONCE(area->num_accesses))) + return -EDEADLOCK; + goto again; + } + + pages = area->pages; + area->pages = NULL; + up_write(&iopt->iova_rwsem); + + iopt_area_unfill_domains(area, pages); + iopt_abort_area(area); + iopt_put_pages(pages); + + unmapped_bytes += area_last - area_first + 1; + + down_write(&iopt->iova_rwsem); + } + if (unmapped_bytes) + rc = 0; + +out_unlock_iova: + up_write(&iopt->iova_rwsem); + up_read(&iopt->domains_rwsem); + if (unmapped) + *unmapped = unmapped_bytes; + return rc; +} + +/** + * iopt_unmap_iova() - Remove a range of iova + * @iopt: io_pagetable to act on + * @iova: Starting iova to unmap + * @length: Number of bytes to unmap + * @unmapped: Return number of bytes unmapped + * + * The requested range must be a superset of existing ranges. + * Splitting/truncating IOVA mappings is not allowed. + */ +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, unsigned long *unmapped) +{ + unsigned long iova_last; + + if (!length) + return -EINVAL; + + if (check_add_overflow(iova, length - 1, &iova_last)) + return -EOVERFLOW; + + return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); +} + +int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) +{ + int rc; + + rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); + /* If the IOVAs are empty then unmap all succeeds */ + if (rc == -ENOENT) + return 0; + return rc; +} + +/* The caller must always free all the nodes in the allowed_iova rb_root. */ +int iopt_set_allow_iova(struct io_pagetable *iopt, + struct rb_root_cached *allowed_iova) +{ + struct iopt_allowed *allowed; + + down_write(&iopt->iova_rwsem); + swap(*allowed_iova, iopt->allowed_itree); + + for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; + allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { + if (iopt_reserved_iter_first(iopt, allowed->node.start, + allowed->node.last)) { + swap(*allowed_iova, iopt->allowed_itree); + up_write(&iopt->iova_rwsem); + return -EADDRINUSE; + } + } + up_write(&iopt->iova_rwsem); + return 0; +} + +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, + unsigned long last, void *owner) +{ + struct iopt_reserved *reserved; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + if (iopt_area_iter_first(iopt, start, last) || + iopt_allowed_iter_first(iopt, start, last)) + return -EADDRINUSE; + + reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); + if (!reserved) + return -ENOMEM; + reserved->node.start = start; + reserved->node.last = last; + reserved->owner = owner; + interval_tree_insert(&reserved->node, &iopt->reserved_itree); + return 0; +} + +static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) +{ + struct iopt_reserved *reserved, *next; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; + reserved = next) { + next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); + + if (reserved->owner == owner) { + interval_tree_remove(&reserved->node, + &iopt->reserved_itree); + kfree(reserved); + } + } +} + +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) +{ + down_write(&iopt->iova_rwsem); + __iopt_remove_reserved_iova(iopt, owner); + up_write(&iopt->iova_rwsem); +} + +void iopt_init_table(struct io_pagetable *iopt) +{ + init_rwsem(&iopt->iova_rwsem); + init_rwsem(&iopt->domains_rwsem); + iopt->area_itree = RB_ROOT_CACHED; + iopt->allowed_itree = RB_ROOT_CACHED; + iopt->reserved_itree = RB_ROOT_CACHED; + xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); + xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); + + /* + * iopt's start as SW tables that can use the entire size_t IOVA space + * due to the use of size_t in the APIs. They have no alignment + * restriction. + */ + iopt->iova_alignment = 1; +} + +void iopt_destroy_table(struct io_pagetable *iopt) +{ + struct interval_tree_node *node; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + iopt_remove_reserved_iova(iopt, NULL); + + while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, + ULONG_MAX))) { + interval_tree_remove(node, &iopt->allowed_itree); + kfree(container_of(node, struct iopt_allowed, node)); + } + + WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); + WARN_ON(!xa_empty(&iopt->domains)); + WARN_ON(!xa_empty(&iopt->access_list)); + WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); +} + +/** + * iopt_unfill_domain() - Unfill a domain with PFNs + * @iopt: io_pagetable to act on + * @domain: domain to unfill + * + * This is used when removing a domain from the iopt. Every area in the iopt + * will be unmapped from the domain. The domain must already be removed from the + * domains xarray. + */ +static void iopt_unfill_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iopt_area *area; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held_write(&iopt->domains_rwsem); + + /* + * Some other domain is holding all the pfns still, rapidly unmap this + * domain. + */ + if (iopt->next_domain_id != 0) { + /* Pick an arbitrary remaining domain to act as storage */ + struct iommu_domain *storage_domain = + xa_load(&iopt->domains, 0); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(!area->storage_domain); + if (area->storage_domain == domain) + area->storage_domain = storage_domain; + mutex_unlock(&pages->mutex); + + iopt_area_unmap_domain(area, domain); + } + return; + } + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + interval_tree_remove(&area->pages_node, &pages->domains_itree); + WARN_ON(area->storage_domain != domain); + area->storage_domain = NULL; + iopt_area_unfill_domain(area, pages, domain); + mutex_unlock(&pages->mutex); + } +} + +/** + * iopt_fill_domain() - Fill a domain with PFNs + * @iopt: io_pagetable to act on + * @domain: domain to fill + * + * Fill the domain with PFNs from every area in the iopt. On failure the domain + * is left unchanged. + */ +static int iopt_fill_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iopt_area *end_area; + struct iopt_area *area; + int rc; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held_write(&iopt->domains_rwsem); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + rc = iopt_area_fill_domain(area, domain); + if (rc) { + mutex_unlock(&pages->mutex); + goto out_unfill; + } + if (!area->storage_domain) { + WARN_ON(iopt->next_domain_id != 0); + area->storage_domain = domain; + interval_tree_insert(&area->pages_node, + &pages->domains_itree); + } + mutex_unlock(&pages->mutex); + } + return 0; + +out_unfill: + end_area = area; + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (area == end_area) + break; + if (!pages) + continue; + mutex_lock(&pages->mutex); + if (iopt->next_domain_id == 0) { + interval_tree_remove(&area->pages_node, + &pages->domains_itree); + area->storage_domain = NULL; + } + iopt_area_unfill_domain(area, pages, domain); + mutex_unlock(&pages->mutex); + } + return rc; +} + +/* All existing area's conform to an increased page size */ +static int iopt_check_iova_alignment(struct io_pagetable *iopt, + unsigned long new_iova_alignment) +{ + unsigned long align_mask = new_iova_alignment - 1; + struct iopt_area *area; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held(&iopt->domains_rwsem); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) + if ((iopt_area_iova(area) & align_mask) || + (iopt_area_length(area) & align_mask) || + (area->page_offset & align_mask)) + return -EADDRINUSE; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { + struct iommufd_access *access; + unsigned long index; + + xa_for_each(&iopt->access_list, index, access) + if (WARN_ON(access->iova_alignment > + new_iova_alignment)) + return -EADDRINUSE; + } + return 0; +} + +int iopt_table_add_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + const struct iommu_domain_geometry *geometry = &domain->geometry; + struct iommu_domain *iter_domain; + unsigned int new_iova_alignment; + unsigned long index; + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + + xa_for_each(&iopt->domains, index, iter_domain) { + if (WARN_ON(iter_domain == domain)) { + rc = -EEXIST; + goto out_unlock; + } + } + + /* + * The io page size drives the iova_alignment. Internally the iopt_pages + * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE + * objects into the iommu_domain. + * + * A iommu_domain must always be able to accept PAGE_SIZE to be + * compatible as we can't guarantee higher contiguity. + */ + new_iova_alignment = max_t(unsigned long, + 1UL << __ffs(domain->pgsize_bitmap), + iopt->iova_alignment); + if (new_iova_alignment > PAGE_SIZE) { + rc = -EINVAL; + goto out_unlock; + } + if (new_iova_alignment != iopt->iova_alignment) { + rc = iopt_check_iova_alignment(iopt, new_iova_alignment); + if (rc) + goto out_unlock; + } + + /* No area exists that is outside the allowed domain aperture */ + if (geometry->aperture_start != 0) { + rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, + domain); + if (rc) + goto out_reserved; + } + if (geometry->aperture_end != ULONG_MAX) { + rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, + ULONG_MAX, domain); + if (rc) + goto out_reserved; + } + + rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); + if (rc) + goto out_reserved; + + rc = iopt_fill_domain(iopt, domain); + if (rc) + goto out_release; + + iopt->iova_alignment = new_iova_alignment; + xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); + iopt->next_domain_id++; + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return 0; +out_release: + xa_release(&iopt->domains, iopt->next_domain_id); +out_reserved: + __iopt_remove_reserved_iova(iopt, domain); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) +{ + unsigned long new_iova_alignment; + struct iommufd_access *access; + struct iommu_domain *domain; + unsigned long index; + + lockdep_assert_held_write(&iopt->iova_rwsem); + lockdep_assert_held(&iopt->domains_rwsem); + + /* See batch_iommu_map_small() */ + if (iopt->disable_large_pages) + new_iova_alignment = PAGE_SIZE; + else + new_iova_alignment = 1; + + xa_for_each(&iopt->domains, index, domain) + new_iova_alignment = max_t(unsigned long, + 1UL << __ffs(domain->pgsize_bitmap), + new_iova_alignment); + xa_for_each(&iopt->access_list, index, access) + new_iova_alignment = max_t(unsigned long, + access->iova_alignment, + new_iova_alignment); + + if (new_iova_alignment > iopt->iova_alignment) { + int rc; + + rc = iopt_check_iova_alignment(iopt, new_iova_alignment); + if (rc) + return rc; + } + iopt->iova_alignment = new_iova_alignment; + return 0; +} + +void iopt_table_remove_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iommu_domain *iter_domain = NULL; + unsigned long index; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + + xa_for_each(&iopt->domains, index, iter_domain) + if (iter_domain == domain) + break; + if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) + goto out_unlock; + + /* + * Compress the xarray to keep it linear by swapping the entry to erase + * with the tail entry and shrinking the tail. + */ + iopt->next_domain_id--; + iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); + if (index != iopt->next_domain_id) + xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); + + iopt_unfill_domain(iopt, domain); + __iopt_remove_reserved_iova(iopt, domain); + + WARN_ON(iopt_calculate_iova_alignment(iopt)); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +/** + * iopt_area_split - Split an area into two parts at iova + * @area: The area to split + * @iova: Becomes the last of a new area + * + * This splits an area into two. It is part of the VFIO compatibility to allow + * poking a hole in the mapping. The two areas continue to point at the same + * iopt_pages, just with different starting bytes. + */ +static int iopt_area_split(struct iopt_area *area, unsigned long iova) +{ + unsigned long alignment = area->iopt->iova_alignment; + unsigned long last_iova = iopt_area_last_iova(area); + unsigned long start_iova = iopt_area_iova(area); + unsigned long new_start = iova + 1; + struct io_pagetable *iopt = area->iopt; + struct iopt_pages *pages = area->pages; + struct iopt_area *lhs; + struct iopt_area *rhs; + int rc; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + if (iova == start_iova || iova == last_iova) + return 0; + + if (!pages || area->prevent_access) + return -EBUSY; + + if (new_start & (alignment - 1) || + iopt_area_start_byte(area, new_start) & (alignment - 1)) + return -EINVAL; + + lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!lhs) + return -ENOMEM; + + rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!rhs) { + rc = -ENOMEM; + goto err_free_lhs; + } + + mutex_lock(&pages->mutex); + /* + * Splitting is not permitted if an access exists, we don't track enough + * information to split existing accesses. + */ + if (area->num_accesses) { + rc = -EINVAL; + goto err_unlock; + } + + /* + * Splitting is not permitted if a domain could have been mapped with + * huge pages. + */ + if (area->storage_domain && !iopt->disable_large_pages) { + rc = -EINVAL; + goto err_unlock; + } + + interval_tree_remove(&area->node, &iopt->area_itree); + rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, + iopt_area_start_byte(area, start_iova), + (new_start - 1) - start_iova + 1, + area->iommu_prot); + if (WARN_ON(rc)) + goto err_insert; + + rc = iopt_insert_area(iopt, rhs, area->pages, new_start, + iopt_area_start_byte(area, new_start), + last_iova - new_start + 1, area->iommu_prot); + if (WARN_ON(rc)) + goto err_remove_lhs; + + lhs->storage_domain = area->storage_domain; + lhs->pages = area->pages; + rhs->storage_domain = area->storage_domain; + rhs->pages = area->pages; + kref_get(&rhs->pages->kref); + kfree(area); + mutex_unlock(&pages->mutex); + + /* + * No change to domains or accesses because the pages hasn't been + * changed + */ + return 0; + +err_remove_lhs: + interval_tree_remove(&lhs->node, &iopt->area_itree); +err_insert: + interval_tree_insert(&area->node, &iopt->area_itree); +err_unlock: + mutex_unlock(&pages->mutex); + kfree(rhs); +err_free_lhs: + kfree(lhs); + return rc; +} + +int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, + size_t num_iovas) +{ + int rc = 0; + int i; + + down_write(&iopt->iova_rwsem); + for (i = 0; i < num_iovas; i++) { + struct iopt_area *area; + + area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); + if (!area) + continue; + rc = iopt_area_split(area, iovas[i]); + if (rc) + break; + } + up_write(&iopt->iova_rwsem); + return rc; +} + +void iopt_enable_large_pages(struct io_pagetable *iopt) +{ + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + WRITE_ONCE(iopt->disable_large_pages, false); + rc = iopt_calculate_iova_alignment(iopt); + WARN_ON(rc); + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +int iopt_disable_large_pages(struct io_pagetable *iopt) +{ + int rc = 0; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + if (iopt->disable_large_pages) + goto out_unlock; + + /* Won't do it if domains already have pages mapped in them */ + if (!xa_empty(&iopt->domains) && + !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { + rc = -EINVAL; + goto out_unlock; + } + + WRITE_ONCE(iopt->disable_large_pages, true); + rc = iopt_calculate_iova_alignment(iopt); + if (rc) + WRITE_ONCE(iopt->disable_large_pages, false); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) +{ + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access, + xa_limit_16b, GFP_KERNEL_ACCOUNT); + if (rc) + goto out_unlock; + + rc = iopt_calculate_iova_alignment(iopt); + if (rc) { + xa_erase(&iopt->access_list, access->iopt_access_list_id); + goto out_unlock; + } + +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +void iopt_remove_access(struct io_pagetable *iopt, + struct iommufd_access *access) +{ + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) != + access); + WARN_ON(iopt_calculate_iova_alignment(iopt)); + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +/* Narrow the valid_iova_itree to include reserved ranges from a group. */ +int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, + struct device *device, + struct iommu_group *group, + phys_addr_t *sw_msi_start) +{ + struct iommu_resv_region *resv; + struct iommu_resv_region *tmp; + LIST_HEAD(group_resv_regions); + unsigned int num_hw_msi = 0; + unsigned int num_sw_msi = 0; + int rc; + + down_write(&iopt->iova_rwsem); + rc = iommu_get_group_resv_regions(group, &group_resv_regions); + if (rc) + goto out_unlock; + + list_for_each_entry(resv, &group_resv_regions, list) { + if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) + continue; + + if (sw_msi_start && resv->type == IOMMU_RESV_MSI) + num_hw_msi++; + if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { + *sw_msi_start = resv->start; + num_sw_msi++; + } + + rc = iopt_reserve_iova(iopt, resv->start, + resv->length - 1 + resv->start, device); + if (rc) + goto out_reserved; + } + + /* Drivers must offer sane combinations of regions */ + if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { + rc = -EINVAL; + goto out_reserved; + } + + rc = 0; + goto out_free_resv; + +out_reserved: + __iopt_remove_reserved_iova(iopt, device); +out_free_resv: + list_for_each_entry_safe(resv, tmp, &group_resv_regions, list) + kfree(resv); +out_unlock: + up_write(&iopt->iova_rwsem); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h new file mode 100644 index 000000000000..0ec3509b7e33 --- /dev/null +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + */ +#ifndef __IO_PAGETABLE_H +#define __IO_PAGETABLE_H + +#include <linux/interval_tree.h> +#include <linux/mutex.h> +#include <linux/kref.h> +#include <linux/xarray.h> + +#include "iommufd_private.h" + +struct iommu_domain; + +/* + * Each io_pagetable is composed of intervals of areas which cover regions of + * the iova that are backed by something. iova not covered by areas is not + * populated in the page table. Each area is fully populated with pages. + * + * iovas are in byte units, but must be iopt->iova_alignment aligned. + * + * pages can be NULL, this means some other thread is still working on setting + * up or tearing down the area. When observed under the write side of the + * domain_rwsem a NULL pages must mean the area is still being setup and no + * domains are filled. + * + * storage_domain points at an arbitrary iommu_domain that is holding the PFNs + * for this area. It is locked by the pages->mutex. This simplifies the locking + * as the pages code can rely on the storage_domain without having to get the + * iopt->domains_rwsem. + * + * The io_pagetable::iova_rwsem protects node + * The iopt_pages::mutex protects pages_node + * iopt and iommu_prot are immutable + * The pages::mutex protects num_accesses + */ +struct iopt_area { + struct interval_tree_node node; + struct interval_tree_node pages_node; + struct io_pagetable *iopt; + struct iopt_pages *pages; + struct iommu_domain *storage_domain; + /* How many bytes into the first page the area starts */ + unsigned int page_offset; + /* IOMMU_READ, IOMMU_WRITE, etc */ + int iommu_prot; + bool prevent_access : 1; + unsigned int num_accesses; +}; + +struct iopt_allowed { + struct interval_tree_node node; +}; + +struct iopt_reserved { + struct interval_tree_node node; + void *owner; +}; + +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages); +void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages); + +int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain); +void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, + struct iommu_domain *domain); +void iopt_area_unmap_domain(struct iopt_area *area, + struct iommu_domain *domain); + +static inline unsigned long iopt_area_index(struct iopt_area *area) +{ + return area->pages_node.start; +} + +static inline unsigned long iopt_area_last_index(struct iopt_area *area) +{ + return area->pages_node.last; +} + +static inline unsigned long iopt_area_iova(struct iopt_area *area) +{ + return area->node.start; +} + +static inline unsigned long iopt_area_last_iova(struct iopt_area *area) +{ + return area->node.last; +} + +static inline size_t iopt_area_length(struct iopt_area *area) +{ + return (area->node.last - area->node.start) + 1; +} + +/* + * Number of bytes from the start of the iopt_pages that the iova begins. + * iopt_area_start_byte() / PAGE_SIZE encodes the starting page index + * iopt_area_start_byte() % PAGE_SIZE encodes the offset within that page + */ +static inline unsigned long iopt_area_start_byte(struct iopt_area *area, + unsigned long iova) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(iova < iopt_area_iova(area) || + iova > iopt_area_last_iova(area)); + return (iova - iopt_area_iova(area)) + area->page_offset + + iopt_area_index(area) * PAGE_SIZE; +} + +static inline unsigned long iopt_area_iova_to_index(struct iopt_area *area, + unsigned long iova) +{ + return iopt_area_start_byte(area, iova) / PAGE_SIZE; +} + +#define __make_iopt_iter(name) \ + static inline struct iopt_##name *iopt_##name##_iter_first( \ + struct io_pagetable *iopt, unsigned long start, \ + unsigned long last) \ + { \ + struct interval_tree_node *node; \ + \ + lockdep_assert_held(&iopt->iova_rwsem); \ + node = interval_tree_iter_first(&iopt->name##_itree, start, \ + last); \ + if (!node) \ + return NULL; \ + return container_of(node, struct iopt_##name, node); \ + } \ + static inline struct iopt_##name *iopt_##name##_iter_next( \ + struct iopt_##name *last_node, unsigned long start, \ + unsigned long last) \ + { \ + struct interval_tree_node *node; \ + \ + node = interval_tree_iter_next(&last_node->node, start, last); \ + if (!node) \ + return NULL; \ + return container_of(node, struct iopt_##name, node); \ + } + +__make_iopt_iter(area) +__make_iopt_iter(allowed) +__make_iopt_iter(reserved) + +struct iopt_area_contig_iter { + unsigned long cur_iova; + unsigned long last_iova; + struct iopt_area *area; +}; +struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, + struct io_pagetable *iopt, + unsigned long iova, + unsigned long last_iova); +struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter); + +static inline bool iopt_area_contig_done(struct iopt_area_contig_iter *iter) +{ + return iter->area && iter->last_iova <= iopt_area_last_iova(iter->area); +} + +/* + * Iterate over a contiguous list of areas that span the iova,last_iova range. + * The caller must check iopt_area_contig_done() after the loop to see if + * contiguous areas existed. + */ +#define iopt_for_each_contig_area(iter, area, iopt, iova, last_iova) \ + for (area = iopt_area_contig_init(iter, iopt, iova, last_iova); area; \ + area = iopt_area_contig_next(iter)) + +enum { + IOPT_PAGES_ACCOUNT_NONE = 0, + IOPT_PAGES_ACCOUNT_USER = 1, + IOPT_PAGES_ACCOUNT_MM = 2, +}; + +/* + * This holds a pinned page list for multiple areas of IO address space. The + * pages always originate from a linear chunk of userspace VA. Multiple + * io_pagetable's, through their iopt_area's, can share a single iopt_pages + * which avoids multi-pinning and double accounting of page consumption. + * + * indexes in this structure are measured in PAGE_SIZE units, are 0 based from + * the start of the uptr and extend to npages. pages are pinned dynamically + * according to the intervals in the access_itree and domains_itree, npinned + * records the current number of pages pinned. + */ +struct iopt_pages { + struct kref kref; + struct mutex mutex; + size_t npages; + size_t npinned; + size_t last_npinned; + struct task_struct *source_task; + struct mm_struct *source_mm; + struct user_struct *source_user; + void __user *uptr; + bool writable:1; + u8 account_mode; + + struct xarray pinned_pfns; + /* Of iopt_pages_access::node */ + struct rb_root_cached access_itree; + /* Of iopt_area::pages_node */ + struct rb_root_cached domains_itree; +}; + +struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, + bool writable); +void iopt_release_pages(struct kref *kref); +static inline void iopt_put_pages(struct iopt_pages *pages) +{ + kref_put(&pages->kref, iopt_release_pages); +} + +void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last, struct page **out_pages); +int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last, struct page **out_pages); +void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last); + +int iopt_area_add_access(struct iopt_area *area, unsigned long start, + unsigned long last, struct page **out_pages, + unsigned int flags); +void iopt_area_remove_access(struct iopt_area *area, unsigned long start, + unsigned long last); +int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, + void *data, unsigned long length, unsigned int flags); + +/* + * Each interval represents an active iopt_access_pages(), it acts as an + * interval lock that keeps the PFNs pinned and stored in the xarray. + */ +struct iopt_pages_access { + struct interval_tree_node node; + unsigned int users; +}; + +#endif diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c new file mode 100644 index 000000000000..31577e9d434f --- /dev/null +++ b/drivers/iommu/iommufd/ioas.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/interval_tree.h> +#include <linux/iommufd.h> +#include <linux/iommu.h> +#include <uapi/linux/iommufd.h> + +#include "io_pagetable.h" + +void iommufd_ioas_destroy(struct iommufd_object *obj) +{ + struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj); + int rc; + + rc = iopt_unmap_all(&ioas->iopt, NULL); + WARN_ON(rc && rc != -ENOENT); + iopt_destroy_table(&ioas->iopt); + mutex_destroy(&ioas->mutex); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas; + + ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS); + if (IS_ERR(ioas)) + return ioas; + + iopt_init_table(&ioas->iopt); + INIT_LIST_HEAD(&ioas->hwpt_list); + mutex_init(&ioas->mutex); + return ioas; +} + +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_alloc *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + ioas = iommufd_ioas_alloc(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + cmd->out_ioas_id = ioas->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_table; + iommufd_object_finalize(ucmd->ictx, &ioas->obj); + return 0; + +out_table: + iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj); + return rc; +} + +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) +{ + struct iommu_iova_range __user *ranges; + struct iommu_ioas_iova_ranges *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + struct interval_tree_span_iter span; + u32 max_iovas; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + down_read(&ioas->iopt.iova_rwsem); + max_iovas = cmd->num_iovas; + ranges = u64_to_user_ptr(cmd->allowed_iovas); + cmd->num_iovas = 0; + cmd->out_iova_alignment = ioas->iopt.iova_alignment; + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + if (!span.is_hole) + continue; + if (cmd->num_iovas < max_iovas) { + struct iommu_iova_range elm = { + .start = span.start_hole, + .last = span.last_hole, + }; + + if (copy_to_user(&ranges[cmd->num_iovas], &elm, + sizeof(elm))) { + rc = -EFAULT; + goto out_put; + } + } + cmd->num_iovas++; + } + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put; + if (cmd->num_iovas > max_iovas) + rc = -EMSGSIZE; +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree, + struct iommu_iova_range __user *ranges, + u32 num) +{ + u32 i; + + for (i = 0; i != num; i++) { + struct iommu_iova_range range; + struct iopt_allowed *allowed; + + if (copy_from_user(&range, ranges + i, sizeof(range))) + return -EFAULT; + + if (range.start >= range.last) + return -EINVAL; + + if (interval_tree_iter_first(itree, range.start, range.last)) + return -EINVAL; + + allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT); + if (!allowed) + return -ENOMEM; + allowed->node.start = range.start; + allowed->node.last = range.last; + + interval_tree_insert(&allowed->node, itree); + } + return 0; +} + +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_allow_iovas *cmd = ucmd->cmd; + struct rb_root_cached allowed_iova = RB_ROOT_CACHED; + struct interval_tree_node *node; + struct iommufd_ioas *ioas; + struct io_pagetable *iopt; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + iopt = &ioas->iopt; + + rc = iommufd_ioas_load_iovas(&allowed_iova, + u64_to_user_ptr(cmd->allowed_iovas), + cmd->num_iovas); + if (rc) + goto out_free; + + /* + * We want the allowed tree update to be atomic, so we have to keep the + * original nodes around, and keep track of the new nodes as we allocate + * memory for them. The simplest solution is to have a new/old tree and + * then swap new for old. On success we free the old tree, on failure we + * free the new tree. + */ + rc = iopt_set_allow_iova(iopt, &allowed_iova); +out_free: + while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) { + interval_tree_remove(node, &allowed_iova); + kfree(container_of(node, struct iopt_allowed, node)); + } + iommufd_put_object(&ioas->obj); + return rc; +} + +static int conv_iommu_prot(u32 map_flags) +{ + /* + * We provide no manual cache coherency ioctls to userspace and most + * architectures make the CPU ops for cache flushing privileged. + * Therefore we require the underlying IOMMU to support CPU coherent + * operation. Support for IOMMU_CACHE is enforced by the + * IOMMU_CAP_CACHE_COHERENCY test during bind. + */ + int iommu_prot = IOMMU_CACHE; + + if (map_flags & IOMMU_IOAS_MAP_WRITEABLE) + iommu_prot |= IOMMU_WRITE; + if (map_flags & IOMMU_IOAS_MAP_READABLE) + iommu_prot |= IOMMU_READ; + return iommu_prot; +} + +int iommufd_ioas_map(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map *cmd = ucmd->cmd; + unsigned long iova = cmd->iova; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + int rc; + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) || + cmd->__reserved) + return -EOPNOTSUPP; + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova, + u64_to_user_ptr(cmd->user_va), cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_copy *cmd = ucmd->cmd; + struct iommufd_ioas *src_ioas; + struct iommufd_ioas *dst_ioas; + unsigned int flags = 0; + LIST_HEAD(pages_list); + unsigned long iova; + int rc; + + iommufd_test_syz_conv_iova_id(ucmd, cmd->src_ioas_id, &cmd->src_iova, + &cmd->flags); + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE))) + return -EOPNOTSUPP; + if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX || + cmd->dst_iova >= ULONG_MAX) + return -EOVERFLOW; + + src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id); + if (IS_ERR(src_ioas)) + return PTR_ERR(src_ioas); + rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, + &pages_list); + iommufd_put_object(&src_ioas->obj); + if (rc) + return rc; + + dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id); + if (IS_ERR(dst_ioas)) { + rc = PTR_ERR(dst_ioas); + goto out_pages; + } + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + iova = cmd->dst_iova; + rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put_dst; + + cmd->dst_iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put_dst: + iommufd_put_object(&dst_ioas->obj); +out_pages: + iopt_free_pages_list(&pages_list); + return rc; +} + +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_unmap *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + unsigned long unmapped = 0; + int rc; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (cmd->iova == 0 && cmd->length == U64_MAX) { + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + if (rc) + goto out_put; + } else { + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) { + rc = -EOVERFLOW; + goto out_put; + } + rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length, + &unmapped); + if (rc) + goto out_put; + } + + cmd->length = unmapped; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx) +{ + if (cmd->object_id) + return -EOPNOTSUPP; + + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + int rc = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + xa_lock(&ictx->objects); + if (!xa_empty(&ictx->objects)) { + rc = -EBUSY; + } else { + if (cmd->val64 == 0) + ictx->account_mode = IOPT_PAGES_ACCOUNT_USER; + else if (cmd->val64 == 1) + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + else + rc = -EINVAL; + } + xa_unlock(&ictx->objects); + + return rc; + } + return -EOPNOTSUPP; +} + +static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd, + struct iommufd_ioas *ioas) +{ + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = !ioas->iopt.disable_large_pages; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + if (cmd->val64 == 0) + return iopt_disable_large_pages(&ioas->iopt); + if (cmd->val64 == 1) { + iopt_enable_large_pages(&ioas->iopt); + return 0; + } + return -EINVAL; + } + return -EOPNOTSUPP; +} + +int iommufd_ioas_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->object_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + switch (cmd->option_id) { + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option_huge_pages(cmd, ioas); + break; + default: + rc = -EOPNOTSUPP; + } + + iommufd_put_object(&ioas->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h new file mode 100644 index 000000000000..222e86591f8a --- /dev/null +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __IOMMUFD_PRIVATE_H +#define __IOMMUFD_PRIVATE_H + +#include <linux/rwsem.h> +#include <linux/xarray.h> +#include <linux/refcount.h> +#include <linux/uaccess.h> + +struct iommu_domain; +struct iommu_group; +struct iommu_option; + +struct iommufd_ctx { + struct file *file; + struct xarray objects; + + u8 account_mode; + struct iommufd_ioas *vfio_ioas; +}; + +/* + * The IOVA to PFN map. The map automatically copies the PFNs into multiple + * domains and permits sharing of PFNs between io_pagetable instances. This + * supports both a design where IOAS's are 1:1 with a domain (eg because the + * domain is HW customized), or where the IOAS is 1:N with multiple generic + * domains. The io_pagetable holds an interval tree of iopt_areas which point + * to shared iopt_pages which hold the pfns mapped to the page table. + * + * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex + */ +struct io_pagetable { + struct rw_semaphore domains_rwsem; + struct xarray domains; + struct xarray access_list; + unsigned int next_domain_id; + + struct rw_semaphore iova_rwsem; + struct rb_root_cached area_itree; + /* IOVA that cannot become reserved, struct iopt_allowed */ + struct rb_root_cached allowed_itree; + /* IOVA that cannot be allocated, struct iopt_reserved */ + struct rb_root_cached reserved_itree; + u8 disable_large_pages; + unsigned long iova_alignment; +}; + +void iopt_init_table(struct io_pagetable *iopt); +void iopt_destroy_table(struct io_pagetable *iopt); +int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, struct list_head *pages_list); +void iopt_free_pages_list(struct list_head *pages_list); +enum { + IOPT_ALLOC_IOVA = 1 << 0, +}; +int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, void __user *uptr, + unsigned long length, int iommu_prot, + unsigned int flags); +int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags); +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, unsigned long *unmapped); +int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); + +void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, + unsigned long length); +int iopt_table_add_domain(struct io_pagetable *iopt, + struct iommu_domain *domain); +void iopt_table_remove_domain(struct io_pagetable *iopt, + struct iommu_domain *domain); +int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, + struct device *device, + struct iommu_group *group, + phys_addr_t *sw_msi_start); +int iopt_set_allow_iova(struct io_pagetable *iopt, + struct rb_root_cached *allowed_iova); +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, + unsigned long last, void *owner); +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner); +int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, + size_t num_iovas); +void iopt_enable_large_pages(struct io_pagetable *iopt); +int iopt_disable_large_pages(struct io_pagetable *iopt); + +struct iommufd_ucmd { + struct iommufd_ctx *ictx; + void __user *ubuffer; + u32 user_size; + void *cmd; +}; + +int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, + unsigned long arg); + +/* Copy the response in ucmd->cmd back to userspace. */ +static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, + size_t cmd_len) +{ + if (copy_to_user(ucmd->ubuffer, ucmd->cmd, + min_t(size_t, ucmd->user_size, cmd_len))) + return -EFAULT; + return 0; +} + +enum iommufd_object_type { + IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_DEVICE, + IOMMUFD_OBJ_HW_PAGETABLE, + IOMMUFD_OBJ_IOAS, + IOMMUFD_OBJ_ACCESS, +#ifdef CONFIG_IOMMUFD_TEST + IOMMUFD_OBJ_SELFTEST, +#endif +}; + +/* Base struct for all objects with a userspace ID handle. */ +struct iommufd_object { + struct rw_semaphore destroy_rwsem; + refcount_t users; + enum iommufd_object_type type; + unsigned int id; +}; + +static inline bool iommufd_lock_obj(struct iommufd_object *obj) +{ + if (!down_read_trylock(&obj->destroy_rwsem)) + return false; + if (!refcount_inc_not_zero(&obj->users)) { + up_read(&obj->destroy_rwsem); + return false; + } + return true; +} + +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, + enum iommufd_object_type type); +static inline void iommufd_put_object(struct iommufd_object *obj) +{ + refcount_dec(&obj->users); + up_read(&obj->destroy_rwsem); +} + +/** + * iommufd_ref_to_users() - Switch from destroy_rwsem to users refcount + * protection + * @obj - Object to release + * + * Objects have two refcount protections (destroy_rwsem and the refcount_t + * users). Holding either of these will prevent the object from being destroyed. + * + * Depending on the use case, one protection or the other is appropriate. In + * most cases references are being protected by the destroy_rwsem. This allows + * orderly destruction of the object because iommufd_object_destroy_user() will + * wait for it to become unlocked. However, as a rwsem, it cannot be held across + * a system call return. So cases that have longer term needs must switch + * to the weaker users refcount_t. + * + * With users protection iommufd_object_destroy_user() will return false, + * refusing to destroy the object, causing -EBUSY to userspace. + */ +static inline void iommufd_ref_to_users(struct iommufd_object *obj) +{ + up_read(&obj->destroy_rwsem); + /* iommufd_lock_obj() obtains users as well */ +} +void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); +void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +void iommufd_object_finalize(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +bool iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type); + +#define iommufd_object_alloc(ictx, ptr, type) \ + container_of(_iommufd_object_alloc( \ + ictx, \ + sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ + offsetof(typeof(*(ptr)), \ + obj) != 0), \ + type), \ + typeof(*(ptr)), obj) + +/* + * The IO Address Space (IOAS) pagetable is a virtual page table backed by the + * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The + * mapping is copied into all of the associated domains and made available to + * in-kernel users. + * + * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable + * object. When we go to attach a device to an IOAS we need to get an + * iommu_domain and wrapping iommufd_hw_pagetable for it. + * + * An iommu_domain & iommfd_hw_pagetable will be automatically selected + * for a device based on the hwpt_list. If no suitable iommu_domain + * is found a new iommu_domain will be created. + */ +struct iommufd_ioas { + struct iommufd_object obj; + struct io_pagetable iopt; + struct mutex mutex; + struct list_head hwpt_list; +}; + +static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd, + u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_IOAS), + struct iommufd_ioas, obj); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_ioas_destroy(struct iommufd_object *obj); +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); +int iommufd_ioas_option(struct iommufd_ucmd *ucmd); +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx); + +int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); + +/* + * A HW pagetable is called an iommu_domain inside the kernel. This user object + * allows directly creating and inspecting the domains. Domains that have kernel + * owned page tables will be associated with an iommufd_ioas that provides the + * IOVA to PFN map. + */ +struct iommufd_hw_pagetable { + struct iommufd_object obj; + struct iommufd_ioas *ioas; + struct iommu_domain *domain; + bool auto_domain : 1; + bool enforce_cache_coherency : 1; + bool msi_cookie : 1; + /* Head at iommufd_ioas::hwpt_list */ + struct list_head hwpt_item; + struct mutex devices_lock; + struct list_head devices; +}; + +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct device *dev); +void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); + +void iommufd_device_destroy(struct iommufd_object *obj); + +struct iommufd_access { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_ioas *ioas; + const struct iommufd_access_ops *ops; + void *data; + unsigned long iova_alignment; + u32 iopt_access_list_id; +}; + +int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); +void iopt_remove_access(struct io_pagetable *iopt, + struct iommufd_access *access); +void iommufd_access_destroy_object(struct iommufd_object *obj); + +#ifdef CONFIG_IOMMUFD_TEST +struct iommufd_hw_pagetable * +iommufd_device_selftest_attach(struct iommufd_ctx *ictx, + struct iommufd_ioas *ioas, + struct device *mock_dev); +void iommufd_device_selftest_detach(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt); +int iommufd_test(struct iommufd_ucmd *ucmd); +void iommufd_selftest_destroy(struct iommufd_object *obj); +extern size_t iommufd_test_memory_limit; +void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, u64 *iova, u32 *flags); +bool iommufd_should_fail(void); +void __init iommufd_test_init(void); +void iommufd_test_exit(void); +#else +static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, + u64 *iova, u32 *flags) +{ +} +static inline bool iommufd_should_fail(void) +{ + return false; +} +static inline void __init iommufd_test_init(void) +{ +} +static inline void iommufd_test_exit(void) +{ +} +#endif +#endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h new file mode 100644 index 000000000000..1d96a8f466fd --- /dev/null +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef _UAPI_IOMMUFD_TEST_H +#define _UAPI_IOMMUFD_TEST_H + +#include <linux/types.h> +#include <linux/iommufd.h> + +enum { + IOMMU_TEST_OP_ADD_RESERVED = 1, + IOMMU_TEST_OP_MOCK_DOMAIN, + IOMMU_TEST_OP_MD_CHECK_MAP, + IOMMU_TEST_OP_MD_CHECK_REFS, + IOMMU_TEST_OP_CREATE_ACCESS, + IOMMU_TEST_OP_DESTROY_ACCESS_PAGES, + IOMMU_TEST_OP_ACCESS_PAGES, + IOMMU_TEST_OP_ACCESS_RW, + IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, +}; + +enum { + MOCK_APERTURE_START = 1UL << 24, + MOCK_APERTURE_LAST = (1UL << 31) - 1, +}; + +enum { + MOCK_FLAGS_ACCESS_WRITE = 1 << 0, + MOCK_FLAGS_ACCESS_SYZ = 1 << 16, +}; + +enum { + MOCK_ACCESS_RW_WRITE = 1 << 0, + MOCK_ACCESS_RW_SLOW_PATH = 1 << 2, +}; + +enum { + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0, +}; + +struct iommu_test_cmd { + __u32 size; + __u32 op; + __u32 id; + __u32 __reserved; + union { + struct { + __aligned_u64 start; + __aligned_u64 length; + } add_reserved; + struct { + __u32 out_device_id; + __u32 out_hwpt_id; + } mock_domain; + struct { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + } check_map; + struct { + __aligned_u64 length; + __aligned_u64 uptr; + __u32 refs; + } check_refs; + struct { + __u32 out_access_fd; + __u32 flags; + } create_access; + struct { + __u32 access_pages_id; + } destroy_access_pages; + struct { + __u32 flags; + __u32 out_access_pages_id; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + } access_pages; + struct { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + __u32 flags; + } access_rw; + struct { + __u32 limit; + } memory_limit; + }; + __u32 last; +}; +#define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32) + +#endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c new file mode 100644 index 000000000000..083e6fcbe10a --- /dev/null +++ b/drivers/iommu/iommufd/main.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2021 Intel Corporation + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + * + * iommufd provides control over the IOMMU HW objects created by IOMMU kernel + * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA + * addresses (IOVA) to CPU addresses. + */ +#define pr_fmt(fmt) "iommufd: " fmt + +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/bug.h> +#include <uapi/linux/iommufd.h> +#include <linux/iommufd.h> + +#include "io_pagetable.h" +#include "iommufd_private.h" +#include "iommufd_test.h" + +struct iommufd_object_ops { + void (*destroy)(struct iommufd_object *obj); +}; +static const struct iommufd_object_ops iommufd_object_ops[]; +static struct miscdevice vfio_misc_dev; + +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + init_rwsem(&obj->destroy_rwsem); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, + xa_limit_32b, GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} + +/* + * Allow concurrent access to the object. + * + * Once another thread can see the object pointer it can prevent object + * destruction. Expect for special kernel-only objects there is no in-kernel way + * to reliably destroy a single object. Thus all APIs that are creating objects + * must use iommufd_object_abort() to handle their errors and only call + * iommufd_object_finalize() once object creation cannot fail. + */ +void iommufd_object_finalize(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + void *old; + + old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); + /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ + WARN_ON(old); +} + +/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ +void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) +{ + void *old; + + old = xa_erase(&ictx->objects, obj->id); + WARN_ON(old); + kfree(obj); +} + +/* + * Abort an object that has been fully initialized and needs destroy, but has + * not been finalized. + */ +void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + iommufd_object_ops[obj->type].destroy(obj); + iommufd_object_abort(ictx, obj); +} + +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + + if (iommufd_should_fail()) + return ERR_PTR(-ENOENT); + + xa_lock(&ictx->objects); + obj = xa_load(&ictx->objects, id); + if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || + !iommufd_lock_obj(obj)) + obj = ERR_PTR(-ENOENT); + xa_unlock(&ictx->objects); + return obj; +} + +/* + * The caller holds a users refcount and wants to destroy the object. Returns + * true if the object was destroyed. In all cases the caller no longer has a + * reference on obj. + */ +bool iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + /* + * The purpose of the destroy_rwsem is to ensure deterministic + * destruction of objects used by external drivers and destroyed by this + * function. Any temporary increment of the refcount must hold the read + * side of this, such as during ioctl execution. + */ + down_write(&obj->destroy_rwsem); + xa_lock(&ictx->objects); + refcount_dec(&obj->users); + if (!refcount_dec_if_one(&obj->users)) { + xa_unlock(&ictx->objects); + up_write(&obj->destroy_rwsem); + return false; + } + __xa_erase(&ictx->objects, obj->id); + if (ictx->vfio_ioas && &ictx->vfio_ioas->obj == obj) + ictx->vfio_ioas = NULL; + xa_unlock(&ictx->objects); + up_write(&obj->destroy_rwsem); + + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); + return true; +} + +static int iommufd_destroy(struct iommufd_ucmd *ucmd) +{ + struct iommu_destroy *cmd = ucmd->cmd; + struct iommufd_object *obj; + + obj = iommufd_get_object(ucmd->ictx, cmd->id, IOMMUFD_OBJ_ANY); + if (IS_ERR(obj)) + return PTR_ERR(obj); + iommufd_ref_to_users(obj); + /* See iommufd_ref_to_users() */ + if (!iommufd_object_destroy_user(ucmd->ictx, obj)) + return -EBUSY; + return 0; +} + +static int iommufd_fops_open(struct inode *inode, struct file *filp) +{ + struct iommufd_ctx *ictx; + + ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); + if (!ictx) + return -ENOMEM; + + /* + * For compatibility with VFIO when /dev/vfio/vfio is opened we default + * to the same rlimit accounting as vfio uses. + */ + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && + filp->private_data == &vfio_misc_dev) { + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); + } + + xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); + ictx->file = filp; + filp->private_data = ictx; + return 0; +} + +static int iommufd_fops_release(struct inode *inode, struct file *filp) +{ + struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_object *obj; + + /* + * The objects in the xarray form a graph of "users" counts, and we have + * to destroy them in a depth first manner. Leaf objects will reduce the + * users count of interior objects when they are destroyed. + * + * Repeatedly destroying all the "1 users" leaf objects will progress + * until the entire list is destroyed. If this can't progress then there + * is some bug related to object refcounting. + */ + while (!xa_empty(&ictx->objects)) { + unsigned int destroyed = 0; + unsigned long index; + + xa_for_each(&ictx->objects, index, obj) { + if (!refcount_dec_if_one(&obj->users)) + continue; + destroyed++; + xa_erase(&ictx->objects, index); + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); + } + /* Bug related to users refcount */ + if (WARN_ON(!destroyed)) + break; + } + kfree(ictx); + return 0; +} + +static int iommufd_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + switch (cmd->option_id) { + case IOMMU_OPTION_RLIMIT_MODE: + rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); + break; + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option(ucmd); + break; + default: + return -EOPNOTSUPP; + } + if (rc) + return rc; + if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, + &cmd->val64, sizeof(cmd->val64))) + return -EFAULT; + return 0; +} + +union ucmd_buffer { + struct iommu_destroy destroy; + struct iommu_ioas_alloc alloc; + struct iommu_ioas_allow_iovas allow_iovas; + struct iommu_ioas_iova_ranges iova_ranges; + struct iommu_ioas_map map; + struct iommu_ioas_unmap unmap; +#ifdef CONFIG_IOMMUFD_TEST + struct iommu_test_cmd test; +#endif +}; + +struct iommufd_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct iommufd_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } +static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { + IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, + struct iommu_ioas_alloc, out_ioas_id), + IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, + struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, + src_iova), + IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, + struct iommu_ioas_iova_ranges, out_iova_alignment), + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, + iova), + IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, + length), + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, + val64), + IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, + __reserved), +#ifdef CONFIG_IOMMUFD_TEST + IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), +#endif +}; + +static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct iommufd_ctx *ictx = filp->private_data; + const struct iommufd_ioctl_op *op; + struct iommufd_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int ret; + + nr = _IOC_NR(cmd); + if (nr < IOMMUFD_CMD_BASE || + (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) + return iommufd_vfio_ioctl(ictx, cmd, arg); + + ucmd.ictx = ictx; + ucmd.ubuffer = (void __user *)arg; + ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (ret) + return ret; + + op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (ret) + return ret; + ret = op->execute(&ucmd); + return ret; +} + +static const struct file_operations iommufd_fops = { + .owner = THIS_MODULE, + .open = iommufd_fops_open, + .release = iommufd_fops_release, + .unlocked_ioctl = iommufd_fops_ioctl, +}; + +/** + * iommufd_ctx_get - Get a context reference + * @ictx: Context to get + * + * The caller must already hold a valid reference to ictx. + */ +void iommufd_ctx_get(struct iommufd_ctx *ictx) +{ + get_file(ictx->file); +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); + +/** + * iommufd_ctx_from_file - Acquires a reference to the iommufd context + * @file: File to obtain the reference from + * + * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file + * remains owned by the caller and the caller must still do fput. On success + * the caller is responsible to call iommufd_ctx_put(). + */ +struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) +{ + struct iommufd_ctx *ictx; + + if (file->f_op != &iommufd_fops) + return ERR_PTR(-EBADFD); + ictx = file->private_data; + iommufd_ctx_get(ictx); + return ictx; +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); + +/** + * iommufd_ctx_put - Put back a reference + * @ictx: Context to put back + */ +void iommufd_ctx_put(struct iommufd_ctx *ictx) +{ + fput(ictx->file); +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); + +static const struct iommufd_object_ops iommufd_object_ops[] = { + [IOMMUFD_OBJ_ACCESS] = { + .destroy = iommufd_access_destroy_object, + }, + [IOMMUFD_OBJ_DEVICE] = { + .destroy = iommufd_device_destroy, + }, + [IOMMUFD_OBJ_IOAS] = { + .destroy = iommufd_ioas_destroy, + }, + [IOMMUFD_OBJ_HW_PAGETABLE] = { + .destroy = iommufd_hw_pagetable_destroy, + }, +#ifdef CONFIG_IOMMUFD_TEST + [IOMMUFD_OBJ_SELFTEST] = { + .destroy = iommufd_selftest_destroy, + }, +#endif +}; + +static struct miscdevice iommu_misc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "iommu", + .fops = &iommufd_fops, + .nodename = "iommu", + .mode = 0660, +}; + + +static struct miscdevice vfio_misc_dev = { + .minor = VFIO_MINOR, + .name = "vfio", + .fops = &iommufd_fops, + .nodename = "vfio/vfio", + .mode = 0666, +}; + +static int __init iommufd_init(void) +{ + int ret; + + ret = misc_register(&iommu_misc_dev); + if (ret) + return ret; + + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { + ret = misc_register(&vfio_misc_dev); + if (ret) + goto err_misc; + } + iommufd_test_init(); + return 0; +err_misc: + misc_deregister(&iommu_misc_dev); + return ret; +} + +static void __exit iommufd_exit(void) +{ + iommufd_test_exit(); + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) + misc_deregister(&vfio_misc_dev); + misc_deregister(&iommu_misc_dev); +} + +module_init(iommufd_init); +module_exit(iommufd_exit); + +#if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) +MODULE_ALIAS_MISCDEV(VFIO_MINOR); +MODULE_ALIAS("devname:vfio/vfio"); +#endif +MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c new file mode 100644 index 000000000000..1e1d3509efae --- /dev/null +++ b/drivers/iommu/iommufd/pages.c @@ -0,0 +1,1977 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * The iopt_pages is the center of the storage and motion of PFNs. Each + * iopt_pages represents a logical linear array of full PFNs. The array is 0 + * based and has npages in it. Accessors use 'index' to refer to the entry in + * this logical array, regardless of its storage location. + * + * PFNs are stored in a tiered scheme: + * 1) iopt_pages::pinned_pfns xarray + * 2) An iommu_domain + * 3) The origin of the PFNs, i.e. the userspace pointer + * + * PFN have to be copied between all combinations of tiers, depending on the + * configuration. + * + * When a PFN is taken out of the userspace pointer it is pinned exactly once. + * The storage locations of the PFN's index are tracked in the two interval + * trees. If no interval includes the index then it is not pinned. + * + * If access_itree includes the PFN's index then an in-kernel access has + * requested the page. The PFN is stored in the xarray so other requestors can + * continue to find it. + * + * If the domains_itree includes the PFN's index then an iommu_domain is storing + * the PFN and it can be read back using iommu_iova_to_phys(). To avoid + * duplicating storage the xarray is not used if only iommu_domains are using + * the PFN's index. + * + * As a general principle this is designed so that destroy never fails. This + * means removing an iommu_domain or releasing a in-kernel access will not fail + * due to insufficient memory. In practice this means some cases have to hold + * PFNs in the xarray even though they are also being stored in an iommu_domain. + * + * While the iopt_pages can use an iommu_domain as storage, it does not have an + * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the + * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages + * and reference their own slice of the PFN array, with sub page granularity. + * + * In this file the term 'last' indicates an inclusive and closed interval, eg + * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to + * no PFNs. + * + * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so + * last_iova + 1 can overflow. An iopt_pages index will always be much less than + * ULONG_MAX so last_index + 1 cannot overflow. + */ +#include <linux/overflow.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/sched/mm.h> +#include <linux/highmem.h> +#include <linux/kthread.h> +#include <linux/iommufd.h> + +#include "io_pagetable.h" +#include "double_span.h" + +#ifndef CONFIG_IOMMUFD_TEST +#define TEMP_MEMORY_LIMIT 65536 +#else +#define TEMP_MEMORY_LIMIT iommufd_test_memory_limit +#endif +#define BATCH_BACKUP_SIZE 32 + +/* + * More memory makes pin_user_pages() and the batching more efficient, but as + * this is only a performance optimization don't try too hard to get it. A 64k + * allocation can hold about 26M of 4k pages and 13G of 2M pages in an + * pfn_batch. Various destroy paths cannot fail and provide a small amount of + * stack memory as a backup contingency. If backup_len is given this cannot + * fail. + */ +static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) +{ + void *res; + + if (WARN_ON(*size == 0)) + return NULL; + + if (*size < backup_len) + return backup; + + if (!backup && iommufd_should_fail()) + return NULL; + + *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT); + res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (res) + return res; + *size = PAGE_SIZE; + if (backup_len) { + res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (res) + return res; + *size = backup_len; + return backup; + } + return kmalloc(*size, GFP_KERNEL); +} + +void interval_tree_double_span_iter_update( + struct interval_tree_double_span_iter *iter) +{ + unsigned long last_hole = ULONG_MAX; + unsigned int i; + + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) { + if (interval_tree_span_iter_done(&iter->spans[i])) { + iter->is_used = -1; + return; + } + + if (iter->spans[i].is_hole) { + last_hole = min(last_hole, iter->spans[i].last_hole); + continue; + } + + iter->is_used = i + 1; + iter->start_used = iter->spans[i].start_used; + iter->last_used = min(iter->spans[i].last_used, last_hole); + return; + } + + iter->is_used = 0; + iter->start_hole = iter->spans[0].start_hole; + iter->last_hole = + min(iter->spans[0].last_hole, iter->spans[1].last_hole); +} + +void interval_tree_double_span_iter_first( + struct interval_tree_double_span_iter *iter, + struct rb_root_cached *itree1, struct rb_root_cached *itree2, + unsigned long first_index, unsigned long last_index) +{ + unsigned int i; + + iter->itrees[0] = itree1; + iter->itrees[1] = itree2; + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) + interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i], + first_index, last_index); + interval_tree_double_span_iter_update(iter); +} + +void interval_tree_double_span_iter_next( + struct interval_tree_double_span_iter *iter) +{ + unsigned int i; + + if (iter->is_used == -1 || + iter->last_hole == iter->spans[0].last_index) { + iter->is_used = -1; + return; + } + + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) + interval_tree_span_iter_advance( + &iter->spans[i], iter->itrees[i], iter->last_hole + 1); + interval_tree_double_span_iter_update(iter); +} + +static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) +{ + int rc; + + rc = check_add_overflow(pages->npinned, npages, &pages->npinned); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(rc || pages->npinned > pages->npages); +} + +static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) +{ + int rc; + + rc = check_sub_overflow(pages->npinned, npages, &pages->npinned); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(rc || pages->npinned > pages->npages); +} + +static void iopt_pages_err_unpin(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **page_list) +{ + unsigned long npages = last_index - start_index + 1; + + unpin_user_pages(page_list, npages); + iopt_pages_sub_npinned(pages, npages); +} + +/* + * index is the number of PAGE_SIZE units from the start of the area's + * iopt_pages. If the iova is sub page-size then the area has an iova that + * covers a portion of the first and last pages in the range. + */ +static unsigned long iopt_area_index_to_iova(struct iopt_area *area, + unsigned long index) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(index < iopt_area_index(area) || + index > iopt_area_last_index(area)); + index -= iopt_area_index(area); + if (index == 0) + return iopt_area_iova(area); + return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE; +} + +static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, + unsigned long index) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(index < iopt_area_index(area) || + index > iopt_area_last_index(area)); + if (index == iopt_area_last_index(area)) + return iopt_area_last_iova(area); + return iopt_area_iova(area) - area->page_offset + + (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1; +} + +static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, + size_t size) +{ + size_t ret; + + ret = iommu_unmap(domain, iova, size); + /* + * It is a logic error in this code or a driver bug if the IOMMU unmaps + * something other than exactly as requested. This implies that the + * iommu driver may not fail unmap for reasons beyond bad agruments. + * Particularly, the iommu driver may not do a memory allocation on the + * unmap path. + */ + WARN_ON(ret != size); +} + +static void iopt_area_unmap_domain_range(struct iopt_area *area, + struct iommu_domain *domain, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start_iova = iopt_area_index_to_iova(area, start_index); + + iommu_unmap_nofail(domain, start_iova, + iopt_area_index_to_iova_last(area, last_index) - + start_iova + 1); +} + +static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, + unsigned long index) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_first(&pages->domains_itree, index, index); + if (!node) + return NULL; + return container_of(node, struct iopt_area, pages_node); +} + +/* + * A simple datastructure to hold a vector of PFNs, optimized for contiguous + * PFNs. This is used as a temporary holding memory for shuttling pfns from one + * place to another. Generally everything is made more efficient if operations + * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles, + * better cache locality, etc + */ +struct pfn_batch { + unsigned long *pfns; + u32 *npfns; + unsigned int array_size; + unsigned int end; + unsigned int total_pfns; +}; + +static void batch_clear(struct pfn_batch *batch) +{ + batch->total_pfns = 0; + batch->end = 0; + batch->pfns[0] = 0; + batch->npfns[0] = 0; +} + +/* + * Carry means we carry a portion of the final hugepage over to the front of the + * batch + */ +static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) +{ + if (!keep_pfns) + return batch_clear(batch); + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(!batch->end || + batch->npfns[batch->end - 1] < keep_pfns); + + batch->total_pfns = keep_pfns; + batch->npfns[0] = keep_pfns; + batch->pfns[0] = batch->pfns[batch->end - 1] + + (batch->npfns[batch->end - 1] - keep_pfns); + batch->end = 0; +} + +static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) +{ + if (!batch->total_pfns) + return; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(batch->total_pfns != batch->npfns[0]); + skip_pfns = min(batch->total_pfns, skip_pfns); + batch->pfns[0] += skip_pfns; + batch->npfns[0] -= skip_pfns; + batch->total_pfns -= skip_pfns; +} + +static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, + size_t backup_len) +{ + const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns); + size_t size = max_pages * elmsz; + + batch->pfns = temp_kmalloc(&size, backup, backup_len); + if (!batch->pfns) + return -ENOMEM; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(size < elmsz)) + return -EINVAL; + batch->array_size = size / elmsz; + batch->npfns = (u32 *)(batch->pfns + batch->array_size); + batch_clear(batch); + return 0; +} + +static int batch_init(struct pfn_batch *batch, size_t max_pages) +{ + return __batch_init(batch, max_pages, NULL, 0); +} + +static void batch_init_backup(struct pfn_batch *batch, size_t max_pages, + void *backup, size_t backup_len) +{ + __batch_init(batch, max_pages, backup, backup_len); +} + +static void batch_destroy(struct pfn_batch *batch, void *backup) +{ + if (batch->pfns != backup) + kfree(batch->pfns); +} + +/* true if the pfn was added, false otherwise */ +static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +{ + const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); + + if (batch->end && + pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && + batch->npfns[batch->end - 1] != MAX_NPFNS) { + batch->npfns[batch->end - 1]++; + batch->total_pfns++; + return true; + } + if (batch->end == batch->array_size) + return false; + batch->total_pfns++; + batch->pfns[batch->end] = pfn; + batch->npfns[batch->end] = 1; + batch->end++; + return true; +} + +/* + * Fill the batch with pfns from the domain. When the batch is full, or it + * reaches last_index, the function will return. The caller should use + * batch->total_pfns to determine the starting point for the next iteration. + */ +static void batch_from_domain(struct pfn_batch *batch, + struct iommu_domain *domain, + struct iopt_area *area, unsigned long start_index, + unsigned long last_index) +{ + unsigned int page_offset = 0; + unsigned long iova; + phys_addr_t phys; + + iova = iopt_area_index_to_iova(area, start_index); + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + while (start_index <= last_index) { + /* + * This is pretty slow, it would be nice to get the page size + * back from the driver, or have the driver directly fill the + * batch. + */ + phys = iommu_iova_to_phys(domain, iova) - page_offset; + if (!batch_add_pfn(batch, PHYS_PFN(phys))) + return; + iova += PAGE_SIZE - page_offset; + page_offset = 0; + start_index++; + } +} + +static struct page **raw_pages_from_domain(struct iommu_domain *domain, + struct iopt_area *area, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + unsigned int page_offset = 0; + unsigned long iova; + phys_addr_t phys; + + iova = iopt_area_index_to_iova(area, start_index); + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + while (start_index <= last_index) { + phys = iommu_iova_to_phys(domain, iova) - page_offset; + *(out_pages++) = pfn_to_page(PHYS_PFN(phys)); + iova += PAGE_SIZE - page_offset; + page_offset = 0; + start_index++; + } + return out_pages; +} + +/* Continues reading a domain until we reach a discontinuity in the pfns. */ +static void batch_from_domain_continue(struct pfn_batch *batch, + struct iommu_domain *domain, + struct iopt_area *area, + unsigned long start_index, + unsigned long last_index) +{ + unsigned int array_size = batch->array_size; + + batch->array_size = batch->end; + batch_from_domain(batch, domain, area, start_index, last_index); + batch->array_size = array_size; +} + +/* + * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That + * mode permits splitting a mapped area up, and then one of the splits is + * unmapped. Doing this normally would cause us to violate our invariant of + * pairing map/unmap. Thus, to support old VFIO compatibility disable support + * for batching consecutive PFNs. All PFNs mapped into the iommu are done in + * PAGE_SIZE units, not larger or smaller. + */ +static int batch_iommu_map_small(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t size, int prot) +{ + unsigned long start_iova = iova; + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(paddr % PAGE_SIZE || iova % PAGE_SIZE || + size % PAGE_SIZE); + + while (size) { + rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot); + if (rc) + goto err_unmap; + iova += PAGE_SIZE; + paddr += PAGE_SIZE; + size -= PAGE_SIZE; + } + return 0; + +err_unmap: + if (start_iova != iova) + iommu_unmap_nofail(domain, start_iova, iova - start_iova); + return rc; +} + +static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, + struct iopt_area *area, unsigned long start_index) +{ + bool disable_large_pages = area->iopt->disable_large_pages; + unsigned long last_iova = iopt_area_last_iova(area); + unsigned int page_offset = 0; + unsigned long start_iova; + unsigned long next_iova; + unsigned int cur = 0; + unsigned long iova; + int rc; + + /* The first index might be a partial page */ + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + next_iova = iova = start_iova = + iopt_area_index_to_iova(area, start_index); + while (cur < batch->end) { + next_iova = min(last_iova + 1, + next_iova + batch->npfns[cur] * PAGE_SIZE - + page_offset); + if (disable_large_pages) + rc = batch_iommu_map_small( + domain, iova, + PFN_PHYS(batch->pfns[cur]) + page_offset, + next_iova - iova, area->iommu_prot); + else + rc = iommu_map(domain, iova, + PFN_PHYS(batch->pfns[cur]) + page_offset, + next_iova - iova, area->iommu_prot); + if (rc) + goto err_unmap; + iova = next_iova; + page_offset = 0; + cur++; + } + return 0; +err_unmap: + if (start_iova != iova) + iommu_unmap_nofail(domain, start_iova, iova - start_iova); + return rc; +} + +static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa, + unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + rcu_read_lock(); + while (true) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + if (!batch_add_pfn(batch, xa_to_value(entry)) || + start_index == last_index) + break; + start_index++; + } + rcu_read_unlock(); +} + +static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa, + unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + xas_lock(&xas); + while (true) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + if (!batch_add_pfn(batch, xa_to_value(entry))) + break; + xas_store(&xas, NULL); + if (start_index == last_index) + break; + start_index++; + } + xas_unlock(&xas); +} + +static void clear_xarray(struct xarray *xa, unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + xas_lock(&xas); + xas_for_each(&xas, entry, last_index) + xas_store(&xas, NULL); + xas_unlock(&xas); +} + +static int pages_to_xarray(struct xarray *xa, unsigned long start_index, + unsigned long last_index, struct page **pages) +{ + struct page **end_pages = pages + (last_index - start_index) + 1; + struct page **half_pages = pages + (end_pages - pages) / 2; + XA_STATE(xas, xa, start_index); + + do { + void *old; + + xas_lock(&xas); + while (pages != end_pages) { + /* xarray does not participate in fault injection */ + if (pages == half_pages && iommufd_should_fail()) { + xas_set_err(&xas, -EINVAL); + xas_unlock(&xas); + /* aka xas_destroy() */ + xas_nomem(&xas, GFP_KERNEL); + goto err_clear; + } + + old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages))); + if (xas_error(&xas)) + break; + WARN_ON(old); + pages++; + xas_next(&xas); + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + +err_clear: + if (xas_error(&xas)) { + if (xas.xa_index != start_index) + clear_xarray(xa, start_index, xas.xa_index - 1); + return xas_error(&xas); + } + return 0; +} + +static void batch_from_pages(struct pfn_batch *batch, struct page **pages, + size_t npages) +{ + struct page **end = pages + npages; + + for (; pages != end; pages++) + if (!batch_add_pfn(batch, page_to_pfn(*pages))) + break; +} + +static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, + unsigned int first_page_off, size_t npages) +{ + unsigned int cur = 0; + + while (first_page_off) { + if (batch->npfns[cur] > first_page_off) + break; + first_page_off -= batch->npfns[cur]; + cur++; + } + + while (npages) { + size_t to_unpin = min_t(size_t, npages, + batch->npfns[cur] - first_page_off); + + unpin_user_page_range_dirty_lock( + pfn_to_page(batch->pfns[cur] + first_page_off), + to_unpin, pages->writable); + iopt_pages_sub_npinned(pages, to_unpin); + cur++; + first_page_off = 0; + npages -= to_unpin; + } +} + +static void copy_data_page(struct page *page, void *data, unsigned long offset, + size_t length, unsigned int flags) +{ + void *mem; + + mem = kmap_local_page(page); + if (flags & IOMMUFD_ACCESS_RW_WRITE) { + memcpy(mem + offset, data, length); + set_page_dirty_lock(page); + } else { + memcpy(data, mem + offset, length); + } + kunmap_local(mem); +} + +static unsigned long batch_rw(struct pfn_batch *batch, void *data, + unsigned long offset, unsigned long length, + unsigned int flags) +{ + unsigned long copied = 0; + unsigned int npage = 0; + unsigned int cur = 0; + + while (cur < batch->end) { + unsigned long bytes = min(length, PAGE_SIZE - offset); + + copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data, + offset, bytes, flags); + offset = 0; + length -= bytes; + data += bytes; + copied += bytes; + npage++; + if (npage == batch->npfns[cur]) { + npage = 0; + cur++; + } + if (!length) + break; + } + return copied; +} + +/* pfn_reader_user is just the pin_user_pages() path */ +struct pfn_reader_user { + struct page **upages; + size_t upages_len; + unsigned long upages_start; + unsigned long upages_end; + unsigned int gup_flags; + /* + * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is + * neither + */ + int locked; +}; + +static void pfn_reader_user_init(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + user->upages = NULL; + user->upages_start = 0; + user->upages_end = 0; + user->locked = -1; + + user->gup_flags = FOLL_LONGTERM; + if (pages->writable) + user->gup_flags |= FOLL_WRITE; +} + +static void pfn_reader_user_destroy(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + if (user->locked != -1) { + if (user->locked) + mmap_read_unlock(pages->source_mm); + if (pages->source_mm != current->mm) + mmput(pages->source_mm); + user->locked = -1; + } + + kfree(user->upages); + user->upages = NULL; +} + +static int pfn_reader_user_pin(struct pfn_reader_user *user, + struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index) +{ + bool remote_mm = pages->source_mm != current->mm; + unsigned long npages; + uintptr_t uptr; + long rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(last_index < start_index)) + return -EINVAL; + + if (!user->upages) { + /* All undone in pfn_reader_destroy() */ + user->upages_len = + (last_index - start_index + 1) * sizeof(*user->upages); + user->upages = temp_kmalloc(&user->upages_len, NULL, 0); + if (!user->upages) + return -ENOMEM; + } + + if (user->locked == -1) { + /* + * The majority of usages will run the map task within the mm + * providing the pages, so we can optimize into + * get_user_pages_fast() + */ + if (remote_mm) { + if (!mmget_not_zero(pages->source_mm)) + return -EFAULT; + } + user->locked = 0; + } + + npages = min_t(unsigned long, last_index - start_index + 1, + user->upages_len / sizeof(*user->upages)); + + + if (iommufd_should_fail()) + return -EFAULT; + + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); + if (!remote_mm) + rc = pin_user_pages_fast(uptr, npages, user->gup_flags, + user->upages); + else { + if (!user->locked) { + mmap_read_lock(pages->source_mm); + user->locked = 1; + } + rc = pin_user_pages_remote(pages->source_mm, uptr, npages, + user->gup_flags, user->upages, NULL, + &user->locked); + } + if (rc <= 0) { + if (WARN_ON(!rc)) + return -EFAULT; + return rc; + } + iopt_pages_add_npinned(pages, rc); + user->upages_start = start_index; + user->upages_end = start_index + rc; + return 0; +} + +/* This is the "modern" and faster accounting method used by io_uring */ +static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) +{ + unsigned long lock_limit; + unsigned long cur_pages; + unsigned long new_pages; + + lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> + PAGE_SHIFT; + do { + cur_pages = atomic_long_read(&pages->source_user->locked_vm); + new_pages = cur_pages + npages; + if (new_pages > lock_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages, + new_pages) != cur_pages); + return 0; +} + +static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) +{ + if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages)) + return; + atomic_long_sub(npages, &pages->source_user->locked_vm); +} + +/* This is the accounting method used for compatibility with VFIO */ +static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) +{ + bool do_put = false; + int rc; + + if (user && user->locked) { + mmap_read_unlock(pages->source_mm); + user->locked = 0; + /* If we had the lock then we also have a get */ + } else if ((!user || !user->upages) && + pages->source_mm != current->mm) { + if (!mmget_not_zero(pages->source_mm)) + return -EINVAL; + do_put = true; + } + + mmap_write_lock(pages->source_mm); + rc = __account_locked_vm(pages->source_mm, npages, inc, + pages->source_task, false); + mmap_write_unlock(pages->source_mm); + + if (do_put) + mmput(pages->source_mm); + return rc; +} + +static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) +{ + int rc = 0; + + switch (pages->account_mode) { + case IOPT_PAGES_ACCOUNT_NONE: + break; + case IOPT_PAGES_ACCOUNT_USER: + if (inc) + rc = incr_user_locked_vm(pages, npages); + else + decr_user_locked_vm(pages, npages); + break; + case IOPT_PAGES_ACCOUNT_MM: + rc = update_mm_locked_vm(pages, npages, inc, user); + break; + } + if (rc) + return rc; + + pages->last_npinned = pages->npinned; + if (inc) + atomic64_add(npages, &pages->source_mm->pinned_vm); + else + atomic64_sub(npages, &pages->source_mm->pinned_vm); + return 0; +} + +static void update_unpinned(struct iopt_pages *pages) +{ + if (WARN_ON(pages->npinned > pages->last_npinned)) + return; + if (pages->npinned == pages->last_npinned) + return; + do_update_pinned(pages, pages->last_npinned - pages->npinned, false, + NULL); +} + +/* + * Changes in the number of pages pinned is done after the pages have been read + * and processed. If the user lacked the limit then the error unwind will unpin + * everything that was just pinned. This is because it is expensive to calculate + * how many pages we have already pinned within a range to generate an accurate + * prediction in advance of doing the work to actually pin them. + */ +static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + unsigned long npages; + bool inc; + + lockdep_assert_held(&pages->mutex); + + if (pages->npinned == pages->last_npinned) + return 0; + + if (pages->npinned < pages->last_npinned) { + npages = pages->last_npinned - pages->npinned; + inc = false; + } else { + if (iommufd_should_fail()) + return -ENOMEM; + npages = pages->npinned - pages->last_npinned; + inc = true; + } + return do_update_pinned(pages, npages, inc, user); +} + +/* + * PFNs are stored in three places, in order of preference: + * - The iopt_pages xarray. This is only populated if there is a + * iopt_pages_access + * - The iommu_domain under an area + * - The original PFN source, ie pages->source_mm + * + * This iterator reads the pfns optimizing to load according to the + * above order. + */ +struct pfn_reader { + struct iopt_pages *pages; + struct interval_tree_double_span_iter span; + struct pfn_batch batch; + unsigned long batch_start_index; + unsigned long batch_end_index; + unsigned long last_index; + + struct pfn_reader_user user; +}; + +static int pfn_reader_update_pinned(struct pfn_reader *pfns) +{ + return pfn_reader_user_update_pinned(&pfns->user, pfns->pages); +} + +/* + * The batch can contain a mixture of pages that are still in use and pages that + * need to be unpinned. Unpin only pages that are not held anywhere else. + */ +static void pfn_reader_unpin(struct pfn_reader *pfns) +{ + unsigned long last = pfns->batch_end_index - 1; + unsigned long start = pfns->batch_start_index; + struct interval_tree_double_span_iter span; + struct iopt_pages *pages = pfns->pages; + + lockdep_assert_held(&pages->mutex); + + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start, last) { + if (span.is_used) + continue; + + batch_unpin(&pfns->batch, pages, span.start_hole - start, + span.last_hole - span.start_hole + 1); + } +} + +/* Process a single span to load it from the proper storage */ +static int pfn_reader_fill_span(struct pfn_reader *pfns) +{ + struct interval_tree_double_span_iter *span = &pfns->span; + unsigned long start_index = pfns->batch_end_index; + struct iopt_area *area; + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(span->last_used < start_index)) + return -EINVAL; + + if (span->is_used == 1) { + batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns, + start_index, span->last_used); + return 0; + } + + if (span->is_used == 2) { + /* + * Pull as many pages from the first domain we find in the + * target span. If it is too small then we will be called again + * and we'll find another area. + */ + area = iopt_pages_find_domain_area(pfns->pages, start_index); + if (WARN_ON(!area)) + return -EINVAL; + + /* The storage_domain cannot change without the pages mutex */ + batch_from_domain( + &pfns->batch, area->storage_domain, area, start_index, + min(iopt_area_last_index(area), span->last_used)); + return 0; + } + + if (start_index >= pfns->user.upages_end) { + rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + span->last_hole); + if (rc) + return rc; + } + + batch_from_pages(&pfns->batch, + pfns->user.upages + + (start_index - pfns->user.upages_start), + pfns->user.upages_end - start_index); + return 0; +} + +static bool pfn_reader_done(struct pfn_reader *pfns) +{ + return pfns->batch_start_index == pfns->last_index + 1; +} + +static int pfn_reader_next(struct pfn_reader *pfns) +{ + int rc; + + batch_clear(&pfns->batch); + pfns->batch_start_index = pfns->batch_end_index; + + while (pfns->batch_end_index != pfns->last_index + 1) { + unsigned int npfns = pfns->batch.total_pfns; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(interval_tree_double_span_iter_done(&pfns->span))) + return -EINVAL; + + rc = pfn_reader_fill_span(pfns); + if (rc) + return rc; + + if (WARN_ON(!pfns->batch.total_pfns)) + return -EINVAL; + + pfns->batch_end_index = + pfns->batch_start_index + pfns->batch.total_pfns; + if (pfns->batch_end_index == pfns->span.last_used + 1) + interval_tree_double_span_iter_next(&pfns->span); + + /* Batch is full */ + if (npfns == pfns->batch.total_pfns) + return 0; + } + return 0; +} + +static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, + unsigned long start_index, unsigned long last_index) +{ + int rc; + + lockdep_assert_held(&pages->mutex); + + pfns->pages = pages; + pfns->batch_start_index = start_index; + pfns->batch_end_index = start_index; + pfns->last_index = last_index; + pfn_reader_user_init(&pfns->user, pages); + rc = batch_init(&pfns->batch, last_index - start_index + 1); + if (rc) + return rc; + interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index); + return 0; +} + +/* + * There are many assertions regarding the state of pages->npinned vs + * pages->last_pinned, for instance something like unmapping a domain must only + * decrement the npinned, and pfn_reader_destroy() must be called only after all + * the pins are updated. This is fine for success flows, but error flows + * sometimes need to release the pins held inside the pfn_reader before going on + * to complete unmapping and releasing pins held in domains. + */ +static void pfn_reader_release_pins(struct pfn_reader *pfns) +{ + struct iopt_pages *pages = pfns->pages; + + if (pfns->user.upages_end > pfns->batch_end_index) { + size_t npages = pfns->user.upages_end - pfns->batch_end_index; + + /* Any pages not transferred to the batch are just unpinned */ + unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - + pfns->user.upages_start), + npages); + iopt_pages_sub_npinned(pages, npages); + pfns->user.upages_end = pfns->batch_end_index; + } + if (pfns->batch_start_index != pfns->batch_end_index) { + pfn_reader_unpin(pfns); + pfns->batch_start_index = pfns->batch_end_index; + } +} + +static void pfn_reader_destroy(struct pfn_reader *pfns) +{ + struct iopt_pages *pages = pfns->pages; + + pfn_reader_release_pins(pfns); + pfn_reader_user_destroy(&pfns->user, pfns->pages); + batch_destroy(&pfns->batch, NULL); + WARN_ON(pages->last_npinned != pages->npinned); +} + +static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, + unsigned long start_index, unsigned long last_index) +{ + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(last_index < start_index)) + return -EINVAL; + + rc = pfn_reader_init(pfns, pages, start_index, last_index); + if (rc) + return rc; + rc = pfn_reader_next(pfns); + if (rc) { + pfn_reader_destroy(pfns); + return rc; + } + return 0; +} + +struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, + bool writable) +{ + struct iopt_pages *pages; + + /* + * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP + * below from overflow + */ + if (length > SIZE_MAX - PAGE_SIZE || length == 0) + return ERR_PTR(-EINVAL); + + pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); + if (!pages) + return ERR_PTR(-ENOMEM); + + kref_init(&pages->kref); + xa_init_flags(&pages->pinned_pfns, XA_FLAGS_ACCOUNT); + mutex_init(&pages->mutex); + pages->source_mm = current->mm; + mmgrab(pages->source_mm); + pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); + pages->access_itree = RB_ROOT_CACHED; + pages->domains_itree = RB_ROOT_CACHED; + pages->writable = writable; + if (capable(CAP_IPC_LOCK)) + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + else + pages->account_mode = IOPT_PAGES_ACCOUNT_USER; + pages->source_task = current->group_leader; + get_task_struct(current->group_leader); + pages->source_user = get_uid(current_user()); + return pages; +} + +void iopt_release_pages(struct kref *kref) +{ + struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); + + WARN_ON(!RB_EMPTY_ROOT(&pages->access_itree.rb_root)); + WARN_ON(!RB_EMPTY_ROOT(&pages->domains_itree.rb_root)); + WARN_ON(pages->npinned); + WARN_ON(!xa_empty(&pages->pinned_pfns)); + mmdrop(pages->source_mm); + mutex_destroy(&pages->mutex); + put_task_struct(pages->source_task); + free_uid(pages->source_user); + kfree(pages); +} + +static void +iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area, + struct iopt_pages *pages, struct iommu_domain *domain, + unsigned long start_index, unsigned long last_index, + unsigned long *unmapped_end_index, + unsigned long real_last_index) +{ + while (start_index <= last_index) { + unsigned long batch_last_index; + + if (*unmapped_end_index <= last_index) { + unsigned long start = + max(start_index, *unmapped_end_index); + + batch_from_domain(batch, domain, area, start, + last_index); + batch_last_index = start + batch->total_pfns - 1; + } else { + batch_last_index = last_index; + } + + /* + * unmaps must always 'cut' at a place where the pfns are not + * contiguous to pair with the maps that always install + * contiguous pages. Thus, if we have to stop unpinning in the + * middle of the domains we need to keep reading pfns until we + * find a cut point to do the unmap. The pfns we read are + * carried over and either skipped or integrated into the next + * batch. + */ + if (batch_last_index == last_index && + last_index != real_last_index) + batch_from_domain_continue(batch, domain, area, + last_index + 1, + real_last_index); + + if (*unmapped_end_index <= batch_last_index) { + iopt_area_unmap_domain_range( + area, domain, *unmapped_end_index, + start_index + batch->total_pfns - 1); + *unmapped_end_index = start_index + batch->total_pfns; + } + + /* unpin must follow unmap */ + batch_unpin(batch, pages, 0, + batch_last_index - start_index + 1); + start_index = batch_last_index + 1; + + batch_clear_carry(batch, + *unmapped_end_index - batch_last_index - 1); + } +} + +static void __iopt_area_unfill_domain(struct iopt_area *area, + struct iopt_pages *pages, + struct iommu_domain *domain, + unsigned long last_index) +{ + struct interval_tree_double_span_iter span; + unsigned long start_index = iopt_area_index(area); + unsigned long unmapped_end_index = start_index; + u64 backup[BATCH_BACKUP_SIZE]; + struct pfn_batch batch; + + lockdep_assert_held(&pages->mutex); + + /* + * For security we must not unpin something that is still DMA mapped, + * so this must unmap any IOVA before we go ahead and unpin the pages. + * This creates a complexity where we need to skip over unpinning pages + * held in the xarray, but continue to unmap from the domain. + * + * The domain unmap cannot stop in the middle of a contiguous range of + * PFNs. To solve this problem the unpinning step will read ahead to the + * end of any contiguous span, unmap that whole span, and then only + * unpin the leading part that does not have any accesses. The residual + * PFNs that were unmapped but not unpinned are called a "carry" in the + * batch as they are moved to the front of the PFN list and continue on + * to the next iteration(s). + */ + batch_init_backup(&batch, last_index + 1, backup, sizeof(backup)); + interval_tree_for_each_double_span(&span, &pages->domains_itree, + &pages->access_itree, start_index, + last_index) { + if (span.is_used) { + batch_skip_carry(&batch, + span.last_used - span.start_used + 1); + continue; + } + iopt_area_unpin_domain(&batch, area, pages, domain, + span.start_hole, span.last_hole, + &unmapped_end_index, last_index); + } + /* + * If the range ends in a access then we do the residual unmap without + * any unpins. + */ + if (unmapped_end_index != last_index + 1) + iopt_area_unmap_domain_range(area, domain, unmapped_end_index, + last_index); + WARN_ON(batch.total_pfns); + batch_destroy(&batch, backup); + update_unpinned(pages); +} + +static void iopt_area_unfill_partial_domain(struct iopt_area *area, + struct iopt_pages *pages, + struct iommu_domain *domain, + unsigned long end_index) +{ + if (end_index != iopt_area_index(area)) + __iopt_area_unfill_domain(area, pages, domain, end_index - 1); +} + +/** + * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain + * @area: The IOVA range to unmap + * @domain: The domain to unmap + * + * The caller must know that unpinning is not required, usually because there + * are other domains in the iopt. + */ +void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) +{ + iommu_unmap_nofail(domain, iopt_area_iova(area), + iopt_area_length(area)); +} + +/** + * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain + * @area: IOVA area to use + * @pages: page supplier for the area (area->pages is NULL) + * @domain: Domain to unmap from + * + * The domain should be removed from the domains_itree before calling. The + * domain will always be unmapped, but the PFNs may not be unpinned if there are + * still accesses. + */ +void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, + struct iommu_domain *domain) +{ + __iopt_area_unfill_domain(area, pages, domain, + iopt_area_last_index(area)); +} + +/** + * iopt_area_fill_domain() - Map PFNs from the area into a domain + * @area: IOVA area to use + * @domain: Domain to load PFNs into + * + * Read the pfns from the area's underlying iopt_pages and map them into the + * given domain. Called when attaching a new domain to an io_pagetable. + */ +int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) +{ + unsigned long done_end_index; + struct pfn_reader pfns; + int rc; + + lockdep_assert_held(&area->pages->mutex); + + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + return rc; + + while (!pfn_reader_done(&pfns)) { + done_end_index = pfns.batch_start_index; + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + done_end_index = pfns.batch_end_index; + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_unmap; + } + + rc = pfn_reader_update_pinned(&pfns); + if (rc) + goto out_unmap; + goto out_destroy; + +out_unmap: + pfn_reader_release_pins(&pfns); + iopt_area_unfill_partial_domain(area, area->pages, domain, + done_end_index); +out_destroy: + pfn_reader_destroy(&pfns); + return rc; +} + +/** + * iopt_area_fill_domains() - Install PFNs into the area's domains + * @area: The area to act on + * @pages: The pages associated with the area (area->pages is NULL) + * + * Called during area creation. The area is freshly created and not inserted in + * the domains_itree yet. PFNs are read and loaded into every domain held in the + * area's io_pagetable and the area is installed in the domains_itree. + * + * On failure all domains are left unchanged. + */ +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) +{ + unsigned long done_first_end_index; + unsigned long done_all_end_index; + struct iommu_domain *domain; + unsigned long unmap_index; + struct pfn_reader pfns; + unsigned long index; + int rc; + + lockdep_assert_held(&area->iopt->domains_rwsem); + + if (xa_empty(&area->iopt->domains)) + return 0; + + mutex_lock(&pages->mutex); + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_unmap; + } + rc = pfn_reader_update_pinned(&pfns); + if (rc) + goto out_unmap; + + area->storage_domain = xa_load(&area->iopt->domains, 0); + interval_tree_insert(&area->pages_node, &pages->domains_itree); + goto out_destroy; + +out_unmap: + pfn_reader_release_pins(&pfns); + xa_for_each(&area->iopt->domains, unmap_index, domain) { + unsigned long end_index; + + if (unmap_index < index) + end_index = done_first_end_index; + else + end_index = done_all_end_index; + + /* + * The area is not yet part of the domains_itree so we have to + * manage the unpinning specially. The last domain does the + * unpin, every other domain is just unmapped. + */ + if (unmap_index != area->iopt->next_domain_id - 1) { + if (end_index != iopt_area_index(area)) + iopt_area_unmap_domain_range( + area, domain, iopt_area_index(area), + end_index - 1); + } else { + iopt_area_unfill_partial_domain(area, pages, domain, + end_index); + } + } +out_destroy: + pfn_reader_destroy(&pfns); +out_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/** + * iopt_area_unfill_domains() - unmap PFNs from the area's domains + * @area: The area to act on + * @pages: The pages associated with the area (area->pages is NULL) + * + * Called during area destruction. This unmaps the iova's covered by all the + * area's domains and releases the PFNs. + */ +void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) +{ + struct io_pagetable *iopt = area->iopt; + struct iommu_domain *domain; + unsigned long index; + + lockdep_assert_held(&iopt->domains_rwsem); + + mutex_lock(&pages->mutex); + if (!area->storage_domain) + goto out_unlock; + + xa_for_each(&iopt->domains, index, domain) + if (domain != area->storage_domain) + iopt_area_unmap_domain_range( + area, domain, iopt_area_index(area), + iopt_area_last_index(area)); + + interval_tree_remove(&area->pages_node, &pages->domains_itree); + iopt_area_unfill_domain(area, pages, area->storage_domain); + area->storage_domain = NULL; +out_unlock: + mutex_unlock(&pages->mutex); +} + +static void iopt_pages_unpin_xarray(struct pfn_batch *batch, + struct iopt_pages *pages, + unsigned long start_index, + unsigned long end_index) +{ + while (start_index <= end_index) { + batch_from_xarray_clear(batch, &pages->pinned_pfns, start_index, + end_index); + batch_unpin(batch, pages, 0, batch->total_pfns); + start_index += batch->total_pfns; + batch_clear(batch); + } +} + +/** + * iopt_pages_unfill_xarray() - Update the xarry after removing an access + * @pages: The pages to act on + * @start_index: Starting PFN index + * @last_index: Last PFN index + * + * Called when an iopt_pages_access is removed, removes pages from the itree. + * The access should already be removed from the access_itree. + */ +void iopt_pages_unfill_xarray(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index) +{ + struct interval_tree_double_span_iter span; + u64 backup[BATCH_BACKUP_SIZE]; + struct pfn_batch batch; + bool batch_inited = false; + + lockdep_assert_held(&pages->mutex); + + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index) { + if (!span.is_used) { + if (!batch_inited) { + batch_init_backup(&batch, + last_index - start_index + 1, + backup, sizeof(backup)); + batch_inited = true; + } + iopt_pages_unpin_xarray(&batch, pages, span.start_hole, + span.last_hole); + } else if (span.is_used == 2) { + /* Covered by a domain */ + clear_xarray(&pages->pinned_pfns, span.start_used, + span.last_used); + } + /* Otherwise covered by an existing access */ + } + if (batch_inited) + batch_destroy(&batch, backup); + update_unpinned(pages); +} + +/** + * iopt_pages_fill_from_xarray() - Fast path for reading PFNs + * @pages: The pages to act on + * @start_index: The first page index in the range + * @last_index: The last page index in the range + * @out_pages: The output array to return the pages + * + * This can be called if the caller is holding a refcount on an + * iopt_pages_access that is known to have already been filled. It quickly reads + * the pages directly from the xarray. + * + * This is part of the SW iommu interface to read pages for in-kernel use. + */ +void iopt_pages_fill_from_xarray(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + XA_STATE(xas, &pages->pinned_pfns, start_index); + void *entry; + + rcu_read_lock(); + while (start_index <= last_index) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + *(out_pages++) = pfn_to_page(xa_to_value(entry)); + start_index++; + } + rcu_read_unlock(); +} + +static int iopt_pages_fill_from_domain(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + while (start_index != last_index + 1) { + unsigned long domain_last; + struct iopt_area *area; + + area = iopt_pages_find_domain_area(pages, start_index); + if (WARN_ON(!area)) + return -EINVAL; + + domain_last = min(iopt_area_last_index(area), last_index); + out_pages = raw_pages_from_domain(area->storage_domain, area, + start_index, domain_last, + out_pages); + start_index = domain_last + 1; + } + return 0; +} + +static int iopt_pages_fill_from_mm(struct iopt_pages *pages, + struct pfn_reader_user *user, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + unsigned long cur_index = start_index; + int rc; + + while (cur_index != last_index + 1) { + user->upages = out_pages + (cur_index - start_index); + rc = pfn_reader_user_pin(user, pages, cur_index, last_index); + if (rc) + goto out_unpin; + cur_index = user->upages_end; + } + return 0; + +out_unpin: + if (start_index != cur_index) + iopt_pages_err_unpin(pages, start_index, cur_index - 1, + out_pages); + return rc; +} + +/** + * iopt_pages_fill_xarray() - Read PFNs + * @pages: The pages to act on + * @start_index: The first page index in the range + * @last_index: The last page index in the range + * @out_pages: The output array to return the pages, may be NULL + * + * This populates the xarray and returns the pages in out_pages. As the slow + * path this is able to copy pages from other storage tiers into the xarray. + * + * On failure the xarray is left unchanged. + * + * This is part of the SW iommu interface to read pages for in-kernel use. + */ +int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, + unsigned long last_index, struct page **out_pages) +{ + struct interval_tree_double_span_iter span; + unsigned long xa_end = start_index; + struct pfn_reader_user user; + int rc; + + lockdep_assert_held(&pages->mutex); + + pfn_reader_user_init(&user, pages); + user.upages_len = (last_index - start_index + 1) * sizeof(*out_pages); + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index) { + struct page **cur_pages; + + if (span.is_used == 1) { + cur_pages = out_pages + (span.start_used - start_index); + iopt_pages_fill_from_xarray(pages, span.start_used, + span.last_used, cur_pages); + continue; + } + + if (span.is_used == 2) { + cur_pages = out_pages + (span.start_used - start_index); + iopt_pages_fill_from_domain(pages, span.start_used, + span.last_used, cur_pages); + rc = pages_to_xarray(&pages->pinned_pfns, + span.start_used, span.last_used, + cur_pages); + if (rc) + goto out_clean_xa; + xa_end = span.last_used + 1; + continue; + } + + /* hole */ + cur_pages = out_pages + (span.start_hole - start_index); + rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, + span.last_hole, cur_pages); + if (rc) + goto out_clean_xa; + rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, + span.last_hole, cur_pages); + if (rc) { + iopt_pages_err_unpin(pages, span.start_hole, + span.last_hole, cur_pages); + goto out_clean_xa; + } + xa_end = span.last_hole + 1; + } + rc = pfn_reader_user_update_pinned(&user, pages); + if (rc) + goto out_clean_xa; + user.upages = NULL; + pfn_reader_user_destroy(&user, pages); + return 0; + +out_clean_xa: + if (start_index != xa_end) + iopt_pages_unfill_xarray(pages, start_index, xa_end - 1); + user.upages = NULL; + pfn_reader_user_destroy(&user, pages); + return rc; +} + +/* + * This uses the pfn_reader instead of taking a shortcut by using the mm. It can + * do every scenario and is fully consistent with what an iommu_domain would + * see. + */ +static int iopt_pages_rw_slow(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, unsigned long offset, + void *data, unsigned long length, + unsigned int flags) +{ + struct pfn_reader pfns; + int rc; + + mutex_lock(&pages->mutex); + + rc = pfn_reader_first(&pfns, pages, start_index, last_index); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + unsigned long done; + + done = batch_rw(&pfns.batch, data, offset, length, flags); + data += done; + length -= done; + offset = 0; + pfn_reader_unpin(&pfns); + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_destroy; + } + if (WARN_ON(length != 0)) + rc = -EINVAL; +out_destroy: + pfn_reader_destroy(&pfns); +out_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/* + * A medium speed path that still allows DMA inconsistencies, but doesn't do any + * memory allocations or interval tree searches. + */ +static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, + unsigned long offset, void *data, + unsigned long length, unsigned int flags) +{ + struct page *page = NULL; + int rc; + + if (!mmget_not_zero(pages->source_mm)) + return iopt_pages_rw_slow(pages, index, index, offset, data, + length, flags); + + if (iommufd_should_fail()) { + rc = -EINVAL; + goto out_mmput; + } + + mmap_read_lock(pages->source_mm); + rc = pin_user_pages_remote( + pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), + 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, + NULL, NULL); + mmap_read_unlock(pages->source_mm); + if (rc != 1) { + if (WARN_ON(rc >= 0)) + rc = -EINVAL; + goto out_mmput; + } + copy_data_page(page, data, offset, length, flags); + unpin_user_page(page); + rc = 0; + +out_mmput: + mmput(pages->source_mm); + return rc; +} + +/** + * iopt_pages_rw_access - Copy to/from a linear slice of the pages + * @pages: pages to act on + * @start_byte: First byte of pages to copy to/from + * @data: Kernel buffer to get/put the data + * @length: Number of bytes to copy + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * This will find each page in the range, kmap it and then memcpy to/from + * the given kernel buffer. + */ +int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, + void *data, unsigned long length, unsigned int flags) +{ + unsigned long start_index = start_byte / PAGE_SIZE; + unsigned long last_index = (start_byte + length - 1) / PAGE_SIZE; + bool change_mm = current->mm != pages->source_mm; + int rc = 0; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + (flags & __IOMMUFD_ACCESS_RW_SLOW_PATH)) + change_mm = true; + + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) + return -EPERM; + + if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { + if (start_index == last_index) + return iopt_pages_rw_page(pages, start_index, + start_byte % PAGE_SIZE, data, + length, flags); + return iopt_pages_rw_slow(pages, start_index, last_index, + start_byte % PAGE_SIZE, data, length, + flags); + } + + /* + * Try to copy using copy_to_user(). We do this as a fast path and + * ignore any pinning inconsistencies, unlike a real DMA path. + */ + if (change_mm) { + if (!mmget_not_zero(pages->source_mm)) + return iopt_pages_rw_slow(pages, start_index, + last_index, + start_byte % PAGE_SIZE, data, + length, flags); + kthread_use_mm(pages->source_mm); + } + + if (flags & IOMMUFD_ACCESS_RW_WRITE) { + if (copy_to_user(pages->uptr + start_byte, data, length)) + rc = -EFAULT; + } else { + if (copy_from_user(data, pages->uptr + start_byte, length)) + rc = -EFAULT; + } + + if (change_mm) { + kthread_unuse_mm(pages->source_mm); + mmput(pages->source_mm); + } + + return rc; +} + +static struct iopt_pages_access * +iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, + unsigned long last) +{ + struct interval_tree_node *node; + + lockdep_assert_held(&pages->mutex); + + /* There can be overlapping ranges in this interval tree */ + for (node = interval_tree_iter_first(&pages->access_itree, index, last); + node; node = interval_tree_iter_next(node, index, last)) + if (node->start == index && node->last == last) + return container_of(node, struct iopt_pages_access, + node); + return NULL; +} + +/** + * iopt_area_add_access() - Record an in-knerel access for PFNs + * @area: The source of PFNs + * @start_index: First page index + * @last_index: Inclusive last page index + * @out_pages: Output list of struct page's representing the PFNs + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * Record that an in-kernel access will be accessing the pages, ensure they are + * pinned, and return the PFNs as a simple list of 'struct page *'. + * + * This should be undone through a matching call to iopt_area_remove_access() + */ +int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, + unsigned long last_index, struct page **out_pages, + unsigned int flags) +{ + struct iopt_pages *pages = area->pages; + struct iopt_pages_access *access; + int rc; + + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) + return -EPERM; + + mutex_lock(&pages->mutex); + access = iopt_pages_get_exact_access(pages, start_index, last_index); + if (access) { + area->num_accesses++; + access->users++; + iopt_pages_fill_from_xarray(pages, start_index, last_index, + out_pages); + mutex_unlock(&pages->mutex); + return 0; + } + + access = kzalloc(sizeof(*access), GFP_KERNEL_ACCOUNT); + if (!access) { + rc = -ENOMEM; + goto err_unlock; + } + + rc = iopt_pages_fill_xarray(pages, start_index, last_index, out_pages); + if (rc) + goto err_free; + + access->node.start = start_index; + access->node.last = last_index; + access->users = 1; + area->num_accesses++; + interval_tree_insert(&access->node, &pages->access_itree); + mutex_unlock(&pages->mutex); + return 0; + +err_free: + kfree(access); +err_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/** + * iopt_area_remove_access() - Release an in-kernel access for PFNs + * @area: The source of PFNs + * @start_index: First page index + * @last_index: Inclusive last page index + * + * Undo iopt_area_add_access() and unpin the pages if necessary. The caller + * must stop using the PFNs before calling this. + */ +void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, + unsigned long last_index) +{ + struct iopt_pages *pages = area->pages; + struct iopt_pages_access *access; + + mutex_lock(&pages->mutex); + access = iopt_pages_get_exact_access(pages, start_index, last_index); + if (WARN_ON(!access)) + goto out_unlock; + + WARN_ON(area->num_accesses == 0 || access->users == 0); + area->num_accesses--; + access->users--; + if (access->users) + goto out_unlock; + + interval_tree_remove(&access->node, &pages->access_itree); + iopt_pages_unfill_xarray(pages, start_index, last_index); + kfree(access); +out_unlock: + mutex_unlock(&pages->mutex); +} diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c new file mode 100644 index 000000000000..cfb5fe9a5e0e --- /dev/null +++ b/drivers/iommu/iommufd/selftest.c @@ -0,0 +1,853 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * Kernel side components to support tools/testing/selftests/iommu + */ +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/xarray.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <linux/fault-inject.h> +#include <uapi/linux/iommufd.h> + +#include "io_pagetable.h" +#include "iommufd_private.h" +#include "iommufd_test.h" + +static DECLARE_FAULT_ATTR(fail_iommufd); +static struct dentry *dbgfs_root; + +size_t iommufd_test_memory_limit = 65536; + +enum { + MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, + + /* + * Like a real page table alignment requires the low bits of the address + * to be zero. xarray also requires the high bit to be zero, so we store + * the pfns shifted. The upper bits are used for metadata. + */ + MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, + + _MOCK_PFN_START = MOCK_PFN_MASK + 1, + MOCK_PFN_START_IOVA = _MOCK_PFN_START, + MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, +}; + +/* + * Syzkaller has trouble randomizing the correct iova to use since it is linked + * to the map ioctl's output, and it has no ide about that. So, simplify things. + * In syzkaller mode the 64 bit IOVA is converted into an nth area and offset + * value. This has a much smaller randomization space and syzkaller can hit it. + */ +static unsigned long iommufd_test_syz_conv_iova(struct io_pagetable *iopt, + u64 *iova) +{ + struct syz_layout { + __u32 nth_area; + __u32 offset; + }; + struct syz_layout *syz = (void *)iova; + unsigned int nth = syz->nth_area; + struct iopt_area *area; + + down_read(&iopt->iova_rwsem); + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + if (nth == 0) { + up_read(&iopt->iova_rwsem); + return iopt_area_iova(area) + syz->offset; + } + nth--; + } + up_read(&iopt->iova_rwsem); + + return 0; +} + +void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, u64 *iova, u32 *flags) +{ + struct iommufd_ioas *ioas; + + if (!(*flags & MOCK_FLAGS_ACCESS_SYZ)) + return; + *flags &= ~(u32)MOCK_FLAGS_ACCESS_SYZ; + + ioas = iommufd_get_ioas(ucmd, ioas_id); + if (IS_ERR(ioas)) + return; + *iova = iommufd_test_syz_conv_iova(&ioas->iopt, iova); + iommufd_put_object(&ioas->obj); +} + +struct mock_iommu_domain { + struct iommu_domain domain; + struct xarray pfns; +}; + +enum selftest_obj_type { + TYPE_IDEV, +}; + +struct selftest_obj { + struct iommufd_object obj; + enum selftest_obj_type type; + + union { + struct { + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ctx *ictx; + struct device mock_dev; + } idev; + }; +}; + +static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +{ + struct mock_iommu_domain *mock; + + if (WARN_ON(iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)) + return NULL; + + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return NULL; + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; + mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + xa_init(&mock->pfns); + return &mock->domain; +} + +static void mock_domain_free(struct iommu_domain *domain) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + + WARN_ON(!xa_empty(&mock->pfns)); + kfree(mock); +} + +static int mock_domain_map_pages(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t pgsize, size_t pgcount, int prot, + gfp_t gfp, size_t *mapped) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long flags = MOCK_PFN_START_IOVA; + unsigned long start_iova = iova; + + /* + * xarray does not reliably work with fault injection because it does a + * retry allocation, so put our own failure point. + */ + if (iommufd_should_fail()) + return -ENOENT; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + for (; pgcount; pgcount--) { + size_t cur; + + for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { + void *old; + + if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) + flags = MOCK_PFN_LAST_IOVA; + old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, + xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | + flags), + gfp); + if (xa_is_err(old)) { + for (; start_iova != iova; + start_iova += MOCK_IO_PAGE_SIZE) + xa_erase(&mock->pfns, + start_iova / + MOCK_IO_PAGE_SIZE); + return xa_err(old); + } + WARN_ON(old); + iova += MOCK_IO_PAGE_SIZE; + paddr += MOCK_IO_PAGE_SIZE; + *mapped += MOCK_IO_PAGE_SIZE; + flags = 0; + } + } + return 0; +} + +static size_t mock_domain_unmap_pages(struct iommu_domain *domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + bool first = true; + size_t ret = 0; + void *ent; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + + for (; pgcount; pgcount--) { + size_t cur; + + for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { + ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + WARN_ON(!ent); + /* + * iommufd generates unmaps that must be a strict + * superset of the map's performend So every starting + * IOVA should have been an iova passed to map, and the + * + * First IOVA must be present and have been a first IOVA + * passed to map_pages + */ + if (first) { + WARN_ON(!(xa_to_value(ent) & + MOCK_PFN_START_IOVA)); + first = false; + } + if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) + WARN_ON(!(xa_to_value(ent) & + MOCK_PFN_LAST_IOVA)); + + iova += MOCK_IO_PAGE_SIZE; + ret += MOCK_IO_PAGE_SIZE; + } + } + return ret; +} + +static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + void *ent; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + WARN_ON(!ent); + return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; +} + +static const struct iommu_ops mock_ops = { + .owner = THIS_MODULE, + .pgsize_bitmap = MOCK_IO_PAGE_SIZE, + .domain_alloc = mock_domain_alloc, + .default_domain_ops = + &(struct iommu_domain_ops){ + .free = mock_domain_free, + .map_pages = mock_domain_map_pages, + .unmap_pages = mock_domain_unmap_pages, + .iova_to_phys = mock_domain_iova_to_phys, + }, +}; + +static inline struct iommufd_hw_pagetable * +get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain **mock) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *obj; + + obj = iommufd_get_object(ucmd->ictx, mockpt_id, + IOMMUFD_OBJ_HW_PAGETABLE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); + if (hwpt->domain->ops != mock_ops.default_domain_ops) { + iommufd_put_object(&hwpt->obj); + return ERR_PTR(-EINVAL); + } + *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); + return hwpt; +} + +/* Create an hw_pagetable with the mock domain so we can test the domain ops */ +static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + static struct bus_type mock_bus = { .iommu_ops = &mock_ops }; + struct iommufd_hw_pagetable *hwpt; + struct selftest_obj *sobj; + struct iommufd_ioas *ioas; + int rc; + + ioas = iommufd_get_ioas(ucmd, cmd->id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + sobj = iommufd_object_alloc(ucmd->ictx, sobj, IOMMUFD_OBJ_SELFTEST); + if (IS_ERR(sobj)) { + rc = PTR_ERR(sobj); + goto out_ioas; + } + sobj->idev.ictx = ucmd->ictx; + sobj->type = TYPE_IDEV; + sobj->idev.mock_dev.bus = &mock_bus; + + hwpt = iommufd_device_selftest_attach(ucmd->ictx, ioas, + &sobj->idev.mock_dev); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_sobj; + } + sobj->idev.hwpt = hwpt; + + /* Userspace must destroy both of these IDs to destroy the object */ + cmd->mock_domain.out_hwpt_id = hwpt->obj.id; + cmd->mock_domain.out_device_id = sobj->obj.id; + iommufd_object_finalize(ucmd->ictx, &sobj->obj); + iommufd_put_object(&ioas->obj); + return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_sobj: + iommufd_object_abort(ucmd->ictx, &sobj->obj); +out_ioas: + iommufd_put_object(&ioas->obj); + return rc; +} + +/* Add an additional reserved IOVA to the IOAS */ +static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, + unsigned int mockpt_id, + unsigned long start, size_t length) +{ + struct iommufd_ioas *ioas; + int rc; + + ioas = iommufd_get_ioas(ucmd, mockpt_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + down_write(&ioas->iopt.iova_rwsem); + rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL); + up_write(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +/* Check that every pfn under each iova matches the pfn under a user VA */ +static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, + unsigned int mockpt_id, unsigned long iova, + size_t length, void __user *uptr) +{ + struct iommufd_hw_pagetable *hwpt; + struct mock_iommu_domain *mock; + int rc; + + if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || + (uintptr_t)uptr % MOCK_IO_PAGE_SIZE) + return -EINVAL; + + hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + for (; length; length -= MOCK_IO_PAGE_SIZE) { + struct page *pages[1]; + unsigned long pfn; + long npages; + void *ent; + + npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, + pages); + if (npages < 0) { + rc = npages; + goto out_put; + } + if (WARN_ON(npages != 1)) { + rc = -EFAULT; + goto out_put; + } + pfn = page_to_pfn(pages[0]); + put_page(pages[0]); + + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + if (!ent || + (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + rc = -EINVAL; + goto out_put; + } + iova += MOCK_IO_PAGE_SIZE; + uptr += MOCK_IO_PAGE_SIZE; + } + rc = 0; + +out_put: + iommufd_put_object(&hwpt->obj); + return rc; +} + +/* Check that the page ref count matches, to look for missing pin/unpins */ +static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, + void __user *uptr, size_t length, + unsigned int refs) +{ + if (length % PAGE_SIZE || (uintptr_t)uptr % PAGE_SIZE) + return -EINVAL; + + for (; length; length -= PAGE_SIZE) { + struct page *pages[1]; + long npages; + + npages = get_user_pages_fast((uintptr_t)uptr, 1, 0, pages); + if (npages < 0) + return npages; + if (WARN_ON(npages != 1)) + return -EFAULT; + if (!PageCompound(pages[0])) { + unsigned int count; + + count = page_ref_count(pages[0]); + if (count / GUP_PIN_COUNTING_BIAS != refs) { + put_page(pages[0]); + return -EIO; + } + } + put_page(pages[0]); + uptr += PAGE_SIZE; + } + return 0; +} + +struct selftest_access { + struct iommufd_access *access; + struct file *file; + struct mutex lock; + struct list_head items; + unsigned int next_id; + bool destroying; +}; + +struct selftest_access_item { + struct list_head items_elm; + unsigned long iova; + size_t length; + unsigned int id; +}; + +static const struct file_operations iommfd_test_staccess_fops; + +static struct selftest_access *iommufd_access_get(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADFD); + + if (file->f_op != &iommfd_test_staccess_fops) { + fput(file); + return ERR_PTR(-EBADFD); + } + return file->private_data; +} + +static void iommufd_test_access_unmap(void *data, unsigned long iova, + unsigned long length) +{ + unsigned long iova_last = iova + length - 1; + struct selftest_access *staccess = data; + struct selftest_access_item *item; + struct selftest_access_item *tmp; + + mutex_lock(&staccess->lock); + list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) { + if (iova > item->iova + item->length - 1 || + iova_last < item->iova) + continue; + list_del(&item->items_elm); + iommufd_access_unpin_pages(staccess->access, item->iova, + item->length); + kfree(item); + } + mutex_unlock(&staccess->lock); +} + +static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd, + unsigned int access_id, + unsigned int item_id) +{ + struct selftest_access_item *item; + struct selftest_access *staccess; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + mutex_lock(&staccess->lock); + list_for_each_entry(item, &staccess->items, items_elm) { + if (item->id == item_id) { + list_del(&item->items_elm); + iommufd_access_unpin_pages(staccess->access, item->iova, + item->length); + mutex_unlock(&staccess->lock); + kfree(item); + fput(staccess->file); + return 0; + } + } + mutex_unlock(&staccess->lock); + fput(staccess->file); + return -ENOENT; +} + +static int iommufd_test_staccess_release(struct inode *inode, + struct file *filep) +{ + struct selftest_access *staccess = filep->private_data; + + if (staccess->access) { + iommufd_test_access_unmap(staccess, 0, ULONG_MAX); + iommufd_access_destroy(staccess->access); + } + mutex_destroy(&staccess->lock); + kfree(staccess); + return 0; +} + +static const struct iommufd_access_ops selftest_access_ops_pin = { + .needs_pin_pages = 1, + .unmap = iommufd_test_access_unmap, +}; + +static const struct iommufd_access_ops selftest_access_ops = { + .unmap = iommufd_test_access_unmap, +}; + +static const struct file_operations iommfd_test_staccess_fops = { + .release = iommufd_test_staccess_release, +}; + +static struct selftest_access *iommufd_test_alloc_access(void) +{ + struct selftest_access *staccess; + struct file *filep; + + staccess = kzalloc(sizeof(*staccess), GFP_KERNEL_ACCOUNT); + if (!staccess) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&staccess->items); + mutex_init(&staccess->lock); + + filep = anon_inode_getfile("[iommufd_test_staccess]", + &iommfd_test_staccess_fops, staccess, + O_RDWR); + if (IS_ERR(filep)) { + kfree(staccess); + return ERR_CAST(filep); + } + staccess->file = filep; + return staccess; +} + +static int iommufd_test_create_access(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, unsigned int flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access *staccess; + struct iommufd_access *access; + int fdno; + int rc; + + if (flags & ~MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) + return -EOPNOTSUPP; + + staccess = iommufd_test_alloc_access(); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + rc = -ENOMEM; + goto out_free_staccess; + } + + access = iommufd_access_create( + ucmd->ictx, ioas_id, + (flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ? + &selftest_access_ops_pin : + &selftest_access_ops, + staccess); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_put_fdno; + } + cmd->create_access.out_access_fd = fdno; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_destroy; + + staccess->access = access; + fd_install(fdno, staccess->file); + return 0; + +out_destroy: + iommufd_access_destroy(access); +out_put_fdno: + put_unused_fd(fdno); +out_free_staccess: + fput(staccess->file); + return rc; +} + +/* Check that the pages in a page array match the pages in the user VA */ +static int iommufd_test_check_pages(void __user *uptr, struct page **pages, + size_t npages) +{ + for (; npages; npages--) { + struct page *tmp_pages[1]; + long rc; + + rc = get_user_pages_fast((uintptr_t)uptr, 1, 0, tmp_pages); + if (rc < 0) + return rc; + if (WARN_ON(rc != 1)) + return -EFAULT; + put_page(tmp_pages[0]); + if (tmp_pages[0] != *pages) + return -EBADE; + pages++; + uptr += PAGE_SIZE; + } + return 0; +} + +static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, + unsigned int access_id, unsigned long iova, + size_t length, void __user *uptr, + u32 flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access_item *item; + struct selftest_access *staccess; + struct page **pages; + size_t npages; + int rc; + + /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ + if (length > 16*1024*1024) + return -ENOMEM; + + if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ)) + return -EOPNOTSUPP; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + if (staccess->access->ops != &selftest_access_ops_pin) { + rc = -EOPNOTSUPP; + goto out_put; + } + + if (flags & MOCK_FLAGS_ACCESS_SYZ) + iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt, + &cmd->access_pages.iova); + + npages = (ALIGN(iova + length, PAGE_SIZE) - + ALIGN_DOWN(iova, PAGE_SIZE)) / + PAGE_SIZE; + pages = kvcalloc(npages, sizeof(*pages), GFP_KERNEL_ACCOUNT); + if (!pages) { + rc = -ENOMEM; + goto out_put; + } + + /* + * Drivers will need to think very carefully about this locking. The + * core code can do multiple unmaps instantaneously after + * iommufd_access_pin_pages() and *all* the unmaps must not return until + * the range is unpinned. This simple implementation puts a global lock + * around the pin, which may not suit drivers that want this to be a + * performance path. drivers that get this wrong will trigger WARN_ON + * races and cause EDEADLOCK failures to userspace. + */ + mutex_lock(&staccess->lock); + rc = iommufd_access_pin_pages(staccess->access, iova, length, pages, + flags & MOCK_FLAGS_ACCESS_WRITE); + if (rc) + goto out_unlock; + + /* For syzkaller allow uptr to be NULL to skip this check */ + if (uptr) { + rc = iommufd_test_check_pages( + uptr - (iova - ALIGN_DOWN(iova, PAGE_SIZE)), pages, + npages); + if (rc) + goto out_unaccess; + } + + item = kzalloc(sizeof(*item), GFP_KERNEL_ACCOUNT); + if (!item) { + rc = -ENOMEM; + goto out_unaccess; + } + + item->iova = iova; + item->length = length; + item->id = staccess->next_id++; + list_add_tail(&item->items_elm, &staccess->items); + + cmd->access_pages.out_access_pages_id = item->id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_free_item; + goto out_unlock; + +out_free_item: + list_del(&item->items_elm); + kfree(item); +out_unaccess: + iommufd_access_unpin_pages(staccess->access, iova, length); +out_unlock: + mutex_unlock(&staccess->lock); + kvfree(pages); +out_put: + fput(staccess->file); + return rc; +} + +static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, + unsigned int access_id, unsigned long iova, + size_t length, void __user *ubuf, + unsigned int flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access *staccess; + void *tmp; + int rc; + + /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ + if (length > 16*1024*1024) + return -ENOMEM; + + if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH | + MOCK_FLAGS_ACCESS_SYZ)) + return -EOPNOTSUPP; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + tmp = kvzalloc(length, GFP_KERNEL_ACCOUNT); + if (!tmp) { + rc = -ENOMEM; + goto out_put; + } + + if (flags & MOCK_ACCESS_RW_WRITE) { + if (copy_from_user(tmp, ubuf, length)) { + rc = -EFAULT; + goto out_free; + } + } + + if (flags & MOCK_FLAGS_ACCESS_SYZ) + iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt, + &cmd->access_rw.iova); + + rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags); + if (rc) + goto out_free; + if (!(flags & MOCK_ACCESS_RW_WRITE)) { + if (copy_to_user(ubuf, tmp, length)) { + rc = -EFAULT; + goto out_free; + } + } + +out_free: + kvfree(tmp); +out_put: + fput(staccess->file); + return rc; +} +static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE); +static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH == + __IOMMUFD_ACCESS_RW_SLOW_PATH); + +void iommufd_selftest_destroy(struct iommufd_object *obj) +{ + struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); + + switch (sobj->type) { + case TYPE_IDEV: + iommufd_device_selftest_detach(sobj->idev.ictx, + sobj->idev.hwpt); + break; + } +} + +int iommufd_test(struct iommufd_ucmd *ucmd) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + + switch (cmd->op) { + case IOMMU_TEST_OP_ADD_RESERVED: + return iommufd_test_add_reserved(ucmd, cmd->id, + cmd->add_reserved.start, + cmd->add_reserved.length); + case IOMMU_TEST_OP_MOCK_DOMAIN: + return iommufd_test_mock_domain(ucmd, cmd); + case IOMMU_TEST_OP_MD_CHECK_MAP: + return iommufd_test_md_check_pa( + ucmd, cmd->id, cmd->check_map.iova, + cmd->check_map.length, + u64_to_user_ptr(cmd->check_map.uptr)); + case IOMMU_TEST_OP_MD_CHECK_REFS: + return iommufd_test_md_check_refs( + ucmd, u64_to_user_ptr(cmd->check_refs.uptr), + cmd->check_refs.length, cmd->check_refs.refs); + case IOMMU_TEST_OP_CREATE_ACCESS: + return iommufd_test_create_access(ucmd, cmd->id, + cmd->create_access.flags); + case IOMMU_TEST_OP_ACCESS_PAGES: + return iommufd_test_access_pages( + ucmd, cmd->id, cmd->access_pages.iova, + cmd->access_pages.length, + u64_to_user_ptr(cmd->access_pages.uptr), + cmd->access_pages.flags); + case IOMMU_TEST_OP_ACCESS_RW: + return iommufd_test_access_rw( + ucmd, cmd->id, cmd->access_rw.iova, + cmd->access_rw.length, + u64_to_user_ptr(cmd->access_rw.uptr), + cmd->access_rw.flags); + case IOMMU_TEST_OP_DESTROY_ACCESS_PAGES: + return iommufd_test_access_item_destroy( + ucmd, cmd->id, cmd->destroy_access_pages.access_pages_id); + case IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT: + /* Protect _batch_init(), can not be less than elmsz */ + if (cmd->memory_limit.limit < + sizeof(unsigned long) + sizeof(u32)) + return -EINVAL; + iommufd_test_memory_limit = cmd->memory_limit.limit; + return 0; + default: + return -EOPNOTSUPP; + } +} + +bool iommufd_should_fail(void) +{ + return should_fail(&fail_iommufd, 1); +} + +void __init iommufd_test_init(void) +{ + dbgfs_root = + fault_create_debugfs_attr("fail_iommufd", NULL, &fail_iommufd); +} + +void iommufd_test_exit(void) +{ + debugfs_remove_recursive(dbgfs_root); +} diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c new file mode 100644 index 000000000000..3ceca0e8311c --- /dev/null +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -0,0 +1,472 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/file.h> +#include <linux/interval_tree.h> +#include <linux/iommu.h> +#include <linux/iommufd.h> +#include <linux/slab.h> +#include <linux/vfio.h> +#include <uapi/linux/vfio.h> +#include <uapi/linux/iommufd.h> + +#include "iommufd_private.h" + +static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); + + xa_lock(&ictx->objects); + if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) + goto out_unlock; + ioas = ictx->vfio_ioas; +out_unlock: + xa_unlock(&ictx->objects); + return ioas; +} + +/** + * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use + * @ictx: Context to operate on + * @out_ioas_id: The ioas_id the caller should use + * + * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate + * on since they do not have an IOAS ID input in their ABI. Only attaching a + * group should cause a default creation of the internal ioas, this returns the + * existing ioas if it has already been assigned somehow. + */ +int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) +{ + struct iommufd_ioas *ioas = NULL; + struct iommufd_ioas *out_ioas; + + ioas = iommufd_ioas_alloc(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + xa_lock(&ictx->objects); + if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) + out_ioas = ictx->vfio_ioas; + else { + out_ioas = ioas; + ictx->vfio_ioas = ioas; + } + xa_unlock(&ictx->objects); + + *out_ioas_id = out_ioas->obj.id; + if (out_ioas != ioas) { + iommufd_put_object(&out_ioas->obj); + iommufd_object_abort(ictx, &ioas->obj); + return 0; + } + /* + * An automatically created compat IOAS is treated as a userspace + * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, + * and if not manually destroyed it will be destroyed automatically + * at iommufd release. + */ + iommufd_object_finalize(ictx, &ioas->obj); + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO); + +int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) +{ + struct iommu_vfio_ioas *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + + if (cmd->__reserved) + return -EOPNOTSUPP; + switch (cmd->op) { + case IOMMU_VFIO_IOAS_GET: + ioas = get_compat_ioas(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + cmd->ioas_id = ioas->obj.id; + iommufd_put_object(&ioas->obj); + return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + + case IOMMU_VFIO_IOAS_SET: + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + xa_lock(&ucmd->ictx->objects); + ucmd->ictx->vfio_ioas = ioas; + xa_unlock(&ucmd->ictx->objects); + iommufd_put_object(&ioas->obj); + return 0; + + case IOMMU_VFIO_IOAS_CLEAR: + xa_lock(&ucmd->ictx->objects); + ucmd->ictx->vfio_ioas = NULL; + xa_unlock(&ucmd->ictx->objects); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, + void __user *arg) +{ + u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + struct vfio_iommu_type1_dma_map map; + int iommu_prot = IOMMU_CACHE; + struct iommufd_ioas *ioas; + unsigned long iova; + int rc; + + if (copy_from_user(&map, arg, minsz)) + return -EFAULT; + + if (map.argsz < minsz || map.flags & ~supported_flags) + return -EINVAL; + + if (map.flags & VFIO_DMA_MAP_FLAG_READ) + iommu_prot |= IOMMU_READ; + if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) + iommu_prot |= IOMMU_WRITE; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + /* + * Maps created through the legacy interface always use VFIO compatible + * rlimit accounting. If the user wishes to use the faster user based + * rlimit accounting then they must use the new interface. + */ + iova = map.iova; + rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), + map.size, iommu_prot, 0); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, + void __user *arg) +{ + size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); + /* + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new + * dirty tracking direction: + * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ + * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ + */ + u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; + struct vfio_iommu_type1_dma_unmap unmap; + unsigned long unmapped = 0; + struct iommufd_ioas *ioas; + int rc; + + if (copy_from_user(&unmap, arg, minsz)) + return -EFAULT; + + if (unmap.argsz < minsz || unmap.flags & ~supported_flags) + return -EINVAL; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { + if (unmap.iova != 0 || unmap.size != 0) { + rc = -EINVAL; + goto err_put; + } + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + } else { + if (READ_ONCE(ioas->iopt.disable_large_pages)) { + /* + * Create cuts at the start and last of the requested + * range. If the start IOVA is 0 then it doesn't need to + * be cut. + */ + unsigned long iovas[] = { unmap.iova + unmap.size - 1, + unmap.iova - 1 }; + + rc = iopt_cut_iova(&ioas->iopt, iovas, + unmap.iova ? 2 : 1); + if (rc) + goto err_put; + } + rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, + &unmapped); + } + unmap.size = unmapped; + if (copy_to_user(arg, &unmap, minsz)) + rc = -EFAULT; + +err_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas; + int rc = 1; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + mutex_lock(&ioas->mutex); + list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { + if (!hwpt->enforce_cache_coherency) { + rc = 0; + break; + } + } + mutex_unlock(&ioas->mutex); + + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, + unsigned long type) +{ + switch (type) { + case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_IOMMU: + case VFIO_UNMAP_ALL: + return 1; + + case VFIO_DMA_CC_IOMMU: + return iommufd_vfio_cc_iommu(ictx); + + /* + * This is obsolete, and to be removed from VFIO. It was an incomplete + * idea that got merged. + * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ + */ + case VFIO_TYPE1_NESTING_IOMMU: + return 0; + + /* + * VFIO_DMA_MAP_FLAG_VADDR + * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ + * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ + * + * It is hard to see how this could be implemented safely. + */ + case VFIO_UPDATE_VADDR: + default: + return 0; + } +} + +static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) +{ + struct iommufd_ioas *ioas = NULL; + int rc = 0; + + if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) + return -EINVAL; + + /* VFIO fails the set_iommu if there is no group */ + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + /* + * The difference between TYPE1 and TYPE1v2 is the ability to unmap in + * the middle of mapped ranges. This is complicated by huge page support + * which creates single large IOPTEs that cannot be split by the iommu + * driver. TYPE1 is very old at this point and likely nothing uses it, + * however it is simple enough to emulate by simply disabling the + * problematic large IOPTEs. Then we can safely unmap within any range. + */ + if (type == VFIO_TYPE1_IOMMU) + rc = iopt_disable_large_pages(&ioas->iopt); + iommufd_put_object(&ioas->obj); + return rc; +} + +static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) +{ + struct io_pagetable *iopt = &ioas->iopt; + unsigned long pgsize_bitmap = ULONG_MAX; + struct iommu_domain *domain; + unsigned long index; + + down_read(&iopt->domains_rwsem); + xa_for_each(&iopt->domains, index, domain) + pgsize_bitmap &= domain->pgsize_bitmap; + + /* See vfio_update_pgsize_bitmap() */ + if (pgsize_bitmap & ~PAGE_MASK) { + pgsize_bitmap &= PAGE_MASK; + pgsize_bitmap |= PAGE_SIZE; + } + pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); + up_read(&iopt->domains_rwsem); + return pgsize_bitmap; +} + +static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail) +{ + struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = + container_of(cur, + struct vfio_iommu_type1_info_cap_iova_range __user, + header); + struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { + .header = { + .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, + .version = 1, + }, + }; + struct interval_tree_span_iter span; + + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + struct vfio_iova_range range; + + if (!span.is_hole) + continue; + range.start = span.start_hole; + range.end = span.last_hole; + if (avail >= struct_size(&cap_iovas, iova_ranges, + cap_iovas.nr_iovas + 1) && + copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], + &range, sizeof(range))) + return -EFAULT; + cap_iovas.nr_iovas++; + } + if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && + copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) + return -EFAULT; + return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); +} + +static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail) +{ + struct vfio_iommu_type1_info_dma_avail cap_dma = { + .header = { + .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, + .version = 1, + }, + /* + * iommufd's limit is based on the cgroup's memory limit. + * Normally vfio would return U16_MAX here, and provide a module + * parameter to adjust it. Since S390 qemu userspace actually + * pays attention and needs a value bigger than U16_MAX return + * U32_MAX. + */ + .avail = U32_MAX, + }; + + if (avail >= sizeof(cap_dma) && + copy_to_user(cur, &cap_dma, sizeof(cap_dma))) + return -EFAULT; + return sizeof(cap_dma); +} + +static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, + void __user *arg) +{ + typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail); + static const fill_cap_fn fill_fns[] = { + iommufd_fill_cap_dma_avail, + iommufd_fill_cap_iova, + }; + size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); + struct vfio_info_cap_header __user *last_cap = NULL; + struct vfio_iommu_type1_info info; + struct iommufd_ioas *ioas; + size_t total_cap_size; + int rc; + int i; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + minsz = min_t(size_t, info.argsz, sizeof(info)); + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + info.flags = VFIO_IOMMU_INFO_PGSIZES; + info.iova_pgsizes = iommufd_get_pagesizes(ioas); + info.cap_offset = 0; + + down_read(&ioas->iopt.iova_rwsem); + total_cap_size = sizeof(info); + for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { + int cap_size; + + if (info.argsz > total_cap_size) + cap_size = fill_fns[i](ioas, arg + total_cap_size, + info.argsz - total_cap_size); + else + cap_size = fill_fns[i](ioas, NULL, 0); + if (cap_size < 0) { + rc = cap_size; + goto out_put; + } + if (last_cap && info.argsz >= total_cap_size && + put_user(total_cap_size, &last_cap->next)) { + rc = -EFAULT; + goto out_put; + } + last_cap = arg + total_cap_size; + total_cap_size += cap_size; + } + + /* + * If the user did not provide enough space then only some caps are + * returned and the argsz will be updated to the correct amount to get + * all caps. + */ + if (info.argsz >= total_cap_size) + info.cap_offset = sizeof(info); + info.argsz = total_cap_size; + info.flags |= VFIO_IOMMU_INFO_CAPS; + if (copy_to_user(arg, &info, minsz)) { + rc = -EFAULT; + goto out_put; + } + rc = 0; + +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, + unsigned long arg) +{ + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_GET_API_VERSION: + return VFIO_API_VERSION; + case VFIO_SET_IOMMU: + return iommufd_vfio_set_iommu(ictx, arg); + case VFIO_CHECK_EXTENSION: + return iommufd_vfio_check_extension(ictx, arg); + case VFIO_IOMMU_GET_INFO: + return iommufd_vfio_iommu_get_info(ictx, uarg); + case VFIO_IOMMU_MAP_DMA: + return iommufd_vfio_map_dma(ictx, cmd, uarg); + case VFIO_IOMMU_UNMAP_DMA: + return iommufd_vfio_unmap_dma(ictx, cmd, uarg); + case VFIO_IOMMU_DIRTY_PAGES: + default: + return -ENOIOCTLCMD; + } + return -ENOIOCTLCMD; +} diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 3b30c0752274..a003bd5fc65c 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -628,8 +628,6 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, * Something is wrong, we can't attach two devices using * different IOMMUs to the same domain. */ - dev_err(dev, "Can't attach IPMMU %s to domain on IPMMU %s\n", - dev_name(mmu->dev), dev_name(domain->mmu->dev)); ret = -EINVAL; } else dev_info(dev, "Reusing IPMMU context %u\n", domain->context_id); @@ -661,22 +659,22 @@ static void ipmmu_detach_device(struct iommu_domain *io_domain, } static int ipmmu_map(struct iommu_domain *io_domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); - if (!domain) - return -ENODEV; - - return domain->iop->map(domain->iop, iova, paddr, size, prot, gfp); + return domain->iop->map_pages(domain->iop, iova, paddr, pgsize, pgcount, + prot, gfp, mapped); } static size_t ipmmu_unmap(struct iommu_domain *io_domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); - return domain->iop->unmap(domain->iop, iova, size, gather); + return domain->iop->unmap_pages(domain->iop, iova, pgsize, pgcount, gather); } static void ipmmu_flush_iotlb_all(struct iommu_domain *io_domain) @@ -879,8 +877,8 @@ static const struct iommu_ops ipmmu_ops = { .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = ipmmu_attach_device, .detach_dev = ipmmu_detach_device, - .map = ipmmu_map, - .unmap = ipmmu_unmap, + .map_pages = ipmmu_map, + .unmap_pages = ipmmu_unmap, .flush_iotlb_all = ipmmu_flush_iotlb_all, .iotlb_sync = ipmmu_iotlb_sync, .iova_to_phys = ipmmu_iova_to_phys, diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 16179a9a7283..c60624910872 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -471,14 +471,16 @@ fail: } static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t pa, size_t len, int prot, gfp_t gfp) + phys_addr_t pa, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct msm_priv *priv = to_msm_priv(domain); unsigned long flags; int ret; spin_lock_irqsave(&priv->pgtlock, flags); - ret = priv->iop->map(priv->iop, iova, pa, len, prot, GFP_ATOMIC); + ret = priv->iop->map_pages(priv->iop, iova, pa, pgsize, pgcount, prot, + GFP_ATOMIC, mapped); spin_unlock_irqrestore(&priv->pgtlock, flags); return ret; @@ -493,16 +495,18 @@ static void msm_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, } static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t len, struct iommu_iotlb_gather *gather) + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { struct msm_priv *priv = to_msm_priv(domain); unsigned long flags; + size_t ret; spin_lock_irqsave(&priv->pgtlock, flags); - len = priv->iop->unmap(priv->iop, iova, len, gather); + ret = priv->iop->unmap_pages(priv->iop, iova, pgsize, pgcount, gather); spin_unlock_irqrestore(&priv->pgtlock, flags); - return len; + return ret; } static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain, @@ -679,8 +683,8 @@ static struct iommu_ops msm_iommu_ops = { .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = msm_iommu_attach_dev, .detach_dev = msm_iommu_detach_dev, - .map = msm_iommu_map, - .unmap = msm_iommu_unmap, + .map_pages = msm_iommu_map, + .unmap_pages = msm_iommu_unmap, /* * Nothing is needed here, the barrier to guarantee * completion of the tlb sync operation is implicitly diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 2ab2ecfe01f8..2badd6acfb23 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -108,8 +108,12 @@ #define F_MMU_INT_ID_SUB_COMM_ID(a) (((a) >> 7) & 0x3) #define F_MMU_INT_ID_COMM_ID_EXT(a) (((a) >> 10) & 0x7) #define F_MMU_INT_ID_SUB_COMM_ID_EXT(a) (((a) >> 7) & 0x7) +/* Macro for 5 bits length port ID field (default) */ #define F_MMU_INT_ID_LARB_ID(a) (((a) >> 7) & 0x7) #define F_MMU_INT_ID_PORT_ID(a) (((a) >> 2) & 0x1f) +/* Macro for 6 bits length port ID field */ +#define F_MMU_INT_ID_LARB_ID_WID_6(a) (((a) >> 8) & 0x7) +#define F_MMU_INT_ID_PORT_ID_WID_6(a) (((a) >> 2) & 0x3f) #define MTK_PROTECT_PA_ALIGN 256 #define MTK_IOMMU_BANK_SZ 0x1000 @@ -139,6 +143,7 @@ #define IFA_IOMMU_PCIE_SUPPORT BIT(16) #define PGTABLE_PA_35_EN BIT(17) #define TF_PORT_TO_ADDR_MT8173 BIT(18) +#define INT_ID_PORT_WIDTH_6 BIT(19) #define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \ ((((pdata)->flags) & (mask)) == (_x)) @@ -165,6 +170,7 @@ enum mtk_iommu_plat { M4U_MT8186, M4U_MT8192, M4U_MT8195, + M4U_MT8365, }; struct mtk_iommu_iova_region { @@ -223,10 +229,7 @@ struct mtk_iommu_data { struct device *smicomm_dev; struct mtk_iommu_bank_data *bank; - - struct dma_iommu_mapping *mapping; /* For mtk_iommu_v1.c */ struct regmap *pericfg; - struct mutex mutex; /* Protect m4u_group/m4u_dom above */ /* @@ -441,20 +444,25 @@ static irqreturn_t mtk_iommu_isr(int irq, void *dev_id) fault_pa |= (u64)pa34_32 << 32; if (MTK_IOMMU_IS_TYPE(plat_data, MTK_IOMMU_TYPE_MM)) { - fault_port = F_MMU_INT_ID_PORT_ID(regval); if (MTK_IOMMU_HAS_FLAG(plat_data, HAS_SUB_COMM_2BITS)) { fault_larb = F_MMU_INT_ID_COMM_ID(regval); sub_comm = F_MMU_INT_ID_SUB_COMM_ID(regval); + fault_port = F_MMU_INT_ID_PORT_ID(regval); } else if (MTK_IOMMU_HAS_FLAG(plat_data, HAS_SUB_COMM_3BITS)) { fault_larb = F_MMU_INT_ID_COMM_ID_EXT(regval); sub_comm = F_MMU_INT_ID_SUB_COMM_ID_EXT(regval); + fault_port = F_MMU_INT_ID_PORT_ID(regval); + } else if (MTK_IOMMU_HAS_FLAG(plat_data, INT_ID_PORT_WIDTH_6)) { + fault_port = F_MMU_INT_ID_PORT_ID_WID_6(regval); + fault_larb = F_MMU_INT_ID_LARB_ID_WID_6(regval); } else { + fault_port = F_MMU_INT_ID_PORT_ID(regval); fault_larb = F_MMU_INT_ID_LARB_ID(regval); } fault_larb = data->plat_data->larbid_remap[fault_larb][sub_comm]; } - if (report_iommu_fault(&dom->domain, bank->parent_dev, fault_iova, + if (!dom || report_iommu_fault(&dom->domain, bank->parent_dev, fault_iova, write ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ)) { dev_err_ratelimited( bank->parent_dev, @@ -609,7 +617,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, dom->iop = alloc_io_pgtable_ops(ARM_V7S, &dom->cfg, data); if (!dom->iop) { dev_err(data->dev, "Failed to alloc io pgtable\n"); - return -EINVAL; + return -ENOMEM; } /* Update our support page sizes bitmap */ @@ -668,7 +676,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, ret = mtk_iommu_domain_finalise(dom, frstdata, region_id); if (ret) { mutex_unlock(&dom->mutex); - return -ENODEV; + return ret; } dom->bank = &data->bank[bankid]; } @@ -711,7 +719,8 @@ static void mtk_iommu_detach_device(struct iommu_domain *domain, } static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); @@ -720,17 +729,17 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, paddr |= BIT_ULL(32); /* Synchronize with the tlb_lock */ - return dom->iop->map(dom->iop, iova, paddr, size, prot, gfp); + return dom->iop->map_pages(dom->iop, iova, paddr, pgsize, pgcount, prot, gfp, mapped); } static size_t mtk_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, + unsigned long iova, size_t pgsize, size_t pgcount, struct iommu_iotlb_gather *gather) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); - iommu_iotlb_gather_add_range(gather, iova, size); - return dom->iop->unmap(dom->iop, iova, size, gather); + iommu_iotlb_gather_add_range(gather, iova, pgsize * pgcount); + return dom->iop->unmap_pages(dom->iop, iova, pgsize, pgcount, gather); } static void mtk_iommu_flush_iotlb_all(struct iommu_domain *domain) @@ -938,8 +947,8 @@ static const struct iommu_ops mtk_iommu_ops = { .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = mtk_iommu_attach_device, .detach_dev = mtk_iommu_detach_device, - .map = mtk_iommu_map, - .unmap = mtk_iommu_unmap, + .map_pages = mtk_iommu_map, + .unmap_pages = mtk_iommu_unmap, .flush_iotlb_all = mtk_iommu_flush_iotlb_all, .iotlb_sync = mtk_iommu_iotlb_sync, .iotlb_sync_map = mtk_iommu_sync_map, @@ -1043,21 +1052,26 @@ static const struct component_master_ops mtk_iommu_com_ops = { static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **match, struct mtk_iommu_data *data) { - struct device_node *larbnode, *smicomm_node, *smi_subcomm_node; - struct platform_device *plarbdev; + struct device_node *larbnode, *frst_avail_smicomm_node = NULL; + struct platform_device *plarbdev, *pcommdev; struct device_link *link; int i, larb_nr, ret; larb_nr = of_count_phandle_with_args(dev->of_node, "mediatek,larbs", NULL); if (larb_nr < 0) return larb_nr; + if (larb_nr == 0 || larb_nr > MTK_LARB_NR_MAX) + return -EINVAL; for (i = 0; i < larb_nr; i++) { + struct device_node *smicomm_node, *smi_subcomm_node; u32 id; larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i); - if (!larbnode) - return -EINVAL; + if (!larbnode) { + ret = -EINVAL; + goto err_larbdev_put; + } if (!of_device_is_available(larbnode)) { of_node_put(larbnode); @@ -1067,48 +1081,91 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m ret = of_property_read_u32(larbnode, "mediatek,larb-id", &id); if (ret)/* The id is consecutive if there is no this property */ id = i; + if (id >= MTK_LARB_NR_MAX) { + of_node_put(larbnode); + ret = -EINVAL; + goto err_larbdev_put; + } plarbdev = of_find_device_by_node(larbnode); + of_node_put(larbnode); if (!plarbdev) { - of_node_put(larbnode); - return -ENODEV; + ret = -ENODEV; + goto err_larbdev_put; } - if (!plarbdev->dev.driver) { - of_node_put(larbnode); - return -EPROBE_DEFER; + if (data->larb_imu[id].dev) { + platform_device_put(plarbdev); + ret = -EEXIST; + goto err_larbdev_put; } data->larb_imu[id].dev = &plarbdev->dev; - component_match_add_release(dev, match, component_release_of, - component_compare_of, larbnode); + if (!plarbdev->dev.driver) { + ret = -EPROBE_DEFER; + goto err_larbdev_put; + } + + /* Get smi-(sub)-common dev from the last larb. */ + smi_subcomm_node = of_parse_phandle(larbnode, "mediatek,smi", 0); + if (!smi_subcomm_node) { + ret = -EINVAL; + goto err_larbdev_put; + } + + /* + * It may have two level smi-common. the node is smi-sub-common if it + * has a new mediatek,smi property. otherwise it is smi-commmon. + */ + smicomm_node = of_parse_phandle(smi_subcomm_node, "mediatek,smi", 0); + if (smicomm_node) + of_node_put(smi_subcomm_node); + else + smicomm_node = smi_subcomm_node; + + /* + * All the larbs that connect to one IOMMU must connect with the same + * smi-common. + */ + if (!frst_avail_smicomm_node) { + frst_avail_smicomm_node = smicomm_node; + } else if (frst_avail_smicomm_node != smicomm_node) { + dev_err(dev, "mediatek,smi property is not right @larb%d.", id); + of_node_put(smicomm_node); + ret = -EINVAL; + goto err_larbdev_put; + } else { + of_node_put(smicomm_node); + } + + component_match_add(dev, match, component_compare_dev, &plarbdev->dev); + platform_device_put(plarbdev); } - /* Get smi-(sub)-common dev from the last larb. */ - smi_subcomm_node = of_parse_phandle(larbnode, "mediatek,smi", 0); - if (!smi_subcomm_node) + if (!frst_avail_smicomm_node) return -EINVAL; - /* - * It may have two level smi-common. the node is smi-sub-common if it - * has a new mediatek,smi property. otherwise it is smi-commmon. - */ - smicomm_node = of_parse_phandle(smi_subcomm_node, "mediatek,smi", 0); - if (smicomm_node) - of_node_put(smi_subcomm_node); - else - smicomm_node = smi_subcomm_node; - - plarbdev = of_find_device_by_node(smicomm_node); - of_node_put(smicomm_node); - data->smicomm_dev = &plarbdev->dev; + pcommdev = of_find_device_by_node(frst_avail_smicomm_node); + of_node_put(frst_avail_smicomm_node); + if (!pcommdev) + return -ENODEV; + data->smicomm_dev = &pcommdev->dev; link = device_link_add(data->smicomm_dev, dev, DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); + platform_device_put(pcommdev); if (!link) { dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev)); return -EINVAL; } return 0; + +err_larbdev_put: + for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) { + if (!data->larb_imu[i].dev) + continue; + put_device(data->larb_imu[i].dev); + } + return ret; } static int mtk_iommu_probe(struct platform_device *pdev) @@ -1173,6 +1230,8 @@ static int mtk_iommu_probe(struct platform_device *pdev) banks_num = data->plat_data->banks_num; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -EINVAL; if (resource_size(res) < banks_num * MTK_IOMMU_BANK_SZ) { dev_err(dev, "banknr %d. res %pR is not enough.\n", banks_num, res); return -EINVAL; @@ -1516,6 +1575,17 @@ static const struct mtk_iommu_plat_data mt8195_data_vpp = { {4, MTK_INVALID_LARBID, MTK_INVALID_LARBID, MTK_INVALID_LARBID, 6}}, }; +static const struct mtk_iommu_plat_data mt8365_data = { + .m4u_plat = M4U_MT8365, + .flags = RESET_AXI | INT_ID_PORT_WIDTH_6, + .inv_sel_reg = REG_MMU_INV_SEL_GEN1, + .banks_num = 1, + .banks_enable = {true}, + .iova_region = single_domain, + .iova_region_nr = ARRAY_SIZE(single_domain), + .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}}, /* Linear mapping. */ +}; + static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt2712-m4u", .data = &mt2712_data}, { .compatible = "mediatek,mt6779-m4u", .data = &mt6779_data}, @@ -1528,6 +1598,7 @@ static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra}, { .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo}, { .compatible = "mediatek,mt8195-iommu-vpp", .data = &mt8195_data_vpp}, + { .compatible = "mediatek,mt8365-m4u", .data = &mt8365_data}, {} }; diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 6e0e65831eb7..69682ee068d2 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -327,44 +327,42 @@ static void mtk_iommu_v1_detach_device(struct iommu_domain *domain, struct devic } static int mtk_iommu_v1_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain); - unsigned int page_num = size >> MT2701_IOMMU_PAGE_SHIFT; unsigned long flags; unsigned int i; u32 *pgt_base_iova = dom->pgt_va + (iova >> MT2701_IOMMU_PAGE_SHIFT); u32 pabase = (u32)paddr; - int map_size = 0; spin_lock_irqsave(&dom->pgtlock, flags); - for (i = 0; i < page_num; i++) { - if (pgt_base_iova[i]) { - memset(pgt_base_iova, 0, i * sizeof(u32)); + for (i = 0; i < pgcount; i++) { + if (pgt_base_iova[i]) break; - } pgt_base_iova[i] = pabase | F_DESC_VALID | F_DESC_NONSEC; pabase += MT2701_IOMMU_PAGE_SIZE; - map_size += MT2701_IOMMU_PAGE_SIZE; } spin_unlock_irqrestore(&dom->pgtlock, flags); - mtk_iommu_v1_tlb_flush_range(dom->data, iova, size); + *mapped = i * MT2701_IOMMU_PAGE_SIZE; + mtk_iommu_v1_tlb_flush_range(dom->data, iova, *mapped); - return map_size == size ? 0 : -EEXIST; + return i == pgcount ? 0 : -EEXIST; } static size_t mtk_iommu_v1_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain); unsigned long flags; u32 *pgt_base_iova = dom->pgt_va + (iova >> MT2701_IOMMU_PAGE_SHIFT); - unsigned int page_num = size >> MT2701_IOMMU_PAGE_SHIFT; + size_t size = pgcount * MT2701_IOMMU_PAGE_SIZE; spin_lock_irqsave(&dom->pgtlock, flags); - memset(pgt_base_iova, 0, page_num * sizeof(u32)); + memset(pgt_base_iova, 0, pgcount * sizeof(u32)); spin_unlock_irqrestore(&dom->pgtlock, flags); mtk_iommu_v1_tlb_flush_range(dom->data, iova, size); @@ -586,13 +584,13 @@ static const struct iommu_ops mtk_iommu_v1_ops = { .release_device = mtk_iommu_v1_release_device, .def_domain_type = mtk_iommu_v1_def_domain_type, .device_group = generic_device_group, - .pgsize_bitmap = ~0UL << MT2701_IOMMU_PAGE_SHIFT, + .pgsize_bitmap = MT2701_IOMMU_PAGE_SIZE, .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = mtk_iommu_v1_attach_device, .detach_dev = mtk_iommu_v1_detach_device, - .map = mtk_iommu_v1_map, - .unmap = mtk_iommu_v1_unmap, + .map_pages = mtk_iommu_v1_map, + .unmap_pages = mtk_iommu_v1_unmap, .iova_to_phys = mtk_iommu_v1_iova_to_phys, .free = mtk_iommu_v1_domain_free, } diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 5696314ae69e..00d98f08732f 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -9,7 +9,6 @@ #include <linux/iommu.h> #include <linux/limits.h> #include <linux/module.h> -#include <linux/msi.h> #include <linux/of.h> #include <linux/of_iommu.h> #include <linux/of_pci.h> diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 07ee2600113c..2fd7702c6709 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1414,7 +1414,7 @@ static int omap_iommu_attach_init(struct device *dev, odomain->num_iommus = omap_iommu_count(dev); if (!odomain->num_iommus) - return -EINVAL; + return -ENODEV; odomain->iommus = kcalloc(odomain->num_iommus, sizeof(*iommu), GFP_ATOMIC); @@ -1464,7 +1464,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) if (!arch_data || !arch_data->iommu_dev) { dev_err(dev, "device doesn't have an associated iommu\n"); - return -EINVAL; + return -ENODEV; } spin_lock(&omap_domain->lock); @@ -1472,7 +1472,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) /* only a single client device can be attached to a domain */ if (omap_domain->dev) { dev_err(dev, "iommu domain is already attached\n"); - ret = -EBUSY; + ret = -EINVAL; goto out; } diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index a3fc59b814ab..a68eadd64f38 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -280,19 +280,17 @@ static u32 rk_mk_pte(phys_addr_t page, int prot) * 11:9 - Page address bit 34:32 * 8:4 - Page address bit 39:35 * 3 - Security - * 2 - Readable - * 1 - Writable + * 2 - Writable + * 1 - Readable * 0 - 1 if Page @ Page address is valid */ -#define RK_PTE_PAGE_READABLE_V2 BIT(2) -#define RK_PTE_PAGE_WRITABLE_V2 BIT(1) static u32 rk_mk_pte_v2(phys_addr_t page, int prot) { u32 flags = 0; - flags |= (prot & IOMMU_READ) ? RK_PTE_PAGE_READABLE_V2 : 0; - flags |= (prot & IOMMU_WRITE) ? RK_PTE_PAGE_WRITABLE_V2 : 0; + flags |= (prot & IOMMU_READ) ? RK_PTE_PAGE_READABLE : 0; + flags |= (prot & IOMMU_WRITE) ? RK_PTE_PAGE_WRITABLE : 0; return rk_mk_dte_v2(page) | flags; } diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 3c071782f6f1..ed33c6cce083 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -10,28 +10,18 @@ #include <linux/iommu.h> #include <linux/iommu-helper.h> #include <linux/sizes.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> #include <asm/pci_dma.h> -/* - * Physically contiguous memory regions can be mapped with 4 KiB alignment, - * we allow all page sizes that are an order of 4KiB (no special large page - * support so far). - */ -#define S390_IOMMU_PGSIZES (~0xFFFUL) - static const struct iommu_ops s390_iommu_ops; struct s390_domain { struct iommu_domain domain; struct list_head devices; unsigned long *dma_table; - spinlock_t dma_table_lock; spinlock_t list_lock; -}; - -struct s390_domain_device { - struct list_head list; - struct zpci_dev *zdev; + struct rcu_head rcu; }; static struct s390_domain *to_s390_domain(struct iommu_domain *dom) @@ -67,119 +57,125 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type) kfree(s390_domain); return NULL; } + s390_domain->domain.geometry.force_aperture = true; + s390_domain->domain.geometry.aperture_start = 0; + s390_domain->domain.geometry.aperture_end = ZPCI_TABLE_SIZE_RT - 1; - spin_lock_init(&s390_domain->dma_table_lock); spin_lock_init(&s390_domain->list_lock); - INIT_LIST_HEAD(&s390_domain->devices); + INIT_LIST_HEAD_RCU(&s390_domain->devices); return &s390_domain->domain; } -static void s390_domain_free(struct iommu_domain *domain) +static void s390_iommu_rcu_free_domain(struct rcu_head *head) { - struct s390_domain *s390_domain = to_s390_domain(domain); + struct s390_domain *s390_domain = container_of(head, struct s390_domain, rcu); dma_cleanup_tables(s390_domain->dma_table); kfree(s390_domain); } +static void s390_domain_free(struct iommu_domain *domain) +{ + struct s390_domain *s390_domain = to_s390_domain(domain); + + rcu_read_lock(); + WARN_ON(!list_empty(&s390_domain->devices)); + rcu_read_unlock(); + + call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain); +} + +static void __s390_iommu_detach_device(struct zpci_dev *zdev) +{ + struct s390_domain *s390_domain = zdev->s390_domain; + unsigned long flags; + + if (!s390_domain) + return; + + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_del_rcu(&zdev->iommu_list); + spin_unlock_irqrestore(&s390_domain->list_lock, flags); + + zpci_unregister_ioat(zdev, 0); + zdev->s390_domain = NULL; + zdev->dma_table = NULL; +} + static int s390_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); - struct s390_domain_device *domain_device; unsigned long flags; - int cc, rc; + u8 status; + int cc; if (!zdev) return -ENODEV; - domain_device = kzalloc(sizeof(*domain_device), GFP_KERNEL); - if (!domain_device) - return -ENOMEM; - - if (zdev->dma_table && !zdev->s390_domain) { - cc = zpci_dma_exit_device(zdev); - if (cc) { - rc = -EIO; - goto out_free; - } - } + if (WARN_ON(domain->geometry.aperture_start > zdev->end_dma || + domain->geometry.aperture_end < zdev->start_dma)) + return -EINVAL; if (zdev->s390_domain) - zpci_unregister_ioat(zdev, 0); + __s390_iommu_detach_device(zdev); + else if (zdev->dma_table) + zpci_dma_exit_device(zdev); - zdev->dma_table = s390_domain->dma_table; cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); - if (cc) { - rc = -EIO; - goto out_restore; - } + virt_to_phys(s390_domain->dma_table), &status); + /* + * If the device is undergoing error recovery the reset code + * will re-establish the new domain. + */ + if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + return -EIO; + zdev->dma_table = s390_domain->dma_table; - spin_lock_irqsave(&s390_domain->list_lock, flags); - /* First device defines the DMA range limits */ - if (list_empty(&s390_domain->devices)) { - domain->geometry.aperture_start = zdev->start_dma; - domain->geometry.aperture_end = zdev->end_dma; - domain->geometry.force_aperture = true; - /* Allow only devices with identical DMA range limits */ - } else if (domain->geometry.aperture_start != zdev->start_dma || - domain->geometry.aperture_end != zdev->end_dma) { - rc = -EINVAL; - spin_unlock_irqrestore(&s390_domain->list_lock, flags); - goto out_restore; - } - domain_device->zdev = zdev; + zdev->dma_table = s390_domain->dma_table; zdev->s390_domain = s390_domain; - list_add(&domain_device->list, &s390_domain->devices); + + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_add_rcu(&zdev->iommu_list, &s390_domain->devices); spin_unlock_irqrestore(&s390_domain->list_lock, flags); return 0; - -out_restore: - if (!zdev->s390_domain) { - zpci_dma_init_device(zdev); - } else { - zdev->dma_table = zdev->s390_domain->dma_table; - zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); - } -out_free: - kfree(domain_device); - - return rc; } static void s390_iommu_detach_device(struct iommu_domain *domain, struct device *dev) { - struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); - struct s390_domain_device *domain_device, *tmp; - unsigned long flags; - int found = 0; - if (!zdev) - return; + WARN_ON(zdev->s390_domain != to_s390_domain(domain)); - spin_lock_irqsave(&s390_domain->list_lock, flags); - list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices, - list) { - if (domain_device->zdev == zdev) { - list_del(&domain_device->list); - kfree(domain_device); - found = 1; - break; - } + __s390_iommu_detach_device(zdev); + zpci_dma_init_device(zdev); +} + +static void s390_iommu_get_resv_regions(struct device *dev, + struct list_head *list) +{ + struct zpci_dev *zdev = to_zpci_dev(dev); + struct iommu_resv_region *region; + + if (zdev->start_dma) { + region = iommu_alloc_resv_region(0, zdev->start_dma, 0, + IOMMU_RESV_RESERVED, GFP_KERNEL); + if (!region) + return; + list_add_tail(®ion->list, list); } - spin_unlock_irqrestore(&s390_domain->list_lock, flags); - if (found && (zdev->s390_domain == s390_domain)) { - zdev->s390_domain = NULL; - zpci_unregister_ioat(zdev, 0); - zpci_dma_init_device(zdev); + if (zdev->end_dma < ZPCI_TABLE_SIZE_RT - 1) { + region = iommu_alloc_resv_region(zdev->end_dma + 1, + ZPCI_TABLE_SIZE_RT - zdev->end_dma - 1, + 0, IOMMU_RESV_RESERVED, GFP_KERNEL); + if (!region) + return; + list_add_tail(®ion->list, list); } } @@ -192,55 +188,88 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev) zdev = to_zpci_dev(dev); + if (zdev->start_dma > zdev->end_dma || + zdev->start_dma > ZPCI_TABLE_SIZE_RT - 1) + return ERR_PTR(-EINVAL); + + if (zdev->end_dma > ZPCI_TABLE_SIZE_RT - 1) + zdev->end_dma = ZPCI_TABLE_SIZE_RT - 1; + return &zdev->iommu_dev; } static void s390_iommu_release_device(struct device *dev) { struct zpci_dev *zdev = to_zpci_dev(dev); - struct iommu_domain *domain; /* - * This is a workaround for a scenario where the IOMMU API common code - * "forgets" to call the detach_dev callback: After binding a device - * to vfio-pci and completing the VFIO_SET_IOMMU ioctl (which triggers - * the attach_dev), removing the device via - * "echo 1 > /sys/bus/pci/devices/.../remove" won't trigger detach_dev, - * only release_device will be called via the BUS_NOTIFY_REMOVED_DEVICE - * notifier. - * - * So let's call detach_dev from here if it hasn't been called before. + * release_device is expected to detach any domain currently attached + * to the device, but keep it attached to other devices in the group. */ - if (zdev && zdev->s390_domain) { - domain = iommu_get_domain_for_dev(dev); - if (domain) - s390_iommu_detach_device(domain, dev); + if (zdev) + __s390_iommu_detach_device(zdev); +} + +static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + struct s390_domain *s390_domain = to_s390_domain(domain); + struct zpci_dev *zdev; + + rcu_read_lock(); + list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { + zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma, + zdev->end_dma - zdev->start_dma + 1); } + rcu_read_unlock(); } -static int s390_iommu_update_trans(struct s390_domain *s390_domain, - phys_addr_t pa, dma_addr_t dma_addr, - size_t size, int flags) +static void s390_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - struct s390_domain_device *domain_device; - phys_addr_t page_addr = pa & PAGE_MASK; - dma_addr_t start_dma_addr = dma_addr; - unsigned long irq_flags, nr_pages, i; - unsigned long *entry; - int rc = 0; + struct s390_domain *s390_domain = to_s390_domain(domain); + size_t size = gather->end - gather->start + 1; + struct zpci_dev *zdev; - if (dma_addr < s390_domain->domain.geometry.aperture_start || - dma_addr + size > s390_domain->domain.geometry.aperture_end) - return -EINVAL; + /* If gather was never added to there is nothing to flush */ + if (!gather->end) + return; - nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - if (!nr_pages) - return 0; + rcu_read_lock(); + list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { + zpci_refresh_trans((u64)zdev->fh << 32, gather->start, + size); + } + rcu_read_unlock(); +} + +static void s390_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + struct s390_domain *s390_domain = to_s390_domain(domain); + struct zpci_dev *zdev; + + rcu_read_lock(); + list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { + if (!zdev->tlb_refresh) + continue; + zpci_refresh_trans((u64)zdev->fh << 32, + iova, size); + } + rcu_read_unlock(); +} + +static int s390_iommu_validate_trans(struct s390_domain *s390_domain, + phys_addr_t pa, dma_addr_t dma_addr, + unsigned long nr_pages, int flags) +{ + phys_addr_t page_addr = pa & PAGE_MASK; + unsigned long *entry; + unsigned long i; + int rc; - spin_lock_irqsave(&s390_domain->dma_table_lock, irq_flags); for (i = 0; i < nr_pages; i++) { entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr); - if (!entry) { + if (unlikely(!entry)) { rc = -ENOMEM; goto undo_cpu_trans; } @@ -249,47 +278,70 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain, dma_addr += PAGE_SIZE; } - spin_lock(&s390_domain->list_lock); - list_for_each_entry(domain_device, &s390_domain->devices, list) { - rc = zpci_refresh_trans((u64) domain_device->zdev->fh << 32, - start_dma_addr, nr_pages * PAGE_SIZE); - if (rc) + return 0; + +undo_cpu_trans: + while (i-- > 0) { + dma_addr -= PAGE_SIZE; + entry = dma_walk_cpu_trans(s390_domain->dma_table, + dma_addr); + if (!entry) break; + dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID); } - spin_unlock(&s390_domain->list_lock); -undo_cpu_trans: - if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) { - flags = ZPCI_PTE_INVALID; - while (i-- > 0) { - page_addr -= PAGE_SIZE; - dma_addr -= PAGE_SIZE; - entry = dma_walk_cpu_trans(s390_domain->dma_table, - dma_addr); - if (!entry) - break; - dma_update_cpu_trans(entry, page_addr, flags); + return rc; +} + +static int s390_iommu_invalidate_trans(struct s390_domain *s390_domain, + dma_addr_t dma_addr, unsigned long nr_pages) +{ + unsigned long *entry; + unsigned long i; + int rc = 0; + + for (i = 0; i < nr_pages; i++) { + entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr); + if (unlikely(!entry)) { + rc = -EINVAL; + break; } + dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID); + dma_addr += PAGE_SIZE; } - spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags); return rc; } -static int s390_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +static int s390_iommu_map_pages(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct s390_domain *s390_domain = to_s390_domain(domain); + size_t size = pgcount << __ffs(pgsize); int flags = ZPCI_PTE_VALID, rc = 0; + if (pgsize != SZ_4K) + return -EINVAL; + + if (iova < s390_domain->domain.geometry.aperture_start || + (iova + size - 1) > s390_domain->domain.geometry.aperture_end) + return -EINVAL; + + if (!IS_ALIGNED(iova | paddr, pgsize)) + return -EINVAL; + if (!(prot & IOMMU_READ)) return -EINVAL; if (!(prot & IOMMU_WRITE)) flags |= ZPCI_TABLE_PROTECTED; - rc = s390_iommu_update_trans(s390_domain, paddr, iova, - size, flags); + rc = s390_iommu_validate_trans(s390_domain, paddr, iova, + pgcount, flags); + if (!rc) + *mapped = size; return rc; } @@ -298,7 +350,8 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { struct s390_domain *s390_domain = to_s390_domain(domain); - unsigned long *sto, *pto, *rto, flags; + unsigned long *rto, *sto, *pto; + unsigned long ste, pte, rte; unsigned int rtx, sx, px; phys_addr_t phys = 0; @@ -311,38 +364,40 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, px = calc_px(iova); rto = s390_domain->dma_table; - spin_lock_irqsave(&s390_domain->dma_table_lock, flags); - if (rto && reg_entry_isvalid(rto[rtx])) { - sto = get_rt_sto(rto[rtx]); - if (sto && reg_entry_isvalid(sto[sx])) { - pto = get_st_pto(sto[sx]); - if (pto && pt_entry_isvalid(pto[px])) - phys = pto[px] & ZPCI_PTE_ADDR_MASK; + rte = READ_ONCE(rto[rtx]); + if (reg_entry_isvalid(rte)) { + sto = get_rt_sto(rte); + ste = READ_ONCE(sto[sx]); + if (reg_entry_isvalid(ste)) { + pto = get_st_pto(ste); + pte = READ_ONCE(pto[px]); + if (pt_entry_isvalid(pte)) + phys = pte & ZPCI_PTE_ADDR_MASK; } } - spin_unlock_irqrestore(&s390_domain->dma_table_lock, flags); return phys; } -static size_t s390_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *gather) +static size_t s390_iommu_unmap_pages(struct iommu_domain *domain, + unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { struct s390_domain *s390_domain = to_s390_domain(domain); - int flags = ZPCI_PTE_INVALID; - phys_addr_t paddr; + size_t size = pgcount << __ffs(pgsize); int rc; - paddr = s390_iommu_iova_to_phys(domain, iova); - if (!paddr) + if (WARN_ON(iova < s390_domain->domain.geometry.aperture_start || + (iova + size - 1) > s390_domain->domain.geometry.aperture_end)) return 0; - rc = s390_iommu_update_trans(s390_domain, paddr, iova, - size, flags); + rc = s390_iommu_invalidate_trans(s390_domain, iova, pgcount); if (rc) return 0; + iommu_iotlb_gather_add_range(gather, iova, size); + return size; } @@ -380,12 +435,16 @@ static const struct iommu_ops s390_iommu_ops = { .probe_device = s390_iommu_probe_device, .release_device = s390_iommu_release_device, .device_group = generic_device_group, - .pgsize_bitmap = S390_IOMMU_PGSIZES, + .pgsize_bitmap = SZ_4K, + .get_resv_regions = s390_iommu_get_resv_regions, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = s390_iommu_attach_device, .detach_dev = s390_iommu_detach_device, - .map = s390_iommu_map, - .unmap = s390_iommu_unmap, + .map_pages = s390_iommu_map_pages, + .unmap_pages = s390_iommu_unmap_pages, + .flush_iotlb_all = s390_iommu_flush_iotlb_all, + .iotlb_sync = s390_iommu_iotlb_sync, + .iotlb_sync_map = s390_iommu_iotlb_sync_map, .iova_to_phys = s390_iommu_iova_to_phys, .free = s390_domain_free, } diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index fadd2c907222..219bfa11f7f4 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -237,10 +237,8 @@ static int sprd_iommu_attach_device(struct iommu_domain *domain, struct sprd_iommu_domain *dom = to_sprd_domain(domain); size_t pgt_size = sprd_iommu_pgt_size(domain); - if (dom->sdev) { - pr_err("There's already a device attached to this domain.\n"); + if (dom->sdev) return -EINVAL; - } dom->pgt_va = dma_alloc_coherent(sdev->dev, pgt_size, &dom->pgt_pa, GFP_KERNEL); if (!dom->pgt_va) @@ -273,10 +271,11 @@ static void sprd_iommu_detach_device(struct iommu_domain *domain, } static int sprd_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct sprd_iommu_domain *dom = to_sprd_domain(domain); - unsigned int page_num = size >> SPRD_IOMMU_PAGE_SHIFT; + size_t size = pgcount * SPRD_IOMMU_PAGE_SIZE; unsigned long flags; unsigned int i; u32 *pgt_base_iova; @@ -298,35 +297,37 @@ static int sprd_iommu_map(struct iommu_domain *domain, unsigned long iova, pgt_base_iova = dom->pgt_va + ((iova - start) >> SPRD_IOMMU_PAGE_SHIFT); spin_lock_irqsave(&dom->pgtlock, flags); - for (i = 0; i < page_num; i++) { + for (i = 0; i < pgcount; i++) { pgt_base_iova[i] = pabase >> SPRD_IOMMU_PAGE_SHIFT; pabase += SPRD_IOMMU_PAGE_SIZE; } spin_unlock_irqrestore(&dom->pgtlock, flags); + *mapped = size; return 0; } static size_t sprd_iommu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *iotlb_gather) + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) { struct sprd_iommu_domain *dom = to_sprd_domain(domain); unsigned long flags; u32 *pgt_base_iova; - unsigned int page_num = size >> SPRD_IOMMU_PAGE_SHIFT; + size_t size = pgcount * SPRD_IOMMU_PAGE_SIZE; unsigned long start = domain->geometry.aperture_start; unsigned long end = domain->geometry.aperture_end; if (iova < start || (iova + size) > (end + 1)) - return -EINVAL; + return 0; pgt_base_iova = dom->pgt_va + ((iova - start) >> SPRD_IOMMU_PAGE_SHIFT); spin_lock_irqsave(&dom->pgtlock, flags); - memset(pgt_base_iova, 0, page_num * sizeof(u32)); + memset(pgt_base_iova, 0, pgcount * sizeof(u32)); spin_unlock_irqrestore(&dom->pgtlock, flags); - return 0; + return size; } static void sprd_iommu_sync_map(struct iommu_domain *domain, @@ -409,13 +410,13 @@ static const struct iommu_ops sprd_iommu_ops = { .probe_device = sprd_iommu_probe_device, .device_group = sprd_iommu_device_group, .of_xlate = sprd_iommu_of_xlate, - .pgsize_bitmap = ~0UL << SPRD_IOMMU_PAGE_SHIFT, + .pgsize_bitmap = SPRD_IOMMU_PAGE_SIZE, .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = sprd_iommu_attach_device, .detach_dev = sprd_iommu_detach_device, - .map = sprd_iommu_map, - .unmap = sprd_iommu_unmap, + .map_pages = sprd_iommu_map, + .unmap_pages = sprd_iommu_unmap, .iotlb_sync_map = sprd_iommu_sync_map, .iotlb_sync = sprd_iommu_sync, .iova_to_phys = sprd_iommu_iova_to_phys, diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index cd9b74ee24de..5b585eace3d4 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -27,6 +27,7 @@ #include <linux/types.h> #define IOMMU_RESET_REG 0x010 +#define IOMMU_RESET_RELEASE_ALL 0xffffffff #define IOMMU_ENABLE_REG 0x020 #define IOMMU_ENABLE_ENABLE BIT(0) @@ -92,6 +93,8 @@ #define NUM_PT_ENTRIES 256 #define PT_SIZE (NUM_PT_ENTRIES * PT_ENTRY_SIZE) +#define SPAGE_SIZE 4096 + struct sun50i_iommu { struct iommu_device iommu; @@ -270,7 +273,7 @@ static u32 sun50i_mk_pte(phys_addr_t page, int prot) enum sun50i_iommu_aci aci; u32 flags = 0; - if (prot & (IOMMU_READ | IOMMU_WRITE)) + if ((prot & (IOMMU_READ | IOMMU_WRITE)) == (IOMMU_READ | IOMMU_WRITE)) aci = SUN50I_IOMMU_ACI_RD_WR; else if (prot & IOMMU_READ) aci = SUN50I_IOMMU_ACI_RD; @@ -294,6 +297,62 @@ static void sun50i_table_flush(struct sun50i_iommu_domain *sun50i_domain, dma_sync_single_for_device(iommu->dev, dma, size, DMA_TO_DEVICE); } +static void sun50i_iommu_zap_iova(struct sun50i_iommu *iommu, + unsigned long iova) +{ + u32 reg; + int ret; + + iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_REG, iova); + iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_MASK_REG, GENMASK(31, 12)); + iommu_write(iommu, IOMMU_TLB_IVLD_ENABLE_REG, + IOMMU_TLB_IVLD_ENABLE_ENABLE); + + ret = readl_poll_timeout_atomic(iommu->base + IOMMU_TLB_IVLD_ENABLE_REG, + reg, !reg, 1, 2000); + if (ret) + dev_warn(iommu->dev, "TLB invalidation timed out!\n"); +} + +static void sun50i_iommu_zap_ptw_cache(struct sun50i_iommu *iommu, + unsigned long iova) +{ + u32 reg; + int ret; + + iommu_write(iommu, IOMMU_PC_IVLD_ADDR_REG, iova); + iommu_write(iommu, IOMMU_PC_IVLD_ENABLE_REG, + IOMMU_PC_IVLD_ENABLE_ENABLE); + + ret = readl_poll_timeout_atomic(iommu->base + IOMMU_PC_IVLD_ENABLE_REG, + reg, !reg, 1, 2000); + if (ret) + dev_warn(iommu->dev, "PTW cache invalidation timed out!\n"); +} + +static void sun50i_iommu_zap_range(struct sun50i_iommu *iommu, + unsigned long iova, size_t size) +{ + assert_spin_locked(&iommu->iommu_lock); + + iommu_write(iommu, IOMMU_AUTO_GATING_REG, 0); + + sun50i_iommu_zap_iova(iommu, iova); + sun50i_iommu_zap_iova(iommu, iova + SPAGE_SIZE); + if (size > SPAGE_SIZE) { + sun50i_iommu_zap_iova(iommu, iova + size); + sun50i_iommu_zap_iova(iommu, iova + size + SPAGE_SIZE); + } + sun50i_iommu_zap_ptw_cache(iommu, iova); + sun50i_iommu_zap_ptw_cache(iommu, iova + SZ_1M); + if (size > SZ_1M) { + sun50i_iommu_zap_ptw_cache(iommu, iova + size); + sun50i_iommu_zap_ptw_cache(iommu, iova + size + SZ_1M); + } + + iommu_write(iommu, IOMMU_AUTO_GATING_REG, IOMMU_AUTO_GATING_ENABLE); +} + static int sun50i_iommu_flush_all_tlb(struct sun50i_iommu *iommu) { u32 reg; @@ -343,6 +402,18 @@ static void sun50i_iommu_flush_iotlb_all(struct iommu_domain *domain) spin_unlock_irqrestore(&iommu->iommu_lock, flags); } +static void sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); + struct sun50i_iommu *iommu = sun50i_domain->iommu; + unsigned long flags; + + spin_lock_irqsave(&iommu->iommu_lock, flags); + sun50i_iommu_zap_range(iommu, iova, size); + spin_unlock_irqrestore(&iommu->iommu_lock, flags); +} + static void sun50i_iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { @@ -511,7 +582,7 @@ static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain, sun50i_iommu_free_page_table(iommu, drop_pt); } - sun50i_table_flush(sun50i_domain, page_table, PT_SIZE); + sun50i_table_flush(sun50i_domain, page_table, NUM_PT_ENTRIES); sun50i_table_flush(sun50i_domain, dte_addr, 1); return page_table; @@ -601,7 +672,6 @@ static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type) struct sun50i_iommu_domain *sun50i_domain; if (type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_IDENTITY && type != IOMMU_DOMAIN_UNMANAGED) return NULL; @@ -766,6 +836,7 @@ static const struct iommu_ops sun50i_iommu_ops = { .attach_dev = sun50i_iommu_attach_device, .detach_dev = sun50i_iommu_detach_device, .flush_iotlb_all = sun50i_iommu_flush_iotlb_all, + .iotlb_sync_map = sun50i_iommu_iotlb_sync_map, .iotlb_sync = sun50i_iommu_iotlb_sync, .iova_to_phys = sun50i_iommu_iova_to_phys, .map = sun50i_iommu_map, @@ -785,6 +856,8 @@ static void sun50i_iommu_report_fault(struct sun50i_iommu *iommu, report_iommu_fault(iommu->domain, iommu->dev, iova, prot); else dev_err(iommu->dev, "Page fault while iommu not attached to any domain?\n"); + + sun50i_iommu_zap_range(iommu, iova, SPAGE_SIZE); } static phys_addr_t sun50i_iommu_handle_pt_irq(struct sun50i_iommu *iommu, @@ -868,8 +941,8 @@ static phys_addr_t sun50i_iommu_handle_perm_irq(struct sun50i_iommu *iommu) static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id) { + u32 status, l1_status, l2_status, resets; struct sun50i_iommu *iommu = dev_id; - u32 status; spin_lock(&iommu->iommu_lock); @@ -879,6 +952,9 @@ static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id) return IRQ_NONE; } + l1_status = iommu_read(iommu, IOMMU_L1PG_INT_REG); + l2_status = iommu_read(iommu, IOMMU_L2PG_INT_REG); + if (status & IOMMU_INT_INVALID_L2PG) sun50i_iommu_handle_pt_irq(iommu, IOMMU_INT_ERR_ADDR_L2_REG, @@ -892,8 +968,9 @@ static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id) iommu_write(iommu, IOMMU_INT_CLR_REG, status); - iommu_write(iommu, IOMMU_RESET_REG, ~status); - iommu_write(iommu, IOMMU_RESET_REG, status); + resets = (status | l1_status | l2_status) & IOMMU_INT_MASTER_MASK; + iommu_write(iommu, IOMMU_RESET_REG, ~resets); + iommu_write(iommu, IOMMU_RESET_REG, IOMMU_RESET_RELEASE_ALL); spin_unlock(&iommu->iommu_lock); diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index e5ca3cf1a949..ed53279d1106 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -112,7 +112,7 @@ static int gart_iommu_attach_dev(struct iommu_domain *domain, spin_lock(&gart->dom_lock); if (gart->active_domain && gart->active_domain != domain) { - ret = -EBUSY; + ret = -EINVAL; } else if (dev_iommu_priv_get(dev) != domain) { dev_iommu_priv_set(dev, domain); gart->active_domain = domain; diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 8b1b5c270e50..5b8fe9bfa9a5 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -670,7 +670,7 @@ static int viommu_domain_finalise(struct viommu_endpoint *vdev, dev_err(vdev->dev, "granule 0x%lx larger than system page size 0x%lx\n", viommu_page_size, PAGE_SIZE); - return -EINVAL; + return -ENODEV; } ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, @@ -697,7 +697,7 @@ static int viommu_domain_finalise(struct viommu_endpoint *vdev, if (ret) { ida_free(&viommu->domain_ids, vdomain->id); vdomain->viommu = NULL; - return -EOPNOTSUPP; + return ret; } } @@ -734,8 +734,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) */ ret = viommu_domain_finalise(vdev, domain); } else if (vdomain->viommu != vdev->viommu) { - dev_err(dev, "cannot attach to foreign vIOMMU\n"); - ret = -EXDEV; + ret = -EINVAL; } mutex_unlock(&vdomain->mutex); |