summaryrefslogtreecommitdiff
path: root/drivers/iommu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/Kconfig149
-rw-r--r--drivers/iommu/Makefile8
-rw-r--r--drivers/iommu/amd/Kconfig6
-rw-r--r--drivers/iommu/amd/Makefile2
-rw-r--r--drivers/iommu/amd/amd_iommu.h49
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h264
-rw-r--r--drivers/iommu/amd/debugfs.c378
-rw-r--r--drivers/iommu/amd/init.c883
-rw-r--r--drivers/iommu/amd/io_pgtable.c607
-rw-r--r--drivers/iommu/amd/io_pgtable_v2.c387
-rw-r--r--drivers/iommu/amd/iommu.c1796
-rw-r--r--drivers/iommu/amd/pasid.c11
-rw-r--r--drivers/iommu/amd/ppr.c2
-rw-r--r--drivers/iommu/apple-dart.c94
-rw-r--r--drivers/iommu/arm/Kconfig144
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/Makefile7
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c485
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c542
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c172
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c1868
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h393
-rw-r--r--drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c1349
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-impl.c9
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c4
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c94
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c253
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h5
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.c163
-rw-r--r--drivers/iommu/arm/arm-smmu/arm-smmu.h73
-rw-r--r--drivers/iommu/arm/arm-smmu/qcom_iommu.c35
-rw-r--r--drivers/iommu/dma-iommu.c952
-rw-r--r--drivers/iommu/dma-iommu.h14
-rw-r--r--drivers/iommu/exynos-iommu.c47
-rw-r--r--drivers/iommu/fsl_pamu_domain.c18
-rw-r--r--drivers/iommu/generic_pt/.kunitconfig14
-rw-r--r--drivers/iommu/generic_pt/Kconfig79
-rw-r--r--drivers/iommu/generic_pt/fmt/Makefile28
-rw-r--r--drivers/iommu/generic_pt/fmt/amdv1.h411
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_amdv1.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_vtdss.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/defs_x86_64.h21
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_amdv1.c15
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_mock.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_template.h48
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_vtdss.c10
-rw-r--r--drivers/iommu/generic_pt/fmt/iommu_x86_64.c11
-rw-r--r--drivers/iommu/generic_pt/fmt/vtdss.h285
-rw-r--r--drivers/iommu/generic_pt/fmt/x86_64.h279
-rw-r--r--drivers/iommu/generic_pt/iommu_pt.h1289
-rw-r--r--drivers/iommu/generic_pt/kunit_generic_pt.h823
-rw-r--r--drivers/iommu/generic_pt/kunit_iommu.h184
-rw-r--r--drivers/iommu/generic_pt/kunit_iommu_pt.h487
-rw-r--r--drivers/iommu/generic_pt/pt_common.h389
-rw-r--r--drivers/iommu/generic_pt/pt_defs.h332
-rw-r--r--drivers/iommu/generic_pt/pt_fmt_defaults.h295
-rw-r--r--drivers/iommu/generic_pt/pt_iter.h636
-rw-r--r--drivers/iommu/generic_pt/pt_log2.h122
-rw-r--r--drivers/iommu/hyperv-iommu.c41
-rw-r--r--drivers/iommu/intel/Kconfig9
-rw-r--r--drivers/iommu/intel/Makefile7
-rw-r--r--drivers/iommu/intel/cache.c285
-rw-r--r--drivers/iommu/intel/cap_audit.c217
-rw-r--r--drivers/iommu/intel/cap_audit.h131
-rw-r--r--drivers/iommu/intel/debugfs.c29
-rw-r--r--drivers/iommu/intel/dmar.c144
-rw-r--r--drivers/iommu/intel/iommu.c2495
-rw-r--r--drivers/iommu/intel/iommu.h407
-rw-r--r--drivers/iommu/intel/irq_remapping.c149
-rw-r--r--drivers/iommu/intel/nested.c96
-rw-r--r--drivers/iommu/intel/pasid.c558
-rw-r--r--drivers/iommu/intel/pasid.h45
-rw-r--r--drivers/iommu/intel/perf.c10
-rw-r--r--drivers/iommu/intel/perf.h5
-rw-r--r--drivers/iommu/intel/perfmon.c111
-rw-r--r--drivers/iommu/intel/prq.c396
-rw-r--r--drivers/iommu/intel/svm.c480
-rw-r--r--drivers/iommu/intel/trace.h5
-rw-r--r--drivers/iommu/io-pgfault.c122
-rw-r--r--drivers/iommu/io-pgtable-arm-selftests.c214
-rw-r--r--drivers/iommu/io-pgtable-arm-v7s.c152
-rw-r--r--drivers/iommu/io-pgtable-arm.c627
-rw-r--r--drivers/iommu/io-pgtable-dart.c159
-rw-r--r--drivers/iommu/io-pgtable.c4
-rw-r--r--drivers/iommu/iommu-pages.c253
-rw-r--r--drivers/iommu/iommu-pages.h224
-rw-r--r--drivers/iommu/iommu-priv.h41
-rw-r--r--drivers/iommu/iommu-sva.c96
-rw-r--r--drivers/iommu/iommu-sysfs.c2
-rw-r--r--drivers/iommu/iommu.c1055
-rw-r--r--drivers/iommu/iommufd/Kconfig5
-rw-r--r--drivers/iommu/iommufd/Makefile7
-rw-r--r--drivers/iommu/iommufd/device.c713
-rw-r--r--drivers/iommu/iommufd/driver.c304
-rw-r--r--drivers/iommu/iommufd/eventq.c541
-rw-r--r--drivers/iommu/iommufd/hw_pagetable.c203
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c248
-rw-r--r--drivers/iommu/iommufd/io_pagetable.h83
-rw-r--r--drivers/iommu/iommufd/ioas.c267
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h383
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h123
-rw-r--r--drivers/iommu/iommufd/iova_bitmap.c138
-rw-r--r--drivers/iommu/iommufd/main.c322
-rw-r--r--drivers/iommu/iommufd/pages.c721
-rw-r--r--drivers/iommu/iommufd/selftest.c1447
-rw-r--r--drivers/iommu/iommufd/vfio_compat.c13
-rw-r--r--drivers/iommu/iommufd/viommu.c430
-rw-r--r--drivers/iommu/iova.c9
-rw-r--r--drivers/iommu/ipmmu-vmsa.c49
-rw-r--r--drivers/iommu/msm_iommu.c69
-rw-r--r--drivers/iommu/mtk_iommu.c255
-rw-r--r--drivers/iommu/mtk_iommu_v1.c91
-rw-r--r--drivers/iommu/of_iommu.c68
-rw-r--r--drivers/iommu/omap-iommu.c76
-rw-r--r--drivers/iommu/omap-iommu.h2
-rw-r--r--drivers/iommu/riscv/Kconfig20
-rw-r--r--drivers/iommu/riscv/Makefile3
-rw-r--r--drivers/iommu/riscv/iommu-bits.h784
-rw-r--r--drivers/iommu/riscv/iommu-pci.c128
-rw-r--r--drivers/iommu/riscv/iommu-platform.c179
-rw-r--r--drivers/iommu/riscv/iommu.c1682
-rw-r--r--drivers/iommu/riscv/iommu.h89
-rw-r--r--drivers/iommu/rockchip-iommu.c104
-rw-r--r--drivers/iommu/s390-iommu.c570
-rw-r--r--drivers/iommu/sprd-iommu.c10
-rw-r--r--drivers/iommu/sun50i-iommu.c30
-rw-r--r--drivers/iommu/tegra-smmu.c132
-rw-r--r--drivers/iommu/virtio-iommu.c219
127 files changed, 26398 insertions, 9761 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c04584be3089..99095645134f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -40,12 +40,13 @@ config IOMMU_IO_PGTABLE_LPAE
sizes at both stage-1 and stage-2, as well as address spaces
up to 48-bits in size.
-config IOMMU_IO_PGTABLE_LPAE_SELFTEST
- bool "LPAE selftests"
- depends on IOMMU_IO_PGTABLE_LPAE
+config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST
+ tristate "KUnit tests for LPAE"
+ depends on IOMMU_IO_PGTABLE_LPAE && KUNIT
+ default KUNIT_ALL_TESTS
help
- Enable self-tests for LPAE page table allocator. This performs
- a series of page-table consistency checks during boot.
+ Enable kunit tests for LPAE page table allocator. This performs
+ a series of page-table consistency checks.
If unsure, say N here.
@@ -151,10 +152,9 @@ config OF_IOMMU
# IOMMU-agnostic DMA-mapping layer
config IOMMU_DMA
def_bool ARM64 || X86 || S390
- select DMA_OPS
+ select DMA_OPS_HELPERS
select IOMMU_API
select IOMMU_IOVA
- select IRQ_MSI_IOMMU
select NEED_SG_DMA_LENGTH
select NEED_SG_DMA_FLAGS if SWIOTLB
@@ -193,13 +193,15 @@ config MSM_IOMMU
If unsure, say N here.
source "drivers/iommu/amd/Kconfig"
+source "drivers/iommu/arm/Kconfig"
source "drivers/iommu/intel/Kconfig"
source "drivers/iommu/iommufd/Kconfig"
+source "drivers/iommu/riscv/Kconfig"
config IRQ_REMAP
bool "Support for Interrupt Remapping"
depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI
- select DMAR_TABLE if INTEL_IOMMU
+ select IRQ_MSI_LIB
help
Supports Interrupt remapping for IO-APIC and MSI devices.
To use x2apic mode in the CPU's which support x2APIC enhancements or
@@ -246,7 +248,7 @@ config SUN50I_IOMMU
config TEGRA_IOMMU_SMMU
bool "NVIDIA Tegra SMMU Support"
- depends on ARCH_TEGRA
+ depends on ARCH_TEGRA || COMPILE_TEST
depends on TEGRA_AHB
depends on TEGRA_MC
select IOMMU_API
@@ -305,7 +307,6 @@ config APPLE_DART
depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_DART
select IOMMU_API
select IOMMU_IO_PGTABLE_DART
- default ARCH_APPLE
help
Support for Apple DART (Device Address Resolution Table) IOMMUs
found in Apple ARM SoCs like the M1.
@@ -314,117 +315,6 @@ config APPLE_DART
Say Y here if you are using an Apple SoC.
-# ARM IOMMU support
-config ARM_SMMU
- tristate "ARM Ltd. System MMU (SMMU) Support"
- depends on ARM64 || ARM || COMPILE_TEST
- depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE
- select IOMMU_API
- select IOMMU_IO_PGTABLE_LPAE
- select ARM_DMA_USE_IOMMU if ARM
- help
- Support for implementations of the ARM System MMU architecture
- versions 1 and 2.
-
- Say Y here if your SoC includes an IOMMU device implementing
- the ARM SMMU architecture.
-
-config ARM_SMMU_LEGACY_DT_BINDINGS
- bool "Support the legacy \"mmu-masters\" devicetree bindings"
- depends on ARM_SMMU=y && OF
- help
- Support for the badly designed and deprecated "mmu-masters"
- devicetree bindings. This allows some DMA masters to attach
- to the SMMU but does not provide any support via the DMA API.
- If you're lucky, you might be able to get VFIO up and running.
-
- If you say Y here then you'll make me very sad. Instead, say N
- and move your firmware to the utopian future that was 2016.
-
-config ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT
- bool "Default to disabling bypass on ARM SMMU v1 and v2"
- depends on ARM_SMMU
- default y
- help
- Say Y here to (by default) disable bypass streams such that
- incoming transactions from devices that are not attached to
- an iommu domain will report an abort back to the device and
- will not be allowed to pass through the SMMU.
-
- Any old kernels that existed before this KConfig was
- introduced would default to _allowing_ bypass (AKA the
- equivalent of NO for this config). However the default for
- this option is YES because the old behavior is insecure.
-
- There are few reasons to allow unmatched stream bypass, and
- even fewer good ones. If saying YES here breaks your board
- you should work on fixing your board. This KConfig option
- is expected to be removed in the future and we'll simply
- hardcode the bypass disable in the code.
-
- NOTE: the kernel command line parameter
- 'arm-smmu.disable_bypass' will continue to override this
- config.
-
-config ARM_SMMU_QCOM
- def_tristate y
- depends on ARM_SMMU && ARCH_QCOM
- select QCOM_SCM
- help
- When running on a Qualcomm platform that has the custom variant
- of the ARM SMMU, this needs to be built into the SMMU driver.
-
-config ARM_SMMU_QCOM_DEBUG
- bool "ARM SMMU QCOM implementation defined debug support"
- depends on ARM_SMMU_QCOM=y
- help
- Support for implementation specific debug features in ARM SMMU
- hardware found in QTI platforms. This include support for
- the Translation Buffer Units (TBU) that can be used to obtain
- additional information when debugging memory management issues
- like context faults.
-
- Say Y here to enable debug for issues such as context faults
- or TLB sync timeouts which requires implementation defined
- register dumps.
-
-config ARM_SMMU_V3
- tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
- depends on ARM64
- select IOMMU_API
- select IOMMU_IO_PGTABLE_LPAE
- select GENERIC_MSI_IRQ
- help
- Support for implementations of the ARM System MMU architecture
- version 3 providing translation support to a PCIe root complex.
-
- Say Y here if your system includes an IOMMU device implementing
- the ARM SMMUv3 architecture.
-
-if ARM_SMMU_V3
-config ARM_SMMU_V3_SVA
- bool "Shared Virtual Addressing support for the ARM SMMUv3"
- select IOMMU_SVA
- select IOMMU_IOPF
- select MMU_NOTIFIER
- help
- Support for sharing process address spaces with devices using the
- SMMUv3.
-
- Say Y here if your system supports SVA extensions such as PCIe PASID
- and PRI.
-
-config ARM_SMMU_V3_KUNIT_TEST
- tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS
- depends on KUNIT
- depends on ARM_SMMU_V3_SVA
- default KUNIT_ALL_TESTS
- help
- Enable this option to unit-test arm-smmu-v3 driver functions.
-
- If unsure, say N.
-endif
-
config S390_IOMMU
def_bool y if S390 && PCI
depends on S390 && PCI
@@ -449,8 +339,7 @@ config MTK_IOMMU
config MTK_IOMMU_V1
tristate "MediaTek IOMMU Version 1 (M4U gen1) Support"
- depends on ARM
- depends on ARCH_MEDIATEK || COMPILE_TEST
+ depends on (ARCH_MEDIATEK && ARM) || COMPILE_TEST
select ARM_DMA_USE_IOMMU
select IOMMU_API
select MEMORY
@@ -462,18 +351,6 @@ config MTK_IOMMU_V1
if unsure, say N here.
-config QCOM_IOMMU
- # Note: iommu drivers cannot (yet?) be built as modules
- bool "Qualcomm IOMMU Support"
- depends on ARCH_QCOM || COMPILE_TEST
- depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE
- select QCOM_SCM
- select IOMMU_API
- select IOMMU_IO_PGTABLE_LPAE
- select ARM_DMA_USE_IOMMU
- help
- Support for IOMMU on certain Qualcomm SoCs.
-
config HYPERV_IOMMU
bool "Hyper-V IRQ Handling"
depends on HYPERV && X86
@@ -508,3 +385,5 @@ config SPRD_IOMMU
Say Y here if you want to use the multimedia devices listed above.
endif # IOMMU_SUPPORT
+
+source "drivers/iommu/generic_pt/Kconfig"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 542760d963ec..8e8843316c4b 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,6 +1,11 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y += amd/ intel/ arm/ iommufd/
+obj-y += arm/ iommufd/
+obj-$(CONFIG_AMD_IOMMU) += amd/
+obj-$(CONFIG_INTEL_IOMMU) += intel/
+obj-$(CONFIG_RISCV_IOMMU) += riscv/
+obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/
obj-$(CONFIG_IOMMU_API) += iommu.o
+obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o
obj-$(CONFIG_IOMMU_API) += iommu-traces.o
obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
@@ -8,6 +13,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o
obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o
obj-$(CONFIG_IOMMU_IOVA) += iova.o
obj-$(CONFIG_OF_IOMMU) += of_iommu.o
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index 994063e5586f..f2acf471cb5d 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -7,13 +7,17 @@ config AMD_IOMMU
select PCI_ATS
select PCI_PRI
select PCI_PASID
+ select IRQ_MSI_LIB
select MMU_NOTIFIER
select IOMMU_API
select IOMMU_IOVA
- select IOMMU_IO_PGTABLE
select IOMMU_SVA
select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_AMDV1
+ select IOMMU_PT_X86_64
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help
With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 9de33b2d42f5..5412a563c697 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o
+obj-y += iommu.o init.o quirks.o ppr.o pasid.o
obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 2d5945c982bd..25044d28f28a 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -16,7 +16,6 @@ irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data);
irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data);
irqreturn_t amd_iommu_int_thread_galog(int irq, void *data);
irqreturn_t amd_iommu_int_handler(int irq, void *data);
-void amd_iommu_apply_erratum_63(struct amd_iommu *iommu, u16 devid);
void amd_iommu_restart_log(struct amd_iommu *iommu, const char *evt_type,
u8 cntrl_intr, u8 cntrl_log,
u32 status_run_mask, u32 status_overflow_mask);
@@ -29,9 +28,9 @@ void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu,
gfp_t gfp, size_t size);
#ifdef CONFIG_AMD_IOMMU_DEBUGFS
-void amd_iommu_debugfs_setup(struct amd_iommu *iommu);
+void amd_iommu_debugfs_setup(void);
#else
-static inline void amd_iommu_debugfs_setup(struct amd_iommu *iommu) {}
+static inline void amd_iommu_debugfs_setup(void) {}
#endif
/* Needed for interrupt remapping */
@@ -41,17 +40,21 @@ void amd_iommu_disable(void);
int amd_iommu_reenable(int mode);
int amd_iommu_enable_faulting(unsigned int cpu);
extern int amd_iommu_guest_ir;
-extern enum io_pgtable_fmt amd_iommu_pgtable;
+extern enum protection_domain_mode amd_iommu_pgtable;
extern int amd_iommu_gpt_level;
+extern u8 amd_iommu_hpt_level;
+extern unsigned long amd_iommu_pgsize_bitmap;
+extern bool amd_iommu_hatdis;
/* Protection domain ops */
-struct protection_domain *protection_domain_alloc(unsigned int type);
-void protection_domain_free(struct protection_domain *domain);
+void amd_iommu_init_identity_domain(void);
+struct protection_domain *protection_domain_alloc(void);
struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev,
struct mm_struct *mm);
void amd_iommu_domain_free(struct iommu_domain *dom);
int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
- struct device *dev, ioasid_t pasid);
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old);
void amd_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
struct iommu_domain *domain);
@@ -85,16 +88,10 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag);
* the IOMMU used by this driver.
*/
void amd_iommu_flush_all_caches(struct amd_iommu *iommu);
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain);
-void amd_iommu_domain_update(struct protection_domain *domain);
-void amd_iommu_dev_update_dte(struct iommu_dev_data *dev_data, bool set);
-void amd_iommu_domain_flush_complete(struct protection_domain *domain);
void amd_iommu_domain_flush_pages(struct protection_domain *domain,
u64 address, size_t size);
void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
ioasid_t pasid, u64 address, size_t size);
-void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data,
- ioasid_t pasid);
#ifdef CONFIG_IRQ_REMAP
int amd_iommu_create_irq_domain(struct amd_iommu *iommu);
@@ -121,14 +118,14 @@ static inline bool check_feature2(u64 mask)
return (amd_iommu_efr2 & mask);
}
-static inline int check_feature_gpt_level(void)
+static inline bool amd_iommu_v2_pgtbl_supported(void)
{
- return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK);
+ return (check_feature(FEATURE_GIOSUP) && check_feature(FEATURE_GT));
}
static inline bool amd_iommu_gt_ppr_supported(void)
{
- return (check_feature(FEATURE_GT) &&
+ return (amd_iommu_v2_pgtbl_supported() &&
check_feature(FEATURE_PPR) &&
check_feature(FEATURE_EPHSUP));
}
@@ -143,19 +140,6 @@ static inline void *iommu_phys_to_virt(unsigned long paddr)
return phys_to_virt(__sme_clr(paddr));
}
-static inline
-void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root)
-{
- domain->iop.root = (u64 *)(root & PAGE_MASK);
- domain->iop.mode = root & 7; /* lowest 3 bits encode pgtable mode */
-}
-
-static inline
-void amd_iommu_domain_clr_pt_root(struct protection_domain *domain)
-{
- amd_iommu_domain_set_pt_root(domain, 0);
-}
-
static inline int get_pci_sbdf_id(struct pci_dev *pdev)
{
int seg = pci_domain_nr(pdev->bus);
@@ -164,6 +148,8 @@ static inline int get_pci_sbdf_id(struct pci_dev *pdev)
return PCI_SEG_DEVID_TO_SBDF(seg, devid);
}
+bool amd_iommu_ht_range_ignore(void);
+
/*
* This must be called after device probe completes. During probe
* use rlookup_amd_iommu() get the iommu.
@@ -185,7 +171,6 @@ static inline struct protection_domain *to_pdomain(struct iommu_domain *dom)
}
bool translation_pre_enabled(struct amd_iommu *iommu);
-bool amd_iommu_is_attach_deferred(struct device *dev);
int __init add_special_device(u8 type, u8 id, u32 *devid, bool cmd_line);
#ifdef CONFIG_DMI
@@ -193,9 +178,11 @@ void amd_iommu_apply_ivrs_quirks(void);
#else
static inline void amd_iommu_apply_ivrs_quirks(void) { }
#endif
+struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid);
void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
u64 *root, int mode);
struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
+struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid);
-#endif
+#endif /* AMD_IOMMU_H */
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 2b76b5dedc1d..320733e7d8b4 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -8,6 +8,7 @@
#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
#define _ASM_X86_AMD_IOMMU_TYPES_H
+#include <linux/bitfield.h>
#include <linux/iommu.h>
#include <linux/types.h>
#include <linux/mmu_notifier.h>
@@ -17,7 +18,7 @@
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/irqreturn.h>
-#include <linux/io-pgtable.h>
+#include <linux/generic_pt/iommu.h>
/*
* Maximum number of IOMMUs supported
@@ -28,8 +29,6 @@
* some size calculation constants
*/
#define DEV_TABLE_ENTRY_SIZE 32
-#define ALIAS_TABLE_ENTRY_SIZE 2
-#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
/* Capability offsets used by the driver */
#define MMIO_CAP_HDR_OFFSET 0x00
@@ -95,26 +94,28 @@
#define FEATURE_GA BIT_ULL(7)
#define FEATURE_HE BIT_ULL(8)
#define FEATURE_PC BIT_ULL(9)
-#define FEATURE_GATS_SHIFT (12)
-#define FEATURE_GATS_MASK (3ULL)
+#define FEATURE_HATS GENMASK_ULL(11, 10)
+#define FEATURE_GATS GENMASK_ULL(13, 12)
+#define FEATURE_GLX GENMASK_ULL(15, 14)
#define FEATURE_GAM_VAPIC BIT_ULL(21)
+#define FEATURE_PASMAX GENMASK_ULL(36, 32)
#define FEATURE_GIOSUP BIT_ULL(48)
#define FEATURE_HASUP BIT_ULL(49)
#define FEATURE_EPHSUP BIT_ULL(50)
#define FEATURE_HDSUP BIT_ULL(52)
#define FEATURE_SNP BIT_ULL(63)
-#define FEATURE_PASID_SHIFT 32
-#define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT)
-
-#define FEATURE_GLXVAL_SHIFT 14
-#define FEATURE_GLXVAL_MASK (0x03ULL << FEATURE_GLXVAL_SHIFT)
/* Extended Feature 2 Bits */
-#define FEATURE_SNPAVICSUP_SHIFT 5
-#define FEATURE_SNPAVICSUP_MASK (0x07ULL << FEATURE_SNPAVICSUP_SHIFT)
+#define FEATURE_SEVSNPIO_SUP BIT_ULL(1)
+#define FEATURE_SNPAVICSUP GENMASK_ULL(7, 5)
#define FEATURE_SNPAVICSUP_GAM(x) \
- ((x & FEATURE_SNPAVICSUP_MASK) >> FEATURE_SNPAVICSUP_SHIFT == 0x1)
+ (FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1)
+#define FEATURE_HT_RANGE_IGNORE BIT_ULL(11)
+
+#define FEATURE_NUM_INT_REMAP_SUP GENMASK_ULL(9, 8)
+#define FEATURE_NUM_INT_REMAP_SUP_2K(x) \
+ (FIELD_GET(FEATURE_NUM_INT_REMAP_SUP, x) == 0x1)
/* Note:
* The current driver only support 16-bit PASID.
@@ -179,12 +180,16 @@
#define CONTROL_GAM_EN 25
#define CONTROL_GALOG_EN 28
#define CONTROL_GAINT_EN 29
+#define CONTROL_NUM_INT_REMAP_MODE 43
+#define CONTROL_NUM_INT_REMAP_MODE_MASK 0x03
+#define CONTROL_NUM_INT_REMAP_MODE_2K 0x01
+#define CONTROL_EPH_EN 45
#define CONTROL_XT_EN 50
#define CONTROL_INTCAPXT_EN 51
#define CONTROL_IRTCACHEDIS 59
#define CONTROL_SNPAVIC_EN 61
-#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT)
+#define CTRL_INV_TO_MASK 7
#define CTRL_INV_TO_NONE 0
#define CTRL_INV_TO_1MS 1
#define CTRL_INV_TO_10MS 2
@@ -224,6 +229,8 @@
#define DEV_ENTRY_EX 0x67
#define DEV_ENTRY_SYSMGT1 0x68
#define DEV_ENTRY_SYSMGT2 0x69
+#define DTE_DATA1_SYSMGT_MASK GENMASK_ULL(41, 40)
+
#define DEV_ENTRY_IRQ_TBL_EN 0x80
#define DEV_ENTRY_INIT_PASS 0xb8
#define DEV_ENTRY_EINT_PASS 0xb9
@@ -241,6 +248,10 @@
#define CMD_BUFFER_ENTRIES 512
#define MMIO_CMD_SIZE_SHIFT 56
#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */
+#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x))
+#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */
+#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x))
/* constants for event buffer handling */
#define EVT_BUFFER_SIZE 8192 /* 512 entries */
@@ -294,8 +305,13 @@
* that we support.
*
* 512GB Pages are not supported due to a hardware bug
+ * Page sizes >= the 52 bit max physical address of the CPU are not supported.
*/
-#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
+#define AMD_IOMMU_PGSIZES (GENMASK_ULL(51, 12) ^ SZ_512G)
+
+/* Special mode where page-sizes are limited to 4 KiB */
+#define AMD_IOMMU_PGSIZES_4K (PAGE_SIZE)
+
/* 4K, 2MB, 1G page sizes are supported */
#define AMD_IOMMU_PGSIZES_V2 (PAGE_SIZE | (1ULL << 21) | (1ULL << 30))
@@ -305,15 +321,14 @@
#define DTE_IRQ_REMAP_INTCTL (2ULL << 60)
#define DTE_IRQ_REMAP_ENABLE 1ULL
-/*
- * AMD IOMMU hardware only support 512 IRTEs despite
- * the architectural limitation of 2048 entries.
- */
#define DTE_INTTAB_ALIGNMENT 128
-#define DTE_INTTABLEN_VALUE 9ULL
-#define DTE_INTTABLEN (DTE_INTTABLEN_VALUE << 1)
#define DTE_INTTABLEN_MASK (0xfULL << 1)
-#define MAX_IRQS_PER_TABLE (1 << DTE_INTTABLEN_VALUE)
+#define DTE_INTTABLEN_VALUE_512 9ULL
+#define DTE_INTTABLEN_512 (DTE_INTTABLEN_VALUE_512 << 1)
+#define MAX_IRQS_PER_TABLE_512 BIT(DTE_INTTABLEN_VALUE_512)
+#define DTE_INTTABLEN_VALUE_2K 11ULL
+#define DTE_INTTABLEN_2K (DTE_INTTABLEN_VALUE_2K << 1)
+#define MAX_IRQS_PER_TABLE_2K BIT(DTE_INTTABLEN_VALUE_2K)
#define PAGE_MODE_NONE 0x00
#define PAGE_MODE_1_LEVEL 0x01
@@ -327,76 +342,7 @@
#define GUEST_PGTABLE_4_LEVEL 0x00
#define GUEST_PGTABLE_5_LEVEL 0x01
-#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
- ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
- (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
- IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k 0
#define PM_ADDR_MASK 0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
- (~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
- ((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
- (1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
- ((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address and a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize) \
- (((address) | ((pagesize) - 1)) & \
- (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
- (1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-/*
- * Takes a page-table level and returns the default page-size for this level
- */
-#define PTE_LEVEL_PAGE_SIZE(level) \
- (1ULL << (12 + (9 * (level))))
-
-/*
- * The IOPTE dirty bit
- */
-#define IOMMU_PTE_HD_BIT (6)
-
-/*
- * Bit value definition for I/O PTE fields
- */
-#define IOMMU_PTE_PR BIT_ULL(0)
-#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
-#define IOMMU_PTE_U BIT_ULL(59)
-#define IOMMU_PTE_FC BIT_ULL(60)
-#define IOMMU_PTE_IR BIT_ULL(61)
-#define IOMMU_PTE_IW BIT_ULL(62)
/*
* Bit value definition for DTE fields
@@ -406,8 +352,7 @@
#define DTE_FLAG_HAD (3ULL << 7)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
-#define DTE_GLX_SHIFT (56)
-#define DTE_GLX_MASK (3)
+#define DTE_GLX GENMASK_ULL(57, 56)
#define DTE_FLAG_IR BIT_ULL(61)
#define DTE_FLAG_IW BIT_ULL(62)
@@ -415,27 +360,17 @@
#define DTE_FLAG_MASK (0x3ffULL << 32)
#define DEV_DOMID_MASK 0xffffULL
-#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL)
-#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL)
-#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0x1fffffULL)
-
-#define DTE_GCR3_INDEX_A 0
-#define DTE_GCR3_INDEX_B 1
-#define DTE_GCR3_INDEX_C 1
-
-#define DTE_GCR3_SHIFT_A 58
-#define DTE_GCR3_SHIFT_B 16
-#define DTE_GCR3_SHIFT_C 43
+#define DTE_GCR3_14_12 GENMASK_ULL(60, 58)
+#define DTE_GCR3_30_15 GENMASK_ULL(31, 16)
+#define DTE_GCR3_51_31 GENMASK_ULL(63, 43)
#define DTE_GPT_LEVEL_SHIFT 54
+#define DTE_GPT_LEVEL_MASK GENMASK_ULL(55, 54)
#define GCR3_VALID 0x01ULL
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
-#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
-#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
+/* DTE[128:179] | DTE[184:191] */
+#define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52)
#define IOMMU_PROT_MASK 0x03
#define IOMMU_PROT_IR 0x01
@@ -456,6 +391,9 @@
/* IOMMU Feature Reporting Field (for IVHD type 10h */
#define IOMMU_FEAT_GASUP_SHIFT 6
+/* IOMMU HATDIS for IVHD type 11h and 40h */
+#define IOMMU_IVHD_ATTR_HATDIS_SHIFT 0
+
/* IOMMU Extended Feature Register (EFR) */
#define IOMMU_EFR_XTSUP_SHIFT 2
#define IOMMU_EFR_GASUP_SHIFT 7
@@ -471,7 +409,7 @@ extern bool amd_iommu_dump;
#define DUMP_printk(format, arg...) \
do { \
if (amd_iommu_dump) \
- pr_info("AMD-Vi: " format, ## arg); \
+ pr_info(format, ## arg); \
} while(0);
/* global flag if IOMMUs cache non-present entries */
@@ -493,9 +431,6 @@ extern const struct iommu_ops amd_iommu_ops;
/* IVRS indicates that pre-boot remapping was enabled */
extern bool amdr_ivrs_remap_support;
-/* kmem_cache to get tables with 128 byte alignement */
-extern struct kmem_cache *amd_iommu_irq_cache;
-
#define PCI_SBDF_TO_SEGID(sbdf) (((sbdf) >> 16) & 0xffff)
#define PCI_SBDF_TO_DEVID(sbdf) ((sbdf) & 0xffff)
#define PCI_SEG_DEVID_TO_SBDF(seg, devid) ((((u32)(seg) & 0xffff) << 16) | \
@@ -519,6 +454,9 @@ extern struct kmem_cache *amd_iommu_irq_cache;
#define for_each_pdom_dev_data_safe(pdom_dev_data, next, pdom) \
list_for_each_entry_safe((pdom_dev_data), (next), &pdom->dev_data_list, list)
+#define for_each_ivhd_dte_flags(entry) \
+ list_for_each_entry((entry), &amd_ivhd_dev_flags_list, list)
+
struct amd_iommu;
struct iommu_domain;
struct irq_domain;
@@ -526,19 +464,6 @@ struct amd_irte_ops;
#define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0)
-#define io_pgtable_to_data(x) \
- container_of((x), struct amd_io_pgtable, iop)
-
-#define io_pgtable_ops_to_data(x) \
- io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
-
-#define io_pgtable_ops_to_domain(x) \
- container_of(io_pgtable_ops_to_data(x), \
- struct protection_domain, iop)
-
-#define io_pgtable_cfg_to_data(x) \
- container_of((x), struct amd_io_pgtable, pgtbl_cfg)
-
struct gcr3_tbl_info {
u64 *gcr3_tbl; /* Guest CR3 table */
int glx; /* Number of levels for GCR3 table */
@@ -546,16 +471,9 @@ struct gcr3_tbl_info {
u16 domid; /* Per device domain ID */
};
-struct amd_io_pgtable {
- struct io_pgtable_cfg pgtbl_cfg;
- struct io_pgtable iop;
- int mode;
- u64 *root;
- u64 *pgd; /* v2 pgtable pgd pointer */
-};
-
enum protection_domain_mode {
- PD_MODE_V1 = 1,
+ PD_MODE_NONE,
+ PD_MODE_V1,
PD_MODE_V2,
};
@@ -569,26 +487,36 @@ struct pdom_dev_data {
struct list_head list;
};
+/* Keeps track of the IOMMUs attached to protection domain */
+struct pdom_iommu_info {
+ struct amd_iommu *iommu; /* IOMMUs attach to protection domain */
+ u32 refcnt; /* Count of attached dev/pasid per domain/IOMMU */
+};
+
/*
* This structure contains generic data for IOMMU protection domains
* independent of their use.
*/
struct protection_domain {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ struct pt_iommu_amdv1 amdv1;
+ struct pt_iommu_x86_64 amdv2;
+ };
struct list_head dev_list; /* List of all devices in this domain */
- struct iommu_domain domain; /* generic domain handle used by
- iommu core code */
- struct amd_io_pgtable iop;
spinlock_t lock; /* mostly used to lock the page table*/
u16 id; /* the domain id written to the device table */
- int nid; /* Node ID */
enum protection_domain_mode pd_mode; /* Track page table type */
bool dirty_tracking; /* dirty tracking is enabled in the domain */
- unsigned dev_cnt; /* devices assigned to this domain */
- unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
+ struct xarray iommu_array; /* per-IOMMU reference count */
struct mmu_notifier mn; /* mmu notifier for the SVA domain */
struct list_head dev_data_list; /* List of pdom_dev_data */
};
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain);
/*
* This structure contains information about one PCI segment in the system.
@@ -609,12 +537,6 @@ struct amd_iommu_pci_seg {
/* Size of the device table */
u32 dev_table_size;
- /* Size of the alias table */
- u32 alias_table_size;
-
- /* Size of the rlookup table */
- u32 rlookup_table_size;
-
/*
* device table virtual address
*
@@ -785,10 +707,17 @@ struct amd_iommu {
u32 flags;
volatile u64 *cmd_sem;
atomic64_t cmd_sem_val;
+ /*
+ * Track physical address to directly use it in build_completion_wait()
+ * and avoid adding any special checks and handling for kdump.
+ */
+ u64 cmd_sem_paddr;
#ifdef CONFIG_AMD_IOMMU_DEBUGFS
/* DebugFS Info */
struct dentry *debugfs;
+ int dbg_mmio_offset;
+ int dbg_cap_offset;
#endif
/* IOPF support */
@@ -836,7 +765,8 @@ struct devid_map {
*/
struct iommu_dev_data {
/*Protect against attach/detach races */
- spinlock_t lock;
+ struct mutex mutex;
+ spinlock_t dte_lock; /* DTE lock for 256-bit access */
struct list_head list; /* For domain->dev_list */
struct llist_node dev_data_list; /* For global dev_data_list */
@@ -845,6 +775,7 @@ struct iommu_dev_data {
struct device *dev;
u16 devid; /* PCI Device ID */
+ unsigned int max_irqs; /* Maximum IRQs supported by device */
u32 max_pasids; /* Max supported PASIDs */
u32 flags; /* Holds AMD_IOMMU_DEVICE_FLAG_<*> */
int ats_qdep;
@@ -878,16 +809,31 @@ extern struct list_head amd_iommu_pci_seg_list;
extern struct list_head amd_iommu_list;
/*
- * Array with pointers to each IOMMU struct
- * The indices are referenced in the protection domains
+ * Structure defining one entry in the device table
*/
-extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
+struct dev_table_entry {
+ union {
+ u64 data[4];
+ u128 data128[2];
+ };
+};
/*
- * Structure defining one entry in the device table
+ * Structure defining one entry in the command buffer
*/
-struct dev_table_entry {
- u64 data[4];
+struct iommu_cmd {
+ u32 data[4];
+};
+
+/*
+ * Structure to sture persistent DTE flags from IVHD
+ */
+struct ivhd_dte_flags {
+ struct list_head list;
+ u16 segid;
+ u16 devid_first;
+ u16 devid_last;
+ struct dev_table_entry dte;
};
/*
@@ -914,17 +860,14 @@ struct unity_map_entry {
* Data structures for device handling
*/
-/* size of the dma_ops aperture as power of 2 */
-extern unsigned amd_iommu_aperture_order;
-
-/* allocation bitmap for domain ids */
-extern unsigned long *amd_iommu_pd_alloc_bitmap;
-
extern bool amd_iommu_force_isolation;
/* Max levels of glxval supported */
extern int amd_iommu_max_glx_val;
+/* IDA to track protection domain IDs */
+extern struct ida pdom_ids;
+
/* Global EFR and EFR2 registers */
extern u64 amd_iommu_efr;
extern u64 amd_iommu_efr2;
@@ -1046,7 +989,6 @@ struct irq_2_irte {
};
struct amd_ir_data {
- u32 cached_ga_tag;
struct amd_iommu *iommu;
struct irq_2_irte irq_2_irte;
struct msi_msg msi_entry;
diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c
index 545372fcc72f..20b04996441d 100644
--- a/drivers/iommu/amd/debugfs.c
+++ b/drivers/iommu/amd/debugfs.c
@@ -11,22 +11,382 @@
#include <linux/pci.h>
#include "amd_iommu.h"
+#include "../irq_remapping.h"
static struct dentry *amd_iommu_debugfs;
-static DEFINE_MUTEX(amd_iommu_debugfs_lock);
#define MAX_NAME_LEN 20
+#define OFS_IN_SZ 8
+#define DEVID_IN_SZ 16
-void amd_iommu_debugfs_setup(struct amd_iommu *iommu)
+static int sbdf = -1;
+
+static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct amd_iommu *iommu = m->private;
+ int ret;
+
+ iommu->dbg_mmio_offset = -1;
+
+ if (cnt > OFS_IN_SZ)
+ return -EINVAL;
+
+ ret = kstrtou32_from_user(ubuf, cnt, 0, &iommu->dbg_mmio_offset);
+ if (ret)
+ return ret;
+
+ if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) {
+ iommu->dbg_mmio_offset = -1;
+ return -EINVAL;
+ }
+
+ return cnt;
+}
+
+static int iommu_mmio_show(struct seq_file *m, void *unused)
+{
+ struct amd_iommu *iommu = m->private;
+ u64 value;
+
+ if (iommu->dbg_mmio_offset < 0) {
+ seq_puts(m, "Please provide mmio register's offset\n");
+ return 0;
+ }
+
+ value = readq(iommu->mmio_base + iommu->dbg_mmio_offset);
+ seq_printf(m, "Offset:0x%x Value:0x%016llx\n", iommu->dbg_mmio_offset, value);
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(iommu_mmio);
+
+static ssize_t iommu_capability_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct amd_iommu *iommu = m->private;
+ int ret;
+
+ iommu->dbg_cap_offset = -1;
+
+ if (cnt > OFS_IN_SZ)
+ return -EINVAL;
+
+ ret = kstrtou32_from_user(ubuf, cnt, 0, &iommu->dbg_cap_offset);
+ if (ret)
+ return ret;
+
+ /* Capability register at offset 0x14 is the last IOMMU capability register. */
+ if (iommu->dbg_cap_offset > 0x14) {
+ iommu->dbg_cap_offset = -1;
+ return -EINVAL;
+ }
+
+ return cnt;
+}
+
+static int iommu_capability_show(struct seq_file *m, void *unused)
+{
+ struct amd_iommu *iommu = m->private;
+ u32 value;
+ int err;
+
+ if (iommu->dbg_cap_offset < 0) {
+ seq_puts(m, "Please provide capability register's offset in the range [0x00 - 0x14]\n");
+ return 0;
+ }
+
+ err = pci_read_config_dword(iommu->dev, iommu->cap_ptr + iommu->dbg_cap_offset, &value);
+ if (err) {
+ seq_printf(m, "Not able to read capability register at 0x%x\n",
+ iommu->dbg_cap_offset);
+ return 0;
+ }
+
+ seq_printf(m, "Offset:0x%x Value:0x%08x\n", iommu->dbg_cap_offset, value);
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(iommu_capability);
+
+static int iommu_cmdbuf_show(struct seq_file *m, void *unused)
+{
+ struct amd_iommu *iommu = m->private;
+ struct iommu_cmd *cmd;
+ unsigned long flag;
+ u32 head, tail;
+ int i;
+
+ raw_spin_lock_irqsave(&iommu->lock, flag);
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+ seq_printf(m, "CMD Buffer Head Offset:%d Tail Offset:%d\n",
+ (head >> 4) & 0x7fff, (tail >> 4) & 0x7fff);
+ for (i = 0; i < CMD_BUFFER_ENTRIES; i++) {
+ cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd));
+ seq_printf(m, "%3d: %08x %08x %08x %08x\n", i, cmd->data[0],
+ cmd->data[1], cmd->data[2], cmd->data[3]);
+ }
+ raw_spin_unlock_irqrestore(&iommu->lock, flag);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(iommu_cmdbuf);
+
+static ssize_t devid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct amd_iommu_pci_seg *pci_seg;
+ int seg, bus, slot, func;
+ struct amd_iommu *iommu;
+ char *srcid_ptr;
+ u16 devid;
+ int i;
+
+ sbdf = -1;
+
+ if (cnt >= DEVID_IN_SZ)
+ return -EINVAL;
+
+ srcid_ptr = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(srcid_ptr))
+ return PTR_ERR(srcid_ptr);
+
+ i = sscanf(srcid_ptr, "%x:%x:%x.%x", &seg, &bus, &slot, &func);
+ if (i != 4) {
+ i = sscanf(srcid_ptr, "%x:%x.%x", &bus, &slot, &func);
+ if (i != 3) {
+ kfree(srcid_ptr);
+ return -EINVAL;
+ }
+ seg = 0;
+ }
+
+ devid = PCI_DEVID(bus, PCI_DEVFN(slot, func));
+
+ /* Check if user device id input is a valid input */
+ for_each_pci_segment(pci_seg) {
+ if (pci_seg->id != seg)
+ continue;
+ if (devid > pci_seg->last_bdf) {
+ kfree(srcid_ptr);
+ return -EINVAL;
+ }
+ iommu = pci_seg->rlookup_table[devid];
+ if (!iommu) {
+ kfree(srcid_ptr);
+ return -ENODEV;
+ }
+ break;
+ }
+
+ if (pci_seg->id != seg) {
+ kfree(srcid_ptr);
+ return -EINVAL;
+ }
+
+ sbdf = PCI_SEG_DEVID_TO_SBDF(seg, devid);
+
+ kfree(srcid_ptr);
+
+ return cnt;
+}
+
+static int devid_show(struct seq_file *m, void *unused)
{
+ u16 devid;
+
+ if (sbdf >= 0) {
+ devid = PCI_SBDF_TO_DEVID(sbdf);
+ seq_printf(m, "%04x:%02x:%02x.%x\n", PCI_SBDF_TO_SEGID(sbdf),
+ PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid));
+ } else
+ seq_puts(m, "No or Invalid input provided\n");
+
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(devid);
+
+static void dump_dte(struct seq_file *m, struct amd_iommu_pci_seg *pci_seg, u16 devid)
+{
+ struct dev_table_entry *dev_table;
+ struct amd_iommu *iommu;
+
+ iommu = pci_seg->rlookup_table[devid];
+ if (!iommu)
+ return;
+
+ dev_table = get_dev_table(iommu);
+ if (!dev_table) {
+ seq_puts(m, "Device table not found");
+ return;
+ }
+
+ seq_printf(m, "%-12s %16s %16s %16s %16s iommu\n", "DeviceId",
+ "QWORD[3]", "QWORD[2]", "QWORD[1]", "QWORD[0]");
+ seq_printf(m, "%04x:%02x:%02x.%x ", pci_seg->id, PCI_BUS_NUM(devid),
+ PCI_SLOT(devid), PCI_FUNC(devid));
+ for (int i = 3; i >= 0; --i)
+ seq_printf(m, "%016llx ", dev_table[devid].data[i]);
+ seq_printf(m, "iommu%d\n", iommu->index);
+}
+
+static int iommu_devtbl_show(struct seq_file *m, void *unused)
+{
+ struct amd_iommu_pci_seg *pci_seg;
+ u16 seg, devid;
+
+ if (sbdf < 0) {
+ seq_puts(m, "Enter a valid device ID to 'devid' file\n");
+ return 0;
+ }
+ seg = PCI_SBDF_TO_SEGID(sbdf);
+ devid = PCI_SBDF_TO_DEVID(sbdf);
+
+ for_each_pci_segment(pci_seg) {
+ if (pci_seg->id != seg)
+ continue;
+ dump_dte(m, pci_seg, devid);
+ break;
+ }
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(iommu_devtbl);
+
+static void dump_128_irte(struct seq_file *m, struct irq_remap_table *table, u16 int_tab_len)
+{
+ struct irte_ga *ptr, *irte;
+ int index;
+
+ for (index = 0; index < int_tab_len; index++) {
+ ptr = (struct irte_ga *)table->table;
+ irte = &ptr[index];
+
+ if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+ !irte->lo.fields_vapic.valid)
+ continue;
+ else if (!irte->lo.fields_remap.valid)
+ continue;
+ seq_printf(m, "IRT[%04d] %016llx %016llx\n", index, irte->hi.val, irte->lo.val);
+ }
+}
+
+static void dump_32_irte(struct seq_file *m, struct irq_remap_table *table, u16 int_tab_len)
+{
+ union irte *ptr, *irte;
+ int index;
+
+ for (index = 0; index < int_tab_len; index++) {
+ ptr = (union irte *)table->table;
+ irte = &ptr[index];
+
+ if (!irte->fields.valid)
+ continue;
+ seq_printf(m, "IRT[%04d] %08x\n", index, irte->val);
+ }
+}
+
+static void dump_irte(struct seq_file *m, u16 devid, struct amd_iommu_pci_seg *pci_seg)
+{
+ struct dev_table_entry *dev_table;
+ struct irq_remap_table *table;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+ u16 int_tab_len;
+
+ table = pci_seg->irq_lookup_table[devid];
+ if (!table) {
+ seq_printf(m, "IRQ lookup table not set for %04x:%02x:%02x:%x\n",
+ pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid));
+ return;
+ }
+
+ iommu = pci_seg->rlookup_table[devid];
+ if (!iommu)
+ return;
+
+ dev_table = get_dev_table(iommu);
+ if (!dev_table) {
+ seq_puts(m, "Device table not found");
+ return;
+ }
+
+ int_tab_len = dev_table[devid].data[2] & DTE_INTTABLEN_MASK;
+ if (int_tab_len != DTE_INTTABLEN_512 && int_tab_len != DTE_INTTABLEN_2K) {
+ seq_puts(m, "The device's DTE contains an invalid IRT length value.");
+ return;
+ }
+
+ seq_printf(m, "DeviceId %04x:%02x:%02x.%x\n", pci_seg->id, PCI_BUS_NUM(devid),
+ PCI_SLOT(devid), PCI_FUNC(devid));
+
+ raw_spin_lock_irqsave(&table->lock, flags);
+ if (AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+ dump_128_irte(m, table, BIT(int_tab_len >> 1));
+ else
+ dump_32_irte(m, table, BIT(int_tab_len >> 1));
+ seq_puts(m, "\n");
+ raw_spin_unlock_irqrestore(&table->lock, flags);
+}
+
+static int iommu_irqtbl_show(struct seq_file *m, void *unused)
+{
+ struct amd_iommu_pci_seg *pci_seg;
+ u16 devid, seg;
+
+ if (!irq_remapping_enabled) {
+ seq_puts(m, "Interrupt remapping is disabled\n");
+ return 0;
+ }
+
+ if (sbdf < 0) {
+ seq_puts(m, "Enter a valid device ID to 'devid' file\n");
+ return 0;
+ }
+
+ seg = PCI_SBDF_TO_SEGID(sbdf);
+ devid = PCI_SBDF_TO_DEVID(sbdf);
+
+ for_each_pci_segment(pci_seg) {
+ if (pci_seg->id != seg)
+ continue;
+ dump_irte(m, devid, pci_seg);
+ break;
+ }
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(iommu_irqtbl);
+
+void amd_iommu_debugfs_setup(void)
+{
+ struct amd_iommu *iommu;
char name[MAX_NAME_LEN + 1];
- mutex_lock(&amd_iommu_debugfs_lock);
- if (!amd_iommu_debugfs)
- amd_iommu_debugfs = debugfs_create_dir("amd",
- iommu_debugfs_dir);
- mutex_unlock(&amd_iommu_debugfs_lock);
+ amd_iommu_debugfs = debugfs_create_dir("amd", iommu_debugfs_dir);
+
+ for_each_iommu(iommu) {
+ iommu->dbg_mmio_offset = -1;
+ iommu->dbg_cap_offset = -1;
+
+ snprintf(name, MAX_NAME_LEN, "iommu%02d", iommu->index);
+ iommu->debugfs = debugfs_create_dir(name, amd_iommu_debugfs);
+
+ debugfs_create_file("mmio", 0644, iommu->debugfs, iommu,
+ &iommu_mmio_fops);
+ debugfs_create_file("capability", 0644, iommu->debugfs, iommu,
+ &iommu_capability_fops);
+ debugfs_create_file("cmdbuf", 0444, iommu->debugfs, iommu,
+ &iommu_cmdbuf_fops);
+ }
- snprintf(name, MAX_NAME_LEN, "iommu%02d", iommu->index);
- iommu->debugfs = debugfs_create_dir(name, amd_iommu_debugfs);
+ debugfs_create_file("devid", 0644, amd_iommu_debugfs, NULL,
+ &devid_fops);
+ debugfs_create_file("devtbl", 0444, amd_iommu_debugfs, NULL,
+ &iommu_devtbl_fops);
+ debugfs_create_file("irqtbl", 0444, amd_iommu_debugfs, NULL,
+ &iommu_irqtbl_fops);
}
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index c89d85b54a1a..4b2953418977 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -12,7 +12,6 @@
#include <linux/acpi.h>
#include <linux/list.h>
#include <linux/bitmap.h>
-#include <linux/slab.h>
#include <linux/syscore_ops.h>
#include <linux/interrupt.h>
#include <linux/msi.h>
@@ -152,7 +151,9 @@ struct ivmd_header {
bool amd_iommu_dump;
bool amd_iommu_irq_remap __read_mostly;
-enum io_pgtable_fmt amd_iommu_pgtable = AMD_IOMMU_V1;
+enum protection_domain_mode amd_iommu_pgtable = PD_MODE_V1;
+/* Host page table level */
+u8 amd_iommu_hpt_level;
/* Guest page table level */
int amd_iommu_gpt_level = PAGE_MODE_4_LEVEL;
@@ -169,16 +170,16 @@ static int amd_iommu_target_ivhd_type;
u64 amd_iommu_efr;
u64 amd_iommu_efr2;
+/* Host (v1) page table is not supported*/
+bool amd_iommu_hatdis;
+
/* SNP is enabled on the system? */
bool amd_iommu_snp_en;
EXPORT_SYMBOL(amd_iommu_snp_en);
LIST_HEAD(amd_iommu_pci_seg_list); /* list of all PCI segments */
-LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
- system */
-
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
+LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */
+LIST_HEAD(amd_ivhd_dev_flags_list); /* list of all IVHD device entry settings */
/* Number of IOMMUs present in the system */
static int amd_iommus_present;
@@ -192,11 +193,7 @@ bool amdr_ivrs_remap_support __read_mostly;
bool amd_iommu_force_isolation __read_mostly;
-/*
- * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
+unsigned long amd_iommu_pgsize_bitmap __ro_after_init = AMD_IOMMU_PGSIZES;
enum iommu_init_state {
IOMMU_START_STATE,
@@ -226,7 +223,6 @@ static bool __initdata cmdline_maps;
static enum iommu_init_state init_state = IOMMU_START_STATE;
static int amd_iommu_enable_interrupts(void);
-static int __init iommu_go_to_state(enum iommu_init_state state);
static void init_device_table_dma(struct amd_iommu_pci_seg *pci_seg);
static bool amd_iommu_pre_enabled = true;
@@ -252,17 +248,14 @@ static void init_translation_status(struct amd_iommu *iommu)
iommu->flags |= AMD_IOMMU_FLAG_TRANS_PRE_ENABLED;
}
-static inline unsigned long tbl_size(int entry_size, int last_bdf)
+int amd_iommu_get_num_iommus(void)
{
- unsigned shift = PAGE_SHIFT +
- get_order((last_bdf + 1) * entry_size);
-
- return 1UL << shift;
+ return amd_iommus_present;
}
-int amd_iommu_get_num_iommus(void)
+bool amd_iommu_ht_range_ignore(void)
{
- return amd_iommus_present;
+ return check_feature2(FEATURE_HT_RANGE_IGNORE);
}
/*
@@ -413,39 +406,35 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
BUG_ON(iommu->mmio_base == NULL);
+ if (is_kdump_kernel())
+ return;
+
entry = iommu_virt_to_phys(dev_table);
entry |= (dev_table_size >> 12) - 1;
memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
&entry, sizeof(entry));
}
-/* Generic functions to enable/disable certain features of the IOMMU. */
-void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_set(struct amd_iommu *iommu, u64 val, u64 mask, u8 shift)
{
u64 ctrl;
ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl |= (1ULL << bit);
+ mask <<= shift;
+ ctrl &= ~mask;
+ ctrl |= (val << shift) & mask;
writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
}
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+/* Generic functions to enable/disable certain features of the IOMMU. */
+void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
{
- u64 ctrl;
-
- ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl &= ~(1ULL << bit);
- writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ iommu_feature_set(iommu, 1ULL, 1ULL, bit);
}
-static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout)
+static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
{
- u64 ctrl;
-
- ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl &= ~CTRL_INV_TO_MASK;
- ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK;
- writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ iommu_feature_set(iommu, 0ULL, 1ULL, bit);
}
/* Function to enable the hardware */
@@ -650,8 +639,8 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table, u16 pci_
/* Allocate per PCI segment device table */
static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg)
{
- pci_seg->dev_table = iommu_alloc_pages(GFP_KERNEL | GFP_DMA32,
- get_order(pci_seg->dev_table_size));
+ pci_seg->dev_table = iommu_alloc_pages_sz(GFP_KERNEL | GFP_DMA32,
+ pci_seg->dev_table_size);
if (!pci_seg->dev_table)
return -ENOMEM;
@@ -660,16 +649,19 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg)
static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg)
{
- iommu_free_pages(pci_seg->dev_table,
- get_order(pci_seg->dev_table_size));
+ if (is_kdump_kernel())
+ memunmap((void *)pci_seg->dev_table);
+ else
+ iommu_free_pages(pci_seg->dev_table);
pci_seg->dev_table = NULL;
}
/* Allocate per PCI segment IOMMU rlookup table. */
static inline int __init alloc_rlookup_table(struct amd_iommu_pci_seg *pci_seg)
{
- pci_seg->rlookup_table = iommu_alloc_pages(GFP_KERNEL,
- get_order(pci_seg->rlookup_table_size));
+ pci_seg->rlookup_table = kvcalloc(pci_seg->last_bdf + 1,
+ sizeof(*pci_seg->rlookup_table),
+ GFP_KERNEL);
if (pci_seg->rlookup_table == NULL)
return -ENOMEM;
@@ -678,17 +670,15 @@ static inline int __init alloc_rlookup_table(struct amd_iommu_pci_seg *pci_seg)
static inline void free_rlookup_table(struct amd_iommu_pci_seg *pci_seg)
{
- iommu_free_pages(pci_seg->rlookup_table,
- get_order(pci_seg->rlookup_table_size));
+ kvfree(pci_seg->rlookup_table);
pci_seg->rlookup_table = NULL;
}
static inline int __init alloc_irq_lookup_table(struct amd_iommu_pci_seg *pci_seg)
{
- pci_seg->irq_lookup_table = iommu_alloc_pages(GFP_KERNEL,
- get_order(pci_seg->rlookup_table_size));
- kmemleak_alloc(pci_seg->irq_lookup_table,
- pci_seg->rlookup_table_size, 1, GFP_KERNEL);
+ pci_seg->irq_lookup_table = kvcalloc(pci_seg->last_bdf + 1,
+ sizeof(*pci_seg->irq_lookup_table),
+ GFP_KERNEL);
if (pci_seg->irq_lookup_table == NULL)
return -ENOMEM;
@@ -697,9 +687,7 @@ static inline int __init alloc_irq_lookup_table(struct amd_iommu_pci_seg *pci_se
static inline void free_irq_lookup_table(struct amd_iommu_pci_seg *pci_seg)
{
- kmemleak_free(pci_seg->irq_lookup_table);
- iommu_free_pages(pci_seg->irq_lookup_table,
- get_order(pci_seg->rlookup_table_size));
+ kvfree(pci_seg->irq_lookup_table);
pci_seg->irq_lookup_table = NULL;
}
@@ -707,8 +695,9 @@ static int __init alloc_alias_table(struct amd_iommu_pci_seg *pci_seg)
{
int i;
- pci_seg->alias_table = iommu_alloc_pages(GFP_KERNEL,
- get_order(pci_seg->alias_table_size));
+ pci_seg->alias_table = kvmalloc_array(pci_seg->last_bdf + 1,
+ sizeof(*pci_seg->alias_table),
+ GFP_KERNEL);
if (!pci_seg->alias_table)
return -ENOMEM;
@@ -723,11 +712,30 @@ static int __init alloc_alias_table(struct amd_iommu_pci_seg *pci_seg)
static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg)
{
- iommu_free_pages(pci_seg->alias_table,
- get_order(pci_seg->alias_table_size));
+ kvfree(pci_seg->alias_table);
pci_seg->alias_table = NULL;
}
+static inline void *iommu_memremap(unsigned long paddr, size_t size)
+{
+ phys_addr_t phys;
+
+ if (!paddr)
+ return NULL;
+
+ /*
+ * Obtain true physical address in kdump kernel when SME is enabled.
+ * Currently, previous kernel with SME enabled and kdump kernel
+ * with SME support disabled is not supported.
+ */
+ phys = __sme_clr(paddr);
+
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return (__force void *)ioremap_encrypted(phys, size);
+ else
+ return memremap(phys, size, MEMREMAP_WB);
+}
+
/*
* Allocates the command buffer. This buffer is per AMD IOMMU. We can
* write commands to that buffer later and the IOMMU will execute them
@@ -735,8 +743,7 @@ static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg)
*/
static int __init alloc_command_buffer(struct amd_iommu *iommu)
{
- iommu->cmd_buf = iommu_alloc_pages(GFP_KERNEL,
- get_order(CMD_BUFFER_SIZE));
+ iommu->cmd_buf = iommu_alloc_pages_sz(GFP_KERNEL, CMD_BUFFER_SIZE);
return iommu->cmd_buf ? 0 : -ENOMEM;
}
@@ -814,11 +821,16 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
BUG_ON(iommu->cmd_buf == NULL);
- entry = iommu_virt_to_phys(iommu->cmd_buf);
- entry |= MMIO_CMD_SIZE_512;
-
- memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
+ if (!is_kdump_kernel()) {
+ /*
+ * Command buffer is re-used for kdump kernel and setting
+ * of MMIO register is not required.
+ */
+ entry = iommu_virt_to_phys(iommu->cmd_buf);
+ entry |= MMIO_CMD_SIZE_512;
+ memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+ &entry, sizeof(entry));
+ }
amd_iommu_reset_cmd_buffer(iommu);
}
@@ -833,20 +845,22 @@ static void iommu_disable_command_buffer(struct amd_iommu *iommu)
static void __init free_command_buffer(struct amd_iommu *iommu)
{
- iommu_free_pages(iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
+ iommu_free_pages(iommu->cmd_buf);
}
void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, gfp_t gfp,
size_t size)
{
- int order = get_order(size);
- void *buf = iommu_alloc_pages(gfp, order);
+ void *buf;
- if (buf &&
- check_feature(FEATURE_SNP) &&
- set_memory_4k((unsigned long)buf, (1 << order))) {
- iommu_free_pages(buf, order);
- buf = NULL;
+ size = PAGE_ALIGN(size);
+ buf = iommu_alloc_pages_sz(gfp, size);
+ if (!buf)
+ return NULL;
+ if (check_feature(FEATURE_SNP) &&
+ set_memory_4k((unsigned long)buf, size / PAGE_SIZE)) {
+ iommu_free_pages(buf);
+ return NULL;
}
return buf;
@@ -867,10 +881,15 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
BUG_ON(iommu->evt_buf == NULL);
- entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
- memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
- &entry, sizeof(entry));
+ if (!is_kdump_kernel()) {
+ /*
+ * Event buffer is re-used for kdump kernel and setting
+ * of MMIO register is not required.
+ */
+ entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+ memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+ &entry, sizeof(entry));
+ }
/* set head and tail to zero manually */
writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
@@ -889,14 +908,14 @@ static void iommu_disable_event_buffer(struct amd_iommu *iommu)
static void __init free_event_buffer(struct amd_iommu *iommu)
{
- iommu_free_pages(iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
+ iommu_free_pages(iommu->evt_buf);
}
static void free_ga_log(struct amd_iommu *iommu)
{
#ifdef CONFIG_IRQ_REMAP
- iommu_free_pages(iommu->ga_log, get_order(GA_LOG_SIZE));
- iommu_free_pages(iommu->ga_log_tail, get_order(8));
+ iommu_free_pages(iommu->ga_log);
+ iommu_free_pages(iommu->ga_log_tail);
#endif
}
@@ -941,11 +960,11 @@ static int iommu_init_ga_log(struct amd_iommu *iommu)
if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
return 0;
- iommu->ga_log = iommu_alloc_pages(GFP_KERNEL, get_order(GA_LOG_SIZE));
+ iommu->ga_log = iommu_alloc_pages_sz(GFP_KERNEL, GA_LOG_SIZE);
if (!iommu->ga_log)
goto err_out;
- iommu->ga_log_tail = iommu_alloc_pages(GFP_KERNEL, get_order(8));
+ iommu->ga_log_tail = iommu_alloc_pages_sz(GFP_KERNEL, 8);
if (!iommu->ga_log_tail)
goto err_out;
@@ -959,14 +978,129 @@ err_out:
static int __init alloc_cwwb_sem(struct amd_iommu *iommu)
{
iommu->cmd_sem = iommu_alloc_4k_pages(iommu, GFP_KERNEL, 1);
+ if (!iommu->cmd_sem)
+ return -ENOMEM;
+ iommu->cmd_sem_paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
+ return 0;
+}
+
+static int __init remap_event_buffer(struct amd_iommu *iommu)
+{
+ u64 paddr;
+
+ pr_info_once("Re-using event buffer from the previous kernel\n");
+ paddr = readq(iommu->mmio_base + MMIO_EVT_BUF_OFFSET) & PM_ADDR_MASK;
+ iommu->evt_buf = iommu_memremap(paddr, EVT_BUFFER_SIZE);
- return iommu->cmd_sem ? 0 : -ENOMEM;
+ return iommu->evt_buf ? 0 : -ENOMEM;
+}
+
+static int __init remap_command_buffer(struct amd_iommu *iommu)
+{
+ u64 paddr;
+
+ pr_info_once("Re-using command buffer from the previous kernel\n");
+ paddr = readq(iommu->mmio_base + MMIO_CMD_BUF_OFFSET) & PM_ADDR_MASK;
+ iommu->cmd_buf = iommu_memremap(paddr, CMD_BUFFER_SIZE);
+
+ return iommu->cmd_buf ? 0 : -ENOMEM;
+}
+
+static int __init remap_or_alloc_cwwb_sem(struct amd_iommu *iommu)
+{
+ u64 paddr;
+
+ if (check_feature(FEATURE_SNP)) {
+ /*
+ * When SNP is enabled, the exclusion base register is used for the
+ * completion wait buffer (CWB) address. Read and re-use it.
+ */
+ pr_info_once("Re-using CWB buffers from the previous kernel\n");
+ paddr = readq(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET) & PM_ADDR_MASK;
+ iommu->cmd_sem = iommu_memremap(paddr, PAGE_SIZE);
+ if (!iommu->cmd_sem)
+ return -ENOMEM;
+ iommu->cmd_sem_paddr = paddr;
+ } else {
+ return alloc_cwwb_sem(iommu);
+ }
+
+ return 0;
+}
+
+static int __init alloc_iommu_buffers(struct amd_iommu *iommu)
+{
+ int ret;
+
+ /*
+ * Reuse/Remap the previous kernel's allocated completion wait
+ * command and event buffers for kdump boot.
+ */
+ if (is_kdump_kernel()) {
+ ret = remap_or_alloc_cwwb_sem(iommu);
+ if (ret)
+ return ret;
+
+ ret = remap_command_buffer(iommu);
+ if (ret)
+ return ret;
+
+ ret = remap_event_buffer(iommu);
+ if (ret)
+ return ret;
+ } else {
+ ret = alloc_cwwb_sem(iommu);
+ if (ret)
+ return ret;
+
+ ret = alloc_command_buffer(iommu);
+ if (ret)
+ return ret;
+
+ ret = alloc_event_buffer(iommu);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
static void __init free_cwwb_sem(struct amd_iommu *iommu)
{
if (iommu->cmd_sem)
- iommu_free_page((void *)iommu->cmd_sem);
+ iommu_free_pages((void *)iommu->cmd_sem);
+}
+static void __init unmap_cwwb_sem(struct amd_iommu *iommu)
+{
+ if (iommu->cmd_sem) {
+ if (check_feature(FEATURE_SNP))
+ memunmap((void *)iommu->cmd_sem);
+ else
+ iommu_free_pages((void *)iommu->cmd_sem);
+ }
+}
+
+static void __init unmap_command_buffer(struct amd_iommu *iommu)
+{
+ memunmap((void *)iommu->cmd_buf);
+}
+
+static void __init unmap_event_buffer(struct amd_iommu *iommu)
+{
+ memunmap(iommu->evt_buf);
+}
+
+static void __init free_iommu_buffers(struct amd_iommu *iommu)
+{
+ if (is_kdump_kernel()) {
+ unmap_cwwb_sem(iommu);
+ unmap_command_buffer(iommu);
+ unmap_event_buffer(iommu);
+ } else {
+ free_cwwb_sem(iommu);
+ free_command_buffer(iommu);
+ free_event_buffer(iommu);
+ }
}
static void iommu_enable_xt(struct amd_iommu *iommu)
@@ -991,47 +1125,20 @@ static void iommu_enable_gt(struct amd_iommu *iommu)
}
/* sets a specific bit in the device table entry. */
-static void __set_dev_entry_bit(struct dev_table_entry *dev_table,
- u16 devid, u8 bit)
-{
- int i = (bit >> 6) & 0x03;
- int _bit = bit & 0x3f;
-
- dev_table[devid].data[i] |= (1UL << _bit);
-}
-
-static void set_dev_entry_bit(struct amd_iommu *iommu, u16 devid, u8 bit)
-{
- struct dev_table_entry *dev_table = get_dev_table(iommu);
-
- return __set_dev_entry_bit(dev_table, devid, bit);
-}
-
-static int __get_dev_entry_bit(struct dev_table_entry *dev_table,
- u16 devid, u8 bit)
+static void set_dte_bit(struct dev_table_entry *dte, u8 bit)
{
int i = (bit >> 6) & 0x03;
int _bit = bit & 0x3f;
- return (dev_table[devid].data[i] & (1UL << _bit)) >> _bit;
+ dte->data[i] |= (1UL << _bit);
}
-static int get_dev_entry_bit(struct amd_iommu *iommu, u16 devid, u8 bit)
+static bool __reuse_device_table(struct amd_iommu *iommu)
{
- struct dev_table_entry *dev_table = get_dev_table(iommu);
-
- return __get_dev_entry_bit(dev_table, devid, bit);
-}
-
-static bool __copy_device_table(struct amd_iommu *iommu)
-{
- u64 int_ctl, int_tab_len, entry = 0;
struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
- struct dev_table_entry *old_devtb = NULL;
- u32 lo, hi, devid, old_devtb_size;
+ u32 lo, hi, old_devtb_size;
phys_addr_t old_devtb_phys;
- u16 dom_id, dte_v, irq_v;
- u64 tmp;
+ u64 entry;
/* Each IOMMU use separate device table with the same size */
lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET);
@@ -1056,62 +1163,20 @@ static bool __copy_device_table(struct amd_iommu *iommu)
pr_err("The address of old device table is above 4G, not trustworthy!\n");
return false;
}
- old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel())
- ? (__force void *)ioremap_encrypted(old_devtb_phys,
- pci_seg->dev_table_size)
- : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB);
-
- if (!old_devtb)
- return false;
- pci_seg->old_dev_tbl_cpy = iommu_alloc_pages(GFP_KERNEL | GFP_DMA32,
- get_order(pci_seg->dev_table_size));
+ /*
+ * Re-use the previous kernel's device table for kdump.
+ */
+ pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size);
if (pci_seg->old_dev_tbl_cpy == NULL) {
- pr_err("Failed to allocate memory for copying old device table!\n");
- memunmap(old_devtb);
+ pr_err("Failed to remap memory for reusing old device table!\n");
return false;
}
- for (devid = 0; devid <= pci_seg->last_bdf; ++devid) {
- pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid];
- dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
- dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
-
- if (dte_v && dom_id) {
- pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0];
- pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1];
- __set_bit(dom_id, amd_iommu_pd_alloc_bitmap);
- /* If gcr3 table existed, mask it out */
- if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
- tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
- tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
- pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp;
- tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A;
- tmp |= DTE_FLAG_GV;
- pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp;
- }
- }
-
- irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
- int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK;
- int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK;
- if (irq_v && (int_ctl || int_tab_len)) {
- if ((int_ctl != DTE_IRQ_REMAP_INTCTL) ||
- (int_tab_len != DTE_INTTABLEN)) {
- pr_err("Wrong old irq remapping flag: %#x\n", devid);
- memunmap(old_devtb);
- return false;
- }
-
- pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2];
- }
- }
- memunmap(old_devtb);
-
return true;
}
-static bool copy_device_table(void)
+static bool reuse_device_table(void)
{
struct amd_iommu *iommu;
struct amd_iommu_pci_seg *pci_seg;
@@ -1119,17 +1184,17 @@ static bool copy_device_table(void)
if (!amd_iommu_pre_enabled)
return false;
- pr_warn("Translation is already enabled - trying to copy translation structures\n");
+ pr_warn("Translation is already enabled - trying to reuse translation structures\n");
/*
* All IOMMUs within PCI segment shares common device table.
- * Hence copy device table only once per PCI segment.
+ * Hence reuse device table only once per PCI segment.
*/
for_each_pci_segment(pci_seg) {
for_each_iommu(iommu) {
if (pci_seg->id != iommu->pci_seg->id)
continue;
- if (!__copy_device_table(iommu))
+ if (!__reuse_device_table(iommu))
return false;
break;
}
@@ -1138,42 +1203,107 @@ static bool copy_device_table(void)
return true;
}
-void amd_iommu_apply_erratum_63(struct amd_iommu *iommu, u16 devid)
+struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid)
{
- int sysmgt;
+ struct ivhd_dte_flags *e;
+ unsigned int best_len = UINT_MAX;
+ struct dev_table_entry *dte = NULL;
- sysmgt = get_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT1) |
- (get_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT2) << 1);
+ for_each_ivhd_dte_flags(e) {
+ /*
+ * Need to go through the whole list to find the smallest range,
+ * which contains the devid.
+ */
+ if ((e->segid == segid) &&
+ (e->devid_first <= devid) && (devid <= e->devid_last)) {
+ unsigned int len = e->devid_last - e->devid_first;
- if (sysmgt == 0x01)
- set_dev_entry_bit(iommu, devid, DEV_ENTRY_IW);
+ if (len < best_len) {
+ dte = &(e->dte);
+ best_len = len;
+ }
+ }
+ }
+ return dte;
+}
+
+static bool search_ivhd_dte_flags(u16 segid, u16 first, u16 last)
+{
+ struct ivhd_dte_flags *e;
+
+ for_each_ivhd_dte_flags(e) {
+ if ((e->segid == segid) &&
+ (e->devid_first == first) &&
+ (e->devid_last == last))
+ return true;
+ }
+ return false;
}
/*
* This function takes the device specific flags read from the ACPI
* table and sets up the device table entry with that information
*/
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
- u16 devid, u32 flags, u32 ext_flags)
+static void __init
+set_dev_entry_from_acpi_range(struct amd_iommu *iommu, u16 first, u16 last,
+ u32 flags, u32 ext_flags)
{
- if (flags & ACPI_DEVFLAG_INITPASS)
- set_dev_entry_bit(iommu, devid, DEV_ENTRY_INIT_PASS);
- if (flags & ACPI_DEVFLAG_EXTINT)
- set_dev_entry_bit(iommu, devid, DEV_ENTRY_EINT_PASS);
- if (flags & ACPI_DEVFLAG_NMI)
- set_dev_entry_bit(iommu, devid, DEV_ENTRY_NMI_PASS);
- if (flags & ACPI_DEVFLAG_SYSMGT1)
- set_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT1);
- if (flags & ACPI_DEVFLAG_SYSMGT2)
- set_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT2);
- if (flags & ACPI_DEVFLAG_LINT0)
- set_dev_entry_bit(iommu, devid, DEV_ENTRY_LINT0_PASS);
- if (flags & ACPI_DEVFLAG_LINT1)
- set_dev_entry_bit(iommu, devid, DEV_ENTRY_LINT1_PASS);
+ int i;
+ struct dev_table_entry dte = {};
+
+ /* Parse IVHD DTE setting flags and store information */
+ if (flags) {
+ struct ivhd_dte_flags *d;
- amd_iommu_apply_erratum_63(iommu, devid);
+ if (search_ivhd_dte_flags(iommu->pci_seg->id, first, last))
+ return;
+
+ d = kzalloc(sizeof(struct ivhd_dte_flags), GFP_KERNEL);
+ if (!d)
+ return;
- amd_iommu_set_rlookup_table(iommu, devid);
+ pr_debug("%s: devid range %#x:%#x\n", __func__, first, last);
+
+ if (flags & ACPI_DEVFLAG_INITPASS)
+ set_dte_bit(&dte, DEV_ENTRY_INIT_PASS);
+ if (flags & ACPI_DEVFLAG_EXTINT)
+ set_dte_bit(&dte, DEV_ENTRY_EINT_PASS);
+ if (flags & ACPI_DEVFLAG_NMI)
+ set_dte_bit(&dte, DEV_ENTRY_NMI_PASS);
+ if (flags & ACPI_DEVFLAG_SYSMGT1)
+ set_dte_bit(&dte, DEV_ENTRY_SYSMGT1);
+ if (flags & ACPI_DEVFLAG_SYSMGT2)
+ set_dte_bit(&dte, DEV_ENTRY_SYSMGT2);
+ if (flags & ACPI_DEVFLAG_LINT0)
+ set_dte_bit(&dte, DEV_ENTRY_LINT0_PASS);
+ if (flags & ACPI_DEVFLAG_LINT1)
+ set_dte_bit(&dte, DEV_ENTRY_LINT1_PASS);
+
+ /* Apply erratum 63, which needs info in initial_dte */
+ if (FIELD_GET(DTE_DATA1_SYSMGT_MASK, dte.data[1]) == 0x1)
+ dte.data[0] |= DTE_FLAG_IW;
+
+ memcpy(&d->dte, &dte, sizeof(dte));
+ d->segid = iommu->pci_seg->id;
+ d->devid_first = first;
+ d->devid_last = last;
+ list_add_tail(&d->list, &amd_ivhd_dev_flags_list);
+ }
+
+ for (i = first; i <= last; i++) {
+ if (flags) {
+ struct dev_table_entry *dev_table = get_dev_table(iommu);
+
+ memcpy(&dev_table[i], &dte, sizeof(dte));
+ }
+ amd_iommu_set_rlookup_table(iommu, i);
+ }
+}
+
+static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+ u16 devid, u32 flags, u32 ext_flags)
+{
+ set_dev_entry_from_acpi_range(iommu, devid, devid, flags, ext_flags);
}
int __init add_special_device(u8 type, u8 id, u32 *devid, bool cmd_line)
@@ -1241,7 +1371,7 @@ static int __init add_acpi_hid_device(u8 *hid, u8 *uid, u32 *devid,
entry->cmd_line = cmd_line;
entry->root_devid = (entry->devid & (~0x7));
- pr_info("%s, add hid:%s, uid:%s, rdevid:%d\n",
+ pr_info("%s, add hid:%s, uid:%s, rdevid:%#x\n",
entry->cmd_line ? "cmd" : "ivrs",
entry->hid, entry->uid, entry->root_devid);
@@ -1333,15 +1463,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
switch (e->type) {
case IVHD_DEV_ALL:
- DUMP_printk(" DEV_ALL\t\t\tflags: %02x\n", e->flags);
-
- for (dev_i = 0; dev_i <= pci_seg->last_bdf; ++dev_i)
- set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0);
+ DUMP_printk(" DEV_ALL\t\t\tsetting: %#02x\n", e->flags);
+ set_dev_entry_from_acpi_range(iommu, 0, pci_seg->last_bdf, e->flags, 0);
break;
case IVHD_DEV_SELECT:
- DUMP_printk(" DEV_SELECT\t\t\t devid: %04x:%02x:%02x.%x "
- "flags: %02x\n",
+ DUMP_printk(" DEV_SELECT\t\t\tdevid: %04x:%02x:%02x.%x flags: %#02x\n",
seg_id, PCI_BUS_NUM(e->devid),
PCI_SLOT(e->devid),
PCI_FUNC(e->devid),
@@ -1352,8 +1479,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
break;
case IVHD_DEV_SELECT_RANGE_START:
- DUMP_printk(" DEV_SELECT_RANGE_START\t "
- "devid: %04x:%02x:%02x.%x flags: %02x\n",
+ DUMP_printk(" DEV_SELECT_RANGE_START\tdevid: %04x:%02x:%02x.%x flags: %#02x\n",
seg_id, PCI_BUS_NUM(e->devid),
PCI_SLOT(e->devid),
PCI_FUNC(e->devid),
@@ -1366,8 +1492,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
break;
case IVHD_DEV_ALIAS:
- DUMP_printk(" DEV_ALIAS\t\t\t devid: %04x:%02x:%02x.%x "
- "flags: %02x devid_to: %02x:%02x.%x\n",
+ DUMP_printk(" DEV_ALIAS\t\t\tdevid: %04x:%02x:%02x.%x flags: %#02x devid_to: %02x:%02x.%x\n",
seg_id, PCI_BUS_NUM(e->devid),
PCI_SLOT(e->devid),
PCI_FUNC(e->devid),
@@ -1384,9 +1509,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
break;
case IVHD_DEV_ALIAS_RANGE:
- DUMP_printk(" DEV_ALIAS_RANGE\t\t "
- "devid: %04x:%02x:%02x.%x flags: %02x "
- "devid_to: %04x:%02x:%02x.%x\n",
+ DUMP_printk(" DEV_ALIAS_RANGE\t\tdevid: %04x:%02x:%02x.%x flags: %#02x devid_to: %04x:%02x:%02x.%x\n",
seg_id, PCI_BUS_NUM(e->devid),
PCI_SLOT(e->devid),
PCI_FUNC(e->devid),
@@ -1403,8 +1526,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
break;
case IVHD_DEV_EXT_SELECT:
- DUMP_printk(" DEV_EXT_SELECT\t\t devid: %04x:%02x:%02x.%x "
- "flags: %02x ext: %08x\n",
+ DUMP_printk(" DEV_EXT_SELECT\t\tdevid: %04x:%02x:%02x.%x flags: %#02x ext: %08x\n",
seg_id, PCI_BUS_NUM(e->devid),
PCI_SLOT(e->devid),
PCI_FUNC(e->devid),
@@ -1416,8 +1538,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
break;
case IVHD_DEV_EXT_SELECT_RANGE:
- DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
- "%04x:%02x:%02x.%x flags: %02x ext: %08x\n",
+ DUMP_printk(" DEV_EXT_SELECT_RANGE\tdevid: %04x:%02x:%02x.%x flags: %#02x ext: %08x\n",
seg_id, PCI_BUS_NUM(e->devid),
PCI_SLOT(e->devid),
PCI_FUNC(e->devid),
@@ -1430,21 +1551,18 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
break;
case IVHD_DEV_RANGE_END:
- DUMP_printk(" DEV_RANGE_END\t\t devid: %04x:%02x:%02x.%x\n",
+ DUMP_printk(" DEV_RANGE_END\t\tdevid: %04x:%02x:%02x.%x\n",
seg_id, PCI_BUS_NUM(e->devid),
PCI_SLOT(e->devid),
PCI_FUNC(e->devid));
devid = e->devid;
- for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
- if (alias) {
+ if (alias) {
+ for (dev_i = devid_start; dev_i <= devid; ++dev_i)
pci_seg->alias_table[dev_i] = devid_to;
- set_dev_entry_from_acpi(iommu,
- devid_to, flags, ext_flags);
- }
- set_dev_entry_from_acpi(iommu, dev_i,
- flags, ext_flags);
+ set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags);
}
+ set_dev_entry_from_acpi_range(iommu, devid_start, devid, flags, ext_flags);
break;
case IVHD_DEV_SPECIAL: {
u8 handle, type;
@@ -1463,11 +1581,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
else
var = "UNKNOWN";
- DUMP_printk(" DEV_SPECIAL(%s[%d])\t\tdevid: %04x:%02x:%02x.%x\n",
+ DUMP_printk(" DEV_SPECIAL(%s[%d])\t\tdevid: %04x:%02x:%02x.%x, flags: %#02x\n",
var, (int)handle,
seg_id, PCI_BUS_NUM(devid),
PCI_SLOT(devid),
- PCI_FUNC(devid));
+ PCI_FUNC(devid),
+ e->flags);
ret = add_special_device(type, handle, &devid, false);
if (ret)
@@ -1527,11 +1646,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
}
devid = PCI_SEG_DEVID_TO_SBDF(seg_id, e->devid);
- DUMP_printk(" DEV_ACPI_HID(%s[%s])\t\tdevid: %04x:%02x:%02x.%x\n",
+ DUMP_printk(" DEV_ACPI_HID(%s[%s])\t\tdevid: %04x:%02x:%02x.%x, flags: %#02x\n",
hid, uid, seg_id,
PCI_BUS_NUM(devid),
PCI_SLOT(devid),
- PCI_FUNC(devid));
+ PCI_FUNC(devid),
+ e->flags);
flags = e->flags;
@@ -1580,9 +1700,9 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id,
pci_seg->last_bdf = last_bdf;
DUMP_printk("PCI segment : 0x%0x, last bdf : 0x%04x\n", id, last_bdf);
- pci_seg->dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE, last_bdf);
- pci_seg->alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE, last_bdf);
- pci_seg->rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE, last_bdf);
+ pci_seg->dev_table_size =
+ max(roundup_pow_of_two((last_bdf + 1) * DEV_TABLE_ENTRY_SIZE),
+ SZ_4K);
pci_seg->id = id;
init_llist_head(&pci_seg->dev_data_list);
@@ -1590,13 +1710,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id,
list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list);
if (alloc_dev_table(pci_seg))
- return NULL;
+ goto err_free_pci_seg;
if (alloc_alias_table(pci_seg))
- return NULL;
+ goto err_free_dev_table;
if (alloc_rlookup_table(pci_seg))
- return NULL;
+ goto err_free_alias_table;
return pci_seg;
+
+err_free_alias_table:
+ free_alias_table(pci_seg);
+err_free_dev_table:
+ free_dev_table(pci_seg);
+err_free_pci_seg:
+ list_del(&pci_seg->list);
+ kfree(pci_seg);
+ return NULL;
}
static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id,
@@ -1637,9 +1766,7 @@ static void __init free_sysfs(struct amd_iommu *iommu)
static void __init free_iommu_one(struct amd_iommu *iommu)
{
free_sysfs(iommu);
- free_cwwb_sem(iommu);
- free_command_buffer(iommu);
- free_event_buffer(iommu);
+ free_iommu_buffers(iommu);
amd_iommu_free_ppr_log(iommu);
free_ga_log(iommu);
iommu_unmap_mmio_space(iommu);
@@ -1742,9 +1869,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h,
return -ENOSYS;
}
- /* Index is fine - add IOMMU to the array */
- amd_iommus[iommu->index] = iommu;
-
/*
* Copy data from ACPI table entry to the iommu struct
*/
@@ -1762,13 +1886,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h,
else
iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
- /*
- * Note: GA (128-bit IRTE) mode requires cmpxchg16b supports.
- * GAM also requires GA mode. Therefore, we need to
- * check cmpxchg16b support before enabling it.
- */
- if (!boot_cpu_has(X86_FEATURE_CX16) ||
- ((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0))
+ /* GAM requires GA mode. */
+ if ((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0)
amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
break;
case 0x11:
@@ -1778,13 +1897,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h,
else
iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET;
- /*
- * Note: GA (128-bit IRTE) mode requires cmpxchg16b supports.
- * XT, GAM also requires GA mode. Therefore, we need to
- * check cmpxchg16b support before enabling them.
- */
- if (!boot_cpu_has(X86_FEATURE_CX16) ||
- ((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0)) {
+ /* XT and GAM require GA mode. */
+ if ((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0) {
amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY;
break;
}
@@ -1792,6 +1906,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h,
if (h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT))
amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE;
+ if (h->efr_attr & BIT(IOMMU_IVHD_ATTR_HATDIS_SHIFT)) {
+ pr_warn_once("Host Address Translation is not supported.\n");
+ amd_iommu_hatdis = true;
+ }
+
early_iommu_features_init(iommu, h);
break;
@@ -1811,14 +1930,9 @@ static int __init init_iommu_one_late(struct amd_iommu *iommu)
{
int ret;
- if (alloc_cwwb_sem(iommu))
- return -ENOMEM;
-
- if (alloc_command_buffer(iommu))
- return -ENOMEM;
-
- if (alloc_event_buffer(iommu))
- return -ENOMEM;
+ ret = alloc_iommu_buffers(iommu);
+ if (ret)
+ return ret;
iommu->int_enabled = false;
@@ -2024,9 +2138,6 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
if (!iommu->dev)
return -ENODEV;
- /* Prevent binding other PCI device drivers to IOMMU devices */
- iommu->dev->match_driver = false;
-
/* ACPI _PRT won't have an IRQ for IOMMU */
iommu->dev->irq_managed = 1;
@@ -2042,14 +2153,12 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
int glxval;
u64 pasmax;
- pasmax = amd_iommu_efr & FEATURE_PASID_MASK;
- pasmax >>= FEATURE_PASID_SHIFT;
+ pasmax = FIELD_GET(FEATURE_PASMAX, amd_iommu_efr);
iommu->iommu.max_pasids = (1 << (pasmax + 1)) - 1;
BUG_ON(iommu->iommu.max_pasids & ~PASID_MASK);
- glxval = amd_iommu_efr & FEATURE_GLXVAL_MASK;
- glxval >>= FEATURE_GLXVAL_SHIFT;
+ glxval = FIELD_GET(FEATURE_GLX, amd_iommu_efr);
if (amd_iommu_max_glx_val == -1)
amd_iommu_max_glx_val = glxval;
@@ -2070,14 +2179,6 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
init_iommu_perf_ctr(iommu);
- if (amd_iommu_pgtable == AMD_IOMMU_V2) {
- if (!check_feature(FEATURE_GIOSUP) ||
- !check_feature(FEATURE_GT)) {
- pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n");
- amd_iommu_pgtable = AMD_IOMMU_V1;
- }
- }
-
if (is_rd890_iommu(iommu->dev)) {
int i, j;
@@ -2125,7 +2226,15 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
return ret;
}
- iommu_device_register(&iommu->iommu, &amd_iommu_ops, NULL);
+ ret = iommu_device_register(&iommu->iommu, &amd_iommu_ops, NULL);
+ if (ret || amd_iommu_pgtable == PD_MODE_NONE) {
+ /*
+ * Remove sysfs if DMA translation is not supported by the
+ * IOMMU. Do not return an error to enable IRQ remapping
+ * in state_next(), DTE[V, TV] must eventually be set to 0.
+ */
+ iommu_device_sysfs_remove(&iommu->iommu);
+ }
return pci_enable_device(iommu->dev);
}
@@ -2152,6 +2261,9 @@ static void print_iommu_info(void)
if (check_feature(FEATURE_SNP))
pr_cont(" SNP");
+ if (check_feature2(FEATURE_SEVSNPIO_SUP))
+ pr_cont(" SEV-TIO");
+
pr_cont("\n");
}
@@ -2160,7 +2272,7 @@ static void print_iommu_info(void)
if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
pr_info("X2APIC enabled\n");
}
- if (amd_iommu_pgtable == AMD_IOMMU_V2) {
+ if (amd_iommu_pgtable == PD_MODE_V2) {
pr_info("V2 page table enabled (Paging mode : %d level)\n",
amd_iommu_gpt_level);
}
@@ -2172,6 +2284,9 @@ static int __init amd_iommu_init_pci(void)
struct amd_iommu_pci_seg *pci_seg;
int ret;
+ /* Init global identity domain before registering IOMMU */
+ amd_iommu_init_identity_domain();
+
for_each_iommu(iommu) {
ret = iommu_init_pci(iommu);
if (ret) {
@@ -2344,7 +2459,7 @@ static struct irq_chip intcapxt_controller = {
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_set_affinity = intcapxt_set_affinity,
.irq_set_wake = intcapxt_set_wake,
- .flags = IRQCHIP_MASK_ON_SUSPEND,
+ .flags = IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_MOVE_DEFERRED,
};
static const struct irq_domain_ops intcapxt_domain_ops = {
@@ -2583,13 +2698,13 @@ static void init_device_table_dma(struct amd_iommu_pci_seg *pci_seg)
u32 devid;
struct dev_table_entry *dev_table = pci_seg->dev_table;
- if (dev_table == NULL)
+ if (!dev_table || amd_iommu_pgtable == PD_MODE_NONE)
return;
for (devid = 0; devid <= pci_seg->last_bdf; ++devid) {
- __set_dev_entry_bit(dev_table, devid, DEV_ENTRY_VALID);
+ set_dte_bit(&dev_table[devid], DEV_ENTRY_VALID);
if (!amd_iommu_snp_en)
- __set_dev_entry_bit(dev_table, devid, DEV_ENTRY_TRANSLATION);
+ set_dte_bit(&dev_table[devid], DEV_ENTRY_TRANSLATION);
}
}
@@ -2617,8 +2732,7 @@ static void init_device_table(void)
for_each_pci_segment(pci_seg) {
for (devid = 0; devid <= pci_seg->last_bdf; ++devid)
- __set_dev_entry_bit(pci_seg->dev_table,
- devid, DEV_ENTRY_IRQ_TBL_EN);
+ set_dte_bit(&pci_seg->dev_table[devid], DEV_ENTRY_IRQ_TBL_EN);
}
}
@@ -2646,7 +2760,11 @@ static void iommu_init_flags(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
/* Set IOTLB invalidation timeout to 1s */
- iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S);
+ iommu_feature_set(iommu, CTRL_INV_TO_1S, CTRL_INV_TO_MASK, CONTROL_INV_TIMEOUT);
+
+ /* Enable Enhanced Peripheral Page Request Handling */
+ if (check_feature(FEATURE_EPHSUP))
+ iommu_feature_enable(iommu, CONTROL_EPH_EN);
}
static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
@@ -2735,6 +2853,17 @@ static void iommu_enable_irtcachedis(struct amd_iommu *iommu)
iommu->irtcachedis_enabled ? "disabled" : "enabled");
}
+static void iommu_enable_2k_int(struct amd_iommu *iommu)
+{
+ if (!FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2))
+ return;
+
+ iommu_feature_set(iommu,
+ CONTROL_NUM_INT_REMAP_MODE_2K,
+ CONTROL_NUM_INT_REMAP_MODE_MASK,
+ CONTROL_NUM_INT_REMAP_MODE);
+}
+
static void early_enable_iommu(struct amd_iommu *iommu)
{
iommu_disable(iommu);
@@ -2747,6 +2876,7 @@ static void early_enable_iommu(struct amd_iommu *iommu)
iommu_enable_ga(iommu);
iommu_enable_xt(iommu);
iommu_enable_irtcachedis(iommu);
+ iommu_enable_2k_int(iommu);
iommu_enable(iommu);
amd_iommu_flush_all_caches(iommu);
}
@@ -2755,8 +2885,8 @@ static void early_enable_iommu(struct amd_iommu *iommu)
* This function finally enables all IOMMUs found in the system after
* they have been initialized.
*
- * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy
- * the old content of device table entries. Not this case or copy failed,
+ * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse
+ * the old content of device table entries. Not this case or reuse failed,
* just continue as normal kernel does.
*/
static void early_enable_iommus(void)
@@ -2764,19 +2894,25 @@ static void early_enable_iommus(void)
struct amd_iommu *iommu;
struct amd_iommu_pci_seg *pci_seg;
- if (!copy_device_table()) {
+ if (!reuse_device_table()) {
/*
- * If come here because of failure in copying device table from old
+ * If come here because of failure in reusing device table from old
* kernel with all IOMMUs enabled, print error message and try to
* free allocated old_dev_tbl_cpy.
*/
- if (amd_iommu_pre_enabled)
- pr_err("Failed to copy DEV table from previous kernel.\n");
+ if (amd_iommu_pre_enabled) {
+ pr_err("Failed to reuse DEV table from previous kernel.\n");
+ /*
+ * Bail out early if unable to remap/reuse DEV table from
+ * previous kernel if SNP enabled as IOMMU commands will
+ * time out without DEV table and cause kdump boot panic.
+ */
+ BUG_ON(check_feature(FEATURE_SNP));
+ }
for_each_pci_segment(pci_seg) {
if (pci_seg->old_dev_tbl_cpy != NULL) {
- iommu_free_pages(pci_seg->old_dev_tbl_cpy,
- get_order(pci_seg->dev_table_size));
+ memunmap((void *)pci_seg->old_dev_tbl_cpy);
pci_seg->old_dev_tbl_cpy = NULL;
}
}
@@ -2786,11 +2922,10 @@ static void early_enable_iommus(void)
early_enable_iommu(iommu);
}
} else {
- pr_info("Copied DEV table from previous kernel.\n");
+ pr_info("Reused DEV table from previous kernel.\n");
for_each_pci_segment(pci_seg) {
- iommu_free_pages(pci_seg->dev_table,
- get_order(pci_seg->dev_table_size));
+ iommu_free_pages(pci_seg->dev_table);
pci_seg->dev_table = pci_seg->old_dev_tbl_cpy;
}
@@ -2803,6 +2938,7 @@ static void early_enable_iommus(void)
iommu_enable_ga(iommu);
iommu_enable_xt(iommu);
iommu_enable_irtcachedis(iommu);
+ iommu_enable_2k_int(iommu);
iommu_set_device_table(iommu);
amd_iommu_flush_all_caches(iommu);
}
@@ -2882,11 +3018,6 @@ static void enable_iommus_vapic(void)
#endif
}
-static void enable_iommus(void)
-{
- early_enable_iommus();
-}
-
static void disable_iommus(void)
{
struct amd_iommu *iommu;
@@ -2905,7 +3036,7 @@ static void disable_iommus(void)
* disable suspend until real resume implemented
*/
-static void amd_iommu_resume(void)
+static void amd_iommu_resume(void *data)
{
struct amd_iommu *iommu;
@@ -2913,12 +3044,13 @@ static void amd_iommu_resume(void)
iommu_apply_resume_quirks(iommu);
/* re-load the hardware */
- enable_iommus();
+ for_each_iommu(iommu)
+ early_enable_iommu(iommu);
amd_iommu_enable_interrupts();
}
-static int amd_iommu_suspend(void)
+static int amd_iommu_suspend(void *data)
{
/* disable IOMMUs to go out of the way for BIOS */
disable_iommus();
@@ -2926,16 +3058,17 @@ static int amd_iommu_suspend(void)
return 0;
}
-static struct syscore_ops amd_iommu_syscore_ops = {
+static const struct syscore_ops amd_iommu_syscore_ops = {
.suspend = amd_iommu_suspend,
.resume = amd_iommu_resume,
};
+static struct syscore amd_iommu_syscore = {
+ .ops = &amd_iommu_syscore_ops,
+};
+
static void __init free_iommu_resources(void)
{
- kmem_cache_destroy(amd_iommu_irq_cache);
- amd_iommu_irq_cache = NULL;
-
free_iommu_all();
free_pci_segments();
}
@@ -2994,9 +3127,7 @@ static bool __init check_ioapic_information(void)
static void __init free_dma_resources(void)
{
- iommu_free_pages(amd_iommu_pd_alloc_bitmap,
- get_order(MAX_DOMAIN_ID / 8));
- amd_iommu_pd_alloc_bitmap = NULL;
+ ida_destroy(&pdom_ids);
free_unity_maps();
}
@@ -3036,8 +3167,9 @@ static void __init ivinfo_init(void *ivrs)
static int __init early_amd_iommu_init(void)
{
struct acpi_table_header *ivrs_base;
- int remap_cache_sz, ret;
+ int ret;
acpi_status status;
+ u8 efr_hats;
if (!amd_iommu_detected)
return -ENODEV;
@@ -3051,6 +3183,12 @@ static int __init early_amd_iommu_init(void)
return -EINVAL;
}
+ if (!boot_cpu_has(X86_FEATURE_CX16)) {
+ pr_err("Failed to initialize. The CMPXCHG16B feature is required.\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* Validate checksum here so we don't need to do it when
* we actually parse the table
@@ -3064,20 +3202,6 @@ static int __init early_amd_iommu_init(void)
amd_iommu_target_ivhd_type = get_highest_supported_ivhd_type(ivrs_base);
DUMP_printk("Using IVHD type %#x\n", amd_iommu_target_ivhd_type);
- /* Device table - directly used by all IOMMUs */
- ret = -ENOMEM;
-
- amd_iommu_pd_alloc_bitmap = iommu_alloc_pages(GFP_KERNEL,
- get_order(MAX_DOMAIN_ID / 8));
- if (amd_iommu_pd_alloc_bitmap == NULL)
- goto out;
-
- /*
- * never allocate domain 0 because its used as the non-allocated and
- * error value placeholder
- */
- __set_bit(0, amd_iommu_pd_alloc_bitmap);
-
/*
* now the data structures are allocated and basically initialized
* start the real acpi table scan
@@ -3088,9 +3212,40 @@ static int __init early_amd_iommu_init(void)
/* 5 level guest page table */
if (cpu_feature_enabled(X86_FEATURE_LA57) &&
- check_feature_gpt_level() == GUEST_PGTABLE_5_LEVEL)
+ FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL)
amd_iommu_gpt_level = PAGE_MODE_5_LEVEL;
+ efr_hats = FIELD_GET(FEATURE_HATS, amd_iommu_efr);
+ if (efr_hats != 0x3) {
+ /*
+ * efr[HATS] bits specify the maximum host translation level
+ * supported, with LEVEL 4 being initial max level.
+ */
+ amd_iommu_hpt_level = efr_hats + PAGE_MODE_4_LEVEL;
+ } else {
+ pr_warn_once(FW_BUG "Disable host address translation due to invalid translation level (%#x).\n",
+ efr_hats);
+ amd_iommu_hatdis = true;
+ }
+
+ if (amd_iommu_pgtable == PD_MODE_V2) {
+ if (!amd_iommu_v2_pgtbl_supported()) {
+ pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n");
+ amd_iommu_pgtable = PD_MODE_V1;
+ }
+ }
+
+ if (amd_iommu_hatdis) {
+ /*
+ * Host (v1) page table is not available. Attempt to use
+ * Guest (v2) page table.
+ */
+ if (amd_iommu_v2_pgtbl_supported())
+ amd_iommu_pgtable = PD_MODE_V2;
+ else
+ amd_iommu_pgtable = PD_MODE_NONE;
+ }
+
/* Disable any previously enabled IOMMUs */
if (!is_kdump_kernel() || amd_iommu_disabled)
disable_iommus();
@@ -3100,22 +3255,7 @@ static int __init early_amd_iommu_init(void)
if (amd_iommu_irq_remap) {
struct amd_iommu_pci_seg *pci_seg;
- /*
- * Interrupt remapping enabled, create kmem_cache for the
- * remapping tables.
- */
ret = -ENOMEM;
- if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
- remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32);
- else
- remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2);
- amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache",
- remap_cache_sz,
- DTE_INTTAB_ALIGNMENT,
- 0, NULL);
- if (!amd_iommu_irq_cache)
- goto out;
-
for_each_pci_segment(pci_seg) {
if (alloc_irq_lookup_table(pci_seg))
goto out;
@@ -3196,7 +3336,7 @@ out:
return true;
}
-static void iommu_snp_enable(void)
+static __init void iommu_snp_enable(void)
{
#ifdef CONFIG_KVM_AMD_SEV
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
@@ -3210,7 +3350,7 @@ static void iommu_snp_enable(void)
goto disable_snp;
}
- if (amd_iommu_pgtable != AMD_IOMMU_V1) {
+ if (amd_iommu_pgtable != PD_MODE_V1) {
pr_warn("SNP: IOMMU is configured with V2 page table mode, SNP cannot be supported.\n");
goto disable_snp;
}
@@ -3221,6 +3361,14 @@ static void iommu_snp_enable(void)
goto disable_snp;
}
+ /*
+ * Enable host SNP support once SNP support is checked on IOMMU.
+ */
+ if (snp_rmptable_init()) {
+ pr_warn("SNP: RMP initialization failed, SNP cannot be supported.\n");
+ goto disable_snp;
+ }
+
pr_info("IOMMU SNP support enabled.\n");
return;
@@ -3263,7 +3411,7 @@ static int __init state_next(void)
init_state = IOMMU_ENABLED;
break;
case IOMMU_ENABLED:
- register_syscore_ops(&amd_iommu_syscore_ops);
+ register_syscore(&amd_iommu_syscore);
iommu_snp_enable();
ret = amd_iommu_init_pci();
init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT;
@@ -3320,6 +3468,19 @@ static int __init iommu_go_to_state(enum iommu_init_state state)
ret = state_next();
}
+ /*
+ * SNP platform initilazation requires IOMMUs to be fully configured.
+ * If the SNP support on IOMMUs has NOT been checked, simply mark SNP
+ * as unsupported. If the SNP support on IOMMUs has been checked and
+ * host SNP support enabled but RMP enforcement has not been enabled
+ * in IOMMUs, then the system is in a half-baked state, but can limp
+ * along as all memory should be Hypervisor-Owned in the RMP. WARN,
+ * but leave SNP as "supported" to avoid confusing the kernel.
+ */
+ if (ret && cc_platform_has(CC_ATTR_HOST_SEV_SNP) &&
+ !WARN_ON_ONCE(amd_iommu_snp_en))
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
+
return ret;
}
@@ -3353,12 +3514,12 @@ int __init amd_iommu_enable(void)
void amd_iommu_disable(void)
{
- amd_iommu_suspend();
+ amd_iommu_suspend(NULL);
}
int amd_iommu_reenable(int mode)
{
- amd_iommu_resume();
+ amd_iommu_resume(NULL);
return 0;
}
@@ -3377,7 +3538,6 @@ int amd_iommu_enable_faulting(unsigned int cpu)
*/
static int __init amd_iommu_init(void)
{
- struct amd_iommu *iommu;
int ret;
ret = iommu_go_to_state(IOMMU_INITIALIZED);
@@ -3391,8 +3551,8 @@ static int __init amd_iommu_init(void)
}
#endif
- for_each_iommu(iommu)
- amd_iommu_debugfs_setup(iommu);
+ if (!ret)
+ amd_iommu_debugfs_setup();
return ret;
}
@@ -3423,25 +3583,28 @@ static bool amd_iommu_sme_check(void)
* IOMMUs
*
****************************************************************************/
-int __init amd_iommu_detect(void)
+void __init amd_iommu_detect(void)
{
int ret;
if (no_iommu || (iommu_detected && !gart_iommu_aperture))
- return -ENODEV;
+ goto disable_snp;
if (!amd_iommu_sme_check())
- return -ENODEV;
+ goto disable_snp;
ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);
if (ret)
- return ret;
+ goto disable_snp;
amd_iommu_detected = true;
iommu_detected = 1;
x86_init.iommu.iommu_init = amd_iommu_init;
+ return;
- return 1;
+disable_snp:
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
}
/****************************************************************************
@@ -3489,11 +3652,17 @@ static int __init parse_amd_iommu_options(char *str)
} else if (strncmp(str, "force_isolation", 15) == 0) {
amd_iommu_force_isolation = true;
} else if (strncmp(str, "pgtbl_v1", 8) == 0) {
- amd_iommu_pgtable = AMD_IOMMU_V1;
+ amd_iommu_pgtable = PD_MODE_V1;
} else if (strncmp(str, "pgtbl_v2", 8) == 0) {
- amd_iommu_pgtable = AMD_IOMMU_V2;
+ amd_iommu_pgtable = PD_MODE_V2;
} else if (strncmp(str, "irtcachedis", 11) == 0) {
amd_iommu_irtcachedis = true;
+ } else if (strncmp(str, "nohugepages", 11) == 0) {
+ pr_info("Restricting V1 page-sizes to 4KiB");
+ amd_iommu_pgsize_bitmap = AMD_IOMMU_PGSIZES_4K;
+ } else if (strncmp(str, "v2_pgsizes_only", 15) == 0) {
+ pr_info("Restricting V1 page-sizes to 4KiB/2MiB/1GiB");
+ amd_iommu_pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
} else {
pr_notice("Unknown option - '%s'\n", str);
}
@@ -3588,7 +3757,7 @@ static int __init parse_ivrs_acpihid(char *str)
{
u32 seg = 0, bus, dev, fn;
char *hid, *uid, *p, *addr;
- char acpiid[ACPIID_LEN] = {0};
+ char acpiid[ACPIID_LEN + 1] = { }; /* size with NULL terminator */
int i;
addr = strchr(str, '@');
@@ -3614,7 +3783,7 @@ static int __init parse_ivrs_acpihid(char *str)
/* We have the '@', make it the terminator to get just the acpiid */
*addr++ = 0;
- if (strlen(str) > ACPIID_LEN + 1)
+ if (strlen(str) > ACPIID_LEN)
goto not_found;
if (sscanf(str, "=%s", acpiid) != 1)
@@ -3645,6 +3814,14 @@ found:
while (*uid == '0' && *(uid + 1))
uid++;
+ if (strlen(hid) >= ACPIHID_HID_LEN) {
+ pr_err("Invalid command line: hid is too long\n");
+ return 1;
+ } else if (strlen(uid) >= ACPIHID_UID_LEN) {
+ pr_err("Invalid command line: uid is too long\n");
+ return 1;
+ }
+
i = early_acpihid_map_size++;
memcpy(early_acpihid_map[i].hid, hid, strlen(hid));
memcpy(early_acpihid_map[i].uid, uid, strlen(uid));
@@ -3854,4 +4031,10 @@ int amd_iommu_snp_disable(void)
return 0;
}
EXPORT_SYMBOL_GPL(amd_iommu_snp_disable);
+
+bool amd_iommu_sev_tio_supported(void)
+{
+ return check_feature2(FEATURE_SEVSNPIO_SUP);
+}
+EXPORT_SYMBOL_GPL(amd_iommu_sev_tio_supported);
#endif
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
deleted file mode 100644
index 9d9a7fde59e7..000000000000
--- a/drivers/iommu/amd/io_pgtable.c
+++ /dev/null
@@ -1,607 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table allocator.
- *
- * Copyright (C) 2020 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#define pr_fmt(fmt) "AMD-Vi: " fmt
-#define dev_fmt(fmt) pr_fmt(fmt)
-
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-static void v1_tlb_flush_all(void *cookie)
-{
-}
-
-static void v1_tlb_flush_walk(unsigned long iova, size_t size,
- size_t granule, void *cookie)
-{
-}
-
-static void v1_tlb_add_page(struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t granule,
- void *cookie)
-{
-}
-
-static const struct iommu_flush_ops v1_flush_ops = {
- .tlb_flush_all = v1_tlb_flush_all,
- .tlb_flush_walk = v1_tlb_flush_walk,
- .tlb_add_page = v1_tlb_add_page,
-};
-
-/*
- * Helper function to get the first pte of a large mapping
- */
-static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
- unsigned long *count)
-{
- unsigned long pte_mask, pg_size, cnt;
- u64 *fpte;
-
- pg_size = PTE_PAGE_SIZE(*pte);
- cnt = PAGE_SIZE_PTE_COUNT(pg_size);
- pte_mask = ~((cnt << 3) - 1);
- fpte = (u64 *)(((unsigned long)pte) & pte_mask);
-
- if (page_size)
- *page_size = pg_size;
-
- if (count)
- *count = cnt;
-
- return fpte;
-}
-
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-
-static void free_pt_page(u64 *pt, struct list_head *freelist)
-{
- struct page *p = virt_to_page(pt);
-
- list_add_tail(&p->lru, freelist);
-}
-
-static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl)
-{
- u64 *p;
- int i;
-
- for (i = 0; i < 512; ++i) {
- /* PTE present? */
- if (!IOMMU_PTE_PRESENT(pt[i]))
- continue;
-
- /* Large PTE? */
- if (PM_PTE_LEVEL(pt[i]) == 0 ||
- PM_PTE_LEVEL(pt[i]) == 7)
- continue;
-
- /*
- * Free the next level. No need to look at l1 tables here since
- * they can only contain leaf PTEs; just free them directly.
- */
- p = IOMMU_PTE_PAGE(pt[i]);
- if (lvl > 2)
- free_pt_lvl(p, freelist, lvl - 1);
- else
- free_pt_page(p, freelist);
- }
-
- free_pt_page(pt, freelist);
-}
-
-static void free_sub_pt(u64 *root, int mode, struct list_head *freelist)
-{
- switch (mode) {
- case PAGE_MODE_NONE:
- case PAGE_MODE_7_LEVEL:
- break;
- case PAGE_MODE_1_LEVEL:
- free_pt_page(root, freelist);
- break;
- case PAGE_MODE_2_LEVEL:
- case PAGE_MODE_3_LEVEL:
- case PAGE_MODE_4_LEVEL:
- case PAGE_MODE_5_LEVEL:
- case PAGE_MODE_6_LEVEL:
- free_pt_lvl(root, freelist, mode);
- break;
- default:
- BUG();
- }
-}
-
-void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
- u64 *root, int mode)
-{
- u64 pt_root;
-
- /* lowest 3 bits encode pgtable mode */
- pt_root = mode & 7;
- pt_root |= (u64)root;
-
- amd_iommu_domain_set_pt_root(domain, pt_root);
-}
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
- unsigned long address,
- gfp_t gfp)
-{
- unsigned long flags;
- bool ret = true;
- u64 *pte;
-
- pte = iommu_alloc_page_node(domain->nid, gfp);
- if (!pte)
- return false;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- if (address <= PM_LEVEL_SIZE(domain->iop.mode))
- goto out;
-
- ret = false;
- if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL))
- goto out;
-
- *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root));
-
- domain->iop.root = pte;
- domain->iop.mode += 1;
- amd_iommu_update_and_flush_device_table(domain);
- amd_iommu_domain_flush_complete(domain);
-
- /*
- * Device Table needs to be updated and flushed before the new root can
- * be published.
- */
- amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode);
-
- pte = NULL;
- ret = true;
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
- iommu_free_page(pte);
-
- return ret;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address,
- unsigned long page_size,
- u64 **pte_page,
- gfp_t gfp,
- bool *updated)
-{
- int level, end_lvl;
- u64 *pte, *page;
-
- BUG_ON(!is_power_of_2(page_size));
-
- while (address > PM_LEVEL_SIZE(domain->iop.mode)) {
- /*
- * Return an error if there is no memory to update the
- * page-table.
- */
- if (!increase_address_space(domain, address, gfp))
- return NULL;
- }
-
-
- level = domain->iop.mode - 1;
- pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)];
- address = PAGE_SIZE_ALIGN(address, page_size);
- end_lvl = PAGE_SIZE_LEVEL(page_size);
-
- while (level > end_lvl) {
- u64 __pte, __npte;
- int pte_level;
-
- __pte = *pte;
- pte_level = PM_PTE_LEVEL(__pte);
-
- /*
- * If we replace a series of large PTEs, we need
- * to tear down all of them.
- */
- if (IOMMU_PTE_PRESENT(__pte) &&
- pte_level == PAGE_MODE_7_LEVEL) {
- unsigned long count, i;
- u64 *lpte;
-
- lpte = first_pte_l7(pte, NULL, &count);
-
- /*
- * Unmap the replicated PTEs that still match the
- * original large mapping
- */
- for (i = 0; i < count; ++i)
- cmpxchg64(&lpte[i], __pte, 0ULL);
-
- *updated = true;
- continue;
- }
-
- if (!IOMMU_PTE_PRESENT(__pte) ||
- pte_level == PAGE_MODE_NONE) {
- page = iommu_alloc_page_node(domain->nid, gfp);
-
- if (!page)
- return NULL;
-
- __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
-
- /* pte could have been changed somewhere. */
- if (!try_cmpxchg64(pte, &__pte, __npte))
- iommu_free_page(page);
- else if (IOMMU_PTE_PRESENT(__pte))
- *updated = true;
-
- continue;
- }
-
- /* No level skipping support yet */
- if (pte_level != level)
- return NULL;
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(__pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
- unsigned long address,
- unsigned long *page_size)
-{
- int level;
- u64 *pte;
-
- *page_size = 0;
-
- if (address > PM_LEVEL_SIZE(pgtable->mode))
- return NULL;
-
- level = pgtable->mode - 1;
- pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];
- *page_size = PTE_LEVEL_PAGE_SIZE(level);
-
- while (level > 0) {
-
- /* Not Present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Large PTE */
- if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL ||
- PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE)
- break;
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- /* Walk to the next level */
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- *page_size = PTE_LEVEL_PAGE_SIZE(level);
- }
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
- pte = first_pte_l7(pte, page_size, NULL);
-
- return pte;
-}
-
-static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist)
-{
- u64 *pt;
- int mode;
-
- while (!try_cmpxchg64(pte, &pteval, 0))
- pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
-
- if (!IOMMU_PTE_PRESENT(pteval))
- return;
-
- pt = IOMMU_PTE_PAGE(pteval);
- mode = IOMMU_PTE_MODE(pteval);
-
- free_sub_pt(pt, mode, freelist);
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- struct protection_domain *dom = io_pgtable_ops_to_domain(ops);
- LIST_HEAD(freelist);
- bool updated = false;
- u64 __pte, *pte;
- int ret, i, count;
- size_t size = pgcount << __ffs(pgsize);
- unsigned long o_iova = iova;
-
- BUG_ON(!IS_ALIGNED(iova, pgsize));
- BUG_ON(!IS_ALIGNED(paddr, pgsize));
-
- ret = -EINVAL;
- if (!(prot & IOMMU_PROT_MASK))
- goto out;
-
- while (pgcount > 0) {
- count = PAGE_SIZE_PTE_COUNT(pgsize);
- pte = alloc_pte(dom, iova, pgsize, NULL, gfp, &updated);
-
- ret = -ENOMEM;
- if (!pte)
- goto out;
-
- for (i = 0; i < count; ++i)
- free_clear_pte(&pte[i], pte[i], &freelist);
-
- if (!list_empty(&freelist))
- updated = true;
-
- if (count > 1) {
- __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC;
- } else
- __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC;
-
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- for (i = 0; i < count; ++i)
- pte[i] = __pte;
-
- iova += pgsize;
- paddr += pgsize;
- pgcount--;
- if (mapped)
- *mapped += pgsize;
- }
-
- ret = 0;
-
-out:
- if (updated) {
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- /*
- * Flush domain TLB(s) and wait for completion. Any Device-Table
- * Updates and flushing already happened in
- * increase_address_space().
- */
- amd_iommu_domain_flush_pages(dom, o_iova, size);
- spin_unlock_irqrestore(&dom->lock, flags);
- }
-
- /* Everything flushed out, free pages now */
- iommu_put_pages_list(&freelist);
-
- return ret;
-}
-
-static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long long unmapped;
- unsigned long unmap_size;
- u64 *pte;
- size_t size = pgcount << __ffs(pgsize);
-
- BUG_ON(!is_power_of_2(pgsize));
-
- unmapped = 0;
-
- while (unmapped < size) {
- pte = fetch_pte(pgtable, iova, &unmap_size);
- if (pte) {
- int i, count;
-
- count = PAGE_SIZE_PTE_COUNT(unmap_size);
- for (i = 0; i < count; i++)
- pte[i] = 0ULL;
- } else {
- return unmapped;
- }
-
- iova = (iova & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- return unmapped;
-}
-
-static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long offset_mask, pte_pgsize;
- u64 *pte, __pte;
-
- pte = fetch_pte(pgtable, iova, &pte_pgsize);
-
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- offset_mask = pte_pgsize - 1;
- __pte = __sme_clr(*pte & PM_ADDR_MASK);
-
- return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
- unsigned long flags)
-{
- bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
- bool dirty = false;
- int i, count;
-
- /*
- * 2.2.3.2 Host Dirty Support
- * When a non-default page size is used , software must OR the
- * Dirty bits in all of the replicated host PTEs used to map
- * the page. The IOMMU does not guarantee the Dirty bits are
- * set in all of the replicated PTEs. Any portion of the page
- * may have been written even if the Dirty bit is set in only
- * one of the replicated PTEs.
- */
- count = PAGE_SIZE_PTE_COUNT(size);
- for (i = 0; i < count && test_only; i++) {
- if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
- dirty = true;
- break;
- }
- }
-
- for (i = 0; i < count && !test_only; i++) {
- if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
- (unsigned long *)&ptep[i])) {
- dirty = true;
- }
- }
-
- return dirty;
-}
-
-static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long end = iova + size - 1;
-
- do {
- unsigned long pgsize = 0;
- u64 *ptep, pte;
-
- ptep = fetch_pte(pgtable, iova, &pgsize);
- if (ptep)
- pte = READ_ONCE(*ptep);
- if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
- pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
- iova += pgsize;
- continue;
- }
-
- /*
- * Mark the whole IOVA range as dirty even if only one of
- * the replicated PTEs were marked dirty.
- */
- if (pte_test_and_clear_dirty(ptep, pgsize, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
-
- return 0;
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v1_free_pgtable(struct io_pgtable *iop)
-{
- struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop);
- struct protection_domain *dom;
- LIST_HEAD(freelist);
-
- if (pgtable->mode == PAGE_MODE_NONE)
- return;
-
- dom = container_of(pgtable, struct protection_domain, iop);
-
- /* Page-table is not visible to IOMMU anymore, so free it */
- BUG_ON(pgtable->mode < PAGE_MODE_NONE ||
- pgtable->mode > PAGE_MODE_6_LEVEL);
-
- free_sub_pt(pgtable->root, pgtable->mode, &freelist);
-
- /* Update data structure */
- amd_iommu_domain_clr_pt_root(dom);
-
- /* Make changes visible to IOMMUs */
- amd_iommu_domain_update(dom);
-
- iommu_put_pages_list(&freelist);
-}
-
-static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
-
- cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES,
- cfg->ias = IOMMU_IN_ADDR_BIT_SIZE,
- cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE,
- cfg->tlb = &v1_flush_ops;
-
- pgtable->iop.ops.map_pages = iommu_v1_map_pages;
- pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages;
- pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
- pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
-
- return &pgtable->iop;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
- .alloc = v1_alloc_pgtable,
- .free = v1_free_pgtable,
-};
diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
deleted file mode 100644
index 78ac37c5ccc1..000000000000
--- a/drivers/iommu/amd/io_pgtable_v2.c
+++ /dev/null
@@ -1,387 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CPU-agnostic AMD IO page table v2 allocator.
- *
- * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc.
- * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- * Author: Vasant Hegde <vasant.hegde@amd.com>
- */
-
-#define pr_fmt(fmt) "AMD-Vi: " fmt
-#define dev_fmt(fmt) pr_fmt(fmt)
-
-#include <linux/bitops.h>
-#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
-
-#include <asm/barrier.h>
-
-#include "amd_iommu_types.h"
-#include "amd_iommu.h"
-#include "../iommu-pages.h"
-
-#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */
-#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */
-#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */
-#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */
-#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */
-#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */
-#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */
-#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */
-#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */
-
-#define MAX_PTRS_PER_PAGE 512
-
-#define IOMMU_PAGE_SIZE_2M BIT_ULL(21)
-#define IOMMU_PAGE_SIZE_1G BIT_ULL(30)
-
-
-static inline int get_pgtable_level(void)
-{
- return amd_iommu_gpt_level;
-}
-
-static inline bool is_large_pte(u64 pte)
-{
- return (pte & IOMMU_PAGE_PSE);
-}
-
-static inline u64 set_pgtable_attr(u64 *page)
-{
- u64 prot;
-
- prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER;
- prot |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
-
- return (iommu_virt_to_phys(page) | prot);
-}
-
-static inline void *get_pgtable_pte(u64 pte)
-{
- return iommu_phys_to_virt(pte & PM_ADDR_MASK);
-}
-
-static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
-{
- u64 pte;
-
- pte = __sme_set(paddr & PM_ADDR_MASK);
- pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
- pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
-
- if (prot & IOMMU_PROT_IW)
- pte |= IOMMU_PAGE_RW;
-
- /* Large page */
- if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M)
- pte |= IOMMU_PAGE_PSE;
-
- return pte;
-}
-
-static inline u64 get_alloc_page_size(u64 size)
-{
- if (size >= IOMMU_PAGE_SIZE_1G)
- return IOMMU_PAGE_SIZE_1G;
-
- if (size >= IOMMU_PAGE_SIZE_2M)
- return IOMMU_PAGE_SIZE_2M;
-
- return PAGE_SIZE;
-}
-
-static inline int page_size_to_level(u64 pg_size)
-{
- if (pg_size == IOMMU_PAGE_SIZE_1G)
- return PAGE_MODE_3_LEVEL;
- if (pg_size == IOMMU_PAGE_SIZE_2M)
- return PAGE_MODE_2_LEVEL;
-
- return PAGE_MODE_1_LEVEL;
-}
-
-static void free_pgtable(u64 *pt, int level)
-{
- u64 *p;
- int i;
-
- for (i = 0; i < MAX_PTRS_PER_PAGE; i++) {
- /* PTE present? */
- if (!IOMMU_PTE_PRESENT(pt[i]))
- continue;
-
- if (is_large_pte(pt[i]))
- continue;
-
- /*
- * Free the next level. No need to look at l1 tables here since
- * they can only contain leaf PTEs; just free them directly.
- */
- p = get_pgtable_pte(pt[i]);
- if (level > 2)
- free_pgtable(p, level - 1);
- else
- iommu_free_page(p);
- }
-
- iommu_free_page(pt);
-}
-
-/* Allocate page table */
-static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova,
- unsigned long pg_size, gfp_t gfp, bool *updated)
-{
- u64 *pte, *page;
- int level, end_level;
-
- level = get_pgtable_level() - 1;
- end_level = page_size_to_level(pg_size);
- pte = &pgd[PM_LEVEL_INDEX(level, iova)];
- iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE);
-
- while (level >= end_level) {
- u64 __pte, __npte;
-
- __pte = *pte;
-
- if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) {
- /* Unmap large pte */
- cmpxchg64(pte, *pte, 0ULL);
- *updated = true;
- continue;
- }
-
- if (!IOMMU_PTE_PRESENT(__pte)) {
- page = iommu_alloc_page_node(nid, gfp);
- if (!page)
- return NULL;
-
- __npte = set_pgtable_attr(page);
- /* pte could have been changed somewhere. */
- if (cmpxchg64(pte, __pte, __npte) != __pte)
- iommu_free_page(page);
- else if (IOMMU_PTE_PRESENT(__pte))
- *updated = true;
-
- continue;
- }
-
- level -= 1;
- pte = get_pgtable_pte(__pte);
- pte = &pte[PM_LEVEL_INDEX(level, iova)];
- }
-
- /* Tear down existing pte entries */
- if (IOMMU_PTE_PRESENT(*pte)) {
- u64 *__pte;
-
- *updated = true;
- __pte = get_pgtable_pte(*pte);
- cmpxchg64(pte, *pte, 0ULL);
- if (pg_size == IOMMU_PAGE_SIZE_1G)
- free_pgtable(__pte, end_level - 1);
- else if (pg_size == IOMMU_PAGE_SIZE_2M)
- iommu_free_page(__pte);
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address.
- * If there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
- unsigned long iova, unsigned long *page_size)
-{
- u64 *pte;
- int level;
-
- level = get_pgtable_level() - 1;
- pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)];
- /* Default page size is 4K */
- *page_size = PAGE_SIZE;
-
- while (level) {
- /* Not present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Walk to the next level */
- pte = get_pgtable_pte(*pte);
- pte = &pte[PM_LEVEL_INDEX(level - 1, iova)];
-
- /* Large page */
- if (is_large_pte(*pte)) {
- if (level == PAGE_MODE_3_LEVEL)
- *page_size = IOMMU_PAGE_SIZE_1G;
- else if (level == PAGE_MODE_2_LEVEL)
- *page_size = IOMMU_PAGE_SIZE_2M;
- else
- return NULL; /* Wrongly set PSE bit in PTE */
-
- break;
- }
-
- level -= 1;
- }
-
- return pte;
-}
-
-static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
-{
- struct protection_domain *pdom = io_pgtable_ops_to_domain(ops);
- struct io_pgtable_cfg *cfg = &pdom->iop.iop.cfg;
- u64 *pte;
- unsigned long map_size;
- unsigned long mapped_size = 0;
- unsigned long o_iova = iova;
- size_t size = pgcount << __ffs(pgsize);
- int ret = 0;
- bool updated = false;
-
- if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount)
- return -EINVAL;
-
- if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- while (mapped_size < size) {
- map_size = get_alloc_page_size(pgsize);
- pte = v2_alloc_pte(pdom->nid, pdom->iop.pgd,
- iova, map_size, gfp, &updated);
- if (!pte) {
- ret = -EINVAL;
- goto out;
- }
-
- *pte = set_pte_attr(paddr, map_size, prot);
-
- iova += map_size;
- paddr += map_size;
- mapped_size += map_size;
- }
-
-out:
- if (updated)
- amd_iommu_domain_flush_pages(pdom, o_iova, size);
-
- if (mapped)
- *mapped += mapped_size;
-
- return ret;
-}
-
-static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &pgtable->iop.cfg;
- unsigned long unmap_size;
- unsigned long unmapped = 0;
- size_t size = pgcount << __ffs(pgsize);
- u64 *pte;
-
- if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount))
- return 0;
-
- while (unmapped < size) {
- pte = fetch_pte(pgtable, iova, &unmap_size);
- if (!pte)
- return unmapped;
-
- *pte = 0ULL;
-
- iova = (iova & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- return unmapped;
-}
-
-static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
- unsigned long offset_mask, pte_pgsize;
- u64 *pte, __pte;
-
- pte = fetch_pte(pgtable, iova, &pte_pgsize);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- offset_mask = pte_pgsize - 1;
- __pte = __sme_clr(*pte & PM_ADDR_MASK);
-
- return (__pte & ~offset_mask) | (iova & offset_mask);
-}
-
-/*
- * ----------------------------------------------------
- */
-static void v2_tlb_flush_all(void *cookie)
-{
-}
-
-static void v2_tlb_flush_walk(unsigned long iova, size_t size,
- size_t granule, void *cookie)
-{
-}
-
-static void v2_tlb_add_page(struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t granule,
- void *cookie)
-{
-}
-
-static const struct iommu_flush_ops v2_flush_ops = {
- .tlb_flush_all = v2_tlb_flush_all,
- .tlb_flush_walk = v2_tlb_flush_walk,
- .tlb_add_page = v2_tlb_add_page,
-};
-
-static void v2_free_pgtable(struct io_pgtable *iop)
-{
- struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop);
-
- if (!pgtable || !pgtable->pgd)
- return;
-
- /* Free page table */
- free_pgtable(pgtable->pgd, get_pgtable_level());
- pgtable->pgd = NULL;
-}
-
-static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
-{
- struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
- struct protection_domain *pdom = (struct protection_domain *)cookie;
- int ias = IOMMU_IN_ADDR_BIT_SIZE;
-
- pgtable->pgd = iommu_alloc_page_node(pdom->nid, GFP_ATOMIC);
- if (!pgtable->pgd)
- return NULL;
-
- if (get_pgtable_level() == PAGE_MODE_5_LEVEL)
- ias = 57;
-
- pgtable->iop.ops.map_pages = iommu_v2_map_pages;
- pgtable->iop.ops.unmap_pages = iommu_v2_unmap_pages;
- pgtable->iop.ops.iova_to_phys = iommu_v2_iova_to_phys;
-
- cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2,
- cfg->ias = ias,
- cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE,
- cfg->tlb = &v2_flush_ops;
-
- return &pgtable->iop;
-}
-
-struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = {
- .alloc = v2_alloc_pgtable,
- .free = v2_free_pgtable,
-};
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index b19e8c0f48fa..9f1d56a5e145 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -14,20 +14,22 @@
#include <linux/pci-ats.h>
#include <linux/bitmap.h>
#include <linux/slab.h>
+#include <linux/string_choices.h>
#include <linux/debugfs.h>
#include <linux/scatterlist.h>
#include <linux/dma-map-ops.h>
#include <linux/dma-direct.h>
+#include <linux/idr.h>
#include <linux/iommu-helper.h>
#include <linux/delay.h>
#include <linux/amd-iommu.h>
#include <linux/notifier.h>
#include <linux/export.h>
#include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
#include <linux/msi.h>
#include <linux/irqdomain.h>
#include <linux/percpu.h>
-#include <linux/io-pgtable.h>
#include <linux/cc_platform.h>
#include <asm/irq_remapping.h>
#include <asm/io_apic.h>
@@ -38,9 +40,9 @@
#include <asm/gart.h>
#include <asm/dma.h>
#include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
#include "amd_iommu.h"
-#include "../dma-iommu.h"
#include "../irq_remapping.h"
#include "../iommu-pages.h"
@@ -52,32 +54,36 @@
#define HT_RANGE_START (0xfd00000000ULL)
#define HT_RANGE_END (0xffffffffffULL)
-#define DEFAULT_PGTABLE_LEVEL PAGE_MODE_3_LEVEL
-
-static DEFINE_SPINLOCK(pd_bitmap_lock);
-
LIST_HEAD(ioapic_map);
LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
-static const struct iommu_dirty_ops amd_dirty_ops;
int amd_iommu_max_glx_val = -1;
/*
- * general struct to manage commands send to an IOMMU
+ * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap
+ * to know which ones are already in use.
*/
-struct iommu_cmd {
- u32 data[4];
-};
+DEFINE_IDA(pdom_ids);
-struct kmem_cache *amd_iommu_irq_cache;
-
-static void detach_device(struct device *dev);
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+ struct iommu_domain *old);
static void set_dte_entry(struct amd_iommu *iommu,
- struct iommu_dev_data *dev_data);
+ struct iommu_dev_data *dev_data,
+ phys_addr_t top_paddr, unsigned int top_level);
+
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level);
+
+static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid);
+
+static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid);
+static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain);
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
/****************************************************************************
*
@@ -85,6 +91,132 @@ static void set_dte_entry(struct amd_iommu *iommu,
*
****************************************************************************/
+static __always_inline void amd_iommu_atomic128_set(__int128 *ptr, __int128 val)
+{
+ /*
+ * Note:
+ * We use arch_cmpxchg128_local() because:
+ * - Need cmpxchg16b instruction mainly for 128-bit store to DTE
+ * (not necessary for cmpxchg since this function is already
+ * protected by a spin_lock for this DTE).
+ * - Neither need LOCK_PREFIX nor try loop because of the spin_lock.
+ */
+ arch_cmpxchg128_local(ptr, *ptr, val);
+}
+
+static void write_dte_upper128(struct dev_table_entry *ptr, struct dev_table_entry *new)
+{
+ struct dev_table_entry old;
+
+ old.data128[1] = ptr->data128[1];
+ /*
+ * Preserve DTE_DATA2_INTR_MASK. This needs to be
+ * done here since it requires to be inside
+ * spin_lock(&dev_data->dte_lock) context.
+ */
+ new->data[2] &= ~DTE_DATA2_INTR_MASK;
+ new->data[2] |= old.data[2] & DTE_DATA2_INTR_MASK;
+
+ amd_iommu_atomic128_set(&ptr->data128[1], new->data128[1]);
+}
+
+static void write_dte_lower128(struct dev_table_entry *ptr, struct dev_table_entry *new)
+{
+ amd_iommu_atomic128_set(&ptr->data128[0], new->data128[0]);
+}
+
+/*
+ * Note:
+ * IOMMU reads the entire Device Table entry in a single 256-bit transaction
+ * but the driver is programming DTE using 2 128-bit cmpxchg. So, the driver
+ * need to ensure the following:
+ * - DTE[V|GV] bit is being written last when setting.
+ * - DTE[V|GV] bit is being written first when clearing.
+ *
+ * This function is used only by code, which updates DMA translation part of the DTE.
+ * So, only consider control bits related to DMA when updating the entry.
+ */
+static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data,
+ struct dev_table_entry *new)
+{
+ unsigned long flags;
+ struct dev_table_entry *dev_table = get_dev_table(iommu);
+ struct dev_table_entry *ptr = &dev_table[dev_data->devid];
+
+ spin_lock_irqsave(&dev_data->dte_lock, flags);
+
+ if (!(ptr->data[0] & DTE_FLAG_V)) {
+ /* Existing DTE is not valid. */
+ write_dte_upper128(ptr, new);
+ write_dte_lower128(ptr, new);
+ iommu_flush_dte_sync(iommu, dev_data->devid);
+ } else if (!(new->data[0] & DTE_FLAG_V)) {
+ /* Existing DTE is valid. New DTE is not valid. */
+ write_dte_lower128(ptr, new);
+ write_dte_upper128(ptr, new);
+ iommu_flush_dte_sync(iommu, dev_data->devid);
+ } else if (!FIELD_GET(DTE_FLAG_GV, ptr->data[0])) {
+ /*
+ * Both DTEs are valid.
+ * Existing DTE has no guest page table.
+ */
+ write_dte_upper128(ptr, new);
+ write_dte_lower128(ptr, new);
+ iommu_flush_dte_sync(iommu, dev_data->devid);
+ } else if (!FIELD_GET(DTE_FLAG_GV, new->data[0])) {
+ /*
+ * Both DTEs are valid.
+ * Existing DTE has guest page table,
+ * new DTE has no guest page table,
+ */
+ write_dte_lower128(ptr, new);
+ write_dte_upper128(ptr, new);
+ iommu_flush_dte_sync(iommu, dev_data->devid);
+ } else if (FIELD_GET(DTE_GPT_LEVEL_MASK, ptr->data[2]) !=
+ FIELD_GET(DTE_GPT_LEVEL_MASK, new->data[2])) {
+ /*
+ * Both DTEs are valid and have guest page table,
+ * but have different number of levels. So, we need
+ * to upadte both upper and lower 128-bit value, which
+ * require disabling and flushing.
+ */
+ struct dev_table_entry clear = {};
+
+ /* First disable DTE */
+ write_dte_lower128(ptr, &clear);
+ iommu_flush_dte_sync(iommu, dev_data->devid);
+
+ /* Then update DTE */
+ write_dte_upper128(ptr, new);
+ write_dte_lower128(ptr, new);
+ iommu_flush_dte_sync(iommu, dev_data->devid);
+ } else {
+ /*
+ * Both DTEs are valid and have guest page table,
+ * and same number of levels. We just need to only
+ * update the lower 128-bit. So no need to disable DTE.
+ */
+ write_dte_lower128(ptr, new);
+ }
+
+ spin_unlock_irqrestore(&dev_data->dte_lock, flags);
+}
+
+static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data,
+ struct dev_table_entry *dte)
+{
+ unsigned long flags;
+ struct dev_table_entry *ptr;
+ struct dev_table_entry *dev_table = get_dev_table(iommu);
+
+ ptr = &dev_table[dev_data->devid];
+
+ spin_lock_irqsave(&dev_data->dte_lock, flags);
+ dte->data128[0] = ptr->data128[0];
+ dte->data128[1] = ptr->data128[1];
+ spin_unlock_irqrestore(&dev_data->dte_lock, flags);
+}
+
static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
{
return (pdom && (pdom->pd_mode == PD_MODE_V2));
@@ -109,7 +241,9 @@ static inline int get_acpihid_device_id(struct device *dev,
struct acpihid_map_entry **entry)
{
struct acpi_device *adev = ACPI_COMPANION(dev);
- struct acpihid_map_entry *p;
+ struct acpihid_map_entry *p, *p1 = NULL;
+ int hid_count = 0;
+ bool fw_bug;
if (!adev)
return -ENODEV;
@@ -117,12 +251,33 @@ static inline int get_acpihid_device_id(struct device *dev,
list_for_each_entry(p, &acpihid_map, list) {
if (acpi_dev_hid_uid_match(adev, p->hid,
p->uid[0] ? p->uid : NULL)) {
- if (entry)
- *entry = p;
- return p->devid;
+ p1 = p;
+ fw_bug = false;
+ hid_count = 1;
+ break;
+ }
+
+ /*
+ * Count HID matches w/o UID, raise FW_BUG but allow exactly one match
+ */
+ if (acpi_dev_hid_match(adev, p->hid)) {
+ p1 = p;
+ hid_count++;
+ fw_bug = true;
}
}
- return -EINVAL;
+
+ if (!p1)
+ return -EINVAL;
+ if (fw_bug)
+ dev_err_once(dev, FW_BUG "No ACPI device matched UID, but %d device%s matched HID.\n",
+ hid_count, str_plural(hid_count));
+ if (hid_count > 1)
+ return -EINVAL;
+ if (entry)
+ *entry = p1;
+
+ return p1->devid;
}
static inline int get_device_sbdf_id(struct device *dev)
@@ -204,7 +359,8 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
if (!dev_data)
return NULL;
- spin_lock_init(&dev_data->lock);
+ mutex_init(&dev_data->mutex);
+ spin_lock_init(&dev_data->dte_lock);
dev_data->devid = devid;
ratelimit_default_init(&dev_data->rs);
@@ -212,7 +368,7 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
return dev_data;
}
-static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
+struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
{
struct iommu_dev_data *dev_data;
struct llist_node *node;
@@ -232,9 +388,11 @@ static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid
static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
{
+ struct dev_table_entry new;
struct amd_iommu *iommu;
- struct dev_table_entry *dev_table;
+ struct iommu_dev_data *dev_data, *alias_data;
u16 devid = pci_dev_id(pdev);
+ int ret = 0;
if (devid == alias)
return 0;
@@ -243,13 +401,27 @@ static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
if (!iommu)
return 0;
- amd_iommu_set_rlookup_table(iommu, alias);
- dev_table = get_dev_table(iommu);
- memcpy(dev_table[alias].data,
- dev_table[devid].data,
- sizeof(dev_table[alias].data));
+ /* Copy the data from pdev */
+ dev_data = dev_iommu_priv_get(&pdev->dev);
+ if (!dev_data) {
+ pr_err("%s : Failed to get dev_data for 0x%x\n", __func__, devid);
+ ret = -EINVAL;
+ goto out;
+ }
+ get_dte256(iommu, dev_data, &new);
- return 0;
+ /* Setup alias */
+ alias_data = find_dev_data(iommu, alias);
+ if (!alias_data) {
+ pr_err("%s : Failed to get alias dev_data for 0x%x\n", __func__, alias);
+ ret = -EINVAL;
+ goto out;
+ }
+ update_dte256(iommu, alias_data, &new);
+
+ amd_iommu_set_rlookup_table(iommu, alias);
+out:
+ return ret;
}
static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
@@ -462,8 +634,8 @@ static inline void pdev_disable_cap_pasid(struct pci_dev *pdev)
static void pdev_enable_caps(struct pci_dev *pdev)
{
- pdev_enable_cap_ats(pdev);
pdev_enable_cap_pasid(pdev);
+ pdev_enable_cap_ats(pdev);
pdev_enable_cap_pri(pdev);
}
@@ -522,6 +694,12 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
return -ENOMEM;
dev_data->dev = dev;
+
+ /*
+ * The dev_iommu_priv_set() needes to be called before setup_aliases.
+ * Otherwise, subsequent call to dev_iommu_priv_get() will fail.
+ */
+ dev_iommu_priv_set(dev, dev_data);
setup_aliases(iommu, dev);
/*
@@ -535,8 +713,6 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
dev_data->flags = pdev_get_caps(to_pci_dev(dev));
}
- dev_iommu_priv_set(dev, dev_data);
-
return 0;
}
@@ -557,22 +733,6 @@ static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
setup_aliases(iommu, dev);
}
-static void amd_iommu_uninit_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
-
- dev_data = dev_iommu_priv_get(dev);
- if (!dev_data)
- return;
-
- if (dev_data->domain)
- detach_device(dev);
-
- /*
- * We keep dev_data around for unplugged devices and reuse it when the
- * device is re-plugged - not doing so would introduce a ton of races.
- */
-}
/****************************************************************************
*
@@ -583,10 +743,13 @@ static void amd_iommu_uninit_device(struct device *dev)
static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
{
int i;
- struct dev_table_entry *dev_table = get_dev_table(iommu);
+ struct dev_table_entry dte;
+ struct iommu_dev_data *dev_data = find_dev_data(iommu, devid);
+
+ get_dte256(iommu, dev_data, &dte);
for (i = 0; i < 4; ++i)
- pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
+ pr_err("DTE[%d]: %016llx\n", i, dte.data[i]);
}
static void dump_command(unsigned long phys_addr)
@@ -726,7 +889,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
int type, devid, flags, tag;
volatile u32 *event = __evt;
int count = 0;
- u64 address;
+ u64 address, ctrl;
u32 pasid;
retry:
@@ -736,6 +899,7 @@ retry:
(event[1] & EVENT_DOMID_MASK_LO);
flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
address = (u64)(((u64)event[3]) << 32) | event[2];
+ ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
if (type == 0) {
/* Did we hit the erratum? */
@@ -757,6 +921,7 @@ retry:
dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
pasid, address, flags);
+ dev_err(dev, "Control Reg : 0x%llx\n", ctrl);
dump_dte_entry(iommu, devid);
break;
case EVENT_TYPE_DEV_TAB_ERR:
@@ -825,10 +990,12 @@ static void iommu_poll_events(struct amd_iommu *iommu)
while (head != tail) {
iommu_print_event(iommu, iommu->evt_buf + head);
+
+ /* Update head pointer of hardware ring-buffer */
head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
+ writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
}
- writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
}
#ifdef CONFIG_IRQ_REMAP
@@ -838,6 +1005,14 @@ int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
{
iommu_ga_log_notifier = notifier;
+ /*
+ * Ensure all in-flight IRQ handlers run to completion before returning
+ * to the caller, e.g. to ensure module code isn't unloaded while it's
+ * being executed in the IRQ handler.
+ */
+ if (!notifier)
+ synchronize_rcu();
+
return 0;
}
EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
@@ -987,6 +1162,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
*
****************************************************************************/
+static void dump_command_buffer(struct amd_iommu *iommu)
+{
+ struct iommu_cmd *cmd;
+ u32 head, tail;
+ int i;
+
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head),
+ MMIO_CMD_BUFFER_TAIL(tail));
+
+ for (i = 0; i < CMD_BUFFER_ENTRIES; i++) {
+ cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd));
+ pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2],
+ cmd->data[3]);
+ }
+}
+
static int wait_on_sem(struct amd_iommu *iommu, u64 data)
{
int i = 0;
@@ -997,7 +1191,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data)
}
if (i == LOOP_TIMEOUT) {
- pr_alert("Completion-Wait loop timed out\n");
+
+ pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n",
+ iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid),
+ PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid));
+
+ if (amd_iommu_dump)
+ DO_ONCE_LITE(dump_command_buffer, iommu);
+
return -EIO;
}
@@ -1026,7 +1227,7 @@ static void build_completion_wait(struct iommu_cmd *cmd,
struct amd_iommu *iommu,
u64 data)
{
- u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
+ u64 paddr = iommu->cmd_sem_paddr;
memset(cmd, 0, sizeof(*cmd));
cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
@@ -1230,7 +1431,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
if (!iommu->need_sync)
return 0;
- data = atomic64_add_return(1, &iommu->cmd_sem_val);
+ data = atomic64_inc_return(&iommu->cmd_sem_val);
build_completion_wait(&cmd, iommu, data);
raw_spin_lock_irqsave(&iommu->lock, flags);
@@ -1247,6 +1448,21 @@ out_unlock:
return ret;
}
+static void domain_flush_complete(struct protection_domain *domain)
+{
+ struct pdom_iommu_info *pdom_iommu_info;
+ unsigned long i;
+
+ lockdep_assert_held(&domain->lock);
+
+ /*
+ * Devices of this domain are behind this IOMMU
+ * We need to wait for completion of all commands.
+ */
+ xa_for_each(&domain->iommu_array, i, pdom_iommu_info)
+ iommu_completion_wait(pdom_iommu_info->iommu);
+}
+
static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
{
struct iommu_cmd cmd;
@@ -1256,6 +1472,15 @@ static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
return iommu_queue_command(iommu, &cmd);
}
+static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid)
+{
+ int ret;
+
+ ret = iommu_flush_dte(iommu, devid);
+ if (!ret)
+ iommu_completion_wait(iommu);
+}
+
static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
{
u32 devid;
@@ -1410,6 +1635,7 @@ static int domain_flush_pages_v2(struct protection_domain *pdom,
struct iommu_cmd cmd;
int ret = 0;
+ lockdep_assert_held(&pdom->lock);
list_for_each_entry(dev_data, &pdom->dev_list, list) {
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
u16 domid = dev_data->gcr3_info.domid;
@@ -1426,21 +1652,22 @@ static int domain_flush_pages_v2(struct protection_domain *pdom,
static int domain_flush_pages_v1(struct protection_domain *pdom,
u64 address, size_t size)
{
+ struct pdom_iommu_info *pdom_iommu_info;
struct iommu_cmd cmd;
- int ret = 0, i;
+ int ret = 0;
+ unsigned long i;
+
+ lockdep_assert_held(&pdom->lock);
build_inv_iommu_pages(&cmd, address, size,
pdom->id, IOMMU_NO_PASID, false);
- for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
- if (!pdom->dev_iommu[i])
- continue;
-
+ xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) {
/*
* Devices of this domain are behind this IOMMU
* We need a TLB flush
*/
- ret |= iommu_queue_command(amd_iommus[i], &cmd);
+ ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd);
}
return ret;
@@ -1458,6 +1685,8 @@ static void __domain_flush_pages(struct protection_domain *domain,
ioasid_t pasid = IOMMU_NO_PASID;
bool gn = false;
+ lockdep_assert_held(&domain->lock);
+
if (pdom_is_v2_pgtbl_mode(domain)) {
gn = true;
ret = domain_flush_pages_v2(domain, address, size);
@@ -1479,11 +1708,13 @@ static void __domain_flush_pages(struct protection_domain *domain,
void amd_iommu_domain_flush_pages(struct protection_domain *domain,
u64 address, size_t size)
{
+ lockdep_assert_held(&domain->lock);
+
if (likely(!amd_iommu_np_cache)) {
__domain_flush_pages(domain, address, size);
/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
- amd_iommu_domain_flush_complete(domain);
+ domain_flush_complete(domain);
return;
}
@@ -1523,7 +1754,7 @@ void amd_iommu_domain_flush_pages(struct protection_domain *domain,
}
/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
- amd_iommu_domain_flush_complete(domain);
+ domain_flush_complete(domain);
}
/* Flush the whole IO/TLB for a given protection domain - including PDE */
@@ -1549,79 +1780,11 @@ void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
iommu_completion_wait(iommu);
}
-void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data,
- ioasid_t pasid)
+static void dev_flush_pasid_all(struct iommu_dev_data *dev_data,
+ ioasid_t pasid)
{
- amd_iommu_dev_flush_pasid_pages(dev_data, 0,
- CMD_INV_IOMMU_ALL_PAGES_ADDRESS, pasid);
-}
-
-void amd_iommu_domain_flush_complete(struct protection_domain *domain)
-{
- int i;
-
- for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
- if (domain && !domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need to wait for completion of all commands.
- */
- iommu_completion_wait(amd_iommus[i]);
- }
-}
-
-/* Flush the not present cache if it exists */
-static void domain_flush_np_cache(struct protection_domain *domain,
- dma_addr_t iova, size_t size)
-{
- if (unlikely(amd_iommu_np_cache)) {
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
- amd_iommu_domain_flush_pages(domain, iova, size);
- spin_unlock_irqrestore(&domain->lock, flags);
- }
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void domain_flush_devices(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- device_flush_dte(dev_data);
-}
-
-static void update_device_table(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
-
- set_dte_entry(iommu, dev_data);
- clone_aliases(iommu, dev_data->dev);
- }
-}
-
-void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
-{
- update_device_table(domain);
- domain_flush_devices(domain);
-}
-
-void amd_iommu_domain_update(struct protection_domain *domain)
-{
- /* Update device table */
- amd_iommu_update_and_flush_device_table(domain);
-
- /* Flush domain TLB(s) and wait for completion */
- amd_iommu_domain_flush_all(domain);
+ amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0,
+ CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
}
int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
@@ -1649,31 +1812,14 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
*
****************************************************************************/
-static u16 domain_id_alloc(void)
+static int pdom_id_alloc(void)
{
- unsigned long flags;
- int id;
-
- spin_lock_irqsave(&pd_bitmap_lock, flags);
- id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
- BUG_ON(id == 0);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __set_bit(id, amd_iommu_pd_alloc_bitmap);
- else
- id = 0;
- spin_unlock_irqrestore(&pd_bitmap_lock, flags);
-
- return id;
+ return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC);
}
-static void domain_id_free(int id)
+static void pdom_id_free(int id)
{
- unsigned long flags;
-
- spin_lock_irqsave(&pd_bitmap_lock, flags);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __clear_bit(id, amd_iommu_pd_alloc_bitmap);
- spin_unlock_irqrestore(&pd_bitmap_lock, flags);
+ ida_free(&pdom_ids, id);
}
static void free_gcr3_tbl_level1(u64 *tbl)
@@ -1687,7 +1833,7 @@ static void free_gcr3_tbl_level1(u64 *tbl)
ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
- iommu_free_page(ptr);
+ iommu_free_pages(ptr);
}
}
@@ -1718,9 +1864,9 @@ static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info)
gcr3_info->glx = 0;
/* Free per device domain ID */
- domain_id_free(gcr3_info->domid);
+ pdom_id_free(gcr3_info->domid);
- iommu_free_page(gcr3_info->gcr3_tbl);
+ iommu_free_pages(gcr3_info->gcr3_tbl);
gcr3_info->gcr3_tbl = NULL;
}
@@ -1745,6 +1891,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
{
int levels = get_gcr3_levels(pasids);
int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
+ int domid;
if (levels > amd_iommu_max_glx_val)
return -EINVAL;
@@ -1753,11 +1900,14 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
return -EBUSY;
/* Allocate per device domain ID */
- gcr3_info->domid = domain_id_alloc();
+ domid = pdom_id_alloc();
+ if (domid <= 0)
+ return -ENOSPC;
+ gcr3_info->domid = domid;
- gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC);
+ gcr3_info->gcr3_tbl = iommu_alloc_pages_node_sz(nid, GFP_ATOMIC, SZ_4K);
if (gcr3_info->gcr3_tbl == NULL) {
- domain_id_free(gcr3_info->domid);
+ pdom_id_free(domid);
return -ENOMEM;
}
@@ -1816,7 +1966,7 @@ static int update_gcr3(struct iommu_dev_data *dev_data,
else
*pte = 0;
- amd_iommu_dev_flush_pasid_all(dev_data, pasid);
+ dev_flush_pasid_all(dev_data, pasid);
return 0;
}
@@ -1851,90 +2001,127 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
return ret;
}
+static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr,
+ struct dev_table_entry *new)
+{
+ /* All existing DTE must have V bit set */
+ new->data128[0] = DTE_FLAG_V;
+ new->data128[1] = 0;
+}
+
+/*
+ * Note:
+ * The old value for GCR3 table and GPT have been cleared from caller.
+ */
+static void set_dte_gcr3_table(struct amd_iommu *iommu,
+ struct iommu_dev_data *dev_data,
+ struct dev_table_entry *target)
+{
+ struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ u64 gcr3;
+
+ if (!gcr3_info->gcr3_tbl)
+ return;
+
+ pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n",
+ __func__, dev_data->devid, gcr3_info->glx,
+ (unsigned long long)gcr3_info->gcr3_tbl);
+
+ gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
+
+ target->data[0] |= DTE_FLAG_GV |
+ FIELD_PREP(DTE_GLX, gcr3_info->glx) |
+ FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12);
+ if (pdom_is_v2_pgtbl_mode(dev_data->domain))
+ target->data[0] |= DTE_FLAG_GIOV;
+
+ target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) |
+ FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31);
+
+ /* Guest page table can only support 4 and 5 levels */
+ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
+ target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
+ else
+ target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
+}
+
static void set_dte_entry(struct amd_iommu *iommu,
- struct iommu_dev_data *dev_data)
+ struct iommu_dev_data *dev_data,
+ phys_addr_t top_paddr, unsigned int top_level)
{
- u64 pte_root = 0;
- u64 flags = 0;
- u32 old_domid;
- u16 devid = dev_data->devid;
u16 domid;
+ u32 old_domid;
+ struct dev_table_entry *initial_dte;
+ struct dev_table_entry new = {};
struct protection_domain *domain = dev_data->domain;
- struct dev_table_entry *dev_table = get_dev_table(iommu);
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
+ struct pt_iommu_amdv1_hw_info pt_info;
+
+ make_clear_dte(dev_data, dte, &new);
if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;
- else
+ else {
domid = domain->id;
- if (domain->iop.mode != PAGE_MODE_NONE)
- pte_root = iommu_virt_to_phys(domain->iop.root);
+ if (domain->domain.type & __IOMMU_DOMAIN_PAGING) {
+ /*
+ * When updating the IO pagetable, the new top and level
+ * are provided as parameters. For other operations i.e.
+ * device attach, retrieve the current pagetable info
+ * via the IOMMU PT API.
+ */
+ if (top_paddr) {
+ pt_info.host_pt_root = top_paddr;
+ pt_info.mode = top_level + 1;
+ } else {
+ WARN_ON(top_paddr || top_level);
+ pt_iommu_amdv1_hw_info(&domain->amdv1,
+ &pt_info);
+ }
- pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
+ new.data[0] |= __sme_set(pt_info.host_pt_root) |
+ (pt_info.mode & DEV_ENTRY_MODE_MASK)
+ << DEV_ENTRY_MODE_SHIFT;
+ }
+ }
- pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
+ new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW;
/*
- * When SNP is enabled, Only set TV bit when IOMMU
- * page translation is in use.
+ * When SNP is enabled, we can only support TV=1 with non-zero domain ID.
+ * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in
+ * do_iommu_domain_alloc().
*/
- if (!amd_iommu_snp_en || (domid != 0))
- pte_root |= DTE_FLAG_TV;
-
- flags = dev_table[devid].data[1];
-
- if (dev_data->ats_enabled)
- flags |= DTE_FLAG_IOTLB;
+ WARN_ON(amd_iommu_snp_en && (domid == 0));
+ new.data[0] |= DTE_FLAG_TV;
if (dev_data->ppr)
- pte_root |= 1ULL << DEV_ENTRY_PPR;
+ new.data[0] |= 1ULL << DEV_ENTRY_PPR;
if (domain->dirty_tracking)
- pte_root |= DTE_FLAG_HAD;
-
- if (gcr3_info && gcr3_info->gcr3_tbl) {
- u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
- u64 glx = gcr3_info->glx;
- u64 tmp;
-
- pte_root |= DTE_FLAG_GV;
- pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
-
- /* First mask out possible old values for GCR3 table */
- tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
- flags &= ~tmp;
-
- tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
- flags &= ~tmp;
-
- /* Encode GCR3 table into DTE */
- tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
- pte_root |= tmp;
+ new.data[0] |= DTE_FLAG_HAD;
- tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
- flags |= tmp;
-
- tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
- flags |= tmp;
+ if (dev_data->ats_enabled)
+ new.data[1] |= DTE_FLAG_IOTLB;
- if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
- dev_table[devid].data[2] |=
- ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
- }
+ old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK;
+ new.data[1] |= domid;
- /* GIOV is supported with V2 page table mode only */
- if (pdom_is_v2_pgtbl_mode(domain))
- pte_root |= DTE_FLAG_GIOV;
+ /*
+ * Restore cached persistent DTE bits, which can be set by information
+ * in IVRS table. See set_dev_entry_from_acpi().
+ */
+ initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid);
+ if (initial_dte) {
+ new.data128[0] |= initial_dte->data128[0];
+ new.data128[1] |= initial_dte->data128[1];
}
- flags &= ~DEV_DOMID_MASK;
- flags |= domid;
+ set_dte_gcr3_table(iommu, dev_data, &new);
- old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
- dev_table[devid].data[1] = flags;
- dev_table[devid].data[0] = pte_root;
+ update_dte256(iommu, dev_data, &new);
/*
* A kdump kernel might be replacing a domain ID that was copied from
@@ -1946,30 +2133,27 @@ static void set_dte_entry(struct amd_iommu *iommu,
}
}
-static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
+/*
+ * Clear DMA-remap related flags to block all DMA (blockeded domain)
+ */
+static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data)
{
- struct dev_table_entry *dev_table = get_dev_table(iommu);
-
- /* remove entry from the device table seen by the hardware */
- dev_table[devid].data[0] = DTE_FLAG_V;
-
- if (!amd_iommu_snp_en)
- dev_table[devid].data[0] |= DTE_FLAG_TV;
-
- dev_table[devid].data[1] &= DTE_FLAG_MASK;
+ struct dev_table_entry new = {};
+ struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
- amd_iommu_apply_erratum_63(iommu, devid);
+ make_clear_dte(dev_data, dte, &new);
+ update_dte256(iommu, dev_data, &new);
}
/* Update and flush DTE for the given device */
-void amd_iommu_dev_update_dte(struct iommu_dev_data *dev_data, bool set)
+static void dev_update_dte(struct iommu_dev_data *dev_data, bool set)
{
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
if (set)
- set_dte_entry(iommu, dev_data);
+ set_dte_entry(iommu, dev_data, 0, 0);
else
- clear_dte_entry(iommu, dev_data->devid);
+ clear_dte_entry(iommu, dev_data);
clone_aliases(iommu, dev_data->dev);
device_flush_dte(dev_data);
@@ -1985,6 +2169,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
{
struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
int max_pasids = dev_data->max_pasids;
+ struct pt_iommu_x86_64_hw_info pt_info;
int ret = 0;
/*
@@ -2007,7 +2192,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data,
if (!pdom_is_v2_pgtbl_mode(pdom))
return ret;
- ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
+ pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info);
+ ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true);
if (ret)
free_gcr3_table(&dev_data->gcr3_info);
@@ -2028,56 +2214,64 @@ static void destroy_gcr3_table(struct iommu_dev_data *dev_data,
free_gcr3_table(gcr3_info);
}
-static int do_attach(struct iommu_dev_data *dev_data,
- struct protection_domain *domain)
+static int pdom_attach_iommu(struct amd_iommu *iommu,
+ struct protection_domain *pdom)
{
- struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
+ struct pdom_iommu_info *pdom_iommu_info, *curr;
+ unsigned long flags;
int ret = 0;
- /* Update data structures */
- dev_data->domain = domain;
- list_add(&dev_data->list, &domain->dev_list);
+ spin_lock_irqsave(&pdom->lock, flags);
- /* Update NUMA Node ID */
- if (domain->nid == NUMA_NO_NODE)
- domain->nid = dev_to_node(dev_data->dev);
+ pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index);
+ if (pdom_iommu_info) {
+ pdom_iommu_info->refcnt++;
+ goto out_unlock;
+ }
- /* Do reference counting */
- domain->dev_iommu[iommu->index] += 1;
- domain->dev_cnt += 1;
+ pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC);
+ if (!pdom_iommu_info) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
- /* Setup GCR3 table */
- if (pdom_is_sva_capable(domain)) {
- ret = init_gcr3_table(dev_data, domain);
- if (ret)
- return ret;
+ pdom_iommu_info->iommu = iommu;
+ pdom_iommu_info->refcnt = 1;
+
+ curr = xa_cmpxchg(&pdom->iommu_array, iommu->index,
+ NULL, pdom_iommu_info, GFP_ATOMIC);
+ if (curr) {
+ kfree(pdom_iommu_info);
+ ret = -ENOSPC;
+ goto out_unlock;
}
+out_unlock:
+ spin_unlock_irqrestore(&pdom->lock, flags);
return ret;
}
-static void do_detach(struct iommu_dev_data *dev_data)
+static void pdom_detach_iommu(struct amd_iommu *iommu,
+ struct protection_domain *pdom)
{
- struct protection_domain *domain = dev_data->domain;
- struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
-
- /* Clear DTE and flush the entry */
- amd_iommu_dev_update_dte(dev_data, false);
+ struct pdom_iommu_info *pdom_iommu_info;
+ unsigned long flags;
- /* Flush IOTLB and wait for the flushes to finish */
- amd_iommu_domain_flush_all(domain);
+ spin_lock_irqsave(&pdom->lock, flags);
- /* Clear GCR3 table */
- if (pdom_is_sva_capable(domain))
- destroy_gcr3_table(dev_data, domain);
+ pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index);
+ if (!pdom_iommu_info) {
+ spin_unlock_irqrestore(&pdom->lock, flags);
+ return;
+ }
- /* Update data structures */
- dev_data->domain = NULL;
- list_del(&dev_data->list);
+ pdom_iommu_info->refcnt--;
+ if (pdom_iommu_info->refcnt == 0) {
+ xa_erase(&pdom->iommu_array, iommu->index);
+ kfree(pdom_iommu_info);
+ }
- /* decrease reference counters - needs to happen after the flushes */
- domain->dev_iommu[iommu->index] -= 1;
- domain->dev_cnt -= 1;
+ spin_unlock_irqrestore(&pdom->lock, flags);
}
/*
@@ -2087,28 +2281,60 @@ static void do_detach(struct iommu_dev_data *dev_data)
static int attach_device(struct device *dev,
struct protection_domain *domain)
{
- struct iommu_dev_data *dev_data;
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+ struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
+ struct pci_dev *pdev;
unsigned long flags;
int ret = 0;
- spin_lock_irqsave(&domain->lock, flags);
-
- dev_data = dev_iommu_priv_get(dev);
-
- spin_lock(&dev_data->lock);
+ mutex_lock(&dev_data->mutex);
if (dev_data->domain != NULL) {
ret = -EBUSY;
goto out;
}
- ret = do_attach(dev_data, domain);
+ /* Do reference counting */
+ ret = pdom_attach_iommu(iommu, domain);
+ if (ret)
+ goto out;
-out:
- spin_unlock(&dev_data->lock);
+ /* Setup GCR3 table */
+ if (pdom_is_sva_capable(domain)) {
+ ret = init_gcr3_table(dev_data, domain);
+ if (ret) {
+ pdom_detach_iommu(iommu, domain);
+ goto out;
+ }
+ }
+
+ pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL;
+ if (pdev && pdom_is_sva_capable(domain)) {
+ pdev_enable_caps(pdev);
+
+ /*
+ * Device can continue to function even if IOPF
+ * enablement failed. Hence in error path just
+ * disable device PRI support.
+ */
+ if (amd_iommu_iopf_add_device(iommu, dev_data))
+ pdev_disable_cap_pri(pdev);
+ } else if (pdev) {
+ pdev_enable_cap_ats(pdev);
+ }
+ /* Update data structures */
+ dev_data->domain = domain;
+ spin_lock_irqsave(&domain->lock, flags);
+ list_add(&dev_data->list, &domain->dev_list);
spin_unlock_irqrestore(&domain->lock, flags);
+ /* Update device table */
+ dev_update_dte(dev_data, true);
+
+out:
+ mutex_unlock(&dev_data->mutex);
+
return ret;
}
@@ -2118,14 +2344,11 @@ out:
static void detach_device(struct device *dev)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
- struct protection_domain *domain = dev_data->domain;
struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
+ struct protection_domain *domain = dev_data->domain;
unsigned long flags;
- bool ppr = dev_data->ppr;
-
- spin_lock_irqsave(&domain->lock, flags);
- spin_lock(&dev_data->lock);
+ mutex_lock(&dev_data->mutex);
/*
* First check if the device is still attached. It might already
@@ -2136,27 +2359,36 @@ static void detach_device(struct device *dev)
if (WARN_ON(!dev_data->domain))
goto out;
- if (ppr) {
+ /* Remove IOPF handler */
+ if (dev_data->ppr) {
iopf_queue_flush_dev(dev);
-
- /* Updated here so that it gets reflected in DTE */
- dev_data->ppr = false;
+ amd_iommu_iopf_remove_device(iommu, dev_data);
}
- do_detach(dev_data);
+ if (dev_is_pci(dev))
+ pdev_disable_caps(to_pci_dev(dev));
-out:
- spin_unlock(&dev_data->lock);
+ /* Clear DTE and flush the entry */
+ dev_update_dte(dev_data, false);
+ /* Flush IOTLB and wait for the flushes to finish */
+ spin_lock_irqsave(&domain->lock, flags);
+ amd_iommu_domain_flush_all(domain);
+ list_del(&dev_data->list);
spin_unlock_irqrestore(&domain->lock, flags);
- /* Remove IOPF handler */
- if (ppr)
- amd_iommu_iopf_remove_device(iommu, dev_data);
+ /* Clear GCR3 table */
+ if (pdom_is_sva_capable(domain))
+ destroy_gcr3_table(dev_data, domain);
- if (dev_is_pci(dev))
- pdev_disable_caps(to_pci_dev(dev));
+ /* Update data structures */
+ dev_data->domain = NULL;
+ /* decrease reference counters - needs to happen after the flushes */
+ pdom_detach_iommu(iommu, domain);
+
+out:
+ mutex_unlock(&dev_data->mutex);
}
static struct iommu_device *amd_iommu_probe_device(struct device *dev)
@@ -2185,11 +2417,12 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev)
dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
iommu_dev = ERR_PTR(ret);
iommu_ignore_device(iommu, dev);
- } else {
- amd_iommu_set_pci_msi_domain(dev, iommu);
- iommu_dev = &iommu->iommu;
+ goto out_err;
}
+ amd_iommu_set_pci_msi_domain(dev, iommu);
+ iommu_dev = &iommu->iommu;
+
/*
* If IOMMU and device supports PASID then it will contain max
* supported PASIDs, else it will be zero.
@@ -2201,24 +2434,38 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev)
pci_max_pasids(to_pci_dev(dev)));
}
+ if (amd_iommu_pgtable == PD_MODE_NONE) {
+ pr_warn_once("%s: DMA translation not supported by iommu.\n",
+ __func__);
+ iommu_dev = ERR_PTR(-ENODEV);
+ goto out_err;
+ }
+
+out_err:
+
iommu_completion_wait(iommu);
+ if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2))
+ dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K;
+ else
+ dev_data->max_irqs = MAX_IRQS_PER_TABLE_512;
+
+ if (dev_is_pci(dev))
+ pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT);
+
return iommu_dev;
}
static void amd_iommu_release_device(struct device *dev)
{
- struct amd_iommu *iommu;
-
- if (!check_device(dev))
- return;
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
- iommu = rlookup_amd_iommu(dev);
- if (!iommu)
- return;
+ WARN_ON(dev_data->domain);
- amd_iommu_uninit_device(dev);
- iommu_completion_wait(iommu);
+ /*
+ * We keep dev_data around for unplugged devices and reuse it when the
+ * device is re-plugged - not doing so would introduce a ton of races.
+ */
}
static struct iommu_group *amd_iommu_device_group(struct device *dev)
@@ -2239,236 +2486,384 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev)
*
*****************************************************************************/
-static void cleanup_domain(struct protection_domain *domain)
+static void protection_domain_init(struct protection_domain *domain)
{
- struct iommu_dev_data *entry;
+ spin_lock_init(&domain->lock);
+ INIT_LIST_HEAD(&domain->dev_list);
+ INIT_LIST_HEAD(&domain->dev_data_list);
+ xa_init(&domain->iommu_array);
+}
- lockdep_assert_held(&domain->lock);
+struct protection_domain *protection_domain_alloc(void)
+{
+ struct protection_domain *domain;
+ int domid;
- if (!domain->dev_cnt)
- return;
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return NULL;
- while (!list_empty(&domain->dev_list)) {
- entry = list_first_entry(&domain->dev_list,
- struct iommu_dev_data, list);
- BUG_ON(!entry->domain);
- do_detach(entry);
+ domid = pdom_id_alloc();
+ if (domid <= 0) {
+ kfree(domain);
+ return NULL;
}
- WARN_ON(domain->dev_cnt != 0);
+ domain->id = domid;
+
+ protection_domain_init(domain);
+
+ return domain;
}
-void protection_domain_free(struct protection_domain *domain)
+static bool amd_iommu_hd_support(struct amd_iommu *iommu)
{
- if (!domain)
- return;
-
- if (domain->iop.pgtbl_cfg.tlb)
- free_io_pgtable_ops(&domain->iop.iop.ops);
+ if (amd_iommu_hatdis)
+ return false;
- if (domain->iop.root)
- iommu_free_page(domain->iop.root);
+ return iommu && (iommu->features & FEATURE_HDSUP);
+}
- if (domain->id)
- domain_id_free(domain->id);
+static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt)
+{
+ struct protection_domain *pdom =
+ container_of(iommupt, struct protection_domain, iommu);
- kfree(domain);
+ return &pdom->lock;
}
-static int protection_domain_init_v1(struct protection_domain *domain, int mode)
+/*
+ * Update all HW references to the domain with a new pgtable configuration.
+ */
+static void amd_iommu_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level)
{
- u64 *pt_root = NULL;
+ struct protection_domain *pdom =
+ container_of(iommu_table, struct protection_domain, iommu);
+ struct iommu_dev_data *dev_data;
+
+ lockdep_assert_held(&pdom->lock);
- BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
+ /* Update the DTE for all devices attached to this domain */
+ list_for_each_entry(dev_data, &pdom->dev_list, list) {
+ struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
- if (mode != PAGE_MODE_NONE) {
- pt_root = iommu_alloc_page(GFP_KERNEL);
- if (!pt_root)
- return -ENOMEM;
+ /* Update the HW references with the new level and top ptr */
+ set_dte_entry(iommu, dev_data, top_paddr, top_level);
+ clone_aliases(iommu, dev_data->dev);
}
- domain->pd_mode = PD_MODE_V1;
- amd_iommu_domain_set_pgtable(domain, pt_root, mode);
+ list_for_each_entry(dev_data, &pdom->dev_list, list)
+ device_flush_dte(dev_data);
- return 0;
+ domain_flush_complete(pdom);
}
-static int protection_domain_init_v2(struct protection_domain *pdom)
+/*
+ * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to
+ * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non
+ * present caching (like hypervisor shadowing).
+ */
+static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
+ unsigned long iova, size_t size)
{
- pdom->pd_mode = PD_MODE_V2;
- pdom->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
+ struct protection_domain *domain = to_pdomain(dom);
+ unsigned long flags;
+ if (likely(!amd_iommu_np_cache))
+ return 0;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ amd_iommu_domain_flush_pages(domain, iova, size);
+ spin_unlock_irqrestore(&domain->lock, flags);
return 0;
}
-struct protection_domain *protection_domain_alloc(unsigned int type)
+static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dom->lock, flags);
+ amd_iommu_domain_flush_all(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
+}
+
+static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
{
- struct io_pgtable_ops *pgtbl_ops;
+ struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dom->lock, flags);
+ amd_iommu_domain_flush_pages(dom, gather->start,
+ gather->end - gather->start + 1);
+ spin_unlock_irqrestore(&dom->lock, flags);
+ iommu_put_pages_list(&gather->freelist);
+}
+
+static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = {
+ .get_top_lock = amd_iommu_get_top_lock,
+ .change_top = amd_iommu_change_top,
+};
+
+static const struct iommu_domain_ops amdv1_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1),
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+ .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
+
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1),
+ .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+};
+
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev,
+ u32 flags)
+{
+ struct pt_iommu_amdv1_cfg cfg = {};
struct protection_domain *domain;
- int pgtable;
int ret;
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- if (!domain)
- return NULL;
+ if (amd_iommu_hatdis)
+ return ERR_PTR(-EOPNOTSUPP);
- domain->id = domain_id_alloc();
- if (!domain->id)
- goto out_err;
+ domain = protection_domain_alloc();
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
- spin_lock_init(&domain->lock);
- INIT_LIST_HEAD(&domain->dev_list);
- INIT_LIST_HEAD(&domain->dev_data_list);
- domain->nid = NUMA_NO_NODE;
+ domain->pd_mode = PD_MODE_V1;
+ domain->iommu.driver_ops = &amd_hw_driver_ops_v1;
+ domain->iommu.nid = dev_to_node(dev);
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ domain->domain.dirty_ops = &amdv1_dirty_ops;
- switch (type) {
- /* No need to allocate io pgtable ops in passthrough mode */
- case IOMMU_DOMAIN_IDENTITY:
- case IOMMU_DOMAIN_SVA:
- return domain;
- case IOMMU_DOMAIN_DMA:
- pgtable = amd_iommu_pgtable;
- break;
/*
- * Force IOMMU v1 page table when allocating
- * domain for pass-through devices.
+ * Someday FORCE_COHERENCE should be set by
+ * amd_iommu_enforce_cache_coherency() like VT-d does.
*/
- case IOMMU_DOMAIN_UNMANAGED:
- pgtable = AMD_IOMMU_V1;
- break;
- default:
- goto out_err;
- }
+ cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
- switch (pgtable) {
- case AMD_IOMMU_V1:
- ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL);
- break;
- case AMD_IOMMU_V2:
- ret = protection_domain_init_v2(domain);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- if (ret)
- goto out_err;
-
- pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain);
- if (!pgtbl_ops)
- goto out_err;
+ /*
+ * AMD's IOMMU can flush as many pages as necessary in a single flush.
+ * Unless we run in a virtual machine, which can be inferred according
+ * to whether "non-present cache" is on, it is probably best to prefer
+ * (potentially) too extensive TLB flushing (i.e., more misses) over
+ * multiple TLB flushes (i.e., more flushes). For virtual machines the
+ * hypervisor needs to synchronize the host IOMMU PTEs with those of
+ * the guest, and the trade-off is different: unnecessary TLB flushes
+ * should be avoided.
+ */
+ if (amd_iommu_np_cache)
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
+ else
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
- return domain;
-out_err:
- protection_domain_free(domain);
- return NULL;
-}
+ cfg.common.hw_max_vasz_lg2 =
+ min(64, (amd_iommu_hpt_level - 1) * 9 + 21);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.starting_level = 2;
+ domain->domain.ops = &amdv1_ops;
-static inline u64 dma_max_address(void)
-{
- if (amd_iommu_pgtable == AMD_IOMMU_V1)
- return ~0ULL;
+ ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL);
+ if (ret) {
+ amd_iommu_domain_free(&domain->domain);
+ return ERR_PTR(ret);
+ }
- /* V2 with 4/5 level page table */
- return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
+ /*
+ * Narrow the supported page sizes to those selected by the kernel
+ * command line.
+ */
+ domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap;
+ return &domain->domain;
}
-static bool amd_iommu_hd_support(struct amd_iommu *iommu)
-{
- return iommu && (iommu->features & FEATURE_HDSUP);
-}
+static const struct iommu_domain_ops amdv2_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
+ .iotlb_sync_map = amd_iommu_iotlb_sync_map,
+ .flush_iotlb_all = amd_iommu_flush_iotlb_all,
+ .iotlb_sync = amd_iommu_iotlb_sync,
+ .attach_dev = amd_iommu_attach_device,
+ .free = amd_iommu_domain_free,
+ /*
+ * Note the AMDv2 page table format does not support a Force Coherency
+ * bit, so enforce_cache_coherency should not be set. However VFIO is
+ * not prepared to handle a case where some domains will support
+ * enforcement and others do not. VFIO and iommufd will have to be fixed
+ * before it can fully use the V2 page table. See the comment in
+ * iommufd_hwpt_paging_alloc(). For now leave things as they have
+ * historically been and lie about enforce_cache_coherencey.
+ */
+ .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
+};
-static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
- struct device *dev, u32 flags)
+static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev,
+ u32 flags)
{
- bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ struct pt_iommu_x86_64_cfg cfg = {};
struct protection_domain *domain;
- struct amd_iommu *iommu = NULL;
-
- if (dev)
- iommu = get_amd_iommu_from_dev(dev);
-
- /*
- * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
- * default to use IOMMU_DOMAIN_DMA[_FQ].
- */
- if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
- return ERR_PTR(-EINVAL);
+ int ret;
- if (dirty_tracking && !amd_iommu_hd_support(iommu))
+ if (!amd_iommu_v2_pgtbl_supported())
return ERR_PTR(-EOPNOTSUPP);
- domain = protection_domain_alloc(type);
+ domain = protection_domain_alloc();
if (!domain)
return ERR_PTR(-ENOMEM);
- domain->domain.geometry.aperture_start = 0;
- domain->domain.geometry.aperture_end = dma_max_address();
- domain->domain.geometry.force_aperture = true;
+ domain->pd_mode = PD_MODE_V2;
+ domain->iommu.nid = dev_to_node(dev);
- if (iommu) {
- domain->domain.type = type;
- domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
- domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+ cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES);
+ if (amd_iommu_np_cache)
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS);
+ else
+ cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE);
- if (dirty_tracking)
- domain->domain.dirty_ops = &amd_dirty_ops;
+ /*
+ * The v2 table behaves differently if it is attached to PASID 0 vs a
+ * non-zero PASID. On PASID 0 it has no sign extension and the full
+ * 57/48 bits decode the lower addresses. Otherwise it behaves like a
+ * normal sign extended x86 page table. Since we want the domain to work
+ * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not
+ * set which creates a table that is compatible in both modes.
+ */
+ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
+ cfg.common.hw_max_vasz_lg2 = 56;
+ cfg.top_level = 4;
+ } else {
+ cfg.common.hw_max_vasz_lg2 = 47;
+ cfg.top_level = 3;
}
+ cfg.common.hw_max_oasz_lg2 = 52;
+ domain->domain.ops = &amdv2_ops;
+ ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL);
+ if (ret) {
+ amd_iommu_domain_free(&domain->domain);
+ return ERR_PTR(ret);
+ }
return &domain->domain;
}
-static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
+static struct iommu_domain *
+amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
+ const struct iommu_user_data *user_data)
+
{
- struct iommu_domain *domain;
+ struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
+ const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_PASID;
- domain = do_iommu_domain_alloc(type, NULL, 0);
- if (IS_ERR(domain))
- return NULL;
+ if ((flags & ~supported_flags) || user_data)
+ return ERR_PTR(-EOPNOTSUPP);
- return domain;
+ switch (flags & supported_flags) {
+ case IOMMU_HWPT_ALLOC_DIRTY_TRACKING:
+ /* Allocate domain with v1 page table for dirty tracking */
+ if (!amd_iommu_hd_support(iommu))
+ break;
+ return amd_iommu_domain_alloc_paging_v1(dev, flags);
+ case IOMMU_HWPT_ALLOC_PASID:
+ /* Allocate domain with v2 page table if IOMMU supports PASID. */
+ if (!amd_iommu_pasid_supported())
+ break;
+ return amd_iommu_domain_alloc_paging_v2(dev, flags);
+ case 0: {
+ struct iommu_domain *ret;
+
+ /* If nothing specific is required use the kernel commandline default */
+ if (amd_iommu_pgtable == PD_MODE_V1) {
+ ret = amd_iommu_domain_alloc_paging_v1(dev, flags);
+ if (ret != ERR_PTR(-EOPNOTSUPP))
+ return ret;
+ return amd_iommu_domain_alloc_paging_v2(dev, flags);
+ }
+ ret = amd_iommu_domain_alloc_paging_v2(dev, flags);
+ if (ret != ERR_PTR(-EOPNOTSUPP))
+ return ret;
+ return amd_iommu_domain_alloc_paging_v1(dev, flags);
+ }
+ default:
+ break;
+ }
+ return ERR_PTR(-EOPNOTSUPP);
}
-static struct iommu_domain *
-amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
- struct iommu_domain *parent,
- const struct iommu_user_data *user_data)
+void amd_iommu_domain_free(struct iommu_domain *dom)
+{
+ struct protection_domain *domain = to_pdomain(dom);
+
+ WARN_ON(!list_empty(&domain->dev_list));
+ pt_iommu_deinit(&domain->iommu);
+ pdom_id_free(domain->id);
+ kfree(domain);
+}
+static int blocked_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
- unsigned int type = IOMMU_DOMAIN_UNMANAGED;
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
- if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
- return ERR_PTR(-EOPNOTSUPP);
+ if (dev_data->domain)
+ detach_device(dev);
+
+ /* Clear DTE and flush the entry */
+ mutex_lock(&dev_data->mutex);
+ dev_update_dte(dev_data, false);
+ mutex_unlock(&dev_data->mutex);
- return do_iommu_domain_alloc(type, dev, flags);
+ return 0;
}
-void amd_iommu_domain_free(struct iommu_domain *dom)
+static int blocked_domain_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
- struct protection_domain *domain;
- unsigned long flags;
+ amd_iommu_remove_dev_pasid(dev, pasid, old);
+ return 0;
+}
- if (!dom)
- return;
+static struct iommu_domain blocked_domain = {
+ .type = IOMMU_DOMAIN_BLOCKED,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = blocked_domain_attach_device,
+ .set_dev_pasid = blocked_domain_set_dev_pasid,
+ }
+};
- domain = to_pdomain(dom);
+static struct protection_domain identity_domain;
- spin_lock_irqsave(&domain->lock, flags);
+static const struct iommu_domain_ops identity_domain_ops = {
+ .attach_dev = amd_iommu_attach_device,
+};
- cleanup_domain(domain);
+void amd_iommu_init_identity_domain(void)
+{
+ struct iommu_domain *domain = &identity_domain.domain;
- spin_unlock_irqrestore(&domain->lock, flags);
+ domain->type = IOMMU_DOMAIN_IDENTITY;
+ domain->ops = &identity_domain_ops;
+ domain->owner = &amd_iommu_ops;
+
+ identity_domain.id = pdom_id_alloc();
- protection_domain_free(domain);
+ protection_domain_init(&identity_domain);
}
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
+static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
+ struct iommu_domain *old)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
struct protection_domain *domain = to_pdomain(dom);
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
- struct pci_dev *pdev;
int ret;
/*
@@ -2501,114 +2896,9 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
}
#endif
- pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL;
- if (pdev && pdom_is_sva_capable(domain)) {
- pdev_enable_caps(pdev);
-
- /*
- * Device can continue to function even if IOPF
- * enablement failed. Hence in error path just
- * disable device PRI support.
- */
- if (amd_iommu_iopf_add_device(iommu, dev_data))
- pdev_disable_cap_pri(pdev);
- } else if (pdev) {
- pdev_enable_cap_ats(pdev);
- }
-
- /* Update device table */
- amd_iommu_dev_update_dte(dev_data, true);
-
- return ret;
-}
-
-static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
- unsigned long iova, size_t size)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.iop.ops;
-
- if (ops->map_pages)
- domain_flush_np_cache(domain, iova, size);
- return 0;
-}
-
-static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, size_t pgsize, size_t pgcount,
- int iommu_prot, gfp_t gfp, size_t *mapped)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.iop.ops;
- int prot = 0;
- int ret = -EINVAL;
-
- if ((domain->pd_mode == PD_MODE_V1) &&
- (domain->iop.mode == PAGE_MODE_NONE))
- return -EINVAL;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- if (ops->map_pages) {
- ret = ops->map_pages(ops, iova, paddr, pgsize,
- pgcount, prot, gfp, mapped);
- }
-
return ret;
}
-static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size)
-{
- /*
- * AMD's IOMMU can flush as many pages as necessary in a single flush.
- * Unless we run in a virtual machine, which can be inferred according
- * to whether "non-present cache" is on, it is probably best to prefer
- * (potentially) too extensive TLB flushing (i.e., more misses) over
- * mutliple TLB flushes (i.e., more flushes). For virtual machines the
- * hypervisor needs to synchronize the host IOMMU PTEs with those of
- * the guest, and the trade-off is different: unnecessary TLB flushes
- * should be avoided.
- */
- if (amd_iommu_np_cache &&
- iommu_iotlb_gather_is_disjoint(gather, iova, size))
- iommu_iotlb_sync(domain, gather);
-
- iommu_iotlb_gather_add_range(gather, iova, size);
-}
-
-static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.iop.ops;
- size_t r;
-
- if ((domain->pd_mode == PD_MODE_V1) &&
- (domain->iop.mode == PAGE_MODE_NONE))
- return 0;
-
- r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
-
- if (r)
- amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
-
- return r;
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- dma_addr_t iova)
-{
- struct protection_domain *domain = to_pdomain(dom);
- struct io_pgtable_ops *ops = &domain->iop.iop.ops;
-
- return ops->iova_to_phys(ops, iova);
-}
-
static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
{
switch (cap) {
@@ -2638,12 +2928,12 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
struct protection_domain *pdomain = to_pdomain(domain);
- struct dev_table_entry *dev_table;
+ struct dev_table_entry *dte;
struct iommu_dev_data *dev_data;
bool domain_flush = false;
struct amd_iommu *iommu;
unsigned long flags;
- u64 pte_root;
+ u64 new;
spin_lock_irqsave(&pdomain->lock, flags);
if (!(pdomain->dirty_tracking ^ enable)) {
@@ -2652,16 +2942,15 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
}
list_for_each_entry(dev_data, &pdomain->dev_list, list) {
+ spin_lock(&dev_data->dte_lock);
iommu = get_amd_iommu_from_dev_data(dev_data);
-
- dev_table = get_dev_table(iommu);
- pte_root = dev_table[dev_data->devid].data[0];
-
- pte_root = (enable ? pte_root | DTE_FLAG_HAD :
- pte_root & ~DTE_FLAG_HAD);
+ dte = &get_dev_table(iommu)[dev_data->devid];
+ new = dte->data[0];
+ new = (enable ? new | DTE_FLAG_HAD : new & ~DTE_FLAG_HAD);
+ dte->data[0] = new;
+ spin_unlock(&dev_data->dte_lock);
/* Flush device DTE */
- dev_table[dev_data->devid].data[0] = pte_root;
device_flush_dte(dev_data);
domain_flush = true;
}
@@ -2676,28 +2965,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
return 0;
}
-static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
-{
- struct protection_domain *pdomain = to_pdomain(domain);
- struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
- unsigned long lflags;
-
- if (!ops || !ops->read_and_clear_dirty)
- return -EOPNOTSUPP;
-
- spin_lock_irqsave(&pdomain->lock, lflags);
- if (!pdomain->dirty_tracking && dirty->bitmap) {
- spin_unlock_irqrestore(&pdomain->lock, lflags);
- return -EINVAL;
- }
- spin_unlock_irqrestore(&pdomain->lock, lflags);
-
- return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
-}
-
static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head)
{
@@ -2749,6 +3016,9 @@ static void amd_iommu_get_resv_regions(struct device *dev,
return;
list_add_tail(&region->list, head);
+ if (amd_iommu_ht_range_ignore())
+ return;
+
region = iommu_alloc_resv_region(HT_RANGE_START,
HT_RANGE_END - HT_RANGE_START + 1,
0, IOMMU_RESV_RESERVED, GFP_KERNEL);
@@ -2757,35 +3027,13 @@ static void amd_iommu_get_resv_regions(struct device *dev,
list_add_tail(&region->list, head);
}
-bool amd_iommu_is_attach_deferred(struct device *dev)
+static bool amd_iommu_is_attach_deferred(struct device *dev)
{
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
return dev_data->defer_attach;
}
-static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_all(dom);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-
-static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
- struct iommu_iotlb_gather *gather)
-{
- struct protection_domain *dom = to_pdomain(domain);
- unsigned long flags;
-
- spin_lock_irqsave(&dom->lock, flags);
- amd_iommu_domain_flush_pages(dom, gather->start,
- gather->end - gather->start + 1);
- spin_unlock_irqrestore(&dom->lock, flags);
-}
-
static int amd_iommu_def_domain_type(struct device *dev)
{
struct iommu_dev_data *dev_data;
@@ -2820,70 +3068,20 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true;
}
-static const struct iommu_dirty_ops amd_dirty_ops = {
- .set_dirty_tracking = amd_iommu_set_dirty_tracking,
- .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
-};
-
-static int amd_iommu_dev_enable_feature(struct device *dev,
- enum iommu_dev_features feat)
-{
- int ret = 0;
-
- switch (feat) {
- case IOMMU_DEV_FEAT_IOPF:
- case IOMMU_DEV_FEAT_SVA:
- break;
- default:
- ret = -EINVAL;
- break;
- }
- return ret;
-}
-
-static int amd_iommu_dev_disable_feature(struct device *dev,
- enum iommu_dev_features feat)
-{
- int ret = 0;
-
- switch (feat) {
- case IOMMU_DEV_FEAT_IOPF:
- case IOMMU_DEV_FEAT_SVA:
- break;
- default:
- ret = -EINVAL;
- break;
- }
- return ret;
-}
-
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
- .domain_alloc = amd_iommu_domain_alloc,
- .domain_alloc_user = amd_iommu_domain_alloc_user,
+ .blocked_domain = &blocked_domain,
+ .release_domain = &blocked_domain,
+ .identity_domain = &identity_domain.domain,
+ .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags,
.domain_alloc_sva = amd_iommu_domain_alloc_sva,
.probe_device = amd_iommu_probe_device,
.release_device = amd_iommu_release_device,
.device_group = amd_iommu_device_group,
.get_resv_regions = amd_iommu_get_resv_regions,
.is_attach_deferred = amd_iommu_is_attach_deferred,
- .pgsize_bitmap = AMD_IOMMU_PGSIZES,
.def_domain_type = amd_iommu_def_domain_type,
- .dev_enable_feat = amd_iommu_dev_enable_feature,
- .dev_disable_feat = amd_iommu_dev_disable_feature,
- .remove_dev_pasid = amd_iommu_remove_dev_pasid,
.page_response = amd_iommu_page_response,
- .default_domain_ops = &(const struct iommu_domain_ops) {
- .attach_dev = amd_iommu_attach_device,
- .map_pages = amd_iommu_map_pages,
- .unmap_pages = amd_iommu_unmap_pages,
- .iotlb_sync_map = amd_iommu_iotlb_sync_map,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .flush_iotlb_all = amd_iommu_flush_iotlb_all,
- .iotlb_sync = amd_iommu_iotlb_sync,
- .free = amd_iommu_domain_free,
- .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
- }
};
#ifdef CONFIG_IRQ_REMAP
@@ -2908,7 +3106,7 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
return;
build_inv_irt(&cmd, devid);
- data = atomic64_add_return(1, &iommu->cmd_sem_val);
+ data = atomic64_inc_return(&iommu->cmd_sem_val);
build_completion_wait(&cmd2, iommu, data);
raw_spin_lock_irqsave(&iommu->lock, flags);
@@ -2923,20 +3121,33 @@ out:
raw_spin_unlock_irqrestore(&iommu->lock, flags);
}
+static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data)
+{
+ if (dev_data && dev_data->max_irqs == MAX_IRQS_PER_TABLE_2K)
+ return DTE_INTTABLEN_2K;
+ return DTE_INTTABLEN_512;
+}
+
static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
struct irq_remap_table *table)
{
- u64 dte;
- struct dev_table_entry *dev_table = get_dev_table(iommu);
+ u64 new;
+ struct dev_table_entry *dte = &get_dev_table(iommu)[devid];
+ struct iommu_dev_data *dev_data = search_dev_data(iommu, devid);
+
+ if (dev_data)
+ spin_lock(&dev_data->dte_lock);
- dte = dev_table[devid].data[2];
- dte &= ~DTE_IRQ_PHYS_ADDR_MASK;
- dte |= iommu_virt_to_phys(table->table);
- dte |= DTE_IRQ_REMAP_INTCTL;
- dte |= DTE_INTTABLEN;
- dte |= DTE_IRQ_REMAP_ENABLE;
+ new = READ_ONCE(dte->data[2]);
+ new &= ~DTE_IRQ_PHYS_ADDR_MASK;
+ new |= iommu_virt_to_phys(table->table);
+ new |= DTE_IRQ_REMAP_INTCTL;
+ new |= iommu_get_int_tablen(dev_data);
+ new |= DTE_IRQ_REMAP_ENABLE;
+ WRITE_ONCE(dte->data[2], new);
- dev_table[devid].data[2] = dte;
+ if (dev_data)
+ spin_unlock(&dev_data->dte_lock);
}
static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
@@ -2957,7 +3168,7 @@ static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
return table;
}
-static struct irq_remap_table *__alloc_irq_table(void)
+static struct irq_remap_table *__alloc_irq_table(int nid, size_t size)
{
struct irq_remap_table *table;
@@ -2965,19 +3176,14 @@ static struct irq_remap_table *__alloc_irq_table(void)
if (!table)
return NULL;
- table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
+ table->table = iommu_alloc_pages_node_sz(
+ nid, GFP_KERNEL, max(DTE_INTTAB_ALIGNMENT, size));
if (!table->table) {
kfree(table);
return NULL;
}
raw_spin_lock_init(&table->lock);
- if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
- memset(table->table, 0,
- MAX_IRQS_PER_TABLE * sizeof(u32));
- else
- memset(table->table, 0,
- (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
return table;
}
@@ -3009,13 +3215,23 @@ static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
return 0;
}
+static inline size_t get_irq_table_size(unsigned int max_irqs)
+{
+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
+ return max_irqs * sizeof(u32);
+
+ return max_irqs * (sizeof(u64) * 2);
+}
+
static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
- u16 devid, struct pci_dev *pdev)
+ u16 devid, struct pci_dev *pdev,
+ unsigned int max_irqs)
{
struct irq_remap_table *table = NULL;
struct irq_remap_table *new_table = NULL;
struct amd_iommu_pci_seg *pci_seg;
unsigned long flags;
+ int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
u16 alias;
spin_lock_irqsave(&iommu_table_lock, flags);
@@ -3034,7 +3250,7 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
spin_unlock_irqrestore(&iommu_table_lock, flags);
/* Nothing there yet, allocate new irq remapping table */
- new_table = __alloc_irq_table();
+ new_table = __alloc_irq_table(nid, get_irq_table_size(max_irqs));
if (!new_table)
return NULL;
@@ -3069,20 +3285,21 @@ out_unlock:
spin_unlock_irqrestore(&iommu_table_lock, flags);
if (new_table) {
- kmem_cache_free(amd_iommu_irq_cache, new_table->table);
+ iommu_free_pages(new_table->table);
kfree(new_table);
}
return table;
}
static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
- bool align, struct pci_dev *pdev)
+ bool align, struct pci_dev *pdev,
+ unsigned long max_irqs)
{
struct irq_remap_table *table;
int index, c, alignment = 1;
unsigned long flags;
- table = alloc_irq_table(iommu, devid, pdev);
+ table = alloc_irq_table(iommu, devid, pdev, max_irqs);
if (!table)
return -ENODEV;
@@ -3093,7 +3310,7 @@ static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
/* Scan table for free entries */
for (index = ALIGN(table->min_index, alignment), c = 0;
- index < MAX_IRQS_PER_TABLE;) {
+ index < max_irqs;) {
if (!iommu->irte_ops->is_allocated(table, index)) {
c += 1;
} else {
@@ -3155,7 +3372,7 @@ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
struct irte_ga *irte)
{
- bool ret;
+ int ret;
ret = __modify_irte_ga(iommu, devid, index, irte);
if (ret)
@@ -3363,6 +3580,14 @@ static void fill_msi_msg(struct msi_msg *msg, u32 index)
msg->data = index;
msg->address_lo = 0;
msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+ /*
+ * The struct msi_msg.dest_mode_logical is used to set the DM bit
+ * in MSI Message Address Register. For device w/ 2K int-remap support,
+ * this is bit must be set to 1 regardless of the actual destination
+ * mode, which is signified by the IRTE[DM].
+ */
+ if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2))
+ msg->arch_addr_lo.dest_mode_logical = true;
msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
}
@@ -3425,6 +3650,8 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
struct amd_ir_data *data = NULL;
struct amd_iommu *iommu;
struct irq_cfg *cfg;
+ struct iommu_dev_data *dev_data;
+ unsigned long max_irqs;
int i, ret, devid, seg, sbdf;
int index;
@@ -3443,6 +3670,9 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
if (!iommu)
return -EINVAL;
+ dev_data = search_dev_data(iommu, devid);
+ max_irqs = dev_data ? dev_data->max_irqs : MAX_IRQS_PER_TABLE_512;
+
ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
if (ret < 0)
return ret;
@@ -3450,7 +3680,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
struct irq_remap_table *table;
- table = alloc_irq_table(iommu, devid, NULL);
+ table = alloc_irq_table(iommu, devid, NULL, max_irqs);
if (table) {
if (!table->min_index) {
/*
@@ -3471,9 +3701,11 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
index = alloc_irq_index(iommu, devid, nr_irqs, align,
- msi_desc_to_pci_dev(info->desc));
+ msi_desc_to_pci_dev(info->desc),
+ max_irqs);
} else {
- index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
+ index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL,
+ max_irqs);
}
if (index < 0) {
@@ -3510,7 +3742,6 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
irq_data->chip_data = data;
irq_data->chip = &amd_ir_chip;
irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
- irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
}
return 0;
@@ -3612,13 +3843,70 @@ static const struct irq_domain_ops amd_ir_domain_ops = {
.deactivate = irq_remapping_deactivate,
};
-int amd_iommu_activate_guest_mode(void *data)
+static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
+ bool ga_log_intr)
+{
+ if (cpu >= 0) {
+ entry->lo.fields_vapic.destination =
+ APICID_TO_IRTE_DEST_LO(cpu);
+ entry->hi.fields.destination =
+ APICID_TO_IRTE_DEST_HI(cpu);
+ entry->lo.fields_vapic.is_run = true;
+ entry->lo.fields_vapic.ga_log_intr = false;
+ } else {
+ entry->lo.fields_vapic.is_run = false;
+ entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
+ }
+}
+
+/*
+ * Update the pCPU information for an IRTE that is configured to post IRQs to
+ * a vCPU, without issuing an IOMMU invalidation for the IRTE.
+ *
+ * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination
+ * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't
+ * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based
+ * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is
+ * blocking and requires a notification wake event). I.e. treat vCPUs that are
+ * associated with a pCPU as running. This API is intended to be used when a
+ * vCPU is scheduled in/out (or stops running for any reason), to do a fast
+ * update of IsRun, GALogIntr, and (conditionally) Destination.
+ *
+ * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached
+ * and thus don't require an invalidation to ensure the IOMMU consumes fresh
+ * information.
+ */
+int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
+{
+ struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+ struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+
+ if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
+ return -EINVAL;
+
+ if (!entry || !entry->lo.fields_vapic.guest_mode)
+ return 0;
+
+ if (!ir_data->iommu)
+ return -ENODEV;
+
+ __amd_iommu_update_ga(entry, cpu, ga_log_intr);
+
+ return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
+ ir_data->irq_2_irte.index, entry);
+}
+EXPORT_SYMBOL(amd_iommu_update_ga);
+
+int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
{
struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
u64 valid;
- if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
+ if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
+ return -EINVAL;
+
+ if (!entry)
return 0;
valid = entry->lo.fields_vapic.valid;
@@ -3628,11 +3916,12 @@ int amd_iommu_activate_guest_mode(void *data)
entry->lo.fields_vapic.valid = valid;
entry->lo.fields_vapic.guest_mode = 1;
- entry->lo.fields_vapic.ga_log_intr = 1;
entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr;
entry->hi.fields.vector = ir_data->ga_vector;
entry->lo.fields_vapic.ga_tag = ir_data->ga_tag;
+ __amd_iommu_update_ga(entry, cpu, ga_log_intr);
+
return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
ir_data->irq_2_irte.index, entry);
}
@@ -3645,8 +3934,10 @@ int amd_iommu_deactivate_guest_mode(void *data)
struct irq_cfg *cfg = ir_data->cfg;
u64 valid;
- if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
- !entry || !entry->lo.fields_vapic.guest_mode)
+ if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
+ return -EINVAL;
+
+ if (!entry || !entry->lo.fields_vapic.guest_mode)
return 0;
valid = entry->lo.fields_remap.valid;
@@ -3668,15 +3959,17 @@ int amd_iommu_deactivate_guest_mode(void *data)
}
EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
-static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
{
int ret;
- struct amd_iommu_pi_data *pi_data = vcpu_info;
- struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
+ struct amd_iommu_pi_data *pi_data = info;
struct amd_ir_data *ir_data = data->chip_data;
struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
struct iommu_dev_data *dev_data;
+ if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
+ return -EINVAL;
+
if (ir_data->iommu == NULL)
return -EINVAL;
@@ -3687,38 +3980,23 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
* we should not modify the IRTE
*/
if (!dev_data || !dev_data->use_vapic)
- return 0;
+ return -EINVAL;
ir_data->cfg = irqd_cfg(data);
- pi_data->ir_data = ir_data;
- /* Note:
- * SVM tries to set up for VAPIC mode, but we are in
- * legacy mode. So, we force legacy mode instead.
- */
- if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
- pr_debug("%s: Fall back to using intr legacy remap\n",
- __func__);
- pi_data->is_guest_mode = false;
- }
+ if (pi_data) {
+ pi_data->ir_data = ir_data;
- pi_data->prev_ga_tag = ir_data->cached_ga_tag;
- if (pi_data->is_guest_mode) {
- ir_data->ga_root_ptr = (pi_data->base >> 12);
- ir_data->ga_vector = vcpu_pi_info->vector;
+ ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12);
+ ir_data->ga_vector = pi_data->vector;
ir_data->ga_tag = pi_data->ga_tag;
- ret = amd_iommu_activate_guest_mode(ir_data);
- if (!ret)
- ir_data->cached_ga_tag = pi_data->ga_tag;
+ if (pi_data->is_guest_mode)
+ ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu,
+ pi_data->ga_log_intr);
+ else
+ ret = amd_iommu_deactivate_guest_mode(ir_data);
} else {
ret = amd_iommu_deactivate_guest_mode(ir_data);
-
- /*
- * This communicates the ga_tag back to the caller
- * so that it can do all the necessary clean up.
- */
- if (!ret)
- ir_data->cached_ga_tag = 0;
}
return ret;
@@ -3785,54 +4063,32 @@ static struct irq_chip amd_ir_chip = {
static const struct msi_parent_ops amdvi_msi_parent_ops = {
.supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI,
+ .bus_select_token = DOMAIN_BUS_AMDVI,
+ .bus_select_mask = MATCH_PCI_MSI,
.prefix = "IR-",
.init_dev_msi_info = msi_parent_init_dev_msi_info,
};
int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
{
- struct fwnode_handle *fn;
+ struct irq_domain_info info = {
+ .fwnode = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index),
+ .ops = &amd_ir_domain_ops,
+ .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI,
+ .host_data = iommu,
+ .parent = arch_get_ir_parent_domain(),
+ };
- fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
- if (!fn)
+ if (!info.fwnode)
return -ENOMEM;
- iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0,
- fn, &amd_ir_domain_ops, iommu);
+
+ iommu->ir_domain = msi_create_parent_irq_domain(&info, &amdvi_msi_parent_ops);
if (!iommu->ir_domain) {
- irq_domain_free_fwnode(fn);
+ irq_domain_free_fwnode(info.fwnode);
return -ENOMEM;
}
-
- irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_AMDVI);
- iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
- IRQ_DOMAIN_FLAG_ISOLATED_MSI;
- iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
-
return 0;
}
-
-int amd_iommu_update_ga(int cpu, bool is_run, void *data)
-{
- struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
- struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
-
- if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
- !entry || !entry->lo.fields_vapic.guest_mode)
- return 0;
-
- if (!ir_data->iommu)
- return -ENODEV;
-
- if (cpu >= 0) {
- entry->lo.fields_vapic.destination =
- APICID_TO_IRTE_DEST_LO(cpu);
- entry->hi.fields.destination =
- APICID_TO_IRTE_DEST_HI(cpu);
- }
- entry->lo.fields_vapic.is_run = is_run;
-
- return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
- ir_data->irq_2_irte.index, entry);
-}
-EXPORT_SYMBOL(amd_iommu_update_ga);
#endif
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c
index a68215f2b3e1..77c8e9a91cbc 100644
--- a/drivers/iommu/amd/pasid.c
+++ b/drivers/iommu/amd/pasid.c
@@ -100,7 +100,8 @@ static const struct mmu_notifier_ops sva_mn = {
};
int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
- struct device *dev, ioasid_t pasid)
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
struct pdom_dev_data *pdom_dev_data;
struct protection_domain *sva_pdom = to_pdomain(domain);
@@ -108,6 +109,9 @@ int iommu_sva_set_dev_pasid(struct iommu_domain *domain,
unsigned long flags;
int ret = -EINVAL;
+ if (old)
+ return -EOPNOTSUPP;
+
/* PASID zero is used for requests from the I/O device without PASID */
if (!is_pasid_valid(dev_data, pasid))
return ret;
@@ -181,16 +185,17 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev,
struct protection_domain *pdom;
int ret;
- pdom = protection_domain_alloc(IOMMU_DOMAIN_SVA);
+ pdom = protection_domain_alloc();
if (!pdom)
return ERR_PTR(-ENOMEM);
pdom->domain.ops = &amd_sva_domain_ops;
pdom->mn.ops = &sva_mn;
+ pdom->domain.type = IOMMU_DOMAIN_SVA;
ret = mmu_notifier_register(&pdom->mn, mm);
if (ret) {
- protection_domain_free(pdom);
+ amd_iommu_domain_free(&pdom->domain);
return ERR_PTR(ret);
}
diff --git a/drivers/iommu/amd/ppr.c b/drivers/iommu/amd/ppr.c
index 7c67d69f0b8c..e6767c057d01 100644
--- a/drivers/iommu/amd/ppr.c
+++ b/drivers/iommu/amd/ppr.c
@@ -48,7 +48,7 @@ void amd_iommu_enable_ppr_log(struct amd_iommu *iommu)
void __init amd_iommu_free_ppr_log(struct amd_iommu *iommu)
{
- iommu_free_pages(iommu->ppr_log, get_order(PPR_LOG_SIZE));
+ iommu_free_pages(iommu->ppr_log);
}
/*
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index eb1e62cd499a..83a5aabcd15d 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -36,7 +36,7 @@
#define DART_MAX_STREAMS 256
#define DART_MAX_TTBR 4
-#define MAX_DARTS_PER_DEVICE 2
+#define MAX_DARTS_PER_DEVICE 3
/* Common registers */
@@ -122,6 +122,8 @@
#define DART_T8110_ERROR_ADDR_LO 0x170
#define DART_T8110_ERROR_ADDR_HI 0x174
+#define DART_T8110_ERROR_STREAMS 0x1c0
+
#define DART_T8110_PROTECT 0x200
#define DART_T8110_UNPROTECT 0x204
#define DART_T8110_PROTECT_LOCK 0x208
@@ -133,6 +135,7 @@
#define DART_T8110_TCR 0x1000
#define DART_T8110_TCR_REMAP GENMASK(11, 8)
#define DART_T8110_TCR_REMAP_EN BIT(7)
+#define DART_T8110_TCR_FOUR_LEVEL BIT(3)
#define DART_T8110_TCR_BYPASS_DAPF BIT(2)
#define DART_T8110_TCR_BYPASS_DART BIT(1)
#define DART_T8110_TCR_TRANSLATE_ENABLE BIT(0)
@@ -166,22 +169,23 @@ struct apple_dart_hw {
int max_sid_count;
- u64 lock;
- u64 lock_bit;
+ u32 lock;
+ u32 lock_bit;
- u64 error;
+ u32 error;
- u64 enable_streams;
+ u32 enable_streams;
- u64 tcr;
- u64 tcr_enabled;
- u64 tcr_disabled;
- u64 tcr_bypass;
+ u32 tcr;
+ u32 tcr_enabled;
+ u32 tcr_disabled;
+ u32 tcr_bypass;
+ u32 tcr_4level;
- u64 ttbr;
- u64 ttbr_valid;
- u64 ttbr_addr_field_shift;
- u64 ttbr_shift;
+ u32 ttbr;
+ u32 ttbr_valid;
+ u32 ttbr_addr_field_shift;
+ u32 ttbr_shift;
int ttbr_count;
};
@@ -217,6 +221,7 @@ struct apple_dart {
u32 pgsize;
u32 num_streams;
u32 supports_bypass : 1;
+ u32 four_level : 1;
struct iommu_group *sid2group[DART_MAX_STREAMS];
struct iommu_device iommu;
@@ -277,6 +282,9 @@ struct apple_dart_domain {
* @streams: streams for this device
*/
struct apple_dart_master_cfg {
+ /* Intersection of DART capabilitles */
+ u32 supports_bypass : 1;
+
struct apple_dart_stream_map stream_maps[MAX_DARTS_PER_DEVICE];
};
@@ -302,13 +310,19 @@ static struct apple_dart_domain *to_dart_domain(struct iommu_domain *dom)
}
static void
-apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map)
+apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map, int levels)
{
struct apple_dart *dart = stream_map->dart;
+ u32 tcr = dart->hw->tcr_enabled;
int sid;
+ if (levels == 4)
+ tcr |= dart->hw->tcr_4level;
+
+ WARN_ON(levels != 3 && levels != 4);
+ WARN_ON(levels == 4 && !dart->four_level);
for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
- writel(dart->hw->tcr_enabled, dart->regs + DART_TCR(dart, sid));
+ writel(tcr, dart->regs + DART_TCR(dart, sid));
}
static void apple_dart_hw_disable_dma(struct apple_dart_stream_map *stream_map)
@@ -566,7 +580,8 @@ apple_dart_setup_translation(struct apple_dart_domain *domain,
for (; i < stream_map->dart->hw->ttbr_count; ++i)
apple_dart_hw_clear_ttbr(stream_map, i);
- apple_dart_hw_enable_translation(stream_map);
+ apple_dart_hw_enable_translation(stream_map,
+ pgtbl_cfg->apple_dart_cfg.n_levels);
stream_map->dart->hw->invalidate_tlb(stream_map);
}
@@ -611,7 +626,7 @@ static int apple_dart_finalize_domain(struct apple_dart_domain *dart_domain,
dart_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
dart_domain->domain.geometry.aperture_start = 0;
dart_domain->domain.geometry.aperture_end =
- (dma_addr_t)DMA_BIT_MASK(dart->ias);
+ (dma_addr_t)DMA_BIT_MASK(pgtbl_cfg.ias);
dart_domain->domain.geometry.force_aperture = true;
dart_domain->finalized = true;
@@ -657,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain,
}
static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
int ret, i;
struct apple_dart_stream_map *stream_map;
@@ -678,13 +694,14 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain,
}
static int apple_dart_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
struct apple_dart_stream_map *stream_map;
int i;
- if (!cfg->stream_maps[0].dart->supports_bypass)
+ if (!cfg->supports_bypass)
return -EINVAL;
for_each_stream_map(i, cfg, stream_map)
@@ -702,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = {
};
static int apple_dart_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
struct apple_dart_stream_map *stream_map;
@@ -773,8 +791,7 @@ static void apple_dart_domain_free(struct iommu_domain *domain)
{
struct apple_dart_domain *dart_domain = to_dart_domain(domain);
- if (dart_domain->pgtbl_ops)
- free_io_pgtable_ops(dart_domain->pgtbl_ops);
+ free_io_pgtable_ops(dart_domain->pgtbl_ops);
kfree(dart_domain);
}
@@ -788,24 +805,31 @@ static int apple_dart_of_xlate(struct device *dev,
struct apple_dart *cfg_dart;
int i, sid;
+ put_device(&iommu_pdev->dev);
+
if (args->args_count != 1)
return -EINVAL;
sid = args->args[0];
- if (!cfg)
+ if (!cfg) {
cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
- if (!cfg)
- return -ENOMEM;
+ if (!cfg)
+ return -ENOMEM;
+ /* Will be ANDed with DART capabilities */
+ cfg->supports_bypass = true;
+ }
dev_iommu_priv_set(dev, cfg);
cfg_dart = cfg->stream_maps[0].dart;
if (cfg_dart) {
- if (cfg_dart->supports_bypass != dart->supports_bypass)
- return -EINVAL;
if (cfg_dart->pgsize != dart->pgsize)
return -EINVAL;
+ if (cfg_dart->ias != dart->ias)
+ return -EINVAL;
}
+ cfg->supports_bypass &= dart->supports_bypass;
+
for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) {
if (cfg->stream_maps[i].dart == dart) {
set_bit(sid, cfg->stream_maps[i].sidmap);
@@ -945,7 +969,7 @@ static int apple_dart_def_domain_type(struct device *dev)
if (cfg->stream_maps[0].dart->pgsize > PAGE_SIZE)
return IOMMU_DOMAIN_IDENTITY;
- if (!cfg->stream_maps[0].dart->supports_bypass)
+ if (!cfg->supports_bypass)
return IOMMU_DOMAIN_DMA;
return 0;
@@ -986,7 +1010,6 @@ static const struct iommu_ops apple_dart_iommu_ops = {
.of_xlate = apple_dart_of_xlate,
.def_domain_type = apple_dart_def_domain_type,
.get_resv_regions = apple_dart_get_resv_regions,
- .pgsize_bitmap = -1UL, /* Restricted during dart probe */
.owner = THIS_MODULE,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = apple_dart_attach_dev_paging,
@@ -1073,6 +1096,9 @@ static irqreturn_t apple_dart_t8110_irq(int irq, void *dev)
error, stream_idx, error_code, fault_name, addr);
writel(error, dart->regs + DART_T8110_ERROR);
+ for (int i = 0; i < BITS_TO_U32(dart->num_streams); i++)
+ writel(U32_MAX, dart->regs + DART_T8110_ERROR_STREAMS + 4 * i);
+
return IRQ_HANDLED;
}
@@ -1133,6 +1159,7 @@ static int apple_dart_probe(struct platform_device *pdev)
dart->ias = FIELD_GET(DART_T8110_PARAMS3_VA_WIDTH, dart_params[2]);
dart->oas = FIELD_GET(DART_T8110_PARAMS3_PA_WIDTH, dart_params[2]);
dart->num_streams = FIELD_GET(DART_T8110_PARAMS4_NUM_SIDS, dart_params[3]);
+ dart->four_level = dart->ias > 36;
break;
}
@@ -1165,9 +1192,9 @@ static int apple_dart_probe(struct platform_device *pdev)
dev_info(
&pdev->dev,
- "DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d] initialized\n",
+ "DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d, AS %d -> %d] initialized\n",
dart->pgsize, dart->num_streams, dart->supports_bypass,
- dart->pgsize > PAGE_SIZE);
+ dart->pgsize > PAGE_SIZE, dart->ias, dart->oas);
return 0;
err_sysfs_remove:
@@ -1288,6 +1315,7 @@ static const struct apple_dart_hw apple_dart_hw_t8110 = {
.tcr_enabled = DART_T8110_TCR_TRANSLATE_ENABLE,
.tcr_disabled = 0,
.tcr_bypass = DART_T8110_TCR_BYPASS_DAPF | DART_T8110_TCR_BYPASS_DART,
+ .tcr_4level = DART_T8110_TCR_FOUR_LEVEL,
.ttbr = DART_T8110_TTBR,
.ttbr_valid = DART_T8110_TTBR_VALID,
@@ -1352,7 +1380,7 @@ static struct platform_driver apple_dart_driver = {
.pm = pm_sleep_ptr(&apple_dart_pm_ops),
},
.probe = apple_dart_probe,
- .remove_new = apple_dart_remove,
+ .remove = apple_dart_remove,
};
module_platform_driver(apple_dart_driver);
diff --git a/drivers/iommu/arm/Kconfig b/drivers/iommu/arm/Kconfig
new file mode 100644
index 000000000000..ef42bbe07dbe
--- /dev/null
+++ b/drivers/iommu/arm/Kconfig
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# ARM IOMMU support
+config ARM_SMMU
+ tristate "ARM Ltd. System MMU (SMMU) Support"
+ depends on ARM64 || ARM || COMPILE_TEST
+ depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_LPAE
+ select ARM_DMA_USE_IOMMU if ARM
+ help
+ Support for implementations of the ARM System MMU architecture
+ versions 1 and 2.
+
+ Say Y here if your SoC includes an IOMMU device implementing
+ the ARM SMMU architecture.
+
+if ARM_SMMU
+config ARM_SMMU_LEGACY_DT_BINDINGS
+ bool "Support the legacy \"mmu-masters\" devicetree bindings"
+ depends on ARM_SMMU=y && OF
+ help
+ Support for the badly designed and deprecated "mmu-masters"
+ devicetree bindings. This allows some DMA masters to attach
+ to the SMMU but does not provide any support via the DMA API.
+ If you're lucky, you might be able to get VFIO up and running.
+
+ If you say Y here then you'll make me very sad. Instead, say N
+ and move your firmware to the utopian future that was 2016.
+
+config ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT
+ bool "Disable unmatched stream bypass by default" if EXPERT
+ default y
+ help
+ If your firmware is broken and fails to describe StreamIDs which
+ Linux should know about in order to manage the SMMU correctly and
+ securely, and you don't want to boot with the 'arm-smmu.disable_bypass=0'
+ command line parameter, then as a last resort you can turn it off
+ by default here. But don't. This option may be removed at any time.
+
+ Note that 'arm-smmu.disable_bypass=1' will still take precedence.
+
+config ARM_SMMU_MMU_500_CPRE_ERRATA
+ bool "Enable errata workaround for CPRE in SMMU reset path"
+ default y
+ help
+ Say Y here (by default) to apply workaround to disable
+ MMU-500's next-page prefetcher for sake of 4 known errata.
+
+ Say N here only when it is sure that any errata related to
+ prefetch enablement are not applicable on the platform.
+ Refer silicon-errata.rst for info on errata IDs.
+
+config ARM_SMMU_QCOM
+ def_tristate y
+ depends on ARCH_QCOM
+ select QCOM_SCM
+ help
+ When running on a Qualcomm platform that has the custom variant
+ of the ARM SMMU, this needs to be built into the SMMU driver.
+
+config ARM_SMMU_QCOM_DEBUG
+ bool "ARM SMMU QCOM implementation defined debug support"
+ depends on ARM_SMMU_QCOM=y
+ help
+ Support for implementation specific debug features in ARM SMMU
+ hardware found in QTI platforms. This include support for
+ the Translation Buffer Units (TBU) that can be used to obtain
+ additional information when debugging memory management issues
+ like context faults.
+
+ Say Y here to enable debug for issues such as context faults
+ or TLB sync timeouts which requires implementation defined
+ register dumps.
+endif
+
+config ARM_SMMU_V3
+ tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
+ depends on ARM64
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_LPAE
+ select GENERIC_MSI_IRQ
+ select IOMMUFD_DRIVER if IOMMUFD
+ help
+ Support for implementations of the ARM System MMU architecture
+ version 3 providing translation support to a PCIe root complex.
+
+ Say Y here if your system includes an IOMMU device implementing
+ the ARM SMMUv3 architecture.
+
+if ARM_SMMU_V3
+config ARM_SMMU_V3_SVA
+ bool "Shared Virtual Addressing support for the ARM SMMUv3"
+ select IOMMU_SVA
+ select IOMMU_IOPF
+ select MMU_NOTIFIER
+ help
+ Support for sharing process address spaces with devices using the
+ SMMUv3.
+
+ Say Y here if your system supports SVA extensions such as PCIe PASID
+ and PRI.
+
+config ARM_SMMU_V3_IOMMUFD
+ bool "Enable IOMMUFD features for ARM SMMUv3 (EXPERIMENTAL)"
+ depends on IOMMUFD
+ help
+ Support for IOMMUFD features intended to support virtual machines
+ with accelerated virtual IOMMUs.
+
+ Say Y here if you are doing development and testing on this feature.
+
+config ARM_SMMU_V3_KUNIT_TEST
+ tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ depends on ARM_SMMU_V3_SVA
+ default KUNIT_ALL_TESTS
+ help
+ Enable this option to unit-test arm-smmu-v3 driver functions.
+
+ If unsure, say N.
+
+config TEGRA241_CMDQV
+ bool "NVIDIA Tegra241 CMDQ-V extension support for ARM SMMUv3"
+ depends on ACPI
+ help
+ Support for NVIDIA CMDQ-Virtualization extension for ARM SMMUv3. The
+ CMDQ-V extension is similar to v3.3 ECMDQ for multi command queues
+ support, except with virtualization capabilities.
+
+ Say Y here if your system is NVIDIA Tegra241 (Grace) or it has the same
+ CMDQ-V extension.
+endif
+
+config QCOM_IOMMU
+ # Note: iommu drivers cannot (yet?) be built as modules
+ bool "Qualcomm IOMMU Support"
+ depends on ARCH_QCOM || COMPILE_TEST
+ depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE
+ select QCOM_SCM
+ select IOMMU_API
+ select IOMMU_IO_PGTABLE_LPAE
+ select ARM_DMA_USE_IOMMU
+ help
+ Support for IOMMU on certain Qualcomm SoCs.
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index 014a997753a8..493a659cc66b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
-arm_smmu_v3-objs-y += arm-smmu-v3.o
-arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
-arm_smmu_v3-objs := $(arm_smmu_v3-objs-y)
+arm_smmu_v3-y := arm-smmu-v3.o
+arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_IOMMUFD) += arm-smmu-v3-iommufd.o
+arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
+arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o
obj-$(CONFIG_ARM_SMMU_V3_KUNIT_TEST) += arm-smmu-v3-test.o
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
new file mode 100644
index 000000000000..93fdadd07431
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+
+#include <uapi/linux/iommufd.h>
+
+#include "arm-smmu-v3.h"
+
+void *arm_smmu_hw_info(struct device *dev, u32 *length,
+ enum iommu_hw_info_type *type)
+{
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ const struct arm_smmu_impl_ops *impl_ops = master->smmu->impl_ops;
+ struct iommu_hw_info_arm_smmuv3 *info;
+ u32 __iomem *base_idr;
+ unsigned int i;
+
+ if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
+ *type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) {
+ if (!impl_ops || !impl_ops->hw_info)
+ return ERR_PTR(-EOPNOTSUPP);
+ return impl_ops->hw_info(master->smmu, length, type);
+ }
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ base_idr = master->smmu->base + ARM_SMMU_IDR0;
+ for (i = 0; i <= 5; i++)
+ info->idr[i] = readl_relaxed(base_idr + i);
+ info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR);
+ info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR);
+
+ *length = sizeof(*info);
+ *type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3;
+
+ return info;
+}
+
+static void arm_smmu_make_nested_cd_table_ste(
+ struct arm_smmu_ste *target, struct arm_smmu_master *master,
+ struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+ arm_smmu_make_s2_domain_ste(
+ target, master, nested_domain->vsmmu->s2_parent, ats_enabled);
+
+ target->data[0] = cpu_to_le64(STRTAB_STE_0_V |
+ FIELD_PREP(STRTAB_STE_0_CFG,
+ STRTAB_STE_0_CFG_NESTED));
+ target->data[0] |= nested_domain->ste[0] &
+ ~cpu_to_le64(STRTAB_STE_0_CFG);
+ target->data[1] |= nested_domain->ste[1];
+ /* Merge events for DoS mitigations on eventq */
+ target->data[1] |= cpu_to_le64(STRTAB_STE_1_MEV);
+}
+
+/*
+ * Create a physical STE from the virtual STE that userspace provided when it
+ * created the nested domain. Using the vSTE userspace can request:
+ * - Non-valid STE
+ * - Abort STE
+ * - Bypass STE (install the S2, no CD table)
+ * - CD table STE (install the S2 and the userspace CD table)
+ */
+static void arm_smmu_make_nested_domain_ste(
+ struct arm_smmu_ste *target, struct arm_smmu_master *master,
+ struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+ unsigned int cfg =
+ FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
+
+ /*
+ * Userspace can request a non-valid STE through the nesting interface.
+ * We relay that into an abort physical STE with the intention that
+ * C_BAD_STE for this SID can be generated to userspace.
+ */
+ if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V)))
+ cfg = STRTAB_STE_0_CFG_ABORT;
+
+ switch (cfg) {
+ case STRTAB_STE_0_CFG_S1_TRANS:
+ arm_smmu_make_nested_cd_table_ste(target, master, nested_domain,
+ ats_enabled);
+ break;
+ case STRTAB_STE_0_CFG_BYPASS:
+ arm_smmu_make_s2_domain_ste(target, master,
+ nested_domain->vsmmu->s2_parent,
+ ats_enabled);
+ break;
+ case STRTAB_STE_0_CFG_ABORT:
+ default:
+ arm_smmu_make_abort_ste(target);
+ break;
+ }
+}
+
+int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
+ struct arm_smmu_nested_domain *nested_domain)
+{
+ unsigned int cfg =
+ FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
+ struct arm_smmu_vmaster *vmaster;
+ unsigned long vsid;
+ int ret;
+
+ iommu_group_mutex_assert(state->master->dev);
+
+ ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core,
+ state->master->dev, &vsid);
+ /*
+ * Attaching to a translate nested domain must allocate a vDEVICE prior,
+ * as CD/ATS invalidations and vevents require a vSID to work properly.
+ * A abort/bypass domain is allowed to attach w/o vmaster for GBPA case.
+ */
+ if (ret) {
+ if (cfg == STRTAB_STE_0_CFG_ABORT ||
+ cfg == STRTAB_STE_0_CFG_BYPASS)
+ return 0;
+ return ret;
+ }
+
+ vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL);
+ if (!vmaster)
+ return -ENOMEM;
+ vmaster->vsmmu = nested_domain->vsmmu;
+ vmaster->vsid = vsid;
+ state->vmaster = vmaster;
+
+ return 0;
+}
+
+void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state)
+{
+ struct arm_smmu_master *master = state->master;
+
+ mutex_lock(&master->smmu->streams_mutex);
+ kfree(master->vmaster);
+ master->vmaster = state->vmaster;
+ mutex_unlock(&master->smmu->streams_mutex);
+}
+
+void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master)
+{
+ struct arm_smmu_attach_state state = { .master = master };
+
+ arm_smmu_attach_commit_vmaster(&state);
+}
+
+static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old_domain)
+{
+ struct arm_smmu_nested_domain *nested_domain =
+ to_smmu_nested_domain(domain);
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_attach_state state = {
+ .master = master,
+ .old_domain = old_domain,
+ .ssid = IOMMU_NO_PASID,
+ };
+ struct arm_smmu_ste ste;
+ int ret;
+
+ if (nested_domain->vsmmu->smmu != master->smmu)
+ return -EINVAL;
+ if (arm_smmu_ssids_in_use(&master->cd_table))
+ return -EBUSY;
+
+ mutex_lock(&arm_smmu_asid_lock);
+ /*
+ * The VM has to control the actual ATS state at the PCI device because
+ * we forward the invalidations directly from the VM. If the VM doesn't
+ * think ATS is on it will not generate ATC flushes and the ATC will
+ * become incoherent. Since we can't access the actual virtual PCI ATS
+ * config bit here base this off the EATS value in the STE. If the EATS
+ * is set then the VM must generate ATC flushes.
+ */
+ state.disable_ats = !nested_domain->enable_ats;
+ ret = arm_smmu_attach_prepare(&state, domain);
+ if (ret) {
+ mutex_unlock(&arm_smmu_asid_lock);
+ return ret;
+ }
+
+ arm_smmu_make_nested_domain_ste(&ste, master, nested_domain,
+ state.ats_enabled);
+ arm_smmu_install_ste_for_dev(master, &ste);
+ arm_smmu_attach_commit(&state);
+ mutex_unlock(&arm_smmu_asid_lock);
+ return 0;
+}
+
+static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
+{
+ kfree(to_smmu_nested_domain(domain));
+}
+
+static const struct iommu_domain_ops arm_smmu_nested_ops = {
+ .attach_dev = arm_smmu_attach_dev_nested,
+ .free = arm_smmu_domain_nested_free,
+};
+
+static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg,
+ bool *enable_ats)
+{
+ unsigned int eats;
+ unsigned int cfg;
+
+ if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
+ memset(arg->ste, 0, sizeof(arg->ste));
+ return 0;
+ }
+
+ /* EIO is reserved for invalid STE data. */
+ if ((arg->ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
+ (arg->ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
+ return -EIO;
+
+ cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg->ste[0]));
+ if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
+ cfg != STRTAB_STE_0_CFG_S1_TRANS)
+ return -EIO;
+
+ /*
+ * Only Full ATS or ATS UR is supported
+ * The EATS field will be set by arm_smmu_make_nested_domain_ste()
+ */
+ eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg->ste[1]));
+ arg->ste[1] &= ~cpu_to_le64(STRTAB_STE_1_EATS);
+ if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS)
+ return -EIO;
+
+ if (cfg == STRTAB_STE_0_CFG_S1_TRANS)
+ *enable_ats = (eats == STRTAB_STE_1_EATS_TRANS);
+ return 0;
+}
+
+struct iommu_domain *
+arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+ struct arm_smmu_nested_domain *nested_domain;
+ struct iommu_hwpt_arm_smmuv3 arg;
+ bool enable_ats = false;
+ int ret;
+
+ if (flags)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ ret = iommu_copy_struct_from_user(&arg, user_data,
+ IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = arm_smmu_validate_vste(&arg, &enable_ats);
+ if (ret)
+ return ERR_PTR(ret);
+
+ nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT);
+ if (!nested_domain)
+ return ERR_PTR(-ENOMEM);
+
+ nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
+ nested_domain->domain.ops = &arm_smmu_nested_ops;
+ nested_domain->enable_ats = enable_ats;
+ nested_domain->vsmmu = vsmmu;
+ nested_domain->ste[0] = arg.ste[0];
+ nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
+
+ return &nested_domain->domain;
+}
+
+static int arm_vsmmu_vsid_to_sid(struct arm_vsmmu *vsmmu, u32 vsid, u32 *sid)
+{
+ struct arm_smmu_master *master;
+ struct device *dev;
+ int ret = 0;
+
+ xa_lock(&vsmmu->core.vdevs);
+ dev = iommufd_viommu_find_dev(&vsmmu->core, (unsigned long)vsid);
+ if (!dev) {
+ ret = -EIO;
+ goto unlock;
+ }
+ master = dev_iommu_priv_get(dev);
+
+ /* At this moment, iommufd only supports PCI device that has one SID */
+ if (sid)
+ *sid = master->streams[0].id;
+unlock:
+ xa_unlock(&vsmmu->core.vdevs);
+ return ret;
+}
+
+/* This is basically iommu_viommu_arm_smmuv3_invalidate in u64 for conversion */
+struct arm_vsmmu_invalidation_cmd {
+ union {
+ u64 cmd[2];
+ struct iommu_viommu_arm_smmuv3_invalidate ucmd;
+ };
+};
+
+/*
+ * Convert, in place, the raw invalidation command into an internal format that
+ * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are
+ * stored in CPU endian.
+ *
+ * Enforce the VMID or SID on the command.
+ */
+static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu,
+ struct arm_vsmmu_invalidation_cmd *cmd)
+{
+ /* Commands are le64 stored in u64 */
+ cmd->cmd[0] = le64_to_cpu(cmd->ucmd.cmd[0]);
+ cmd->cmd[1] = le64_to_cpu(cmd->ucmd.cmd[1]);
+
+ switch (cmd->cmd[0] & CMDQ_0_OP) {
+ case CMDQ_OP_TLBI_NSNH_ALL:
+ /* Convert to NH_ALL */
+ cmd->cmd[0] = CMDQ_OP_TLBI_NH_ALL |
+ FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid);
+ cmd->cmd[1] = 0;
+ break;
+ case CMDQ_OP_TLBI_NH_VA:
+ case CMDQ_OP_TLBI_NH_VAA:
+ case CMDQ_OP_TLBI_NH_ALL:
+ case CMDQ_OP_TLBI_NH_ASID:
+ cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID;
+ cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid);
+ break;
+ case CMDQ_OP_ATC_INV:
+ case CMDQ_OP_CFGI_CD:
+ case CMDQ_OP_CFGI_CD_ALL: {
+ u32 sid, vsid = FIELD_GET(CMDQ_CFGI_0_SID, cmd->cmd[0]);
+
+ if (arm_vsmmu_vsid_to_sid(vsmmu, vsid, &sid))
+ return -EIO;
+ cmd->cmd[0] &= ~CMDQ_CFGI_0_SID;
+ cmd->cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, sid);
+ break;
+ }
+ default:
+ return -EIO;
+ }
+ return 0;
+}
+
+int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array)
+{
+ struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+ struct arm_smmu_device *smmu = vsmmu->smmu;
+ struct arm_vsmmu_invalidation_cmd *last;
+ struct arm_vsmmu_invalidation_cmd *cmds;
+ struct arm_vsmmu_invalidation_cmd *cur;
+ struct arm_vsmmu_invalidation_cmd *end;
+ int ret;
+
+ cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+ if (!cmds)
+ return -ENOMEM;
+ cur = cmds;
+ end = cmds + array->entry_num;
+
+ static_assert(sizeof(*cmds) == 2 * sizeof(u64));
+ ret = iommu_copy_struct_from_full_user_array(
+ cmds, sizeof(*cmds), array,
+ IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3);
+ if (ret)
+ goto out;
+
+ last = cmds;
+ while (cur != end) {
+ ret = arm_vsmmu_convert_user_cmd(vsmmu, cur);
+ if (ret)
+ goto out;
+
+ /* FIXME work in blocks of CMDQ_BATCH_ENTRIES and copy each block? */
+ cur++;
+ if (cur != end && (cur - last) != CMDQ_BATCH_ENTRIES - 1)
+ continue;
+
+ /* FIXME always uses the main cmdq rather than trying to group by type */
+ ret = arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, last->cmd,
+ cur - last, true);
+ if (ret) {
+ cur--;
+ goto out;
+ }
+ last = cur;
+ }
+out:
+ array->entry_num = cur - cmds;
+ kfree(cmds);
+ return ret;
+}
+
+static const struct iommufd_viommu_ops arm_vsmmu_ops = {
+ .alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
+ .cache_invalidate = arm_vsmmu_cache_invalidate,
+};
+
+size_t arm_smmu_get_viommu_size(struct device *dev,
+ enum iommu_viommu_type viommu_type)
+{
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_device *smmu = master->smmu;
+
+ if (!(smmu->features & ARM_SMMU_FEAT_NESTING))
+ return 0;
+
+ /*
+ * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW
+ * defect is needed to determine if arm_vsmmu_cache_invalidate() needs
+ * any change to remove this.
+ */
+ if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC))
+ return 0;
+
+ /*
+ * Must support some way to prevent the VM from bypassing the cache
+ * because VFIO currently does not do any cache maintenance. canwbs
+ * indicates the device is fully coherent and no cache maintenance is
+ * ever required, even for PCI No-Snoop. S2FWB means the S1 can't make
+ * things non-coherent using the memattr, but No-Snoop behavior is not
+ * effected.
+ */
+ if (!arm_smmu_master_canwbs(master) &&
+ !(smmu->features & ARM_SMMU_FEAT_S2FWB))
+ return 0;
+
+ if (viommu_type == IOMMU_VIOMMU_TYPE_ARM_SMMUV3)
+ return VIOMMU_STRUCT_SIZE(struct arm_vsmmu, core);
+
+ if (!smmu->impl_ops || !smmu->impl_ops->get_viommu_size)
+ return 0;
+ return smmu->impl_ops->get_viommu_size(viommu_type);
+}
+
+int arm_vsmmu_init(struct iommufd_viommu *viommu,
+ struct iommu_domain *parent_domain,
+ const struct iommu_user_data *user_data)
+{
+ struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+ struct arm_smmu_device *smmu =
+ container_of(viommu->iommu_dev, struct arm_smmu_device, iommu);
+ struct arm_smmu_domain *s2_parent = to_smmu_domain(parent_domain);
+
+ if (s2_parent->smmu != smmu)
+ return -EINVAL;
+
+ vsmmu->smmu = smmu;
+ vsmmu->s2_parent = s2_parent;
+ /* FIXME Move VMID allocation from the S2 domain allocation to here */
+ vsmmu->vmid = s2_parent->s2_cfg.vmid;
+
+ if (viommu->type == IOMMU_VIOMMU_TYPE_ARM_SMMUV3) {
+ viommu->ops = &arm_vsmmu_ops;
+ return 0;
+ }
+
+ return smmu->impl_ops->vsmmu_init(vsmmu, user_data);
+}
+
+int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt)
+{
+ struct iommu_vevent_arm_smmuv3 vevt;
+ int i;
+
+ lockdep_assert_held(&vmaster->vsmmu->smmu->streams_mutex);
+
+ vevt.evt[0] = cpu_to_le64((evt[0] & ~EVTQ_0_SID) |
+ FIELD_PREP(EVTQ_0_SID, vmaster->vsid));
+ for (i = 1; i < EVTQ_ENT_DWORDS; i++)
+ vevt.evt[i] = cpu_to_le64(evt[i]);
+
+ return iommufd_viommu_report_event(&vmaster->vsmmu->core,
+ IOMMU_VEVENTQ_TYPE_ARM_SMMUV3, &vevt,
+ sizeof(vevt));
+}
+
+MODULE_IMPORT_NS("IOMMUFD");
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index e490ffb38015..59a480974d80 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -13,103 +13,29 @@
#include "arm-smmu-v3.h"
#include "../../io-pgtable-arm.h"
-struct arm_smmu_mmu_notifier {
- struct mmu_notifier mn;
- struct arm_smmu_ctx_desc *cd;
- bool cleared;
- refcount_t refs;
- struct list_head list;
- struct arm_smmu_domain *domain;
-};
-
-#define mn_to_smmu(mn) container_of(mn, struct arm_smmu_mmu_notifier, mn)
-
-struct arm_smmu_bond {
- struct mm_struct *mm;
- struct arm_smmu_mmu_notifier *smmu_mn;
- struct list_head list;
-};
-
-#define sva_to_bond(handle) \
- container_of(handle, struct arm_smmu_bond, sva)
-
-static DEFINE_MUTEX(sva_lock);
-
-static void
+static void __maybe_unused
arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
{
- struct arm_smmu_master *master;
+ struct arm_smmu_master_domain *master_domain;
struct arm_smmu_cd target_cd;
unsigned long flags;
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+ list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) {
+ struct arm_smmu_master *master = master_domain->master;
struct arm_smmu_cd *cdptr;
- /* S1 domains only support RID attachment right now */
- cdptr = arm_smmu_get_cd_ptr(master, IOMMU_NO_PASID);
+ cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid);
if (WARN_ON(!cdptr))
continue;
arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
- arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
+ arm_smmu_write_cd_entry(master, master_domain->ssid, cdptr,
&target_cd);
}
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
}
-/*
- * Check if the CPU ASID is available on the SMMU side. If a private context
- * descriptor is using it, try to replace it.
- */
-static struct arm_smmu_ctx_desc *
-arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
-{
- int ret;
- u32 new_asid;
- struct arm_smmu_ctx_desc *cd;
- struct arm_smmu_device *smmu;
- struct arm_smmu_domain *smmu_domain;
-
- cd = xa_load(&arm_smmu_asid_xa, asid);
- if (!cd)
- return NULL;
-
- if (cd->mm) {
- if (WARN_ON(cd->mm != mm))
- return ERR_PTR(-EINVAL);
- /* All devices bound to this mm use the same cd struct. */
- refcount_inc(&cd->refs);
- return cd;
- }
-
- smmu_domain = container_of(cd, struct arm_smmu_domain, cd);
- smmu = smmu_domain->smmu;
-
- ret = xa_alloc(&arm_smmu_asid_xa, &new_asid, cd,
- XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
- if (ret)
- return ERR_PTR(-ENOSPC);
- /*
- * Race with unmap: TLB invalidations will start targeting the new ASID,
- * which isn't assigned yet. We'll do an invalidate-all on the old ASID
- * later, so it doesn't matter.
- */
- cd->asid = new_asid;
- /*
- * Update ASID and invalidate CD in all associated masters. There will
- * be some overlap between use of both ASIDs, until we invalidate the
- * TLB.
- */
- arm_smmu_update_s1_domain_cd_entry(smmu_domain);
-
- /* Invalidate TLB entries previously associated with that context */
- arm_smmu_tlb_inv_asid(smmu, asid);
-
- xa_erase(&arm_smmu_asid_xa, asid);
- return NULL;
-}
-
static u64 page_size_to_cd(void)
{
static_assert(PAGE_SIZE == SZ_4K || PAGE_SIZE == SZ_16K ||
@@ -184,71 +110,17 @@ void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
* from the current CPU register
*/
target->data[3] = cpu_to_le64(read_sysreg(mair_el1));
-}
-EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd);
-
-static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
-{
- u16 asid;
- int err = 0;
- struct arm_smmu_ctx_desc *cd;
- struct arm_smmu_ctx_desc *ret = NULL;
-
- /* Don't free the mm until we release the ASID */
- mmgrab(mm);
-
- asid = arm64_mm_context_get(mm);
- if (!asid) {
- err = -ESRCH;
- goto out_drop_mm;
- }
-
- cd = kzalloc(sizeof(*cd), GFP_KERNEL);
- if (!cd) {
- err = -ENOMEM;
- goto out_put_context;
- }
-
- refcount_set(&cd->refs, 1);
-
- mutex_lock(&arm_smmu_asid_lock);
- ret = arm_smmu_share_asid(mm, asid);
- if (ret) {
- mutex_unlock(&arm_smmu_asid_lock);
- goto out_free_cd;
- }
-
- err = xa_insert(&arm_smmu_asid_xa, asid, cd, GFP_KERNEL);
- mutex_unlock(&arm_smmu_asid_lock);
-
- if (err)
- goto out_free_asid;
- cd->asid = asid;
- cd->mm = mm;
-
- return cd;
-
-out_free_asid:
- arm_smmu_free_asid(cd);
-out_free_cd:
- kfree(cd);
-out_put_context:
- arm64_mm_context_put(mm);
-out_drop_mm:
- mmdrop(mm);
- return err < 0 ? ERR_PTR(err) : ret;
-}
-
-static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
-{
- if (arm_smmu_free_asid(cd)) {
- /* Unpin ASID */
- arm64_mm_context_put(cd->mm);
- mmdrop(cd->mm);
- kfree(cd);
- }
+ /*
+ * Note that we don't bother with S1PIE on the SMMU, we just rely on
+ * our default encoding scheme matching direct permissions anyway.
+ * SMMU has no notion of S1POE nor GCS, so make sure that is clear if
+ * either is enabled for CPUs, just in case anyone imagines otherwise.
+ */
+ if (system_supports_poe() || system_supports_gcs())
+ dev_warn_once(master->smmu->dev, "SVA devices ignore permission overlays and GCS\n");
}
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd);
/*
* Cloned from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h, this
@@ -264,8 +136,8 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
unsigned long start,
unsigned long end)
{
- struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
- struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
+ struct arm_smmu_domain *smmu_domain =
+ container_of(mn, struct arm_smmu_domain, mmu_notifier);
size_t size;
/*
@@ -282,62 +154,50 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
size = 0;
}
- if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) {
- if (!size)
- arm_smmu_tlb_inv_asid(smmu_domain->smmu,
- smmu_mn->cd->asid);
- else
- arm_smmu_tlb_inv_range_asid(start, size,
- smmu_mn->cd->asid,
- PAGE_SIZE, false,
- smmu_domain);
- }
+ if (!size)
+ arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid);
+ else
+ arm_smmu_tlb_inv_range_asid(start, size, smmu_domain->cd.asid,
+ PAGE_SIZE, false, smmu_domain);
- arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start,
- size);
+ arm_smmu_atc_inv_domain(smmu_domain, start, size);
}
static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
- struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
- struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
- struct arm_smmu_master *master;
+ struct arm_smmu_domain *smmu_domain =
+ container_of(mn, struct arm_smmu_domain, mmu_notifier);
+ struct arm_smmu_master_domain *master_domain;
unsigned long flags;
- mutex_lock(&sva_lock);
- if (smmu_mn->cleared) {
- mutex_unlock(&sva_lock);
- return;
- }
-
/*
* DMA may still be running. Keep the cd valid to avoid C_BAD_CD events,
* but disable translation.
*/
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+ list_for_each_entry(master_domain, &smmu_domain->devices,
+ devices_elm) {
+ struct arm_smmu_master *master = master_domain->master;
struct arm_smmu_cd target;
struct arm_smmu_cd *cdptr;
- cdptr = arm_smmu_get_cd_ptr(master, mm_get_enqcmd_pasid(mm));
+ cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid);
if (WARN_ON(!cdptr))
continue;
- arm_smmu_make_sva_cd(&target, master, NULL, smmu_mn->cd->asid);
- arm_smmu_write_cd_entry(master, mm_get_enqcmd_pasid(mm), cdptr,
+ arm_smmu_make_sva_cd(&target, master, NULL,
+ smmu_domain->cd.asid);
+ arm_smmu_write_cd_entry(master, master_domain->ssid, cdptr,
&target);
}
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
- arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
- arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0);
-
- smmu_mn->cleared = true;
- mutex_unlock(&sva_lock);
+ arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid);
+ arm_smmu_atc_inv_domain(smmu_domain, 0, 0);
}
static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn)
{
- kfree(mn_to_smmu(mn));
+ kfree(container_of(mn, struct arm_smmu_domain, mmu_notifier));
}
static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
@@ -346,127 +206,6 @@ static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
.free_notifier = arm_smmu_mmu_notifier_free,
};
-/* Allocate or get existing MMU notifier for this {domain, mm} pair */
-static struct arm_smmu_mmu_notifier *
-arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain,
- struct mm_struct *mm)
-{
- int ret;
- struct arm_smmu_ctx_desc *cd;
- struct arm_smmu_mmu_notifier *smmu_mn;
-
- list_for_each_entry(smmu_mn, &smmu_domain->mmu_notifiers, list) {
- if (smmu_mn->mn.mm == mm) {
- refcount_inc(&smmu_mn->refs);
- return smmu_mn;
- }
- }
-
- cd = arm_smmu_alloc_shared_cd(mm);
- if (IS_ERR(cd))
- return ERR_CAST(cd);
-
- smmu_mn = kzalloc(sizeof(*smmu_mn), GFP_KERNEL);
- if (!smmu_mn) {
- ret = -ENOMEM;
- goto err_free_cd;
- }
-
- refcount_set(&smmu_mn->refs, 1);
- smmu_mn->cd = cd;
- smmu_mn->domain = smmu_domain;
- smmu_mn->mn.ops = &arm_smmu_mmu_notifier_ops;
-
- ret = mmu_notifier_register(&smmu_mn->mn, mm);
- if (ret) {
- kfree(smmu_mn);
- goto err_free_cd;
- }
-
- list_add(&smmu_mn->list, &smmu_domain->mmu_notifiers);
- return smmu_mn;
-
-err_free_cd:
- arm_smmu_free_shared_cd(cd);
- return ERR_PTR(ret);
-}
-
-static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
-{
- struct mm_struct *mm = smmu_mn->mn.mm;
- struct arm_smmu_ctx_desc *cd = smmu_mn->cd;
- struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
-
- if (!refcount_dec_and_test(&smmu_mn->refs))
- return;
-
- list_del(&smmu_mn->list);
-
- /*
- * If we went through clear(), we've already invalidated, and no
- * new TLB entry can have been formed.
- */
- if (!smmu_mn->cleared) {
- arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
- arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0,
- 0);
- }
-
- /* Frees smmu_mn */
- mmu_notifier_put(&smmu_mn->mn);
- arm_smmu_free_shared_cd(cd);
-}
-
-static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid,
- struct mm_struct *mm)
-{
- int ret;
- struct arm_smmu_cd target;
- struct arm_smmu_cd *cdptr;
- struct arm_smmu_bond *bond;
- struct arm_smmu_master *master = dev_iommu_priv_get(dev);
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- struct arm_smmu_domain *smmu_domain;
-
- if (!(domain->type & __IOMMU_DOMAIN_PAGING))
- return -ENODEV;
- smmu_domain = to_smmu_domain(domain);
- if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1)
- return -ENODEV;
-
- if (!master || !master->sva_enabled)
- return -ENODEV;
-
- bond = kzalloc(sizeof(*bond), GFP_KERNEL);
- if (!bond)
- return -ENOMEM;
-
- bond->mm = mm;
-
- bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm);
- if (IS_ERR(bond->smmu_mn)) {
- ret = PTR_ERR(bond->smmu_mn);
- goto err_free_bond;
- }
-
- cdptr = arm_smmu_alloc_cd_ptr(master, mm_get_enqcmd_pasid(mm));
- if (!cdptr) {
- ret = -ENOMEM;
- goto err_put_notifier;
- }
- arm_smmu_make_sva_cd(&target, master, mm, bond->smmu_mn->cd->asid);
- arm_smmu_write_cd_entry(master, pasid, cdptr, &target);
-
- list_add(&bond->list, &master->bonds);
- return 0;
-
-err_put_notifier:
- arm_smmu_mmu_notifier_put(bond->smmu_mn);
-err_free_bond:
- kfree(bond);
- return ret;
-}
-
bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
{
unsigned long reg, fld;
@@ -474,8 +213,15 @@ bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
unsigned long asid_bits;
u32 feat_mask = ARM_SMMU_FEAT_COHERENCY;
- if (vabits_actual == 52)
+ if (vabits_actual == 52) {
+ /* We don't support LPA2 */
+ if (PAGE_SIZE != SZ_64K)
+ return false;
feat_mask |= ARM_SMMU_FEAT_VAX;
+ }
+
+ if (system_supports_bbml2_noabort())
+ feat_mask |= ARM_SMMU_FEAT_BBML2;
if ((smmu->features & feat_mask) != feat_mask)
return false;
@@ -512,89 +258,6 @@ bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
return true;
}
-bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master)
-{
- /* We're not keeping track of SIDs in fault events */
- if (master->num_streams != 1)
- return false;
-
- return master->stall_enabled;
-}
-
-bool arm_smmu_master_sva_supported(struct arm_smmu_master *master)
-{
- if (!(master->smmu->features & ARM_SMMU_FEAT_SVA))
- return false;
-
- /* SSID support is mandatory for the moment */
- return master->ssid_bits;
-}
-
-bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master)
-{
- bool enabled;
-
- mutex_lock(&sva_lock);
- enabled = master->sva_enabled;
- mutex_unlock(&sva_lock);
- return enabled;
-}
-
-static int arm_smmu_master_sva_enable_iopf(struct arm_smmu_master *master)
-{
- struct device *dev = master->dev;
-
- /*
- * Drivers for devices supporting PRI or stall should enable IOPF first.
- * Others have device-specific fault handlers and don't need IOPF.
- */
- if (!arm_smmu_master_iopf_supported(master))
- return 0;
-
- if (!master->iopf_enabled)
- return -EINVAL;
-
- return iopf_queue_add_device(master->smmu->evtq.iopf, dev);
-}
-
-static void arm_smmu_master_sva_disable_iopf(struct arm_smmu_master *master)
-{
- struct device *dev = master->dev;
-
- if (!master->iopf_enabled)
- return;
-
- iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
-}
-
-int arm_smmu_master_enable_sva(struct arm_smmu_master *master)
-{
- int ret;
-
- mutex_lock(&sva_lock);
- ret = arm_smmu_master_sva_enable_iopf(master);
- if (!ret)
- master->sva_enabled = true;
- mutex_unlock(&sva_lock);
-
- return ret;
-}
-
-int arm_smmu_master_disable_sva(struct arm_smmu_master *master)
-{
- mutex_lock(&sva_lock);
- if (!list_empty(&master->bonds)) {
- dev_err(master->dev, "cannot disable SVA, device is bound\n");
- mutex_unlock(&sva_lock);
- return -EBUSY;
- }
- arm_smmu_master_sva_disable_iopf(master);
- master->sva_enabled = false;
- mutex_unlock(&sva_lock);
-
- return 0;
-}
-
void arm_smmu_sva_notifier_synchronize(void)
{
/*
@@ -604,51 +267,55 @@ void arm_smmu_sva_notifier_synchronize(void)
mmu_notifier_synchronize();
}
-void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
- struct device *dev, ioasid_t id)
+static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t id,
+ struct iommu_domain *old)
{
- struct mm_struct *mm = domain->mm;
- struct arm_smmu_bond *bond = NULL, *t;
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_cd target;
+ int ret;
- mutex_lock(&sva_lock);
-
- arm_smmu_clear_cd(master, id);
-
- list_for_each_entry(t, &master->bonds, list) {
- if (t->mm == mm) {
- bond = t;
- break;
- }
- }
-
- if (!WARN_ON(!bond)) {
- list_del(&bond->list);
- arm_smmu_mmu_notifier_put(bond->smmu_mn);
- kfree(bond);
- }
- mutex_unlock(&sva_lock);
-}
-
-static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
- struct device *dev, ioasid_t id)
-{
- int ret = 0;
- struct mm_struct *mm = domain->mm;
+ if (!(master->smmu->features & ARM_SMMU_FEAT_SVA))
+ return -EOPNOTSUPP;
- if (mm_get_enqcmd_pasid(mm) != id)
+ /* Prevent arm_smmu_mm_release from being called while we are attaching */
+ if (!mmget_not_zero(domain->mm))
return -EINVAL;
- mutex_lock(&sva_lock);
- ret = __arm_smmu_sva_bind(dev, id, mm);
- mutex_unlock(&sva_lock);
+ /*
+ * This does not need the arm_smmu_asid_lock because SVA domains never
+ * get reassigned
+ */
+ arm_smmu_make_sva_cd(&target, master, domain->mm, smmu_domain->cd.asid);
+ ret = arm_smmu_set_pasid(master, smmu_domain, id, &target, old);
+ mmput(domain->mm);
return ret;
}
static void arm_smmu_sva_domain_free(struct iommu_domain *domain)
{
- kfree(domain);
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+ /*
+ * Ensure the ASID is empty in the iommu cache before allowing reuse.
+ */
+ arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid);
+
+ /*
+ * Notice that the arm_smmu_mm_arch_invalidate_secondary_tlbs op can
+ * still be called/running at this point. We allow the ASID to be
+ * reused, and if there is a race then it just suffers harmless
+ * unnecessary invalidation.
+ */
+ xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid);
+
+ /*
+ * Actual free is defered to the SRCU callback
+ * arm_smmu_mmu_notifier_free()
+ */
+ mmu_notifier_put(&smmu_domain->mmu_notifier);
}
static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
@@ -656,14 +323,47 @@ static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
.free = arm_smmu_sva_domain_free
};
-struct iommu_domain *arm_smmu_sva_domain_alloc(void)
+struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
+ struct mm_struct *mm)
{
- struct iommu_domain *domain;
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_domain *smmu_domain;
+ u32 asid;
+ int ret;
+
+ if (!(master->smmu->features & ARM_SMMU_FEAT_SVA))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ smmu_domain = arm_smmu_domain_alloc();
+ if (IS_ERR(smmu_domain))
+ return ERR_CAST(smmu_domain);
+ smmu_domain->domain.type = IOMMU_DOMAIN_SVA;
+ smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- if (!domain)
- return NULL;
- domain->ops = &arm_smmu_sva_domain_ops;
+ /*
+ * Choose page_size as the leaf page size for invalidation when
+ * ARM_SMMU_FEAT_RANGE_INV is present
+ */
+ smmu_domain->domain.pgsize_bitmap = PAGE_SIZE;
+ smmu_domain->smmu = smmu;
- return domain;
+ ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain,
+ XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
+ if (ret)
+ goto err_free;
+
+ smmu_domain->cd.asid = asid;
+ smmu_domain->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops;
+ ret = mmu_notifier_register(&smmu_domain->mmu_notifier, mm);
+ if (ret)
+ goto err_asid;
+
+ return &smmu_domain->domain;
+
+err_asid:
+ xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid);
+err_free:
+ kfree(smmu_domain);
+ return ERR_PTR(ret);
}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
index 315e487fd990..d2671bfd3798 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -30,6 +30,11 @@ static struct mm_struct sva_mm = {
.pgd = (void *)0xdaedbeefdeadbeefULL,
};
+enum arm_smmu_test_master_feat {
+ ARM_SMMU_MASTER_TEST_ATS = BIT(0),
+ ARM_SMMU_MASTER_TEST_STALL = BIT(1),
+};
+
static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry,
const __le64 *used_bits,
const __le64 *target,
@@ -144,6 +149,14 @@ static void arm_smmu_v3_test_ste_expect_transition(
KUNIT_EXPECT_MEMEQ(test, target->data, cur_copy.data, sizeof(cur_copy));
}
+static void arm_smmu_v3_test_ste_expect_non_hitless_transition(
+ struct kunit *test, const struct arm_smmu_ste *cur,
+ const struct arm_smmu_ste *target, unsigned int num_syncs_expected)
+{
+ arm_smmu_v3_test_ste_expect_transition(test, cur, target,
+ num_syncs_expected, false);
+}
+
static void arm_smmu_v3_test_ste_expect_hitless_transition(
struct kunit *test, const struct arm_smmu_ste *cur,
const struct arm_smmu_ste *target, unsigned int num_syncs_expected)
@@ -155,16 +168,23 @@ static void arm_smmu_v3_test_ste_expect_hitless_transition(
static const dma_addr_t fake_cdtab_dma_addr = 0xF0F0F0F0F0F0;
static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste,
- const dma_addr_t dma_addr)
+ unsigned int s1dss,
+ const dma_addr_t dma_addr,
+ enum arm_smmu_test_master_feat feat)
{
+ bool ats_enabled = feat & ARM_SMMU_MASTER_TEST_ATS;
+ bool stall_enabled = feat & ARM_SMMU_MASTER_TEST_STALL;
+
struct arm_smmu_master master = {
+ .ats_enabled = ats_enabled,
.cd_table.cdtab_dma = dma_addr,
.cd_table.s1cdmax = 0xFF,
.cd_table.s1fmt = STRTAB_STE_0_S1FMT_64K_L2,
.smmu = &smmu,
+ .stall_enabled = stall_enabled,
};
- arm_smmu_make_cdtable_ste(ste, &master);
+ arm_smmu_make_cdtable_ste(ste, &master, ats_enabled, s1dss);
}
static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test)
@@ -194,7 +214,8 @@ static void arm_smmu_v3_write_ste_test_cdtable_to_abort(struct kunit *test)
{
struct arm_smmu_ste ste;
- arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+ arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &abort_ste,
NUM_EXPECTED_SYNCS(2));
}
@@ -203,7 +224,8 @@ static void arm_smmu_v3_write_ste_test_abort_to_cdtable(struct kunit *test)
{
struct arm_smmu_ste ste;
- arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+ arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &abort_ste, &ste,
NUM_EXPECTED_SYNCS(2));
}
@@ -212,7 +234,8 @@ static void arm_smmu_v3_write_ste_test_cdtable_to_bypass(struct kunit *test)
{
struct arm_smmu_ste ste;
- arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+ arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &bypass_ste,
NUM_EXPECTED_SYNCS(3));
}
@@ -221,17 +244,63 @@ static void arm_smmu_v3_write_ste_test_bypass_to_cdtable(struct kunit *test)
{
struct arm_smmu_ste ste;
- arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+ arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &bypass_ste, &ste,
NUM_EXPECTED_SYNCS(3));
}
+static void arm_smmu_v3_write_ste_test_cdtable_s1dss_change(struct kunit *test)
+{
+ struct arm_smmu_ste ste;
+ struct arm_smmu_ste s1dss_bypass;
+
+ arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
+
+ /*
+ * Flipping s1dss on a CD table STE only involves changes to the second
+ * qword of an STE and can be done in a single write.
+ */
+ arm_smmu_v3_test_ste_expect_hitless_transition(
+ test, &ste, &s1dss_bypass, NUM_EXPECTED_SYNCS(1));
+ arm_smmu_v3_test_ste_expect_hitless_transition(
+ test, &s1dss_bypass, &ste, NUM_EXPECTED_SYNCS(1));
+}
+
+static void
+arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass(struct kunit *test)
+{
+ struct arm_smmu_ste s1dss_bypass;
+
+ arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_v3_test_ste_expect_hitless_transition(
+ test, &s1dss_bypass, &bypass_ste, NUM_EXPECTED_SYNCS(2));
+}
+
+static void
+arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass(struct kunit *test)
+{
+ struct arm_smmu_ste s1dss_bypass;
+
+ arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_v3_test_ste_expect_hitless_transition(
+ test, &bypass_ste, &s1dss_bypass, NUM_EXPECTED_SYNCS(2));
+}
+
static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste,
- bool ats_enabled)
+ enum arm_smmu_test_master_feat feat)
{
+ bool ats_enabled = feat & ARM_SMMU_MASTER_TEST_ATS;
+ bool stall_enabled = feat & ARM_SMMU_MASTER_TEST_STALL;
struct arm_smmu_master master = {
- .smmu = &smmu,
.ats_enabled = ats_enabled,
+ .smmu = &smmu,
+ .stall_enabled = stall_enabled,
};
struct io_pgtable io_pgtable = {};
struct arm_smmu_domain smmu_domain = {
@@ -247,14 +316,14 @@ static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste,
io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.sl = 3;
io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.tsz = 4;
- arm_smmu_make_s2_domain_ste(ste, &master, &smmu_domain);
+ arm_smmu_make_s2_domain_ste(ste, &master, &smmu_domain, ats_enabled);
}
static void arm_smmu_v3_write_ste_test_s2_to_abort(struct kunit *test)
{
struct arm_smmu_ste ste;
- arm_smmu_test_make_s2_ste(&ste, true);
+ arm_smmu_test_make_s2_ste(&ste, ARM_SMMU_MASTER_TEST_ATS);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &abort_ste,
NUM_EXPECTED_SYNCS(2));
}
@@ -263,7 +332,7 @@ static void arm_smmu_v3_write_ste_test_abort_to_s2(struct kunit *test)
{
struct arm_smmu_ste ste;
- arm_smmu_test_make_s2_ste(&ste, true);
+ arm_smmu_test_make_s2_ste(&ste, ARM_SMMU_MASTER_TEST_ATS);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &abort_ste, &ste,
NUM_EXPECTED_SYNCS(2));
}
@@ -272,7 +341,7 @@ static void arm_smmu_v3_write_ste_test_s2_to_bypass(struct kunit *test)
{
struct arm_smmu_ste ste;
- arm_smmu_test_make_s2_ste(&ste, true);
+ arm_smmu_test_make_s2_ste(&ste, ARM_SMMU_MASTER_TEST_ATS);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &bypass_ste,
NUM_EXPECTED_SYNCS(2));
}
@@ -281,11 +350,53 @@ static void arm_smmu_v3_write_ste_test_bypass_to_s2(struct kunit *test)
{
struct arm_smmu_ste ste;
- arm_smmu_test_make_s2_ste(&ste, true);
+ arm_smmu_test_make_s2_ste(&ste, ARM_SMMU_MASTER_TEST_ATS);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &bypass_ste, &ste,
NUM_EXPECTED_SYNCS(2));
}
+static void arm_smmu_v3_write_ste_test_s1_to_s2(struct kunit *test)
+{
+ struct arm_smmu_ste s1_ste;
+ struct arm_smmu_ste s2_ste;
+
+ arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_test_make_s2_ste(&s2_ste, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste,
+ NUM_EXPECTED_SYNCS(3));
+}
+
+static void arm_smmu_v3_write_ste_test_s2_to_s1(struct kunit *test)
+{
+ struct arm_smmu_ste s1_ste;
+ struct arm_smmu_ste s2_ste;
+
+ arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_test_make_s2_ste(&s2_ste, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste,
+ NUM_EXPECTED_SYNCS(3));
+}
+
+static void arm_smmu_v3_write_ste_test_non_hitless(struct kunit *test)
+{
+ struct arm_smmu_ste ste;
+ struct arm_smmu_ste ste_2;
+
+ /*
+ * Although no flow resembles this in practice, one way to force an STE
+ * update to be non-hitless is to change its CD table pointer as well as
+ * s1 dss field in the same update.
+ */
+ arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_test_make_cdtable_ste(&ste_2, STRTAB_STE_1_S1DSS_BYPASS,
+ 0x4B4B4b4B4B, ARM_SMMU_MASTER_TEST_ATS);
+ arm_smmu_v3_test_ste_expect_non_hitless_transition(
+ test, &ste, &ste_2, NUM_EXPECTED_SYNCS(3));
+}
+
static void arm_smmu_v3_test_cd_expect_transition(
struct kunit *test, const struct arm_smmu_cd *cur,
const struct arm_smmu_cd *target, unsigned int num_syncs_expected,
@@ -407,6 +518,30 @@ static void arm_smmu_test_make_sva_release_cd(struct arm_smmu_cd *cd,
arm_smmu_make_sva_cd(cd, &master, NULL, asid);
}
+static void arm_smmu_v3_write_ste_test_s1_to_s2_stall(struct kunit *test)
+{
+ struct arm_smmu_ste s1_ste;
+ struct arm_smmu_ste s2_ste;
+
+ arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_STALL);
+ arm_smmu_test_make_s2_ste(&s2_ste, ARM_SMMU_MASTER_TEST_STALL);
+ arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste,
+ NUM_EXPECTED_SYNCS(3));
+}
+
+static void arm_smmu_v3_write_ste_test_s2_to_s1_stall(struct kunit *test)
+{
+ struct arm_smmu_ste s1_ste;
+ struct arm_smmu_ste s2_ste;
+
+ arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0,
+ fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_STALL);
+ arm_smmu_test_make_s2_ste(&s2_ste, ARM_SMMU_MASTER_TEST_STALL);
+ arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste,
+ NUM_EXPECTED_SYNCS(3));
+}
+
static void arm_smmu_v3_write_cd_test_sva_clear(struct kunit *test)
{
struct arm_smmu_cd cd = {};
@@ -439,12 +574,20 @@ static struct kunit_case arm_smmu_v3_test_cases[] = {
KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_cdtable),
KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_to_bypass),
KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_cdtable),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_s1dss_change),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass),
KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_abort),
KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_s2),
KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_bypass),
KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_s2),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_non_hitless),
KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_clear),
KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_change_asid),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2_stall),
+ KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1_stall),
KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear),
KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_release),
{},
@@ -464,5 +607,6 @@ static struct kunit_suite arm_smmu_v3_test_module = {
};
kunit_test_suites(&arm_smmu_v3_test_module);
-MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+MODULE_DESCRIPTION("KUnit tests for arm-smmu-v3 driver");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ab415e107054..d16d35c78c06 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -26,7 +26,9 @@
#include <linux/pci.h>
#include <linux/pci-ats.h>
#include <linux/platform_device.h>
+#include <linux/string_choices.h>
#include <kunit/visibility.h>
+#include <uapi/linux/iommufd.h>
#include "arm-smmu-v3.h"
#include "../../dma-iommu.h"
@@ -36,6 +38,9 @@ module_param(disable_msipolling, bool, 0444);
MODULE_PARM_DESC(disable_msipolling,
"Disable MSI-based polling for CMD_SYNC completion.");
+static const struct iommu_ops arm_smmu_ops;
+static struct iommu_dirty_ops arm_smmu_dirty_ops;
+
enum arm_smmu_msi_index {
EVTQ_MSI_INDEX,
GERROR_MSI_INDEX,
@@ -79,8 +84,28 @@ static struct arm_smmu_option_prop arm_smmu_options[] = {
{ 0, NULL},
};
-static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
- struct arm_smmu_device *smmu);
+static const char * const event_str[] = {
+ [EVT_ID_BAD_STREAMID_CONFIG] = "C_BAD_STREAMID",
+ [EVT_ID_STE_FETCH_FAULT] = "F_STE_FETCH",
+ [EVT_ID_BAD_STE_CONFIG] = "C_BAD_STE",
+ [EVT_ID_STREAM_DISABLED_FAULT] = "F_STREAM_DISABLED",
+ [EVT_ID_BAD_SUBSTREAMID_CONFIG] = "C_BAD_SUBSTREAMID",
+ [EVT_ID_CD_FETCH_FAULT] = "F_CD_FETCH",
+ [EVT_ID_BAD_CD_CONFIG] = "C_BAD_CD",
+ [EVT_ID_TRANSLATION_FAULT] = "F_TRANSLATION",
+ [EVT_ID_ADDR_SIZE_FAULT] = "F_ADDR_SIZE",
+ [EVT_ID_ACCESS_FAULT] = "F_ACCESS",
+ [EVT_ID_PERMISSION_FAULT] = "F_PERMISSION",
+ [EVT_ID_VMS_FETCH_FAULT] = "F_VMS_FETCH",
+};
+
+static const char * const event_class_str[] = {
+ [0] = "CD fetch",
+ [1] = "Stage 1 translation table fetch",
+ [2] = "Input address caused fault",
+ [3] = "Reserved",
+};
+
static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master);
static void parse_driver_options(struct arm_smmu_device *smmu)
@@ -291,6 +316,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
case CMDQ_OP_TLBI_NH_ASID:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
fallthrough;
+ case CMDQ_OP_TLBI_NH_ALL:
case CMDQ_OP_TLBI_S12_VMALL:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
break;
@@ -342,14 +368,30 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
return 0;
}
-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
+static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent)
+{
+ struct arm_smmu_cmdq *cmdq = NULL;
+
+ if (smmu->impl_ops && smmu->impl_ops->get_secondary_cmdq)
+ cmdq = smmu->impl_ops->get_secondary_cmdq(smmu, ent);
+
+ return cmdq ?: &smmu->cmdq;
+}
+
+static bool arm_smmu_cmdq_needs_busy_polling(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq)
{
- return &smmu->cmdq;
+ if (cmdq == &smmu->cmdq)
+ return false;
+
+ return smmu->options & ARM_SMMU_OPT_TEGRA241_CMDQV;
}
static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
- struct arm_smmu_queue *q, u32 prod)
+ struct arm_smmu_cmdq *cmdq, u32 prod)
{
+ struct arm_smmu_queue *q = &cmdq->q;
struct arm_smmu_cmdq_ent ent = {
.opcode = CMDQ_OP_CMD_SYNC,
};
@@ -364,10 +406,12 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
}
arm_smmu_cmdq_build_cmd(cmd, &ent);
+ if (arm_smmu_cmdq_needs_busy_polling(smmu, cmdq))
+ u64p_replace_bits(cmd, CMDQ_SYNC_0_CS_NONE, CMDQ_SYNC_0_CS);
}
-static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
- struct arm_smmu_queue *q)
+void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq)
{
static const char * const cerror_str[] = {
[CMDQ_ERR_CERROR_NONE_IDX] = "No error",
@@ -375,6 +419,7 @@ static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
[CMDQ_ERR_CERROR_ABT_IDX] = "Abort on command fetch",
[CMDQ_ERR_CERROR_ATC_INV_IDX] = "ATC invalidate timeout",
};
+ struct arm_smmu_queue *q = &cmdq->q;
int i;
u64 cmd[CMDQ_ENT_DWORDS];
@@ -417,13 +462,15 @@ static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
/* Convert the erroneous command into a CMD_SYNC */
arm_smmu_cmdq_build_cmd(cmd, &cmd_sync);
+ if (arm_smmu_cmdq_needs_busy_polling(smmu, cmdq))
+ u64p_replace_bits(cmd, CMDQ_SYNC_0_CS_NONE, CMDQ_SYNC_0_CS);
queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
}
static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
{
- __arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq.q);
+ __arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq);
}
/*
@@ -588,11 +635,11 @@ static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
/* Wait for the command queue to become non-full */
static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq,
struct arm_smmu_ll_queue *llq)
{
unsigned long flags;
struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
int ret = 0;
/*
@@ -623,11 +670,11 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
* Must be called with the cmdq lock held in some capacity.
*/
static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq,
struct arm_smmu_ll_queue *llq)
{
int ret = 0;
struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
queue_poll_init(smmu, &qp);
@@ -647,10 +694,10 @@ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
* Must be called with the cmdq lock held in some capacity.
*/
static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq,
struct arm_smmu_ll_queue *llq)
{
struct arm_smmu_queue_poll qp;
- struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
u32 prod = llq->prod;
int ret = 0;
@@ -697,12 +744,14 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
}
static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq,
struct arm_smmu_ll_queue *llq)
{
- if (smmu->options & ARM_SMMU_OPT_MSIPOLL)
- return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
+ if (smmu->options & ARM_SMMU_OPT_MSIPOLL &&
+ !arm_smmu_cmdq_needs_busy_polling(smmu, cmdq))
+ return __arm_smmu_cmdq_poll_until_msi(smmu, cmdq, llq);
- return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
+ return __arm_smmu_cmdq_poll_until_consumed(smmu, cmdq, llq);
}
static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
@@ -738,14 +787,14 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
* insert their own list of commands then all of the commands from one
* CPU will appear before any of the commands from the other CPU.
*/
-static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
- u64 *cmds, int n, bool sync)
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
+ bool sync)
{
u64 cmd_sync[CMDQ_ENT_DWORDS];
u32 prod;
unsigned long flags;
bool owner;
- struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
struct arm_smmu_ll_queue llq, head;
int ret = 0;
@@ -759,7 +808,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
while (!queue_has_space(&llq, n + sync)) {
local_irq_restore(flags);
- if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
+ if (arm_smmu_cmdq_poll_until_not_full(smmu, cmdq, &llq))
dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
local_irq_save(flags);
}
@@ -785,7 +834,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
if (sync) {
prod = queue_inc_prod_n(&llq, n);
- arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, &cmdq->q, prod);
+ arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, cmdq, prod);
queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
/*
@@ -835,7 +884,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
if (sync) {
llq.prod = queue_inc_prod_n(&llq, n);
- ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
+ ret = arm_smmu_cmdq_poll_until_sync(smmu, cmdq, &llq);
if (ret) {
dev_err_ratelimited(smmu->dev,
"CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
@@ -870,7 +919,8 @@ static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
return -EINVAL;
}
- return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync);
+ return arm_smmu_cmdq_issue_cmdlist(
+ smmu, arm_smmu_get_cmdq(smmu, ent), cmd, 1, sync);
}
static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
@@ -885,21 +935,33 @@ static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
return __arm_smmu_cmdq_issue_cmd(smmu, ent, true);
}
+static void arm_smmu_cmdq_batch_init(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_batch *cmds,
+ struct arm_smmu_cmdq_ent *ent)
+{
+ cmds->num = 0;
+ cmds->cmdq = arm_smmu_get_cmdq(smmu, ent);
+}
+
static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
struct arm_smmu_cmdq_batch *cmds,
struct arm_smmu_cmdq_ent *cmd)
{
+ bool unsupported_cmd = !arm_smmu_cmdq_supports_cmd(cmds->cmdq, cmd);
+ bool force_sync = (cmds->num == CMDQ_BATCH_ENTRIES - 1) &&
+ (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC);
int index;
- if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
- (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
- arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
- cmds->num = 0;
+ if (force_sync || unsupported_cmd) {
+ arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
+ cmds->num, true);
+ arm_smmu_cmdq_batch_init(smmu, cmds, cmd);
}
if (cmds->num == CMDQ_BATCH_ENTRIES) {
- arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
- cmds->num = 0;
+ arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
+ cmds->num, false);
+ arm_smmu_cmdq_batch_init(smmu, cmds, cmd);
}
index = cmds->num * CMDQ_ENT_DWORDS;
@@ -915,7 +977,8 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
struct arm_smmu_cmdq_batch *cmds)
{
- return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+ return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
+ cmds->num, true);
}
static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused,
@@ -989,18 +1052,28 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |
STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |
STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW |
- STRTAB_STE_1_EATS);
+ STRTAB_STE_1_EATS | STRTAB_STE_1_MEV);
used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
+
+ /*
+ * See 13.5 Summary of attribute/permission configuration fields
+ * for the SHCFG behavior.
+ */
+ if (FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) ==
+ STRTAB_STE_1_S1DSS_BYPASS)
+ used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
}
/* S2 translates */
if (cfg & BIT(1)) {
used_bits[1] |=
- cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
+ cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS |
+ STRTAB_STE_1_SHCFG | STRTAB_STE_1_MEV);
used_bits[2] |=
cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
- STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R);
+ STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2S |
+ STRTAB_STE_2_S2R);
used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
}
@@ -1158,7 +1231,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
},
};
- cmds.num = 0;
+ arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd);
for (i = 0; i < master->num_streams; i++) {
cmd.cfgi.sid = master->streams[i].id;
arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
@@ -1167,52 +1240,40 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
arm_smmu_cmdq_batch_submit(smmu, &cmds);
}
-static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
- struct arm_smmu_l1_ctx_desc *l1_desc)
+static void arm_smmu_write_cd_l1_desc(struct arm_smmu_cdtab_l1 *dst,
+ dma_addr_t l2ptr_dma)
{
- size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
+ u64 val = (l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | CTXDESC_L1_DESC_V;
- l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size,
- &l1_desc->l2ptr_dma, GFP_KERNEL);
- if (!l1_desc->l2ptr) {
- dev_warn(smmu->dev,
- "failed to allocate context descriptor table\n");
- return -ENOMEM;
- }
- return 0;
+ /* The HW has 64 bit atomicity with stores to the L2 CD table */
+ WRITE_ONCE(dst->l2ptr, cpu_to_le64(val));
}
-static void arm_smmu_write_cd_l1_desc(__le64 *dst,
- struct arm_smmu_l1_ctx_desc *l1_desc)
+static dma_addr_t arm_smmu_cd_l1_get_desc(const struct arm_smmu_cdtab_l1 *src)
{
- u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
- CTXDESC_L1_DESC_V;
-
- /* The HW has 64 bit atomicity with stores to the L2 CD table */
- WRITE_ONCE(*dst, cpu_to_le64(val));
+ return le64_to_cpu(src->l2ptr) & CTXDESC_L1_DESC_L2PTR_MASK;
}
struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
u32 ssid)
{
- struct arm_smmu_l1_ctx_desc *l1_desc;
+ struct arm_smmu_cdtab_l2 *l2;
struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
- if (!cd_table->cdtab)
+ if (!arm_smmu_cdtab_allocated(cd_table))
return NULL;
if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
- return (struct arm_smmu_cd *)(cd_table->cdtab +
- ssid * CTXDESC_CD_DWORDS);
+ return &cd_table->linear.table[ssid];
- l1_desc = &cd_table->l1_desc[ssid / CTXDESC_L2_ENTRIES];
- if (!l1_desc->l2ptr)
+ l2 = cd_table->l2.l2ptrs[arm_smmu_cdtab_l1_idx(ssid)];
+ if (!l2)
return NULL;
- return &l1_desc->l2ptr[ssid % CTXDESC_L2_ENTRIES];
+ return &l2->cds[arm_smmu_cdtab_l2_idx(ssid)];
}
-struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
- u32 ssid)
+static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
+ u32 ssid)
{
struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
struct arm_smmu_device *smmu = master->smmu;
@@ -1220,24 +1281,25 @@ struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
might_sleep();
iommu_group_mutex_assert(master->dev);
- if (!cd_table->cdtab) {
+ if (!arm_smmu_cdtab_allocated(cd_table)) {
if (arm_smmu_alloc_cd_tables(master))
return NULL;
}
if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_64K_L2) {
- unsigned int idx = ssid / CTXDESC_L2_ENTRIES;
- struct arm_smmu_l1_ctx_desc *l1_desc;
+ unsigned int idx = arm_smmu_cdtab_l1_idx(ssid);
+ struct arm_smmu_cdtab_l2 **l2ptr = &cd_table->l2.l2ptrs[idx];
- l1_desc = &cd_table->l1_desc[idx];
- if (!l1_desc->l2ptr) {
- __le64 *l1ptr;
+ if (!*l2ptr) {
+ dma_addr_t l2ptr_dma;
- if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+ *l2ptr = dma_alloc_coherent(smmu->dev, sizeof(**l2ptr),
+ &l2ptr_dma, GFP_KERNEL);
+ if (!*l2ptr)
return NULL;
- l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
- arm_smmu_write_cd_l1_desc(l1ptr, l1_desc);
+ arm_smmu_write_cd_l1_desc(&cd_table->l2.l1tab[idx],
+ l2ptr_dma);
/* An invalid L1CD can be cached */
arm_smmu_sync_cd(master, ssid, false);
}
@@ -1289,6 +1351,8 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
struct arm_smmu_cd *cdptr,
const struct arm_smmu_cd *target)
{
+ bool target_valid = target->data[0] & cpu_to_le64(CTXDESC_CD_0_V);
+ bool cur_valid = cdptr->data[0] & cpu_to_le64(CTXDESC_CD_0_V);
struct arm_smmu_cd_writer cd_writer = {
.writer = {
.ops = &arm_smmu_cd_writer_ops,
@@ -1297,6 +1361,13 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
.ssid = ssid,
};
+ if (ssid != IOMMU_NO_PASID && cur_valid != target_valid) {
+ if (cur_valid)
+ master->cd_table.used_ssids--;
+ else
+ master->cd_table.used_ssids++;
+ }
+
arm_smmu_write_entry(&cd_writer.writer, cdptr->data, target->data);
}
@@ -1331,6 +1402,12 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
CTXDESC_CD_0_ASET |
FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid)
);
+
+ /* To enable dirty flag update, set both Access flag and dirty state update */
+ if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_HD)
+ target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_HA |
+ CTXDESC_CD_0_TCR_HD);
+
target->data[1] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.ttbr &
CTXDESC_CD_1_TTB0_MASK);
target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.mair);
@@ -1342,7 +1419,7 @@ void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid)
struct arm_smmu_cd target = {};
struct arm_smmu_cd *cdptr;
- if (!master->cd_table.cdtab)
+ if (!arm_smmu_cdtab_allocated(&master->cd_table))
return;
cdptr = arm_smmu_get_cd_ptr(master, ssid);
if (WARN_ON(!cdptr))
@@ -1364,99 +1441,83 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) ||
max_contexts <= CTXDESC_L2_ENTRIES) {
cd_table->s1fmt = STRTAB_STE_0_S1FMT_LINEAR;
- cd_table->num_l1_ents = max_contexts;
+ cd_table->linear.num_ents = max_contexts;
- l1size = max_contexts * (CTXDESC_CD_DWORDS << 3);
+ l1size = max_contexts * sizeof(struct arm_smmu_cd);
+ cd_table->linear.table = dma_alloc_coherent(smmu->dev, l1size,
+ &cd_table->cdtab_dma,
+ GFP_KERNEL);
+ if (!cd_table->linear.table)
+ return -ENOMEM;
} else {
cd_table->s1fmt = STRTAB_STE_0_S1FMT_64K_L2;
- cd_table->num_l1_ents = DIV_ROUND_UP(max_contexts,
- CTXDESC_L2_ENTRIES);
+ cd_table->l2.num_l1_ents =
+ DIV_ROUND_UP(max_contexts, CTXDESC_L2_ENTRIES);
- cd_table->l1_desc = devm_kcalloc(smmu->dev, cd_table->num_l1_ents,
- sizeof(*cd_table->l1_desc),
- GFP_KERNEL);
- if (!cd_table->l1_desc)
+ cd_table->l2.l2ptrs = kcalloc(cd_table->l2.num_l1_ents,
+ sizeof(*cd_table->l2.l2ptrs),
+ GFP_KERNEL);
+ if (!cd_table->l2.l2ptrs)
return -ENOMEM;
- l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
- }
-
- cd_table->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma,
- GFP_KERNEL);
- if (!cd_table->cdtab) {
- dev_warn(smmu->dev, "failed to allocate context descriptor\n");
- ret = -ENOMEM;
- goto err_free_l1;
+ l1size = cd_table->l2.num_l1_ents * sizeof(struct arm_smmu_cdtab_l1);
+ cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size,
+ &cd_table->cdtab_dma,
+ GFP_KERNEL);
+ if (!cd_table->l2.l1tab) {
+ ret = -ENOMEM;
+ goto err_free_l2ptrs;
+ }
}
-
return 0;
-err_free_l1:
- if (cd_table->l1_desc) {
- devm_kfree(smmu->dev, cd_table->l1_desc);
- cd_table->l1_desc = NULL;
- }
+err_free_l2ptrs:
+ kfree(cd_table->l2.l2ptrs);
+ cd_table->l2.l2ptrs = NULL;
return ret;
}
static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
{
int i;
- size_t size, l1size;
struct arm_smmu_device *smmu = master->smmu;
struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
- if (cd_table->l1_desc) {
- size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
-
- for (i = 0; i < cd_table->num_l1_ents; i++) {
- if (!cd_table->l1_desc[i].l2ptr)
+ if (cd_table->s1fmt != STRTAB_STE_0_S1FMT_LINEAR) {
+ for (i = 0; i < cd_table->l2.num_l1_ents; i++) {
+ if (!cd_table->l2.l2ptrs[i])
continue;
- dmam_free_coherent(smmu->dev, size,
- cd_table->l1_desc[i].l2ptr,
- cd_table->l1_desc[i].l2ptr_dma);
+ dma_free_coherent(smmu->dev,
+ sizeof(*cd_table->l2.l2ptrs[i]),
+ cd_table->l2.l2ptrs[i],
+ arm_smmu_cd_l1_get_desc(&cd_table->l2.l1tab[i]));
}
- devm_kfree(smmu->dev, cd_table->l1_desc);
- cd_table->l1_desc = NULL;
+ kfree(cd_table->l2.l2ptrs);
- l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
+ dma_free_coherent(smmu->dev,
+ cd_table->l2.num_l1_ents *
+ sizeof(struct arm_smmu_cdtab_l1),
+ cd_table->l2.l1tab, cd_table->cdtab_dma);
} else {
- l1size = cd_table->num_l1_ents * (CTXDESC_CD_DWORDS << 3);
- }
-
- dmam_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma);
- cd_table->cdtab_dma = 0;
- cd_table->cdtab = NULL;
-}
-
-bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd)
-{
- bool free;
- struct arm_smmu_ctx_desc *old_cd;
-
- if (!cd->asid)
- return false;
-
- free = refcount_dec_and_test(&cd->refs);
- if (free) {
- old_cd = xa_erase(&arm_smmu_asid_xa, cd->asid);
- WARN_ON(old_cd != cd);
+ dma_free_coherent(smmu->dev,
+ cd_table->linear.num_ents *
+ sizeof(struct arm_smmu_cd),
+ cd_table->linear.table, cd_table->cdtab_dma);
}
- return free;
}
/* Stream table manipulation functions */
-static void
-arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
+static void arm_smmu_write_strtab_l1_desc(struct arm_smmu_strtab_l1 *dst,
+ dma_addr_t l2ptr_dma)
{
u64 val = 0;
- val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span);
- val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
+ val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, STRTAB_SPLIT + 1);
+ val |= l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
/* The HW has 64 bit atomicity with stores to the L2 STE table */
- WRITE_ONCE(*dst, cpu_to_le64(val));
+ WRITE_ONCE(dst->l2ptr, cpu_to_le64(val));
}
struct arm_smmu_ste_writer {
@@ -1511,7 +1572,6 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
}
}
-VISIBLE_IF_KUNIT
void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
{
memset(target, 0, sizeof(*target));
@@ -1538,7 +1598,8 @@ EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_bypass_ste);
VISIBLE_IF_KUNIT
void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
- struct arm_smmu_master *master)
+ struct arm_smmu_master *master, bool ats_enabled,
+ unsigned int s1dss)
{
struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
struct arm_smmu_device *smmu = master->smmu;
@@ -1552,7 +1613,7 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax));
target->data[1] = cpu_to_le64(
- FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
+ FIELD_PREP(STRTAB_STE_1_S1DSS, s1dss) |
FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
@@ -1561,7 +1622,12 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
STRTAB_STE_1_S1STALLD :
0) |
FIELD_PREP(STRTAB_STE_1_EATS,
- master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+ ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+
+ if ((smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR) &&
+ s1dss == STRTAB_STE_1_S1DSS_BYPASS)
+ target->data[1] |= cpu_to_le64(FIELD_PREP(
+ STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING));
if (smmu->features & ARM_SMMU_FEAT_E2H) {
/*
@@ -1588,10 +1654,10 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
}
EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste);
-VISIBLE_IF_KUNIT
void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
struct arm_smmu_master *master,
- struct arm_smmu_domain *smmu_domain)
+ struct arm_smmu_domain *smmu_domain,
+ bool ats_enabled)
{
struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg;
const struct io_pgtable_cfg *pgtbl_cfg =
@@ -1608,8 +1674,10 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
target->data[1] = cpu_to_le64(
FIELD_PREP(STRTAB_STE_1_EATS,
- master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+ ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+ if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+ target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB);
if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
STRTAB_STE_1_SHCFG_INCOMING));
@@ -1629,6 +1697,7 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
STRTAB_STE_2_S2ENDI |
#endif
STRTAB_STE_2_S2PTW |
+ (master->stall_enabled ? STRTAB_STE_2_S2S : 0) |
STRTAB_STE_2_S2R);
target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s2_cfg.vttbr &
@@ -1653,66 +1722,111 @@ static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab,
static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
{
- size_t size;
- void *strtab;
+ dma_addr_t l2ptr_dma;
struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
- struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[sid >> STRTAB_SPLIT];
+ struct arm_smmu_strtab_l2 **l2table;
- if (desc->l2ptr)
+ l2table = &cfg->l2.l2ptrs[arm_smmu_strtab_l1_idx(sid)];
+ if (*l2table)
return 0;
- size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3);
- strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS];
-
- desc->span = STRTAB_SPLIT + 1;
- desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma,
- GFP_KERNEL);
- if (!desc->l2ptr) {
+ *l2table = dmam_alloc_coherent(smmu->dev, sizeof(**l2table),
+ &l2ptr_dma, GFP_KERNEL);
+ if (!*l2table) {
dev_err(smmu->dev,
"failed to allocate l2 stream table for SID %u\n",
sid);
return -ENOMEM;
}
- arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
- arm_smmu_write_strtab_l1_desc(strtab, desc);
+ arm_smmu_init_initial_stes((*l2table)->stes,
+ ARRAY_SIZE((*l2table)->stes));
+ arm_smmu_write_strtab_l1_desc(&cfg->l2.l1tab[arm_smmu_strtab_l1_idx(sid)],
+ l2ptr_dma);
+ return 0;
+}
+
+static int arm_smmu_streams_cmp_key(const void *lhs, const struct rb_node *rhs)
+{
+ struct arm_smmu_stream *stream_rhs =
+ rb_entry(rhs, struct arm_smmu_stream, node);
+ const u32 *sid_lhs = lhs;
+
+ if (*sid_lhs < stream_rhs->id)
+ return -1;
+ if (*sid_lhs > stream_rhs->id)
+ return 1;
return 0;
}
+static int arm_smmu_streams_cmp_node(struct rb_node *lhs,
+ const struct rb_node *rhs)
+{
+ return arm_smmu_streams_cmp_key(
+ &rb_entry(lhs, struct arm_smmu_stream, node)->id, rhs);
+}
+
static struct arm_smmu_master *
arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
{
struct rb_node *node;
- struct arm_smmu_stream *stream;
lockdep_assert_held(&smmu->streams_mutex);
- node = smmu->streams.rb_node;
- while (node) {
- stream = rb_entry(node, struct arm_smmu_stream, node);
- if (stream->id < sid)
- node = node->rb_right;
- else if (stream->id > sid)
- node = node->rb_left;
- else
- return stream->master;
- }
-
- return NULL;
+ node = rb_find(&sid, &smmu->streams, arm_smmu_streams_cmp_key);
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct arm_smmu_stream, node)->master;
}
/* IRQ and event handlers */
-static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
+static void arm_smmu_decode_event(struct arm_smmu_device *smmu, u64 *raw,
+ struct arm_smmu_event *event)
+{
+ struct arm_smmu_master *master;
+
+ event->id = FIELD_GET(EVTQ_0_ID, raw[0]);
+ event->sid = FIELD_GET(EVTQ_0_SID, raw[0]);
+ event->ssv = FIELD_GET(EVTQ_0_SSV, raw[0]);
+ event->ssid = event->ssv ? FIELD_GET(EVTQ_0_SSID, raw[0]) : IOMMU_NO_PASID;
+ event->privileged = FIELD_GET(EVTQ_1_PnU, raw[1]);
+ event->instruction = FIELD_GET(EVTQ_1_InD, raw[1]);
+ event->s2 = FIELD_GET(EVTQ_1_S2, raw[1]);
+ event->read = FIELD_GET(EVTQ_1_RnW, raw[1]);
+ event->stag = FIELD_GET(EVTQ_1_STAG, raw[1]);
+ event->stall = FIELD_GET(EVTQ_1_STALL, raw[1]);
+ event->class = FIELD_GET(EVTQ_1_CLASS, raw[1]);
+ event->iova = FIELD_GET(EVTQ_2_ADDR, raw[2]);
+ event->ipa = raw[3] & EVTQ_3_IPA;
+ event->fetch_addr = raw[3] & EVTQ_3_FETCH_ADDR;
+ event->ttrnw = FIELD_GET(EVTQ_1_TT_READ, raw[1]);
+ event->class_tt = false;
+ event->dev = NULL;
+
+ if (event->id == EVT_ID_PERMISSION_FAULT)
+ event->class_tt = (event->class == EVTQ_1_CLASS_TT);
+
+ mutex_lock(&smmu->streams_mutex);
+ master = arm_smmu_find_master(smmu, event->sid);
+ if (master)
+ event->dev = get_device(master->dev);
+ mutex_unlock(&smmu->streams_mutex);
+}
+
+static int arm_smmu_handle_event(struct arm_smmu_device *smmu, u64 *evt,
+ struct arm_smmu_event *event)
{
int ret = 0;
u32 perm = 0;
struct arm_smmu_master *master;
- bool ssid_valid = evt[0] & EVTQ_0_SSV;
- u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
struct iopf_fault fault_evt = { };
struct iommu_fault *flt = &fault_evt.fault;
- switch (FIELD_GET(EVTQ_0_ID, evt[0])) {
+ switch (event->id) {
+ case EVT_ID_BAD_STE_CONFIG:
+ case EVT_ID_STREAM_DISABLED_FAULT:
+ case EVT_ID_BAD_SUBSTREAMID_CONFIG:
+ case EVT_ID_BAD_CD_CONFIG:
case EVT_ID_TRANSLATION_FAULT:
case EVT_ID_ADDR_SIZE_FAULT:
case EVT_ID_ACCESS_FAULT:
@@ -1722,73 +1836,126 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
return -EOPNOTSUPP;
}
- /* Stage-2 is always pinned at the moment */
- if (evt[1] & EVTQ_1_S2)
- return -EFAULT;
-
- if (!(evt[1] & EVTQ_1_STALL))
- return -EOPNOTSUPP;
-
- if (evt[1] & EVTQ_1_RnW)
- perm |= IOMMU_FAULT_PERM_READ;
- else
- perm |= IOMMU_FAULT_PERM_WRITE;
+ if (event->stall) {
+ if (event->read)
+ perm |= IOMMU_FAULT_PERM_READ;
+ else
+ perm |= IOMMU_FAULT_PERM_WRITE;
- if (evt[1] & EVTQ_1_InD)
- perm |= IOMMU_FAULT_PERM_EXEC;
+ if (event->instruction)
+ perm |= IOMMU_FAULT_PERM_EXEC;
- if (evt[1] & EVTQ_1_PnU)
- perm |= IOMMU_FAULT_PERM_PRIV;
+ if (event->privileged)
+ perm |= IOMMU_FAULT_PERM_PRIV;
- flt->type = IOMMU_FAULT_PAGE_REQ;
- flt->prm = (struct iommu_fault_page_request) {
- .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
- .grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
- .perm = perm,
- .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
- };
+ flt->type = IOMMU_FAULT_PAGE_REQ;
+ flt->prm = (struct iommu_fault_page_request){
+ .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
+ .grpid = event->stag,
+ .perm = perm,
+ .addr = event->iova,
+ };
- if (ssid_valid) {
- flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
- flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
+ if (event->ssv) {
+ flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+ flt->prm.pasid = event->ssid;
+ }
}
mutex_lock(&smmu->streams_mutex);
- master = arm_smmu_find_master(smmu, sid);
+ master = arm_smmu_find_master(smmu, event->sid);
if (!master) {
ret = -EINVAL;
goto out_unlock;
}
- iommu_report_device_fault(master->dev, &fault_evt);
+ if (event->stall)
+ ret = iommu_report_device_fault(master->dev, &fault_evt);
+ else if (master->vmaster && !event->s2)
+ ret = arm_vmaster_report_event(master->vmaster, evt);
+ else
+ ret = -EOPNOTSUPP; /* Unhandled events should be pinned */
out_unlock:
mutex_unlock(&smmu->streams_mutex);
return ret;
}
+static void arm_smmu_dump_raw_event(struct arm_smmu_device *smmu, u64 *raw,
+ struct arm_smmu_event *event)
+{
+ int i;
+
+ dev_err(smmu->dev, "event 0x%02x received:\n", event->id);
+
+ for (i = 0; i < EVTQ_ENT_DWORDS; ++i)
+ dev_err(smmu->dev, "\t0x%016llx\n", raw[i]);
+}
+
+#define ARM_SMMU_EVT_KNOWN(e) ((e)->id < ARRAY_SIZE(event_str) && event_str[(e)->id])
+#define ARM_SMMU_LOG_EVT_STR(e) ARM_SMMU_EVT_KNOWN(e) ? event_str[(e)->id] : "UNKNOWN"
+#define ARM_SMMU_LOG_CLIENT(e) (e)->dev ? dev_name((e)->dev) : "(unassigned sid)"
+
+static void arm_smmu_dump_event(struct arm_smmu_device *smmu, u64 *raw,
+ struct arm_smmu_event *evt,
+ struct ratelimit_state *rs)
+{
+ if (!__ratelimit(rs))
+ return;
+
+ arm_smmu_dump_raw_event(smmu, raw, evt);
+
+ switch (evt->id) {
+ case EVT_ID_TRANSLATION_FAULT:
+ case EVT_ID_ADDR_SIZE_FAULT:
+ case EVT_ID_ACCESS_FAULT:
+ case EVT_ID_PERMISSION_FAULT:
+ dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x iova: %#llx ipa: %#llx",
+ ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt),
+ evt->sid, evt->ssid, evt->iova, evt->ipa);
+
+ dev_err(smmu->dev, "%s %s %s %s \"%s\"%s%s stag: %#x",
+ evt->privileged ? "priv" : "unpriv",
+ evt->instruction ? "inst" : "data",
+ str_read_write(evt->read),
+ evt->s2 ? "s2" : "s1", event_class_str[evt->class],
+ evt->class_tt ? (evt->ttrnw ? " ttd_read" : " ttd_write") : "",
+ evt->stall ? " stall" : "", evt->stag);
+
+ break;
+
+ case EVT_ID_STE_FETCH_FAULT:
+ case EVT_ID_CD_FETCH_FAULT:
+ case EVT_ID_VMS_FETCH_FAULT:
+ dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x fetch_addr: %#llx",
+ ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt),
+ evt->sid, evt->ssid, evt->fetch_addr);
+
+ break;
+
+ default:
+ dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x",
+ ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt),
+ evt->sid, evt->ssid);
+ }
+}
+
static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
{
- int i, ret;
+ u64 evt[EVTQ_ENT_DWORDS];
+ struct arm_smmu_event event = {0};
struct arm_smmu_device *smmu = dev;
struct arm_smmu_queue *q = &smmu->evtq.q;
struct arm_smmu_ll_queue *llq = &q->llq;
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- u64 evt[EVTQ_ENT_DWORDS];
do {
while (!queue_remove_raw(q, evt)) {
- u8 id = FIELD_GET(EVTQ_0_ID, evt[0]);
-
- ret = arm_smmu_handle_evt(smmu, evt);
- if (!ret || !__ratelimit(&rs))
- continue;
-
- dev_info(smmu->dev, "event 0x%02x received:\n", id);
- for (i = 0; i < ARRAY_SIZE(evt); ++i)
- dev_info(smmu->dev, "\t0x%016llx\n",
- (unsigned long long)evt[i]);
+ arm_smmu_decode_event(smmu, evt, &event);
+ if (arm_smmu_handle_event(smmu, evt, &event))
+ arm_smmu_dump_event(smmu, evt, &event, &rs);
+ put_device(event.dev);
cond_resched();
}
@@ -1995,15 +2162,16 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
cmd->atc.size = log2_span;
}
-static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
+static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
+ ioasid_t ssid)
{
int i;
struct arm_smmu_cmdq_ent cmd;
struct arm_smmu_cmdq_batch cmds;
- arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+ arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
- cmds.num = 0;
+ arm_smmu_cmdq_batch_init(master->smmu, &cmds, &cmd);
for (i = 0; i < master->num_streams; i++) {
cmd.atc.sid = master->streams[i].id;
arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
@@ -2012,13 +2180,15 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
}
-int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
+int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
unsigned long iova, size_t size)
{
+ struct arm_smmu_master_domain *master_domain;
int i;
unsigned long flags;
- struct arm_smmu_cmdq_ent cmd;
- struct arm_smmu_master *master;
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = CMDQ_OP_ATC_INV,
+ };
struct arm_smmu_cmdq_batch cmds;
if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
@@ -2041,15 +2211,27 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
if (!atomic_read(&smmu_domain->nr_ats_masters))
return 0;
- arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
-
- cmds.num = 0;
+ arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, &cmd);
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+ list_for_each_entry(master_domain, &smmu_domain->devices,
+ devices_elm) {
+ struct arm_smmu_master *master = master_domain->master;
+
if (!master->ats_enabled)
continue;
+ if (master_domain->nested_ats_flush) {
+ /*
+ * If a S2 used as a nesting parent is changed we have
+ * no option but to completely flush the ATC.
+ */
+ arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+ } else {
+ arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
+ &cmd);
+ }
+
for (i = 0; i < master->num_streams; i++) {
cmd.atc.sid = master->streams[i].id;
arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
@@ -2081,7 +2263,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid;
arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
}
- arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, 0, 0);
+ arm_smmu_atc_inv_domain(smmu_domain, 0, 0);
}
static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
@@ -2120,7 +2302,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
num_pages++;
}
- cmds.num = 0;
+ arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
while (iova < end) {
if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
@@ -2175,11 +2357,20 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
}
__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
+ if (smmu_domain->nest_parent) {
+ /*
+ * When the S2 domain changes all the nested S1 ASIDs have to be
+ * flushed too.
+ */
+ cmd.opcode = CMDQ_OP_TLBI_NH_ALL;
+ arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd);
+ }
+
/*
* Unfortunately, this can't be leaf-only since we may have
* zapped an entire table.
*/
- arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, iova, size);
+ arm_smmu_atc_inv_domain(smmu_domain, iova, size);
}
void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
@@ -2220,6 +2411,13 @@ static const struct iommu_flush_ops arm_smmu_flush_ops = {
.tlb_add_page = arm_smmu_tlb_inv_page_nosync,
};
+static bool arm_smmu_dbm_capable(struct arm_smmu_device *smmu)
+{
+ u32 features = (ARM_SMMU_FEAT_HD | ARM_SMMU_FEAT_COHERENCY);
+
+ return (smmu->features & features) == features;
+}
+
/* IOMMU API */
static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
{
@@ -2229,54 +2427,53 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
case IOMMU_CAP_CACHE_COHERENCY:
/* Assume that a coherent TCU implies coherent TBUs */
return master->smmu->features & ARM_SMMU_FEAT_COHERENCY;
+ case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
+ return arm_smmu_master_canwbs(master);
case IOMMU_CAP_NOEXEC:
case IOMMU_CAP_DEFERRED_FLUSH:
return true;
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return arm_smmu_dbm_capable(master->smmu);
default:
return false;
}
}
-static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
+static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain)
{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_master_domain *master_domain;
+ unsigned long flags;
+ bool ret = true;
- if (type == IOMMU_DOMAIN_SVA)
- return arm_smmu_sva_domain_alloc();
- return ERR_PTR(-EOPNOTSUPP);
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ list_for_each_entry(master_domain, &smmu_domain->devices,
+ devices_elm) {
+ if (!arm_smmu_master_canwbs(master_domain->master)) {
+ ret = false;
+ break;
+ }
+ }
+ smmu_domain->enforce_cache_coherency = ret;
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+ return ret;
}
-static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
+struct arm_smmu_domain *arm_smmu_domain_alloc(void)
{
struct arm_smmu_domain *smmu_domain;
- /*
- * Allocate the domain and initialise some of its data structures.
- * We can't really do anything meaningful until we've added a
- * master.
- */
smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
if (!smmu_domain)
return ERR_PTR(-ENOMEM);
- mutex_init(&smmu_domain->init_mutex);
INIT_LIST_HEAD(&smmu_domain->devices);
spin_lock_init(&smmu_domain->devices_lock);
- INIT_LIST_HEAD(&smmu_domain->mmu_notifiers);
-
- if (dev) {
- struct arm_smmu_master *master = dev_iommu_priv_get(dev);
- int ret;
- ret = arm_smmu_domain_finalise(smmu_domain, master->smmu);
- if (ret) {
- kfree(smmu_domain);
- return ERR_PTR(ret);
- }
- }
- return &smmu_domain->domain;
+ return smmu_domain;
}
-static void arm_smmu_domain_free(struct iommu_domain *domain)
+static void arm_smmu_domain_free_paging(struct iommu_domain *domain)
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_device *smmu = smmu_domain->smmu;
@@ -2287,7 +2484,7 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
/* Prevent SVA from touching the CD while we're freeing it */
mutex_lock(&arm_smmu_asid_lock);
- arm_smmu_free_asid(&smmu_domain->cd);
+ xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid);
mutex_unlock(&arm_smmu_asid_lock);
} else {
struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
@@ -2302,14 +2499,12 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
struct arm_smmu_domain *smmu_domain)
{
int ret;
- u32 asid;
+ u32 asid = 0;
struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
- refcount_set(&cd->refs, 1);
-
/* Prevent SVA from modifying the ASID until it is written to the CD */
mutex_lock(&arm_smmu_asid_lock);
- ret = xa_alloc(&arm_smmu_asid_xa, &asid, cd,
+ ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain,
XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
cd->asid = (u16)asid;
mutex_unlock(&arm_smmu_asid_lock);
@@ -2333,49 +2528,51 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu,
}
static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
- struct arm_smmu_device *smmu)
+ struct arm_smmu_device *smmu, u32 flags)
{
int ret;
- unsigned long ias, oas;
enum io_pgtable_fmt fmt;
struct io_pgtable_cfg pgtbl_cfg;
struct io_pgtable_ops *pgtbl_ops;
int (*finalise_stage_fn)(struct arm_smmu_device *smmu,
struct arm_smmu_domain *smmu_domain);
+ bool enable_dirty = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- /* Restrict the stage to what we can actually support */
- if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
- smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
- if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
- smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+ pgtbl_cfg = (struct io_pgtable_cfg) {
+ .pgsize_bitmap = smmu->pgsize_bitmap,
+ .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY,
+ .tlb = &arm_smmu_flush_ops,
+ .iommu_dev = smmu->dev,
+ };
switch (smmu_domain->stage) {
- case ARM_SMMU_DOMAIN_S1:
- ias = (smmu->features & ARM_SMMU_FEAT_VAX) ? 52 : 48;
- ias = min_t(unsigned long, ias, VA_BITS);
- oas = smmu->ias;
+ case ARM_SMMU_DOMAIN_S1: {
+ unsigned long ias = (smmu->features &
+ ARM_SMMU_FEAT_VAX) ? 52 : 48;
+
+ pgtbl_cfg.ias = min_t(unsigned long, ias, VA_BITS);
+ pgtbl_cfg.oas = smmu->ias;
+ if (enable_dirty)
+ pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_HD;
fmt = ARM_64_LPAE_S1;
finalise_stage_fn = arm_smmu_domain_finalise_s1;
break;
+ }
case ARM_SMMU_DOMAIN_S2:
- ias = smmu->ias;
- oas = smmu->oas;
+ if (enable_dirty)
+ return -EOPNOTSUPP;
+ pgtbl_cfg.ias = smmu->ias;
+ pgtbl_cfg.oas = smmu->oas;
fmt = ARM_64_LPAE_S2;
finalise_stage_fn = arm_smmu_domain_finalise_s2;
+ if ((smmu->features & ARM_SMMU_FEAT_S2FWB) &&
+ (flags & IOMMU_HWPT_ALLOC_NEST_PARENT))
+ pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB;
break;
default:
return -EINVAL;
}
- pgtbl_cfg = (struct io_pgtable_cfg) {
- .pgsize_bitmap = smmu->pgsize_bitmap,
- .ias = ias,
- .oas = oas,
- .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY,
- .tlb = &arm_smmu_flush_ops,
- .iommu_dev = smmu->dev,
- };
-
pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
if (!pgtbl_ops)
return -ENOMEM;
@@ -2383,6 +2580,8 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
smmu_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
smmu_domain->domain.geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1;
smmu_domain->domain.geometry.force_aperture = true;
+ if (enable_dirty && smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
+ smmu_domain->domain.dirty_ops = &arm_smmu_dirty_ops;
ret = finalise_stage_fn(smmu, smmu_domain);
if (ret < 0) {
@@ -2401,25 +2600,28 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
- unsigned int idx1, idx2;
-
/* Two-level walk */
- idx1 = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS;
- idx2 = sid & ((1 << STRTAB_SPLIT) - 1);
- return &cfg->l1_desc[idx1].l2ptr[idx2];
+ return &cfg->l2.l2ptrs[arm_smmu_strtab_l1_idx(sid)]
+ ->stes[arm_smmu_strtab_l2_idx(sid)];
} else {
/* Simple linear lookup */
- return (struct arm_smmu_ste *)&cfg
- ->strtab[sid * STRTAB_STE_DWORDS];
+ return &cfg->linear.table[sid];
}
}
-static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
- const struct arm_smmu_ste *target)
+void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
+ const struct arm_smmu_ste *target)
{
int i, j;
struct arm_smmu_device *smmu = master->smmu;
+ master->cd_table.in_ste =
+ FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(target->data[0])) ==
+ STRTAB_STE_0_CFG_S1_TRANS;
+ master->ste_ats_enabled =
+ FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(target->data[1])) ==
+ STRTAB_STE_1_EATS_TRANS;
+
for (i = 0; i < master->num_streams; ++i) {
u32 sid = master->streams[i].id;
struct arm_smmu_ste *step =
@@ -2451,46 +2653,24 @@ static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev));
}
-static void arm_smmu_enable_ats(struct arm_smmu_master *master,
- struct arm_smmu_domain *smmu_domain)
+static void arm_smmu_enable_ats(struct arm_smmu_master *master)
{
size_t stu;
struct pci_dev *pdev;
struct arm_smmu_device *smmu = master->smmu;
- /* Don't enable ATS at the endpoint if it's not enabled in the STE */
- if (!master->ats_enabled)
- return;
-
/* Smallest Translation Unit: log2 of the smallest supported granule */
stu = __ffs(smmu->pgsize_bitmap);
pdev = to_pci_dev(master->dev);
- atomic_inc(&smmu_domain->nr_ats_masters);
/*
* ATC invalidation of PASID 0 causes the entire ATC to be flushed.
*/
- arm_smmu_atc_inv_master(master);
+ arm_smmu_atc_inv_master(master, IOMMU_NO_PASID);
if (pci_enable_ats(pdev, stu))
dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
}
-static void arm_smmu_disable_ats(struct arm_smmu_master *master,
- struct arm_smmu_domain *smmu_domain)
-{
- if (!master->ats_enabled)
- return;
-
- pci_disable_ats(to_pci_dev(master->dev));
- /*
- * Ensure ATS is disabled at the endpoint before we issue the
- * ATC invalidation via the SMMU.
- */
- wmb();
- arm_smmu_atc_inv_master(master);
- atomic_dec(&smmu_domain->nr_ats_masters);
-}
-
static int arm_smmu_enable_pasid(struct arm_smmu_master *master)
{
int ret;
@@ -2538,68 +2718,320 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
pci_disable_pasid(pdev);
}
-static void arm_smmu_detach_dev(struct arm_smmu_master *master)
+static struct arm_smmu_master_domain *
+arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
+ struct iommu_domain *domain,
+ struct arm_smmu_master *master,
+ ioasid_t ssid, bool nested_ats_flush)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(master->dev);
- struct arm_smmu_domain *smmu_domain;
+ struct arm_smmu_master_domain *master_domain;
+
+ lockdep_assert_held(&smmu_domain->devices_lock);
+
+ list_for_each_entry(master_domain, &smmu_domain->devices,
+ devices_elm) {
+ if (master_domain->master == master &&
+ master_domain->domain == domain &&
+ master_domain->ssid == ssid &&
+ master_domain->nested_ats_flush == nested_ats_flush)
+ return master_domain;
+ }
+ return NULL;
+}
+
+/*
+ * If the domain uses the smmu_domain->devices list return the arm_smmu_domain
+ * structure, otherwise NULL. These domains track attached devices so they can
+ * issue invalidations.
+ */
+static struct arm_smmu_domain *
+to_smmu_domain_devices(struct iommu_domain *domain)
+{
+ /* The domain can be NULL only when processing the first attach */
+ if (!domain)
+ return NULL;
+ if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
+ domain->type == IOMMU_DOMAIN_SVA)
+ return to_smmu_domain(domain);
+ if (domain->type == IOMMU_DOMAIN_NESTED)
+ return to_smmu_nested_domain(domain)->vsmmu->s2_parent;
+ return NULL;
+}
+
+static int arm_smmu_enable_iopf(struct arm_smmu_master *master,
+ struct arm_smmu_master_domain *master_domain)
+{
+ int ret;
+
+ iommu_group_mutex_assert(master->dev);
+
+ if (!IS_ENABLED(CONFIG_ARM_SMMU_V3_SVA))
+ return -EOPNOTSUPP;
+
+ /*
+ * Drivers for devices supporting PRI or stall require iopf others have
+ * device-specific fault handlers and don't need IOPF, so this is not a
+ * failure.
+ */
+ if (!master->stall_enabled)
+ return 0;
+
+ /* We're not keeping track of SIDs in fault events */
+ if (master->num_streams != 1)
+ return -EOPNOTSUPP;
+
+ if (master->iopf_refcount) {
+ master->iopf_refcount++;
+ master_domain->using_iopf = true;
+ return 0;
+ }
+
+ ret = iopf_queue_add_device(master->smmu->evtq.iopf, master->dev);
+ if (ret)
+ return ret;
+ master->iopf_refcount = 1;
+ master_domain->using_iopf = true;
+ return 0;
+}
+
+static void arm_smmu_disable_iopf(struct arm_smmu_master *master,
+ struct arm_smmu_master_domain *master_domain)
+{
+ iommu_group_mutex_assert(master->dev);
+
+ if (!IS_ENABLED(CONFIG_ARM_SMMU_V3_SVA))
+ return;
+
+ if (!master_domain || !master_domain->using_iopf)
+ return;
+
+ master->iopf_refcount--;
+ if (master->iopf_refcount == 0)
+ iopf_queue_remove_device(master->smmu->evtq.iopf, master->dev);
+}
+
+static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
+ struct iommu_domain *domain,
+ ioasid_t ssid)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
+ struct arm_smmu_master_domain *master_domain;
+ bool nested_ats_flush = false;
unsigned long flags;
- if (!domain || !(domain->type & __IOMMU_DOMAIN_PAGING))
+ if (!smmu_domain)
return;
- smmu_domain = to_smmu_domain(domain);
- arm_smmu_disable_ats(master, smmu_domain);
+ if (domain->type == IOMMU_DOMAIN_NESTED)
+ nested_ats_flush = to_smmu_nested_domain(domain)->enable_ats;
spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- list_del_init(&master->domain_head);
+ master_domain = arm_smmu_find_master_domain(smmu_domain, domain, master,
+ ssid, nested_ats_flush);
+ if (master_domain) {
+ list_del(&master_domain->devices_elm);
+ if (master->ats_enabled)
+ atomic_dec(&smmu_domain->nr_ats_masters);
+ }
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
- master->ats_enabled = false;
+ arm_smmu_disable_iopf(master, master_domain);
+ kfree(master_domain);
}
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+/*
+ * Start the sequence to attach a domain to a master. The sequence contains three
+ * steps:
+ * arm_smmu_attach_prepare()
+ * arm_smmu_install_ste_for_dev()
+ * arm_smmu_attach_commit()
+ *
+ * If prepare succeeds then the sequence must be completed. The STE installed
+ * must set the STE.EATS field according to state.ats_enabled.
+ *
+ * If the device supports ATS then this determines if EATS should be enabled
+ * in the STE, and starts sequencing EATS disable if required.
+ *
+ * The change of the EATS in the STE and the PCI ATS config space is managed by
+ * this sequence to be in the right order so that if PCI ATS is enabled then
+ * STE.ETAS is enabled.
+ *
+ * new_domain can be a non-paging domain. In this case ATS will not be enabled,
+ * and invalidations won't be tracked.
+ */
+int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
+ struct iommu_domain *new_domain)
{
- int ret = 0;
+ struct arm_smmu_master *master = state->master;
+ struct arm_smmu_master_domain *master_domain;
+ struct arm_smmu_domain *smmu_domain =
+ to_smmu_domain_devices(new_domain);
unsigned long flags;
+ int ret;
+
+ /*
+ * arm_smmu_share_asid() must not see two domains pointing to the same
+ * arm_smmu_master_domain contents otherwise it could randomly write one
+ * or the other to the CD.
+ */
+ lockdep_assert_held(&arm_smmu_asid_lock);
+
+ if (smmu_domain || state->cd_needs_ats) {
+ /*
+ * The SMMU does not support enabling ATS with bypass/abort.
+ * When the STE is in bypass (STE.Config[2:0] == 0b100), ATS
+ * Translation Requests and Translated transactions are denied
+ * as though ATS is disabled for the stream (STE.EATS == 0b00),
+ * causing F_BAD_ATS_TREQ and F_TRANSL_FORBIDDEN events
+ * (IHI0070Ea 5.2 Stream Table Entry).
+ *
+ * However, if we have installed a CD table and are using S1DSS
+ * then ATS will work in S1DSS bypass. See "13.6.4 Full ATS
+ * skipping stage 1".
+ *
+ * Disable ATS if we are going to create a normal 0b100 bypass
+ * STE.
+ */
+ state->ats_enabled = !state->disable_ats &&
+ arm_smmu_ats_supported(master);
+ }
+
+ if (smmu_domain) {
+ if (new_domain->type == IOMMU_DOMAIN_NESTED) {
+ ret = arm_smmu_attach_prepare_vmaster(
+ state, to_smmu_nested_domain(new_domain));
+ if (ret)
+ return ret;
+ }
+
+ master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
+ if (!master_domain) {
+ ret = -ENOMEM;
+ goto err_free_vmaster;
+ }
+ master_domain->domain = new_domain;
+ master_domain->master = master;
+ master_domain->ssid = state->ssid;
+ if (new_domain->type == IOMMU_DOMAIN_NESTED)
+ master_domain->nested_ats_flush =
+ to_smmu_nested_domain(new_domain)->enable_ats;
+
+ if (new_domain->iopf_handler) {
+ ret = arm_smmu_enable_iopf(master, master_domain);
+ if (ret)
+ goto err_free_master_domain;
+ }
+
+ /*
+ * During prepare we want the current smmu_domain and new
+ * smmu_domain to be in the devices list before we change any
+ * HW. This ensures that both domains will send ATS
+ * invalidations to the master until we are done.
+ *
+ * It is tempting to make this list only track masters that are
+ * using ATS, but arm_smmu_share_asid() also uses this to change
+ * the ASID of a domain, unrelated to ATS.
+ *
+ * Notice if we are re-attaching the same domain then the list
+ * will have two identical entries and commit will remove only
+ * one of them.
+ */
+ spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+ if (smmu_domain->enforce_cache_coherency &&
+ !arm_smmu_master_canwbs(master)) {
+ spin_unlock_irqrestore(&smmu_domain->devices_lock,
+ flags);
+ ret = -EINVAL;
+ goto err_iopf;
+ }
+
+ if (state->ats_enabled)
+ atomic_inc(&smmu_domain->nr_ats_masters);
+ list_add(&master_domain->devices_elm, &smmu_domain->devices);
+ spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+ }
+
+ if (!state->ats_enabled && master->ats_enabled) {
+ pci_disable_ats(to_pci_dev(master->dev));
+ /*
+ * This is probably overkill, but the config write for disabling
+ * ATS should complete before the STE is configured to generate
+ * UR to avoid AER noise.
+ */
+ wmb();
+ }
+ return 0;
+
+err_iopf:
+ arm_smmu_disable_iopf(master, master_domain);
+err_free_master_domain:
+ kfree(master_domain);
+err_free_vmaster:
+ kfree(state->vmaster);
+ return ret;
+}
+
+/*
+ * Commit is done after the STE/CD are configured with the EATS setting. It
+ * completes synchronizing the PCI device's ATC and finishes manipulating the
+ * smmu_domain->devices list.
+ */
+void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
+{
+ struct arm_smmu_master *master = state->master;
+
+ lockdep_assert_held(&arm_smmu_asid_lock);
+
+ arm_smmu_attach_commit_vmaster(state);
+
+ if (state->ats_enabled && !master->ats_enabled) {
+ arm_smmu_enable_ats(master);
+ } else if (state->ats_enabled && master->ats_enabled) {
+ /*
+ * The translation has changed, flush the ATC. At this point the
+ * SMMU is translating for the new domain and both the old&new
+ * domain will issue invalidations.
+ */
+ arm_smmu_atc_inv_master(master, state->ssid);
+ } else if (!state->ats_enabled && master->ats_enabled) {
+ /* ATS is being switched off, invalidate the entire ATC */
+ arm_smmu_atc_inv_master(master, IOMMU_NO_PASID);
+ }
+
+ arm_smmu_remove_master_domain(master, state->old_domain, state->ssid);
+ master->ats_enabled = state->ats_enabled;
+}
+
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old_domain)
+{
+ int ret = 0;
struct arm_smmu_ste target;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct arm_smmu_device *smmu;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct arm_smmu_attach_state state = {
+ .old_domain = old_domain,
+ .ssid = IOMMU_NO_PASID,
+ };
struct arm_smmu_master *master;
struct arm_smmu_cd *cdptr;
if (!fwspec)
return -ENOENT;
- master = dev_iommu_priv_get(dev);
+ state.master = master = dev_iommu_priv_get(dev);
smmu = master->smmu;
- /*
- * Checking that SVA is disabled ensures that this device isn't bound to
- * any mm, and can be safely detached from its old domain. Bonds cannot
- * be removed concurrently since we're holding the group mutex.
- */
- if (arm_smmu_master_sva_enabled(master)) {
- dev_err(dev, "cannot attach - SVA enabled\n");
- return -EBUSY;
- }
-
- mutex_lock(&smmu_domain->init_mutex);
-
- if (!smmu_domain->smmu) {
- ret = arm_smmu_domain_finalise(smmu_domain, smmu);
- } else if (smmu_domain->smmu != smmu)
- ret = -EINVAL;
-
- mutex_unlock(&smmu_domain->init_mutex);
- if (ret)
- return ret;
+ if (smmu_domain->smmu != smmu)
+ return -EINVAL;
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cdptr = arm_smmu_alloc_cd_ptr(master, IOMMU_NO_PASID);
if (!cdptr)
return -ENOMEM;
- }
+ } else if (arm_smmu_ssids_in_use(&master->cd_table))
+ return -EBUSY;
/*
* Prevent arm_smmu_share_asid() from trying to change the ASID
@@ -2609,13 +3041,11 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
*/
mutex_lock(&arm_smmu_asid_lock);
- arm_smmu_detach_dev(master);
-
- master->ats_enabled = arm_smmu_ats_supported(master);
-
- spin_lock_irqsave(&smmu_domain->devices_lock, flags);
- list_add(&master->domain_head, &smmu_domain->devices);
- spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+ ret = arm_smmu_attach_prepare(&state, domain);
+ if (ret) {
+ mutex_unlock(&arm_smmu_asid_lock);
+ return ret;
+ }
switch (smmu_domain->stage) {
case ARM_SMMU_DOMAIN_S1: {
@@ -2624,29 +3054,165 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
&target_cd);
- arm_smmu_make_cdtable_ste(&target, master);
+ arm_smmu_make_cdtable_ste(&target, master, state.ats_enabled,
+ STRTAB_STE_1_S1DSS_SSID0);
arm_smmu_install_ste_for_dev(master, &target);
break;
}
case ARM_SMMU_DOMAIN_S2:
- arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
+ arm_smmu_make_s2_domain_ste(&target, master, smmu_domain,
+ state.ats_enabled);
arm_smmu_install_ste_for_dev(master, &target);
arm_smmu_clear_cd(master, IOMMU_NO_PASID);
break;
}
- arm_smmu_enable_ats(master, smmu_domain);
+ arm_smmu_attach_commit(&state);
mutex_unlock(&arm_smmu_asid_lock);
return 0;
}
-static int arm_smmu_attach_dev_ste(struct device *dev,
- struct arm_smmu_ste *ste)
+static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t id,
+ struct iommu_domain *old)
{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_cd target_cd;
- if (arm_smmu_master_sva_enabled(master))
- return -EBUSY;
+ if (smmu_domain->smmu != smmu)
+ return -EINVAL;
+
+ if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1)
+ return -EINVAL;
+
+ /*
+ * We can read cd.asid outside the lock because arm_smmu_set_pasid()
+ * will fix it
+ */
+ arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
+ return arm_smmu_set_pasid(master, to_smmu_domain(domain), id,
+ &target_cd, old);
+}
+
+static void arm_smmu_update_ste(struct arm_smmu_master *master,
+ struct iommu_domain *sid_domain,
+ bool ats_enabled)
+{
+ unsigned int s1dss = STRTAB_STE_1_S1DSS_TERMINATE;
+ struct arm_smmu_ste ste;
+
+ if (master->cd_table.in_ste && master->ste_ats_enabled == ats_enabled)
+ return;
+
+ if (sid_domain->type == IOMMU_DOMAIN_IDENTITY)
+ s1dss = STRTAB_STE_1_S1DSS_BYPASS;
+ else
+ WARN_ON(sid_domain->type != IOMMU_DOMAIN_BLOCKED);
+
+ /*
+ * Change the STE into a cdtable one with SID IDENTITY/BLOCKED behavior
+ * using s1dss if necessary. If the cd_table is already installed then
+ * the S1DSS is correct and this will just update the EATS. Otherwise it
+ * installs the entire thing. This will be hitless.
+ */
+ arm_smmu_make_cdtable_ste(&ste, master, ats_enabled, s1dss);
+ arm_smmu_install_ste_for_dev(master, &ste);
+}
+
+int arm_smmu_set_pasid(struct arm_smmu_master *master,
+ struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
+ struct arm_smmu_cd *cd, struct iommu_domain *old)
+{
+ struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev);
+ struct arm_smmu_attach_state state = {
+ .master = master,
+ .ssid = pasid,
+ .old_domain = old,
+ };
+ struct arm_smmu_cd *cdptr;
+ int ret;
+
+ /* The core code validates pasid */
+
+ if (smmu_domain->smmu != master->smmu)
+ return -EINVAL;
+
+ if (!master->cd_table.in_ste &&
+ sid_domain->type != IOMMU_DOMAIN_IDENTITY &&
+ sid_domain->type != IOMMU_DOMAIN_BLOCKED)
+ return -EINVAL;
+
+ cdptr = arm_smmu_alloc_cd_ptr(master, pasid);
+ if (!cdptr)
+ return -ENOMEM;
+
+ mutex_lock(&arm_smmu_asid_lock);
+ ret = arm_smmu_attach_prepare(&state, &smmu_domain->domain);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * We don't want to obtain to the asid_lock too early, so fix up the
+ * caller set ASID under the lock in case it changed.
+ */
+ cd->data[0] &= ~cpu_to_le64(CTXDESC_CD_0_ASID);
+ cd->data[0] |= cpu_to_le64(
+ FIELD_PREP(CTXDESC_CD_0_ASID, smmu_domain->cd.asid));
+
+ arm_smmu_write_cd_entry(master, pasid, cdptr, cd);
+ arm_smmu_update_ste(master, sid_domain, state.ats_enabled);
+
+ arm_smmu_attach_commit(&state);
+
+out_unlock:
+ mutex_unlock(&arm_smmu_asid_lock);
+ return ret;
+}
+
+static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old_domain)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(old_domain);
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+ mutex_lock(&arm_smmu_asid_lock);
+ arm_smmu_clear_cd(master, pasid);
+ if (master->ats_enabled)
+ arm_smmu_atc_inv_master(master, pasid);
+ arm_smmu_remove_master_domain(master, &smmu_domain->domain, pasid);
+ mutex_unlock(&arm_smmu_asid_lock);
+
+ /*
+ * When the last user of the CD table goes away downgrade the STE back
+ * to a non-cd_table one, by re-attaching its sid_domain.
+ */
+ if (!arm_smmu_ssids_in_use(&master->cd_table)) {
+ struct iommu_domain *sid_domain =
+ iommu_get_domain_for_dev(master->dev);
+
+ if (sid_domain->type == IOMMU_DOMAIN_IDENTITY ||
+ sid_domain->type == IOMMU_DOMAIN_BLOCKED)
+ sid_domain->ops->attach_dev(sid_domain, dev,
+ sid_domain);
+ }
+ return 0;
+}
+
+static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
+ struct iommu_domain *old_domain,
+ struct device *dev,
+ struct arm_smmu_ste *ste,
+ unsigned int s1dss)
+{
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_attach_state state = {
+ .master = master,
+ .old_domain = old_domain,
+ .ssid = IOMMU_NO_PASID,
+ };
/*
* Do not allow any ASID to be changed while are working on the STE,
@@ -2655,15 +3221,25 @@ static int arm_smmu_attach_dev_ste(struct device *dev,
mutex_lock(&arm_smmu_asid_lock);
/*
- * The SMMU does not support enabling ATS with bypass/abort. When the
- * STE is in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests
- * and Translated transactions are denied as though ATS is disabled for
- * the stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and
- * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry).
+ * If the CD table is not in use we can use the provided STE, otherwise
+ * we use a cdtable STE with the provided S1DSS.
*/
- arm_smmu_detach_dev(master);
-
+ if (arm_smmu_ssids_in_use(&master->cd_table)) {
+ /*
+ * If a CD table has to be present then we need to run with ATS
+ * on because we have to assume a PASID is using ATS. For
+ * IDENTITY this will setup things so that S1DSS=bypass which
+ * follows the explanation in "13.6.4 Full ATS skipping stage 1"
+ * and allows for ATS on the RID to work.
+ */
+ state.cd_needs_ats = true;
+ arm_smmu_attach_prepare(&state, domain);
+ arm_smmu_make_cdtable_ste(ste, master, state.ats_enabled, s1dss);
+ } else {
+ arm_smmu_attach_prepare(&state, domain);
+ }
arm_smmu_install_ste_for_dev(master, ste);
+ arm_smmu_attach_commit(&state);
mutex_unlock(&arm_smmu_asid_lock);
/*
@@ -2672,17 +3248,20 @@ static int arm_smmu_attach_dev_ste(struct device *dev,
* descriptor from arm_smmu_share_asid().
*/
arm_smmu_clear_cd(master, IOMMU_NO_PASID);
- return 0;
}
static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_ste ste;
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ arm_smmu_master_clear_vmaster(master);
arm_smmu_make_bypass_ste(master->smmu, &ste);
- return arm_smmu_attach_dev_ste(dev, &ste);
+ arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
+ STRTAB_STE_1_S1DSS_BYPASS);
+ return 0;
}
static const struct iommu_domain_ops arm_smmu_identity_ops = {
@@ -2695,16 +3274,22 @@ static struct iommu_domain arm_smmu_identity_domain = {
};
static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old_domain)
{
struct arm_smmu_ste ste;
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ arm_smmu_master_clear_vmaster(master);
arm_smmu_make_abort_ste(&ste);
- return arm_smmu_attach_dev_ste(dev, &ste);
+ arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste,
+ STRTAB_STE_1_S1DSS_TERMINATE);
+ return 0;
}
static const struct iommu_domain_ops arm_smmu_blocked_ops = {
.attach_dev = arm_smmu_attach_dev_blocked,
+ .set_dev_pasid = arm_smmu_blocking_set_dev_pasid,
};
static struct iommu_domain arm_smmu_blocked_domain = {
@@ -2712,6 +3297,69 @@ static struct iommu_domain arm_smmu_blocked_domain = {
.ops = &arm_smmu_blocked_ops,
};
+static struct iommu_domain *
+arm_smmu_domain_alloc_paging_flags(struct device *dev, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct arm_smmu_device *smmu = master->smmu;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_PASID |
+ IOMMU_HWPT_ALLOC_NEST_PARENT;
+ struct arm_smmu_domain *smmu_domain;
+ int ret;
+
+ if (flags & ~PAGING_FLAGS)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (user_data)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ smmu_domain = arm_smmu_domain_alloc();
+ if (IS_ERR(smmu_domain))
+ return ERR_CAST(smmu_domain);
+
+ switch (flags) {
+ case 0:
+ /* Prefer S1 if available */
+ if (smmu->features & ARM_SMMU_FEAT_TRANS_S1)
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+ else
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+ break;
+ case IOMMU_HWPT_ALLOC_NEST_PARENT:
+ if (!(smmu->features & ARM_SMMU_FEAT_NESTING)) {
+ ret = -EOPNOTSUPP;
+ goto err_free;
+ }
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+ smmu_domain->nest_parent = true;
+ break;
+ case IOMMU_HWPT_ALLOC_DIRTY_TRACKING:
+ case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_PASID:
+ case IOMMU_HWPT_ALLOC_PASID:
+ if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) {
+ ret = -EOPNOTSUPP;
+ goto err_free;
+ }
+ smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ goto err_free;
+ }
+
+ smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
+ smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops;
+ ret = arm_smmu_domain_finalise(smmu_domain, smmu, flags);
+ if (ret)
+ goto err_free;
+ return &smmu_domain->domain;
+
+err_free:
+ kfree(smmu_domain);
+ return ERR_PTR(ret);
+}
+
static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t pgsize, size_t pgcount,
int prot, gfp_t gfp, size_t *mapped)
@@ -2774,20 +3422,17 @@ static struct platform_driver arm_smmu_driver;
static
struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
{
- struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver,
- fwnode);
+ struct device *dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode);
+
put_device(dev);
return dev ? dev_get_drvdata(dev) : NULL;
}
static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
{
- unsigned long limit = smmu->strtab_cfg.num_l1_ents;
-
if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
- limit *= 1UL << STRTAB_SPLIT;
-
- return sid < limit;
+ return arm_smmu_strtab_l1_idx(sid) < smmu->strtab_cfg.l2.num_l1_ents;
+ return sid < smmu->strtab_cfg.linear.num_ents;
}
static int arm_smmu_init_sid_strtab(struct arm_smmu_device *smmu, u32 sid)
@@ -2808,8 +3453,6 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
{
int i;
int ret = 0;
- struct arm_smmu_stream *new_stream, *cur_stream;
- struct rb_node **new_node, *parent_node = NULL;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
master->streams = kcalloc(fwspec->num_ids, sizeof(*master->streams),
@@ -2820,9 +3463,10 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
mutex_lock(&smmu->streams_mutex);
for (i = 0; i < fwspec->num_ids; i++) {
+ struct arm_smmu_stream *new_stream = &master->streams[i];
+ struct rb_node *existing;
u32 sid = fwspec->ids[i];
- new_stream = &master->streams[i];
new_stream->id = sid;
new_stream->master = master;
@@ -2831,28 +3475,23 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
break;
/* Insert into SID tree */
- new_node = &(smmu->streams.rb_node);
- while (*new_node) {
- cur_stream = rb_entry(*new_node, struct arm_smmu_stream,
- node);
- parent_node = *new_node;
- if (cur_stream->id > new_stream->id) {
- new_node = &((*new_node)->rb_left);
- } else if (cur_stream->id < new_stream->id) {
- new_node = &((*new_node)->rb_right);
- } else {
- dev_warn(master->dev,
- "stream %u already in tree\n",
- cur_stream->id);
- ret = -EINVAL;
- break;
- }
- }
- if (ret)
- break;
+ existing = rb_find_add(&new_stream->node, &smmu->streams,
+ arm_smmu_streams_cmp_node);
+ if (existing) {
+ struct arm_smmu_master *existing_master =
+ rb_entry(existing, struct arm_smmu_stream, node)
+ ->master;
+
+ /* Bridged PCI devices may end up with duplicated IDs */
+ if (existing_master == master)
+ continue;
- rb_link_node(&new_stream->node, parent_node, new_node);
- rb_insert_color(&new_stream->node, &smmu->streams);
+ dev_warn(master->dev,
+ "Aliasing StreamID 0x%x (from %s) unsupported, expect DMA to be broken\n",
+ sid, dev_name(existing_master->dev));
+ ret = -ENODEV;
+ break;
+ }
}
if (ret) {
@@ -2882,8 +3521,6 @@ static void arm_smmu_remove_master(struct arm_smmu_master *master)
kfree(master->streams);
}
-static struct iommu_ops arm_smmu_ops;
-
static struct iommu_device *arm_smmu_probe_device(struct device *dev)
{
int ret;
@@ -2904,8 +3541,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
master->dev = dev;
master->smmu = smmu;
- INIT_LIST_HEAD(&master->bonds);
- INIT_LIST_HEAD(&master->domain_head);
dev_iommu_priv_set(dev, master);
ret = arm_smmu_insert_master(smmu, master);
@@ -2934,6 +3569,12 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
smmu->features & ARM_SMMU_FEAT_STALL_FORCE)
master->stall_enabled = true;
+ if (dev_is_pci(dev)) {
+ unsigned int stu = __ffs(smmu->pgsize_bitmap);
+
+ pci_prepare_ats(to_pci_dev(dev), stu);
+ }
+
return &smmu->iommu;
err_free_master:
@@ -2945,22 +3586,36 @@ static void arm_smmu_release_device(struct device *dev)
{
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
- if (WARN_ON(arm_smmu_master_sva_enabled(master)))
- iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
-
- /* Put the STE back to what arm_smmu_init_strtab() sets */
- if (dev->iommu->require_direct)
- arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev);
- else
- arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev);
+ WARN_ON(master->iopf_refcount);
arm_smmu_disable_pasid(master);
arm_smmu_remove_master(master);
- if (master->cd_table.cdtab)
+ if (arm_smmu_cdtab_allocated(&master->cd_table))
arm_smmu_free_cd_tables(master);
kfree(master);
}
+static int arm_smmu_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+ struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+ return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
+}
+
+static int arm_smmu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enabled)
+{
+ /*
+ * Always enabled and the dirty bitmap is cleared prior to
+ * set_dirty_tracking().
+ */
+ return 0;
+}
+
static struct iommu_group *arm_smmu_device_group(struct device *dev)
{
struct iommu_group *group;
@@ -2978,21 +3633,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
return group;
}
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
- struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
- int ret = 0;
-
- mutex_lock(&smmu_domain->init_mutex);
- if (smmu_domain->smmu)
- ret = -EPERM;
- else
- smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
- mutex_unlock(&smmu_domain->init_mutex);
-
- return ret;
-}
-
static int arm_smmu_of_xlate(struct device *dev,
const struct of_phandle_args *args)
{
@@ -3015,58 +3655,6 @@ static void arm_smmu_get_resv_regions(struct device *dev,
iommu_dma_get_resv_regions(dev, head);
}
-static int arm_smmu_dev_enable_feature(struct device *dev,
- enum iommu_dev_features feat)
-{
- struct arm_smmu_master *master = dev_iommu_priv_get(dev);
-
- if (!master)
- return -ENODEV;
-
- switch (feat) {
- case IOMMU_DEV_FEAT_IOPF:
- if (!arm_smmu_master_iopf_supported(master))
- return -EINVAL;
- if (master->iopf_enabled)
- return -EBUSY;
- master->iopf_enabled = true;
- return 0;
- case IOMMU_DEV_FEAT_SVA:
- if (!arm_smmu_master_sva_supported(master))
- return -EINVAL;
- if (arm_smmu_master_sva_enabled(master))
- return -EBUSY;
- return arm_smmu_master_enable_sva(master);
- default:
- return -EINVAL;
- }
-}
-
-static int arm_smmu_dev_disable_feature(struct device *dev,
- enum iommu_dev_features feat)
-{
- struct arm_smmu_master *master = dev_iommu_priv_get(dev);
-
- if (!master)
- return -EINVAL;
-
- switch (feat) {
- case IOMMU_DEV_FEAT_IOPF:
- if (!master->iopf_enabled)
- return -EINVAL;
- if (master->sva_enabled)
- return -EBUSY;
- master->iopf_enabled = false;
- return 0;
- case IOMMU_DEV_FEAT_SVA:
- if (!arm_smmu_master_sva_enabled(master))
- return -EINVAL;
- return arm_smmu_master_disable_sva(master);
- default:
- return -EINVAL;
- }
-}
-
/*
* HiSilicon PCIe tune and trace device can be used to trace TLP headers on the
* PCIe link and save the data to memory by DMA. The hardware is restricted to
@@ -3087,49 +3675,48 @@ static int arm_smmu_def_domain_type(struct device *dev)
return 0;
}
-static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
- struct iommu_domain *domain)
-{
- arm_smmu_sva_remove_dev_pasid(domain, dev, pasid);
-}
-
-static struct iommu_ops arm_smmu_ops = {
+static const struct iommu_ops arm_smmu_ops = {
.identity_domain = &arm_smmu_identity_domain,
.blocked_domain = &arm_smmu_blocked_domain,
+ .release_domain = &arm_smmu_blocked_domain,
.capable = arm_smmu_capable,
- .domain_alloc = arm_smmu_domain_alloc,
- .domain_alloc_paging = arm_smmu_domain_alloc_paging,
+ .hw_info = arm_smmu_hw_info,
+ .domain_alloc_sva = arm_smmu_sva_domain_alloc,
+ .domain_alloc_paging_flags = arm_smmu_domain_alloc_paging_flags,
.probe_device = arm_smmu_probe_device,
.release_device = arm_smmu_release_device,
.device_group = arm_smmu_device_group,
.of_xlate = arm_smmu_of_xlate,
.get_resv_regions = arm_smmu_get_resv_regions,
- .remove_dev_pasid = arm_smmu_remove_dev_pasid,
- .dev_enable_feat = arm_smmu_dev_enable_feature,
- .dev_disable_feat = arm_smmu_dev_disable_feature,
.page_response = arm_smmu_page_response,
.def_domain_type = arm_smmu_def_domain_type,
- .pgsize_bitmap = -1UL, /* Restricted during device attach */
+ .get_viommu_size = arm_smmu_get_viommu_size,
+ .viommu_init = arm_vsmmu_init,
+ .user_pasid_table = 1,
.owner = THIS_MODULE,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = arm_smmu_attach_dev,
+ .enforce_cache_coherency = arm_smmu_enforce_cache_coherency,
+ .set_dev_pasid = arm_smmu_s1_set_dev_pasid,
.map_pages = arm_smmu_map_pages,
.unmap_pages = arm_smmu_unmap_pages,
.flush_iotlb_all = arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,
.iova_to_phys = arm_smmu_iova_to_phys,
- .enable_nesting = arm_smmu_enable_nesting,
- .free = arm_smmu_domain_free,
+ .free = arm_smmu_domain_free_paging,
}
};
+static struct iommu_dirty_ops arm_smmu_dirty_ops = {
+ .read_and_clear_dirty = arm_smmu_read_and_clear_dirty,
+ .set_dirty_tracking = arm_smmu_set_dirty_tracking,
+};
+
/* Probing and initialisation functions */
-static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
- struct arm_smmu_queue *q,
- void __iomem *page,
- unsigned long prod_off,
- unsigned long cons_off,
- size_t dwords, const char *name)
+int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
+ struct arm_smmu_queue *q, void __iomem *page,
+ unsigned long prod_off, unsigned long cons_off,
+ size_t dwords, const char *name)
{
size_t qsz;
@@ -3167,9 +3754,9 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
return 0;
}
-static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
+int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq)
{
- struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
atomic_set(&cmdq->owner_prod, 0);
@@ -3194,7 +3781,7 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
if (ret)
return ret;
- ret = arm_smmu_cmdq_init(smmu);
+ ret = arm_smmu_cmdq_init(smmu, &smmu->cmdq);
if (ret)
return ret;
@@ -3221,109 +3808,71 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
PRIQ_ENT_DWORDS, "priq");
}
-static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu)
-{
- unsigned int i;
- struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
- void *strtab = smmu->strtab_cfg.strtab;
-
- cfg->l1_desc = devm_kcalloc(smmu->dev, cfg->num_l1_ents,
- sizeof(*cfg->l1_desc), GFP_KERNEL);
- if (!cfg->l1_desc)
- return -ENOMEM;
-
- for (i = 0; i < cfg->num_l1_ents; ++i) {
- arm_smmu_write_strtab_l1_desc(strtab, &cfg->l1_desc[i]);
- strtab += STRTAB_L1_DESC_DWORDS << 3;
- }
-
- return 0;
-}
-
static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
{
- void *strtab;
- u64 reg;
- u32 size, l1size;
+ u32 l1size;
struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+ unsigned int last_sid_idx =
+ arm_smmu_strtab_l1_idx((1ULL << smmu->sid_bits) - 1);
/* Calculate the L1 size, capped to the SIDSIZE. */
- size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
- size = min(size, smmu->sid_bits - STRTAB_SPLIT);
- cfg->num_l1_ents = 1 << size;
-
- size += STRTAB_SPLIT;
- if (size < smmu->sid_bits)
+ cfg->l2.num_l1_ents = min(last_sid_idx + 1, STRTAB_MAX_L1_ENTRIES);
+ if (cfg->l2.num_l1_ents <= last_sid_idx)
dev_warn(smmu->dev,
"2-level strtab only covers %u/%u bits of SID\n",
- size, smmu->sid_bits);
+ ilog2(cfg->l2.num_l1_ents * STRTAB_NUM_L2_STES),
+ smmu->sid_bits);
- l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3);
- strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
- GFP_KERNEL);
- if (!strtab) {
+ l1size = cfg->l2.num_l1_ents * sizeof(struct arm_smmu_strtab_l1);
+ cfg->l2.l1tab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->l2.l1_dma,
+ GFP_KERNEL);
+ if (!cfg->l2.l1tab) {
dev_err(smmu->dev,
"failed to allocate l1 stream table (%u bytes)\n",
l1size);
return -ENOMEM;
}
- cfg->strtab = strtab;
- /* Configure strtab_base_cfg for 2 levels */
- reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL);
- reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, size);
- reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
- cfg->strtab_base_cfg = reg;
+ cfg->l2.l2ptrs = devm_kcalloc(smmu->dev, cfg->l2.num_l1_ents,
+ sizeof(*cfg->l2.l2ptrs), GFP_KERNEL);
+ if (!cfg->l2.l2ptrs)
+ return -ENOMEM;
- return arm_smmu_init_l1_strtab(smmu);
+ return 0;
}
static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
{
- void *strtab;
- u64 reg;
u32 size;
struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
- size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3);
- strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma,
- GFP_KERNEL);
- if (!strtab) {
+ size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
+ cfg->linear.table = dmam_alloc_coherent(smmu->dev, size,
+ &cfg->linear.ste_dma,
+ GFP_KERNEL);
+ if (!cfg->linear.table) {
dev_err(smmu->dev,
"failed to allocate linear stream table (%u bytes)\n",
size);
return -ENOMEM;
}
- cfg->strtab = strtab;
- cfg->num_l1_ents = 1 << smmu->sid_bits;
+ cfg->linear.num_ents = 1 << smmu->sid_bits;
- /* Configure strtab_base_cfg for a linear table covering all SIDs */
- reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_LINEAR);
- reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
- cfg->strtab_base_cfg = reg;
-
- arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents);
+ arm_smmu_init_initial_stes(cfg->linear.table, cfg->linear.num_ents);
return 0;
}
static int arm_smmu_init_strtab(struct arm_smmu_device *smmu)
{
- u64 reg;
int ret;
if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
ret = arm_smmu_init_strtab_2lvl(smmu);
else
ret = arm_smmu_init_strtab_linear(smmu);
-
if (ret)
return ret;
- /* Set the strtab base address */
- reg = smmu->strtab_cfg.strtab_dma & STRTAB_BASE_ADDR_MASK;
- reg |= STRTAB_BASE_RA;
- smmu->strtab_cfg.strtab_base = reg;
-
ida_init(&smmu->vmid_map);
return 0;
@@ -3340,7 +3889,14 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
if (ret)
return ret;
- return arm_smmu_init_strtab(smmu);
+ ret = arm_smmu_init_strtab(smmu);
+ if (ret)
+ return ret;
+
+ if (smmu->impl_ops && smmu->impl_ops->init_structures)
+ return smmu->impl_ops->init_structures(smmu);
+
+ return 0;
}
static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val,
@@ -3532,6 +4088,30 @@ static int arm_smmu_device_disable(struct arm_smmu_device *smmu)
return ret;
}
+static void arm_smmu_write_strtab(struct arm_smmu_device *smmu)
+{
+ struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+ dma_addr_t dma;
+ u32 reg;
+
+ if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+ reg = FIELD_PREP(STRTAB_BASE_CFG_FMT,
+ STRTAB_BASE_CFG_FMT_2LVL) |
+ FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE,
+ ilog2(cfg->l2.num_l1_ents) + STRTAB_SPLIT) |
+ FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
+ dma = cfg->l2.l1_dma;
+ } else {
+ reg = FIELD_PREP(STRTAB_BASE_CFG_FMT,
+ STRTAB_BASE_CFG_FMT_LINEAR) |
+ FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
+ dma = cfg->linear.ste_dma;
+ }
+ writeq_relaxed((dma & STRTAB_BASE_ADDR_MASK) | STRTAB_BASE_RA,
+ smmu->base + ARM_SMMU_STRTAB_BASE);
+ writel_relaxed(reg, smmu->base + ARM_SMMU_STRTAB_BASE_CFG);
+}
+
static int arm_smmu_device_reset(struct arm_smmu_device *smmu)
{
int ret;
@@ -3567,10 +4147,7 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu)
writel_relaxed(reg, smmu->base + ARM_SMMU_CR2);
/* Stream table */
- writeq_relaxed(smmu->strtab_cfg.strtab_base,
- smmu->base + ARM_SMMU_STRTAB_BASE);
- writel_relaxed(smmu->strtab_cfg.strtab_base_cfg,
- smmu->base + ARM_SMMU_STRTAB_BASE_CFG);
+ arm_smmu_write_strtab(smmu);
/* Command queue */
writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE);
@@ -3657,6 +4234,14 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu)
return ret;
}
+ if (smmu->impl_ops && smmu->impl_ops->device_reset) {
+ ret = smmu->impl_ops->device_reset(smmu);
+ if (ret) {
+ dev_err(smmu->dev, "failed to reset impl\n");
+ return ret;
+ }
+ }
+
return 0;
}
@@ -3698,6 +4283,28 @@ static void arm_smmu_device_iidr_probe(struct arm_smmu_device *smmu)
}
}
+static void arm_smmu_get_httu(struct arm_smmu_device *smmu, u32 reg)
+{
+ u32 fw_features = smmu->features & (ARM_SMMU_FEAT_HA | ARM_SMMU_FEAT_HD);
+ u32 hw_features = 0;
+
+ switch (FIELD_GET(IDR0_HTTU, reg)) {
+ case IDR0_HTTU_ACCESS_DIRTY:
+ hw_features |= ARM_SMMU_FEAT_HD;
+ fallthrough;
+ case IDR0_HTTU_ACCESS:
+ hw_features |= ARM_SMMU_FEAT_HA;
+ }
+
+ if (smmu->dev->of_node)
+ smmu->features |= hw_features;
+ else if (hw_features != fw_features)
+ /* ACPI IORT sets the HTTU bits */
+ dev_warn(smmu->dev,
+ "IDR0.HTTU features(0x%x) overridden by FW configuration (0x%x)\n",
+ hw_features, fw_features);
+}
+
static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
{
u32 reg;
@@ -3758,13 +4365,15 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
smmu->features |= ARM_SMMU_FEAT_E2H;
}
+ arm_smmu_get_httu(smmu, reg);
+
/*
* The coherency feature as set by FW is used in preference to the ID
* register, but warn on mismatch.
*/
if (!!(reg & IDR0_COHACC) != coherent)
dev_warn(smmu->dev, "IDR0.COHACC overridden by FW configuration (%s)\n",
- coherent ? "true" : "false");
+ str_true_false(coherent));
switch (FIELD_GET(IDR0_STALL_MODEL, reg)) {
case IDR0_STALL_MODEL_FORCE:
@@ -3847,6 +4456,11 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
if (FIELD_GET(IDR3_RIL, reg))
smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
+ if (FIELD_GET(IDR3_FWB, reg))
+ smmu->features |= ARM_SMMU_FEAT_S2FWB;
+
+ if (FIELD_GET(IDR3_BBM, reg) == 2)
+ smmu->features |= ARM_SMMU_FEAT_BBML2;
/* IDR5 */
reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
@@ -3895,11 +4509,6 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
smmu->oas = 48;
}
- if (arm_smmu_ops.pgsize_bitmap == -1UL)
- arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap;
- else
- arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap;
-
/* Set the DMA mask for our table walker */
if (dma_set_mask_and_coherent(smmu->dev, DMA_BIT_MASK(smmu->oas)))
dev_warn(smmu->dev,
@@ -3922,18 +4531,55 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
}
#ifdef CONFIG_ACPI
-static void acpi_smmu_get_options(u32 model, struct arm_smmu_device *smmu)
+#ifdef CONFIG_TEGRA241_CMDQV
+static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node,
+ struct arm_smmu_device *smmu)
+{
+ const char *uid = kasprintf(GFP_KERNEL, "%u", node->identifier);
+ struct acpi_device *adev;
+
+ /* Look for an NVDA200C node whose _UID matches the SMMU node ID */
+ adev = acpi_dev_get_first_match_dev("NVDA200C", uid, -1);
+ if (adev) {
+ /* Tegra241 CMDQV driver is responsible for put_device() */
+ smmu->impl_dev = &adev->dev;
+ smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV;
+ dev_info(smmu->dev, "found companion CMDQV device: %s\n",
+ dev_name(smmu->impl_dev));
+ }
+ kfree(uid);
+}
+#else
+static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node,
+ struct arm_smmu_device *smmu)
{
- switch (model) {
+}
+#endif
+
+static int acpi_smmu_iort_probe_model(struct acpi_iort_node *node,
+ struct arm_smmu_device *smmu)
+{
+ struct acpi_iort_smmu_v3 *iort_smmu =
+ (struct acpi_iort_smmu_v3 *)node->node_data;
+
+ switch (iort_smmu->model) {
case ACPI_IORT_SMMU_V3_CAVIUM_CN99XX:
smmu->options |= ARM_SMMU_OPT_PAGE0_REGS_ONLY;
break;
case ACPI_IORT_SMMU_V3_HISILICON_HI161X:
smmu->options |= ARM_SMMU_OPT_SKIP_PREFETCH;
break;
+ case ACPI_IORT_SMMU_V3_GENERIC:
+ /*
+ * Tegra241 implementation stores its SMMU options and impl_dev
+ * in DSDT. Thus, go through the ACPI tables unconditionally.
+ */
+ acpi_smmu_dsdt_probe_tegra241_cmdqv(node, smmu);
+ break;
}
dev_notice(smmu->dev, "option mask 0x%x\n", smmu->options);
+ return 0;
}
static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
@@ -3948,12 +4594,18 @@ static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
/* Retrieve SMMUv3 specific data */
iort_smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
- acpi_smmu_get_options(iort_smmu->model, smmu);
-
if (iort_smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE)
smmu->features |= ARM_SMMU_FEAT_COHERENCY;
- return 0;
+ switch (FIELD_GET(ACPI_IORT_SMMU_V3_HTTU_OVERRIDE, iort_smmu->flags)) {
+ case IDR0_HTTU_ACCESS_DIRTY:
+ smmu->features |= ARM_SMMU_FEAT_HD;
+ fallthrough;
+ case IDR0_HTTU_ACCESS:
+ smmu->features |= ARM_SMMU_FEAT_HA;
+ }
+
+ return acpi_smmu_iort_probe_model(node, smmu);
}
#else
static inline int arm_smmu_device_acpi_probe(struct platform_device *pdev,
@@ -4034,6 +4686,53 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu)
iort_put_rmr_sids(dev_fwnode(smmu->dev), &rmr_list);
}
+static void arm_smmu_impl_remove(void *data)
+{
+ struct arm_smmu_device *smmu = data;
+
+ if (smmu->impl_ops && smmu->impl_ops->device_remove)
+ smmu->impl_ops->device_remove(smmu);
+}
+
+/*
+ * Probe all the compiled in implementations. Each one checks to see if it
+ * matches this HW and if so returns a devm_krealloc'd arm_smmu_device which
+ * replaces the callers. Otherwise the original is returned or ERR_PTR.
+ */
+static struct arm_smmu_device *arm_smmu_impl_probe(struct arm_smmu_device *smmu)
+{
+ struct arm_smmu_device *new_smmu = ERR_PTR(-ENODEV);
+ const struct arm_smmu_impl_ops *ops;
+ int ret;
+
+ if (smmu->impl_dev && (smmu->options & ARM_SMMU_OPT_TEGRA241_CMDQV))
+ new_smmu = tegra241_cmdqv_probe(smmu);
+
+ if (new_smmu == ERR_PTR(-ENODEV))
+ return smmu;
+ if (IS_ERR(new_smmu))
+ return new_smmu;
+
+ ops = new_smmu->impl_ops;
+ if (ops) {
+ /* get_viommu_size and vsmmu_init ops must be paired */
+ if (WARN_ON(!ops->get_viommu_size != !ops->vsmmu_init)) {
+ ret = -EINVAL;
+ goto err_remove;
+ }
+ }
+
+ ret = devm_add_action_or_reset(new_smmu->dev, arm_smmu_impl_remove,
+ new_smmu);
+ if (ret)
+ return ERR_PTR(ret);
+ return new_smmu;
+
+err_remove:
+ arm_smmu_impl_remove(new_smmu);
+ return ERR_PTR(ret);
+}
+
static int arm_smmu_device_probe(struct platform_device *pdev)
{
int irq, ret;
@@ -4055,6 +4754,10 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
if (ret)
return ret;
+ smmu = arm_smmu_impl_probe(smmu);
+ if (IS_ERR(smmu))
+ return PTR_ERR(smmu);
+
/* Base address */
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!res)
@@ -4108,7 +4811,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
/* Initialise in-memory data structures */
ret = arm_smmu_init_structures(smmu);
if (ret)
- return ret;
+ goto err_free_iopf;
/* Record our private device structure */
platform_set_drvdata(pdev, smmu);
@@ -4119,22 +4822,29 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
/* Reset the device */
ret = arm_smmu_device_reset(smmu);
if (ret)
- return ret;
+ goto err_disable;
/* And we're up. Go go go! */
ret = iommu_device_sysfs_add(&smmu->iommu, dev, NULL,
"smmu3.%pa", &ioaddr);
if (ret)
- return ret;
+ goto err_disable;
ret = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
if (ret) {
dev_err(dev, "Failed to register iommu\n");
- iommu_device_sysfs_remove(&smmu->iommu);
- return ret;
+ goto err_free_sysfs;
}
return 0;
+
+err_free_sysfs:
+ iommu_device_sysfs_remove(&smmu->iommu);
+err_disable:
+ arm_smmu_device_disable(smmu);
+err_free_iopf:
+ iopf_queue_free(smmu->evtq.iopf);
+ return ret;
}
static void arm_smmu_device_remove(struct platform_device *pdev)
@@ -4174,7 +4884,7 @@ static struct platform_driver arm_smmu_driver = {
.suppress_bind_attrs = true,
},
.probe = arm_smmu_device_probe,
- .remove_new = arm_smmu_device_remove,
+ .remove = arm_smmu_device_remove,
.shutdown = arm_smmu_device_shutdown,
};
module_driver(arm_smmu_driver, platform_driver_register,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 1242a086c9f9..ae23aacc3840 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -10,10 +10,14 @@
#include <linux/bitfield.h>
#include <linux/iommu.h>
+#include <linux/iommufd.h>
#include <linux/kernel.h>
#include <linux/mmzone.h>
#include <linux/sizes.h>
+struct arm_smmu_device;
+struct arm_vsmmu;
+
/* MMIO registers */
#define ARM_SMMU_IDR0 0x0
#define IDR0_ST_LVL GENMASK(28, 27)
@@ -33,6 +37,9 @@
#define IDR0_ASID16 (1 << 12)
#define IDR0_ATS (1 << 10)
#define IDR0_HYP (1 << 9)
+#define IDR0_HTTU GENMASK(7, 6)
+#define IDR0_HTTU_ACCESS 1
+#define IDR0_HTTU_ACCESS_DIRTY 2
#define IDR0_COHACC (1 << 4)
#define IDR0_TTF GENMASK(3, 2)
#define IDR0_TTF_AARCH64 2
@@ -52,7 +59,9 @@
#define IDR1_SIDSIZE GENMASK(5, 0)
#define ARM_SMMU_IDR3 0xc
+#define IDR3_FWB (1 << 8)
#define IDR3_RIL (1 << 10)
+#define IDR3_BBM GENMASK(12, 11)
#define ARM_SMMU_IDR5 0x14
#define IDR5_STALL_MAX GENMASK(31, 16)
@@ -76,6 +85,8 @@
#define IIDR_REVISION GENMASK(15, 12)
#define IIDR_IMPLEMENTER GENMASK(11, 0)
+#define ARM_SMMU_AIDR 0x1C
+
#define ARM_SMMU_CR0 0x20
#define CR0_ATSCHK (1 << 4)
#define CR0_CMDQEN (1 << 3)
@@ -199,10 +210,8 @@
* 2lvl: 128k L1 entries,
* 256 lazy entries per table (each table covers a PCI bus)
*/
-#define STRTAB_L1_SZ_SHIFT 20
#define STRTAB_SPLIT 8
-#define STRTAB_L1_DESC_DWORDS 1
#define STRTAB_L1_DESC_SPAN GENMASK_ULL(4, 0)
#define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6)
@@ -212,12 +221,33 @@ struct arm_smmu_ste {
__le64 data[STRTAB_STE_DWORDS];
};
+#define STRTAB_NUM_L2_STES (1 << STRTAB_SPLIT)
+struct arm_smmu_strtab_l2 {
+ struct arm_smmu_ste stes[STRTAB_NUM_L2_STES];
+};
+
+struct arm_smmu_strtab_l1 {
+ __le64 l2ptr;
+};
+#define STRTAB_MAX_L1_ENTRIES (1 << 17)
+
+static inline u32 arm_smmu_strtab_l1_idx(u32 sid)
+{
+ return sid / STRTAB_NUM_L2_STES;
+}
+
+static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
+{
+ return sid % STRTAB_NUM_L2_STES;
+}
+
#define STRTAB_STE_0_V (1UL << 0)
#define STRTAB_STE_0_CFG GENMASK_ULL(3, 1)
#define STRTAB_STE_0_CFG_ABORT 0
#define STRTAB_STE_0_CFG_BYPASS 4
#define STRTAB_STE_0_CFG_S1_TRANS 5
#define STRTAB_STE_0_CFG_S2_TRANS 6
+#define STRTAB_STE_0_CFG_NESTED 7
#define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4)
#define STRTAB_STE_0_S1FMT_LINEAR 0
@@ -238,6 +268,8 @@ struct arm_smmu_ste {
#define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4)
#define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6)
+#define STRTAB_STE_1_MEV (1UL << 19)
+#define STRTAB_STE_1_S2FWB (1UL << 25)
#define STRTAB_STE_1_S1STALLD (1UL << 27)
#define STRTAB_STE_1_EATS GENMASK_ULL(29, 28)
@@ -264,10 +296,20 @@ struct arm_smmu_ste {
#define STRTAB_STE_2_S2AA64 (1UL << 51)
#define STRTAB_STE_2_S2ENDI (1UL << 52)
#define STRTAB_STE_2_S2PTW (1UL << 54)
+#define STRTAB_STE_2_S2S (1UL << 57)
#define STRTAB_STE_2_S2R (1UL << 58)
#define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4)
+/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */
+#define STRTAB_STE_0_NESTING_ALLOWED \
+ cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
+ STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
+#define STRTAB_STE_1_NESTING_ALLOWED \
+ cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \
+ STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \
+ STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
+
/*
* Context descriptors.
*
@@ -277,7 +319,6 @@ struct arm_smmu_ste {
*/
#define CTXDESC_L2_ENTRIES 1024
-#define CTXDESC_L1_DESC_DWORDS 1
#define CTXDESC_L1_DESC_V (1UL << 0)
#define CTXDESC_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 12)
@@ -287,6 +328,24 @@ struct arm_smmu_cd {
__le64 data[CTXDESC_CD_DWORDS];
};
+struct arm_smmu_cdtab_l2 {
+ struct arm_smmu_cd cds[CTXDESC_L2_ENTRIES];
+};
+
+struct arm_smmu_cdtab_l1 {
+ __le64 l2ptr;
+};
+
+static inline unsigned int arm_smmu_cdtab_l1_idx(unsigned int ssid)
+{
+ return ssid / CTXDESC_L2_ENTRIES;
+}
+
+static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid)
+{
+ return ssid % CTXDESC_L2_ENTRIES;
+}
+
#define CTXDESC_CD_0_TCR_T0SZ GENMASK_ULL(5, 0)
#define CTXDESC_CD_0_TCR_TG0 GENMASK_ULL(7, 6)
#define CTXDESC_CD_0_TCR_IRGN0 GENMASK_ULL(9, 8)
@@ -301,6 +360,9 @@ struct arm_smmu_cd {
#define CTXDESC_CD_0_TCR_IPS GENMASK_ULL(34, 32)
#define CTXDESC_CD_0_TCR_TBI0 (1ULL << 38)
+#define CTXDESC_CD_0_TCR_HA (1UL << 43)
+#define CTXDESC_CD_0_TCR_HD (1UL << 42)
+
#define CTXDESC_CD_0_AA64 (1UL << 41)
#define CTXDESC_CD_0_S (1UL << 44)
#define CTXDESC_CD_0_R (1UL << 45)
@@ -314,7 +376,7 @@ struct arm_smmu_cd {
* When the SMMU only supports linear context descriptor tables, pick a
* reasonable size limit (64kB).
*/
-#define CTXDESC_LINEAR_CDMAX ilog2(SZ_64K / (CTXDESC_CD_DWORDS << 3))
+#define CTXDESC_LINEAR_CDMAX ilog2(SZ_64K / sizeof(struct arm_smmu_cd))
/* Command queue */
#define CMDQ_ENT_SZ_SHIFT 4
@@ -393,10 +455,18 @@ struct arm_smmu_cd {
#define EVTQ_0_ID GENMASK_ULL(7, 0)
+#define EVT_ID_BAD_STREAMID_CONFIG 0x02
+#define EVT_ID_STE_FETCH_FAULT 0x03
+#define EVT_ID_BAD_STE_CONFIG 0x04
+#define EVT_ID_STREAM_DISABLED_FAULT 0x06
+#define EVT_ID_BAD_SUBSTREAMID_CONFIG 0x08
+#define EVT_ID_CD_FETCH_FAULT 0x09
+#define EVT_ID_BAD_CD_CONFIG 0x0a
#define EVT_ID_TRANSLATION_FAULT 0x10
#define EVT_ID_ADDR_SIZE_FAULT 0x11
#define EVT_ID_ACCESS_FAULT 0x12
#define EVT_ID_PERMISSION_FAULT 0x13
+#define EVT_ID_VMS_FETCH_FAULT 0x25
#define EVTQ_0_SSV (1UL << 11)
#define EVTQ_0_SSID GENMASK_ULL(31, 12)
@@ -408,9 +478,11 @@ struct arm_smmu_cd {
#define EVTQ_1_RnW (1UL << 35)
#define EVTQ_1_S2 (1UL << 39)
#define EVTQ_1_CLASS GENMASK_ULL(41, 40)
+#define EVTQ_1_CLASS_TT 0x01
#define EVTQ_1_TT_READ (1UL << 44)
#define EVTQ_2_ADDR GENMASK_ULL(63, 0)
#define EVTQ_3_IPA GENMASK_ULL(51, 12)
+#define EVTQ_3_FETCH_ADDR GENMASK_ULL(51, 3)
/* PRI queue */
#define PRIQ_ENT_SZ_SHIFT 4
@@ -467,8 +539,10 @@ struct arm_smmu_cmdq_ent {
};
} cfgi;
+ #define CMDQ_OP_TLBI_NH_ALL 0x10
#define CMDQ_OP_TLBI_NH_ASID 0x11
#define CMDQ_OP_TLBI_NH_VA 0x12
+ #define CMDQ_OP_TLBI_NH_VAA 0x13
#define CMDQ_OP_TLBI_EL2_ALL 0x20
#define CMDQ_OP_TLBI_EL2_ASID 0x21
#define CMDQ_OP_TLBI_EL2_VA 0x22
@@ -560,10 +634,18 @@ struct arm_smmu_cmdq {
atomic_long_t *valid_map;
atomic_t owner_prod;
atomic_t lock;
+ bool (*supports_cmd)(struct arm_smmu_cmdq_ent *ent);
};
+static inline bool arm_smmu_cmdq_supports_cmd(struct arm_smmu_cmdq *cmdq,
+ struct arm_smmu_cmdq_ent *ent)
+{
+ return cmdq->supports_cmd ? cmdq->supports_cmd(ent) : true;
+}
+
struct arm_smmu_cmdq_batch {
u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
+ struct arm_smmu_cmdq *cmdq;
int num;
};
@@ -578,52 +660,86 @@ struct arm_smmu_priq {
};
/* High-level stream table and context descriptor structures */
-struct arm_smmu_strtab_l1_desc {
- u8 span;
-
- struct arm_smmu_ste *l2ptr;
- dma_addr_t l2ptr_dma;
-};
-
struct arm_smmu_ctx_desc {
u16 asid;
-
- refcount_t refs;
- struct mm_struct *mm;
-};
-
-struct arm_smmu_l1_ctx_desc {
- struct arm_smmu_cd *l2ptr;
- dma_addr_t l2ptr_dma;
};
struct arm_smmu_ctx_desc_cfg {
- __le64 *cdtab;
+ union {
+ struct {
+ struct arm_smmu_cd *table;
+ unsigned int num_ents;
+ } linear;
+ struct {
+ struct arm_smmu_cdtab_l1 *l1tab;
+ struct arm_smmu_cdtab_l2 **l2ptrs;
+ unsigned int num_l1_ents;
+ } l2;
+ };
dma_addr_t cdtab_dma;
- struct arm_smmu_l1_ctx_desc *l1_desc;
- unsigned int num_l1_ents;
+ unsigned int used_ssids;
+ u8 in_ste;
u8 s1fmt;
/* log2 of the maximum number of CDs supported by this table */
u8 s1cdmax;
};
+static inline bool
+arm_smmu_cdtab_allocated(struct arm_smmu_ctx_desc_cfg *cfg)
+{
+ return cfg->linear.table || cfg->l2.l1tab;
+}
+
+/* True if the cd table has SSIDS > 0 in use. */
+static inline bool arm_smmu_ssids_in_use(struct arm_smmu_ctx_desc_cfg *cd_table)
+{
+ return cd_table->used_ssids;
+}
+
struct arm_smmu_s2_cfg {
u16 vmid;
};
struct arm_smmu_strtab_cfg {
- __le64 *strtab;
- dma_addr_t strtab_dma;
- struct arm_smmu_strtab_l1_desc *l1_desc;
- unsigned int num_l1_ents;
+ union {
+ struct {
+ struct arm_smmu_ste *table;
+ dma_addr_t ste_dma;
+ unsigned int num_ents;
+ } linear;
+ struct {
+ struct arm_smmu_strtab_l1 *l1tab;
+ struct arm_smmu_strtab_l2 **l2ptrs;
+ dma_addr_t l1_dma;
+ unsigned int num_l1_ents;
+ } l2;
+ };
+};
- u64 strtab_base;
- u32 strtab_base_cfg;
+struct arm_smmu_impl_ops {
+ int (*device_reset)(struct arm_smmu_device *smmu);
+ void (*device_remove)(struct arm_smmu_device *smmu);
+ int (*init_structures)(struct arm_smmu_device *smmu);
+ struct arm_smmu_cmdq *(*get_secondary_cmdq)(
+ struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent);
+ /*
+ * An implementation should define its own type other than the default
+ * IOMMU_HW_INFO_TYPE_ARM_SMMUV3. And it must validate the input @type
+ * to return its own structure.
+ */
+ void *(*hw_info)(struct arm_smmu_device *smmu, u32 *length,
+ enum iommu_hw_info_type *type);
+ size_t (*get_viommu_size)(enum iommu_viommu_type viommu_type);
+ int (*vsmmu_init)(struct arm_vsmmu *vsmmu,
+ const struct iommu_user_data *user_data);
};
/* An SMMUv3 instance */
struct arm_smmu_device {
struct device *dev;
+ struct device *impl_dev;
+ const struct arm_smmu_impl_ops *impl_ops;
+
void __iomem *base;
void __iomem *page1;
@@ -648,12 +764,17 @@ struct arm_smmu_device {
#define ARM_SMMU_FEAT_E2H (1 << 18)
#define ARM_SMMU_FEAT_NESTING (1 << 19)
#define ARM_SMMU_FEAT_ATTR_TYPES_OVR (1 << 20)
+#define ARM_SMMU_FEAT_HA (1 << 21)
+#define ARM_SMMU_FEAT_HD (1 << 22)
+#define ARM_SMMU_FEAT_S2FWB (1 << 23)
+#define ARM_SMMU_FEAT_BBML2 (1 << 24)
u32 features;
#define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
#define ARM_SMMU_OPT_PAGE0_REGS_ONLY (1 << 1)
#define ARM_SMMU_OPT_MSIPOLL (1 << 2)
#define ARM_SMMU_OPT_CMDQ_FORCE_SYNC (1 << 3)
+#define ARM_SMMU_OPT_TEGRA241_CMDQV (1 << 4)
u32 options;
struct arm_smmu_cmdq cmdq;
@@ -692,21 +813,45 @@ struct arm_smmu_stream {
struct rb_node node;
};
+struct arm_smmu_vmaster {
+ struct arm_vsmmu *vsmmu;
+ unsigned long vsid;
+};
+
+struct arm_smmu_event {
+ u8 stall : 1,
+ ssv : 1,
+ privileged : 1,
+ instruction : 1,
+ s2 : 1,
+ read : 1,
+ ttrnw : 1,
+ class_tt : 1;
+ u8 id;
+ u8 class;
+ u16 stag;
+ u32 sid;
+ u32 ssid;
+ u64 iova;
+ u64 ipa;
+ u64 fetch_addr;
+ struct device *dev;
+};
+
/* SMMU private data for each master */
struct arm_smmu_master {
struct arm_smmu_device *smmu;
struct device *dev;
- struct list_head domain_head;
struct arm_smmu_stream *streams;
+ struct arm_smmu_vmaster *vmaster; /* use smmu->streams_mutex */
/* Locked by the iommu core using the group mutex */
struct arm_smmu_ctx_desc_cfg cd_table;
unsigned int num_streams;
- bool ats_enabled;
+ bool ats_enabled : 1;
+ bool ste_ats_enabled : 1;
bool stall_enabled;
- bool sva_enabled;
- bool iopf_enabled;
- struct list_head bonds;
unsigned int ssid_bits;
+ unsigned int iopf_refcount;
};
/* SMMU private data for an IOMMU domain */
@@ -717,7 +862,6 @@ enum arm_smmu_domain_stage {
struct arm_smmu_domain {
struct arm_smmu_device *smmu;
- struct mutex init_mutex; /* Protects smmu pointer */
struct io_pgtable_ops *pgtbl_ops;
atomic_t nr_ats_masters;
@@ -730,10 +874,21 @@ struct arm_smmu_domain {
struct iommu_domain domain;
+ /* List of struct arm_smmu_master_domain */
struct list_head devices;
spinlock_t devices_lock;
+ bool enforce_cache_coherency : 1;
+ bool nest_parent : 1;
+
+ struct mmu_notifier mmu_notifier;
+};
+
+struct arm_smmu_nested_domain {
+ struct iommu_domain domain;
+ struct arm_vsmmu *vsmmu;
+ bool enable_ats : 1;
- struct list_head mmu_notifiers;
+ __le64 ste[2];
};
/* The following are exposed for testing purposes. */
@@ -748,37 +903,59 @@ struct arm_smmu_entry_writer_ops {
void (*sync)(struct arm_smmu_entry_writer *writer);
};
+void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
+void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
+ struct arm_smmu_master *master,
+ struct arm_smmu_domain *smmu_domain,
+ bool ats_enabled);
+
#if IS_ENABLED(CONFIG_KUNIT)
void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits);
void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur,
const __le64 *target);
void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits);
-void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
struct arm_smmu_ste *target);
void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
- struct arm_smmu_master *master);
-void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
- struct arm_smmu_master *master,
- struct arm_smmu_domain *smmu_domain);
+ struct arm_smmu_master *master, bool ats_enabled,
+ unsigned int s1dss);
void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
struct arm_smmu_master *master, struct mm_struct *mm,
u16 asid);
#endif
+struct arm_smmu_master_domain {
+ struct list_head devices_elm;
+ struct arm_smmu_master *master;
+ /*
+ * For nested domains the master_domain is threaded onto the S2 parent,
+ * this points to the IOMMU_DOMAIN_NESTED to disambiguate the masters.
+ */
+ struct iommu_domain *domain;
+ ioasid_t ssid;
+ bool nested_ats_flush : 1;
+ bool using_iopf : 1;
+};
+
static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
{
return container_of(dom, struct arm_smmu_domain, domain);
}
+static inline struct arm_smmu_nested_domain *
+to_smmu_nested_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct arm_smmu_nested_domain, domain);
+}
+
extern struct xarray arm_smmu_asid_xa;
extern struct mutex arm_smmu_asid_lock;
+struct arm_smmu_domain *arm_smmu_domain_alloc(void);
+
void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid);
struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
u32 ssid);
-struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
- u32 ssid);
void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
struct arm_smmu_master *master,
struct arm_smmu_domain *smmu_domain);
@@ -786,67 +963,135 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
struct arm_smmu_cd *cdptr,
const struct arm_smmu_cd *target);
+int arm_smmu_set_pasid(struct arm_smmu_master *master,
+ struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
+ struct arm_smmu_cd *cd, struct iommu_domain *old);
+
void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
size_t granule, bool leaf,
struct arm_smmu_domain *smmu_domain);
-bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd);
-int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
+int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
unsigned long iova, size_t size);
+void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq);
+int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
+ struct arm_smmu_queue *q, void __iomem *page,
+ unsigned long prod_off, unsigned long cons_off,
+ size_t dwords, const char *name);
+int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq);
+
+static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master)
+{
+ return dev_iommu_fwspec_get(master->dev)->flags &
+ IOMMU_FWSPEC_PCI_RC_CANWBS;
+}
+
+struct arm_smmu_attach_state {
+ /* Inputs */
+ struct iommu_domain *old_domain;
+ struct arm_smmu_master *master;
+ bool cd_needs_ats;
+ bool disable_ats;
+ ioasid_t ssid;
+ /* Resulting state */
+ struct arm_smmu_vmaster *vmaster;
+ bool ats_enabled;
+};
+
+int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
+ struct iommu_domain *new_domain);
+void arm_smmu_attach_commit(struct arm_smmu_attach_state *state);
+void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
+ const struct arm_smmu_ste *target);
+
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
+ bool sync);
+
#ifdef CONFIG_ARM_SMMU_V3_SVA
bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
-bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);
-bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master);
-int arm_smmu_master_enable_sva(struct arm_smmu_master *master);
-int arm_smmu_master_disable_sva(struct arm_smmu_master *master);
-bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master);
void arm_smmu_sva_notifier_synchronize(void);
-struct iommu_domain *arm_smmu_sva_domain_alloc(void);
-void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
- struct device *dev, ioasid_t id);
+struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
+ struct mm_struct *mm);
#else /* CONFIG_ARM_SMMU_V3_SVA */
static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
{
return false;
}
-static inline bool arm_smmu_master_sva_supported(struct arm_smmu_master *master)
-{
- return false;
-}
+static inline void arm_smmu_sva_notifier_synchronize(void) {}
-static inline bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master)
-{
- return false;
-}
+#define arm_smmu_sva_domain_alloc NULL
+
+#endif /* CONFIG_ARM_SMMU_V3_SVA */
-static inline int arm_smmu_master_enable_sva(struct arm_smmu_master *master)
+#ifdef CONFIG_TEGRA241_CMDQV
+struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu);
+#else /* CONFIG_TEGRA241_CMDQV */
+static inline struct arm_smmu_device *
+tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
{
- return -ENODEV;
+ return ERR_PTR(-ENODEV);
}
+#endif /* CONFIG_TEGRA241_CMDQV */
+
+struct arm_vsmmu {
+ struct iommufd_viommu core;
+ struct arm_smmu_device *smmu;
+ struct arm_smmu_domain *s2_parent;
+ u16 vmid;
+};
-static inline int arm_smmu_master_disable_sva(struct arm_smmu_master *master)
+#if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD)
+void *arm_smmu_hw_info(struct device *dev, u32 *length,
+ enum iommu_hw_info_type *type);
+size_t arm_smmu_get_viommu_size(struct device *dev,
+ enum iommu_viommu_type viommu_type);
+int arm_vsmmu_init(struct iommufd_viommu *viommu,
+ struct iommu_domain *parent_domain,
+ const struct iommu_user_data *user_data);
+int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
+ struct arm_smmu_nested_domain *nested_domain);
+void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state);
+void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master);
+int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt);
+struct iommu_domain *
+arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data);
+int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array);
+#else
+#define arm_smmu_get_viommu_size NULL
+#define arm_smmu_hw_info NULL
+#define arm_vsmmu_init NULL
+#define arm_vsmmu_alloc_domain_nested NULL
+#define arm_vsmmu_cache_invalidate NULL
+
+static inline int
+arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
+ struct arm_smmu_nested_domain *nested_domain)
{
- return -ENODEV;
+ return 0;
}
-static inline bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master)
+static inline void
+arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state)
{
- return false;
}
-static inline void arm_smmu_sva_notifier_synchronize(void) {}
-
-static inline struct iommu_domain *arm_smmu_sva_domain_alloc(void)
+static inline void
+arm_smmu_master_clear_vmaster(struct arm_smmu_master *master)
{
- return NULL;
}
-static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
- struct device *dev,
- ioasid_t id)
+static inline int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster,
+ u64 *evt)
{
+ return -EOPNOTSUPP;
}
-#endif /* CONFIG_ARM_SMMU_V3_SVA */
+#endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */
+
#endif /* _ARM_SMMU_V3_H */
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
new file mode 100644
index 000000000000..378104cd395e
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -0,0 +1,1349 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2021-2024 NVIDIA CORPORATION & AFFILIATES. */
+
+#define dev_fmt(fmt) "tegra241_cmdqv: " fmt
+
+#include <linux/acpi.h>
+#include <linux/debugfs.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/iommufd.h>
+#include <linux/iopoll.h>
+#include <uapi/linux/iommufd.h>
+
+#include <acpi/acpixf.h>
+
+#include "arm-smmu-v3.h"
+
+/* CMDQV register page base and size defines */
+#define TEGRA241_CMDQV_CONFIG_BASE (0)
+#define TEGRA241_CMDQV_CONFIG_SIZE (SZ_64K)
+#define TEGRA241_VCMDQ_PAGE0_BASE (TEGRA241_CMDQV_CONFIG_BASE + SZ_64K)
+#define TEGRA241_VCMDQ_PAGE1_BASE (TEGRA241_VCMDQ_PAGE0_BASE + SZ_64K)
+#define TEGRA241_VINTF_PAGE_BASE (TEGRA241_VCMDQ_PAGE1_BASE + SZ_64K)
+
+/* CMDQV global base regs */
+#define TEGRA241_CMDQV_CONFIG 0x0000
+#define CMDQV_EN BIT(0)
+
+#define TEGRA241_CMDQV_PARAM 0x0004
+#define CMDQV_NUM_SID_PER_VM_LOG2 GENMASK(15, 12)
+#define CMDQV_NUM_VINTF_LOG2 GENMASK(11, 8)
+#define CMDQV_NUM_VCMDQ_LOG2 GENMASK(7, 4)
+#define CMDQV_VER GENMASK(3, 0)
+
+#define TEGRA241_CMDQV_STATUS 0x0008
+#define CMDQV_ENABLED BIT(0)
+
+#define TEGRA241_CMDQV_VINTF_ERR_MAP 0x0014
+#define TEGRA241_CMDQV_VINTF_INT_MASK 0x001C
+#define TEGRA241_CMDQV_CMDQ_ERR_MAP(m) (0x0024 + 0x4*(m))
+
+#define TEGRA241_CMDQV_CMDQ_ALLOC(q) (0x0200 + 0x4*(q))
+#define CMDQV_CMDQ_ALLOC_VINTF GENMASK(20, 15)
+#define CMDQV_CMDQ_ALLOC_LVCMDQ GENMASK(7, 1)
+#define CMDQV_CMDQ_ALLOCATED BIT(0)
+
+/* VINTF base regs */
+#define TEGRA241_VINTF(v) (0x1000 + 0x100*(v))
+
+#define TEGRA241_VINTF_CONFIG 0x0000
+#define VINTF_HYP_OWN BIT(17)
+#define VINTF_VMID GENMASK(16, 1)
+#define VINTF_EN BIT(0)
+
+#define TEGRA241_VINTF_STATUS 0x0004
+#define VINTF_STATUS GENMASK(3, 1)
+#define VINTF_ENABLED BIT(0)
+
+#define TEGRA241_VINTF_SID_MATCH(s) (0x0040 + 0x4*(s))
+#define TEGRA241_VINTF_SID_REPLACE(s) (0x0080 + 0x4*(s))
+
+#define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \
+ (0x00C0 + 0x8*(m))
+#define LVCMDQ_ERR_MAP_NUM_64 2
+
+/* VCMDQ base regs */
+/* -- PAGE0 -- */
+#define TEGRA241_VCMDQ_PAGE0(q) (TEGRA241_VCMDQ_PAGE0_BASE + 0x80*(q))
+
+#define TEGRA241_VCMDQ_CONS 0x00000
+#define VCMDQ_CONS_ERR GENMASK(30, 24)
+
+#define TEGRA241_VCMDQ_PROD 0x00004
+
+#define TEGRA241_VCMDQ_CONFIG 0x00008
+#define VCMDQ_EN BIT(0)
+
+#define TEGRA241_VCMDQ_STATUS 0x0000C
+#define VCMDQ_ENABLED BIT(0)
+
+#define TEGRA241_VCMDQ_GERROR 0x00010
+#define TEGRA241_VCMDQ_GERRORN 0x00014
+
+/* -- PAGE1 -- */
+#define TEGRA241_VCMDQ_PAGE1(q) (TEGRA241_VCMDQ_PAGE1_BASE + 0x80*(q))
+#define VCMDQ_ADDR GENMASK(47, 5)
+#define VCMDQ_LOG2SIZE GENMASK(4, 0)
+
+#define TEGRA241_VCMDQ_BASE 0x00000
+#define TEGRA241_VCMDQ_CONS_INDX_BASE 0x00008
+
+/* VINTF logical-VCMDQ pages */
+#define TEGRA241_VINTFi_PAGE0(i) (TEGRA241_VINTF_PAGE_BASE + SZ_128K*(i))
+#define TEGRA241_VINTFi_PAGE1(i) (TEGRA241_VINTFi_PAGE0(i) + SZ_64K)
+#define TEGRA241_VINTFi_LVCMDQ_PAGE0(i, q) \
+ (TEGRA241_VINTFi_PAGE0(i) + 0x80*(q))
+#define TEGRA241_VINTFi_LVCMDQ_PAGE1(i, q) \
+ (TEGRA241_VINTFi_PAGE1(i) + 0x80*(q))
+
+/* MMIO helpers */
+#define REG_CMDQV(_cmdqv, _regname) \
+ ((_cmdqv)->base + TEGRA241_CMDQV_##_regname)
+#define REG_VINTF(_vintf, _regname) \
+ ((_vintf)->base + TEGRA241_VINTF_##_regname)
+#define REG_VCMDQ_PAGE0(_vcmdq, _regname) \
+ ((_vcmdq)->page0 + TEGRA241_VCMDQ_##_regname)
+#define REG_VCMDQ_PAGE1(_vcmdq, _regname) \
+ ((_vcmdq)->page1 + TEGRA241_VCMDQ_##_regname)
+
+
+static bool disable_cmdqv;
+module_param(disable_cmdqv, bool, 0444);
+MODULE_PARM_DESC(disable_cmdqv,
+ "This allows to disable CMDQV HW and use default SMMU internal CMDQ.");
+
+static bool bypass_vcmdq;
+module_param(bypass_vcmdq, bool, 0444);
+MODULE_PARM_DESC(bypass_vcmdq,
+ "This allows to bypass VCMDQ for debugging use or perf comparison.");
+
+/**
+ * struct tegra241_vcmdq - Virtual Command Queue
+ * @core: Embedded iommufd_hw_queue structure
+ * @idx: Global index in the CMDQV
+ * @lidx: Local index in the VINTF
+ * @enabled: Enable status
+ * @cmdqv: Parent CMDQV pointer
+ * @vintf: Parent VINTF pointer
+ * @prev: Previous LVCMDQ to depend on
+ * @cmdq: Command Queue struct
+ * @page0: MMIO Page0 base address
+ * @page1: MMIO Page1 base address
+ */
+struct tegra241_vcmdq {
+ struct iommufd_hw_queue core;
+
+ u16 idx;
+ u16 lidx;
+
+ bool enabled;
+
+ struct tegra241_cmdqv *cmdqv;
+ struct tegra241_vintf *vintf;
+ struct tegra241_vcmdq *prev;
+ struct arm_smmu_cmdq cmdq;
+
+ void __iomem *page0;
+ void __iomem *page1;
+};
+#define hw_queue_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core)
+
+/**
+ * struct tegra241_vintf - Virtual Interface
+ * @vsmmu: Embedded arm_vsmmu structure
+ * @idx: Global index in the CMDQV
+ * @enabled: Enable status
+ * @hyp_own: Owned by hypervisor (in-kernel)
+ * @cmdqv: Parent CMDQV pointer
+ * @lvcmdqs: List of logical VCMDQ pointers
+ * @lvcmdq_mutex: Lock to serialize user-allocated lvcmdqs
+ * @base: MMIO base address
+ * @mmap_offset: Offset argument for mmap() syscall
+ * @sids: Stream ID mapping resources
+ */
+struct tegra241_vintf {
+ struct arm_vsmmu vsmmu;
+
+ u16 idx;
+
+ bool enabled;
+ bool hyp_own;
+
+ struct tegra241_cmdqv *cmdqv;
+ struct tegra241_vcmdq **lvcmdqs;
+ struct mutex lvcmdq_mutex; /* user space race */
+
+ void __iomem *base;
+ unsigned long mmap_offset;
+
+ struct ida sids;
+};
+#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, vsmmu.core)
+
+/**
+ * struct tegra241_vintf_sid - Virtual Interface Stream ID Mapping
+ * @core: Embedded iommufd_vdevice structure, holding virtual Stream ID
+ * @vintf: Parent VINTF pointer
+ * @sid: Physical Stream ID
+ * @idx: Mapping index in the VINTF
+ */
+struct tegra241_vintf_sid {
+ struct iommufd_vdevice core;
+ struct tegra241_vintf *vintf;
+ u32 sid;
+ u8 idx;
+};
+#define vdev_to_vsid(v) container_of(v, struct tegra241_vintf_sid, core)
+
+/**
+ * struct tegra241_cmdqv - CMDQ-V for SMMUv3
+ * @smmu: SMMUv3 device
+ * @dev: CMDQV device
+ * @base: MMIO base address
+ * @base_phys: MMIO physical base address, for mmap
+ * @irq: IRQ number
+ * @num_vintfs: Total number of VINTFs
+ * @num_vcmdqs: Total number of VCMDQs
+ * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF
+ * @num_sids_per_vintf: Total number of SID mappings per VINTF
+ * @vintf_ids: VINTF id allocator
+ * @vintfs: List of VINTFs
+ */
+struct tegra241_cmdqv {
+ struct arm_smmu_device smmu;
+ struct device *dev;
+
+ void __iomem *base;
+ phys_addr_t base_phys;
+ int irq;
+
+ /* CMDQV Hardware Params */
+ u16 num_vintfs;
+ u16 num_vcmdqs;
+ u16 num_lvcmdqs_per_vintf;
+ u16 num_sids_per_vintf;
+
+ struct ida vintf_ids;
+
+ struct tegra241_vintf **vintfs;
+};
+
+/* Config and Polling Helpers */
+
+static inline int tegra241_cmdqv_write_config(struct tegra241_cmdqv *cmdqv,
+ void __iomem *addr_config,
+ void __iomem *addr_status,
+ u32 regval, const char *header,
+ bool *out_enabled)
+{
+ bool en = regval & BIT(0);
+ int ret;
+
+ writel(regval, addr_config);
+ ret = readl_poll_timeout(addr_status, regval,
+ en ? regval & BIT(0) : !(regval & BIT(0)),
+ 1, ARM_SMMU_POLL_TIMEOUT_US);
+ if (ret)
+ dev_err(cmdqv->dev, "%sfailed to %sable, STATUS=0x%08X\n",
+ header, en ? "en" : "dis", regval);
+ if (out_enabled)
+ WRITE_ONCE(*out_enabled, regval & BIT(0));
+ return ret;
+}
+
+static inline int cmdqv_write_config(struct tegra241_cmdqv *cmdqv, u32 regval)
+{
+ return tegra241_cmdqv_write_config(cmdqv,
+ REG_CMDQV(cmdqv, CONFIG),
+ REG_CMDQV(cmdqv, STATUS),
+ regval, "CMDQV: ", NULL);
+}
+
+static inline int vintf_write_config(struct tegra241_vintf *vintf, u32 regval)
+{
+ char header[16];
+
+ snprintf(header, 16, "VINTF%u: ", vintf->idx);
+ return tegra241_cmdqv_write_config(vintf->cmdqv,
+ REG_VINTF(vintf, CONFIG),
+ REG_VINTF(vintf, STATUS),
+ regval, header, &vintf->enabled);
+}
+
+static inline char *lvcmdq_error_header(struct tegra241_vcmdq *vcmdq,
+ char *header, int hlen)
+{
+ WARN_ON(hlen < 64);
+ if (WARN_ON(!vcmdq->vintf))
+ return "";
+ snprintf(header, hlen, "VINTF%u: VCMDQ%u/LVCMDQ%u: ",
+ vcmdq->vintf->idx, vcmdq->idx, vcmdq->lidx);
+ return header;
+}
+
+static inline int vcmdq_write_config(struct tegra241_vcmdq *vcmdq, u32 regval)
+{
+ char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
+
+ return tegra241_cmdqv_write_config(vcmdq->cmdqv,
+ REG_VCMDQ_PAGE0(vcmdq, CONFIG),
+ REG_VCMDQ_PAGE0(vcmdq, STATUS),
+ regval, h, &vcmdq->enabled);
+}
+
+/* ISR Functions */
+
+static void tegra241_vintf_user_handle_error(struct tegra241_vintf *vintf)
+{
+ struct iommufd_viommu *viommu = &vintf->vsmmu.core;
+ struct iommu_vevent_tegra241_cmdqv vevent_data;
+ int i;
+
+ for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) {
+ u64 err = readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i)));
+
+ vevent_data.lvcmdq_err_map[i] = cpu_to_le64(err);
+ }
+
+ iommufd_viommu_report_event(viommu, IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV,
+ &vevent_data, sizeof(vevent_data));
+}
+
+static void tegra241_vintf0_handle_error(struct tegra241_vintf *vintf)
+{
+ int i;
+
+ for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) {
+ u64 map = readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i)));
+
+ while (map) {
+ unsigned long lidx = __ffs64(map);
+ struct tegra241_vcmdq *vcmdq = vintf->lvcmdqs[lidx];
+ u32 gerror = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR));
+
+ __arm_smmu_cmdq_skip_err(&vintf->cmdqv->smmu, &vcmdq->cmdq);
+ writel(gerror, REG_VCMDQ_PAGE0(vcmdq, GERRORN));
+ map &= ~BIT_ULL(lidx);
+ }
+ }
+}
+
+static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid)
+{
+ struct tegra241_cmdqv *cmdqv = (struct tegra241_cmdqv *)devid;
+ void __iomem *reg_vintf_map = REG_CMDQV(cmdqv, VINTF_ERR_MAP);
+ char err_str[256];
+ u64 vintf_map;
+
+ /* Use readl_relaxed() as register addresses are not 64-bit aligned */
+ vintf_map = (u64)readl_relaxed(reg_vintf_map + 0x4) << 32 |
+ (u64)readl_relaxed(reg_vintf_map);
+
+ snprintf(err_str, sizeof(err_str),
+ "vintf_map: %016llx, vcmdq_map %08x:%08x:%08x:%08x", vintf_map,
+ readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(3))),
+ readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(2))),
+ readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(1))),
+ readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(0))));
+
+ dev_warn(cmdqv->dev, "unexpected error reported. %s\n", err_str);
+
+ /* Handle VINTF0 and its LVCMDQs */
+ if (vintf_map & BIT_ULL(0)) {
+ tegra241_vintf0_handle_error(cmdqv->vintfs[0]);
+ vintf_map &= ~BIT_ULL(0);
+ }
+
+ /* Handle other user VINTFs and their LVCMDQs */
+ while (vintf_map) {
+ unsigned long idx = __ffs64(vintf_map);
+
+ tegra241_vintf_user_handle_error(cmdqv->vintfs[idx]);
+ vintf_map &= ~BIT_ULL(idx);
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* Command Queue Function */
+
+static bool tegra241_guest_vcmdq_supports_cmd(struct arm_smmu_cmdq_ent *ent)
+{
+ switch (ent->opcode) {
+ case CMDQ_OP_TLBI_NH_ASID:
+ case CMDQ_OP_TLBI_NH_VA:
+ case CMDQ_OP_ATC_INV:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct arm_smmu_cmdq *
+tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_ent *ent)
+{
+ struct tegra241_cmdqv *cmdqv =
+ container_of(smmu, struct tegra241_cmdqv, smmu);
+ struct tegra241_vintf *vintf = cmdqv->vintfs[0];
+ struct tegra241_vcmdq *vcmdq;
+ u16 lidx;
+
+ if (READ_ONCE(bypass_vcmdq))
+ return NULL;
+
+ /* Use SMMU CMDQ if VINTF0 is uninitialized */
+ if (!READ_ONCE(vintf->enabled))
+ return NULL;
+
+ /*
+ * Select a LVCMDQ to use. Here we use a temporal solution to
+ * balance out traffic on cmdq issuing: each cmdq has its own
+ * lock, if all cpus issue cmdlist using the same cmdq, only
+ * one CPU at a time can enter the process, while the others
+ * will be spinning at the same lock.
+ */
+ lidx = raw_smp_processor_id() % cmdqv->num_lvcmdqs_per_vintf;
+ vcmdq = vintf->lvcmdqs[lidx];
+ if (!vcmdq || !READ_ONCE(vcmdq->enabled))
+ return NULL;
+
+ /* Unsupported CMD goes for smmu->cmdq pathway */
+ if (!arm_smmu_cmdq_supports_cmd(&vcmdq->cmdq, ent))
+ return NULL;
+ return &vcmdq->cmdq;
+}
+
+/* HW Reset Functions */
+
+/*
+ * When a guest-owned VCMDQ is disabled, if the guest did not enqueue a CMD_SYNC
+ * following an ATC_INV command at the end of the guest queue while this ATC_INV
+ * is timed out, the TIMEOUT will not be reported until this VCMDQ gets assigned
+ * to the next VM, which will be a false alarm potentially causing some unwanted
+ * behavior in the new VM. Thus, a guest-owned VCMDQ must flush the TIMEOUT when
+ * it gets disabled. This can be done by just issuing a CMD_SYNC to SMMU CMDQ.
+ */
+static void tegra241_vcmdq_hw_flush_timeout(struct tegra241_vcmdq *vcmdq)
+{
+ struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu;
+ u64 cmd_sync[CMDQ_ENT_DWORDS] = {};
+
+ cmd_sync[0] = FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) |
+ FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_NONE);
+
+ /*
+ * It does not hurt to insert another CMD_SYNC, taking advantage of the
+ * arm_smmu_cmdq_issue_cmdlist() that waits for the CMD_SYNC completion.
+ */
+ arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, cmd_sync, 1, true);
+}
+
+/* This function is for LVCMDQ, so @vcmdq must not be unmapped yet */
+static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
+{
+ char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
+ u32 gerrorn, gerror;
+
+ if (vcmdq_write_config(vcmdq, 0)) {
+ dev_err(vcmdq->cmdqv->dev,
+ "%sGERRORN=0x%X, GERROR=0x%X, CONS=0x%X\n", h,
+ readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN)),
+ readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)),
+ readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS)));
+ }
+ tegra241_vcmdq_hw_flush_timeout(vcmdq);
+
+ writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, PROD));
+ writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, CONS));
+ writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, BASE));
+ writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, CONS_INDX_BASE));
+
+ gerrorn = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN));
+ gerror = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR));
+ if (gerror != gerrorn) {
+ dev_warn(vcmdq->cmdqv->dev,
+ "%suncleared error detected, resetting\n", h);
+ writel(gerror, REG_VCMDQ_PAGE0(vcmdq, GERRORN));
+ }
+
+ dev_dbg(vcmdq->cmdqv->dev, "%sdeinited\n", h);
+}
+
+/* This function is for LVCMDQ, so @vcmdq must be mapped prior */
+static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
+{
+ char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
+ int ret;
+
+ /* Reset VCMDQ */
+ tegra241_vcmdq_hw_deinit(vcmdq);
+
+ /* Configure and enable VCMDQ */
+ writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
+
+ ret = vcmdq_write_config(vcmdq, VCMDQ_EN);
+ if (ret) {
+ dev_err(vcmdq->cmdqv->dev,
+ "%sGERRORN=0x%X, GERROR=0x%X, CONS=0x%X\n", h,
+ readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN)),
+ readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)),
+ readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS)));
+ return ret;
+ }
+
+ dev_dbg(vcmdq->cmdqv->dev, "%sinited\n", h);
+ return 0;
+}
+
+/* Unmap a global VCMDQ from the pre-assigned LVCMDQ */
+static void tegra241_vcmdq_unmap_lvcmdq(struct tegra241_vcmdq *vcmdq)
+{
+ u32 regval = readl(REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx)));
+ char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
+
+ writel(regval & ~CMDQV_CMDQ_ALLOCATED,
+ REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx)));
+ dev_dbg(vcmdq->cmdqv->dev, "%sunmapped\n", h);
+}
+
+static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf)
+{
+ u16 lidx = vintf->cmdqv->num_lvcmdqs_per_vintf;
+ int sidx;
+
+ /* HW requires to unmap LVCMDQs in descending order */
+ while (lidx--) {
+ if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) {
+ tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]);
+ tegra241_vcmdq_unmap_lvcmdq(vintf->lvcmdqs[lidx]);
+ }
+ }
+ vintf_write_config(vintf, 0);
+ for (sidx = 0; sidx < vintf->cmdqv->num_sids_per_vintf; sidx++) {
+ writel(0, REG_VINTF(vintf, SID_MATCH(sidx)));
+ writel(0, REG_VINTF(vintf, SID_REPLACE(sidx)));
+ }
+}
+
+/* Map a global VCMDQ to the pre-assigned LVCMDQ */
+static void tegra241_vcmdq_map_lvcmdq(struct tegra241_vcmdq *vcmdq)
+{
+ u32 regval = readl(REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx)));
+ char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
+
+ writel(regval | CMDQV_CMDQ_ALLOCATED,
+ REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx)));
+ dev_dbg(vcmdq->cmdqv->dev, "%smapped\n", h);
+}
+
+static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own)
+{
+ u32 regval;
+ u16 lidx;
+ int ret;
+
+ /* Reset VINTF */
+ tegra241_vintf_hw_deinit(vintf);
+
+ /* Configure and enable VINTF */
+ /*
+ * Note that HYP_OWN bit is wired to zero when running in guest kernel,
+ * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a
+ * restricted set of supported commands.
+ */
+ regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) |
+ FIELD_PREP(VINTF_VMID, vintf->vsmmu.vmid);
+ writel(regval, REG_VINTF(vintf, CONFIG));
+
+ ret = vintf_write_config(vintf, regval | VINTF_EN);
+ if (ret)
+ return ret;
+ /*
+ * As being mentioned above, HYP_OWN bit is wired to zero for a guest
+ * kernel, so read it back from HW to ensure that reflects in hyp_own
+ */
+ vintf->hyp_own = !!(VINTF_HYP_OWN & readl(REG_VINTF(vintf, CONFIG)));
+
+ /* HW requires to map LVCMDQs in ascending order */
+ for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) {
+ if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) {
+ tegra241_vcmdq_map_lvcmdq(vintf->lvcmdqs[lidx]);
+ ret = tegra241_vcmdq_hw_init(vintf->lvcmdqs[lidx]);
+ if (ret) {
+ tegra241_vintf_hw_deinit(vintf);
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tegra241_cmdqv_hw_reset(struct arm_smmu_device *smmu)
+{
+ struct tegra241_cmdqv *cmdqv =
+ container_of(smmu, struct tegra241_cmdqv, smmu);
+ u16 qidx, lidx, idx;
+ u32 regval;
+ int ret;
+
+ /* Reset CMDQV */
+ regval = readl_relaxed(REG_CMDQV(cmdqv, CONFIG));
+ ret = cmdqv_write_config(cmdqv, regval & ~CMDQV_EN);
+ if (ret)
+ return ret;
+ ret = cmdqv_write_config(cmdqv, regval | CMDQV_EN);
+ if (ret)
+ return ret;
+
+ /* Assign preallocated global VCMDQs to each VINTF as LVCMDQs */
+ for (idx = 0, qidx = 0; idx < cmdqv->num_vintfs; idx++) {
+ for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) {
+ regval = FIELD_PREP(CMDQV_CMDQ_ALLOC_VINTF, idx);
+ regval |= FIELD_PREP(CMDQV_CMDQ_ALLOC_LVCMDQ, lidx);
+ writel_relaxed(regval,
+ REG_CMDQV(cmdqv, CMDQ_ALLOC(qidx++)));
+ }
+ }
+
+ return tegra241_vintf_hw_init(cmdqv->vintfs[0], true);
+}
+
+/* VCMDQ Resource Helpers */
+
+static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
+{
+ struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu;
+ struct arm_smmu_cmdq *cmdq = &vcmdq->cmdq;
+ struct arm_smmu_queue *q = &cmdq->q;
+ char name[16];
+ u32 regval;
+ int ret;
+
+ snprintf(name, 16, "vcmdq%u", vcmdq->idx);
+
+ /* Cap queue size to SMMU's IDR1.CMDQS and ensure natural alignment */
+ regval = readl_relaxed(smmu->base + ARM_SMMU_IDR1);
+ q->llq.max_n_shift =
+ min_t(u32, CMDQ_MAX_SZ_SHIFT, FIELD_GET(IDR1_CMDQS, regval));
+
+ /* Use the common helper to init the VCMDQ, and then... */
+ ret = arm_smmu_init_one_queue(smmu, q, vcmdq->page0,
+ TEGRA241_VCMDQ_PROD, TEGRA241_VCMDQ_CONS,
+ CMDQ_ENT_DWORDS, name);
+ if (ret)
+ return ret;
+
+ /* ...override q_base to write VCMDQ_BASE registers */
+ q->q_base = q->base_dma & VCMDQ_ADDR;
+ q->q_base |= FIELD_PREP(VCMDQ_LOG2SIZE, q->llq.max_n_shift);
+
+ if (!vcmdq->vintf->hyp_own)
+ cmdq->supports_cmd = tegra241_guest_vcmdq_supports_cmd;
+
+ return arm_smmu_cmdq_init(smmu, cmdq);
+}
+
+/* VINTF Logical VCMDQ Resource Helpers */
+
+static void tegra241_vintf_deinit_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
+{
+ vintf->lvcmdqs[lidx] = NULL;
+}
+
+static int tegra241_vintf_init_lvcmdq(struct tegra241_vintf *vintf, u16 lidx,
+ struct tegra241_vcmdq *vcmdq)
+{
+ struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+ u16 idx = vintf->idx;
+
+ vcmdq->idx = idx * cmdqv->num_lvcmdqs_per_vintf + lidx;
+ vcmdq->lidx = lidx;
+ vcmdq->cmdqv = cmdqv;
+ vcmdq->vintf = vintf;
+ vcmdq->page0 = cmdqv->base + TEGRA241_VINTFi_LVCMDQ_PAGE0(idx, lidx);
+ vcmdq->page1 = cmdqv->base + TEGRA241_VINTFi_LVCMDQ_PAGE1(idx, lidx);
+
+ vintf->lvcmdqs[lidx] = vcmdq;
+ return 0;
+}
+
+static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
+{
+ struct tegra241_vcmdq *vcmdq = vintf->lvcmdqs[lidx];
+ char header[64];
+
+ /* Note that the lvcmdq queue memory space is managed by devres */
+
+ tegra241_vintf_deinit_lvcmdq(vintf, lidx);
+
+ dev_dbg(vintf->cmdqv->dev,
+ "%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64));
+ /* Guest-owned VCMDQ is free-ed with hw_queue by iommufd core */
+ if (vcmdq->vintf->hyp_own)
+ kfree(vcmdq);
+}
+
+static struct tegra241_vcmdq *
+tegra241_vintf_alloc_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
+{
+ struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+ struct tegra241_vcmdq *vcmdq;
+ char header[64];
+ int ret;
+
+ vcmdq = kzalloc(sizeof(*vcmdq), GFP_KERNEL);
+ if (!vcmdq)
+ return ERR_PTR(-ENOMEM);
+
+ ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq);
+ if (ret)
+ goto free_vcmdq;
+
+ /* Build an arm_smmu_cmdq for each LVCMDQ */
+ ret = tegra241_vcmdq_alloc_smmu_cmdq(vcmdq);
+ if (ret)
+ goto deinit_lvcmdq;
+
+ dev_dbg(cmdqv->dev,
+ "%sallocated\n", lvcmdq_error_header(vcmdq, header, 64));
+ return vcmdq;
+
+deinit_lvcmdq:
+ tegra241_vintf_deinit_lvcmdq(vintf, lidx);
+free_vcmdq:
+ kfree(vcmdq);
+ return ERR_PTR(ret);
+}
+
+/* VINTF Resource Helpers */
+
+static void tegra241_cmdqv_deinit_vintf(struct tegra241_cmdqv *cmdqv, u16 idx)
+{
+ kfree(cmdqv->vintfs[idx]->lvcmdqs);
+ ida_free(&cmdqv->vintf_ids, idx);
+ cmdqv->vintfs[idx] = NULL;
+}
+
+static int tegra241_cmdqv_init_vintf(struct tegra241_cmdqv *cmdqv, u16 max_idx,
+ struct tegra241_vintf *vintf)
+{
+
+ u16 idx;
+ int ret;
+
+ ret = ida_alloc_max(&cmdqv->vintf_ids, max_idx, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+ idx = ret;
+
+ vintf->idx = idx;
+ vintf->cmdqv = cmdqv;
+ vintf->base = cmdqv->base + TEGRA241_VINTF(idx);
+
+ vintf->lvcmdqs = kcalloc(cmdqv->num_lvcmdqs_per_vintf,
+ sizeof(*vintf->lvcmdqs), GFP_KERNEL);
+ if (!vintf->lvcmdqs) {
+ ida_free(&cmdqv->vintf_ids, idx);
+ return -ENOMEM;
+ }
+
+ cmdqv->vintfs[idx] = vintf;
+ return ret;
+}
+
+/* Remove Helpers */
+
+static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx)
+{
+ struct tegra241_vintf *vintf = cmdqv->vintfs[idx];
+ u16 lidx;
+
+ tegra241_vintf_hw_deinit(vintf);
+
+ /* Remove LVCMDQ resources */
+ for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++)
+ if (vintf->lvcmdqs[lidx])
+ tegra241_vintf_free_lvcmdq(vintf, lidx);
+
+ dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx);
+ tegra241_cmdqv_deinit_vintf(cmdqv, idx);
+ if (!vintf->hyp_own) {
+ mutex_destroy(&vintf->lvcmdq_mutex);
+ ida_destroy(&vintf->sids);
+ /* Guest-owned VINTF is free-ed with viommu by iommufd core */
+ } else {
+ kfree(vintf);
+ }
+}
+
+static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
+{
+ struct tegra241_cmdqv *cmdqv =
+ container_of(smmu, struct tegra241_cmdqv, smmu);
+ u16 idx;
+
+ /* Remove VINTF resources */
+ for (idx = 0; idx < cmdqv->num_vintfs; idx++) {
+ if (cmdqv->vintfs[idx]) {
+ /* Only vintf0 should remain at this stage */
+ WARN_ON(idx > 0);
+ tegra241_cmdqv_remove_vintf(cmdqv, idx);
+ }
+ }
+
+ /* Remove cmdqv resources */
+ ida_destroy(&cmdqv->vintf_ids);
+
+ if (cmdqv->irq > 0)
+ free_irq(cmdqv->irq, cmdqv);
+ iounmap(cmdqv->base);
+ kfree(cmdqv->vintfs);
+ put_device(cmdqv->dev); /* smmu->impl_dev */
+}
+
+static int
+tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu,
+ const struct iommu_user_data *user_data);
+
+static void *tegra241_cmdqv_hw_info(struct arm_smmu_device *smmu, u32 *length,
+ enum iommu_hw_info_type *type)
+{
+ struct tegra241_cmdqv *cmdqv =
+ container_of(smmu, struct tegra241_cmdqv, smmu);
+ struct iommu_hw_info_tegra241_cmdqv *info;
+ u32 regval;
+
+ if (*type != IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM));
+ info->log2vcmdqs = ilog2(cmdqv->num_lvcmdqs_per_vintf);
+ info->log2vsids = ilog2(cmdqv->num_sids_per_vintf);
+ info->version = FIELD_GET(CMDQV_VER, regval);
+
+ *length = sizeof(*info);
+ *type = IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV;
+ return info;
+}
+
+static size_t tegra241_cmdqv_get_vintf_size(enum iommu_viommu_type viommu_type)
+{
+ if (viommu_type != IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV)
+ return 0;
+ return VIOMMU_STRUCT_SIZE(struct tegra241_vintf, vsmmu.core);
+}
+
+static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = {
+ /* For in-kernel use */
+ .get_secondary_cmdq = tegra241_cmdqv_get_cmdq,
+ .device_reset = tegra241_cmdqv_hw_reset,
+ .device_remove = tegra241_cmdqv_remove,
+ /* For user-space use */
+ .hw_info = tegra241_cmdqv_hw_info,
+ .get_viommu_size = tegra241_cmdqv_get_vintf_size,
+ .vsmmu_init = tegra241_cmdqv_init_vintf_user,
+};
+
+/* Probe Functions */
+
+static int tegra241_cmdqv_acpi_is_memory(struct acpi_resource *res, void *data)
+{
+ struct resource_win win;
+
+ return !acpi_dev_resource_address_space(res, &win);
+}
+
+static int tegra241_cmdqv_acpi_get_irqs(struct acpi_resource *ares, void *data)
+{
+ struct resource r;
+ int *irq = data;
+
+ if (*irq <= 0 && acpi_dev_resource_interrupt(ares, 0, &r))
+ *irq = r.start;
+ return 1; /* No need to add resource to the list */
+}
+
+static struct resource *
+tegra241_cmdqv_find_acpi_resource(struct device *dev, int *irq)
+{
+ struct acpi_device *adev = to_acpi_device(dev);
+ struct list_head resource_list;
+ struct resource_entry *rentry;
+ struct resource *res = NULL;
+ int ret;
+
+ INIT_LIST_HEAD(&resource_list);
+ ret = acpi_dev_get_resources(adev, &resource_list,
+ tegra241_cmdqv_acpi_is_memory, NULL);
+ if (ret < 0) {
+ dev_err(dev, "failed to get memory resource: %d\n", ret);
+ return NULL;
+ }
+
+ rentry = list_first_entry_or_null(&resource_list,
+ struct resource_entry, node);
+ if (!rentry) {
+ dev_err(dev, "failed to get memory resource entry\n");
+ goto free_list;
+ }
+
+ /* Caller must free the res */
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ goto free_list;
+
+ *res = *rentry->res;
+
+ acpi_dev_free_resource_list(&resource_list);
+
+ INIT_LIST_HEAD(&resource_list);
+
+ if (irq)
+ ret = acpi_dev_get_resources(adev, &resource_list,
+ tegra241_cmdqv_acpi_get_irqs, irq);
+ if (ret < 0 || !irq || *irq <= 0)
+ dev_warn(dev, "no interrupt. errors will not be reported\n");
+
+free_list:
+ acpi_dev_free_resource_list(&resource_list);
+ return res;
+}
+
+static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu)
+{
+ struct tegra241_cmdqv *cmdqv =
+ container_of(smmu, struct tegra241_cmdqv, smmu);
+ struct tegra241_vintf *vintf;
+ int lidx;
+ int ret;
+
+ vintf = kzalloc(sizeof(*vintf), GFP_KERNEL);
+ if (!vintf)
+ return -ENOMEM;
+
+ /* Init VINTF0 for in-kernel use */
+ ret = tegra241_cmdqv_init_vintf(cmdqv, 0, vintf);
+ if (ret) {
+ dev_err(cmdqv->dev, "failed to init vintf0: %d\n", ret);
+ return ret;
+ }
+
+ /* Preallocate logical VCMDQs to VINTF0 */
+ for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) {
+ struct tegra241_vcmdq *vcmdq;
+
+ vcmdq = tegra241_vintf_alloc_lvcmdq(vintf, lidx);
+ if (IS_ERR(vcmdq))
+ return PTR_ERR(vcmdq);
+ }
+
+ /* Now, we are ready to run all the impl ops */
+ smmu->impl_ops = &tegra241_cmdqv_impl_ops;
+ return 0;
+}
+
+#ifdef CONFIG_IOMMU_DEBUGFS
+static struct dentry *cmdqv_debugfs_dir;
+#endif
+
+static struct arm_smmu_device *
+__tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
+ int irq)
+{
+ static const struct arm_smmu_impl_ops init_ops = {
+ .init_structures = tegra241_cmdqv_init_structures,
+ .device_remove = tegra241_cmdqv_remove,
+ };
+ struct tegra241_cmdqv *cmdqv = NULL;
+ struct arm_smmu_device *new_smmu;
+ void __iomem *base;
+ u32 regval;
+ int ret;
+
+ static_assert(offsetof(struct tegra241_cmdqv, smmu) == 0);
+
+ base = ioremap(res->start, resource_size(res));
+ if (!base) {
+ dev_err(smmu->dev, "failed to ioremap\n");
+ return NULL;
+ }
+
+ regval = readl(base + TEGRA241_CMDQV_CONFIG);
+ if (disable_cmdqv) {
+ dev_info(smmu->dev, "Detected disable_cmdqv=true\n");
+ writel(regval & ~CMDQV_EN, base + TEGRA241_CMDQV_CONFIG);
+ goto iounmap;
+ }
+
+ cmdqv = devm_krealloc(smmu->dev, smmu, sizeof(*cmdqv), GFP_KERNEL);
+ if (!cmdqv)
+ goto iounmap;
+ new_smmu = &cmdqv->smmu;
+
+ cmdqv->irq = irq;
+ cmdqv->base = base;
+ cmdqv->dev = smmu->impl_dev;
+ cmdqv->base_phys = res->start;
+
+ if (cmdqv->irq > 0) {
+ ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr,
+ IRQF_ONESHOT, "tegra241-cmdqv",
+ cmdqv);
+ if (ret) {
+ dev_err(cmdqv->dev, "failed to request irq (%d): %d\n",
+ cmdqv->irq, ret);
+ goto iounmap;
+ }
+ }
+
+ regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM));
+ cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval);
+ cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval);
+ cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs;
+ cmdqv->num_sids_per_vintf =
+ 1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval);
+
+ cmdqv->vintfs =
+ kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL);
+ if (!cmdqv->vintfs)
+ goto free_irq;
+
+ ida_init(&cmdqv->vintf_ids);
+
+#ifdef CONFIG_IOMMU_DEBUGFS
+ if (!cmdqv_debugfs_dir) {
+ cmdqv_debugfs_dir =
+ debugfs_create_dir("tegra241_cmdqv", iommu_debugfs_dir);
+ debugfs_create_bool("bypass_vcmdq", 0644, cmdqv_debugfs_dir,
+ &bypass_vcmdq);
+ }
+#endif
+
+ /* Provide init-level ops only, until tegra241_cmdqv_init_structures */
+ new_smmu->impl_ops = &init_ops;
+
+ return new_smmu;
+
+free_irq:
+ if (cmdqv->irq > 0)
+ free_irq(cmdqv->irq, cmdqv);
+iounmap:
+ iounmap(base);
+ return NULL;
+}
+
+struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
+{
+ struct arm_smmu_device *new_smmu;
+ struct resource *res = NULL;
+ int irq;
+
+ if (!smmu->dev->of_node)
+ res = tegra241_cmdqv_find_acpi_resource(smmu->impl_dev, &irq);
+ if (!res)
+ goto out_fallback;
+
+ new_smmu = __tegra241_cmdqv_probe(smmu, res, irq);
+ kfree(res);
+
+ if (new_smmu)
+ return new_smmu;
+
+out_fallback:
+ dev_info(smmu->impl_dev, "Falling back to standard SMMU CMDQ\n");
+ smmu->options &= ~ARM_SMMU_OPT_TEGRA241_CMDQV;
+ put_device(smmu->impl_dev);
+ return ERR_PTR(-ENODEV);
+}
+
+/* User space VINTF and VCMDQ Functions */
+
+static size_t tegra241_vintf_get_vcmdq_size(struct iommufd_viommu *viommu,
+ enum iommu_hw_queue_type queue_type)
+{
+ if (queue_type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV)
+ return 0;
+ return HW_QUEUE_STRUCT_SIZE(struct tegra241_vcmdq, core);
+}
+
+static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq)
+{
+ char header[64];
+
+ /* Configure the vcmdq only; User space does the enabling */
+ writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
+
+ dev_dbg(vcmdq->cmdqv->dev, "%sinited at host PA 0x%llx size 0x%lx\n",
+ lvcmdq_error_header(vcmdq, header, 64),
+ vcmdq->cmdq.q.q_base & VCMDQ_ADDR,
+ 1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE));
+ return 0;
+}
+
+static void
+tegra241_vintf_destroy_lvcmdq_user(struct iommufd_hw_queue *hw_queue)
+{
+ struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue);
+
+ mutex_lock(&vcmdq->vintf->lvcmdq_mutex);
+ tegra241_vcmdq_hw_deinit(vcmdq);
+ tegra241_vcmdq_unmap_lvcmdq(vcmdq);
+ tegra241_vintf_free_lvcmdq(vcmdq->vintf, vcmdq->lidx);
+ if (vcmdq->prev)
+ iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core);
+ mutex_unlock(&vcmdq->vintf->lvcmdq_mutex);
+}
+
+static int tegra241_vintf_alloc_lvcmdq_user(struct iommufd_hw_queue *hw_queue,
+ u32 lidx, phys_addr_t base_addr_pa)
+{
+ struct tegra241_vintf *vintf = viommu_to_vintf(hw_queue->viommu);
+ struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue);
+ struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+ struct arm_smmu_device *smmu = &cmdqv->smmu;
+ struct tegra241_vcmdq *prev = NULL;
+ u32 log2size, max_n_shift;
+ char header[64];
+ int ret;
+
+ if (hw_queue->type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV)
+ return -EOPNOTSUPP;
+ if (lidx >= cmdqv->num_lvcmdqs_per_vintf)
+ return -EINVAL;
+
+ mutex_lock(&vintf->lvcmdq_mutex);
+
+ if (vintf->lvcmdqs[lidx]) {
+ ret = -EEXIST;
+ goto unlock;
+ }
+
+ /*
+ * HW requires to map LVCMDQs in ascending order, so reject if the
+ * previous lvcmdqs is not allocated yet.
+ */
+ if (lidx) {
+ prev = vintf->lvcmdqs[lidx - 1];
+ if (!prev) {
+ ret = -EIO;
+ goto unlock;
+ }
+ }
+
+ /*
+ * hw_queue->length must be a power of 2, in range of
+ * [ 32, 2 ^ (idr[1].CMDQS + CMDQ_ENT_SZ_SHIFT) ]
+ */
+ max_n_shift = FIELD_GET(IDR1_CMDQS,
+ readl_relaxed(smmu->base + ARM_SMMU_IDR1));
+ if (!is_power_of_2(hw_queue->length) || hw_queue->length < 32 ||
+ hw_queue->length > (1 << (max_n_shift + CMDQ_ENT_SZ_SHIFT))) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+ log2size = ilog2(hw_queue->length) - CMDQ_ENT_SZ_SHIFT;
+
+ /* base_addr_pa must be aligned to hw_queue->length */
+ if (base_addr_pa & ~VCMDQ_ADDR ||
+ base_addr_pa & (hw_queue->length - 1)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ /*
+ * HW requires to unmap LVCMDQs in descending order, so destroy() must
+ * follow this rule. Set a dependency on its previous LVCMDQ so iommufd
+ * core will help enforce it.
+ */
+ if (prev) {
+ ret = iommufd_hw_queue_depend(vcmdq, prev, core);
+ if (ret)
+ goto unlock;
+ }
+ vcmdq->prev = prev;
+
+ ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq);
+ if (ret)
+ goto undepend_vcmdq;
+
+ dev_dbg(cmdqv->dev, "%sallocated\n",
+ lvcmdq_error_header(vcmdq, header, 64));
+
+ tegra241_vcmdq_map_lvcmdq(vcmdq);
+
+ vcmdq->cmdq.q.q_base = base_addr_pa & VCMDQ_ADDR;
+ vcmdq->cmdq.q.q_base |= log2size;
+
+ ret = tegra241_vcmdq_hw_init_user(vcmdq);
+ if (ret)
+ goto unmap_lvcmdq;
+
+ hw_queue->destroy = &tegra241_vintf_destroy_lvcmdq_user;
+ mutex_unlock(&vintf->lvcmdq_mutex);
+ return 0;
+
+unmap_lvcmdq:
+ tegra241_vcmdq_unmap_lvcmdq(vcmdq);
+ tegra241_vintf_deinit_lvcmdq(vintf, lidx);
+undepend_vcmdq:
+ if (vcmdq->prev)
+ iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core);
+unlock:
+ mutex_unlock(&vintf->lvcmdq_mutex);
+ return ret;
+}
+
+static void tegra241_cmdqv_destroy_vintf_user(struct iommufd_viommu *viommu)
+{
+ struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
+
+ if (vintf->mmap_offset)
+ iommufd_viommu_destroy_mmap(&vintf->vsmmu.core,
+ vintf->mmap_offset);
+ tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx);
+}
+
+static void tegra241_vintf_destroy_vsid(struct iommufd_vdevice *vdev)
+{
+ struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev);
+ struct tegra241_vintf *vintf = vsid->vintf;
+
+ writel(0, REG_VINTF(vintf, SID_MATCH(vsid->idx)));
+ writel(0, REG_VINTF(vintf, SID_REPLACE(vsid->idx)));
+ ida_free(&vintf->sids, vsid->idx);
+ dev_dbg(vintf->cmdqv->dev,
+ "VINTF%u: deallocated SID_REPLACE%d for pSID=%x\n", vintf->idx,
+ vsid->idx, vsid->sid);
+}
+
+static int tegra241_vintf_init_vsid(struct iommufd_vdevice *vdev)
+{
+ struct device *dev = iommufd_vdevice_to_device(vdev);
+ struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+ struct tegra241_vintf *vintf = viommu_to_vintf(vdev->viommu);
+ struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev);
+ struct arm_smmu_stream *stream = &master->streams[0];
+ u64 virt_sid = vdev->virt_id;
+ int sidx;
+
+ if (virt_sid > UINT_MAX)
+ return -EINVAL;
+
+ WARN_ON_ONCE(master->num_streams != 1);
+
+ /* Find an empty pair of SID_REPLACE and SID_MATCH */
+ sidx = ida_alloc_max(&vintf->sids, vintf->cmdqv->num_sids_per_vintf - 1,
+ GFP_KERNEL);
+ if (sidx < 0)
+ return sidx;
+
+ writel(stream->id, REG_VINTF(vintf, SID_REPLACE(sidx)));
+ writel(virt_sid << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(sidx)));
+ dev_dbg(vintf->cmdqv->dev,
+ "VINTF%u: allocated SID_REPLACE%d for pSID=%x, vSID=%x\n",
+ vintf->idx, sidx, stream->id, (u32)virt_sid);
+
+ vsid->idx = sidx;
+ vsid->vintf = vintf;
+ vsid->sid = stream->id;
+
+ vdev->destroy = &tegra241_vintf_destroy_vsid;
+ return 0;
+}
+
+static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = {
+ .destroy = tegra241_cmdqv_destroy_vintf_user,
+ .alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
+ /* Non-accelerated commands will be still handled by the kernel */
+ .cache_invalidate = arm_vsmmu_cache_invalidate,
+ .vdevice_size = VDEVICE_STRUCT_SIZE(struct tegra241_vintf_sid, core),
+ .vdevice_init = tegra241_vintf_init_vsid,
+ .get_hw_queue_size = tegra241_vintf_get_vcmdq_size,
+ .hw_queue_init_phys = tegra241_vintf_alloc_lvcmdq_user,
+};
+
+static int
+tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu,
+ const struct iommu_user_data *user_data)
+{
+ struct tegra241_cmdqv *cmdqv =
+ container_of(vsmmu->smmu, struct tegra241_cmdqv, smmu);
+ struct tegra241_vintf *vintf = viommu_to_vintf(&vsmmu->core);
+ struct iommu_viommu_tegra241_cmdqv data;
+ phys_addr_t page0_base;
+ int ret;
+
+ /*
+ * Unsupported type should be rejected by tegra241_cmdqv_get_vintf_size.
+ * Seeing one here indicates a kernel bug or some data corruption.
+ */
+ if (WARN_ON(vsmmu->core.type != IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV))
+ return -EOPNOTSUPP;
+
+ if (!user_data)
+ return -EINVAL;
+
+ ret = iommu_copy_struct_from_user(&data, user_data,
+ IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
+ out_vintf_mmap_length);
+ if (ret)
+ return ret;
+
+ ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf);
+ if (ret < 0) {
+ dev_err(cmdqv->dev, "no more available vintf\n");
+ return ret;
+ }
+
+ /*
+ * Initialize the user-owned VINTF without a LVCMDQ, as it cannot pre-
+ * allocate a LVCMDQ until user space wants one, for security reasons.
+ * It is different than the kernel-owned VINTF0, which had pre-assigned
+ * and pre-allocated global VCMDQs that would be mapped to the LVCMDQs
+ * by the tegra241_vintf_hw_init() call.
+ */
+ ret = tegra241_vintf_hw_init(vintf, false);
+ if (ret)
+ goto deinit_vintf;
+
+ page0_base = cmdqv->base_phys + TEGRA241_VINTFi_PAGE0(vintf->idx);
+ ret = iommufd_viommu_alloc_mmap(&vintf->vsmmu.core, page0_base, SZ_64K,
+ &vintf->mmap_offset);
+ if (ret)
+ goto hw_deinit_vintf;
+
+ data.out_vintf_mmap_length = SZ_64K;
+ data.out_vintf_mmap_offset = vintf->mmap_offset;
+ ret = iommu_copy_struct_to_user(user_data, &data,
+ IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
+ out_vintf_mmap_length);
+ if (ret)
+ goto free_mmap;
+
+ ida_init(&vintf->sids);
+ mutex_init(&vintf->lvcmdq_mutex);
+
+ dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n", vintf->idx,
+ vintf->vsmmu.vmid);
+
+ vsmmu->core.ops = &tegra241_cmdqv_viommu_ops;
+ return 0;
+
+free_mmap:
+ iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, vintf->mmap_offset);
+hw_deinit_vintf:
+ tegra241_vintf_hw_deinit(vintf);
+deinit_vintf:
+ tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
+ return ret;
+}
+
+MODULE_IMPORT_NS("IOMMUFD");
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 9dc772f2cbb2..db9b9a8e139c 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -110,7 +110,6 @@ static struct arm_smmu_device *cavium_smmu_impl_init(struct arm_smmu_device *smm
int arm_mmu500_reset(struct arm_smmu_device *smmu)
{
u32 reg, major;
- int i;
/*
* On MMU-500 r2p0 onwards we need to clear ACR.CACHE_LOCK before
* writes to the context bank ACTLRs will stick. And we just hope that
@@ -128,18 +127,20 @@ int arm_mmu500_reset(struct arm_smmu_device *smmu)
reg |= ARM_MMU500_ACR_SMTNMB_TLBEN | ARM_MMU500_ACR_S2CRB_TLBEN;
arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sACR, reg);
+#ifdef CONFIG_ARM_SMMU_MMU_500_CPRE_ERRATA
/*
* Disable MMU-500's not-particularly-beneficial next-page
- * prefetcher for the sake of errata #841119 and #826419.
+ * prefetcher for the sake of at least 5 known errata.
*/
- for (i = 0; i < smmu->num_context_banks; ++i) {
+ for (int i = 0; i < smmu->num_context_banks; ++i) {
reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR);
reg &= ~ARM_MMU500_ACTLR_CPRE;
arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_ACTLR, reg);
reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR);
if (reg & ARM_MMU500_ACTLR_CPRE)
- dev_warn_once(smmu->dev, "Failed to disable prefetcher [errata #841119 and #826419], check ACR.CACHE_LOCK\n");
+ dev_warn_once(smmu->dev, "Failed to disable prefetcher for errata workarounds, check SACR.CACHE_LOCK\n");
}
+#endif
return 0;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
index 957d988b6d83..2fce4f6d4e1b 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
@@ -200,7 +200,7 @@ static irqreturn_t nvidia_smmu_context_fault_bank(int irq,
void __iomem *cb_base = nvidia_smmu_page(smmu, inst, smmu->numpage + idx);
fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
- if (!(fsr & ARM_SMMU_FSR_FAULT))
+ if (!(fsr & ARM_SMMU_CB_FSR_FAULT))
return IRQ_NONE;
fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0);
@@ -277,7 +277,7 @@ static int nvidia_smmu_init_context(struct arm_smmu_domain *smmu_domain,
*/
if (of_device_is_compatible(np, "nvidia,tegra234-smmu") ||
of_device_is_compatible(np, "nvidia,tegra194-smmu")) {
- smmu->pgsize_bitmap = PAGE_SIZE;
+ smmu->pgsize_bitmap &= GENMASK(PAGE_SHIFT, 0);
pgtbl_cfg->pgsize_bitmap = smmu->pgsize_bitmap;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
index 552199cbd9e2..65e0ef6539fe 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
@@ -73,7 +73,7 @@ void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu)
if (__ratelimit(&rs)) {
dev_err(smmu->dev, "TLB sync timed out -- SMMU may be deadlocked\n");
- cfg = qsmmu->cfg;
+ cfg = qsmmu->data->cfg;
if (!cfg)
return;
@@ -141,7 +141,7 @@ static int qcom_tbu_halt(struct qcom_tbu *tbu, struct arm_smmu_domain *smmu_doma
writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
- if ((fsr & ARM_SMMU_FSR_FAULT) && (fsr & ARM_SMMU_FSR_SS)) {
+ if ((fsr & ARM_SMMU_CB_FSR_FAULT) && (fsr & ARM_SMMU_CB_FSR_SS)) {
u32 sctlr_orig, sctlr;
/*
@@ -298,7 +298,7 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain,
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr);
fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
- if (fsr & ARM_SMMU_FSR_FAULT) {
+ if (fsr & ARM_SMMU_CB_FSR_FAULT) {
/* Clear pending interrupts */
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
@@ -306,7 +306,7 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain,
* TBU halt takes care of resuming any stalled transcation.
* Kept it here for completeness sake.
*/
- if (fsr & ARM_SMMU_FSR_SS)
+ if (fsr & ARM_SMMU_CB_FSR_SS)
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
ARM_SMMU_RESUME_TERMINATE);
}
@@ -320,11 +320,11 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain,
phys = qcom_tbu_trigger_atos(smmu_domain, tbu, iova, sid);
fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
- if (fsr & ARM_SMMU_FSR_FAULT) {
+ if (fsr & ARM_SMMU_CB_FSR_FAULT) {
/* Clear pending interrupts */
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
- if (fsr & ARM_SMMU_FSR_SS)
+ if (fsr & ARM_SMMU_CB_FSR_SS)
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
ARM_SMMU_RESUME_TERMINATE);
}
@@ -383,68 +383,53 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
struct arm_smmu_domain *smmu_domain = dev;
struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
struct arm_smmu_device *smmu = smmu_domain->smmu;
- u32 fsr, fsynr, cbfrsynra, resume = 0;
+ struct arm_smmu_context_fault_info cfi;
+ u32 resume = 0;
int idx = smmu_domain->cfg.cbndx;
phys_addr_t phys_soft;
- unsigned long iova;
int ret, tmp;
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
- if (!(fsr & ARM_SMMU_FSR_FAULT))
- return IRQ_NONE;
+ arm_smmu_read_context_fault_info(smmu, idx, &cfi);
- fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
- iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
- cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+ if (!(cfi.fsr & ARM_SMMU_CB_FSR_FAULT))
+ return IRQ_NONE;
if (list_empty(&tbu_list)) {
- ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
- fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+ ret = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova,
+ cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
if (ret == -ENOSYS)
- dev_err_ratelimited(smmu->dev,
- "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
- fsr, iova, fsynr, cbfrsynra, idx);
+ arm_smmu_print_context_fault_info(smmu, idx, &cfi);
+
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr);
+
+ if (cfi.fsr & ARM_SMMU_CB_FSR_SS) {
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
+ ret == -EAGAIN ? 0 : ARM_SMMU_RESUME_TERMINATE);
+ }
- arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
return IRQ_HANDLED;
}
- phys_soft = ops->iova_to_phys(ops, iova);
+ phys_soft = ops->iova_to_phys(ops, cfi.iova);
- tmp = report_iommu_fault(&smmu_domain->domain, NULL, iova,
- fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+ tmp = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova,
+ cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
if (!tmp || tmp == -EBUSY) {
- dev_dbg(smmu->dev,
- "Context fault handled by client: iova=0x%08lx, fsr=0x%x, fsynr=0x%x, cb=%d\n",
- iova, fsr, fsynr, idx);
- dev_dbg(smmu->dev, "soft iova-to-phys=%pa\n", &phys_soft);
ret = IRQ_HANDLED;
resume = ARM_SMMU_RESUME_TERMINATE;
+ } else if (tmp == -EAGAIN) {
+ ret = IRQ_HANDLED;
+ resume = 0;
} else {
- phys_addr_t phys_atos = qcom_smmu_verify_fault(smmu_domain, iova, fsr);
+ phys_addr_t phys_atos = qcom_smmu_verify_fault(smmu_domain, cfi.iova, cfi.fsr);
if (__ratelimit(&_rs)) {
- dev_err(smmu->dev,
- "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
- fsr, iova, fsynr, cbfrsynra, idx);
- dev_err(smmu->dev,
- "FSR = %08x [%s%s%s%s%s%s%s%s%s], SID=0x%x\n",
- fsr,
- (fsr & 0x02) ? "TF " : "",
- (fsr & 0x04) ? "AFF " : "",
- (fsr & 0x08) ? "PF " : "",
- (fsr & 0x10) ? "EF " : "",
- (fsr & 0x20) ? "TLBMCF " : "",
- (fsr & 0x40) ? "TLBLKF " : "",
- (fsr & 0x80) ? "MHF " : "",
- (fsr & 0x40000000) ? "SS " : "",
- (fsr & 0x80000000) ? "MULTI " : "",
- cbfrsynra);
+ arm_smmu_print_context_fault_info(smmu, idx, &cfi);
dev_err(smmu->dev,
"soft iova-to-phys=%pa\n", &phys_soft);
@@ -478,17 +463,17 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
*/
if (tmp != -EBUSY) {
/* Clear the faulting FSR */
- arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr);
/* Retry or terminate any stalled transactions */
- if (fsr & ARM_SMMU_FSR_SS)
+ if (cfi.fsr & ARM_SMMU_CB_FSR_SS)
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, resume);
}
return ret;
}
-static int qcom_tbu_probe(struct platform_device *pdev)
+int qcom_tbu_probe(struct platform_device *pdev)
{
struct of_phandle_args args = { .args_count = 2 };
struct device_node *np = pdev->dev.of_node;
@@ -530,18 +515,3 @@ static int qcom_tbu_probe(struct platform_device *pdev)
return 0;
}
-
-static const struct of_device_id qcom_tbu_of_match[] = {
- { .compatible = "qcom,sc7280-tbu" },
- { .compatible = "qcom,sdm845-tbu" },
- { }
-};
-
-static struct platform_driver qcom_tbu_driver = {
- .driver = {
- .name = "qcom_tbu",
- .of_match_table = qcom_tbu_of_match,
- },
- .probe = qcom_tbu_probe,
-};
-builtin_platform_driver(qcom_tbu_driver);
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 25f034677f56..573085349df3 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -8,12 +8,48 @@
#include <linux/delay.h>
#include <linux/of_device.h>
#include <linux/firmware/qcom/qcom_scm.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
#include "arm-smmu.h"
#include "arm-smmu-qcom.h"
#define QCOM_DUMMY_VAL -1
+/*
+ * SMMU-500 TRM defines BIT(0) as CMTLB (Enable context caching in the
+ * macro TLB) and BIT(1) as CPRE (Enable context caching in the prefetch
+ * buffer). The remaining bits are implementation defined and vary across
+ * SoCs.
+ */
+
+#define CPRE (1 << 1)
+#define CMTLB (1 << 0)
+#define PREFETCH_SHIFT 8
+#define PREFETCH_DEFAULT 0
+#define PREFETCH_SHALLOW (1 << PREFETCH_SHIFT)
+#define PREFETCH_MODERATE (2 << PREFETCH_SHIFT)
+#define PREFETCH_DEEP (3 << PREFETCH_SHIFT)
+#define GFX_ACTLR_PRR (1 << 5)
+
+static const struct of_device_id qcom_smmu_actlr_client_of_match[] = {
+ { .compatible = "qcom,adreno",
+ .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) },
+ { .compatible = "qcom,adreno-gmu",
+ .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) },
+ { .compatible = "qcom,adreno-smmu",
+ .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) },
+ { .compatible = "qcom,fastrpc",
+ .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) },
+ { .compatible = "qcom,sc7280-mdss",
+ .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
+ { .compatible = "qcom,sc7280-venus",
+ .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
+ { .compatible = "qcom,sm8550-mdss",
+ .data = (const void *) (PREFETCH_DEFAULT | CMTLB) },
+ { }
+};
+
static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu)
{
return container_of(smmu, struct qcom_smmu, smmu);
@@ -76,25 +112,80 @@ static void qcom_adreno_smmu_set_stall(const void *cookie, bool enabled)
{
struct arm_smmu_domain *smmu_domain = (void *)cookie;
struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
- struct qcom_smmu *qsmmu = to_qcom_smmu(smmu_domain->smmu);
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+ u32 mask = BIT(cfg->cbndx);
+ bool stall_changed = !!(qsmmu->stall_enabled & mask) != enabled;
+ unsigned long flags;
if (enabled)
- qsmmu->stall_enabled |= BIT(cfg->cbndx);
+ qsmmu->stall_enabled |= mask;
else
- qsmmu->stall_enabled &= ~BIT(cfg->cbndx);
+ qsmmu->stall_enabled &= ~mask;
+
+ /*
+ * If the device is on and we changed the setting, update the register.
+ * The spec pseudocode says that CFCFG is resampled after a fault, and
+ * we believe that no implementations cache it in the TLB, so it should
+ * be safe to change it without a TLB invalidation.
+ */
+ if (stall_changed && pm_runtime_get_if_active(smmu->dev) > 0) {
+ u32 reg;
+
+ spin_lock_irqsave(&smmu_domain->cb_lock, flags);
+ reg = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR);
+
+ if (enabled)
+ reg |= ARM_SMMU_SCTLR_CFCFG;
+ else
+ reg &= ~ARM_SMMU_SCTLR_CFCFG;
+
+ arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR, reg);
+ spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
+
+ pm_runtime_put_autosuspend(smmu->dev);
+ }
}
-static void qcom_adreno_smmu_resume_translation(const void *cookie, bool terminate)
+static void qcom_adreno_smmu_set_prr_bit(const void *cookie, bool set)
{
struct arm_smmu_domain *smmu_domain = (void *)cookie;
- struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
u32 reg = 0;
+ int ret;
- if (terminate)
- reg |= ARM_SMMU_RESUME_TERMINATE;
+ ret = pm_runtime_resume_and_get(smmu->dev);
+ if (ret < 0) {
+ dev_err(smmu->dev, "failed to get runtime PM: %d\n", ret);
+ return;
+ }
- arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_RESUME, reg);
+ reg = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_ACTLR);
+ reg &= ~GFX_ACTLR_PRR;
+ if (set)
+ reg |= FIELD_PREP(GFX_ACTLR_PRR, 1);
+ arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_ACTLR, reg);
+ pm_runtime_put_autosuspend(smmu->dev);
+}
+
+static void qcom_adreno_smmu_set_prr_addr(const void *cookie, phys_addr_t page_addr)
+{
+ struct arm_smmu_domain *smmu_domain = (void *)cookie;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ int ret;
+
+ ret = pm_runtime_resume_and_get(smmu->dev);
+ if (ret < 0) {
+ dev_err(smmu->dev, "failed to get runtime PM: %d\n", ret);
+ return;
+ }
+
+ writel_relaxed(lower_32_bits(page_addr),
+ smmu->base + ARM_SMMU_GFX_PRR_CFG_LADDR);
+ writel_relaxed(upper_32_bits(page_addr),
+ smmu->base + ARM_SMMU_GFX_PRR_CFG_UADDR);
+ pm_runtime_put_autosuspend(smmu->dev);
}
#define QCOM_ADRENO_SMMU_GPU_SID 0
@@ -205,13 +296,37 @@ static bool qcom_adreno_can_do_ttbr1(struct arm_smmu_device *smmu)
return true;
}
+static void qcom_smmu_set_actlr_dev(struct device *dev, struct arm_smmu_device *smmu, int cbndx,
+ const struct of_device_id *client_match)
+{
+ const struct of_device_id *match =
+ of_match_device(client_match, dev);
+
+ if (!match) {
+ dev_dbg(dev, "no ACTLR settings present\n");
+ return;
+ }
+
+ arm_smmu_cb_write(smmu, cbndx, ARM_SMMU_CB_ACTLR, (unsigned long)match->data);
+}
+
static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
{
+ const struct device_node *np = smmu_domain->smmu->dev->of_node;
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+ const struct of_device_id *client_match;
+ int cbndx = smmu_domain->cfg.cbndx;
struct adreno_smmu_priv *priv;
smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
+ client_match = qsmmu->data->client_match;
+
+ if (client_match)
+ qcom_smmu_set_actlr_dev(dev, smmu, cbndx, client_match);
+
/* Only enable split pagetables for the GPU device (SID 0) */
if (!qcom_adreno_smmu_is_gpu_device(dev))
return 0;
@@ -236,7 +351,15 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
priv->set_ttbr0_cfg = qcom_adreno_smmu_set_ttbr0_cfg;
priv->get_fault_info = qcom_adreno_smmu_get_fault_info;
priv->set_stall = qcom_adreno_smmu_set_stall;
- priv->resume_translation = qcom_adreno_smmu_resume_translation;
+ priv->set_prr_bit = NULL;
+ priv->set_prr_addr = NULL;
+
+ if (of_device_is_compatible(np, "qcom,smmu-500") &&
+ !of_device_is_compatible(np, "qcom,sm8250-smmu-500") &&
+ of_device_is_compatible(np, "qcom,adreno-smmu")) {
+ priv->set_prr_bit = qcom_adreno_smmu_set_prr_bit;
+ priv->set_prr_addr = qcom_adreno_smmu_set_prr_addr;
+ }
return 0;
}
@@ -244,9 +367,11 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,adreno" },
{ .compatible = "qcom,adreno-gmu" },
+ { .compatible = "qcom,glymur-mdss" },
{ .compatible = "qcom,mdp4" },
{ .compatible = "qcom,mdss" },
{ .compatible = "qcom,qcm2290-mdss" },
+ { .compatible = "qcom,sar2130p-mdss" },
{ .compatible = "qcom,sc7180-mdss" },
{ .compatible = "qcom,sc7180-mss-pil" },
{ .compatible = "qcom,sc7280-mdss" },
@@ -256,6 +381,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
{ .compatible = "qcom,sdm670-mdss" },
{ .compatible = "qcom,sdm845-mdss" },
{ .compatible = "qcom,sdm845-mss-pil" },
+ { .compatible = "qcom,sm6115-mdss" },
{ .compatible = "qcom,sm6350-mdss" },
{ .compatible = "qcom,sm6375-mdss" },
{ .compatible = "qcom,sm8150-mdss" },
@@ -267,8 +393,18 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain,
struct io_pgtable_cfg *pgtbl_cfg, struct device *dev)
{
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+ const struct of_device_id *client_match;
+ int cbndx = smmu_domain->cfg.cbndx;
+
smmu_domain->cfg.flush_walk_prefer_tlbiasid = true;
+ client_match = qsmmu->data->client_match;
+
+ if (client_match)
+ qcom_smmu_set_actlr_dev(dev, smmu, cbndx, client_match);
+
return 0;
}
@@ -281,18 +417,34 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
int i;
/*
- * Some platforms support more than the Arm SMMU architected maximum of
- * 128 stream matching groups. For unknown reasons, the additional
- * groups don't exhibit the same behavior as the architected registers,
- * so limit the groups to 128 until the behavior is fixed for the other
- * groups.
+ * MSM8998 LPASS SMMU reports 13 context banks, but accessing
+ * the last context bank crashes the system.
*/
- if (smmu->num_mapping_groups > 128) {
- dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
- smmu->num_mapping_groups = 128;
+ if (of_device_is_compatible(smmu->dev->of_node, "qcom,msm8998-smmu-v2") &&
+ smmu->num_context_banks == 13) {
+ smmu->num_context_banks = 12;
+ } else if (of_device_is_compatible(smmu->dev->of_node, "qcom,sdm630-smmu-v2")) {
+ if (smmu->num_context_banks == 21) /* SDM630 / SDM660 A2NOC SMMU */
+ smmu->num_context_banks = 7;
+ else if (smmu->num_context_banks == 14) /* SDM630 / SDM660 LPASS SMMU */
+ smmu->num_context_banks = 13;
}
- last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
+ /*
+ * Some platforms support more than the Arm SMMU architected maximum of
+ * 128 stream matching groups. The additional registers appear to have
+ * the same behavior as the architected registers in the hardware.
+ * However, on some firmware versions, the hypervisor does not
+ * correctly trap and emulate accesses to the additional registers,
+ * resulting in unexpected behavior.
+ *
+ * If there are more than 128 groups, use the last reliable group to
+ * detect if we need to apply the bypass quirk.
+ */
+ if (smmu->num_mapping_groups > 128)
+ last_s2cr = ARM_SMMU_GR0_S2CR(127);
+ else
+ last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1);
/*
* With some firmware versions writes to S2CR of type FAULT are
@@ -315,6 +467,11 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS);
arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg);
+
+ if (smmu->num_mapping_groups > 128) {
+ dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n");
+ smmu->num_mapping_groups = 128;
+ }
}
for (i = 0; i < smmu->num_mapping_groups; i++) {
@@ -336,6 +493,19 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu)
return 0;
}
+static int qcom_adreno_smmuv2_cfg_probe(struct arm_smmu_device *smmu)
+{
+ /* Support for 16K pages is advertised on some SoCs, but it doesn't seem to work */
+ smmu->features &= ~ARM_SMMU_FEAT_FMT_AARCH64_16K;
+
+ /* TZ protects several last context banks, hide them from Linux */
+ if (of_device_is_compatible(smmu->dev->of_node, "qcom,sdm630-smmu-v2") &&
+ smmu->num_context_banks == 5)
+ smmu->num_context_banks = 2;
+
+ return 0;
+}
+
static void qcom_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx)
{
struct arm_smmu_s2cr *s2cr = smmu->s2crs + idx;
@@ -434,10 +604,12 @@ static const struct arm_smmu_impl sdm845_smmu_500_impl = {
static const struct arm_smmu_impl qcom_adreno_smmu_v2_impl = {
.init_context = qcom_adreno_smmu_init_context,
+ .cfg_probe = qcom_adreno_smmuv2_cfg_probe,
.def_domain_type = qcom_smmu_def_domain_type,
.alloc_context_bank = qcom_adreno_smmu_alloc_context_bank,
.write_sctlr = qcom_adreno_smmu_write_sctlr,
.tlb_sync = qcom_smmu_tlb_sync,
+ .context_fault_needs_threaded_irq = true,
};
static const struct arm_smmu_impl qcom_adreno_smmu_500_impl = {
@@ -447,6 +619,7 @@ static const struct arm_smmu_impl qcom_adreno_smmu_500_impl = {
.alloc_context_bank = qcom_adreno_smmu_alloc_context_bank,
.write_sctlr = qcom_adreno_smmu_write_sctlr,
.tlb_sync = qcom_smmu_tlb_sync,
+ .context_fault_needs_threaded_irq = true,
};
static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu,
@@ -469,14 +642,15 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu,
/* Check to make sure qcom_scm has finished probing */
if (!qcom_scm_is_available())
- return ERR_PTR(-EPROBE_DEFER);
+ return ERR_PTR(dev_err_probe(smmu->dev, -EPROBE_DEFER,
+ "qcom_scm not ready\n"));
qsmmu = devm_krealloc(smmu->dev, smmu, sizeof(*qsmmu), GFP_KERNEL);
if (!qsmmu)
return ERR_PTR(-ENOMEM);
qsmmu->smmu.impl = impl;
- qsmmu->cfg = data->cfg;
+ qsmmu->data = data;
return &qsmmu->smmu;
}
@@ -519,6 +693,7 @@ static const struct qcom_smmu_match_data qcom_smmu_500_impl0_data = {
.impl = &qcom_smmu_500_impl,
.adreno_impl = &qcom_adreno_smmu_500_impl,
.cfg = &qcom_smmu_impl0_cfg,
+ .client_match = qcom_smmu_actlr_client_of_match,
};
/*
@@ -536,6 +711,7 @@ static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
{ .compatible = "qcom,sc8180x-smmu-500", .data = &qcom_smmu_500_impl0_data },
{ .compatible = "qcom,sc8280xp-smmu-500", .data = &qcom_smmu_500_impl0_data },
{ .compatible = "qcom,sdm630-smmu-v2", .data = &qcom_smmu_v2_data },
+ { .compatible = "qcom,sdm670-smmu-v2", .data = &qcom_smmu_v2_data },
{ .compatible = "qcom,sdm845-smmu-v2", .data = &qcom_smmu_v2_data },
{ .compatible = "qcom,sdm845-smmu-500", .data = &sdm845_smmu_500_data },
{ .compatible = "qcom,sm6115-smmu-500", .data = &qcom_smmu_500_impl0_data},
@@ -561,10 +737,47 @@ static struct acpi_platform_list qcom_acpi_platlist[] = {
};
#endif
+static int qcom_smmu_tbu_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ int ret;
+
+ if (IS_ENABLED(CONFIG_ARM_SMMU_QCOM_DEBUG)) {
+ ret = qcom_tbu_probe(pdev);
+ if (ret)
+ return ret;
+ }
+
+ if (dev->pm_domain) {
+ pm_runtime_set_active(dev);
+ pm_runtime_enable(dev);
+ }
+
+ return 0;
+}
+
+static const struct of_device_id qcom_smmu_tbu_of_match[] = {
+ { .compatible = "qcom,sc7280-tbu" },
+ { .compatible = "qcom,sdm845-tbu" },
+ { }
+};
+
+static struct platform_driver qcom_smmu_tbu_driver = {
+ .driver = {
+ .name = "qcom_tbu",
+ .of_match_table = qcom_smmu_tbu_of_match,
+ },
+ .probe = qcom_smmu_tbu_probe,
+};
+
struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
{
const struct device_node *np = smmu->dev->of_node;
const struct of_device_id *match;
+ static u8 tbu_registered;
+
+ if (!tbu_registered++)
+ platform_driver_register(&qcom_smmu_tbu_driver);
#ifdef CONFIG_ACPI
if (np == NULL) {
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
index 9bb3ae7d62da..8addd453f5f1 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
@@ -8,7 +8,7 @@
struct qcom_smmu {
struct arm_smmu_device smmu;
- const struct qcom_smmu_config *cfg;
+ const struct qcom_smmu_match_data *data;
bool bypass_quirk;
u8 bypass_cbndx;
u32 stall_enabled;
@@ -28,14 +28,17 @@ struct qcom_smmu_match_data {
const struct qcom_smmu_config *cfg;
const struct arm_smmu_impl *impl;
const struct arm_smmu_impl *adreno_impl;
+ const struct of_device_id * const client_match;
};
irqreturn_t qcom_smmu_context_fault(int irq, void *dev);
#ifdef CONFIG_ARM_SMMU_QCOM_DEBUG
void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu);
+int qcom_tbu_probe(struct platform_device *pdev);
#else
static inline void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) { }
+static inline int qcom_tbu_probe(struct platform_device *pdev) { return -EINVAL; }
#endif
#endif /* _ARM_SMMU_QCOM_H */
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 87c81f75cf84..5e690cf85ec9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -34,6 +34,7 @@
#include <linux/pm_runtime.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
+#include <linux/string_choices.h>
#include <linux/fsl/mc.h>
@@ -78,8 +79,11 @@ static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu)
static inline void arm_smmu_rpm_put(struct arm_smmu_device *smmu)
{
- if (pm_runtime_enabled(smmu->dev))
- pm_runtime_put_autosuspend(smmu->dev);
+ if (pm_runtime_enabled(smmu->dev)) {
+ pm_runtime_mark_last_busy(smmu->dev);
+ __pm_runtime_put_autosuspend(smmu->dev);
+
+ }
}
static void arm_smmu_rpm_use_autosuspend(struct arm_smmu_device *smmu)
@@ -105,7 +109,7 @@ static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
}
static struct platform_driver arm_smmu_driver;
-static struct iommu_ops arm_smmu_ops;
+static const struct iommu_ops arm_smmu_ops;
#ifdef CONFIG_ARM_SMMU_LEGACY_DT_BINDINGS
static struct device_node *dev_get_dev_node(struct device *dev)
@@ -178,8 +182,7 @@ static int arm_smmu_register_legacy_master(struct device *dev,
it.cur_count = 1;
}
- err = iommu_fwspec_init(dev, &smmu_dev->of_node->fwnode,
- &arm_smmu_ops);
+ err = iommu_fwspec_init(dev, NULL);
if (err)
return err;
@@ -405,32 +408,78 @@ static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v1 = {
.tlb_add_page = arm_smmu_tlb_add_page_s2_v1,
};
+
+void arm_smmu_read_context_fault_info(struct arm_smmu_device *smmu, int idx,
+ struct arm_smmu_context_fault_info *cfi)
+{
+ cfi->iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
+ cfi->fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+ cfi->fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
+ cfi->cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+}
+
+void arm_smmu_print_context_fault_info(struct arm_smmu_device *smmu, int idx,
+ const struct arm_smmu_context_fault_info *cfi)
+{
+ dev_err(smmu->dev,
+ "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
+ cfi->fsr, cfi->iova, cfi->fsynr, cfi->cbfrsynra, idx);
+
+ dev_err(smmu->dev, "FSR = %08x [%s%sFormat=%u%s%s%s%s%s%s%s%s], SID=0x%x\n",
+ cfi->fsr,
+ (cfi->fsr & ARM_SMMU_CB_FSR_MULTI) ? "MULTI " : "",
+ (cfi->fsr & ARM_SMMU_CB_FSR_SS) ? "SS " : "",
+ (u32)FIELD_GET(ARM_SMMU_CB_FSR_FORMAT, cfi->fsr),
+ (cfi->fsr & ARM_SMMU_CB_FSR_UUT) ? " UUT" : "",
+ (cfi->fsr & ARM_SMMU_CB_FSR_ASF) ? " ASF" : "",
+ (cfi->fsr & ARM_SMMU_CB_FSR_TLBLKF) ? " TLBLKF" : "",
+ (cfi->fsr & ARM_SMMU_CB_FSR_TLBMCF) ? " TLBMCF" : "",
+ (cfi->fsr & ARM_SMMU_CB_FSR_EF) ? " EF" : "",
+ (cfi->fsr & ARM_SMMU_CB_FSR_PF) ? " PF" : "",
+ (cfi->fsr & ARM_SMMU_CB_FSR_AFF) ? " AFF" : "",
+ (cfi->fsr & ARM_SMMU_CB_FSR_TF) ? " TF" : "",
+ cfi->cbfrsynra);
+
+ dev_err(smmu->dev, "FSYNR0 = %08x [S1CBNDX=%u%s%s%s%s%s%s PLVL=%u]\n",
+ cfi->fsynr,
+ (u32)FIELD_GET(ARM_SMMU_CB_FSYNR0_S1CBNDX, cfi->fsynr),
+ (cfi->fsynr & ARM_SMMU_CB_FSYNR0_AFR) ? " AFR" : "",
+ (cfi->fsynr & ARM_SMMU_CB_FSYNR0_PTWF) ? " PTWF" : "",
+ (cfi->fsynr & ARM_SMMU_CB_FSYNR0_NSATTR) ? " NSATTR" : "",
+ (cfi->fsynr & ARM_SMMU_CB_FSYNR0_IND) ? " IND" : "",
+ (cfi->fsynr & ARM_SMMU_CB_FSYNR0_PNU) ? " PNU" : "",
+ (cfi->fsynr & ARM_SMMU_CB_FSYNR0_WNR) ? " WNR" : "",
+ (u32)FIELD_GET(ARM_SMMU_CB_FSYNR0_PLVL, cfi->fsynr));
+}
+
static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
{
- u32 fsr, fsynr, cbfrsynra;
- unsigned long iova;
+ struct arm_smmu_context_fault_info cfi;
struct arm_smmu_domain *smmu_domain = dev;
struct arm_smmu_device *smmu = smmu_domain->smmu;
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
int idx = smmu_domain->cfg.cbndx;
int ret;
- fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
- if (!(fsr & ARM_SMMU_FSR_FAULT))
+ arm_smmu_read_context_fault_info(smmu, idx, &cfi);
+
+ if (!(cfi.fsr & ARM_SMMU_CB_FSR_FAULT))
return IRQ_NONE;
- fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
- iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
- cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+ ret = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova,
+ cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
- ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
- fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+ if (ret == -ENOSYS && __ratelimit(&rs))
+ arm_smmu_print_context_fault_info(smmu, idx, &cfi);
- if (ret == -ENOSYS)
- dev_err_ratelimited(smmu->dev,
- "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
- fsr, iova, fsynr, cbfrsynra, idx);
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr);
+
+ if (cfi.fsr & ARM_SMMU_CB_FSR_SS) {
+ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
+ ret == -EAGAIN ? 0 : ARM_SMMU_RESUME_TERMINATE);
+ }
- arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
return IRQ_HANDLED;
}
@@ -870,6 +919,8 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain)
static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
{
struct arm_smmu_domain *smmu_domain;
+ struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
+ struct arm_smmu_device *smmu = cfg->smmu;
/*
* Allocate the domain and initialise some of its data structures.
@@ -882,6 +933,7 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
mutex_init(&smmu_domain->init_mutex);
spin_lock_init(&smmu_domain->cb_lock);
+ smmu_domain->domain.pgsize_bitmap = smmu->pgsize_bitmap;
return &smmu_domain->domain;
}
@@ -1113,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg,
}
}
-static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
@@ -1155,7 +1208,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
/* Looks ok, so add the device to the domain */
arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_TRANS,
smmu_domain->cfg.cbndx, fwspec);
- arm_smmu_rpm_use_autosuspend(smmu);
rpm_put:
arm_smmu_rpm_put(smmu);
return ret;
@@ -1178,13 +1230,13 @@ static int arm_smmu_attach_dev_type(struct device *dev,
return ret;
arm_smmu_master_install_s2crs(cfg, type, 0, fwspec);
- arm_smmu_rpm_use_autosuspend(smmu);
arm_smmu_rpm_put(smmu);
return 0;
}
static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS);
}
@@ -1199,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = {
};
static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT);
}
@@ -1306,7 +1359,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_ATS1PR, va);
reg = arm_smmu_page(smmu, ARM_SMMU_CB(smmu, idx)) + ARM_SMMU_CB_ATSR;
- if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ARM_SMMU_ATSR_ACTIVE),
+ if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ARM_SMMU_CB_ATSR_ACTIVE),
5, 50)) {
spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
dev_err(dev,
@@ -1372,8 +1425,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
static
struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
{
- struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver,
- fwnode);
+ struct device *dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode);
+
put_device(dev);
return dev ? dev_get_drvdata(dev) : NULL;
}
@@ -1446,7 +1499,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
out_cfg_free:
kfree(cfg);
out_free:
- iommu_fwspec_free(dev);
return ERR_PTR(ret);
}
@@ -1519,21 +1571,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
return group;
}
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
- struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
- int ret = 0;
-
- mutex_lock(&smmu_domain->init_mutex);
- if (smmu_domain->smmu)
- ret = -EPERM;
- else
- smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
- mutex_unlock(&smmu_domain->init_mutex);
-
- return ret;
-}
-
static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirks)
{
@@ -1596,7 +1633,7 @@ static int arm_smmu_def_domain_type(struct device *dev)
return 0;
}
-static struct iommu_ops arm_smmu_ops = {
+static const struct iommu_ops arm_smmu_ops = {
.identity_domain = &arm_smmu_identity_domain,
.blocked_domain = &arm_smmu_blocked_domain,
.capable = arm_smmu_capable,
@@ -1608,7 +1645,6 @@ static struct iommu_ops arm_smmu_ops = {
.of_xlate = arm_smmu_of_xlate,
.get_resv_regions = arm_smmu_get_resv_regions,
.def_domain_type = arm_smmu_def_domain_type,
- .pgsize_bitmap = -1UL, /* Restricted during device attach */
.owner = THIS_MODULE,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = arm_smmu_attach_dev,
@@ -1617,7 +1653,6 @@ static struct iommu_ops arm_smmu_ops = {
.flush_iotlb_all = arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,
.iova_to_phys = arm_smmu_iova_to_phys,
- .enable_nesting = arm_smmu_enable_nesting,
.set_pgtable_quirks = arm_smmu_set_pgtable_quirks,
.free = arm_smmu_domain_free,
}
@@ -1642,7 +1677,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
/* Make sure all context banks are disabled and clear CB_FSR */
for (i = 0; i < smmu->num_context_banks; ++i) {
arm_smmu_write_context_bank(smmu, i);
- arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, ARM_SMMU_FSR_FAULT);
+ arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, ARM_SMMU_CB_FSR_FAULT);
}
/* Invalidate the TLB, just in case */
@@ -1889,10 +1924,6 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH64_64K)
smmu->pgsize_bitmap |= SZ_64K | SZ_512M;
- if (arm_smmu_ops.pgsize_bitmap == -1UL)
- arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap;
- else
- arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap;
dev_notice(smmu->dev, "\tSupported page sizes: 0x%08lx\n",
smmu->pgsize_bitmap);
@@ -2083,7 +2114,7 @@ static void arm_smmu_rmr_install_bypass_smr(struct arm_smmu_device *smmu)
}
dev_notice(smmu->dev, "\tpreserved %d boot mapping%s\n", cnt,
- cnt == 1 ? "" : "s");
+ str_plural(cnt));
iort_put_rmr_sids(dev_fwnode(smmu->dev), &rmr_list);
}
@@ -2193,29 +2224,26 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
i, irq);
}
+ platform_set_drvdata(pdev, smmu);
+
+ /* Check for RMRs and install bypass SMRs if any */
+ arm_smmu_rmr_install_bypass_smr(smmu);
+
+ arm_smmu_device_reset(smmu);
+ arm_smmu_test_smr_masks(smmu);
+
err = iommu_device_sysfs_add(&smmu->iommu, smmu->dev, NULL,
"smmu.%pa", &smmu->ioaddr);
- if (err) {
- dev_err(dev, "Failed to register iommu in sysfs\n");
- return err;
- }
+ if (err)
+ return dev_err_probe(dev, err, "Failed to register iommu in sysfs\n");
err = iommu_device_register(&smmu->iommu, &arm_smmu_ops,
using_legacy_binding ? NULL : dev);
if (err) {
- dev_err(dev, "Failed to register iommu\n");
iommu_device_sysfs_remove(&smmu->iommu);
- return err;
+ return dev_err_probe(dev, err, "Failed to register iommu\n");
}
- platform_set_drvdata(pdev, smmu);
-
- /* Check for RMRs and install bypass SMRs if any */
- arm_smmu_rmr_install_bypass_smr(smmu);
-
- arm_smmu_device_reset(smmu);
- arm_smmu_test_smr_masks(smmu);
-
/*
* We want to avoid touching dev->power.lock in fastpaths unless
* it's really going to do something useful - pm_runtime_enabled()
@@ -2225,6 +2253,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
if (dev->pm_domain) {
pm_runtime_set_active(dev);
pm_runtime_enable(dev);
+ arm_smmu_rpm_use_autosuspend(smmu);
}
return 0;
@@ -2333,7 +2362,7 @@ static struct platform_driver arm_smmu_driver = {
.suppress_bind_attrs = true,
},
.probe = arm_smmu_device_probe,
- .remove_new = arm_smmu_device_remove,
+ .remove = arm_smmu_device_remove,
.shutdown = arm_smmu_device_shutdown,
};
module_platform_driver(arm_smmu_driver);
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 4765c6945c34..2dbf3243b5ad 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -154,6 +154,8 @@ enum arm_smmu_cbar_type {
#define ARM_SMMU_SCTLR_M BIT(0)
#define ARM_SMMU_CB_ACTLR 0x4
+#define ARM_SMMU_GFX_PRR_CFG_LADDR 0x6008
+#define ARM_SMMU_GFX_PRR_CFG_UADDR 0x600C
#define ARM_SMMU_CB_RESUME 0x8
#define ARM_SMMU_RESUME_TERMINATE BIT(0)
@@ -196,34 +198,42 @@ enum arm_smmu_cbar_type {
#define ARM_SMMU_CB_PAR_F BIT(0)
#define ARM_SMMU_CB_FSR 0x58
-#define ARM_SMMU_FSR_MULTI BIT(31)
-#define ARM_SMMU_FSR_SS BIT(30)
-#define ARM_SMMU_FSR_UUT BIT(8)
-#define ARM_SMMU_FSR_ASF BIT(7)
-#define ARM_SMMU_FSR_TLBLKF BIT(6)
-#define ARM_SMMU_FSR_TLBMCF BIT(5)
-#define ARM_SMMU_FSR_EF BIT(4)
-#define ARM_SMMU_FSR_PF BIT(3)
-#define ARM_SMMU_FSR_AFF BIT(2)
-#define ARM_SMMU_FSR_TF BIT(1)
-
-#define ARM_SMMU_FSR_IGN (ARM_SMMU_FSR_AFF | \
- ARM_SMMU_FSR_ASF | \
- ARM_SMMU_FSR_TLBMCF | \
- ARM_SMMU_FSR_TLBLKF)
-
-#define ARM_SMMU_FSR_FAULT (ARM_SMMU_FSR_MULTI | \
- ARM_SMMU_FSR_SS | \
- ARM_SMMU_FSR_UUT | \
- ARM_SMMU_FSR_EF | \
- ARM_SMMU_FSR_PF | \
- ARM_SMMU_FSR_TF | \
- ARM_SMMU_FSR_IGN)
+#define ARM_SMMU_CB_FSR_MULTI BIT(31)
+#define ARM_SMMU_CB_FSR_SS BIT(30)
+#define ARM_SMMU_CB_FSR_FORMAT GENMASK(10, 9)
+#define ARM_SMMU_CB_FSR_UUT BIT(8)
+#define ARM_SMMU_CB_FSR_ASF BIT(7)
+#define ARM_SMMU_CB_FSR_TLBLKF BIT(6)
+#define ARM_SMMU_CB_FSR_TLBMCF BIT(5)
+#define ARM_SMMU_CB_FSR_EF BIT(4)
+#define ARM_SMMU_CB_FSR_PF BIT(3)
+#define ARM_SMMU_CB_FSR_AFF BIT(2)
+#define ARM_SMMU_CB_FSR_TF BIT(1)
+
+#define ARM_SMMU_CB_FSR_IGN (ARM_SMMU_CB_FSR_AFF | \
+ ARM_SMMU_CB_FSR_ASF | \
+ ARM_SMMU_CB_FSR_TLBMCF | \
+ ARM_SMMU_CB_FSR_TLBLKF)
+
+#define ARM_SMMU_CB_FSR_FAULT (ARM_SMMU_CB_FSR_MULTI | \
+ ARM_SMMU_CB_FSR_SS | \
+ ARM_SMMU_CB_FSR_UUT | \
+ ARM_SMMU_CB_FSR_EF | \
+ ARM_SMMU_CB_FSR_PF | \
+ ARM_SMMU_CB_FSR_TF | \
+ ARM_SMMU_CB_FSR_IGN)
#define ARM_SMMU_CB_FAR 0x60
#define ARM_SMMU_CB_FSYNR0 0x68
-#define ARM_SMMU_FSYNR0_WNR BIT(4)
+#define ARM_SMMU_CB_FSYNR0_PLVL GENMASK(1, 0)
+#define ARM_SMMU_CB_FSYNR0_WNR BIT(4)
+#define ARM_SMMU_CB_FSYNR0_PNU BIT(5)
+#define ARM_SMMU_CB_FSYNR0_IND BIT(6)
+#define ARM_SMMU_CB_FSYNR0_NSATTR BIT(8)
+#define ARM_SMMU_CB_FSYNR0_PTWF BIT(10)
+#define ARM_SMMU_CB_FSYNR0_AFR BIT(11)
+#define ARM_SMMU_CB_FSYNR0_S1CBNDX GENMASK(23, 16)
#define ARM_SMMU_CB_FSYNR1 0x6c
@@ -237,7 +247,7 @@ enum arm_smmu_cbar_type {
#define ARM_SMMU_CB_ATS1PR 0x800
#define ARM_SMMU_CB_ATSR 0x8f0
-#define ARM_SMMU_ATSR_ACTIVE BIT(0)
+#define ARM_SMMU_CB_ATSR_ACTIVE BIT(0)
#define ARM_SMMU_RESUME_TERMINATE BIT(0)
@@ -533,4 +543,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
int arm_mmu500_reset(struct arm_smmu_device *smmu);
+struct arm_smmu_context_fault_info {
+ unsigned long iova;
+ u32 fsr;
+ u32 fsynr;
+ u32 cbfrsynra;
+};
+
+void arm_smmu_read_context_fault_info(struct arm_smmu_device *smmu, int idx,
+ struct arm_smmu_context_fault_info *cfi);
+
+void arm_smmu_print_context_fault_info(struct arm_smmu_device *smmu, int idx,
+ const struct arm_smmu_context_fault_info *cfi);
+
#endif /* _ARM_SMMU_H */
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index e079bb7a993e..f69d9276dc55 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -194,7 +194,7 @@ static irqreturn_t qcom_iommu_fault(int irq, void *dev)
fsr = iommu_readl(ctx, ARM_SMMU_CB_FSR);
- if (!(fsr & ARM_SMMU_FSR_FAULT))
+ if (!(fsr & ARM_SMMU_CB_FSR_FAULT))
return IRQ_NONE;
fsynr = iommu_readl(ctx, ARM_SMMU_CB_FSYNR0);
@@ -229,7 +229,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
goto out_unlock;
pgtbl_cfg = (struct io_pgtable_cfg) {
- .pgsize_bitmap = qcom_iommu_ops.pgsize_bitmap,
+ .pgsize_bitmap = domain->pgsize_bitmap,
.ias = 32,
.oas = 40,
.tlb = &qcom_flush_ops,
@@ -246,8 +246,6 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
goto out_clear_iommu;
}
- /* Update the domain's page sizes to reflect the page table format */
- domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
domain->geometry.aperture_end = (1ULL << pgtbl_cfg.ias) - 1;
domain->geometry.force_aperture = true;
@@ -274,7 +272,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
/* Clear context bank fault address fault status registers */
iommu_writel(ctx, ARM_SMMU_CB_FAR, 0);
- iommu_writel(ctx, ARM_SMMU_CB_FSR, ARM_SMMU_FSR_FAULT);
+ iommu_writel(ctx, ARM_SMMU_CB_FSR, ARM_SMMU_CB_FSR_FAULT);
/* TTBRs */
iommu_writeq(ctx, ARM_SMMU_CB_TTBR0,
@@ -337,6 +335,7 @@ static struct iommu_domain *qcom_iommu_domain_alloc_paging(struct device *dev)
mutex_init(&qcom_domain->init_mutex);
spin_lock_init(&qcom_domain->pgtbl_lock);
+ qcom_domain->domain.pgsize_bitmap = SZ_4K;
return &qcom_domain->domain;
}
@@ -360,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
kfree(qcom_domain);
}
-static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int qcom_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev, struct iommu_domain *old)
{
struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
@@ -389,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
}
static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct qcom_iommu_domain *qcom_domain;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev);
unsigned int i;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- qcom_domain = to_qcom_iommu_domain(domain);
+ qcom_domain = to_qcom_iommu_domain(old);
if (WARN_ON(!qcom_domain->iommu))
return -EINVAL;
@@ -566,14 +566,14 @@ static int qcom_iommu_of_xlate(struct device *dev,
qcom_iommu = platform_get_drvdata(iommu_pdev);
+ put_device(&iommu_pdev->dev);
+
/* make sure the asid specified in dt is valid, so we don't have
* to sanity check this elsewhere:
*/
if (WARN_ON(asid > qcom_iommu->max_asid) ||
- WARN_ON(qcom_iommu->ctxs[asid] == NULL)) {
- put_device(&iommu_pdev->dev);
+ WARN_ON(qcom_iommu->ctxs[asid] == NULL))
return -EINVAL;
- }
if (!dev_iommu_priv_get(dev)) {
dev_iommu_priv_set(dev, qcom_iommu);
@@ -582,10 +582,8 @@ static int qcom_iommu_of_xlate(struct device *dev,
* multiple different iommu devices. Multiple context
* banks are ok, but multiple devices are not:
*/
- if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) {
- put_device(&iommu_pdev->dev);
+ if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev)))
return -EINVAL;
- }
}
return iommu_fwspec_add_ids(dev, &asid, 1);
@@ -598,7 +596,6 @@ static const struct iommu_ops qcom_iommu_ops = {
.probe_device = qcom_iommu_probe_device,
.device_group = generic_device_group,
.of_xlate = qcom_iommu_of_xlate,
- .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = qcom_iommu_attach_dev,
.map_pages = qcom_iommu_map,
@@ -759,7 +756,7 @@ static struct platform_driver qcom_iommu_ctx_driver = {
.of_match_table = ctx_of_match,
},
.probe = qcom_iommu_ctx_probe,
- .remove_new = qcom_iommu_ctx_remove,
+ .remove = qcom_iommu_ctx_remove,
};
static bool qcom_iommu_has_secure_context(struct qcom_iommu_dev *qcom_iommu)
@@ -931,7 +928,7 @@ static struct platform_driver qcom_iommu_driver = {
.pm = &qcom_iommu_pm_ops,
},
.probe = qcom_iommu_device_probe,
- .remove_new = qcom_iommu_device_remove,
+ .remove = qcom_iommu_device_remove,
};
static int __init qcom_iommu_init(void)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 43520e7275cc..c92088855450 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -17,14 +17,17 @@
#include <linux/gfp.h>
#include <linux/huge_mm.h>
#include <linux/iommu.h>
+#include <linux/iommu-dma.h>
#include <linux/iova.h>
#include <linux/irq.h>
#include <linux/list_sort.h>
#include <linux/memremap.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/msi.h>
#include <linux/of_iommu.h>
#include <linux/pci.h>
+#include <linux/pci-p2pdma.h>
#include <linux/scatterlist.h>
#include <linux/spinlock.h>
#include <linux/swiotlb.h>
@@ -40,11 +43,6 @@ struct iommu_dma_msi_page {
phys_addr_t phys;
};
-enum iommu_dma_cookie_type {
- IOMMU_DMA_IOVA_COOKIE,
- IOMMU_DMA_MSI_COOKIE,
-};
-
enum iommu_dma_queue_type {
IOMMU_DMA_OPTS_PER_CPU_QUEUE,
IOMMU_DMA_OPTS_SINGLE_QUEUE,
@@ -57,35 +55,30 @@ struct iommu_dma_options {
};
struct iommu_dma_cookie {
- enum iommu_dma_cookie_type type;
+ struct iova_domain iovad;
+ struct list_head msi_page_list;
+ /* Flush queue */
union {
- /* Full allocator for IOMMU_DMA_IOVA_COOKIE */
- struct {
- struct iova_domain iovad;
- /* Flush queue */
- union {
- struct iova_fq *single_fq;
- struct iova_fq __percpu *percpu_fq;
- };
- /* Number of TLB flushes that have been started */
- atomic64_t fq_flush_start_cnt;
- /* Number of TLB flushes that have been finished */
- atomic64_t fq_flush_finish_cnt;
- /* Timer to regularily empty the flush queues */
- struct timer_list fq_timer;
- /* 1 when timer is active, 0 when not */
- atomic_t fq_timer_on;
- };
- /* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */
- dma_addr_t msi_iova;
+ struct iova_fq *single_fq;
+ struct iova_fq __percpu *percpu_fq;
};
- struct list_head msi_page_list;
-
+ /* Number of TLB flushes that have been started */
+ atomic64_t fq_flush_start_cnt;
+ /* Number of TLB flushes that have been finished */
+ atomic64_t fq_flush_finish_cnt;
+ /* Timer to regularily empty the flush queues */
+ struct timer_list fq_timer;
+ /* 1 when timer is active, 0 when not */
+ atomic_t fq_timer_on;
/* Domain for flush queue callback; NULL if flush queue not in use */
- struct iommu_domain *fq_domain;
+ struct iommu_domain *fq_domain;
/* Options for dma-iommu use */
- struct iommu_dma_options options;
- struct mutex mutex;
+ struct iommu_dma_options options;
+};
+
+struct iommu_dma_msi_cookie {
+ dma_addr_t msi_iova;
+ struct list_head msi_page_list;
};
static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled);
@@ -113,7 +106,7 @@ early_param("iommu.forcedac", iommu_dma_forcedac_setup);
struct iova_fq_entry {
unsigned long iova_pfn;
unsigned long pages;
- struct list_head freelist;
+ struct iommu_pages_list freelist;
u64 counter; /* Flush counter when this entry was added */
};
@@ -162,6 +155,8 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq
fq->entries[idx].iova_pfn,
fq->entries[idx].pages);
+ fq->entries[idx].freelist =
+ IOMMU_PAGES_LIST_INIT(fq->entries[idx].freelist);
fq->head = (fq->head + 1) & fq->mod_mask;
}
}
@@ -184,7 +179,8 @@ static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
static void fq_flush_timeout(struct timer_list *t)
{
- struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
+ struct iommu_dma_cookie *cookie = timer_container_of(cookie, t,
+ fq_timer);
int cpu;
atomic_set(&cookie->fq_timer_on, 0);
@@ -200,7 +196,7 @@ static void fq_flush_timeout(struct timer_list *t)
static void queue_iova(struct iommu_dma_cookie *cookie,
unsigned long pfn, unsigned long pages,
- struct list_head *freelist)
+ struct iommu_pages_list *freelist)
{
struct iova_fq *fq;
unsigned long flags;
@@ -239,7 +235,7 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
fq->entries[idx].iova_pfn = pfn;
fq->entries[idx].pages = pages;
fq->entries[idx].counter = atomic64_read(&cookie->fq_flush_start_cnt);
- list_splice(freelist, &fq->entries[idx].freelist);
+ iommu_pages_list_splice(freelist, &fq->entries[idx].freelist);
spin_unlock_irqrestore(&fq->lock, flags);
@@ -279,7 +275,7 @@ static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
if (!cookie->fq_domain)
return;
- del_timer_sync(&cookie->fq_timer);
+ timer_delete_sync(&cookie->fq_timer);
if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE)
iommu_dma_free_fq_single(cookie->single_fq);
else
@@ -297,7 +293,8 @@ static void iommu_dma_init_one_fq(struct iova_fq *fq, size_t fq_size)
spin_lock_init(&fq->lock);
for (i = 0; i < fq_size; i++)
- INIT_LIST_HEAD(&fq->entries[i].freelist);
+ fq->entries[i].freelist =
+ IOMMU_PAGES_LIST_INIT(fq->entries[i].freelist);
}
static int iommu_dma_init_fq_single(struct iommu_dma_cookie *cookie)
@@ -364,39 +361,24 @@ int iommu_dma_init_fq(struct iommu_domain *domain)
return 0;
}
-static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie)
-{
- if (cookie->type == IOMMU_DMA_IOVA_COOKIE)
- return cookie->iovad.granule;
- return PAGE_SIZE;
-}
-
-static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type)
-{
- struct iommu_dma_cookie *cookie;
-
- cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
- if (cookie) {
- INIT_LIST_HEAD(&cookie->msi_page_list);
- cookie->type = type;
- }
- return cookie;
-}
-
/**
* iommu_get_dma_cookie - Acquire DMA-API resources for a domain
* @domain: IOMMU domain to prepare for DMA-API usage
*/
int iommu_get_dma_cookie(struct iommu_domain *domain)
{
- if (domain->iova_cookie)
+ struct iommu_dma_cookie *cookie;
+
+ if (domain->cookie_type != IOMMU_COOKIE_NONE)
return -EEXIST;
- domain->iova_cookie = cookie_alloc(IOMMU_DMA_IOVA_COOKIE);
- if (!domain->iova_cookie)
+ cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
+ if (!cookie)
return -ENOMEM;
- mutex_init(&domain->iova_cookie->mutex);
+ INIT_LIST_HEAD(&cookie->msi_page_list);
+ domain->cookie_type = IOMMU_COOKIE_DMA_IOVA;
+ domain->iova_cookie = cookie;
return 0;
}
@@ -414,48 +396,56 @@ int iommu_get_dma_cookie(struct iommu_domain *domain)
*/
int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
{
- struct iommu_dma_cookie *cookie;
+ struct iommu_dma_msi_cookie *cookie;
if (domain->type != IOMMU_DOMAIN_UNMANAGED)
return -EINVAL;
- if (domain->iova_cookie)
+ if (domain->cookie_type != IOMMU_COOKIE_NONE)
return -EEXIST;
- cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+ cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
if (!cookie)
return -ENOMEM;
cookie->msi_iova = base;
- domain->iova_cookie = cookie;
+ INIT_LIST_HEAD(&cookie->msi_page_list);
+ domain->cookie_type = IOMMU_COOKIE_DMA_MSI;
+ domain->msi_cookie = cookie;
return 0;
}
EXPORT_SYMBOL(iommu_get_msi_cookie);
/**
* iommu_put_dma_cookie - Release a domain's DMA mapping resources
- * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or
- * iommu_get_msi_cookie()
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
*/
void iommu_put_dma_cookie(struct iommu_domain *domain)
{
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iommu_dma_msi_page *msi, *tmp;
- if (!cookie)
- return;
-
- if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule) {
+ if (cookie->iovad.granule) {
iommu_dma_free_fq(cookie);
put_iova_domain(&cookie->iovad);
}
+ list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list)
+ kfree(msi);
+ kfree(cookie);
+}
- list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) {
- list_del(&msi->list);
+/**
+ * iommu_put_msi_cookie - Release a domain's MSI mapping resources
+ * @domain: IOMMU domain previously prepared by iommu_get_msi_cookie()
+ */
+void iommu_put_msi_cookie(struct iommu_domain *domain)
+{
+ struct iommu_dma_msi_cookie *cookie = domain->msi_cookie;
+ struct iommu_dma_msi_page *msi, *tmp;
+
+ list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list)
kfree(msi);
- }
kfree(cookie);
- domain->iova_cookie = NULL;
}
/**
@@ -675,7 +665,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev
struct iova_domain *iovad;
int ret;
- if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
+ if (!cookie || domain->cookie_type != IOMMU_COOKIE_DMA_IOVA)
return -EINVAL;
iovad = &cookie->iovad;
@@ -697,23 +687,20 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev
domain->geometry.aperture_start >> order);
/* start_pfn is always nonzero for an already-initialised domain */
- mutex_lock(&cookie->mutex);
if (iovad->start_pfn) {
if (1UL << order != iovad->granule ||
base_pfn != iovad->start_pfn) {
pr_warn("Incompatible range for DMA domain\n");
- ret = -EFAULT;
- goto done_unlock;
+ return -EFAULT;
}
- ret = 0;
- goto done_unlock;
+ return 0;
}
init_iova_domain(iovad, 1UL << order, base_pfn);
ret = iova_domain_init_rcaches(iovad);
if (ret)
- goto done_unlock;
+ return ret;
iommu_dma_init_options(&cookie->options, dev);
@@ -722,11 +709,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev
(!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain)))
domain->type = IOMMU_DOMAIN_DMA;
- ret = iova_reserve_iommu_regions(dev, domain);
-
-done_unlock:
- mutex_unlock(&cookie->mutex);
- return ret;
+ return iova_reserve_iommu_regions(dev, domain);
}
/**
@@ -741,7 +724,12 @@ done_unlock:
static int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
unsigned long attrs)
{
- int prot = coherent ? IOMMU_CACHE : 0;
+ int prot;
+
+ if (attrs & DMA_ATTR_MMIO)
+ prot = IOMMU_MMIO;
+ else
+ prot = coherent ? IOMMU_CACHE : 0;
if (attrs & DMA_ATTR_PRIVILEGED)
prot |= IOMMU_PRIV;
@@ -765,9 +753,9 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
struct iova_domain *iovad = &cookie->iovad;
unsigned long shift, iova_len, iova;
- if (cookie->type == IOMMU_DMA_MSI_COOKIE) {
- cookie->msi_iova += size;
- return cookie->msi_iova - size;
+ if (domain->cookie_type == IOMMU_COOKIE_DMA_MSI) {
+ domain->msi_cookie->msi_iova += size;
+ return domain->msi_cookie->msi_iova - size;
}
shift = iova_shift(iovad);
@@ -804,16 +792,16 @@ done:
return (dma_addr_t)iova << shift;
}
-static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
- dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather)
+static void iommu_dma_free_iova(struct iommu_domain *domain, dma_addr_t iova,
+ size_t size, struct iommu_iotlb_gather *gather)
{
- struct iova_domain *iovad = &cookie->iovad;
+ struct iova_domain *iovad = &domain->iova_cookie->iovad;
/* The MSI case is only ever cleaning up its most recent allocation */
- if (cookie->type == IOMMU_DMA_MSI_COOKIE)
- cookie->msi_iova -= size;
+ if (domain->cookie_type == IOMMU_COOKIE_DMA_MSI)
+ domain->msi_cookie->msi_iova -= size;
else if (gather && gather->queued)
- queue_iova(cookie, iova_pfn(iovad, iova),
+ queue_iova(domain->iova_cookie, iova_pfn(iovad, iova),
size >> iova_shift(iovad),
&gather->freelist);
else
@@ -841,7 +829,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
if (!iotlb_gather.queued)
iommu_iotlb_sync(domain, &iotlb_gather);
- iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather);
+ iommu_dma_free_iova(domain, dma_addr, size, &iotlb_gather);
}
static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
@@ -869,7 +857,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
return DMA_MAPPING_ERROR;
if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) {
- iommu_dma_free_iova(cookie, iova, size, NULL);
+ iommu_dma_free_iova(domain, iova, size, NULL);
return DMA_MAPPING_ERROR;
}
return iova + iova_off;
@@ -939,8 +927,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
* but an IOMMU which supports smaller pages might not map the whole thing.
*/
static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
- size_t size, struct sg_table *sgt, gfp_t gfp, pgprot_t prot,
- unsigned long attrs)
+ size_t size, struct sg_table *sgt, gfp_t gfp, unsigned long attrs)
{
struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -1007,22 +994,21 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
out_free_sg:
sg_free_table(sgt);
out_free_iova:
- iommu_dma_free_iova(cookie, iova, size, NULL);
+ iommu_dma_free_iova(domain, iova, size, NULL);
out_free_pages:
__iommu_dma_free_pages(pages, count);
return NULL;
}
static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
- unsigned long attrs)
+ dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
struct page **pages;
struct sg_table sgt;
void *vaddr;
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
- pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, prot,
- attrs);
+ pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, attrs);
if (!pages)
return NULL;
*dma_handle = sgt.sgl->dma_address;
@@ -1039,9 +1025,23 @@ out_unmap:
return NULL;
}
-static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev,
- size_t size, enum dma_data_direction dir, gfp_t gfp,
- unsigned long attrs)
+/*
+ * This is the actual return value from the iommu_dma_alloc_noncontiguous.
+ *
+ * The users of the DMA API should only care about the sg_table, but to make
+ * the DMA-API internal vmaping and freeing easier we stash away the page
+ * array as well (except for the fallback case). This can go away any time,
+ * e.g. when a vmap-variant that takes a scatterlist comes along.
+ */
+struct dma_sgt_handle {
+ struct sg_table sgt;
+ struct page **pages;
+};
+#define sgt_handle(sgt) \
+ container_of((sgt), struct dma_sgt_handle, sgt)
+
+struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size,
+ enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
{
struct dma_sgt_handle *sh;
@@ -1049,8 +1049,7 @@ static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev,
if (!sh)
return NULL;
- sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp,
- PAGE_KERNEL, attrs);
+ sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp, attrs);
if (!sh->pages) {
kfree(sh);
return NULL;
@@ -1058,7 +1057,7 @@ static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev,
return &sh->sgt;
}
-static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
+void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
struct sg_table *sgt, enum dma_data_direction dir)
{
struct dma_sgt_handle *sh = sgt_handle(sgt);
@@ -1069,8 +1068,26 @@ static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
kfree(sh);
}
-static void iommu_dma_sync_single_for_cpu(struct device *dev,
- dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
+void *iommu_dma_vmap_noncontiguous(struct device *dev, size_t size,
+ struct sg_table *sgt)
+{
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL);
+}
+
+int iommu_dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+ size_t size, struct sg_table *sgt)
+{
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff)
+ return -ENXIO;
+ return vm_map_pages(vma, sgt_handle(sgt)->pages, count);
+}
+
+void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir)
{
phys_addr_t phys;
@@ -1081,12 +1098,11 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(phys, size, dir);
- if (is_swiotlb_buffer(dev, phys))
- swiotlb_sync_single_for_cpu(dev, phys, size, dir);
+ swiotlb_sync_single_for_cpu(dev, phys, size, dir);
}
-static void iommu_dma_sync_single_for_device(struct device *dev,
- dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
+void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir)
{
phys_addr_t phys;
@@ -1094,16 +1110,14 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
return;
phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
- if (is_swiotlb_buffer(dev, phys))
- swiotlb_sync_single_for_device(dev, phys, size, dir);
+ swiotlb_sync_single_for_device(dev, phys, size, dir);
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(phys, size, dir);
}
-static void iommu_dma_sync_sg_for_cpu(struct device *dev,
- struct scatterlist *sgl, int nelems,
- enum dma_data_direction dir)
+void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir)
{
struct scatterlist *sg;
int i;
@@ -1117,9 +1131,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
}
-static void iommu_dma_sync_sg_for_device(struct device *dev,
- struct scatterlist *sgl, int nelems,
- enum dma_data_direction dir)
+void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir)
{
struct scatterlist *sg;
int i;
@@ -1134,11 +1147,57 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
}
-static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iova_domain *iovad = &domain->iova_cookie->iovad;
+
+ if (!is_swiotlb_active(dev)) {
+ dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n");
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
+ trace_swiotlb_bounced(dev, phys, size);
+
+ phys = swiotlb_tbl_map_single(dev, phys, size, iova_mask(iovad), dir,
+ attrs);
+
+ /*
+ * Untrusted devices should not see padding areas with random leftover
+ * kernel data, so zero the pre- and post-padding.
+ * swiotlb_tbl_map_single() has initialized the bounce buffer proper to
+ * the contents of the original memory buffer.
+ */
+ if (phys != (phys_addr_t)DMA_MAPPING_ERROR && dev_is_untrusted(dev)) {
+ size_t start, virt = (size_t)phys_to_virt(phys);
+
+ /* Pre-padding */
+ start = iova_align_down(iovad, virt);
+ memset((void *)start, 0, virt - start);
+
+ /* Post-padding */
+ start = virt + size;
+ memset((void *)start, 0, iova_align(iovad, start) - start);
+ }
+
+ return phys;
+}
+
+/*
+ * Checks if a physical buffer has unaligned boundaries with respect to
+ * the IOMMU granule. Returns non-zero if either the start or end
+ * address is not aligned to the granule boundary.
+ */
+static inline size_t iova_unaligned(struct iova_domain *iovad, phys_addr_t phys,
+ size_t size)
+{
+ return iova_offset(iovad, phys | size);
+}
+
+dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
{
- phys_addr_t phys = page_to_phys(page) + offset;
bool coherent = dev_is_dma_coherent(dev);
int prot = dma_info_to_prot(dir, coherent, attrs);
struct iommu_domain *domain = iommu_get_dma_domain(dev);
@@ -1147,60 +1206,39 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
dma_addr_t iova, dma_mask = dma_get_mask(dev);
/*
- * If both the physical buffer start address and size are
- * page aligned, we don't need to use a bounce page.
+ * If both the physical buffer start address and size are page aligned,
+ * we don't need to use a bounce page.
*/
if (dev_use_swiotlb(dev, size, dir) &&
- iova_offset(iovad, phys | size)) {
- if (!is_swiotlb_active(dev)) {
- dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n");
+ iova_unaligned(iovad, phys, size)) {
+ if (attrs & DMA_ATTR_MMIO)
return DMA_MAPPING_ERROR;
- }
-
- trace_swiotlb_bounced(dev, phys, size);
- phys = swiotlb_tbl_map_single(dev, phys, size,
- iova_mask(iovad), dir, attrs);
-
- if (phys == DMA_MAPPING_ERROR)
+ phys = iommu_dma_map_swiotlb(dev, phys, size, dir, attrs);
+ if (phys == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
-
- /*
- * Untrusted devices should not see padding areas with random
- * leftover kernel data, so zero the pre- and post-padding.
- * swiotlb_tbl_map_single() has initialized the bounce buffer
- * proper to the contents of the original memory buffer.
- */
- if (dev_is_untrusted(dev)) {
- size_t start, virt = (size_t)phys_to_virt(phys);
-
- /* Pre-padding */
- start = iova_align_down(iovad, virt);
- memset((void *)start, 0, virt - start);
-
- /* Post-padding */
- start = virt + size;
- memset((void *)start, 0,
- iova_align(iovad, start) - start);
- }
}
- if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
arch_sync_dma_for_device(phys, size, dir);
iova = __iommu_dma_map(dev, phys, size, prot, dma_mask);
- if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys))
+ if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
return iova;
}
-static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
+void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- struct iommu_domain *domain = iommu_get_dma_domain(dev);
phys_addr_t phys;
- phys = iommu_iova_to_phys(domain, dma_handle);
+ if (attrs & DMA_ATTR_MMIO) {
+ __iommu_dma_unmap(dev, dma_handle, size);
+ return;
+ }
+
+ phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
if (WARN_ON(!phys))
return;
@@ -1209,8 +1247,7 @@ static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
__iommu_dma_unmap(dev, dma_handle, size);
- if (unlikely(is_swiotlb_buffer(dev, phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
}
/*
@@ -1314,7 +1351,7 @@ static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *s
int i;
for_each_sg(sg, s, nents, i)
- iommu_dma_unmap_page(dev, sg_dma_address(s),
+ iommu_dma_unmap_phys(dev, sg_dma_address(s),
sg_dma_len(s), dir, attrs);
}
@@ -1327,8 +1364,8 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg,
sg_dma_mark_swiotlb(sg);
for_each_sg(sg, s, nents, i) {
- sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s),
- s->offset, s->length, dir, attrs);
+ sg_dma_address(s) = iommu_dma_map_phys(dev, sg_phys(s),
+ s->length, dir, attrs);
if (sg_dma_address(s) == DMA_MAPPING_ERROR)
goto out_unmap;
sg_dma_len(s) = s->length;
@@ -1348,8 +1385,8 @@ out_unmap:
* impedance-matching, to be able to hand off a suitably-aligned list,
* but still preserve the original offsets and sizes for the caller.
*/
-static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir, unsigned long attrs)
+int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
{
struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -1357,7 +1394,6 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
struct scatterlist *s, *prev = NULL;
int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs);
struct pci_p2pdma_map_state p2pdma_state = {};
- enum pci_p2pdma_map_type map;
dma_addr_t iova;
size_t iova_len = 0;
unsigned long mask = dma_get_seg_boundary(dev);
@@ -1387,28 +1423,30 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
size_t s_length = s->length;
size_t pad_len = (mask - iova_len + 1) & mask;
- if (is_pci_p2pdma_page(sg_page(s))) {
- map = pci_p2pdma_map_segment(&p2pdma_state, dev, s);
- switch (map) {
- case PCI_P2PDMA_MAP_BUS_ADDR:
- /*
- * iommu_map_sg() will skip this segment as
- * it is marked as a bus address,
- * __finalise_sg() will copy the dma address
- * into the output segment.
- */
- continue;
- case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
- /*
- * Mapping through host bridge should be
- * mapped with regular IOVAs, thus we
- * do nothing here and continue below.
- */
- break;
- default:
- ret = -EREMOTEIO;
- goto out_restore_sg;
- }
+ switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(s))) {
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Mapping through host bridge should be mapped with
+ * regular IOVAs, thus we do nothing here and continue
+ * below.
+ */
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ /*
+ * iommu_map_sg() will skip this segment as it is marked
+ * as a bus address, __finalise_sg() will copy the dma
+ * address into the output segment.
+ */
+ s->dma_address = pci_p2pdma_bus_addr_map(
+ p2pdma_state.mem, sg_phys(s));
+ sg_dma_len(s) = sg->length;
+ sg_dma_mark_bus_address(s);
+ continue;
+ default:
+ ret = -EREMOTEIO;
+ goto out_restore_sg;
}
sg_dma_address(s) = s_iova_off;
@@ -1459,7 +1497,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
return __finalise_sg(dev, sg, nents, iova);
out_free_iova:
- iommu_dma_free_iova(cookie, iova, iova_len, NULL);
+ iommu_dma_free_iova(domain, iova, iova_len, NULL);
out_restore_sg:
__invalidate_sg(sg, nents);
out:
@@ -1468,8 +1506,8 @@ out:
return ret;
}
-static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir, unsigned long attrs)
+void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
{
dma_addr_t end = 0, start;
struct scatterlist *tmp;
@@ -1518,20 +1556,6 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
__iommu_dma_unmap(dev, start, end - start);
}
-static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- return __iommu_dma_map(dev, phys, size,
- dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
- dma_get_mask(dev));
-}
-
-static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- __iommu_dma_unmap(dev, handle, size);
-}
-
static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
{
size_t alloc_size = PAGE_ALIGN(size);
@@ -1563,7 +1587,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
dma_free_contiguous(dev, page, alloc_size);
}
-static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
+void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t handle, unsigned long attrs)
{
__iommu_dma_unmap(dev, handle, size);
@@ -1607,8 +1631,8 @@ out_free_pages:
return NULL;
}
-static void *iommu_dma_alloc(struct device *dev, size_t size,
- dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
+void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
+ gfp_t gfp, unsigned long attrs)
{
bool coherent = dev_is_dma_coherent(dev);
int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
@@ -1619,8 +1643,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
if (gfpflags_allow_blocking(gfp) &&
!(attrs & DMA_ATTR_FORCE_CONTIGUOUS)) {
- return iommu_dma_alloc_remap(dev, size, handle, gfp,
- dma_pgprot(dev, PAGE_KERNEL, attrs), attrs);
+ return iommu_dma_alloc_remap(dev, size, handle, gfp, attrs);
}
if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
@@ -1642,7 +1665,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
return cpu_addr;
}
-static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
@@ -1673,7 +1696,7 @@ static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
vma->vm_page_prot);
}
-static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
+int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
@@ -1700,19 +1723,19 @@ static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
return ret;
}
-static unsigned long iommu_dma_get_merge_boundary(struct device *dev)
+unsigned long iommu_dma_get_merge_boundary(struct device *dev)
{
struct iommu_domain *domain = iommu_get_dma_domain(dev);
return (1UL << __ffs(domain->pgsize_bitmap)) - 1;
}
-static size_t iommu_dma_opt_mapping_size(void)
+size_t iommu_dma_opt_mapping_size(void)
{
return iova_rcache_range();
}
-static size_t iommu_dma_max_mapping_size(struct device *dev)
+size_t iommu_dma_max_mapping_size(struct device *dev)
{
if (dev_is_untrusted(dev))
return swiotlb_max_mapping_size(dev);
@@ -1720,31 +1743,359 @@ static size_t iommu_dma_max_mapping_size(struct device *dev)
return SIZE_MAX;
}
-static const struct dma_map_ops iommu_dma_ops = {
- .flags = DMA_F_PCI_P2PDMA_SUPPORTED |
- DMA_F_CAN_SKIP_SYNC,
- .alloc = iommu_dma_alloc,
- .free = iommu_dma_free,
- .alloc_pages_op = dma_common_alloc_pages,
- .free_pages = dma_common_free_pages,
- .alloc_noncontiguous = iommu_dma_alloc_noncontiguous,
- .free_noncontiguous = iommu_dma_free_noncontiguous,
- .mmap = iommu_dma_mmap,
- .get_sgtable = iommu_dma_get_sgtable,
- .map_page = iommu_dma_map_page,
- .unmap_page = iommu_dma_unmap_page,
- .map_sg = iommu_dma_map_sg,
- .unmap_sg = iommu_dma_unmap_sg,
- .sync_single_for_cpu = iommu_dma_sync_single_for_cpu,
- .sync_single_for_device = iommu_dma_sync_single_for_device,
- .sync_sg_for_cpu = iommu_dma_sync_sg_for_cpu,
- .sync_sg_for_device = iommu_dma_sync_sg_for_device,
- .map_resource = iommu_dma_map_resource,
- .unmap_resource = iommu_dma_unmap_resource,
- .get_merge_boundary = iommu_dma_get_merge_boundary,
- .opt_mapping_size = iommu_dma_opt_mapping_size,
- .max_mapping_size = iommu_dma_max_mapping_size,
-};
+/**
+ * dma_iova_try_alloc - Try to allocate an IOVA space
+ * @dev: Device to allocate the IOVA space for
+ * @state: IOVA state
+ * @phys: physical address
+ * @size: IOVA size
+ *
+ * Check if @dev supports the IOVA-based DMA API, and if yes allocate IOVA space
+ * for the given base address and size.
+ *
+ * Note: @phys is only used to calculate the IOVA alignment. Callers that always
+ * do PAGE_SIZE aligned transfers can safely pass 0 here.
+ *
+ * Returns %true if the IOVA-based DMA API can be used and IOVA space has been
+ * allocated, or %false if the regular DMA API should be used.
+ */
+bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state,
+ phys_addr_t phys, size_t size)
+{
+ struct iommu_dma_cookie *cookie;
+ struct iommu_domain *domain;
+ struct iova_domain *iovad;
+ size_t iova_off;
+ dma_addr_t addr;
+
+ memset(state, 0, sizeof(*state));
+ if (!use_dma_iommu(dev))
+ return false;
+
+ domain = iommu_get_dma_domain(dev);
+ cookie = domain->iova_cookie;
+ iovad = &cookie->iovad;
+ iova_off = iova_offset(iovad, phys);
+
+ if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
+ iommu_deferred_attach(dev, iommu_get_domain_for_dev(dev)))
+ return false;
+
+ if (WARN_ON_ONCE(!size))
+ return false;
+
+ /*
+ * DMA_IOVA_USE_SWIOTLB is flag which is set by dma-iommu
+ * internals, make sure that caller didn't set it and/or
+ * didn't use this interface to map SIZE_MAX.
+ */
+ if (WARN_ON_ONCE((u64)size & DMA_IOVA_USE_SWIOTLB))
+ return false;
+
+ addr = iommu_dma_alloc_iova(domain,
+ iova_align(iovad, size + iova_off),
+ dma_get_mask(dev), dev);
+ if (!addr)
+ return false;
+
+ state->addr = addr + iova_off;
+ state->__size = size;
+ return true;
+}
+EXPORT_SYMBOL_GPL(dma_iova_try_alloc);
+
+/**
+ * dma_iova_free - Free an IOVA space
+ * @dev: Device to free the IOVA space for
+ * @state: IOVA state
+ *
+ * Undoes a successful dma_try_iova_alloc().
+ *
+ * Note that all dma_iova_link() calls need to be undone first. For callers
+ * that never call dma_iova_unlink(), dma_iova_destroy() can be used instead
+ * which unlinks all ranges and frees the IOVA space in a single efficient
+ * operation.
+ */
+void dma_iova_free(struct device *dev, struct dma_iova_state *state)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ size_t iova_start_pad = iova_offset(iovad, state->addr);
+ size_t size = dma_iova_size(state);
+
+ iommu_dma_free_iova(domain, state->addr - iova_start_pad,
+ iova_align(iovad, size + iova_start_pad), NULL);
+}
+EXPORT_SYMBOL_GPL(dma_iova_free);
+
+static int __dma_iova_link(struct device *dev, dma_addr_t addr,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ bool coherent = dev_is_dma_coherent(dev);
+ int prot = dma_info_to_prot(dir, coherent, attrs);
+
+ if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+ arch_sync_dma_for_device(phys, size, dir);
+
+ return iommu_map_nosync(iommu_get_dma_domain(dev), addr, phys, size,
+ prot, GFP_ATOMIC);
+}
+
+static int iommu_dma_iova_bounce_and_link(struct device *dev, dma_addr_t addr,
+ phys_addr_t phys, size_t bounce_len,
+ enum dma_data_direction dir, unsigned long attrs,
+ size_t iova_start_pad)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iova_domain *iovad = &domain->iova_cookie->iovad;
+ phys_addr_t bounce_phys;
+ int error;
+
+ bounce_phys = iommu_dma_map_swiotlb(dev, phys, bounce_len, dir, attrs);
+ if (bounce_phys == DMA_MAPPING_ERROR)
+ return -ENOMEM;
+
+ error = __dma_iova_link(dev, addr - iova_start_pad,
+ bounce_phys - iova_start_pad,
+ iova_align(iovad, bounce_len), dir, attrs);
+ if (error)
+ swiotlb_tbl_unmap_single(dev, bounce_phys, bounce_len, dir,
+ attrs);
+ return error;
+}
+
+static int iommu_dma_iova_link_swiotlb(struct device *dev,
+ struct dma_iova_state *state, phys_addr_t phys, size_t offset,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ size_t iova_start_pad = iova_offset(iovad, phys);
+ size_t iova_end_pad = iova_offset(iovad, phys + size);
+ dma_addr_t addr = state->addr + offset;
+ size_t mapped = 0;
+ int error;
+
+ if (iova_start_pad) {
+ size_t bounce_len = min(size, iovad->granule - iova_start_pad);
+
+ error = iommu_dma_iova_bounce_and_link(dev, addr, phys,
+ bounce_len, dir, attrs, iova_start_pad);
+ if (error)
+ return error;
+ state->__size |= DMA_IOVA_USE_SWIOTLB;
+
+ mapped += bounce_len;
+ size -= bounce_len;
+ if (!size)
+ return 0;
+ }
+
+ size -= iova_end_pad;
+ error = __dma_iova_link(dev, addr + mapped, phys + mapped, size, dir,
+ attrs);
+ if (error)
+ goto out_unmap;
+ mapped += size;
+
+ if (iova_end_pad) {
+ error = iommu_dma_iova_bounce_and_link(dev, addr + mapped,
+ phys + mapped, iova_end_pad, dir, attrs, 0);
+ if (error)
+ goto out_unmap;
+ state->__size |= DMA_IOVA_USE_SWIOTLB;
+ }
+
+ return 0;
+
+out_unmap:
+ dma_iova_unlink(dev, state, 0, mapped, dir, attrs);
+ return error;
+}
+
+/**
+ * dma_iova_link - Link a range of IOVA space
+ * @dev: DMA device
+ * @state: IOVA state
+ * @phys: physical address to link
+ * @offset: offset into the IOVA state to map into
+ * @size: size of the buffer
+ * @dir: DMA direction
+ * @attrs: attributes of mapping properties
+ *
+ * Link a range of IOVA space for the given IOVA state without IOTLB sync.
+ * This function is used to link multiple physical addresses in contiguous
+ * IOVA space without performing costly IOTLB sync.
+ *
+ * The caller is responsible to call to dma_iova_sync() to sync IOTLB at
+ * the end of linkage.
+ */
+int dma_iova_link(struct device *dev, struct dma_iova_state *state,
+ phys_addr_t phys, size_t offset, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ size_t iova_start_pad = iova_offset(iovad, phys);
+
+ if (WARN_ON_ONCE(iova_start_pad && offset > 0))
+ return -EIO;
+
+ if (dev_use_swiotlb(dev, size, dir) &&
+ iova_unaligned(iovad, phys, size)) {
+ if (attrs & DMA_ATTR_MMIO)
+ return -EPERM;
+
+ return iommu_dma_iova_link_swiotlb(dev, state, phys, offset,
+ size, dir, attrs);
+ }
+
+ return __dma_iova_link(dev, state->addr + offset - iova_start_pad,
+ phys - iova_start_pad,
+ iova_align(iovad, size + iova_start_pad), dir, attrs);
+}
+EXPORT_SYMBOL_GPL(dma_iova_link);
+
+/**
+ * dma_iova_sync - Sync IOTLB
+ * @dev: DMA device
+ * @state: IOVA state
+ * @offset: offset into the IOVA state to sync
+ * @size: size of the buffer
+ *
+ * Sync IOTLB for the given IOVA state. This function should be called on
+ * the IOVA-contiguous range created by one ore more dma_iova_link() calls
+ * to sync the IOTLB.
+ */
+int dma_iova_sync(struct device *dev, struct dma_iova_state *state,
+ size_t offset, size_t size)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ dma_addr_t addr = state->addr + offset;
+ size_t iova_start_pad = iova_offset(iovad, addr);
+
+ return iommu_sync_map(domain, addr - iova_start_pad,
+ iova_align(iovad, size + iova_start_pad));
+}
+EXPORT_SYMBOL_GPL(dma_iova_sync);
+
+static void iommu_dma_iova_unlink_range_slow(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ size_t iova_start_pad = iova_offset(iovad, addr);
+ dma_addr_t end = addr + size;
+
+ do {
+ phys_addr_t phys;
+ size_t len;
+
+ phys = iommu_iova_to_phys(domain, addr);
+ if (WARN_ON(!phys))
+ /* Something very horrible happen here */
+ return;
+
+ len = min_t(size_t,
+ end - addr, iovad->granule - iova_start_pad);
+
+ if (!dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+ arch_sync_dma_for_cpu(phys, len, dir);
+
+ swiotlb_tbl_unmap_single(dev, phys, len, dir, attrs);
+
+ addr += len;
+ iova_start_pad = 0;
+ } while (addr < end);
+}
+
+static void __iommu_dma_iova_unlink(struct device *dev,
+ struct dma_iova_state *state, size_t offset, size_t size,
+ enum dma_data_direction dir, unsigned long attrs,
+ bool free_iova)
+{
+ struct iommu_domain *domain = iommu_get_dma_domain(dev);
+ struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct iova_domain *iovad = &cookie->iovad;
+ dma_addr_t addr = state->addr + offset;
+ size_t iova_start_pad = iova_offset(iovad, addr);
+ struct iommu_iotlb_gather iotlb_gather;
+ size_t unmapped;
+
+ if ((state->__size & DMA_IOVA_USE_SWIOTLB) ||
+ (!dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))))
+ iommu_dma_iova_unlink_range_slow(dev, addr, size, dir, attrs);
+
+ iommu_iotlb_gather_init(&iotlb_gather);
+ iotlb_gather.queued = free_iova && READ_ONCE(cookie->fq_domain);
+
+ size = iova_align(iovad, size + iova_start_pad);
+ addr -= iova_start_pad;
+ unmapped = iommu_unmap_fast(domain, addr, size, &iotlb_gather);
+ WARN_ON(unmapped != size);
+
+ if (!iotlb_gather.queued)
+ iommu_iotlb_sync(domain, &iotlb_gather);
+ if (free_iova)
+ iommu_dma_free_iova(domain, addr, size, &iotlb_gather);
+}
+
+/**
+ * dma_iova_unlink - Unlink a range of IOVA space
+ * @dev: DMA device
+ * @state: IOVA state
+ * @offset: offset into the IOVA state to unlink
+ * @size: size of the buffer
+ * @dir: DMA direction
+ * @attrs: attributes of mapping properties
+ *
+ * Unlink a range of IOVA space for the given IOVA state.
+ */
+void dma_iova_unlink(struct device *dev, struct dma_iova_state *state,
+ size_t offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ __iommu_dma_iova_unlink(dev, state, offset, size, dir, attrs, false);
+}
+EXPORT_SYMBOL_GPL(dma_iova_unlink);
+
+/**
+ * dma_iova_destroy - Finish a DMA mapping transaction
+ * @dev: DMA device
+ * @state: IOVA state
+ * @mapped_len: number of bytes to unmap
+ * @dir: DMA direction
+ * @attrs: attributes of mapping properties
+ *
+ * Unlink the IOVA range up to @mapped_len and free the entire IOVA space. The
+ * range of IOVA from dma_addr to @mapped_len must all be linked, and be the
+ * only linked IOVA in state.
+ */
+void dma_iova_destroy(struct device *dev, struct dma_iova_state *state,
+ size_t mapped_len, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ if (mapped_len)
+ __iommu_dma_iova_unlink(dev, state, 0, mapped_len, dir, attrs,
+ true);
+ else
+ /*
+ * We can be here if first call to dma_iova_link() failed and
+ * there is nothing to unlink, so let's be more clear.
+ */
+ dma_iova_free(dev, state);
+}
+EXPORT_SYMBOL_GPL(dma_iova_destroy);
void iommu_setup_dma_ops(struct device *dev)
{
@@ -1753,32 +2104,58 @@ void iommu_setup_dma_ops(struct device *dev)
if (dev_is_pci(dev))
dev->iommu->pci_32bit_workaround = !iommu_dma_forcedac;
- if (iommu_is_dma_domain(domain)) {
- if (iommu_dma_init_domain(domain, dev))
- goto out_err;
- dev->dma_ops = &iommu_dma_ops;
- } else if (dev->dma_ops == &iommu_dma_ops) {
- /* Clean up if we've switched *from* a DMA domain */
- dev->dma_ops = NULL;
- }
+ dev->dma_iommu = iommu_is_dma_domain(domain);
+ if (dev->dma_iommu && iommu_dma_init_domain(domain, dev))
+ goto out_err;
return;
out_err:
- pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
- dev_name(dev));
+ pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
+ dev_name(dev));
+ dev->dma_iommu = false;
+}
+
+static bool has_msi_cookie(const struct iommu_domain *domain)
+{
+ return domain && (domain->cookie_type == IOMMU_COOKIE_DMA_IOVA ||
+ domain->cookie_type == IOMMU_COOKIE_DMA_MSI);
+}
+
+static size_t cookie_msi_granule(const struct iommu_domain *domain)
+{
+ switch (domain->cookie_type) {
+ case IOMMU_COOKIE_DMA_IOVA:
+ return domain->iova_cookie->iovad.granule;
+ case IOMMU_COOKIE_DMA_MSI:
+ return PAGE_SIZE;
+ default:
+ BUG();
+ }
+}
+
+static struct list_head *cookie_msi_pages(const struct iommu_domain *domain)
+{
+ switch (domain->cookie_type) {
+ case IOMMU_COOKIE_DMA_IOVA:
+ return &domain->iova_cookie->msi_page_list;
+ case IOMMU_COOKIE_DMA_MSI:
+ return &domain->msi_cookie->msi_page_list;
+ default:
+ BUG();
+ }
}
static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
phys_addr_t msi_addr, struct iommu_domain *domain)
{
- struct iommu_dma_cookie *cookie = domain->iova_cookie;
+ struct list_head *msi_page_list = cookie_msi_pages(domain);
struct iommu_dma_msi_page *msi_page;
dma_addr_t iova;
int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
- size_t size = cookie_msi_granule(cookie);
+ size_t size = cookie_msi_granule(domain);
msi_addr &= ~(phys_addr_t)(size - 1);
- list_for_each_entry(msi_page, &cookie->msi_page_list, list)
+ list_for_each_entry(msi_page, msi_page_list, list)
if (msi_page->phys == msi_addr)
return msi_page;
@@ -1796,70 +2173,35 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
INIT_LIST_HEAD(&msi_page->list);
msi_page->phys = msi_addr;
msi_page->iova = iova;
- list_add(&msi_page->list, &cookie->msi_page_list);
+ list_add(&msi_page->list, msi_page_list);
return msi_page;
out_free_iova:
- iommu_dma_free_iova(cookie, iova, size, NULL);
+ iommu_dma_free_iova(domain, iova, size, NULL);
out_free_page:
kfree(msi_page);
return NULL;
}
-/**
- * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
- * @desc: MSI descriptor, will store the MSI page
- * @msi_addr: MSI target address to be mapped
- *
- * Return: 0 on success or negative error code if the mapping failed.
- */
-int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
+int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
+ phys_addr_t msi_addr)
{
struct device *dev = msi_desc_to_dev(desc);
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- struct iommu_dma_msi_page *msi_page;
- static DEFINE_MUTEX(msi_prepare_lock); /* see below */
+ const struct iommu_dma_msi_page *msi_page;
- if (!domain || !domain->iova_cookie) {
- desc->iommu_cookie = NULL;
+ if (!has_msi_cookie(domain)) {
+ msi_desc_set_iommu_msi_iova(desc, 0, 0);
return 0;
}
- /*
- * In fact the whole prepare operation should already be serialised by
- * irq_domain_mutex further up the callchain, but that's pretty subtle
- * on its own, so consider this locking as failsafe documentation...
- */
- mutex_lock(&msi_prepare_lock);
+ iommu_group_mutex_assert(dev);
msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain);
- mutex_unlock(&msi_prepare_lock);
-
- msi_desc_set_iommu_cookie(desc, msi_page);
-
if (!msi_page)
return -ENOMEM;
- return 0;
-}
-
-/**
- * iommu_dma_compose_msi_msg() - Apply translation to an MSI message
- * @desc: MSI descriptor prepared by iommu_dma_prepare_msi()
- * @msg: MSI message containing target physical address
- */
-void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
-{
- struct device *dev = msi_desc_to_dev(desc);
- const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- const struct iommu_dma_msi_page *msi_page;
- msi_page = msi_desc_get_iommu_cookie(desc);
-
- if (!domain || !domain->iova_cookie || WARN_ON(!msi_page))
- return;
-
- msg->address_hi = upper_32_bits(msi_page->iova);
- msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1;
- msg->address_lo += lower_32_bits(msi_page->iova);
+ msi_desc_set_iommu_msi_iova(desc, msi_page->iova,
+ ilog2(cookie_msi_granule(domain)));
+ return 0;
}
static int iommu_dma_init(void)
diff --git a/drivers/iommu/dma-iommu.h b/drivers/iommu/dma-iommu.h
index c12d63457c76..eca201c1f963 100644
--- a/drivers/iommu/dma-iommu.h
+++ b/drivers/iommu/dma-iommu.h
@@ -13,11 +13,15 @@ void iommu_setup_dma_ops(struct device *dev);
int iommu_get_dma_cookie(struct iommu_domain *domain);
void iommu_put_dma_cookie(struct iommu_domain *domain);
+void iommu_put_msi_cookie(struct iommu_domain *domain);
int iommu_dma_init_fq(struct iommu_domain *domain);
void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
+int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
+ phys_addr_t msi_addr);
+
extern bool iommu_dma_forcedac;
#else /* CONFIG_IOMMU_DMA */
@@ -40,9 +44,19 @@ static inline void iommu_put_dma_cookie(struct iommu_domain *domain)
{
}
+static inline void iommu_put_msi_cookie(struct iommu_domain *domain)
+{
+}
+
static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
{
}
+static inline int iommu_dma_sw_msi(struct iommu_domain *domain,
+ struct msi_desc *desc, phys_addr_t msi_addr)
+{
+ return -ENODEV;
+}
+
#endif /* CONFIG_IOMMU_DMA */
#endif /* __DMA_IOMMU_H */
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index c666ecab955d..b512c6b939ac 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -22,6 +22,7 @@
#include <linux/pm_runtime.h>
#include <linux/slab.h>
+#include "dma-iommu.h"
#include "iommu-pages.h"
typedef u32 sysmmu_iova_t;
@@ -249,7 +250,7 @@ struct exynos_iommu_domain {
struct list_head clients; /* list of sysmmu_drvdata.domain_node */
sysmmu_pte_t *pgtable; /* lv1 page table, 16KB */
short *lv2entcnt; /* free lv2 entry counter for each section */
- spinlock_t lock; /* lock for modyfying list of clients */
+ spinlock_t lock; /* lock for modifying list of clients */
spinlock_t pgtablelock; /* lock for modifying page table @ pgtable */
struct iommu_domain domain; /* generic domain data structure */
};
@@ -292,7 +293,7 @@ struct sysmmu_drvdata {
struct clk *aclk; /* SYSMMU's aclk clock */
struct clk *pclk; /* SYSMMU's pclk clock */
struct clk *clk_master; /* master's device clock */
- spinlock_t lock; /* lock for modyfying state */
+ spinlock_t lock; /* lock for modifying state */
bool active; /* current status */
struct exynos_iommu_domain *domain; /* domain we belong to */
struct list_head domain_node; /* node for domain clients list */
@@ -746,7 +747,7 @@ static int exynos_sysmmu_probe(struct platform_device *pdev)
ret = devm_request_irq(dev, irq, exynos_sysmmu_irq, 0,
dev_name(dev), data);
if (ret) {
- dev_err(dev, "Unabled to register handler of irq %d\n", irq);
+ dev_err(dev, "Unable to register handler of irq %d\n", irq);
return ret;
}
@@ -832,7 +833,7 @@ static int __maybe_unused exynos_sysmmu_suspend(struct device *dev)
struct exynos_iommu_owner *owner = dev_iommu_priv_get(master);
mutex_lock(&owner->rpm_lock);
- if (&data->domain->domain != &exynos_identity_domain) {
+ if (data->domain) {
dev_dbg(data->sysmmu, "saving state\n");
__sysmmu_disable(data);
}
@@ -850,7 +851,7 @@ static int __maybe_unused exynos_sysmmu_resume(struct device *dev)
struct exynos_iommu_owner *owner = dev_iommu_priv_get(master);
mutex_lock(&owner->rpm_lock);
- if (&data->domain->domain != &exynos_identity_domain) {
+ if (data->domain) {
dev_dbg(data->sysmmu, "restoring state\n");
__sysmmu_enable(data);
}
@@ -902,11 +903,11 @@ static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev)
if (!domain)
return NULL;
- domain->pgtable = iommu_alloc_pages(GFP_KERNEL, 2);
+ domain->pgtable = iommu_alloc_pages_sz(GFP_KERNEL, SZ_16K);
if (!domain->pgtable)
goto err_pgtable;
- domain->lv2entcnt = iommu_alloc_pages(GFP_KERNEL, 1);
+ domain->lv2entcnt = iommu_alloc_pages_sz(GFP_KERNEL, SZ_8K);
if (!domain->lv2entcnt)
goto err_counter;
@@ -925,6 +926,8 @@ static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev)
spin_lock_init(&domain->pgtablelock);
INIT_LIST_HEAD(&domain->clients);
+ domain->domain.pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE;
+
domain->domain.geometry.aperture_start = 0;
domain->domain.geometry.aperture_end = ~0UL;
domain->domain.geometry.force_aperture = true;
@@ -932,9 +935,9 @@ static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev)
return &domain->domain;
err_lv2ent:
- iommu_free_pages(domain->lv2entcnt, 1);
+ iommu_free_pages(domain->lv2entcnt);
err_counter:
- iommu_free_pages(domain->pgtable, 2);
+ iommu_free_pages(domain->pgtable);
err_pgtable:
kfree(domain);
return NULL;
@@ -975,13 +978,14 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain)
phys_to_virt(base));
}
- iommu_free_pages(domain->pgtable, 2);
- iommu_free_pages(domain->lv2entcnt, 1);
+ iommu_free_pages(domain->pgtable);
+ iommu_free_pages(domain->lv2entcnt);
kfree(domain);
}
static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
struct exynos_iommu_domain *domain;
@@ -1032,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = {
};
static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain);
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
@@ -1041,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain,
unsigned long flags;
int err;
- err = exynos_iommu_identity_attach(&exynos_identity_domain, dev);
+ err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old);
if (err)
return err;
@@ -1426,8 +1431,6 @@ static void exynos_iommu_release_device(struct device *dev)
struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
struct sysmmu_drvdata *data;
- WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev));
-
list_for_each_entry(data, &owner->controllers, owner_node)
device_link_del(data->link);
}
@@ -1443,17 +1446,14 @@ static int exynos_iommu_of_xlate(struct device *dev,
return -ENODEV;
data = platform_get_drvdata(sysmmu);
- if (!data) {
- put_device(&sysmmu->dev);
+ put_device(&sysmmu->dev);
+ if (!data)
return -ENODEV;
- }
if (!owner) {
owner = kzalloc(sizeof(*owner), GFP_KERNEL);
- if (!owner) {
- put_device(&sysmmu->dev);
+ if (!owner)
return -ENOMEM;
- }
INIT_LIST_HEAD(&owner->controllers);
mutex_init(&owner->rpm_lock);
@@ -1473,11 +1473,12 @@ static int exynos_iommu_of_xlate(struct device *dev,
static const struct iommu_ops exynos_iommu_ops = {
.identity_domain = &exynos_identity_domain,
+ .release_domain = &exynos_identity_domain,
.domain_alloc_paging = exynos_iommu_domain_alloc_paging,
.device_group = generic_device_group,
.probe_device = exynos_iommu_probe_device,
.release_device = exynos_iommu_release_device,
- .pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE,
+ .get_resv_regions = iommu_dma_get_resv_regions,
.of_xlate = exynos_iommu_of_xlate,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = exynos_iommu_attach_device,
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index e9d2bff4659b..9664ef9840d2 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -64,7 +64,7 @@ static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
spin_lock_irqsave(&iommu_lock, flags);
ret = pamu_update_paace_stash(liodn, val);
if (ret) {
- pr_debug("Failed to update SPAACE for liodn %d\n ", liodn);
+ pr_debug("Failed to update SPAACE for liodn %d\n", liodn);
spin_unlock_irqrestore(&iommu_lock, flags);
return ret;
}
@@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
}
static int fsl_pamu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
unsigned long flags;
@@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain,
* switches to what looks like BLOCKING.
*/
static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct fsl_dma_domain *dma_domain;
const u32 *prop;
int len;
@@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain,
* Hack to keep things working as they always have, only leaving an
* UNMANAGED domain makes it BLOCKING.
*/
- if (domain == platform_domain || !domain ||
- domain->type != IOMMU_DOMAIN_UNMANAGED)
+ if (old == platform_domain || !old ||
+ old->type != IOMMU_DOMAIN_UNMANAGED)
return 0;
- dma_domain = to_fsl_dma_domain(domain);
+ dma_domain = to_fsl_dma_domain(old);
/*
* Use LIODN of the PCI controller while detaching a
@@ -416,14 +416,12 @@ static struct iommu_group *fsl_pamu_device_group(struct device *dev)
static struct iommu_device *fsl_pamu_probe_device(struct device *dev)
{
- int len;
-
/*
* uboot must fill the fsl,liodn for platform devices to be supported by
* the iommu.
*/
if (!dev_is_pci(dev) &&
- !of_get_property(dev->of_node, "fsl,liodn", &len))
+ !of_property_present(dev->of_node, "fsl,liodn"))
return ERR_PTR(-ENODEV);
return &pamu_iommu;
diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig
new file mode 100644
index 000000000000..52ac9e661ffd
--- /dev/null
+++ b/drivers/iommu/generic_pt/.kunitconfig
@@ -0,0 +1,14 @@
+CONFIG_KUNIT=y
+CONFIG_GENERIC_PT=y
+CONFIG_DEBUG_GENERIC_PT=y
+CONFIG_IOMMU_PT=y
+CONFIG_IOMMU_PT_AMDV1=y
+CONFIG_IOMMU_PT_VTDSS=y
+CONFIG_IOMMU_PT_X86_64=y
+CONFIG_IOMMU_PT_KUNIT_TEST=y
+
+CONFIG_IOMMUFD=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_FAULT_INJECTION=y
+CONFIG_RUNTIME_TESTING_MENU=y
+CONFIG_IOMMUFD_TEST=y
diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
new file mode 100644
index 000000000000..ce4fb4786914
--- /dev/null
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menuconfig GENERIC_PT
+ bool "Generic Radix Page Table" if COMPILE_TEST
+ help
+ Generic library for building radix tree page tables.
+
+ Generic PT provides a set of HW page table formats and a common
+ set of APIs to work with them.
+
+if GENERIC_PT
+config DEBUG_GENERIC_PT
+ bool "Extra debugging checks for GENERIC_PT"
+ help
+ Enable extra run time debugging checks for GENERIC_PT code. This
+ incurs a runtime cost and should not be enabled for production
+ kernels.
+
+ The kunit tests require this to be enabled to get full coverage.
+
+config IOMMU_PT
+ tristate "IOMMU Page Tables"
+ select IOMMU_API
+ depends on IOMMU_SUPPORT
+ depends on GENERIC_PT
+ help
+ Generic library for building IOMMU page tables
+
+ IOMMU_PT provides an implementation of the page table operations
+ related to struct iommu_domain using GENERIC_PT. It provides a single
+ implementation of the page table operations that can be shared by
+ multiple drivers.
+
+if IOMMU_PT
+config IOMMU_PT_AMDV1
+ tristate "IOMMU page table for 64-bit AMD IOMMU v1"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the AMD v1 page table. AMDv1 is the
+ "host" page table. It supports granular page sizes of almost every
+ power of 2 and decodes the full 64-bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_VTDSS
+ tristate "IOMMU page table for Intel VT-d Second Stage"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5
+ level Second Stage page table. It is similar to the X86_64 format with
+ 4K/2M/1G page sizes.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_X86_64
+ tristate "IOMMU page table for x86 64-bit, 4/5 levels"
+ depends on !GENERIC_ATOMIC64 # for cmpxchg64
+ help
+ iommu_domain implementation for the x86 64-bit 4/5 level page table.
+ It supports 4K/2M/1G page sizes and can decode a sign-extended
+ portion of the 64-bit IOVA space.
+
+ Selected automatically by an IOMMU driver that uses this format.
+
+config IOMMU_PT_KUNIT_TEST
+ tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1
+ depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64
+ depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS
+ default KUNIT_ALL_TESTS
+ help
+ Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
+ enabled page table formats. The test covers most of the GENERIC_PT
+ functions provided by the page table format, as well as covering the
+ iommu_domain related functions.
+
+endif
+endif
diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile
new file mode 100644
index 000000000000..976b49ec97dc
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1
+iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss
+
+iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64
+
+IOMMU_PT_KUNIT_TEST :=
+define create_format
+obj-$(2) += iommu_$(1).o
+iommu_pt_kunit_test-y += kunit_iommu_$(1).o
+CFLAGS_kunit_iommu_$(1).o += -DGENERIC_PT_KUNIT=1
+IOMMU_PT_KUNIT_TEST := iommu_pt_kunit_test.o
+
+endef
+
+$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y)))
+$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m)))
+
+# The kunit objects are constructed by compiling the main source
+# with -DGENERIC_PT_KUNIT
+$(obj)/kunit_iommu_%.o: $(src)/iommu_%.c FORCE
+ $(call rule_mkdir)
+ $(call if_changed_dep,cc_o_c)
+
+obj-$(CONFIG_IOMMU_PT_KUNIT_TEST) += $(IOMMU_PT_KUNIT_TEST)
diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h
new file mode 100644
index 000000000000..aa8e1a8ec95f
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/amdv1.h
@@ -0,0 +1,411 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * AMD IOMMU v1 page table
+ *
+ * This is described in Section "2.2.3 I/O Page Tables for Host Translations"
+ * of the "AMD I/O Virtualization Technology (IOMMU) Specification"
+ *
+ * Note the level numbering here matches the core code, so level 0 is the same
+ * as mode 1.
+ *
+ */
+#ifndef __GENERIC_PT_FMT_AMDV1_H
+#define __GENERIC_PT_FMT_AMDV1_H
+
+#include "defs_amdv1.h"
+#include "../pt_defs.h"
+
+#include <asm/page.h>
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/mem_encrypt.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+
+enum {
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ /*
+ * The IOMMUFD selftest uses the AMDv1 format with some alterations It
+ * uses a 2k page size to test cases where the CPU page size is not the
+ * same.
+ */
+#ifdef AMDV1_IOMMUFD_SELFTEST
+ PT_MAX_VA_ADDRESS_LG2 = 56,
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 11,
+#else
+ PT_MAX_VA_ADDRESS_LG2 = 64,
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_TOP_LEVEL = 5,
+ PT_GRANULE_LG2SZ = 12,
+#endif
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* The DTE only has these bits for the top phyiscal address */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* PTE bits */
+enum {
+ AMDV1PT_FMT_PR = BIT(0),
+ AMDV1PT_FMT_D = BIT(6),
+ AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
+ AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
+ AMDV1PT_FMT_FC = BIT_ULL(60),
+ AMDV1PT_FMT_IR = BIT_ULL(61),
+ AMDV1PT_FMT_IW = BIT_ULL(62),
+};
+
+/*
+ * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
+ * these defines to avoid it.
+ */
+#define AMDV1PT_FMT_NL_DEFAULT 0
+#define AMDV1PT_FMT_NL_SIZE 7
+
+static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
+}
+#define pt_table_pa amdv1pt_table_pa
+
+/* Returns the oa for the start of the contiguous entry */
+static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+ pt_oaddr_t oa;
+
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
+ unsigned int sz_bits = oaffz(oa);
+
+ oa = oalog2_set_mod(oa, 0, sz_bits);
+ } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
+ AMDV1PT_FMT_NL_DEFAULT))
+ return 0;
+ return oalog2_mul(oa, PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa amdv1pt_entry_oa
+
+static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
+{
+ /*
+ * Table 15: Page Table Level Parameters
+ * The top most level cannot have translation entries
+ */
+ return pts->level < PT_MAX_TOP_LEVEL;
+}
+#define pt_can_have_leaf amdv1pt_can_have_leaf
+
+/* Body in pt_fmt_defaults.h */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+static inline unsigned int
+amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ u32 code;
+
+ if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
+ AMDV1PT_FMT_NL_DEFAULT)
+ return ilog2(1);
+
+ PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
+ AMDV1PT_FMT_NL_SIZE);
+
+ /*
+ * The contiguous size is encoded in the length of a string of 1's in
+ * the low bits of the OA. Reverse the equation:
+ * code = log2_to_int(num_contig_lg2 + item_lg2sz -
+ * PT_GRANULE_LG2SZ - 1) - 1
+ * Which can be expressed as:
+ * num_contig_lg2 = oalog2_ffz(code) + 1 -
+ * item_lg2sz - PT_GRANULE_LG2SZ
+ *
+ * Assume the bit layout is correct and remove the masking. Reorganize
+ * the equation to move all the arithmetic before the ffz.
+ */
+ code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
+ pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
+ return ffz_t(u32, code);
+}
+#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
+
+static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
+{
+ /*
+ * Top entry covers bits [63:57] only, this is handled through
+ * max_vasz_lg2.
+ */
+ if (PT_WARN_ON(pts->level == 5))
+ return 7;
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 amdv1pt_num_items_lg2
+
+static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!amdv1pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * Table 14: Example Page Size Encodings
+ * Address bits 51:32 can be used to encode page sizes greater than 4
+ * Gbytes. Address bits 63:52 are zero-extended.
+ *
+ * 512GB Pages are not supported due to a hardware bug.
+ * Otherwise every power of two size is supported.
+ */
+ return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
+ isz_lg2) & ~SZ_512G;
+}
+#define pt_possible_sizes amdv1pt_possible_sizes
+
+static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ unsigned int next_level;
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(*tablep);
+ if (!(entry & AMDV1PT_FMT_PR))
+ return PT_ENTRY_EMPTY;
+
+ next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
+ if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
+ next_level == AMDV1PT_FMT_NL_SIZE)
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw amdv1pt_load_entry_raw
+
+static inline void
+amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+
+ if (oasz_lg2 == isz_lg2) {
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_DEFAULT);
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
+ AMDV1PT_FMT_NL_SIZE) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
+ 1) -
+ 1);
+
+ /* See amdv1pt_clear_entries() */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, entry);
+ } else {
+ memset64(tablep, entry, log2_to_int(num_contig_lg2));
+ }
+ }
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry amdv1pt_install_leaf_entry
+
+static inline bool amdv1pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ /*
+ * IR and IW are ANDed from the table levels along with the PTE. We
+ * always control permissions from the PTE, so always set IR and IW for
+ * tables.
+ */
+ entry = AMDV1PT_FMT_PR |
+ FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
+ FIELD_PREP(AMDV1PT_FMT_OA,
+ log2_div(table_pa, PT_GRANULE_LG2SZ)) |
+ AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
+ if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table amdv1pt_install_table
+
+static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits =
+ pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
+}
+#define pt_attr_from_entry amdv1pt_attr_from_entry
+
+static inline void amdv1pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ /*
+ * gcc generates rep stos for the io-pgtable code, and this difference
+ * can show in microbenchmarks with larger contiguous page sizes.
+ * rep is slower for small cases.
+ */
+ if (num_contig_lg2 <= ilog2(32)) {
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+ } else {
+ memset64(tablep, 0, log2_to_int(num_contig_lg2));
+ }
+}
+#define pt_clear_entries amdv1pt_clear_entries
+
+static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
+ return true;
+ return false;
+}
+#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
+
+static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
+{
+ unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
+ u64 *tablep = pt_cur_table(pts, u64) +
+ log2_set_mod(pts->index, 0, num_contig_lg2);
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
+}
+#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
+
+static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | AMDV1PT_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_amdv1
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
+ ->amdpt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
+}
+
+static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
+ pte |= AMDV1PT_FMT_FC;
+ if (iommu_prot & IOMMU_READ)
+ pte |= AMDV1PT_FMT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= AMDV1PT_FMT_IW;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot amdv1pt_iommu_set_prot
+
+static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
+ const struct pt_iommu_amdv1_cfg *cfg)
+{
+ struct pt_amdv1 *table = &iommu_table->amdpt;
+ unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+
+ if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
+ return -EINVAL;
+
+ if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
+ cfg->starting_level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
+ (cfg->starting_level + 1);
+
+ table->common.max_vasz_lg2 =
+ min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ pt_top_set_level(&table->common, cfg->starting_level);
+ return 0;
+}
+#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
+
+#ifndef PT_FMT_VARIANT
+static inline void
+amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_amdv1_hw_info *info)
+{
+ info->host_pt_root = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
+ info->mode = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
+#endif
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
+ /* Matches what io_pgtable does */
+ [0] = { .starting_level = 2 },
+};
+#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = 0 };
+#endif
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
new file mode 100644
index 000000000000..0b9614ca6d10
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H
+#define __GENERIC_PT_FMT_DEFS_AMDV1_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct amdv1pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs amdv1pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
new file mode 100644
index 000000000000..4a239bcaae2a
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H
+#define __GENERIC_PT_FMT_DEFS_VTDSS_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct vtdss_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs vtdss_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
new file mode 100644
index 000000000000..6f589e1f55d3
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ */
+#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H
+#define __GENERIC_PT_FMT_DEFS_X86_64_H
+
+#include <linux/generic_pt/common.h>
+#include <linux/types.h>
+
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+
+struct x86_64_pt_write_attrs {
+ u64 descriptor_bits;
+ gfp_t gfp;
+};
+#define pt_write_attrs x86_64_pt_write_attrs
+
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
new file mode 100644
index 000000000000..72a2337d0c55
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT amdv1
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \
+ BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+#define PT_FORCE_ENABLED_FEATURES \
+ (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c
new file mode 100644
index 000000000000..74e597cba9d9
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define AMDV1_IOMMUFD_SELFTEST 1
+#define PT_FMT amdv1
+#define PT_FMT_VARIANT mock
+#define PT_SUPPORTED_FEATURES 0
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h
new file mode 100644
index 000000000000..d28e86abdf2e
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_template.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Template to build the iommu module and kunit from the format and
+ * implementation headers.
+ *
+ * The format should have:
+ * #define PT_FMT <name>
+ * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy))
+ * And optionally:
+ * #define PT_FORCE_ENABLED_FEATURES ..
+ * #define PT_FMT_VARIANT <suffix>
+ */
+#include <linux/args.h>
+#include <linux/stringify.h>
+
+#ifdef PT_FMT_VARIANT
+#define PTPFX_RAW \
+ CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT)
+#else
+#define PTPFX_RAW PT_FMT
+#endif
+
+#define PTPFX CONCATENATE(PTPFX_RAW, _)
+
+#define _PT_FMT_H PT_FMT.h
+#define PT_FMT_H __stringify(_PT_FMT_H)
+
+#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H)
+#define PT_DEFS_H __stringify(_PT_DEFS_H)
+
+#include <linux/generic_pt/common.h>
+#include PT_DEFS_H
+#include "../pt_defs.h"
+#include PT_FMT_H
+#include "../pt_common.h"
+
+#ifndef GENERIC_PT_KUNIT
+#include "../iommu_pt.h"
+#else
+/*
+ * The makefile will compile the .c file twice, once with GENERIC_PT_KUNIT set
+ * which means we are building the kunit modle.
+ */
+#include "../kunit_generic_pt.h"
+#include "../kunit_iommu_pt.h"
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
new file mode 100644
index 000000000000..f551711e2a33
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT vtdss
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \
+ BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
new file mode 100644
index 000000000000..5472660c2d71
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#define PT_FMT x86_64
+#define PT_SUPPORTED_FEATURES \
+ (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \
+ BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \
+ BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT))
+
+#include "iommu_template.h"
diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h
new file mode 100644
index 000000000000..f5f8981edde7
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/vtdss.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Intel VT-d Second Stange 5/4 level page table
+ *
+ * This is described in
+ * Section "3.7 Second-Stage Translation"
+ * Section "9.8 Second-Stage Paging Entries"
+ *
+ * Of the "Intel Virtualization Technology for Directed I/O Architecture
+ * Specification".
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/SS-PTE - 0
+ * Directory/SS-PDE - 1
+ * Directory Ptr/SS-PDPTE - 2
+ * PML4/SS-PML4E - 3
+ * PML5/SS-PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_VTDSS_H
+#define __GENERIC_PT_FMT_VTDSS_H
+
+#include "defs_vtdss.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /* SSPTPTR is 4k aligned and limited by HAW */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ VTDSS_FMT_R = BIT(0),
+ VTDSS_FMT_W = BIT(1),
+ VTDSS_FMT_A = BIT(8),
+ VTDSS_FMT_D = BIT(9),
+ VTDSS_FMT_SNP = BIT(11),
+ VTDSS_FMT_OA = GENMASK_ULL(51, 12),
+};
+
+/* PDPTE/PDE */
+enum {
+ VTDSS_FMT_PS = BIT(7),
+};
+
+#define common_to_vtdss_pt(common_ptr) \
+ container_of_const(common_ptr, struct pt_vtdss, common)
+#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common)
+
+static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa vtdss_pt_table_pa
+
+static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts)
+{
+ return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa vtdss_pt_entry_oa
+
+static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf vtdss_pt_can_have_leaf
+
+static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 vtdss_pt_num_items_lg2
+
+static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!entry)
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw vtdss_pt_load_entry_raw
+
+static inline void
+vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= VTDSS_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry vtdss_pt_install_leaf_entry
+
+static inline bool vtdss_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = VTDSS_FMT_R | VTDSS_FMT_W |
+ FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table vtdss_pt_install_table
+
+static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP);
+}
+#define pt_attr_from_entry vtdss_pt_attr_from_entry
+
+static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ return READ_ONCE(*tablep) & VTDSS_FMT_D;
+}
+#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty
+
+static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+
+ WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D);
+}
+#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean
+
+static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 new = pts->entry | VTDSS_FMT_D;
+
+ return try_cmpxchg64(tablep, &pts->entry, new);
+}
+#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty
+
+static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common)
+{
+ return 10;
+}
+#define pt_max_sw_bit vtdss_pt_max_sw_bit
+
+static inline u64 vtdss_pt_sw_bit(unsigned int bitnr)
+{
+ if (__builtin_constant_p(bitnr) && bitnr > 10)
+ BUILD_BUG();
+
+ /* Bits marked Ignored in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(10);
+ case 1 ... 9:
+ return BIT_ULL((bitnr - 1) + 52);
+ case 10:
+ return BIT_ULL(63);
+ /* Some bits in 9-3 are available in some entries */
+ default:
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit vtdss_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_vtdss
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->vtdss_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, vtdss_pt.common)
+ ->iommu;
+}
+
+static inline int vtdss_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte = 0;
+
+ /*
+ * VTDSS does not have a present bit, so we tell if any entry is present
+ * by checking for R or W.
+ */
+ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+ return -EINVAL;
+
+ if (iommu_prot & IOMMU_READ)
+ pte |= VTDSS_FMT_R;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= VTDSS_FMT_W;
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE))
+ pte |= VTDSS_FMT_SNP;
+
+ if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) &&
+ !(iommu_prot & IOMMU_WRITE)) {
+ pr_err_ratelimited(
+ "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot vtdss_pt_iommu_set_prot
+
+static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table,
+ const struct pt_iommu_vtdss_cfg *cfg)
+{
+ struct pt_vtdss *table = &iommu_table->vtdss_pt;
+
+ if (cfg->top_level > 4 || cfg->top_level < 2)
+ return -EOPNOTSUPP;
+
+ pt_top_set_level(&table->common, cfg->top_level);
+ return 0;
+}
+#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init
+
+static inline void
+vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_vtdss_hw_info *info)
+{
+ info->ssptptr = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK);
+ /*
+ * top_level = 2 = 3 level table aw=1
+ * top_level = 3 = 4 level table aw=2
+ * top_level = 4 = 5 level table aw=3
+ */
+ info->aw = top_range->top_level - 1;
+}
+#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = {
+ [0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2},
+ [1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3},
+ [2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4},
+};
+#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) };
+#endif
+#endif
diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h
new file mode 100644
index 000000000000..210748d9d6e8
--- /dev/null
+++ b/drivers/iommu/generic_pt/fmt/x86_64.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * x86 page table. Supports the 4 and 5 level variations.
+ *
+ * The 4 and 5 level version is described in:
+ * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software
+ * Developer's Manual Volume 3
+ *
+ * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization
+ * Technology for Directed I/O Architecture Specification"
+ *
+ * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O
+ * Virtualization Technology (IOMMU) Specification"
+ *
+ * It is used by x86 CPUs, AMD and VT-d IOMMU HW.
+ *
+ * Note the 3 level format is very similar and almost implemented here. The
+ * reserved/ignored layout is different and there are functional bit
+ * differences.
+ *
+ * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower
+ * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign
+ * extended addressing with this page table format.
+ *
+ * The named levels in the spec map to the pts->level as:
+ * Table/PTE - 0
+ * Directory/PDE - 1
+ * Directory Ptr/PDPTE - 2
+ * PML4/PML4E - 3
+ * PML5/PML5E - 4
+ */
+#ifndef __GENERIC_PT_FMT_X86_64_H
+#define __GENERIC_PT_FMT_X86_64_H
+
+#include "defs_x86_64.h"
+#include "../pt_defs.h"
+
+#include <linux/bitfield.h>
+#include <linux/container_of.h>
+#include <linux/log2.h>
+#include <linux/mem_encrypt.h>
+
+enum {
+ PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
+ PT_MAX_VA_ADDRESS_LG2 = 57,
+ PT_ITEM_WORD_SIZE = sizeof(u64),
+ PT_MAX_TOP_LEVEL = 4,
+ PT_GRANULE_LG2SZ = 12,
+ PT_TABLEMEM_LG2SZ = 12,
+
+ /*
+ * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k
+ * aligned and is limited by the architected HAW
+ */
+ PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
+};
+
+/* Shared descriptor bits */
+enum {
+ X86_64_FMT_P = BIT(0),
+ X86_64_FMT_RW = BIT(1),
+ X86_64_FMT_U = BIT(2),
+ X86_64_FMT_A = BIT(5),
+ X86_64_FMT_D = BIT(6),
+ X86_64_FMT_OA = GENMASK_ULL(51, 12),
+ X86_64_FMT_XD = BIT_ULL(63),
+};
+
+/* PDPTE/PDE */
+enum {
+ X86_64_FMT_PS = BIT(7),
+};
+
+static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+ PT_TABLEMEM_LG2SZ);
+}
+#define pt_table_pa x86_64_pt_table_pa
+
+static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts)
+{
+ u64 entry = pts->entry;
+
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_clr(entry);
+ return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry),
+ PT_GRANULE_LG2SZ);
+}
+#define pt_entry_oa x86_64_pt_entry_oa
+
+static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts)
+{
+ return pts->level <= 2;
+}
+#define pt_can_have_leaf x86_64_pt_can_have_leaf
+
+static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts)
+{
+ return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
+}
+#define pt_num_items_lg2 x86_64_pt_num_items_lg2
+
+static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts)
+{
+ const u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ pts->entry = entry = READ_ONCE(tablep[pts->index]);
+ if (!(entry & X86_64_FMT_P))
+ return PT_ENTRY_EMPTY;
+ if (pts->level == 0 ||
+ (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS)))
+ return PT_ENTRY_OA;
+ return PT_ENTRY_TABLE;
+}
+#define pt_load_entry_raw x86_64_pt_load_entry_raw
+
+static inline void
+x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs)
+{
+ u64 *tablep = pt_cur_table(pts, u64);
+ u64 entry;
+
+ if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
+ return;
+
+ entry = X86_64_FMT_P |
+ FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
+ attrs->descriptor_bits;
+ if (pts->level != 0)
+ entry |= X86_64_FMT_PS;
+
+ WRITE_ONCE(tablep[pts->index], entry);
+ pts->entry = entry;
+}
+#define pt_install_leaf_entry x86_64_pt_install_leaf_entry
+
+static inline bool x86_64_pt_install_table(struct pt_state *pts,
+ pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs)
+{
+ u64 entry;
+
+ entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+ FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ));
+ if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ entry = __sme_set(entry);
+ return pt_table_install64(pts, entry);
+}
+#define pt_install_table x86_64_pt_install_table
+
+static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ attrs->descriptor_bits = pts->entry &
+ (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A |
+ X86_64_FMT_D | X86_64_FMT_XD);
+}
+#define pt_attr_from_entry x86_64_pt_attr_from_entry
+
+static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common)
+{
+ return 12;
+}
+#define pt_max_sw_bit x86_64_pt_max_sw_bit
+
+static inline u64 x86_64_pt_sw_bit(unsigned int bitnr)
+{
+ if (__builtin_constant_p(bitnr) && bitnr > 12)
+ BUILD_BUG();
+
+ /* Bits marked Ignored/AVL in the specification */
+ switch (bitnr) {
+ case 0:
+ return BIT(9);
+ case 1:
+ return BIT(11);
+ case 2 ... 12:
+ return BIT_ULL((bitnr - 2) + 52);
+ /* Some bits in 8,6,4,3 are available in some entries */
+ default:
+ PT_WARN_ON(true);
+ return 0;
+ }
+}
+#define pt_sw_bit x86_64_pt_sw_bit
+
+/* --- iommu */
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+#define pt_iommu_table pt_iommu_x86_64
+
+/* The common struct is in the per-format common struct */
+static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
+{
+ return &container_of(iommu_table, struct pt_iommu_table, iommu)
+ ->x86_64_pt.common;
+}
+
+static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
+{
+ return &container_of(common, struct pt_iommu_table, x86_64_pt.common)
+ ->iommu;
+}
+
+static inline int x86_64_pt_iommu_set_prot(struct pt_common *common,
+ struct pt_write_attrs *attrs,
+ unsigned int iommu_prot)
+{
+ u64 pte;
+
+ pte = X86_64_FMT_U | X86_64_FMT_A;
+ if (iommu_prot & IOMMU_WRITE)
+ pte |= X86_64_FMT_RW | X86_64_FMT_D;
+
+ /*
+ * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
+ * control this. For now if the tables use sme_set then so do the ptes.
+ */
+ if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
+ pte = __sme_set(pte);
+
+ attrs->descriptor_bits = pte;
+ return 0;
+}
+#define pt_iommu_set_prot x86_64_pt_iommu_set_prot
+
+static inline int
+x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table,
+ const struct pt_iommu_x86_64_cfg *cfg)
+{
+ struct pt_x86_64 *table = &iommu_table->x86_64_pt;
+
+ if (cfg->top_level < 3 || cfg->top_level > 4)
+ return -EOPNOTSUPP;
+
+ pt_top_set_level(&table->common, cfg->top_level);
+
+ table->common.max_oasz_lg2 =
+ min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
+ return 0;
+}
+#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init
+
+static inline void
+x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table,
+ const struct pt_range *top_range,
+ struct pt_iommu_x86_64_hw_info *info)
+{
+ info->gcr3_pt = virt_to_phys(top_range->top_table);
+ PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK);
+ info->levels = top_range->top_level + 1;
+}
+#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info
+
+#if defined(GENERIC_PT_KUNIT)
+static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = {
+ [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+ .common.hw_max_vasz_lg2 = 48, .top_level = 3 },
+ [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND),
+ .common.hw_max_vasz_lg2 = 57, .top_level = 4 },
+ /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */
+ [2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 },
+ [3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4},
+};
+#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs
+enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)};
+#endif
+#endif
diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
new file mode 100644
index 000000000000..97aeda1ad01c
--- /dev/null
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -0,0 +1,1289 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * "Templated C code" for implementing the iommu operations for page tables.
+ * This is compiled multiple times, over all the page table formats to pick up
+ * the per-format definitions.
+ */
+#ifndef __GENERIC_PT_IOMMU_PT_H
+#define __GENERIC_PT_IOMMU_PT_H
+
+#include "pt_iter.h"
+
+#include <linux/export.h>
+#include <linux/iommu.h>
+#include "../iommu-pages.h"
+#include <linux/cleanup.h>
+#include <linux/dma-mapping.h>
+
+enum {
+ SW_BIT_CACHE_FLUSH_DONE = 0,
+};
+
+static void flush_writes_range(const struct pt_state *pts,
+ unsigned int start_index, unsigned int end_index)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, start_index * PT_ITEM_WORD_SIZE,
+ (end_index - start_index) * PT_ITEM_WORD_SIZE);
+}
+
+static void flush_writes_item(const struct pt_state *pts)
+{
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_flush_incoherent(
+ iommu_from_common(pts->range->common)->iommu_device,
+ pts->table, pts->index * PT_ITEM_WORD_SIZE,
+ PT_ITEM_WORD_SIZE);
+}
+
+static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
+ struct pt_iommu *iommu_table, pt_vaddr_t iova,
+ pt_vaddr_t len,
+ struct iommu_pages_list *free_list)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(free_list,
+ iommu_table->iommu_device);
+
+ if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
+ iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
+ iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
+ /*
+ * Note that the sync frees the gather's free list, so we must
+ * not have any pages on that list that are covered by iova/len
+ */
+ } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) {
+ iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
+ }
+
+ iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
+}
+
+#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
+
+static int make_range_ul(struct pt_common *common, struct pt_range *range,
+ unsigned long iova, unsigned long len)
+{
+ unsigned long last;
+
+ if (unlikely(len == 0))
+ return -EINVAL;
+
+ if (check_add_overflow(iova, len - 1, &last))
+ return -EOVERFLOW;
+
+ *range = pt_make_range(common, iova, last);
+ if (sizeof(iova) > sizeof(range->va)) {
+ if (unlikely(range->va != iova || range->last_va != last))
+ return -EOVERFLOW;
+ }
+ return 0;
+}
+
+static __maybe_unused int make_range_u64(struct pt_common *common,
+ struct pt_range *range, u64 iova,
+ u64 len)
+{
+ if (unlikely(iova > ULONG_MAX || len > ULONG_MAX))
+ return -EOVERFLOW;
+ return make_range_ul(common, range, iova, len);
+}
+
+/*
+ * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch
+ * to the correct validation based on the type.
+ */
+#define make_range_no_check(common, range, iova, len) \
+ ({ \
+ int ret; \
+ if (sizeof(iova) > sizeof(unsigned long) || \
+ sizeof(len) > sizeof(unsigned long)) \
+ ret = make_range_u64(common, range, iova, len); \
+ else \
+ ret = make_range_ul(common, range, iova, len); \
+ ret; \
+ })
+
+#define make_range(common, range, iova, len) \
+ ({ \
+ int ret = make_range_no_check(common, range, iova, len); \
+ if (!ret) \
+ ret = pt_check_range(range); \
+ ret; \
+ })
+
+static inline unsigned int compute_best_pgsize(struct pt_state *pts,
+ pt_oaddr_t oa)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(pts->range->common);
+
+ if (!pt_can_have_leaf(pts))
+ return 0;
+
+ /*
+ * The page size is limited by the domain's bitmap. This allows the core
+ * code to reduce the supported page sizes by changing the bitmap.
+ */
+ return pt_compute_best_pgsize(pt_possible_sizes(pts) &
+ iommu_table->domain.pgsize_bitmap,
+ pts->range->va, pts->range->last_va, oa);
+}
+
+static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ pt_oaddr_t *res = arg;
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, descend_fn);
+ case PT_ENTRY_OA:
+ *res = pt_entry_oa_exact(&pts);
+ return 0;
+ }
+ return -ENOENT;
+}
+PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys);
+
+/**
+ * iova_to_phys() - Return the output address for the given IOVA
+ * @domain: Table to query
+ * @iova: IO virtual address to query
+ *
+ * Determine the output address from the given IOVA. @iova may have any
+ * alignment, the returned physical will be adjusted with any sub page offset.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Return: 0 if there is no translation for the given iova.
+ */
+phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain,
+ dma_addr_t iova)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_range range;
+ pt_oaddr_t res;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __iova_to_phys, &res);
+ /* PHYS_ADDR_MAX would be a better error code */
+ if (ret)
+ return 0;
+ return res;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
+
+struct pt_iommu_dirty_args {
+ struct iommu_dirty_bitmap *dirty;
+ unsigned int flags;
+};
+
+static void record_dirty(struct pt_state *pts,
+ struct pt_iommu_dirty_args *dirty,
+ unsigned int num_contig_lg2)
+{
+ pt_vaddr_t dirty_len;
+
+ if (num_contig_lg2 != ilog2(1)) {
+ unsigned int index = pts->index;
+ unsigned int end_index = log2_set_mod_max_t(
+ unsigned int, pts->index, num_contig_lg2);
+
+ /* Adjust for being contained inside a contiguous page */
+ end_index = min(end_index, pts->end_index);
+ dirty_len = (end_index - index) *
+ log2_to_int(pt_table_item_lg2sz(pts));
+ } else {
+ dirty_len = log2_to_int(pt_table_item_lg2sz(pts));
+ }
+
+ if (dirty->dirty->bitmap)
+ iova_bitmap_set(dirty->dirty->bitmap, pts->range->va,
+ dirty_len);
+
+ if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) {
+ /*
+ * No write log required because DMA incoherence and atomic
+ * dirty tracking bits can't work together
+ */
+ pt_entry_make_write_clean(pts);
+ iommu_iotlb_gather_add_range(dirty->dirty->gather,
+ pts->range->va, dirty_len);
+ }
+}
+
+static inline int __read_and_clear_dirty(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_dirty_args *dirty = arg;
+ int ret;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ ret = pt_descend(&pts, arg, __read_and_clear_dirty);
+ if (ret)
+ return ret;
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts))
+ record_dirty(&pts, dirty,
+ pt_entry_num_contig_lg2(&pts));
+ }
+ return 0;
+}
+
+/**
+ * read_and_clear_dirty() - Manipulate the HW set write dirty state
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @size: Length of the IOVA
+ * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR
+ * @dirty: Place to store the dirty bits
+ *
+ * Iterate over all the entries in the mapped range and record their write dirty
+ * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then
+ * the entries will be left dirty, otherwise they are returned to being not
+ * write dirty.
+ *
+ * Context: The caller must hold a read range lock that includes @iova.
+ *
+ * Returns: -ERRNO on failure, 0 on success.
+ */
+int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_iommu_dirty_args dirty_args = {
+ .dirty = dirty,
+ .flags = flags,
+ };
+ struct pt_range range;
+ int ret;
+
+#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty)
+ return -EOPNOTSUPP;
+#endif
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, size);
+ if (ret)
+ return ret;
+
+ ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args);
+ PT_WARN_ON(ret);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU");
+
+static inline int __set_dirty(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+
+ switch (pt_load_single_entry(&pts)) {
+ case PT_ENTRY_EMPTY:
+ return -ENOENT;
+ case PT_ENTRY_TABLE:
+ return pt_descend(&pts, arg, __set_dirty);
+ case PT_ENTRY_OA:
+ if (!pt_entry_make_write_dirty(&pts))
+ return -EAGAIN;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table,
+ dma_addr_t iova)
+{
+ struct pt_range range;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Note: There is no locking here yet, if the test suite races this it
+ * can crash. It should use RCU locking eventually.
+ */
+ return pt_walk_range(&range, __set_dirty, NULL);
+}
+
+struct pt_iommu_collect_args {
+ struct iommu_pages_list free_list;
+ /* Fail if any OAs are within the range */
+ u8 check_mapped : 1;
+};
+
+static int __collect_tables(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_collect_args *collect = arg;
+ int ret;
+
+ if (!collect->check_mapped && !pt_can_have_table(&pts))
+ return 0;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ iommu_pages_list_add(&collect->free_list, pts.table_lower);
+ ret = pt_descend(&pts, arg, __collect_tables);
+ if (ret)
+ return ret;
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA && collect->check_mapped)
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH};
+
+/* Allocate a table, the empty table will be ready to be installed. */
+static inline struct pt_table_p *_table_alloc(struct pt_common *common,
+ size_t lg2sz, gfp_t gfp,
+ enum alloc_mode mode)
+{
+ struct pt_iommu *iommu_table = iommu_from_common(common);
+ struct pt_table_p *table_mem;
+
+ table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp,
+ log2_to_int(lg2sz));
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ mode == ALLOC_NORMAL) {
+ int ret = iommu_pages_start_incoherent(
+ table_mem, iommu_table->iommu_device);
+ if (ret) {
+ iommu_free_pages(table_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ return table_mem;
+}
+
+static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
+ uintptr_t top_of_table,
+ gfp_t gfp,
+ enum alloc_mode mode)
+{
+ /*
+ * Top doesn't need the free list or otherwise, so it technically
+ * doesn't need to use iommu pages. Use the API anyhow as the top is
+ * usually not smaller than PAGE_SIZE to keep things simple.
+ */
+ return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table),
+ gfp, mode);
+}
+
+/* Allocate an interior table */
+static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts,
+ gfp_t gfp, enum alloc_mode mode)
+{
+ struct pt_state child_pts =
+ pt_init(parent_pts->range, parent_pts->level - 1, NULL);
+
+ return _table_alloc(parent_pts->range->common,
+ pt_num_items_lg2(&child_pts) +
+ ilog2(PT_ITEM_WORD_SIZE),
+ gfp, mode);
+}
+
+static inline int pt_iommu_new_table(struct pt_state *pts,
+ struct pt_write_attrs *attrs)
+{
+ struct pt_table_p *table_mem;
+ phys_addr_t phys;
+
+ /* Given PA/VA/length can't be represented */
+ if (PT_WARN_ON(!pt_can_have_table(pts)))
+ return -ENXIO;
+
+ table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+
+ phys = virt_to_phys(table_mem);
+ if (!pt_install_table(pts, phys, attrs)) {
+ iommu_pages_free_incoherent(
+ table_mem,
+ iommu_from_common(pts->range->common)->iommu_device);
+ return -EAGAIN;
+ }
+
+ if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) {
+ flush_writes_item(pts);
+ pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE);
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+ /*
+ * The underlying table can't store the physical table address.
+ * This happens when kunit testing tables outside their normal
+ * environment where a CPU might be limited.
+ */
+ pt_load_single_entry(pts);
+ if (PT_WARN_ON(pt_table_pa(pts) != phys)) {
+ pt_clear_entries(pts, ilog2(1));
+ iommu_pages_free_incoherent(
+ table_mem, iommu_from_common(pts->range->common)
+ ->iommu_device);
+ return -EINVAL;
+ }
+ }
+
+ pts->table_lower = table_mem;
+ return 0;
+}
+
+struct pt_iommu_map_args {
+ struct iommu_iotlb_gather *iotlb_gather;
+ struct pt_write_attrs attrs;
+ pt_oaddr_t oa;
+ unsigned int leaf_pgsize_lg2;
+ unsigned int leaf_level;
+};
+
+/*
+ * This will recursively check any tables in the block to validate they are
+ * empty and then free them through the gather.
+ */
+static int clear_contig(const struct pt_state *start_pts,
+ struct iommu_iotlb_gather *iotlb_gather,
+ unsigned int step, unsigned int pgsize_lg2)
+{
+ struct pt_iommu *iommu_table =
+ iommu_from_common(start_pts->range->common);
+ struct pt_range range = *start_pts->range;
+ struct pt_state pts =
+ pt_init(&range, start_pts->level, start_pts->table);
+ struct pt_iommu_collect_args collect = { .check_mapped = true };
+ int ret;
+
+ pts.index = start_pts->index;
+ pts.end_index = start_pts->index + step;
+ for (; _pt_iter_load(&pts); pt_next_entry(&pts)) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ collect.free_list =
+ IOMMU_PAGES_LIST_INIT(collect.free_list);
+ ret = pt_walk_descend_all(&pts, __collect_tables,
+ &collect);
+ if (ret)
+ return ret;
+
+ /*
+ * The table item must be cleared before we can update
+ * the gather
+ */
+ pt_clear_entries(&pts, ilog2(1));
+ flush_writes_item(&pts);
+
+ iommu_pages_list_add(&collect.free_list,
+ pt_table_ptr(&pts));
+ gather_range_pages(
+ iotlb_gather, iommu_table, range.va,
+ log2_to_int(pt_table_item_lg2sz(&pts)),
+ &collect.free_list);
+ } else if (pts.type != PT_ENTRY_EMPTY) {
+ return -EADDRINUSE;
+ }
+ }
+ return 0;
+}
+
+static int __map_range_leaf(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2;
+ unsigned int start_index;
+ pt_oaddr_t oa = map->oa;
+ unsigned int step;
+ bool need_contig;
+ int ret = 0;
+
+ PT_WARN_ON(map->leaf_level != level);
+ PT_WARN_ON(!pt_can_have_leaf(&pts));
+
+ step = log2_to_int_t(unsigned int,
+ leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts));
+ need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts);
+
+ _pt_iter_first(&pts);
+ start_index = pts.index;
+ do {
+ pts.type = pt_load_entry_raw(&pts);
+ if (pts.type != PT_ENTRY_EMPTY || need_contig) {
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ ret = clear_contig(&pts, map->iotlb_gather, step,
+ leaf_pgsize_lg2);
+ if (ret)
+ break;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
+ pt_index_to_va(&pts);
+ PT_WARN_ON(compute_best_pgsize(&pts, oa) !=
+ leaf_pgsize_lg2);
+ }
+ pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs);
+
+ oa += log2_to_int(leaf_pgsize_lg2);
+ pts.index += step;
+ } while (pts.index < pts.end_index);
+
+ flush_writes_range(&pts, start_index, pts.index);
+
+ map->oa = oa;
+ return ret;
+}
+
+static int __map_range(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+ int ret;
+
+ PT_WARN_ON(map->leaf_level == level);
+ PT_WARN_ON(!pt_can_have_table(&pts));
+
+ _pt_iter_first(&pts);
+
+ /* Descend to a child table */
+ do {
+ pts.type = pt_load_entry_raw(&pts);
+
+ if (pts.type != PT_ENTRY_TABLE) {
+ if (pts.type != PT_ENTRY_EMPTY)
+ return -EADDRINUSE;
+ ret = pt_iommu_new_table(&pts, &map->attrs);
+ if (ret) {
+ /*
+ * Racing with another thread installing a table
+ */
+ if (ret == -EAGAIN)
+ continue;
+ return ret;
+ }
+ } else {
+ pts.table_lower = pt_table_ptr(&pts);
+ /*
+ * Racing with a shared pt_iommu_new_table()? The other
+ * thread is still flushing the cache, so we have to
+ * also flush it to ensure that when our thread's map
+ * completes all the table items leading to our mapping
+ * are visible.
+ *
+ * This requires the pt_set_bit_release() to be a
+ * release of the cache flush so that this can acquire
+ * visibility at the iommu.
+ */
+ if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) &&
+ !pt_test_sw_bit_acquire(&pts,
+ SW_BIT_CACHE_FLUSH_DONE))
+ flush_writes_item(&pts);
+ }
+
+ /*
+ * The already present table can possibly be shared with another
+ * concurrent map.
+ */
+ if (map->leaf_level == level - 1)
+ ret = pt_descend(&pts, arg, __map_range_leaf);
+ else
+ ret = pt_descend(&pts, arg, __map_range);
+ if (ret)
+ return ret;
+
+ pts.index++;
+ pt_index_to_va(&pts);
+ if (pts.index >= pts.end_index)
+ break;
+ } while (true);
+ return 0;
+}
+
+/*
+ * Fast path for the easy case of mapping a 4k page to an already allocated
+ * table. This is a common workload. If it returns EAGAIN run the full algorithm
+ * instead.
+ */
+static __always_inline int __do_map_single_page(struct pt_range *range,
+ void *arg, unsigned int level,
+ struct pt_table_p *table,
+ pt_level_fn_t descend_fn)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_iommu_map_args *map = arg;
+
+ pts.type = pt_load_single_entry(&pts);
+ if (level == 0) {
+ if (pts.type != PT_ENTRY_EMPTY)
+ return -EADDRINUSE;
+ pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT,
+ &map->attrs);
+ /* No flush, not used when incoherent */
+ map->oa += PAGE_SIZE;
+ return 0;
+ }
+ if (pts.type == PT_ENTRY_TABLE)
+ return pt_descend(&pts, arg, descend_fn);
+ /* Something else, use the slow path */
+ return -EAGAIN;
+}
+PT_MAKE_LEVELS(__map_single_page, __do_map_single_page);
+
+/*
+ * Add a table to the top, increasing the top level as much as necessary to
+ * encompass range.
+ */
+static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_iommu_map_args *map)
+{
+ struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list);
+ struct pt_common *common = common_from_iommu(iommu_table);
+ uintptr_t top_of_table = READ_ONCE(common->top_of_table);
+ uintptr_t new_top_of_table = top_of_table;
+ struct pt_table_p *table_mem;
+ unsigned int new_level;
+ spinlock_t *domain_lock;
+ unsigned long flags;
+ int ret;
+
+ while (true) {
+ struct pt_range top_range =
+ _pt_top_range(common, new_top_of_table);
+ struct pt_state pts = pt_init_top(&top_range);
+
+ top_range.va = range->va;
+ top_range.last_va = range->last_va;
+
+ if (!pt_check_range(&top_range) &&
+ map->leaf_level <= pts.level) {
+ new_level = pts.level;
+ break;
+ }
+
+ pts.level++;
+ if (pts.level > PT_MAX_TOP_LEVEL ||
+ pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) {
+ ret = -ERANGE;
+ goto err_free;
+ }
+
+ table_mem =
+ table_alloc_top(common, _pt_top_set(NULL, pts.level),
+ map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH);
+ if (IS_ERR(table_mem)) {
+ ret = PTR_ERR(table_mem);
+ goto err_free;
+ }
+ iommu_pages_list_add(&free_list, table_mem);
+
+ /* The new table links to the lower table always at index 0 */
+ top_range.va = 0;
+ top_range.top_level = pts.level;
+ pts.table_lower = pts.table;
+ pts.table = table_mem;
+ pt_load_single_entry(&pts);
+ PT_WARN_ON(pts.index != 0);
+ pt_install_table(&pts, virt_to_phys(pts.table_lower),
+ &map->attrs);
+ new_top_of_table = _pt_top_set(pts.table, pts.level);
+ }
+
+ /*
+ * Avoid double flushing, flush it once after all pt_install_table()
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ ret = iommu_pages_start_incoherent_list(
+ &free_list, iommu_table->iommu_device);
+ if (ret)
+ goto err_free;
+ }
+
+ /*
+ * top_of_table is write locked by the spinlock, but readers can use
+ * READ_ONCE() to get the value. Since we encode both the level and the
+ * pointer in one quanta the lockless reader will always see something
+ * valid. The HW must be updated to the new level under the spinlock
+ * before top_of_table is updated so that concurrent readers don't map
+ * into the new level until it is fully functional. If another thread
+ * already updated it while we were working then throw everything away
+ * and try again.
+ */
+ domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table);
+ spin_lock_irqsave(domain_lock, flags);
+ if (common->top_of_table != top_of_table ||
+ top_of_table == new_top_of_table) {
+ spin_unlock_irqrestore(domain_lock, flags);
+ ret = -EAGAIN;
+ goto err_free;
+ }
+
+ /*
+ * We do not issue any flushes for change_top on the expectation that
+ * any walk cache will not become a problem by adding another layer to
+ * the tree. Misses will rewalk from the updated top pointer, hits
+ * continue to be correct. Negative caching is fine too since all the
+ * new IOVA added by the new top is non-present.
+ */
+ iommu_table->driver_ops->change_top(
+ iommu_table, virt_to_phys(table_mem), new_level);
+ WRITE_ONCE(common->top_of_table, new_top_of_table);
+ spin_unlock_irqrestore(domain_lock, flags);
+ return 0;
+
+err_free:
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&free_list,
+ iommu_table->iommu_device);
+ iommu_put_pages_list(&free_list);
+ return ret;
+}
+
+static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range,
+ struct pt_iommu_map_args *map)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ int ret;
+
+ do {
+ ret = pt_check_range(range);
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ return ret;
+
+ if (!ret && map->leaf_level <= range->top_level)
+ break;
+
+ ret = increase_top(iommu_table, range, map);
+ if (ret && ret != -EAGAIN)
+ return ret;
+
+ /* Reload the new top */
+ *range = pt_make_range(common, range->va, range->last_va);
+ } while (ret);
+ PT_WARN_ON(pt_check_range(range));
+ return 0;
+}
+
+static int do_map(struct pt_range *range, struct pt_common *common,
+ bool single_page, struct pt_iommu_map_args *map)
+{
+ /*
+ * The __map_single_page() fast path does not support DMA_INCOHERENT
+ * flushing to keep its .text small.
+ */
+ if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
+ int ret;
+
+ ret = pt_walk_range(range, __map_single_page, map);
+ if (ret != -EAGAIN)
+ return ret;
+ /* EAGAIN falls through to the full path */
+ }
+
+ if (map->leaf_level == range->top_level)
+ return pt_walk_range(range, __map_range_leaf, map);
+ return pt_walk_range(range, __map_range, map);
+}
+
+/**
+ * map_pages() - Install translation for an IOVA range
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @paddr: Physical/Output address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO
+ * @gfp: GFP flags for any memory allocations
+ * @mapped: Total bytes successfully mapped
+ *
+ * The range starting at IOVA will have paddr installed into it. The caller
+ * must specify a valid pgsize and pgcount to segment the range into compatible
+ * blocks.
+ *
+ * On error the caller will probably want to invoke unmap on the range from iova
+ * up to the amount indicated by @mapped to return the table back to an
+ * unchanged state.
+ *
+ * Context: The caller must hold a write range lock that includes the whole
+ * range.
+ *
+ * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were
+ * mapped are added to @mapped, @mapped is not zerod first.
+ */
+int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct iommu_iotlb_gather iotlb_gather;
+ pt_vaddr_t len = pgsize * pgcount;
+ struct pt_iommu_map_args map = {
+ .iotlb_gather = &iotlb_gather,
+ .oa = paddr,
+ .leaf_pgsize_lg2 = vaffs(pgsize),
+ };
+ bool single_page = false;
+ struct pt_range range;
+ int ret;
+
+ iommu_iotlb_gather_init(&iotlb_gather);
+
+ if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE))))
+ return -EINVAL;
+
+ /* Check the paddr doesn't exceed what the table can store */
+ if ((sizeof(pt_oaddr_t) < sizeof(paddr) &&
+ (pt_vaddr_t)paddr > PT_VADDR_MAX) ||
+ (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 &&
+ oalog2_div(paddr, common->max_oasz_lg2)))
+ return -ERANGE;
+
+ ret = pt_iommu_set_prot(common, &map.attrs, prot);
+ if (ret)
+ return ret;
+ map.attrs.gfp = gfp;
+
+ ret = make_range_no_check(common, &range, iova, len);
+ if (ret)
+ return ret;
+
+ /* Calculate target page size and level for the leaves */
+ if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE &&
+ pgcount == 1) {
+ PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE));
+ if (log2_mod(iova | paddr, PAGE_SHIFT))
+ return -ENXIO;
+ map.leaf_pgsize_lg2 = PAGE_SHIFT;
+ map.leaf_level = 0;
+ single_page = true;
+ } else {
+ map.leaf_pgsize_lg2 = pt_compute_best_pgsize(
+ pgsize_bitmap, range.va, range.last_va, paddr);
+ if (!map.leaf_pgsize_lg2)
+ return -ENXIO;
+ map.leaf_level =
+ pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2);
+ }
+
+ ret = check_map_range(iommu_table, &range, &map);
+ if (ret)
+ return ret;
+
+ PT_WARN_ON(map.leaf_level > range.top_level);
+
+ ret = do_map(&range, common, single_page, &map);
+
+ /*
+ * Table levels were freed and replaced with large items, flush any walk
+ * cache that may refer to the freed levels.
+ */
+ if (!iommu_pages_list_empty(&iotlb_gather.freelist))
+ iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather);
+
+ /* Bytes successfully mapped */
+ PT_WARN_ON(!ret && map.oa - paddr != len);
+ *mapped += map.oa - paddr;
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU");
+
+struct pt_unmap_args {
+ struct iommu_pages_list free_list;
+ pt_vaddr_t unmapped;
+};
+
+static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
+ unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct pt_unmap_args *unmap = arg;
+ unsigned int num_oas = 0;
+ unsigned int start_index;
+ int ret = 0;
+
+ _pt_iter_first(&pts);
+ start_index = pts.index;
+ pts.type = pt_load_entry_raw(&pts);
+ /*
+ * A starting index is in the middle of a contiguous entry
+ *
+ * The IOMMU API does not require drivers to support unmapping parts of
+ * large pages. Long ago VFIO would try to split maps but the current
+ * version never does.
+ *
+ * Instead when unmap reaches a partial unmap of the start of a large
+ * IOPTE it should remove the entire IOPTE and return that size to the
+ * caller.
+ */
+ if (pts.type == PT_ENTRY_OA) {
+ if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts)))
+ return -EINVAL;
+ /* Micro optimization */
+ goto start_oa;
+ }
+
+ do {
+ if (pts.type != PT_ENTRY_OA) {
+ bool fully_covered;
+
+ if (pts.type != PT_ENTRY_TABLE) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (pts.index != start_index)
+ pt_index_to_va(&pts);
+ pts.table_lower = pt_table_ptr(&pts);
+
+ fully_covered = pt_entry_fully_covered(
+ &pts, pt_table_item_lg2sz(&pts));
+
+ ret = pt_descend(&pts, arg, __unmap_range);
+ if (ret)
+ break;
+
+ /*
+ * If the unmapping range fully covers the table then we
+ * can free it as well. The clear is delayed until we
+ * succeed in clearing the lower table levels.
+ */
+ if (fully_covered) {
+ iommu_pages_list_add(&unmap->free_list,
+ pts.table_lower);
+ pt_clear_entries(&pts, ilog2(1));
+ }
+ pts.index++;
+ } else {
+ unsigned int num_contig_lg2;
+start_oa:
+ /*
+ * If the caller requested an last that falls within a
+ * single entry then the entire entry is unmapped and
+ * the length returned will be larger than requested.
+ */
+ num_contig_lg2 = pt_entry_num_contig_lg2(&pts);
+ pt_clear_entries(&pts, num_contig_lg2);
+ num_oas += log2_to_int(num_contig_lg2);
+ pts.index += log2_to_int(num_contig_lg2);
+ }
+ if (pts.index >= pts.end_index)
+ break;
+ pts.type = pt_load_entry_raw(&pts);
+ } while (true);
+
+ unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
+ flush_writes_range(&pts, start_index, pts.index);
+
+ return ret;
+}
+
+/**
+ * unmap_pages() - Make a range of IOVA empty/not present
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @pgsize: Length of each page
+ * @pgcount: Length of the range in pgsize units starting from @iova
+ * @iotlb_gather: Gather struct that must be flushed on return
+ *
+ * unmap_pages() will remove a translation created by map_pages(). It cannot
+ * subdivide a mapping created by map_pages(), so it should be called with IOVA
+ * ranges that match those passed to map_pages(). The IOVA range can aggregate
+ * contiguous map_pages() calls so long as no individual range is split.
+ *
+ * Context: The caller must hold a write range lock that includes
+ * the whole range.
+ *
+ * Returns: Number of bytes of VA unmapped. iova + res will be the point
+ * unmapping stopped.
+ */
+size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ struct pt_iommu *iommu_table =
+ container_of(domain, struct pt_iommu, domain);
+ struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT(
+ unmap.free_list) };
+ pt_vaddr_t len = pgsize * pgcount;
+ struct pt_range range;
+ int ret;
+
+ ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
+ if (ret)
+ return 0;
+
+ pt_walk_range(&range, __unmap_range, &unmap);
+
+ gather_range_pages(iotlb_gather, iommu_table, iova, len,
+ &unmap.free_list);
+
+ return unmap.unmapped;
+}
+EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU");
+
+static void NS(get_info)(struct pt_iommu *iommu_table,
+ struct pt_iommu_info *info)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_top_range(common);
+ struct pt_state pts = pt_init_top(&range);
+ pt_vaddr_t pgsize_bitmap = 0;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) {
+ for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL;
+ pts.level++) {
+ if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2)
+ break;
+ pgsize_bitmap |= pt_possible_sizes(&pts);
+ }
+ } else {
+ for (pts.level = 0; pts.level <= range.top_level; pts.level++)
+ pgsize_bitmap |= pt_possible_sizes(&pts);
+ }
+
+ /* Hide page sizes larger than the maximum OA */
+ info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2);
+}
+
+static void NS(deinit)(struct pt_iommu *iommu_table)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_all_range(common);
+ struct pt_iommu_collect_args collect = {
+ .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+ };
+
+ iommu_pages_list_add(&collect.free_list, range.top_table);
+ pt_walk_range(&range, __collect_tables, &collect);
+
+ /*
+ * The driver has to already have fenced the HW access to the page table
+ * and invalidated any caching referring to this memory.
+ */
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
+ iommu_pages_stop_incoherent_list(&collect.free_list,
+ iommu_table->iommu_device);
+ iommu_put_pages_list(&collect.free_list);
+}
+
+static const struct pt_iommu_ops NS(ops) = {
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \
+ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)
+ .set_dirty = NS(set_dirty),
+#endif
+ .get_info = NS(get_info),
+ .deinit = NS(deinit),
+};
+
+static int pt_init_common(struct pt_common *common)
+{
+ struct pt_range top_range = pt_top_range(common);
+
+ if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL))
+ return -EINVAL;
+
+ if (top_range.top_level == PT_MAX_TOP_LEVEL ||
+ common->max_vasz_lg2 == top_range.max_vasz_lg2)
+ common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP);
+
+ if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2)
+ common->features |= BIT(PT_FEAT_FULL_VA);
+
+ /* Requested features must match features compiled into this format */
+ if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) ||
+ (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) &&
+ (common->features & PT_FORCE_ENABLED_FEATURES) !=
+ PT_FORCE_ENABLED_FEATURES))
+ return -EOPNOTSUPP;
+
+ /*
+ * Check if the top level of the page table is too small to hold the
+ * specified maxvasz.
+ */
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ top_range.top_level != PT_MAX_TOP_LEVEL) {
+ struct pt_state pts = { .range = &top_range,
+ .level = top_range.top_level };
+
+ if (common->max_vasz_lg2 >
+ pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts))
+ return -EOPNOTSUPP;
+ }
+
+ if (common->max_oasz_lg2 == 0)
+ common->max_oasz_lg2 = pt_max_oa_lg2(common);
+ else
+ common->max_oasz_lg2 = min(common->max_oasz_lg2,
+ pt_max_oa_lg2(common));
+ return 0;
+}
+
+static int pt_iommu_init_domain(struct pt_iommu *iommu_table,
+ struct iommu_domain *domain)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_iommu_info info;
+ struct pt_range range;
+
+ NS(get_info)(iommu_table, &info);
+
+ domain->type = __IOMMU_DOMAIN_PAGING;
+ domain->pgsize_bitmap = info.pgsize_bitmap;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ range = _pt_top_range(common,
+ _pt_top_set(NULL, PT_MAX_TOP_LEVEL));
+ else
+ range = pt_top_range(common);
+
+ /* A 64-bit high address space table on a 32-bit system cannot work. */
+ domain->geometry.aperture_start = (unsigned long)range.va;
+ if ((pt_vaddr_t)domain->geometry.aperture_start != range.va)
+ return -EOVERFLOW;
+
+ /*
+ * The aperture is limited to what the API can do after considering all
+ * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used
+ * to store a VA. Set the aperture to something that is valid for all
+ * cases. Saturate instead of truncate the end if the types are smaller
+ * than the top range. aperture_end should be called aperture_last.
+ */
+ domain->geometry.aperture_end = (unsigned long)range.last_va;
+ if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) {
+ domain->geometry.aperture_end = ULONG_MAX;
+ domain->pgsize_bitmap &= ULONG_MAX;
+ }
+ domain->geometry.force_aperture = true;
+
+ return 0;
+}
+
+static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_iommu cfg = *iommu_table;
+
+ static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0);
+ memset_after(fmt_table, 0, iommu.domain);
+
+ /* The caller can initialize some of these values */
+ iommu_table->iommu_device = cfg.iommu_device;
+ iommu_table->driver_ops = cfg.driver_ops;
+ iommu_table->nid = cfg.nid;
+}
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+ const struct pt_iommu_table_cfg *cfg, gfp_t gfp)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_table_p *table_mem;
+ int ret;
+
+ if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 ||
+ !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2)
+ return -EINVAL;
+
+ pt_iommu_zero(fmt_table);
+ common->features = cfg->common.features;
+ common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2;
+ common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2;
+ ret = pt_iommu_fmt_init(fmt_table, cfg);
+ if (ret)
+ return ret;
+
+ if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common))
+ return -EINVAL;
+
+ ret = pt_init_common(common);
+ if (ret)
+ return ret;
+
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ WARN_ON(!iommu_table->driver_ops ||
+ !iommu_table->driver_ops->change_top ||
+ !iommu_table->driver_ops->get_top_lock))
+ return -EINVAL;
+
+ if (pt_feature(common, PT_FEAT_SIGN_EXTEND) &&
+ (pt_feature(common, PT_FEAT_FULL_VA) ||
+ pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
+ return -EINVAL;
+
+ if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
+ WARN_ON(!iommu_table->iommu_device))
+ return -EINVAL;
+
+ ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
+ if (ret)
+ return ret;
+
+ table_mem = table_alloc_top(common, common->top_of_table, gfp,
+ ALLOC_NORMAL);
+ if (IS_ERR(table_mem))
+ return PTR_ERR(table_mem);
+ pt_top_set(common, table_mem, pt_top_get_level(common));
+
+ /* Must be last, see pt_iommu_deinit() */
+ iommu_table->ops = &NS(ops);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU");
+
+#ifdef pt_iommu_fmt_hw_info
+#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info)
+#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info)
+void pt_iommu_hw_info(struct pt_iommu_table *fmt_table,
+ struct pt_iommu_table_hw_info *info)
+{
+ struct pt_iommu *iommu_table = &fmt_table->iommu;
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range top_range = pt_top_range(common);
+
+ pt_iommu_fmt_hw_info(fmt_table, &top_range, info);
+}
+EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU");
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW));
+MODULE_IMPORT_NS("GENERIC_PT");
+/* For iommu_dirty_bitmap_record() */
+MODULE_IMPORT_NS("IOMMUFD");
+
+#endif /* __GENERIC_PT_IOMMU_PT_H */
diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h
new file mode 100644
index 000000000000..68278bf15cfe
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_generic_pt.h
@@ -0,0 +1,823 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Test the format API directly.
+ *
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ int ret;
+
+ KUNIT_ASSERT_EQ(test, len, (size_t)len);
+
+ ret = iommu_map(&priv->domain, va, pa, len, IOMMU_READ | IOMMU_WRITE,
+ GFP_KERNEL);
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "map_pages", ret);
+}
+
+#define KUNIT_ASSERT_PT_LOAD(test, pts, entry) \
+ ({ \
+ pt_load_entry(pts); \
+ KUNIT_ASSERT_EQ(test, (pts)->type, entry); \
+ })
+
+struct check_levels_arg {
+ struct kunit *test;
+ void *fn_arg;
+ void (*fn)(struct kunit *test, struct pt_state *pts, void *arg);
+};
+
+static int __check_all_levels(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct check_levels_arg *chk = arg;
+ struct kunit *test = chk->test;
+ int ret;
+
+ _pt_iter_first(&pts);
+
+
+ /*
+ * If we were able to use the full VA space this should always be the
+ * last index in each table.
+ */
+ if (!(IS_32BIT && range->max_vasz_lg2 > 32)) {
+ if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND) &&
+ pts.level == pts.range->top_level)
+ KUNIT_ASSERT_EQ(test, pts.index,
+ log2_to_int(range->max_vasz_lg2 - 1 -
+ pt_table_item_lg2sz(&pts)) -
+ 1);
+ else
+ KUNIT_ASSERT_EQ(test, pts.index,
+ log2_to_int(pt_table_oa_lg2sz(&pts) -
+ pt_table_item_lg2sz(&pts)) -
+ 1);
+ }
+
+ if (pt_can_have_table(&pts)) {
+ pt_load_single_entry(&pts);
+ KUNIT_ASSERT_EQ(test, pts.type, PT_ENTRY_TABLE);
+ ret = pt_descend(&pts, arg, __check_all_levels);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* Index 0 is used by the test */
+ if (IS_32BIT && !pts.index)
+ return 0;
+ KUNIT_ASSERT_NE(chk->test, pts.index, 0);
+ }
+
+ /*
+ * A format should not create a table with only one entry, at least this
+ * test approach won't work.
+ */
+ KUNIT_ASSERT_GT(chk->test, pts.end_index, 1);
+
+ /*
+ * For increase top we end up using index 0 for the original top's tree,
+ * so use index 1 for testing instead.
+ */
+ pts.index = 0;
+ pt_index_to_va(&pts);
+ pt_load_single_entry(&pts);
+ if (pts.type == PT_ENTRY_TABLE && pts.end_index > 2) {
+ pts.index = 1;
+ pt_index_to_va(&pts);
+ }
+ (*chk->fn)(chk->test, &pts, chk->fn_arg);
+ return 0;
+}
+
+/*
+ * Call fn for each level in the table with a pts setup to index 0 in a table
+ * for that level. This allows writing tests that run on every level.
+ * The test can use every index in the table except the last one.
+ */
+static void check_all_levels(struct kunit *test,
+ void (*fn)(struct kunit *test,
+ struct pt_state *pts, void *arg),
+ void *fn_arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct check_levels_arg chk = {
+ .test = test,
+ .fn = fn,
+ .fn_arg = fn_arg,
+ };
+ int ret;
+
+ if (pt_feature(priv->common, PT_FEAT_DYNAMIC_TOP) &&
+ priv->common->max_vasz_lg2 > range.max_vasz_lg2)
+ range.last_va = fvalog2_set_mod_max(range.va,
+ priv->common->max_vasz_lg2);
+
+ /*
+ * Map a page at the highest VA, this will populate all the levels so we
+ * can then iterate over them. Index 0 will be used for testing.
+ */
+ if (IS_32BIT && range.max_vasz_lg2 > 32)
+ range.last_va = (u32)range.last_va;
+ range.va = range.last_va - (priv->smallest_pgsz - 1);
+ do_map(test, range.va, 0, priv->smallest_pgsz);
+
+ range = pt_make_range(priv->common, range.va, range.last_va);
+ ret = pt_walk_range(&range, __check_all_levels, &chk);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+}
+
+static void test_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ /* Fixture does the setup */
+ KUNIT_ASSERT_NE(test, priv->info.pgsize_bitmap, 0);
+}
+
+/*
+ * Basic check that the log2_* functions are working, especially at the integer
+ * limits.
+ */
+static void test_bitops(struct kunit *test)
+{
+ int i;
+
+ KUNIT_ASSERT_EQ(test, fls_t(u32, 0), 0);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, 1), 1);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, BIT(2)), 3);
+ KUNIT_ASSERT_EQ(test, fls_t(u32, U32_MAX), 32);
+
+ KUNIT_ASSERT_EQ(test, fls_t(u64, 0), 0);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, 1), 1);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, BIT(2)), 3);
+ KUNIT_ASSERT_EQ(test, fls_t(u64, U64_MAX), 64);
+
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, 1), 0);
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(2)), 2);
+ KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(31)), 31);
+
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, 1), 0);
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT(2)), 2);
+ KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT_ULL(63)), 63);
+
+ for (i = 0; i != 31; i++)
+ KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i);
+
+ for (i = 0; i != 63; i++)
+ KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i);
+
+ for (i = 0; i != 32; i++) {
+ u64 val = get_random_u64();
+
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffs_t(u32, val)), 0);
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffs_t(u64, val)), 0);
+
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffz_t(u32, val)),
+ log2_to_max_int_t(u32, ffz_t(u32, val)));
+ KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffz_t(u64, val)),
+ log2_to_max_int_t(u64, ffz_t(u64, val)));
+ }
+}
+
+static unsigned int ref_best_pgsize(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va,
+ pt_vaddr_t last_va, pt_oaddr_t oa)
+{
+ pt_vaddr_t pgsz_lg2;
+
+ /* Brute force the constraints described in pt_compute_best_pgsize() */
+ for (pgsz_lg2 = PT_VADDR_MAX_LG2 - 1; pgsz_lg2 != 0; pgsz_lg2--) {
+ if ((pgsz_bitmap & log2_to_int(pgsz_lg2)) &&
+ log2_mod(va, pgsz_lg2) == 0 &&
+ oalog2_mod(oa, pgsz_lg2) == 0 &&
+ va + log2_to_int(pgsz_lg2) - 1 <= last_va &&
+ log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2) &&
+ oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2))
+ return pgsz_lg2;
+ }
+ return 0;
+}
+
+/* Check that the bit logic in pt_compute_best_pgsize() works. */
+static void test_best_pgsize(struct kunit *test)
+{
+ unsigned int a_lg2;
+ unsigned int b_lg2;
+ unsigned int c_lg2;
+
+ /* Try random prefixes with every suffix combination */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+
+ /* 0 prefix, every suffix */
+ for (c_lg2 = 1; c_lg2 != PT_VADDR_MAX_LG2 - 1; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = 0;
+ pt_oaddr_t oa = 0;
+ pt_vaddr_t last_va = log2_set_mod_max(0, c_lg2);
+
+ KUNIT_ASSERT_EQ(test,
+ pt_compute_best_pgsize(pgsz_bitmap, va, last_va,
+ oa),
+ ref_best_pgsize(pgsz_bitmap, va, last_va, oa));
+ }
+
+ /* 1's prefix, every suffix */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = PT_VADDR_MAX << a_lg2;
+ pt_oaddr_t oa = PT_VADDR_MAX << b_lg2;
+ pt_vaddr_t last_va = PT_VADDR_MAX;
+
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+
+ /* pgsize_bitmap is always 0 */
+ for (a_lg2 = 1; a_lg2 != 10; a_lg2++) {
+ for (b_lg2 = 1; b_lg2 != 10; b_lg2++) {
+ for (c_lg2 = 1; c_lg2 != 10; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = 0;
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ 0);
+ }
+ }
+ }
+
+ if (sizeof(pt_vaddr_t) <= 4)
+ return;
+
+ /* over 32 bit page sizes */
+ for (a_lg2 = 32; a_lg2 != 42; a_lg2++) {
+ for (b_lg2 = 32; b_lg2 != 42; b_lg2++) {
+ for (c_lg2 = 32; c_lg2 != 42; c_lg2++) {
+ pt_vaddr_t pgsz_bitmap = get_random_u64();
+ pt_vaddr_t va = get_random_u64() << a_lg2;
+ pt_oaddr_t oa = get_random_u64() << b_lg2;
+ pt_vaddr_t last_va = log2_set_mod_max(
+ get_random_u64(), c_lg2);
+
+ if (va > last_va)
+ swap(va, last_va);
+ KUNIT_ASSERT_EQ(
+ test,
+ pt_compute_best_pgsize(pgsz_bitmap, va,
+ last_va, oa),
+ ref_best_pgsize(pgsz_bitmap, va,
+ last_va, oa));
+ }
+ }
+ }
+}
+
+/*
+ * Check that pt_install_table() and pt_table_pa() match
+ */
+static void test_lvl_table_ptr(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ struct pt_write_attrs attrs = {};
+
+ if (!pt_can_have_table(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ pt_load_single_entry(pts);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ /* A second install should pass because install updates pts->entry. */
+ KUNIT_ASSERT_EQ(test, pt_install_table(pts, paddr, &attrs), true);
+
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_TABLE);
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entries(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static void test_table_ptr(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_table_ptr, NULL);
+}
+
+struct lvl_radix_arg {
+ pt_vaddr_t vbits;
+};
+
+/*
+ * Check pt_table_oa_lg2sz() and pt_table_item_lg2sz() they need to decode a
+ * continuous list of VA across all the levels that covers the entire advertised
+ * VA space.
+ */
+static void test_lvl_radix(struct kunit *test, struct pt_state *pts, void *arg)
+{
+ unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct lvl_radix_arg *radix = arg;
+
+ /* Every bit below us is decoded */
+ KUNIT_ASSERT_EQ(test, log2_set_mod_max(0, isz_lg2), radix->vbits);
+
+ /* We are not decoding bits someone else is */
+ KUNIT_ASSERT_EQ(test, log2_div(radix->vbits, isz_lg2), 0);
+
+ /* Can't decode past the pt_vaddr_t size */
+ KUNIT_ASSERT_LE(test, table_lg2sz, PT_VADDR_MAX_LG2);
+ KUNIT_ASSERT_EQ(test, fvalog2_div(table_lg2sz, PT_MAX_VA_ADDRESS_LG2),
+ 0);
+
+ radix->vbits = fvalog2_set_mod_max(0, table_lg2sz);
+}
+
+static void test_max_va(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+
+ KUNIT_ASSERT_GE(test, priv->common->max_vasz_lg2, range.max_vasz_lg2);
+}
+
+static void test_table_radix(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct lvl_radix_arg radix = { .vbits = priv->smallest_pgsz - 1 };
+ struct pt_range range;
+
+ check_all_levels(test, test_lvl_radix, &radix);
+
+ range = pt_top_range(priv->common);
+ if (range.max_vasz_lg2 == PT_VADDR_MAX_LG2) {
+ KUNIT_ASSERT_EQ(test, radix.vbits, PT_VADDR_MAX);
+ } else {
+ if (!IS_32BIT)
+ KUNIT_ASSERT_EQ(test,
+ log2_set_mod_max(0, range.max_vasz_lg2),
+ radix.vbits);
+ KUNIT_ASSERT_EQ(test, log2_div(radix.vbits, range.max_vasz_lg2),
+ 0);
+ }
+}
+
+static unsigned int safe_pt_num_items_lg2(const struct pt_state *pts)
+{
+ struct pt_range top_range = pt_top_range(pts->range->common);
+ struct pt_state top_pts = pt_init_top(&top_range);
+
+ /*
+ * Avoid calling pt_num_items_lg2() on the top, instead we can derive
+ * the size of the top table from the top range.
+ */
+ if (pts->level == top_range.top_level)
+ return ilog2(pt_range_to_end_index(&top_pts));
+ return pt_num_items_lg2(pts);
+}
+
+static void test_lvl_possible_sizes(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ unsigned int num_items_lg2 = safe_pt_num_items_lg2(pts);
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!pt_can_have_leaf(pts)) {
+ KUNIT_ASSERT_EQ(test, pgsize_bitmap, 0);
+ return;
+ }
+
+ /* No bits for sizes that would be outside this table */
+ KUNIT_ASSERT_EQ(test, log2_mod(pgsize_bitmap, isz_lg2), 0);
+ KUNIT_ASSERT_EQ(
+ test, fvalog2_div(pgsize_bitmap, num_items_lg2 + isz_lg2), 0);
+
+ /*
+ * Non contiguous must be supported. AMDv1 has a HW bug where it does
+ * not support it on one of the levels.
+ */
+ if ((u64)pgsize_bitmap != 0xff0000000000ULL ||
+ strcmp(__stringify(PTPFX_RAW), "amdv1") != 0)
+ KUNIT_ASSERT_TRUE(test, pgsize_bitmap & log2_to_int(isz_lg2));
+ else
+ KUNIT_ASSERT_NE(test, pgsize_bitmap, 0);
+
+ /* A contiguous entry should not span the whole table */
+ if (num_items_lg2 + isz_lg2 != PT_VADDR_MAX_LG2)
+ KUNIT_ASSERT_FALSE(
+ test,
+ pgsize_bitmap & log2_to_int(num_items_lg2 + isz_lg2));
+}
+
+static void test_entry_possible_sizes(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_possible_sizes, NULL);
+}
+
+static void sweep_all_pgsizes(struct kunit *test, struct pt_state *pts,
+ struct pt_write_attrs *attrs,
+ pt_oaddr_t test_oaddr)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ unsigned int len_lg2;
+
+ if (pts->index != 0)
+ return;
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ struct pt_state sub_pts = *pts;
+ pt_oaddr_t oaddr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ oaddr = log2_set_mod(test_oaddr, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, attrs);
+ /* Verify that every contiguous item translates correctly */
+ for (sub_pts.index = 0;
+ sub_pts.index != log2_to_int(len_lg2 - isz_lg2);
+ sub_pts.index++) {
+ KUNIT_ASSERT_PT_LOAD(test, &sub_pts, PT_ENTRY_OA);
+ KUNIT_ASSERT_EQ(test, pt_item_oa(&sub_pts),
+ oaddr + sub_pts.index *
+ oalog2_mul(1, isz_lg2));
+ KUNIT_ASSERT_EQ(test, pt_entry_oa(&sub_pts), oaddr);
+ KUNIT_ASSERT_EQ(test, pt_entry_num_contig_lg2(&sub_pts),
+ len_lg2 - isz_lg2);
+ }
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+/*
+ * Check that pt_install_leaf_entry() and pt_entry_oa() match.
+ * Check that pt_clear_entries() works.
+ */
+static void test_lvl_entry_oa(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ unsigned int max_oa_lg2 = pts->range->common->max_oasz_lg2;
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ sweep_all_pgsizes(test, pts, &attrs, priv->test_oa);
+
+ /* Check that the table can store the boundary OAs */
+ sweep_all_pgsizes(test, pts, &attrs, 0);
+ if (max_oa_lg2 == PT_OADDR_MAX_LG2)
+ sweep_all_pgsizes(test, pts, &attrs, PT_OADDR_MAX);
+ else
+ sweep_all_pgsizes(test, pts, &attrs,
+ oalog2_to_max_int(max_oa_lg2));
+}
+
+static void test_entry_oa(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_entry_oa, NULL);
+}
+
+/* Test pt_attr_from_entry() */
+static void test_lvl_attr_from_entry(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct kunit_iommu_priv *priv = test->priv;
+ unsigned int len_lg2;
+ unsigned int prot;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) {
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+ for (prot = 0; prot <= (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE |
+ IOMMU_NOEXEC | IOMMU_MMIO);
+ prot++) {
+ pt_oaddr_t oaddr;
+ struct pt_write_attrs attrs = {};
+ u64 good_entry;
+
+ /*
+ * If the format doesn't support this combination of
+ * prot bits skip it
+ */
+ if (pt_iommu_set_prot(pts->range->common, &attrs,
+ prot)) {
+ /* But RW has to be supported */
+ KUNIT_ASSERT_NE(test, prot,
+ IOMMU_READ | IOMMU_WRITE);
+ continue;
+ }
+
+ oaddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ good_entry = pts->entry;
+
+ memset(&attrs, 0, sizeof(attrs));
+ pt_attr_from_entry(pts, &attrs);
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ /*
+ * The descriptor produced by pt_attr_from_entry()
+ * produce an identical entry value when re-written
+ */
+ KUNIT_ASSERT_EQ(test, good_entry, pts->entry);
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ }
+ }
+}
+
+static void test_attr_from_entry(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_attr_from_entry, NULL);
+}
+
+static void test_lvl_dirty(struct kunit *test, struct pt_state *pts, void *arg)
+{
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct kunit_iommu_priv *priv = test->priv;
+ unsigned int start_idx = pts->index;
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ | IOMMU_WRITE));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) {
+ pt_oaddr_t oaddr;
+ unsigned int i;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ oaddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA);
+
+ pt_load_entry(pts);
+ pt_entry_make_write_clean(pts);
+ pt_load_entry(pts);
+ KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts));
+
+ for (i = 0; i != log2_to_int(len_lg2 - isz_lg2); i++) {
+ /* dirty every contiguous entry */
+ pts->index = start_idx + i;
+ pt_load_entry(pts);
+ KUNIT_ASSERT_TRUE(test, pt_entry_make_write_dirty(pts));
+ pts->index = start_idx;
+ pt_load_entry(pts);
+ KUNIT_ASSERT_TRUE(test, pt_entry_is_write_dirty(pts));
+
+ pt_entry_make_write_clean(pts);
+ pt_load_entry(pts);
+ KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts));
+ }
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ }
+}
+
+static __maybe_unused void test_dirty(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!pt_dirty_supported(priv->common))
+ kunit_skip(test,
+ "Page table features do not support dirty tracking");
+
+ check_all_levels(test, test_lvl_dirty, NULL);
+}
+
+static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts);
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_write_attrs attrs = {};
+ unsigned int len_lg2;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2);
+ struct pt_write_attrs new_attrs = {};
+ unsigned int bitnr;
+
+ if (!(pgsize_bitmap & log2_to_int(len_lg2)))
+ continue;
+
+ pt_install_leaf_entry(pts, paddr, len_lg2, &attrs);
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++) {
+ KUNIT_ASSERT_FALSE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common);
+ bitnr++)
+ KUNIT_ASSERT_TRUE(test,
+ pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr);
+
+ /* SW bits didn't leak into the attrs */
+ pt_attr_from_entry(pts, &new_attrs);
+ KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs));
+
+ pt_clear_entries(pts, len_lg2 - isz_lg2);
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+ }
+}
+
+static __maybe_unused void test_sw_bit_leaf(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_leaf, NULL);
+}
+
+static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts,
+ void *arg)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_write_attrs attrs = {};
+ pt_oaddr_t paddr =
+ log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2);
+ unsigned int bitnr;
+
+ if (!pt_can_have_leaf(pts))
+ return;
+ if (pts->index != 0)
+ return;
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot",
+ pt_iommu_set_prot(pts->range->common, &attrs,
+ IOMMU_READ));
+
+ KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) {
+ KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ pt_set_sw_bit_release(pts, bitnr);
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+ }
+
+ for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++)
+ KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr));
+
+ KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr);
+
+ pt_clear_entries(pts, ilog2(1));
+ KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY);
+}
+
+static __maybe_unused void test_sw_bit_table(struct kunit *test)
+{
+ check_all_levels(test, test_lvl_sw_bit_table, NULL);
+}
+
+static struct kunit_case generic_pt_test_cases[] = {
+ KUNIT_CASE_FMT(test_init),
+ KUNIT_CASE_FMT(test_bitops),
+ KUNIT_CASE_FMT(test_best_pgsize),
+ KUNIT_CASE_FMT(test_table_ptr),
+ KUNIT_CASE_FMT(test_max_va),
+ KUNIT_CASE_FMT(test_table_radix),
+ KUNIT_CASE_FMT(test_entry_possible_sizes),
+ KUNIT_CASE_FMT(test_entry_oa),
+ KUNIT_CASE_FMT(test_attr_from_entry),
+#ifdef pt_entry_is_write_dirty
+ KUNIT_CASE_FMT(test_dirty),
+#endif
+#ifdef pt_sw_bit
+ KUNIT_CASE_FMT(test_sw_bit_leaf),
+ KUNIT_CASE_FMT(test_sw_bit_table),
+#endif
+ {},
+};
+
+static int pt_kunit_generic_pt_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv;
+ int ret;
+
+ priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ ret = pt_kunit_priv_init(test, priv);
+ if (ret) {
+ kunit_kfree(test, priv);
+ return ret;
+ }
+ test->priv = priv;
+ return 0;
+}
+
+static void pt_kunit_generic_pt_exit(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!test->priv)
+ return;
+
+ pt_iommu_deinit(priv->iommu);
+ kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(generic_pt_suite) = {
+ .name = __stringify(NS(fmt_test)),
+ .init = pt_kunit_generic_pt_init,
+ .exit = pt_kunit_generic_pt_exit,
+ .test_cases = generic_pt_test_cases,
+};
+kunit_test_suites(&NS(generic_pt_suite));
diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h
new file mode 100644
index 000000000000..22c9e4c4dd97
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_KUNIT_IOMMU_H
+#define __GENERIC_PT_KUNIT_IOMMU_H
+
+#define GENERIC_PT_KUNIT 1
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include "../iommu-pages.h"
+#include "pt_iter.h"
+
+#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
+#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
+int pt_iommu_init(struct pt_iommu_table *fmt_table,
+ const struct pt_iommu_table_cfg *cfg, gfp_t gfp);
+
+/* The format can provide a list of configurations it would like to test */
+#ifdef kunit_fmt_cfgs
+static const void *kunit_pt_gen_params_cfg(struct kunit *test, const void *prev,
+ char *desc)
+{
+ uintptr_t cfg_id = (uintptr_t)prev;
+
+ cfg_id++;
+ if (cfg_id >= ARRAY_SIZE(kunit_fmt_cfgs) + 1)
+ return NULL;
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s_cfg_%u",
+ __stringify(PTPFX_RAW), (unsigned int)(cfg_id - 1));
+ return (void *)cfg_id;
+}
+#define KUNIT_CASE_FMT(test_name) \
+ KUNIT_CASE_PARAM(test_name, kunit_pt_gen_params_cfg)
+#else
+#define KUNIT_CASE_FMT(test_name) KUNIT_CASE(test_name)
+#endif
+
+#define KUNIT_ASSERT_NO_ERRNO(test, ret) \
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, KUNIT_SUBSUBTEST_INDENT "errno %pe", \
+ ERR_PTR(ret))
+
+#define KUNIT_ASSERT_NO_ERRNO_FN(test, fn, ret) \
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, \
+ KUNIT_SUBSUBTEST_INDENT "errno %pe from %s", \
+ ERR_PTR(ret), fn)
+
+/*
+ * When the test is run on a 32 bit system unsigned long can be 32 bits. This
+ * cause the iommu op signatures to be restricted to 32 bits. Meaning the test
+ * has to be mindful not to create any VA's over the 32 bit limit. Reduce the
+ * scope of the testing as the main purpose of checking on full 32 bit is to
+ * look for 32bitism in the core code. Run the test on i386 with X86_PAE=y to
+ * get the full coverage when dma_addr_t & phys_addr_t are 8 bytes
+ */
+#define IS_32BIT (sizeof(unsigned long) == 4)
+
+struct kunit_iommu_priv {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu_table fmt_table;
+ };
+ spinlock_t top_lock;
+ struct device *dummy_dev;
+ struct pt_iommu *iommu;
+ struct pt_common *common;
+ struct pt_iommu_table_cfg cfg;
+ struct pt_iommu_info info;
+ unsigned int smallest_pgsz_lg2;
+ pt_vaddr_t smallest_pgsz;
+ unsigned int largest_pgsz_lg2;
+ pt_oaddr_t test_oa;
+ pt_vaddr_t safe_pgsize_bitmap;
+ unsigned long orig_nr_secondary_pagetable;
+
+};
+PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain);
+
+static void pt_kunit_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ iommu_put_pages_list(&gather->freelist);
+}
+
+#define IOMMU_PT_DOMAIN_OPS1(x) IOMMU_PT_DOMAIN_OPS(x)
+static const struct iommu_domain_ops kunit_pt_ops = {
+ IOMMU_PT_DOMAIN_OPS1(PTPFX_RAW),
+ .iotlb_sync = &pt_kunit_iotlb_sync,
+};
+
+static void pt_kunit_change_top(struct pt_iommu *iommu_table,
+ phys_addr_t top_paddr, unsigned int top_level)
+{
+}
+
+static spinlock_t *pt_kunit_get_top_lock(struct pt_iommu *iommu_table)
+{
+ struct kunit_iommu_priv *priv = container_of(
+ iommu_table, struct kunit_iommu_priv, fmt_table.iommu);
+
+ return &priv->top_lock;
+}
+
+static const struct pt_iommu_driver_ops pt_kunit_driver_ops = {
+ .change_top = &pt_kunit_change_top,
+ .get_top_lock = &pt_kunit_get_top_lock,
+};
+
+static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv)
+{
+ unsigned int va_lg2sz;
+ int ret;
+
+ /* Enough so the memory allocator works */
+ priv->dummy_dev = kunit_device_register(test, "pt_kunit_dev");
+ if (IS_ERR(priv->dummy_dev))
+ return PTR_ERR(priv->dummy_dev);
+ set_dev_node(priv->dummy_dev, NUMA_NO_NODE);
+
+ spin_lock_init(&priv->top_lock);
+
+#ifdef kunit_fmt_cfgs
+ priv->cfg = kunit_fmt_cfgs[((uintptr_t)test->param_value) - 1];
+ /*
+ * The format can set a list of features that the kunit_fmt_cfgs
+ * controls, other features are default to on.
+ */
+ priv->cfg.common.features |= PT_SUPPORTED_FEATURES &
+ (~KUNIT_FMT_FEATURES);
+#else
+ priv->cfg.common.features = PT_SUPPORTED_FEATURES;
+#endif
+
+ /* Defaults, for the kunit */
+ if (!priv->cfg.common.hw_max_vasz_lg2)
+ priv->cfg.common.hw_max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
+ if (!priv->cfg.common.hw_max_oasz_lg2)
+ priv->cfg.common.hw_max_oasz_lg2 = pt_max_oa_lg2(NULL);
+
+ priv->fmt_table.iommu.nid = NUMA_NO_NODE;
+ priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops;
+ priv->fmt_table.iommu.iommu_device = priv->dummy_dev;
+ priv->domain.ops = &kunit_pt_ops;
+ ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL);
+ if (ret) {
+ if (ret == -EOVERFLOW)
+ kunit_skip(
+ test,
+ "This configuration cannot be tested on 32 bit");
+ return ret;
+ }
+
+ priv->iommu = &priv->fmt_table.iommu;
+ priv->common = common_from_iommu(&priv->fmt_table.iommu);
+ priv->iommu->ops->get_info(priv->iommu, &priv->info);
+
+ /*
+ * size_t is used to pass the mapping length, it can be 32 bit, truncate
+ * the pagesizes so we don't use large sizes.
+ */
+ priv->info.pgsize_bitmap = (size_t)priv->info.pgsize_bitmap;
+
+ priv->smallest_pgsz_lg2 = vaffs(priv->info.pgsize_bitmap);
+ priv->smallest_pgsz = log2_to_int(priv->smallest_pgsz_lg2);
+ priv->largest_pgsz_lg2 =
+ vafls((dma_addr_t)priv->info.pgsize_bitmap) - 1;
+
+ priv->test_oa =
+ oalog2_mod(0x74a71445deadbeef, priv->common->max_oasz_lg2);
+
+ /*
+ * We run out of VA space if the mappings get too big, make something
+ * smaller that can safely pass through dma_addr_t API.
+ */
+ va_lg2sz = priv->common->max_vasz_lg2;
+ if (IS_32BIT && va_lg2sz > 32)
+ va_lg2sz = 32;
+ priv->safe_pgsize_bitmap =
+ log2_mod(priv->info.pgsize_bitmap, va_lg2sz - 1);
+
+ return 0;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h
new file mode 100644
index 000000000000..e8a63c8ea850
--- /dev/null
+++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "kunit_iommu.h"
+#include "pt_iter.h"
+#include <linux/generic_pt/iommu.h>
+#include <linux/iommu.h>
+
+static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len);
+
+struct count_valids {
+ u64 per_size[PT_VADDR_MAX_LG2];
+};
+
+static int __count_valids(struct pt_range *range, void *arg, unsigned int level,
+ struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ struct count_valids *valids = arg;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ pt_descend(&pts, arg, __count_valids);
+ continue;
+ }
+ if (pts.type == PT_ENTRY_OA) {
+ valids->per_size[pt_entry_oa_lg2sz(&pts)]++;
+ continue;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Number of valid table entries. This counts contiguous entries as a single
+ * valid.
+ */
+static unsigned int count_valids(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ u64 total = 0;
+ unsigned int i;
+
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+
+ for (i = 0; i != ARRAY_SIZE(valids.per_size); i++)
+ total += valids.per_size[i];
+ return total;
+}
+
+/* Only a single page size is present, count the number of valid entries */
+static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ u64 total = 0;
+ unsigned int i;
+
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+
+ for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) {
+ if ((1ULL << i) == pgsz)
+ total = valids.per_size[i];
+ else
+ KUNIT_ASSERT_EQ(test, valids.per_size[i], 0);
+ }
+ return total;
+}
+
+static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ size_t ret;
+
+ ret = iommu_unmap(&priv->domain, va, len);
+ KUNIT_ASSERT_EQ(test, ret, len);
+}
+
+static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa,
+ pt_vaddr_t len)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2);
+ pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2);
+
+ for (; pfn != end_pfn; pfn++) {
+ phys_addr_t res = iommu_iova_to_phys(&priv->domain,
+ pfn * priv->smallest_pgsz);
+
+ KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa);
+ if (res != pa)
+ break;
+ pa += priv->smallest_pgsz;
+ }
+}
+
+static void test_increase_level(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_common *common = priv->common;
+
+ if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format");
+
+ if (IS_32BIT)
+ kunit_skip(test, "Unable to test on 32bit");
+
+ KUNIT_ASSERT_GT(test, common->max_vasz_lg2,
+ pt_top_range(common).max_vasz_lg2);
+
+ /* Add every possible level to the max */
+ while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) {
+ struct pt_range top_range = pt_top_range(common);
+
+ if (top_range.va == 0)
+ do_map(test, top_range.last_va + 1, 0,
+ priv->smallest_pgsz);
+ else
+ do_map(test, top_range.va - priv->smallest_pgsz, 0,
+ priv->smallest_pgsz);
+
+ KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level,
+ top_range.top_level + 1);
+ KUNIT_ASSERT_GE(test, common->max_vasz_lg2,
+ pt_top_range(common).max_vasz_lg2);
+ }
+}
+
+static void test_map_simple(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range range = pt_top_range(priv->common);
+ struct count_valids valids = {};
+ pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+ unsigned int pgsz_lg2;
+ pt_vaddr_t cur_va;
+
+ /* Map every reported page size */
+ cur_va = range.va + priv->smallest_pgsz * 256;
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+ u64 len = log2_to_int(pgsz_lg2);
+
+ if (!(pgsize_bitmap & len))
+ continue;
+
+ cur_va = ALIGN(cur_va, len);
+ do_map(test, cur_va, paddr, len);
+ if (len <= SZ_2G)
+ check_iova(test, cur_va, paddr, len);
+ cur_va += len;
+ }
+
+ /* The read interface reports that every page size was created */
+ range = pt_top_range(priv->common);
+ KUNIT_ASSERT_NO_ERRNO(test,
+ pt_walk_range(&range, __count_valids, &valids));
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ if (pgsize_bitmap & (1ULL << pgsz_lg2))
+ KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1);
+ else
+ KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0);
+ }
+
+ /* Unmap works */
+ range = pt_top_range(priv->common);
+ cur_va = range.va + priv->smallest_pgsz * 256;
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ u64 len = log2_to_int(pgsz_lg2);
+
+ if (!(pgsize_bitmap & len))
+ continue;
+ cur_va = ALIGN(cur_va, len);
+ do_unmap(test, cur_va, len);
+ cur_va += len;
+ }
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+}
+
+/*
+ * Test to convert a table pointer into an OA by mapping something small,
+ * unmapping it so as to leave behind a table pointer, then mapping something
+ * larger that will convert the table into an OA.
+ */
+static void test_map_table_to_oa(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ pt_vaddr_t limited_pgbitmap =
+ priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G);
+ struct pt_range range = pt_top_range(priv->common);
+ unsigned int pgsz_lg2;
+ pt_vaddr_t max_pgsize;
+ pt_vaddr_t cur_va;
+
+ max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1);
+ KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize);
+
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2);
+ u64 len = log2_to_int(pgsz_lg2);
+ pt_vaddr_t offset;
+
+ if (!(priv->info.pgsize_bitmap & len))
+ continue;
+ if (len > max_pgsize)
+ break;
+
+ cur_va = ALIGN(range.va + priv->smallest_pgsz * 256,
+ max_pgsize);
+ for (offset = 0; offset != max_pgsize; offset += len)
+ do_map(test, cur_va + offset, paddr + offset, len);
+ check_iova(test, cur_va, paddr, max_pgsize);
+ KUNIT_ASSERT_EQ(test, count_valids_single(test, len),
+ log2_div(max_pgsize, pgsz_lg2));
+
+ if (len == max_pgsize) {
+ do_unmap(test, cur_va, max_pgsize);
+ } else {
+ do_unmap(test, cur_va, max_pgsize / 2);
+ for (offset = max_pgsize / 2; offset != max_pgsize;
+ offset += len)
+ do_unmap(test, cur_va + offset, len);
+ }
+
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+ }
+}
+
+/*
+ * Test unmapping a small page at the start of a large page. This always unmaps
+ * the large page.
+ */
+static void test_unmap_split(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap;
+ unsigned int pgsz_lg2;
+ unsigned int count = 0;
+
+ for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) {
+ pt_vaddr_t base_len = log2_to_int(pgsz_lg2);
+ unsigned int next_pgsz_lg2;
+
+ if (!(pgsize_bitmap & base_len))
+ continue;
+
+ for (next_pgsz_lg2 = pgsz_lg2 + 1;
+ next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) {
+ pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2);
+ pt_vaddr_t vaddr = top_range.va;
+ pt_oaddr_t paddr = 0;
+ size_t gnmapped;
+
+ if (!(pgsize_bitmap & next_len))
+ continue;
+
+ do_map(test, vaddr, paddr, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+ /* Make sure unmap doesn't keep going */
+ do_map(test, vaddr, paddr, next_len);
+ do_map(test, vaddr + next_len, paddr, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr, base_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+ gnmapped = iommu_unmap(&priv->domain, vaddr + next_len,
+ next_len);
+ KUNIT_ASSERT_EQ(test, gnmapped, next_len);
+
+ count++;
+ }
+ }
+
+ if (count == 0)
+ kunit_skip(test, "Test needs two page sizes");
+}
+
+static void unmap_collisions(struct kunit *test, struct maple_tree *mt,
+ pt_vaddr_t start, pt_vaddr_t last)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ MA_STATE(mas, mt, start, last);
+ void *entry;
+
+ mtree_lock(mt);
+ mas_for_each(&mas, entry, last) {
+ pt_vaddr_t mas_start = mas.index;
+ pt_vaddr_t len = (mas.last - mas_start) + 1;
+ pt_oaddr_t paddr;
+
+ mas_erase(&mas);
+ mas_pause(&mas);
+ mtree_unlock(mt);
+
+ paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2);
+ check_iova(test, mas_start, paddr, len);
+ do_unmap(test, mas_start, len);
+ mtree_lock(mt);
+ }
+ mtree_unlock(mt);
+}
+
+static void clamp_range(struct kunit *test, struct pt_range *range)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (range->last_va - range->va > SZ_1G)
+ range->last_va = range->va + SZ_1G;
+ KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX);
+ if (range->va <= MAPLE_RESERVED_RANGE)
+ range->va =
+ ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz);
+}
+
+/*
+ * Randomly map and unmap ranges that can large physical pages. If a random
+ * range overlaps with existing ranges then unmap them. This hits all the
+ * special cases.
+ */
+static void test_random_map(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range upper_range = pt_upper_range(priv->common);
+ struct pt_range top_range = pt_top_range(priv->common);
+ struct maple_tree mt;
+ unsigned int iter;
+
+ mt_init(&mt);
+
+ /*
+ * Shrink the range so randomization is more likely to have
+ * intersections
+ */
+ clamp_range(test, &top_range);
+ clamp_range(test, &upper_range);
+
+ for (iter = 0; iter != 1000; iter++) {
+ struct pt_range *range = &top_range;
+ pt_oaddr_t paddr;
+ pt_vaddr_t start;
+ pt_vaddr_t end;
+ int ret;
+
+ if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) &&
+ ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1))
+ range = &upper_range;
+
+ start = get_random_u32_below(
+ min(U32_MAX, range->last_va - range->va));
+ end = get_random_u32_below(
+ min(U32_MAX, range->last_va - start));
+
+ start = ALIGN_DOWN(start, priv->smallest_pgsz);
+ end = ALIGN(end, priv->smallest_pgsz);
+ start += range->va;
+ end += start;
+ if (start < range->va || end > range->last_va + 1 ||
+ start >= end)
+ continue;
+
+ /* Try overmapping to test the failure handling */
+ paddr = oalog2_mod(start, priv->common->max_oasz_lg2);
+ ret = iommu_map(&priv->domain, start, paddr, end - start,
+ IOMMU_READ | IOMMU_WRITE, GFP_KERNEL);
+ if (ret) {
+ KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE);
+ unmap_collisions(test, &mt, start, end - 1);
+ do_map(test, start, paddr, end - start);
+ }
+
+ KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range",
+ mtree_insert_range(&mt, start, end - 1,
+ XA_ZERO_ENTRY,
+ GFP_KERNEL));
+
+ check_iova(test, start, paddr, end - start);
+ if (iter % 100)
+ cond_resched();
+ }
+
+ unmap_collisions(test, &mt, 0, PT_VADDR_MAX);
+ KUNIT_ASSERT_EQ(test, count_valids(test), 0);
+
+ mtree_destroy(&mt);
+}
+
+/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */
+static void test_pgsize_boundary(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+
+ if (top_range.va != 0 || top_range.last_va < 0xfef9ffff ||
+ priv->smallest_pgsz != SZ_4K)
+ kunit_skip(test, "Format does not have the required range");
+
+ do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1);
+}
+
+/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */
+static void test_mixed(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ u64 start = 0x3fe400ULL << 12;
+ u64 end = 0x4c0600ULL << 12;
+ pt_vaddr_t len = end - start;
+ pt_oaddr_t oa = start;
+
+ if (top_range.last_va <= start || sizeof(unsigned long) == 4)
+ kunit_skip(test, "range is too small");
+ if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21)))
+ kunit_skip(test, "incompatible psize");
+
+ do_map(test, start, oa, len);
+ /* 14 2M, 3 1G, 3 2M */
+ KUNIT_ASSERT_EQ(test, count_valids(test), 20);
+ check_iova(test, start, oa, len);
+}
+
+static struct kunit_case iommu_test_cases[] = {
+ KUNIT_CASE_FMT(test_increase_level),
+ KUNIT_CASE_FMT(test_map_simple),
+ KUNIT_CASE_FMT(test_map_table_to_oa),
+ KUNIT_CASE_FMT(test_unmap_split),
+ KUNIT_CASE_FMT(test_random_map),
+ KUNIT_CASE_FMT(test_pgsize_boundary),
+ KUNIT_CASE_FMT(test_mixed),
+ {},
+};
+
+static int pt_kunit_iommu_init(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv;
+ int ret;
+
+ priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->orig_nr_secondary_pagetable =
+ global_node_page_state(NR_SECONDARY_PAGETABLE);
+ ret = pt_kunit_priv_init(test, priv);
+ if (ret) {
+ kunit_kfree(test, priv);
+ return ret;
+ }
+ test->priv = priv;
+ return 0;
+}
+
+static void pt_kunit_iommu_exit(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+
+ if (!test->priv)
+ return;
+
+ pt_iommu_deinit(priv->iommu);
+ /*
+ * Look for memory leaks, assumes kunit is running isolated and nothing
+ * else is using secondary page tables.
+ */
+ KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable,
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
+ kunit_kfree(test, test->priv);
+}
+
+static struct kunit_suite NS(iommu_suite) = {
+ .name = __stringify(NS(iommu_test)),
+ .init = pt_kunit_iommu_init,
+ .exit = pt_kunit_iommu_exit,
+ .test_cases = iommu_test_cases,
+};
+kunit_test_suites(&NS(iommu_suite));
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kunit for generic page table");
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h
new file mode 100644
index 000000000000..e1123d35c907
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_common.h
@@ -0,0 +1,389 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included after the format. It contains definitions
+ * that build on the format definitions to create the basic format API.
+ *
+ * The format API is listed here, with kdocs. The functions without bodies are
+ * implemented in the format using the pattern:
+ * static inline FMTpt_XXX(..) {..}
+ * #define pt_XXX FMTpt_XXX
+ *
+ * If the format doesn't implement a function then pt_fmt_defaults.h can provide
+ * a generic version.
+ *
+ * The routines marked "@pts: Entry to query" operate on the entire contiguous
+ * entry and can be called with a pts->index pointing to any sub item that makes
+ * up that entry.
+ *
+ * The header order is:
+ * pt_defs.h
+ * FMT.h
+ * pt_common.h
+ */
+#ifndef __GENERIC_PT_PT_COMMON_H
+#define __GENERIC_PT_PT_COMMON_H
+
+#include "pt_defs.h"
+#include "pt_fmt_defaults.h"
+
+/**
+ * pt_attr_from_entry() - Convert the permission bits back to attrs
+ * @pts: Entry to convert from
+ * @attrs: Resulting attrs
+ *
+ * Fill in the attrs with the permission bits encoded in the current leaf entry.
+ * The attrs should be usable with pt_install_leaf_entry() to reconstruct the
+ * same entry.
+ */
+static inline void pt_attr_from_entry(const struct pt_state *pts,
+ struct pt_write_attrs *attrs);
+
+/**
+ * pt_can_have_leaf() - True if the current level can have an OA entry
+ * @pts: The current level
+ *
+ * True if the current level can support pt_install_leaf_entry(). A leaf
+ * entry produce an OA.
+ */
+static inline bool pt_can_have_leaf(const struct pt_state *pts);
+
+/**
+ * pt_can_have_table() - True if the current level can have a lower table
+ * @pts: The current level
+ *
+ * Every level except 0 is allowed to have a lower table.
+ */
+static inline bool pt_can_have_table(const struct pt_state *pts)
+{
+ /* No further tables at level 0 */
+ return pts->level > 0;
+}
+
+/**
+ * pt_clear_entries() - Make entries empty (non-present)
+ * @pts: Starting table index
+ * @num_contig_lg2: Number of contiguous items to clear
+ *
+ * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY
+ * and does not have any effect on table walking. The starting index must be
+ * aligned to num_contig_lg2.
+ */
+static inline void pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2);
+
+/**
+ * pt_entry_make_write_dirty() - Make an entry dirty
+ * @pts: Table entry to change
+ *
+ * Make pt_entry_is_write_dirty() return true for this entry. This can be called
+ * asynchronously with any other table manipulation under a RCU lock and must
+ * not corrupt the table.
+ */
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts);
+
+/**
+ * pt_entry_make_write_clean() - Make the entry write clean
+ * @pts: Table entry to change
+ *
+ * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will
+ * eventually be notified of this change via a TLB flush, which is the point
+ * that the HW must become synchronized. Any "write dirty" prior to the TLB
+ * flush can be lost, but once the TLB flush completes all writes must make
+ * their entries write dirty.
+ *
+ * The format should alter the entry in a way that is compatible with any
+ * concurrent update from HW. The entire contiguous entry is changed.
+ */
+static inline void pt_entry_make_write_clean(struct pt_state *pts);
+
+/**
+ * pt_entry_is_write_dirty() - True if the entry has been written to
+ * @pts: Entry to query
+ *
+ * "write dirty" means that the HW has written to the OA translated
+ * by this entry. If the entry is contiguous then the consolidated
+ * "write dirty" for all the items must be returned.
+ */
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts);
+
+/**
+ * pt_dirty_supported() - True if the page table supports dirty tracking
+ * @common: Page table to query
+ */
+static inline bool pt_dirty_supported(struct pt_common *common);
+
+/**
+ * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the number of contiguous items this leaf entry spans. If the entry
+ * is single item it returns ilog2(1).
+ */
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa() - Output Address for this leaf entry
+ * @pts: Entry to query
+ *
+ * Return the output address for the start of the entry. If the entry
+ * is contiguous this returns the same value for each sub-item. I.e.::
+ *
+ * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0
+ *
+ * See pt_item_oa(). The format should implement one of these two functions
+ * depending on how it stores the OAs in the table.
+ */
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts);
+
+/**
+ * pt_entry_oa_lg2sz() - Return the size of an OA entry
+ * @pts: Entry to query
+ *
+ * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise
+ * it returns the total VA/OA size of the entire contiguous entry.
+ */
+static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts)
+{
+ return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts);
+}
+
+/**
+ * pt_entry_oa_exact() - Return the complete OA for an entry
+ * @pts: Entry to query
+ *
+ * During iteration the first entry could have a VA with an offset from the
+ * natural start of the entry. Return the exact OA including the pts's VA
+ * offset.
+ */
+static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts)
+{
+ return _pt_entry_oa_fast(pts) |
+ log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts));
+}
+
+/**
+ * pt_full_va_prefix() - The top bits of the VA
+ * @common: Page table to query
+ *
+ * This is usually 0, but some formats have their VA space going downward from
+ * PT_VADDR_MAX, and will return that instead. This value must always be
+ * adjusted by struct pt_common max_vasz_lg2.
+ */
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common);
+
+/**
+ * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry
+ * @common: Page table to query
+ *
+ * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT).
+ * This is useful to create optimized paths for common cases of PAGE_SIZE
+ * mappings.
+ */
+static inline bool pt_has_system_page_size(const struct pt_common *common);
+
+/**
+ * pt_install_leaf_entry() - Write a leaf entry to the table
+ * @pts: Table index to change
+ * @oa: Output Address for this leaf
+ * @oasz_lg2: Size in VA/OA for this leaf
+ * @attrs: Attributes to modify the entry
+ *
+ * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates
+ * the VA indicated by pts to the given OA.
+ *
+ * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz().
+ * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2.
+ *
+ * This must not be called if pt_can_have_leaf() == false. Contiguous sizes
+ * not indicated by pt_possible_sizes() must not be specified.
+ */
+static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
+ unsigned int oasz_lg2,
+ const struct pt_write_attrs *attrs);
+
+/**
+ * pt_install_table() - Write a table entry to the table
+ * @pts: Table index to change
+ * @table_pa: CPU physical address of the lower table's memory
+ * @attrs: Attributes to modify the table index
+ *
+ * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa
+ * is the table at pts->level - 1. This is done by cmpxchg so pts must have the
+ * current entry loaded. The pts is updated with the installed entry.
+ *
+ * This must not be called if pt_can_have_table() == false.
+ *
+ * Returns: true if the table was installed successfully.
+ */
+static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa,
+ const struct pt_write_attrs *attrs);
+
+/**
+ * pt_item_oa() - Output Address for this leaf item
+ * @pts: Item to query
+ *
+ * Return the output address for this item. If the item is part of a contiguous
+ * entry it returns the value of the OA for this individual sub item.
+ *
+ * See pt_entry_oa(). The format should implement one of these two functions
+ * depending on how it stores the OA's in the table.
+ */
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts);
+
+/**
+ * pt_load_entry_raw() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Return the type of entry that was loaded. pts->entry will be filled in with
+ * the entry's content. See pt_load_entry()
+ */
+static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts);
+
+/**
+ * pt_max_oa_lg2() - Return the maximum OA the table format can hold
+ * @common: Page table to query
+ *
+ * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the
+ * OA. This is the absolute maximum address the table can hold. struct pt_common
+ * max_oasz_lg2 sets a lower dynamic maximum based on HW capability.
+ */
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common);
+
+/**
+ * pt_num_items_lg2() - Return the number of items in this table level
+ * @pts: The current level
+ *
+ * The number of items in a table level defines the number of bits this level
+ * decodes from the VA. This function is not called for the top level,
+ * so it does not need to compute a special value for the top case. The
+ * result for the top is based on pt_common max_vasz_lg2.
+ *
+ * The value is used as part of determining the table indexes via the
+ * equation::
+ *
+ * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2())
+ */
+static inline unsigned int pt_num_items_lg2(const struct pt_state *pts);
+
+/**
+ * pt_pgsz_lg2_to_level - Return the level that maps the page size
+ * @common: Page table to query
+ * @pgsize_lg2: Log2 page size
+ *
+ * Returns the table level that will map the given page size. The page
+ * size must be part of the pt_possible_sizes() for some level.
+ */
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+ unsigned int pgsize_lg2);
+
+/**
+ * pt_possible_sizes() - Return a bitmap of possible output sizes at this level
+ * @pts: The current level
+ *
+ * Each level has a list of possible output sizes that can be installed as
+ * leaf entries. If pt_can_have_leaf() is false returns zero.
+ *
+ * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating
+ * that a non-contiguous single item leaf entry is supported. The following
+ * pt_num_items_lg2() number of bits can be set indicating contiguous entries
+ * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be
+ * set, contiguous entries cannot span the entire table.
+ *
+ * The OR of pt_possible_sizes() of all levels is the typical bitmask of all
+ * supported sizes in the entire table.
+ */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts);
+
+/**
+ * pt_table_item_lg2sz() - Size of a single item entry in this table level
+ * @pts: The current level
+ *
+ * The size of the item specifies how much VA and OA a single item occupies.
+ *
+ * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous
+ * entries.
+ */
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
+
+/**
+ * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table
+ * @pts: The current level
+ *
+ * Return the size of VA decoded by the entire table level.
+ */
+static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts)
+{
+ if (pts->range->top_level == pts->level)
+ return pts->range->max_vasz_lg2;
+ return min_t(unsigned int, pts->range->common->max_vasz_lg2,
+ pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts));
+}
+
+/**
+ * pt_table_pa() - Return the CPU physical address of the table entry
+ * @pts: Entry to query
+ *
+ * This is only ever called on PT_ENTRY_TABLE entries. Must return the same
+ * value passed to pt_install_table().
+ */
+static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts);
+
+/**
+ * pt_table_ptr() - Return a CPU pointer for a table item
+ * @pts: Entry to query
+ *
+ * Same as pt_table_pa() but returns a CPU pointer.
+ */
+static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts)
+{
+ return __va(pt_table_pa(pts));
+}
+
+/**
+ * pt_max_sw_bit() - Return the maximum software bit usable for any level and
+ * entry
+ * @common: Page table
+ *
+ * The swbit can be passed as bitnr to the other sw_bit functions.
+ */
+static inline unsigned int pt_max_sw_bit(struct pt_common *common);
+
+/**
+ * pt_test_sw_bit_acquire() - Read a software bit in an item
+ * @pts: Entry to read
+ * @bitnr: Bit to read
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a test bit and acquire operation.
+ */
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_set_sw_bit_release() - Set a software bit in an item
+ * @pts: Entry to set
+ * @bitnr: Bit to set
+ *
+ * Software bits are ignored by HW and can be used for any purpose by the
+ * software. This does a set bit and release operation.
+ */
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr);
+
+/**
+ * pt_load_entry() - Read from the location pts points at into the pts
+ * @pts: Table index to load
+ *
+ * Set the type of entry that was loaded. pts->entry and pts->table_lower
+ * will be filled in with the entry's content.
+ */
+static inline void pt_load_entry(struct pt_state *pts)
+{
+ pts->type = pt_load_entry_raw(pts);
+ if (pts->type == PT_ENTRY_TABLE)
+ pts->table_lower = pt_table_ptr(pts);
+}
+#endif
diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h
new file mode 100644
index 000000000000..c25544d72f97
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_defs.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * This header is included before the format. It contains definitions
+ * that are required to compile the format. The header order is:
+ * pt_defs.h
+ * fmt_XX.h
+ * pt_common.h
+ */
+#ifndef __GENERIC_PT_DEFS_H
+#define __GENERIC_PT_DEFS_H
+
+#include <linux/generic_pt/common.h>
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/bits.h>
+#include <linux/limits.h>
+#include <linux/bug.h>
+#include <linux/kconfig.h>
+#include "pt_log2.h"
+
+/* Header self-compile default defines */
+#ifndef pt_write_attrs
+typedef u64 pt_vaddr_t;
+typedef u64 pt_oaddr_t;
+#endif
+
+struct pt_table_p;
+
+enum {
+ PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX,
+ PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32,
+ PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX,
+ PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32,
+};
+
+/*
+ * The format instantiation can have features wired off or on to optimize the
+ * code gen. Supported features are just a reflection of what the current set of
+ * kernel users want to use.
+ */
+#ifndef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES 0
+#endif
+
+/*
+ * When in debug mode we compile all formats with all features. This allows the
+ * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or
+ * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+enum {
+ PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES,
+ PT_DEBUG_SUPPORTED_FEATURES =
+ UINT_MAX &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ?
+ 0 :
+ BIT(PT_FEAT_DMA_INCOHERENT))) &
+ ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ?
+ BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) :
+ BIT(PT_FEAT_SIGN_EXTEND)),
+};
+#undef PT_SUPPORTED_FEATURES
+#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES
+#endif
+
+#ifndef PT_FORCE_ENABLED_FEATURES
+#define PT_FORCE_ENABLED_FEATURES 0
+#endif
+
+/**
+ * DOC: Generic Page Table Language
+ *
+ * Language used in Generic Page Table
+ * VA
+ * The input address to the page table, often the virtual address.
+ * OA
+ * The output address from the page table, often the physical address.
+ * leaf
+ * An entry that results in an output address.
+ * start/end
+ * An half-open range, e.g. [0,0) refers to no VA.
+ * start/last
+ * An inclusive closed range, e.g. [0,0] refers to the VA 0
+ * common
+ * The generic page table container struct pt_common
+ * level
+ * Level 0 is always a table of only leaves with no futher table pointers.
+ * Increasing levels increase the size of the table items. The least
+ * significant VA bits used to index page tables are used to index the Level
+ * 0 table. The various labels for table levels used by HW descriptions are
+ * not used.
+ * top_level
+ * The inclusive highest level of the table. A two-level table
+ * has a top level of 1.
+ * table
+ * A linear array of translation items for that level.
+ * index
+ * The position in a table of an element: item = table[index]
+ * item
+ * A single index in a table
+ * entry
+ * A single logical element in a table. If contiguous pages are not
+ * supported then item and entry are the same thing, otherwise entry refers
+ * to all the items that comprise a single contiguous translation.
+ * item/entry_size
+ * The number of bytes of VA the table index translates for.
+ * If the item is a table entry then the next table covers
+ * this size. If the entry translates to an output address then the
+ * full OA is: OA | (VA % entry_size)
+ * contig_count
+ * The number of consecutive items fused into a single entry.
+ * item_size * contig_count is the size of that entry's translation.
+ * lg2
+ * Indicates the value is encoded as log2, i.e. 1<<x is the actual value.
+ * Normally the compiler is fine to optimize divide and mod with log2 values
+ * automatically when inlining, however if the values are not constant
+ * expressions it can't. So we do it by hand; we want to avoid 64-bit
+ * divmod.
+ */
+
+/* Returned by pt_load_entry() and for_each_pt_level_entry() */
+enum pt_entry_type {
+ PT_ENTRY_EMPTY,
+ /* Entry is valid and points to a lower table level */
+ PT_ENTRY_TABLE,
+ /* Entry is valid and returns an output address */
+ PT_ENTRY_OA,
+};
+
+struct pt_range {
+ struct pt_common *common;
+ struct pt_table_p *top_table;
+ pt_vaddr_t va;
+ pt_vaddr_t last_va;
+ u8 top_level;
+ u8 max_vasz_lg2;
+};
+
+/*
+ * Similar to xa_state, this records information about an in-progress parse at a
+ * single level.
+ */
+struct pt_state {
+ struct pt_range *range;
+ struct pt_table_p *table;
+ struct pt_table_p *table_lower;
+ u64 entry;
+ enum pt_entry_type type;
+ unsigned short index;
+ unsigned short end_index;
+ u8 level;
+};
+
+#define pt_cur_table(pts, type) ((type *)((pts)->table))
+
+/*
+ * Try to install a new table pointer. The locking methodology requires this to
+ * be atomic (multiple threads can race to install a pointer). The losing
+ * threads will fail the atomic and return false. They should free any memory
+ * and reparse the table level again.
+ */
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry)
+{
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ bool ret;
+
+ /*
+ * Ensure the zero'd table content itself is visible before its PTE can
+ * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+ */
+ if (!IS_ENABLED(CONFIG_SMP))
+ dma_wmb();
+ ret = try_cmpxchg64_release(entryp, &old_entry, table_entry);
+ if (ret)
+ pts->entry = table_entry;
+ return ret;
+}
+#endif
+
+static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry)
+{
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ bool ret;
+
+ /*
+ * Ensure the zero'd table content itself is visible before its PTE can
+ * be. release is a NOP on !SMP, but the HW is still doing an acquire.
+ */
+ if (!IS_ENABLED(CONFIG_SMP))
+ dma_wmb();
+ ret = try_cmpxchg_release(entryp, &old_entry, table_entry);
+ if (ret)
+ pts->entry = table_entry;
+ return ret;
+}
+
+#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr))
+
+static inline bool pt_feature(const struct pt_common *common,
+ unsigned int feature_nr)
+{
+ if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr))
+ return true;
+ if (!PT_SUPPORTED_FEATURE(feature_nr))
+ return false;
+ return common->features & BIT(feature_nr);
+}
+
+static inline bool pts_feature(const struct pt_state *pts,
+ unsigned int feature_nr)
+{
+ return pt_feature(pts->range->common, feature_nr);
+}
+
+/*
+ * PT_WARN_ON is used for invariants that the kunit should be checking can't
+ * happen.
+ */
+#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)
+#define PT_WARN_ON WARN_ON
+#else
+static inline bool PT_WARN_ON(bool condition)
+{
+ return false;
+}
+#endif
+
+/* These all work on the VA type */
+#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2)
+#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2)
+#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2)
+#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2)
+#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2)
+#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2)
+#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2)
+#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2)
+#define vaffs(a) ffs_t(pt_vaddr_t, a)
+#define vafls(a) fls_t(pt_vaddr_t, a)
+#define vaffz(a) ffz_t(pt_vaddr_t, a)
+
+/*
+ * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and
+ * generate a useful defined result. The non-fva versions will malfunction at
+ * this extreme.
+ */
+static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return 0;
+ return log2_div_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return a;
+ return log2_mod_t(pt_vaddr_t, a, b_lg2);
+}
+
+static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b,
+ unsigned int c_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2)
+ return true;
+ return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val,
+ unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return val;
+ return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2);
+}
+
+static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2)
+{
+ if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2)
+ return PT_VADDR_MAX;
+ return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2);
+}
+
+/* These all work on the OA type */
+#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2)
+#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2)
+#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2)
+#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2)
+#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2)
+#define oaffs(a) ffs_t(pt_oaddr_t, a)
+#define oafls(a) fls_t(pt_oaddr_t, a)
+#define oaffz(a) ffz_t(pt_oaddr_t, a)
+
+static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem,
+ unsigned int top_level)
+{
+ return top_level | (uintptr_t)table_mem;
+}
+
+static inline void pt_top_set(struct pt_common *common,
+ struct pt_table_p *table_mem,
+ unsigned int top_level)
+{
+ WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level));
+}
+
+static inline void pt_top_set_level(struct pt_common *common,
+ unsigned int top_level)
+{
+ pt_top_set(common, NULL, top_level);
+}
+
+static inline unsigned int pt_top_get_level(const struct pt_common *common)
+{
+ return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS);
+}
+
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+ pt_oaddr_t oa,
+ unsigned int oasz_lg2);
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h
new file mode 100644
index 000000000000..69fb7c2314ca
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Default definitions for formats that don't define these functions.
+ */
+#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H
+#define __GENERIC_PT_PT_FMT_DEFAULTS_H
+
+#include "pt_defs.h"
+#include <linux/log2.h>
+
+/* Header self-compile default defines */
+#ifndef pt_load_entry_raw
+#include "fmt/amdv1.h"
+#endif
+
+/*
+ * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and
+ * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top.
+ */
+#ifndef pt_table_item_lg2sz
+static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts)
+{
+ return PT_GRANULE_LG2SZ +
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level;
+}
+#endif
+
+#ifndef pt_pgsz_lg2_to_level
+static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common,
+ unsigned int pgsize_lg2)
+{
+ return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) /
+ (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE));
+}
+#endif
+
+/*
+ * If not supplied by the format then contiguous pages are not supported.
+ *
+ * If contiguous pages are supported then the format must also provide
+ * pt_contig_count_lg2() if it supports a single contiguous size per level,
+ * or pt_possible_sizes() if it supports multiple sizes per level.
+ */
+#ifndef pt_entry_num_contig_lg2
+static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts)
+{
+ return ilog2(1);
+}
+
+/*
+ * Return the number of contiguous OA items forming an entry at this table level
+ */
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts)
+{
+ return ilog2(1);
+}
+#endif
+
+/* If not supplied by the format then dirty tracking is not supported */
+#ifndef pt_entry_is_write_dirty
+static inline bool pt_entry_is_write_dirty(const struct pt_state *pts)
+{
+ return false;
+}
+
+static inline void pt_entry_make_write_clean(struct pt_state *pts)
+{
+}
+
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+ return false;
+}
+#else
+/* If not supplied then dirty tracking is always enabled */
+#ifndef pt_dirty_supported
+static inline bool pt_dirty_supported(struct pt_common *common)
+{
+ return true;
+}
+#endif
+#endif
+
+#ifndef pt_entry_make_write_dirty
+static inline bool pt_entry_make_write_dirty(struct pt_state *pts)
+{
+ return false;
+}
+#endif
+
+/*
+ * Format supplies either:
+ * pt_entry_oa - OA is at the start of a contiguous entry
+ * or
+ * pt_item_oa - OA is adjusted for every item in a contiguous entry
+ *
+ * Build the missing one
+ *
+ * The internal helper _pt_entry_oa_fast() allows generating
+ * an efficient pt_entry_oa_exact(), it doesn't care which
+ * option is selected.
+ */
+#ifdef pt_entry_oa
+static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts)
+{
+ return pt_entry_oa(pts) |
+ log2_mul(pts->index, pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_entry_oa
+#endif
+
+#ifdef pt_item_oa
+static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts)
+{
+ return log2_set_mod(pt_item_oa(pts), 0,
+ pt_entry_num_contig_lg2(pts) +
+ pt_table_item_lg2sz(pts));
+}
+#define _pt_entry_oa_fast pt_item_oa
+#endif
+
+/*
+ * If not supplied by the format then use the constant
+ * PT_MAX_OUTPUT_ADDRESS_LG2.
+ */
+#ifndef pt_max_oa_lg2
+static inline unsigned int
+pt_max_oa_lg2(const struct pt_common *common)
+{
+ return PT_MAX_OUTPUT_ADDRESS_LG2;
+}
+#endif
+
+#ifndef pt_has_system_page_size
+static inline bool pt_has_system_page_size(const struct pt_common *common)
+{
+ return PT_GRANULE_LG2SZ == PAGE_SHIFT;
+}
+#endif
+
+/*
+ * If not supplied by the format then assume only one contiguous size determined
+ * by pt_contig_count_lg2()
+ */
+#ifndef pt_possible_sizes
+static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts);
+
+/* Return a bitmap of possible leaf page sizes at this level */
+static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (!pt_can_have_leaf(pts))
+ return 0;
+ return log2_to_int(isz_lg2) |
+ log2_to_int(pt_contig_count_lg2(pts) + isz_lg2);
+}
+#endif
+
+/* If not supplied by the format then use 0. */
+#ifndef pt_full_va_prefix
+static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common)
+{
+ return 0;
+}
+#endif
+
+/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */
+#ifndef pt_clear_entries
+static inline void pt_clear_entries64(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u64 *tablep = pt_cur_table(pts, u64) + pts->index;
+ u64 *end = tablep + log2_to_int(num_contig_lg2);
+
+ PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries32(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ u32 *tablep = pt_cur_table(pts, u32) + pts->index;
+ u32 *end = tablep + log2_to_int(num_contig_lg2);
+
+ PT_WARN_ON(log2_mod(pts->index, num_contig_lg2));
+ for (; tablep != end; tablep++)
+ WRITE_ONCE(*tablep, 0);
+}
+
+static inline void pt_clear_entries(struct pt_state *pts,
+ unsigned int num_contig_lg2)
+{
+ if (PT_ITEM_WORD_SIZE == sizeof(u32))
+ pt_clear_entries32(pts, num_contig_lg2);
+ else
+ pt_clear_entries64(pts, num_contig_lg2);
+}
+#define pt_clear_entries pt_clear_entries
+#endif
+
+/* If not supplied then SW bits are not supported */
+#ifdef pt_sw_bit
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ /* Acquire, pairs with pt_set_sw_bit_release() */
+ smp_mb();
+ /* For a contiguous entry the sw bit is only stored in the first item. */
+ return pts->entry & pt_sw_bit(bitnr);
+}
+#define pt_test_sw_bit_acquire pt_test_sw_bit_acquire
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64)
+ if (PT_ITEM_WORD_SIZE == sizeof(u64)) {
+ u64 *entryp = pt_cur_table(pts, u64) + pts->index;
+ u64 old_entry = pts->entry;
+ u64 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ return;
+ }
+#endif
+ if (PT_ITEM_WORD_SIZE == sizeof(u32)) {
+ u32 *entryp = pt_cur_table(pts, u32) + pts->index;
+ u32 old_entry = pts->entry;
+ u32 new_entry;
+
+ do {
+ new_entry = old_entry | pt_sw_bit(bitnr);
+ } while (!try_cmpxchg_release(entryp, &old_entry, new_entry));
+ pts->entry = new_entry;
+ } else
+ BUILD_BUG();
+}
+#define pt_set_sw_bit_release pt_set_sw_bit_release
+#else
+static inline unsigned int pt_max_sw_bit(struct pt_common *common)
+{
+ return 0;
+}
+
+extern void __pt_no_sw_bit(void);
+static inline bool pt_test_sw_bit_acquire(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+ return false;
+}
+
+static inline void pt_set_sw_bit_release(struct pt_state *pts,
+ unsigned int bitnr)
+{
+ __pt_no_sw_bit();
+}
+#endif
+
+/*
+ * Format can call in the pt_install_leaf_entry() to check the arguments are all
+ * aligned correctly.
+ */
+static inline bool pt_check_install_leaf_args(struct pt_state *pts,
+ pt_oaddr_t oa,
+ unsigned int oasz_lg2)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2)))
+ return false;
+
+#ifdef pt_possible_sizes
+ if (PT_WARN_ON(isz_lg2 > oasz_lg2 ||
+ oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts)))
+ return false;
+#else
+ if (PT_WARN_ON(oasz_lg2 != isz_lg2 &&
+ oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts)))
+ return false;
+#endif
+
+ if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2)))
+ return false;
+ return true;
+}
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h
new file mode 100644
index 000000000000..c0d8617cce29
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_iter.h
@@ -0,0 +1,636 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Iterators for Generic Page Table
+ */
+#ifndef __GENERIC_PT_PT_ITER_H
+#define __GENERIC_PT_PT_ITER_H
+
+#include "pt_common.h"
+
+#include <linux/errno.h>
+
+/*
+ * Use to mangle symbols so that backtraces and the symbol table are
+ * understandable. Any non-inlined function should get mangled like this.
+ */
+#define NS(fn) CONCATENATE(PTPFX, fn)
+
+/**
+ * pt_check_range() - Validate the range can be iterated
+ * @range: Range to validate
+ *
+ * Check that VA and last_va fall within the permitted range of VAs. If the
+ * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension
+ * is correct.
+ */
+static inline int pt_check_range(struct pt_range *range)
+{
+ pt_vaddr_t prefix;
+
+ PT_WARN_ON(!range->max_vasz_lg2);
+
+ if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) {
+ PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2);
+ prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ?
+ PT_VADDR_MAX :
+ 0;
+ } else {
+ prefix = pt_full_va_prefix(range->common);
+ }
+
+ if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) ||
+ !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2))
+ return -ERANGE;
+ return 0;
+}
+
+/**
+ * pt_index_to_va() - Update range->va to the current pts->index
+ * @pts: Iteration State
+ *
+ * Adjust range->va to match the current index. This is done in a lazy manner
+ * since computing the VA takes several instructions and is rarely required.
+ */
+static inline void pt_index_to_va(struct pt_state *pts)
+{
+ pt_vaddr_t lower_va;
+
+ lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts));
+ pts->range->va = fvalog2_set_mod(pts->range->va, lower_va,
+ pt_table_oa_lg2sz(pts));
+}
+
+/*
+ * Add index_count_lg2 number of entries to pts's VA and index. The VA will be
+ * adjusted to the end of the contiguous block if it is currently in the middle.
+ */
+static inline void _pt_advance(struct pt_state *pts,
+ unsigned int index_count_lg2)
+{
+ pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0,
+ index_count_lg2);
+}
+
+/**
+ * pt_entry_fully_covered() - Check if the item or entry is entirely contained
+ * within pts->range
+ * @pts: Iteration State
+ * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or
+ * pt_entry_oa_lg2sz()
+ *
+ * Returns: true if the item is fully enclosed by the pts->range.
+ */
+static inline bool pt_entry_fully_covered(const struct pt_state *pts,
+ unsigned int oasz_lg2)
+{
+ struct pt_range *range = pts->range;
+
+ /* Range begins at the start of the entry */
+ if (log2_mod(pts->range->va, oasz_lg2))
+ return false;
+
+ /* Range ends past the end of the entry */
+ if (!log2_div_eq(range->va, range->last_va, oasz_lg2))
+ return true;
+
+ /* Range ends at the end of the entry */
+ return log2_mod_eq_max(range->last_va, oasz_lg2);
+}
+
+/**
+ * pt_range_to_index() - Starting index for an iteration
+ * @pts: Iteration State
+ *
+ * Return: the starting index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_index(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+
+ PT_WARN_ON(pts->level > pts->range->top_level);
+ if (pts->range->top_level == pts->level)
+ return log2_div(fvalog2_mod(pts->range->va,
+ pts->range->max_vasz_lg2),
+ isz_lg2);
+ return log2_mod(log2_div(pts->range->va, isz_lg2),
+ pt_num_items_lg2(pts));
+}
+
+/**
+ * pt_range_to_end_index() - Ending index iteration
+ * @pts: Iteration State
+ *
+ * Return: the last index for the iteration in pts.
+ */
+static inline unsigned int pt_range_to_end_index(const struct pt_state *pts)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
+ struct pt_range *range = pts->range;
+ unsigned int num_entries_lg2;
+
+ if (range->va == range->last_va)
+ return pts->index + 1;
+
+ if (pts->range->top_level == pts->level)
+ return log2_div(fvalog2_mod(pts->range->last_va,
+ pts->range->max_vasz_lg2),
+ isz_lg2) +
+ 1;
+
+ num_entries_lg2 = pt_num_items_lg2(pts);
+
+ /* last_va falls within this table */
+ if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2))
+ return log2_mod(log2_div(pts->range->last_va, isz_lg2),
+ num_entries_lg2) +
+ 1;
+
+ return log2_to_int(num_entries_lg2);
+}
+
+static inline void _pt_iter_first(struct pt_state *pts)
+{
+ pts->index = pt_range_to_index(pts);
+ pts->end_index = pt_range_to_end_index(pts);
+ PT_WARN_ON(pts->index > pts->end_index);
+}
+
+static inline bool _pt_iter_load(struct pt_state *pts)
+{
+ if (pts->index >= pts->end_index)
+ return false;
+ pt_load_entry(pts);
+ return true;
+}
+
+/**
+ * pt_next_entry() - Advance pts to the next entry
+ * @pts: Iteration State
+ *
+ * Update pts to go to the next index at this level. If pts is pointing at a
+ * contiguous entry then the index may advance my more than one.
+ */
+static inline void pt_next_entry(struct pt_state *pts)
+{
+ if (pts->type == PT_ENTRY_OA &&
+ !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0))
+ _pt_advance(pts, pt_entry_num_contig_lg2(pts));
+ else
+ pts->index++;
+ pt_index_to_va(pts);
+}
+
+/**
+ * for_each_pt_level_entry() - For loop wrapper over entries in the range
+ * @pts: Iteration State
+ *
+ * This is the basic iteration primitive. It iterates over all the entries in
+ * pts->range that fall within the pts's current table level. Each step does
+ * pt_load_entry(pts).
+ */
+#define for_each_pt_level_entry(pts) \
+ for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts))
+
+/**
+ * pt_load_single_entry() - Version of pt_load_entry() usable within a walker
+ * @pts: Iteration State
+ *
+ * Alternative to for_each_pt_level_entry() if the walker function uses only a
+ * single entry.
+ */
+static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts)
+{
+ pts->index = pt_range_to_index(pts);
+ pt_load_entry(pts);
+ return pts->type;
+}
+
+static __always_inline struct pt_range _pt_top_range(struct pt_common *common,
+ uintptr_t top_of_table)
+{
+ struct pt_range range = {
+ .common = common,
+ .top_table =
+ (struct pt_table_p *)(top_of_table &
+ ~(uintptr_t)PT_TOP_LEVEL_MASK),
+ .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS),
+ };
+ struct pt_state pts = { .range = &range, .level = range.top_level };
+ unsigned int max_vasz_lg2;
+
+ max_vasz_lg2 = common->max_vasz_lg2;
+ if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
+ pts.level != PT_MAX_TOP_LEVEL)
+ max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2,
+ pt_num_items_lg2(&pts) +
+ pt_table_item_lg2sz(&pts));
+
+ /*
+ * The top range will default to the lower region only with sign extend.
+ */
+ range.max_vasz_lg2 = max_vasz_lg2;
+ if (pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ max_vasz_lg2--;
+
+ range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2);
+ range.last_va =
+ fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2);
+ return range;
+}
+
+/**
+ * pt_top_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static __always_inline struct pt_range pt_top_range(struct pt_common *common)
+{
+ /*
+ * The top pointer can change without locking. We capture the value and
+ * it's level here and are safe to walk it so long as both values are
+ * captured without tearing.
+ */
+ return _pt_top_range(common, READ_ONCE(common->top_of_table));
+}
+
+/**
+ * pt_all_range() - Return a range that spans the entire page table
+ * @common: Table
+ *
+ * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND
+ * is supported range->va and range->last_va will be incorrect during the
+ * iteration and must not be accessed.
+ */
+static inline struct pt_range pt_all_range(struct pt_common *common)
+{
+ struct pt_range range = pt_top_range(common);
+
+ if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ return range;
+
+ /*
+ * Pretend the table is linear from 0 without a sign extension. This
+ * generates the correct indexes for iteration.
+ */
+ range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2);
+ return range;
+}
+
+/**
+ * pt_upper_range() - Return a range that spans part of the top level
+ * @common: Table
+ *
+ * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the
+ * total page table. Otherwise it returns the entire page table.
+ */
+static inline struct pt_range pt_upper_range(struct pt_common *common)
+{
+ struct pt_range range = pt_top_range(common);
+
+ if (!pt_feature(common, PT_FEAT_SIGN_EXTEND))
+ return range;
+
+ range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1);
+ range.last_va = PT_VADDR_MAX;
+ return range;
+}
+
+/**
+ * pt_make_range() - Return a range that spans part of the table
+ * @common: Table
+ * @va: Start address
+ * @last_va: Last address
+ *
+ * The caller must validate the range with pt_check_range() before using it.
+ */
+static __always_inline struct pt_range
+pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va)
+{
+ struct pt_range range =
+ _pt_top_range(common, READ_ONCE(common->top_of_table));
+
+ range.va = va;
+ range.last_va = last_va;
+
+ return range;
+}
+
+/*
+ * Span a slice of the table starting at a lower table level from an active
+ * walk.
+ */
+static __always_inline struct pt_range
+pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va,
+ pt_vaddr_t last_va)
+{
+ struct pt_range range = *parent;
+
+ range.va = va;
+ range.last_va = last_va;
+
+ PT_WARN_ON(last_va < va);
+ PT_WARN_ON(pt_check_range(&range));
+
+ return range;
+}
+
+/**
+ * pt_init() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ * @level: Table level for the state
+ * @table: Pointer to the table memory at level
+ *
+ * Helper to initialize the on-stack pt_state from walker arguments.
+ */
+static __always_inline struct pt_state
+pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = {
+ .range = range,
+ .table = table,
+ .level = level,
+ };
+ return pts;
+}
+
+/**
+ * pt_init_top() - Initialize a pt_state on the stack
+ * @range: Range pointer to embed in the state
+ *
+ * The pt_state points to the top most level.
+ */
+static __always_inline struct pt_state pt_init_top(struct pt_range *range)
+{
+ return pt_init(range, range->top_level, range->top_table);
+}
+
+typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table);
+
+/**
+ * pt_descend() - Recursively invoke the walker for the lower level
+ * @pts: Iteration State
+ * @arg: Value to pass to the function
+ * @fn: Walker function to call
+ *
+ * pts must point to a table item. Invoke fn as a walker on the table
+ * pts points to.
+ */
+static __always_inline int pt_descend(struct pt_state *pts, void *arg,
+ pt_level_fn_t fn)
+{
+ int ret;
+
+ if (PT_WARN_ON(!pts->table_lower))
+ return -EINVAL;
+
+ ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower);
+ return ret;
+}
+
+/**
+ * pt_walk_range() - Walk over a VA range
+ * @range: Range pointer
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * Walk over a VA range. The caller should have done a validity check, at
+ * least calling pt_check_range(), when building range. The walk will
+ * start at the top most table.
+ */
+static __always_inline int pt_walk_range(struct pt_range *range,
+ pt_level_fn_t fn, void *arg)
+{
+ return fn(range, arg, range->top_level, range->top_table);
+}
+
+/*
+ * pt_walk_descend() - Recursively invoke the walker for a slice of a lower
+ * level
+ * @pts: Iteration State
+ * @va: Start address
+ * @last_va: Last address
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over a slice of the
+ * lower table. The caller must ensure that va/last_va are within the table
+ * item. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int pt_walk_descend(const struct pt_state *pts,
+ pt_vaddr_t va, pt_vaddr_t last_va,
+ pt_level_fn_t fn, void *arg)
+{
+ struct pt_range range = pt_make_child_range(pts->range, va, last_va);
+
+ if (PT_WARN_ON(!pt_can_have_table(pts)) ||
+ PT_WARN_ON(!pts->table_lower))
+ return -EINVAL;
+
+ return fn(&range, arg, pts->level - 1, pts->table_lower);
+}
+
+/*
+ * pt_walk_descend_all() - Recursively invoke the walker for a table item
+ * @parent_pts: Iteration State
+ * @fn: Walker function to call
+ * @arg: Value to pass to the function
+ *
+ * With pts pointing at a table item this will descend and over the entire lower
+ * table. This creates a new walk and does not alter pts or pts->range.
+ */
+static __always_inline int
+pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn,
+ void *arg)
+{
+ unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts);
+
+ return pt_walk_descend(parent_pts,
+ log2_set_mod(parent_pts->range->va, 0, isz_lg2),
+ log2_set_mod_max(parent_pts->range->va, isz_lg2),
+ fn, arg);
+}
+
+/**
+ * pt_range_slice() - Return a range that spans indexes
+ * @pts: Iteration State
+ * @start_index: Starting index within pts
+ * @end_index: Ending index within pts
+ *
+ * Create a range than spans an index range of the current table level
+ * pt_state points at.
+ */
+static inline struct pt_range pt_range_slice(const struct pt_state *pts,
+ unsigned int start_index,
+ unsigned int end_index)
+{
+ unsigned int table_lg2sz = pt_table_oa_lg2sz(pts);
+ pt_vaddr_t last_va;
+ pt_vaddr_t va;
+
+ va = fvalog2_set_mod(pts->range->va,
+ log2_mul(start_index, pt_table_item_lg2sz(pts)),
+ table_lg2sz);
+ last_va = fvalog2_set_mod(
+ pts->range->va,
+ log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz);
+ return pt_make_child_range(pts->range, va, last_va);
+}
+
+/**
+ * pt_top_memsize_lg2()
+ * @common: Table
+ * @top_of_table: Top of table value from _pt_top_set()
+ *
+ * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this
+ * will compute the top size assuming the table will grow.
+ */
+static inline unsigned int pt_top_memsize_lg2(struct pt_common *common,
+ uintptr_t top_of_table)
+{
+ struct pt_range range = _pt_top_range(common, top_of_table);
+ struct pt_state pts = pt_init_top(&range);
+ unsigned int num_items_lg2;
+
+ num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts);
+ if (range.top_level != PT_MAX_TOP_LEVEL &&
+ pt_feature(common, PT_FEAT_DYNAMIC_TOP))
+ num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts));
+
+ /* Round up the allocation size to the minimum alignment */
+ return max(ffs_t(u64, PT_TOP_PHYS_MASK),
+ num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE));
+}
+
+/**
+ * pt_compute_best_pgsize() - Determine the best page size for leaf entries
+ * @pgsz_bitmap: Permitted page sizes
+ * @va: Starting virtual address for the leaf entry
+ * @last_va: Last virtual address for the leaf entry, sets the max page size
+ * @oa: Starting output address for the leaf entry
+ *
+ * Compute the largest page size for va, last_va, and oa together and return it
+ * in lg2. The largest page size depends on the format's supported page sizes at
+ * this level, and the relative alignment of the VA and OA addresses. 0 means
+ * the OA cannot be stored with the provided pgsz_bitmap.
+ */
+static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap,
+ pt_vaddr_t va,
+ pt_vaddr_t last_va,
+ pt_oaddr_t oa)
+{
+ unsigned int best_pgsz_lg2;
+ unsigned int pgsz_lg2;
+ pt_vaddr_t len = last_va - va + 1;
+ pt_vaddr_t mask;
+
+ if (PT_WARN_ON(va >= last_va))
+ return 0;
+
+ /*
+ * Given a VA/OA pair the best page size is the largest page size
+ * where:
+ *
+ * 1) VA and OA start at the page. Bitwise this is the count of least
+ * significant 0 bits.
+ * This also implies that last_va/oa has the same prefix as va/oa.
+ */
+ mask = va | oa;
+
+ /*
+ * 2) The page size is not larger than the last_va (length). Since page
+ * sizes are always power of two this can't be larger than the
+ * largest power of two factor of the length.
+ */
+ mask |= log2_to_int(vafls(len) - 1);
+
+ best_pgsz_lg2 = vaffs(mask);
+
+ /* Choose the highest bit <= best_pgsz_lg2 */
+ if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1)
+ pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1);
+
+ pgsz_lg2 = vafls(pgsz_bitmap);
+ if (!pgsz_lg2)
+ return 0;
+
+ pgsz_lg2--;
+
+ PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0);
+ PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0);
+ PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va);
+ PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+ PT_WARN_ON(
+ !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2));
+ return pgsz_lg2;
+}
+
+#define _PT_MAKE_CALL_LEVEL(fn) \
+ static __always_inline int fn(struct pt_range *range, void *arg, \
+ unsigned int level, \
+ struct pt_table_p *table) \
+ { \
+ static_assert(PT_MAX_TOP_LEVEL <= 5); \
+ if (level == 0) \
+ return CONCATENATE(fn, 0)(range, arg, 0, table); \
+ if (level == 1 || PT_MAX_TOP_LEVEL == 1) \
+ return CONCATENATE(fn, 1)(range, arg, 1, table); \
+ if (level == 2 || PT_MAX_TOP_LEVEL == 2) \
+ return CONCATENATE(fn, 2)(range, arg, 2, table); \
+ if (level == 3 || PT_MAX_TOP_LEVEL == 3) \
+ return CONCATENATE(fn, 3)(range, arg, 3, table); \
+ if (level == 4 || PT_MAX_TOP_LEVEL == 4) \
+ return CONCATENATE(fn, 4)(range, arg, 4, table); \
+ return CONCATENATE(fn, 5)(range, arg, 5, table); \
+ }
+
+static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg,
+ unsigned int unused_level,
+ struct pt_table_p *table)
+{
+ static_assert(PT_MAX_TOP_LEVEL <= 5);
+ return -EPROTOTYPE;
+}
+
+#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \
+ static inline int fn(struct pt_range *range, void *arg, \
+ unsigned int unused_level, \
+ struct pt_table_p *table) \
+ { \
+ return do_fn(range, arg, level, table, descend_fn); \
+ }
+
+/**
+ * PT_MAKE_LEVELS() - Build an unwound walker
+ * @fn: Name of the walker function
+ * @do_fn: Function to call at each level
+ *
+ * This builds a function call tree that can be fully inlined.
+ * The caller must provide a function body in an __always_inline function::
+ *
+ * static __always_inline int do_fn(struct pt_range *range, void *arg,
+ * unsigned int level, struct pt_table_p *table,
+ * pt_level_fn_t descend_fn)
+ *
+ * An inline function will be created for each table level that calls do_fn with
+ * a compile time constant for level and a pointer to the next lower function.
+ * This generates an optimally inlined walk where each of the functions sees a
+ * constant level and can codegen the exact constants/etc for that level.
+ *
+ * Note this can produce a lot of code!
+ */
+#define PT_MAKE_LEVELS(fn, do_fn) \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \
+ do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \
+ __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \
+ _PT_MAKE_CALL_LEVEL(fn)
+
+#endif
diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h
new file mode 100644
index 000000000000..6dbbed119238
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_log2.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
+ *
+ * Helper macros for working with log2 values
+ *
+ */
+#ifndef __GENERIC_PT_LOG2_H
+#define __GENERIC_PT_LOG2_H
+#include <linux/bitops.h>
+#include <linux/limits.h>
+
+/* Compute a */
+#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2)))
+static_assert(log2_to_int_t(unsigned int, 0) == 1);
+
+/* Compute a - 1 (aka all low bits set) */
+#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1))
+
+/* Compute a / b */
+#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2)))
+static_assert(log2_div_t(unsigned int, 4, 2) == 1);
+
+/*
+ * Compute:
+ * a / c == b / c
+ * aka the high bits are equal
+ */
+#define log2_div_eq_t(type, a, b, c_lg2) \
+ (log2_div_t(type, (a) ^ (b), c_lg2) == 0)
+static_assert(log2_div_eq_t(unsigned int, 1, 1, 2));
+
+/* Compute a % b */
+#define log2_mod_t(type, a, b_lg2) \
+ ((type)(((type)a) & log2_to_max_int_t(type, b_lg2)))
+static_assert(log2_mod_t(unsigned int, 1, 2) == 1);
+
+/*
+ * Compute:
+ * a % b == b - 1
+ * aka the low bits are all 1s
+ */
+#define log2_mod_eq_max_t(type, a, b_lg2) \
+ (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2))
+static_assert(log2_mod_eq_max_t(unsigned int, 3, 2));
+
+/*
+ * Return a value such that:
+ * a / b == ret / b
+ * ret % b == val
+ * aka set the low bits to val. val must be < b
+ */
+#define log2_set_mod_t(type, a, val, b_lg2) \
+ ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val)))
+static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1);
+
+/* Return a value such that:
+ * a / b == ret / b
+ * ret % b == b - 1
+ * aka set the low bits to all 1s
+ */
+#define log2_set_mod_max_t(type, a, b_lg2) \
+ (((type)(a)) | log2_to_max_int_t(type, b_lg2))
+static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3);
+
+/* Compute a * b */
+#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2)))
+static_assert(log2_mul_t(unsigned int, 2, 2) == 8);
+
+#define _dispatch_sz(type, fn, a) \
+ (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a))
+
+/*
+ * Return the highest value such that:
+ * fls_t(u32, 0) == 0
+ * fls_t(u3, 1) == 1
+ * a >= log2_to_int(ret - 1)
+ * aka find last set bit
+ */
+static inline unsigned int fls32(u32 a)
+{
+ return fls(a);
+}
+#define fls_t(type, a) _dispatch_sz(type, fls, a)
+
+/*
+ * Return the highest value such that:
+ * ffs_t(u32, 0) == UNDEFINED
+ * ffs_t(u32, 1) == 0
+ * log_mod(a, ret) == 0
+ * aka find first set bit
+ */
+static inline unsigned int __ffs32(u32 a)
+{
+ return __ffs(a);
+}
+#define ffs_t(type, a) _dispatch_sz(type, __ffs, a)
+
+/*
+ * Return the highest value such that:
+ * ffz_t(u32, U32_MAX) == UNDEFINED
+ * ffz_t(u32, 0) == 0
+ * ffz_t(u32, 1) == 1
+ * log_mod(a, ret) == log_to_max_int(ret)
+ * aka find first zero bit
+ */
+static inline unsigned int ffz32(u32 a)
+{
+ return ffz(a);
+}
+static inline unsigned int ffz64(u64 a)
+{
+ if (sizeof(u64) == sizeof(unsigned long))
+ return ffz(a);
+
+ if ((u32)a == U32_MAX)
+ return ffz32(a >> 32) + 32;
+ return ffz32(a);
+}
+#define ffz_t(type, a) _dispatch_sz(type, ffz, a)
+
+#endif
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index 8a5c17b97310..0961ac805944 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -130,7 +130,7 @@ static int __init hyperv_prepare_irq_remapping(void)
x86_init.hyper.msi_ext_dest_id())
return -ENODEV;
- if (hv_root_partition) {
+ if (hv_root_partition()) {
name = "HYPERV-ROOT-IR";
ops = &hyperv_root_ir_domain_ops;
} else {
@@ -151,7 +151,7 @@ static int __init hyperv_prepare_irq_remapping(void)
return -ENOMEM;
}
- if (hv_root_partition)
+ if (hv_root_partition())
return 0; /* The rest is only relevant to guests */
/*
@@ -164,8 +164,8 @@ static int __init hyperv_prepare_irq_remapping(void)
* max cpu affinity for IOAPIC irqs. Scan cpu 0-255 and set cpu
* into ioapic_max_cpumask if its APIC ID is less than 256.
*/
- for (i = min_t(unsigned int, num_possible_cpus() - 1, 255); i >= 0; i--)
- if (cpu_physical_id(i) < 256)
+ for (i = min_t(unsigned int, nr_cpu_ids - 1, 255); i >= 0; i--)
+ if (cpu_possible(i) && cpu_physical_id(i) < 256)
cpumask_set_cpu(i, &ioapic_max_cpumask);
return 0;
@@ -193,15 +193,13 @@ struct hyperv_root_ir_data {
static void
hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
{
- u64 status;
- u32 vector;
- struct irq_cfg *cfg;
- int ioapic_id;
- const struct cpumask *affinity;
- int cpu;
- struct hv_interrupt_entry entry;
struct hyperv_root_ir_data *data = irq_data->chip_data;
+ struct hv_interrupt_entry entry;
+ const struct cpumask *affinity;
struct IO_APIC_route_entry e;
+ struct irq_cfg *cfg;
+ int cpu, ioapic_id;
+ u32 vector;
cfg = irqd_cfg(irq_data);
affinity = irq_data_get_effective_affinity_mask(irq_data);
@@ -214,23 +212,16 @@ hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
&& data->entry.ioapic_rte.as_uint64) {
entry = data->entry;
- status = hv_unmap_ioapic_interrupt(ioapic_id, &entry);
-
- if (status != HV_STATUS_SUCCESS)
- pr_debug("%s: unexpected unmap status %lld\n", __func__, status);
+ (void)hv_unmap_ioapic_interrupt(ioapic_id, &entry);
data->entry.ioapic_rte.as_uint64 = 0;
data->entry.source = 0; /* Invalid source */
}
- status = hv_map_ioapic_interrupt(ioapic_id, data->is_level, cpu,
- vector, &entry);
-
- if (status != HV_STATUS_SUCCESS) {
- pr_err("%s: map hypercall failed, status %lld\n", __func__, status);
+ if (hv_map_ioapic_interrupt(ioapic_id, data->is_level, cpu,
+ vector, &entry))
return;
- }
data->entry = entry;
@@ -322,10 +313,10 @@ static void hyperv_root_irq_remapping_free(struct irq_domain *domain,
data = irq_data->chip_data;
e = &data->entry;
- if (e->source == HV_DEVICE_TYPE_IOAPIC
- && e->ioapic_rte.as_uint64)
- hv_unmap_ioapic_interrupt(data->ioapic_id,
- &data->entry);
+ if (e->source == HV_DEVICE_TYPE_IOAPIC &&
+ e->ioapic_rte.as_uint64)
+ (void)hv_unmap_ioapic_interrupt(data->ioapic_id,
+ &data->entry);
kfree(data);
}
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index f52fb39c968e..5471f814e073 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -12,9 +12,13 @@ config DMAR_DEBUG
config INTEL_IOMMU
bool "Support for Intel IOMMU using DMA Remapping Devices"
depends on PCI_MSI && ACPI && X86
- select DMA_OPS
select IOMMU_API
+ select GENERIC_PT
+ select IOMMU_PT
+ select IOMMU_PT_X86_64
+ select IOMMU_PT_VTDSS
select IOMMU_IOVA
+ select IOMMU_IOPF
select IOMMUFD_DRIVER if IOMMUFD
select NEED_DMA_MAP_STATE
select DMAR_TABLE
@@ -51,7 +55,6 @@ config INTEL_IOMMU_SVM
depends on X86_64
select MMU_NOTIFIER
select IOMMU_SVA
- select IOMMU_IOPF
help
Shared Virtual Memory (SVM) provides a facility for devices
to access DMA resources through process address space by
@@ -67,7 +70,7 @@ config INTEL_IOMMU_DEFAULT_ON
config INTEL_IOMMU_FLOPPY_WA
def_bool y
- depends on X86
+ depends on X86 && BLK_DEV_FD
help
Floppy disk drivers are known to bypass DMA API calls
thereby failing to work when IOMMU is enabled. This
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index c8beb0281559..ada651c4a01b 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -1,11 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o
-obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
+obj-y += iommu.o pasid.o nested.o cache.o prq.o
+obj-$(CONFIG_DMAR_TABLE) += dmar.o trace.o
obj-$(CONFIG_DMAR_PERF) += perf.o
obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
-ifdef CONFIG_INTEL_IOMMU
obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
-endif
obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o
diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c
index e8418cdd8331..265e7290256b 100644
--- a/drivers/iommu/intel/cache.c
+++ b/drivers/iommu/intel/cache.c
@@ -40,13 +40,13 @@ static bool cache_tage_match(struct cache_tag *tag, u16 domain_id,
}
/* Assign a cache tag with specified type to domain. */
-static int cache_tag_assign(struct dmar_domain *domain, u16 did,
- struct device *dev, ioasid_t pasid,
- enum cache_tag_type type)
+int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev,
+ ioasid_t pasid, enum cache_tag_type type)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
struct cache_tag *tag, *temp;
+ struct list_head *prev;
unsigned long flags;
tag = kzalloc(sizeof(*tag), GFP_KERNEL);
@@ -65,6 +65,7 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did,
tag->dev = iommu->iommu.dev;
spin_lock_irqsave(&domain->cache_lock, flags);
+ prev = &domain->cache_tags;
list_for_each_entry(temp, &domain->cache_tags, node) {
if (cache_tage_match(temp, did, iommu, dev, pasid, type)) {
temp->users++;
@@ -73,8 +74,15 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did,
trace_cache_tag_assign(temp);
return 0;
}
+ if (temp->iommu == iommu)
+ prev = &temp->node;
}
- list_add_tail(&tag->node, &domain->cache_tags);
+ /*
+ * Link cache tags of same iommu unit together, so corresponding
+ * flush ops can be batched for iommu unit.
+ */
+ list_add(&tag->node, prev);
+
spin_unlock_irqrestore(&domain->cache_lock, flags);
trace_cache_tag_assign(tag);
@@ -105,12 +113,35 @@ static void cache_tag_unassign(struct dmar_domain *domain, u16 did,
spin_unlock_irqrestore(&domain->cache_lock, flags);
}
+/* domain->qi_batch will be freed in iommu_free_domain() path. */
+static int domain_qi_batch_alloc(struct dmar_domain *domain)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&domain->cache_lock, flags);
+ if (domain->qi_batch)
+ goto out_unlock;
+
+ domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_ATOMIC);
+ if (!domain->qi_batch)
+ ret = -ENOMEM;
+out_unlock:
+ spin_unlock_irqrestore(&domain->cache_lock, flags);
+
+ return ret;
+}
+
static int __cache_tag_assign_domain(struct dmar_domain *domain, u16 did,
struct device *dev, ioasid_t pasid)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
int ret;
+ ret = domain_qi_batch_alloc(domain);
+ if (ret)
+ return ret;
+
ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_IOTLB);
if (ret || !info->ats_enabled)
return ret;
@@ -139,6 +170,10 @@ static int __cache_tag_assign_parent_domain(struct dmar_domain *domain, u16 did,
struct device_domain_info *info = dev_iommu_priv_get(dev);
int ret;
+ ret = domain_qi_batch_alloc(domain);
+ if (ret)
+ return ret;
+
ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB);
if (ret || !info->ats_enabled)
return ret;
@@ -245,7 +280,8 @@ static unsigned long calculate_psi_aligned_address(unsigned long start,
* shared_bits are all equal in both pfn and end_pfn.
*/
shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
- mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
+ mask = shared_bits ? __ffs(shared_bits) : MAX_AGAW_PFN_WIDTH;
+ aligned_pages = 1UL << mask;
}
*_pages = aligned_pages;
@@ -254,6 +290,138 @@ static unsigned long calculate_psi_aligned_address(unsigned long start,
return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask);
}
+static void qi_batch_flush_descs(struct intel_iommu *iommu, struct qi_batch *batch)
+{
+ if (!iommu || !batch->index)
+ return;
+
+ qi_submit_sync(iommu, batch->descs, batch->index, 0);
+
+ /* Reset the index value and clean the whole batch buffer. */
+ memset(batch, 0, sizeof(*batch));
+}
+
+static void qi_batch_increment_index(struct intel_iommu *iommu, struct qi_batch *batch)
+{
+ if (++batch->index == QI_MAX_BATCHED_DESC_COUNT)
+ qi_batch_flush_descs(iommu, batch);
+}
+
+static void qi_batch_add_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+ unsigned int size_order, u64 type,
+ struct qi_batch *batch)
+{
+ qi_desc_iotlb(iommu, did, addr, size_order, type, &batch->descs[batch->index]);
+ qi_batch_increment_index(iommu, batch);
+}
+
+static void qi_batch_add_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u16 qdep, u64 addr, unsigned int mask,
+ struct qi_batch *batch)
+{
+ /*
+ * According to VT-d spec, software is recommended to not submit any Device-TLB
+ * invalidation requests while address remapping hardware is disabled.
+ */
+ if (!(iommu->gcmd & DMA_GCMD_TE))
+ return;
+
+ qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &batch->descs[batch->index]);
+ qi_batch_increment_index(iommu, batch);
+}
+
+static void qi_batch_add_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid,
+ u64 addr, unsigned long npages, bool ih,
+ struct qi_batch *batch)
+{
+ /*
+ * npages == -1 means a PASID-selective invalidation, otherwise,
+ * a positive value for Page-selective-within-PASID invalidation.
+ * 0 is not a valid input.
+ */
+ if (!npages)
+ return;
+
+ qi_desc_piotlb(did, pasid, addr, npages, ih, &batch->descs[batch->index]);
+ qi_batch_increment_index(iommu, batch);
+}
+
+static void qi_batch_add_pasid_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+ u32 pasid, u16 qdep, u64 addr,
+ unsigned int size_order, struct qi_batch *batch)
+{
+ /*
+ * According to VT-d spec, software is recommended to not submit any
+ * Device-TLB invalidation requests while address remapping hardware
+ * is disabled.
+ */
+ if (!(iommu->gcmd & DMA_GCMD_TE))
+ return;
+
+ qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, qdep, addr, size_order,
+ &batch->descs[batch->index]);
+ qi_batch_increment_index(iommu, batch);
+}
+
+static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag,
+ unsigned long addr, unsigned long pages,
+ unsigned long mask, int ih)
+{
+ struct intel_iommu *iommu = tag->iommu;
+ u64 type = DMA_TLB_PSI_FLUSH;
+
+ if (intel_domain_is_fs_paging(domain)) {
+ qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr,
+ pages, ih, domain->qi_batch);
+ return;
+ }
+
+ /*
+ * Fallback to domain selective flush if no PSI support or the size
+ * is too big.
+ */
+ if (!cap_pgsel_inv(iommu->cap) ||
+ mask > cap_max_amask_val(iommu->cap) || pages == -1) {
+ addr = 0;
+ mask = 0;
+ ih = 0;
+ type = DMA_TLB_DSI_FLUSH;
+ }
+
+ if (ecap_qis(iommu->ecap))
+ qi_batch_add_iotlb(iommu, tag->domain_id, addr | ih, mask, type,
+ domain->qi_batch);
+ else
+ __iommu_flush_iotlb(iommu, tag->domain_id, addr | ih, mask, type);
+}
+
+static void cache_tag_flush_devtlb_psi(struct dmar_domain *domain, struct cache_tag *tag,
+ unsigned long addr, unsigned long mask)
+{
+ struct intel_iommu *iommu = tag->iommu;
+ struct device_domain_info *info;
+ u16 sid;
+
+ info = dev_iommu_priv_get(tag->dev);
+ sid = PCI_DEVID(info->bus, info->devfn);
+
+ if (tag->pasid == IOMMU_NO_PASID) {
+ qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep,
+ addr, mask, domain->qi_batch);
+ if (info->dtlb_extra_inval)
+ qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep,
+ addr, mask, domain->qi_batch);
+ return;
+ }
+
+ qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid,
+ info->ats_qdep, addr, mask, domain->qi_batch);
+ if (info->dtlb_extra_inval)
+ qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid,
+ info->ats_qdep, addr, mask,
+ domain->qi_batch);
+}
+
/*
* Invalidates a range of IOVA from @start (inclusive) to @end (inclusive)
* when the memory mappings in the target domain have been modified.
@@ -261,38 +429,29 @@ static unsigned long calculate_psi_aligned_address(unsigned long start,
void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start,
unsigned long end, int ih)
{
+ struct intel_iommu *iommu = NULL;
unsigned long pages, mask, addr;
struct cache_tag *tag;
unsigned long flags;
- addr = calculate_psi_aligned_address(start, end, &pages, &mask);
+ if (start == 0 && end == ULONG_MAX) {
+ addr = 0;
+ pages = -1;
+ mask = MAX_AGAW_PFN_WIDTH;
+ } else {
+ addr = calculate_psi_aligned_address(start, end, &pages, &mask);
+ }
spin_lock_irqsave(&domain->cache_lock, flags);
list_for_each_entry(tag, &domain->cache_tags, node) {
- struct intel_iommu *iommu = tag->iommu;
- struct device_domain_info *info;
- u16 sid;
+ if (iommu && iommu != tag->iommu)
+ qi_batch_flush_descs(iommu, domain->qi_batch);
+ iommu = tag->iommu;
switch (tag->type) {
case CACHE_TAG_IOTLB:
case CACHE_TAG_NESTING_IOTLB:
- if (domain->use_first_level) {
- qi_flush_piotlb(iommu, tag->domain_id,
- tag->pasid, addr, pages, ih);
- } else {
- /*
- * Fallback to domain selective flush if no
- * PSI support or the size is too big.
- */
- if (!cap_pgsel_inv(iommu->cap) ||
- mask > cap_max_amask_val(iommu->cap))
- iommu->flush.flush_iotlb(iommu, tag->domain_id,
- 0, 0, DMA_TLB_DSI_FLUSH);
- else
- iommu->flush.flush_iotlb(iommu, tag->domain_id,
- addr | ih, mask,
- DMA_TLB_PSI_FLUSH);
- }
+ cache_tag_flush_iotlb(domain, tag, addr, pages, mask, ih);
break;
case CACHE_TAG_NESTING_DEVTLB:
/*
@@ -306,23 +465,13 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start,
mask = MAX_AGAW_PFN_WIDTH;
fallthrough;
case CACHE_TAG_DEVTLB:
- info = dev_iommu_priv_get(tag->dev);
- sid = PCI_DEVID(info->bus, info->devfn);
-
- if (tag->pasid == IOMMU_NO_PASID)
- qi_flush_dev_iotlb(iommu, sid, info->pfsid,
- info->ats_qdep, addr, mask);
- else
- qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid,
- tag->pasid, info->ats_qdep,
- addr, mask);
-
- quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep);
+ cache_tag_flush_devtlb_psi(domain, tag, addr, mask);
break;
}
trace_cache_tag_flush_range(tag, start, end, addr, pages, mask);
}
+ qi_batch_flush_descs(iommu, domain->qi_batch);
spin_unlock_irqrestore(&domain->cache_lock, flags);
}
@@ -332,40 +481,7 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start,
*/
void cache_tag_flush_all(struct dmar_domain *domain)
{
- struct cache_tag *tag;
- unsigned long flags;
-
- spin_lock_irqsave(&domain->cache_lock, flags);
- list_for_each_entry(tag, &domain->cache_tags, node) {
- struct intel_iommu *iommu = tag->iommu;
- struct device_domain_info *info;
- u16 sid;
-
- switch (tag->type) {
- case CACHE_TAG_IOTLB:
- case CACHE_TAG_NESTING_IOTLB:
- if (domain->use_first_level)
- qi_flush_piotlb(iommu, tag->domain_id,
- tag->pasid, 0, -1, 0);
- else
- iommu->flush.flush_iotlb(iommu, tag->domain_id,
- 0, 0, DMA_TLB_DSI_FLUSH);
- break;
- case CACHE_TAG_DEVTLB:
- case CACHE_TAG_NESTING_DEVTLB:
- info = dev_iommu_priv_get(tag->dev);
- sid = PCI_DEVID(info->bus, info->devfn);
-
- qi_flush_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep,
- 0, MAX_AGAW_PFN_WIDTH);
- quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH,
- IOMMU_NO_PASID, info->ats_qdep);
- break;
- }
-
- trace_cache_tag_flush_all(tag);
- }
- spin_unlock_irqrestore(&domain->cache_lock, flags);
+ cache_tag_flush_range(domain, 0, ULONG_MAX, 0);
}
/*
@@ -382,6 +498,7 @@ void cache_tag_flush_all(struct dmar_domain *domain)
void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start,
unsigned long end)
{
+ struct intel_iommu *iommu = NULL;
unsigned long pages, mask, addr;
struct cache_tag *tag;
unsigned long flags;
@@ -390,30 +507,22 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start,
spin_lock_irqsave(&domain->cache_lock, flags);
list_for_each_entry(tag, &domain->cache_tags, node) {
- struct intel_iommu *iommu = tag->iommu;
+ if (iommu && iommu != tag->iommu)
+ qi_batch_flush_descs(iommu, domain->qi_batch);
+ iommu = tag->iommu;
- if (!cap_caching_mode(iommu->cap) || domain->use_first_level) {
+ if (!cap_caching_mode(iommu->cap) ||
+ intel_domain_is_fs_paging(domain)) {
iommu_flush_write_buffer(iommu);
continue;
}
if (tag->type == CACHE_TAG_IOTLB ||
- tag->type == CACHE_TAG_NESTING_IOTLB) {
- /*
- * Fallback to domain selective flush if no
- * PSI support or the size is too big.
- */
- if (!cap_pgsel_inv(iommu->cap) ||
- mask > cap_max_amask_val(iommu->cap))
- iommu->flush.flush_iotlb(iommu, tag->domain_id,
- 0, 0, DMA_TLB_DSI_FLUSH);
- else
- iommu->flush.flush_iotlb(iommu, tag->domain_id,
- addr, mask,
- DMA_TLB_PSI_FLUSH);
- }
+ tag->type == CACHE_TAG_NESTING_IOTLB)
+ cache_tag_flush_iotlb(domain, tag, addr, pages, mask, 0);
trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask);
}
+ qi_batch_flush_descs(iommu, domain->qi_batch);
spin_unlock_irqrestore(&domain->cache_lock, flags);
}
diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
deleted file mode 100644
index 9862dc20b35e..000000000000
--- a/drivers/iommu/intel/cap_audit.c
+++ /dev/null
@@ -1,217 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * cap_audit.c - audit iommu capabilities for boot time and hot plug
- *
- * Copyright (C) 2021 Intel Corporation
- *
- * Author: Kyung Min Park <kyung.min.park@intel.com>
- * Lu Baolu <baolu.lu@linux.intel.com>
- */
-
-#define pr_fmt(fmt) "DMAR: " fmt
-
-#include "iommu.h"
-#include "cap_audit.h"
-
-static u64 intel_iommu_cap_sanity;
-static u64 intel_iommu_ecap_sanity;
-
-static inline void check_irq_capabilities(struct intel_iommu *a,
- struct intel_iommu *b)
-{
- CHECK_FEATURE_MISMATCH(a, b, cap, pi_support, CAP_PI_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, eim_support, ECAP_EIM_MASK);
-}
-
-static inline void check_dmar_capabilities(struct intel_iommu *a,
- struct intel_iommu *b)
-{
- MINIMAL_FEATURE_IOMMU(b, cap, CAP_MAMV_MASK);
- MINIMAL_FEATURE_IOMMU(b, cap, CAP_NFR_MASK);
- MINIMAL_FEATURE_IOMMU(b, cap, CAP_SLLPS_MASK);
- MINIMAL_FEATURE_IOMMU(b, cap, CAP_FRO_MASK);
- MINIMAL_FEATURE_IOMMU(b, cap, CAP_MGAW_MASK);
- MINIMAL_FEATURE_IOMMU(b, cap, CAP_SAGAW_MASK);
- MINIMAL_FEATURE_IOMMU(b, cap, CAP_NDOMS_MASK);
- MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_PSS_MASK);
- MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_MHMV_MASK);
- MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_IRO_MASK);
-
- CHECK_FEATURE_MISMATCH(a, b, cap, fl5lp_support, CAP_FL5LP_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, fl1gp_support, CAP_FL1GP_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, read_drain, CAP_RD_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, write_drain, CAP_WD_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, pgsel_inv, CAP_PSI_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, zlr, CAP_ZLR_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, caching_mode, CAP_CM_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, phmr, CAP_PHMR_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, plmr, CAP_PLMR_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, rwbf, CAP_RWBF_MASK);
- CHECK_FEATURE_MISMATCH(a, b, cap, afl, CAP_AFL_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, rps, ECAP_RPS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, smpwc, ECAP_SMPWC_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, flts, ECAP_FLTS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, slts, ECAP_SLTS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, nwfs, ECAP_NWFS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, slads, ECAP_SLADS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, smts, ECAP_SMTS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, pds, ECAP_PDS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, dit, ECAP_DIT_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, pasid, ECAP_PASID_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, eafs, ECAP_EAFS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, srs, ECAP_SRS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, ers, ECAP_ERS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, prs, ECAP_PRS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, nest, ECAP_NEST_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, mts, ECAP_MTS_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, sc_support, ECAP_SC_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, pass_through, ECAP_PT_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, dev_iotlb_support, ECAP_DT_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, qis, ECAP_QI_MASK);
- CHECK_FEATURE_MISMATCH(a, b, ecap, coherent, ECAP_C_MASK);
-}
-
-static int cap_audit_hotplug(struct intel_iommu *iommu, enum cap_audit_type type)
-{
- bool mismatch = false;
- u64 old_cap = intel_iommu_cap_sanity;
- u64 old_ecap = intel_iommu_ecap_sanity;
-
- if (type == CAP_AUDIT_HOTPLUG_IRQR) {
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pi_support, CAP_PI_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eim_support, ECAP_EIM_MASK);
- goto out;
- }
-
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl5lp_support, CAP_FL5LP_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl1gp_support, CAP_FL1GP_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, read_drain, CAP_RD_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, write_drain, CAP_WD_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pgsel_inv, CAP_PSI_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, zlr, CAP_ZLR_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, caching_mode, CAP_CM_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, phmr, CAP_PHMR_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, plmr, CAP_PLMR_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, rwbf, CAP_RWBF_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, afl, CAP_AFL_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, rps, ECAP_RPS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smpwc, ECAP_SMPWC_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, flts, ECAP_FLTS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slts, ECAP_SLTS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nwfs, ECAP_NWFS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slads, ECAP_SLADS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smts, ECAP_SMTS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pds, ECAP_PDS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dit, ECAP_DIT_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pasid, ECAP_PASID_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eafs, ECAP_EAFS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, srs, ECAP_SRS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, ers, ECAP_ERS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, prs, ECAP_PRS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nest, ECAP_NEST_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, mts, ECAP_MTS_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, sc_support, ECAP_SC_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pass_through, ECAP_PT_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dev_iotlb_support, ECAP_DT_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, qis, ECAP_QI_MASK);
- CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, coherent, ECAP_C_MASK);
-
- /* Abort hot plug if the hot plug iommu feature is smaller than global */
- MINIMAL_FEATURE_HOTPLUG(iommu, cap, max_amask_val, CAP_MAMV_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, cap, num_fault_regs, CAP_NFR_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, cap, super_page_val, CAP_SLLPS_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, cap, fault_reg_offset, CAP_FRO_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, cap, mgaw, CAP_MGAW_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, cap, sagaw, CAP_SAGAW_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, cap, ndoms, CAP_NDOMS_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, ecap, pss, ECAP_PSS_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, ecap, max_handle_mask, ECAP_MHMV_MASK, mismatch);
- MINIMAL_FEATURE_HOTPLUG(iommu, ecap, iotlb_offset, ECAP_IRO_MASK, mismatch);
-
-out:
- if (mismatch) {
- intel_iommu_cap_sanity = old_cap;
- intel_iommu_ecap_sanity = old_ecap;
- return -EFAULT;
- }
-
- return 0;
-}
-
-static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type)
-{
- struct dmar_drhd_unit *d;
- struct intel_iommu *i;
- int rc = 0;
-
- rcu_read_lock();
- if (list_empty(&dmar_drhd_units))
- goto out;
-
- for_each_active_iommu(i, d) {
- if (!iommu) {
- intel_iommu_ecap_sanity = i->ecap;
- intel_iommu_cap_sanity = i->cap;
- iommu = i;
- continue;
- }
-
- if (type == CAP_AUDIT_STATIC_DMAR)
- check_dmar_capabilities(iommu, i);
- else
- check_irq_capabilities(iommu, i);
- }
-
- /*
- * If the system is sane to support scalable mode, either SL or FL
- * should be sane.
- */
- if (intel_cap_smts_sanity() &&
- !intel_cap_flts_sanity() && !intel_cap_slts_sanity())
- rc = -EOPNOTSUPP;
-
-out:
- rcu_read_unlock();
- return rc;
-}
-
-int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu)
-{
- switch (type) {
- case CAP_AUDIT_STATIC_DMAR:
- case CAP_AUDIT_STATIC_IRQR:
- return cap_audit_static(iommu, type);
- case CAP_AUDIT_HOTPLUG_DMAR:
- case CAP_AUDIT_HOTPLUG_IRQR:
- return cap_audit_hotplug(iommu, type);
- default:
- break;
- }
-
- return -EFAULT;
-}
-
-bool intel_cap_smts_sanity(void)
-{
- return ecap_smts(intel_iommu_ecap_sanity);
-}
-
-bool intel_cap_pasid_sanity(void)
-{
- return ecap_pasid(intel_iommu_ecap_sanity);
-}
-
-bool intel_cap_nest_sanity(void)
-{
- return ecap_nest(intel_iommu_ecap_sanity);
-}
-
-bool intel_cap_flts_sanity(void)
-{
- return ecap_flts(intel_iommu_ecap_sanity);
-}
-
-bool intel_cap_slts_sanity(void)
-{
- return ecap_slts(intel_iommu_ecap_sanity);
-}
diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
deleted file mode 100644
index d07b75938961..000000000000
--- a/drivers/iommu/intel/cap_audit.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * cap_audit.h - audit iommu capabilities header
- *
- * Copyright (C) 2021 Intel Corporation
- *
- * Author: Kyung Min Park <kyung.min.park@intel.com>
- */
-
-/*
- * Capability Register Mask
- */
-#define CAP_FL5LP_MASK BIT_ULL(60)
-#define CAP_PI_MASK BIT_ULL(59)
-#define CAP_FL1GP_MASK BIT_ULL(56)
-#define CAP_RD_MASK BIT_ULL(55)
-#define CAP_WD_MASK BIT_ULL(54)
-#define CAP_MAMV_MASK GENMASK_ULL(53, 48)
-#define CAP_NFR_MASK GENMASK_ULL(47, 40)
-#define CAP_PSI_MASK BIT_ULL(39)
-#define CAP_SLLPS_MASK GENMASK_ULL(37, 34)
-#define CAP_FRO_MASK GENMASK_ULL(33, 24)
-#define CAP_ZLR_MASK BIT_ULL(22)
-#define CAP_MGAW_MASK GENMASK_ULL(21, 16)
-#define CAP_SAGAW_MASK GENMASK_ULL(12, 8)
-#define CAP_CM_MASK BIT_ULL(7)
-#define CAP_PHMR_MASK BIT_ULL(6)
-#define CAP_PLMR_MASK BIT_ULL(5)
-#define CAP_RWBF_MASK BIT_ULL(4)
-#define CAP_AFL_MASK BIT_ULL(3)
-#define CAP_NDOMS_MASK GENMASK_ULL(2, 0)
-
-/*
- * Extended Capability Register Mask
- */
-#define ECAP_RPS_MASK BIT_ULL(49)
-#define ECAP_SMPWC_MASK BIT_ULL(48)
-#define ECAP_FLTS_MASK BIT_ULL(47)
-#define ECAP_SLTS_MASK BIT_ULL(46)
-#define ECAP_SLADS_MASK BIT_ULL(45)
-#define ECAP_VCS_MASK BIT_ULL(44)
-#define ECAP_SMTS_MASK BIT_ULL(43)
-#define ECAP_PDS_MASK BIT_ULL(42)
-#define ECAP_DIT_MASK BIT_ULL(41)
-#define ECAP_PASID_MASK BIT_ULL(40)
-#define ECAP_PSS_MASK GENMASK_ULL(39, 35)
-#define ECAP_EAFS_MASK BIT_ULL(34)
-#define ECAP_NWFS_MASK BIT_ULL(33)
-#define ECAP_SRS_MASK BIT_ULL(31)
-#define ECAP_ERS_MASK BIT_ULL(30)
-#define ECAP_PRS_MASK BIT_ULL(29)
-#define ECAP_NEST_MASK BIT_ULL(26)
-#define ECAP_MTS_MASK BIT_ULL(25)
-#define ECAP_MHMV_MASK GENMASK_ULL(23, 20)
-#define ECAP_IRO_MASK GENMASK_ULL(17, 8)
-#define ECAP_SC_MASK BIT_ULL(7)
-#define ECAP_PT_MASK BIT_ULL(6)
-#define ECAP_EIM_MASK BIT_ULL(4)
-#define ECAP_DT_MASK BIT_ULL(2)
-#define ECAP_QI_MASK BIT_ULL(1)
-#define ECAP_C_MASK BIT_ULL(0)
-
-/*
- * u64 intel_iommu_cap_sanity, intel_iommu_ecap_sanity will be adjusted as each
- * IOMMU gets audited.
- */
-#define DO_CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \
-do { \
- if (cap##_##feature(a) != cap##_##feature(b)) { \
- intel_iommu_##cap##_sanity &= ~(MASK); \
- pr_info("IOMMU feature %s inconsistent", #feature); \
- } \
-} while (0)
-
-#define CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \
- DO_CHECK_FEATURE_MISMATCH((a)->cap, (b)->cap, cap, feature, MASK)
-
-#define CHECK_FEATURE_MISMATCH_HOTPLUG(b, cap, feature, MASK) \
-do { \
- if (cap##_##feature(intel_iommu_##cap##_sanity)) \
- DO_CHECK_FEATURE_MISMATCH(intel_iommu_##cap##_sanity, \
- (b)->cap, cap, feature, MASK); \
-} while (0)
-
-#define MINIMAL_FEATURE_IOMMU(iommu, cap, MASK) \
-do { \
- u64 min_feature = intel_iommu_##cap##_sanity & (MASK); \
- min_feature = min_t(u64, min_feature, (iommu)->cap & (MASK)); \
- intel_iommu_##cap##_sanity = (intel_iommu_##cap##_sanity & ~(MASK)) | \
- min_feature; \
-} while (0)
-
-#define MINIMAL_FEATURE_HOTPLUG(iommu, cap, feature, MASK, mismatch) \
-do { \
- if ((intel_iommu_##cap##_sanity & (MASK)) > \
- (cap##_##feature((iommu)->cap))) \
- mismatch = true; \
- else \
- (iommu)->cap = ((iommu)->cap & ~(MASK)) | \
- (intel_iommu_##cap##_sanity & (MASK)); \
-} while (0)
-
-enum cap_audit_type {
- CAP_AUDIT_STATIC_DMAR,
- CAP_AUDIT_STATIC_IRQR,
- CAP_AUDIT_HOTPLUG_DMAR,
- CAP_AUDIT_HOTPLUG_IRQR,
-};
-
-bool intel_cap_smts_sanity(void);
-bool intel_cap_pasid_sanity(void);
-bool intel_cap_nest_sanity(void);
-bool intel_cap_flts_sanity(void);
-bool intel_cap_slts_sanity(void);
-
-static inline bool scalable_mode_support(void)
-{
- return (intel_iommu_sm && intel_cap_smts_sanity());
-}
-
-static inline bool pasid_mode_support(void)
-{
- return scalable_mode_support() && intel_cap_pasid_sanity();
-}
-
-static inline bool nested_mode_support(void)
-{
- return scalable_mode_support() && intel_cap_nest_sanity();
-}
-
-int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index affbf4a1558d..617fd81a80f0 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -62,8 +62,6 @@ static const struct iommu_regset iommu_regs_64[] = {
IOMMU_REGSET_ENTRY(CAP),
IOMMU_REGSET_ENTRY(ECAP),
IOMMU_REGSET_ENTRY(RTADDR),
- IOMMU_REGSET_ENTRY(CCMD),
- IOMMU_REGSET_ENTRY(AFLOG),
IOMMU_REGSET_ENTRY(PHMBASE),
IOMMU_REGSET_ENTRY(PHMLIMIT),
IOMMU_REGSET_ENTRY(IQH),
@@ -435,8 +433,21 @@ static int domain_translation_struct_show(struct seq_file *m,
}
pgd &= VTD_PAGE_MASK;
} else { /* legacy mode */
- pgd = context->lo & VTD_PAGE_MASK;
- agaw = context->hi & 7;
+ u8 tt = (u8)(context->lo & GENMASK_ULL(3, 2)) >> 2;
+
+ /*
+ * According to Translation Type(TT),
+ * get the page table pointer(SSPTPTR).
+ */
+ switch (tt) {
+ case CONTEXT_TT_MULTI_LEVEL:
+ case CONTEXT_TT_DEV_IOTLB:
+ pgd = context->lo & VTD_PAGE_MASK;
+ agaw = context->hi & 7;
+ break;
+ default:
+ goto iommu_unlock;
+ }
}
seq_printf(m, "Device %04x:%02x:%02x.%x ",
@@ -648,17 +659,11 @@ DEFINE_SHOW_ATTRIBUTE(ir_translation_struct);
static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu,
struct dmar_drhd_unit *drhd)
{
- int ret;
-
seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
iommu->name, drhd->reg_base_addr);
- ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
- if (ret < 0)
- seq_puts(m, "Failed to get latency snapshot");
- else
- seq_puts(m, debug_buf);
- seq_puts(m, "\n");
+ dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
+ seq_printf(m, "%s\n", debug_buf);
}
static int latency_show(struct seq_file *m, void *v)
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 304e84949ca7..ec975c73cfe6 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -935,14 +935,11 @@ void __init detect_intel_iommu(void)
pci_request_acs();
}
-#ifdef CONFIG_X86
if (!ret) {
x86_init.iommu.iommu_init = intel_iommu_init;
x86_platform.iommu_shutdown = intel_iommu_shutdown;
}
-#endif
-
if (dmar_tbl) {
acpi_put_table(dmar_tbl);
dmar_tbl = NULL;
@@ -1060,7 +1057,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
err = iommu->seq_id;
goto error;
}
- sprintf(iommu->name, "dmar%d", iommu->seq_id);
+ snprintf(iommu->name, sizeof(iommu->name), "dmar%d", iommu->seq_id);
err = map_iommu(iommu, drhd);
if (err) {
@@ -1099,6 +1096,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
spin_lock_init(&iommu->device_rbtree_lock);
mutex_init(&iommu->iopf_lock);
iommu->node = NUMA_NO_NODE;
+ spin_lock_init(&iommu->lock);
+ ida_init(&iommu->domain_ida);
+ mutex_init(&iommu->did_lock);
ver = readl(iommu->reg + DMAR_VER_REG);
pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
@@ -1187,7 +1187,7 @@ static void free_iommu(struct intel_iommu *iommu)
}
if (iommu->qi) {
- iommu_free_page(iommu->qi->desc);
+ iommu_free_pages(iommu->qi->desc);
kfree(iommu->qi->desc_status);
kfree(iommu->qi);
}
@@ -1195,6 +1195,7 @@ static void free_iommu(struct intel_iommu *iommu)
if (iommu->reg)
unmap_iommu(iommu);
+ ida_destroy(&iommu->domain_ida);
ida_free(&dmar_seq_ids, iommu->seq_id);
kfree(iommu);
}
@@ -1204,9 +1205,7 @@ static void free_iommu(struct intel_iommu *iommu)
*/
static inline void reclaim_free_desc(struct q_inval *qi)
{
- while (qi->desc_status[qi->free_tail] == QI_DONE ||
- qi->desc_status[qi->free_tail] == QI_ABORT) {
- qi->desc_status[qi->free_tail] = QI_FREE;
+ while (qi->desc_status[qi->free_tail] == QI_FREE && qi->free_tail != qi->free_head) {
qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
qi->free_cnt++;
}
@@ -1446,7 +1445,7 @@ restart:
*/
writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG);
- while (qi->desc_status[wait_index] != QI_DONE) {
+ while (READ_ONCE(qi->desc_status[wait_index]) != QI_DONE) {
/*
* We will leave the interrupts disabled, to prevent interrupt
* context to queue another cmd while a cmd is already submitted
@@ -1463,8 +1462,16 @@ restart:
raw_spin_lock(&qi->q_lock);
}
- for (i = 0; i < count; i++)
- qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE;
+ /*
+ * The reclaim code can free descriptors from multiple submissions
+ * starting from the tail of the queue. When count == 0, the
+ * status of the standalone wait descriptor at the tail of the queue
+ * must be set to QI_FREE to allow the reclaim code to proceed.
+ * It is also possible that descriptors from one of the previous
+ * submissions has to be reclaimed by a subsequent submission.
+ */
+ for (i = 0; i <= count; i++)
+ qi->desc_status[(index + i) % QI_LENGTH] = QI_FREE;
reclaim_free_desc(qi);
raw_spin_unlock_irqrestore(&qi->q_lock, flags);
@@ -1520,24 +1527,9 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
unsigned int size_order, u64 type)
{
- u8 dw = 0, dr = 0;
-
struct qi_desc desc;
- int ih = 0;
-
- if (cap_write_drain(iommu->cap))
- dw = 1;
-
- if (cap_read_drain(iommu->cap))
- dr = 1;
-
- desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
- | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
- desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
- | QI_IOTLB_AM(size_order);
- desc.qw2 = 0;
- desc.qw3 = 0;
+ qi_desc_iotlb(iommu, did, addr, size_order, type, &desc);
qi_submit_sync(iommu, &desc, 1, 0);
}
@@ -1555,20 +1547,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
if (!(iommu->gcmd & DMA_GCMD_TE))
return;
- if (mask) {
- addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
- desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
- } else
- desc.qw1 = QI_DEV_IOTLB_ADDR(addr);
-
- if (qdep >= QI_DEV_IOTLB_MAX_INVS)
- qdep = 0;
-
- desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
- QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
- desc.qw2 = 0;
- desc.qw3 = 0;
-
+ qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &desc);
qi_submit_sync(iommu, &desc, 1, 0);
}
@@ -1588,28 +1567,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
return;
}
- if (npages == -1) {
- desc.qw0 = QI_EIOTLB_PASID(pasid) |
- QI_EIOTLB_DID(did) |
- QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
- QI_EIOTLB_TYPE;
- desc.qw1 = 0;
- } else {
- int mask = ilog2(__roundup_pow_of_two(npages));
- unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask));
-
- if (WARN_ON_ONCE(!IS_ALIGNED(addr, align)))
- addr = ALIGN_DOWN(addr, align);
-
- desc.qw0 = QI_EIOTLB_PASID(pasid) |
- QI_EIOTLB_DID(did) |
- QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
- QI_EIOTLB_TYPE;
- desc.qw1 = QI_EIOTLB_ADDR(addr) |
- QI_EIOTLB_IH(ih) |
- QI_EIOTLB_AM(mask);
- }
-
+ qi_desc_piotlb(did, pasid, addr, npages, ih, &desc);
qi_submit_sync(iommu, &desc, 1, 0);
}
@@ -1617,7 +1575,6 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
u32 pasid, u16 qdep, u64 addr, unsigned int size_order)
{
- unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1);
struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};
/*
@@ -1629,40 +1586,9 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
if (!(iommu->gcmd & DMA_GCMD_TE))
return;
- desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
- QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
- QI_DEV_IOTLB_PFSID(pfsid);
-
- /*
- * If S bit is 0, we only flush a single page. If S bit is set,
- * The least significant zero bit indicates the invalidation address
- * range. VT-d spec 6.5.2.6.
- * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB.
- * size order = 0 is PAGE_SIZE 4KB
- * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
- * ECAP.
- */
- if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order))
- pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n",
- addr, size_order);
-
- /* Take page address */
- desc.qw1 = QI_DEV_EIOTLB_ADDR(addr);
-
- if (size_order) {
- /*
- * Existing 0s in address below size_order may be the least
- * significant bit, we must set them to 1s to avoid having
- * smaller size than desired.
- */
- desc.qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1,
- VTD_PAGE_SHIFT);
- /* Clear size_order bit to indicate size */
- desc.qw1 &= ~mask;
- /* Set the S bit to indicate flushing more than 1 page */
- desc.qw1 |= QI_DEV_EIOTLB_SIZE;
- }
-
+ qi_desc_dev_iotlb_pasid(sid, pfsid, pasid,
+ qdep, addr, size_order,
+ &desc);
qi_submit_sync(iommu, &desc, 1, 0);
}
@@ -1756,7 +1682,6 @@ int dmar_enable_qi(struct intel_iommu *iommu)
{
struct q_inval *qi;
void *desc;
- int order;
if (!ecap_qis(iommu->ecap))
return -ENOENT;
@@ -1777,8 +1702,9 @@ int dmar_enable_qi(struct intel_iommu *iommu)
* Need two pages to accommodate 256 descriptors of 256 bits each
* if the remapping hardware supports scalable mode translation.
*/
- order = ecap_smts(iommu->ecap) ? 1 : 0;
- desc = iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order);
+ desc = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
+ ecap_smts(iommu->ecap) ? SZ_8K :
+ SZ_4K);
if (!desc) {
kfree(qi);
iommu->qi = NULL;
@@ -1789,7 +1715,7 @@ int dmar_enable_qi(struct intel_iommu *iommu)
qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
if (!qi->desc_status) {
- iommu_free_page(qi->desc);
+ iommu_free_pages(qi->desc);
kfree(qi);
iommu->qi = NULL;
return -ENOMEM;
@@ -1970,19 +1896,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
-void dmar_msi_read(int irq, struct msi_msg *msg)
-{
- struct intel_iommu *iommu = irq_get_handler_data(irq);
- int reg = dmar_msi_reg(iommu, irq);
- unsigned long flag;
-
- raw_spin_lock_irqsave(&iommu->register_lock, flag);
- msg->data = readl(iommu->reg + reg + 4);
- msg->address_lo = readl(iommu->reg + reg + 8);
- msg->address_hi = readl(iommu->reg + reg + 12);
- raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
u8 fault_reason, u32 pasid, u16 source_id,
unsigned long long addr)
@@ -2131,6 +2044,7 @@ int enable_drhd_fault_handling(unsigned int cpu)
/*
* Enable fault control interrupt.
*/
+ guard(rwsem_read)(&dmar_global_lock);
for_each_iommu(iommu, drhd) {
u32 fault_status;
int ret;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index fd11a080380c..134302fbcd92 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -29,13 +29,12 @@
#include "../irq_remapping.h"
#include "../iommu-pages.h"
#include "pasid.h"
-#include "cap_audit.h"
#include "perfmon.h"
#define ROOT_SIZE VTD_PAGE_SIZE
#define CONTEXT_SIZE VTD_PAGE_SIZE
-#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
+#define IS_GFX_DEVICE(pdev) pci_is_display(pdev)
#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
@@ -46,18 +45,13 @@
#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
-#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
-#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
-
-/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
- to match. That way, we can use 'unsigned long' for PFNs with impunity. */
-#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
- __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
-#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
-
static void __init check_tylersburg_isoch(void);
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable);
static int rwbf_quirk;
+#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap))
+
/*
* set to 1 to panic kernel if can't successfully enable VT-d
* (used when kernel is launched w/ TXT)
@@ -167,15 +161,6 @@ static void device_rbtree_remove(struct device_domain_info *info)
spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
}
-/*
- * This domain is a statically identity mapping domain.
- * 1. This domain creats a static 1:1 mapping to all usable memory.
- * 2. It maps to each iommu if successful.
- * 3. Each iommu mapps to this domain if successful.
- */
-static struct dmar_domain *si_domain;
-static int hw_pass_through = 1;
-
struct dmar_rmrr_unit {
struct list_head list; /* list of rmrr units */
struct acpi_dmar_header *hdr; /* ACPI header */
@@ -225,7 +210,6 @@ static int disable_igfx_iommu;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
-static const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
@@ -293,18 +277,6 @@ static int __init intel_iommu_setup(char *str)
}
__setup("intel_iommu=", intel_iommu_setup);
-static int domain_type_is_si(struct dmar_domain *domain)
-{
- return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
-}
-
-static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
-{
- int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
-
- return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
-}
-
/*
* Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
* Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
@@ -366,135 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
}
-static void domain_update_iommu_coherency(struct dmar_domain *domain)
-{
- struct iommu_domain_info *info;
- struct dmar_drhd_unit *drhd;
- struct intel_iommu *iommu;
- bool found = false;
- unsigned long i;
-
- domain->iommu_coherency = true;
- xa_for_each(&domain->iommu_array, i, info) {
- found = true;
- if (!iommu_paging_structure_coherency(info->iommu)) {
- domain->iommu_coherency = false;
- break;
- }
- }
- if (found)
- return;
-
- /* No hardware attached; use lowest common denominator */
- rcu_read_lock();
- for_each_active_iommu(iommu, drhd) {
- if (!iommu_paging_structure_coherency(iommu)) {
- domain->iommu_coherency = false;
- break;
- }
- }
- rcu_read_unlock();
-}
-
-static int domain_update_iommu_superpage(struct dmar_domain *domain,
- struct intel_iommu *skip)
-{
- struct dmar_drhd_unit *drhd;
- struct intel_iommu *iommu;
- int mask = 0x3;
-
- if (!intel_iommu_superpage)
- return 0;
-
- /* set iommu_superpage to the smallest common denominator */
- rcu_read_lock();
- for_each_active_iommu(iommu, drhd) {
- if (iommu != skip) {
- if (domain && domain->use_first_level) {
- if (!cap_fl1gp_support(iommu->cap))
- mask = 0x1;
- } else {
- mask &= cap_super_page_val(iommu->cap);
- }
-
- if (!mask)
- break;
- }
- }
- rcu_read_unlock();
-
- return fls(mask);
-}
-
-static int domain_update_device_node(struct dmar_domain *domain)
-{
- struct device_domain_info *info;
- int nid = NUMA_NO_NODE;
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
- list_for_each_entry(info, &domain->devices, link) {
- /*
- * There could possibly be multiple device numa nodes as devices
- * within the same domain may sit behind different IOMMUs. There
- * isn't perfect answer in such situation, so we select first
- * come first served policy.
- */
- nid = dev_to_node(info->dev);
- if (nid != NUMA_NO_NODE)
- break;
- }
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return nid;
-}
-
-/* Return the super pagesize bitmap if supported. */
-static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
-{
- unsigned long bitmap = 0;
-
- /*
- * 1-level super page supports page size of 2MiB, 2-level super page
- * supports page size of both 2MiB and 1GiB.
- */
- if (domain->iommu_superpage == 1)
- bitmap |= SZ_2M;
- else if (domain->iommu_superpage == 2)
- bitmap |= SZ_2M | SZ_1G;
-
- return bitmap;
-}
-
-/* Some capabilities may be different across iommus */
-void domain_update_iommu_cap(struct dmar_domain *domain)
-{
- domain_update_iommu_coherency(domain);
- domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
-
- /*
- * If RHSA is missing, we should default to the device numa domain
- * as fall back.
- */
- if (domain->nid == NUMA_NO_NODE)
- domain->nid = domain_update_device_node(domain);
-
- /*
- * First-level translation restricts the input-address to a
- * canonical address (i.e., address bits 63:N have the same
- * value as address bit [N-1], where N is 48-bits with 4-level
- * paging and 57-bits with 5-level paging). Hence, skip bit
- * [N-1].
- */
- if (domain->use_first_level)
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
- else
- domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
-
- domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
- domain_update_iotlb(domain);
-}
-
struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
u8 devfn, int alloc)
{
@@ -524,7 +367,8 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
if (!alloc)
return NULL;
- context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
+ context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
+ SZ_4K);
if (!context)
return NULL;
@@ -680,13 +524,6 @@ out:
return iommu;
}
-static void domain_flush_cache(struct dmar_domain *domain,
- void *addr, int size)
-{
- if (!domain->iommu_coherency)
- clflush_cache_range(addr, size);
-}
-
static void free_context_table(struct intel_iommu *iommu)
{
struct context_entry *context;
@@ -698,17 +535,17 @@ static void free_context_table(struct intel_iommu *iommu)
for (i = 0; i < ROOT_ENTRY_NR; i++) {
context = iommu_context_addr(iommu, i, 0, 0);
if (context)
- iommu_free_page(context);
+ iommu_free_pages(context);
if (!sm_supported(iommu))
continue;
context = iommu_context_addr(iommu, i, 0x80, 0);
if (context)
- iommu_free_page(context);
+ iommu_free_pages(context);
}
- iommu_free_page(iommu->root_entry);
+ iommu_free_pages(iommu->root_entry);
iommu->root_entry = NULL;
}
@@ -722,14 +559,15 @@ static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
while (1) {
offset = pfn_level_offset(pfn, level);
pte = &parent[offset];
- if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
- pr_info("PTE not present at level %d\n", level);
- break;
- }
pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
- if (level == 1)
+ if (!dma_pte_present(pte)) {
+ pr_info("page table not present at level %d\n", level - 1);
+ break;
+ }
+
+ if (level == 1 || dma_pte_superpage(pte))
break;
parent = phys_to_virt(dma_pte_addr(pte));
@@ -752,11 +590,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
/* root entry dump */
- rt_entry = &iommu->root_entry[bus];
- if (!rt_entry) {
- pr_info("root table entry is not present\n");
+ if (!iommu->root_entry) {
+ pr_info("root table is not present\n");
return;
}
+ rt_entry = &iommu->root_entry[bus];
if (sm_supported(iommu))
pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
@@ -767,7 +605,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
/* context entry dump */
ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
if (!ctx_entry) {
- pr_info("context table entry is not present\n");
+ pr_info("context table is not present\n");
return;
}
@@ -776,17 +614,23 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
/* legacy mode does not require PASID entries */
if (!sm_supported(iommu)) {
+ if (!context_present(ctx_entry)) {
+ pr_info("legacy mode page table is not present\n");
+ return;
+ }
level = agaw_to_level(ctx_entry->hi & 7);
pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
goto pgtable_walk;
}
- /* get the pointer to pasid directory entry */
- dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
- if (!dir) {
- pr_info("pasid directory entry is not present\n");
+ if (!context_present(ctx_entry)) {
+ pr_info("pasid directory table is not present\n");
return;
}
+
+ /* get the pointer to pasid directory entry */
+ dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
+
/* For request-without-pasid, get the pasid from context entry */
if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
pasid = IOMMU_NO_PASID;
@@ -798,7 +642,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
/* get the pointer to the pasid table entry */
entries = get_pasid_table_from_pde(pde);
if (!entries) {
- pr_info("pasid table entry is not present\n");
+ pr_info("pasid table is not present\n");
return;
}
index = pasid & PASID_PTE_MASK;
@@ -806,6 +650,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
for (i = 0; i < ARRAY_SIZE(pte->val); i++)
pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
+ if (!pasid_pte_is_present(pte)) {
+ pr_info("scalable mode page table is not present\n");
+ return;
+ }
+
if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
@@ -819,286 +668,12 @@ pgtable_walk:
}
#endif
-static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
- unsigned long pfn, int *target_level,
- gfp_t gfp)
-{
- struct dma_pte *parent, *pte;
- int level = agaw_to_level(domain->agaw);
- int offset;
-
- if (!domain_pfn_supported(domain, pfn))
- /* Address beyond IOMMU's addressing capabilities. */
- return NULL;
-
- parent = domain->pgd;
-
- while (1) {
- void *tmp_page;
-
- offset = pfn_level_offset(pfn, level);
- pte = &parent[offset];
- if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
- break;
- if (level == *target_level)
- break;
-
- if (!dma_pte_present(pte)) {
- uint64_t pteval, tmp;
-
- tmp_page = iommu_alloc_page_node(domain->nid, gfp);
-
- if (!tmp_page)
- return NULL;
-
- domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
- pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
- if (domain->use_first_level)
- pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
-
- tmp = 0ULL;
- if (!try_cmpxchg64(&pte->val, &tmp, pteval))
- /* Someone else set it while we were thinking; use theirs. */
- iommu_free_page(tmp_page);
- else
- domain_flush_cache(domain, pte, sizeof(*pte));
- }
- if (level == 1)
- break;
-
- parent = phys_to_virt(dma_pte_addr(pte));
- level--;
- }
-
- if (!*target_level)
- *target_level = level;
-
- return pte;
-}
-
-/* return address's pte at specific level */
-static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
- unsigned long pfn,
- int level, int *large_page)
-{
- struct dma_pte *parent, *pte;
- int total = agaw_to_level(domain->agaw);
- int offset;
-
- parent = domain->pgd;
- while (level <= total) {
- offset = pfn_level_offset(pfn, total);
- pte = &parent[offset];
- if (level == total)
- return pte;
-
- if (!dma_pte_present(pte)) {
- *large_page = total;
- break;
- }
-
- if (dma_pte_superpage(pte)) {
- *large_page = total;
- return pte;
- }
-
- parent = phys_to_virt(dma_pte_addr(pte));
- total--;
- }
- return NULL;
-}
-
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned int large_page;
- struct dma_pte *first_pte, *pte;
-
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- do {
- large_page = 1;
- first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
- if (!pte) {
- start_pfn = align_to_level(start_pfn + 1, large_page + 1);
- continue;
- }
- do {
- dma_clear_pte(pte);
- start_pfn += lvl_to_nr_pages(large_page);
- pte++;
- } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
-
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
-
- } while (start_pfn && start_pfn <= last_pfn);
-}
-
-static void dma_pte_free_level(struct dmar_domain *domain, int level,
- int retain_level, struct dma_pte *pte,
- unsigned long pfn, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn;
- struct dma_pte *level_pte;
-
- if (!dma_pte_present(pte) || dma_pte_superpage(pte))
- goto next;
-
- level_pfn = pfn & level_mask(level);
- level_pte = phys_to_virt(dma_pte_addr(pte));
-
- if (level > 2) {
- dma_pte_free_level(domain, level - 1, retain_level,
- level_pte, level_pfn, start_pfn,
- last_pfn);
- }
-
- /*
- * Free the page table if we're below the level we want to
- * retain and the range covers the entire table.
- */
- if (level < retain_level && !(start_pfn > level_pfn ||
- last_pfn < level_pfn + level_size(level) - 1)) {
- dma_clear_pte(pte);
- domain_flush_cache(domain, pte, sizeof(*pte));
- iommu_free_page(level_pte);
- }
-next:
- pfn += level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-}
-
-/*
- * clear last level (leaf) ptes and free page table pages below the
- * level we wish to keep intact.
- */
-static void dma_pte_free_pagetable(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long last_pfn,
- int retain_level)
-{
- dma_pte_clear_range(domain, start_pfn, last_pfn);
-
- /* We don't need lock here; nobody else touches the iova range */
- dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
- domain->pgd, 0, start_pfn, last_pfn);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- iommu_free_page(domain->pgd);
- domain->pgd = NULL;
- }
-}
-
-/* When a page at a given level is being unlinked from its parent, we don't
- need to *modify* it at all. All we need to do is make a list of all the
- pages which can be freed just as soon as we've flushed the IOTLB and we
- know the hardware page-walk will no longer touch them.
- The 'pte' argument is the *parent* PTE, pointing to the page that is to
- be freed. */
-static void dma_pte_list_pagetables(struct dmar_domain *domain,
- int level, struct dma_pte *pte,
- struct list_head *freelist)
-{
- struct page *pg;
-
- pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
- list_add_tail(&pg->lru, freelist);
-
- if (level == 1)
- return;
-
- pte = page_address(pg);
- do {
- if (dma_pte_present(pte) && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
- pte++;
- } while (!first_pte_in_page(pte));
-}
-
-static void dma_pte_clear_level(struct dmar_domain *domain, int level,
- struct dma_pte *pte, unsigned long pfn,
- unsigned long start_pfn, unsigned long last_pfn,
- struct list_head *freelist)
-{
- struct dma_pte *first_pte = NULL, *last_pte = NULL;
-
- pfn = max(start_pfn, pfn);
- pte = &pte[pfn_level_offset(pfn, level)];
-
- do {
- unsigned long level_pfn = pfn & level_mask(level);
-
- if (!dma_pte_present(pte))
- goto next;
-
- /* If range covers entire pagetable, free it */
- if (start_pfn <= level_pfn &&
- last_pfn >= level_pfn + level_size(level) - 1) {
- /* These suborbinate page tables are going away entirely. Don't
- bother to clear them; we're just going to *free* them. */
- if (level > 1 && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
-
- dma_clear_pte(pte);
- if (!first_pte)
- first_pte = pte;
- last_pte = pte;
- } else if (level > 1) {
- /* Recurse down into a level that isn't *entirely* obsolete */
- dma_pte_clear_level(domain, level - 1,
- phys_to_virt(dma_pte_addr(pte)),
- level_pfn, start_pfn, last_pfn,
- freelist);
- }
-next:
- pfn = level_pfn + level_size(level);
- } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
-
- if (first_pte)
- domain_flush_cache(domain, first_pte,
- (void *)++last_pte - (void *)first_pte);
-}
-
-/* We can't just free the pages because the IOMMU may still be walking
- the page tables, and may have cached the intermediate levels. The
- pages can only be freed after the IOTLB flush has been done. */
-static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
- unsigned long last_pfn, struct list_head *freelist)
-{
- if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
- WARN_ON(start_pfn > last_pfn))
- return;
-
- /* we don't need lock here; nobody else touches the iova range */
- dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
- domain->pgd, 0, start_pfn, last_pfn, freelist);
-
- /* free pgd */
- if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
- struct page *pgd_page = virt_to_page(domain->pgd);
- list_add_tail(&pgd_page->lru, freelist);
- domain->pgd = NULL;
- }
-}
-
/* iommu handling */
static int iommu_alloc_root_entry(struct intel_iommu *iommu)
{
struct root_entry *root;
- root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
+ root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K);
if (!root) {
pr_err("Allocating root entry for %s failed\n",
iommu->name);
@@ -1199,9 +774,8 @@ static void __iommu_flush_context(struct intel_iommu *iommu,
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
-/* return value determine if we need a write buffer flush */
-static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
- u64 addr, unsigned int size_order, u64 type)
+void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+ unsigned int size_order, u64 type)
{
int tlb_offset = ecap_iotlb_offset(iommu->ecap);
u64 val = 0, val_iva = 0;
@@ -1270,32 +844,6 @@ domain_lookup_dev_info(struct dmar_domain *domain,
return NULL;
}
-void domain_update_iotlb(struct dmar_domain *domain)
-{
- struct dev_pasid_info *dev_pasid;
- struct device_domain_info *info;
- bool has_iotlb_device = false;
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
- list_for_each_entry(info, &domain->devices, link) {
- if (info->ats_enabled) {
- has_iotlb_device = true;
- break;
- }
- }
-
- list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
- info = dev_iommu_priv_get(dev_pasid->dev);
- if (info->ats_enabled) {
- has_iotlb_device = true;
- break;
- }
- }
- domain->has_iotlb_device = has_iotlb_device;
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
/*
* The extra devTLB flush quirk impacts those QAT devices with PCI device
* IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
@@ -1314,64 +862,59 @@ static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
return true;
}
-static void iommu_enable_pci_caps(struct device_domain_info *info)
+static void iommu_enable_pci_ats(struct device_domain_info *info)
{
struct pci_dev *pdev;
- if (!dev_is_pci(info->dev))
+ if (!info->ats_supported)
return;
pdev = to_pci_dev(info->dev);
+ if (!pci_ats_page_aligned(pdev))
+ return;
- /* The PCIe spec, in its wisdom, declares that the behaviour of
- the device if you enable PASID support after ATS support is
- undefined. So always enable PASID support on devices which
- have it, even if we can't yet know if we're ever going to
- use it. */
- if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
- info->pasid_enabled = 1;
-
- if (info->ats_supported && pci_ats_page_aligned(pdev) &&
- !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
+ if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
info->ats_enabled = 1;
- domain_update_iotlb(info->domain);
- }
}
-static void iommu_disable_pci_caps(struct device_domain_info *info)
+static void iommu_disable_pci_ats(struct device_domain_info *info)
+{
+ if (!info->ats_enabled)
+ return;
+
+ pci_disable_ats(to_pci_dev(info->dev));
+ info->ats_enabled = 0;
+}
+
+static void iommu_enable_pci_pri(struct device_domain_info *info)
{
struct pci_dev *pdev;
- if (!dev_is_pci(info->dev))
+ if (!info->ats_enabled || !info->pri_supported)
return;
pdev = to_pci_dev(info->dev);
+ /* PASID is required in PRG Response Message. */
+ if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
+ return;
- if (info->ats_enabled) {
- pci_disable_ats(pdev);
- info->ats_enabled = 0;
- domain_update_iotlb(info->domain);
- }
+ if (pci_reset_pri(pdev))
+ return;
- if (info->pasid_enabled) {
- pci_disable_pasid(pdev);
- info->pasid_enabled = 0;
- }
+ if (!pci_enable_pri(pdev, PRQ_DEPTH))
+ info->pri_enabled = 1;
}
-static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
- u64 addr, unsigned int mask)
+static void iommu_disable_pci_pri(struct device_domain_info *info)
{
- u16 sid, qdep;
-
- if (!info || !info->ats_enabled)
+ if (!info->pri_enabled)
return;
- sid = info->bus << 8 | info->devfn;
- qdep = info->ats_qdep;
- qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
- qdep, addr, mask);
- quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
+ if (WARN_ON(info->iopf_refcount))
+ iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
+
+ pci_disable_pri(to_pci_dev(info->dev));
+ info->pri_enabled = 0;
}
static void intel_flush_iotlb_all(struct iommu_domain *domain)
@@ -1435,52 +978,13 @@ static void iommu_disable_translation(struct intel_iommu *iommu)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
-static int iommu_init_domains(struct intel_iommu *iommu)
-{
- u32 ndomains;
-
- ndomains = cap_ndoms(iommu->cap);
- pr_debug("%s: Number of Domains supported <%d>\n",
- iommu->name, ndomains);
-
- spin_lock_init(&iommu->lock);
-
- iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
- if (!iommu->domain_ids)
- return -ENOMEM;
-
- /*
- * If Caching mode is set, then invalid translations are tagged
- * with domain-id 0, hence we need to pre-allocate it. We also
- * use domain-id 0 as a marker for non-allocated domain-id, so
- * make sure it is not used for a real domain.
- */
- set_bit(0, iommu->domain_ids);
-
- /*
- * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
- * entry for first-level or pass-through translation modes should
- * be programmed with a domain id different from those used for
- * second-level or nested translation. We reserve a domain id for
- * this purpose.
- */
- if (sm_supported(iommu))
- set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
-
- return 0;
-}
-
static void disable_dmar_iommu(struct intel_iommu *iommu)
{
- if (!iommu->domain_ids)
- return;
-
/*
* All iommu domains must have been detached from the devices,
* hence there should be no domain IDs in use.
*/
- if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
- > NUM_RESERVED_DID))
+ if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
return;
if (iommu->gcmd & DMA_GCMD_TE)
@@ -1489,11 +993,6 @@ static void disable_dmar_iommu(struct intel_iommu *iommu)
static void free_dmar_iommu(struct intel_iommu *iommu)
{
- if (iommu->domain_ids) {
- bitmap_free(iommu->domain_ids);
- iommu->domain_ids = NULL;
- }
-
if (iommu->copied_tables) {
bitmap_free(iommu->copied_tables);
iommu->copied_tables = NULL;
@@ -1502,58 +1001,30 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
/* free context mapping */
free_context_table(iommu);
-#ifdef CONFIG_INTEL_IOMMU_SVM
- if (pasid_supported(iommu)) {
- if (ecap_prs(iommu->ecap))
- intel_svm_finish_prq(iommu);
- }
-#endif
+ if (ecap_prs(iommu->ecap))
+ intel_iommu_finish_prq(iommu);
}
/*
* Check and return whether first level is used by default for
* DMA translation.
*/
-static bool first_level_by_default(unsigned int type)
+static bool first_level_by_default(struct intel_iommu *iommu)
{
/* Only SL is available in legacy mode */
- if (!scalable_mode_support())
+ if (!sm_supported(iommu))
return false;
/* Only level (either FL or SL) is available, just use it */
- if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
- return intel_cap_flts_sanity();
-
- /* Both levels are available, decide it based on domain type */
- return type != IOMMU_DOMAIN_UNMANAGED;
-}
-
-static struct dmar_domain *alloc_domain(unsigned int type)
-{
- struct dmar_domain *domain;
+ if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
+ return ecap_flts(iommu->ecap);
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- if (!domain)
- return NULL;
-
- domain->nid = NUMA_NO_NODE;
- if (first_level_by_default(type))
- domain->use_first_level = true;
- domain->has_iotlb_device = false;
- INIT_LIST_HEAD(&domain->devices);
- INIT_LIST_HEAD(&domain->dev_pasids);
- INIT_LIST_HEAD(&domain->cache_tags);
- spin_lock_init(&domain->lock);
- spin_lock_init(&domain->cache_lock);
- xa_init(&domain->iommu_array);
-
- return domain;
+ return true;
}
int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info, *curr;
- unsigned long ndomains;
int num, ret = -ENOSPC;
if (domain->domain.type == IOMMU_DOMAIN_SVA)
@@ -1563,41 +1034,36 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
if (!info)
return -ENOMEM;
- spin_lock(&iommu->lock);
+ guard(mutex)(&iommu->did_lock);
curr = xa_load(&domain->iommu_array, iommu->seq_id);
if (curr) {
curr->refcnt++;
- spin_unlock(&iommu->lock);
kfree(info);
return 0;
}
- ndomains = cap_ndoms(iommu->cap);
- num = find_first_zero_bit(iommu->domain_ids, ndomains);
- if (num >= ndomains) {
+ num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
+ cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
+ if (num < 0) {
pr_err("%s: No free domain ids\n", iommu->name);
goto err_unlock;
}
- set_bit(num, iommu->domain_ids);
info->refcnt = 1;
info->did = num;
info->iommu = iommu;
curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
- NULL, info, GFP_ATOMIC);
+ NULL, info, GFP_KERNEL);
if (curr) {
ret = xa_err(curr) ? : -EBUSY;
goto err_clear;
}
- domain_update_iommu_cap(domain);
- spin_unlock(&iommu->lock);
return 0;
err_clear:
- clear_bit(info->did, iommu->domain_ids);
+ ida_free(&iommu->domain_ida, info->did);
err_unlock:
- spin_unlock(&iommu->lock);
kfree(info);
return ret;
}
@@ -1609,45 +1075,68 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
if (domain->domain.type == IOMMU_DOMAIN_SVA)
return;
- spin_lock(&iommu->lock);
+ guard(mutex)(&iommu->did_lock);
info = xa_load(&domain->iommu_array, iommu->seq_id);
if (--info->refcnt == 0) {
- clear_bit(info->did, iommu->domain_ids);
+ ida_free(&iommu->domain_ida, info->did);
xa_erase(&domain->iommu_array, iommu->seq_id);
- domain->nid = NUMA_NO_NODE;
- domain_update_iommu_cap(domain);
kfree(info);
}
- spin_unlock(&iommu->lock);
}
-static int guestwidth_to_adjustwidth(int gaw)
+/*
+ * For kdump cases, old valid entries may be cached due to the
+ * in-flight DMA and copied pgtable, but there is no unmapping
+ * behaviour for them, thus we need an explicit cache flush for
+ * the newly-mapped device. For kdump, at this point, the device
+ * is supposed to finish reset at its driver probe stage, so no
+ * in-flight DMA will exist, and we don't need to worry anymore
+ * hereafter.
+ */
+static void copied_context_tear_down(struct intel_iommu *iommu,
+ struct context_entry *context,
+ u8 bus, u8 devfn)
{
- int agaw;
- int r = (gaw - 12) % 9;
+ u16 did_old;
- if (r == 0)
- agaw = gaw;
- else
- agaw = gaw + 9 - r;
- if (agaw > 64)
- agaw = 64;
- return agaw;
-}
+ if (!context_copied(iommu, bus, devfn))
+ return;
-static void domain_exit(struct dmar_domain *domain)
-{
- if (domain->pgd) {
- LIST_HEAD(freelist);
+ assert_spin_locked(&iommu->lock);
+
+ did_old = context_domain_id(context);
+ context_clear_entry(context);
- domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
- iommu_put_pages_list(&freelist);
+ if (did_old < cap_ndoms(iommu->cap)) {
+ iommu->flush.flush_context(iommu, did_old,
+ PCI_DEVID(bus, devfn),
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
+ DMA_TLB_DSI_FLUSH);
}
- if (WARN_ON(!list_empty(&domain->devices)))
- return;
+ clear_context_copied(iommu, bus, devfn);
+}
- kfree(domain);
+/*
+ * It's a non-present to present mapping. If hardware doesn't cache
+ * non-present entry we only need to flush the write-buffer. If the
+ * _does_ cache non-present entries, then it does so in the special
+ * domain #0, which we have to flush:
+ */
+static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
+ u8 bus, u8 devfn)
+{
+ if (cap_caching_mode(iommu->cap)) {
+ iommu->flush.flush_context(iommu, 0,
+ PCI_DEVID(bus, devfn),
+ DMA_CCMD_MASK_NOBIT,
+ DMA_CCMD_DEVICE_INVL);
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+ } else {
+ iommu_flush_write_buffer(iommu);
+ }
}
static int domain_context_mapping_one(struct dmar_domain *domain,
@@ -1658,12 +1147,14 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
domain_lookup_dev_info(domain, iommu, bus, devfn);
u16 did = domain_id_iommu(domain, iommu);
int translation = CONTEXT_TT_MULTI_LEVEL;
- struct dma_pte *pgd = domain->pgd;
+ struct pt_iommu_vtdss_hw_info pt_info;
struct context_entry *context;
- int agaw, ret;
+ int ret;
- if (hw_pass_through && domain_type_is_si(domain))
- translation = CONTEXT_TT_PASS_THROUGH;
+ if (WARN_ON(!intel_domain_is_ss_paging(domain)))
+ return -EINVAL;
+
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
pr_debug("Set context mapping for %02x:%02x.%d\n",
bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1678,83 +1169,23 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
if (context_present(context) && !context_copied(iommu, bus, devfn))
goto out_unlock;
- /*
- * For kdump cases, old valid entries may be cached due to the
- * in-flight DMA and copied pgtable, but there is no unmapping
- * behaviour for them, thus we need an explicit cache flush for
- * the newly-mapped device. For kdump, at this point, the device
- * is supposed to finish reset at its driver probe stage, so no
- * in-flight DMA will exist, and we don't need to worry anymore
- * hereafter.
- */
- if (context_copied(iommu, bus, devfn)) {
- u16 did_old = context_domain_id(context);
-
- if (did_old < cap_ndoms(iommu->cap)) {
- iommu->flush.flush_context(iommu, did_old,
- (((u16)bus) << 8) | devfn,
- DMA_CCMD_MASK_NOBIT,
- DMA_CCMD_DEVICE_INVL);
- iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
- DMA_TLB_DSI_FLUSH);
- }
-
- clear_context_copied(iommu, bus, devfn);
- }
-
+ copied_context_tear_down(iommu, context, bus, devfn);
context_clear_entry(context);
context_set_domain_id(context, did);
- if (translation != CONTEXT_TT_PASS_THROUGH) {
- /*
- * Skip top levels of page tables for iommu which has
- * less agaw than default. Unnecessary for PT mode.
- */
- for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
- ret = -ENOMEM;
- pgd = phys_to_virt(dma_pte_addr(pgd));
- if (!dma_pte_present(pgd))
- goto out_unlock;
- }
-
- if (info && info->ats_supported)
- translation = CONTEXT_TT_DEV_IOTLB;
- else
- translation = CONTEXT_TT_MULTI_LEVEL;
-
- context_set_address_root(context, virt_to_phys(pgd));
- context_set_address_width(context, agaw);
- } else {
- /*
- * In pass through mode, AW must be programmed to
- * indicate the largest AGAW value supported by
- * hardware. And ASR is ignored by hardware.
- */
- context_set_address_width(context, iommu->msagaw);
- }
+ if (info && info->ats_supported)
+ translation = CONTEXT_TT_DEV_IOTLB;
+ else
+ translation = CONTEXT_TT_MULTI_LEVEL;
+ context_set_address_root(context, pt_info.ssptptr);
+ context_set_address_width(context, pt_info.aw);
context_set_translation_type(context, translation);
context_set_fault_enable(context);
context_set_present(context);
if (!ecap_coherent(iommu->ecap))
clflush_cache_range(context, sizeof(*context));
-
- /*
- * It's a non-present to present mapping. If hardware doesn't cache
- * non-present entry we only need to flush the write-buffer. If the
- * _does_ cache non-present entries, then it does so in the special
- * domain #0, which we have to flush:
- */
- if (cap_caching_mode(iommu->cap)) {
- iommu->flush.flush_context(iommu, 0,
- (((u16)bus) << 8) | devfn,
- DMA_CCMD_MASK_NOBIT,
- DMA_CCMD_DEVICE_INVL);
- iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
- } else {
- iommu_flush_write_buffer(iommu);
- }
-
+ context_present_cache_flush(iommu, did, bus, devfn);
ret = 0;
out_unlock:
@@ -1780,177 +1211,17 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev)
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
u8 bus = info->bus, devfn = info->devfn;
+ int ret;
if (!dev_is_pci(dev))
return domain_context_mapping_one(domain, iommu, bus, devfn);
- return pci_for_each_dma_alias(to_pci_dev(dev),
- domain_context_mapping_cb, domain);
-}
-
-/* Return largest possible superpage level for a given mapping */
-static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phy_pfn, unsigned long pages)
-{
- int support, level = 1;
- unsigned long pfnmerge;
-
- support = domain->iommu_superpage;
-
- /* To use a large page, the virtual *and* physical addresses
- must be aligned to 2MiB/1GiB/etc. Lower bits set in either
- of them will mean we have to use smaller pages. So just
- merge them and check both at once. */
- pfnmerge = iov_pfn | phy_pfn;
-
- while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
- pages >>= VTD_STRIDE_SHIFT;
- if (!pages)
- break;
- pfnmerge >>= VTD_STRIDE_SHIFT;
- level++;
- support--;
- }
- return level;
-}
-
-/*
- * Ensure that old small page tables are removed to make room for superpage(s).
- * We're going to add new large pages, so make sure we don't remove their parent
- * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
- */
-static void switch_to_super_page(struct dmar_domain *domain,
- unsigned long start_pfn,
- unsigned long end_pfn, int level)
-{
- unsigned long lvl_pages = lvl_to_nr_pages(level);
- struct dma_pte *pte = NULL;
-
- while (start_pfn <= end_pfn) {
- if (!pte)
- pte = pfn_to_dma_pte(domain, start_pfn, &level,
- GFP_ATOMIC);
-
- if (dma_pte_present(pte)) {
- dma_pte_free_pagetable(domain, start_pfn,
- start_pfn + lvl_pages - 1,
- level + 1);
-
- cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
- end_pfn << VTD_PAGE_SHIFT, 0);
- }
-
- pte++;
- start_pfn += lvl_pages;
- if (first_pte_in_page(pte))
- pte = NULL;
- }
-}
-
-static int
-__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
- unsigned long phys_pfn, unsigned long nr_pages, int prot,
- gfp_t gfp)
-{
- struct dma_pte *first_pte = NULL, *pte = NULL;
- unsigned int largepage_lvl = 0;
- unsigned long lvl_pages = 0;
- phys_addr_t pteval;
- u64 attr;
-
- if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
- return -EINVAL;
-
- if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
- return -EINVAL;
-
- if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
- pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
- return -EINVAL;
- }
-
- attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
- attr |= DMA_FL_PTE_PRESENT;
- if (domain->use_first_level) {
- attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
- if (prot & DMA_PTE_WRITE)
- attr |= DMA_FL_PTE_DIRTY;
- }
-
- domain->has_mappings = true;
-
- pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
-
- while (nr_pages > 0) {
- uint64_t tmp;
-
- if (!pte) {
- largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
- phys_pfn, nr_pages);
-
- pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
- gfp);
- if (!pte)
- return -ENOMEM;
- first_pte = pte;
-
- lvl_pages = lvl_to_nr_pages(largepage_lvl);
-
- /* It is large page*/
- if (largepage_lvl > 1) {
- unsigned long end_pfn;
- unsigned long pages_to_remove;
-
- pteval |= DMA_PTE_LARGE_PAGE;
- pages_to_remove = min_t(unsigned long, nr_pages,
- nr_pte_to_next_page(pte) * lvl_pages);
- end_pfn = iov_pfn + pages_to_remove - 1;
- switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
- } else {
- pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
- }
-
- }
- /* We don't need lock here, nobody else
- * touches the iova range
- */
- tmp = 0ULL;
- if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
- static int dumps = 5;
- pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
- iov_pfn, tmp, (unsigned long long)pteval);
- if (dumps) {
- dumps--;
- debug_dma_dump_mappings(NULL);
- }
- WARN_ON(1);
- }
+ ret = pci_for_each_dma_alias(to_pci_dev(dev),
+ domain_context_mapping_cb, domain);
+ if (ret)
+ return ret;
- nr_pages -= lvl_pages;
- iov_pfn += lvl_pages;
- phys_pfn += lvl_pages;
- pteval += lvl_pages * VTD_PAGE_SIZE;
-
- /* If the next PTE would be the first in a new page, then we
- * need to flush the cache on the entries we've just written.
- * And then we'll need to recalculate 'pte', so clear it and
- * let it get set again in the if (!pte) block above.
- *
- * If we're done (!nr_pages) we need to flush the cache too.
- *
- * Also if we've been setting superpages, we may need to
- * recalculate 'pte' and switch back to smaller pages for the
- * end of the mapping, if the trailing size is not enough to
- * use another superpage (i.e. nr_pages < lvl_pages).
- */
- pte++;
- if (!nr_pages || first_pte_in_page(pte) ||
- (largepage_lvl > 1 && nr_pages < lvl_pages)) {
- domain_flush_cache(domain, first_pte,
- (void *)pte - (void *)first_pte);
- pte = NULL;
- }
- }
+ iommu_enable_pci_ats(info);
return 0;
}
@@ -1959,7 +1230,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8
{
struct intel_iommu *iommu = info->iommu;
struct context_entry *context;
- u16 did_old;
+ u16 did;
spin_lock(&iommu->lock);
context = iommu_context_addr(iommu, bus, devfn, 0);
@@ -1968,138 +1239,74 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8
return;
}
- did_old = context_domain_id(context);
-
+ did = context_domain_id(context);
context_clear_entry(context);
__iommu_flush_cache(iommu, context, sizeof(*context));
spin_unlock(&iommu->lock);
- iommu->flush.flush_context(iommu,
- did_old,
- (((u16)bus) << 8) | devfn,
- DMA_CCMD_MASK_NOBIT,
- DMA_CCMD_DEVICE_INVL);
-
- iommu->flush.flush_iotlb(iommu,
- did_old,
- 0,
- 0,
- DMA_TLB_DSI_FLUSH);
-
- __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
+ intel_context_flush_no_pasid(info, context, did);
}
-static int domain_setup_first_level(struct intel_iommu *iommu,
- struct dmar_domain *domain,
- struct device *dev,
- u32 pasid)
+int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
+ ioasid_t pasid, u16 did, phys_addr_t fsptptr,
+ int flags, struct iommu_domain *old)
{
- struct dma_pte *pgd = domain->pgd;
- int agaw, level;
- int flags = 0;
-
- /*
- * Skip top levels of page tables for iommu which has
- * less agaw than default. Unnecessary for PT mode.
- */
- for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
- pgd = phys_to_virt(dma_pte_addr(pgd));
- if (!dma_pte_present(pgd))
- return -ENOMEM;
- }
-
- level = agaw_to_level(agaw);
- if (level != 4 && level != 5)
- return -EINVAL;
-
- if (level == 5)
- flags |= PASID_FLAG_FL5LP;
-
- if (domain->force_snooping)
- flags |= PASID_FLAG_PAGE_SNOOP;
-
- return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
- domain_id_iommu(domain, iommu),
- flags);
+ if (!old)
+ return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid,
+ did, flags);
+ return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did,
+ iommu_domain_did(old, iommu),
+ flags);
}
-static bool dev_is_real_dma_subdevice(struct device *dev)
+static int domain_setup_second_level(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
- return dev && dev_is_pci(dev) &&
- pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
+ if (!old)
+ return intel_pasid_setup_second_level(iommu, domain,
+ dev, pasid);
+ return intel_pasid_replace_second_level(iommu, domain, dev,
+ iommu_domain_did(old, iommu),
+ pasid);
}
-static int iommu_domain_identity_map(struct dmar_domain *domain,
- unsigned long first_vpfn,
- unsigned long last_vpfn)
+static int domain_setup_passthrough(struct intel_iommu *iommu,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
- /*
- * RMRR range might have overlap with physical memory range,
- * clear it first
- */
- dma_pte_clear_range(domain, first_vpfn, last_vpfn);
-
- return __domain_mapping(domain, first_vpfn,
- first_vpfn, last_vpfn - first_vpfn + 1,
- DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
+ if (!old)
+ return intel_pasid_setup_pass_through(iommu, dev, pasid);
+ return intel_pasid_replace_pass_through(iommu, dev,
+ iommu_domain_did(old, iommu),
+ pasid);
}
-static int md_domain_init(struct dmar_domain *domain, int guest_width);
-
-static int __init si_domain_init(int hw)
+static int domain_setup_first_level(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev,
+ u32 pasid, struct iommu_domain *old)
{
- struct dmar_rmrr_unit *rmrr;
- struct device *dev;
- int i, nid, ret;
-
- si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
- if (!si_domain)
- return -EFAULT;
+ struct pt_iommu_x86_64_hw_info pt_info;
+ unsigned int flags = 0;
- if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
- domain_exit(si_domain);
- si_domain = NULL;
- return -EFAULT;
- }
-
- if (hw)
- return 0;
-
- for_each_online_node(nid) {
- unsigned long start_pfn, end_pfn;
- int i;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- ret = iommu_domain_identity_map(si_domain,
- mm_to_dma_pfn_start(start_pfn),
- mm_to_dma_pfn_end(end_pfn));
- if (ret)
- return ret;
- }
- }
+ pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info);
+ if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5))
+ return -EINVAL;
- /*
- * Identity map the RMRRs so that devices with RMRRs could also use
- * the si_domain.
- */
- for_each_rmrr_units(rmrr) {
- for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
- i, dev) {
- unsigned long long start = rmrr->base_address;
- unsigned long long end = rmrr->end_address;
+ if (pt_info.levels == 5)
+ flags |= PASID_FLAG_FL5LP;
- if (WARN_ON(end < start ||
- end >> agaw_to_width(si_domain->agaw)))
- continue;
+ if (domain->force_snooping)
+ flags |= PASID_FLAG_PAGE_SNOOP;
- ret = iommu_domain_identity_map(si_domain,
- mm_to_dma_pfn_start(start >> PAGE_SHIFT),
- mm_to_dma_pfn_end(end >> PAGE_SHIFT));
- if (ret)
- return ret;
- }
- }
+ if (!(domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ flags |= PASID_FLAG_PWSNP;
- return 0;
+ return __domain_setup_first_level(iommu, dev, pasid,
+ domain_id_iommu(domain, iommu),
+ pt_info.gcr3_pt, flags, old);
}
static int dmar_domain_attach_device(struct dmar_domain *domain,
@@ -2115,6 +1322,7 @@ static int dmar_domain_attach_device(struct dmar_domain *domain,
return ret;
info->domain = domain;
+ info->domain_attached = true;
spin_lock_irqsave(&domain->lock, flags);
list_add(&info->link, &domain->devices);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -2124,19 +1332,18 @@ static int dmar_domain_attach_device(struct dmar_domain *domain,
if (!sm_supported(iommu))
ret = domain_context_mapping(domain, dev);
- else if (hw_pass_through && domain_type_is_si(domain))
- ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
- else if (domain->use_first_level)
- ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
- else
- ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
+ else if (intel_domain_is_fs_paging(domain))
+ ret = domain_setup_first_level(iommu, domain, dev,
+ IOMMU_NO_PASID, NULL);
+ else if (intel_domain_is_ss_paging(domain))
+ ret = domain_setup_second_level(iommu, domain, dev,
+ IOMMU_NO_PASID, NULL);
+ else if (WARN_ON(true))
+ ret = -EINVAL;
if (ret)
goto out_block_translation;
- if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
- iommu_enable_pci_caps(info);
-
ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
if (ret)
goto out_block_translation;
@@ -2177,19 +1384,18 @@ static bool device_rmrr_is_relaxable(struct device *dev)
return false;
}
-/*
- * Return the required default domain type for a specific device.
- *
- * @dev: the device in query
- * @startup: true if this is during early boot
- *
- * Returns:
- * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
- * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
- * - 0: both identity and dynamic domains work for this device
- */
static int device_def_domain_type(struct device *dev)
{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+
+ /*
+ * Hardware does not support the passthrough translation mode.
+ * Always use a dynamaic mapping domain.
+ */
+ if (!ecap_pass_through(iommu->ecap))
+ return IOMMU_DOMAIN_DMA;
+
if (dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(dev);
@@ -2287,7 +1493,8 @@ static int copy_context_table(struct intel_iommu *iommu,
if (!old_ce)
goto out;
- new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
+ new_ce = iommu_alloc_pages_node_sz(iommu->node,
+ GFP_KERNEL, SZ_4K);
if (!new_ce)
goto out_unmap;
@@ -2302,7 +1509,7 @@ static int copy_context_table(struct intel_iommu *iommu,
did = context_domain_id(&ce);
if (did >= 0 && did < cap_ndoms(iommu->cap))
- set_bit(did, iommu->domain_ids);
+ ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL);
set_context_copied(iommu, bus, devfn);
new_ce[idx] = ce;
@@ -2410,10 +1617,6 @@ static int __init init_dmars(void)
struct intel_iommu *iommu;
int ret;
- ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
- if (ret)
- goto free_iommu;
-
for_each_iommu(iommu, drhd) {
if (drhd->ignored) {
iommu_disable_translation(iommu);
@@ -2433,11 +1636,6 @@ static int __init init_dmars(void)
}
intel_iommu_init_qi(iommu);
-
- ret = iommu_init_domains(iommu);
- if (ret)
- goto free_iommu;
-
init_translation_status(iommu);
if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
@@ -2480,8 +1678,6 @@ static int __init init_dmars(void)
}
}
- if (!ecap_pass_through(iommu->ecap))
- hw_pass_through = 0;
intel_svm_check(iommu);
}
@@ -2497,10 +1693,6 @@ static int __init init_dmars(void)
check_tylersburg_isoch();
- ret = si_domain_init(hw_pass_through);
- if (ret)
- goto free_iommu;
-
/*
* for each drhd
* enable fault log
@@ -2521,19 +1713,18 @@ static int __init init_dmars(void)
iommu_flush_write_buffer(iommu);
-#ifdef CONFIG_INTEL_IOMMU_SVM
- if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
+ if (ecap_prs(iommu->ecap)) {
/*
* Call dmar_alloc_hwirq() with dmar_global_lock held,
* could cause possible lock race condition.
*/
up_write(&dmar_global_lock);
- ret = intel_svm_enable_prq(iommu);
+ ret = intel_iommu_enable_prq(iommu);
down_write(&dmar_global_lock);
if (ret)
goto free_iommu;
}
-#endif
+
ret = dmar_set_interrupt(iommu);
if (ret)
goto free_iommu;
@@ -2546,10 +1737,6 @@ free_iommu:
disable_dmar_iommu(iommu);
free_dmar_iommu(iommu);
}
- if (si_domain) {
- domain_exit(si_domain);
- si_domain = NULL;
- }
return ret;
}
@@ -2638,7 +1825,7 @@ static void iommu_flush_all(void)
}
}
-static int iommu_suspend(void)
+static int iommu_suspend(void *data)
{
struct dmar_drhd_unit *drhd;
struct intel_iommu *iommu = NULL;
@@ -2665,7 +1852,7 @@ static int iommu_suspend(void)
return 0;
}
-static void iommu_resume(void)
+static void iommu_resume(void *data)
{
struct dmar_drhd_unit *drhd;
struct intel_iommu *iommu = NULL;
@@ -2696,14 +1883,18 @@ static void iommu_resume(void)
}
}
-static struct syscore_ops iommu_syscore_ops = {
+static const struct syscore_ops iommu_syscore_ops = {
.resume = iommu_resume,
.suspend = iommu_suspend,
};
+static struct syscore iommu_syscore = {
+ .ops = &iommu_syscore_ops,
+};
+
static void __init init_iommu_pm_ops(void)
{
- register_syscore_ops(&iommu_syscore_ops);
+ register_syscore(&iommu_syscore);
}
#else
@@ -2917,25 +2108,8 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
{
- int sp, ret;
struct intel_iommu *iommu = dmaru->iommu;
-
- ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
- if (ret)
- goto out;
-
- if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
- pr_warn("%s: Doesn't support hardware pass through.\n",
- iommu->name);
- return -ENXIO;
- }
-
- sp = domain_update_iommu_superpage(NULL, iommu) - 1;
- if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
- pr_warn("%s: Doesn't support large page.\n",
- iommu->name);
- return -ENXIO;
- }
+ int ret;
/*
* Disable translation if already enabled prior to OS handover.
@@ -2943,9 +2117,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
if (iommu->gcmd & DMA_GCMD_TE)
iommu_disable_translation(iommu);
- ret = iommu_init_domains(iommu);
- if (ret == 0)
- ret = iommu_alloc_root_entry(iommu);
+ ret = iommu_alloc_root_entry(iommu);
if (ret)
goto out;
@@ -2963,13 +2135,12 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
intel_iommu_init_qi(iommu);
iommu_flush_write_buffer(iommu);
-#ifdef CONFIG_INTEL_IOMMU_SVM
- if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
- ret = intel_svm_enable_prq(iommu);
+ if (ecap_prs(iommu->ecap)) {
+ ret = intel_iommu_enable_prq(iommu);
if (ret)
goto disable_iommu;
}
-#endif
+
ret = dmar_set_interrupt(iommu);
if (ret)
goto disable_iommu;
@@ -3037,7 +2208,6 @@ static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
struct device *tmp;
int i;
- dev = pci_physfn(dev);
rcu_read_lock();
list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
@@ -3054,15 +2224,16 @@ out:
return satcu;
}
-static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
+static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
{
- int i, ret = 1;
- struct pci_bus *bus;
struct pci_dev *bridge = NULL;
- struct device *tmp;
- struct acpi_dmar_atsr *atsr;
struct dmar_atsr_unit *atsru;
struct dmar_satc_unit *satcu;
+ struct acpi_dmar_atsr *atsr;
+ bool supported = true;
+ struct pci_bus *bus;
+ struct device *tmp;
+ int i;
dev = pci_physfn(dev);
satcu = dmar_find_matched_satc_unit(dev);
@@ -3080,11 +2251,11 @@ static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
bridge = bus->self;
/* If it's an integrated device, allow ATS */
if (!bridge)
- return 1;
+ return true;
/* Connected via non-PCIe: no ATS */
if (!pci_is_pcie(bridge) ||
pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
- return 0;
+ return false;
/* If we found the root port, look it up in the ATSR */
if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
break;
@@ -3103,11 +2274,11 @@ static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
if (atsru->include_all)
goto out;
}
- ret = 0;
+ supported = false;
out:
rcu_read_unlock();
- return ret;
+ return supported;
}
int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
@@ -3180,43 +2351,6 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
return 0;
}
-static int intel_iommu_memory_notifier(struct notifier_block *nb,
- unsigned long val, void *v)
-{
- struct memory_notify *mhp = v;
- unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
- unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
- mhp->nr_pages - 1);
-
- switch (val) {
- case MEM_GOING_ONLINE:
- if (iommu_domain_identity_map(si_domain,
- start_vpfn, last_vpfn)) {
- pr_warn("Failed to build identity map for [%lx-%lx]\n",
- start_vpfn, last_vpfn);
- return NOTIFY_BAD;
- }
- break;
-
- case MEM_OFFLINE:
- case MEM_CANCEL_ONLINE:
- {
- LIST_HEAD(freelist);
-
- domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
- iommu_put_pages_list(&freelist);
- }
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block intel_iommu_memory_nb = {
- .notifier_call = intel_iommu_memory_notifier,
- .priority = 0
-};
-
static void intel_disable_iommus(void)
{
struct intel_iommu *iommu = NULL;
@@ -3234,16 +2368,19 @@ void intel_iommu_shutdown(void)
if (no_iommu || dmar_disabled)
return;
- down_write(&dmar_global_lock);
+ /*
+ * All other CPUs were brought down, hotplug interrupts were disabled,
+ * no lock and RCU checking needed anymore
+ */
+ list_for_each_entry(drhd, &dmar_drhd_units, list) {
+ iommu = drhd->iommu;
- /* Disable PMRs explicitly here. */
- for_each_iommu(iommu, drhd)
+ /* Disable PMRs explicitly here. */
iommu_disable_protect_mem_regions(iommu);
- /* Make sure the IOMMUs are switched off */
- intel_disable_iommus();
-
- up_write(&dmar_global_lock);
+ /* Make sure the IOMMUs are switched off */
+ iommu_disable_translation(iommu);
+ }
}
static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
@@ -3299,9 +2436,14 @@ static ssize_t domains_used_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct intel_iommu *iommu = dev_to_intel_iommu(dev);
- return sysfs_emit(buf, "%d\n",
- bitmap_weight(iommu->domain_ids,
- cap_ndoms(iommu->cap)));
+ unsigned int count = 0;
+ int id;
+
+ for (id = 0; id < cap_ndoms(iommu->cap); id++)
+ if (ida_exists(&iommu->domain_ida, id))
+ count++;
+
+ return sysfs_emit(buf, "%d\n", count);
}
static DEVICE_ATTR_RO(domains_used);
@@ -3376,6 +2518,7 @@ static int __init probe_acpi_namespace_devices(void)
if (dev->bus != &acpi_bus_type)
continue;
+ up_read(&dmar_global_lock);
adev = to_acpi_device(dev);
mutex_lock(&adev->physical_node_lock);
list_for_each_entry(pn,
@@ -3385,6 +2528,7 @@ static int __init probe_acpi_namespace_devices(void)
break;
}
mutex_unlock(&adev->physical_node_lock);
+ down_read(&dmar_global_lock);
if (ret)
return ret;
@@ -3502,23 +2646,25 @@ int __init intel_iommu_init(void)
* the virtual and physical IOMMU page-tables.
*/
if (cap_caching_mode(iommu->cap) &&
- !first_level_by_default(IOMMU_DOMAIN_DMA)) {
+ !first_level_by_default(iommu)) {
pr_info_once("IOMMU batching disallowed due to virtualization\n");
iommu_set_dma_strict();
}
iommu_device_sysfs_add(&iommu->iommu, NULL,
intel_iommu_groups,
"%s", iommu->name);
+ /*
+ * The iommu device probe is protected by the iommu_probe_device_lock.
+ * Release the dmar_global_lock before entering the device probe path
+ * to avoid unnecessary lock order splat.
+ */
+ up_read(&dmar_global_lock);
iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
+ down_read(&dmar_global_lock);
iommu_pmu_register(iommu);
}
- up_read(&dmar_global_lock);
-
- if (si_domain && !hw_pass_through)
- register_memory_notifier(&intel_iommu_memory_nb);
- down_read(&dmar_global_lock);
if (probe_acpi_namespace_devices())
pr_warn("ACPI name space devices didn't probe correctly\n");
@@ -3559,11 +2705,14 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op
*/
static void domain_context_clear(struct device_domain_info *info)
{
- if (!dev_is_pci(info->dev))
+ if (!dev_is_pci(info->dev)) {
domain_context_clear_one(info, info->bus, info->devfn);
+ return;
+ }
pci_for_each_dma_alias(to_pci_dev(info->dev),
&domain_context_clear_one_cb, info);
+ iommu_disable_pci_ats(info);
}
/*
@@ -3577,7 +2726,13 @@ void device_block_translation(struct device *dev)
struct intel_iommu *iommu = info->iommu;
unsigned long flags;
- iommu_disable_pci_caps(info);
+ /* Device in DMA blocking state. Noting to do. */
+ if (!info->domain_attached)
+ return;
+
+ if (info->domain)
+ cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
+
if (!dev_is_real_dma_subdevice(dev)) {
if (sm_supported(iommu))
intel_pasid_tear_down_entry(iommu, dev,
@@ -3586,6 +2741,9 @@ void device_block_translation(struct device *dev)
domain_context_clear(info);
}
+ /* Device now in DMA blocking state. */
+ info->domain_attached = false;
+
if (!info->domain)
return;
@@ -3593,335 +2751,410 @@ void device_block_translation(struct device *dev)
list_del(&info->link);
spin_unlock_irqrestore(&info->domain->lock, flags);
- cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
domain_detach_iommu(info->domain, iommu);
info->domain = NULL;
}
-static int md_domain_init(struct dmar_domain *domain, int guest_width)
-{
- int adjust_width;
-
- /* calculate AGAW */
- domain->gaw = guest_width;
- adjust_width = guestwidth_to_adjustwidth(guest_width);
- domain->agaw = width_to_agaw(adjust_width);
-
- domain->iommu_coherency = false;
- domain->iommu_superpage = 0;
- domain->max_addr = 0;
-
- /* always allocate the top pgd */
- domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
- if (!domain->pgd)
- return -ENOMEM;
- domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
- return 0;
-}
-
static int blocking_domain_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+
+ iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev);
device_block_translation(dev);
return 0;
}
+static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old);
+
static struct iommu_domain blocking_domain = {
.type = IOMMU_DOMAIN_BLOCKED,
.ops = &(const struct iommu_domain_ops) {
.attach_dev = blocking_domain_attach_dev,
+ .set_dev_pasid = blocking_domain_set_dev_pasid,
}
};
-static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
+static struct dmar_domain *paging_domain_alloc(void)
{
- struct dmar_domain *dmar_domain;
- struct iommu_domain *domain;
+ struct dmar_domain *domain;
- switch (type) {
- case IOMMU_DOMAIN_DMA:
- case IOMMU_DOMAIN_UNMANAGED:
- dmar_domain = alloc_domain(type);
- if (!dmar_domain) {
- pr_err("Can't allocate dmar_domain\n");
- return NULL;
- }
- if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
- pr_err("Domain initialization failed\n");
- domain_exit(dmar_domain);
- return NULL;
- }
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
- domain = &dmar_domain->domain;
- domain->geometry.aperture_start = 0;
- domain->geometry.aperture_end =
- __DOMAIN_MAX_ADDR(dmar_domain->gaw);
- domain->geometry.force_aperture = true;
+ INIT_LIST_HEAD(&domain->devices);
+ INIT_LIST_HEAD(&domain->dev_pasids);
+ INIT_LIST_HEAD(&domain->cache_tags);
+ spin_lock_init(&domain->lock);
+ spin_lock_init(&domain->cache_lock);
+ xa_init(&domain->iommu_array);
+ INIT_LIST_HEAD(&domain->s1_domains);
+ spin_lock_init(&domain->s1_lock);
- return domain;
- case IOMMU_DOMAIN_IDENTITY:
- return &si_domain->domain;
- default:
- return NULL;
+ return domain;
+}
+
+static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu,
+ unsigned int *top_level)
+{
+ unsigned int mgaw = cap_mgaw(iommu->cap);
+
+ /*
+ * Spec 3.6 First-Stage Translation:
+ *
+ * Software must limit addresses to less than the minimum of MGAW
+ * and the lower canonical address width implied by FSPM (i.e.,
+ * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level).
+ */
+ if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) {
+ *top_level = 4;
+ return min(57, mgaw);
}
- return NULL;
+ /* Four level is always supported */
+ *top_level = 3;
+ return min(48, mgaw);
}
static struct iommu_domain *
-intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
- struct iommu_domain *parent,
- const struct iommu_user_data *user_data)
+intel_iommu_domain_alloc_first_stage(struct device *dev,
+ struct intel_iommu *iommu, u32 flags)
{
- struct device_domain_info *info = dev_iommu_priv_get(dev);
- bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
- struct intel_iommu *iommu = info->iommu;
+ struct pt_iommu_x86_64_cfg cfg = {};
struct dmar_domain *dmar_domain;
- struct iommu_domain *domain;
-
- /* Must be NESTING domain */
- if (parent) {
- if (!nested_supported(iommu) || flags)
- return ERR_PTR(-EOPNOTSUPP);
- return intel_nested_domain_alloc(parent, user_data);
- }
+ int ret;
- if (flags &
- (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
- return ERR_PTR(-EOPNOTSUPP);
- if (nested_parent && !nested_supported(iommu))
+ if (flags & ~IOMMU_HWPT_ALLOC_PASID)
return ERR_PTR(-EOPNOTSUPP);
- if (user_data || (dirty_tracking && !ssads_supported(iommu)))
+
+ /* Only SL is available in legacy mode */
+ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return ERR_PTR(-EOPNOTSUPP);
+ dmar_domain = paging_domain_alloc();
+ if (IS_ERR(dmar_domain))
+ return ERR_CAST(dmar_domain);
+
+ cfg.common.hw_max_vasz_lg2 =
+ compute_vasz_lg2_fs(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
+ BIT(PT_FEAT_FLUSH_RANGE);
+ /* First stage always uses scalable mode */
+ if (!ecap_smpwc(iommu->ecap))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
+ dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
/*
- * domain_alloc_user op needs to fully initialize a domain before
- * return, so uses iommu_domain_alloc() here for simple.
+ * iotlb sync for map is only needed for legacy implementations that
+ * explicitly require flushing internal write buffers to ensure memory
+ * coherence.
*/
- domain = iommu_domain_alloc(dev->bus);
- if (!domain)
- return ERR_PTR(-ENOMEM);
+ if (rwbf_required(iommu))
+ dmar_domain->iotlb_sync_map = true;
- dmar_domain = to_dmar_domain(domain);
-
- if (nested_parent) {
- dmar_domain->nested_parent = true;
- INIT_LIST_HEAD(&dmar_domain->s1_domains);
- spin_lock_init(&dmar_domain->s1_lock);
+ ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
}
- if (dirty_tracking) {
- if (dmar_domain->use_first_level) {
- iommu_domain_free(domain);
- return ERR_PTR(-EOPNOTSUPP);
- }
- domain->dirty_ops = &intel_dirty_ops;
- }
+ if (!cap_fl1gp_support(iommu->cap))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
- return domain;
+ return &dmar_domain->domain;
}
-static void intel_iommu_domain_free(struct iommu_domain *domain)
+static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu,
+ unsigned int *top_level)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ unsigned int sagaw = cap_sagaw(iommu->cap);
+ unsigned int mgaw = cap_mgaw(iommu->cap);
- WARN_ON(dmar_domain->nested_parent &&
- !list_empty(&dmar_domain->s1_domains));
- if (domain != &si_domain->domain)
- domain_exit(dmar_domain);
+ /*
+ * Find the largest table size that both the mgaw and sagaw support.
+ * This sets the valid range of IOVA and the top starting level.
+ * Some HW may only support a 4 or 5 level walk but must limit IOVA to
+ * 3 levels.
+ */
+ if (mgaw > 48 && sagaw >= BIT(3)) {
+ *top_level = 4;
+ return min(57, mgaw);
+ } else if (mgaw > 39 && sagaw >= BIT(2)) {
+ *top_level = 3 + ffs(sagaw >> 3);
+ return min(48, mgaw);
+ } else if (mgaw > 30 && sagaw >= BIT(1)) {
+ *top_level = 2 + ffs(sagaw >> 2);
+ return min(39, mgaw);
+ }
+ return 0;
}
-int prepare_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(vtdss),
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+};
+
+static struct iommu_domain *
+intel_iommu_domain_alloc_second_stage(struct device *dev,
+ struct intel_iommu *iommu, u32 flags)
{
- struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct intel_iommu *iommu = info->iommu;
- int addr_width;
+ struct pt_iommu_vtdss_cfg cfg = {};
+ struct dmar_domain *dmar_domain;
+ unsigned int sslps;
+ int ret;
- if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
- return -EINVAL;
+ if (flags &
+ (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_PASID)))
+ return ERR_PTR(-EOPNOTSUPP);
- if (domain->dirty_ops && !ssads_supported(iommu))
- return -EINVAL;
+ if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) &&
+ !nested_supported(iommu)) ||
+ ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
+ !ssads_supported(iommu)))
+ return ERR_PTR(-EOPNOTSUPP);
- /* check if this iommu agaw is sufficient for max mapped address */
- addr_width = agaw_to_width(iommu->agaw);
- if (addr_width > cap_mgaw(iommu->cap))
- addr_width = cap_mgaw(iommu->cap);
+ /* Legacy mode always supports second stage */
+ if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
+ return ERR_PTR(-EOPNOTSUPP);
- if (dmar_domain->max_addr > (1LL << addr_width))
- return -EINVAL;
- dmar_domain->gaw = addr_width;
+ dmar_domain = paging_domain_alloc();
+ if (IS_ERR(dmar_domain))
+ return ERR_CAST(dmar_domain);
+
+ cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level);
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
/*
- * Knock out extra levels of page tables if necessary
+ * Read-only mapping is disallowed on the domain which serves as the
+ * parent in a nested configuration, due to HW errata
+ * (ERRATA_772415_SPR17)
*/
- while (iommu->agaw < dmar_domain->agaw) {
- struct dma_pte *pte;
+ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)
+ cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE);
- pte = dmar_domain->pgd;
- if (dma_pte_present(pte)) {
- dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
- iommu_free_page(pte);
- }
- dmar_domain->agaw--;
+ if (!iommu_paging_structure_coherency(iommu))
+ cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
+ dmar_domain->iommu.iommu_device = dev;
+ dmar_domain->iommu.nid = dev_to_node(dev);
+ dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
+ dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops;
+
+ ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL);
+ if (ret) {
+ kfree(dmar_domain);
+ return ERR_PTR(ret);
}
- if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
- context_copied(iommu, info->bus, info->devfn))
- return intel_pasid_setup_sm_context(dev);
+ /* Adjust the supported page sizes to HW capability */
+ sslps = cap_super_page_val(iommu->cap);
+ if (!(sslps & BIT(0)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M;
+ if (!(sslps & BIT(1)))
+ dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
+ if (!intel_iommu_superpage)
+ dmar_domain->domain.pgsize_bitmap = SZ_4K;
- return 0;
+ /*
+ * Besides the internal write buffer flush, the caching mode used for
+ * legacy nested translation (which utilizes shadowing page tables)
+ * also requires iotlb sync on map.
+ */
+ if (rwbf_required(iommu) || cap_caching_mode(iommu->cap))
+ dmar_domain->iotlb_sync_map = true;
+
+ return &dmar_domain->domain;
}
-static int intel_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+static struct iommu_domain *
+intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
+ const struct iommu_user_data *user_data)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
- int ret;
-
- if (info->domain)
- device_block_translation(dev);
+ struct intel_iommu *iommu = info->iommu;
+ struct iommu_domain *domain;
- ret = prepare_domain_attach_device(domain, dev);
- if (ret)
- return ret;
+ if (user_data)
+ return ERR_PTR(-EOPNOTSUPP);
- return dmar_domain_attach_device(to_dmar_domain(domain), dev);
+ /* Prefer first stage if possible by default. */
+ domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags);
+ if (domain != ERR_PTR(-EOPNOTSUPP))
+ return domain;
+ return intel_iommu_domain_alloc_second_stage(dev, iommu, flags);
}
-static int intel_iommu_map(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t hpa,
- size_t size, int iommu_prot, gfp_t gfp)
+static void intel_iommu_domain_free(struct iommu_domain *domain)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- u64 max_addr;
- int prot = 0;
-
- if (iommu_prot & IOMMU_READ)
- prot |= DMA_PTE_READ;
- if (iommu_prot & IOMMU_WRITE)
- prot |= DMA_PTE_WRITE;
- if (dmar_domain->set_pte_snp)
- prot |= DMA_PTE_SNP;
-
- max_addr = iova + size;
- if (dmar_domain->max_addr < max_addr) {
- u64 end;
-
- /* check if minimum agaw is sufficient for mapped address */
- end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
- if (end < max_addr) {
- pr_err("%s: iommu width (%d) is not "
- "sufficient for the mapped address (%llx)\n",
- __func__, dmar_domain->gaw, max_addr);
- return -EFAULT;
- }
- dmar_domain->max_addr = max_addr;
- }
- /* Round up size to next multiple of PAGE_SIZE, if it and
- the low bits of hpa would take us onto the next page */
- size = aligned_nrpages(hpa, size);
- return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
- hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
+
+ if (WARN_ON(dmar_domain->nested_parent &&
+ !list_empty(&dmar_domain->s1_domains)))
+ return;
+
+ if (WARN_ON(!list_empty(&dmar_domain->devices)))
+ return;
+
+ pt_iommu_deinit(&dmar_domain->iommu);
+
+ kfree(dmar_domain->qi_batch);
+ kfree(dmar_domain);
}
-static int intel_iommu_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount,
- int prot, gfp_t gfp, size_t *mapped)
+static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
+ struct intel_iommu *iommu)
{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
- int ret;
+ if (WARN_ON(dmar_domain->domain.dirty_ops ||
+ dmar_domain->nested_parent))
+ return -EINVAL;
- if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
+ /* Only SL is available in legacy mode */
+ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
return -EINVAL;
- if (!IS_ALIGNED(iova | paddr, pgsize))
+ if (!ecap_smpwc(iommu->ecap) &&
+ !(dmar_domain->fspt.x86_64_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
return -EINVAL;
- ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
- if (!ret && mapped)
- *mapped = size;
+ /* Supports the number of table levels */
+ if (!cap_fl5lp_support(iommu->cap) &&
+ dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48)
+ return -EINVAL;
- return ret;
+ /* Same page size support */
+ if (!cap_fl1gp_support(iommu->cap) &&
+ (dmar_domain->domain.pgsize_bitmap & SZ_1G))
+ return -EINVAL;
+
+ /* iotlb sync on map requirement */
+ if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map)
+ return -EINVAL;
+
+ return 0;
}
-static size_t intel_iommu_unmap(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- struct iommu_iotlb_gather *gather)
+static int
+paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
+ struct intel_iommu *iommu)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long start_pfn, last_pfn;
- int level = 0;
+ unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2;
+ unsigned int sslps = cap_super_page_val(iommu->cap);
+ struct pt_iommu_vtdss_hw_info pt_info;
- /* Cope with horrid API which requires us to unmap more than the
- size argument if it happens to be a large-page mapping. */
- if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
- &level, GFP_ATOMIC)))
- return 0;
+ pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info);
- if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
- size = VTD_PAGE_SIZE << level_to_offset_bits(level);
+ if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
+ return -EINVAL;
+ if (dmar_domain->nested_parent && !nested_supported(iommu))
+ return -EINVAL;
+
+ /* Legacy mode always supports second stage */
+ if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
+ return -EINVAL;
- start_pfn = iova >> VTD_PAGE_SHIFT;
- last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+ if (!iommu_paging_structure_coherency(iommu) &&
+ !(dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)))
+ return -EINVAL;
- domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
+ /* Address width falls within the capability */
+ if (cap_mgaw(iommu->cap) < vasz_lg2)
+ return -EINVAL;
- if (dmar_domain->max_addr == iova + size)
- dmar_domain->max_addr = iova;
+ /* Page table level is supported. */
+ if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
+ return -EINVAL;
+
+ /* Same page size support */
+ if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
+ return -EINVAL;
+ if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G))
+ return -EINVAL;
+
+ /* iotlb sync on map requirement */
+ if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) &&
+ !dmar_domain->iotlb_sync_map)
+ return -EINVAL;
/*
- * We do not use page-selective IOTLB invalidation in flush queue,
- * so there is no need to track page and sync iotlb.
+ * FIXME this is locked wrong, it needs to be under the
+ * dmar_domain->lock
*/
- if (!iommu_iotlb_gather_queued(gather))
- iommu_iotlb_gather_add_page(domain, gather, iova, size);
+ if ((dmar_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) &&
+ !ecap_sc_support(iommu->ecap))
+ return -EINVAL;
+ return 0;
+}
- return size;
+int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ int ret = -EINVAL;
+
+ if (intel_domain_is_fs_paging(dmar_domain))
+ ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
+ else if (intel_domain_is_ss_paging(dmar_domain))
+ ret = paging_domain_compatible_second_stage(dmar_domain, iommu);
+ else if (WARN_ON(true))
+ ret = -EINVAL;
+ if (ret)
+ return ret;
+
+ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
+ context_copied(iommu, info->bus, info->devfn))
+ return intel_pasid_setup_sm_context(dev);
+
+ return 0;
}
-static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
- unsigned long iova,
- size_t pgsize, size_t pgcount,
- struct iommu_iotlb_gather *gather)
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
- unsigned long pgshift = __ffs(pgsize);
- size_t size = pgcount << pgshift;
+ int ret;
+
+ device_block_translation(dev);
- return intel_iommu_unmap(domain, iova, size, gather);
+ ret = paging_domain_compatible(domain, dev);
+ if (ret)
+ return ret;
+
+ ret = iopf_for_domain_set(domain, dev);
+ if (ret)
+ return ret;
+
+ ret = dmar_domain_attach_device(to_dmar_domain(domain), dev);
+ if (ret)
+ iopf_for_domain_remove(domain, dev);
+
+ return ret;
}
static void intel_iommu_tlb_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
{
cache_tag_flush_range(to_dmar_domain(domain), gather->start,
- gather->end, list_empty(&gather->freelist));
+ gather->end,
+ iommu_pages_list_empty(&gather->freelist));
iommu_put_pages_list(&gather->freelist);
}
-static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
-{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct dma_pte *pte;
- int level = 0;
- u64 phys = 0;
-
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
- GFP_ATOMIC);
- if (pte && dma_pte_present(pte))
- phys = dma_pte_addr(pte) +
- (iova & (BIT_MASK(level_to_offset_bits(level) +
- VTD_PAGE_SHIFT) - 1));
-
- return phys;
-}
-
static bool domain_support_force_snooping(struct dmar_domain *domain)
{
struct device_domain_info *info;
@@ -3938,44 +3171,41 @@ static bool domain_support_force_snooping(struct dmar_domain *domain)
return support;
}
-static void domain_set_force_snooping(struct dmar_domain *domain)
+static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain)
{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct device_domain_info *info;
- assert_spin_locked(&domain->lock);
- /*
- * Second level page table supports per-PTE snoop control. The
- * iommu_map() interface will handle this by setting SNP bit.
- */
- if (!domain->use_first_level) {
- domain->set_pte_snp = true;
- return;
- }
+ guard(spinlock_irqsave)(&dmar_domain->lock);
+
+ if (dmar_domain->force_snooping)
+ return true;
- list_for_each_entry(info, &domain->devices, link)
+ if (!domain_support_force_snooping(dmar_domain))
+ return false;
+
+ dmar_domain->force_snooping = true;
+ list_for_each_entry(info, &dmar_domain->devices, link)
intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
IOMMU_NO_PASID);
+ return true;
}
-static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
+static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long flags;
-
- if (dmar_domain->force_snooping)
- return true;
- spin_lock_irqsave(&dmar_domain->lock, flags);
- if (!domain_support_force_snooping(dmar_domain) ||
- (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
- spin_unlock_irqrestore(&dmar_domain->lock, flags);
+ guard(spinlock_irqsave)(&dmar_domain->lock);
+ if (!domain_support_force_snooping(dmar_domain))
return false;
- }
- domain_set_force_snooping(dmar_domain);
+ /*
+ * Second level page table supports per-PTE snoop control. The
+ * iommu_map() interface will handle this by setting SNP bit.
+ */
+ dmar_domain->sspt.vtdss_pt.common.features |=
+ BIT(PT_FEAT_VTDSS_FORCE_COHERENCE);
dmar_domain->force_snooping = true;
- spin_unlock_irqrestore(&dmar_domain->lock, flags);
-
return true;
}
@@ -4053,13 +3283,14 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev)
}
if (info->ats_supported && ecap_prs(iommu->ecap) &&
- pci_pri_supported(pdev))
+ ecap_pds(iommu->ecap) && pci_pri_supported(pdev))
info->pri_supported = 1;
}
}
dev_iommu_priv_set(dev, info);
if (pdev && pci_ats_supported(pdev)) {
+ pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
ret = device_rbtree_insert(iommu, info);
if (ret)
goto free;
@@ -4092,11 +3323,48 @@ free:
return ERR_PTR(ret);
}
+static void intel_iommu_probe_finalize(struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+
+ /*
+ * The PCIe spec, in its wisdom, declares that the behaviour of the
+ * device is undefined if you enable PASID support after ATS support.
+ * So always enable PASID support on devices which have it, even if
+ * we can't yet know if we're ever going to use it.
+ */
+ if (info->pasid_supported &&
+ !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
+ info->pasid_enabled = 1;
+
+ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
+ iommu_enable_pci_ats(info);
+ /* Assign a DEVTLB cache tag to the default domain. */
+ if (info->ats_enabled && info->domain) {
+ u16 did = domain_id_iommu(info->domain, iommu);
+
+ if (cache_tag_assign(info->domain, did, dev,
+ IOMMU_NO_PASID, CACHE_TAG_DEVTLB))
+ iommu_disable_pci_ats(info);
+ }
+ }
+ iommu_enable_pci_pri(info);
+}
+
static void intel_iommu_release_device(struct device *dev)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
+ iommu_disable_pci_pri(info);
+ iommu_disable_pci_ats(info);
+
+ if (info->pasid_enabled) {
+ pci_disable_pasid(to_pci_dev(dev));
+ info->pasid_enabled = 0;
+ }
+
mutex_lock(&iommu->iopf_lock);
if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
device_rbtree_remove(info);
@@ -4109,7 +3377,6 @@ static void intel_iommu_release_device(struct device *dev)
intel_pasid_free_table(dev);
intel_iommu_debugfs_remove_dev(info);
kfree(info);
- set_dma_ops(dev, NULL);
}
static void intel_iommu_get_resv_regions(struct device *device,
@@ -4178,132 +3445,44 @@ static struct iommu_group *intel_iommu_device_group(struct device *dev)
return generic_device_group(dev);
}
-static int intel_iommu_enable_sva(struct device *dev)
-{
- struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct intel_iommu *iommu;
-
- if (!info || dmar_disabled)
- return -EINVAL;
-
- iommu = info->iommu;
- if (!iommu)
- return -EINVAL;
-
- if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
- return -ENODEV;
-
- if (!info->pasid_enabled || !info->ats_enabled)
- return -EINVAL;
-
- /*
- * Devices having device-specific I/O fault handling should not
- * support PCI/PRI. The IOMMU side has no means to check the
- * capability of device-specific IOPF. Therefore, IOMMU can only
- * default that if the device driver enables SVA on a non-PRI
- * device, it will handle IOPF in its own way.
- */
- if (!info->pri_supported)
- return 0;
-
- /* Devices supporting PRI should have it enabled. */
- if (!info->pri_enabled)
- return -EINVAL;
-
- return 0;
-}
-
-static int intel_iommu_enable_iopf(struct device *dev)
+int intel_iommu_enable_iopf(struct device *dev)
{
- struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct intel_iommu *iommu;
+ struct intel_iommu *iommu = info->iommu;
int ret;
- if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
+ if (!info->pri_enabled)
return -ENODEV;
- if (info->pri_enabled)
- return -EBUSY;
-
- iommu = info->iommu;
- if (!iommu)
- return -EINVAL;
-
- /* PASID is required in PRG Response Message. */
- if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
- return -EINVAL;
-
- ret = pci_reset_pri(pdev);
- if (ret)
- return ret;
+ /* pri_enabled is protected by the group mutex. */
+ iommu_group_mutex_assert(dev);
+ if (info->iopf_refcount) {
+ info->iopf_refcount++;
+ return 0;
+ }
ret = iopf_queue_add_device(iommu->iopf_queue, dev);
if (ret)
return ret;
- ret = pci_enable_pri(pdev, PRQ_DEPTH);
- if (ret) {
- iopf_queue_remove_device(iommu->iopf_queue, dev);
- return ret;
- }
-
- info->pri_enabled = 1;
+ info->iopf_refcount = 1;
return 0;
}
-static int intel_iommu_disable_iopf(struct device *dev)
+void intel_iommu_disable_iopf(struct device *dev)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
- if (!info->pri_enabled)
- return -EINVAL;
-
- /*
- * PCIe spec states that by clearing PRI enable bit, the Page
- * Request Interface will not issue new page requests, but has
- * outstanding page requests that have been transmitted or are
- * queued for transmission. This is supposed to be called after
- * the device driver has stopped DMA, all PASIDs have been
- * unbound and the outstanding PRQs have been drained.
- */
- pci_disable_pri(to_pci_dev(dev));
- info->pri_enabled = 0;
- iopf_queue_remove_device(iommu->iopf_queue, dev);
-
- return 0;
-}
-
-static int
-intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
-{
- switch (feat) {
- case IOMMU_DEV_FEAT_IOPF:
- return intel_iommu_enable_iopf(dev);
-
- case IOMMU_DEV_FEAT_SVA:
- return intel_iommu_enable_sva(dev);
-
- default:
- return -ENODEV;
- }
-}
-
-static int
-intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
-{
- switch (feat) {
- case IOMMU_DEV_FEAT_IOPF:
- return intel_iommu_disable_iopf(dev);
+ if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
+ return;
- case IOMMU_DEV_FEAT_SVA:
- return 0;
+ iommu_group_mutex_assert(dev);
+ if (--info->iopf_refcount)
+ return;
- default:
- return -ENODEV;
- }
+ iopf_queue_remove_device(iommu->iopf_queue, dev);
}
static bool intel_iommu_is_attach_deferred(struct device *dev)
@@ -4333,20 +3512,31 @@ static bool risky_device(struct pci_dev *pdev)
static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
- cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+ if (dmar_domain->iotlb_sync_map)
+ cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
return 0;
}
-static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
- struct iommu_domain *domain)
+void domain_remove_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct dev_pasid_info *curr, *dev_pasid = NULL;
struct intel_iommu *iommu = info->iommu;
+ struct dmar_domain *dmar_domain;
unsigned long flags;
+ if (!domain)
+ return;
+
+ /* Identity domain has no meta data for pasid. */
+ if (domain->type == IOMMU_DOMAIN_IDENTITY)
+ return;
+
+ dmar_domain = to_dmar_domain(domain);
spin_lock_irqsave(&dmar_domain->lock, flags);
list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
if (curr->dev == dev && curr->pasid == pasid) {
@@ -4355,27 +3545,79 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
break;
}
}
- WARN_ON_ONCE(!dev_pasid);
spin_unlock_irqrestore(&dmar_domain->lock, flags);
cache_tag_unassign_domain(dmar_domain, dev, pasid);
domain_detach_iommu(dmar_domain, iommu);
- intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
+ if (!WARN_ON_ONCE(!dev_pasid)) {
+ intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
+ kfree(dev_pasid);
+ }
+}
+
+static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+
+ intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
+ iopf_for_domain_remove(old, dev);
+ domain_remove_dev_pasid(old, dev, pasid);
+
+ return 0;
+}
+
+struct dev_pasid_info *
+domain_add_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ struct dev_pasid_info *dev_pasid;
+ unsigned long flags;
+ int ret;
+
+ dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
+ if (!dev_pasid)
+ return ERR_PTR(-ENOMEM);
+
+ ret = domain_attach_iommu(dmar_domain, iommu);
+ if (ret)
+ goto out_free;
+
+ ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
+ if (ret)
+ goto out_detach_iommu;
+
+ dev_pasid->dev = dev;
+ dev_pasid->pasid = pasid;
+ spin_lock_irqsave(&dmar_domain->lock, flags);
+ list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
+ spin_unlock_irqrestore(&dmar_domain->lock, flags);
+
+ return dev_pasid;
+out_detach_iommu:
+ domain_detach_iommu(dmar_domain, iommu);
+out_free:
kfree(dev_pasid);
- intel_pasid_tear_down_entry(iommu, dev, pasid, false);
- intel_drain_pasid_prq(dev, pasid);
+ return ERR_PTR(ret);
}
static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
- struct device *dev, ioasid_t pasid)
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu = info->iommu;
struct dev_pasid_info *dev_pasid;
- unsigned long flags;
int ret;
+ if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
+ return -EINVAL;
+
if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
return -EOPNOTSUPP;
@@ -4385,58 +3627,54 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
if (context_copied(iommu, info->bus, info->devfn))
return -EBUSY;
- ret = prepare_domain_attach_device(domain, dev);
+ ret = paging_domain_compatible(domain, dev);
if (ret)
return ret;
- dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
- if (!dev_pasid)
- return -ENOMEM;
+ dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
+ if (IS_ERR(dev_pasid))
+ return PTR_ERR(dev_pasid);
- ret = domain_attach_iommu(dmar_domain, iommu);
+ ret = iopf_for_domain_replace(domain, old, dev);
if (ret)
- goto out_free;
+ goto out_remove_dev_pasid;
- ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
- if (ret)
- goto out_detach_iommu;
-
- if (domain_type_is_si(dmar_domain))
- ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
- else if (dmar_domain->use_first_level)
+ if (intel_domain_is_fs_paging(dmar_domain))
ret = domain_setup_first_level(iommu, dmar_domain,
- dev, pasid);
- else
- ret = intel_pasid_setup_second_level(iommu, dmar_domain,
- dev, pasid);
+ dev, pasid, old);
+ else if (intel_domain_is_ss_paging(dmar_domain))
+ ret = domain_setup_second_level(iommu, dmar_domain,
+ dev, pasid, old);
+ else if (WARN_ON(true))
+ ret = -EINVAL;
+
if (ret)
- goto out_unassign_tag;
+ goto out_unwind_iopf;
- dev_pasid->dev = dev;
- dev_pasid->pasid = pasid;
- spin_lock_irqsave(&dmar_domain->lock, flags);
- list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
- spin_unlock_irqrestore(&dmar_domain->lock, flags);
+ domain_remove_dev_pasid(old, dev, pasid);
- if (domain->type & __IOMMU_DOMAIN_PAGING)
- intel_iommu_debugfs_create_dev_pasid(dev_pasid);
+ intel_iommu_debugfs_create_dev_pasid(dev_pasid);
return 0;
-out_unassign_tag:
- cache_tag_unassign_domain(dmar_domain, dev, pasid);
-out_detach_iommu:
- domain_detach_iommu(dmar_domain, iommu);
-out_free:
- kfree(dev_pasid);
+
+out_unwind_iopf:
+ iopf_for_domain_replace(old, domain, dev);
+out_remove_dev_pasid:
+ domain_remove_dev_pasid(domain, dev, pasid);
return ret;
}
-static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
+static void *intel_iommu_hw_info(struct device *dev, u32 *length,
+ enum iommu_hw_info_type *type)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
struct iommu_hw_info_vtd *vtd;
+ if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
+ *type != IOMMU_HW_INFO_TYPE_INTEL_VTD)
+ return ERR_PTR(-EOPNOTSUPP);
+
vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
if (!vtd)
return ERR_PTR(-ENOMEM);
@@ -4530,82 +3768,163 @@ err_unwind:
return ret;
}
-static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
+static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
{
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- unsigned long end = iova + size - 1;
- unsigned long pgsize;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ struct context_entry *context;
+
+ spin_lock(&iommu->lock);
+ context = iommu_context_addr(iommu, bus, devfn, 1);
+ if (!context) {
+ spin_unlock(&iommu->lock);
+ return -ENOMEM;
+ }
+
+ if (context_present(context) && !context_copied(iommu, bus, devfn)) {
+ spin_unlock(&iommu->lock);
+ return 0;
+ }
+
+ copied_context_tear_down(iommu, context, bus, devfn);
+ context_clear_entry(context);
+ context_set_domain_id(context, FLPT_DEFAULT_DID);
/*
- * IOMMUFD core calls into a dirty tracking disabled domain without an
- * IOVA bitmap set in order to clean dirty bits in all PTEs that might
- * have occurred when we stopped dirty tracking. This ensures that we
- * never inherit dirtied bits from a previous cycle.
+ * In pass through mode, AW must be programmed to indicate the largest
+ * AGAW value supported by hardware. And ASR is ignored by hardware.
*/
- if (!dmar_domain->dirty_tracking && dirty->bitmap)
- return -EINVAL;
+ context_set_address_width(context, iommu->msagaw);
+ context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
+ context_set_fault_enable(context);
+ context_set_present(context);
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(context, sizeof(*context));
+ context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
+ spin_unlock(&iommu->lock);
+
+ return 0;
+}
- do {
- struct dma_pte *pte;
- int lvl = 0;
+static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
+{
+ struct device *dev = data;
- pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
- GFP_ATOMIC);
- pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
- if (!pte || !dma_pte_present(pte)) {
- iova += pgsize;
- continue;
- }
+ return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
+}
+
+static int device_setup_pass_through(struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
- if (dma_sl_pte_test_and_clear_dirty(pte, flags))
- iommu_dirty_bitmap_record(dirty, iova, pgsize);
- iova += pgsize;
- } while (iova < end);
+ if (!dev_is_pci(dev))
+ return context_setup_pass_through(dev, info->bus, info->devfn);
+
+ return pci_for_each_dma_alias(to_pci_dev(dev),
+ context_setup_pass_through_cb, dev);
+}
+static int identity_domain_attach_dev(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ int ret;
+
+ device_block_translation(dev);
+
+ if (dev_is_real_dma_subdevice(dev))
+ return 0;
+
+ /*
+ * No PRI support with the global identity domain. No need to enable or
+ * disable PRI in this path as the iommu has been put in the blocking
+ * state.
+ */
+ if (sm_supported(iommu))
+ ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
+ else
+ ret = device_setup_pass_through(dev);
+
+ if (!ret)
+ info->domain_attached = true;
+
+ return ret;
+}
+
+static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ int ret;
+
+ if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
+ return -EOPNOTSUPP;
+
+ ret = iopf_for_domain_replace(domain, old, dev);
+ if (ret)
+ return ret;
+
+ ret = domain_setup_passthrough(iommu, dev, pasid, old);
+ if (ret) {
+ iopf_for_domain_replace(old, domain, dev);
+ return ret;
+ }
+
+ domain_remove_dev_pasid(old, dev, pasid);
return 0;
}
-static const struct iommu_dirty_ops intel_dirty_ops = {
- .set_dirty_tracking = intel_iommu_set_dirty_tracking,
- .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
+static struct iommu_domain identity_domain = {
+ .type = IOMMU_DOMAIN_IDENTITY,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = identity_domain_attach_dev,
+ .set_dev_pasid = identity_domain_set_dev_pasid,
+ },
+};
+
+const struct iommu_domain_ops intel_fs_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(x86_64),
+ .attach_dev = intel_iommu_attach_device,
+ .set_dev_pasid = intel_iommu_set_dev_pasid,
+ .iotlb_sync_map = intel_iommu_iotlb_sync_map,
+ .flush_iotlb_all = intel_flush_iotlb_all,
+ .iotlb_sync = intel_iommu_tlb_sync,
+ .free = intel_iommu_domain_free,
+ .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
+};
+
+const struct iommu_domain_ops intel_ss_paging_domain_ops = {
+ IOMMU_PT_DOMAIN_OPS(vtdss),
+ .attach_dev = intel_iommu_attach_device,
+ .set_dev_pasid = intel_iommu_set_dev_pasid,
+ .iotlb_sync_map = intel_iommu_iotlb_sync_map,
+ .flush_iotlb_all = intel_flush_iotlb_all,
+ .iotlb_sync = intel_iommu_tlb_sync,
+ .free = intel_iommu_domain_free,
+ .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
};
const struct iommu_ops intel_iommu_ops = {
.blocked_domain = &blocking_domain,
.release_domain = &blocking_domain,
+ .identity_domain = &identity_domain,
.capable = intel_iommu_capable,
.hw_info = intel_iommu_hw_info,
- .domain_alloc = intel_iommu_domain_alloc,
- .domain_alloc_user = intel_iommu_domain_alloc_user,
+ .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
.domain_alloc_sva = intel_svm_domain_alloc,
+ .domain_alloc_nested = intel_iommu_domain_alloc_nested,
.probe_device = intel_iommu_probe_device,
+ .probe_finalize = intel_iommu_probe_finalize,
.release_device = intel_iommu_release_device,
.get_resv_regions = intel_iommu_get_resv_regions,
.device_group = intel_iommu_device_group,
- .dev_enable_feat = intel_iommu_dev_enable_feat,
- .dev_disable_feat = intel_iommu_dev_disable_feat,
.is_attach_deferred = intel_iommu_is_attach_deferred,
.def_domain_type = device_def_domain_type,
- .remove_dev_pasid = intel_iommu_remove_dev_pasid,
- .pgsize_bitmap = SZ_4K,
-#ifdef CONFIG_INTEL_IOMMU_SVM
- .page_response = intel_svm_page_response,
-#endif
- .default_domain_ops = &(const struct iommu_domain_ops) {
- .attach_dev = intel_iommu_attach_device,
- .set_dev_pasid = intel_iommu_set_dev_pasid,
- .map_pages = intel_iommu_map_pages,
- .unmap_pages = intel_iommu_unmap_pages,
- .iotlb_sync_map = intel_iommu_iotlb_sync_map,
- .flush_iotlb_all = intel_flush_iotlb_all,
- .iotlb_sync = intel_iommu_tlb_sync,
- .iova_to_phys = intel_iommu_iova_to_phys,
- .free = intel_iommu_domain_free,
- .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
- }
+ .page_response = intel_iommu_page_response,
};
static void quirk_iommu_igfx(struct pci_dev *dev)
@@ -4626,6 +3945,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
+/* QM57/QS57 integrated gfx malfunctions with dmar */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
+
/* Broadwell igfx malfunctions with dmar */
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
@@ -4703,7 +4025,6 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
@@ -4897,3 +4218,5 @@ err:
return ret;
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index eaf015b4353b..25c5e22096d4 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -22,8 +22,9 @@
#include <linux/bitfield.h>
#include <linux/xarray.h>
#include <linux/perf_event.h>
+#include <linux/pci.h>
+#include <linux/generic_pt/iommu.h>
-#include <asm/cacheflush.h>
#include <asm/iommu.h>
#include <uapi/linux/iommufd.h>
@@ -49,7 +50,6 @@
#define DMA_FL_PTE_US BIT_ULL(2)
#define DMA_FL_PTE_ACCESS BIT_ULL(5)
#define DMA_FL_PTE_DIRTY BIT_ULL(6)
-#define DMA_FL_PTE_XD BIT_ULL(63)
#define DMA_SL_PTE_DIRTY_BIT 9
#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
@@ -77,7 +77,6 @@
#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */
#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */
#define DMAR_FEUADDR_REG 0x44 /* Upper address register */
-#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */
#define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */
#define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */
#define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */
@@ -173,8 +172,6 @@
#define cap_pgsel_inv(c) (((c) >> 39) & 1)
#define cap_super_page_val(c) (((c) >> 34) & 0xf)
-#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \
- * OFFSET_STRIDE) + 21)
#define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16)
#define cap_max_fault_reg_offset(c) \
@@ -462,7 +459,6 @@ enum {
#define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32)
/* Page group response descriptor QW1 */
-#define QI_PGRP_LPIG(x) (((u64)(x)) << 2)
#define QI_PGRP_IDX(idx) (((u64)(idx)) << 3)
@@ -493,14 +489,13 @@ struct q_inval {
/* Page Request Queue depth */
#define PRQ_ORDER 4
-#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20)
-#define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5)
+#define PRQ_SIZE (SZ_4K << PRQ_ORDER)
+#define PRQ_RING_MASK (PRQ_SIZE - 0x20)
+#define PRQ_DEPTH (PRQ_SIZE >> 5)
struct dmar_pci_notify_info;
#ifdef CONFIG_IRQ_REMAP
-/* 1MB - maximum possible interrupt remapping table size */
-#define INTR_REMAP_PAGE_ORDER 8
#define INTR_REMAP_TABLE_REG_SIZE 0xf
#define INTR_REMAP_TABLE_REG_SIZE_MASK 0xf
@@ -542,7 +537,8 @@ enum {
#define pasid_supported(iommu) (sm_supported(iommu) && \
ecap_pasid((iommu)->ecap))
#define ssads_supported(iommu) (sm_supported(iommu) && \
- ecap_slads((iommu)->ecap))
+ ecap_slads((iommu)->ecap) && \
+ ecap_smpwc(iommu->ecap))
#define nested_supported(iommu) (sm_supported(iommu) && \
ecap_nest((iommu)->ecap))
@@ -585,23 +581,36 @@ struct iommu_domain_info {
* to VT-d spec, section 9.3 */
};
+/*
+ * We start simply by using a fixed size for the batched descriptors. This
+ * size is currently sufficient for our needs. Future improvements could
+ * involve dynamically allocating the batch buffer based on actual demand,
+ * allowing us to adjust the batch size for optimal performance in different
+ * scenarios.
+ */
+#define QI_MAX_BATCHED_DESC_COUNT 16
+struct qi_batch {
+ struct qi_desc descs[QI_MAX_BATCHED_DESC_COUNT];
+ unsigned int index;
+};
+
struct dmar_domain {
- int nid; /* node id */
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ /* First stage page table */
+ struct pt_iommu_x86_64 fspt;
+ /* Second stage page table */
+ struct pt_iommu_vtdss sspt;
+ };
+
struct xarray iommu_array; /* Attached IOMMU array */
- u8 has_iotlb_device: 1;
- u8 iommu_coherency: 1; /* indicate coherency of iommu access */
- u8 force_snooping : 1; /* Create IOPTEs with snoop control */
- u8 set_pte_snp:1;
- u8 use_first_level:1; /* DMA translation for the domain goes
- * through the first level page table,
- * otherwise, goes through the second
- * level.
- */
+ u8 force_snooping:1; /* Create PASID entry with snoop control */
u8 dirty_tracking:1; /* Dirty tracking is enabled */
u8 nested_parent:1; /* Has other domains nested on it */
- u8 has_mappings:1; /* Has mappings configured through
- * iommu_map() interface.
+ u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write
+ * buffer when creating mappings.
*/
spinlock_t lock; /* Protect device tracking lists */
@@ -610,27 +619,11 @@ struct dmar_domain {
spinlock_t cache_lock; /* Protect the cache tag list */
struct list_head cache_tags; /* Cache tag list */
+ struct qi_batch *qi_batch; /* Batched QI descriptors */
- int iommu_superpage;/* Level of superpages supported:
- 0 == 4KiB (no superpages), 1 == 2MiB,
- 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
union {
/* DMA remapping domain */
struct {
- /* virtual address */
- struct dma_pte *pgd;
- /* max guest address width */
- int gaw;
- /*
- * adjusted guest address width:
- * 0: level 2 30-bit
- * 1: level 3 39-bit
- * 2: level 4 48-bit
- * 3: level 5 57-bit
- */
- int agaw;
- /* maximum mapped address */
- u64 max_addr;
/* Protect the s1_domains list */
spinlock_t s1_lock;
/* Track s1_domains nested on this domain */
@@ -641,8 +634,6 @@ struct dmar_domain {
struct {
/* parent page table which the user domain is nested on */
struct dmar_domain *s2_domain;
- /* user page table pointer (in GPA) */
- unsigned long s1_pgtbl;
/* page table attributes */
struct iommu_hwpt_vtd_s1 s1_cfg;
/* link to parent domain siblings */
@@ -654,10 +645,10 @@ struct dmar_domain {
struct mmu_notifier notifier;
};
};
-
- struct iommu_domain domain; /* generic domain data structure for
- iommu core */
};
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain);
/*
* In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters.
@@ -688,8 +679,6 @@ struct iommu_pmu {
DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
unsigned char irq_name[16];
- struct hlist_node cpuhp_node;
- int cpu;
};
#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED)
@@ -710,22 +699,22 @@ struct intel_iommu {
int msagaw; /* max sagaw of this iommu */
unsigned int irq, pr_irq, perf_irq;
u16 segment; /* PCI segment# */
- unsigned char name[13]; /* Device Name */
+ unsigned char name[16]; /* Device Name */
#ifdef CONFIG_INTEL_IOMMU
- unsigned long *domain_ids; /* bitmap of domains */
+ /* mutex to protect domain_ida */
+ struct mutex did_lock;
+ struct ida domain_ida; /* domain id allocator */
unsigned long *copied_tables; /* bitmap of copied tables */
spinlock_t lock; /* protect context, domain ids */
struct root_entry *root_entry; /* virtual address */
struct iommu_flush flush;
#endif
-#ifdef CONFIG_INTEL_IOMMU_SVM
struct page_req_dsc *prq;
unsigned char prq_name[16]; /* Name for PRQ interrupt */
unsigned long prq_seq_number;
struct completion prq_complete;
-#endif
struct iopf_queue *iopf_queue;
unsigned char iopfq_name[16];
/* Synchronization between fault report and iommu device release. */
@@ -766,7 +755,9 @@ struct device_domain_info {
u8 ats_supported:1;
u8 ats_enabled:1;
u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */
+ u8 domain_attached:1; /* Device has domain attached */
u8 ats_qdep;
+ unsigned int iopf_refcount;
struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
struct intel_iommu *iommu; /* IOMMU used by this device */
struct dmar_domain *domain; /* pointer to domain */
@@ -800,6 +791,24 @@ static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
return container_of(dom, struct dmar_domain, domain);
}
+/*
+ * Domain ID 0 and 1 are reserved:
+ *
+ * If Caching mode is set, then invalid translations are tagged
+ * with domain-id 0, hence we need to pre-allocate it. We also
+ * use domain-id 0 as a marker for non-allocated domain-id, so
+ * make sure it is not used for a real domain.
+ *
+ * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
+ * entry for first-level or pass-through translation modes should
+ * be programmed with a domain id different from those used for
+ * second-level or nested translation. We reserve a domain id for
+ * this purpose. This domain id is also used for identity domain
+ * in legacy mode.
+ */
+#define FLPT_DEFAULT_DID 1
+#define IDA_START_DID 2
+
/* Retrieve the domain ID which has allocated to the domain */
static inline u16
domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
@@ -810,6 +819,21 @@ domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
return info->did;
}
+static inline u16
+iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu)
+{
+ if (domain->type == IOMMU_DOMAIN_SVA ||
+ domain->type == IOMMU_DOMAIN_IDENTITY)
+ return FLPT_DEFAULT_DID;
+ return domain_id_iommu(to_dmar_domain(domain), iommu);
+}
+
+static inline bool dev_is_real_dma_subdevice(struct device *dev)
+{
+ return dev && dev_is_pci(dev) &&
+ pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
+}
+
/*
* 0: readable
* 1: writable
@@ -823,19 +847,13 @@ struct dma_pte {
u64 val;
};
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
- pte->val = 0;
-}
-
static inline u64 dma_pte_addr(struct dma_pte *pte)
{
#ifdef CONFIG_64BIT
- return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
+ return pte->val & VTD_PAGE_MASK;
#else
/* Must have a full atomic 64-bit read */
- return __cmpxchg64(&pte->val, 0ULL, 0ULL) &
- VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
+ return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
#endif
}
@@ -844,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
-static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
- unsigned long flags)
-{
- if (flags & IOMMU_DIRTY_NO_CLEAR)
- return (pte->val & DMA_SL_PTE_DIRTY) != 0;
-
- return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
- (unsigned long *)&pte->val);
-}
-
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
}
-static inline bool first_pte_in_page(struct dma_pte *pte)
-{
- return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
-}
-
-static inline int nr_pte_to_next_page(struct dma_pte *pte)
-{
- return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
- (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
-}
-
static inline bool context_present(struct context_entry *context)
{
return (context->lo & 1);
@@ -885,11 +882,6 @@ static inline int agaw_to_level(int agaw)
return agaw + 2;
}
-static inline int agaw_to_width(int agaw)
-{
- return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
-}
-
static inline int width_to_agaw(int width)
{
return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
@@ -905,44 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level)
return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
}
-static inline u64 level_mask(int level)
-{
- return -1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 level_size(int level)
-{
- return 1ULL << level_to_offset_bits(level);
-}
-
-static inline u64 align_to_level(u64 pfn, int level)
-{
- return (pfn + level_size(level) - 1) & level_mask(level);
-}
-
-static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
-{
- return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
-}
-
-/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
- are never going to work. */
-static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
-{
- return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
-}
-static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
-{
- return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
-}
-static inline unsigned long page_to_dma_pfn(struct page *pg)
-{
- return mm_to_dma_pfn_start(page_to_pfn(pg));
-}
-static inline unsigned long virt_to_dma_pfn(void *p)
-{
- return page_to_dma_pfn(virt_to_page(p));
-}
static inline void context_set_present(struct context_entry *context)
{
@@ -1047,6 +1001,15 @@ static inline void context_set_sm_pre(struct context_entry *context)
context->lo |= BIT_ULL(4);
}
+/*
+ * Clear the PRE(Page Request Enable) field of a scalable mode context
+ * entry.
+ */
+static inline void context_clear_sm_pre(struct context_entry *context)
+{
+ context->lo &= ~BIT_ULL(4);
+}
+
/* Returns a number of VTD pages, but aligned to MM page size */
static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
{
@@ -1060,6 +1023,115 @@ static inline unsigned long nrpages_to_size(unsigned long npages)
return npages << VTD_PAGE_SHIFT;
}
+static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+ unsigned int size_order, u64 type,
+ struct qi_desc *desc)
+{
+ u8 dw = 0, dr = 0;
+ int ih = addr & 1;
+
+ if (cap_write_drain(iommu->cap))
+ dw = 1;
+
+ if (cap_read_drain(iommu->cap))
+ dr = 1;
+
+ desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw)
+ | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
+ desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
+ | QI_IOTLB_AM(size_order);
+ desc->qw2 = 0;
+ desc->qw3 = 0;
+}
+
+static inline void qi_desc_dev_iotlb(u16 sid, u16 pfsid, u16 qdep, u64 addr,
+ unsigned int mask, struct qi_desc *desc)
+{
+ if (mask) {
+ addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+ desc->qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+ } else {
+ desc->qw1 = QI_DEV_IOTLB_ADDR(addr);
+ }
+
+ if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+ qdep = 0;
+
+ desc->qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+ QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
+ desc->qw2 = 0;
+ desc->qw3 = 0;
+}
+
+static inline void qi_desc_piotlb(u16 did, u32 pasid, u64 addr,
+ unsigned long npages, bool ih,
+ struct qi_desc *desc)
+{
+ if (npages == -1) {
+ desc->qw0 = QI_EIOTLB_PASID(pasid) |
+ QI_EIOTLB_DID(did) |
+ QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
+ QI_EIOTLB_TYPE;
+ desc->qw1 = 0;
+ } else {
+ int mask = ilog2(__roundup_pow_of_two(npages));
+ unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask));
+
+ if (WARN_ON_ONCE(!IS_ALIGNED(addr, align)))
+ addr = ALIGN_DOWN(addr, align);
+
+ desc->qw0 = QI_EIOTLB_PASID(pasid) |
+ QI_EIOTLB_DID(did) |
+ QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
+ QI_EIOTLB_TYPE;
+ desc->qw1 = QI_EIOTLB_ADDR(addr) |
+ QI_EIOTLB_IH(ih) |
+ QI_EIOTLB_AM(mask);
+ }
+}
+
+static inline void qi_desc_dev_iotlb_pasid(u16 sid, u16 pfsid, u32 pasid,
+ u16 qdep, u64 addr,
+ unsigned int size_order,
+ struct qi_desc *desc)
+{
+ unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1);
+
+ desc->qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+ QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+ QI_DEV_IOTLB_PFSID(pfsid);
+
+ /*
+ * If S bit is 0, we only flush a single page. If S bit is set,
+ * The least significant zero bit indicates the invalidation address
+ * range. VT-d spec 6.5.2.6.
+ * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB.
+ * size order = 0 is PAGE_SIZE 4KB
+ * Max Invs Pending (MIP) is set to 0 for now until we have DIT in
+ * ECAP.
+ */
+ if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order))
+ pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n",
+ addr, size_order);
+
+ /* Take page address */
+ desc->qw1 = QI_DEV_EIOTLB_ADDR(addr);
+
+ if (size_order) {
+ /*
+ * Existing 0s in address below size_order may be the least
+ * significant bit, we must set them to 1s to avoid having
+ * smaller size than desired.
+ */
+ desc->qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1,
+ VTD_PAGE_SHIFT);
+ /* Clear size_order bit to indicate size */
+ desc->qw1 &= ~mask;
+ /* Set the S bit to indicate flushing more than 1 page */
+ desc->qw1 |= QI_DEV_EIOTLB_SIZE;
+ }
+}
+
/* Convert value to context PASID directory size field coding. */
#define context_pdts(pds) (((pds) & 0x7) << 9)
@@ -1091,25 +1163,37 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
unsigned int count, unsigned long options);
+
+void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+ unsigned int size_order, u64 type);
/*
* Options used in qi_submit_sync:
* QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8.
*/
#define QI_OPT_WAIT_DRAIN BIT(0)
-void domain_update_iotlb(struct dmar_domain *domain);
int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
void device_block_translation(struct device *dev);
-int prepare_domain_attach_device(struct iommu_domain *domain,
- struct device *dev);
-void domain_update_iommu_cap(struct dmar_domain *domain);
+int paging_domain_compatible(struct iommu_domain *domain, struct device *dev);
+
+struct dev_pasid_info *
+domain_add_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid);
+void domain_remove_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid);
+
+int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
+ ioasid_t pasid, u16 did, phys_addr_t fsptptr,
+ int flags, struct iommu_domain *old);
int dmar_ir_support(void);
void iommu_flush_write_buffer(struct intel_iommu *iommu);
-struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
- const struct iommu_user_data *user_data);
+struct iommu_domain *
+intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
+ u32 flags,
+ const struct iommu_user_data *user_data);
struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
enum cache_tag_type {
@@ -1135,6 +1219,8 @@ struct cache_tag {
unsigned int users;
};
+int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev,
+ ioasid_t pasid, enum cache_tag_type type);
int cache_tag_assign_domain(struct dmar_domain *domain,
struct device *dev, ioasid_t pasid);
void cache_tag_unassign_domain(struct dmar_domain *domain,
@@ -1145,18 +1231,57 @@ void cache_tag_flush_all(struct dmar_domain *domain);
void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start,
unsigned long end);
+void intel_context_flush_no_pasid(struct device_domain_info *info,
+ struct context_entry *context, u16 did);
+
+int intel_iommu_enable_prq(struct intel_iommu *iommu);
+int intel_iommu_finish_prq(struct intel_iommu *iommu);
+void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt,
+ struct iommu_page_response *msg);
+void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid);
+
+int intel_iommu_enable_iopf(struct device *dev);
+void intel_iommu_disable_iopf(struct device *dev);
+
+static inline int iopf_for_domain_set(struct iommu_domain *domain,
+ struct device *dev)
+{
+ if (!domain || !domain->iopf_handler)
+ return 0;
+
+ return intel_iommu_enable_iopf(dev);
+}
+
+static inline void iopf_for_domain_remove(struct iommu_domain *domain,
+ struct device *dev)
+{
+ if (!domain || !domain->iopf_handler)
+ return;
+
+ intel_iommu_disable_iopf(dev);
+}
+
+static inline int iopf_for_domain_replace(struct iommu_domain *new,
+ struct iommu_domain *old,
+ struct device *dev)
+{
+ int ret;
+
+ ret = iopf_for_domain_set(new, dev);
+ if (ret)
+ return ret;
+
+ iopf_for_domain_remove(old, dev);
+
+ return 0;
+}
+
#ifdef CONFIG_INTEL_IOMMU_SVM
void intel_svm_check(struct intel_iommu *iommu);
-int intel_svm_enable_prq(struct intel_iommu *iommu);
-int intel_svm_finish_prq(struct intel_iommu *iommu);
-void intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
- struct iommu_page_response *msg);
struct iommu_domain *intel_svm_domain_alloc(struct device *dev,
struct mm_struct *mm);
-void intel_drain_pasid_prq(struct device *dev, u32 pasid);
#else
static inline void intel_svm_check(struct intel_iommu *iommu) {}
-static inline void intel_drain_pasid_prq(struct device *dev, u32 pasid) {}
static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev,
struct mm_struct *mm)
{
@@ -1183,6 +1308,18 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
u8 devfn, int alloc);
extern const struct iommu_ops intel_iommu_ops;
+extern const struct iommu_domain_ops intel_fs_paging_domain_ops;
+extern const struct iommu_domain_ops intel_ss_paging_domain_ops;
+
+static inline bool intel_domain_is_fs_paging(struct dmar_domain *domain)
+{
+ return domain->domain.ops == &intel_fs_paging_domain_ops;
+}
+
+static inline bool intel_domain_is_ss_paging(struct dmar_domain *domain)
+{
+ return domain->domain.ops == &intel_ss_paging_domain_ops;
+}
#ifdef CONFIG_INTEL_IOMMU
extern int intel_iommu_sm;
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index e4a70886678c..4f9b01dc91e8 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -10,6 +10,7 @@
#include <linux/hpet.h>
#include <linux/pci.h>
#include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
#include <linux/acpi.h>
#include <linux/irqdomain.h>
#include <linux/crash_dump.h>
@@ -24,12 +25,6 @@
#include "iommu.h"
#include "../irq_remapping.h"
#include "../iommu-pages.h"
-#include "cap_audit.h"
-
-enum irq_mode {
- IRQ_REMAPPING,
- IRQ_POSTING,
-};
struct ioapic_scope {
struct intel_iommu *iommu;
@@ -50,8 +45,8 @@ struct irq_2_iommu {
u16 irte_index;
u16 sub_handle;
u8 irte_mask;
- enum irq_mode mode;
bool posted_msi;
+ bool posted_vcpu;
};
struct intel_ir_data {
@@ -139,7 +134,6 @@ static int alloc_irte(struct intel_iommu *iommu,
irq_iommu->irte_index = index;
irq_iommu->sub_handle = 0;
irq_iommu->irte_mask = mask;
- irq_iommu->mode = IRQ_REMAPPING;
}
raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
@@ -194,8 +188,6 @@ static int modify_irte(struct irq_2_iommu *irq_iommu,
rc = qi_flush_iec(iommu, index, 0);
- /* Update iommu mode according to the IRTE mode */
- irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING;
raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
return rc;
@@ -312,7 +304,7 @@ static int set_ioapic_sid(struct irte *irte, int apic)
for (i = 0; i < MAX_IO_APICS; i++) {
if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) {
- sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+ sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn);
break;
}
}
@@ -337,7 +329,7 @@ static int set_hpet_sid(struct irte *irte, u8 id)
for (i = 0; i < MAX_HPET_TBS; i++) {
if (ir_hpet[i].iommu && ir_hpet[i].id == id) {
- sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn;
+ sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn);
break;
}
}
@@ -527,8 +519,14 @@ static void iommu_enable_irq_remapping(struct intel_iommu *iommu)
static int intel_setup_irq_remapping(struct intel_iommu *iommu)
{
+ struct irq_domain_info info = {
+ .ops = &intel_ir_domain_ops,
+ .parent = arch_get_ir_parent_domain(),
+ .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI,
+ .size = INTR_REMAP_TABLE_ENTRIES,
+ .host_data = iommu,
+ };
struct ir_table *ir_table;
- struct fwnode_handle *fn;
unsigned long *bitmap;
void *ir_table_base;
@@ -539,11 +537,11 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
if (!ir_table)
return -ENOMEM;
- ir_table_base = iommu_alloc_pages_node(iommu->node, GFP_KERNEL,
- INTR_REMAP_PAGE_ORDER);
+ /* 1MB - maximum possible interrupt remapping table size */
+ ir_table_base =
+ iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, SZ_1M);
if (!ir_table_base) {
- pr_err("IR%d: failed to allocate pages of order %d\n",
- iommu->seq_id, INTR_REMAP_PAGE_ORDER);
+ pr_err("IR%d: failed to allocate 1M of pages\n", iommu->seq_id);
goto out_free_table;
}
@@ -553,25 +551,16 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
goto out_free_pages;
}
- fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
- if (!fn)
+ info.fwnode = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id);
+ if (!info.fwnode)
goto out_free_bitmap;
- iommu->ir_domain =
- irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
- 0, INTR_REMAP_TABLE_ENTRIES,
- fn, &intel_ir_domain_ops,
- iommu);
+ iommu->ir_domain = msi_create_parent_irq_domain(&info, &dmar_msi_parent_ops);
if (!iommu->ir_domain) {
pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
goto out_free_fwnode;
}
- irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_DMAR);
- iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
- IRQ_DOMAIN_FLAG_ISOLATED_MSI;
- iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops;
-
ir_table->base = ir_table_base;
ir_table->bitmap = bitmap;
iommu->ir_table = ir_table;
@@ -597,8 +586,8 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
if (ir_pre_enabled(iommu)) {
if (!is_kdump_kernel()) {
- pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n",
- iommu->name);
+ pr_info_once("IRQ remapping was enabled on %s but we are not in kdump mode\n",
+ iommu->name);
clear_ir_pre_enabled(iommu);
iommu_disable_irq_remapping(iommu);
} else if (iommu_load_old_irte(iommu))
@@ -617,11 +606,11 @@ out_free_ir_domain:
irq_domain_remove(iommu->ir_domain);
iommu->ir_domain = NULL;
out_free_fwnode:
- irq_domain_free_fwnode(fn);
+ irq_domain_free_fwnode(info.fwnode);
out_free_bitmap:
bitmap_free(bitmap);
out_free_pages:
- iommu_free_pages(ir_table_base, INTR_REMAP_PAGE_ORDER);
+ iommu_free_pages(ir_table_base);
out_free_table:
kfree(ir_table);
@@ -642,7 +631,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
irq_domain_free_fwnode(fn);
iommu->ir_domain = NULL;
}
- iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER);
+ iommu_free_pages(iommu->ir_table->base);
bitmap_free(iommu->ir_table->bitmap);
kfree(iommu->ir_table);
iommu->ir_table = NULL;
@@ -727,9 +716,6 @@ static int __init intel_prepare_irq_remapping(void)
if (dmar_table_init() < 0)
return -ENODEV;
- if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
- return -ENODEV;
-
if (!dmar_ir_support())
return -ENODEV;
@@ -1173,7 +1159,26 @@ static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd)
static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {}
#endif
-static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
+static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host)
+{
+ struct intel_ir_data *ir_data = irqd->chip_data;
+
+ /*
+ * Don't modify IRTEs for IRQs that are being posted to vCPUs if the
+ * host CPU affinity changes.
+ */
+ if (ir_data->irq_2_iommu.posted_vcpu && !force_host)
+ return;
+
+ ir_data->irq_2_iommu.posted_vcpu = false;
+
+ if (ir_data->irq_2_iommu.posted_msi)
+ intel_ir_reconfigure_irte_posted(irqd);
+ else
+ modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
+}
+
+static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host)
{
struct intel_ir_data *ir_data = irqd->chip_data;
struct irte *irte = &ir_data->irte_entry;
@@ -1186,10 +1191,7 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
irte->vector = cfg->vector;
irte->dest_id = IRTE_DEST(cfg->dest_apicid);
- if (ir_data->irq_2_iommu.posted_msi)
- intel_ir_reconfigure_irte_posted(irqd);
- else if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
- modify_irte(&ir_data->irq_2_iommu, irte);
+ __intel_ir_reconfigure_irte(irqd, force_host);
}
/*
@@ -1240,11 +1242,11 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
{
struct intel_ir_data *ir_data = data->chip_data;
- struct vcpu_data *vcpu_pi_info = info;
+ struct intel_iommu_pi_data *pi_data = info;
/* stop posting interrupts, back to the default mode */
- if (!vcpu_pi_info) {
- modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
+ if (!pi_data) {
+ __intel_ir_reconfigure_irte(data, true);
} else {
struct irte irte_pi;
@@ -1261,12 +1263,13 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
/* Update the posted mode fields */
irte_pi.p_pst = 1;
irte_pi.p_urgent = 0;
- irte_pi.p_vector = vcpu_pi_info->vector;
- irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
+ irte_pi.p_vector = pi_data->vector;
+ irte_pi.pda_l = (pi_data->pi_desc_addr >>
(32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
- irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+ irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) &
~(-1UL << PDA_HIGH_BIT);
+ ir_data->irq_2_iommu.posted_vcpu = true;
modify_irte(&ir_data->irq_2_iommu, &irte_pi);
}
@@ -1282,43 +1285,44 @@ static struct irq_chip intel_ir_chip = {
};
/*
- * With posted MSIs, all vectors are multiplexed into a single notification
- * vector. Devices MSIs are then dispatched in a demux loop where
- * EOIs can be coalesced as well.
+ * With posted MSIs, the MSI vectors are multiplexed into a single notification
+ * vector, and only the notification vector is sent to the APIC IRR. Device
+ * MSIs are then dispatched in a demux loop that harvests the MSIs from the
+ * CPU's Posted Interrupt Request bitmap. I.e. Posted MSIs never get sent to
+ * the APIC IRR, and thus do not need an EOI. The notification handler instead
+ * performs a single EOI after processing the PIR.
*
- * "INTEL-IR-POST" IRQ chip does not do EOI on ACK, thus the dummy irq_ack()
- * function. Instead EOI is performed by the posted interrupt notification
- * handler.
+ * Note! Pending SMP/CPU affinity changes, which are per MSI, must still be
+ * honored, only the APIC EOI is omitted.
*
* For the example below, 3 MSIs are coalesced into one CPU notification. Only
- * one apic_eoi() is needed.
+ * one apic_eoi() is needed, but each MSI needs to process pending changes to
+ * its CPU affinity.
*
* __sysvec_posted_msi_notification()
* irq_enter();
* handle_edge_irq()
* irq_chip_ack_parent()
- * dummy(); // No EOI
+ * irq_move_irq(); // No EOI
* handle_irq_event()
* driver_handler()
* handle_edge_irq()
* irq_chip_ack_parent()
- * dummy(); // No EOI
+ * irq_move_irq(); // No EOI
* handle_irq_event()
* driver_handler()
* handle_edge_irq()
* irq_chip_ack_parent()
- * dummy(); // No EOI
+ * irq_move_irq(); // No EOI
* handle_irq_event()
* driver_handler()
* apic_eoi()
* irq_exit()
+ *
*/
-
-static void dummy_ack(struct irq_data *d) { }
-
static struct irq_chip intel_ir_chip_post_msi = {
.name = "INTEL-IR-POST",
- .irq_ack = dummy_ack,
+ .irq_ack = irq_move_irq,
.irq_set_affinity = intel_ir_set_affinity,
.irq_compose_msi_msg = intel_ir_compose_msi_msg,
.irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
@@ -1352,12 +1356,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
case X86_IRQ_ALLOC_TYPE_IOAPIC:
/* Set source-id of interrupt request */
set_ioapic_sid(irte, info->devid);
- apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
- info->devid, irte->present, irte->fpd,
- irte->dst_mode, irte->redir_hint,
- irte->trigger_mode, irte->dlvry_mode,
- irte->avail, irte->vector, irte->dest_id,
- irte->sid, irte->sq, irte->svt);
+ apic_pr_verbose("IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
+ info->devid, irte->present, irte->fpd, irte->dst_mode,
+ irte->redir_hint, irte->trigger_mode, irte->dlvry_mode,
+ irte->avail, irte->vector, irte->dest_id, irte->sid,
+ irte->sq, irte->svt);
sub_handle = info->ioapic.pin;
break;
case X86_IRQ_ALLOC_TYPE_HPET:
@@ -1464,7 +1467,6 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
else
irq_data->chip = &intel_ir_chip;
intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
- irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
}
return 0;
@@ -1495,6 +1497,9 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain,
struct intel_ir_data *data = irq_data->chip_data;
struct irte entry;
+ WARN_ON_ONCE(data->irq_2_iommu.posted_vcpu);
+ data->irq_2_iommu.posted_vcpu = false;
+
memset(&entry, 0, sizeof(entry));
modify_irte(&data->irq_2_iommu, &entry);
}
@@ -1523,6 +1528,8 @@ static const struct irq_domain_ops intel_ir_domain_ops = {
static const struct msi_parent_ops dmar_msi_parent_ops = {
.supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI,
+ .bus_select_token = DOMAIN_BUS_DMAR,
+ .bus_select_mask = MATCH_PCI_MSI,
.prefix = "IR-",
.init_dev_msi_info = msi_parent_init_dev_msi_info,
};
@@ -1535,10 +1542,6 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
int ret;
int eim = x2apic_enabled();
- ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu);
- if (ret)
- return ret;
-
if (eim && !ecap_eim_support(iommu->ecap)) {
pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
iommu->reg_phys, iommu->ecap);
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
index 16a2bcf5cfeb..a3fb8c193ca6 100644
--- a/drivers/iommu/intel/nested.c
+++ b/drivers/iommu/intel/nested.c
@@ -19,7 +19,7 @@
#include "pasid.h"
static int intel_nested_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
@@ -27,20 +27,14 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
unsigned long flags;
int ret = 0;
- if (info->domain)
- device_block_translation(dev);
-
- if (iommu->agaw < dmar_domain->s2_domain->agaw) {
- dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
- return -ENODEV;
- }
+ device_block_translation(dev);
/*
* Stage-1 domain cannot work alone, it is nested on a s2_domain.
* The s2_domain will be used in nested translation, hence needs
* to ensure the s2_domain is compatible with this IOMMU.
*/
- ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev);
+ ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev);
if (ret) {
dev_err_ratelimited(dev, "s2 domain is not compatible\n");
return ret;
@@ -56,19 +50,24 @@ static int intel_nested_attach_dev(struct iommu_domain *domain,
if (ret)
goto detach_iommu;
+ ret = iopf_for_domain_set(domain, dev);
+ if (ret)
+ goto unassign_tag;
+
ret = intel_pasid_setup_nested(iommu, dev,
IOMMU_NO_PASID, dmar_domain);
if (ret)
- goto unassign_tag;
+ goto disable_iopf;
info->domain = dmar_domain;
+ info->domain_attached = true;
spin_lock_irqsave(&dmar_domain->lock, flags);
list_add(&info->link, &dmar_domain->devices);
spin_unlock_irqrestore(&dmar_domain->lock, flags);
- domain_update_iotlb(dmar_domain);
-
return 0;
+disable_iopf:
+ iopf_for_domain_remove(domain, dev);
unassign_tag:
cache_tag_unassign_domain(dmar_domain, dev, IOMMU_NO_PASID);
detach_iommu:
@@ -85,6 +84,7 @@ static void intel_nested_domain_free(struct iommu_domain *domain)
spin_lock(&s2_domain->s1_lock);
list_del(&dmar_domain->s2_link);
spin_unlock(&s2_domain->s1_lock);
+ kfree(dmar_domain->qi_batch);
kfree(dmar_domain);
}
@@ -131,25 +131,87 @@ out:
return ret;
}
+static int domain_setup_nested(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
+{
+ if (!old)
+ return intel_pasid_setup_nested(iommu, dev, pasid, domain);
+ return intel_pasid_replace_nested(iommu, dev, pasid,
+ iommu_domain_did(old, iommu),
+ domain);
+}
+
+static int intel_nested_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ struct dev_pasid_info *dev_pasid;
+ int ret;
+
+ if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
+ return -EOPNOTSUPP;
+
+ if (context_copied(iommu, info->bus, info->devfn))
+ return -EBUSY;
+
+ ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev);
+ if (ret)
+ return ret;
+
+ dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
+ if (IS_ERR(dev_pasid))
+ return PTR_ERR(dev_pasid);
+
+ ret = iopf_for_domain_replace(domain, old, dev);
+ if (ret)
+ goto out_remove_dev_pasid;
+
+ ret = domain_setup_nested(iommu, dmar_domain, dev, pasid, old);
+ if (ret)
+ goto out_unwind_iopf;
+
+ domain_remove_dev_pasid(old, dev, pasid);
+
+ return 0;
+
+out_unwind_iopf:
+ iopf_for_domain_replace(old, domain, dev);
+out_remove_dev_pasid:
+ domain_remove_dev_pasid(domain, dev, pasid);
+ return ret;
+}
+
static const struct iommu_domain_ops intel_nested_domain_ops = {
.attach_dev = intel_nested_attach_dev,
+ .set_dev_pasid = intel_nested_set_dev_pasid,
.free = intel_nested_domain_free,
.cache_invalidate_user = intel_nested_cache_invalidate_user,
};
-struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
- const struct iommu_user_data *user_data)
+struct iommu_domain *
+intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
+ u32 flags,
+ const struct iommu_user_data *user_data)
{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
struct dmar_domain *s2_domain = to_dmar_domain(parent);
+ struct intel_iommu *iommu = info->iommu;
struct iommu_hwpt_vtd_s1 vtd;
struct dmar_domain *domain;
int ret;
+ if (!nested_supported(iommu) || flags & ~IOMMU_HWPT_ALLOC_PASID)
+ return ERR_PTR(-EOPNOTSUPP);
+
/* Must be nested domain */
if (user_data->type != IOMMU_HWPT_DATA_VTD_S1)
return ERR_PTR(-EOPNOTSUPP);
- if (parent->ops != intel_iommu_ops.default_domain_ops ||
- !s2_domain->nested_parent)
+ if (!intel_domain_is_ss_paging(s2_domain) || !s2_domain->nested_parent)
return ERR_PTR(-EINVAL);
ret = iommu_copy_struct_from_user(&vtd, user_data,
@@ -161,9 +223,7 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
if (!domain)
return ERR_PTR(-ENOMEM);
- domain->use_first_level = true;
domain->s2_domain = s2_domain;
- domain->s1_pgtbl = vtd.pgtbl_addr;
domain->s1_cfg = vtd;
domain->domain.ops = &intel_nested_domain_ops;
domain->domain.type = IOMMU_DOMAIN_NESTED;
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index abce19e2ad6f..3e2255057079 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -60,14 +60,14 @@ int intel_pasid_alloc_table(struct device *dev)
size = max_pasid >> (PASID_PDE_SHIFT - 3);
order = size ? get_order(size) : 0;
- dir = iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order);
+ dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL,
+ 1 << (order + PAGE_SHIFT));
if (!dir) {
kfree(pasid_table);
return -ENOMEM;
}
pasid_table->table = dir;
- pasid_table->order = order;
pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
info->pasid_table = pasid_table;
@@ -97,10 +97,10 @@ void intel_pasid_free_table(struct device *dev)
max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT;
for (i = 0; i < max_pde; i++) {
table = get_pasid_table_from_pde(&dir[i]);
- iommu_free_page(table);
+ iommu_free_pages(table);
}
- iommu_free_pages(pasid_table->table, pasid_table->order);
+ iommu_free_pages(pasid_table->table);
kfree(pasid_table);
}
@@ -146,7 +146,10 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
retry:
entries = get_pasid_table_from_pde(&dir[dir_index]);
if (!entries) {
- entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC);
+ u64 tmp;
+
+ entries = iommu_alloc_pages_node_sz(info->iommu->node,
+ GFP_ATOMIC, SZ_4K);
if (!entries)
return NULL;
@@ -156,9 +159,10 @@ retry:
* clear. However, this entry might be populated by others
* while we are preparing it. Use theirs with a retry.
*/
- if (cmpxchg64(&dir[dir_index].val, 0ULL,
- (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
- iommu_free_page(entries);
+ tmp = 0ULL;
+ if (!try_cmpxchg64(&dir[dir_index].val, &tmp,
+ (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
+ iommu_free_pages(entries);
goto retry;
}
if (!ecap_coherent(info->iommu->ecap)) {
@@ -217,7 +221,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
if (pci_dev_is_disconnected(to_pci_dev(dev)))
return;
- sid = info->bus << 8 | info->devfn;
+ sid = PCI_DEVID(info->bus, info->devfn);
qdep = info->ats_qdep;
pfsid = info->pfsid;
@@ -241,11 +245,31 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
- if (WARN_ON(!pte) || !pasid_pte_is_present(pte)) {
+ if (WARN_ON(!pte)) {
spin_unlock(&iommu->lock);
return;
}
+ if (!pasid_pte_is_present(pte)) {
+ if (!pasid_pte_is_fault_disabled(pte)) {
+ WARN_ON(READ_ONCE(pte->val[0]) != 0);
+ spin_unlock(&iommu->lock);
+ return;
+ }
+
+ /*
+ * When a PASID is used for SVA by a device, it's possible
+ * that the pasid entry is non-present with the Fault
+ * Processing Disabled bit set. Clear the pasid entry and
+ * drain the PRQ for the PASID before return.
+ */
+ pasid_clear_entry(pte);
+ spin_unlock(&iommu->lock);
+ intel_iommu_drain_pasid_prq(dev, pasid);
+
+ return;
+ }
+
did = pasid_get_domain_id(pte);
pgtt = pasid_pte_get_pgtt(pte);
intel_pasid_clear_entry(dev, pasid, fault_ignore);
@@ -261,9 +285,9 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
else
iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
- /* Device IOTLB doesn't need to be flushed in caching mode. */
- if (!cap_caching_mode(iommu->cap))
- devtlb_invalidation_with_pasid(iommu, dev, pasid);
+ devtlb_invalidation_with_pasid(iommu, dev, pasid);
+ if (!fault_ignore)
+ intel_iommu_drain_pasid_prq(dev, pasid);
}
/*
@@ -286,12 +310,72 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
}
/*
+ * This function is supposed to be used after caller updates the fields
+ * except for the SSADE and P bit of a pasid table entry. It does the
+ * below:
+ * - Flush cacheline if needed
+ * - Flush the caches per Table 28 ”Guidance to Software for Invalidations“
+ * of VT-d spec 5.0.
+ */
+static void intel_pasid_flush_present(struct intel_iommu *iommu,
+ struct device *dev,
+ u32 pasid, u16 did,
+ struct pasid_entry *pte)
+{
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(pte, sizeof(*pte));
+
+ /*
+ * VT-d spec 5.0 table28 states guides for cache invalidation:
+ *
+ * - PASID-selective-within-Domain PASID-cache invalidation
+ * - PASID-selective PASID-based IOTLB invalidation
+ * - If (pasid is RID_PASID)
+ * - Global Device-TLB invalidation to affected functions
+ * Else
+ * - PASID-based Device-TLB invalidation (with S=1 and
+ * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
+ */
+ pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+ qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+
+ devtlb_invalidation_with_pasid(iommu, dev, pasid);
+}
+
+/*
* Set up the scalable mode pasid table entry for first only
* translation type.
*/
-int intel_pasid_setup_first_level(struct intel_iommu *iommu,
- struct device *dev, pgd_t *pgd,
- u32 pasid, u16 did, int flags)
+static void pasid_pte_config_first_level(struct intel_iommu *iommu,
+ struct pasid_entry *pte,
+ phys_addr_t fsptptr, u16 did,
+ int flags)
+{
+ lockdep_assert_held(&iommu->lock);
+
+ pasid_clear_entry(pte);
+
+ /* Setup the first level page table pointer: */
+ pasid_set_flptr(pte, fsptptr);
+
+ if (flags & PASID_FLAG_FL5LP)
+ pasid_set_flpm(pte, 1);
+
+ if (flags & PASID_FLAG_PAGE_SNOOP)
+ pasid_set_pgsnp(pte);
+
+ pasid_set_domain_id(pte, did);
+ pasid_set_address_width(pte, iommu->agaw);
+ pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP);
+
+ /* Setup Present and PASID Granular Transfer Type: */
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
+ pasid_set_present(pte);
+}
+
+int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev,
+ phys_addr_t fsptptr, u32 pasid, u16 did,
+ int flags)
{
struct pasid_entry *pte;
@@ -319,64 +403,93 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
return -EBUSY;
}
- pasid_clear_entry(pte);
+ pasid_pte_config_first_level(iommu, pte, fsptptr, did, flags);
- /* Setup the first level page table pointer: */
- pasid_set_flptr(pte, (u64)__pa(pgd));
+ spin_unlock(&iommu->lock);
- if (flags & PASID_FLAG_FL5LP)
- pasid_set_flpm(pte, 1);
+ pasid_flush_caches(iommu, pte, pasid, did);
- if (flags & PASID_FLAG_PAGE_SNOOP)
- pasid_set_pgsnp(pte);
+ return 0;
+}
- pasid_set_domain_id(pte, did);
- pasid_set_address_width(pte, iommu->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
- pasid_set_nxe(pte);
+int intel_pasid_replace_first_level(struct intel_iommu *iommu,
+ struct device *dev, phys_addr_t fsptptr,
+ u32 pasid, u16 did, u16 old_did,
+ int flags)
+{
+ struct pasid_entry *pte, new_pte;
- /* Setup Present and PASID Granular Transfer Type: */
- pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
- pasid_set_present(pte);
+ if (!ecap_flts(iommu->ecap)) {
+ pr_err("No first level translation support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) {
+ pr_err("No 5-level paging support for first-level on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags);
+
+ spin_lock(&iommu->lock);
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ return -ENODEV;
+ }
+
+ if (!pasid_pte_is_present(pte)) {
+ spin_unlock(&iommu->lock);
+ return -EINVAL;
+ }
+
+ WARN_ON(old_did != pasid_get_domain_id(pte));
+
+ *pte = new_pte;
spin_unlock(&iommu->lock);
- pasid_flush_caches(iommu, pte, pasid, did);
+ intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
+ intel_iommu_drain_pasid_prq(dev, pasid);
return 0;
}
/*
- * Skip top levels of page tables for iommu which has less agaw
- * than default. Unnecessary for PT mode.
+ * Set up the scalable mode pasid entry for second only translation type.
*/
-static int iommu_skip_agaw(struct dmar_domain *domain,
- struct intel_iommu *iommu,
- struct dma_pte **pgd)
+static void pasid_pte_config_second_level(struct intel_iommu *iommu,
+ struct pasid_entry *pte,
+ struct dmar_domain *domain, u16 did)
{
- int agaw;
+ struct pt_iommu_vtdss_hw_info pt_info;
- for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
- *pgd = phys_to_virt(dma_pte_addr(*pgd));
- if (!dma_pte_present(*pgd))
- return -EINVAL;
- }
+ lockdep_assert_held(&iommu->lock);
- return agaw;
+ pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
+ pasid_clear_entry(pte);
+ pasid_set_domain_id(pte, did);
+ pasid_set_slptr(pte, pt_info.ssptptr);
+ pasid_set_address_width(pte, pt_info.aw);
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
+ pasid_set_fault_enable(pte);
+ pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
+ if (domain->dirty_tracking)
+ pasid_set_ssade(pte);
+
+ pasid_set_present(pte);
}
-/*
- * Set up the scalable mode pasid entry for second only translation type.
- */
int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid)
{
struct pasid_entry *pte;
- struct dma_pte *pgd;
- u64 pgd_val;
- int agaw;
u16 did;
+
/*
* If hardware advertises no support for second level
* translation, return directly.
@@ -387,16 +500,50 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -EINVAL;
}
- pgd = domain->pgd;
- agaw = iommu_skip_agaw(domain, iommu, &pgd);
- if (agaw < 0) {
- dev_err(dev, "Invalid domain page table\n");
+ did = domain_id_iommu(domain, iommu);
+
+ spin_lock(&iommu->lock);
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ return -ENODEV;
+ }
+
+ if (pasid_pte_is_present(pte)) {
+ spin_unlock(&iommu->lock);
+ return -EBUSY;
+ }
+
+ pasid_pte_config_second_level(iommu, pte, domain, did);
+ spin_unlock(&iommu->lock);
+
+ pasid_flush_caches(iommu, pte, pasid, did);
+
+ return 0;
+}
+
+int intel_pasid_replace_second_level(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u16 old_did,
+ u32 pasid)
+{
+ struct pasid_entry *pte, new_pte;
+ u16 did;
+
+ /*
+ * If hardware advertises no support for second level
+ * translation, return directly.
+ */
+ if (!ecap_slts(iommu->ecap)) {
+ pr_err("No second level translation support on %s\n",
+ iommu->name);
return -EINVAL;
}
- pgd_val = virt_to_phys(pgd);
did = domain_id_iommu(domain, iommu);
+ pasid_pte_config_second_level(iommu, &new_pte, domain, did);
+
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
@@ -404,25 +551,18 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return -ENODEV;
}
- if (pasid_pte_is_present(pte)) {
+ if (!pasid_pte_is_present(pte)) {
spin_unlock(&iommu->lock);
- return -EBUSY;
+ return -EINVAL;
}
- pasid_clear_entry(pte);
- pasid_set_domain_id(pte, did);
- pasid_set_slptr(pte, pgd_val);
- pasid_set_address_width(pte, agaw);
- pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
- pasid_set_fault_enable(pte);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
- if (domain->dirty_tracking)
- pasid_set_ssade(pte);
+ WARN_ON(old_did != pasid_get_domain_id(pte));
- pasid_set_present(pte);
+ *pte = new_pte;
spin_unlock(&iommu->lock);
- pasid_flush_caches(iommu, pte, pasid, did);
+ intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
+ intel_iommu_drain_pasid_prq(dev, pasid);
return 0;
}
@@ -491,9 +631,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
- /* Device IOTLB doesn't need to be flushed in caching mode. */
- if (!cap_caching_mode(iommu->cap))
- devtlb_invalidation_with_pasid(iommu, dev, pasid);
+ devtlb_invalidation_with_pasid(iommu, dev, pasid);
return 0;
}
@@ -501,6 +639,20 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
/*
* Set up the scalable mode pasid entry for passthrough translation type.
*/
+static void pasid_pte_config_pass_through(struct intel_iommu *iommu,
+ struct pasid_entry *pte, u16 did)
+{
+ lockdep_assert_held(&iommu->lock);
+
+ pasid_clear_entry(pte);
+ pasid_set_domain_id(pte, did);
+ pasid_set_address_width(pte, iommu->agaw);
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
+ pasid_set_fault_enable(pte);
+ pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_present(pte);
+}
+
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
struct device *dev, u32 pasid)
{
@@ -519,13 +671,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
return -EBUSY;
}
- pasid_clear_entry(pte);
- pasid_set_domain_id(pte, did);
- pasid_set_address_width(pte, iommu->agaw);
- pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
- pasid_set_fault_enable(pte);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
- pasid_set_present(pte);
+ pasid_pte_config_pass_through(iommu, pte, did);
spin_unlock(&iommu->lock);
pasid_flush_caches(iommu, pte, pasid, did);
@@ -533,6 +679,38 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
return 0;
}
+int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
+ struct device *dev, u16 old_did,
+ u32 pasid)
+{
+ struct pasid_entry *pte, new_pte;
+ u16 did = FLPT_DEFAULT_DID;
+
+ pasid_pte_config_pass_through(iommu, &new_pte, did);
+
+ spin_lock(&iommu->lock);
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ return -ENODEV;
+ }
+
+ if (!pasid_pte_is_present(pte)) {
+ spin_unlock(&iommu->lock);
+ return -EINVAL;
+ }
+
+ WARN_ON(old_did != pasid_get_domain_id(pte));
+
+ *pte = new_pte;
+ spin_unlock(&iommu->lock);
+
+ intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
+ intel_iommu_drain_pasid_prq(dev, pasid);
+
+ return 0;
+}
+
/*
* Set the page snoop control for a pasid entry which has been set up.
*/
@@ -553,26 +731,50 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
did = pasid_get_domain_id(pte);
spin_unlock(&iommu->lock);
- if (!ecap_coherent(iommu->ecap))
- clflush_cache_range(pte, sizeof(*pte));
+ intel_pasid_flush_present(iommu, dev, pasid, did, pte);
+}
- /*
- * VT-d spec 3.4 table23 states guides for cache invalidation:
- *
- * - PASID-selective-within-Domain PASID-cache invalidation
- * - PASID-selective PASID-based IOTLB invalidation
- * - If (pasid is RID_PASID)
- * - Global Device-TLB invalidation to affected functions
- * Else
- * - PASID-based Device-TLB invalidation (with S=1 and
- * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
- */
- pasid_cache_invalidation_with_pasid(iommu, did, pasid);
- qi_flush_piotlb(iommu, did, pasid, 0, -1, 0);
+static void pasid_pte_config_nestd(struct intel_iommu *iommu,
+ struct pasid_entry *pte,
+ struct iommu_hwpt_vtd_s1 *s1_cfg,
+ struct dmar_domain *s2_domain,
+ u16 did)
+{
+ struct pt_iommu_vtdss_hw_info pt_info;
+
+ lockdep_assert_held(&iommu->lock);
+
+ pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info);
+
+ pasid_clear_entry(pte);
- /* Device IOTLB doesn't need to be flushed in caching mode. */
- if (!cap_caching_mode(iommu->cap))
- devtlb_invalidation_with_pasid(iommu, dev, pasid);
+ if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
+ pasid_set_flpm(pte, 1);
+
+ pasid_set_flptr(pte, s1_cfg->pgtbl_addr);
+
+ if (s1_cfg->flags & IOMMU_VTD_S1_SRE) {
+ pasid_set_sre(pte);
+ if (s1_cfg->flags & IOMMU_VTD_S1_WPE)
+ pasid_set_wpe(pte);
+ }
+
+ if (s1_cfg->flags & IOMMU_VTD_S1_EAFE)
+ pasid_set_eafe(pte);
+
+ if (s2_domain->force_snooping)
+ pasid_set_pgsnp(pte);
+
+ pasid_set_slptr(pte, pt_info.ssptptr);
+ pasid_set_fault_enable(pte);
+ pasid_set_domain_id(pte, did);
+ pasid_set_address_width(pte, pt_info.aw);
+ pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features &
+ BIT(PT_FEAT_DMA_INCOHERENT)));
+ if (s2_domain->dirty_tracking)
+ pasid_set_ssade(pte);
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+ pasid_set_present(pte);
}
/**
@@ -590,10 +792,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
u32 pasid, struct dmar_domain *domain)
{
struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
- pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl;
struct dmar_domain *s2_domain = domain->s2_domain;
u16 did = domain_id_iommu(domain, iommu);
- struct dma_pte *pgd = s2_domain->pgd;
struct pasid_entry *pte;
/* Address width should match the address width supported by hardware */
@@ -636,37 +836,73 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
return -EBUSY;
}
- pasid_clear_entry(pte);
+ pasid_pte_config_nestd(iommu, pte, s1_cfg, s2_domain, did);
+ spin_unlock(&iommu->lock);
- if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
- pasid_set_flpm(pte, 1);
+ pasid_flush_caches(iommu, pte, pasid, did);
- pasid_set_flptr(pte, (uintptr_t)s1_gpgd);
+ return 0;
+}
- if (s1_cfg->flags & IOMMU_VTD_S1_SRE) {
- pasid_set_sre(pte);
- if (s1_cfg->flags & IOMMU_VTD_S1_WPE)
- pasid_set_wpe(pte);
+int intel_pasid_replace_nested(struct intel_iommu *iommu,
+ struct device *dev, u32 pasid,
+ u16 old_did, struct dmar_domain *domain)
+{
+ struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
+ struct dmar_domain *s2_domain = domain->s2_domain;
+ u16 did = domain_id_iommu(domain, iommu);
+ struct pasid_entry *pte, new_pte;
+
+ /* Address width should match the address width supported by hardware */
+ switch (s1_cfg->addr_width) {
+ case ADDR_WIDTH_4LEVEL:
+ break;
+ case ADDR_WIDTH_5LEVEL:
+ if (!cap_fl5lp_support(iommu->cap)) {
+ dev_err_ratelimited(dev,
+ "5-level paging not supported\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n",
+ s1_cfg->addr_width);
+ return -EINVAL;
}
- if (s1_cfg->flags & IOMMU_VTD_S1_EAFE)
- pasid_set_eafe(pte);
+ if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) {
+ pr_err_ratelimited("No supervisor request support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
- if (s2_domain->force_snooping)
- pasid_set_pgsnp(pte);
+ if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) {
+ pr_err_ratelimited("No extended access flag support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
- pasid_set_slptr(pte, virt_to_phys(pgd));
- pasid_set_fault_enable(pte);
- pasid_set_domain_id(pte, did);
- pasid_set_address_width(pte, s2_domain->agaw);
- pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
- if (s2_domain->dirty_tracking)
- pasid_set_ssade(pte);
- pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
- pasid_set_present(pte);
+ pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did);
+
+ spin_lock(&iommu->lock);
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ return -ENODEV;
+ }
+
+ if (!pasid_pte_is_present(pte)) {
+ spin_unlock(&iommu->lock);
+ return -EINVAL;
+ }
+
+ WARN_ON(old_did != pasid_get_domain_id(pte));
+
+ *pte = new_pte;
spin_unlock(&iommu->lock);
- pasid_flush_caches(iommu, pte, pasid, did);
+ intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
+ intel_iommu_drain_pasid_prq(dev, pasid);
return 0;
}
@@ -681,6 +917,7 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn)
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
struct context_entry *context;
+ u16 did;
spin_lock(&iommu->lock);
context = iommu_context_addr(iommu, bus, devfn, false);
@@ -689,28 +926,11 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn)
return;
}
+ did = context_domain_id(context);
context_clear_entry(context);
__iommu_flush_cache(iommu, context, sizeof(*context));
spin_unlock(&iommu->lock);
-
- /*
- * Cache invalidation for changes to a scalable-mode context table
- * entry.
- *
- * Section 6.5.3.3 of the VT-d spec:
- * - Device-selective context-cache invalidation;
- * - Domain-selective PASID-cache invalidation to affected domains
- * (can be skipped if all PASID entries were not-present);
- * - Domain-selective IOTLB invalidation to affected domains;
- * - Global Device-TLB invalidation to affected functions.
- *
- * The iommu has been parked in the blocking state. All domains have
- * been detached from the device or PASID. The PASID and IOTLB caches
- * have been invalidated during the domain detach path.
- */
- iommu->flush.flush_context(iommu, 0, PCI_DEVID(bus, devfn),
- DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL);
- devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID);
+ intel_context_flush_no_pasid(info, context, did);
}
static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data)
@@ -768,10 +988,10 @@ static int context_entry_set_pasid_table(struct context_entry *context,
if (info->ats_supported)
context_set_sm_dte(context);
- if (info->pri_supported)
- context_set_sm_pre(context);
if (info->pasid_supported)
context_set_pasid(context);
+ if (info->pri_supported)
+ context_set_sm_pre(context);
context_set_fault_enable(context);
context_set_present(context);
@@ -872,3 +1092,61 @@ int intel_pasid_setup_sm_context(struct device *dev)
return pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_setup, dev);
}
+
+/*
+ * Global Device-TLB invalidation following changes in a context entry which
+ * was present.
+ */
+static void __context_flush_dev_iotlb(struct device_domain_info *info)
+{
+ if (!info->ats_enabled)
+ return;
+
+ qi_flush_dev_iotlb(info->iommu, PCI_DEVID(info->bus, info->devfn),
+ info->pfsid, info->ats_qdep, 0, MAX_AGAW_PFN_WIDTH);
+
+ /*
+ * There is no guarantee that the device DMA is stopped when it reaches
+ * here. Therefore, always attempt the extra device TLB invalidation
+ * quirk. The impact on performance is acceptable since this is not a
+ * performance-critical path.
+ */
+ quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, IOMMU_NO_PASID,
+ info->ats_qdep);
+}
+
+/*
+ * Cache invalidations after change in a context table entry that was present
+ * according to the Spec 6.5.3.3 (Guidance to Software for Invalidations).
+ * This helper can only be used when IOMMU is working in the legacy mode or
+ * IOMMU is in scalable mode but all PASID table entries of the device are
+ * non-present.
+ */
+void intel_context_flush_no_pasid(struct device_domain_info *info,
+ struct context_entry *context, u16 did)
+{
+ struct intel_iommu *iommu = info->iommu;
+
+ /*
+ * Device-selective context-cache invalidation. The Domain-ID field
+ * of the Context-cache Invalidate Descriptor is ignored by hardware
+ * when operating in scalable mode. Therefore the @did value doesn't
+ * matter in scalable mode.
+ */
+ iommu->flush.flush_context(iommu, did, PCI_DEVID(info->bus, info->devfn),
+ DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL);
+
+ /*
+ * For legacy mode:
+ * - Domain-selective IOTLB invalidation
+ * - Global Device-TLB invalidation to all affected functions
+ */
+ if (!sm_supported(iommu)) {
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+ __context_flush_dev_iotlb(info);
+
+ return;
+ }
+
+ __context_flush_dev_iotlb(info);
+}
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index da9978fef7ac..b4c85242dc79 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -22,15 +22,9 @@
#define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1)
#define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7))
-/*
- * Domain ID reserved for pasid entries programmed for first-level
- * only and pass-through transfer modes.
- */
-#define FLPT_DEFAULT_DID 1
-#define NUM_RESERVED_DID 2
-
#define PASID_FLAG_NESTED BIT(1)
#define PASID_FLAG_PAGE_SNOOP BIT(2)
+#define PASID_FLAG_PWSNP BIT(2)
/*
* The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-
@@ -54,7 +48,6 @@ struct pasid_entry {
/* The representative of a PASID table */
struct pasid_table {
void *table; /* pasid table pointer */
- int order; /* page order of pasid table */
u32 max_pasid; /* max pasid */
};
@@ -80,6 +73,12 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte)
return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT;
}
+/* Get FPD(Fault Processing Disable) bit of a PASID table entry */
+static inline bool pasid_pte_is_fault_disabled(struct pasid_entry *pte)
+{
+ return READ_ONCE(pte->val[0]) & PASID_PTE_FPD;
+}
+
/* Get PGTT field of a PASID table entry */
static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte)
{
@@ -248,16 +247,6 @@ static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
}
/*
- * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID
- * entry. It is required when XD bit of the first level page table
- * entry is about to be set.
- */
-static inline void pasid_set_nxe(struct pasid_entry *pe)
-{
- pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5);
-}
-
-/*
* Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode
* PASID entry.
*/
@@ -300,9 +289,9 @@ extern unsigned int intel_pasid_max_id;
int intel_pasid_alloc_table(struct device *dev);
void intel_pasid_free_table(struct device *dev);
struct pasid_table *intel_pasid_get_table(struct device *dev);
-int intel_pasid_setup_first_level(struct intel_iommu *iommu,
- struct device *dev, pgd_t *pgd,
- u32 pasid, u16 did, int flags);
+int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev,
+ phys_addr_t fsptptr, u32 pasid, u16 did,
+ int flags);
int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
@@ -313,6 +302,20 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
struct device *dev, u32 pasid);
int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
u32 pasid, struct dmar_domain *domain);
+int intel_pasid_replace_first_level(struct intel_iommu *iommu,
+ struct device *dev, phys_addr_t fsptptr,
+ u32 pasid, u16 did, u16 old_did, int flags);
+int intel_pasid_replace_second_level(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u16 old_did,
+ u32 pasid);
+int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
+ struct device *dev, u16 old_did,
+ u32 pasid);
+int intel_pasid_replace_nested(struct intel_iommu *iommu,
+ struct device *dev, u32 pasid,
+ u16 old_did, struct dmar_domain *domain);
+
void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
struct device *dev, u32 pasid,
bool fault_ignore);
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
index adc4de6bbd88..dceeadc3ee7c 100644
--- a/drivers/iommu/intel/perf.c
+++ b/drivers/iommu/intel/perf.c
@@ -113,7 +113,7 @@ static char *latency_type_names[] = {
" svm_prq"
};
-int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
{
struct latency_statistic *lstat = iommu->perf_statistic;
unsigned long flags;
@@ -122,7 +122,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
memset(str, 0, size);
for (i = 0; i < COUNTS_NUM; i++)
- bytes += snprintf(str + bytes, size - bytes,
+ bytes += scnprintf(str + bytes, size - bytes,
"%s", latency_counter_names[i]);
spin_lock_irqsave(&latency_lock, flags);
@@ -130,7 +130,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
if (!dmar_latency_enabled(iommu, i))
continue;
- bytes += snprintf(str + bytes, size - bytes,
+ bytes += scnprintf(str + bytes, size - bytes,
"\n%s", latency_type_names[i]);
for (j = 0; j < COUNTS_NUM; j++) {
@@ -156,11 +156,9 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
break;
}
- bytes += snprintf(str + bytes, size - bytes,
+ bytes += scnprintf(str + bytes, size - bytes,
"%12lld", val);
}
}
spin_unlock_irqrestore(&latency_lock, flags);
-
- return bytes;
}
diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h
index df9a36942d64..1d4baad7e852 100644
--- a/drivers/iommu/intel/perf.h
+++ b/drivers/iommu/intel/perf.h
@@ -40,7 +40,7 @@ void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type);
bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type);
void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type,
u64 latency);
-int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size);
+void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size);
#else
static inline int
dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
@@ -64,9 +64,8 @@ dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 laten
{
}
-static inline int
+static inline void
dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
{
- return 0;
}
#endif /* CONFIG_DMAR_PERF */
diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c
index 44083d01852d..75f493bcb353 100644
--- a/drivers/iommu/intel/perfmon.c
+++ b/drivers/iommu/intel/perfmon.c
@@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_events_attr_group = {
.attrs = attrs_empty,
};
-static cpumask_t iommu_pmu_cpu_mask;
-
-static ssize_t
-cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
- return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
-}
-static DEVICE_ATTR_RO(cpumask);
-
-static struct attribute *iommu_pmu_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL
-};
-
-static struct attribute_group iommu_pmu_cpumask_attr_group = {
- .attrs = iommu_pmu_cpumask_attrs,
-};
-
static const struct attribute_group *iommu_pmu_attr_groups[] = {
&iommu_pmu_format_attr_group,
&iommu_pmu_events_attr_group,
- &iommu_pmu_cpumask_attr_group,
NULL
};
@@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct intel_iommu *iommu)
iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups;
iommu_pmu->pmu.attr_update = iommu_pmu_attr_update;
iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+ iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
iommu_pmu->pmu.module = THIS_MODULE;
return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
@@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
iommu->perf_irq = 0;
}
-static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
-{
- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
-
- if (cpumask_empty(&iommu_pmu_cpu_mask))
- cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
-
- if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
- iommu_pmu->cpu = cpu;
-
- return 0;
-}
-
-static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
-{
- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
- int target = cpumask_first(&iommu_pmu_cpu_mask);
-
- /*
- * The iommu_pmu_cpu_mask has been updated when offline the CPU
- * for the first iommu_pmu. Migrate the other iommu_pmu to the
- * new target.
- */
- if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
- iommu_pmu->cpu = target;
- return 0;
- }
-
- if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
- return 0;
-
- target = cpumask_any_but(cpu_online_mask, cpu);
-
- if (target < nr_cpu_ids)
- cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
- else
- return 0;
-
- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
- iommu_pmu->cpu = target;
-
- return 0;
-}
-
-static int nr_iommu_pmu;
-static enum cpuhp_state iommu_cpuhp_slot;
-
-static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
-{
- int ret;
-
- if (!nr_iommu_pmu) {
- ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
- "driver/iommu/intel/perfmon:online",
- iommu_pmu_cpu_online,
- iommu_pmu_cpu_offline);
- if (ret < 0)
- return ret;
- iommu_cpuhp_slot = ret;
- }
-
- ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
- if (ret) {
- if (!nr_iommu_pmu)
- cpuhp_remove_multi_state(iommu_cpuhp_slot);
- return ret;
- }
- nr_iommu_pmu++;
-
- return 0;
-}
-
-static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
-{
- cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
-
- if (--nr_iommu_pmu)
- return;
-
- cpuhp_remove_multi_state(iommu_cpuhp_slot);
-}
-
void iommu_pmu_register(struct intel_iommu *iommu)
{
struct iommu_pmu *iommu_pmu = iommu->pmu;
@@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iommu *iommu)
if (__iommu_pmu_register(iommu))
goto err;
- if (iommu_pmu_cpuhp_setup(iommu_pmu))
- goto unregister;
-
/* Set interrupt for overflow */
if (iommu_pmu_set_interrupt(iommu))
- goto cpuhp_free;
+ goto unregister;
return;
-cpuhp_free:
- iommu_pmu_cpuhp_free(iommu_pmu);
unregister:
perf_pmu_unregister(&iommu_pmu->pmu);
err:
@@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
return;
iommu_pmu_unset_interrupt(iommu);
- iommu_pmu_cpuhp_free(iommu_pmu);
perf_pmu_unregister(&iommu_pmu->pmu);
}
diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c
new file mode 100644
index 000000000000..ff63c228e6e1
--- /dev/null
+++ b/drivers/iommu/intel/prq.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015 Intel Corporation
+ *
+ * Originally split from drivers/iommu/intel/svm.c
+ */
+
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+
+#include "iommu.h"
+#include "pasid.h"
+#include "../iommu-pages.h"
+#include "trace.h"
+
+/* Page request queue descriptor */
+struct page_req_dsc {
+ union {
+ struct {
+ u64 type:8;
+ u64 pasid_present:1;
+ u64 rsvd:7;
+ u64 rid:16;
+ u64 pasid:20;
+ u64 exe_req:1;
+ u64 pm_req:1;
+ u64 rsvd2:10;
+ };
+ u64 qw_0;
+ };
+ union {
+ struct {
+ u64 rd_req:1;
+ u64 wr_req:1;
+ u64 lpig:1;
+ u64 prg_index:9;
+ u64 addr:52;
+ };
+ u64 qw_1;
+ };
+ u64 qw_2;
+ u64 qw_3;
+};
+
+/**
+ * intel_iommu_drain_pasid_prq - Drain page requests and responses for a pasid
+ * @dev: target device
+ * @pasid: pasid for draining
+ *
+ * Drain all pending page requests and responses related to @pasid in both
+ * software and hardware. This is supposed to be called after the device
+ * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
+ * and DevTLB have been invalidated.
+ *
+ * It waits until all pending page requests for @pasid in the page fault
+ * queue are completed by the prq handling thread. Then follow the steps
+ * described in VT-d spec CH7.10 to drain all page requests and page
+ * responses pending in the hardware.
+ */
+void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid)
+{
+ struct device_domain_info *info;
+ struct dmar_domain *domain;
+ struct intel_iommu *iommu;
+ struct qi_desc desc[3];
+ int head, tail;
+ u16 sid, did;
+
+ info = dev_iommu_priv_get(dev);
+ if (!info->iopf_refcount)
+ return;
+
+ iommu = info->iommu;
+ domain = info->domain;
+ sid = PCI_DEVID(info->bus, info->devfn);
+ did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID;
+
+ /*
+ * Check and wait until all pending page requests in the queue are
+ * handled by the prq handling thread.
+ */
+prq_retry:
+ reinit_completion(&iommu->prq_complete);
+ tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+ head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ while (head != tail) {
+ struct page_req_dsc *req;
+
+ req = &iommu->prq[head / sizeof(*req)];
+ if (req->rid != sid ||
+ (req->pasid_present && pasid != req->pasid) ||
+ (!req->pasid_present && pasid != IOMMU_NO_PASID)) {
+ head = (head + sizeof(*req)) & PRQ_RING_MASK;
+ continue;
+ }
+
+ wait_for_completion(&iommu->prq_complete);
+ goto prq_retry;
+ }
+
+ iopf_queue_flush_dev(dev);
+
+ /*
+ * Perform steps described in VT-d spec CH7.10 to drain page
+ * requests and responses in hardware.
+ */
+ memset(desc, 0, sizeof(desc));
+ desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+ QI_IWD_FENCE |
+ QI_IWD_TYPE;
+ if (pasid == IOMMU_NO_PASID) {
+ qi_desc_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, &desc[1]);
+ qi_desc_dev_iotlb(sid, info->pfsid, info->ats_qdep, 0,
+ MAX_AGAW_PFN_WIDTH, &desc[2]);
+ } else {
+ qi_desc_piotlb(did, pasid, 0, -1, 0, &desc[1]);
+ qi_desc_dev_iotlb_pasid(sid, info->pfsid, pasid, info->ats_qdep,
+ 0, MAX_AGAW_PFN_WIDTH, &desc[2]);
+ }
+qi_retry:
+ reinit_completion(&iommu->prq_complete);
+ qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
+ if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+ wait_for_completion(&iommu->prq_complete);
+ goto qi_retry;
+ }
+}
+
+static bool is_canonical_address(u64 addr)
+{
+ int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+ long saddr = (long)addr;
+
+ return (((saddr << shift) >> shift) == saddr);
+}
+
+static void handle_bad_prq_event(struct intel_iommu *iommu,
+ struct page_req_dsc *req, int result)
+{
+ struct qi_desc desc = { };
+
+ pr_err("%s: Invalid page request: %08llx %08llx\n",
+ iommu->name, ((unsigned long long *)req)[0],
+ ((unsigned long long *)req)[1]);
+
+ if (!req->lpig)
+ return;
+
+ desc.qw0 = QI_PGRP_PASID(req->pasid) |
+ QI_PGRP_DID(req->rid) |
+ QI_PGRP_PASID_P(req->pasid_present) |
+ QI_PGRP_RESP_CODE(result) |
+ QI_PGRP_RESP_TYPE;
+ desc.qw1 = QI_PGRP_IDX(req->prg_index);
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
+
+static int prq_to_iommu_prot(struct page_req_dsc *req)
+{
+ int prot = 0;
+
+ if (req->rd_req)
+ prot |= IOMMU_FAULT_PERM_READ;
+ if (req->wr_req)
+ prot |= IOMMU_FAULT_PERM_WRITE;
+ if (req->exe_req)
+ prot |= IOMMU_FAULT_PERM_EXEC;
+ if (req->pm_req)
+ prot |= IOMMU_FAULT_PERM_PRIV;
+
+ return prot;
+}
+
+static void intel_prq_report(struct intel_iommu *iommu, struct device *dev,
+ struct page_req_dsc *desc)
+{
+ struct iopf_fault event = { };
+
+ /* Fill in event data for device specific processing */
+ event.fault.type = IOMMU_FAULT_PAGE_REQ;
+ event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
+ event.fault.prm.pasid = desc->pasid;
+ event.fault.prm.grpid = desc->prg_index;
+ event.fault.prm.perm = prq_to_iommu_prot(desc);
+
+ if (desc->lpig)
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+ if (desc->pasid_present) {
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+ }
+
+ iommu_report_device_fault(dev, &event);
+}
+
+static irqreturn_t prq_event_thread(int irq, void *d)
+{
+ struct intel_iommu *iommu = d;
+ struct page_req_dsc *req;
+ int head, tail, handled;
+ struct device *dev;
+ u64 address;
+
+ /*
+ * Clear PPR bit before reading head/tail registers, to ensure that
+ * we get a new interrupt if needed.
+ */
+ writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
+
+ tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+ head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ handled = (head != tail);
+ while (head != tail) {
+ req = &iommu->prq[head / sizeof(*req)];
+ address = (u64)req->addr << VTD_PAGE_SHIFT;
+
+ if (unlikely(!is_canonical_address(address))) {
+ pr_err("IOMMU: %s: Address is not canonical\n",
+ iommu->name);
+bad_req:
+ handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+ goto prq_advance;
+ }
+
+ if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) {
+ pr_err("IOMMU: %s: Page request in Privilege Mode\n",
+ iommu->name);
+ goto bad_req;
+ }
+
+ if (unlikely(req->exe_req && req->rd_req)) {
+ pr_err("IOMMU: %s: Execution request not supported\n",
+ iommu->name);
+ goto bad_req;
+ }
+
+ /* Drop Stop Marker message. No need for a response. */
+ if (unlikely(req->lpig && !req->rd_req && !req->wr_req))
+ goto prq_advance;
+
+ /*
+ * If prq is to be handled outside iommu driver via receiver of
+ * the fault notifiers, we skip the page response here.
+ */
+ mutex_lock(&iommu->iopf_lock);
+ dev = device_rbtree_find(iommu, req->rid);
+ if (!dev) {
+ mutex_unlock(&iommu->iopf_lock);
+ goto bad_req;
+ }
+
+ intel_prq_report(iommu, dev, req);
+ trace_prq_report(iommu, dev, req->qw_0, req->qw_1,
+ req->qw_2, req->qw_3,
+ iommu->prq_seq_number++);
+ mutex_unlock(&iommu->iopf_lock);
+prq_advance:
+ head = (head + sizeof(*req)) & PRQ_RING_MASK;
+ }
+
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
+
+ /*
+ * Clear the page request overflow bit and wake up all threads that
+ * are waiting for the completion of this handling.
+ */
+ if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
+ pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
+ iommu->name);
+ head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
+ tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
+ if (head == tail) {
+ iopf_queue_discard_partial(iommu->iopf_queue);
+ writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
+ pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
+ iommu->name);
+ }
+ }
+
+ if (!completion_done(&iommu->prq_complete))
+ complete(&iommu->prq_complete);
+
+ return IRQ_RETVAL(handled);
+}
+
+int intel_iommu_enable_prq(struct intel_iommu *iommu)
+{
+ struct iopf_queue *iopfq;
+ int irq, ret;
+
+ iommu->prq =
+ iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, PRQ_SIZE);
+ if (!iommu->prq) {
+ pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
+ iommu->name);
+ return -ENOMEM;
+ }
+
+ irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu);
+ if (irq <= 0) {
+ pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
+ iommu->name);
+ ret = -EINVAL;
+ goto free_prq;
+ }
+ iommu->pr_irq = irq;
+
+ snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name),
+ "dmar%d-iopfq", iommu->seq_id);
+ iopfq = iopf_queue_alloc(iommu->iopfq_name);
+ if (!iopfq) {
+ pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name);
+ ret = -ENOMEM;
+ goto free_hwirq;
+ }
+ iommu->iopf_queue = iopfq;
+
+ snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
+
+ ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
+ iommu->prq_name, iommu);
+ if (ret) {
+ pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
+ iommu->name);
+ goto free_iopfq;
+ }
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
+
+ init_completion(&iommu->prq_complete);
+
+ return 0;
+
+free_iopfq:
+ iopf_queue_free(iommu->iopf_queue);
+ iommu->iopf_queue = NULL;
+free_hwirq:
+ dmar_free_hwirq(irq);
+ iommu->pr_irq = 0;
+free_prq:
+ iommu_free_pages(iommu->prq);
+ iommu->prq = NULL;
+
+ return ret;
+}
+
+int intel_iommu_finish_prq(struct intel_iommu *iommu)
+{
+ dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
+ dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
+
+ if (iommu->pr_irq) {
+ free_irq(iommu->pr_irq, iommu);
+ dmar_free_hwirq(iommu->pr_irq);
+ iommu->pr_irq = 0;
+ }
+
+ if (iommu->iopf_queue) {
+ iopf_queue_free(iommu->iopf_queue);
+ iommu->iopf_queue = NULL;
+ }
+
+ iommu_free_pages(iommu->prq);
+ iommu->prq = NULL;
+
+ return 0;
+}
+
+void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt,
+ struct iommu_page_response *msg)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ u8 bus = info->bus, devfn = info->devfn;
+ struct iommu_fault_page_request *prm;
+ struct qi_desc desc;
+ bool pasid_present;
+ u16 sid;
+
+ prm = &evt->fault.prm;
+ sid = PCI_DEVID(bus, devfn);
+ pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+
+ desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
+ QI_PGRP_PASID_P(pasid_present) |
+ QI_PGRP_RESP_CODE(msg->code) |
+ QI_PGRP_RESP_TYPE;
+ desc.qw1 = QI_PGRP_IDX(prm->grpid);
+ desc.qw2 = 0;
+ desc.qw3 = 0;
+
+ qi_submit_sync(iommu, &desc, 1, 0);
+}
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 0e3a9b38bef2..71de7947971f 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -25,92 +25,6 @@
#include "../iommu-pages.h"
#include "trace.h"
-static irqreturn_t prq_event_thread(int irq, void *d);
-
-int intel_svm_enable_prq(struct intel_iommu *iommu)
-{
- struct iopf_queue *iopfq;
- int irq, ret;
-
- iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER);
- if (!iommu->prq) {
- pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
- iommu->name);
- return -ENOMEM;
- }
-
- irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu);
- if (irq <= 0) {
- pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
- iommu->name);
- ret = -EINVAL;
- goto free_prq;
- }
- iommu->pr_irq = irq;
-
- snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name),
- "dmar%d-iopfq", iommu->seq_id);
- iopfq = iopf_queue_alloc(iommu->iopfq_name);
- if (!iopfq) {
- pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name);
- ret = -ENOMEM;
- goto free_hwirq;
- }
- iommu->iopf_queue = iopfq;
-
- snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
-
- ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
- iommu->prq_name, iommu);
- if (ret) {
- pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
- iommu->name);
- goto free_iopfq;
- }
- dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
- dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
- dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
-
- init_completion(&iommu->prq_complete);
-
- return 0;
-
-free_iopfq:
- iopf_queue_free(iommu->iopf_queue);
- iommu->iopf_queue = NULL;
-free_hwirq:
- dmar_free_hwirq(irq);
- iommu->pr_irq = 0;
-free_prq:
- iommu_free_pages(iommu->prq, PRQ_ORDER);
- iommu->prq = NULL;
-
- return ret;
-}
-
-int intel_svm_finish_prq(struct intel_iommu *iommu)
-{
- dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
- dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
- dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
-
- if (iommu->pr_irq) {
- free_irq(iommu->pr_irq, iommu);
- dmar_free_hwirq(iommu->pr_irq);
- iommu->pr_irq = 0;
- }
-
- if (iommu->iopf_queue) {
- iopf_queue_free(iommu->iopf_queue);
- iommu->iopf_queue = NULL;
- }
-
- iommu_free_pages(iommu->prq, PRQ_ORDER);
- iommu->prq = NULL;
-
- return 0;
-}
-
void intel_svm_check(struct intel_iommu *iommu)
{
if (!pasid_supported(iommu))
@@ -184,7 +98,10 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
static void intel_mm_free_notifier(struct mmu_notifier *mn)
{
- kfree(container_of(mn, struct dmar_domain, notifier));
+ struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier);
+
+ kfree(domain->qi_batch);
+ kfree(domain);
}
static const struct mmu_notifier_ops intel_mmuops = {
@@ -193,359 +110,81 @@ static const struct mmu_notifier_ops intel_mmuops = {
.free_notifier = intel_mm_free_notifier,
};
-static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
- struct device *dev, ioasid_t pasid)
+static int intel_iommu_sva_supported(struct device *dev)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct dmar_domain *dmar_domain = to_dmar_domain(domain);
- struct intel_iommu *iommu = info->iommu;
- struct mm_struct *mm = domain->mm;
- struct dev_pasid_info *dev_pasid;
- unsigned long sflags;
- unsigned long flags;
- int ret = 0;
-
- dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
- if (!dev_pasid)
- return -ENOMEM;
-
- dev_pasid->dev = dev;
- dev_pasid->pasid = pasid;
-
- ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid);
- if (ret)
- goto free_dev_pasid;
-
- /* Setup the pasid table: */
- sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
- ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid,
- FLPT_DEFAULT_DID, sflags);
- if (ret)
- goto unassign_tag;
-
- spin_lock_irqsave(&dmar_domain->lock, flags);
- list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
- spin_unlock_irqrestore(&dmar_domain->lock, flags);
-
- return 0;
-
-unassign_tag:
- cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid);
-free_dev_pasid:
- kfree(dev_pasid);
-
- return ret;
-}
-
-/* Page request queue descriptor */
-struct page_req_dsc {
- union {
- struct {
- u64 type:8;
- u64 pasid_present:1;
- u64 rsvd:7;
- u64 rid:16;
- u64 pasid:20;
- u64 exe_req:1;
- u64 pm_req:1;
- u64 rsvd2:10;
- };
- u64 qw_0;
- };
- union {
- struct {
- u64 rd_req:1;
- u64 wr_req:1;
- u64 lpig:1;
- u64 prg_index:9;
- u64 addr:52;
- };
- u64 qw_1;
- };
- u64 qw_2;
- u64 qw_3;
-};
-
-static bool is_canonical_address(u64 addr)
-{
- int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
- long saddr = (long) addr;
-
- return (((saddr << shift) >> shift) == saddr);
-}
-
-/**
- * intel_drain_pasid_prq - Drain page requests and responses for a pasid
- * @dev: target device
- * @pasid: pasid for draining
- *
- * Drain all pending page requests and responses related to @pasid in both
- * software and hardware. This is supposed to be called after the device
- * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
- * and DevTLB have been invalidated.
- *
- * It waits until all pending page requests for @pasid in the page fault
- * queue are completed by the prq handling thread. Then follow the steps
- * described in VT-d spec CH7.10 to drain all page requests and page
- * responses pending in the hardware.
- */
-void intel_drain_pasid_prq(struct device *dev, u32 pasid)
-{
- struct device_domain_info *info;
- struct dmar_domain *domain;
struct intel_iommu *iommu;
- struct qi_desc desc[3];
- struct pci_dev *pdev;
- int head, tail;
- u16 sid, did;
- int qdep;
-
- info = dev_iommu_priv_get(dev);
- if (WARN_ON(!info || !dev_is_pci(dev)))
- return;
- if (!info->pri_enabled)
- return;
+ if (!info || dmar_disabled)
+ return -EINVAL;
iommu = info->iommu;
- domain = info->domain;
- pdev = to_pci_dev(dev);
- sid = PCI_DEVID(info->bus, info->devfn);
- did = domain_id_iommu(domain, iommu);
- qdep = pci_ats_queue_depth(pdev);
+ if (!iommu)
+ return -EINVAL;
- /*
- * Check and wait until all pending page requests in the queue are
- * handled by the prq handling thread.
- */
-prq_retry:
- reinit_completion(&iommu->prq_complete);
- tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
- head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
- while (head != tail) {
- struct page_req_dsc *req;
-
- req = &iommu->prq[head / sizeof(*req)];
- if (!req->pasid_present || req->pasid != pasid) {
- head = (head + sizeof(*req)) & PRQ_RING_MASK;
- continue;
- }
-
- wait_for_completion(&iommu->prq_complete);
- goto prq_retry;
- }
+ if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
+ return -ENODEV;
- iopf_queue_flush_dev(dev);
+ if (!info->pasid_enabled || !info->ats_enabled)
+ return -EINVAL;
/*
- * Perform steps described in VT-d spec CH7.10 to drain page
- * requests and responses in hardware.
+ * Devices having device-specific I/O fault handling should not
+ * support PCI/PRI. The IOMMU side has no means to check the
+ * capability of device-specific IOPF. Therefore, IOMMU can only
+ * default that if the device driver enables SVA on a non-PRI
+ * device, it will handle IOPF in its own way.
*/
- memset(desc, 0, sizeof(desc));
- desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
- QI_IWD_FENCE |
- QI_IWD_TYPE;
- desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
- QI_EIOTLB_DID(did) |
- QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
- QI_EIOTLB_TYPE;
- desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
- QI_DEV_EIOTLB_SID(sid) |
- QI_DEV_EIOTLB_QDEP(qdep) |
- QI_DEIOTLB_TYPE |
- QI_DEV_IOTLB_PFSID(info->pfsid);
-qi_retry:
- reinit_completion(&iommu->prq_complete);
- qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
- if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
- wait_for_completion(&iommu->prq_complete);
- goto qi_retry;
- }
-}
-
-static int prq_to_iommu_prot(struct page_req_dsc *req)
-{
- int prot = 0;
-
- if (req->rd_req)
- prot |= IOMMU_FAULT_PERM_READ;
- if (req->wr_req)
- prot |= IOMMU_FAULT_PERM_WRITE;
- if (req->exe_req)
- prot |= IOMMU_FAULT_PERM_EXEC;
- if (req->pm_req)
- prot |= IOMMU_FAULT_PERM_PRIV;
-
- return prot;
-}
-
-static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
- struct page_req_dsc *desc)
-{
- struct iopf_fault event = { };
-
- /* Fill in event data for device specific processing */
- event.fault.type = IOMMU_FAULT_PAGE_REQ;
- event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
- event.fault.prm.pasid = desc->pasid;
- event.fault.prm.grpid = desc->prg_index;
- event.fault.prm.perm = prq_to_iommu_prot(desc);
-
- if (desc->lpig)
- event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
- if (desc->pasid_present) {
- event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
- event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
- }
-
- iommu_report_device_fault(dev, &event);
-}
-
-static void handle_bad_prq_event(struct intel_iommu *iommu,
- struct page_req_dsc *req, int result)
-{
- struct qi_desc desc = { };
-
- pr_err("%s: Invalid page request: %08llx %08llx\n",
- iommu->name, ((unsigned long long *)req)[0],
- ((unsigned long long *)req)[1]);
-
- if (!req->lpig)
- return;
+ if (!info->pri_supported)
+ return 0;
- desc.qw0 = QI_PGRP_PASID(req->pasid) |
- QI_PGRP_DID(req->rid) |
- QI_PGRP_PASID_P(req->pasid_present) |
- QI_PGRP_RESP_CODE(result) |
- QI_PGRP_RESP_TYPE;
- desc.qw1 = QI_PGRP_IDX(req->prg_index) |
- QI_PGRP_LPIG(req->lpig);
+ /* Devices supporting PRI should have it enabled. */
+ if (!info->pri_enabled)
+ return -EINVAL;
- qi_submit_sync(iommu, &desc, 1, 0);
+ return 0;
}
-static irqreturn_t prq_event_thread(int irq, void *d)
+static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
{
- struct intel_iommu *iommu = d;
- struct page_req_dsc *req;
- int head, tail, handled;
- struct device *dev;
- u64 address;
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct intel_iommu *iommu = info->iommu;
+ struct mm_struct *mm = domain->mm;
+ struct dev_pasid_info *dev_pasid;
+ unsigned long sflags;
+ int ret = 0;
- /*
- * Clear PPR bit before reading head/tail registers, to ensure that
- * we get a new interrupt if needed.
- */
- writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
-
- tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
- head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
- handled = (head != tail);
- while (head != tail) {
- req = &iommu->prq[head / sizeof(*req)];
- address = (u64)req->addr << VTD_PAGE_SHIFT;
-
- if (unlikely(!req->pasid_present)) {
- pr_err("IOMMU: %s: Page request without PASID\n",
- iommu->name);
-bad_req:
- handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
- goto prq_advance;
- }
-
- if (unlikely(!is_canonical_address(address))) {
- pr_err("IOMMU: %s: Address is not canonical\n",
- iommu->name);
- goto bad_req;
- }
-
- if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) {
- pr_err("IOMMU: %s: Page request in Privilege Mode\n",
- iommu->name);
- goto bad_req;
- }
-
- if (unlikely(req->exe_req && req->rd_req)) {
- pr_err("IOMMU: %s: Execution request not supported\n",
- iommu->name);
- goto bad_req;
- }
-
- /* Drop Stop Marker message. No need for a response. */
- if (unlikely(req->lpig && !req->rd_req && !req->wr_req))
- goto prq_advance;
-
- /*
- * If prq is to be handled outside iommu driver via receiver of
- * the fault notifiers, we skip the page response here.
- */
- mutex_lock(&iommu->iopf_lock);
- dev = device_rbtree_find(iommu, req->rid);
- if (!dev) {
- mutex_unlock(&iommu->iopf_lock);
- goto bad_req;
- }
-
- intel_svm_prq_report(iommu, dev, req);
- trace_prq_report(iommu, dev, req->qw_0, req->qw_1,
- req->qw_2, req->qw_3,
- iommu->prq_seq_number++);
- mutex_unlock(&iommu->iopf_lock);
-prq_advance:
- head = (head + sizeof(*req)) & PRQ_RING_MASK;
- }
+ ret = intel_iommu_sva_supported(dev);
+ if (ret)
+ return ret;
- dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
+ dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
+ if (IS_ERR(dev_pasid))
+ return PTR_ERR(dev_pasid);
- /*
- * Clear the page request overflow bit and wake up all threads that
- * are waiting for the completion of this handling.
- */
- if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
- pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
- iommu->name);
- head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
- tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
- if (head == tail) {
- iopf_queue_discard_partial(iommu->iopf_queue);
- writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
- pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
- iommu->name);
- }
- }
+ ret = iopf_for_domain_replace(domain, old, dev);
+ if (ret)
+ goto out_remove_dev_pasid;
- if (!completion_done(&iommu->prq_complete))
- complete(&iommu->prq_complete);
+ /* Setup the pasid table: */
+ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+ sflags |= PASID_FLAG_PWSNP;
+ ret = __domain_setup_first_level(iommu, dev, pasid,
+ FLPT_DEFAULT_DID, __pa(mm->pgd),
+ sflags, old);
+ if (ret)
+ goto out_unwind_iopf;
- return IRQ_RETVAL(handled);
-}
+ domain_remove_dev_pasid(old, dev, pasid);
-void intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
- struct iommu_page_response *msg)
-{
- struct device_domain_info *info = dev_iommu_priv_get(dev);
- struct intel_iommu *iommu = info->iommu;
- u8 bus = info->bus, devfn = info->devfn;
- struct iommu_fault_page_request *prm;
- struct qi_desc desc;
- bool pasid_present;
- bool last_page;
- u16 sid;
-
- prm = &evt->fault.prm;
- sid = PCI_DEVID(bus, devfn);
- pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
- last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
-
- desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
- QI_PGRP_PASID_P(pasid_present) |
- QI_PGRP_RESP_CODE(msg->code) |
- QI_PGRP_RESP_TYPE;
- desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
- desc.qw2 = 0;
- desc.qw3 = 0;
-
- qi_submit_sync(iommu, &desc, 1, 0);
+ return 0;
+out_unwind_iopf:
+ iopf_for_domain_replace(old, domain, dev);
+out_remove_dev_pasid:
+ domain_remove_dev_pasid(domain, dev, pasid);
+ return ret;
}
static void intel_svm_domain_free(struct iommu_domain *domain)
@@ -567,12 +206,15 @@ struct iommu_domain *intel_svm_domain_alloc(struct device *dev,
struct dmar_domain *domain;
int ret;
+ ret = intel_iommu_sva_supported(dev);
+ if (ret)
+ return ERR_PTR(ret);
+
domain = kzalloc(sizeof(*domain), GFP_KERNEL);
if (!domain)
return ERR_PTR(-ENOMEM);
domain->domain.ops = &intel_svm_domain_ops;
- domain->use_first_level = true;
INIT_LIST_HEAD(&domain->dev_pasids);
INIT_LIST_HEAD(&domain->cache_tags);
spin_lock_init(&domain->cache_lock);
diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h
index 9defdae6ebae..6311ba3f1691 100644
--- a/drivers/iommu/intel/trace.h
+++ b/drivers/iommu/intel/trace.h
@@ -130,11 +130,6 @@ DEFINE_EVENT(cache_tag_log, cache_tag_unassign,
TP_ARGS(tag)
);
-DEFINE_EVENT(cache_tag_log, cache_tag_flush_all,
- TP_PROTO(struct cache_tag *tag),
- TP_ARGS(tag)
-);
-
DECLARE_EVENT_CLASS(cache_tag_flush,
TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end,
unsigned long addr, unsigned long pages, unsigned long mask),
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 06d78fcc79fd..8b5926c1452e 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -59,30 +59,6 @@ void iopf_free_group(struct iopf_group *group)
}
EXPORT_SYMBOL_GPL(iopf_free_group);
-static struct iommu_domain *get_domain_for_iopf(struct device *dev,
- struct iommu_fault *fault)
-{
- struct iommu_domain *domain;
-
- if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
- domain = iommu_get_domain_for_dev_pasid(dev, fault->prm.pasid, 0);
- if (IS_ERR(domain))
- domain = NULL;
- } else {
- domain = iommu_get_domain_for_dev(dev);
- }
-
- if (!domain || !domain->iopf_handler) {
- dev_warn_ratelimited(dev,
- "iopf (pasid %d) without domain attached or handler installed\n",
- fault->prm.pasid);
-
- return NULL;
- }
-
- return domain;
-}
-
/* Non-last request of a group. Postpone until the last one. */
static int report_partial_fault(struct iommu_fault_param *fault_param,
struct iommu_fault *fault)
@@ -134,9 +110,64 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
list_add(&group->pending_node, &iopf_param->faults);
mutex_unlock(&iopf_param->lock);
+ group->fault_count = list_count_nodes(&group->faults);
+
return group;
}
+static struct iommu_attach_handle *find_fault_handler(struct device *dev,
+ struct iopf_fault *evt)
+{
+ struct iommu_fault *fault = &evt->fault;
+ struct iommu_attach_handle *attach_handle;
+
+ if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
+ attach_handle = iommu_attach_handle_get(dev->iommu_group,
+ fault->prm.pasid, 0);
+ if (IS_ERR(attach_handle)) {
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
+
+ if (!ops->user_pasid_table)
+ return NULL;
+ /*
+ * The iommu driver for this device supports user-
+ * managed PASID table. Therefore page faults for
+ * any PASID should go through the NESTING domain
+ * attached to the device RID.
+ */
+ attach_handle = iommu_attach_handle_get(
+ dev->iommu_group, IOMMU_NO_PASID,
+ IOMMU_DOMAIN_NESTED);
+ if (IS_ERR(attach_handle))
+ return NULL;
+ }
+ } else {
+ attach_handle = iommu_attach_handle_get(dev->iommu_group,
+ IOMMU_NO_PASID, 0);
+
+ if (IS_ERR(attach_handle))
+ return NULL;
+ }
+
+ if (!attach_handle->domain->iopf_handler)
+ return NULL;
+
+ return attach_handle;
+}
+
+static void iopf_error_response(struct device *dev, struct iopf_fault *evt)
+{
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
+ struct iommu_fault *fault = &evt->fault;
+ struct iommu_page_response resp = {
+ .pasid = fault->prm.pasid,
+ .grpid = fault->prm.grpid,
+ .code = IOMMU_PAGE_RESP_INVALID
+ };
+
+ ops->page_response(dev, evt, &resp);
+}
+
/**
* iommu_report_device_fault() - Report fault event to device driver
* @dev: the device
@@ -175,23 +206,39 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
* handling framework should guarantee that the iommu domain could only be
* freed after the device has stopped generating page faults (or the iommu
* hardware has been set to block the page faults) and the pending page faults
- * have been flushed.
+ * have been flushed. In case no page fault handler is attached or no iopf params
+ * are setup, then the ops->page_response() is called to complete the evt.
+ *
+ * Returns 0 on success, or an error in case of a bad/failed iopf setup.
*/
-void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
{
+ struct iommu_attach_handle *attach_handle;
struct iommu_fault *fault = &evt->fault;
struct iommu_fault_param *iopf_param;
struct iopf_group abort_group = {};
struct iopf_group *group;
+ attach_handle = find_fault_handler(dev, evt);
+ if (!attach_handle)
+ goto err_bad_iopf;
+
+ /*
+ * Something has gone wrong if a fault capable domain is attached but no
+ * iopf_param is setup
+ */
iopf_param = iopf_get_dev_fault_param(dev);
if (WARN_ON(!iopf_param))
- return;
+ goto err_bad_iopf;
if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
- report_partial_fault(iopf_param, fault);
+ int ret;
+
+ ret = report_partial_fault(iopf_param, fault);
iopf_put_dev_fault_param(iopf_param);
/* A request that is not the last does not need to be ack'd */
+
+ return ret;
}
/*
@@ -206,25 +253,33 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
if (group == &abort_group)
goto err_abort;
- group->domain = get_domain_for_iopf(dev, fault);
- if (!group->domain)
- goto err_abort;
+ group->attach_handle = attach_handle;
/*
* On success iopf_handler must call iopf_group_response() and
* iopf_free_group()
*/
- if (group->domain->iopf_handler(group))
+ if (group->attach_handle->domain->iopf_handler(group))
goto err_abort;
- return;
+ return 0;
err_abort:
+ dev_warn_ratelimited(dev, "iopf with pasid %d aborted\n",
+ fault->prm.pasid);
iopf_group_response(group, IOMMU_PAGE_RESP_FAILURE);
if (group == &abort_group)
__iopf_free_group(group);
else
iopf_free_group(group);
+
+ return 0;
+
+err_bad_iopf:
+ if (fault->type == IOMMU_FAULT_PAGE_REQ)
+ iopf_error_response(dev, evt);
+
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(iommu_report_device_fault);
@@ -423,6 +478,7 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
ops->page_response(dev, iopf, &resp);
list_del_init(&group->pending_node);
+ iopf_free_group(group);
}
mutex_unlock(&fault_param->lock);
diff --git a/drivers/iommu/io-pgtable-arm-selftests.c b/drivers/iommu/io-pgtable-arm-selftests.c
new file mode 100644
index 000000000000..334e70350924
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm-selftests.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU-agnostic ARM page table allocator.
+ *
+ * Copyright (C) 2014 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt
+
+#include <kunit/device.h>
+#include <kunit/test.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+
+#include "io-pgtable-arm.h"
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_flush(unsigned long iova, size_t size,
+ size_t granule, void *cookie)
+{
+ WARN_ON(cookie != cfg_cookie);
+ WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
+ unsigned long iova, size_t granule,
+ void *cookie)
+{
+ dummy_tlb_flush(iova, granule, granule, cookie);
+}
+
+static const struct iommu_flush_ops dummy_tlb_ops = {
+ .tlb_flush_all = dummy_tlb_flush_all,
+ .tlb_flush_walk = dummy_tlb_flush,
+ .tlb_add_page = dummy_tlb_add_page,
+};
+
+#define __FAIL(test, i) ({ \
+ KUNIT_FAIL(test, "test failed for fmt idx %d\n", (i)); \
+ -EFAULT; \
+})
+
+static int arm_lpae_run_tests(struct kunit *test, struct io_pgtable_cfg *cfg)
+{
+ static const enum io_pgtable_fmt fmts[] = {
+ ARM_64_LPAE_S1,
+ ARM_64_LPAE_S2,
+ };
+
+ int i, j;
+ unsigned long iova;
+ size_t size, mapped;
+ struct io_pgtable_ops *ops;
+
+ for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
+ cfg_cookie = cfg;
+ ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
+ if (!ops) {
+ kunit_err(test, "failed to allocate io pgtable ops\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Initial sanity checks.
+ * Empty page tables shouldn't provide any translations.
+ */
+ if (ops->iova_to_phys(ops, 42))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, SZ_1G + 42))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, SZ_2G + 42))
+ return __FAIL(test, i);
+
+ /*
+ * Distinct mappings of different granule sizes.
+ */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ /* Overlapping mappings */
+ if (!ops->map_pages(ops, iova, iova + size, size, 1,
+ IOMMU_READ | IOMMU_NOEXEC,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(test, i);
+
+ iova += SZ_1G;
+ }
+
+ /* Full unmap */
+ iova = 0;
+ for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
+ size = 1UL << j;
+
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42))
+ return __FAIL(test, i);
+
+ /* Remap full block */
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_WRITE, GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+
+ if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+ return __FAIL(test, i);
+
+ iova += SZ_1G;
+ }
+
+ /*
+ * Map/unmap the last largest supported page of the IAS, this can
+ * trigger corner cases in the concatednated page tables.
+ */
+ mapped = 0;
+ size = 1UL << __fls(cfg->pgsize_bitmap);
+ iova = (1UL << cfg->ias) - size;
+ if (ops->map_pages(ops, iova, iova, size, 1,
+ IOMMU_READ | IOMMU_WRITE |
+ IOMMU_NOEXEC | IOMMU_CACHE,
+ GFP_KERNEL, &mapped))
+ return __FAIL(test, i);
+ if (mapped != size)
+ return __FAIL(test, i);
+ if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
+ return __FAIL(test, i);
+
+ free_io_pgtable_ops(ops);
+ }
+
+ return 0;
+}
+
+static void arm_lpae_do_selftests(struct kunit *test)
+{
+ static const unsigned long pgsize[] = {
+ SZ_4K | SZ_2M | SZ_1G,
+ SZ_16K | SZ_32M,
+ SZ_64K | SZ_512M,
+ };
+
+ static const unsigned int address_size[] = {
+ 32, 36, 40, 42, 44, 48,
+ };
+
+ int i, j, k, pass = 0, fail = 0;
+ struct device *dev;
+ struct io_pgtable_cfg cfg = {
+ .tlb = &dummy_tlb_ops,
+ .coherent_walk = true,
+ .quirks = IO_PGTABLE_QUIRK_NO_WARN,
+ };
+
+ dev = kunit_device_register(test, "io-pgtable-test");
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev);
+ if (IS_ERR_OR_NULL(dev))
+ return;
+
+ cfg.iommu_dev = dev;
+
+ for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
+ for (j = 0; j < ARRAY_SIZE(address_size); ++j) {
+ /* Don't use ias > oas as it is not valid for stage-2. */
+ for (k = 0; k <= j; ++k) {
+ cfg.pgsize_bitmap = pgsize[i];
+ cfg.ias = address_size[k];
+ cfg.oas = address_size[j];
+ kunit_info(test, "pgsize_bitmap 0x%08lx, IAS %u OAS %u\n",
+ pgsize[i], cfg.ias, cfg.oas);
+ if (arm_lpae_run_tests(test, &cfg))
+ fail++;
+ else
+ pass++;
+ }
+ }
+ }
+
+ kunit_info(test, "completed with %d PASS %d FAIL\n", pass, fail);
+}
+
+static struct kunit_case io_pgtable_arm_test_cases[] = {
+ KUNIT_CASE(arm_lpae_do_selftests),
+ {},
+};
+
+static struct kunit_suite io_pgtable_arm_test = {
+ .name = "io-pgtable-arm-test",
+ .test_cases = io_pgtable_arm_test_cases,
+};
+
+kunit_test_suite(io_pgtable_arm_test);
+
+MODULE_DESCRIPTION("io-pgtable-arm library kunit tests");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index 75f244a3e12d..523355e91a2c 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -166,7 +166,6 @@ struct arm_v7s_io_pgtable {
arm_v7s_iopte *pgd;
struct kmem_cache *l2_tables;
- spinlock_t split_lock;
};
static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl);
@@ -363,25 +362,6 @@ static arm_v7s_iopte arm_v7s_prot_to_pte(int prot, int lvl,
return pte;
}
-static int arm_v7s_pte_to_prot(arm_v7s_iopte pte, int lvl)
-{
- int prot = IOMMU_READ;
- arm_v7s_iopte attr = pte >> ARM_V7S_ATTR_SHIFT(lvl);
-
- if (!(attr & ARM_V7S_PTE_AP_RDONLY))
- prot |= IOMMU_WRITE;
- if (!(attr & ARM_V7S_PTE_AP_UNPRIV))
- prot |= IOMMU_PRIV;
- if ((attr & (ARM_V7S_TEX_MASK << ARM_V7S_TEX_SHIFT)) == 0)
- prot |= IOMMU_MMIO;
- else if (pte & ARM_V7S_ATTR_C)
- prot |= IOMMU_CACHE;
- if (pte & ARM_V7S_ATTR_XN(lvl))
- prot |= IOMMU_NOEXEC;
-
- return prot;
-}
-
static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl)
{
if (lvl == 1) {
@@ -398,23 +378,6 @@ static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl)
return pte;
}
-static arm_v7s_iopte arm_v7s_cont_to_pte(arm_v7s_iopte pte, int lvl)
-{
- if (lvl == 1) {
- pte &= ~ARM_V7S_CONT_SECTION;
- } else if (lvl == 2) {
- arm_v7s_iopte xn = pte & BIT(ARM_V7S_CONT_PAGE_XN_SHIFT);
- arm_v7s_iopte tex = pte & (ARM_V7S_CONT_PAGE_TEX_MASK <<
- ARM_V7S_CONT_PAGE_TEX_SHIFT);
-
- pte ^= xn | tex | ARM_V7S_PTE_TYPE_CONT_PAGE;
- pte |= (xn >> ARM_V7S_CONT_PAGE_XN_SHIFT) |
- (tex >> ARM_V7S_CONT_PAGE_TEX_SHIFT) |
- ARM_V7S_PTE_TYPE_PAGE;
- }
- return pte;
-}
-
static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl)
{
if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte, lvl))
@@ -552,9 +515,8 @@ static int arm_v7s_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
paddr >= (1ULL << data->iop.cfg.oas)))
return -ERANGE;
- /* If no access, then nothing to do */
if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
- return 0;
+ return -EINVAL;
while (pgcount--) {
ret = __arm_v7s_map(data, iova, paddr, pgsize, prot, 1, data->pgd,
@@ -592,77 +554,6 @@ static void arm_v7s_free_pgtable(struct io_pgtable *iop)
kfree(data);
}
-static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data,
- unsigned long iova, int idx, int lvl,
- arm_v7s_iopte *ptep)
-{
- struct io_pgtable *iop = &data->iop;
- arm_v7s_iopte pte;
- size_t size = ARM_V7S_BLOCK_SIZE(lvl);
- int i;
-
- /* Check that we didn't lose a race to get the lock */
- pte = *ptep;
- if (!arm_v7s_pte_is_cont(pte, lvl))
- return pte;
-
- ptep -= idx & (ARM_V7S_CONT_PAGES - 1);
- pte = arm_v7s_cont_to_pte(pte, lvl);
- for (i = 0; i < ARM_V7S_CONT_PAGES; i++)
- ptep[i] = pte + i * size;
-
- __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg);
-
- size *= ARM_V7S_CONT_PAGES;
- io_pgtable_tlb_flush_walk(iop, iova, size, size);
- return pte;
-}
-
-static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data,
- struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size,
- arm_v7s_iopte blk_pte,
- arm_v7s_iopte *ptep)
-{
- struct io_pgtable_cfg *cfg = &data->iop.cfg;
- arm_v7s_iopte pte, *tablep;
- int i, unmap_idx, num_entries, num_ptes;
-
- tablep = __arm_v7s_alloc_table(2, GFP_ATOMIC, data);
- if (!tablep)
- return 0; /* Bytes unmapped */
-
- num_ptes = ARM_V7S_PTES_PER_LVL(2, cfg);
- num_entries = size >> ARM_V7S_LVL_SHIFT(2);
- unmap_idx = ARM_V7S_LVL_IDX(iova, 2, cfg);
-
- pte = arm_v7s_prot_to_pte(arm_v7s_pte_to_prot(blk_pte, 1), 2, cfg);
- if (num_entries > 1)
- pte = arm_v7s_pte_to_cont(pte, 2);
-
- for (i = 0; i < num_ptes; i += num_entries, pte += size) {
- /* Unmap! */
- if (i == unmap_idx)
- continue;
-
- __arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg);
- }
-
- pte = arm_v7s_install_table(tablep, ptep, blk_pte, cfg);
- if (pte != blk_pte) {
- __arm_v7s_free_table(tablep, 2, data);
-
- if (!ARM_V7S_PTE_IS_TABLE(pte, 1))
- return 0;
-
- tablep = iopte_deref(pte, 1, data);
- return __arm_v7s_unmap(data, gather, iova, size, 2, tablep);
- }
-
- io_pgtable_tlb_add_page(&data->iop, gather, iova, size);
- return size;
-}
-
static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
struct iommu_iotlb_gather *gather,
unsigned long iova, size_t size, int lvl,
@@ -695,11 +586,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
* case in a lock for the sake of correctness and be done with it.
*/
if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) {
- unsigned long flags;
-
- spin_lock_irqsave(&data->split_lock, flags);
- pte[0] = arm_v7s_split_cont(data, iova, idx, lvl, ptep);
- spin_unlock_irqrestore(&data->split_lock, flags);
+ WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed");
+ return 0;
}
/* If the size matches this level, we're in the right place */
@@ -722,12 +610,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
}
return size;
} else if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte[0], lvl)) {
- /*
- * Insert a table at the next level to map the old region,
- * minus the part we want to unmap
- */
- return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0],
- ptep);
+ WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed");
+ return 0;
}
/* Keep on walkin' */
@@ -812,8 +696,6 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
if (!data)
return NULL;
- spin_lock_init(&data->split_lock);
-
/*
* ARM_MTK_TTBR_EXT extend the translation table base support larger
* memory address.
@@ -937,8 +819,8 @@ static int __init arm_v7s_do_selftests(void)
.quirks = IO_PGTABLE_QUIRK_ARM_NS,
.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
};
- unsigned int iova, size, iova_start;
- unsigned int i, loopnr = 0;
+ unsigned int iova, size;
+ unsigned int i;
size_t mapped;
selftest_running = true;
@@ -986,26 +868,6 @@ static int __init arm_v7s_do_selftests(void)
return __FAIL(ops);
iova += SZ_16M;
- loopnr++;
- }
-
- /* Partial unmap */
- i = 1;
- size = 1UL << __ffs(cfg.pgsize_bitmap);
- while (i < loopnr) {
- iova_start = i * SZ_16M;
- if (ops->unmap_pages(ops, iova_start + size, size, 1, NULL) != size)
- return __FAIL(ops);
-
- /* Remap of partial unmap */
- if (ops->map_pages(ops, iova_start + size, size, size, 1,
- IOMMU_READ, GFP_KERNEL, &mapped))
- return __FAIL(ops);
-
- if (ops->iova_to_phys(ops, iova_start + size + 42)
- != (size + 42))
- return __FAIL(ops);
- i++;
}
/* Full unmap */
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 3d23b924cec1..e6626004b323 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -12,7 +12,6 @@
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/io-pgtable.h>
-#include <linux/kernel.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -76,6 +75,7 @@
#define ARM_LPAE_PTE_NSTABLE (((arm_lpae_iopte)1) << 63)
#define ARM_LPAE_PTE_XN (((arm_lpae_iopte)3) << 53)
+#define ARM_LPAE_PTE_DBM (((arm_lpae_iopte)1) << 51)
#define ARM_LPAE_PTE_AF (((arm_lpae_iopte)1) << 10)
#define ARM_LPAE_PTE_SH_NS (((arm_lpae_iopte)0) << 8)
#define ARM_LPAE_PTE_SH_OS (((arm_lpae_iopte)2) << 8)
@@ -83,17 +83,16 @@
#define ARM_LPAE_PTE_NS (((arm_lpae_iopte)1) << 5)
#define ARM_LPAE_PTE_VALID (((arm_lpae_iopte)1) << 0)
-#define ARM_LPAE_PTE_ATTR_LO_MASK (((arm_lpae_iopte)0x3ff) << 2)
-/* Ignore the contiguous bit for block splitting */
-#define ARM_LPAE_PTE_ATTR_HI_MASK (((arm_lpae_iopte)6) << 52)
-#define ARM_LPAE_PTE_ATTR_MASK (ARM_LPAE_PTE_ATTR_LO_MASK | \
- ARM_LPAE_PTE_ATTR_HI_MASK)
/* Software bit for solving coherency races */
#define ARM_LPAE_PTE_SW_SYNC (((arm_lpae_iopte)1) << 55)
/* Stage-1 PTE */
#define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6)
-#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)2) << 6)
+#define ARM_LPAE_PTE_AP_RDONLY_BIT 7
+#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)1) << \
+ ARM_LPAE_PTE_AP_RDONLY_BIT)
+#define ARM_LPAE_PTE_AP_WR_CLEAN_MASK (ARM_LPAE_PTE_AP_RDONLY | \
+ ARM_LPAE_PTE_DBM)
#define ARM_LPAE_PTE_ATTRINDX_SHIFT 2
#define ARM_LPAE_PTE_nG (((arm_lpae_iopte)1) << 11)
@@ -101,6 +100,18 @@
#define ARM_LPAE_PTE_HAP_FAULT (((arm_lpae_iopte)0) << 6)
#define ARM_LPAE_PTE_HAP_READ (((arm_lpae_iopte)1) << 6)
#define ARM_LPAE_PTE_HAP_WRITE (((arm_lpae_iopte)2) << 6)
+/*
+ * For !FWB these code to:
+ * 1111 = Normal outer write back cachable / Inner Write Back Cachable
+ * Permit S1 to override
+ * 0101 = Normal Non-cachable / Inner Non-cachable
+ * 0001 = Device / Device-nGnRE
+ * For S2FWB these code:
+ * 0110 Force Normal Write Back
+ * 0101 Normal* is forced Normal-NC, Device unchanged
+ * 0001 Force Device-nGnRE
+ */
+#define ARM_LPAE_PTE_MEMATTR_FWB_WB (((arm_lpae_iopte)0x6) << 2)
#define ARM_LPAE_PTE_MEMATTR_OIWB (((arm_lpae_iopte)0xf) << 2)
#define ARM_LPAE_PTE_MEMATTR_NC (((arm_lpae_iopte)0x5) << 2)
#define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2)
@@ -137,7 +148,11 @@
#define iopte_type(pte) \
(((pte) >> ARM_LPAE_PTE_TYPE_SHIFT) & ARM_LPAE_PTE_TYPE_MASK)
-#define iopte_prot(pte) ((pte) & ARM_LPAE_PTE_ATTR_MASK)
+#define iopte_writeable_dirty(pte) \
+ (((pte) & ARM_LPAE_PTE_AP_WR_CLEAN_MASK) == ARM_LPAE_PTE_DBM)
+
+#define iopte_set_writeable_clean(ptep) \
+ set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, (unsigned long *)(ptep))
struct arm_lpae_io_pgtable {
struct io_pgtable iop;
@@ -160,6 +175,13 @@ static inline bool iopte_leaf(arm_lpae_iopte pte, int lvl,
return iopte_type(pte) == ARM_LPAE_PTE_TYPE_BLOCK;
}
+static inline bool iopte_table(arm_lpae_iopte pte, int lvl)
+{
+ if (lvl == (ARM_LPAE_MAX_LEVELS - 1))
+ return false;
+ return iopte_type(pte) == ARM_LPAE_PTE_TYPE_TABLE;
+}
+
static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr,
struct arm_lpae_io_pgtable *data)
{
@@ -181,7 +203,45 @@ static phys_addr_t iopte_to_paddr(arm_lpae_iopte pte,
return (paddr | (paddr << (48 - 12))) & (ARM_LPAE_PTE_ADDR_MASK << 4);
}
-static bool selftest_running = false;
+/*
+ * Convert an index returned by ARM_LPAE_PGD_IDX(), which can point into
+ * a concatenated PGD, into the maximum number of entries that can be
+ * mapped in the same table page.
+ */
+static inline int arm_lpae_max_entries(int i, struct arm_lpae_io_pgtable *data)
+{
+ int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
+
+ return ptes_per_table - (i & (ptes_per_table - 1));
+}
+
+/*
+ * Check if concatenated PGDs are mandatory according to Arm DDI0487 (K.a)
+ * 1) R_DXBSH: For 16KB, and 48-bit input size, use level 1 instead of 0.
+ * 2) R_SRKBC: After de-ciphering the table for PA size and valid initial lookup
+ * a) 40 bits PA size with 4K: use level 1 instead of level 0 (2 tables for ias = oas)
+ * b) 40 bits PA size with 16K: use level 2 instead of level 1 (16 tables for ias = oas)
+ * c) 42 bits PA size with 4K: use level 1 instead of level 0 (8 tables for ias = oas)
+ * d) 48 bits PA size with 16K: use level 1 instead of level 0 (2 tables for ias = oas)
+ */
+static inline bool arm_lpae_concat_mandatory(struct io_pgtable_cfg *cfg,
+ struct arm_lpae_io_pgtable *data)
+{
+ unsigned int ias = cfg->ias;
+ unsigned int oas = cfg->oas;
+
+ /* Covers 1 and 2.d */
+ if ((ARM_LPAE_GRANULE(data) == SZ_16K) && (data->start_level == 0))
+ return (oas == 48) || (ias == 48);
+
+ /* Covers 2.a and 2.c */
+ if ((ARM_LPAE_GRANULE(data) == SZ_4K) && (data->start_level == 0))
+ return (oas == 40) || (oas == 42);
+
+ /* Case 2.b */
+ return (ARM_LPAE_GRANULE(data) == SZ_16K) &&
+ (data->start_level == 1) && (oas == 40);
+}
static dma_addr_t __arm_lpae_dma_addr(void *pages)
{
@@ -193,16 +253,20 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
void *cookie)
{
struct device *dev = cfg->iommu_dev;
- int order = get_order(size);
+ size_t alloc_size;
dma_addr_t dma;
void *pages;
- VM_BUG_ON((gfp & __GFP_HIGHMEM));
-
+ /*
+ * For very small starting-level translation tables the HW requires a
+ * minimum alignment of at least 64 to cover all cases.
+ */
+ alloc_size = max(size, 64);
if (cfg->alloc)
- pages = cfg->alloc(cookie, size, gfp);
+ pages = cfg->alloc(cookie, alloc_size, gfp);
else
- pages = iommu_alloc_pages_node(dev_to_node(dev), gfp, order);
+ pages = iommu_alloc_pages_node_sz(dev_to_node(dev), gfp,
+ alloc_size);
if (!pages)
return NULL;
@@ -230,7 +294,7 @@ out_free:
if (cfg->free)
cfg->free(cookie, pages, size);
else
- iommu_free_pages(pages, order);
+ iommu_free_pages(pages);
return NULL;
}
@@ -246,7 +310,7 @@ static void __arm_lpae_free_pages(void *pages, size_t size,
if (cfg->free)
cfg->free(cookie, pages, size);
else
- iommu_free_pages(pages, get_order(size));
+ iommu_free_pages(pages);
}
static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
@@ -256,13 +320,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
}
-static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
+static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
{
+ for (int i = 0; i < num_entries; i++)
+ ptep[i] = 0;
- *ptep = 0;
-
- if (!cfg->coherent_walk)
- __arm_lpae_sync_pte(ptep, 1, cfg);
+ if (!cfg->coherent_walk && num_entries)
+ __arm_lpae_sync_pte(ptep, num_entries, cfg);
}
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
@@ -301,7 +365,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
for (i = 0; i < num_entries; i++)
if (iopte_leaf(ptep[i], lvl, data->iop.fmt)) {
/* We require an unmap first */
- WARN_ON(!selftest_running);
+ WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN));
return -EEXIST;
} else if (iopte_type(ptep[i]) == ARM_LPAE_PTE_TYPE_TABLE) {
/*
@@ -372,7 +436,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
/* If we can install a leaf entry at this level, then do so */
if (size == block_size) {
- max_entries = ARM_LPAE_PTES_PER_TABLE(data) - map_idx_start;
+ max_entries = arm_lpae_max_entries(map_idx_start, data);
num_entries = min_t(int, pgcount, max_entries);
ret = arm_lpae_init_pte(data, iova, paddr, prot, lvl, num_entries, ptep);
if (!ret)
@@ -403,7 +467,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
cptep = iopte_deref(pte, data);
} else if (pte) {
/* We require an unmap first */
- WARN_ON(!selftest_running);
+ WARN_ON(!(cfg->quirks & IO_PGTABLE_QUIRK_NO_WARN));
return -EEXIST;
}
@@ -422,6 +486,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
pte = ARM_LPAE_PTE_nG;
if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
pte |= ARM_LPAE_PTE_AP_RDONLY;
+ else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
+ pte |= ARM_LPAE_PTE_DBM;
if (!(prot & IOMMU_PRIV))
pte |= ARM_LPAE_PTE_AP_UNPRIV;
} else {
@@ -438,12 +504,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
*/
if (data->iop.fmt == ARM_64_LPAE_S2 ||
data->iop.fmt == ARM_32_LPAE_S2) {
- if (prot & IOMMU_MMIO)
+ if (prot & IOMMU_MMIO) {
pte |= ARM_LPAE_PTE_MEMATTR_DEV;
- else if (prot & IOMMU_CACHE)
- pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
- else
+ } else if (prot & IOMMU_CACHE) {
+ if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+ pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB;
+ else
+ pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
+ } else {
pte |= ARM_LPAE_PTE_MEMATTR_NC;
+ }
} else {
if (prot & IOMMU_MMIO)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
@@ -495,9 +565,8 @@ static int arm_lpae_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
if (WARN_ON(iaext || paddr >> cfg->oas))
return -ERANGE;
- /* If no access, then nothing to do */
if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
- return 0;
+ return -EINVAL;
prot = arm_lpae_prot_to_pte(data, iommu_prot);
ret = __arm_lpae_map(data, iova, paddr, pgsize, pgcount, prot, lvl,
@@ -550,66 +619,6 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)
kfree(data);
}
-static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
- struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t size,
- arm_lpae_iopte blk_pte, int lvl,
- arm_lpae_iopte *ptep, size_t pgcount)
-{
- struct io_pgtable_cfg *cfg = &data->iop.cfg;
- arm_lpae_iopte pte, *tablep;
- phys_addr_t blk_paddr;
- size_t tablesz = ARM_LPAE_GRANULE(data);
- size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
- int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
- int i, unmap_idx_start = -1, num_entries = 0, max_entries;
-
- if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
- return 0;
-
- tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie);
- if (!tablep)
- return 0; /* Bytes unmapped */
-
- if (size == split_sz) {
- unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
- max_entries = ptes_per_table - unmap_idx_start;
- num_entries = min_t(int, pgcount, max_entries);
- }
-
- blk_paddr = iopte_to_paddr(blk_pte, data);
- pte = iopte_prot(blk_pte);
-
- for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) {
- /* Unmap! */
- if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries))
- continue;
-
- __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]);
- }
-
- pte = arm_lpae_install_table(tablep, ptep, blk_pte, data);
- if (pte != blk_pte) {
- __arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie);
- /*
- * We may race against someone unmapping another part of this
- * block, but anything else is invalid. We can't misinterpret
- * a page entry here since we're never at the last level.
- */
- if (iopte_type(pte) != ARM_LPAE_PTE_TYPE_TABLE)
- return 0;
-
- tablep = iopte_deref(pte, data);
- } else if (unmap_idx_start >= 0) {
- for (i = 0; i < num_entries; i++)
- io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size);
-
- return num_entries * size;
- }
-
- return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep);
-}
-
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
struct iommu_iotlb_gather *gather,
unsigned long iova, size_t size, size_t pgcount,
@@ -626,42 +635,45 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
ptep += unmap_idx_start;
pte = READ_ONCE(*ptep);
- if (WARN_ON(!pte))
- return 0;
+ if (!pte) {
+ WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN));
+ return -ENOENT;
+ }
/* If the size matches this level, we're in the right place */
if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
- max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
+ max_entries = arm_lpae_max_entries(unmap_idx_start, data);
num_entries = min_t(int, pgcount, max_entries);
- while (i < num_entries) {
- pte = READ_ONCE(*ptep);
- if (WARN_ON(!pte))
+ /* Find and handle non-leaf entries */
+ for (i = 0; i < num_entries; i++) {
+ pte = READ_ONCE(ptep[i]);
+ if (!pte) {
+ WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN));
break;
-
- __arm_lpae_clear_pte(ptep, &iop->cfg);
+ }
if (!iopte_leaf(pte, lvl, iop->fmt)) {
+ __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
+
/* Also flush any partial walks */
io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
ARM_LPAE_GRANULE(data));
__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
- } else if (!iommu_iotlb_gather_queued(gather)) {
- io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
}
-
- ptep++;
- i++;
}
+ /* Clear the remaining entries */
+ __arm_lpae_clear_pte(ptep, &iop->cfg, i);
+
+ if (gather && !iommu_iotlb_gather_queued(gather))
+ for (int j = 0; j < i; j++)
+ io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
+
return i * size;
} else if (iopte_leaf(pte, lvl, iop->fmt)) {
- /*
- * Insert a table at the next level to map the old region,
- * minus the part we want to unmap
- */
- return arm_lpae_split_blk_unmap(data, gather, iova, size, pte,
- lvl + 1, ptep, pgcount);
+ WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed");
+ return 0;
}
/* Keep on walkin' */
@@ -690,40 +702,172 @@ static size_t arm_lpae_unmap_pages(struct io_pgtable_ops *ops, unsigned long iov
data->start_level, ptep);
}
+struct io_pgtable_walk_data {
+ struct io_pgtable *iop;
+ void *data;
+ int (*visit)(struct io_pgtable_walk_data *walk_data, int lvl,
+ arm_lpae_iopte *ptep, size_t size);
+ unsigned long flags;
+ u64 addr;
+ const u64 end;
+};
+
+static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data,
+ struct io_pgtable_walk_data *walk_data,
+ arm_lpae_iopte *ptep,
+ int lvl);
+
+struct iova_to_phys_data {
+ arm_lpae_iopte pte;
+ int lvl;
+};
+
+static int visit_iova_to_phys(struct io_pgtable_walk_data *walk_data, int lvl,
+ arm_lpae_iopte *ptep, size_t size)
+{
+ struct iova_to_phys_data *data = walk_data->data;
+ data->pte = *ptep;
+ data->lvl = lvl;
+ return 0;
+}
+
static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
unsigned long iova)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
- arm_lpae_iopte pte, *ptep = data->pgd;
- int lvl = data->start_level;
+ struct iova_to_phys_data d;
+ struct io_pgtable_walk_data walk_data = {
+ .data = &d,
+ .visit = visit_iova_to_phys,
+ .addr = iova,
+ .end = iova + 1,
+ };
+ int ret;
+
+ ret = __arm_lpae_iopte_walk(data, &walk_data, data->pgd, data->start_level);
+ if (ret)
+ return 0;
+
+ iova &= (ARM_LPAE_BLOCK_SIZE(d.lvl, data) - 1);
+ return iopte_to_paddr(d.pte, data) | iova;
+}
+
+static int visit_pgtable_walk(struct io_pgtable_walk_data *walk_data, int lvl,
+ arm_lpae_iopte *ptep, size_t size)
+{
+ struct arm_lpae_io_pgtable_walk_data *data = walk_data->data;
+ data->ptes[lvl] = *ptep;
+ return 0;
+}
- do {
- /* Valid IOPTE pointer? */
- if (!ptep)
- return 0;
+static int arm_lpae_pgtable_walk(struct io_pgtable_ops *ops, unsigned long iova,
+ void *wd)
+{
+ struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ struct io_pgtable_walk_data walk_data = {
+ .data = wd,
+ .visit = visit_pgtable_walk,
+ .addr = iova,
+ .end = iova + 1,
+ };
- /* Grab the IOPTE we're interested in */
- ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
- pte = READ_ONCE(*ptep);
+ return __arm_lpae_iopte_walk(data, &walk_data, data->pgd, data->start_level);
+}
- /* Valid entry? */
- if (!pte)
- return 0;
+static int io_pgtable_visit(struct arm_lpae_io_pgtable *data,
+ struct io_pgtable_walk_data *walk_data,
+ arm_lpae_iopte *ptep, int lvl)
+{
+ struct io_pgtable *iop = &data->iop;
+ arm_lpae_iopte pte = READ_ONCE(*ptep);
- /* Leaf entry? */
- if (iopte_leaf(pte, lvl, data->iop.fmt))
- goto found_translation;
+ size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+ int ret = walk_data->visit(walk_data, lvl, ptep, size);
+ if (ret)
+ return ret;
- /* Take it to the next level */
- ptep = iopte_deref(pte, data);
- } while (++lvl < ARM_LPAE_MAX_LEVELS);
+ if (iopte_leaf(pte, lvl, iop->fmt)) {
+ walk_data->addr += size;
+ return 0;
+ }
+
+ if (!iopte_table(pte, lvl)) {
+ return -EINVAL;
+ }
+
+ ptep = iopte_deref(pte, data);
+ return __arm_lpae_iopte_walk(data, walk_data, ptep, lvl + 1);
+}
+
+static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data,
+ struct io_pgtable_walk_data *walk_data,
+ arm_lpae_iopte *ptep,
+ int lvl)
+{
+ u32 idx;
+ int max_entries, ret;
+
+ if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+ return -EINVAL;
+
+ if (lvl == data->start_level)
+ max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte);
+ else
+ max_entries = ARM_LPAE_PTES_PER_TABLE(data);
+
+ for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data);
+ (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) {
+ ret = io_pgtable_visit(data, walk_data, ptep + idx, lvl);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int visit_dirty(struct io_pgtable_walk_data *walk_data, int lvl,
+ arm_lpae_iopte *ptep, size_t size)
+{
+ struct iommu_dirty_bitmap *dirty = walk_data->data;
+
+ if (!iopte_leaf(*ptep, lvl, walk_data->iop->fmt))
+ return 0;
+
+ if (iopte_writeable_dirty(*ptep)) {
+ iommu_dirty_bitmap_record(dirty, walk_data->addr, size);
+ if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR))
+ iopte_set_writeable_clean(ptep);
+ }
- /* Ran out of page tables to walk */
return 0;
+}
+
+static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+ struct io_pgtable_cfg *cfg = &data->iop.cfg;
+ struct io_pgtable_walk_data walk_data = {
+ .iop = &data->iop,
+ .data = dirty,
+ .visit = visit_dirty,
+ .flags = flags,
+ .addr = iova,
+ .end = iova + size,
+ };
+ arm_lpae_iopte *ptep = data->pgd;
+ int lvl = data->start_level;
+
+ if (WARN_ON(!size))
+ return -EINVAL;
+ if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
+ return -EINVAL;
+ if (data->iop.fmt != ARM_64_LPAE_S1)
+ return -EINVAL;
-found_translation:
- iova &= (ARM_LPAE_BLOCK_SIZE(lvl, data) - 1);
- return iopte_to_paddr(pte, data) | iova;
+ return __arm_lpae_iopte_walk(data, &walk_data, ptep, lvl);
}
static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
@@ -804,6 +948,8 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
.map_pages = arm_lpae_map_pages,
.unmap_pages = arm_lpae_unmap_pages,
.iova_to_phys = arm_lpae_iova_to_phys,
+ .read_and_clear_dirty = arm_lpae_read_and_clear_dirty,
+ .pgtable_walk = arm_lpae_pgtable_walk,
};
return data;
@@ -819,7 +965,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_ARM_TTBR1 |
- IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+ IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
+ IO_PGTABLE_QUIRK_ARM_HD |
+ IO_PGTABLE_QUIRK_NO_WARN))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
@@ -920,26 +1068,20 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
struct arm_lpae_io_pgtable *data;
typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr;
- /* The NS quirk doesn't apply at stage 2 */
- if (cfg->quirks)
+ if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB |
+ IO_PGTABLE_QUIRK_NO_WARN))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
if (!data)
return NULL;
- /*
- * Concatenate PGDs at level 1 if possible in order to reduce
- * the depth of the stage-2 walk.
- */
- if (data->start_level == 0) {
- unsigned long pgd_pages;
-
- pgd_pages = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte);
- if (pgd_pages <= ARM_LPAE_S2_MAX_CONCAT_PAGES) {
- data->pgd_bits += data->bits_per_level;
- data->start_level++;
- }
+ if (arm_lpae_concat_mandatory(cfg, data)) {
+ if (WARN_ON((ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte)) >
+ ARM_LPAE_S2_MAX_CONCAT_PAGES))
+ return NULL;
+ data->pgd_bits += data->bits_per_level;
+ data->start_level++;
}
/* VTCR */
@@ -1123,196 +1265,3 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
.alloc = arm_mali_lpae_alloc_pgtable,
.free = arm_lpae_free_pgtable,
};
-
-#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
-
-static struct io_pgtable_cfg *cfg_cookie __initdata;
-
-static void __init dummy_tlb_flush_all(void *cookie)
-{
- WARN_ON(cookie != cfg_cookie);
-}
-
-static void __init dummy_tlb_flush(unsigned long iova, size_t size,
- size_t granule, void *cookie)
-{
- WARN_ON(cookie != cfg_cookie);
- WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
-}
-
-static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
- unsigned long iova, size_t granule,
- void *cookie)
-{
- dummy_tlb_flush(iova, granule, granule, cookie);
-}
-
-static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
- .tlb_flush_all = dummy_tlb_flush_all,
- .tlb_flush_walk = dummy_tlb_flush,
- .tlb_add_page = dummy_tlb_add_page,
-};
-
-static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
-{
- struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
- struct io_pgtable_cfg *cfg = &data->iop.cfg;
-
- pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n",
- cfg->pgsize_bitmap, cfg->ias);
- pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n",
- ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data),
- ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd);
-}
-
-#define __FAIL(ops, i) ({ \
- WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \
- arm_lpae_dump_ops(ops); \
- selftest_running = false; \
- -EFAULT; \
-})
-
-static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
-{
- static const enum io_pgtable_fmt fmts[] __initconst = {
- ARM_64_LPAE_S1,
- ARM_64_LPAE_S2,
- };
-
- int i, j;
- unsigned long iova;
- size_t size, mapped;
- struct io_pgtable_ops *ops;
-
- selftest_running = true;
-
- for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
- cfg_cookie = cfg;
- ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
- if (!ops) {
- pr_err("selftest: failed to allocate io pgtable ops\n");
- return -ENOMEM;
- }
-
- /*
- * Initial sanity checks.
- * Empty page tables shouldn't provide any translations.
- */
- if (ops->iova_to_phys(ops, 42))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_1G + 42))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_2G + 42))
- return __FAIL(ops, i);
-
- /*
- * Distinct mappings of different granule sizes.
- */
- iova = 0;
- for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
- size = 1UL << j;
-
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_READ | IOMMU_WRITE |
- IOMMU_NOEXEC | IOMMU_CACHE,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- /* Overlapping mappings */
- if (!ops->map_pages(ops, iova, iova + size, size, 1,
- IOMMU_READ | IOMMU_NOEXEC,
- GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
- return __FAIL(ops, i);
-
- iova += SZ_1G;
- }
-
- /* Partial unmap */
- size = 1UL << __ffs(cfg->pgsize_bitmap);
- if (ops->unmap_pages(ops, SZ_1G + size, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- /* Remap of partial unmap */
- if (ops->map_pages(ops, SZ_1G + size, size, size, 1,
- IOMMU_READ, GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, SZ_1G + size + 42) != (size + 42))
- return __FAIL(ops, i);
-
- /* Full unmap */
- iova = 0;
- for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
- size = 1UL << j;
-
- if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42))
- return __FAIL(ops, i);
-
- /* Remap full block */
- if (ops->map_pages(ops, iova, iova, size, 1,
- IOMMU_WRITE, GFP_KERNEL, &mapped))
- return __FAIL(ops, i);
-
- if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
- return __FAIL(ops, i);
-
- iova += SZ_1G;
- }
-
- free_io_pgtable_ops(ops);
- }
-
- selftest_running = false;
- return 0;
-}
-
-static int __init arm_lpae_do_selftests(void)
-{
- static const unsigned long pgsize[] __initconst = {
- SZ_4K | SZ_2M | SZ_1G,
- SZ_16K | SZ_32M,
- SZ_64K | SZ_512M,
- };
-
- static const unsigned int ias[] __initconst = {
- 32, 36, 40, 42, 44, 48,
- };
-
- int i, j, pass = 0, fail = 0;
- struct device dev;
- struct io_pgtable_cfg cfg = {
- .tlb = &dummy_tlb_ops,
- .oas = 48,
- .coherent_walk = true,
- .iommu_dev = &dev,
- };
-
- /* __arm_lpae_alloc_pages() merely needs dev_to_node() to work */
- set_dev_node(&dev, NUMA_NO_NODE);
-
- for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
- for (j = 0; j < ARRAY_SIZE(ias); ++j) {
- cfg.pgsize_bitmap = pgsize[i];
- cfg.ias = ias[j];
- pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u\n",
- pgsize[i], ias[j]);
- if (arm_lpae_run_tests(&cfg))
- fail++;
- else
- pass++;
- }
- }
-
- pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail);
- return fail ? -EFAULT : 0;
-}
-subsys_initcall(arm_lpae_do_selftests);
-#endif
diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c
index ad28031e1e93..54d287cc0dd1 100644
--- a/drivers/iommu/io-pgtable-dart.c
+++ b/drivers/iommu/io-pgtable-dart.c
@@ -27,8 +27,9 @@
#define DART1_MAX_ADDR_BITS 36
-#define DART_MAX_TABLES 4
-#define DART_LEVELS 2
+#define DART_MAX_TABLE_BITS 2
+#define DART_MAX_TABLES BIT(DART_MAX_TABLE_BITS)
+#define DART_MAX_LEVELS 4 /* Includes TTBR level */
/* Struct accessors */
#define io_pgtable_to_data(x) \
@@ -68,6 +69,7 @@
struct dart_io_pgtable {
struct io_pgtable iop;
+ int levels;
int tbl_bits;
int bits_per_level;
@@ -107,14 +109,6 @@ static phys_addr_t iopte_to_paddr(dart_iopte pte,
return paddr;
}
-static void *__dart_alloc_pages(size_t size, gfp_t gfp)
-{
- int order = get_order(size);
-
- VM_BUG_ON((gfp & __GFP_HIGHMEM));
- return iommu_alloc_pages(gfp, order);
-}
-
static int dart_init_pte(struct dart_io_pgtable *data,
unsigned long iova, phys_addr_t paddr,
dart_iopte prot, int num_entries,
@@ -135,7 +129,6 @@ static int dart_init_pte(struct dart_io_pgtable *data,
pte |= FIELD_PREP(APPLE_DART_PTE_SUBPAGE_START, 0);
pte |= FIELD_PREP(APPLE_DART_PTE_SUBPAGE_END, 0xfff);
- pte |= APPLE_DART1_PTE_PROT_SP_DIS;
pte |= APPLE_DART_PTE_VALID;
for (i = 0; i < num_entries; i++)
@@ -165,44 +158,45 @@ static dart_iopte dart_install_table(dart_iopte *table,
return old;
}
-static int dart_get_table(struct dart_io_pgtable *data, unsigned long iova)
+static int dart_get_index(struct dart_io_pgtable *data, unsigned long iova, int level)
{
- return (iova >> (3 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) &
- ((1 << data->tbl_bits) - 1);
+ return (iova >> (level * data->bits_per_level + ilog2(sizeof(dart_iopte)))) &
+ ((1 << data->bits_per_level) - 1);
}
-static int dart_get_l1_index(struct dart_io_pgtable *data, unsigned long iova)
-{
-
- return (iova >> (2 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) &
- ((1 << data->bits_per_level) - 1);
-}
-
-static int dart_get_l2_index(struct dart_io_pgtable *data, unsigned long iova)
+static int dart_get_last_index(struct dart_io_pgtable *data, unsigned long iova)
{
return (iova >> (data->bits_per_level + ilog2(sizeof(dart_iopte)))) &
((1 << data->bits_per_level) - 1);
}
-static dart_iopte *dart_get_l2(struct dart_io_pgtable *data, unsigned long iova)
+static dart_iopte *dart_get_last(struct dart_io_pgtable *data, unsigned long iova)
{
dart_iopte pte, *ptep;
- int tbl = dart_get_table(data, iova);
+ int level = data->levels;
+ int tbl = dart_get_index(data, iova, level);
+
+ if (tbl >= (1 << data->tbl_bits))
+ return NULL;
ptep = data->pgd[tbl];
if (!ptep)
return NULL;
- ptep += dart_get_l1_index(data, iova);
- pte = READ_ONCE(*ptep);
+ while (--level > 1) {
+ ptep += dart_get_index(data, iova, level);
+ pte = READ_ONCE(*ptep);
- /* Valid entry? */
- if (!pte)
- return NULL;
+ /* Valid entry? */
+ if (!pte)
+ return NULL;
- /* Deref to get level 2 table */
- return iopte_deref(pte, data);
+ /* Deref to get next level table */
+ ptep = iopte_deref(pte, data);
+ }
+
+ return ptep;
}
static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data,
@@ -211,6 +205,7 @@ static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data,
dart_iopte pte = 0;
if (data->iop.fmt == APPLE_DART) {
+ pte |= APPLE_DART1_PTE_PROT_SP_DIS;
if (!(prot & IOMMU_WRITE))
pte |= APPLE_DART1_PTE_PROT_NO_WRITE;
if (!(prot & IOMMU_READ))
@@ -238,6 +233,7 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
int ret = 0, tbl, num_entries, max_entries, map_idx_start;
dart_iopte pte, *cptep, *ptep;
dart_iopte prot;
+ int level = data->levels;
if (WARN_ON(pgsize != cfg->pgsize_bitmap))
return -EINVAL;
@@ -245,35 +241,39 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
if (WARN_ON(paddr >> cfg->oas))
return -ERANGE;
- /* If no access, then nothing to do */
if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
- return 0;
+ return -EINVAL;
+
+ tbl = dart_get_index(data, iova, level);
- tbl = dart_get_table(data, iova);
+ if (tbl >= (1 << data->tbl_bits))
+ return -ENOMEM;
ptep = data->pgd[tbl];
- ptep += dart_get_l1_index(data, iova);
- pte = READ_ONCE(*ptep);
+ while (--level > 1) {
+ ptep += dart_get_index(data, iova, level);
+ pte = READ_ONCE(*ptep);
- /* no L2 table present */
- if (!pte) {
- cptep = __dart_alloc_pages(tblsz, gfp);
- if (!cptep)
- return -ENOMEM;
+ /* no table present */
+ if (!pte) {
+ cptep = iommu_alloc_pages_sz(gfp, tblsz);
+ if (!cptep)
+ return -ENOMEM;
- pte = dart_install_table(cptep, ptep, 0, data);
- if (pte)
- iommu_free_pages(cptep, get_order(tblsz));
+ pte = dart_install_table(cptep, ptep, 0, data);
+ if (pte)
+ iommu_free_pages(cptep);
- /* L2 table is present (now) */
- pte = READ_ONCE(*ptep);
- }
+ /* L2 table is present (now) */
+ pte = READ_ONCE(*ptep);
+ }
- ptep = iopte_deref(pte, data);
+ ptep = iopte_deref(pte, data);
+ }
/* install a leaf entries into L2 table */
prot = dart_prot_to_pte(data, iommu_prot);
- map_idx_start = dart_get_l2_index(data, iova);
+ map_idx_start = dart_get_last_index(data, iova);
max_entries = DART_PTES_PER_TABLE(data) - map_idx_start;
num_entries = min_t(int, pgcount, max_entries);
ptep += map_idx_start;
@@ -302,13 +302,13 @@ static size_t dart_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova,
if (WARN_ON(pgsize != cfg->pgsize_bitmap || !pgcount))
return 0;
- ptep = dart_get_l2(data, iova);
+ ptep = dart_get_last(data, iova);
/* Valid L2 IOPTE pointer? */
if (WARN_ON(!ptep))
return 0;
- unmap_idx_start = dart_get_l2_index(data, iova);
+ unmap_idx_start = dart_get_last_index(data, iova);
ptep += unmap_idx_start;
max_entries = DART_PTES_PER_TABLE(data) - unmap_idx_start;
@@ -339,13 +339,13 @@ static phys_addr_t dart_iova_to_phys(struct io_pgtable_ops *ops,
struct dart_io_pgtable *data = io_pgtable_ops_to_data(ops);
dart_iopte pte, *ptep;
- ptep = dart_get_l2(data, iova);
+ ptep = dart_get_last(data, iova);
/* Valid L2 IOPTE pointer? */
if (!ptep)
return 0;
- ptep += dart_get_l2_index(data, iova);
+ ptep += dart_get_last_index(data, iova);
pte = READ_ONCE(*ptep);
/* Found translation */
@@ -362,21 +362,37 @@ static struct dart_io_pgtable *
dart_alloc_pgtable(struct io_pgtable_cfg *cfg)
{
struct dart_io_pgtable *data;
- int tbl_bits, bits_per_level, va_bits, pg_shift;
+ int levels, max_tbl_bits, tbl_bits, bits_per_level, va_bits, pg_shift;
+
+ /*
+ * Old 4K page DARTs can use up to 4 top-level tables.
+ * Newer ones only ever use a maximum of 1.
+ */
+ if (cfg->pgsize_bitmap == SZ_4K)
+ max_tbl_bits = DART_MAX_TABLE_BITS;
+ else
+ max_tbl_bits = 0;
pg_shift = __ffs(cfg->pgsize_bitmap);
bits_per_level = pg_shift - ilog2(sizeof(dart_iopte));
va_bits = cfg->ias - pg_shift;
- tbl_bits = max_t(int, 0, va_bits - (bits_per_level * DART_LEVELS));
- if ((1 << tbl_bits) > DART_MAX_TABLES)
+ levels = max_t(int, 2, (va_bits - max_tbl_bits + bits_per_level - 1) / bits_per_level);
+
+ if (levels > (DART_MAX_LEVELS - 1))
+ return NULL;
+
+ tbl_bits = max_t(int, 0, va_bits - (bits_per_level * levels));
+
+ if (tbl_bits > max_tbl_bits)
return NULL;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return NULL;
+ data->levels = levels + 1; /* Table level counts as one level */
data->tbl_bits = tbl_bits;
data->bits_per_level = bits_per_level;
@@ -412,9 +428,11 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
return NULL;
cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits;
+ cfg->apple_dart_cfg.n_levels = data->levels;
for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) {
- data->pgd[i] = __dart_alloc_pages(DART_GRANULE(data), GFP_KERNEL);
+ data->pgd[i] =
+ iommu_alloc_pages_sz(GFP_KERNEL, DART_GRANULE(data));
if (!data->pgd[i])
goto out_free_data;
cfg->apple_dart_cfg.ttbr[i] = virt_to_phys(data->pgd[i]);
@@ -424,32 +442,37 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
out_free_data:
while (--i >= 0) {
- iommu_free_pages(data->pgd[i],
- get_order(DART_GRANULE(data)));
+ iommu_free_pages(data->pgd[i]);
}
kfree(data);
return NULL;
}
-static void apple_dart_free_pgtable(struct io_pgtable *iop)
+static void apple_dart_free_pgtables(struct dart_io_pgtable *data, dart_iopte *ptep, int level)
{
- struct dart_io_pgtable *data = io_pgtable_to_data(iop);
- int order = get_order(DART_GRANULE(data));
- dart_iopte *ptep, *end;
- int i;
+ dart_iopte *end;
+ dart_iopte *start = ptep;
- for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) {
- ptep = data->pgd[i];
+ if (level > 1) {
end = (void *)ptep + DART_GRANULE(data);
while (ptep != end) {
dart_iopte pte = *ptep++;
if (pte)
- iommu_free_pages(iopte_deref(pte, data), order);
+ apple_dart_free_pgtables(data, iopte_deref(pte, data), level - 1);
}
- iommu_free_pages(data->pgd[i], order);
}
+ iommu_free_pages(start);
+}
+
+static void apple_dart_free_pgtable(struct io_pgtable *iop)
+{
+ struct dart_io_pgtable *data = io_pgtable_to_data(iop);
+ int i;
+
+ for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i)
+ apple_dart_free_pgtables(data, data->pgd[i], data->levels - 1);
kfree(data);
}
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 8841c1487f00..843fec8e8a51 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
#endif
-#ifdef CONFIG_AMD_IOMMU
- [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
- [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
-#endif
};
static int check_custom_allocator(enum io_pgtable_fmt fmt,
diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c
new file mode 100644
index 000000000000..3bab175d8557
--- /dev/null
+++ b/drivers/iommu/iommu-pages.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+#include "iommu-pages.h"
+#include <linux/dma-mapping.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+
+#define IOPTDESC_MATCH(pg_elm, elm) \
+ static_assert(offsetof(struct page, pg_elm) == \
+ offsetof(struct ioptdesc, elm))
+IOPTDESC_MATCH(flags, __page_flags);
+IOPTDESC_MATCH(lru, iopt_freelist_elm); /* Ensure bit 0 is clear */
+IOPTDESC_MATCH(mapping, __page_mapping);
+IOPTDESC_MATCH(private, _private);
+IOPTDESC_MATCH(page_type, __page_type);
+IOPTDESC_MATCH(_refcount, __page_refcount);
+#ifdef CONFIG_MEMCG
+IOPTDESC_MATCH(memcg_data, memcg_data);
+#endif
+#undef IOPTDESC_MATCH
+static_assert(sizeof(struct ioptdesc) <= sizeof(struct page));
+
+static inline size_t ioptdesc_mem_size(struct ioptdesc *desc)
+{
+ return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT);
+}
+
+/**
+ * iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from
+ * specific NUMA node
+ * @nid: memory NUMA node id
+ * @gfp: buddy allocator flags
+ * @size: Memory size to allocate, rounded up to a power of 2
+ *
+ * Returns the virtual address of the allocated page. The page must be freed
+ * either by calling iommu_free_pages() or via iommu_put_pages_list(). The
+ * returned allocation is round_up_pow_two(size) big, and is physically aligned
+ * to its size.
+ */
+void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size)
+{
+ struct ioptdesc *iopt;
+ unsigned long pgcnt;
+ struct folio *folio;
+ unsigned int order;
+
+ /* This uses page_address() on the memory. */
+ if (WARN_ON(gfp & __GFP_HIGHMEM))
+ return NULL;
+
+ /*
+ * Currently sub page allocations result in a full page being returned.
+ */
+ order = get_order(size);
+
+ /*
+ * __folio_alloc_node() does not handle NUMA_NO_NODE like
+ * alloc_pages_node() did.
+ */
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+
+ folio = __folio_alloc_node(gfp | __GFP_ZERO, order, nid);
+ if (unlikely(!folio))
+ return NULL;
+
+ iopt = folio_ioptdesc(folio);
+ iopt->incoherent = false;
+
+ /*
+ * All page allocations that should be reported to as "iommu-pagetables"
+ * to userspace must use one of the functions below. This includes
+ * allocations of page-tables and other per-iommu_domain configuration
+ * structures.
+ *
+ * This is necessary for the proper accounting as IOMMU state can be
+ * rather large, i.e. multiple gigabytes in size.
+ */
+ pgcnt = 1UL << order;
+ mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, pgcnt);
+ lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, pgcnt);
+
+ return folio_address(folio);
+}
+EXPORT_SYMBOL_GPL(iommu_alloc_pages_node_sz);
+
+static void __iommu_free_desc(struct ioptdesc *iopt)
+{
+ struct folio *folio = ioptdesc_folio(iopt);
+ const unsigned long pgcnt = folio_nr_pages(folio);
+
+ if (IOMMU_PAGES_USE_DMA_API)
+ WARN_ON_ONCE(iopt->incoherent);
+
+ mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt);
+ lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt);
+ folio_put(folio);
+}
+
+/**
+ * iommu_free_pages - free pages
+ * @virt: virtual address of the page to be freed.
+ *
+ * The page must have have been allocated by iommu_alloc_pages_node_sz()
+ */
+void iommu_free_pages(void *virt)
+{
+ if (!virt)
+ return;
+ __iommu_free_desc(virt_to_ioptdesc(virt));
+}
+EXPORT_SYMBOL_GPL(iommu_free_pages);
+
+/**
+ * iommu_put_pages_list - free a list of pages.
+ * @list: The list of pages to be freed
+ *
+ * Frees a list of pages allocated by iommu_alloc_pages_node_sz(). On return the
+ * passed list is invalid, the caller must use IOMMU_PAGES_LIST_INIT to reinit
+ * the list if it expects to use it again.
+ */
+void iommu_put_pages_list(struct iommu_pages_list *list)
+{
+ struct ioptdesc *iopt, *tmp;
+
+ list_for_each_entry_safe(iopt, tmp, &list->pages, iopt_freelist_elm)
+ __iommu_free_desc(iopt);
+}
+EXPORT_SYMBOL_GPL(iommu_put_pages_list);
+
+/**
+ * iommu_pages_start_incoherent - Setup the page for cache incoherent operation
+ * @virt: The page to setup
+ * @dma_dev: The iommu device
+ *
+ * For incoherent memory this will use the DMA API to manage the cache flushing
+ * on some arches. This is a lot of complexity compared to just calling
+ * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers
+ * have been doing. The DMA API requires keeping track of the DMA map and
+ * freeing it when required. This keeps track of the dma map inside the ioptdesc
+ * so that error paths are simple for the caller.
+ */
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+ dma_addr_t dma;
+
+ if (WARN_ON(iopt->incoherent))
+ return -EINVAL;
+
+ if (!IOMMU_PAGES_USE_DMA_API) {
+ iommu_pages_flush_incoherent(dma_dev, virt, 0,
+ ioptdesc_mem_size(iopt));
+ } else {
+ dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(dma_dev, dma))
+ return -EINVAL;
+
+ /*
+ * The DMA API is not allowed to do anything other than DMA
+ * direct. It would be nice to also check
+ * dev_is_dma_coherent(dma_dev));
+ */
+ if (WARN_ON(dma != virt_to_phys(virt))) {
+ dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt),
+ DMA_TO_DEVICE);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ iopt->incoherent = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent);
+
+/**
+ * iommu_pages_start_incoherent_list - Make a list of pages incoherent
+ * @list: The list of pages to setup
+ * @dma_dev: The iommu device
+ *
+ * Perform iommu_pages_start_incoherent() across all of list.
+ *
+ * If this fails the caller must call iommu_pages_stop_incoherent_list().
+ */
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+ int ret;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ if (WARN_ON(cur->incoherent))
+ continue;
+
+ ret = iommu_pages_start_incoherent(
+ folio_address(ioptdesc_folio(cur)), dma_dev);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list);
+
+/**
+ * iommu_pages_stop_incoherent_list - Undo incoherence across a list
+ * @list: The list of pages to release
+ * @dma_dev: The iommu device
+ *
+ * Revert iommu_pages_start_incoherent() across all of the list. Pages that did
+ * not call or succeed iommu_pages_start_incoherent() will be ignored.
+ */
+#if IOMMU_PAGES_USE_DMA_API
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
+{
+ struct ioptdesc *cur;
+
+ list_for_each_entry(cur, &list->pages, iopt_freelist_elm) {
+ struct folio *folio = ioptdesc_folio(cur);
+
+ if (!cur->incoherent)
+ continue;
+ dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)),
+ ioptdesc_mem_size(cur), DMA_TO_DEVICE);
+ cur->incoherent = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list);
+
+/**
+ * iommu_pages_free_incoherent - Free an incoherent page
+ * @virt: virtual address of the page to be freed.
+ * @dma_dev: The iommu device
+ *
+ * If the page is incoherent it made coherent again then freed.
+ */
+void iommu_pages_free_incoherent(void *virt, struct device *dma_dev)
+{
+ struct ioptdesc *iopt = virt_to_ioptdesc(virt);
+
+ if (iopt->incoherent) {
+ dma_unmap_single(dma_dev, virt_to_phys(virt),
+ ioptdesc_mem_size(iopt), DMA_TO_DEVICE);
+ iopt->incoherent = 0;
+ }
+ __iommu_free_desc(iopt);
+}
+EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent);
+#endif
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index 82ebf0033081..ae9da4f571f6 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -7,180 +7,142 @@
#ifndef __IOMMU_PAGES_H
#define __IOMMU_PAGES_H
-#include <linux/vmstat.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-
-/*
- * All page allocations that should be reported to as "iommu-pagetables" to
- * userspace must use one of the functions below. This includes allocations of
- * page-tables and other per-iommu_domain configuration structures.
- *
- * This is necessary for the proper accounting as IOMMU state can be rather
- * large, i.e. multiple gigabytes in size.
- */
+#include <linux/iommu.h>
/**
- * __iommu_alloc_account - account for newly allocated page.
- * @page: head struct page of the page.
- * @order: order of the page
+ * struct ioptdesc - Memory descriptor for IOMMU page tables
+ * @iopt_freelist_elm: List element for a struct iommu_pages_list
+ *
+ * This struct overlays struct page for now. Do not modify without a good
+ * understanding of the issues.
*/
-static inline void __iommu_alloc_account(struct page *page, int order)
+struct ioptdesc {
+ unsigned long __page_flags;
+
+ struct list_head iopt_freelist_elm;
+ unsigned long __page_mapping;
+ union {
+ u8 incoherent;
+ pgoff_t __index;
+ };
+ void *_private;
+
+ unsigned int __page_type;
+ atomic_t __page_refcount;
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+#endif
+};
+
+static inline struct ioptdesc *folio_ioptdesc(struct folio *folio)
{
- const long pgcnt = 1l << order;
-
- mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, pgcnt);
- mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, pgcnt);
+ return (struct ioptdesc *)folio;
}
-/**
- * __iommu_free_account - account a page that is about to be freed.
- * @page: head struct page of the page.
- * @order: order of the page
- */
-static inline void __iommu_free_account(struct page *page, int order)
+static inline struct folio *ioptdesc_folio(struct ioptdesc *iopt)
{
- const long pgcnt = 1l << order;
-
- mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, -pgcnt);
- mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, -pgcnt);
+ return (struct folio *)iopt;
}
-/**
- * __iommu_alloc_pages - allocate a zeroed page of a given order.
- * @gfp: buddy allocator flags
- * @order: page order
- *
- * returns the head struct page of the allocated page.
- */
-static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
+static inline struct ioptdesc *virt_to_ioptdesc(void *virt)
{
- struct page *page;
-
- page = alloc_pages(gfp | __GFP_ZERO, order);
- if (unlikely(!page))
- return NULL;
-
- __iommu_alloc_account(page, order);
-
- return page;
+ return folio_ioptdesc(virt_to_folio(virt));
}
+void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size);
+void iommu_free_pages(void *virt);
+void iommu_put_pages_list(struct iommu_pages_list *list);
+
/**
- * __iommu_free_pages - free page of a given order
- * @page: head struct page of the page
- * @order: page order
+ * iommu_pages_list_add - add the page to a iommu_pages_list
+ * @list: List to add the page to
+ * @virt: Address returned from iommu_alloc_pages_node_sz()
*/
-static inline void __iommu_free_pages(struct page *page, int order)
+static inline void iommu_pages_list_add(struct iommu_pages_list *list,
+ void *virt)
{
- if (!page)
- return;
-
- __iommu_free_account(page, order);
- __free_pages(page, order);
+ list_add_tail(&virt_to_ioptdesc(virt)->iopt_freelist_elm, &list->pages);
}
/**
- * iommu_alloc_pages_node - allocate a zeroed page of a given order from
- * specific NUMA node.
- * @nid: memory NUMA node id
- * @gfp: buddy allocator flags
- * @order: page order
+ * iommu_pages_list_splice - Put all the pages in list from into list to
+ * @from: Source list of pages
+ * @to: Destination list of pages
*
- * returns the virtual address of the allocated page
+ * from must be re-initialized after calling this function if it is to be
+ * used again.
*/
-static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order)
+static inline void iommu_pages_list_splice(struct iommu_pages_list *from,
+ struct iommu_pages_list *to)
{
- struct page *page = alloc_pages_node(nid, gfp | __GFP_ZERO, order);
-
- if (unlikely(!page))
- return NULL;
-
- __iommu_alloc_account(page, order);
-
- return page_address(page);
+ list_splice(&from->pages, &to->pages);
}
/**
- * iommu_alloc_pages - allocate a zeroed page of a given order
- * @gfp: buddy allocator flags
- * @order: page order
- *
- * returns the virtual address of the allocated page
+ * iommu_pages_list_empty - True if the list is empty
+ * @list: List to check
*/
-static inline void *iommu_alloc_pages(gfp_t gfp, int order)
+static inline bool iommu_pages_list_empty(struct iommu_pages_list *list)
{
- struct page *page = __iommu_alloc_pages(gfp, order);
-
- if (unlikely(!page))
- return NULL;
-
- return page_address(page);
+ return list_empty(&list->pages);
}
/**
- * iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
+ * iommu_alloc_pages_sz - Allocate a zeroed page of a given size from
+ * specific NUMA node
* @nid: memory NUMA node id
* @gfp: buddy allocator flags
+ * @size: Memory size to allocate, this is rounded up to a power of 2
*
- * returns the virtual address of the allocated page
+ * Returns the virtual address of the allocated page.
*/
-static inline void *iommu_alloc_page_node(int nid, gfp_t gfp)
+static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size)
{
- return iommu_alloc_pages_node(nid, gfp, 0);
+ return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size);
}
-/**
- * iommu_alloc_page - allocate a zeroed page
- * @gfp: buddy allocator flags
- *
- * returns the virtual address of the allocated page
- */
-static inline void *iommu_alloc_page(gfp_t gfp)
+int iommu_pages_start_incoherent(void *virt, struct device *dma_dev);
+int iommu_pages_start_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+
+#ifdef CONFIG_X86
+#define IOMMU_PAGES_USE_DMA_API 0
+#include <linux/cacheflush.h>
+
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
{
- return iommu_alloc_pages(gfp, 0);
+ clflush_cache_range(virt + offset, len);
}
-
-/**
- * iommu_free_pages - free page of a given order
- * @virt: virtual address of the page to be freed.
- * @order: page order
- */
-static inline void iommu_free_pages(void *virt, int order)
+static inline void
+iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev)
{
- if (!virt)
- return;
-
- __iommu_free_pages(virt_to_page(virt), order);
+ /*
+ * For performance leave the incoherent flag alone which turns this into
+ * a NOP. For X86 the rest of the stop/free flow ignores the flag.
+ */
}
-
-/**
- * iommu_free_page - free page
- * @virt: virtual address of the page to be freed.
- */
-static inline void iommu_free_page(void *virt)
+static inline void iommu_pages_free_incoherent(void *virt,
+ struct device *dma_dev)
{
- iommu_free_pages(virt, 0);
+ iommu_free_pages(virt);
}
+#else
+#define IOMMU_PAGES_USE_DMA_API 1
+#include <linux/dma-mapping.h>
-/**
- * iommu_put_pages_list - free a list of pages.
- * @page: the head of the lru list to be freed.
- *
- * There are no locking requirement for these pages, as they are going to be
- * put on a free list as soon as refcount reaches 0. Pages are put on this LRU
- * list once they are removed from the IOMMU page tables. However, they can
- * still be access through debugfs.
- */
-static inline void iommu_put_pages_list(struct list_head *page)
+static inline void iommu_pages_flush_incoherent(struct device *dma_dev,
+ void *virt, size_t offset,
+ size_t len)
{
- while (!list_empty(page)) {
- struct page *p = list_entry(page->prev, struct page, lru);
-
- list_del(&p->lru);
- __iommu_free_account(p, 0);
- put_page(p);
- }
+ dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len,
+ DMA_TO_DEVICE);
}
+void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list,
+ struct device *dma_dev);
+void iommu_pages_free_incoherent(void *virt, struct device *dma_dev);
+#endif
-#endif /* __IOMMU_PAGES_H */
+#endif /* __IOMMU_PAGES_H */
diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index 5f731d994803..c95394cd03a7 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -5,6 +5,7 @@
#define __LINUX_IOMMU_PRIV_H
#include <linux/iommu.h>
+#include <linux/msi.h>
static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
{
@@ -17,8 +18,16 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
return dev->iommu->iommu_dev->ops;
}
-int iommu_group_replace_domain(struct iommu_group *group,
- struct iommu_domain *new_domain);
+void dev_iommu_free(struct device *dev);
+
+const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode);
+
+static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwspec)
+{
+ return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL);
+}
+
+void iommu_fwspec_free(struct device *dev);
int iommu_device_register_bus(struct iommu_device *iommu,
const struct iommu_ops *ops,
@@ -28,4 +37,32 @@ void iommu_device_unregister_bus(struct iommu_device *iommu,
const struct bus_type *bus,
struct notifier_block *nb);
+int iommu_mock_device_add(struct device *dev, struct iommu_device *iommu);
+
+struct iommu_attach_handle *iommu_attach_handle_get(struct iommu_group *group,
+ ioasid_t pasid,
+ unsigned int type);
+int iommu_attach_group_handle(struct iommu_domain *domain,
+ struct iommu_group *group,
+ struct iommu_attach_handle *handle);
+void iommu_detach_group_handle(struct iommu_domain *domain,
+ struct iommu_group *group);
+int iommu_replace_group_handle(struct iommu_group *group,
+ struct iommu_domain *new_domain,
+ struct iommu_attach_handle *handle);
+
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) && IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
+int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
+ phys_addr_t msi_addr);
+#else /* !CONFIG_IOMMUFD_DRIVER_CORE || !CONFIG_IRQ_MSI_IOMMU */
+static inline int iommufd_sw_msi(struct iommu_domain *domain,
+ struct msi_desc *desc, phys_addr_t msi_addr)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_IOMMUFD_DRIVER_CORE && CONFIG_IRQ_MSI_IOMMU */
+
+int iommu_replace_device_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_attach_handle *handle);
#endif /* __LINUX_IOMMU_PRIV_H */
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 18a35e798b72..d236aef80a8d 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -10,6 +10,10 @@
#include "iommu-priv.h"
static DEFINE_MUTEX(iommu_sva_lock);
+static bool iommu_sva_present;
+static LIST_HEAD(iommu_sva_mms);
+static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+ struct mm_struct *mm);
/* Allocate a PASID for the mm within range (inclusive) */
static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct device *dev)
@@ -40,8 +44,8 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
return ERR_PTR(-ENOSPC);
}
iommu_mm->pasid = pasid;
+ iommu_mm->mm = mm;
INIT_LIST_HEAD(&iommu_mm->sva_domains);
- INIT_LIST_HEAD(&iommu_mm->sva_handles);
/*
* Make sure the write to mm->iommu_mm is not reordered in front of
* initialization to iommu_mm fields. If it does, readers may see a
@@ -62,18 +66,20 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
* reference is taken. Caller must call iommu_sva_unbind_device()
* to release each reference.
*
- * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to
- * initialize the required SVA features.
- *
* On error, returns an ERR_PTR value.
*/
struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
{
+ struct iommu_group *group = dev->iommu_group;
+ struct iommu_attach_handle *attach_handle;
struct iommu_mm_data *iommu_mm;
struct iommu_domain *domain;
struct iommu_sva *handle;
int ret;
+ if (!group)
+ return ERR_PTR(-ENODEV);
+
mutex_lock(&iommu_sva_lock);
/* Allocate mm->pasid if necessary. */
@@ -83,12 +89,22 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
goto out_unlock;
}
- list_for_each_entry(handle, &mm->iommu_mm->sva_handles, handle_item) {
- if (handle->dev == dev) {
- refcount_inc(&handle->users);
- mutex_unlock(&iommu_sva_lock);
- return handle;
+ /* A bond already exists, just take a reference`. */
+ attach_handle = iommu_attach_handle_get(group, iommu_mm->pasid, IOMMU_DOMAIN_SVA);
+ if (!IS_ERR(attach_handle)) {
+ handle = container_of(attach_handle, struct iommu_sva, handle);
+ if (attach_handle->domain->mm != mm) {
+ ret = -EBUSY;
+ goto out_unlock;
}
+ refcount_inc(&handle->users);
+ mutex_unlock(&iommu_sva_lock);
+ return handle;
+ }
+
+ if (PTR_ERR(attach_handle) != -ENOENT) {
+ ret = PTR_ERR(attach_handle);
+ goto out_unlock;
}
handle = kzalloc(sizeof(*handle), GFP_KERNEL);
@@ -99,7 +115,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
/* Search for an existing domain. */
list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) {
- ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid);
+ ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid,
+ &handle->handle);
if (!ret) {
domain->users++;
goto out;
@@ -113,18 +130,22 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
goto out_free_handle;
}
- ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid);
+ ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid,
+ &handle->handle);
if (ret)
goto out_free_domain;
domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
+ if (list_empty(&iommu_mm->sva_domains)) {
+ if (list_empty(&iommu_sva_mms))
+ iommu_sva_present = true;
+ list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
+ }
+ list_add(&domain->next, &iommu_mm->sva_domains);
out:
refcount_set(&handle->users, 1);
- list_add(&handle->handle_item, &mm->iommu_mm->sva_handles);
mutex_unlock(&iommu_sva_lock);
handle->dev = dev;
- handle->domain = domain;
return handle;
out_free_domain:
@@ -147,7 +168,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_bind_device);
*/
void iommu_sva_unbind_device(struct iommu_sva *handle)
{
- struct iommu_domain *domain = handle->domain;
+ struct iommu_domain *domain = handle->handle.domain;
struct iommu_mm_data *iommu_mm = domain->mm->iommu_mm;
struct device *dev = handle->dev;
@@ -156,13 +177,19 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
mutex_unlock(&iommu_sva_lock);
return;
}
- list_del(&handle->handle_item);
iommu_detach_device_pasid(domain, dev, iommu_mm->pasid);
if (--domain->users == 0) {
list_del(&domain->next);
iommu_domain_free(domain);
}
+
+ if (list_empty(&iommu_mm->sva_domains)) {
+ list_del(&iommu_mm->mm_list_elm);
+ if (list_empty(&iommu_sva_mms))
+ iommu_sva_present = false;
+ }
+
mutex_unlock(&iommu_sva_lock);
kfree(handle);
}
@@ -170,7 +197,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_unbind_device);
u32 iommu_sva_get_pasid(struct iommu_sva *handle)
{
- struct iommu_domain *domain = handle->domain;
+ struct iommu_domain *domain = handle->handle.domain;
return mm_get_enqcmd_pasid(domain->mm);
}
@@ -259,7 +286,8 @@ static void iommu_sva_handle_iopf(struct work_struct *work)
if (status != IOMMU_PAGE_RESP_SUCCESS)
break;
- status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm);
+ status = iommu_sva_handle_mm(&iopf->fault,
+ group->attach_handle->domain->mm);
}
iopf_group_response(group, status);
@@ -277,23 +305,21 @@ static int iommu_sva_iopf_handler(struct iopf_group *group)
return 0;
}
-struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
- struct mm_struct *mm)
+static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+ struct mm_struct *mm)
{
const struct iommu_ops *ops = dev_iommu_ops(dev);
struct iommu_domain *domain;
- if (ops->domain_alloc_sva) {
- domain = ops->domain_alloc_sva(dev, mm);
- if (IS_ERR(domain))
- return domain;
- } else {
- domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
- if (!domain)
- return ERR_PTR(-ENOMEM);
- }
+ if (!ops->domain_alloc_sva)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ domain = ops->domain_alloc_sva(dev, mm);
+ if (IS_ERR(domain))
+ return domain;
domain->type = IOMMU_DOMAIN_SVA;
+ domain->cookie_type = IOMMU_COOKIE_SVA;
mmgrab(mm);
domain->mm = mm;
domain->owner = ops;
@@ -301,3 +327,15 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
return domain;
}
+
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
+{
+ struct iommu_mm_data *iommu_mm;
+
+ guard(mutex)(&iommu_sva_lock);
+ if (!iommu_sva_present)
+ return;
+
+ list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
+ mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+}
diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
index cbe378c34ba3..170022c09536 100644
--- a/drivers/iommu/iommu-sysfs.c
+++ b/drivers/iommu/iommu-sysfs.c
@@ -34,7 +34,7 @@ static void release_device(struct device *dev)
kfree(dev);
}
-static struct class iommu_class = {
+static const struct class iommu_class = {
.name = "iommu",
.dev_release = release_device,
.dev_groups = dev_groups,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9df7cc75c1bc..2ca990dfbb88 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -18,6 +18,7 @@
#include <linux/errno.h>
#include <linux/host1x_context_bus.h>
#include <linux/iommu.h>
+#include <linux/iommufd.h>
#include <linux/idr.h>
#include <linux/err.h>
#include <linux/pci.h>
@@ -32,6 +33,7 @@
#include <trace/events/iommu.h>
#include <linux/sched/mm.h>
#include <linux/msi.h>
+#include <uapi/linux/iommufd.h>
#include "dma-iommu.h"
#include "iommu-priv.h"
@@ -44,6 +46,9 @@ static unsigned int iommu_def_domain_type __read_mostly;
static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT);
static u32 iommu_cmd_line __read_mostly;
+/* Tags used with xa_tag_pointer() in group->pasid_array */
+enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 };
+
struct iommu_group {
struct kobject kobj;
struct kobject *devices_kobj;
@@ -90,15 +95,17 @@ static const char * const iommu_group_resv_type_string[] = {
#define IOMMU_CMD_LINE_DMA_API BIT(0)
#define IOMMU_CMD_LINE_STRICT BIT(1)
+static int bus_iommu_probe(const struct bus_type *bus);
static int iommu_bus_notifier(struct notifier_block *nb,
unsigned long action, void *data);
static void iommu_release_device(struct device *dev);
-static struct iommu_domain *
-__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type);
static int __iommu_attach_device(struct iommu_domain *domain,
- struct device *dev);
+ struct device *dev, struct iommu_domain *old);
static int __iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group);
+static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev,
+ unsigned int type,
+ unsigned int flags);
enum {
IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0,
@@ -107,6 +114,7 @@ enum {
static int __iommu_device_set_domain(struct iommu_group *group,
struct device *dev,
struct iommu_domain *new_domain,
+ struct iommu_domain *old_domain,
unsigned int flags);
static int __iommu_group_set_domain_internal(struct iommu_group *group,
struct iommu_domain *new_domain,
@@ -133,6 +141,8 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group,
struct device *dev);
static void __iommu_group_free_device(struct iommu_group *group,
struct group_device *grp_dev);
+static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
+ const struct iommu_ops *ops);
#define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \
struct iommu_group_attribute iommu_group_attr_##_name = \
@@ -268,6 +278,8 @@ int iommu_device_register(struct iommu_device *iommu,
err = bus_iommu_probe(iommu_buses[i]);
if (err)
iommu_device_unregister(iommu);
+ else
+ WRITE_ONCE(iommu->ready, true);
return err;
}
EXPORT_SYMBOL_GPL(iommu_device_register);
@@ -293,6 +305,7 @@ void iommu_device_unregister_bus(struct iommu_device *iommu,
struct notifier_block *nb)
{
bus_unregister_notifier(bus, nb);
+ fwnode_remove_software_node(iommu->fwnode);
iommu_device_unregister(iommu);
}
EXPORT_SYMBOL_GPL(iommu_device_unregister_bus);
@@ -315,6 +328,12 @@ int iommu_device_register_bus(struct iommu_device *iommu,
if (err)
return err;
+ iommu->fwnode = fwnode_create_software_node(NULL, NULL);
+ if (IS_ERR(iommu->fwnode)) {
+ bus_unregister_notifier(bus, nb);
+ return PTR_ERR(iommu->fwnode);
+ }
+
spin_lock(&iommu_device_lock);
list_add_tail(&iommu->list, &iommu_device_list);
spin_unlock(&iommu_device_lock);
@@ -324,9 +343,28 @@ int iommu_device_register_bus(struct iommu_device *iommu,
iommu_device_unregister_bus(iommu, bus, nb);
return err;
}
+ WRITE_ONCE(iommu->ready, true);
return 0;
}
EXPORT_SYMBOL_GPL(iommu_device_register_bus);
+
+int iommu_mock_device_add(struct device *dev, struct iommu_device *iommu)
+{
+ int rc;
+
+ mutex_lock(&iommu_probe_device_lock);
+ rc = iommu_fwspec_init(dev, iommu->fwnode);
+ mutex_unlock(&iommu_probe_device_lock);
+
+ if (rc)
+ return rc;
+
+ rc = device_add(dev);
+ if (rc)
+ iommu_fwspec_free(dev);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(iommu_mock_device_add);
#endif
static struct dev_iommu *dev_iommu_get(struct device *dev)
@@ -347,7 +385,7 @@ static struct dev_iommu *dev_iommu_get(struct device *dev)
return param;
}
-static void dev_iommu_free(struct device *dev)
+void dev_iommu_free(struct device *dev)
{
struct dev_iommu *param = dev->iommu;
@@ -399,14 +437,42 @@ EXPORT_SYMBOL_GPL(dev_iommu_priv_set);
* Init the dev->iommu and dev->iommu_group in the struct device and get the
* driver probed
*/
-static int iommu_init_device(struct device *dev, const struct iommu_ops *ops)
+static int iommu_init_device(struct device *dev)
{
+ const struct iommu_ops *ops;
struct iommu_device *iommu_dev;
struct iommu_group *group;
int ret;
if (!dev_iommu_get(dev))
return -ENOMEM;
+ /*
+ * For FDT-based systems and ACPI IORT/VIOT, the common firmware parsing
+ * is buried in the bus dma_configure path. Properly unpicking that is
+ * still a big job, so for now just invoke the whole thing. The device
+ * already having a driver bound means dma_configure has already run and
+ * found no IOMMU to wait for, so there's no point calling it again.
+ */
+ if (!dev->iommu->fwspec && !dev->driver && dev->bus->dma_configure) {
+ mutex_unlock(&iommu_probe_device_lock);
+ dev->bus->dma_configure(dev);
+ mutex_lock(&iommu_probe_device_lock);
+ /* If another instance finished the job for us, skip it */
+ if (!dev->iommu || dev->iommu_group)
+ return -ENODEV;
+ }
+ /*
+ * At this point, relevant devices either now have a fwspec which will
+ * match ops registered with a non-NULL fwnode, or we can reasonably
+ * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can
+ * be present, and that any of their registered instances has suitable
+ * ops for probing, and thus cheekily co-opt the same mechanism.
+ */
+ ops = iommu_fwspec_ops(dev->iommu->fwspec);
+ if (!ops) {
+ ret = -ENODEV;
+ goto err_free;
+ }
if (!try_module_get(ops->owner)) {
ret = -EINVAL;
@@ -477,8 +543,21 @@ static void iommu_deinit_device(struct device *dev)
* Regardless, if a delayed attach never occurred, then the release
* should still avoid touching any hardware configuration either.
*/
- if (!dev->iommu->attach_deferred && ops->release_domain)
- ops->release_domain->ops->attach_dev(ops->release_domain, dev);
+ if (!dev->iommu->attach_deferred && ops->release_domain) {
+ struct iommu_domain *release_domain = ops->release_domain;
+
+ /*
+ * If the device requires direct mappings then it should not
+ * be parked on a BLOCKED domain during release as that would
+ * break the direct mappings.
+ */
+ if (dev->iommu->require_direct && ops->identity_domain &&
+ release_domain == ops->blocked_domain)
+ release_domain = ops->identity_domain;
+
+ release_domain->ops->attach_dev(release_domain, dev,
+ group->domain);
+ }
if (ops->release_device)
ops->release_device(dev);
@@ -503,35 +582,27 @@ static void iommu_deinit_device(struct device *dev)
dev->iommu_group = NULL;
module_put(ops->owner);
dev_iommu_free(dev);
+#ifdef CONFIG_IOMMU_DMA
+ dev->dma_iommu = false;
+#endif
+}
+
+static struct iommu_domain *pasid_array_entry_to_domain(void *entry)
+{
+ if (xa_pointer_tag(entry) == IOMMU_PASID_ARRAY_DOMAIN)
+ return xa_untag_pointer(entry);
+ return ((struct iommu_attach_handle *)xa_untag_pointer(entry))->domain;
}
DEFINE_MUTEX(iommu_probe_device_lock);
static int __iommu_probe_device(struct device *dev, struct list_head *group_list)
{
- const struct iommu_ops *ops;
- struct iommu_fwspec *fwspec;
struct iommu_group *group;
struct group_device *gdev;
int ret;
/*
- * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU
- * instances with non-NULL fwnodes, and client devices should have been
- * identified with a fwspec by this point. Otherwise, we can currently
- * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can
- * be present, and that any of their registered instances has suitable
- * ops for probing, and thus cheekily co-opt the same mechanism.
- */
- fwspec = dev_iommu_fwspec_get(dev);
- if (fwspec && fwspec->ops)
- ops = fwspec->ops;
- else
- ops = iommu_ops_from_fwnode(NULL);
-
- if (!ops)
- return -ENODEV;
- /*
* Serialise to avoid races between IOMMU drivers registering in
* parallel and/or the "replay" calls from ACPI/OF code via client
* driver probe. Once the latter have been cleaned up we should
@@ -544,9 +615,15 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
if (dev->iommu_group)
return 0;
- ret = iommu_init_device(dev, ops);
+ ret = iommu_init_device(dev);
if (ret)
return ret;
+ /*
+ * And if we do now see any replay calls, they would indicate someone
+ * misusing the dma_configure path outside bus code.
+ */
+ if (dev->driver)
+ dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n");
group = dev->iommu_group;
gdev = iommu_group_alloc_device(group, dev);
@@ -565,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
if (group->default_domain)
iommu_create_device_direct_mappings(group->default_domain, dev);
if (group->domain) {
- ret = __iommu_device_set_domain(group, dev, group->domain, 0);
+ ret = __iommu_device_set_domain(group, dev, group->domain, NULL,
+ 0);
if (ret)
goto err_remove_gdev;
} else if (!group->default_domain && !group_list) {
@@ -1147,10 +1225,6 @@ map_end:
}
}
-
- if (!list_empty(&mappings) && iommu_is_dma_domain(domain))
- iommu_flush_iotlb_all(domain);
-
out:
iommu_put_resv_regions(dev, &mappings);
@@ -1592,12 +1666,57 @@ struct iommu_group *fsl_mc_device_group(struct device *dev)
}
EXPORT_SYMBOL_GPL(fsl_mc_device_group);
+static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev)
+{
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
+ struct iommu_domain *domain;
+
+ if (ops->identity_domain)
+ return ops->identity_domain;
+
+ if (ops->domain_alloc_identity) {
+ domain = ops->domain_alloc_identity(dev);
+ if (IS_ERR(domain))
+ return domain;
+ } else {
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops);
+ return domain;
+}
+
static struct iommu_domain *
__iommu_group_alloc_default_domain(struct iommu_group *group, int req_type)
{
+ struct device *dev = iommu_group_first_dev(group);
+ struct iommu_domain *dom;
+
if (group->default_domain && group->default_domain->type == req_type)
return group->default_domain;
- return __iommu_group_domain_alloc(group, req_type);
+
+ /*
+ * When allocating the DMA API domain assume that the driver is going to
+ * use PASID and make sure the RID's domain is PASID compatible.
+ */
+ if (req_type & __IOMMU_DOMAIN_PAGING) {
+ dom = __iommu_paging_domain_alloc_flags(dev, req_type,
+ dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0);
+
+ /*
+ * If driver does not support PASID feature then
+ * try to allocate non-PASID domain
+ */
+ if (PTR_ERR(dom) == -EOPNOTSUPP)
+ dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0);
+
+ return dom;
+ }
+
+ if (req_type == IOMMU_DOMAIN_IDENTITY)
+ return __iommu_alloc_identity_domain(dev);
+
+ return ERR_PTR(-EINVAL);
}
/*
@@ -1714,7 +1833,7 @@ static int iommu_get_def_domain_type(struct iommu_group *group,
group->id);
/*
- * Try to recover, drivers are allowed to force IDENITY or DMA, IDENTITY
+ * Try to recover, drivers are allowed to force IDENTITY or DMA, IDENTITY
* takes precedence.
*/
if (type == IOMMU_DOMAIN_IDENTITY)
@@ -1801,7 +1920,7 @@ static void iommu_group_do_probe_finalize(struct device *dev)
ops->probe_finalize(dev);
}
-int bus_iommu_probe(const struct bus_type *bus)
+static int bus_iommu_probe(const struct bus_type *bus)
{
struct iommu_group *group, *next;
LIST_HEAD(group_list);
@@ -1847,31 +1966,6 @@ int bus_iommu_probe(const struct bus_type *bus)
}
/**
- * iommu_present() - make platform-specific assumptions about an IOMMU
- * @bus: bus to check
- *
- * Do not use this function. You want device_iommu_mapped() instead.
- *
- * Return: true if some IOMMU is present and aware of devices on the given bus;
- * in general it may not be the only IOMMU, and it may not have anything to do
- * with whatever device you are ultimately interested in.
- */
-bool iommu_present(const struct bus_type *bus)
-{
- bool ret = false;
-
- for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) {
- if (iommu_buses[i] == bus) {
- spin_lock(&iommu_device_lock);
- ret = !list_empty(&iommu_device_list);
- spin_unlock(&iommu_device_lock);
- }
- }
- return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_present);
-
-/**
* device_iommu_capable() - check for a general IOMMU capability
* @dev: device to which the capability would be relevant, if available
* @cap: IOMMU capability
@@ -1933,110 +2027,87 @@ void iommu_set_fault_handler(struct iommu_domain *domain,
iommu_fault_handler_t handler,
void *token)
{
- BUG_ON(!domain);
+ if (WARN_ON(!domain || domain->cookie_type != IOMMU_COOKIE_NONE))
+ return;
+ domain->cookie_type = IOMMU_COOKIE_FAULT_HANDLER;
domain->handler = handler;
domain->handler_token = token;
}
EXPORT_SYMBOL_GPL(iommu_set_fault_handler);
-static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops,
- struct device *dev,
- unsigned int type)
+static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
+ const struct iommu_ops *ops)
{
- struct iommu_domain *domain;
- unsigned int alloc_type = type & IOMMU_DOMAIN_ALLOC_FLAGS;
-
- if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain)
- return ops->identity_domain;
- else if (alloc_type == IOMMU_DOMAIN_BLOCKED && ops->blocked_domain)
- return ops->blocked_domain;
- else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging)
- domain = ops->domain_alloc_paging(dev);
- else if (ops->domain_alloc)
- domain = ops->domain_alloc(alloc_type);
- else
- return ERR_PTR(-EOPNOTSUPP);
-
- /*
- * Many domain_alloc ops now return ERR_PTR, make things easier for the
- * driver by accepting ERR_PTR from all domain_alloc ops instead of
- * having two rules.
- */
- if (IS_ERR(domain))
- return domain;
- if (!domain)
- return ERR_PTR(-ENOMEM);
-
domain->type = type;
domain->owner = ops;
- /*
- * If not already set, assume all sizes by default; the driver
- * may override this later
- */
- if (!domain->pgsize_bitmap)
- domain->pgsize_bitmap = ops->pgsize_bitmap;
-
if (!domain->ops)
domain->ops = ops->default_domain_ops;
-
- if (iommu_is_dma_domain(domain)) {
- int rc;
-
- rc = iommu_get_dma_cookie(domain);
- if (rc) {
- iommu_domain_free(domain);
- return ERR_PTR(rc);
- }
- }
- return domain;
}
static struct iommu_domain *
-__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
+__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type,
+ unsigned int flags)
{
- struct device *dev = iommu_group_first_dev(group);
+ const struct iommu_ops *ops;
+ struct iommu_domain *domain;
- return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type);
-}
+ if (!dev_has_iommu(dev))
+ return ERR_PTR(-ENODEV);
-static int __iommu_domain_alloc_dev(struct device *dev, void *data)
-{
- const struct iommu_ops **ops = data;
+ ops = dev_iommu_ops(dev);
- if (!dev_has_iommu(dev))
- return 0;
+ if (ops->domain_alloc_paging && !flags)
+ domain = ops->domain_alloc_paging(dev);
+ else if (ops->domain_alloc_paging_flags)
+ domain = ops->domain_alloc_paging_flags(dev, flags, NULL);
+#if IS_ENABLED(CONFIG_FSL_PAMU)
+ else if (ops->domain_alloc && !flags)
+ domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED);
+#endif
+ else
+ return ERR_PTR(-EOPNOTSUPP);
- if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev),
- "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n",
- dev_bus_name(dev)))
- return -EBUSY;
+ if (IS_ERR(domain))
+ return domain;
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
- *ops = dev_iommu_ops(dev);
- return 0;
+ iommu_domain_init(domain, type, ops);
+ return domain;
}
-struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
+/**
+ * iommu_paging_domain_alloc_flags() - Allocate a paging domain
+ * @dev: device for which the domain is allocated
+ * @flags: Bitmap of iommufd_hwpt_alloc_flags
+ *
+ * Allocate a paging domain which will be managed by a kernel driver. Return
+ * allocated domain if successful, or an ERR pointer for failure.
+ */
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+ unsigned int flags)
{
- const struct iommu_ops *ops = NULL;
- int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev);
- struct iommu_domain *domain;
-
- if (err || !ops)
- return NULL;
-
- domain = __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED);
- if (IS_ERR(domain))
- return NULL;
- return domain;
+ return __iommu_paging_domain_alloc_flags(dev,
+ IOMMU_DOMAIN_UNMANAGED, flags);
}
-EXPORT_SYMBOL_GPL(iommu_domain_alloc);
+EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags);
void iommu_domain_free(struct iommu_domain *domain)
{
- if (domain->type == IOMMU_DOMAIN_SVA)
+ switch (domain->cookie_type) {
+ case IOMMU_COOKIE_DMA_IOVA:
+ iommu_put_dma_cookie(domain);
+ break;
+ case IOMMU_COOKIE_DMA_MSI:
+ iommu_put_msi_cookie(domain);
+ break;
+ case IOMMU_COOKIE_SVA:
mmdrop(domain->mm);
- iommu_put_dma_cookie(domain);
+ break;
+ default:
+ break;
+ }
if (domain->ops->free)
domain->ops->free(domain);
}
@@ -2059,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group)
}
static int __iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
int ret;
if (unlikely(domain->ops->attach_dev == NULL))
return -ENODEV;
- ret = domain->ops->attach_dev(domain, dev);
+ ret = domain->ops->attach_dev(domain, dev, old);
if (ret)
return ret;
dev->iommu->attach_deferred = 0;
@@ -2115,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
{
if (dev->iommu && dev->iommu->attach_deferred)
- return __iommu_attach_device(domain, dev);
+ return __iommu_attach_device(domain, dev, NULL);
return 0;
}
@@ -2160,6 +2231,30 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev)
return dev->iommu_group->default_domain;
}
+static void *iommu_make_pasid_array_entry(struct iommu_domain *domain,
+ struct iommu_attach_handle *handle)
+{
+ if (handle) {
+ handle->domain = domain;
+ return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE);
+ }
+
+ return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN);
+}
+
+static bool domain_iommu_ops_compatible(const struct iommu_ops *ops,
+ struct iommu_domain *domain)
+{
+ if (domain->owner == ops)
+ return true;
+
+ /* For static domains, owner isn't set. */
+ if (domain == ops->blocked_domain || domain == ops->identity_domain)
+ return true;
+
+ return false;
+}
+
static int __iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group)
{
@@ -2170,7 +2265,8 @@ static int __iommu_attach_group(struct iommu_domain *domain,
return -EBUSY;
dev = iommu_group_first_dev(group);
- if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner)
+ if (!dev_has_iommu(dev) ||
+ !domain_iommu_ops_compatible(dev_iommu_ops(dev), domain))
return -EINVAL;
return __iommu_group_set_domain(group, domain);
@@ -2200,35 +2296,10 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group)
}
EXPORT_SYMBOL_GPL(iommu_attach_group);
-/**
- * iommu_group_replace_domain - replace the domain that a group is attached to
- * @new_domain: new IOMMU domain to replace with
- * @group: IOMMU group that will be attached to the new domain
- *
- * This API allows the group to switch domains without being forced to go to
- * the blocking domain in-between.
- *
- * If the currently attached domain is a core domain (e.g. a default_domain),
- * it will act just like the iommu_attach_group().
- */
-int iommu_group_replace_domain(struct iommu_group *group,
- struct iommu_domain *new_domain)
-{
- int ret;
-
- if (!new_domain)
- return -EINVAL;
-
- mutex_lock(&group->mutex);
- ret = __iommu_group_set_domain(group, new_domain);
- mutex_unlock(&group->mutex);
- return ret;
-}
-EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, IOMMUFD_INTERNAL);
-
static int __iommu_device_set_domain(struct iommu_group *group,
struct device *dev,
struct iommu_domain *new_domain,
+ struct iommu_domain *old_domain,
unsigned int flags)
{
int ret;
@@ -2254,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group,
dev->iommu->attach_deferred = 0;
}
- ret = __iommu_attach_device(new_domain, dev);
+ ret = __iommu_attach_device(new_domain, dev, old_domain);
if (ret) {
/*
* If we have a blocking domain then try to attach that in hopes
@@ -2264,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group,
if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) &&
group->blocking_domain &&
group->blocking_domain != new_domain)
- __iommu_attach_device(group->blocking_domain, dev);
+ __iommu_attach_device(group->blocking_domain, dev,
+ old_domain);
return ret;
}
return 0;
@@ -2311,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
result = 0;
for_each_group_device(group, gdev) {
ret = __iommu_device_set_domain(group, gdev->dev, new_domain,
- flags);
+ group->domain, flags);
if (ret) {
result = ret;
/*
@@ -2336,6 +2408,9 @@ err_revert:
*/
last_gdev = gdev;
for_each_group_device(group, gdev) {
+ /* No need to revert the last gdev that failed to set domain */
+ if (gdev == last_gdev)
+ break;
/*
* A NULL domain can happen only for first probe, in which case
* we leave group->domain as NULL and let release clean
@@ -2343,10 +2418,8 @@ err_revert:
*/
if (group->domain)
WARN_ON(__iommu_device_set_domain(
- group, gdev->dev, group->domain,
+ group, gdev->dev, group->domain, new_domain,
IOMMU_SET_DOMAIN_MUST_SUCCEED));
- if (gdev == last_gdev)
- break;
}
return ret;
}
@@ -2377,6 +2450,7 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
unsigned int pgsize_idx, pgsize_idx_next;
unsigned long pgsizes;
size_t offset, pgsize, pgsize_next;
+ size_t offset_end;
unsigned long addr_merge = paddr | iova;
/* Page sizes supported by the hardware and small enough for @size */
@@ -2417,7 +2491,8 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova,
* If size is big enough to accommodate the larger page, reduce
* the number of smaller pages.
*/
- if (offset + pgsize_next <= size)
+ if (!check_add_overflow(offset, pgsize_next, &offset_end) &&
+ offset_end <= size)
size = offset;
out_set_count:
@@ -2425,8 +2500,8 @@ out_set_count:
return pgsize;
}
-static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
{
const struct iommu_domain_ops *ops = domain->ops;
unsigned long orig_iova = iova;
@@ -2435,12 +2510,19 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t orig_paddr = paddr;
int ret = 0;
+ might_sleep_if(gfpflags_allow_blocking(gfp));
+
if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
return -EINVAL;
if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL))
return -ENODEV;
+ /* Discourage passing strange GFP flags */
+ if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 |
+ __GFP_HIGHMEM)))
+ return -EINVAL;
+
/* find out the minimum page size supported */
min_pagesz = 1 << __ffs(domain->pgsize_bitmap);
@@ -2488,31 +2570,27 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
return ret;
}
-int iommu_map(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size)
{
const struct iommu_domain_ops *ops = domain->ops;
- int ret;
-
- might_sleep_if(gfpflags_allow_blocking(gfp));
- /* Discourage passing strange GFP flags */
- if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 |
- __GFP_HIGHMEM)))
- return -EINVAL;
+ if (!ops->iotlb_sync_map)
+ return 0;
+ return ops->iotlb_sync_map(domain, iova, size);
+}
- ret = __iommu_map(domain, iova, paddr, size, prot, gfp);
- if (ret == 0 && ops->iotlb_sync_map) {
- ret = ops->iotlb_sync_map(domain, iova, size);
- if (ret)
- goto out_err;
- }
+int iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+ int ret;
- return ret;
+ ret = iommu_map_nosync(domain, iova, paddr, size, prot, gfp);
+ if (ret)
+ return ret;
-out_err:
- /* undo mappings already done */
- iommu_unmap(domain, iova, size);
+ ret = iommu_sync_map(domain, iova, size);
+ if (ret)
+ iommu_unmap(domain, iova, size);
return ret;
}
@@ -2572,6 +2650,20 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
return unmapped;
}
+/**
+ * iommu_unmap() - Remove mappings from a range of IOVA
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @size: Length of the range starting from @iova
+ *
+ * iommu_unmap() will remove a translation created by iommu_map(). It cannot
+ * subdivide a mapping created by iommu_map(), so it should be called with IOVA
+ * ranges that match what was passed to iommu_map(). The range can aggregate
+ * contiguous iommu_map() calls so long as no individual range is split.
+ *
+ * Returns: Number of bytes of IOVA unmapped. iova + res will be the point
+ * unmapping stopped.
+ */
size_t iommu_unmap(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
@@ -2586,6 +2678,25 @@ size_t iommu_unmap(struct iommu_domain *domain,
}
EXPORT_SYMBOL_GPL(iommu_unmap);
+/**
+ * iommu_unmap_fast() - Remove mappings from a range of IOVA without IOTLB sync
+ * @domain: Domain to manipulate
+ * @iova: IO virtual address to start
+ * @size: Length of the range starting from @iova
+ * @iotlb_gather: range information for a pending IOTLB flush
+ *
+ * iommu_unmap_fast() will remove a translation created by iommu_map().
+ * It can't subdivide a mapping created by iommu_map(), so it should be
+ * called with IOVA ranges that match what was passed to iommu_map(). The
+ * range can aggregate contiguous iommu_map() calls so long as no individual
+ * range is split.
+ *
+ * Basically iommu_unmap_fast() is the same as iommu_unmap() but for callers
+ * which manage the IOTLB flushing externally to perform a batched sync.
+ *
+ * Returns: Number of bytes of IOVA unmapped. iova + res will be the point
+ * unmapping stopped.
+ */
size_t iommu_unmap_fast(struct iommu_domain *domain,
unsigned long iova, size_t size,
struct iommu_iotlb_gather *iotlb_gather)
@@ -2598,26 +2709,17 @@ ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
struct scatterlist *sg, unsigned int nents, int prot,
gfp_t gfp)
{
- const struct iommu_domain_ops *ops = domain->ops;
size_t len = 0, mapped = 0;
phys_addr_t start;
unsigned int i = 0;
int ret;
- might_sleep_if(gfpflags_allow_blocking(gfp));
-
- /* Discourage passing strange GFP flags */
- if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 |
- __GFP_HIGHMEM)))
- return -EINVAL;
-
while (i <= nents) {
phys_addr_t s_phys = sg_phys(sg);
if (len && s_phys != start + len) {
- ret = __iommu_map(domain, iova + mapped, start,
+ ret = iommu_map_nosync(domain, iova + mapped, start,
len, prot, gfp);
-
if (ret)
goto out_err;
@@ -2640,11 +2742,10 @@ next:
sg = sg_next(sg);
}
- if (ops->iotlb_sync_map) {
- ret = ops->iotlb_sync_map(domain, iova, mapped);
- if (ret)
- goto out_err;
- }
+ ret = iommu_sync_map(domain, iova, mapped);
+ if (ret)
+ goto out_err;
+
return mapped;
out_err:
@@ -2688,7 +2789,8 @@ int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
* if upper layers showed interest and installed a fault handler,
* invoke it.
*/
- if (domain->handler)
+ if (domain->cookie_type == IOMMU_COOKIE_FAULT_HANDLER &&
+ domain->handler)
ret = domain->handler(domain, dev, iova, flags,
domain->handler_token);
@@ -2709,16 +2811,6 @@ static int __init iommu_init(void)
}
core_initcall(iommu_init);
-int iommu_enable_nesting(struct iommu_domain *domain)
-{
- if (domain->type != IOMMU_DOMAIN_UNMANAGED)
- return -EINVAL;
- if (!domain->ops->enable_nesting)
- return -EINVAL;
- return domain->ops->enable_nesting(domain);
-}
-EXPORT_SYMBOL_GPL(iommu_enable_nesting);
-
int iommu_set_pgtable_quirks(struct iommu_domain *domain,
unsigned long quirk)
{
@@ -2807,28 +2899,39 @@ bool iommu_default_passthrough(void)
}
EXPORT_SYMBOL_GPL(iommu_default_passthrough);
-const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode)
+static const struct iommu_device *iommu_from_fwnode(const struct fwnode_handle *fwnode)
{
- const struct iommu_ops *ops = NULL;
- struct iommu_device *iommu;
+ const struct iommu_device *iommu, *ret = NULL;
spin_lock(&iommu_device_lock);
list_for_each_entry(iommu, &iommu_device_list, list)
if (iommu->fwnode == fwnode) {
- ops = iommu->ops;
+ ret = iommu;
break;
}
spin_unlock(&iommu_device_lock);
- return ops;
+ return ret;
}
-int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
- const struct iommu_ops *ops)
+const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode)
{
+ const struct iommu_device *iommu = iommu_from_fwnode(fwnode);
+
+ return iommu ? iommu->ops : NULL;
+}
+
+int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode)
+{
+ const struct iommu_device *iommu = iommu_from_fwnode(iommu_fwnode);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ if (!iommu)
+ return driver_deferred_probe_check_state(dev);
+ if (!dev->iommu && !READ_ONCE(iommu->ready))
+ return -EPROBE_DEFER;
+
if (fwspec)
- return ops == fwspec->ops ? 0 : -EINVAL;
+ return iommu->ops == iommu_fwspec_ops(fwspec) ? 0 : -EINVAL;
if (!dev_iommu_get(dev))
return -ENOMEM;
@@ -2838,9 +2941,8 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
if (!fwspec)
return -ENOMEM;
- of_node_get(to_of_node(iommu_fwnode));
+ fwnode_handle_get(iommu_fwnode);
fwspec->iommu_fwnode = iommu_fwnode;
- fwspec->ops = ops;
dev_iommu_fwspec_set(dev, fwspec);
return 0;
}
@@ -2856,7 +2958,6 @@ void iommu_fwspec_free(struct device *dev)
dev_iommu_fwspec_set(dev, NULL);
}
}
-EXPORT_SYMBOL_GPL(iommu_fwspec_free);
int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids)
{
@@ -2884,38 +2985,6 @@ int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids)
}
EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
-/*
- * Per device IOMMU features.
- */
-int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
-{
- if (dev_has_iommu(dev)) {
- const struct iommu_ops *ops = dev_iommu_ops(dev);
-
- if (ops->dev_enable_feat)
- return ops->dev_enable_feat(dev, feat);
- }
-
- return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
-
-/*
- * The device drivers should do the necessary cleanups before calling this.
- */
-int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
-{
- if (dev_has_iommu(dev)) {
- const struct iommu_ops *ops = dev_iommu_ops(dev);
-
- if (ops->dev_disable_feat)
- return ops->dev_disable_feat(dev, feat);
- }
-
- return -EBUSY;
-}
-EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
-
/**
* iommu_setup_default_domain - Set the default_domain for the group
* @group: Group to change
@@ -2949,6 +3018,14 @@ static int iommu_setup_default_domain(struct iommu_group *group,
if (group->default_domain == dom)
return 0;
+ if (iommu_is_dma_domain(dom)) {
+ ret = iommu_get_dma_cookie(dom);
+ if (ret) {
+ iommu_domain_free(dom);
+ return ret;
+ }
+ }
+
/*
* IOMMU_RESV_DIRECT and IOMMU_RESV_DIRECT_RELAXABLE regions must be
* mapped before their device is attached, in order to guarantee
@@ -3096,6 +3173,11 @@ int iommu_device_use_default_domain(struct device *dev)
return 0;
mutex_lock(&group->mutex);
+ /* We may race against bus_iommu_probe() finalising groups here */
+ if (!group->default_domain) {
+ ret = -EPROBE_DEFER;
+ goto unlock_out;
+ }
if (group->owner_cnt) {
if (group->domain != group->default_domain || group->owner ||
!xa_empty(&group->pasid_array)) {
@@ -3136,22 +3218,25 @@ void iommu_device_unuse_default_domain(struct device *dev)
static int __iommu_group_alloc_blocking_domain(struct iommu_group *group)
{
+ struct device *dev = iommu_group_first_dev(group);
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
struct iommu_domain *domain;
if (group->blocking_domain)
return 0;
- domain = __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED);
- if (IS_ERR(domain)) {
- /*
- * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED
- * create an empty domain instead.
- */
- domain = __iommu_group_domain_alloc(group,
- IOMMU_DOMAIN_UNMANAGED);
- if (IS_ERR(domain))
- return PTR_ERR(domain);
+ if (ops->blocked_domain) {
+ group->blocking_domain = ops->blocked_domain;
+ return 0;
}
+
+ /*
+ * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED create an
+ * empty PAGING domain instead.
+ */
+ domain = iommu_paging_domain_alloc(dev);
+ if (IS_ERR(domain))
+ return PTR_ERR(domain);
group->blocking_domain = domain;
return 0;
}
@@ -3308,16 +3393,30 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group)
}
EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed);
+static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
+ struct iommu_domain *domain)
+{
+ const struct iommu_ops *ops = dev_iommu_ops(dev);
+ struct iommu_domain *blocked_domain = ops->blocked_domain;
+
+ WARN_ON(blocked_domain->ops->set_dev_pasid(blocked_domain,
+ dev, pasid, domain));
+}
+
static int __iommu_set_group_pasid(struct iommu_domain *domain,
- struct iommu_group *group, ioasid_t pasid)
+ struct iommu_group *group, ioasid_t pasid,
+ struct iommu_domain *old)
{
struct group_device *device, *last_gdev;
int ret;
for_each_group_device(group, device) {
- ret = domain->ops->set_dev_pasid(domain, device->dev, pasid);
- if (ret)
- goto err_revert;
+ if (device->dev->iommu->max_pasids > 0) {
+ ret = domain->ops->set_dev_pasid(domain, device->dev,
+ pasid, old);
+ if (ret)
+ goto err_revert;
+ }
}
return 0;
@@ -3325,11 +3424,20 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain,
err_revert:
last_gdev = device;
for_each_group_device(group, device) {
- const struct iommu_ops *ops = dev_iommu_ops(device->dev);
-
if (device == last_gdev)
break;
- ops->remove_dev_pasid(device->dev, pasid, domain);
+ if (device->dev->iommu->max_pasids > 0) {
+ /*
+ * If no old domain, undo the succeeded devices/pasid.
+ * Otherwise, rollback the succeeded devices/pasid to
+ * the old domain. And it is a driver bug to fail
+ * attaching with a previously good domain.
+ */
+ if (!old ||
+ WARN_ON(old->ops->set_dev_pasid(old, device->dev,
+ pasid, domain)))
+ iommu_remove_dev_pasid(device->dev, pasid, domain);
+ }
}
return ret;
}
@@ -3339,11 +3447,10 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
struct iommu_domain *domain)
{
struct group_device *device;
- const struct iommu_ops *ops;
for_each_group_device(group, device) {
- ops = dev_iommu_ops(device->dev);
- ops->remove_dev_pasid(device->dev, pasid, domain);
+ if (device->dev->iommu->max_pasids > 0)
+ iommu_remove_dev_pasid(device->dev, pasid, domain);
}
}
@@ -3352,107 +3459,195 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
* @domain: the iommu domain.
* @dev: the attached device.
* @pasid: the pasid of the device.
+ * @handle: the attach handle.
+ *
+ * Caller should always provide a new handle to avoid race with the paths
+ * that have lockless reference to handle if it intends to pass a valid handle.
*
* Return: 0 on success, or an error.
*/
int iommu_attach_device_pasid(struct iommu_domain *domain,
- struct device *dev, ioasid_t pasid)
+ struct device *dev, ioasid_t pasid,
+ struct iommu_attach_handle *handle)
{
/* Caller must be a probed driver on dev */
struct iommu_group *group = dev->iommu_group;
struct group_device *device;
- void *curr;
+ const struct iommu_ops *ops;
+ void *entry;
int ret;
- if (!domain->ops->set_dev_pasid)
- return -EOPNOTSUPP;
-
if (!group)
return -ENODEV;
- if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner ||
+ ops = dev_iommu_ops(dev);
+
+ if (!domain->ops->set_dev_pasid ||
+ !ops->blocked_domain ||
+ !ops->blocked_domain->ops->set_dev_pasid)
+ return -EOPNOTSUPP;
+
+ if (!domain_iommu_ops_compatible(ops, domain) ||
pasid == IOMMU_NO_PASID)
return -EINVAL;
mutex_lock(&group->mutex);
for_each_group_device(group, device) {
- if (pasid >= device->dev->iommu->max_pasids) {
+ /*
+ * Skip PASID validation for devices without PASID support
+ * (max_pasids = 0). These devices cannot issue transactions
+ * with PASID, so they don't affect group's PASID usage.
+ */
+ if ((device->dev->iommu->max_pasids > 0) &&
+ (pasid >= device->dev->iommu->max_pasids)) {
ret = -EINVAL;
goto out_unlock;
}
}
- curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL);
- if (curr) {
- ret = xa_err(curr) ? : -EBUSY;
+ entry = iommu_make_pasid_array_entry(domain, handle);
+
+ /*
+ * Entry present is a failure case. Use xa_insert() instead of
+ * xa_reserve().
+ */
+ ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL);
+ if (ret)
+ goto out_unlock;
+
+ ret = __iommu_set_group_pasid(domain, group, pasid, NULL);
+ if (ret) {
+ xa_release(&group->pasid_array, pasid);
goto out_unlock;
}
- ret = __iommu_set_group_pasid(domain, group, pasid);
- if (ret)
- xa_erase(&group->pasid_array, pasid);
+ /*
+ * The xa_insert() above reserved the memory, and the group->mutex is
+ * held, this cannot fail. The new domain cannot be visible until the
+ * operation succeeds as we cannot tolerate PRIs becoming concurrently
+ * queued and then failing attach.
+ */
+ WARN_ON(xa_is_err(xa_store(&group->pasid_array,
+ pasid, entry, GFP_KERNEL)));
+
out_unlock:
mutex_unlock(&group->mutex);
return ret;
}
EXPORT_SYMBOL_GPL(iommu_attach_device_pasid);
-/*
- * iommu_detach_device_pasid() - Detach the domain from pasid of device
- * @domain: the iommu domain.
+/**
+ * iommu_replace_device_pasid - Replace the domain that a specific pasid
+ * of the device is attached to
+ * @domain: the new iommu domain
* @dev: the attached device.
* @pasid: the pasid of the device.
+ * @handle: the attach handle.
*
- * The @domain must have been attached to @pasid of the @dev with
- * iommu_attach_device_pasid().
+ * This API allows the pasid to switch domains. The @pasid should have been
+ * attached. Otherwise, this fails. The pasid will keep the old configuration
+ * if replacement failed.
+ *
+ * Caller should always provide a new handle to avoid race with the paths
+ * that have lockless reference to handle if it intends to pass a valid handle.
+ *
+ * Return 0 on success, or an error.
*/
-void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev,
- ioasid_t pasid)
+int iommu_replace_device_pasid(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_attach_handle *handle)
{
/* Caller must be a probed driver on dev */
struct iommu_group *group = dev->iommu_group;
+ struct iommu_attach_handle *entry;
+ struct iommu_domain *curr_domain;
+ void *curr;
+ int ret;
+
+ if (!group)
+ return -ENODEV;
+
+ if (!domain->ops->set_dev_pasid)
+ return -EOPNOTSUPP;
+
+ if (!domain_iommu_ops_compatible(dev_iommu_ops(dev), domain) ||
+ pasid == IOMMU_NO_PASID || !handle)
+ return -EINVAL;
mutex_lock(&group->mutex);
- __iommu_remove_group_pasid(group, pasid, domain);
- WARN_ON(xa_erase(&group->pasid_array, pasid) != domain);
+ entry = iommu_make_pasid_array_entry(domain, handle);
+ curr = xa_cmpxchg(&group->pasid_array, pasid, NULL,
+ XA_ZERO_ENTRY, GFP_KERNEL);
+ if (xa_is_err(curr)) {
+ ret = xa_err(curr);
+ goto out_unlock;
+ }
+
+ /*
+ * No domain (with or without handle) attached, hence not
+ * a replace case.
+ */
+ if (!curr) {
+ xa_release(&group->pasid_array, pasid);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Reusing handle is problematic as there are paths that refers
+ * the handle without lock. To avoid race, reject the callers that
+ * attempt it.
+ */
+ if (curr == entry) {
+ WARN_ON(1);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ curr_domain = pasid_array_entry_to_domain(curr);
+ ret = 0;
+
+ if (curr_domain != domain) {
+ ret = __iommu_set_group_pasid(domain, group,
+ pasid, curr_domain);
+ if (ret)
+ goto out_unlock;
+ }
+
+ /*
+ * The above xa_cmpxchg() reserved the memory, and the
+ * group->mutex is held, this cannot fail.
+ */
+ WARN_ON(xa_is_err(xa_store(&group->pasid_array,
+ pasid, entry, GFP_KERNEL)));
+
+out_unlock:
mutex_unlock(&group->mutex);
+ return ret;
}
-EXPORT_SYMBOL_GPL(iommu_detach_device_pasid);
+EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, "IOMMUFD_INTERNAL");
/*
- * iommu_get_domain_for_dev_pasid() - Retrieve domain for @pasid of @dev
- * @dev: the queried device
- * @pasid: the pasid of the device
- * @type: matched domain type, 0 for any match
- *
- * This is a variant of iommu_get_domain_for_dev(). It returns the existing
- * domain attached to pasid of a device. Callers must hold a lock around this
- * function, and both iommu_attach/detach_dev_pasid() whenever a domain of
- * type is being manipulated. This API does not internally resolve races with
- * attach/detach.
+ * iommu_detach_device_pasid() - Detach the domain from pasid of device
+ * @domain: the iommu domain.
+ * @dev: the attached device.
+ * @pasid: the pasid of the device.
*
- * Return: attached domain on success, NULL otherwise.
+ * The @domain must have been attached to @pasid of the @dev with
+ * iommu_attach_device_pasid().
*/
-struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev,
- ioasid_t pasid,
- unsigned int type)
+void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev,
+ ioasid_t pasid)
{
/* Caller must be a probed driver on dev */
struct iommu_group *group = dev->iommu_group;
- struct iommu_domain *domain;
-
- if (!group)
- return NULL;
-
- xa_lock(&group->pasid_array);
- domain = xa_load(&group->pasid_array, pasid);
- if (type && domain && domain->type != type)
- domain = ERR_PTR(-EBUSY);
- xa_unlock(&group->pasid_array);
- return domain;
+ mutex_lock(&group->mutex);
+ __iommu_remove_group_pasid(group, pasid, domain);
+ xa_erase(&group->pasid_array, pasid);
+ mutex_unlock(&group->mutex);
}
-EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid);
+EXPORT_SYMBOL_GPL(iommu_detach_device_pasid);
ioasid_t iommu_alloc_global_pasid(struct device *dev)
{
@@ -3480,3 +3675,201 @@ void iommu_free_global_pasid(ioasid_t pasid)
ida_free(&iommu_global_pasid_ida, pasid);
}
EXPORT_SYMBOL_GPL(iommu_free_global_pasid);
+
+/**
+ * iommu_attach_handle_get - Return the attach handle
+ * @group: the iommu group that domain was attached to
+ * @pasid: the pasid within the group
+ * @type: matched domain type, 0 for any match
+ *
+ * Return handle or ERR_PTR(-ENOENT) on none, ERR_PTR(-EBUSY) on mismatch.
+ *
+ * Return the attach handle to the caller. The life cycle of an iommu attach
+ * handle is from the time when the domain is attached to the time when the
+ * domain is detached. Callers are required to synchronize the call of
+ * iommu_attach_handle_get() with domain attachment and detachment. The attach
+ * handle can only be used during its life cycle.
+ */
+struct iommu_attach_handle *
+iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type)
+{
+ struct iommu_attach_handle *handle;
+ void *entry;
+
+ xa_lock(&group->pasid_array);
+ entry = xa_load(&group->pasid_array, pasid);
+ if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) {
+ handle = ERR_PTR(-ENOENT);
+ } else {
+ handle = xa_untag_pointer(entry);
+ if (type && handle->domain->type != type)
+ handle = ERR_PTR(-EBUSY);
+ }
+ xa_unlock(&group->pasid_array);
+
+ return handle;
+}
+EXPORT_SYMBOL_NS_GPL(iommu_attach_handle_get, "IOMMUFD_INTERNAL");
+
+/**
+ * iommu_attach_group_handle - Attach an IOMMU domain to an IOMMU group
+ * @domain: IOMMU domain to attach
+ * @group: IOMMU group that will be attached
+ * @handle: attach handle
+ *
+ * Returns 0 on success and error code on failure.
+ *
+ * This is a variant of iommu_attach_group(). It allows the caller to provide
+ * an attach handle and use it when the domain is attached. This is currently
+ * used by IOMMUFD to deliver the I/O page faults.
+ *
+ * Caller should always provide a new handle to avoid race with the paths
+ * that have lockless reference to handle.
+ */
+int iommu_attach_group_handle(struct iommu_domain *domain,
+ struct iommu_group *group,
+ struct iommu_attach_handle *handle)
+{
+ void *entry;
+ int ret;
+
+ if (!handle)
+ return -EINVAL;
+
+ mutex_lock(&group->mutex);
+ entry = iommu_make_pasid_array_entry(domain, handle);
+ ret = xa_insert(&group->pasid_array,
+ IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL);
+ if (ret)
+ goto out_unlock;
+
+ ret = __iommu_attach_group(domain, group);
+ if (ret) {
+ xa_release(&group->pasid_array, IOMMU_NO_PASID);
+ goto out_unlock;
+ }
+
+ /*
+ * The xa_insert() above reserved the memory, and the group->mutex is
+ * held, this cannot fail. The new domain cannot be visible until the
+ * operation succeeds as we cannot tolerate PRIs becoming concurrently
+ * queued and then failing attach.
+ */
+ WARN_ON(xa_is_err(xa_store(&group->pasid_array,
+ IOMMU_NO_PASID, entry, GFP_KERNEL)));
+
+out_unlock:
+ mutex_unlock(&group->mutex);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommu_attach_group_handle, "IOMMUFD_INTERNAL");
+
+/**
+ * iommu_detach_group_handle - Detach an IOMMU domain from an IOMMU group
+ * @domain: IOMMU domain to attach
+ * @group: IOMMU group that will be attached
+ *
+ * Detach the specified IOMMU domain from the specified IOMMU group.
+ * It must be used in conjunction with iommu_attach_group_handle().
+ */
+void iommu_detach_group_handle(struct iommu_domain *domain,
+ struct iommu_group *group)
+{
+ mutex_lock(&group->mutex);
+ __iommu_group_set_core_domain(group);
+ xa_erase(&group->pasid_array, IOMMU_NO_PASID);
+ mutex_unlock(&group->mutex);
+}
+EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL");
+
+/**
+ * iommu_replace_group_handle - replace the domain that a group is attached to
+ * @group: IOMMU group that will be attached to the new domain
+ * @new_domain: new IOMMU domain to replace with
+ * @handle: attach handle
+ *
+ * This API allows the group to switch domains without being forced to go to
+ * the blocking domain in-between. It allows the caller to provide an attach
+ * handle for the new domain and use it when the domain is attached.
+ *
+ * If the currently attached domain is a core domain (e.g. a default_domain),
+ * it will act just like the iommu_attach_group_handle().
+ *
+ * Caller should always provide a new handle to avoid race with the paths
+ * that have lockless reference to handle.
+ */
+int iommu_replace_group_handle(struct iommu_group *group,
+ struct iommu_domain *new_domain,
+ struct iommu_attach_handle *handle)
+{
+ void *curr, *entry;
+ int ret;
+
+ if (!new_domain || !handle)
+ return -EINVAL;
+
+ mutex_lock(&group->mutex);
+ entry = iommu_make_pasid_array_entry(new_domain, handle);
+ ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL);
+ if (ret)
+ goto err_unlock;
+
+ ret = __iommu_group_set_domain(group, new_domain);
+ if (ret)
+ goto err_release;
+
+ curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL);
+ WARN_ON(xa_is_err(curr));
+
+ mutex_unlock(&group->mutex);
+
+ return 0;
+err_release:
+ xa_release(&group->pasid_array, IOMMU_NO_PASID);
+err_unlock:
+ mutex_unlock(&group->mutex);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL");
+
+#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
+/**
+ * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
+ * @desc: MSI descriptor, will store the MSI page
+ * @msi_addr: MSI target address to be mapped
+ *
+ * The implementation of sw_msi() should take msi_addr and map it to
+ * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the
+ * mapping information.
+ *
+ * Return: 0 on success or negative error code if the mapping failed.
+ */
+int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
+{
+ struct device *dev = msi_desc_to_dev(desc);
+ struct iommu_group *group = dev->iommu_group;
+ int ret = 0;
+
+ if (!group)
+ return 0;
+
+ mutex_lock(&group->mutex);
+ /* An IDENTITY domain must pass through */
+ if (group->domain && group->domain->type != IOMMU_DOMAIN_IDENTITY) {
+ switch (group->domain->cookie_type) {
+ case IOMMU_COOKIE_DMA_MSI:
+ case IOMMU_COOKIE_DMA_IOVA:
+ ret = iommu_dma_sw_msi(group->domain, desc, msi_addr);
+ break;
+ case IOMMU_COOKIE_IOMMUFD:
+ ret = iommufd_sw_msi(group->domain, desc, msi_addr);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ }
+ mutex_unlock(&group->mutex);
+ return ret;
+}
+#endif /* CONFIG_IRQ_MSI_IOMMU */
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig
index 76656fe0470d..eae3f03629b0 100644
--- a/drivers/iommu/iommufd/Kconfig
+++ b/drivers/iommu/iommufd/Kconfig
@@ -1,4 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
+config IOMMUFD_DRIVER_CORE
+ bool
+ default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n
+
config IOMMUFD
tristate "IOMMU Userspace API"
select INTERVAL_TREE
@@ -37,6 +41,7 @@ config IOMMUFD_TEST
depends on DEBUG_KERNEL
depends on FAULT_INJECTION
depends on RUNTIME_TESTING_MENU
+ depends on IOMMU_PT_AMDV1
select IOMMUFD_DRIVER
default n
help
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 34b446146961..71d692c9a8f4 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,14 +1,19 @@
# SPDX-License-Identifier: GPL-2.0-only
iommufd-y := \
device.o \
+ eventq.o \
hw_pagetable.o \
io_pagetable.o \
ioas.o \
main.o \
pages.o \
- vfio_compat.o
+ vfio_compat.o \
+ viommu.o
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
+
+iommufd_driver-y := driver.o
+obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 873630c111c1..4c842368289f 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1,12 +1,13 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
*/
+#include <linux/iommu.h>
#include <linux/iommufd.h>
+#include <linux/pci-ats.h>
#include <linux/slab.h>
-#include <linux/iommu.h>
#include <uapi/linux/iommufd.h>
-#include "../iommu-priv.h"
+#include "../iommu-priv.h"
#include "io_pagetable.h"
#include "iommufd_private.h"
@@ -17,12 +18,17 @@ MODULE_PARM_DESC(
"Allow IOMMUFD to bind to devices even if the platform cannot isolate "
"the MSI interrupt window. Enabling this is a security weakness.");
+struct iommufd_attach {
+ struct iommufd_hw_pagetable *hwpt;
+ struct xarray device_array;
+};
+
static void iommufd_group_release(struct kref *kref)
{
struct iommufd_group *igroup =
container_of(kref, struct iommufd_group, ref);
- WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list));
+ WARN_ON(!xa_empty(&igroup->pasid_attach));
xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup,
NULL, GFP_KERNEL);
@@ -89,7 +95,7 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
kref_init(&new_igroup->ref);
mutex_init(&new_igroup->lock);
- INIT_LIST_HEAD(&new_igroup->device_list);
+ xa_init(&new_igroup->pasid_attach);
new_igroup->sw_msi_start = PHYS_ADDR_MAX;
/* group reference moves into new_igroup */
new_igroup->group = group;
@@ -131,6 +137,57 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
}
}
+static void iommufd_device_remove_vdev(struct iommufd_device *idev)
+{
+ struct iommufd_vdevice *vdev;
+
+ mutex_lock(&idev->igroup->lock);
+ /* prevent new references from vdev */
+ idev->destroying = true;
+ /* vdev has been completely destroyed by userspace */
+ if (!idev->vdev)
+ goto out_unlock;
+
+ vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
+ /*
+ * An ongoing vdev destroy ioctl has removed the vdev from the object
+ * xarray, but has not finished iommufd_vdevice_destroy() yet as it
+ * needs the same mutex. We exit the locking then wait on wait_cnt
+ * reference for the vdev destruction.
+ */
+ if (IS_ERR(vdev))
+ goto out_unlock;
+
+ /* Should never happen */
+ if (WARN_ON(vdev != idev->vdev)) {
+ iommufd_put_object(idev->ictx, &vdev->obj);
+ goto out_unlock;
+ }
+
+ /*
+ * vdev is still alive. Hold a users refcount to prevent racing with
+ * userspace destruction, then use iommufd_object_tombstone_user() to
+ * destroy it and leave a tombstone.
+ */
+ refcount_inc(&vdev->obj.users);
+ iommufd_put_object(idev->ictx, &vdev->obj);
+ mutex_unlock(&idev->igroup->lock);
+ iommufd_object_tombstone_user(idev->ictx, &vdev->obj);
+ return;
+
+out_unlock:
+ mutex_unlock(&idev->igroup->lock);
+}
+
+void iommufd_device_pre_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_device *idev =
+ container_of(obj, struct iommufd_device, obj);
+
+ /* Release the wait_cnt reference on this */
+ iommufd_device_remove_vdev(idev);
+}
+
void iommufd_device_destroy(struct iommufd_object *obj)
{
struct iommufd_device *idev =
@@ -232,7 +289,7 @@ out_group_put:
iommufd_put_group(igroup);
return ERR_PTR(rc);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, "IOMMUFD");
/**
* iommufd_ctx_has_group - True if any device within the group is bound
@@ -263,7 +320,7 @@ bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group)
xa_unlock(&ictx->objects);
return false;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, "IOMMUFD");
/**
* iommufd_device_unbind - Undo iommufd_device_bind()
@@ -278,69 +335,97 @@ void iommufd_device_unbind(struct iommufd_device *idev)
{
iommufd_object_destroy_user(idev->ictx, &idev->obj);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, "IOMMUFD");
struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev)
{
return idev->ictx;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, "IOMMUFD");
u32 iommufd_device_to_id(struct iommufd_device *idev)
{
return idev->obj.id;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD");
+
+static unsigned int iommufd_group_device_num(struct iommufd_group *igroup,
+ ioasid_t pasid)
+{
+ struct iommufd_attach *attach;
+ struct iommufd_device *idev;
+ unsigned int count = 0;
+ unsigned long index;
+
+ lockdep_assert_held(&igroup->lock);
+ attach = xa_load(&igroup->pasid_attach, pasid);
+ if (attach)
+ xa_for_each(&attach->device_array, index, idev)
+ count++;
+ return count;
+}
+
+#ifdef CONFIG_IRQ_MSI_IOMMU
static int iommufd_group_setup_msi(struct iommufd_group *igroup,
struct iommufd_hwpt_paging *hwpt_paging)
{
- phys_addr_t sw_msi_start = igroup->sw_msi_start;
- int rc;
+ struct iommufd_ctx *ictx = igroup->ictx;
+ struct iommufd_sw_msi_map *cur;
+
+ if (igroup->sw_msi_start == PHYS_ADDR_MAX)
+ return 0;
/*
- * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
- * call iommu_get_msi_cookie() on its behalf. This is necessary to setup
- * the MSI window so iommu_dma_prepare_msi() can install pages into our
- * domain after request_irq(). If it is not done interrupts will not
- * work on this domain.
- *
- * FIXME: This is conceptually broken for iommufd since we want to allow
- * userspace to change the domains, eg switch from an identity IOAS to a
- * DMA IOAS. There is currently no way to create a MSI window that
- * matches what the IRQ layer actually expects in a newly created
- * domain.
+ * Install all the MSI pages the device has been using into the domain
*/
- if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
- rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
- sw_msi_start);
+ guard(mutex)(&ictx->sw_msi_lock);
+ list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
+ int rc;
+
+ if (cur->sw_msi_start != igroup->sw_msi_start ||
+ !test_bit(cur->id, igroup->required_sw_msi.bitmap))
+ continue;
+
+ rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur);
if (rc)
return rc;
-
- /*
- * iommu_get_msi_cookie() can only be called once per domain,
- * it returns -EBUSY on later calls.
- */
- hwpt_paging->msi_cookie = true;
}
return 0;
}
+#else
+static inline int
+iommufd_group_setup_msi(struct iommufd_group *igroup,
+ struct iommufd_hwpt_paging *hwpt_paging)
+{
+ return 0;
+}
+#endif
-static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging,
- struct iommufd_device *idev)
+static bool
+iommufd_group_first_attach(struct iommufd_group *igroup, ioasid_t pasid)
{
+ lockdep_assert_held(&igroup->lock);
+ return !xa_load(&igroup->pasid_attach, pasid);
+}
+
+static int
+iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
+ struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommufd_group *igroup = idev->igroup;
int rc;
- lockdep_assert_held(&idev->igroup->lock);
+ lockdep_assert_held(&igroup->lock);
rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
idev->dev,
- &idev->igroup->sw_msi_start);
+ &igroup->sw_msi_start);
if (rc)
return rc;
- if (list_empty(&idev->igroup->device_list)) {
- rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging);
+ if (iommufd_group_first_attach(igroup, IOMMU_NO_PASID)) {
+ rc = iommufd_group_setup_msi(igroup, hwpt_paging);
if (rc) {
iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt,
idev->dev);
@@ -350,22 +435,216 @@ static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging,
return 0;
}
+/* The device attach/detach/replace helpers for attach_handle */
+
+static bool iommufd_device_is_attached(struct iommufd_device *idev,
+ ioasid_t pasid)
+{
+ struct iommufd_attach *attach;
+
+ attach = xa_load(&idev->igroup->pasid_attach, pasid);
+ return xa_load(&attach->device_array, idev->obj.id);
+}
+
+static int iommufd_hwpt_pasid_compat(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_device *idev,
+ ioasid_t pasid)
+{
+ struct iommufd_group *igroup = idev->igroup;
+
+ lockdep_assert_held(&igroup->lock);
+
+ if (pasid == IOMMU_NO_PASID) {
+ unsigned long start = IOMMU_NO_PASID;
+
+ if (!hwpt->pasid_compat &&
+ xa_find_after(&igroup->pasid_attach,
+ &start, UINT_MAX, XA_PRESENT))
+ return -EINVAL;
+ } else {
+ struct iommufd_attach *attach;
+
+ if (!hwpt->pasid_compat)
+ return -EINVAL;
+
+ attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
+ if (attach && attach->hwpt && !attach->hwpt->pasid_compat)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool iommufd_hwpt_compatible_device(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_device *idev)
+{
+ struct pci_dev *pdev;
+
+ if (!hwpt->fault || !dev_is_pci(idev->dev))
+ return true;
+
+ /*
+ * Once we turn on PCI/PRI support for VF, the response failure code
+ * should not be forwarded to the hardware due to PRI being a shared
+ * resource between PF and VFs. There is no coordination for this
+ * shared capability. This waits for a vPRI reset to recover.
+ */
+ pdev = to_pci_dev(idev->dev);
+
+ return (!pdev->is_virtfn || !pci_pri_supported(pdev));
+}
+
+static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_device *idev,
+ ioasid_t pasid)
+{
+ struct iommufd_attach_handle *handle;
+ int rc;
+
+ if (!iommufd_hwpt_compatible_device(hwpt, idev))
+ return -EINVAL;
+
+ rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid);
+ if (rc)
+ return rc;
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->idev = idev;
+ if (pasid == IOMMU_NO_PASID)
+ rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group,
+ &handle->handle);
+ else
+ rc = iommu_attach_device_pasid(hwpt->domain, idev->dev, pasid,
+ &handle->handle);
+ if (rc)
+ goto out_free_handle;
+
+ return 0;
+
+out_free_handle:
+ kfree(handle);
+ return rc;
+}
+
+static struct iommufd_attach_handle *
+iommufd_device_get_attach_handle(struct iommufd_device *idev, ioasid_t pasid)
+{
+ struct iommu_attach_handle *handle;
+
+ lockdep_assert_held(&idev->igroup->lock);
+
+ handle = iommu_attach_handle_get(idev->igroup->group, pasid, 0);
+ if (IS_ERR(handle))
+ return NULL;
+ return to_iommufd_handle(handle);
+}
+
+static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_device *idev,
+ ioasid_t pasid)
+{
+ struct iommufd_attach_handle *handle;
+
+ handle = iommufd_device_get_attach_handle(idev, pasid);
+ if (pasid == IOMMU_NO_PASID)
+ iommu_detach_group_handle(hwpt->domain, idev->igroup->group);
+ else
+ iommu_detach_device_pasid(hwpt->domain, idev->dev, pasid);
+
+ iommufd_auto_response_faults(hwpt, handle);
+ kfree(handle);
+}
+
+static int iommufd_hwpt_replace_device(struct iommufd_device *idev,
+ ioasid_t pasid,
+ struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_hw_pagetable *old)
+{
+ struct iommufd_attach_handle *handle, *old_handle;
+ int rc;
+
+ if (!iommufd_hwpt_compatible_device(hwpt, idev))
+ return -EINVAL;
+
+ rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid);
+ if (rc)
+ return rc;
+
+ old_handle = iommufd_device_get_attach_handle(idev, pasid);
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->idev = idev;
+ if (pasid == IOMMU_NO_PASID)
+ rc = iommu_replace_group_handle(idev->igroup->group,
+ hwpt->domain, &handle->handle);
+ else
+ rc = iommu_replace_device_pasid(hwpt->domain, idev->dev,
+ pasid, &handle->handle);
+ if (rc)
+ goto out_free_handle;
+
+ iommufd_auto_response_faults(hwpt, old_handle);
+ kfree(old_handle);
+
+ return 0;
+
+out_free_handle:
+ kfree(handle);
+ return rc;
+}
+
int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
- struct iommufd_device *idev)
+ struct iommufd_device *idev, ioasid_t pasid)
{
+ struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
+ bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID;
+ struct iommufd_group *igroup = idev->igroup;
+ struct iommufd_hw_pagetable *old_hwpt;
+ struct iommufd_attach *attach;
int rc;
- mutex_lock(&idev->igroup->lock);
+ mutex_lock(&igroup->lock);
- if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) {
- rc = -EINVAL;
+ attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL,
+ XA_ZERO_ENTRY, GFP_KERNEL);
+ if (xa_is_err(attach)) {
+ rc = xa_err(attach);
goto err_unlock;
}
- if (hwpt_is_paging(hwpt)) {
- rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev);
+ if (!attach) {
+ attach = kzalloc(sizeof(*attach), GFP_KERNEL);
+ if (!attach) {
+ rc = -ENOMEM;
+ goto err_release_pasid;
+ }
+ xa_init(&attach->device_array);
+ }
+
+ old_hwpt = attach->hwpt;
+
+ rc = xa_insert(&attach->device_array, idev->obj.id, XA_ZERO_ENTRY,
+ GFP_KERNEL);
+ if (rc) {
+ WARN_ON(rc == -EBUSY && !old_hwpt);
+ goto err_free_attach;
+ }
+
+ if (old_hwpt && old_hwpt != hwpt) {
+ rc = -EINVAL;
+ goto err_release_devid;
+ }
+
+ if (attach_resv) {
+ rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging);
if (rc)
- goto err_unlock;
+ goto err_release_devid;
}
/*
@@ -375,52 +654,76 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
* reserved regions are only updated during individual device
* attachment.
*/
- if (list_empty(&idev->igroup->device_list)) {
- rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
+ if (iommufd_group_first_attach(igroup, pasid)) {
+ rc = iommufd_hwpt_attach_device(hwpt, idev, pasid);
if (rc)
goto err_unresv;
- idev->igroup->hwpt = hwpt;
+ attach->hwpt = hwpt;
+ WARN_ON(xa_is_err(xa_store(&igroup->pasid_attach, pasid, attach,
+ GFP_KERNEL)));
}
refcount_inc(&hwpt->obj.users);
- list_add_tail(&idev->group_item, &idev->igroup->device_list);
- mutex_unlock(&idev->igroup->lock);
+ WARN_ON(xa_is_err(xa_store(&attach->device_array, idev->obj.id,
+ idev, GFP_KERNEL)));
+ mutex_unlock(&igroup->lock);
return 0;
err_unresv:
- if (hwpt_is_paging(hwpt))
- iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
- idev->dev);
+ if (attach_resv)
+ iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
+err_release_devid:
+ xa_release(&attach->device_array, idev->obj.id);
+err_free_attach:
+ if (iommufd_group_first_attach(igroup, pasid))
+ kfree(attach);
+err_release_pasid:
+ if (iommufd_group_first_attach(igroup, pasid))
+ xa_release(&igroup->pasid_attach, pasid);
err_unlock:
- mutex_unlock(&idev->igroup->lock);
+ mutex_unlock(&igroup->lock);
return rc;
}
struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_detach(struct iommufd_device *idev)
+iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid)
{
- struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt;
+ struct iommufd_group *igroup = idev->igroup;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_attach *attach;
- mutex_lock(&idev->igroup->lock);
- list_del(&idev->group_item);
- if (list_empty(&idev->igroup->device_list)) {
- iommu_detach_group(hwpt->domain, idev->igroup->group);
- idev->igroup->hwpt = NULL;
+ mutex_lock(&igroup->lock);
+ attach = xa_load(&igroup->pasid_attach, pasid);
+ if (!attach) {
+ mutex_unlock(&igroup->lock);
+ return NULL;
}
- if (hwpt_is_paging(hwpt))
- iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
- idev->dev);
- mutex_unlock(&idev->igroup->lock);
+
+ hwpt = attach->hwpt;
+ hwpt_paging = find_hwpt_paging(hwpt);
+
+ xa_erase(&attach->device_array, idev->obj.id);
+ if (xa_empty(&attach->device_array)) {
+ iommufd_hwpt_detach_device(hwpt, idev, pasid);
+ xa_erase(&igroup->pasid_attach, pasid);
+ kfree(attach);
+ }
+ if (hwpt_paging && pasid == IOMMU_NO_PASID)
+ iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
+ mutex_unlock(&igroup->lock);
+
+ iommufd_hw_pagetable_put(idev->ictx, hwpt);
/* Caller must destroy hwpt */
return hwpt;
}
static struct iommufd_hw_pagetable *
-iommufd_device_do_attach(struct iommufd_device *idev,
+iommufd_device_do_attach(struct iommufd_device *idev, ioasid_t pasid,
struct iommufd_hw_pagetable *hwpt)
{
int rc;
- rc = iommufd_hw_pagetable_attach(hwpt, idev);
+ rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid);
if (rc)
return ERR_PTR(rc);
return NULL;
@@ -430,27 +733,33 @@ static void
iommufd_group_remove_reserved_iova(struct iommufd_group *igroup,
struct iommufd_hwpt_paging *hwpt_paging)
{
+ struct iommufd_attach *attach;
struct iommufd_device *cur;
+ unsigned long index;
lockdep_assert_held(&igroup->lock);
- list_for_each_entry(cur, &igroup->device_list, group_item)
+ attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
+ xa_for_each(&attach->device_array, index, cur)
iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev);
}
static int
-iommufd_group_do_replace_paging(struct iommufd_group *igroup,
- struct iommufd_hwpt_paging *hwpt_paging)
+iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup,
+ struct iommufd_hwpt_paging *hwpt_paging)
{
- struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt;
+ struct iommufd_hwpt_paging *old_hwpt_paging;
+ struct iommufd_attach *attach;
struct iommufd_device *cur;
+ unsigned long index;
int rc;
lockdep_assert_held(&igroup->lock);
- if (!hwpt_is_paging(old_hwpt) ||
- hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) {
- list_for_each_entry(cur, &igroup->device_list, group_item) {
+ attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
+ old_hwpt_paging = find_hwpt_paging(attach->hwpt);
+ if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) {
+ xa_for_each(&attach->device_array, index, cur) {
rc = iopt_table_enforce_dev_resv_regions(
&hwpt_paging->ioas->iopt, cur->dev, NULL);
if (rc)
@@ -469,70 +778,81 @@ err_unresv:
}
static struct iommufd_hw_pagetable *
-iommufd_device_do_replace(struct iommufd_device *idev,
+iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid,
struct iommufd_hw_pagetable *hwpt)
{
+ struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
+ bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID;
+ struct iommufd_hwpt_paging *old_hwpt_paging;
struct iommufd_group *igroup = idev->igroup;
struct iommufd_hw_pagetable *old_hwpt;
+ struct iommufd_attach *attach;
unsigned int num_devices;
int rc;
- mutex_lock(&idev->igroup->lock);
+ mutex_lock(&igroup->lock);
+
+ attach = xa_load(&igroup->pasid_attach, pasid);
+ if (!attach) {
+ rc = -EINVAL;
+ goto err_unlock;
+ }
+
+ old_hwpt = attach->hwpt;
- if (igroup->hwpt == NULL) {
+ WARN_ON(!old_hwpt || xa_empty(&attach->device_array));
+
+ if (!iommufd_device_is_attached(idev, pasid)) {
rc = -EINVAL;
goto err_unlock;
}
- if (hwpt == igroup->hwpt) {
- mutex_unlock(&idev->igroup->lock);
+ if (hwpt == old_hwpt) {
+ mutex_unlock(&igroup->lock);
return NULL;
}
- old_hwpt = igroup->hwpt;
- if (hwpt_is_paging(hwpt)) {
- rc = iommufd_group_do_replace_paging(igroup,
- to_hwpt_paging(hwpt));
+ if (attach_resv) {
+ rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging);
if (rc)
goto err_unlock;
}
- rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
+ rc = iommufd_hwpt_replace_device(idev, pasid, hwpt, old_hwpt);
if (rc)
goto err_unresv;
- if (hwpt_is_paging(old_hwpt) &&
- (!hwpt_is_paging(hwpt) ||
- to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas))
- iommufd_group_remove_reserved_iova(igroup,
- to_hwpt_paging(old_hwpt));
+ old_hwpt_paging = find_hwpt_paging(old_hwpt);
+ if (old_hwpt_paging && pasid == IOMMU_NO_PASID &&
+ (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas))
+ iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging);
- igroup->hwpt = hwpt;
+ attach->hwpt = hwpt;
- num_devices = list_count_nodes(&igroup->device_list);
+ num_devices = iommufd_group_device_num(igroup, pasid);
/*
- * Move the refcounts held by the device_list to the new hwpt. Retain a
+ * Move the refcounts held by the device_array to the new hwpt. Retain a
* refcount for this thread as the caller will free it.
*/
refcount_add(num_devices, &hwpt->obj.users);
if (num_devices > 1)
WARN_ON(refcount_sub_and_test(num_devices - 1,
&old_hwpt->obj.users));
- mutex_unlock(&idev->igroup->lock);
+ mutex_unlock(&igroup->lock);
/* Caller must destroy old_hwpt */
return old_hwpt;
err_unresv:
- if (hwpt_is_paging(hwpt))
- iommufd_group_remove_reserved_iova(igroup,
- to_hwpt_paging(old_hwpt));
+ if (attach_resv)
+ iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
err_unlock:
- mutex_unlock(&idev->igroup->lock);
+ mutex_unlock(&igroup->lock);
return ERR_PTR(rc);
}
typedef struct iommufd_hw_pagetable *(*attach_fn)(
- struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt);
+ struct iommufd_device *idev, ioasid_t pasid,
+ struct iommufd_hw_pagetable *hwpt);
/*
* When automatically managing the domains we search for a compatible domain in
@@ -540,7 +860,7 @@ typedef struct iommufd_hw_pagetable *(*attach_fn)(
* Automatic domain selection will never pick a manually created domain.
*/
static struct iommufd_hw_pagetable *
-iommufd_device_auto_get_domain(struct iommufd_device *idev,
+iommufd_device_auto_get_domain(struct iommufd_device *idev, ioasid_t pasid,
struct iommufd_ioas *ioas, u32 *pt_id,
attach_fn do_attach)
{
@@ -569,7 +889,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
hwpt = &hwpt_paging->common;
if (!iommufd_lock_obj(&hwpt->obj))
continue;
- destroy_hwpt = (*do_attach)(idev, hwpt);
+ destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
if (IS_ERR(destroy_hwpt)) {
iommufd_put_object(idev->ictx, &hwpt->obj);
/*
@@ -587,8 +907,8 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
goto out_unlock;
}
- hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0,
- immediate_attach, NULL);
+ hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, pasid,
+ 0, immediate_attach, NULL);
if (IS_ERR(hwpt_paging)) {
destroy_hwpt = ERR_CAST(hwpt_paging);
goto out_unlock;
@@ -596,7 +916,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
hwpt = &hwpt_paging->common;
if (!immediate_attach) {
- destroy_hwpt = (*do_attach)(idev, hwpt);
+ destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
if (IS_ERR(destroy_hwpt))
goto out_abort;
} else {
@@ -617,8 +937,9 @@ out_unlock:
return destroy_hwpt;
}
-static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
- attach_fn do_attach)
+static int iommufd_device_change_pt(struct iommufd_device *idev,
+ ioasid_t pasid,
+ u32 *pt_id, attach_fn do_attach)
{
struct iommufd_hw_pagetable *destroy_hwpt;
struct iommufd_object *pt_obj;
@@ -633,7 +954,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
struct iommufd_hw_pagetable *hwpt =
container_of(pt_obj, struct iommufd_hw_pagetable, obj);
- destroy_hwpt = (*do_attach)(idev, hwpt);
+ destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
if (IS_ERR(destroy_hwpt))
goto out_put_pt_obj;
break;
@@ -642,8 +963,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
struct iommufd_ioas *ioas =
container_of(pt_obj, struct iommufd_ioas, obj);
- destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id,
- do_attach);
+ destroy_hwpt = iommufd_device_auto_get_domain(idev, pasid, ioas,
+ pt_id, do_attach);
if (IS_ERR(destroy_hwpt))
goto out_put_pt_obj;
break;
@@ -665,22 +986,26 @@ out_put_pt_obj:
}
/**
- * iommufd_device_attach - Connect a device to an iommu_domain
+ * iommufd_device_attach - Connect a device/pasid to an iommu_domain
* @idev: device to attach
+ * @pasid: pasid to attach
* @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
* Output the IOMMUFD_OBJ_HWPT_PAGING ID
*
- * This connects the device to an iommu_domain, either automatically or manually
- * selected. Once this completes the device could do DMA.
+ * This connects the device/pasid to an iommu_domain, either automatically
+ * or manually selected. Once this completes the device could do DMA with
+ * @pasid. @pasid is IOMMU_NO_PASID if this attach is for no pasid usage.
*
* The caller should return the resulting pt_id back to userspace.
* This function is undone by calling iommufd_device_detach().
*/
-int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
+int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid,
+ u32 *pt_id)
{
int rc;
- rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach);
+ rc = iommufd_device_change_pt(idev, pasid, pt_id,
+ &iommufd_device_do_attach);
if (rc)
return rc;
@@ -691,11 +1016,12 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id)
refcount_inc(&idev->obj.users);
return 0;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD");
/**
- * iommufd_device_replace - Change the device's iommu_domain
+ * iommufd_device_replace - Change the device/pasid's iommu_domain
* @idev: device to change
+ * @pasid: pasid to change
* @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
* Output the IOMMUFD_OBJ_HWPT_PAGING ID
*
@@ -706,31 +1032,36 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
*
* If it fails then no change is made to the attachment. The iommu driver may
* implement this so there is no disruption in translation. This can only be
- * called if iommufd_device_attach() has already succeeded.
+ * called if iommufd_device_attach() has already succeeded. @pasid is
+ * IOMMU_NO_PASID for no pasid usage.
*/
-int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id)
+int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid,
+ u32 *pt_id)
{
- return iommufd_device_change_pt(idev, pt_id,
+ return iommufd_device_change_pt(idev, pasid, pt_id,
&iommufd_device_do_replace);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, "IOMMUFD");
/**
- * iommufd_device_detach - Disconnect a device to an iommu_domain
+ * iommufd_device_detach - Disconnect a device/device to an iommu_domain
* @idev: device to detach
+ * @pasid: pasid to detach
*
* Undo iommufd_device_attach(). This disconnects the idev from the previously
* attached pt_id. The device returns back to a blocked DMA translation.
+ * @pasid is IOMMU_NO_PASID for no pasid usage.
*/
-void iommufd_device_detach(struct iommufd_device *idev)
+void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid)
{
struct iommufd_hw_pagetable *hwpt;
- hwpt = iommufd_hw_pagetable_detach(idev);
- iommufd_hw_pagetable_put(idev->ictx, hwpt);
+ hwpt = iommufd_hw_pagetable_detach(idev, pasid);
+ if (!hwpt)
+ return;
refcount_dec(&idev->obj.users);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, "IOMMUFD");
/*
* On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at
@@ -769,7 +1100,7 @@ static int iommufd_access_change_ioas(struct iommufd_access *access,
}
if (cur_ioas) {
- if (access->ops->unmap) {
+ if (!iommufd_access_is_internal(access) && access->ops->unmap) {
mutex_unlock(&access->ioas_lock);
access->ops->unmap(access->data, 0, ULONG_MAX);
mutex_lock(&access->ioas_lock);
@@ -805,7 +1136,39 @@ void iommufd_access_destroy_object(struct iommufd_object *obj)
if (access->ioas)
WARN_ON(iommufd_access_change_ioas(access, NULL));
mutex_unlock(&access->ioas_lock);
- iommufd_ctx_put(access->ictx);
+ if (!iommufd_access_is_internal(access))
+ iommufd_ctx_put(access->ictx);
+}
+
+static struct iommufd_access *__iommufd_access_create(struct iommufd_ctx *ictx)
+{
+ struct iommufd_access *access;
+
+ /*
+ * There is no uAPI for the access object, but to keep things symmetric
+ * use the object infrastructure anyhow.
+ */
+ access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS);
+ if (IS_ERR(access))
+ return access;
+
+ /* The calling driver is a user until iommufd_access_destroy() */
+ refcount_inc(&access->obj.users);
+ mutex_init(&access->ioas_lock);
+ return access;
+}
+
+struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx)
+{
+ struct iommufd_access *access;
+
+ access = __iommufd_access_create(ictx);
+ if (IS_ERR(access))
+ return access;
+ access->iova_alignment = PAGE_SIZE;
+
+ iommufd_object_finalize(ictx, &access->obj);
+ return access;
}
/**
@@ -827,11 +1190,7 @@ iommufd_access_create(struct iommufd_ctx *ictx,
{
struct iommufd_access *access;
- /*
- * There is no uAPI for the access object, but to keep things symmetric
- * use the object infrastructure anyhow.
- */
- access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS);
+ access = __iommufd_access_create(ictx);
if (IS_ERR(access))
return access;
@@ -843,16 +1202,13 @@ iommufd_access_create(struct iommufd_ctx *ictx,
else
access->iova_alignment = 1;
- /* The calling driver is a user until iommufd_access_destroy() */
- refcount_inc(&access->obj.users);
access->ictx = ictx;
iommufd_ctx_get(ictx);
iommufd_object_finalize(ictx, &access->obj);
*id = access->obj.id;
- mutex_init(&access->ioas_lock);
return access;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_access_create, "IOMMUFD");
/**
* iommufd_access_destroy - Destroy an iommufd_access
@@ -864,7 +1220,7 @@ void iommufd_access_destroy(struct iommufd_access *access)
{
iommufd_object_destroy_user(access->ictx, &access->obj);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, "IOMMUFD");
void iommufd_access_detach(struct iommufd_access *access)
{
@@ -876,7 +1232,7 @@ void iommufd_access_detach(struct iommufd_access *access)
WARN_ON(iommufd_access_change_ioas(access, NULL));
mutex_unlock(&access->ioas_lock);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, "IOMMUFD");
int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id)
{
@@ -892,7 +1248,23 @@ int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id)
mutex_unlock(&access->ioas_lock);
return rc;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, "IOMMUFD");
+
+int iommufd_access_attach_internal(struct iommufd_access *access,
+ struct iommufd_ioas *ioas)
+{
+ int rc;
+
+ mutex_lock(&access->ioas_lock);
+ if (WARN_ON(access->ioas)) {
+ mutex_unlock(&access->ioas_lock);
+ return -EINVAL;
+ }
+
+ rc = iommufd_access_change_ioas(access, ioas);
+ mutex_unlock(&access->ioas_lock);
+ return rc;
+}
int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id)
{
@@ -907,7 +1279,7 @@ int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id)
mutex_unlock(&access->ioas_lock);
return rc;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, "IOMMUFD");
/**
* iommufd_access_notify_unmap - Notify users of an iopt to stop using it
@@ -935,7 +1307,8 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
xa_lock(&ioas->iopt.access_list);
xa_for_each(&ioas->iopt.access_list, index, access) {
- if (!iommufd_lock_obj(&access->obj))
+ if (!iommufd_lock_obj(&access->obj) ||
+ iommufd_access_is_internal(access))
continue;
xa_unlock(&ioas->iopt.access_list);
@@ -959,6 +1332,7 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
void iommufd_access_unpin_pages(struct iommufd_access *access,
unsigned long iova, unsigned long length)
{
+ bool internal = iommufd_access_is_internal(access);
struct iopt_area_contig_iter iter;
struct io_pagetable *iopt;
unsigned long last_iova;
@@ -985,12 +1359,13 @@ void iommufd_access_unpin_pages(struct iommufd_access *access,
area, iopt_area_iova_to_index(area, iter.cur_iova),
iopt_area_iova_to_index(
area,
- min(last_iova, iopt_area_last_iova(area))));
+ min(last_iova, iopt_area_last_iova(area))),
+ internal);
WARN_ON(!iopt_area_contig_done(&iter));
up_read(&iopt->iova_rwsem);
mutex_unlock(&access->ioas_lock);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, "IOMMUFD");
static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter)
{
@@ -1034,6 +1409,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
unsigned long length, struct page **out_pages,
unsigned int flags)
{
+ bool internal = iommufd_access_is_internal(access);
struct iopt_area_contig_iter iter;
struct io_pagetable *iopt;
unsigned long last_iova;
@@ -1042,7 +1418,8 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
/* Driver's ops don't support pin_pages */
if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
- WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
+ WARN_ON(access->iova_alignment != PAGE_SIZE ||
+ (!internal && !access->ops->unmap)))
return -EINVAL;
if (!length)
@@ -1076,7 +1453,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
}
rc = iopt_area_add_access(area, index, last_index, out_pages,
- flags);
+ flags, internal);
if (rc)
goto err_remove;
out_pages += last_index - index + 1;
@@ -1099,13 +1476,14 @@ err_remove:
iopt_area_iova_to_index(area, iter.cur_iova),
iopt_area_iova_to_index(
area, min(last_iova,
- iopt_area_last_iova(area))));
+ iopt_area_last_iova(area))),
+ internal);
}
up_read(&iopt->iova_rwsem);
mutex_unlock(&access->ioas_lock);
return rc;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, "IOMMUFD");
/**
* iommufd_access_rw - Read or write data under the iova
@@ -1126,7 +1504,7 @@ int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
struct io_pagetable *iopt;
struct iopt_area *area;
unsigned long last_iova;
- int rc;
+ int rc = -EINVAL;
if (!length)
return -EINVAL;
@@ -1169,10 +1547,11 @@ err_out:
mutex_unlock(&access->ioas_lock);
return rc;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD");
int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
{
+ const u32 SUPPORTED_FLAGS = IOMMU_HW_INFO_FLAG_INPUT_TYPE;
struct iommu_hw_info *cmd = ucmd->cmd;
void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr);
const struct iommu_ops *ops;
@@ -1182,9 +1561,15 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
void *data;
int rc;
- if (cmd->flags || cmd->__reserved)
+ if (cmd->flags & ~SUPPORTED_FLAGS)
+ return -EOPNOTSUPP;
+ if (cmd->__reserved[0] || cmd->__reserved[1] || cmd->__reserved[2])
return -EOPNOTSUPP;
+ /* Clear the type field since drivers don't support a random input */
+ if (!(cmd->flags & IOMMU_HW_INFO_FLAG_INPUT_TYPE))
+ cmd->in_data_type = IOMMU_HW_INFO_TYPE_DEFAULT;
+
idev = iommufd_get_device(ucmd, cmd->dev_id);
if (IS_ERR(idev))
return PTR_ERR(idev);
@@ -1203,7 +1588,7 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
*/
if (WARN_ON_ONCE(cmd->out_data_type ==
IOMMU_HW_INFO_TYPE_NONE)) {
- rc = -ENODEV;
+ rc = -EOPNOTSUPP;
goto out_free;
}
} else {
@@ -1239,6 +1624,36 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING;
+ cmd->out_max_pasid_log2 = 0;
+ /*
+ * Currently, all iommu drivers enable PASID in the probe_device()
+ * op if iommu and device supports it. So the max_pasids stored in
+ * dev->iommu indicates both PASID support and enable status. A
+ * non-zero dev->iommu->max_pasids means PASID is supported and
+ * enabled. The iommufd only reports PASID capability to userspace
+ * if it's enabled.
+ */
+ if (idev->dev->iommu->max_pasids) {
+ cmd->out_max_pasid_log2 = ilog2(idev->dev->iommu->max_pasids);
+
+ if (dev_is_pci(idev->dev)) {
+ struct pci_dev *pdev = to_pci_dev(idev->dev);
+ int ctrl;
+
+ ctrl = pci_pasid_status(pdev);
+
+ WARN_ON_ONCE(ctrl < 0 ||
+ !(ctrl & PCI_PASID_CTRL_ENABLE));
+
+ if (ctrl & PCI_PASID_CTRL_EXEC)
+ cmd->out_capabilities |=
+ IOMMU_HW_CAP_PCI_PASID_EXEC;
+ if (ctrl & PCI_PASID_CTRL_PRIV)
+ cmd->out_capabilities |=
+ IOMMU_HW_CAP_PCI_PASID_PRIV;
+ }
+ }
+
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_free:
kfree(data);
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
new file mode 100644
index 000000000000..21d4a35538f6
--- /dev/null
+++ b/drivers/iommu/iommufd/driver.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+/* Driver should use a per-structure helper in include/linux/iommufd.h */
+int _iommufd_object_depend(struct iommufd_object *obj_dependent,
+ struct iommufd_object *obj_depended)
+{
+ /* Reject self dependency that dead locks */
+ if (obj_dependent == obj_depended)
+ return -EINVAL;
+ /* Only support dependency between two objects of the same type */
+ if (obj_dependent->type != obj_depended->type)
+ return -EINVAL;
+
+ refcount_inc(&obj_depended->users);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_object_depend, "IOMMUFD");
+
+/* Driver should use a per-structure helper in include/linux/iommufd.h */
+void _iommufd_object_undepend(struct iommufd_object *obj_dependent,
+ struct iommufd_object *obj_depended)
+{
+ if (WARN_ON_ONCE(obj_dependent == obj_depended ||
+ obj_dependent->type != obj_depended->type))
+ return;
+
+ refcount_dec(&obj_depended->users);
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_object_undepend, "IOMMUFD");
+
+/*
+ * Allocate an @offset to return to user space to use for an mmap() syscall
+ *
+ * Driver should use a per-structure helper in include/linux/iommufd.h
+ */
+int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner,
+ phys_addr_t mmio_addr, size_t length,
+ unsigned long *offset)
+{
+ struct iommufd_mmap *immap;
+ unsigned long startp;
+ int rc;
+
+ if (!PAGE_ALIGNED(mmio_addr))
+ return -EINVAL;
+ if (!length || !PAGE_ALIGNED(length))
+ return -EINVAL;
+
+ immap = kzalloc(sizeof(*immap), GFP_KERNEL);
+ if (!immap)
+ return -ENOMEM;
+ immap->owner = owner;
+ immap->length = length;
+ immap->mmio_addr = mmio_addr;
+
+ /* Skip the first page to ease caller identifying the returned offset */
+ rc = mtree_alloc_range(&ictx->mt_mmap, &startp, immap, immap->length,
+ PAGE_SIZE, ULONG_MAX, GFP_KERNEL);
+ if (rc < 0) {
+ kfree(immap);
+ return rc;
+ }
+
+ /* mmap() syscall will right-shift the offset in vma->vm_pgoff too */
+ immap->vm_pgoff = startp >> PAGE_SHIFT;
+ *offset = startp;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_alloc_mmap, "IOMMUFD");
+
+/* Driver should use a per-structure helper in include/linux/iommufd.h */
+void _iommufd_destroy_mmap(struct iommufd_ctx *ictx,
+ struct iommufd_object *owner, unsigned long offset)
+{
+ struct iommufd_mmap *immap;
+
+ immap = mtree_erase(&ictx->mt_mmap, offset);
+ WARN_ON_ONCE(!immap || immap->owner != owner);
+ kfree(immap);
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_destroy_mmap, "IOMMUFD");
+
+struct device *iommufd_vdevice_to_device(struct iommufd_vdevice *vdev)
+{
+ return vdev->idev->dev;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vdevice_to_device, "IOMMUFD");
+
+/* Caller should xa_lock(&viommu->vdevs) to protect the return value */
+struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
+ unsigned long vdev_id)
+{
+ struct iommufd_vdevice *vdev;
+
+ lockdep_assert_held(&viommu->vdevs.xa_lock);
+
+ vdev = xa_load(&viommu->vdevs, vdev_id);
+ return vdev ? iommufd_vdevice_to_device(vdev) : NULL;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, "IOMMUFD");
+
+/* Return -ENOENT if device is not associated to the vIOMMU */
+int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
+ struct device *dev, unsigned long *vdev_id)
+{
+ struct iommufd_vdevice *vdev;
+ unsigned long index;
+ int rc = -ENOENT;
+
+ if (WARN_ON_ONCE(!vdev_id))
+ return -EINVAL;
+
+ xa_lock(&viommu->vdevs);
+ xa_for_each(&viommu->vdevs, index, vdev) {
+ if (iommufd_vdevice_to_device(vdev) == dev) {
+ *vdev_id = vdev->virt_id;
+ rc = 0;
+ break;
+ }
+ }
+ xa_unlock(&viommu->vdevs);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_get_vdev_id, "IOMMUFD");
+
+/*
+ * Typically called in driver's threaded IRQ handler.
+ * The @type and @event_data must be defined in include/uapi/linux/iommufd.h
+ */
+int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
+ enum iommu_veventq_type type, void *event_data,
+ size_t data_len)
+{
+ struct iommufd_veventq *veventq;
+ struct iommufd_vevent *vevent;
+ int rc = 0;
+
+ if (WARN_ON_ONCE(!data_len || !event_data))
+ return -EINVAL;
+
+ down_read(&viommu->veventqs_rwsem);
+
+ veventq = iommufd_viommu_find_veventq(viommu, type);
+ if (!veventq) {
+ rc = -EOPNOTSUPP;
+ goto out_unlock_veventqs;
+ }
+
+ spin_lock(&veventq->common.lock);
+ if (veventq->num_events == veventq->depth) {
+ vevent = &veventq->lost_events_header;
+ goto out_set_header;
+ }
+
+ vevent = kzalloc(struct_size(vevent, event_data, data_len), GFP_ATOMIC);
+ if (!vevent) {
+ rc = -ENOMEM;
+ vevent = &veventq->lost_events_header;
+ goto out_set_header;
+ }
+ vevent->data_len = data_len;
+ memcpy(vevent->event_data, event_data, data_len);
+ veventq->num_events++;
+
+out_set_header:
+ iommufd_vevent_handler(veventq, vevent);
+ spin_unlock(&veventq->common.lock);
+out_unlock_veventqs:
+ up_read(&viommu->veventqs_rwsem);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_event, "IOMMUFD");
+
+#ifdef CONFIG_IRQ_MSI_IOMMU
+/*
+ * Get a iommufd_sw_msi_map for the msi physical address requested by the irq
+ * layer. The mapping to IOVA is global to the iommufd file descriptor, every
+ * domain that is attached to a device using the same MSI parameters will use
+ * the same IOVA.
+ */
+static struct iommufd_sw_msi_map *
+iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr,
+ phys_addr_t sw_msi_start)
+{
+ struct iommufd_sw_msi_map *cur;
+ unsigned int max_pgoff = 0;
+
+ lockdep_assert_held(&ictx->sw_msi_lock);
+
+ list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
+ if (cur->sw_msi_start != sw_msi_start)
+ continue;
+ max_pgoff = max(max_pgoff, cur->pgoff + 1);
+ if (cur->msi_addr == msi_addr)
+ return cur;
+ }
+
+ if (ictx->sw_msi_id >=
+ BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap))
+ return ERR_PTR(-EOVERFLOW);
+
+ cur = kzalloc(sizeof(*cur), GFP_KERNEL);
+ if (!cur)
+ return ERR_PTR(-ENOMEM);
+
+ cur->sw_msi_start = sw_msi_start;
+ cur->msi_addr = msi_addr;
+ cur->pgoff = max_pgoff;
+ cur->id = ictx->sw_msi_id++;
+ list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list);
+ return cur;
+}
+
+int iommufd_sw_msi_install(struct iommufd_ctx *ictx,
+ struct iommufd_hwpt_paging *hwpt_paging,
+ struct iommufd_sw_msi_map *msi_map)
+{
+ unsigned long iova;
+
+ lockdep_assert_held(&ictx->sw_msi_lock);
+
+ iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
+ if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) {
+ int rc;
+
+ rc = iommu_map(hwpt_paging->common.domain, iova,
+ msi_map->msi_addr, PAGE_SIZE,
+ IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO,
+ GFP_KERNEL_ACCOUNT);
+ if (rc)
+ return rc;
+ __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_sw_msi_install, "IOMMUFD_INTERNAL");
+
+/*
+ * Called by the irq code if the platform translates the MSI address through the
+ * IOMMU. msi_addr is the physical address of the MSI page. iommufd will
+ * allocate a fd global iova for the physical page that is the same on all
+ * domains and devices.
+ */
+int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
+ phys_addr_t msi_addr)
+{
+ struct device *dev = msi_desc_to_dev(desc);
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommu_attach_handle *raw_handle;
+ struct iommufd_attach_handle *handle;
+ struct iommufd_sw_msi_map *msi_map;
+ struct iommufd_ctx *ictx;
+ unsigned long iova;
+ int rc;
+
+ /*
+ * It is safe to call iommu_attach_handle_get() here because the iommu
+ * core code invokes this under the group mutex which also prevents any
+ * change of the attach handle for the duration of this function.
+ */
+ iommu_group_mutex_assert(dev);
+
+ raw_handle =
+ iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);
+ if (IS_ERR(raw_handle))
+ return 0;
+ hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt);
+
+ handle = to_iommufd_handle(raw_handle);
+ /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */
+ if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX)
+ return 0;
+
+ ictx = handle->idev->ictx;
+ guard(mutex)(&ictx->sw_msi_lock);
+ /*
+ * The input msi_addr is the exact byte offset of the MSI doorbell, we
+ * assume the caller has checked that it is contained with a MMIO region
+ * that is secure to map at PAGE_SIZE.
+ */
+ msi_map = iommufd_sw_msi_get_map(handle->idev->ictx,
+ msi_addr & PAGE_MASK,
+ handle->idev->igroup->sw_msi_start);
+ if (IS_ERR(msi_map))
+ return PTR_ERR(msi_map);
+
+ rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map);
+ if (rc)
+ return rc;
+ __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap);
+
+ iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
+ msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_sw_msi, "IOMMUFD");
+#endif
+
+MODULE_DESCRIPTION("iommufd code shared with builtin modules");
+MODULE_IMPORT_NS("IOMMUFD_INTERNAL");
+MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c
new file mode 100644
index 000000000000..e23d9ee4fe38
--- /dev/null
+++ b/drivers/iommu/iommufd/eventq.c
@@ -0,0 +1,541 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/iommufd.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <uapi/linux/iommufd.h>
+
+#include "../iommu-priv.h"
+#include "iommufd_private.h"
+
+/* IOMMUFD_OBJ_FAULT Functions */
+void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_attach_handle *handle)
+{
+ struct iommufd_fault *fault = hwpt->fault;
+ struct iopf_group *group, *next;
+ struct list_head free_list;
+ unsigned long index;
+
+ if (!fault || !handle)
+ return;
+ INIT_LIST_HEAD(&free_list);
+
+ mutex_lock(&fault->mutex);
+ spin_lock(&fault->common.lock);
+ list_for_each_entry_safe(group, next, &fault->common.deliver, node) {
+ if (group->attach_handle != &handle->handle)
+ continue;
+ list_move(&group->node, &free_list);
+ }
+ spin_unlock(&fault->common.lock);
+
+ list_for_each_entry_safe(group, next, &free_list, node) {
+ list_del(&group->node);
+ iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+ iopf_free_group(group);
+ }
+
+ xa_for_each(&fault->response, index, group) {
+ if (group->attach_handle != &handle->handle)
+ continue;
+ xa_erase(&fault->response, index);
+ iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+ iopf_free_group(group);
+ }
+ mutex_unlock(&fault->mutex);
+}
+
+void iommufd_fault_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_eventq *eventq =
+ container_of(obj, struct iommufd_eventq, obj);
+ struct iommufd_fault *fault = eventq_to_fault(eventq);
+ struct iopf_group *group, *next;
+ unsigned long index;
+
+ /*
+ * The iommufd object's reference count is zero at this point.
+ * We can be confident that no other threads are currently
+ * accessing this pointer. Therefore, acquiring the mutex here
+ * is unnecessary.
+ */
+ list_for_each_entry_safe(group, next, &fault->common.deliver, node) {
+ list_del(&group->node);
+ iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+ iopf_free_group(group);
+ }
+ xa_for_each(&fault->response, index, group) {
+ xa_erase(&fault->response, index);
+ iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+ iopf_free_group(group);
+ }
+ xa_destroy(&fault->response);
+ mutex_destroy(&fault->mutex);
+}
+
+static void iommufd_compose_fault_message(struct iommu_fault *fault,
+ struct iommu_hwpt_pgfault *hwpt_fault,
+ struct iommufd_device *idev,
+ u32 cookie)
+{
+ hwpt_fault->flags = fault->prm.flags;
+ hwpt_fault->dev_id = idev->obj.id;
+ hwpt_fault->pasid = fault->prm.pasid;
+ hwpt_fault->grpid = fault->prm.grpid;
+ hwpt_fault->perm = fault->prm.perm;
+ hwpt_fault->addr = fault->prm.addr;
+ hwpt_fault->length = 0;
+ hwpt_fault->cookie = cookie;
+}
+
+/* Fetch the first node out of the fault->deliver list */
+static struct iopf_group *
+iommufd_fault_deliver_fetch(struct iommufd_fault *fault)
+{
+ struct list_head *list = &fault->common.deliver;
+ struct iopf_group *group = NULL;
+
+ spin_lock(&fault->common.lock);
+ if (!list_empty(list)) {
+ group = list_first_entry(list, struct iopf_group, node);
+ list_del(&group->node);
+ }
+ spin_unlock(&fault->common.lock);
+ return group;
+}
+
+/* Restore a node back to the head of the fault->deliver list */
+static void iommufd_fault_deliver_restore(struct iommufd_fault *fault,
+ struct iopf_group *group)
+{
+ spin_lock(&fault->common.lock);
+ list_add(&group->node, &fault->common.deliver);
+ spin_unlock(&fault->common.lock);
+}
+
+static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
+ struct iommufd_eventq *eventq = filep->private_data;
+ struct iommufd_fault *fault = eventq_to_fault(eventq);
+ struct iommu_hwpt_pgfault data = {};
+ struct iommufd_device *idev;
+ struct iopf_group *group;
+ struct iopf_fault *iopf;
+ size_t done = 0;
+ int rc = 0;
+
+ if (*ppos || count % fault_size)
+ return -ESPIPE;
+
+ mutex_lock(&fault->mutex);
+ while ((group = iommufd_fault_deliver_fetch(fault))) {
+ if (done >= count ||
+ group->fault_count * fault_size > count - done) {
+ iommufd_fault_deliver_restore(fault, group);
+ break;
+ }
+
+ rc = xa_alloc(&fault->response, &group->cookie, group,
+ xa_limit_32b, GFP_KERNEL);
+ if (rc) {
+ iommufd_fault_deliver_restore(fault, group);
+ break;
+ }
+
+ idev = to_iommufd_handle(group->attach_handle)->idev;
+ list_for_each_entry(iopf, &group->faults, list) {
+ iommufd_compose_fault_message(&iopf->fault,
+ &data, idev,
+ group->cookie);
+ if (copy_to_user(buf + done, &data, fault_size)) {
+ xa_erase(&fault->response, group->cookie);
+ iommufd_fault_deliver_restore(fault, group);
+ rc = -EFAULT;
+ break;
+ }
+ done += fault_size;
+ }
+ }
+ mutex_unlock(&fault->mutex);
+
+ return done == 0 ? rc : done;
+}
+
+static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ size_t response_size = sizeof(struct iommu_hwpt_page_response);
+ struct iommufd_eventq *eventq = filep->private_data;
+ struct iommufd_fault *fault = eventq_to_fault(eventq);
+ struct iommu_hwpt_page_response response;
+ struct iopf_group *group;
+ size_t done = 0;
+ int rc = 0;
+
+ if (*ppos || count % response_size)
+ return -ESPIPE;
+
+ mutex_lock(&fault->mutex);
+ while (count > done) {
+ rc = copy_from_user(&response, buf + done, response_size);
+ if (rc)
+ break;
+
+ static_assert((int)IOMMUFD_PAGE_RESP_SUCCESS ==
+ (int)IOMMU_PAGE_RESP_SUCCESS);
+ static_assert((int)IOMMUFD_PAGE_RESP_INVALID ==
+ (int)IOMMU_PAGE_RESP_INVALID);
+ if (response.code != IOMMUFD_PAGE_RESP_SUCCESS &&
+ response.code != IOMMUFD_PAGE_RESP_INVALID) {
+ rc = -EINVAL;
+ break;
+ }
+
+ group = xa_erase(&fault->response, response.cookie);
+ if (!group) {
+ rc = -EINVAL;
+ break;
+ }
+
+ iopf_group_response(group, response.code);
+ iopf_free_group(group);
+ done += response_size;
+ }
+ mutex_unlock(&fault->mutex);
+
+ return done == 0 ? rc : done;
+}
+
+/* IOMMUFD_OBJ_VEVENTQ Functions */
+
+void iommufd_veventq_abort(struct iommufd_object *obj)
+{
+ struct iommufd_eventq *eventq =
+ container_of(obj, struct iommufd_eventq, obj);
+ struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
+ struct iommufd_viommu *viommu = veventq->viommu;
+ struct iommufd_vevent *cur, *next;
+
+ lockdep_assert_held_write(&viommu->veventqs_rwsem);
+
+ list_for_each_entry_safe(cur, next, &eventq->deliver, node) {
+ list_del(&cur->node);
+ if (cur != &veventq->lost_events_header)
+ kfree(cur);
+ }
+
+ refcount_dec(&viommu->obj.users);
+ list_del(&veventq->node);
+}
+
+void iommufd_veventq_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_veventq *veventq = eventq_to_veventq(
+ container_of(obj, struct iommufd_eventq, obj));
+
+ down_write(&veventq->viommu->veventqs_rwsem);
+ iommufd_veventq_abort(obj);
+ up_write(&veventq->viommu->veventqs_rwsem);
+}
+
+static struct iommufd_vevent *
+iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq)
+{
+ struct iommufd_eventq *eventq = &veventq->common;
+ struct list_head *list = &eventq->deliver;
+ struct iommufd_vevent *vevent = NULL;
+
+ spin_lock(&eventq->lock);
+ if (!list_empty(list)) {
+ struct iommufd_vevent *next;
+
+ next = list_first_entry(list, struct iommufd_vevent, node);
+ /* Make a copy of the lost_events_header for copy_to_user */
+ if (next == &veventq->lost_events_header) {
+ vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC);
+ if (!vevent)
+ goto out_unlock;
+ }
+ list_del(&next->node);
+ if (vevent)
+ memcpy(vevent, next, sizeof(*vevent));
+ else
+ vevent = next;
+ }
+out_unlock:
+ spin_unlock(&eventq->lock);
+ return vevent;
+}
+
+static void iommufd_veventq_deliver_restore(struct iommufd_veventq *veventq,
+ struct iommufd_vevent *vevent)
+{
+ struct iommufd_eventq *eventq = &veventq->common;
+ struct list_head *list = &eventq->deliver;
+
+ spin_lock(&eventq->lock);
+ if (vevent_for_lost_events_header(vevent)) {
+ /* Remove the copy of the lost_events_header */
+ kfree(vevent);
+ vevent = NULL;
+ /* An empty list needs the lost_events_header back */
+ if (list_empty(list))
+ vevent = &veventq->lost_events_header;
+ }
+ if (vevent)
+ list_add(&vevent->node, list);
+ spin_unlock(&eventq->lock);
+}
+
+static ssize_t iommufd_veventq_fops_read(struct file *filep, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct iommufd_eventq *eventq = filep->private_data;
+ struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
+ struct iommufd_vevent_header *hdr;
+ struct iommufd_vevent *cur;
+ size_t done = 0;
+ int rc = 0;
+
+ if (*ppos)
+ return -ESPIPE;
+
+ while ((cur = iommufd_veventq_deliver_fetch(veventq))) {
+ /* Validate the remaining bytes against the header size */
+ if (done >= count || sizeof(*hdr) > count - done) {
+ iommufd_veventq_deliver_restore(veventq, cur);
+ break;
+ }
+ hdr = &cur->header;
+
+ /* If being a normal vEVENT, validate against the full size */
+ if (!vevent_for_lost_events_header(cur) &&
+ sizeof(hdr) + cur->data_len > count - done) {
+ iommufd_veventq_deliver_restore(veventq, cur);
+ break;
+ }
+
+ if (copy_to_user(buf + done, hdr, sizeof(*hdr))) {
+ iommufd_veventq_deliver_restore(veventq, cur);
+ rc = -EFAULT;
+ break;
+ }
+ done += sizeof(*hdr);
+
+ if (cur->data_len &&
+ copy_to_user(buf + done, cur->event_data, cur->data_len)) {
+ iommufd_veventq_deliver_restore(veventq, cur);
+ rc = -EFAULT;
+ break;
+ }
+ spin_lock(&eventq->lock);
+ if (!vevent_for_lost_events_header(cur))
+ veventq->num_events--;
+ spin_unlock(&eventq->lock);
+ done += cur->data_len;
+ kfree(cur);
+ }
+
+ return done == 0 ? rc : done;
+}
+
+/* Common Event Queue Functions */
+
+static __poll_t iommufd_eventq_fops_poll(struct file *filep,
+ struct poll_table_struct *wait)
+{
+ struct iommufd_eventq *eventq = filep->private_data;
+ __poll_t pollflags = 0;
+
+ if (eventq->obj.type == IOMMUFD_OBJ_FAULT)
+ pollflags |= EPOLLOUT;
+
+ poll_wait(filep, &eventq->wait_queue, wait);
+ spin_lock(&eventq->lock);
+ if (!list_empty(&eventq->deliver))
+ pollflags |= EPOLLIN | EPOLLRDNORM;
+ spin_unlock(&eventq->lock);
+
+ return pollflags;
+}
+
+static int iommufd_eventq_fops_release(struct inode *inode, struct file *filep)
+{
+ struct iommufd_eventq *eventq = filep->private_data;
+
+ refcount_dec(&eventq->obj.users);
+ iommufd_ctx_put(eventq->ictx);
+ return 0;
+}
+
+#define INIT_EVENTQ_FOPS(read_op, write_op) \
+ ((const struct file_operations){ \
+ .owner = THIS_MODULE, \
+ .open = nonseekable_open, \
+ .read = read_op, \
+ .write = write_op, \
+ .poll = iommufd_eventq_fops_poll, \
+ .release = iommufd_eventq_fops_release, \
+ })
+
+static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name,
+ struct iommufd_ctx *ictx,
+ const struct file_operations *fops)
+{
+ struct file *filep;
+
+ spin_lock_init(&eventq->lock);
+ INIT_LIST_HEAD(&eventq->deliver);
+ init_waitqueue_head(&eventq->wait_queue);
+
+ /* The filep is fput() by the core code during failure */
+ filep = anon_inode_getfile(name, fops, eventq, O_RDWR);
+ if (IS_ERR(filep))
+ return PTR_ERR(filep);
+
+ eventq->ictx = ictx;
+ iommufd_ctx_get(eventq->ictx);
+ eventq->filep = filep;
+ refcount_inc(&eventq->obj.users);
+
+ return get_unused_fd_flags(O_CLOEXEC);
+}
+
+static const struct file_operations iommufd_fault_fops =
+ INIT_EVENTQ_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write);
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_fault_alloc *cmd = ucmd->cmd;
+ struct iommufd_fault *fault;
+ int fdno;
+ int rc;
+
+ if (cmd->flags)
+ return -EOPNOTSUPP;
+
+ fault = __iommufd_object_alloc_ucmd(ucmd, fault, IOMMUFD_OBJ_FAULT,
+ common.obj);
+ if (IS_ERR(fault))
+ return PTR_ERR(fault);
+
+ xa_init_flags(&fault->response, XA_FLAGS_ALLOC1);
+ mutex_init(&fault->mutex);
+
+ fdno = iommufd_eventq_init(&fault->common, "[iommufd-pgfault]",
+ ucmd->ictx, &iommufd_fault_fops);
+ if (fdno < 0)
+ return fdno;
+
+ cmd->out_fault_id = fault->common.obj.id;
+ cmd->out_fault_fd = fdno;
+
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_put_fdno;
+
+ fd_install(fdno, fault->common.filep);
+
+ return 0;
+out_put_fdno:
+ put_unused_fd(fdno);
+ return rc;
+}
+
+int iommufd_fault_iopf_handler(struct iopf_group *group)
+{
+ struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_fault *fault;
+
+ hwpt = group->attach_handle->domain->iommufd_hwpt;
+ fault = hwpt->fault;
+
+ spin_lock(&fault->common.lock);
+ list_add_tail(&group->node, &fault->common.deliver);
+ spin_unlock(&fault->common.lock);
+
+ wake_up_interruptible(&fault->common.wait_queue);
+
+ return 0;
+}
+
+static const struct file_operations iommufd_veventq_fops =
+ INIT_EVENTQ_FOPS(iommufd_veventq_fops_read, NULL);
+
+int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_veventq_alloc *cmd = ucmd->cmd;
+ struct iommufd_veventq *veventq;
+ struct iommufd_viommu *viommu;
+ int fdno;
+ int rc;
+
+ if (cmd->flags || cmd->__reserved ||
+ cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
+ return -EOPNOTSUPP;
+ if (!cmd->veventq_depth)
+ return -EINVAL;
+
+ viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+ if (IS_ERR(viommu))
+ return PTR_ERR(viommu);
+
+ down_write(&viommu->veventqs_rwsem);
+
+ if (iommufd_viommu_find_veventq(viommu, cmd->type)) {
+ rc = -EEXIST;
+ goto out_unlock_veventqs;
+ }
+
+ veventq = __iommufd_object_alloc(ucmd->ictx, veventq,
+ IOMMUFD_OBJ_VEVENTQ, common.obj);
+ if (IS_ERR(veventq)) {
+ rc = PTR_ERR(veventq);
+ goto out_unlock_veventqs;
+ }
+
+ veventq->type = cmd->type;
+ veventq->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+ veventq->depth = cmd->veventq_depth;
+ list_add_tail(&veventq->node, &viommu->veventqs);
+ veventq->lost_events_header.header.flags =
+ IOMMU_VEVENTQ_FLAG_LOST_EVENTS;
+
+ fdno = iommufd_eventq_init(&veventq->common, "[iommufd-viommu-event]",
+ ucmd->ictx, &iommufd_veventq_fops);
+ if (fdno < 0) {
+ rc = fdno;
+ goto out_abort;
+ }
+
+ cmd->out_veventq_id = veventq->common.obj.id;
+ cmd->out_veventq_fd = fdno;
+
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_put_fdno;
+
+ iommufd_object_finalize(ucmd->ictx, &veventq->common.obj);
+ fd_install(fdno, veventq->common.filep);
+ goto out_unlock_veventqs;
+
+out_put_fdno:
+ put_unused_fd(fdno);
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj);
+out_unlock_veventqs:
+ up_write(&viommu->veventqs_rwsem);
+ iommufd_put_object(ucmd->ictx, &viommu->obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 33d142f8057d..fe789c2dc0c9 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -8,6 +8,15 @@
#include "../iommu-priv.h"
#include "iommufd_private.h"
+static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt)
+{
+ if (hwpt->domain)
+ iommu_domain_free(hwpt->domain);
+
+ if (hwpt->fault)
+ refcount_dec(&hwpt->fault->common.obj.users);
+}
+
void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
{
struct iommufd_hwpt_paging *hwpt_paging =
@@ -22,9 +31,7 @@ void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
hwpt_paging->common.domain);
}
- if (hwpt_paging->common.domain)
- iommu_domain_free(hwpt_paging->common.domain);
-
+ __iommufd_hwpt_destroy(&hwpt_paging->common);
refcount_dec(&hwpt_paging->ioas->obj.users);
}
@@ -49,10 +56,11 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
struct iommufd_hwpt_nested *hwpt_nested =
container_of(obj, struct iommufd_hwpt_nested, common.obj);
- if (hwpt_nested->common.domain)
- iommu_domain_free(hwpt_nested->common.domain);
-
- refcount_dec(&hwpt_nested->parent->common.obj.users);
+ __iommufd_hwpt_destroy(&hwpt_nested->common);
+ if (hwpt_nested->viommu)
+ refcount_dec(&hwpt_nested->viommu->obj.users);
+ else
+ refcount_dec(&hwpt_nested->parent->common.obj.users);
}
void iommufd_hwpt_nested_abort(struct iommufd_object *obj)
@@ -82,6 +90,7 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging)
* @ictx: iommufd context
* @ioas: IOAS to associate the domain with
* @idev: Device to get an iommu_domain for
+ * @pasid: PASID to get an iommu_domain for
* @flags: Flags from userspace
* @immediate_attach: True if idev should be attached to the hwpt
* @user_data: The user provided driver specific data describing the domain to
@@ -97,12 +106,14 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging)
*/
struct iommufd_hwpt_paging *
iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
- struct iommufd_device *idev, u32 flags,
- bool immediate_attach,
+ struct iommufd_device *idev, ioasid_t pasid,
+ u32 flags, bool immediate_attach,
const struct iommu_user_data *user_data)
{
const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT |
- IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_FAULT_ID_VALID |
+ IOMMU_HWPT_ALLOC_PASID;
const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_hw_pagetable *hwpt;
@@ -110,16 +121,23 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
lockdep_assert_held(&ioas->mutex);
- if ((flags || user_data) && !ops->domain_alloc_user)
+ if ((flags || user_data) && !ops->domain_alloc_paging_flags)
return ERR_PTR(-EOPNOTSUPP);
if (flags & ~valid_flags)
return ERR_PTR(-EOPNOTSUPP);
+ if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
+ !device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
+ return ERR_PTR(-EOPNOTSUPP);
+ if ((flags & IOMMU_HWPT_FAULT_ID_VALID) &&
+ (flags & IOMMU_HWPT_ALLOC_NEST_PARENT))
+ return ERR_PTR(-EOPNOTSUPP);
hwpt_paging = __iommufd_object_alloc(
ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj);
if (IS_ERR(hwpt_paging))
return ERR_CAST(hwpt_paging);
hwpt = &hwpt_paging->common;
+ hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID;
INIT_LIST_HEAD(&hwpt_paging->hwpt_item);
/* Pairs with iommufd_hw_pagetable_destroy() */
@@ -127,9 +145,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
hwpt_paging->ioas = ioas;
hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
- if (ops->domain_alloc_user) {
- hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL,
- user_data);
+ if (ops->domain_alloc_paging_flags) {
+ hwpt->domain = ops->domain_alloc_paging_flags(idev->dev,
+ flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data);
if (IS_ERR(hwpt->domain)) {
rc = PTR_ERR(hwpt->domain);
hwpt->domain = NULL;
@@ -137,12 +155,15 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
}
hwpt->domain->owner = ops;
} else {
- hwpt->domain = iommu_domain_alloc(idev->dev->bus);
- if (!hwpt->domain) {
- rc = -ENOMEM;
+ hwpt->domain = iommu_paging_domain_alloc(idev->dev);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
goto out_abort;
}
}
+ hwpt->domain->iommufd_hwpt = hwpt;
+ hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD;
/*
* Set the coherency mode before we do iopt_table_add_domain() as some
@@ -171,7 +192,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
* sequence. Once those drivers are fixed this should be removed.
*/
if (immediate_attach) {
- rc = iommufd_hw_pagetable_attach(hwpt, idev);
+ rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid);
if (rc)
goto out_abort;
}
@@ -184,7 +205,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
out_detach:
if (immediate_attach)
- iommufd_hw_pagetable_detach(idev);
+ iommufd_hw_pagetable_detach(idev, pasid);
out_abort:
iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
return ERR_PTR(rc);
@@ -213,9 +234,11 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
struct iommufd_hw_pagetable *hwpt;
int rc;
- if (flags || !user_data->len || !ops->domain_alloc_user)
+ if ((flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) ||
+ !user_data->len || !ops->domain_alloc_nested)
return ERR_PTR(-EOPNOTSUPP);
- if (parent->auto_domain || !parent->nest_parent)
+ if (parent->auto_domain || !parent->nest_parent ||
+ parent->common.domain->owner != ops)
return ERR_PTR(-EINVAL);
hwpt_nested = __iommufd_object_alloc(
@@ -223,21 +246,25 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
if (IS_ERR(hwpt_nested))
return ERR_CAST(hwpt_nested);
hwpt = &hwpt_nested->common;
+ hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID;
refcount_inc(&parent->common.obj.users);
hwpt_nested->parent = parent;
- hwpt->domain = ops->domain_alloc_user(idev->dev, flags,
- parent->common.domain, user_data);
+ hwpt->domain = ops->domain_alloc_nested(
+ idev->dev, parent->common.domain,
+ flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data);
if (IS_ERR(hwpt->domain)) {
rc = PTR_ERR(hwpt->domain);
hwpt->domain = NULL;
goto out_abort;
}
hwpt->domain->owner = ops;
+ hwpt->domain->iommufd_hwpt = hwpt;
+ hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD;
if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
- rc = -EINVAL;
+ rc = -EOPNOTSUPP;
goto out_abort;
}
return hwpt_nested;
@@ -247,6 +274,63 @@ out_abort:
return ERR_PTR(rc);
}
+/**
+ * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU
+ * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with
+ * @flags: Flags from userspace
+ * @user_data: user_data pointer. Must be valid
+ *
+ * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED
+ * hw_pagetable.
+ */
+static struct iommufd_hwpt_nested *
+iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ if (flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!user_data->len)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!viommu->ops || !viommu->ops->alloc_domain_nested)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ hwpt_nested = __iommufd_object_alloc(
+ viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj);
+ if (IS_ERR(hwpt_nested))
+ return ERR_CAST(hwpt_nested);
+ hwpt = &hwpt_nested->common;
+ hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID;
+
+ hwpt_nested->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+ hwpt_nested->parent = viommu->hwpt;
+
+ hwpt->domain = viommu->ops->alloc_domain_nested(
+ viommu, flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+ hwpt->domain->iommufd_hwpt = hwpt;
+ hwpt->domain->owner = viommu->iommu_dev->ops;
+ hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD;
+
+ if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+ rc = -EOPNOTSUPP;
+ goto out_abort;
+ }
+ return hwpt_nested;
+
+out_abort:
+ iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
+
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
{
struct iommu_hwpt_alloc *cmd = ucmd->cmd;
@@ -283,8 +367,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
ioas = container_of(pt_obj, struct iommufd_ioas, obj);
mutex_lock(&ioas->mutex);
hwpt_paging = iommufd_hwpt_paging_alloc(
- ucmd->ictx, ioas, idev, cmd->flags, false,
- user_data.len ? &user_data : NULL);
+ ucmd->ictx, ioas, idev, IOMMU_NO_PASID, cmd->flags,
+ false, user_data.len ? &user_data : NULL);
if (IS_ERR(hwpt_paging)) {
rc = PTR_ERR(hwpt_paging);
goto out_unlock;
@@ -303,11 +387,41 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
goto out_unlock;
}
hwpt = &hwpt_nested->common;
+ } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_viommu *viommu;
+
+ viommu = container_of(pt_obj, struct iommufd_viommu, obj);
+ if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ hwpt_nested = iommufd_viommu_alloc_hwpt_nested(
+ viommu, cmd->flags, &user_data);
+ if (IS_ERR(hwpt_nested)) {
+ rc = PTR_ERR(hwpt_nested);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_nested->common;
} else {
rc = -EINVAL;
goto out_put_pt;
}
+ if (cmd->flags & IOMMU_HWPT_FAULT_ID_VALID) {
+ struct iommufd_fault *fault;
+
+ fault = iommufd_get_fault(ucmd, cmd->fault_id);
+ if (IS_ERR(fault)) {
+ rc = PTR_ERR(fault);
+ goto out_hwpt;
+ }
+ hwpt->fault = fault;
+ hwpt->domain->iopf_handler = iommufd_fault_iopf_handler;
+ refcount_inc(&fault->common.obj.users);
+ iommufd_put_object(ucmd->ictx, &fault->common.obj);
+ }
+
cmd->out_hwpt_id = hwpt->obj.id;
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
if (rc)
@@ -384,7 +498,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
.entry_len = cmd->entry_len,
.entry_num = cmd->entry_num,
};
- struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_object *pt_obj;
u32 done_num = 0;
int rc;
@@ -398,17 +512,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
goto out;
}
- hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id);
- if (IS_ERR(hwpt)) {
- rc = PTR_ERR(hwpt);
+ pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj)) {
+ rc = PTR_ERR(pt_obj);
goto out;
}
+ if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) {
+ struct iommufd_hw_pagetable *hwpt =
+ container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+ if (!hwpt->domain->ops ||
+ !hwpt->domain->ops->cache_invalidate_user) {
+ rc = -EOPNOTSUPP;
+ goto out_put_pt;
+ }
+ rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
+ &data_array);
+ } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+ struct iommufd_viommu *viommu =
+ container_of(pt_obj, struct iommufd_viommu, obj);
+
+ if (!viommu->ops || !viommu->ops->cache_invalidate) {
+ rc = -EOPNOTSUPP;
+ goto out_put_pt;
+ }
+ rc = viommu->ops->cache_invalidate(viommu, &data_array);
+ } else {
+ rc = -EINVAL;
+ goto out_put_pt;
+ }
- rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
- &data_array);
done_num = data_array.entry_num;
- iommufd_put_object(ucmd->ictx, &hwpt->obj);
+out_put_pt:
+ iommufd_put_object(ucmd->ictx, pt_obj);
out:
cmd->entry_num = done_num;
if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 05fd9d3abf1b..54cf4d856179 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -8,17 +8,19 @@
* The datastructure uses the iopt_pages to optimize the storage of the PFNs
* between the domains and xarray.
*/
+#include <linux/dma-buf.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/iommu.h>
#include <linux/iommufd.h>
#include <linux/lockdep.h>
-#include <linux/iommu.h>
#include <linux/sched/mm.h>
-#include <linux/err.h>
#include <linux/slab.h>
-#include <linux/errno.h>
#include <uapi/linux/iommufd.h>
-#include "io_pagetable.h"
#include "double_span.h"
+#include "io_pagetable.h"
struct iopt_pages_list {
struct iopt_pages *pages;
@@ -70,36 +72,45 @@ struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
return iter->area;
}
-static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
- unsigned long length,
- unsigned long iova_alignment,
- unsigned long page_offset)
+static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
+ unsigned long length,
+ unsigned long iova_alignment,
+ unsigned long page_offset)
{
- if (span->is_used || span->last_hole - span->start_hole < length - 1)
+ unsigned long aligned_start;
+
+ /* ALIGN_UP() */
+ if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
return false;
+ aligned_start &= ~(iova_alignment - 1);
+ aligned_start |= page_offset;
- span->start_hole = ALIGN(span->start_hole, iova_alignment) |
- page_offset;
- if (span->start_hole > span->last_hole ||
- span->last_hole - span->start_hole < length - 1)
+ if (aligned_start >= last || last - aligned_start < length - 1)
return false;
+ *start = aligned_start;
return true;
}
-static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
+static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
unsigned long length,
unsigned long iova_alignment,
unsigned long page_offset)
{
- if (span->is_hole || span->last_used - span->start_used < length - 1)
+ if (span->is_used)
return false;
+ return __alloc_iova_check_range(&span->start_hole, span->last_hole,
+ length, iova_alignment, page_offset);
+}
- span->start_used = ALIGN(span->start_used, iova_alignment) |
- page_offset;
- if (span->start_used > span->last_used ||
- span->last_used - span->start_used < length - 1)
+static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
+ unsigned long length,
+ unsigned long iova_alignment,
+ unsigned long page_offset)
+{
+ if (span->is_hole)
return false;
- return true;
+ return __alloc_iova_check_range(&span->start_used, span->last_used,
+ length, iova_alignment, page_offset);
}
/*
@@ -107,11 +118,12 @@ static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
* Does not return a 0 IOVA even if it is valid.
*/
static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
- unsigned long uptr, unsigned long length)
+ unsigned long addr, unsigned long length)
{
- unsigned long page_offset = uptr % PAGE_SIZE;
+ unsigned long page_offset = addr % PAGE_SIZE;
struct interval_tree_double_span_iter used_span;
struct interval_tree_span_iter allowed_span;
+ unsigned long max_alignment = PAGE_SIZE;
unsigned long iova_alignment;
lockdep_assert_held(&iopt->iova_rwsem);
@@ -121,15 +133,22 @@ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
return -EOVERFLOW;
/*
- * Keep alignment present in the uptr when building the IOVA, this
+ * Keep alignment present in addr when building the IOVA, which
* increases the chance we can map a THP.
*/
- if (!uptr)
+ if (!addr)
iova_alignment = roundup_pow_of_two(length);
else
iova_alignment = min_t(unsigned long,
roundup_pow_of_two(length),
- 1UL << __ffs64(uptr));
+ 1UL << __ffs64(addr));
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ max_alignment = HPAGE_SIZE;
+#endif
+ /* Protect against ALIGN() overflow */
+ if (iova_alignment >= max_alignment)
+ iova_alignment = max_alignment;
if (iova_alignment < iopt->iova_alignment)
return -EINVAL;
@@ -240,6 +259,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
int iommu_prot, unsigned int flags)
{
struct iopt_pages_list *elm;
+ unsigned long start;
unsigned long iova;
int rc = 0;
@@ -259,9 +279,18 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
/* Use the first entry to guess the ideal IOVA alignment */
elm = list_first_entry(pages_list, struct iopt_pages_list,
next);
- rc = iopt_alloc_iova(
- iopt, dst_iova,
- (uintptr_t)elm->pages->uptr + elm->start_byte, length);
+ switch (elm->pages->type) {
+ case IOPT_ADDRESS_USER:
+ start = elm->start_byte + (uintptr_t)elm->pages->uptr;
+ break;
+ case IOPT_ADDRESS_FILE:
+ start = elm->start_byte + elm->pages->start;
+ break;
+ case IOPT_ADDRESS_DMABUF:
+ start = elm->start_byte + elm->pages->dmabuf.start;
+ break;
+ }
+ rc = iopt_alloc_iova(iopt, dst_iova, start, length);
if (rc)
goto out_unlock;
if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
@@ -376,6 +405,34 @@ out_unlock_domains:
return rc;
}
+static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ struct iopt_pages *pages, unsigned long *iova,
+ unsigned long length, unsigned long start_byte,
+ int iommu_prot, unsigned int flags)
+{
+ struct iopt_pages_list elm = {};
+ LIST_HEAD(pages_list);
+ int rc;
+
+ elm.pages = pages;
+ elm.start_byte = start_byte;
+ if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
+ elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
+ elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ elm.length = length;
+ list_add(&elm.next, &pages_list);
+
+ rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
+ if (rc) {
+ if (elm.area)
+ iopt_abort_area(elm.area);
+ if (elm.pages)
+ iopt_put_pages(elm.pages);
+ return rc;
+ }
+ return 0;
+}
+
/**
* iopt_map_user_pages() - Map a user VA to an iova in the io page table
* @ictx: iommufd_ctx the iopt is part of
@@ -400,29 +457,69 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long length, int iommu_prot,
unsigned int flags)
{
- struct iopt_pages_list elm = {};
- LIST_HEAD(pages_list);
- int rc;
+ struct iopt_pages *pages;
- elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
- if (IS_ERR(elm.pages))
- return PTR_ERR(elm.pages);
- if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
- elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
- elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
- elm.start_byte = uptr - elm.pages->uptr;
- elm.length = length;
- list_add(&elm.next, &pages_list);
+ pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
- rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
- if (rc) {
- if (elm.area)
- iopt_abort_area(elm.area);
- if (elm.pages)
- iopt_put_pages(elm.pages);
- return rc;
+ return iopt_map_common(ictx, iopt, pages, iova, length,
+ uptr - pages->uptr, iommu_prot, flags);
+}
+
+/**
+ * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
+ * @ictx: iommufd_ctx the iopt is part of
+ * @iopt: io_pagetable to act on
+ * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
+ * the chosen iova on output. Otherwise is the iova to map to on input
+ * @fd: fdno of a file to map
+ * @start: map file starting at this byte offset
+ * @length: Number of bytes to map
+ * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
+ * @flags: IOPT_ALLOC_IOVA or zero
+ */
+int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, int fd, unsigned long start,
+ unsigned long length, int iommu_prot,
+ unsigned int flags)
+{
+ struct iopt_pages *pages;
+ struct dma_buf *dmabuf;
+ unsigned long start_byte;
+ unsigned long last;
+
+ if (!length)
+ return -EINVAL;
+ if (check_add_overflow(start, length - 1, &last))
+ return -EOVERFLOW;
+
+ start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
+ dmabuf = dma_buf_get(fd);
+ if (!IS_ERR(dmabuf)) {
+ pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
+ length,
+ iommu_prot & IOMMU_WRITE);
+ if (IS_ERR(pages)) {
+ dma_buf_put(dmabuf);
+ return PTR_ERR(pages);
+ }
+ } else {
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ pages = iopt_alloc_file_pages(file, start_byte, start, length,
+ iommu_prot & IOMMU_WRITE);
+ fput(file);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
}
- return 0;
+
+ return iopt_map_common(ictx, iopt, pages, iova, length,
+ start_byte, iommu_prot, flags);
}
struct iova_bitmap_fn_arg {
@@ -643,7 +740,8 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
struct iopt_area *area;
unsigned long unmapped_bytes = 0;
unsigned int tries = 0;
- int rc = -ENOENT;
+ /* If there are no mapped entries then success */
+ int rc = 0;
/*
* The domains_rwsem must be held in read mode any time any area->pages
@@ -664,6 +762,12 @@ again:
goto out_unlock_iova;
}
+ /* The area is locked by an object that has not been destroyed */
+ if (area->num_locks) {
+ rc = -EBUSY;
+ goto out_unlock_iova;
+ }
+
if (area_first < start || area_last > last) {
rc = -ENOENT;
goto out_unlock_iova;
@@ -688,8 +792,10 @@ again:
iommufd_access_notify_unmap(iopt, area_first, length);
/* Something is not responding to unmap requests. */
tries++;
- if (WARN_ON(tries > 100))
- return -EDEADLOCK;
+ if (WARN_ON(tries > 100)) {
+ rc = -EDEADLOCK;
+ goto out_unmapped;
+ }
goto again;
}
@@ -705,12 +811,11 @@ again:
down_write(&iopt->iova_rwsem);
}
- if (unmapped_bytes)
- rc = 0;
out_unlock_iova:
up_write(&iopt->iova_rwsem);
up_read(&iopt->domains_rwsem);
+out_unmapped:
if (unmapped)
*unmapped = unmapped_bytes;
return rc;
@@ -742,13 +847,8 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
{
- int rc;
-
- rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
/* If the IOVAs are empty then unmap all succeeds */
- if (rc == -ENOENT)
- return 0;
- return rc;
+ return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
}
/* The caller must always free all the nodes in the allowed_iova rb_root. */
@@ -894,9 +994,15 @@ static void iopt_unfill_domain(struct io_pagetable *iopt,
WARN_ON(!area->storage_domain);
if (area->storage_domain == domain)
area->storage_domain = storage_domain;
+ if (iopt_is_dmabuf(pages)) {
+ if (!iopt_dmabuf_revoked(pages))
+ iopt_area_unmap_domain(area, domain);
+ iopt_dmabuf_untrack_domain(pages, area, domain);
+ }
mutex_unlock(&pages->mutex);
- iopt_area_unmap_domain(area, domain);
+ if (!iopt_is_dmabuf(pages))
+ iopt_area_unmap_domain(area, domain);
}
return;
}
@@ -913,6 +1019,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt,
WARN_ON(area->storage_domain != domain);
area->storage_domain = NULL;
iopt_area_unfill_domain(area, pages, domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
mutex_unlock(&pages->mutex);
}
}
@@ -942,10 +1050,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt,
if (!pages)
continue;
- mutex_lock(&pages->mutex);
+ guard(mutex)(&pages->mutex);
+ if (iopt_is_dmabuf(pages)) {
+ rc = iopt_dmabuf_track_domain(pages, area, domain);
+ if (rc)
+ goto out_unfill;
+ }
rc = iopt_area_fill_domain(area, domain);
if (rc) {
- mutex_unlock(&pages->mutex);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
goto out_unfill;
}
if (!area->storage_domain) {
@@ -954,7 +1068,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt,
interval_tree_insert(&area->pages_node,
&pages->domains_itree);
}
- mutex_unlock(&pages->mutex);
}
return 0;
@@ -975,6 +1088,8 @@ out_unfill:
area->storage_domain = NULL;
}
iopt_area_unfill_domain(area, pages, domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_domain(pages, area, domain);
mutex_unlock(&pages->mutex);
}
return rc;
@@ -1185,6 +1300,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
if (!pages || area->prevent_access)
return -EBUSY;
+ /* Maintaining the domains_itree below is a bit complicated */
+ if (iopt_is_dmabuf(pages))
+ return -EOPNOTSUPP;
+
if (new_start & (alignment - 1) ||
iopt_area_start_byte(area, new_start) & (alignment - 1))
return -EINVAL;
@@ -1355,8 +1474,7 @@ out_unlock:
}
void iopt_remove_access(struct io_pagetable *iopt,
- struct iommufd_access *access,
- u32 iopt_access_list_id)
+ struct iommufd_access *access, u32 iopt_access_list_id)
{
down_write(&iopt->domains_rwsem);
down_write(&iopt->iova_rwsem);
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 0ec3509b7e33..14cd052fd320 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -5,9 +5,10 @@
#ifndef __IO_PAGETABLE_H
#define __IO_PAGETABLE_H
+#include <linux/dma-buf.h>
#include <linux/interval_tree.h>
-#include <linux/mutex.h>
#include <linux/kref.h>
+#include <linux/mutex.h>
#include <linux/xarray.h>
#include "iommufd_private.h"
@@ -48,6 +49,7 @@ struct iopt_area {
int iommu_prot;
bool prevent_access : 1;
unsigned int num_accesses;
+ unsigned int num_locks;
};
struct iopt_allowed {
@@ -68,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
void iopt_area_unmap_domain(struct iopt_area *area,
struct iommu_domain *domain);
+int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area,
+ struct iommu_domain *domain);
+void iopt_dmabuf_untrack_domain(struct iopt_pages *pages,
+ struct iopt_area *area,
+ struct iommu_domain *domain);
+int iopt_dmabuf_track_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages);
+void iopt_dmabuf_untrack_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages);
+
static inline unsigned long iopt_area_index(struct iopt_area *area)
{
return area->pages_node.start;
@@ -173,6 +185,27 @@ enum {
IOPT_PAGES_ACCOUNT_NONE = 0,
IOPT_PAGES_ACCOUNT_USER = 1,
IOPT_PAGES_ACCOUNT_MM = 2,
+ IOPT_PAGES_ACCOUNT_MODE_NUM = 3,
+};
+
+enum iopt_address_type {
+ IOPT_ADDRESS_USER = 0,
+ IOPT_ADDRESS_FILE,
+ IOPT_ADDRESS_DMABUF,
+};
+
+struct iopt_pages_dmabuf_track {
+ struct iommu_domain *domain;
+ struct iopt_area *area;
+ struct list_head elm;
+};
+
+struct iopt_pages_dmabuf {
+ struct dma_buf_attachment *attach;
+ struct dma_buf_phys_vec phys;
+ /* Always PAGE_SIZE aligned */
+ unsigned long start;
+ struct list_head tracker;
};
/*
@@ -195,7 +228,16 @@ struct iopt_pages {
struct task_struct *source_task;
struct mm_struct *source_mm;
struct user_struct *source_user;
- void __user *uptr;
+ enum iopt_address_type type;
+ union {
+ void __user *uptr; /* IOPT_ADDRESS_USER */
+ struct { /* IOPT_ADDRESS_FILE */
+ struct file *file;
+ unsigned long start;
+ };
+ /* IOPT_ADDRESS_DMABUF */
+ struct iopt_pages_dmabuf dmabuf;
+ };
bool writable:1;
u8 account_mode;
@@ -206,8 +248,32 @@ struct iopt_pages {
struct rb_root_cached domains_itree;
};
-struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
- bool writable);
+static inline bool iopt_is_dmabuf(struct iopt_pages *pages)
+{
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return false;
+ return pages->type == IOPT_ADDRESS_DMABUF;
+}
+
+static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages)
+{
+ lockdep_assert_held(&pages->mutex);
+ if (iopt_is_dmabuf(pages))
+ return pages->dmabuf.phys.len == 0;
+ return false;
+}
+
+struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
+ unsigned long length, bool writable);
+struct iopt_pages *iopt_alloc_file_pages(struct file *file,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable);
+struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx,
+ struct dma_buf *dmabuf,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable);
void iopt_release_pages(struct kref *kref);
static inline void iopt_put_pages(struct iopt_pages *pages)
{
@@ -223,9 +289,9 @@ void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start,
int iopt_area_add_access(struct iopt_area *area, unsigned long start,
unsigned long last, struct page **out_pages,
- unsigned int flags);
+ unsigned int flags, bool lock_area);
void iopt_area_remove_access(struct iopt_area *area, unsigned long start,
- unsigned long last);
+ unsigned long last, bool unlock_area);
int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
void *data, unsigned long length, unsigned int flags);
@@ -238,4 +304,9 @@ struct iopt_pages_access {
unsigned int users;
};
+struct pfn_reader_user;
+
+int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user);
+
#endif
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 742248276548..f4721afedadc 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -2,9 +2,10 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
*/
+#include <linux/file.h>
#include <linux/interval_tree.h>
-#include <linux/iommufd.h>
#include <linux/iommu.h>
+#include <linux/iommufd.h>
#include <uapi/linux/iommufd.h>
#include "io_pagetable.h"
@@ -51,7 +52,10 @@ int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
if (rc)
goto out_table;
+
+ down_read(&ucmd->ictx->ioas_creation_lock);
iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+ up_read(&ucmd->ictx->ioas_creation_lock);
return 0;
out_table:
@@ -197,6 +201,46 @@ static int conv_iommu_prot(u32 map_flags)
return iommu_prot;
}
+int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_map_file *cmd = ucmd->cmd;
+ unsigned long iova = cmd->iova;
+ struct iommufd_ioas *ioas;
+ unsigned int flags = 0;
+ int rc;
+
+ if (cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE))
+ return -EOPNOTSUPP;
+
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ if (!(cmd->flags &
+ (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
+ return -EINVAL;
+
+ ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+
+ rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd,
+ cmd->start, cmd->length,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put;
+
+ cmd->iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+ iommufd_put_object(ucmd->ictx, &ioas->obj);
+ return rc;
+}
+
int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
{
struct iommu_ioas_map *cmd = ucmd->cmd;
@@ -213,6 +257,10 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
return -EOVERFLOW;
+ if (!(cmd->flags &
+ (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
+ return -EINVAL;
+
ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
if (IS_ERR(ioas))
return PTR_ERR(ioas);
@@ -253,6 +301,10 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
cmd->dst_iova >= ULONG_MAX)
return -EOVERFLOW;
+ if (!(cmd->flags &
+ (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
+ return -EINVAL;
+
src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id);
if (IS_ERR(src_ioas))
return PTR_ERR(src_ioas);
@@ -309,6 +361,10 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
&unmapped);
if (rc)
goto out_put;
+ if (!unmapped) {
+ rc = -ENOENT;
+ goto out_put;
+ }
}
cmd->length = unmapped;
@@ -319,6 +375,215 @@ out_put:
return rc;
}
+static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx,
+ struct xarray *ioas_list)
+{
+ struct iommufd_ioas *ioas;
+ unsigned long index;
+
+ xa_for_each(ioas_list, index, ioas) {
+ up_write(&ioas->iopt.iova_rwsem);
+ refcount_dec(&ioas->obj.users);
+ }
+ up_write(&ictx->ioas_creation_lock);
+ xa_destroy(ioas_list);
+}
+
+static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
+ struct xarray *ioas_list)
+{
+ struct iommufd_object *obj;
+ unsigned long index;
+ int rc;
+
+ /*
+ * This is very ugly, it is done instead of adding a lock around
+ * pages->source_mm, which is a performance path for mdev, we just
+ * obtain the write side of all the iova_rwsems which also protects the
+ * pages->source_*. Due to copies we can't know which IOAS could read
+ * from the pages, so we just lock everything. This is the only place
+ * locks are nested and they are uniformly taken in ID order.
+ *
+ * ioas_creation_lock prevents new IOAS from being installed in the
+ * xarray while we do this, and also prevents more than one thread from
+ * holding nested locks.
+ */
+ down_write(&ictx->ioas_creation_lock);
+ xa_lock(&ictx->objects);
+ xa_for_each(&ictx->objects, index, obj) {
+ struct iommufd_ioas *ioas;
+
+ if (!obj || obj->type != IOMMUFD_OBJ_IOAS)
+ continue;
+
+ if (!refcount_inc_not_zero(&obj->users))
+ continue;
+
+ xa_unlock(&ictx->objects);
+
+ ioas = container_of(obj, struct iommufd_ioas, obj);
+ down_write_nest_lock(&ioas->iopt.iova_rwsem,
+ &ictx->ioas_creation_lock);
+
+ rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL));
+ if (rc) {
+ iommufd_release_all_iova_rwsem(ictx, ioas_list);
+ return rc;
+ }
+
+ xa_lock(&ictx->objects);
+ }
+ xa_unlock(&ictx->objects);
+ return 0;
+}
+
+static bool need_charge_update(struct iopt_pages *pages)
+{
+ switch (pages->account_mode) {
+ case IOPT_PAGES_ACCOUNT_NONE:
+ return false;
+ case IOPT_PAGES_ACCOUNT_MM:
+ return pages->source_mm != current->mm;
+ case IOPT_PAGES_ACCOUNT_USER:
+ /*
+ * Update when mm changes because it also accounts
+ * in mm->pinned_vm.
+ */
+ return (pages->source_user != current_user()) ||
+ (pages->source_mm != current->mm);
+ }
+ return true;
+}
+
+static int charge_current(unsigned long *npinned)
+{
+ struct iopt_pages tmp = {
+ .source_mm = current->mm,
+ .source_task = current->group_leader,
+ .source_user = current_user(),
+ };
+ unsigned int account_mode;
+ int rc;
+
+ for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM;
+ account_mode++) {
+ if (!npinned[account_mode])
+ continue;
+
+ tmp.account_mode = account_mode;
+ rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true,
+ NULL);
+ if (rc)
+ goto err_undo;
+ }
+ return 0;
+
+err_undo:
+ while (account_mode != 0) {
+ account_mode--;
+ if (!npinned[account_mode])
+ continue;
+ tmp.account_mode = account_mode;
+ iopt_pages_update_pinned(&tmp, npinned[account_mode], false,
+ NULL);
+ }
+ return rc;
+}
+
+static void change_mm(struct iopt_pages *pages)
+{
+ struct task_struct *old_task = pages->source_task;
+ struct user_struct *old_user = pages->source_user;
+ struct mm_struct *old_mm = pages->source_mm;
+
+ pages->source_mm = current->mm;
+ mmgrab(pages->source_mm);
+ mmdrop(old_mm);
+
+ pages->source_task = current->group_leader;
+ get_task_struct(pages->source_task);
+ put_task_struct(old_task);
+
+ pages->source_user = get_uid(current_user());
+ free_uid(old_user);
+}
+
+#define for_each_ioas_area(_xa, _index, _ioas, _area) \
+ xa_for_each((_xa), (_index), (_ioas)) \
+ for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
+ _area; \
+ _area = iopt_area_iter_next(_area, 0, ULONG_MAX))
+
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_change_process *cmd = ucmd->cmd;
+ struct iommufd_ctx *ictx = ucmd->ictx;
+ unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {};
+ struct iommufd_ioas *ioas;
+ struct iopt_area *area;
+ struct iopt_pages *pages;
+ struct xarray ioas_list;
+ unsigned long index;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ xa_init(&ioas_list);
+ rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list);
+ if (rc)
+ return rc;
+
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ if (area->pages->type != IOPT_ADDRESS_FILE) {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * Count last_pinned pages, then clear it to avoid double counting
+ * if the same iopt_pages is visited multiple times in this loop.
+ * Since we are under all the locks, npinned == last_npinned, so we
+ * can easily restore last_npinned before we return.
+ */
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ pages = area->pages;
+
+ if (need_charge_update(pages)) {
+ all_npinned[pages->account_mode] += pages->last_npinned;
+ pages->last_npinned = 0;
+ }
+ }
+
+ rc = charge_current(all_npinned);
+
+ if (rc) {
+ /* Charge failed. Fix last_npinned and bail. */
+ for_each_ioas_area(&ioas_list, index, ioas, area)
+ area->pages->last_npinned = area->pages->npinned;
+ goto out;
+ }
+
+ for_each_ioas_area(&ioas_list, index, ioas, area) {
+ pages = area->pages;
+
+ /* Uncharge the old one (which also restores last_npinned) */
+ if (need_charge_update(pages)) {
+ int r = iopt_pages_update_pinned(pages, pages->npinned,
+ false, NULL);
+
+ if (WARN_ON(r))
+ rc = r;
+ }
+ change_mm(pages);
+ }
+
+out:
+ iommufd_release_all_iova_rwsem(ictx, &ioas_list);
+ return rc;
+}
+
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx)
{
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..eb6d1a70f673 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -4,24 +4,54 @@
#ifndef __IOMMUFD_PRIVATE_H
#define __IOMMUFD_PRIVATE_H
-#include <linux/rwsem.h>
-#include <linux/xarray.h>
-#include <linux/refcount.h>
-#include <linux/uaccess.h>
#include <linux/iommu.h>
+#include <linux/iommufd.h>
#include <linux/iova_bitmap.h>
+#include <linux/maple_tree.h>
+#include <linux/rwsem.h>
+#include <linux/uaccess.h>
+#include <linux/xarray.h>
#include <uapi/linux/iommufd.h>
+#include "../iommu-priv.h"
+
struct iommu_domain;
struct iommu_group;
struct iommu_option;
struct iommufd_device;
+struct dma_buf_attachment;
+struct dma_buf_phys_vec;
+
+struct iommufd_sw_msi_map {
+ struct list_head sw_msi_item;
+ phys_addr_t sw_msi_start;
+ phys_addr_t msi_addr;
+ unsigned int pgoff;
+ unsigned int id;
+};
+
+/* Bitmap of struct iommufd_sw_msi_map::id */
+struct iommufd_sw_msi_maps {
+ DECLARE_BITMAP(bitmap, 64);
+};
+
+#ifdef CONFIG_IRQ_MSI_IOMMU
+int iommufd_sw_msi_install(struct iommufd_ctx *ictx,
+ struct iommufd_hwpt_paging *hwpt_paging,
+ struct iommufd_sw_msi_map *msi_map);
+#endif
struct iommufd_ctx {
struct file *file;
struct xarray objects;
struct xarray groups;
wait_queue_head_t destroy_wait;
+ struct rw_semaphore ioas_creation_lock;
+ struct maple_tree mt_mmap;
+
+ struct mutex sw_msi_lock;
+ struct list_head sw_msi_list;
+ unsigned int sw_msi_id;
u8 account_mode;
/* Compatibility with VFIO no iommu */
@@ -29,6 +59,18 @@ struct iommufd_ctx {
struct iommufd_ioas *vfio_ioas;
};
+/* Entry for iommufd_ctx::mt_mmap */
+struct iommufd_mmap {
+ struct iommufd_object *owner;
+
+ /* Page-shifted start position in mt_mmap to validate vma->vm_pgoff */
+ unsigned long vm_pgoff;
+
+ /* Physical range for io_remap_pfn_range() */
+ phys_addr_t mmio_addr;
+ size_t length;
+};
+
/*
* The IOVA to PFN map. The map automatically copies the PFNs into multiple
* domains and permits sharing of PFNs between io_pagetable instances. This
@@ -67,6 +109,10 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
unsigned long *iova, void __user *uptr,
unsigned long length, int iommu_prot,
unsigned int flags);
+int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+ unsigned long *iova, int fd,
+ unsigned long start, unsigned long length,
+ int iommu_prot, unsigned int flags);
int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
unsigned long length, unsigned long *dst_iova,
int iommu_prot, unsigned int flags);
@@ -105,6 +151,7 @@ struct iommufd_ucmd {
void __user *ubuffer;
u32 user_size;
void *cmd;
+ struct iommufd_object *new_obj;
};
int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
@@ -120,33 +167,11 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
return 0;
}
-enum iommufd_object_type {
- IOMMUFD_OBJ_NONE,
- IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
- IOMMUFD_OBJ_DEVICE,
- IOMMUFD_OBJ_HWPT_PAGING,
- IOMMUFD_OBJ_HWPT_NESTED,
- IOMMUFD_OBJ_IOAS,
- IOMMUFD_OBJ_ACCESS,
-#ifdef CONFIG_IOMMUFD_TEST
- IOMMUFD_OBJ_SELFTEST,
-#endif
- IOMMUFD_OBJ_MAX,
-};
-
-/* Base struct for all objects with a userspace ID handle. */
-struct iommufd_object {
- refcount_t shortterm_users;
- refcount_t users;
- enum iommufd_object_type type;
- unsigned int id;
-};
-
static inline bool iommufd_lock_obj(struct iommufd_object *obj)
{
if (!refcount_inc_not_zero(&obj->users))
return false;
- if (!refcount_inc_not_zero(&obj->shortterm_users)) {
+ if (!refcount_inc_not_zero(&obj->wait_cnt)) {
/*
* If the caller doesn't already have a ref on obj this must be
* called under the xa_lock. Otherwise the caller is holding a
@@ -164,11 +189,11 @@ static inline void iommufd_put_object(struct iommufd_ctx *ictx,
struct iommufd_object *obj)
{
/*
- * Users first, then shortterm so that REMOVE_WAIT_SHORTTERM never sees
- * a spurious !0 users with a 0 shortterm_users.
+ * Users first, then wait_cnt so that REMOVE_WAIT never sees a spurious
+ * !0 users with a 0 wait_cnt.
*/
refcount_dec(&obj->users);
- if (refcount_dec_and_test(&obj->shortterm_users))
+ if (refcount_dec_and_test(&obj->wait_cnt))
wake_up_interruptible_all(&ictx->destroy_wait);
}
@@ -179,7 +204,8 @@ void iommufd_object_finalize(struct iommufd_ctx *ictx,
struct iommufd_object *obj);
enum {
- REMOVE_WAIT_SHORTTERM = 1,
+ REMOVE_WAIT = BIT(0),
+ REMOVE_OBJ_TOMBSTONE = BIT(1),
};
int iommufd_object_remove(struct iommufd_ctx *ictx,
struct iommufd_object *to_destroy, u32 id,
@@ -187,15 +213,35 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
/*
* The caller holds a users refcount and wants to destroy the object. At this
- * point the caller has no shortterm_users reference and at least the xarray
- * will be holding one.
+ * point the caller has no wait_cnt reference and at least the xarray will be
+ * holding one.
*/
static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx,
struct iommufd_object *obj)
{
int ret;
- ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM);
+ ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT);
+
+ /*
+ * If there is a bug and we couldn't destroy the object then we did put
+ * back the caller's users refcount and will eventually try to free it
+ * again during close.
+ */
+ WARN_ON(ret);
+}
+
+/*
+ * Similar to iommufd_object_destroy_user(), except that the object ID is left
+ * reserved/tombstoned.
+ */
+static inline void iommufd_object_tombstone_user(struct iommufd_ctx *ictx,
+ struct iommufd_object *obj)
+{
+ int ret;
+
+ ret = iommufd_object_remove(ictx, obj, obj->id,
+ REMOVE_WAIT | REMOVE_OBJ_TOMBSTONE);
/*
* If there is a bug and we couldn't destroy the object then we did put
@@ -222,6 +268,11 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx,
iommufd_object_remove(ictx, obj, obj->id, 0);
}
+/*
+ * Callers of these normal object allocators must call iommufd_object_finalize()
+ * to finalize the object, or call iommufd_object_abort_and_destroy() to revert
+ * the allocation.
+ */
struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
size_t size,
enum iommufd_object_type type);
@@ -239,6 +290,26 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
__iommufd_object_alloc(ictx, ptr, type, obj)
/*
+ * Callers of these _ucmd allocators should not call iommufd_object_finalize()
+ * or iommufd_object_abort_and_destroy(), as the core automatically does that.
+ */
+struct iommufd_object *
+_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, size_t size,
+ enum iommufd_object_type type);
+
+#define __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) \
+ container_of(_iommufd_object_alloc_ucmd( \
+ ucmd, \
+ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \
+ offsetof(typeof(*(ptr)), \
+ obj) != 0), \
+ type), \
+ typeof(*(ptr)), obj)
+
+#define iommufd_object_alloc_ucmd(ucmd, ptr, type) \
+ __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj)
+
+/*
* The IO Address Space (IOAS) pagetable is a virtual page table backed by the
* io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
* mapping is copied into all of the associated domains and made available to
@@ -262,8 +333,7 @@ struct iommufd_ioas {
static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx,
u32 id)
{
- return container_of(iommufd_get_object(ictx, id,
- IOMMUFD_OBJ_IOAS),
+ return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_IOAS),
struct iommufd_ioas, obj);
}
@@ -273,6 +343,8 @@ void iommufd_ioas_destroy(struct iommufd_object *obj);
int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd);
int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
@@ -292,6 +364,8 @@ int iommufd_check_iova_range(struct io_pagetable *iopt,
struct iommufd_hw_pagetable {
struct iommufd_object obj;
struct iommu_domain *domain;
+ struct iommufd_fault *fault;
+ bool pasid_compat : 1;
};
struct iommufd_hwpt_paging {
@@ -299,15 +373,16 @@ struct iommufd_hwpt_paging {
struct iommufd_ioas *ioas;
bool auto_domain : 1;
bool enforce_cache_coherency : 1;
- bool msi_cookie : 1;
bool nest_parent : 1;
/* Head at iommufd_ioas::hwpt_list */
struct list_head hwpt_item;
+ struct iommufd_sw_msi_maps present_sw_msi;
};
struct iommufd_hwpt_nested {
struct iommufd_hw_pagetable common;
struct iommufd_hwpt_paging *parent;
+ struct iommufd_viommu *viommu;
};
static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
@@ -321,6 +396,25 @@ to_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
return container_of(hwpt, struct iommufd_hwpt_paging, common);
}
+static inline struct iommufd_hwpt_nested *
+to_hwpt_nested(struct iommufd_hw_pagetable *hwpt)
+{
+ return container_of(hwpt, struct iommufd_hwpt_nested, common);
+}
+
+static inline struct iommufd_hwpt_paging *
+find_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
+{
+ switch (hwpt->obj.type) {
+ case IOMMUFD_OBJ_HWPT_PAGING:
+ return to_hwpt_paging(hwpt);
+ case IOMMUFD_OBJ_HWPT_NESTED:
+ return to_hwpt_nested(hwpt)->parent;
+ default:
+ return NULL;
+ }
+}
+
static inline struct iommufd_hwpt_paging *
iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id)
{
@@ -342,13 +436,13 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd);
struct iommufd_hwpt_paging *
iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
- struct iommufd_device *idev, u32 flags,
- bool immediate_attach,
+ struct iommufd_device *idev, ioasid_t pasid,
+ u32 flags, bool immediate_attach,
const struct iommu_user_data *user_data);
int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
- struct iommufd_device *idev);
+ struct iommufd_device *idev, ioasid_t pasid);
struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_detach(struct iommufd_device *idev);
+iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid);
void iommufd_hwpt_paging_destroy(struct iommufd_object *obj);
void iommufd_hwpt_paging_abort(struct iommufd_object *obj);
void iommufd_hwpt_nested_destroy(struct iommufd_object *obj);
@@ -362,9 +456,8 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) {
struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt);
- lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
-
if (hwpt_paging->auto_domain) {
+ lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
iommufd_object_put_and_try_destroy(ictx, &hwpt->obj);
return;
}
@@ -372,13 +465,15 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
refcount_dec(&hwpt->obj.users);
}
+struct iommufd_attach;
+
struct iommufd_group {
struct kref ref;
struct mutex lock;
struct iommufd_ctx *ictx;
struct iommu_group *group;
- struct iommufd_hw_pagetable *hwpt;
- struct list_head device_list;
+ struct xarray pasid_attach;
+ struct iommufd_sw_msi_maps required_sw_msi;
phys_addr_t sw_msi_start;
};
@@ -395,6 +490,8 @@ struct iommufd_device {
/* always the physical device */
struct device *dev;
bool enforce_cache_coherency;
+ struct iommufd_vdevice *vdev;
+ bool destroying;
};
static inline struct iommufd_device *
@@ -405,9 +502,12 @@ iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id)
struct iommufd_device, obj);
}
+void iommufd_device_pre_destroy(struct iommufd_object *obj);
void iommufd_device_destroy(struct iommufd_object *obj);
int iommufd_get_hw_info(struct iommufd_ucmd *ucmd);
+struct device *iommufd_global_device(void);
+
struct iommufd_access {
struct iommufd_object obj;
struct iommufd_ctx *ictx;
@@ -422,10 +522,191 @@ struct iommufd_access {
int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access);
void iopt_remove_access(struct io_pagetable *iopt,
- struct iommufd_access *access,
- u32 iopt_access_list_id);
+ struct iommufd_access *access, u32 iopt_access_list_id);
void iommufd_access_destroy_object(struct iommufd_object *obj);
+/* iommufd_access for internal use */
+static inline bool iommufd_access_is_internal(struct iommufd_access *access)
+{
+ return !access->ictx;
+}
+
+struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx);
+
+static inline void
+iommufd_access_destroy_internal(struct iommufd_ctx *ictx,
+ struct iommufd_access *access)
+{
+ iommufd_object_destroy_user(ictx, &access->obj);
+}
+
+int iommufd_access_attach_internal(struct iommufd_access *access,
+ struct iommufd_ioas *ioas);
+
+static inline void iommufd_access_detach_internal(struct iommufd_access *access)
+{
+ iommufd_access_detach(access);
+}
+
+struct iommufd_eventq {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct file *filep;
+
+ spinlock_t lock; /* protects the deliver list */
+ struct list_head deliver;
+
+ struct wait_queue_head wait_queue;
+};
+
+struct iommufd_attach_handle {
+ struct iommu_attach_handle handle;
+ struct iommufd_device *idev;
+};
+
+/* Convert an iommu attach handle to iommufd handle. */
+#define to_iommufd_handle(hdl) container_of(hdl, struct iommufd_attach_handle, handle)
+
+/*
+ * An iommufd_fault object represents an interface to deliver I/O page faults
+ * to the user space. These objects are created/destroyed by the user space and
+ * associated with hardware page table objects during page-table allocation.
+ */
+struct iommufd_fault {
+ struct iommufd_eventq common;
+ struct mutex mutex; /* serializes response flows */
+ struct xarray response;
+};
+
+static inline struct iommufd_fault *
+eventq_to_fault(struct iommufd_eventq *eventq)
+{
+ return container_of(eventq, struct iommufd_fault, common);
+}
+
+static inline struct iommufd_fault *
+iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_FAULT),
+ struct iommufd_fault, common.obj);
+}
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_fault_destroy(struct iommufd_object *obj);
+int iommufd_fault_iopf_handler(struct iopf_group *group);
+void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
+ struct iommufd_attach_handle *handle);
+
+/* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */
+struct iommufd_vevent {
+ struct iommufd_vevent_header header;
+ struct list_head node; /* for iommufd_eventq::deliver */
+ ssize_t data_len;
+ u64 event_data[] __counted_by(data_len);
+};
+
+#define vevent_for_lost_events_header(vevent) \
+ (vevent->header.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS)
+
+/*
+ * An iommufd_veventq object represents an interface to deliver vIOMMU events to
+ * the user space. It is created/destroyed by the user space and associated with
+ * a vIOMMU object during the allocations.
+ */
+struct iommufd_veventq {
+ struct iommufd_eventq common;
+ struct iommufd_viommu *viommu;
+ struct list_head node; /* for iommufd_viommu::veventqs */
+
+ enum iommu_veventq_type type;
+ unsigned int depth;
+
+ /* Use common.lock for protection */
+ u32 num_events;
+ u32 sequence;
+
+ /* Must be last as it ends in a flexible-array member. */
+ struct iommufd_vevent lost_events_header;
+};
+
+static inline struct iommufd_veventq *
+eventq_to_veventq(struct iommufd_eventq *eventq)
+{
+ return container_of(eventq, struct iommufd_veventq, common);
+}
+
+static inline struct iommufd_veventq *
+iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_VEVENTQ),
+ struct iommufd_veventq, common.obj);
+}
+
+int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_veventq_destroy(struct iommufd_object *obj);
+void iommufd_veventq_abort(struct iommufd_object *obj);
+
+static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq,
+ struct iommufd_vevent *vevent)
+{
+ struct iommufd_eventq *eventq = &veventq->common;
+
+ lockdep_assert_held(&eventq->lock);
+
+ /*
+ * Remove the lost_events_header and add the new node at the same time.
+ * Note the new node can be lost_events_header, for a sequence update.
+ */
+ if (list_is_last(&veventq->lost_events_header.node, &eventq->deliver))
+ list_del(&veventq->lost_events_header.node);
+ list_add_tail(&vevent->node, &eventq->deliver);
+ vevent->header.sequence = veventq->sequence;
+ veventq->sequence = (veventq->sequence + 1) & INT_MAX;
+
+ wake_up_interruptible(&eventq->wait_queue);
+}
+
+static inline struct iommufd_viommu *
+iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_VIOMMU),
+ struct iommufd_viommu, obj);
+}
+
+static inline struct iommufd_veventq *
+iommufd_viommu_find_veventq(struct iommufd_viommu *viommu,
+ enum iommu_veventq_type type)
+{
+ struct iommufd_veventq *veventq, *next;
+
+ lockdep_assert_held(&viommu->veventqs_rwsem);
+
+ list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) {
+ if (veventq->type == type)
+ return veventq;
+ }
+ return NULL;
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_vdevice_destroy(struct iommufd_object *obj);
+void iommufd_vdevice_abort(struct iommufd_object *obj);
+int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_hw_queue_destroy(struct iommufd_object *obj);
+
+static inline struct iommufd_vdevice *
+iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
+{
+ return container_of(iommufd_get_object(ictx, id,
+ IOMMUFD_OBJ_VDEVICE),
+ struct iommufd_vdevice, obj);
+}
+
#ifdef CONFIG_IOMMUFD_TEST
int iommufd_test(struct iommufd_ucmd *ucmd);
void iommufd_selftest_destroy(struct iommufd_object *obj);
@@ -436,6 +717,8 @@ bool iommufd_should_fail(void);
int __init iommufd_test_init(void);
void iommufd_test_exit(void);
bool iommufd_selftest_is_mock_dev(struct device *dev);
+int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys);
#else
static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
unsigned int ioas_id,
@@ -457,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev)
{
return false;
}
+static inline int
+iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ return -EOPNOTSUPP;
+}
#endif
#endif
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index e854d3f67205..73e73e1ec158 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -4,8 +4,8 @@
#ifndef _UAPI_IOMMUFD_TEST_H
#define _UAPI_IOMMUFD_TEST_H
-#include <linux/types.h>
#include <linux/iommufd.h>
+#include <linux/types.h>
enum {
IOMMU_TEST_OP_ADD_RESERVED = 1,
@@ -22,11 +22,29 @@ enum {
IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
IOMMU_TEST_OP_DIRTY,
IOMMU_TEST_OP_MD_CHECK_IOTLB,
+ IOMMU_TEST_OP_TRIGGER_IOPF,
+ IOMMU_TEST_OP_DEV_CHECK_CACHE,
+ IOMMU_TEST_OP_TRIGGER_VEVENT,
+ IOMMU_TEST_OP_PASID_ATTACH,
+ IOMMU_TEST_OP_PASID_REPLACE,
+ IOMMU_TEST_OP_PASID_DETACH,
+ IOMMU_TEST_OP_PASID_CHECK_HWPT,
+ IOMMU_TEST_OP_DMABUF_GET,
+ IOMMU_TEST_OP_DMABUF_REVOKE,
+};
+
+enum {
+ MOCK_IOMMUPT_DEFAULT = 0,
+ MOCK_IOMMUPT_HUGE,
+ MOCK_IOMMUPT_AMDV1,
};
+/* These values are true for MOCK_IOMMUPT_DEFAULT */
enum {
MOCK_APERTURE_START = 1UL << 24,
MOCK_APERTURE_LAST = (1UL << 31) - 1,
+ MOCK_PAGE_SIZE = 2048,
+ MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE,
};
enum {
@@ -45,7 +63,7 @@ enum {
enum {
MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
- MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1,
+ MOCK_FLAGS_DEVICE_PASID = 1 << 2,
};
enum {
@@ -53,6 +71,14 @@ enum {
MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
};
+enum {
+ MOCK_DEV_CACHE_ID_MAX = 3,
+ MOCK_DEV_CACHE_NUM = 4,
+};
+
+/* Reserved for special pasid replace test */
+#define IOMMU_TEST_PASID_RESERVED 1024
+
struct iommu_test_cmd {
__u32 size;
__u32 op;
@@ -127,11 +153,55 @@ struct iommu_test_cmd {
__u32 id;
__u32 iotlb;
} check_iotlb;
+ struct {
+ __u32 dev_id;
+ __u32 pasid;
+ __u32 grpid;
+ __u32 perm;
+ __u64 addr;
+ } trigger_iopf;
+ struct {
+ __u32 id;
+ __u32 cache;
+ } check_dev_cache;
+ struct {
+ __u32 dev_id;
+ } trigger_vevent;
+ struct {
+ __u32 pasid;
+ __u32 pt_id;
+ /* @id is stdev_id */
+ } pasid_attach;
+ struct {
+ __u32 pasid;
+ __u32 pt_id;
+ /* @id is stdev_id */
+ } pasid_replace;
+ struct {
+ __u32 pasid;
+ /* @id is stdev_id */
+ } pasid_detach;
+ struct {
+ __u32 pasid;
+ __u32 hwpt_id;
+ /* @id is stdev_id */
+ } pasid_check;
+ struct {
+ __u32 length;
+ __u32 open_flags;
+ } dmabuf_get;
+ struct {
+ __s32 dmabuf_fd;
+ __u32 revoked;
+ } dmabuf_revoke;
};
__u32 last;
};
#define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32)
+/* Mock device/iommu PASID width */
+#define MOCK_PASID_WIDTH 20
+
/* Mock structs for IOMMU_DEVICE_GET_HW_INFO ioctl */
#define IOMMU_HW_INFO_TYPE_SELFTEST 0xfeedbeef
#define IOMMU_HW_INFO_SELFTEST_REGVAL 0xdeadbeef
@@ -144,6 +214,7 @@ struct iommu_test_hw_info {
/* Should not be equal to any defined value in enum iommu_hwpt_data_type */
#define IOMMU_HWPT_DATA_SELFTEST 0xdead
#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
+#define IOMMU_TEST_DEV_CACHE_DEFAULT 0xbaddad
/**
* struct iommu_hwpt_selftest
@@ -152,6 +223,7 @@ struct iommu_test_hw_info {
*/
struct iommu_hwpt_selftest {
__u32 iotlb;
+ __u32 pagetable_type;
};
/* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */
@@ -172,4 +244,51 @@ struct iommu_hwpt_invalidate_selftest {
__u32 iotlb_id;
};
+#define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef
+
+/**
+ * struct iommu_viommu_selftest - vIOMMU data for Mock driver
+ * (IOMMU_VIOMMU_TYPE_SELFTEST)
+ * @in_data: Input random data from user space
+ * @out_data: Output data (matching @in_data) to user space
+ * @out_mmap_offset: The offset argument for mmap syscall
+ * @out_mmap_length: The length argument for mmap syscall
+ *
+ * Simply set @out_data=@in_data for a loopback test
+ */
+struct iommu_viommu_selftest {
+ __u32 in_data;
+ __u32 out_data;
+ __aligned_u64 out_mmap_offset;
+ __aligned_u64 out_mmap_length;
+};
+
+/* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */
+#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef
+#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef
+
+/**
+ * struct iommu_viommu_invalidate_selftest - Invalidation data for Mock VIOMMU
+ * (IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST)
+ * @flags: Invalidate flags
+ * @cache_id: Invalidate cache entry index
+ *
+ * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @cache_id will be ignored
+ */
+struct iommu_viommu_invalidate_selftest {
+#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0)
+ __u32 flags;
+ __u32 vdev_id;
+ __u32 cache_id;
+};
+
+#define IOMMU_VEVENTQ_TYPE_SELFTEST 0xbeefbeef
+
+struct iommu_viommu_event_selftest {
+ __u32 virt_id;
+};
+
+#define IOMMU_HW_QUEUE_TYPE_SELFTEST 0xdeadbeef
+#define IOMMU_TEST_HW_QUEUE_MAX 2
+
#endif
diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index db8c46bee155..b5b67a9d3fb3 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -3,10 +3,10 @@
* Copyright (c) 2022, Oracle and/or its affiliates.
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
+#include <linux/highmem.h>
#include <linux/iova_bitmap.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/highmem.h>
#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
@@ -35,6 +35,9 @@ struct iova_bitmap_map {
/* base IOVA representing bit 0 of the first page */
unsigned long iova;
+ /* mapped length */
+ unsigned long length;
+
/* page size order that each bit granules to */
unsigned long pgshift;
@@ -113,9 +116,6 @@ struct iova_bitmap {
/* length of the IOVA range for the whole bitmap */
size_t length;
-
- /* length of the IOVA range set ahead the pinned pages */
- unsigned long set_ahead_length;
};
/*
@@ -130,9 +130,8 @@ struct iova_bitmap {
static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
unsigned long iova)
{
- unsigned long pgsize = 1 << bitmap->mapped.pgshift;
-
- return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
+ return (iova >> bitmap->mapped.pgshift) /
+ BITS_PER_TYPE(*bitmap->bitmap);
}
/*
@@ -156,6 +155,8 @@ static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap)
return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip);
}
+static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap);
+
/*
* Pins the bitmap user pages for the current range window.
* This is internal to IOVA bitmap and called when advancing the
@@ -206,6 +207,7 @@ static int iova_bitmap_get(struct iova_bitmap *bitmap)
* aligned.
*/
mapped->pgoff = offset_in_page(addr);
+ mapped->length = iova_bitmap_mapped_length(bitmap);
return 0;
}
@@ -263,16 +265,13 @@ struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
goto err;
}
- rc = iova_bitmap_get(bitmap);
- if (rc)
- goto err;
return bitmap;
err:
iova_bitmap_free(bitmap);
return ERR_PTR(rc);
}
-EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, "IOMMUFD");
/**
* iova_bitmap_free() - Frees an IOVA bitmap object
@@ -294,7 +293,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap)
kfree(bitmap);
}
-EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, "IOMMUFD");
/*
* Returns the remaining bitmap indexes from mapped_total_index to process for
@@ -338,65 +337,34 @@ static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap)
}
/*
- * Returns true if there's not more data to iterate.
+ * Returns true if [@iova..@iova+@length-1] is part of the mapped IOVA range.
*/
-static bool iova_bitmap_done(struct iova_bitmap *bitmap)
+static bool iova_bitmap_mapped_range(struct iova_bitmap_map *mapped,
+ unsigned long iova, size_t length)
{
- return bitmap->mapped_base_index >= bitmap->mapped_total_index;
-}
-
-static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap,
- size_t set_ahead_length)
-{
- int ret = 0;
-
- while (set_ahead_length > 0 && !iova_bitmap_done(bitmap)) {
- unsigned long length = iova_bitmap_mapped_length(bitmap);
- unsigned long iova = iova_bitmap_mapped_iova(bitmap);
-
- ret = iova_bitmap_get(bitmap);
- if (ret)
- break;
-
- length = min(length, set_ahead_length);
- iova_bitmap_set(bitmap, iova, length);
-
- set_ahead_length -= length;
- bitmap->mapped_base_index +=
- iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
- iova_bitmap_put(bitmap);
- }
-
- bitmap->set_ahead_length = 0;
- return ret;
+ return mapped->npages &&
+ (iova >= mapped->iova &&
+ (iova + length - 1) <= (mapped->iova + mapped->length - 1));
}
/*
- * Advances to the next range, releases the current pinned
+ * Advances to a selected range, releases the current pinned
* pages and pins the next set of bitmap pages.
* Returns 0 on success or otherwise errno.
*/
-static int iova_bitmap_advance(struct iova_bitmap *bitmap)
+static int iova_bitmap_advance_to(struct iova_bitmap *bitmap,
+ unsigned long iova)
{
- unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1;
- unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1;
+ unsigned long index;
- bitmap->mapped_base_index += count;
+ index = iova_bitmap_offset_to_index(bitmap, iova - bitmap->iova);
+ if (index >= bitmap->mapped_total_index)
+ return -EINVAL;
+ bitmap->mapped_base_index = index;
iova_bitmap_put(bitmap);
- if (iova_bitmap_done(bitmap))
- return 0;
-
- /* Iterate, set and skip any bits requested for next iteration */
- if (bitmap->set_ahead_length) {
- int ret;
- ret = iova_bitmap_set_ahead(bitmap, bitmap->set_ahead_length);
- if (ret)
- return ret;
- }
-
- /* When advancing the index we pin the next set of bitmap pages */
+ /* Pin the next set of bitmap pages */
return iova_bitmap_get(bitmap);
}
@@ -416,19 +384,9 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap)
int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
iova_bitmap_fn_t fn)
{
- int ret = 0;
-
- for (; !iova_bitmap_done(bitmap) && !ret;
- ret = iova_bitmap_advance(bitmap)) {
- ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
- iova_bitmap_mapped_length(bitmap), opaque);
- if (ret)
- break;
- }
-
- return ret;
+ return fn(bitmap, bitmap->iova, bitmap->length, opaque);
}
-EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, "IOMMUFD");
/**
* iova_bitmap_set() - Records an IOVA range in bitmap
@@ -444,11 +402,24 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
unsigned long iova, size_t length)
{
struct iova_bitmap_map *mapped = &bitmap->mapped;
- unsigned long cur_bit = ((iova - mapped->iova) >>
- mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
- unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
- mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
- unsigned long last_page_idx = mapped->npages - 1;
+ unsigned long cur_bit, last_bit, last_page_idx;
+
+update_indexes:
+ if (unlikely(!iova_bitmap_mapped_range(mapped, iova, length))) {
+ /*
+ * The attempt to advance the base index to @iova
+ * may fail if it's out of bounds, or pinning the pages
+ * returns an error.
+ */
+ if (iova_bitmap_advance_to(bitmap, iova))
+ return;
+ }
+
+ last_page_idx = mapped->npages - 1;
+ cur_bit = ((iova - mapped->iova) >>
+ mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+ last_bit = (((iova + length - 1) - mapped->iova) >>
+ mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
do {
unsigned int page_idx = cur_bit / BITS_PER_PAGE;
@@ -457,18 +428,19 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
last_bit - cur_bit + 1);
void *kaddr;
- if (unlikely(page_idx > last_page_idx))
- break;
+ if (unlikely(page_idx > last_page_idx)) {
+ unsigned long left =
+ ((last_bit - cur_bit + 1) << mapped->pgshift);
+
+ iova += (length - left);
+ length = left;
+ goto update_indexes;
+ }
kaddr = kmap_local_page(mapped->pages[page_idx]);
bitmap_set(kaddr, offset, nbits);
kunmap_local(kaddr);
cur_bit += nbits;
} while (cur_bit <= last_bit);
-
- if (unlikely(cur_bit <= last_bit)) {
- bitmap->set_ahead_length =
- ((last_bit - cur_bit + 1) << bitmap->mapped.pgshift);
- }
}
-EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, "IOMMUFD");
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 39b32932c61e..5cc4b08c25f5 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -8,21 +8,23 @@
*/
#define pr_fmt(fmt) "iommufd: " fmt
+#include <linux/bug.h>
#include <linux/file.h>
#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/iommufd.h>
#include <linux/miscdevice.h>
+#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/bug.h>
+#include <linux/slab.h>
#include <uapi/linux/iommufd.h>
-#include <linux/iommufd.h>
#include "io_pagetable.h"
#include "iommufd_private.h"
#include "iommufd_test.h"
struct iommufd_object_ops {
+ size_t file_offset;
+ void (*pre_destroy)(struct iommufd_object *obj);
void (*destroy)(struct iommufd_object *obj);
void (*abort)(struct iommufd_object *obj);
};
@@ -41,7 +43,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
return ERR_PTR(-ENOMEM);
obj->type = type;
/* Starts out bias'd by 1 until it is removed from the xarray */
- refcount_set(&obj->shortterm_users, 1);
+ refcount_set(&obj->wait_cnt, 1);
refcount_set(&obj->users, 1);
/*
@@ -51,8 +53,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
* it anymore, so the caller must complete all errorable operations
* before calling iommufd_object_finalize().
*/
- rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY,
- xa_limit_31b, GFP_KERNEL_ACCOUNT);
+ rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b,
+ GFP_KERNEL_ACCOUNT);
if (rc)
goto out_free;
return obj;
@@ -61,6 +63,33 @@ out_free:
return ERR_PTR(rc);
}
+struct iommufd_object *_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd,
+ size_t size,
+ enum iommufd_object_type type)
+{
+ struct iommufd_object *new_obj;
+
+ /* Something is coded wrong if this is hit */
+ if (WARN_ON(ucmd->new_obj))
+ return ERR_PTR(-EBUSY);
+
+ /*
+ * An abort op means that its caller needs to invoke it within a lock in
+ * the caller. So it doesn't work with _iommufd_object_alloc_ucmd() that
+ * will invoke the abort op in iommufd_object_abort_and_destroy(), which
+ * must be outside the caller's lock.
+ */
+ if (WARN_ON(iommufd_object_ops[type].abort))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ new_obj = _iommufd_object_alloc(ucmd->ictx, size, type);
+ if (IS_ERR(new_obj))
+ return new_obj;
+
+ ucmd->new_obj = new_obj;
+ return new_obj;
+}
+
/*
* Allow concurrent access to the object.
*
@@ -73,20 +102,30 @@ out_free:
void iommufd_object_finalize(struct iommufd_ctx *ictx,
struct iommufd_object *obj)
{
+ XA_STATE(xas, &ictx->objects, obj->id);
void *old;
- old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL);
- /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */
- WARN_ON(old);
+ xa_lock(&ictx->objects);
+ old = xas_store(&xas, obj);
+ xa_unlock(&ictx->objects);
+ /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */
+ WARN_ON(old != XA_ZERO_ENTRY);
}
/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */
void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj)
{
+ XA_STATE(xas, &ictx->objects, obj->id);
void *old;
- old = xa_erase(&ictx->objects, obj->id);
- WARN_ON(old);
+ xa_lock(&ictx->objects);
+ old = xas_store(&xas, NULL);
+ xa_unlock(&ictx->objects);
+ WARN_ON(old != XA_ZERO_ENTRY);
+
+ if (WARN_ON(!refcount_dec_and_test(&obj->users)))
+ return;
+
kfree(obj);
}
@@ -97,10 +136,30 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj)
void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
struct iommufd_object *obj)
{
- if (iommufd_object_ops[obj->type].abort)
- iommufd_object_ops[obj->type].abort(obj);
+ const struct iommufd_object_ops *ops = &iommufd_object_ops[obj->type];
+
+ if (ops->file_offset) {
+ struct file **filep = ((void *)obj) + ops->file_offset;
+
+ /*
+ * A file should hold a users refcount while the file is open
+ * and put it back in its release. The file should hold a
+ * pointer to obj in their private data. Normal fput() is
+ * deferred to a workqueue and can get out of order with the
+ * following kfree(obj). Using the sync version ensures the
+ * release happens immediately. During abort we require the file
+ * refcount is one at this point - meaning the object alloc
+ * function cannot do anything to allow another thread to take a
+ * refcount prior to a guaranteed success.
+ */
+ if (*filep)
+ __fput_sync(*filep);
+ }
+
+ if (ops->abort)
+ ops->abort(obj);
else
- iommufd_object_ops[obj->type].destroy(obj);
+ ops->destroy(obj);
iommufd_object_abort(ictx, obj);
}
@@ -121,20 +180,22 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
return obj;
}
-static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx,
- struct iommufd_object *to_destroy)
+static int iommufd_object_dec_wait(struct iommufd_ctx *ictx,
+ struct iommufd_object *to_destroy)
{
- if (refcount_dec_and_test(&to_destroy->shortterm_users))
+ if (refcount_dec_and_test(&to_destroy->wait_cnt))
return 0;
+ if (iommufd_object_ops[to_destroy->type].pre_destroy)
+ iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy);
+
if (wait_event_timeout(ictx->destroy_wait,
- refcount_read(&to_destroy->shortterm_users) ==
- 0,
- msecs_to_jiffies(10000)))
+ refcount_read(&to_destroy->wait_cnt) == 0,
+ msecs_to_jiffies(60000)))
return 0;
pr_crit("Time out waiting for iommufd object to become free\n");
- refcount_inc(&to_destroy->shortterm_users);
+ refcount_inc(&to_destroy->wait_cnt);
return -EBUSY;
}
@@ -148,17 +209,18 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
{
struct iommufd_object *obj;
XA_STATE(xas, &ictx->objects, id);
- bool zerod_shortterm = false;
+ bool zerod_wait_cnt = false;
int ret;
/*
- * The purpose of the shortterm_users is to ensure deterministic
- * destruction of objects used by external drivers and destroyed by this
- * function. Any temporary increment of the refcount must increment
- * shortterm_users, such as during ioctl execution.
+ * The purpose of the wait_cnt is to ensure deterministic destruction
+ * of objects used by external drivers and destroyed by this function.
+ * Incrementing this wait_cnt should either be short lived, such as
+ * during ioctl execution, or be revoked and blocked during
+ * pre_destroy(), such as vdev holding the idev's refcount.
*/
- if (flags & REMOVE_WAIT_SHORTTERM) {
- ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy);
+ if (flags & REMOVE_WAIT) {
+ ret = iommufd_object_dec_wait(ictx, to_destroy);
if (ret) {
/*
* We have a bug. Put back the callers reference and
@@ -167,7 +229,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
refcount_dec(&to_destroy->users);
return ret;
}
- zerod_shortterm = true;
+ zerod_wait_cnt = true;
}
xa_lock(&ictx->objects);
@@ -193,17 +255,17 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
goto err_xa;
}
- xas_store(&xas, NULL);
+ xas_store(&xas, (flags & REMOVE_OBJ_TOMBSTONE) ? XA_ZERO_ENTRY : NULL);
if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj))
ictx->vfio_ioas = NULL;
xa_unlock(&ictx->objects);
/*
- * Since users is zero any positive users_shortterm must be racing
+ * Since users is zero any positive wait_cnt must be racing
* iommufd_put_object(), or we have a bug.
*/
- if (!zerod_shortterm) {
- ret = iommufd_object_dec_wait_shortterm(ictx, obj);
+ if (!zerod_wait_cnt) {
+ ret = iommufd_object_dec_wait(ictx, obj);
if (WARN_ON(ret))
return ret;
}
@@ -213,9 +275,9 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
return 0;
err_xa:
- if (zerod_shortterm) {
+ if (zerod_wait_cnt) {
/* Restore the xarray owned reference */
- refcount_set(&obj->shortterm_users, 1);
+ refcount_set(&obj->wait_cnt, 1);
}
xa_unlock(&ictx->objects);
@@ -248,10 +310,14 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n");
}
+ init_rwsem(&ictx->ioas_creation_lock);
xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
xa_init(&ictx->groups);
ictx->file = filp;
+ mt_init_flags(&ictx->mt_mmap, MT_FLAGS_ALLOC_RANGE);
init_waitqueue_head(&ictx->destroy_wait);
+ mutex_init(&ictx->sw_msi_lock);
+ INIT_LIST_HEAD(&ictx->sw_msi_list);
filp->private_data = ictx;
return 0;
}
@@ -259,6 +325,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
static int iommufd_fops_release(struct inode *inode, struct file *filp)
{
struct iommufd_ctx *ictx = filp->private_data;
+ struct iommufd_sw_msi_map *next;
+ struct iommufd_sw_msi_map *cur;
struct iommufd_object *obj;
/*
@@ -273,20 +341,47 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
while (!xa_empty(&ictx->objects)) {
unsigned int destroyed = 0;
unsigned long index;
+ bool empty = true;
+ /*
+ * We can't use xa_empty() to end the loop as the tombstones
+ * are stored as XA_ZERO_ENTRY in the xarray. However
+ * xa_for_each() automatically converts them to NULL and skips
+ * them causing xa_empty() to be kept false. Thus once
+ * xa_for_each() finds no further !NULL entries the loop is
+ * done.
+ */
xa_for_each(&ictx->objects, index, obj) {
+ empty = false;
if (!refcount_dec_if_one(&obj->users))
continue;
+
destroyed++;
xa_erase(&ictx->objects, index);
iommufd_object_ops[obj->type].destroy(obj);
kfree(obj);
}
+
+ if (empty)
+ break;
+
/* Bug related to users refcount */
if (WARN_ON(!destroyed))
break;
}
+
+ /*
+ * There may be some tombstones left over from
+ * iommufd_object_tombstone_user()
+ */
+ xa_destroy(&ictx->objects);
+
WARN_ON(!xa_empty(&ictx->groups));
+
+ mutex_destroy(&ictx->sw_msi_lock);
+ list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item)
+ kfree(cur);
+
kfree(ictx);
return 0;
}
@@ -319,7 +414,9 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
union ucmd_buffer {
struct iommu_destroy destroy;
+ struct iommu_fault_alloc fault;
struct iommu_hw_info info;
+ struct iommu_hw_queue_alloc hw_queue;
struct iommu_hwpt_alloc hwpt;
struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
struct iommu_hwpt_invalidate cache;
@@ -331,7 +428,10 @@ union ucmd_buffer {
struct iommu_ioas_map map;
struct iommu_ioas_unmap unmap;
struct iommu_option option;
+ struct iommu_vdevice_alloc vdev;
+ struct iommu_veventq_alloc veventq;
struct iommu_vfio_ioas vfio_ioas;
+ struct iommu_viommu_alloc viommu;
#ifdef CONFIG_IOMMUFD_TEST
struct iommu_test_cmd test;
#endif
@@ -355,8 +455,12 @@ struct iommufd_ioctl_op {
}
static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
+ IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc,
+ struct iommu_fault_alloc, out_fault_fd),
IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
__reserved),
+ IOCTL_OP(IOMMU_HW_QUEUE_ALLOC, iommufd_hw_queue_alloc_ioctl,
+ struct iommu_hw_queue_alloc, length),
IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
__reserved),
IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
@@ -369,18 +473,26 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
struct iommu_ioas_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
struct iommu_ioas_allow_iovas, allowed_iovas),
+ IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process,
+ struct iommu_ioas_change_process, __reserved),
IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
src_iova),
IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
struct iommu_ioas_iova_ranges, out_iova_alignment),
- IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
- iova),
+ IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, iova),
+ IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file,
+ struct iommu_ioas_map_file, iova),
IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
length),
- IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
- val64),
+ IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64),
+ IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
+ struct iommu_vdevice_alloc, virt_id),
+ IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc,
+ struct iommu_veventq_alloc, out_veventq_fd),
IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
__reserved),
+ IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+ struct iommu_viommu_alloc, out_viommu_id),
#ifdef CONFIG_IOMMUFD_TEST
IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
#endif
@@ -419,14 +531,91 @@ static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
if (ret)
return ret;
ret = op->execute(&ucmd);
+
+ if (ucmd.new_obj) {
+ if (ret)
+ iommufd_object_abort_and_destroy(ictx, ucmd.new_obj);
+ else
+ iommufd_object_finalize(ictx, ucmd.new_obj);
+ }
return ret;
}
+static void iommufd_fops_vma_open(struct vm_area_struct *vma)
+{
+ struct iommufd_mmap *immap = vma->vm_private_data;
+
+ refcount_inc(&immap->owner->users);
+}
+
+static void iommufd_fops_vma_close(struct vm_area_struct *vma)
+{
+ struct iommufd_mmap *immap = vma->vm_private_data;
+
+ refcount_dec(&immap->owner->users);
+}
+
+static const struct vm_operations_struct iommufd_vma_ops = {
+ .open = iommufd_fops_vma_open,
+ .close = iommufd_fops_vma_close,
+};
+
+/* The vm_pgoff must be pre-allocated from mt_mmap, and given to user space */
+static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct iommufd_ctx *ictx = filp->private_data;
+ size_t length = vma->vm_end - vma->vm_start;
+ struct iommufd_mmap *immap;
+ int rc;
+
+ if (!PAGE_ALIGNED(length))
+ return -EINVAL;
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+ if (vma->vm_flags & VM_EXEC)
+ return -EPERM;
+
+ mtree_lock(&ictx->mt_mmap);
+ /* vma->vm_pgoff carries a page-shifted start position to an immap */
+ immap = mtree_load(&ictx->mt_mmap, vma->vm_pgoff << PAGE_SHIFT);
+ if (!immap || !refcount_inc_not_zero(&immap->owner->users)) {
+ mtree_unlock(&ictx->mt_mmap);
+ return -ENXIO;
+ }
+ mtree_unlock(&ictx->mt_mmap);
+
+ /*
+ * mtree_load() returns the immap for any contained mmio_addr, so only
+ * allow the exact immap thing to be mapped
+ */
+ if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) {
+ rc = -ENXIO;
+ goto err_refcount;
+ }
+
+ vma->vm_pgoff = 0;
+ vma->vm_private_data = immap;
+ vma->vm_ops = &iommufd_vma_ops;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ rc = io_remap_pfn_range(vma, vma->vm_start,
+ immap->mmio_addr >> PAGE_SHIFT, length,
+ vma->vm_page_prot);
+ if (rc)
+ goto err_refcount;
+ return 0;
+
+err_refcount:
+ refcount_dec(&immap->owner->users);
+ return rc;
+}
+
static const struct file_operations iommufd_fops = {
.owner = THIS_MODULE,
.open = iommufd_fops_open,
.release = iommufd_fops_release,
.unlocked_ioctl = iommufd_fops_ioctl,
+ .mmap = iommufd_fops_mmap,
};
/**
@@ -439,7 +628,7 @@ void iommufd_ctx_get(struct iommufd_ctx *ictx)
{
get_file(ictx->file);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, "IOMMUFD");
/**
* iommufd_ctx_from_file - Acquires a reference to the iommufd context
@@ -459,7 +648,7 @@ struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
iommufd_ctx_get(ictx);
return ictx;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, "IOMMUFD");
/**
* iommufd_ctx_from_fd - Acquires a reference to the iommufd context
@@ -483,7 +672,7 @@ struct iommufd_ctx *iommufd_ctx_from_fd(int fd)
/* fget is the same as iommufd_ctx_get() */
return file->private_data;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, "IOMMUFD");
/**
* iommufd_ctx_put - Put back a reference
@@ -493,17 +682,28 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx)
{
fput(ictx->file);
}
-EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
+EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, "IOMMUFD");
+
+#define IOMMUFD_FILE_OFFSET(_struct, _filep, _obj) \
+ .file_offset = (offsetof(_struct, _filep) + \
+ BUILD_BUG_ON_ZERO(!__same_type( \
+ struct file *, ((_struct *)NULL)->_filep)) + \
+ BUILD_BUG_ON_ZERO(offsetof(_struct, _obj)))
static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_ACCESS] = {
.destroy = iommufd_access_destroy_object,
},
[IOMMUFD_OBJ_DEVICE] = {
+ .pre_destroy = iommufd_device_pre_destroy,
.destroy = iommufd_device_destroy,
},
- [IOMMUFD_OBJ_IOAS] = {
- .destroy = iommufd_ioas_destroy,
+ [IOMMUFD_OBJ_FAULT] = {
+ .destroy = iommufd_fault_destroy,
+ IOMMUFD_FILE_OFFSET(struct iommufd_fault, common.filep, common.obj),
+ },
+ [IOMMUFD_OBJ_HW_QUEUE] = {
+ .destroy = iommufd_hw_queue_destroy,
},
[IOMMUFD_OBJ_HWPT_PAGING] = {
.destroy = iommufd_hwpt_paging_destroy,
@@ -513,6 +713,21 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
.destroy = iommufd_hwpt_nested_destroy,
.abort = iommufd_hwpt_nested_abort,
},
+ [IOMMUFD_OBJ_IOAS] = {
+ .destroy = iommufd_ioas_destroy,
+ },
+ [IOMMUFD_OBJ_VDEVICE] = {
+ .destroy = iommufd_vdevice_destroy,
+ .abort = iommufd_vdevice_abort,
+ },
+ [IOMMUFD_OBJ_VEVENTQ] = {
+ .destroy = iommufd_veventq_destroy,
+ .abort = iommufd_veventq_abort,
+ IOMMUFD_FILE_OFFSET(struct iommufd_veventq, common.filep, common.obj),
+ },
+ [IOMMUFD_OBJ_VIOMMU] = {
+ .destroy = iommufd_viommu_destroy,
+ },
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
.destroy = iommufd_selftest_destroy,
@@ -528,7 +743,6 @@ static struct miscdevice iommu_misc_dev = {
.mode = 0660,
};
-
static struct miscdevice vfio_misc_dev = {
.minor = VFIO_MINOR,
.name = "vfio",
@@ -537,6 +751,15 @@ static struct miscdevice vfio_misc_dev = {
.mode = 0666,
};
+/*
+ * Used only by DMABUF, returns a valid struct device to use as a dummy struct
+ * device for attachment.
+ */
+struct device *iommufd_global_device(void)
+{
+ return iommu_misc_dev.this_device;
+}
+
static int __init iommufd_init(void)
{
int ret;
@@ -578,7 +801,8 @@ module_exit(iommufd_exit);
MODULE_ALIAS_MISCDEV(VFIO_MINOR);
MODULE_ALIAS("devname:vfio/vfio");
#endif
-MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD_INTERNAL");
+MODULE_IMPORT_NS("IOMMUFD");
+MODULE_IMPORT_NS("DMA_BUF");
MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 528f356238b3..dbe51ecb9a20 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -45,16 +45,20 @@
* last_iova + 1 can overflow. An iopt_pages index will always be much less than
* ULONG_MAX so last_index + 1 cannot overflow.
*/
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/iommu.h>
+#include <linux/iommufd.h>
+#include <linux/kthread.h>
#include <linux/overflow.h>
#include <linux/slab.h>
-#include <linux/iommu.h>
#include <linux/sched/mm.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <linux/iommufd.h>
+#include <linux/vfio_pci_core.h>
-#include "io_pagetable.h"
#include "double_span.h"
+#include "io_pagetable.h"
#ifndef CONFIG_IOMMUFD_TEST
#define TEMP_MEMORY_LIMIT 65536
@@ -257,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages,
return container_of(node, struct iopt_area, pages_node);
}
+enum batch_kind {
+ BATCH_CPU_MEMORY = 0,
+ BATCH_MMIO,
+};
+
/*
* A simple datastructure to hold a vector of PFNs, optimized for contiguous
* PFNs. This is used as a temporary holding memory for shuttling pfns from one
@@ -270,7 +279,9 @@ struct pfn_batch {
unsigned int array_size;
unsigned int end;
unsigned int total_pfns;
+ enum batch_kind kind;
};
+enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) };
static void batch_clear(struct pfn_batch *batch)
{
@@ -346,27 +357,47 @@ static void batch_destroy(struct pfn_batch *batch, void *backup)
kfree(batch->pfns);
}
-/* true if the pfn was added, false otherwise */
-static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
+static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn,
+ u32 nr, enum batch_kind kind)
{
- const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns));
+ unsigned int end = batch->end;
- if (batch->end &&
- pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] &&
- batch->npfns[batch->end - 1] != MAX_NPFNS) {
- batch->npfns[batch->end - 1]++;
- batch->total_pfns++;
- return true;
+ if (batch->kind != kind) {
+ /* One kind per batch */
+ if (batch->end != 0)
+ return false;
+ batch->kind = kind;
}
- if (batch->end == batch->array_size)
+
+ if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] &&
+ nr <= MAX_NPFNS - batch->npfns[end - 1]) {
+ batch->npfns[end - 1] += nr;
+ } else if (end < batch->array_size) {
+ batch->pfns[end] = pfn;
+ batch->npfns[end] = nr;
+ batch->end++;
+ } else {
return false;
- batch->total_pfns++;
- batch->pfns[batch->end] = pfn;
- batch->npfns[batch->end] = 1;
- batch->end++;
+ }
+
+ batch->total_pfns += nr;
return true;
}
+static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr)
+{
+ batch->npfns[batch->end - 1] -= nr;
+ if (batch->npfns[batch->end - 1] == 0)
+ batch->end--;
+ batch->total_pfns -= nr;
+}
+
+/* true if the pfn was added, false otherwise */
+static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn)
+{
+ return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY);
+}
+
/*
* Fill the batch with pfns from the domain. When the batch is full, or it
* reaches last_index, the function will return. The caller should use
@@ -477,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
{
bool disable_large_pages = area->iopt->disable_large_pages;
unsigned long last_iova = iopt_area_last_iova(area);
+ int iommu_prot = area->iommu_prot;
unsigned int page_offset = 0;
unsigned long start_iova;
unsigned long next_iova;
@@ -484,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
unsigned long iova;
int rc;
+ if (batch->kind == BATCH_MMIO) {
+ iommu_prot &= ~IOMMU_CACHE;
+ iommu_prot |= IOMMU_MMIO;
+ }
+
/* The first index might be a partial page */
if (start_index == iopt_area_index(area))
page_offset = area->page_offset;
@@ -497,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain,
rc = batch_iommu_map_small(
domain, iova,
PFN_PHYS(batch->pfns[cur]) + page_offset,
- next_iova - iova, area->iommu_prot);
+ next_iova - iova, iommu_prot);
else
rc = iommu_map(domain, iova,
PFN_PHYS(batch->pfns[cur]) + page_offset,
- next_iova - iova, area->iommu_prot,
+ next_iova - iova, iommu_prot,
GFP_KERNEL_ACCOUNT);
if (rc)
goto err_unmap;
@@ -622,6 +659,41 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages,
break;
}
+static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
+ unsigned long *offset_p, unsigned long npages)
+{
+ int rc = 0;
+ struct folio **folios = *folios_p;
+ unsigned long offset = *offset_p;
+
+ while (npages) {
+ struct folio *folio = *folios;
+ unsigned long nr = folio_nr_pages(folio) - offset;
+ unsigned long pfn = page_to_pfn(folio_page(folio, offset));
+
+ nr = min(nr, npages);
+ npages -= nr;
+
+ if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY))
+ break;
+ if (nr > 1) {
+ rc = folio_add_pins(folio, nr - 1);
+ if (rc) {
+ batch_remove_pfn_num(batch, nr);
+ goto out;
+ }
+ }
+
+ folios++;
+ offset = 0;
+ }
+
+out:
+ *folios_p = folios;
+ *offset_p = offset;
+ return rc;
+}
+
static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
unsigned int first_page_off, size_t npages)
{
@@ -703,19 +775,32 @@ struct pfn_reader_user {
* neither
*/
int locked;
+
+ /* The following are only valid if file != NULL. */
+ struct file *file;
+ struct folio **ufolios;
+ size_t ufolios_len;
+ unsigned long ufolios_offset;
+ struct folio **ufolios_next;
};
static void pfn_reader_user_init(struct pfn_reader_user *user,
struct iopt_pages *pages)
{
user->upages = NULL;
+ user->upages_len = 0;
user->upages_start = 0;
user->upages_end = 0;
user->locked = -1;
-
user->gup_flags = FOLL_LONGTERM;
if (pages->writable)
user->gup_flags |= FOLL_WRITE;
+
+ user->file = (pages->type == IOPT_ADDRESS_FILE) ? pages->file : NULL;
+ user->ufolios = NULL;
+ user->ufolios_len = 0;
+ user->ufolios_next = NULL;
+ user->ufolios_offset = 0;
}
static void pfn_reader_user_destroy(struct pfn_reader_user *user,
@@ -724,13 +809,67 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user,
if (user->locked != -1) {
if (user->locked)
mmap_read_unlock(pages->source_mm);
- if (pages->source_mm != current->mm)
+ if (!user->file && pages->source_mm != current->mm)
mmput(pages->source_mm);
user->locked = -1;
}
kfree(user->upages);
user->upages = NULL;
+ kfree(user->ufolios);
+ user->ufolios = NULL;
+}
+
+static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i;
+ unsigned long offset;
+ unsigned long npages_out = 0;
+ struct page **upages = user->upages;
+ unsigned long end = start + (npages << PAGE_SHIFT) - 1;
+ long nfolios = user->ufolios_len / sizeof(*user->ufolios);
+
+ /*
+ * todo: memfd_pin_folios should return the last pinned offset so
+ * we can compute npages pinned, and avoid looping over folios here
+ * if upages == NULL.
+ */
+ nfolios = memfd_pin_folios(user->file, start, end, user->ufolios,
+ nfolios, &offset);
+ if (nfolios <= 0)
+ return nfolios;
+
+ offset >>= PAGE_SHIFT;
+ user->ufolios_next = user->ufolios;
+ user->ufolios_offset = offset;
+
+ for (i = 0; i < nfolios; i++) {
+ struct folio *folio = user->ufolios[i];
+ unsigned long nr = folio_nr_pages(folio);
+ unsigned long npin = min(nr - offset, npages);
+
+ npages -= npin;
+ npages_out += npin;
+
+ if (upages) {
+ if (npin == 1) {
+ *upages++ = folio_page(folio, offset);
+ } else {
+ int rc = folio_add_pins(folio, npin - 1);
+
+ if (rc)
+ return rc;
+
+ while (npin--)
+ *upages++ = folio_page(folio, offset++);
+ }
+ }
+
+ offset = 0;
+ }
+
+ return npages_out;
}
static int pfn_reader_user_pin(struct pfn_reader_user *user,
@@ -739,7 +878,9 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
unsigned long last_index)
{
bool remote_mm = pages->source_mm != current->mm;
- unsigned long npages;
+ unsigned long npages = last_index - start_index + 1;
+ unsigned long start;
+ unsigned long unum;
uintptr_t uptr;
long rc;
@@ -747,40 +888,50 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
WARN_ON(last_index < start_index))
return -EINVAL;
- if (!user->upages) {
+ if (!user->file && !user->upages) {
/* All undone in pfn_reader_destroy() */
- user->upages_len =
- (last_index - start_index + 1) * sizeof(*user->upages);
+ user->upages_len = npages * sizeof(*user->upages);
user->upages = temp_kmalloc(&user->upages_len, NULL, 0);
if (!user->upages)
return -ENOMEM;
}
+ if (user->file && !user->ufolios) {
+ user->ufolios_len = npages * sizeof(*user->ufolios);
+ user->ufolios = temp_kmalloc(&user->ufolios_len, NULL, 0);
+ if (!user->ufolios)
+ return -ENOMEM;
+ }
+
if (user->locked == -1) {
/*
* The majority of usages will run the map task within the mm
* providing the pages, so we can optimize into
* get_user_pages_fast()
*/
- if (remote_mm) {
+ if (!user->file && remote_mm) {
if (!mmget_not_zero(pages->source_mm))
return -EFAULT;
}
user->locked = 0;
}
- npages = min_t(unsigned long, last_index - start_index + 1,
- user->upages_len / sizeof(*user->upages));
-
+ unum = user->file ? user->ufolios_len / sizeof(*user->ufolios) :
+ user->upages_len / sizeof(*user->upages);
+ npages = min_t(unsigned long, npages, unum);
if (iommufd_should_fail())
return -EFAULT;
- uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
- if (!remote_mm)
+ if (user->file) {
+ start = pages->start + (start_index * PAGE_SIZE);
+ rc = pin_memfd_pages(user, start, npages);
+ } else if (!remote_mm) {
+ uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
rc = pin_user_pages_fast(uptr, npages, user->gup_flags,
user->upages);
- else {
+ } else {
+ uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
if (!user->locked) {
mmap_read_lock(pages->source_mm);
user->locked = 1;
@@ -809,13 +960,14 @@ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >>
PAGE_SHIFT;
+
+ cur_pages = atomic_long_read(&pages->source_user->locked_vm);
do {
- cur_pages = atomic_long_read(&pages->source_user->locked_vm);
new_pages = cur_pages + npages;
if (new_pages > lock_limit)
return -ENOMEM;
- } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages,
- new_pages) != cur_pages);
+ } while (!atomic_long_try_cmpxchg(&pages->source_user->locked_vm,
+ &cur_pages, new_pages));
return 0;
}
@@ -837,7 +989,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
mmap_read_unlock(pages->source_mm);
user->locked = 0;
/* If we had the lock then we also have a get */
- } else if ((!user || !user->upages) &&
+
+ } else if ((!user || (!user->upages && !user->ufolios)) &&
pages->source_mm != current->mm) {
if (!mmget_not_zero(pages->source_mm))
return -EINVAL;
@@ -854,8 +1007,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages,
return rc;
}
-static int do_update_pinned(struct iopt_pages *pages, unsigned long npages,
- bool inc, struct pfn_reader_user *user)
+int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages,
+ bool inc, struct pfn_reader_user *user)
{
int rc = 0;
@@ -889,8 +1042,8 @@ static void update_unpinned(struct iopt_pages *pages)
return;
if (pages->npinned == pages->last_npinned)
return;
- do_update_pinned(pages, pages->last_npinned - pages->npinned, false,
- NULL);
+ iopt_pages_update_pinned(pages, pages->last_npinned - pages->npinned,
+ false, NULL);
}
/*
@@ -920,7 +1073,42 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user,
npages = pages->npinned - pages->last_npinned;
inc = true;
}
- return do_update_pinned(pages, npages, inc, user);
+ return iopt_pages_update_pinned(pages, npages, inc, user);
+}
+
+struct pfn_reader_dmabuf {
+ struct dma_buf_phys_vec phys;
+ unsigned long start_offset;
+};
+
+static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf,
+ struct iopt_pages *pages)
+{
+ /* Callers must not get here if the dmabuf was already revoked */
+ if (WARN_ON(iopt_dmabuf_revoked(pages)))
+ return -EINVAL;
+
+ dmabuf->phys = pages->dmabuf.phys;
+ dmabuf->start_offset = pages->dmabuf.start;
+ return 0;
+}
+
+static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf,
+ struct pfn_batch *batch,
+ unsigned long start_index,
+ unsigned long last_index)
+{
+ unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE;
+
+ /*
+ * start/last_index and start are all PAGE_SIZE aligned, the batch is
+ * always filled using page size aligned PFNs just like the other types.
+ * If the dmabuf has been sliced on a sub page offset then the common
+ * batch to domain code will adjust it before mapping to the domain.
+ */
+ batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start),
+ last_index - start_index + 1, BATCH_MMIO);
+ return 0;
}
/*
@@ -941,7 +1129,10 @@ struct pfn_reader {
unsigned long batch_end_index;
unsigned long last_index;
- struct pfn_reader_user user;
+ union {
+ struct pfn_reader_user user;
+ struct pfn_reader_dmabuf dmabuf;
+ };
};
static int pfn_reader_update_pinned(struct pfn_reader *pfns)
@@ -977,6 +1168,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
{
struct interval_tree_double_span_iter *span = &pfns->span;
unsigned long start_index = pfns->batch_end_index;
+ struct pfn_reader_user *user;
+ unsigned long npages;
struct iopt_area *area;
int rc;
@@ -1007,18 +1200,29 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
return 0;
}
- if (start_index >= pfns->user.upages_end) {
- rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index,
+ if (iopt_is_dmabuf(pfns->pages))
+ return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch,
+ start_index, span->last_hole);
+
+ user = &pfns->user;
+ if (start_index >= user->upages_end) {
+ rc = pfn_reader_user_pin(user, pfns->pages, start_index,
span->last_hole);
if (rc)
return rc;
}
- batch_from_pages(&pfns->batch,
- pfns->user.upages +
- (start_index - pfns->user.upages_start),
- pfns->user.upages_end - start_index);
- return 0;
+ npages = user->upages_end - start_index;
+ start_index -= user->upages_start;
+ rc = 0;
+
+ if (!user->file)
+ batch_from_pages(&pfns->batch, user->upages + start_index,
+ npages);
+ else
+ rc = batch_from_folios(&pfns->batch, &user->ufolios_next,
+ &user->ufolios_offset, npages);
+ return rc;
}
static bool pfn_reader_done(struct pfn_reader *pfns)
@@ -1070,7 +1274,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
pfns->batch_start_index = start_index;
pfns->batch_end_index = start_index;
pfns->last_index = last_index;
- pfn_reader_user_init(&pfns->user, pages);
+ if (iopt_is_dmabuf(pages))
+ pfn_reader_dmabuf_init(&pfns->dmabuf, pages);
+ else
+ pfn_reader_user_init(&pfns->user, pages);
rc = batch_init(&pfns->batch, last_index - start_index + 1);
if (rc)
return rc;
@@ -1091,16 +1298,29 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages,
static void pfn_reader_release_pins(struct pfn_reader *pfns)
{
struct iopt_pages *pages = pfns->pages;
+ struct pfn_reader_user *user;
- if (pfns->user.upages_end > pfns->batch_end_index) {
- size_t npages = pfns->user.upages_end - pfns->batch_end_index;
+ if (iopt_is_dmabuf(pages))
+ return;
+ user = &pfns->user;
+ if (user->upages_end > pfns->batch_end_index) {
/* Any pages not transferred to the batch are just unpinned */
- unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
- pfns->user.upages_start),
- npages);
+
+ unsigned long npages = user->upages_end - pfns->batch_end_index;
+ unsigned long start_index = pfns->batch_end_index -
+ user->upages_start;
+
+ if (!user->file) {
+ unpin_user_pages(user->upages + start_index, npages);
+ } else {
+ long n = user->ufolios_len / sizeof(*user->ufolios);
+
+ unpin_folios(user->ufolios_next,
+ user->ufolios + n - user->ufolios_next);
+ }
iopt_pages_sub_npinned(pages, npages);
- pfns->user.upages_end = pfns->batch_end_index;
+ user->upages_end = pfns->batch_end_index;
}
if (pfns->batch_start_index != pfns->batch_end_index) {
pfn_reader_unpin(pfns);
@@ -1113,7 +1333,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns)
struct iopt_pages *pages = pfns->pages;
pfn_reader_release_pins(pfns);
- pfn_reader_user_destroy(&pfns->user, pfns->pages);
+ if (!iopt_is_dmabuf(pfns->pages))
+ pfn_reader_user_destroy(&pfns->user, pfns->pages);
batch_destroy(&pfns->batch, NULL);
WARN_ON(pages->last_npinned != pages->npinned);
}
@@ -1138,11 +1359,10 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages,
return 0;
}
-struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
- bool writable)
+static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte,
+ unsigned long length, bool writable)
{
struct iopt_pages *pages;
- unsigned long end;
/*
* The iommu API uses size_t as the length, and protect the DIV_ROUND_UP
@@ -1151,9 +1371,6 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
if (length > SIZE_MAX - PAGE_SIZE || length == 0)
return ERR_PTR(-EINVAL);
- if (check_add_overflow((unsigned long)uptr, length, &end))
- return ERR_PTR(-EOVERFLOW);
-
pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT);
if (!pages)
return ERR_PTR(-ENOMEM);
@@ -1163,8 +1380,7 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
mutex_init(&pages->mutex);
pages->source_mm = current->mm;
mmgrab(pages->source_mm);
- pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
- pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE);
+ pages->npages = DIV_ROUND_UP(length + start_byte, PAGE_SIZE);
pages->access_itree = RB_ROOT_CACHED;
pages->domains_itree = RB_ROOT_CACHED;
pages->writable = writable;
@@ -1178,6 +1394,253 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length,
return pages;
}
+struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
+ unsigned long length, bool writable)
+{
+ struct iopt_pages *pages;
+ unsigned long end;
+ void __user *uptr_down =
+ (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
+
+ if (check_add_overflow((unsigned long)uptr, length, &end))
+ return ERR_PTR(-EOVERFLOW);
+
+ pages = iopt_alloc_pages(uptr - uptr_down, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+ pages->uptr = uptr_down;
+ pages->type = IOPT_ADDRESS_USER;
+ return pages;
+}
+
+struct iopt_pages *iopt_alloc_file_pages(struct file *file,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable)
+
+{
+ struct iopt_pages *pages;
+
+ pages = iopt_alloc_pages(start_byte, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+ pages->file = get_file(file);
+ pages->start = start - start_byte;
+ pages->type = IOPT_ADDRESS_FILE;
+ return pages;
+}
+
+static void iopt_revoke_notify(struct dma_buf_attachment *attach)
+{
+ struct iopt_pages *pages = attach->importer_priv;
+ struct iopt_pages_dmabuf_track *track;
+
+ guard(mutex)(&pages->mutex);
+ if (iopt_dmabuf_revoked(pages))
+ return;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm) {
+ struct iopt_area *area = track->area;
+
+ iopt_area_unmap_domain_range(area, track->domain,
+ iopt_area_index(area),
+ iopt_area_last_index(area));
+ }
+ pages->dmabuf.phys.len = 0;
+}
+
+static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = {
+ .allow_peer2peer = true,
+ .move_notify = iopt_revoke_notify,
+};
+
+/*
+ * iommufd and vfio have a circular dependency. Future work for a phys
+ * based private interconnect will remove this.
+ */
+static int
+sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ typeof(&vfio_pci_dma_buf_iommufd_map) fn;
+ int rc;
+
+ rc = iommufd_test_dma_buf_iommufd_map(attachment, phys);
+ if (rc != -EOPNOTSUPP)
+ return rc;
+
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF))
+ return -EOPNOTSUPP;
+
+ fn = symbol_get(vfio_pci_dma_buf_iommufd_map);
+ if (!fn)
+ return -EOPNOTSUPP;
+ rc = fn(attachment, phys);
+ symbol_put(vfio_pci_dma_buf_iommufd_map);
+ return rc;
+}
+
+static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages,
+ struct dma_buf *dmabuf)
+{
+ struct dma_buf_attachment *attach;
+ int rc;
+
+ attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(),
+ &iopt_dmabuf_attach_revoke_ops, pages);
+ if (IS_ERR(attach))
+ return PTR_ERR(attach);
+
+ dma_resv_lock(dmabuf->resv, NULL);
+ /*
+ * Lock ordering requires the mutex to be taken inside the reservation,
+ * make sure lockdep sees this.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ mutex_lock(&pages->mutex);
+ mutex_unlock(&pages->mutex);
+ }
+
+ rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys);
+ if (rc)
+ goto err_detach;
+
+ dma_resv_unlock(dmabuf->resv);
+
+ /* On success iopt_release_pages() will detach and put the dmabuf. */
+ pages->dmabuf.attach = attach;
+ return 0;
+
+err_detach:
+ dma_resv_unlock(dmabuf->resv);
+ dma_buf_detach(dmabuf, attach);
+ return rc;
+}
+
+struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx,
+ struct dma_buf *dmabuf,
+ unsigned long start_byte,
+ unsigned long start,
+ unsigned long length, bool writable)
+{
+ static struct lock_class_key pages_dmabuf_mutex_key;
+ struct iopt_pages *pages;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (dmabuf->size <= (start + length - 1) ||
+ length / PAGE_SIZE >= MAX_NPFNS)
+ return ERR_PTR(-EINVAL);
+
+ pages = iopt_alloc_pages(start_byte, length, writable);
+ if (IS_ERR(pages))
+ return pages;
+
+ /*
+ * The mmap_lock can be held when obtaining the dmabuf reservation lock
+ * which creates a locking cycle with the pages mutex which is held
+ * while obtaining the mmap_lock. This locking path is not present for
+ * IOPT_ADDRESS_DMABUF so split the lock class.
+ */
+ lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key);
+
+ /* dmabuf does not use pinned page accounting. */
+ pages->account_mode = IOPT_PAGES_ACCOUNT_NONE;
+ pages->type = IOPT_ADDRESS_DMABUF;
+ pages->dmabuf.start = start - start_byte;
+ INIT_LIST_HEAD(&pages->dmabuf.tracker);
+
+ rc = iopt_map_dmabuf(ictx, pages, dmabuf);
+ if (rc) {
+ iopt_put_pages(pages);
+ return ERR_PTR(rc);
+ }
+
+ return pages;
+}
+
+int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area,
+ struct iommu_domain *domain)
+{
+ struct iopt_pages_dmabuf_track *track;
+
+ lockdep_assert_held(&pages->mutex);
+ if (WARN_ON(!iopt_is_dmabuf(pages)))
+ return -EINVAL;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm)
+ if (WARN_ON(track->domain == domain && track->area == area))
+ return -EINVAL;
+
+ track = kzalloc(sizeof(*track), GFP_KERNEL);
+ if (!track)
+ return -ENOMEM;
+ track->domain = domain;
+ track->area = area;
+ list_add_tail(&track->elm, &pages->dmabuf.tracker);
+
+ return 0;
+}
+
+void iopt_dmabuf_untrack_domain(struct iopt_pages *pages,
+ struct iopt_area *area,
+ struct iommu_domain *domain)
+{
+ struct iopt_pages_dmabuf_track *track;
+
+ lockdep_assert_held(&pages->mutex);
+ WARN_ON(!iopt_is_dmabuf(pages));
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm) {
+ if (track->domain == domain && track->area == area) {
+ list_del(&track->elm);
+ kfree(track);
+ return;
+ }
+ }
+ WARN_ON(true);
+}
+
+int iopt_dmabuf_track_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages)
+{
+ struct iopt_pages_dmabuf_track *track;
+ struct iommu_domain *domain;
+ unsigned long index;
+ int rc;
+
+ list_for_each_entry(track, &pages->dmabuf.tracker, elm)
+ if (WARN_ON(track->area == area))
+ return -EINVAL;
+
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = iopt_dmabuf_track_domain(pages, area, domain);
+ if (rc)
+ goto err_untrack;
+ }
+ return 0;
+err_untrack:
+ iopt_dmabuf_untrack_all_domains(area, pages);
+ return rc;
+}
+
+void iopt_dmabuf_untrack_all_domains(struct iopt_area *area,
+ struct iopt_pages *pages)
+{
+ struct iopt_pages_dmabuf_track *track;
+ struct iopt_pages_dmabuf_track *tmp;
+
+ list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker,
+ elm) {
+ if (track->area == area) {
+ list_del(&track->elm);
+ kfree(track);
+ }
+ }
+}
+
void iopt_release_pages(struct kref *kref)
{
struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
@@ -1190,6 +1653,15 @@ void iopt_release_pages(struct kref *kref)
mutex_destroy(&pages->mutex);
put_task_struct(pages->source_task);
free_uid(pages->source_user);
+ if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) {
+ struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf;
+
+ dma_buf_detach(dmabuf, pages->dmabuf.attach);
+ dma_buf_put(dmabuf);
+ WARN_ON(!list_empty(&pages->dmabuf.tracker));
+ } else if (pages->type == IOPT_ADDRESS_FILE) {
+ fput(pages->file);
+ }
kfree(pages);
}
@@ -1267,6 +1739,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area,
lockdep_assert_held(&pages->mutex);
+ if (iopt_is_dmabuf(pages)) {
+ if (WARN_ON(iopt_dmabuf_revoked(pages)))
+ return;
+ iopt_area_unmap_domain_range(area, domain, start_index,
+ last_index);
+ return;
+ }
+
/*
* For security we must not unpin something that is still DMA mapped,
* so this must unmap any IOVA before we go ahead and unpin the pages.
@@ -1342,6 +1822,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain)
void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages,
struct iommu_domain *domain)
{
+ if (iopt_dmabuf_revoked(pages))
+ return;
+
__iopt_area_unfill_domain(area, pages, domain,
iopt_area_last_index(area));
}
@@ -1362,6 +1845,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
lockdep_assert_held(&area->pages->mutex);
+ if (iopt_dmabuf_revoked(area->pages))
+ return 0;
+
rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
iopt_area_last_index(area));
if (rc)
@@ -1421,33 +1907,44 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
return 0;
mutex_lock(&pages->mutex);
- rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
- iopt_area_last_index(area));
- if (rc)
- goto out_unlock;
+ if (iopt_is_dmabuf(pages)) {
+ rc = iopt_dmabuf_track_all_domains(area, pages);
+ if (rc)
+ goto out_unlock;
+ }
- while (!pfn_reader_done(&pfns)) {
- done_first_end_index = pfns.batch_end_index;
- done_all_end_index = pfns.batch_start_index;
- xa_for_each(&area->iopt->domains, index, domain) {
- rc = batch_to_domain(&pfns.batch, domain, area,
- pfns.batch_start_index);
+ if (!iopt_dmabuf_revoked(pages)) {
+ rc = pfn_reader_first(&pfns, pages, iopt_area_index(area),
+ iopt_area_last_index(area));
+ if (rc)
+ goto out_untrack;
+
+ while (!pfn_reader_done(&pfns)) {
+ done_first_end_index = pfns.batch_end_index;
+ done_all_end_index = pfns.batch_start_index;
+ xa_for_each(&area->iopt->domains, index, domain) {
+ rc = batch_to_domain(&pfns.batch, domain, area,
+ pfns.batch_start_index);
+ if (rc)
+ goto out_unmap;
+ }
+ done_all_end_index = done_first_end_index;
+
+ rc = pfn_reader_next(&pfns);
if (rc)
goto out_unmap;
}
- done_all_end_index = done_first_end_index;
-
- rc = pfn_reader_next(&pfns);
+ rc = pfn_reader_update_pinned(&pfns);
if (rc)
goto out_unmap;
+
+ pfn_reader_destroy(&pfns);
}
- rc = pfn_reader_update_pinned(&pfns);
- if (rc)
- goto out_unmap;
area->storage_domain = xa_load(&area->iopt->domains, 0);
interval_tree_insert(&area->pages_node, &pages->domains_itree);
- goto out_destroy;
+ mutex_unlock(&pages->mutex);
+ return 0;
out_unmap:
pfn_reader_release_pins(&pfns);
@@ -1474,8 +1971,10 @@ out_unmap:
end_index);
}
}
-out_destroy:
pfn_reader_destroy(&pfns);
+out_untrack:
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_all_domains(area, pages);
out_unlock:
mutex_unlock(&pages->mutex);
return rc;
@@ -1501,16 +2000,22 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
if (!area->storage_domain)
goto out_unlock;
- xa_for_each(&iopt->domains, index, domain)
- if (domain != area->storage_domain)
+ xa_for_each(&iopt->domains, index, domain) {
+ if (domain == area->storage_domain)
+ continue;
+
+ if (!iopt_dmabuf_revoked(pages))
iopt_area_unmap_domain_range(
area, domain, iopt_area_index(area),
iopt_area_last_index(area));
+ }
if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
interval_tree_remove(&area->pages_node, &pages->domains_itree);
iopt_area_unfill_domain(area, pages, area->storage_domain);
+ if (iopt_is_dmabuf(pages))
+ iopt_dmabuf_untrack_all_domains(area, pages);
area->storage_domain = NULL;
out_unlock:
mutex_unlock(&pages->mutex);
@@ -1629,11 +2134,11 @@ static int iopt_pages_fill_from_domain(struct iopt_pages *pages,
return 0;
}
-static int iopt_pages_fill_from_mm(struct iopt_pages *pages,
- struct pfn_reader_user *user,
- unsigned long start_index,
- unsigned long last_index,
- struct page **out_pages)
+static int iopt_pages_fill(struct iopt_pages *pages,
+ struct pfn_reader_user *user,
+ unsigned long start_index,
+ unsigned long last_index,
+ struct page **out_pages)
{
unsigned long cur_index = start_index;
int rc;
@@ -1707,8 +2212,8 @@ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index,
/* hole */
cur_pages = out_pages + (span.start_hole - start_index);
- rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole,
- span.last_hole, cur_pages);
+ rc = iopt_pages_fill(pages, &user, span.start_hole,
+ span.last_hole, cur_pages);
if (rc)
goto out_clean_xa;
rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole,
@@ -1788,6 +2293,10 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index,
struct page *page = NULL;
int rc;
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
+ WARN_ON(pages->type != IOPT_ADDRESS_USER))
+ return -EINVAL;
+
if (!mmget_not_zero(pages->source_mm))
return iopt_pages_rw_slow(pages, index, index, offset, data,
length, flags);
@@ -1843,6 +2352,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable)
return -EPERM;
+ if (iopt_is_dmabuf(pages))
+ return -EINVAL;
+
+ if (pages->type != IOPT_ADDRESS_USER)
+ return iopt_pages_rw_slow(pages, start_index, last_index,
+ start_byte % PAGE_SIZE, data, length,
+ flags);
+
if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) {
if (start_index == last_index)
return iopt_pages_rw_page(pages, start_index,
@@ -1906,6 +2423,7 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index,
* @last_index: Inclusive last page index
* @out_pages: Output list of struct page's representing the PFNs
* @flags: IOMMUFD_ACCESS_RW_* flags
+ * @lock_area: Fail userspace munmap on this area
*
* Record that an in-kernel access will be accessing the pages, ensure they are
* pinned, and return the PFNs as a simple list of 'struct page *'.
@@ -1913,8 +2431,8 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index,
* This should be undone through a matching call to iopt_area_remove_access()
*/
int iopt_area_add_access(struct iopt_area *area, unsigned long start_index,
- unsigned long last_index, struct page **out_pages,
- unsigned int flags)
+ unsigned long last_index, struct page **out_pages,
+ unsigned int flags, bool lock_area)
{
struct iopt_pages *pages = area->pages;
struct iopt_pages_access *access;
@@ -1927,6 +2445,8 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index,
access = iopt_pages_get_exact_access(pages, start_index, last_index);
if (access) {
area->num_accesses++;
+ if (lock_area)
+ area->num_locks++;
access->users++;
iopt_pages_fill_from_xarray(pages, start_index, last_index,
out_pages);
@@ -1948,6 +2468,8 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index,
access->node.last = last_index;
access->users = 1;
area->num_accesses++;
+ if (lock_area)
+ area->num_locks++;
interval_tree_insert(&access->node, &pages->access_itree);
mutex_unlock(&pages->mutex);
return 0;
@@ -1964,12 +2486,13 @@ err_unlock:
* @area: The source of PFNs
* @start_index: First page index
* @last_index: Inclusive last page index
+ * @unlock_area: Must match the matching iopt_area_add_access()'s lock_area
*
* Undo iopt_area_add_access() and unpin the pages if necessary. The caller
* must stop using the PFNs before calling this.
*/
void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index,
- unsigned long last_index)
+ unsigned long last_index, bool unlock_area)
{
struct iopt_pages *pages = area->pages;
struct iopt_pages_access *access;
@@ -1980,6 +2503,10 @@ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index,
goto out_unlock;
WARN_ON(area->num_accesses == 0 || access->users == 0);
+ if (unlock_area) {
+ WARN_ON(area->num_locks == 0);
+ area->num_locks--;
+ }
area->num_accesses--;
access->users--;
if (access->users)
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 7a2199470f31..c4322fd26f93 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -3,14 +3,19 @@
*
* Kernel side components to support tools/testing/selftests/iommu
*/
-#include <linux/slab.h>
-#include <linux/iommu.h>
-#include <linux/xarray.h>
-#include <linux/file.h>
#include <linux/anon_inodes.h>
+#include <linux/debugfs.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/fault-inject.h>
+#include <linux/file.h>
+#include <linux/iommu.h>
#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
#include <uapi/linux/iommufd.h>
+#include <linux/generic_pt/iommu.h>
+#include "../iommu-pages.h"
#include "../iommu-priv.h"
#include "io_pagetable.h"
@@ -40,23 +45,11 @@ static DEFINE_IDA(mock_dev_ida);
enum {
MOCK_DIRTY_TRACK = 1,
- MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
- MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE,
-
- /*
- * Like a real page table alignment requires the low bits of the address
- * to be zero. xarray also requires the high bit to be zero, so we store
- * the pfns shifted. The upper bits are used for metadata.
- */
- MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE,
-
- _MOCK_PFN_START = MOCK_PFN_MASK + 1,
- MOCK_PFN_START_IOVA = _MOCK_PFN_START,
- MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
- MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
- MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2,
};
+static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain);
+static void mock_dev_disable_iopf(struct device *dev, struct iommu_domain *domain);
+
/*
* Syzkaller has trouble randomizing the correct iova to use since it is linked
* to the map ioctl's output, and it has no ide about that. So, simplify things.
@@ -120,27 +113,84 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
}
struct mock_iommu_domain {
+ union {
+ struct iommu_domain domain;
+ struct pt_iommu iommu;
+ struct pt_iommu_amdv1 amdv1;
+ };
unsigned long flags;
- struct iommu_domain domain;
- struct xarray pfns;
};
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain);
+PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain);
+
+static inline struct mock_iommu_domain *
+to_mock_domain(struct iommu_domain *domain)
+{
+ return container_of(domain, struct mock_iommu_domain, domain);
+}
struct mock_iommu_domain_nested {
struct iommu_domain domain;
- struct mock_iommu_domain *parent;
+ struct mock_viommu *mock_viommu;
u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM];
};
+static inline struct mock_iommu_domain_nested *
+to_mock_nested(struct iommu_domain *domain)
+{
+ return container_of(domain, struct mock_iommu_domain_nested, domain);
+}
+
+struct mock_viommu {
+ struct iommufd_viommu core;
+ struct mock_iommu_domain *s2_parent;
+ struct mock_hw_queue *hw_queue[IOMMU_TEST_HW_QUEUE_MAX];
+ struct mutex queue_mutex;
+
+ unsigned long mmap_offset;
+ u32 *page; /* Mmap page to test u32 type of in_data */
+};
+
+static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu)
+{
+ return container_of(viommu, struct mock_viommu, core);
+}
+
+struct mock_hw_queue {
+ struct iommufd_hw_queue core;
+ struct mock_viommu *mock_viommu;
+ struct mock_hw_queue *prev;
+ u16 index;
+};
+
+static inline struct mock_hw_queue *
+to_mock_hw_queue(struct iommufd_hw_queue *hw_queue)
+{
+ return container_of(hw_queue, struct mock_hw_queue, core);
+}
+
enum selftest_obj_type {
TYPE_IDEV,
};
struct mock_dev {
struct device dev;
+ struct mock_viommu *viommu;
+ struct rw_semaphore viommu_rwsem;
unsigned long flags;
+ unsigned long vdev_id;
int id;
+ u32 cache[MOCK_DEV_CACHE_NUM];
+ atomic_t pasid_1024_fake_error;
+ unsigned int iopf_refcount;
+ struct iommu_domain *domain;
};
+static inline struct mock_dev *to_mock_dev(struct device *dev)
+{
+ return container_of(dev, struct mock_dev, dev);
+}
+
struct selftest_obj {
struct iommufd_object obj;
enum selftest_obj_type type;
@@ -154,19 +204,94 @@ struct selftest_obj {
};
};
+static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj)
+{
+ return container_of(obj, struct selftest_obj, obj);
+}
+
static int mock_domain_nop_attach(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
+ struct mock_viommu *new_viommu = NULL;
+ unsigned long vdev_id = 0;
+ int rc;
if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY))
return -EINVAL;
+ iommu_group_mutex_assert(dev);
+ if (domain->type == IOMMU_DOMAIN_NESTED) {
+ new_viommu = to_mock_nested(domain)->mock_viommu;
+ if (new_viommu) {
+ rc = iommufd_viommu_get_vdev_id(&new_viommu->core, dev,
+ &vdev_id);
+ if (rc)
+ return rc;
+ }
+ }
+ if (new_viommu != mdev->viommu) {
+ down_write(&mdev->viommu_rwsem);
+ mdev->viommu = new_viommu;
+ mdev->vdev_id = vdev_id;
+ up_write(&mdev->viommu_rwsem);
+ }
+
+ rc = mock_dev_enable_iopf(dev, domain);
+ if (rc)
+ return rc;
+
+ mock_dev_disable_iopf(dev, mdev->domain);
+ mdev->domain = domain;
+
+ return 0;
+}
+
+static int mock_domain_set_dev_pasid_nop(struct iommu_domain *domain,
+ struct device *dev, ioasid_t pasid,
+ struct iommu_domain *old)
+{
+ struct mock_dev *mdev = to_mock_dev(dev);
+ int rc;
+
+ /*
+ * Per the first attach with pasid 1024, set the
+ * mdev->pasid_1024_fake_error. Hence the second call of this op
+ * can fake an error to validate the error path of the core. This
+ * is helpful to test the case in which the iommu core needs to
+ * rollback to the old domain due to driver failure. e.g. replace.
+ * User should be careful about the third call of this op, it shall
+ * succeed since the mdev->pasid_1024_fake_error is cleared in the
+ * second call.
+ */
+ if (pasid == 1024) {
+ if (domain->type == IOMMU_DOMAIN_BLOCKED) {
+ atomic_set(&mdev->pasid_1024_fake_error, 0);
+ } else if (atomic_read(&mdev->pasid_1024_fake_error)) {
+ /*
+ * Clear the flag, and fake an error to fail the
+ * replacement.
+ */
+ atomic_set(&mdev->pasid_1024_fake_error, 0);
+ return -ENOMEM;
+ } else {
+ /* Set the flag to fake an error in next call */
+ atomic_set(&mdev->pasid_1024_fake_error, 1);
+ }
+ }
+
+ rc = mock_dev_enable_iopf(dev, domain);
+ if (rc)
+ return rc;
+
+ mock_dev_disable_iopf(dev, old);
+
return 0;
}
static const struct iommu_domain_ops mock_blocking_ops = {
.attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop
};
static struct iommu_domain mock_blocking_domain = {
@@ -174,10 +299,15 @@ static struct iommu_domain mock_blocking_domain = {
.ops = &mock_blocking_ops,
};
-static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
+static void *mock_domain_hw_info(struct device *dev, u32 *length,
+ enum iommu_hw_info_type *type)
{
struct iommu_test_hw_info *info;
+ if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
+ *type != IOMMU_HW_INFO_TYPE_SELFTEST)
+ return ERR_PTR(-EOPNOTSUPP);
+
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
return ERR_PTR(-ENOMEM);
@@ -192,8 +322,7 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
unsigned long flags = mock->flags;
if (enable && !domain->dirty_ops)
@@ -209,309 +338,540 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
return 0;
}
-static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock,
- unsigned long iova, size_t page_size,
- unsigned long flags)
+static struct mock_iommu_domain_nested *
+__mock_domain_alloc_nested(const struct iommu_user_data *user_data)
{
- unsigned long cur, end = iova + page_size - 1;
- bool dirty = false;
- void *ent, *old;
-
- for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) {
- ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
- if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA))
- continue;
+ struct mock_iommu_domain_nested *mock_nested;
+ struct iommu_hwpt_selftest user_cfg;
+ int rc, i;
- dirty = true;
- /* Clear dirty */
- if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
- unsigned long val;
+ if (user_data->type != IOMMU_HWPT_DATA_SELFTEST)
+ return ERR_PTR(-EOPNOTSUPP);
- val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
- old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE,
- xa_mk_value(val), GFP_KERNEL);
- WARN_ON_ONCE(ent != old);
- }
- }
+ rc = iommu_copy_struct_from_user(&user_cfg, user_data,
+ IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
- return dirty;
+ mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL);
+ if (!mock_nested)
+ return ERR_PTR(-ENOMEM);
+ mock_nested->domain.ops = &domain_nested_ops;
+ mock_nested->domain.type = IOMMU_DOMAIN_NESTED;
+ for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++)
+ mock_nested->iotlb[i] = user_cfg.iotlb;
+ return mock_nested;
}
-static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
- unsigned long iova, size_t size,
- unsigned long flags,
- struct iommu_dirty_bitmap *dirty)
+static struct iommu_domain *
+mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
+ u32 flags, const struct iommu_user_data *user_data)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
- unsigned long end = iova + size;
- void *ent;
+ struct mock_iommu_domain_nested *mock_nested;
+ struct mock_iommu_domain *mock_parent;
- if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
- return -EINVAL;
+ if (flags & ~IOMMU_HWPT_ALLOC_PASID)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING))
+ return ERR_PTR(-EINVAL);
- do {
- unsigned long pgsize = MOCK_IO_PAGE_SIZE;
- unsigned long head;
+ mock_parent = to_mock_domain(parent);
+ if (!mock_parent)
+ return ERR_PTR(-EINVAL);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- if (!ent) {
- iova += pgsize;
- continue;
- }
+ mock_nested = __mock_domain_alloc_nested(user_data);
+ if (IS_ERR(mock_nested))
+ return ERR_CAST(mock_nested);
+ return &mock_nested->domain;
+}
- if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA)
- pgsize = MOCK_HUGE_PAGE_SIZE;
- head = iova & ~(pgsize - 1);
+static void mock_domain_free(struct iommu_domain *domain)
+{
+ struct mock_iommu_domain *mock = to_mock_domain(domain);
- /* Clear dirty */
- if (mock_test_and_clear_dirty(mock, head, pgsize, flags))
- iommu_dirty_bitmap_record(dirty, head, pgsize);
- iova = head + pgsize;
- } while (iova < end);
+ pt_iommu_deinit(&mock->iommu);
+ kfree(mock);
+}
- return 0;
+static void mock_iotlb_sync(struct iommu_domain *domain,
+ struct iommu_iotlb_gather *gather)
+{
+ iommu_put_pages_list(&gather->freelist);
}
-const struct iommu_dirty_ops dirty_ops = {
+static const struct iommu_domain_ops amdv1_mock_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
+
+static const struct iommu_domain_ops amdv1_mock_huge_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1_mock),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
+#undef pt_iommu_amdv1_mock_map_pages
+
+static const struct iommu_dirty_ops amdv1_mock_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1_mock),
.set_dirty_tracking = mock_domain_set_dirty_tracking,
- .read_and_clear_dirty = mock_domain_read_and_clear_dirty,
};
-static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
+static const struct iommu_domain_ops amdv1_ops = {
+ IOMMU_PT_DOMAIN_OPS(amdv1),
+ .free = mock_domain_free,
+ .attach_dev = mock_domain_nop_attach,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
+ .iotlb_sync = &mock_iotlb_sync,
+};
+
+static const struct iommu_dirty_ops amdv1_dirty_ops = {
+ IOMMU_PT_DIRTY_OPS(amdv1),
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+};
+
+static struct mock_iommu_domain *
+mock_domain_alloc_pgtable(struct device *dev,
+ const struct iommu_hwpt_selftest *user_cfg, u32 flags)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
struct mock_iommu_domain *mock;
+ int rc;
mock = kzalloc(sizeof(*mock), GFP_KERNEL);
if (!mock)
- return NULL;
- mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
- mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
- mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
- if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA)
- mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE;
- mock->domain.ops = mock_ops.default_domain_ops;
+ return ERR_PTR(-ENOMEM);
mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
- xa_init(&mock->pfns);
- return &mock->domain;
-}
-static struct iommu_domain *
-__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent,
- const struct iommu_hwpt_selftest *user_cfg)
-{
- struct mock_iommu_domain_nested *mock_nested;
- int i;
+ mock->amdv1.iommu.nid = NUMA_NO_NODE;
+
+ switch (user_cfg->pagetable_type) {
+ case MOCK_IOMMUPT_DEFAULT:
+ case MOCK_IOMMUPT_HUGE: {
+ struct pt_iommu_amdv1_cfg cfg = {};
+
+ /* The mock version has a 2k page size */
+ cfg.common.hw_max_vasz_lg2 = 56;
+ cfg.common.hw_max_oasz_lg2 = 51;
+ cfg.starting_level = 2;
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+ mock->domain.ops = &amdv1_mock_huge_ops;
+ else
+ mock->domain.ops = &amdv1_mock_ops;
+ rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL);
+ if (rc)
+ goto err_free;
+
+ /*
+ * In huge mode userspace should only provide huge pages, we
+ * have to include PAGE_SIZE for the domain to be accepted by
+ * iommufd.
+ */
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE)
+ mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE |
+ PAGE_SIZE;
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ mock->domain.dirty_ops = &amdv1_mock_dirty_ops;
+ break;
+ }
- mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL);
- if (!mock_nested)
- return ERR_PTR(-ENOMEM);
- mock_nested->parent = mock_parent;
- mock_nested->domain.ops = &domain_nested_ops;
- mock_nested->domain.type = IOMMU_DOMAIN_NESTED;
- for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++)
- mock_nested->iotlb[i] = user_cfg->iotlb;
- return &mock_nested->domain;
+ case MOCK_IOMMUPT_AMDV1: {
+ struct pt_iommu_amdv1_cfg cfg = {};
+
+ cfg.common.hw_max_vasz_lg2 = 64;
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) |
+ BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) |
+ BIT(PT_FEAT_AMDV1_FORCE_COHERENCE);
+ cfg.starting_level = 2;
+ mock->domain.ops = &amdv1_ops;
+ rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL);
+ if (rc)
+ goto err_free;
+ if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+ mock->domain.dirty_ops = &amdv1_dirty_ops;
+ break;
+ }
+ default:
+ rc = -EOPNOTSUPP;
+ goto err_free;
+ }
+
+ /*
+ * Override the real aperture to the MOCK aperture for test purposes.
+ */
+ if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) {
+ WARN_ON(mock->domain.geometry.aperture_start != 0);
+ WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST);
+
+ mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
+ mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
+ }
+
+ return mock;
+err_free:
+ kfree(mock);
+ return ERR_PTR(rc);
}
static struct iommu_domain *
-mock_domain_alloc_user(struct device *dev, u32 flags,
- struct iommu_domain *parent,
- const struct iommu_user_data *user_data)
+mock_domain_alloc_paging_flags(struct device *dev, u32 flags,
+ const struct iommu_user_data *user_data)
{
- struct mock_iommu_domain *mock_parent;
- struct iommu_hwpt_selftest user_cfg;
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+ IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_PASID;
+ struct mock_dev *mdev = to_mock_dev(dev);
+ bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_hwpt_selftest user_cfg = {};
+ struct mock_iommu_domain *mock;
int rc;
- /* must be mock_domain */
- if (!parent) {
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
- bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
- bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
- struct iommu_domain *domain;
-
- if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT |
- IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
- return ERR_PTR(-EOPNOTSUPP);
- if (user_data || (has_dirty_flag && no_dirty_ops))
- return ERR_PTR(-EOPNOTSUPP);
- domain = mock_domain_alloc_paging(dev);
- if (!domain)
- return ERR_PTR(-ENOMEM);
- if (has_dirty_flag)
- container_of(domain, struct mock_iommu_domain, domain)
- ->domain.dirty_ops = &dirty_ops;
- return domain;
- }
-
- /* must be mock_domain_nested */
- if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags)
+ if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops))
return ERR_PTR(-EOPNOTSUPP);
- if (!parent || parent->ops != mock_ops.default_domain_ops)
- return ERR_PTR(-EINVAL);
- mock_parent = container_of(parent, struct mock_iommu_domain, domain);
- if (!mock_parent)
- return ERR_PTR(-EINVAL);
+ if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST &&
+ user_data->type != IOMMU_HWPT_DATA_NONE))
+ return ERR_PTR(-EOPNOTSUPP);
- rc = iommu_copy_struct_from_user(&user_cfg, user_data,
- IOMMU_HWPT_DATA_SELFTEST, iotlb);
- if (rc)
- return ERR_PTR(rc);
+ if (user_data) {
+ rc = iommu_copy_struct_from_user(
+ &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
+ }
- return __mock_domain_alloc_nested(mock_parent, &user_cfg);
+ mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags);
+ if (IS_ERR(mock))
+ return ERR_CAST(mock);
+ return &mock->domain;
}
-static void mock_domain_free(struct iommu_domain *domain)
+static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
+ struct mock_dev *mdev = to_mock_dev(dev);
- WARN_ON(!xa_empty(&mock->pfns));
- kfree(mock);
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY);
+ default:
+ break;
+ }
+
+ return false;
}
-static int mock_domain_map_pages(struct iommu_domain *domain,
- unsigned long iova, phys_addr_t paddr,
- size_t pgsize, size_t pgcount, int prot,
- gfp_t gfp, size_t *mapped)
+static struct iopf_queue *mock_iommu_iopf_queue;
+
+static struct mock_iommu_device {
+ struct iommu_device iommu_dev;
+ struct completion complete;
+ refcount_t users;
+} mock_iommu;
+
+static struct iommu_device *mock_probe_device(struct device *dev)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
- unsigned long flags = MOCK_PFN_START_IOVA;
- unsigned long start_iova = iova;
+ if (dev->bus != &iommufd_mock_bus_type.bus)
+ return ERR_PTR(-ENODEV);
+ return &mock_iommu.iommu_dev;
+}
- /*
- * xarray does not reliably work with fault injection because it does a
- * retry allocation, so put our own failure point.
- */
- if (iommufd_should_fail())
- return -ENOENT;
+static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt,
+ struct iommu_page_response *msg)
+{
+}
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
- for (; pgcount; pgcount--) {
- size_t cur;
+static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain)
+{
+ struct mock_dev *mdev = to_mock_dev(dev);
+ int ret;
- for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
- void *old;
+ if (!domain || !domain->iopf_handler)
+ return 0;
- if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
- flags = MOCK_PFN_LAST_IOVA;
- if (pgsize != MOCK_IO_PAGE_SIZE) {
- flags |= MOCK_PFN_HUGE_IOVA;
- }
- old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE,
- xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) |
- flags),
- gfp);
- if (xa_is_err(old)) {
- for (; start_iova != iova;
- start_iova += MOCK_IO_PAGE_SIZE)
- xa_erase(&mock->pfns,
- start_iova /
- MOCK_IO_PAGE_SIZE);
- return xa_err(old);
- }
- WARN_ON(old);
- iova += MOCK_IO_PAGE_SIZE;
- paddr += MOCK_IO_PAGE_SIZE;
- *mapped += MOCK_IO_PAGE_SIZE;
- flags = 0;
- }
+ if (!mock_iommu_iopf_queue)
+ return -ENODEV;
+
+ if (mdev->iopf_refcount) {
+ mdev->iopf_refcount++;
+ return 0;
}
+
+ ret = iopf_queue_add_device(mock_iommu_iopf_queue, dev);
+ if (ret)
+ return ret;
+
+ mdev->iopf_refcount = 1;
+
return 0;
}
-static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
- unsigned long iova, size_t pgsize,
- size_t pgcount,
- struct iommu_iotlb_gather *iotlb_gather)
+static void mock_dev_disable_iopf(struct device *dev, struct iommu_domain *domain)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
- bool first = true;
- size_t ret = 0;
- void *ent;
+ struct mock_dev *mdev = to_mock_dev(dev);
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- WARN_ON(pgsize % MOCK_IO_PAGE_SIZE);
+ if (!domain || !domain->iopf_handler)
+ return;
- for (; pgcount; pgcount--) {
- size_t cur;
+ if (--mdev->iopf_refcount)
+ return;
- for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
- ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
+ iopf_queue_remove_device(mock_iommu_iopf_queue, dev);
+}
- /*
- * iommufd generates unmaps that must be a strict
- * superset of the map's performend So every
- * starting/ending IOVA should have been an iova passed
- * to map.
- *
- * This simple logic doesn't work when the HUGE_PAGE is
- * turned on since the core code will automatically
- * switch between the two page sizes creating a break in
- * the unmap calls. The break can land in the middle of
- * contiguous IOVA.
- */
- if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) {
- if (first) {
- WARN_ON(ent && !(xa_to_value(ent) &
- MOCK_PFN_START_IOVA));
- first = false;
- }
- if (pgcount == 1 &&
- cur + MOCK_IO_PAGE_SIZE == pgsize)
- WARN_ON(ent && !(xa_to_value(ent) &
- MOCK_PFN_LAST_IOVA));
- }
+static void mock_viommu_destroy(struct iommufd_viommu *viommu)
+{
+ struct mock_iommu_device *mock_iommu = container_of(
+ viommu->iommu_dev, struct mock_iommu_device, iommu_dev);
+ struct mock_viommu *mock_viommu = to_mock_viommu(viommu);
+
+ if (refcount_dec_and_test(&mock_iommu->users))
+ complete(&mock_iommu->complete);
+ if (mock_viommu->mmap_offset)
+ iommufd_viommu_destroy_mmap(&mock_viommu->core,
+ mock_viommu->mmap_offset);
+ free_page((unsigned long)mock_viommu->page);
+ mutex_destroy(&mock_viommu->queue_mutex);
+
+ /* iommufd core frees mock_viommu and viommu */
+}
+
+static struct iommu_domain *
+mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ struct mock_viommu *mock_viommu = to_mock_viommu(viommu);
+ struct mock_iommu_domain_nested *mock_nested;
+
+ if (flags & ~IOMMU_HWPT_ALLOC_PASID)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mock_nested = __mock_domain_alloc_nested(user_data);
+ if (IS_ERR(mock_nested))
+ return ERR_CAST(mock_nested);
+ mock_nested->mock_viommu = mock_viommu;
+ return &mock_nested->domain;
+}
+
+static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu,
+ struct iommu_user_data_array *array)
+{
+ struct iommu_viommu_invalidate_selftest *cmds;
+ struct iommu_viommu_invalidate_selftest *cur;
+ struct iommu_viommu_invalidate_selftest *end;
+ int rc;
+
+ /* A zero-length array is allowed to validate the array type */
+ if (array->entry_num == 0 &&
+ array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) {
+ array->entry_num = 0;
+ return 0;
+ }
+
+ cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+ if (!cmds)
+ return -ENOMEM;
+ cur = cmds;
+ end = cmds + array->entry_num;
+
+ static_assert(sizeof(*cmds) == 3 * sizeof(u32));
+ rc = iommu_copy_struct_from_full_user_array(
+ cmds, sizeof(*cmds), array,
+ IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST);
+ if (rc)
+ goto out;
+
+ while (cur != end) {
+ struct mock_dev *mdev;
+ struct device *dev;
+ int i;
- iova += MOCK_IO_PAGE_SIZE;
- ret += MOCK_IO_PAGE_SIZE;
+ if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ xa_lock(&viommu->vdevs);
+ dev = iommufd_viommu_find_dev(viommu,
+ (unsigned long)cur->vdev_id);
+ if (!dev) {
+ xa_unlock(&viommu->vdevs);
+ rc = -EINVAL;
+ goto out;
+ }
+ mdev = container_of(dev, struct mock_dev, dev);
+
+ if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) {
+ /* Invalidate all cache entries and ignore cache_id */
+ for (i = 0; i < MOCK_DEV_CACHE_NUM; i++)
+ mdev->cache[i] = 0;
+ } else {
+ mdev->cache[cur->cache_id] = 0;
}
+ xa_unlock(&viommu->vdevs);
+
+ cur++;
}
- return ret;
+out:
+ array->entry_num = cur - cmds;
+ kfree(cmds);
+ return rc;
}
-static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
- dma_addr_t iova)
+static size_t mock_viommu_get_hw_queue_size(struct iommufd_viommu *viommu,
+ enum iommu_hw_queue_type queue_type)
{
- struct mock_iommu_domain *mock =
- container_of(domain, struct mock_iommu_domain, domain);
- void *ent;
+ if (queue_type != IOMMU_HW_QUEUE_TYPE_SELFTEST)
+ return 0;
+ return HW_QUEUE_STRUCT_SIZE(struct mock_hw_queue, core);
+}
- WARN_ON(iova % MOCK_IO_PAGE_SIZE);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- WARN_ON(!ent);
- return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE;
+static void mock_hw_queue_destroy(struct iommufd_hw_queue *hw_queue)
+{
+ struct mock_hw_queue *mock_hw_queue = to_mock_hw_queue(hw_queue);
+ struct mock_viommu *mock_viommu = mock_hw_queue->mock_viommu;
+
+ mutex_lock(&mock_viommu->queue_mutex);
+ mock_viommu->hw_queue[mock_hw_queue->index] = NULL;
+ if (mock_hw_queue->prev)
+ iommufd_hw_queue_undepend(mock_hw_queue, mock_hw_queue->prev,
+ core);
+ mutex_unlock(&mock_viommu->queue_mutex);
}
-static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
+/* Test iommufd_hw_queue_depend/undepend() */
+static int mock_hw_queue_init_phys(struct iommufd_hw_queue *hw_queue, u32 index,
+ phys_addr_t base_addr_pa)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_viommu *mock_viommu = to_mock_viommu(hw_queue->viommu);
+ struct mock_hw_queue *mock_hw_queue = to_mock_hw_queue(hw_queue);
+ struct mock_hw_queue *prev = NULL;
+ int rc = 0;
- switch (cap) {
- case IOMMU_CAP_CACHE_COHERENCY:
- return true;
- case IOMMU_CAP_DIRTY_TRACKING:
- return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY);
- default:
- break;
+ if (index >= IOMMU_TEST_HW_QUEUE_MAX)
+ return -EINVAL;
+
+ mutex_lock(&mock_viommu->queue_mutex);
+
+ if (mock_viommu->hw_queue[index]) {
+ rc = -EEXIST;
+ goto unlock;
}
- return false;
+ if (index) {
+ prev = mock_viommu->hw_queue[index - 1];
+ if (!prev) {
+ rc = -EIO;
+ goto unlock;
+ }
+ }
+
+ /*
+ * Test to catch a kernel bug if the core converted the physical address
+ * incorrectly. Let mock_domain_iova_to_phys() WARN_ON if it fails.
+ */
+ if (base_addr_pa != iommu_iova_to_phys(&mock_viommu->s2_parent->domain,
+ hw_queue->base_addr)) {
+ rc = -EFAULT;
+ goto unlock;
+ }
+
+ if (prev) {
+ rc = iommufd_hw_queue_depend(mock_hw_queue, prev, core);
+ if (rc)
+ goto unlock;
+ }
+
+ mock_hw_queue->prev = prev;
+ mock_hw_queue->mock_viommu = mock_viommu;
+ mock_viommu->hw_queue[index] = mock_hw_queue;
+
+ hw_queue->destroy = &mock_hw_queue_destroy;
+unlock:
+ mutex_unlock(&mock_viommu->queue_mutex);
+ return rc;
}
-static struct iommu_device mock_iommu_device = {
+static struct iommufd_viommu_ops mock_viommu_ops = {
+ .destroy = mock_viommu_destroy,
+ .alloc_domain_nested = mock_viommu_alloc_domain_nested,
+ .cache_invalidate = mock_viommu_cache_invalidate,
+ .get_hw_queue_size = mock_viommu_get_hw_queue_size,
+ .hw_queue_init_phys = mock_hw_queue_init_phys,
};
-static struct iommu_device *mock_probe_device(struct device *dev)
+static size_t mock_get_viommu_size(struct device *dev,
+ enum iommu_viommu_type viommu_type)
{
- if (dev->bus != &iommufd_mock_bus_type.bus)
- return ERR_PTR(-ENODEV);
- return &mock_iommu_device;
+ if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST)
+ return 0;
+ return VIOMMU_STRUCT_SIZE(struct mock_viommu, core);
+}
+
+static int mock_viommu_init(struct iommufd_viommu *viommu,
+ struct iommu_domain *parent_domain,
+ const struct iommu_user_data *user_data)
+{
+ struct mock_iommu_device *mock_iommu = container_of(
+ viommu->iommu_dev, struct mock_iommu_device, iommu_dev);
+ struct mock_viommu *mock_viommu = to_mock_viommu(viommu);
+ struct iommu_viommu_selftest data;
+ int rc;
+
+ if (user_data) {
+ rc = iommu_copy_struct_from_user(
+ &data, user_data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data);
+ if (rc)
+ return rc;
+
+ /* Allocate two pages */
+ mock_viommu->page =
+ (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+ if (!mock_viommu->page)
+ return -ENOMEM;
+
+ rc = iommufd_viommu_alloc_mmap(&mock_viommu->core,
+ __pa(mock_viommu->page),
+ PAGE_SIZE * 2,
+ &mock_viommu->mmap_offset);
+ if (rc)
+ goto err_free_page;
+
+ /* For loopback tests on both the page and out_data */
+ *mock_viommu->page = data.in_data;
+ data.out_data = data.in_data;
+ data.out_mmap_length = PAGE_SIZE * 2;
+ data.out_mmap_offset = mock_viommu->mmap_offset;
+ rc = iommu_copy_struct_to_user(
+ user_data, &data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data);
+ if (rc)
+ goto err_destroy_mmap;
+ }
+
+ refcount_inc(&mock_iommu->users);
+ mutex_init(&mock_viommu->queue_mutex);
+ mock_viommu->s2_parent = to_mock_domain(parent_domain);
+
+ viommu->ops = &mock_viommu_ops;
+ return 0;
+
+err_destroy_mmap:
+ iommufd_viommu_destroy_mmap(&mock_viommu->core,
+ mock_viommu->mmap_offset);
+err_free_page:
+ free_page((unsigned long)mock_viommu->page);
+ return rc;
}
static const struct iommu_ops mock_ops = {
@@ -522,37 +882,28 @@ static const struct iommu_ops mock_ops = {
.default_domain = &mock_blocking_domain,
.blocked_domain = &mock_blocking_domain,
.owner = THIS_MODULE,
- .pgsize_bitmap = MOCK_IO_PAGE_SIZE,
.hw_info = mock_domain_hw_info,
- .domain_alloc_paging = mock_domain_alloc_paging,
- .domain_alloc_user = mock_domain_alloc_user,
+ .domain_alloc_paging_flags = mock_domain_alloc_paging_flags,
+ .domain_alloc_nested = mock_domain_alloc_nested,
.capable = mock_domain_capable,
.device_group = generic_device_group,
.probe_device = mock_probe_device,
- .default_domain_ops =
- &(struct iommu_domain_ops){
- .free = mock_domain_free,
- .attach_dev = mock_domain_nop_attach,
- .map_pages = mock_domain_map_pages,
- .unmap_pages = mock_domain_unmap_pages,
- .iova_to_phys = mock_domain_iova_to_phys,
- },
+ .page_response = mock_domain_page_response,
+ .user_pasid_table = true,
+ .get_viommu_size = mock_get_viommu_size,
+ .viommu_init = mock_viommu_init,
};
static void mock_domain_free_nested(struct iommu_domain *domain)
{
- struct mock_iommu_domain_nested *mock_nested =
- container_of(domain, struct mock_iommu_domain_nested, domain);
-
- kfree(mock_nested);
+ kfree(to_mock_nested(domain));
}
static int
mock_domain_cache_invalidate_user(struct iommu_domain *domain,
struct iommu_user_data_array *array)
{
- struct mock_iommu_domain_nested *mock_nested =
- container_of(domain, struct mock_iommu_domain_nested, domain);
+ struct mock_iommu_domain_nested *mock_nested = to_mock_nested(domain);
struct iommu_hwpt_invalidate_selftest inv;
u32 processed = 0;
int i = 0, j;
@@ -600,6 +951,7 @@ static struct iommu_domain_ops domain_nested_ops = {
.free = mock_domain_free_nested,
.attach_dev = mock_domain_nop_attach,
.cache_invalidate_user = mock_domain_cache_invalidate_user,
+ .set_dev_pasid = mock_domain_set_dev_pasid_nop,
};
static inline struct iommufd_hw_pagetable *
@@ -623,11 +975,11 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
if (IS_ERR(hwpt))
return hwpt;
if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
- hwpt->domain->ops != mock_ops.default_domain_ops) {
+ hwpt->domain->owner != &mock_ops) {
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
- *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain);
+ *mock = to_mock_domain(hwpt->domain);
return hwpt;
}
@@ -645,14 +997,13 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id,
iommufd_put_object(ucmd->ictx, &hwpt->obj);
return ERR_PTR(-EINVAL);
}
- *mock_nested = container_of(hwpt->domain,
- struct mock_iommu_domain_nested, domain);
+ *mock_nested = to_mock_nested(hwpt->domain);
return hwpt;
}
static void mock_dev_release(struct device *dev)
{
- struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ struct mock_dev *mdev = to_mock_dev(dev);
ida_free(&mock_dev_ida, mdev->id);
kfree(mdev);
@@ -660,21 +1011,29 @@ static void mock_dev_release(struct device *dev)
static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{
+ struct property_entry prop[] = {
+ PROPERTY_ENTRY_U32("pasid-num-bits", 0),
+ {},
+ };
+ const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY |
+ MOCK_FLAGS_DEVICE_PASID;
struct mock_dev *mdev;
- int rc;
+ int rc, i;
- if (dev_flags &
- ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA))
+ if (dev_flags & ~valid_flags)
return ERR_PTR(-EINVAL);
mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
if (!mdev)
return ERR_PTR(-ENOMEM);
+ init_rwsem(&mdev->viommu_rwsem);
device_initialize(&mdev->dev);
mdev->flags = dev_flags;
mdev->dev.release = mock_dev_release;
mdev->dev.bus = &iommufd_mock_bus_type.bus;
+ for (i = 0; i < MOCK_DEV_CACHE_NUM; i++)
+ mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT;
rc = ida_alloc(&mock_dev_ida, GFP_KERNEL);
if (rc < 0)
@@ -685,7 +1044,16 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
if (rc)
goto err_put;
- rc = device_add(&mdev->dev);
+ if (dev_flags & MOCK_FLAGS_DEVICE_PASID)
+ prop[0] = PROPERTY_ENTRY_U32("pasid-num-bits", MOCK_PASID_WIDTH);
+
+ rc = device_create_managed_software_node(&mdev->dev, prop, NULL);
+ if (rc) {
+ dev_err(&mdev->dev, "add pasid-num-bits property failed, rc: %d", rc);
+ goto err_put;
+ }
+
+ rc = iommu_mock_device_add(&mdev->dev, &mock_iommu.iommu_dev);
if (rc)
goto err_put;
return mdev;
@@ -740,7 +1108,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
}
sobj->idev.idev = idev;
- rc = iommufd_device_attach(idev, &pt_id);
+ rc = iommufd_device_attach(idev, IOMMU_NO_PASID, &pt_id);
if (rc)
goto out_unbind;
@@ -755,7 +1123,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
return 0;
out_detach:
- iommufd_device_detach(idev);
+ iommufd_device_detach(idev, IOMMU_NO_PASID);
out_unbind:
iommufd_device_unbind(idev);
out_mdev:
@@ -765,39 +1133,49 @@ out_sobj:
return rc;
}
-/* Replace the mock domain with a manually allocated hw_pagetable */
-static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd,
- unsigned int device_id, u32 pt_id,
- struct iommu_test_cmd *cmd)
+static struct selftest_obj *
+iommufd_test_get_selftest_obj(struct iommufd_ctx *ictx, u32 id)
{
struct iommufd_object *dev_obj;
struct selftest_obj *sobj;
- int rc;
/*
* Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure
* it doesn't race with detach, which is not allowed.
*/
- dev_obj =
- iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST);
+ dev_obj = iommufd_get_object(ictx, id, IOMMUFD_OBJ_SELFTEST);
if (IS_ERR(dev_obj))
- return PTR_ERR(dev_obj);
+ return ERR_CAST(dev_obj);
- sobj = container_of(dev_obj, struct selftest_obj, obj);
+ sobj = to_selftest_obj(dev_obj);
if (sobj->type != TYPE_IDEV) {
- rc = -EINVAL;
- goto out_dev_obj;
+ iommufd_put_object(ictx, dev_obj);
+ return ERR_PTR(-EINVAL);
}
+ return sobj;
+}
- rc = iommufd_device_replace(sobj->idev.idev, &pt_id);
+/* Replace the mock domain with a manually allocated hw_pagetable */
+static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd,
+ unsigned int device_id, u32 pt_id,
+ struct iommu_test_cmd *cmd)
+{
+ struct selftest_obj *sobj;
+ int rc;
+
+ sobj = iommufd_test_get_selftest_obj(ucmd->ictx, device_id);
+ if (IS_ERR(sobj))
+ return PTR_ERR(sobj);
+
+ rc = iommufd_device_replace(sobj->idev.idev, IOMMU_NO_PASID, &pt_id);
if (rc)
- goto out_dev_obj;
+ goto out_sobj;
cmd->mock_domain_replace.pt_id = pt_id;
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
-out_dev_obj:
- iommufd_put_object(ucmd->ictx, dev_obj);
+out_sobj:
+ iommufd_put_object(ucmd->ictx, &sobj->obj);
return rc;
}
@@ -826,23 +1204,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
{
struct iommufd_hw_pagetable *hwpt;
struct mock_iommu_domain *mock;
+ unsigned int page_size;
uintptr_t end;
int rc;
- if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE ||
- (uintptr_t)uptr % MOCK_IO_PAGE_SIZE ||
- check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
- return -EINVAL;
-
hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- for (; length; length -= MOCK_IO_PAGE_SIZE) {
+ page_size = 1 << __ffs(mock->domain.pgsize_bitmap);
+ if (iova % page_size || length % page_size ||
+ (uintptr_t)uptr % page_size ||
+ check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end))
+ return -EINVAL;
+
+ for (; length; length -= page_size) {
struct page *pages[1];
+ phys_addr_t io_phys;
unsigned long pfn;
long npages;
- void *ent;
npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0,
pages);
@@ -857,15 +1237,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd,
pfn = page_to_pfn(pages[0]);
put_page(pages[0]);
- ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- if (!ent ||
- (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE !=
- pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
+ io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova);
+ if (io_phys !=
+ pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) {
rc = -EINVAL;
goto out_put;
}
- iova += MOCK_IO_PAGE_SIZE;
- uptr += MOCK_IO_PAGE_SIZE;
+ iova += page_size;
+ uptr += page_size;
}
rc = 0;
@@ -909,9 +1288,8 @@ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd,
return 0;
}
-static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
- u32 mockpt_id, unsigned int iotlb_id,
- u32 iotlb)
+static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ unsigned int iotlb_id, u32 iotlb)
{
struct mock_iommu_domain_nested *mock_nested;
struct iommufd_hw_pagetable *hwpt;
@@ -921,8 +1299,7 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- mock_nested = container_of(hwpt->domain,
- struct mock_iommu_domain_nested, domain);
+ mock_nested = to_mock_nested(hwpt->domain);
if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX ||
mock_nested->iotlb[iotlb_id] != iotlb)
@@ -931,6 +1308,24 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
return rc;
}
+static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id,
+ unsigned int cache_id, u32 cache)
+{
+ struct iommufd_device *idev;
+ struct mock_dev *mdev;
+ int rc = 0;
+
+ idev = iommufd_get_device(ucmd, idev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+ mdev = container_of(idev->dev, struct mock_dev, dev);
+
+ if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache)
+ rc = -EINVAL;
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+ return rc;
+}
+
struct selftest_access {
struct iommufd_access *access;
struct file *file;
@@ -1167,7 +1562,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd,
int rc;
/* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */
- if (length > 16*1024*1024)
+ if (length > 16 * 1024 * 1024)
return -ENOMEM;
if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ))
@@ -1184,7 +1579,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd,
if (flags & MOCK_FLAGS_ACCESS_SYZ)
iova = iommufd_test_syz_conv_iova(staccess->access,
- &cmd->access_pages.iova);
+ &cmd->access_pages.iova);
npages = (ALIGN(iova + length, PAGE_SIZE) -
ALIGN_DOWN(iova, PAGE_SIZE)) /
@@ -1260,7 +1655,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd,
int rc;
/* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */
- if (length > 16*1024*1024)
+ if (length > 16 * 1024 * 1024)
return -ENOMEM;
if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH |
@@ -1286,7 +1681,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd,
if (flags & MOCK_FLAGS_ACCESS_SYZ)
iova = iommufd_test_syz_conv_iova(staccess->access,
- &cmd->access_rw.iova);
+ &cmd->access_rw.iova);
rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags);
if (rc)
@@ -1313,7 +1708,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
unsigned long page_size, void __user *uptr,
u32 flags)
{
- unsigned long bitmap_size, i, max;
+ unsigned long i, max;
struct iommu_test_cmd *cmd = ucmd->cmd;
struct iommufd_hw_pagetable *hwpt;
struct mock_iommu_domain *mock;
@@ -1328,42 +1723,29 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
if (IS_ERR(hwpt))
return PTR_ERR(hwpt);
- if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+ if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) {
rc = -EINVAL;
goto out_put;
}
max = length / page_size;
- bitmap_size = max / BITS_PER_BYTE;
-
- tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT);
+ tmp = kvzalloc(DIV_ROUND_UP(max, BITS_PER_LONG) * sizeof(unsigned long),
+ GFP_KERNEL_ACCOUNT);
if (!tmp) {
rc = -ENOMEM;
goto out_put;
}
- if (copy_from_user(tmp, uptr, bitmap_size)) {
+ if (copy_from_user(tmp, uptr, DIV_ROUND_UP(max, BITS_PER_BYTE))) {
rc = -EFAULT;
goto out_free;
}
for (i = 0; i < max; i++) {
- unsigned long cur = iova + i * page_size;
- void *ent, *old;
-
if (!test_bit(i, (unsigned long *)tmp))
continue;
-
- ent = xa_load(&mock->pfns, cur / page_size);
- if (ent) {
- unsigned long val;
-
- val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
- old = xa_store(&mock->pfns, cur / page_size,
- xa_mk_value(val), GFP_KERNEL);
- WARN_ON_ONCE(ent != old);
- count++;
- }
+ mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size);
+ count++;
}
cmd->dirty.out_nr_dirty = count;
@@ -1375,19 +1757,330 @@ out_put:
return rc;
}
+static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd,
+ struct iommu_test_cmd *cmd)
+{
+ struct iopf_fault event = {};
+ struct iommufd_device *idev;
+
+ idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ event.fault.prm.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+ if (cmd->trigger_iopf.pasid != IOMMU_NO_PASID)
+ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+ event.fault.type = IOMMU_FAULT_PAGE_REQ;
+ event.fault.prm.addr = cmd->trigger_iopf.addr;
+ event.fault.prm.pasid = cmd->trigger_iopf.pasid;
+ event.fault.prm.grpid = cmd->trigger_iopf.grpid;
+ event.fault.prm.perm = cmd->trigger_iopf.perm;
+
+ iommu_report_device_fault(idev->dev, &event);
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+
+ return 0;
+}
+
+static int iommufd_test_trigger_vevent(struct iommufd_ucmd *ucmd,
+ struct iommu_test_cmd *cmd)
+{
+ struct iommu_viommu_event_selftest test = {};
+ struct iommufd_device *idev;
+ struct mock_dev *mdev;
+ int rc = -ENOENT;
+
+ idev = iommufd_get_device(ucmd, cmd->trigger_vevent.dev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+ mdev = to_mock_dev(idev->dev);
+
+ down_read(&mdev->viommu_rwsem);
+ if (!mdev->viommu || !mdev->vdev_id)
+ goto out_unlock;
+
+ test.virt_id = mdev->vdev_id;
+ rc = iommufd_viommu_report_event(&mdev->viommu->core,
+ IOMMU_VEVENTQ_TYPE_SELFTEST, &test,
+ sizeof(test));
+out_unlock:
+ up_read(&mdev->viommu_rwsem);
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+
+ return rc;
+}
+
+static inline struct iommufd_hw_pagetable *
+iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id)
+{
+ struct iommufd_object *pt_obj;
+
+ pt_obj = iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj))
+ return ERR_CAST(pt_obj);
+
+ if (pt_obj->type != IOMMUFD_OBJ_HWPT_NESTED &&
+ pt_obj->type != IOMMUFD_OBJ_HWPT_PAGING) {
+ iommufd_put_object(ucmd->ictx, pt_obj);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+}
+
+static int iommufd_test_pasid_check_hwpt(struct iommufd_ucmd *ucmd,
+ struct iommu_test_cmd *cmd)
+{
+ u32 hwpt_id = cmd->pasid_check.hwpt_id;
+ struct iommu_domain *attached_domain;
+ struct iommu_attach_handle *handle;
+ struct iommufd_hw_pagetable *hwpt;
+ struct selftest_obj *sobj;
+ struct mock_dev *mdev;
+ int rc = 0;
+
+ sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id);
+ if (IS_ERR(sobj))
+ return PTR_ERR(sobj);
+
+ mdev = sobj->idev.mock_dev;
+
+ handle = iommu_attach_handle_get(mdev->dev.iommu_group,
+ cmd->pasid_check.pasid, 0);
+ if (IS_ERR(handle))
+ attached_domain = NULL;
+ else
+ attached_domain = handle->domain;
+
+ /* hwpt_id == 0 means to check if pasid is detached */
+ if (!hwpt_id) {
+ if (attached_domain)
+ rc = -EINVAL;
+ goto out_sobj;
+ }
+
+ hwpt = iommufd_get_hwpt(ucmd, hwpt_id);
+ if (IS_ERR(hwpt)) {
+ rc = PTR_ERR(hwpt);
+ goto out_sobj;
+ }
+
+ if (attached_domain != hwpt->domain)
+ rc = -EINVAL;
+
+ iommufd_put_object(ucmd->ictx, &hwpt->obj);
+out_sobj:
+ iommufd_put_object(ucmd->ictx, &sobj->obj);
+ return rc;
+}
+
+static int iommufd_test_pasid_attach(struct iommufd_ucmd *ucmd,
+ struct iommu_test_cmd *cmd)
+{
+ struct selftest_obj *sobj;
+ int rc;
+
+ sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id);
+ if (IS_ERR(sobj))
+ return PTR_ERR(sobj);
+
+ rc = iommufd_device_attach(sobj->idev.idev, cmd->pasid_attach.pasid,
+ &cmd->pasid_attach.pt_id);
+ if (rc)
+ goto out_sobj;
+
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ iommufd_device_detach(sobj->idev.idev, cmd->pasid_attach.pasid);
+
+out_sobj:
+ iommufd_put_object(ucmd->ictx, &sobj->obj);
+ return rc;
+}
+
+static int iommufd_test_pasid_replace(struct iommufd_ucmd *ucmd,
+ struct iommu_test_cmd *cmd)
+{
+ struct selftest_obj *sobj;
+ int rc;
+
+ sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id);
+ if (IS_ERR(sobj))
+ return PTR_ERR(sobj);
+
+ rc = iommufd_device_replace(sobj->idev.idev, cmd->pasid_attach.pasid,
+ &cmd->pasid_attach.pt_id);
+ if (rc)
+ goto out_sobj;
+
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_sobj:
+ iommufd_put_object(ucmd->ictx, &sobj->obj);
+ return rc;
+}
+
+static int iommufd_test_pasid_detach(struct iommufd_ucmd *ucmd,
+ struct iommu_test_cmd *cmd)
+{
+ struct selftest_obj *sobj;
+
+ sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id);
+ if (IS_ERR(sobj))
+ return PTR_ERR(sobj);
+
+ iommufd_device_detach(sobj->idev.idev, cmd->pasid_detach.pasid);
+ iommufd_put_object(ucmd->ictx, &sobj->obj);
+ return 0;
+}
+
void iommufd_selftest_destroy(struct iommufd_object *obj)
{
- struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
+ struct selftest_obj *sobj = to_selftest_obj(obj);
switch (sobj->type) {
case TYPE_IDEV:
- iommufd_device_detach(sobj->idev.idev);
+ iommufd_device_detach(sobj->idev.idev, IOMMU_NO_PASID);
iommufd_device_unbind(sobj->idev.idev);
mock_dev_destroy(sobj->idev.mock_dev);
break;
}
}
+struct iommufd_test_dma_buf {
+ void *memory;
+ size_t length;
+ bool revoked;
+};
+
+static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ return 0;
+}
+
+static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+}
+
+static struct sg_table *
+iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment,
+ enum dma_data_direction dir)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment,
+ struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+}
+
+static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf)
+{
+ struct iommufd_test_dma_buf *priv = dmabuf->priv;
+
+ kfree(priv->memory);
+ kfree(priv);
+}
+
+static const struct dma_buf_ops iommufd_test_dmabuf_ops = {
+ .attach = iommufd_test_dma_buf_attach,
+ .detach = iommufd_test_dma_buf_detach,
+ .map_dma_buf = iommufd_test_dma_buf_map,
+ .release = iommufd_test_dma_buf_release,
+ .unmap_dma_buf = iommufd_test_dma_buf_unmap,
+};
+
+int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
+ struct dma_buf_phys_vec *phys)
+{
+ struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv;
+
+ dma_resv_assert_held(attachment->dmabuf->resv);
+
+ if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops)
+ return -EOPNOTSUPP;
+
+ if (priv->revoked)
+ return -ENODEV;
+
+ phys->paddr = virt_to_phys(priv->memory);
+ phys->len = priv->length;
+ return 0;
+}
+
+static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd,
+ unsigned int open_flags,
+ size_t len)
+{
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct iommufd_test_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int rc;
+
+ len = ALIGN(len, PAGE_SIZE);
+ if (len == 0 || len > PAGE_SIZE * 512)
+ return -EINVAL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->length = len;
+ priv->memory = kzalloc(len, GFP_KERNEL);
+ if (!priv->memory) {
+ rc = -ENOMEM;
+ goto err_free;
+ }
+
+ exp_info.ops = &iommufd_test_dmabuf_ops;
+ exp_info.size = len;
+ exp_info.flags = open_flags;
+ exp_info.priv = priv;
+
+ dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(dmabuf)) {
+ rc = PTR_ERR(dmabuf);
+ goto err_free;
+ }
+
+ return dma_buf_fd(dmabuf, open_flags);
+
+err_free:
+ kfree(priv->memory);
+ kfree(priv);
+ return rc;
+}
+
+static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd,
+ bool revoked)
+{
+ struct iommufd_test_dma_buf *priv;
+ struct dma_buf *dmabuf;
+ int rc = 0;
+
+ dmabuf = dma_buf_get(fd);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+
+ if (dmabuf->ops != &iommufd_test_dmabuf_ops) {
+ rc = -EOPNOTSUPP;
+ goto err_put;
+ }
+
+ priv = dmabuf->priv;
+ dma_resv_lock(dmabuf->resv, NULL);
+ priv->revoked = revoked;
+ dma_buf_move_notify(dmabuf);
+ dma_resv_unlock(dmabuf->resv);
+
+err_put:
+ dma_buf_put(dmabuf);
+ return rc;
+}
+
int iommufd_test(struct iommufd_ucmd *ucmd)
{
struct iommu_test_cmd *cmd = ucmd->cmd;
@@ -1416,6 +2109,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return iommufd_test_md_check_iotlb(ucmd, cmd->id,
cmd->check_iotlb.id,
cmd->check_iotlb.iotlb);
+ case IOMMU_TEST_OP_DEV_CHECK_CACHE:
+ return iommufd_test_dev_check_cache(ucmd, cmd->id,
+ cmd->check_dev_cache.id,
+ cmd->check_dev_cache.cache);
case IOMMU_TEST_OP_CREATE_ACCESS:
return iommufd_test_create_access(ucmd, cmd->id,
cmd->create_access.flags);
@@ -1450,6 +2147,25 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
cmd->dirty.page_size,
u64_to_user_ptr(cmd->dirty.uptr),
cmd->dirty.flags);
+ case IOMMU_TEST_OP_TRIGGER_IOPF:
+ return iommufd_test_trigger_iopf(ucmd, cmd);
+ case IOMMU_TEST_OP_TRIGGER_VEVENT:
+ return iommufd_test_trigger_vevent(ucmd, cmd);
+ case IOMMU_TEST_OP_PASID_ATTACH:
+ return iommufd_test_pasid_attach(ucmd, cmd);
+ case IOMMU_TEST_OP_PASID_REPLACE:
+ return iommufd_test_pasid_replace(ucmd, cmd);
+ case IOMMU_TEST_OP_PASID_DETACH:
+ return iommufd_test_pasid_detach(ucmd, cmd);
+ case IOMMU_TEST_OP_PASID_CHECK_HWPT:
+ return iommufd_test_pasid_check_hwpt(ucmd, cmd);
+ case IOMMU_TEST_OP_DMABUF_GET:
+ return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags,
+ cmd->dmabuf_get.length);
+ case IOMMU_TEST_OP_DMABUF_REVOKE:
+ return iommufd_test_dmabuf_revoke(ucmd,
+ cmd->dmabuf_revoke.dmabuf_fd,
+ cmd->dmabuf_revoke.revoked);
default:
return -EOPNOTSUPP;
}
@@ -1480,21 +2196,28 @@ int __init iommufd_test_init(void)
if (rc)
goto err_platform;
- rc = iommu_device_sysfs_add(&mock_iommu_device,
+ rc = iommu_device_sysfs_add(&mock_iommu.iommu_dev,
&selftest_iommu_dev->dev, NULL, "%s",
dev_name(&selftest_iommu_dev->dev));
if (rc)
goto err_bus;
- rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops,
- &iommufd_mock_bus_type.bus,
- &iommufd_mock_bus_type.nb);
+ rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops,
+ &iommufd_mock_bus_type.bus,
+ &iommufd_mock_bus_type.nb);
if (rc)
goto err_sysfs;
+
+ refcount_set(&mock_iommu.users, 1);
+ init_completion(&mock_iommu.complete);
+
+ mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq");
+ mock_iommu.iommu_dev.max_pasids = (1 << MOCK_PASID_WIDTH);
+
return 0;
err_sysfs:
- iommu_device_sysfs_remove(&mock_iommu_device);
+ iommu_device_sysfs_remove(&mock_iommu.iommu_dev);
err_bus:
bus_unregister(&iommufd_mock_bus_type.bus);
err_platform:
@@ -1504,13 +2227,37 @@ err_dbgfs:
return rc;
}
+static void iommufd_test_wait_for_users(void)
+{
+ if (refcount_dec_and_test(&mock_iommu.users))
+ return;
+ /*
+ * Time out waiting for iommu device user count to become 0.
+ *
+ * Note that this is just making an example here, since the selftest is
+ * built into the iommufd module, i.e. it only unplugs the iommu device
+ * when unloading the module. So, it is expected that this WARN_ON will
+ * not trigger, as long as any iommufd FDs are open.
+ */
+ WARN_ON(!wait_for_completion_timeout(&mock_iommu.complete,
+ msecs_to_jiffies(10000)));
+}
+
void iommufd_test_exit(void)
{
- iommu_device_sysfs_remove(&mock_iommu_device);
- iommu_device_unregister_bus(&mock_iommu_device,
+ if (mock_iommu_iopf_queue) {
+ iopf_queue_free(mock_iommu_iopf_queue);
+ mock_iommu_iopf_queue = NULL;
+ }
+
+ iommufd_test_wait_for_users();
+ iommu_device_sysfs_remove(&mock_iommu.iommu_dev);
+ iommu_device_unregister_bus(&mock_iommu.iommu_dev,
&iommufd_mock_bus_type.bus,
&iommufd_mock_bus_type.nb);
bus_unregister(&iommufd_mock_bus_type.bus);
platform_device_unregister(selftest_iommu_dev);
debugfs_remove_recursive(dbgfs_root);
}
+
+MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index a3ad5f0b6c59..a258ee2f4579 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -44,7 +44,7 @@ int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
iommufd_put_object(ictx, &ioas->obj);
return 0;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, "IOMMUFD_VFIO");
/**
* iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
@@ -66,7 +66,7 @@ int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
xa_unlock(&ictx->objects);
return ret;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO);
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, "IOMMUFD_VFIO");
/**
* iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
@@ -118,7 +118,7 @@ out_abort:
iommufd_object_abort(ictx, &ioas->obj);
return ret;
}
-EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO);
+EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, "IOMMUFD_VFIO");
int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
{
@@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
case VFIO_DMA_CC_IOMMU:
return iommufd_vfio_cc_iommu(ictx);
- /*
- * This is obsolete, and to be removed from VFIO. It was an incomplete
- * idea that got merged.
- * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
- */
- case VFIO_TYPE1_NESTING_IOMMU:
+ case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
return 0;
/*
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
new file mode 100644
index 000000000000..462b457ffd0c
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_viommu *viommu =
+ container_of(obj, struct iommufd_viommu, obj);
+
+ if (viommu->ops && viommu->ops->destroy)
+ viommu->ops->destroy(viommu);
+ refcount_dec(&viommu->hwpt->common.obj.users);
+ xa_destroy(&viommu->vdevs);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_viommu_alloc *cmd = ucmd->cmd;
+ const struct iommu_user_data user_data = {
+ .type = cmd->type,
+ .uptr = u64_to_user_ptr(cmd->data_uptr),
+ .len = cmd->data_len,
+ };
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_viommu *viommu;
+ struct iommufd_device *idev;
+ const struct iommu_ops *ops;
+ size_t viommu_size;
+ int rc;
+
+ if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT)
+ return -EOPNOTSUPP;
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ ops = dev_iommu_ops(idev->dev);
+ if (!ops->get_viommu_size || !ops->viommu_init) {
+ rc = -EOPNOTSUPP;
+ goto out_put_idev;
+ }
+
+ viommu_size = ops->get_viommu_size(idev->dev, cmd->type);
+ if (!viommu_size) {
+ rc = -EOPNOTSUPP;
+ goto out_put_idev;
+ }
+
+ /*
+ * It is a driver bug for providing a viommu_size smaller than the core
+ * vIOMMU structure size
+ */
+ if (WARN_ON_ONCE(viommu_size < sizeof(*viommu))) {
+ rc = -EOPNOTSUPP;
+ goto out_put_idev;
+ }
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging)) {
+ rc = PTR_ERR(hwpt_paging);
+ goto out_put_idev;
+ }
+
+ if (!hwpt_paging->nest_parent) {
+ rc = -EINVAL;
+ goto out_put_hwpt;
+ }
+
+ viommu = (struct iommufd_viommu *)_iommufd_object_alloc_ucmd(
+ ucmd, viommu_size, IOMMUFD_OBJ_VIOMMU);
+ if (IS_ERR(viommu)) {
+ rc = PTR_ERR(viommu);
+ goto out_put_hwpt;
+ }
+
+ xa_init(&viommu->vdevs);
+ viommu->type = cmd->type;
+ viommu->ictx = ucmd->ictx;
+ viommu->hwpt = hwpt_paging;
+ refcount_inc(&viommu->hwpt->common.obj.users);
+ INIT_LIST_HEAD(&viommu->veventqs);
+ init_rwsem(&viommu->veventqs_rwsem);
+ /*
+ * It is the most likely case that a physical IOMMU is unpluggable. A
+ * pluggable IOMMU instance (if exists) is responsible for refcounting
+ * on its own.
+ */
+ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
+
+ rc = ops->viommu_init(viommu, hwpt_paging->common.domain,
+ user_data.len ? &user_data : NULL);
+ if (rc)
+ goto out_put_hwpt;
+
+ /* It is a driver bug that viommu->ops isn't filled */
+ if (WARN_ON_ONCE(!viommu->ops)) {
+ rc = -EOPNOTSUPP;
+ goto out_put_hwpt;
+ }
+
+ cmd->out_viommu_id = viommu->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put_hwpt:
+ iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+ return rc;
+}
+
+void iommufd_vdevice_abort(struct iommufd_object *obj)
+{
+ struct iommufd_vdevice *vdev =
+ container_of(obj, struct iommufd_vdevice, obj);
+ struct iommufd_viommu *viommu = vdev->viommu;
+ struct iommufd_device *idev = vdev->idev;
+
+ lockdep_assert_held(&idev->igroup->lock);
+
+ if (vdev->destroy)
+ vdev->destroy(vdev);
+ /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
+ xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL);
+ refcount_dec(&viommu->obj.users);
+ idev->vdev = NULL;
+}
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_vdevice *vdev =
+ container_of(obj, struct iommufd_vdevice, obj);
+ struct iommufd_device *idev = vdev->idev;
+ struct iommufd_ctx *ictx = idev->ictx;
+
+ mutex_lock(&idev->igroup->lock);
+ iommufd_vdevice_abort(obj);
+ mutex_unlock(&idev->igroup->lock);
+ iommufd_put_object(ictx, &idev->obj);
+}
+
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_vdevice_alloc *cmd = ucmd->cmd;
+ struct iommufd_vdevice *vdev, *curr;
+ size_t vdev_size = sizeof(*vdev);
+ struct iommufd_viommu *viommu;
+ struct iommufd_device *idev;
+ u64 virt_id = cmd->virt_id;
+ int rc = 0;
+
+ /* virt_id indexes an xarray */
+ if (virt_id > ULONG_MAX)
+ return -EINVAL;
+
+ viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+ if (IS_ERR(viommu))
+ return PTR_ERR(viommu);
+
+ idev = iommufd_get_device(ucmd, cmd->dev_id);
+ if (IS_ERR(idev)) {
+ rc = PTR_ERR(idev);
+ goto out_put_viommu;
+ }
+
+ if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+ rc = -EINVAL;
+ goto out_put_idev;
+ }
+
+ mutex_lock(&idev->igroup->lock);
+ if (idev->destroying) {
+ rc = -ENOENT;
+ goto out_unlock_igroup;
+ }
+
+ if (idev->vdev) {
+ rc = -EEXIST;
+ goto out_unlock_igroup;
+ }
+
+ if (viommu->ops && viommu->ops->vdevice_size) {
+ /*
+ * It is a driver bug for:
+ * - ops->vdevice_size smaller than the core structure size
+ * - not implementing a pairing ops->vdevice_init op
+ */
+ if (WARN_ON_ONCE(viommu->ops->vdevice_size < vdev_size ||
+ !viommu->ops->vdevice_init)) {
+ rc = -EOPNOTSUPP;
+ goto out_put_idev;
+ }
+ vdev_size = viommu->ops->vdevice_size;
+ }
+
+ vdev = (struct iommufd_vdevice *)_iommufd_object_alloc(
+ ucmd->ictx, vdev_size, IOMMUFD_OBJ_VDEVICE);
+ if (IS_ERR(vdev)) {
+ rc = PTR_ERR(vdev);
+ goto out_unlock_igroup;
+ }
+
+ vdev->virt_id = virt_id;
+ vdev->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+ /*
+ * A wait_cnt reference is held on the idev so long as we have the
+ * pointer. iommufd_device_pre_destroy() will revoke it before the
+ * idev real destruction.
+ */
+ vdev->idev = idev;
+
+ /*
+ * iommufd_device_destroy() delays until idev->vdev is NULL before
+ * freeing the idev, which only happens once the vdev is finished
+ * destruction.
+ */
+ idev->vdev = vdev;
+
+ curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
+ if (curr) {
+ rc = xa_err(curr) ?: -EEXIST;
+ goto out_abort;
+ }
+
+ if (viommu->ops && viommu->ops->vdevice_init) {
+ rc = viommu->ops->vdevice_init(vdev);
+ if (rc)
+ goto out_abort;
+ }
+
+ cmd->out_vdevice_id = vdev->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_abort;
+ iommufd_object_finalize(ucmd->ictx, &vdev->obj);
+ goto out_unlock_igroup;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_unlock_igroup:
+ mutex_unlock(&idev->igroup->lock);
+out_put_idev:
+ if (rc)
+ iommufd_put_object(ucmd->ictx, &idev->obj);
+out_put_viommu:
+ iommufd_put_object(ucmd->ictx, &viommu->obj);
+ return rc;
+}
+
+static void iommufd_hw_queue_destroy_access(struct iommufd_ctx *ictx,
+ struct iommufd_access *access,
+ u64 base_iova, size_t length)
+{
+ u64 aligned_iova = PAGE_ALIGN_DOWN(base_iova);
+ u64 offset = base_iova - aligned_iova;
+
+ iommufd_access_unpin_pages(access, aligned_iova,
+ PAGE_ALIGN(length + offset));
+ iommufd_access_detach_internal(access);
+ iommufd_access_destroy_internal(ictx, access);
+}
+
+void iommufd_hw_queue_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_hw_queue *hw_queue =
+ container_of(obj, struct iommufd_hw_queue, obj);
+
+ if (hw_queue->destroy)
+ hw_queue->destroy(hw_queue);
+ if (hw_queue->access)
+ iommufd_hw_queue_destroy_access(hw_queue->viommu->ictx,
+ hw_queue->access,
+ hw_queue->base_addr,
+ hw_queue->length);
+ if (hw_queue->viommu)
+ refcount_dec(&hw_queue->viommu->obj.users);
+}
+
+/*
+ * When the HW accesses the guest queue via physical addresses, the underlying
+ * physical pages of the guest queue must be contiguous. Also, for the security
+ * concern that IOMMUFD_CMD_IOAS_UNMAP could potentially remove the mappings of
+ * the guest queue from the nesting parent iopt while the HW is still accessing
+ * the guest queue memory physically, such a HW queue must require an access to
+ * pin the underlying pages and prevent that from happening.
+ */
+static struct iommufd_access *
+iommufd_hw_queue_alloc_phys(struct iommu_hw_queue_alloc *cmd,
+ struct iommufd_viommu *viommu, phys_addr_t *base_pa)
+{
+ u64 aligned_iova = PAGE_ALIGN_DOWN(cmd->nesting_parent_iova);
+ u64 offset = cmd->nesting_parent_iova - aligned_iova;
+ struct iommufd_access *access;
+ struct page **pages;
+ size_t max_npages;
+ size_t length;
+ size_t i;
+ int rc;
+
+ /* max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE) */
+ if (check_add_overflow(offset, cmd->length, &length))
+ return ERR_PTR(-ERANGE);
+ if (check_add_overflow(length, PAGE_SIZE - 1, &length))
+ return ERR_PTR(-ERANGE);
+ max_npages = length / PAGE_SIZE;
+ /* length needs to be page aligned too */
+ length = max_npages * PAGE_SIZE;
+
+ /*
+ * Use kvcalloc() to avoid memory fragmentation for a large page array.
+ * Set __GFP_NOWARN to avoid syzkaller blowups
+ */
+ pages = kvcalloc(max_npages, sizeof(*pages), GFP_KERNEL | __GFP_NOWARN);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ access = iommufd_access_create_internal(viommu->ictx);
+ if (IS_ERR(access)) {
+ rc = PTR_ERR(access);
+ goto out_free;
+ }
+
+ rc = iommufd_access_attach_internal(access, viommu->hwpt->ioas);
+ if (rc)
+ goto out_destroy;
+
+ rc = iommufd_access_pin_pages(access, aligned_iova, length, pages, 0);
+ if (rc)
+ goto out_detach;
+
+ /* Validate if the underlying physical pages are contiguous */
+ for (i = 1; i < max_npages; i++) {
+ if (page_to_pfn(pages[i]) == page_to_pfn(pages[i - 1]) + 1)
+ continue;
+ rc = -EFAULT;
+ goto out_unpin;
+ }
+
+ *base_pa = (page_to_pfn(pages[0]) << PAGE_SHIFT) + offset;
+ kvfree(pages);
+ return access;
+
+out_unpin:
+ iommufd_access_unpin_pages(access, aligned_iova, length);
+out_detach:
+ iommufd_access_detach_internal(access);
+out_destroy:
+ iommufd_access_destroy_internal(viommu->ictx, access);
+out_free:
+ kvfree(pages);
+ return ERR_PTR(rc);
+}
+
+int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hw_queue_alloc *cmd = ucmd->cmd;
+ struct iommufd_hw_queue *hw_queue;
+ struct iommufd_viommu *viommu;
+ struct iommufd_access *access;
+ size_t hw_queue_size;
+ phys_addr_t base_pa;
+ u64 last;
+ int rc;
+
+ if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT)
+ return -EOPNOTSUPP;
+ if (!cmd->length)
+ return -EINVAL;
+ if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1,
+ &last))
+ return -EOVERFLOW;
+
+ viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+ if (IS_ERR(viommu))
+ return PTR_ERR(viommu);
+
+ if (!viommu->ops || !viommu->ops->get_hw_queue_size ||
+ !viommu->ops->hw_queue_init_phys) {
+ rc = -EOPNOTSUPP;
+ goto out_put_viommu;
+ }
+
+ hw_queue_size = viommu->ops->get_hw_queue_size(viommu, cmd->type);
+ if (!hw_queue_size) {
+ rc = -EOPNOTSUPP;
+ goto out_put_viommu;
+ }
+
+ /*
+ * It is a driver bug for providing a hw_queue_size smaller than the
+ * core HW queue structure size
+ */
+ if (WARN_ON_ONCE(hw_queue_size < sizeof(*hw_queue))) {
+ rc = -EOPNOTSUPP;
+ goto out_put_viommu;
+ }
+
+ hw_queue = (struct iommufd_hw_queue *)_iommufd_object_alloc_ucmd(
+ ucmd, hw_queue_size, IOMMUFD_OBJ_HW_QUEUE);
+ if (IS_ERR(hw_queue)) {
+ rc = PTR_ERR(hw_queue);
+ goto out_put_viommu;
+ }
+
+ access = iommufd_hw_queue_alloc_phys(cmd, viommu, &base_pa);
+ if (IS_ERR(access)) {
+ rc = PTR_ERR(access);
+ goto out_put_viommu;
+ }
+
+ hw_queue->viommu = viommu;
+ refcount_inc(&viommu->obj.users);
+ hw_queue->access = access;
+ hw_queue->type = cmd->type;
+ hw_queue->length = cmd->length;
+ hw_queue->base_addr = cmd->nesting_parent_iova;
+
+ rc = viommu->ops->hw_queue_init_phys(hw_queue, cmd->index, base_pa);
+ if (rc)
+ goto out_put_viommu;
+
+ cmd->out_hw_queue_id = hw_queue->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put_viommu:
+ iommufd_put_object(ucmd->ictx, &viommu->obj);
+ return rc;
+}
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d59d0ea2fd21..18f839721813 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -6,6 +6,7 @@
*/
#include <linux/iova.h>
+#include <linux/kmemleak.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/smp.h>
@@ -506,7 +507,7 @@ __adjust_overlap_range(struct iova *iova,
* reserve_iova - reserves an iova in the given range
* @iovad: - iova domain pointer
* @pfn_lo: - lower page frame address
- * @pfn_hi:- higher pfn adderss
+ * @pfn_hi:- higher pfn address
* This function allocates reserves the address range from pfn_lo to pfn_hi so
* that this address is not dished out as part of alloc_iova.
*/
@@ -673,6 +674,11 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
{
struct iova_magazine *mag = rcache->depot;
+ /*
+ * As the mag->next pointer is moved to rcache->depot and reset via
+ * the mag->size assignment, mark it as a transient false positive.
+ */
+ kmemleak_transient_leak(mag->next);
rcache->depot = mag->next;
mag->size = IOVA_MAG_SIZE;
rcache->depot_size--;
@@ -1000,4 +1006,5 @@ void iova_cache_put(void)
EXPORT_SYMBOL_GPL(iova_cache_put);
MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
+MODULE_DESCRIPTION("IOMMU I/O Virtual Address management");
MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index b657cc09605f..ca848288dbf2 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -430,7 +430,7 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
* non-secure mode.
*/
domain->cfg.quirks = IO_PGTABLE_QUIRK_ARM_NS;
- domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K;
+ domain->cfg.pgsize_bitmap = domain->io_domain.pgsize_bitmap;
domain->cfg.ias = 32;
domain->cfg.oas = 40;
domain->cfg.tlb = &ipmmu_flush_ops;
@@ -571,6 +571,7 @@ static struct iommu_domain *ipmmu_domain_alloc_paging(struct device *dev)
return NULL;
mutex_init(&domain->mutex);
+ domain->io_domain.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K;
return &domain->io_domain;
}
@@ -589,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain)
}
static int ipmmu_attach_device(struct iommu_domain *io_domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
@@ -636,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain,
}
static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_domain *domain;
unsigned int i;
- if (io_domain == identity_domain || !io_domain)
+ if (old == identity_domain || !old)
return 0;
- domain = to_vmsa_domain(io_domain);
+ domain = to_vmsa_domain(old);
for (i = 0; i < fwspec->num_ids; ++i)
ipmmu_utlb_disable(domain, fwspec->ids[i]);
@@ -719,6 +720,8 @@ static int ipmmu_init_platform_device(struct device *dev,
dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev));
+ put_device(&ipmmu_pdev->dev);
+
return 0;
}
@@ -804,8 +807,7 @@ static int ipmmu_init_arm_mapping(struct device *dev)
if (!mmu->mapping) {
struct dma_iommu_mapping *mapping;
- mapping = arm_iommu_create_mapping(&platform_bus_type,
- SZ_1G, SZ_2G);
+ mapping = arm_iommu_create_mapping(dev, SZ_1G, SZ_2G);
if (IS_ERR(mapping)) {
dev_err(mmu->dev, "failed to create ARM IOMMU mapping\n");
ret = PTR_ERR(mapping);
@@ -883,7 +885,6 @@ static const struct iommu_ops ipmmu_ops = {
*/
.device_group = IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA)
? generic_device_group : generic_single_device_group,
- .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K,
.of_xlate = ipmmu_of_xlate,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = ipmmu_attach_device,
@@ -1082,31 +1083,25 @@ static int ipmmu_probe(struct platform_device *pdev)
}
}
+ platform_set_drvdata(pdev, mmu);
/*
* Register the IPMMU to the IOMMU subsystem in the following cases:
* - R-Car Gen2 IPMMU (all devices registered)
* - R-Car Gen3 IPMMU (leaf devices only - skip root IPMMU-MM device)
*/
- if (!mmu->features->has_cache_leaf_nodes || !ipmmu_is_root(mmu)) {
- ret = iommu_device_sysfs_add(&mmu->iommu, &pdev->dev, NULL,
- dev_name(&pdev->dev));
- if (ret)
- return ret;
-
- ret = iommu_device_register(&mmu->iommu, &ipmmu_ops, &pdev->dev);
- if (ret)
- return ret;
- }
+ if (mmu->features->has_cache_leaf_nodes && ipmmu_is_root(mmu))
+ return 0;
- /*
- * We can't create the ARM mapping here as it requires the bus to have
- * an IOMMU, which only happens when bus_set_iommu() is called in
- * ipmmu_init() after the probe function returns.
- */
+ ret = iommu_device_sysfs_add(&mmu->iommu, &pdev->dev, NULL, "%s",
+ dev_name(&pdev->dev));
+ if (ret)
+ return ret;
- platform_set_drvdata(pdev, mmu);
+ ret = iommu_device_register(&mmu->iommu, &ipmmu_ops, &pdev->dev);
+ if (ret)
+ iommu_device_sysfs_remove(&mmu->iommu);
- return 0;
+ return ret;
}
static void ipmmu_remove(struct platform_device *pdev)
@@ -1160,6 +1155,6 @@ static struct platform_driver ipmmu_driver = {
.pm = pm_sleep_ptr(&ipmmu_pm),
},
.probe = ipmmu_probe,
- .remove_new = ipmmu_remove,
+ .remove = ipmmu_remove,
};
builtin_platform_driver(ipmmu_driver);
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 989e0869d805..819add75a665 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -312,6 +312,8 @@ static struct iommu_domain *msm_iommu_domain_alloc_paging(struct device *dev)
INIT_LIST_HEAD(&priv->list_attached);
+ priv->domain.pgsize_bitmap = MSM_IOMMU_PGSIZES;
+
priv->domain.geometry.aperture_start = 0;
priv->domain.geometry.aperture_end = (1ULL << 32) - 1;
priv->domain.geometry.force_aperture = true;
@@ -339,7 +341,7 @@ static int msm_iommu_domain_config(struct msm_priv *priv)
spin_lock_init(&priv->pgtlock);
priv->cfg = (struct io_pgtable_cfg) {
- .pgsize_bitmap = msm_iommu_ops.pgsize_bitmap,
+ .pgsize_bitmap = priv->domain.pgsize_bitmap,
.ias = 32,
.oas = 32,
.tlb = &msm_iommu_flush_ops,
@@ -352,8 +354,6 @@ static int msm_iommu_domain_config(struct msm_priv *priv)
return -EINVAL;
}
- msm_iommu_ops.pgsize_bitmap = priv->cfg.pgsize_bitmap;
-
return 0;
}
@@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev)
return &iommu->iommu;
}
-static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
int ret = 0;
unsigned long flags;
@@ -441,19 +442,19 @@ fail:
}
static int msm_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct msm_priv *priv;
unsigned long flags;
struct msm_iommu_dev *iommu;
struct msm_iommu_ctx_dev *master;
int ret = 0;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- priv = to_msm_priv(domain);
+ priv = to_msm_priv(old);
free_io_pgtable_ops(priv->iop);
spin_lock_irqsave(&msm_iommu_lock, flags);
@@ -692,7 +693,6 @@ static struct iommu_ops msm_iommu_ops = {
.domain_alloc_paging = msm_iommu_domain_alloc_paging,
.probe_device = msm_iommu_probe_device,
.device_group = generic_device_group,
- .pgsize_bitmap = MSM_IOMMU_PGSIZES,
.of_xlate = qcom_iommu_of_xlate,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = msm_iommu_attach_dev,
@@ -725,47 +725,32 @@ static int msm_iommu_probe(struct platform_device *pdev)
iommu->dev = &pdev->dev;
INIT_LIST_HEAD(&iommu->ctx_list);
- iommu->pclk = devm_clk_get(iommu->dev, "smmu_pclk");
+ iommu->pclk = devm_clk_get_prepared(iommu->dev, "smmu_pclk");
if (IS_ERR(iommu->pclk))
return dev_err_probe(iommu->dev, PTR_ERR(iommu->pclk),
"could not get smmu_pclk\n");
- ret = clk_prepare(iommu->pclk);
- if (ret)
- return dev_err_probe(iommu->dev, ret,
- "could not prepare smmu_pclk\n");
-
- iommu->clk = devm_clk_get(iommu->dev, "iommu_clk");
- if (IS_ERR(iommu->clk)) {
- clk_unprepare(iommu->pclk);
+ iommu->clk = devm_clk_get_prepared(iommu->dev, "iommu_clk");
+ if (IS_ERR(iommu->clk))
return dev_err_probe(iommu->dev, PTR_ERR(iommu->clk),
"could not get iommu_clk\n");
- }
-
- ret = clk_prepare(iommu->clk);
- if (ret) {
- clk_unprepare(iommu->pclk);
- return dev_err_probe(iommu->dev, ret, "could not prepare iommu_clk\n");
- }
r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
iommu->base = devm_ioremap_resource(iommu->dev, r);
if (IS_ERR(iommu->base)) {
ret = dev_err_probe(iommu->dev, PTR_ERR(iommu->base), "could not get iommu base\n");
- goto fail;
+ return ret;
}
ioaddr = r->start;
iommu->irq = platform_get_irq(pdev, 0);
- if (iommu->irq < 0) {
- ret = -ENODEV;
- goto fail;
- }
+ if (iommu->irq < 0)
+ return -ENODEV;
ret = of_property_read_u32(iommu->dev->of_node, "qcom,ncb", &val);
if (ret) {
dev_err(iommu->dev, "could not get ncb\n");
- goto fail;
+ return ret;
}
iommu->ncb = val;
@@ -780,8 +765,7 @@ static int msm_iommu_probe(struct platform_device *pdev)
if (!par) {
pr_err("Invalid PAR value detected\n");
- ret = -ENODEV;
- goto fail;
+ return -ENODEV;
}
ret = devm_request_threaded_irq(iommu->dev, iommu->irq, NULL,
@@ -791,7 +775,7 @@ static int msm_iommu_probe(struct platform_device *pdev)
iommu);
if (ret) {
pr_err("Request IRQ %d failed with ret=%d\n", iommu->irq, ret);
- goto fail;
+ return ret;
}
list_add(&iommu->dev_node, &qcom_iommu_devices);
@@ -800,23 +784,19 @@ static int msm_iommu_probe(struct platform_device *pdev)
"msm-smmu.%pa", &ioaddr);
if (ret) {
pr_err("Could not add msm-smmu at %pa to sysfs\n", &ioaddr);
- goto fail;
+ return ret;
}
ret = iommu_device_register(&iommu->iommu, &msm_iommu_ops, &pdev->dev);
if (ret) {
pr_err("Could not register msm-smmu at %pa\n", &ioaddr);
- goto fail;
+ return ret;
}
pr_info("device mapped at %p, irq %d with %d ctx banks\n",
iommu->base, iommu->irq, iommu->ncb);
return ret;
-fail:
- clk_unprepare(iommu->clk);
- clk_unprepare(iommu->pclk);
- return ret;
}
static const struct of_device_id msm_iommu_dt_match[] = {
@@ -824,20 +804,11 @@ static const struct of_device_id msm_iommu_dt_match[] = {
{}
};
-static void msm_iommu_remove(struct platform_device *pdev)
-{
- struct msm_iommu_dev *iommu = platform_get_drvdata(pdev);
-
- clk_unprepare(iommu->clk);
- clk_unprepare(iommu->pclk);
-}
-
static struct platform_driver msm_iommu_driver = {
.driver = {
.name = "msm_iommu",
.of_match_table = msm_iommu_dt_match,
},
.probe = msm_iommu_probe,
- .remove_new = msm_iommu_remove,
};
builtin_platform_driver(msm_iommu_driver);
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 6a2707fe7a78..60fcd3d3b5eb 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -29,6 +29,7 @@
#include <linux/spinlock.h>
#include <linux/soc/mediatek/infracfg.h>
#include <linux/soc/mediatek/mtk_sip_svc.h>
+#include <linux/string_choices.h>
#include <asm/barrier.h>
#include <soc/mediatek/smi.h>
@@ -138,6 +139,7 @@
/* 2 bits: iommu type */
#define MTK_IOMMU_TYPE_MM (0x0 << 13)
#define MTK_IOMMU_TYPE_INFRA (0x1 << 13)
+#define MTK_IOMMU_TYPE_APU (0x2 << 13)
#define MTK_IOMMU_TYPE_MASK (0x3 << 13)
/* PM and clock always on. e.g. infra iommu */
#define PM_CLK_AO BIT(15)
@@ -146,6 +148,7 @@
#define TF_PORT_TO_ADDR_MT8173 BIT(18)
#define INT_ID_PORT_WIDTH_6 BIT(19)
#define CFG_IFA_MASTER_IN_ATF BIT(20)
+#define DL_WITH_MULTI_LARB BIT(21)
#define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \
((((pdata)->flags) & (mask)) == (_x))
@@ -171,6 +174,7 @@ enum mtk_iommu_plat {
M4U_MT8183,
M4U_MT8186,
M4U_MT8188,
+ M4U_MT8189,
M4U_MT8192,
M4U_MT8195,
M4U_MT8365,
@@ -334,6 +338,8 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data, unsigned int ban
*/
#define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL
+static LIST_HEAD(apulist); /* List the apu iommu HWs */
+static LIST_HEAD(infralist); /* List the iommu_infra HW */
static LIST_HEAD(m4ulist); /* List all the M4U HWs */
#define for_each_m4u(data, head) list_for_each_entry(data, head, list)
@@ -349,6 +355,15 @@ static const struct mtk_iommu_iova_region single_domain[] = {
#define MT8192_MULTI_REGION_NR (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) ? \
MT8192_MULTI_REGION_NR_MAX : 1)
+static const struct mtk_iommu_iova_region mt8189_multi_dom_apu[] = {
+ { .iova_base = 0x200000ULL, .size = SZ_512M}, /* APU SECURE */
+#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT)
+ { .iova_base = SZ_1G, .size = 0xc0000000}, /* APU CODE */
+ { .iova_base = 0x70000000ULL, .size = 0x12600000}, /* APU VLM */
+ { .iova_base = SZ_4G, .size = SZ_4G * 3}, /* APU VPU */
+#endif
+};
+
static const struct mtk_iommu_iova_region mt8192_multi_dom[MT8192_MULTI_REGION_NR] = {
{ .iova_base = 0x0, .size = MTK_IOMMU_IOVA_SZ_4G}, /* 0 ~ 4G, */
#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT)
@@ -510,7 +525,7 @@ static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
bank->parent_dev,
"fault type=0x%x iova=0x%llx pa=0x%llx master=0x%x(larb=%d port=%d) layer=%d %s\n",
int_state, fault_iova, fault_pa, regval, fault_larb, fault_port,
- layer, write ? "write" : "read");
+ layer, str_write_read(write));
}
/* Interrupt clear */
@@ -602,7 +617,7 @@ static int mtk_iommu_config(struct mtk_iommu_data *data, struct device *dev,
larb_mmu->bank[portid] = upper_32_bits(region->iova_base);
dev_dbg(dev, "%s iommu for larb(%s) port 0x%lx region %d rgn-bank %d.\n",
- enable ? "enable" : "disable", dev_name(larb_mmu->dev),
+ str_enable_disable(enable), dev_name(larb_mmu->dev),
portid_msk, regionid, upper_32_bits(region->iova_base));
if (enable)
@@ -630,8 +645,8 @@ static int mtk_iommu_config(struct mtk_iommu_data *data, struct device *dev,
}
if (ret)
dev_err(dev, "%s iommu(%s) inframaster 0x%lx fail(%d).\n",
- enable ? "enable" : "disable",
- dev_name(data->dev), portid_msk, ret);
+ str_enable_disable(enable), dev_name(data->dev),
+ portid_msk, ret);
}
return ret;
}
@@ -647,7 +662,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom,
if (share_dom) {
dom->iop = share_dom->iop;
dom->cfg = share_dom->cfg;
- dom->domain.pgsize_bitmap = share_dom->cfg.pgsize_bitmap;
+ dom->domain.pgsize_bitmap = share_dom->domain.pgsize_bitmap;
goto update_iova_region;
}
@@ -655,7 +670,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom,
.quirks = IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NO_PERMS |
IO_PGTABLE_QUIRK_ARM_MTK_EXT,
- .pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap,
+ .pgsize_bitmap = dom->domain.pgsize_bitmap,
.ias = MTK_IOMMU_HAS_FLAG(data->plat_data, IOVA_34_EN) ? 34 : 32,
.iommu_dev = data->dev,
};
@@ -674,9 +689,6 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom,
return -ENOMEM;
}
- /* Update our support page sizes bitmap */
- dom->domain.pgsize_bitmap = dom->cfg.pgsize_bitmap;
-
data->share_dom = dom;
update_iova_region:
@@ -696,6 +708,7 @@ static struct iommu_domain *mtk_iommu_domain_alloc_paging(struct device *dev)
if (!dom)
return NULL;
mutex_init(&dom->mutex);
+ dom->domain.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M;
return &dom->domain;
}
@@ -706,7 +719,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
}
static int mtk_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata;
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
@@ -774,12 +787,12 @@ err_unlock:
}
static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
mtk_iommu_config(data, dev, false, 0);
@@ -866,6 +879,7 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
struct device_link *link;
struct device *larbdev;
+ unsigned long larbid_msk = 0;
unsigned int larbid, larbidx, i;
if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
@@ -873,30 +887,50 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev)
/*
* Link the consumer device with the smi-larb device(supplier).
- * The device that connects with each a larb is a independent HW.
- * All the ports in each a device should be in the same larbs.
+ * w/DL_WITH_MULTI_LARB: the master may connect with multi larbs,
+ * we should create device link with each larb.
+ * w/o DL_WITH_MULTI_LARB: the master must connect with one larb,
+ * otherwise fail.
*/
larbid = MTK_M4U_TO_LARB(fwspec->ids[0]);
if (larbid >= MTK_LARB_NR_MAX)
return ERR_PTR(-EINVAL);
+ larbid_msk |= BIT(larbid);
+
for (i = 1; i < fwspec->num_ids; i++) {
larbidx = MTK_M4U_TO_LARB(fwspec->ids[i]);
- if (larbid != larbidx) {
+ if (MTK_IOMMU_HAS_FLAG(data->plat_data, DL_WITH_MULTI_LARB)) {
+ larbid_msk |= BIT(larbidx);
+ } else if (larbid != larbidx) {
dev_err(dev, "Can only use one larb. Fail@larb%d-%d.\n",
larbid, larbidx);
return ERR_PTR(-EINVAL);
}
}
- larbdev = data->larb_imu[larbid].dev;
- if (!larbdev)
- return ERR_PTR(-EINVAL);
- link = device_link_add(dev, larbdev,
- DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
- if (!link)
- dev_err(dev, "Unable to link %s\n", dev_name(larbdev));
+ for_each_set_bit(larbid, &larbid_msk, 32) {
+ larbdev = data->larb_imu[larbid].dev;
+ if (!larbdev)
+ return ERR_PTR(-EINVAL);
+
+ link = device_link_add(dev, larbdev,
+ DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS);
+ if (!link) {
+ dev_err(dev, "Unable to link %s\n", dev_name(larbdev));
+ goto link_remove;
+ }
+ }
+
return &data->iommu;
+
+link_remove:
+ for_each_set_bit(i, &larbid_msk, larbid) {
+ larbdev = data->larb_imu[i].dev;
+ device_link_remove(dev, larbdev);
+ }
+
+ return ERR_PTR(-ENODEV);
}
static void mtk_iommu_release_device(struct device *dev)
@@ -904,11 +938,19 @@ static void mtk_iommu_release_device(struct device *dev)
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct mtk_iommu_data *data;
struct device *larbdev;
- unsigned int larbid;
+ unsigned int larbid, i;
+ unsigned long larbid_msk = 0;
data = dev_iommu_priv_get(dev);
- if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
- larbid = MTK_M4U_TO_LARB(fwspec->ids[0]);
+ if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+ return;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ larbid = MTK_M4U_TO_LARB(fwspec->ids[i]);
+ larbid_msk |= BIT(larbid);
+ }
+
+ for_each_set_bit(larbid, &larbid_msk, 32) {
larbdev = data->larb_imu[larbid].dev;
device_link_remove(dev, larbdev);
}
@@ -975,6 +1017,8 @@ static int mtk_iommu_of_xlate(struct device *dev,
return -EINVAL;
dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+
+ put_device(&m4updev->dev);
}
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -1018,7 +1062,6 @@ static const struct iommu_ops mtk_iommu_ops = {
.device_group = mtk_iommu_device_group,
.of_xlate = mtk_iommu_of_xlate,
.get_resv_regions = mtk_iommu_get_resv_regions,
- .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
.owner = THIS_MODULE,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = mtk_iommu_attach_device,
@@ -1213,16 +1256,19 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m
}
component_match_add(dev, match, component_compare_dev, &plarbdev->dev);
- platform_device_put(plarbdev);
}
- if (!frst_avail_smicomm_node)
- return -EINVAL;
+ if (!frst_avail_smicomm_node) {
+ ret = -EINVAL;
+ goto err_larbdev_put;
+ }
pcommdev = of_find_device_by_node(frst_avail_smicomm_node);
of_node_put(frst_avail_smicomm_node);
- if (!pcommdev)
- return -ENODEV;
+ if (!pcommdev) {
+ ret = -ENODEV;
+ goto err_larbdev_put;
+ }
data->smicomm_dev = &pcommdev->dev;
link = device_link_add(data->smicomm_dev, dev,
@@ -1230,16 +1276,16 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m
platform_device_put(pcommdev);
if (!link) {
dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev));
- return -EINVAL;
+ ret = -EINVAL;
+ goto err_larbdev_put;
}
return 0;
err_larbdev_put:
- for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) {
- if (!data->larb_imu[i].dev)
- continue;
+ /* id mapping may not be linear, loop the whole array */
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
put_device(data->larb_imu[i].dev);
- }
+
return ret;
}
@@ -1371,15 +1417,6 @@ static int mtk_iommu_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, data);
mutex_init(&data->mutex);
- ret = iommu_device_sysfs_add(&data->iommu, dev, NULL,
- "mtk-iommu.%pa", &ioaddr);
- if (ret)
- goto out_link_remove;
-
- ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev);
- if (ret)
- goto out_sysfs_remove;
-
if (MTK_IOMMU_HAS_FLAG(data->plat_data, SHARE_PGTABLE)) {
list_add_tail(&data->list, data->plat_data->hw_list);
data->hw_list = data->plat_data->hw_list;
@@ -1389,21 +1426,34 @@ static int mtk_iommu_probe(struct platform_device *pdev)
data->hw_list = &data->hw_list_head;
}
+ ret = iommu_device_sysfs_add(&data->iommu, dev, NULL,
+ "mtk-iommu.%pa", &ioaddr);
+ if (ret)
+ goto out_list_del;
+
+ ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev);
+ if (ret)
+ goto out_sysfs_remove;
+
if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
ret = component_master_add_with_match(dev, &mtk_iommu_com_ops, match);
if (ret)
- goto out_list_del;
+ goto out_device_unregister;
}
return ret;
-out_list_del:
- list_del(&data->list);
+out_device_unregister:
iommu_device_unregister(&data->iommu);
out_sysfs_remove:
iommu_device_sysfs_remove(&data->iommu);
-out_link_remove:
- if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM))
+out_list_del:
+ list_del(&data->list);
+ if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
device_link_remove(data->smicomm_dev, dev);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
+ }
out_runtime_disable:
pm_runtime_disable(dev);
return ret;
@@ -1423,6 +1473,9 @@ static void mtk_iommu_remove(struct platform_device *pdev)
if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) {
device_link_remove(data->smicomm_dev, &pdev->dev);
component_master_del(&pdev->dev, &mtk_iommu_com_ops);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
}
pm_runtime_disable(&pdev->dev);
for (i = 0; i < data->plat_data->banks_num; i++) {
@@ -1549,6 +1602,31 @@ static const struct mtk_iommu_plat_data mt6795_data = {
.larbid_remap = {{0}, {1}, {2}, {3}, {4}}, /* Linear mapping. */
};
+static const unsigned int mt8192_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = {
+ [0] = {~0, ~0}, /* Region0: larb0/1 */
+ [1] = {0, 0, 0, 0, ~0, ~0, 0, ~0}, /* Region1: larb4/5/7 */
+ [2] = {0, 0, ~0, 0, 0, 0, 0, 0, /* Region2: larb2/9/11/13/14/16/17/18/19/20 */
+ 0, ~0, 0, ~0, 0, ~(u32)(BIT(9) | BIT(10)), ~(u32)(BIT(4) | BIT(5)), 0,
+ ~0, ~0, ~0, ~0, ~0},
+ [3] = {0},
+ [4] = {[13] = BIT(9) | BIT(10)}, /* larb13 port9/10 */
+ [5] = {[14] = BIT(4) | BIT(5)}, /* larb14 port4/5 */
+};
+
+static const struct mtk_iommu_plat_data mt6893_data = {
+ .m4u_plat = M4U_MT8192,
+ .flags = HAS_BCLK | OUT_ORDER_WR_EN | HAS_SUB_COMM_2BITS |
+ WR_THROT_EN | IOVA_34_EN | SHARE_PGTABLE | MTK_IOMMU_TYPE_MM,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .iova_region = mt8192_multi_dom,
+ .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom),
+ .iova_region_larb_msk = mt8192_larb_region_msk,
+ .larbid_remap = {{0}, {1}, {4, 5}, {7}, {2}, {9, 11, 19, 20},
+ {0, 14, 16}, {0, 13, 18, 17}},
+};
+
static const struct mtk_iommu_plat_data mt8167_data = {
.m4u_plat = M4U_MT8167,
.flags = RESET_AXI | HAS_LEGACY_IVRP_PADDR | MTK_IOMMU_TYPE_MM,
@@ -1599,7 +1677,7 @@ static const unsigned int mt8186_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK
static const struct mtk_iommu_plat_data mt8186_data_mm = {
.m4u_plat = M4U_MT8186,
.flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN |
- WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM,
+ WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM | PGTABLE_PA_35_EN,
.larbid_remap = {{0}, {1, MTK_INVALID_LARBID, 8}, {4}, {7}, {2}, {9, 11, 19, 20},
{MTK_INVALID_LARBID, 14, 16},
{MTK_INVALID_LARBID, 13, MTK_INVALID_LARBID, 17}},
@@ -1672,15 +1750,64 @@ static const struct mtk_iommu_plat_data mt8188_data_vpp = {
27, 28 /* ccu0 */, MTK_INVALID_LARBID}, {4, 6}},
};
-static const unsigned int mt8192_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = {
- [0] = {~0, ~0}, /* Region0: larb0/1 */
- [1] = {0, 0, 0, 0, ~0, ~0, 0, ~0}, /* Region1: larb4/5/7 */
- [2] = {0, 0, ~0, 0, 0, 0, 0, 0, /* Region2: larb2/9/11/13/14/16/17/18/19/20 */
- 0, ~0, 0, ~0, 0, ~(u32)(BIT(9) | BIT(10)), ~(u32)(BIT(4) | BIT(5)), 0,
- ~0, ~0, ~0, ~0, ~0},
- [3] = {0},
- [4] = {[13] = BIT(9) | BIT(10)}, /* larb13 port9/10 */
- [5] = {[14] = BIT(4) | BIT(5)}, /* larb14 port4/5 */
+static const unsigned int mt8189_apu_region_msk[][MTK_LARB_NR_MAX] = {
+ [0] = {[0] = BIT(2)}, /* Region0: fake larb 0 APU_SECURE */
+ [1] = {[0] = BIT(1)}, /* Region1: fake larb 0 APU_CODE */
+ [2] = {[0] = BIT(3)}, /* Region2: fake larb 0 APU_VLM */
+ [3] = {[0] = BIT(0)}, /* Region3: fake larb 0 APU_DATA */
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_apu = {
+ .m4u_plat = M4U_MT8189,
+ .flags = IOVA_34_EN | DCM_DISABLE |
+ MTK_IOMMU_TYPE_APU | PGTABLE_PA_35_EN,
+ .hw_list = &apulist,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .iova_region = mt8189_multi_dom_apu,
+ .iova_region_nr = ARRAY_SIZE(mt8189_multi_dom_apu),
+ .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}},
+ .iova_region_larb_msk = mt8189_apu_region_msk,
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_infra = {
+ .m4u_plat = M4U_MT8189,
+ .flags = WR_THROT_EN | DCM_DISABLE | MTK_IOMMU_TYPE_INFRA |
+ CFG_IFA_MASTER_IN_ATF | SHARE_PGTABLE | PGTABLE_PA_35_EN,
+ .hw_list = &infralist,
+ .banks_num = 1,
+ .banks_enable = {true},
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .iova_region = single_domain,
+ .iova_region_nr = ARRAY_SIZE(single_domain),
+};
+
+static const u32 mt8189_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = {
+ [0] = {~0, ~0, ~0, [22] = BIT(0)}, /* Region0: all ports for larb0/1/2 */
+ [1] = {[3] = ~0, [4] = ~0}, /* Region1: all ports for larb4(3)/7(4) */
+ [2] = {[5] = ~0, [6] = ~0, /* Region2: all ports for larb9(5)/11(6) */
+ [7] = ~0, [8] = ~0, /* Region2: all ports for larb13(7)/14(8) */
+ [9] = ~0, [10] = ~0, /* Region2: all ports for larb16(9)/17(10) */
+ [11] = ~0, [12] = ~0, /* Region2: all ports for larb19(11)/20(12) */
+ [21] = ~0}, /* Region2: larb21 fake GCE larb */
+};
+
+static const struct mtk_iommu_plat_data mt8189_data_mm = {
+ .m4u_plat = M4U_MT8189,
+ .flags = HAS_BCLK | HAS_SUB_COMM_3BITS | OUT_ORDER_WR_EN |
+ WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM |
+ PGTABLE_PA_35_EN | DL_WITH_MULTI_LARB,
+ .hw_list = &m4ulist,
+ .inv_sel_reg = REG_MMU_INV_SEL_GEN2,
+ .banks_num = 5,
+ .banks_enable = {true, false, false, false, false},
+ .iova_region = mt8192_multi_dom,
+ .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom),
+ .iova_region_larb_msk = mt8189_larb_region_msk,
+ .larbid_remap = {{0}, {1}, {21/* GCE_D */, 21/* GCE_M */, 2},
+ {19, 20, 9, 11}, {7}, {4},
+ {13, 17}, {14, 16}},
};
static const struct mtk_iommu_plat_data mt8192_data = {
@@ -1776,6 +1903,7 @@ static const struct of_device_id mtk_iommu_of_ids[] = {
{ .compatible = "mediatek,mt2712-m4u", .data = &mt2712_data},
{ .compatible = "mediatek,mt6779-m4u", .data = &mt6779_data},
{ .compatible = "mediatek,mt6795-m4u", .data = &mt6795_data},
+ { .compatible = "mediatek,mt6893-iommu-mm", .data = &mt6893_data},
{ .compatible = "mediatek,mt8167-m4u", .data = &mt8167_data},
{ .compatible = "mediatek,mt8173-m4u", .data = &mt8173_data},
{ .compatible = "mediatek,mt8183-m4u", .data = &mt8183_data},
@@ -1783,6 +1911,9 @@ static const struct of_device_id mtk_iommu_of_ids[] = {
{ .compatible = "mediatek,mt8188-iommu-infra", .data = &mt8188_data_infra},
{ .compatible = "mediatek,mt8188-iommu-vdo", .data = &mt8188_data_vdo},
{ .compatible = "mediatek,mt8188-iommu-vpp", .data = &mt8188_data_vpp},
+ { .compatible = "mediatek,mt8189-iommu-apu", .data = &mt8189_data_apu},
+ { .compatible = "mediatek,mt8189-iommu-infra", .data = &mt8189_data_infra},
+ { .compatible = "mediatek,mt8189-iommu-mm", .data = &mt8189_data_mm},
{ .compatible = "mediatek,mt8192-m4u", .data = &mt8192_data},
{ .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra},
{ .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo},
@@ -1794,7 +1925,7 @@ MODULE_DEVICE_TABLE(of, mtk_iommu_of_ids);
static struct platform_driver mtk_iommu_driver = {
.probe = mtk_iommu_probe,
- .remove_new = mtk_iommu_remove,
+ .remove = mtk_iommu_remove,
.driver = {
.name = "mtk-iommu",
.of_match_table = mtk_iommu_of_ids,
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index d6e4002200bd..c8d8eff5373d 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -25,12 +25,22 @@
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/string_choices.h>
#include <asm/barrier.h>
-#include <asm/dma-iommu.h>
#include <dt-bindings/memory/mtk-memory-port.h>
#include <dt-bindings/memory/mt2701-larb-port.h>
#include <soc/mediatek/smi.h>
+#if defined(CONFIG_ARM)
+#include <asm/dma-iommu.h>
+#else
+#define arm_iommu_create_mapping(...) NULL
+#define arm_iommu_attach_device(...) -ENODEV
+struct dma_iommu_mapping {
+ struct iommu_domain *domain;
+};
+#endif
+
#define REG_MMU_PT_BASE_ADDR 0x000
#define F_ALL_INVLD 0x2
@@ -243,7 +253,7 @@ static void mtk_iommu_v1_config(struct mtk_iommu_v1_data *data,
larb_mmu = &data->larb_imu[larbid];
dev_dbg(dev, "%s iommu port: %d\n",
- enable ? "enable" : "disable", portid);
+ str_enable_disable(enable), portid);
if (enable)
larb_mmu->mmu |= MTK_SMI_MMU_EN(portid);
@@ -278,6 +288,8 @@ static struct iommu_domain *mtk_iommu_v1_domain_alloc_paging(struct device *dev)
if (!dom)
return NULL;
+ dom->domain.pgsize_bitmap = MT2701_IOMMU_PAGE_SIZE;
+
return &dom->domain;
}
@@ -291,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain)
kfree(to_mtk_domain(domain));
}
-static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev)
+static int mtk_iommu_v1_attach_device(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain);
@@ -317,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device
}
static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
@@ -401,7 +416,6 @@ static const struct iommu_ops mtk_iommu_v1_ops;
static int mtk_iommu_v1_create_mapping(struct device *dev,
const struct of_phandle_args *args)
{
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct mtk_iommu_v1_data *data;
struct platform_device *m4updev;
struct dma_iommu_mapping *mtk_mapping;
@@ -413,14 +427,9 @@ static int mtk_iommu_v1_create_mapping(struct device *dev,
return -EINVAL;
}
- if (!fwspec) {
- ret = iommu_fwspec_init(dev, &args->np->fwnode, &mtk_iommu_v1_ops);
- if (ret)
- return ret;
- fwspec = dev_iommu_fwspec_get(dev);
- } else if (dev_iommu_fwspec_get(dev)->ops != &mtk_iommu_v1_ops) {
- return -EINVAL;
- }
+ ret = iommu_fwspec_init(dev, of_fwnode_handle(args->np));
+ if (ret)
+ return ret;
if (!dev_iommu_priv_get(dev)) {
/* Get the m4u device */
@@ -429,6 +438,8 @@ static int mtk_iommu_v1_create_mapping(struct device *dev,
return -EINVAL;
dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
+
+ put_device(&m4updev->dev);
}
ret = iommu_fwspec_add_ids(dev, args->args, 1);
@@ -439,8 +450,7 @@ static int mtk_iommu_v1_create_mapping(struct device *dev,
mtk_mapping = data->mapping;
if (!mtk_mapping) {
/* MTK iommu support 4GB iova address space. */
- mtk_mapping = arm_iommu_create_mapping(&platform_bus_type,
- 0, 1ULL << 32);
+ mtk_mapping = arm_iommu_create_mapping(dev, 0, 1ULL << 32);
if (IS_ERR(mtk_mapping))
return PTR_ERR(mtk_mapping);
@@ -452,22 +462,13 @@ static int mtk_iommu_v1_create_mapping(struct device *dev,
static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev)
{
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct iommu_fwspec *fwspec = NULL;
struct of_phandle_args iommu_spec;
struct mtk_iommu_v1_data *data;
int err, idx = 0, larbid, larbidx;
struct device_link *link;
struct device *larbdev;
- /*
- * In the deferred case, free the existed fwspec.
- * Always initialize the fwspec internally.
- */
- if (fwspec) {
- iommu_fwspec_free(dev);
- fwspec = dev_iommu_fwspec_get(dev);
- }
-
while (!of_parse_phandle_with_args(dev->of_node, "iommus",
"#iommu-cells",
idx, &iommu_spec)) {
@@ -482,6 +483,9 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev)
idx++;
}
+ if (!fwspec)
+ return ERR_PTR(-ENODEV);
+
data = dev_iommu_priv_get(dev);
/* Link the consumer device with the smi-larb device(supplier) */
@@ -512,14 +516,10 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev)
static void mtk_iommu_v1_probe_finalize(struct device *dev)
{
- struct dma_iommu_mapping *mtk_mapping;
- struct mtk_iommu_v1_data *data;
+ __maybe_unused struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev);
int err;
- data = dev_iommu_priv_get(dev);
- mtk_mapping = data->mapping;
-
- err = arm_iommu_attach_device(dev, mtk_mapping);
+ err = arm_iommu_attach_device(dev, data->mapping);
if (err)
dev_err(dev, "Can't create IOMMU mapping - DMA-OPS will not work\n");
}
@@ -585,7 +585,6 @@ static const struct iommu_ops mtk_iommu_v1_ops = {
.probe_finalize = mtk_iommu_v1_probe_finalize,
.release_device = mtk_iommu_v1_release_device,
.device_group = generic_device_group,
- .pgsize_bitmap = MT2701_IOMMU_PAGE_SIZE,
.owner = THIS_MODULE,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = mtk_iommu_v1_attach_device,
@@ -647,13 +646,18 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
if (larb_nr < 0)
return larb_nr;
+ if (larb_nr > MTK_LARB_NR_MAX)
+ return -EINVAL;
+
for (i = 0; i < larb_nr; i++) {
struct device_node *larbnode;
struct platform_device *plarbdev;
larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i);
- if (!larbnode)
- return -EINVAL;
+ if (!larbnode) {
+ ret = -EINVAL;
+ goto out_put_larbs;
+ }
if (!of_device_is_available(larbnode)) {
of_node_put(larbnode);
@@ -663,11 +667,14 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
plarbdev = of_find_device_by_node(larbnode);
if (!plarbdev) {
of_node_put(larbnode);
- return -ENODEV;
+ ret = -ENODEV;
+ goto out_put_larbs;
}
if (!plarbdev->dev.driver) {
of_node_put(larbnode);
- return -EPROBE_DEFER;
+ put_device(&plarbdev->dev);
+ ret = -EPROBE_DEFER;
+ goto out_put_larbs;
}
data->larb_imu[i].dev = &plarbdev->dev;
@@ -679,7 +686,7 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev)
ret = mtk_iommu_v1_hw_init(data);
if (ret)
- return ret;
+ goto out_put_larbs;
ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL,
dev_name(&pdev->dev));
@@ -701,12 +708,17 @@ out_sysfs_remove:
iommu_device_sysfs_remove(&data->iommu);
out_clk_unprepare:
clk_disable_unprepare(data->bclk);
+out_put_larbs:
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
+
return ret;
}
static void mtk_iommu_v1_remove(struct platform_device *pdev)
{
struct mtk_iommu_v1_data *data = platform_get_drvdata(pdev);
+ int i;
iommu_device_sysfs_remove(&data->iommu);
iommu_device_unregister(&data->iommu);
@@ -714,6 +726,9 @@ static void mtk_iommu_v1_remove(struct platform_device *pdev)
clk_disable_unprepare(data->bclk);
devm_free_irq(&pdev->dev, data->irq, data);
component_master_del(&pdev->dev, &mtk_iommu_v1_com_ops);
+
+ for (i = 0; i < MTK_LARB_NR_MAX; i++)
+ put_device(data->larb_imu[i].dev);
}
static int __maybe_unused mtk_iommu_v1_suspend(struct device *dev)
@@ -752,7 +767,7 @@ static const struct dev_pm_ops mtk_iommu_v1_pm_ops = {
static struct platform_driver mtk_iommu_v1_driver = {
.probe = mtk_iommu_v1_probe,
- .remove_new = mtk_iommu_v1_remove,
+ .remove = mtk_iommu_v1_remove,
.driver = {
.name = "mtk-iommu-v1",
.of_match_table = mtk_iommu_v1_of_ids,
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 3afe0b48a48d..6b989a62def2 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -17,30 +17,23 @@
#include <linux/slab.h>
#include <linux/fsl/mc.h>
+#include "iommu-priv.h"
+
static int of_iommu_xlate(struct device *dev,
struct of_phandle_args *iommu_spec)
{
const struct iommu_ops *ops;
- struct fwnode_handle *fwnode = &iommu_spec->np->fwnode;
int ret;
- ops = iommu_ops_from_fwnode(fwnode);
- if ((ops && !ops->of_xlate) ||
- !of_device_is_available(iommu_spec->np))
+ if (!of_device_is_available(iommu_spec->np))
return -ENODEV;
- ret = iommu_fwspec_init(dev, fwnode, ops);
+ ret = iommu_fwspec_init(dev, of_fwnode_handle(iommu_spec->np));
if (ret)
return ret;
- /*
- * The otherwise-empty fwspec handily serves to indicate the specific
- * IOMMU device we're waiting for, which will be useful if we ever get
- * a proper probe-ordering dependency mechanism in future.
- */
- if (!ops)
- return driver_deferred_probe_check_state(dev);
- if (!try_module_get(ops->owner))
+ ops = iommu_ops_from_fwnode(&iommu_spec->np->fwnode);
+ if (!ops->of_xlate || !try_module_get(ops->owner))
return -ENODEV;
ret = ops->of_xlate(dev, iommu_spec);
@@ -105,6 +98,14 @@ static int of_iommu_configure_device(struct device_node *master_np,
of_iommu_configure_dev(master_np, dev);
}
+static void of_pci_check_device_ats(struct device *dev, struct device_node *np)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ if (fwspec && of_property_read_bool(np, "ats-supported"))
+ fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+}
+
/*
* Returns:
* 0 on success, an iommu was configured
@@ -115,7 +116,7 @@ static int of_iommu_configure_device(struct device_node *master_np,
int of_iommu_configure(struct device *dev, struct device_node *master_np,
const u32 *id)
{
- struct iommu_fwspec *fwspec;
+ bool dev_iommu_present;
int err;
if (!master_np)
@@ -123,15 +124,11 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np,
/* Serialise to make dev->iommu stable under our potential fwspec */
mutex_lock(&iommu_probe_device_lock);
- fwspec = dev_iommu_fwspec_get(dev);
- if (fwspec) {
- if (fwspec->ops) {
- mutex_unlock(&iommu_probe_device_lock);
- return 0;
- }
- /* In the deferred case, start again from scratch */
- iommu_fwspec_free(dev);
+ if (dev_iommu_fwspec_get(dev)) {
+ mutex_unlock(&iommu_probe_device_lock);
+ return 0;
}
+ dev_iommu_present = dev->iommu;
/*
* We don't currently walk up the tree looking for a parent IOMMU.
@@ -147,23 +144,28 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np,
pci_request_acs();
err = pci_for_each_dma_alias(to_pci_dev(dev),
of_pci_iommu_init, &info);
+ of_pci_check_device_ats(dev, master_np);
} else {
err = of_iommu_configure_device(master_np, dev, id);
}
+
+ if (err && dev_iommu_present)
+ iommu_fwspec_free(dev);
+ else if (err && dev->iommu)
+ dev_iommu_free(dev);
mutex_unlock(&iommu_probe_device_lock);
- if (err == -ENODEV || err == -EPROBE_DEFER)
- return err;
- if (err)
- goto err_log;
+ /*
+ * If we're not on the iommu_probe_device() path (as indicated by the
+ * initial dev->iommu) then try to simulate it. This should no longer
+ * happen unless of_dma_configure() is being misused outside bus code.
+ */
+ if (!err && dev->bus && !dev_iommu_present)
+ err = iommu_probe_device(dev);
- err = iommu_probe_device(dev);
- if (err)
- goto err_log;
- return 0;
+ if (err && err != -EPROBE_DEFER)
+ dev_dbg(dev, "Adding to IOMMU failed: %d\n", err);
-err_log:
- dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err));
return err;
}
@@ -219,7 +221,7 @@ void of_iommu_get_resv_regions(struct device *dev, struct list_head *list)
* that represent reservations in the IOVA space, which are regions that should
* not be mapped.
*/
- if (of_find_property(it.node, "reg", NULL)) {
+ if (of_property_present(it.node, "reg")) {
err = of_address_to_resource(it.node, 0, &phys);
if (err < 0) {
dev_err(dev, "failed to parse memory region %pOF: %d\n",
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index c9528065a59a..768973b7e511 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1123,29 +1123,15 @@ static int omap_iommu_dra7_get_dsp_system_cfg(struct platform_device *pdev,
struct omap_iommu *obj)
{
struct device_node *np = pdev->dev.of_node;
- int ret;
if (!of_device_is_compatible(np, "ti,dra7-dsp-iommu"))
return 0;
- if (!of_property_read_bool(np, "ti,syscon-mmuconfig")) {
- dev_err(&pdev->dev, "ti,syscon-mmuconfig property is missing\n");
- return -EINVAL;
- }
-
- obj->syscfg =
- syscon_regmap_lookup_by_phandle(np, "ti,syscon-mmuconfig");
- if (IS_ERR(obj->syscfg)) {
- /* can fail with -EPROBE_DEFER */
- ret = PTR_ERR(obj->syscfg);
- return ret;
- }
-
- if (of_property_read_u32_index(np, "ti,syscon-mmuconfig", 1,
- &obj->id)) {
- dev_err(&pdev->dev, "couldn't get the IOMMU instance id within subsystem\n");
- return -EINVAL;
- }
+ obj->syscfg = syscon_regmap_lookup_by_phandle_args(np, "ti,syscon-mmuconfig",
+ 1, &obj->id);
+ if (IS_ERR(obj->syscfg))
+ return dev_err_probe(&pdev->dev, PTR_ERR(obj->syscfg),
+ "ti,syscon-mmuconfig property is missing\n");
if (obj->id != 0 && obj->id != 1) {
dev_err(&pdev->dev, "invalid IOMMU instance id\n");
@@ -1230,25 +1216,24 @@ static int omap_iommu_probe(struct platform_device *pdev)
if (err)
return err;
- err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev);
- if (err)
- goto out_sysfs;
obj->has_iommu_driver = true;
}
+ err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev);
+ if (err)
+ goto out_sysfs;
+
pm_runtime_enable(obj->dev);
omap_iommu_debugfs_add(obj);
dev_info(&pdev->dev, "%s registered\n", obj->name);
- /* Re-probe bus to probe device attached to this IOMMU */
- bus_iommu_probe(&platform_bus_type);
-
return 0;
out_sysfs:
- iommu_device_sysfs_remove(&obj->iommu);
+ if (obj->has_iommu_driver)
+ iommu_device_sysfs_remove(&obj->iommu);
return err;
}
@@ -1256,10 +1241,10 @@ static void omap_iommu_remove(struct platform_device *pdev)
{
struct omap_iommu *obj = platform_get_drvdata(pdev);
- if (obj->has_iommu_driver) {
+ if (obj->has_iommu_driver)
iommu_device_sysfs_remove(&obj->iommu);
- iommu_device_unregister(&obj->iommu);
- }
+
+ iommu_device_unregister(&obj->iommu);
omap_iommu_debugfs_remove(obj);
@@ -1286,7 +1271,7 @@ static const struct of_device_id omap_iommu_of_match[] = {
static struct platform_driver omap_iommu_driver = {
.probe = omap_iommu_probe,
- .remove_new = omap_iommu_remove,
+ .remove = omap_iommu_remove,
.driver = {
.name = "omap-iommu",
.pm = &omap_iommu_pm_ops,
@@ -1318,8 +1303,8 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
struct omap_iommu_device *iommu;
struct omap_iommu *oiommu;
struct iotlb_entry e;
+ int ret = -EINVAL;
int omap_pgsz;
- u32 ret = -EINVAL;
int i;
omap_pgsz = bytes_to_iopgsz(bytes);
@@ -1446,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain)
odomain->iommus = NULL;
}
-static int
-omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static int omap_iommu_attach_dev(struct iommu_domain *domain,
+ struct device *dev, struct iommu_domain *old)
{
struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev);
struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
@@ -1551,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
}
static int omap_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct omap_iommu_domain *omap_domain;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- omap_domain = to_omap_domain(domain);
+ omap_domain = to_omap_domain(old);
spin_lock(&omap_domain->lock);
_omap_iommu_detach_dev(omap_domain, dev);
spin_unlock(&omap_domain->lock);
@@ -1585,6 +1570,8 @@ static struct iommu_domain *omap_iommu_domain_alloc_paging(struct device *dev)
spin_lock_init(&omap_domain->lock);
+ omap_domain->domain.pgsize_bitmap = OMAP_IOMMU_PGSIZES;
+
omap_domain->domain.geometry.aperture_start = 0;
omap_domain->domain.geometry.aperture_end = (1ULL << 32) - 1;
omap_domain->domain.geometry.force_aperture = true;
@@ -1681,23 +1668,20 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev)
}
pdev = of_find_device_by_node(np);
+ of_node_put(np);
if (!pdev) {
- of_node_put(np);
kfree(arch_data);
return ERR_PTR(-ENODEV);
}
oiommu = platform_get_drvdata(pdev);
+ put_device(&pdev->dev);
if (!oiommu) {
- of_node_put(np);
kfree(arch_data);
return ERR_PTR(-EINVAL);
}
tmp->iommu_dev = oiommu;
- tmp->dev = &pdev->dev;
-
- of_node_put(np);
}
dev_iommu_priv_set(dev, arch_data);
@@ -1723,13 +1707,19 @@ static void omap_iommu_release_device(struct device *dev)
}
+static int omap_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
+{
+ /* TODO: collect args->np to save re-parsing in probe above */
+ return 0;
+}
+
static const struct iommu_ops omap_iommu_ops = {
.identity_domain = &omap_iommu_identity_domain,
.domain_alloc_paging = omap_iommu_domain_alloc_paging,
.probe_device = omap_iommu_probe_device,
.release_device = omap_iommu_release_device,
.device_group = generic_single_device_group,
- .pgsize_bitmap = OMAP_IOMMU_PGSIZES,
+ .of_xlate = omap_iommu_of_xlate,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = omap_iommu_attach_dev,
.map_pages = omap_iommu_map,
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
index 27697109ec79..50b39be61abc 100644
--- a/drivers/iommu/omap-iommu.h
+++ b/drivers/iommu/omap-iommu.h
@@ -88,7 +88,6 @@ struct omap_iommu {
/**
* struct omap_iommu_arch_data - omap iommu private data
* @iommu_dev: handle of the OMAP iommu device
- * @dev: handle of the iommu device
*
* This is an omap iommu private data object, which binds an iommu user
* to its iommu device. This object should be placed at the iommu user's
@@ -97,7 +96,6 @@ struct omap_iommu {
*/
struct omap_iommu_arch_data {
struct omap_iommu *iommu_dev;
- struct device *dev;
};
struct cr_regs {
diff --git a/drivers/iommu/riscv/Kconfig b/drivers/iommu/riscv/Kconfig
new file mode 100644
index 000000000000..c071816f59a6
--- /dev/null
+++ b/drivers/iommu/riscv/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# RISC-V IOMMU support
+
+config RISCV_IOMMU
+ bool "RISC-V IOMMU Support"
+ depends on RISCV && 64BIT
+ default y
+ select IOMMU_API
+ help
+ Support for implementations of the RISC-V IOMMU architecture that
+ complements the RISC-V MMU capabilities, providing similar address
+ translation and protection functions for accesses from I/O devices.
+
+ Say Y here if your SoC includes an IOMMU device implementing
+ the RISC-V IOMMU architecture.
+
+config RISCV_IOMMU_PCI
+ def_bool y if RISCV_IOMMU && PCI_MSI
+ help
+ Support for the PCIe implementation of RISC-V IOMMU architecture.
diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile
new file mode 100644
index 000000000000..b5929f9f23e6
--- /dev/null
+++ b/drivers/iommu/riscv/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y += iommu.o iommu-platform.o
+obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o
diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h
new file mode 100644
index 000000000000..98daf0e1a306
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-bits.h
@@ -0,0 +1,784 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ * Copyright © 2023 RISC-V IOMMU Task Group
+ *
+ * RISC-V IOMMU - Register Layout and Data Structures.
+ *
+ * Based on the 'RISC-V IOMMU Architecture Specification', Version 1.0
+ * Published at https://github.com/riscv-non-isa/riscv-iommu
+ *
+ */
+
+#ifndef _RISCV_IOMMU_BITS_H_
+#define _RISCV_IOMMU_BITS_H_
+
+#include <linux/types.h>
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+
+/*
+ * Chapter 5: Memory Mapped register interface
+ */
+
+/* Common field positions */
+#define RISCV_IOMMU_PPN_FIELD GENMASK_ULL(53, 10)
+#define RISCV_IOMMU_QUEUE_LOG2SZ_FIELD GENMASK_ULL(4, 0)
+#define RISCV_IOMMU_QUEUE_INDEX_FIELD GENMASK_ULL(31, 0)
+#define RISCV_IOMMU_QUEUE_ENABLE BIT(0)
+#define RISCV_IOMMU_QUEUE_INTR_ENABLE BIT(1)
+#define RISCV_IOMMU_QUEUE_MEM_FAULT BIT(8)
+#define RISCV_IOMMU_QUEUE_OVERFLOW BIT(9)
+#define RISCV_IOMMU_QUEUE_ACTIVE BIT(16)
+#define RISCV_IOMMU_QUEUE_BUSY BIT(17)
+
+#define RISCV_IOMMU_ATP_PPN_FIELD GENMASK_ULL(43, 0)
+#define RISCV_IOMMU_ATP_MODE_FIELD GENMASK_ULL(63, 60)
+
+/* 5.3 IOMMU Capabilities (64bits) */
+#define RISCV_IOMMU_REG_CAPABILITIES 0x0000
+#define RISCV_IOMMU_CAPABILITIES_VERSION GENMASK_ULL(7, 0)
+#define RISCV_IOMMU_CAPABILITIES_SV32 BIT_ULL(8)
+#define RISCV_IOMMU_CAPABILITIES_SV39 BIT_ULL(9)
+#define RISCV_IOMMU_CAPABILITIES_SV48 BIT_ULL(10)
+#define RISCV_IOMMU_CAPABILITIES_SV57 BIT_ULL(11)
+#define RISCV_IOMMU_CAPABILITIES_SVPBMT BIT_ULL(15)
+#define RISCV_IOMMU_CAPABILITIES_SV32X4 BIT_ULL(16)
+#define RISCV_IOMMU_CAPABILITIES_SV39X4 BIT_ULL(17)
+#define RISCV_IOMMU_CAPABILITIES_SV48X4 BIT_ULL(18)
+#define RISCV_IOMMU_CAPABILITIES_SV57X4 BIT_ULL(19)
+#define RISCV_IOMMU_CAPABILITIES_AMO_MRIF BIT_ULL(21)
+#define RISCV_IOMMU_CAPABILITIES_MSI_FLAT BIT_ULL(22)
+#define RISCV_IOMMU_CAPABILITIES_MSI_MRIF BIT_ULL(23)
+#define RISCV_IOMMU_CAPABILITIES_AMO_HWAD BIT_ULL(24)
+#define RISCV_IOMMU_CAPABILITIES_ATS BIT_ULL(25)
+#define RISCV_IOMMU_CAPABILITIES_T2GPA BIT_ULL(26)
+#define RISCV_IOMMU_CAPABILITIES_END BIT_ULL(27)
+#define RISCV_IOMMU_CAPABILITIES_IGS GENMASK_ULL(29, 28)
+#define RISCV_IOMMU_CAPABILITIES_HPM BIT_ULL(30)
+#define RISCV_IOMMU_CAPABILITIES_DBG BIT_ULL(31)
+#define RISCV_IOMMU_CAPABILITIES_PAS GENMASK_ULL(37, 32)
+#define RISCV_IOMMU_CAPABILITIES_PD8 BIT_ULL(38)
+#define RISCV_IOMMU_CAPABILITIES_PD17 BIT_ULL(39)
+#define RISCV_IOMMU_CAPABILITIES_PD20 BIT_ULL(40)
+
+/**
+ * enum riscv_iommu_igs_settings - Interrupt Generation Support Settings
+ * @RISCV_IOMMU_CAPABILITIES_IGS_MSI: IOMMU supports only MSI generation
+ * @RISCV_IOMMU_CAPABILITIES_IGS_WSI: IOMMU supports only Wired-Signaled interrupt
+ * @RISCV_IOMMU_CAPABILITIES_IGS_BOTH: IOMMU supports both MSI and WSI generation
+ * @RISCV_IOMMU_CAPABILITIES_IGS_RSRV: Reserved for standard use
+ */
+enum riscv_iommu_igs_settings {
+ RISCV_IOMMU_CAPABILITIES_IGS_MSI = 0,
+ RISCV_IOMMU_CAPABILITIES_IGS_WSI = 1,
+ RISCV_IOMMU_CAPABILITIES_IGS_BOTH = 2,
+ RISCV_IOMMU_CAPABILITIES_IGS_RSRV = 3
+};
+
+/* 5.4 Features control register (32bits) */
+#define RISCV_IOMMU_REG_FCTL 0x0008
+#define RISCV_IOMMU_FCTL_BE BIT(0)
+#define RISCV_IOMMU_FCTL_WSI BIT(1)
+#define RISCV_IOMMU_FCTL_GXL BIT(2)
+
+/* 5.5 Device-directory-table pointer (64bits) */
+#define RISCV_IOMMU_REG_DDTP 0x0010
+#define RISCV_IOMMU_DDTP_IOMMU_MODE GENMASK_ULL(3, 0)
+#define RISCV_IOMMU_DDTP_BUSY BIT_ULL(4)
+#define RISCV_IOMMU_DDTP_PPN RISCV_IOMMU_PPN_FIELD
+
+/**
+ * enum riscv_iommu_ddtp_modes - IOMMU translation modes
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_OFF: No inbound transactions allowed
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_BARE: Pass-through mode
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL: One-level DDT
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_2LVL: Two-level DDT
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL: Three-level DDT
+ * @RISCV_IOMMU_DDTP_IOMMU_MODE_MAX: Max value allowed by specification
+ */
+enum riscv_iommu_ddtp_modes {
+ RISCV_IOMMU_DDTP_IOMMU_MODE_OFF = 0,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_BARE = 1,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL = 2,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_2LVL = 3,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL = 4,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_MAX = 4
+};
+
+/* 5.6 Command Queue Base (64bits) */
+#define RISCV_IOMMU_REG_CQB 0x0018
+#define RISCV_IOMMU_CQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
+#define RISCV_IOMMU_CQB_PPN RISCV_IOMMU_PPN_FIELD
+
+/* 5.7 Command Queue head (32bits) */
+#define RISCV_IOMMU_REG_CQH 0x0020
+#define RISCV_IOMMU_CQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.8 Command Queue tail (32bits) */
+#define RISCV_IOMMU_REG_CQT 0x0024
+#define RISCV_IOMMU_CQT_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.9 Fault Queue Base (64bits) */
+#define RISCV_IOMMU_REG_FQB 0x0028
+#define RISCV_IOMMU_FQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
+#define RISCV_IOMMU_FQB_PPN RISCV_IOMMU_PPN_FIELD
+
+/* 5.10 Fault Queue Head (32bits) */
+#define RISCV_IOMMU_REG_FQH 0x0030
+#define RISCV_IOMMU_FQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.11 Fault Queue tail (32bits) */
+#define RISCV_IOMMU_REG_FQT 0x0034
+#define RISCV_IOMMU_FQT_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.12 Page Request Queue base (64bits) */
+#define RISCV_IOMMU_REG_PQB 0x0038
+#define RISCV_IOMMU_PQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD
+#define RISCV_IOMMU_PQB_PPN RISCV_IOMMU_PPN_FIELD
+
+/* 5.13 Page Request Queue head (32bits) */
+#define RISCV_IOMMU_REG_PQH 0x0040
+#define RISCV_IOMMU_PQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.14 Page Request Queue tail (32bits) */
+#define RISCV_IOMMU_REG_PQT 0x0044
+#define RISCV_IOMMU_PQT_INDEX_MASK RISCV_IOMMU_QUEUE_INDEX_FIELD
+
+/* 5.15 Command Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_CQCSR 0x0048
+#define RISCV_IOMMU_CQCSR_CQEN RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_CQCSR_CIE RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_CQCSR_CQMF RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_CQCSR_CMD_TO BIT(9)
+#define RISCV_IOMMU_CQCSR_CMD_ILL BIT(10)
+#define RISCV_IOMMU_CQCSR_FENCE_W_IP BIT(11)
+#define RISCV_IOMMU_CQCSR_CQON RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_CQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY
+
+/* 5.16 Fault Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_FQCSR 0x004C
+#define RISCV_IOMMU_FQCSR_FQEN RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_FQCSR_FIE RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_FQCSR_FQMF RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_FQCSR_FQOF RISCV_IOMMU_QUEUE_OVERFLOW
+#define RISCV_IOMMU_FQCSR_FQON RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_FQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY
+
+/* 5.17 Page Request Queue CSR (32bits) */
+#define RISCV_IOMMU_REG_PQCSR 0x0050
+#define RISCV_IOMMU_PQCSR_PQEN RISCV_IOMMU_QUEUE_ENABLE
+#define RISCV_IOMMU_PQCSR_PIE RISCV_IOMMU_QUEUE_INTR_ENABLE
+#define RISCV_IOMMU_PQCSR_PQMF RISCV_IOMMU_QUEUE_MEM_FAULT
+#define RISCV_IOMMU_PQCSR_PQOF RISCV_IOMMU_QUEUE_OVERFLOW
+#define RISCV_IOMMU_PQCSR_PQON RISCV_IOMMU_QUEUE_ACTIVE
+#define RISCV_IOMMU_PQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY
+
+/* 5.18 Interrupt Pending Status (32bits) */
+#define RISCV_IOMMU_REG_IPSR 0x0054
+
+#define RISCV_IOMMU_INTR_CQ 0
+#define RISCV_IOMMU_INTR_FQ 1
+#define RISCV_IOMMU_INTR_PM 2
+#define RISCV_IOMMU_INTR_PQ 3
+#define RISCV_IOMMU_INTR_COUNT 4
+
+#define RISCV_IOMMU_IPSR_CIP BIT(RISCV_IOMMU_INTR_CQ)
+#define RISCV_IOMMU_IPSR_FIP BIT(RISCV_IOMMU_INTR_FQ)
+#define RISCV_IOMMU_IPSR_PMIP BIT(RISCV_IOMMU_INTR_PM)
+#define RISCV_IOMMU_IPSR_PIP BIT(RISCV_IOMMU_INTR_PQ)
+
+/* 5.19 Performance monitoring counter overflow status (32bits) */
+#define RISCV_IOMMU_REG_IOCOUNTOVF 0x0058
+#define RISCV_IOMMU_IOCOUNTOVF_CY BIT(0)
+#define RISCV_IOMMU_IOCOUNTOVF_HPM GENMASK_ULL(31, 1)
+
+/* 5.20 Performance monitoring counter inhibits (32bits) */
+#define RISCV_IOMMU_REG_IOCOUNTINH 0x005C
+#define RISCV_IOMMU_IOCOUNTINH_CY BIT(0)
+#define RISCV_IOMMU_IOCOUNTINH_HPM GENMASK(31, 1)
+
+/* 5.21 Performance monitoring cycles counter (64bits) */
+#define RISCV_IOMMU_REG_IOHPMCYCLES 0x0060
+#define RISCV_IOMMU_IOHPMCYCLES_COUNTER GENMASK_ULL(62, 0)
+#define RISCV_IOMMU_IOHPMCYCLES_OF BIT_ULL(63)
+
+/* 5.22 Performance monitoring event counters (31 * 64bits) */
+#define RISCV_IOMMU_REG_IOHPMCTR_BASE 0x0068
+#define RISCV_IOMMU_REG_IOHPMCTR(_n) (RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8))
+
+/* 5.23 Performance monitoring event selectors (31 * 64bits) */
+#define RISCV_IOMMU_REG_IOHPMEVT_BASE 0x0160
+#define RISCV_IOMMU_REG_IOHPMEVT(_n) (RISCV_IOMMU_REG_IOHPMEVT_BASE + ((_n) * 0x8))
+#define RISCV_IOMMU_IOHPMEVT_EVENTID GENMASK_ULL(14, 0)
+#define RISCV_IOMMU_IOHPMEVT_DMASK BIT_ULL(15)
+#define RISCV_IOMMU_IOHPMEVT_PID_PSCID GENMASK_ULL(35, 16)
+#define RISCV_IOMMU_IOHPMEVT_DID_GSCID GENMASK_ULL(59, 36)
+#define RISCV_IOMMU_IOHPMEVT_PV_PSCV BIT_ULL(60)
+#define RISCV_IOMMU_IOHPMEVT_DV_GSCV BIT_ULL(61)
+#define RISCV_IOMMU_IOHPMEVT_IDT BIT_ULL(62)
+#define RISCV_IOMMU_IOHPMEVT_OF BIT_ULL(63)
+
+/* Number of defined performance-monitoring event selectors */
+#define RISCV_IOMMU_IOHPMEVT_CNT 31
+
+/**
+ * enum riscv_iommu_hpmevent_id - Performance-monitoring event identifier
+ *
+ * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count
+ * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests
+ * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests
+ * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests
+ * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses
+ * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks
+ * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks
+ * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: First-stage page table walks
+ * @RISCV_IOMMU_HPMEVENT_G_WALKS: Second-stage page table walks
+ * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs
+ */
+enum riscv_iommu_hpmevent_id {
+ RISCV_IOMMU_HPMEVENT_INVALID = 0,
+ RISCV_IOMMU_HPMEVENT_URQ = 1,
+ RISCV_IOMMU_HPMEVENT_TRQ = 2,
+ RISCV_IOMMU_HPMEVENT_ATS_RQ = 3,
+ RISCV_IOMMU_HPMEVENT_TLB_MISS = 4,
+ RISCV_IOMMU_HPMEVENT_DD_WALK = 5,
+ RISCV_IOMMU_HPMEVENT_PD_WALK = 6,
+ RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7,
+ RISCV_IOMMU_HPMEVENT_G_WALKS = 8,
+ RISCV_IOMMU_HPMEVENT_MAX = 9
+};
+
+/* 5.24 Translation request IOVA (64bits) */
+#define RISCV_IOMMU_REG_TR_REQ_IOVA 0x0258
+#define RISCV_IOMMU_TR_REQ_IOVA_VPN GENMASK_ULL(63, 12)
+
+/* 5.25 Translation request control (64bits) */
+#define RISCV_IOMMU_REG_TR_REQ_CTL 0x0260
+#define RISCV_IOMMU_TR_REQ_CTL_GO_BUSY BIT_ULL(0)
+#define RISCV_IOMMU_TR_REQ_CTL_PRIV BIT_ULL(1)
+#define RISCV_IOMMU_TR_REQ_CTL_EXE BIT_ULL(2)
+#define RISCV_IOMMU_TR_REQ_CTL_NW BIT_ULL(3)
+#define RISCV_IOMMU_TR_REQ_CTL_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_TR_REQ_CTL_PV BIT_ULL(32)
+#define RISCV_IOMMU_TR_REQ_CTL_DID GENMASK_ULL(63, 40)
+
+/* 5.26 Translation request response (64bits) */
+#define RISCV_IOMMU_REG_TR_RESPONSE 0x0268
+#define RISCV_IOMMU_TR_RESPONSE_FAULT BIT_ULL(0)
+#define RISCV_IOMMU_TR_RESPONSE_PBMT GENMASK_ULL(8, 7)
+#define RISCV_IOMMU_TR_RESPONSE_SZ BIT_ULL(9)
+#define RISCV_IOMMU_TR_RESPONSE_PPN RISCV_IOMMU_PPN_FIELD
+
+/* 5.27 Interrupt cause to vector (64bits) */
+#define RISCV_IOMMU_REG_ICVEC 0x02F8
+#define RISCV_IOMMU_ICVEC_CIV GENMASK_ULL(3, 0)
+#define RISCV_IOMMU_ICVEC_FIV GENMASK_ULL(7, 4)
+#define RISCV_IOMMU_ICVEC_PMIV GENMASK_ULL(11, 8)
+#define RISCV_IOMMU_ICVEC_PIV GENMASK_ULL(15, 12)
+
+/* 5.28 MSI Configuration table (32 * 64bits) */
+#define RISCV_IOMMU_REG_MSI_CFG_TBL 0x0300
+#define RISCV_IOMMU_REG_MSI_CFG_TBL_ADDR(_n) \
+ (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10))
+#define RISCV_IOMMU_MSI_CFG_TBL_ADDR GENMASK_ULL(55, 2)
+#define RISCV_IOMMU_REG_MSI_CFG_TBL_DATA(_n) \
+ (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10) + 0x08)
+#define RISCV_IOMMU_MSI_CFG_TBL_DATA GENMASK_ULL(31, 0)
+#define RISCV_IOMMU_REG_MSI_CFG_TBL_CTRL(_n) \
+ (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10) + 0x0C)
+#define RISCV_IOMMU_MSI_CFG_TBL_CTRL_M BIT_ULL(0)
+
+#define RISCV_IOMMU_REG_SIZE 0x1000
+
+/*
+ * Chapter 2: Data structures
+ */
+
+/*
+ * Device Directory Table macros for non-leaf nodes
+ */
+#define RISCV_IOMMU_DDTE_V BIT_ULL(0)
+#define RISCV_IOMMU_DDTE_PPN RISCV_IOMMU_PPN_FIELD
+
+/**
+ * struct riscv_iommu_dc - Device Context
+ * @tc: Translation Control
+ * @iohgatp: I/O Hypervisor guest address translation and protection
+ * (Second stage context)
+ * @ta: Translation Attributes
+ * @fsc: First stage context
+ * @msiptp: MSI page table pointer
+ * @msi_addr_mask: MSI address mask
+ * @msi_addr_pattern: MSI address pattern
+ * @_reserved: Reserved for future use, padding
+ *
+ * This structure is used for leaf nodes on the Device Directory Table,
+ * in case RISCV_IOMMU_CAPABILITIES_MSI_FLAT is not set, the bottom 4 fields
+ * are not present and are skipped with pointer arithmetic to avoid
+ * casting, check out riscv_iommu_get_dc().
+ * See section 2.1 for more details
+ */
+struct riscv_iommu_dc {
+ u64 tc;
+ u64 iohgatp;
+ u64 ta;
+ u64 fsc;
+ u64 msiptp;
+ u64 msi_addr_mask;
+ u64 msi_addr_pattern;
+ u64 _reserved;
+};
+
+/* Translation control fields */
+#define RISCV_IOMMU_DC_TC_V BIT_ULL(0)
+#define RISCV_IOMMU_DC_TC_EN_ATS BIT_ULL(1)
+#define RISCV_IOMMU_DC_TC_EN_PRI BIT_ULL(2)
+#define RISCV_IOMMU_DC_TC_T2GPA BIT_ULL(3)
+#define RISCV_IOMMU_DC_TC_DTF BIT_ULL(4)
+#define RISCV_IOMMU_DC_TC_PDTV BIT_ULL(5)
+#define RISCV_IOMMU_DC_TC_PRPR BIT_ULL(6)
+#define RISCV_IOMMU_DC_TC_GADE BIT_ULL(7)
+#define RISCV_IOMMU_DC_TC_SADE BIT_ULL(8)
+#define RISCV_IOMMU_DC_TC_DPE BIT_ULL(9)
+#define RISCV_IOMMU_DC_TC_SBE BIT_ULL(10)
+#define RISCV_IOMMU_DC_TC_SXL BIT_ULL(11)
+
+/* Second-stage (aka G-stage) context fields */
+#define RISCV_IOMMU_DC_IOHGATP_PPN RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_IOHGATP_GSCID GENMASK_ULL(59, 44)
+#define RISCV_IOMMU_DC_IOHGATP_MODE RISCV_IOMMU_ATP_MODE_FIELD
+
+/**
+ * enum riscv_iommu_dc_iohgatp_modes - Guest address translation/protection modes
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_BARE: No translation/protection
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4: Sv32x4 (2-bit extension of Sv32), when fctl.GXL == 1
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4: Sv39x4 (2-bit extension of Sv39), when fctl.GXL == 0
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4: Sv48x4 (2-bit extension of Sv48), when fctl.GXL == 0
+ * @RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4: Sv57x4 (2-bit extension of Sv57), when fctl.GXL == 0
+ */
+enum riscv_iommu_dc_iohgatp_modes {
+ RISCV_IOMMU_DC_IOHGATP_MODE_BARE = 0,
+ RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4 = 8,
+ RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4 = 8,
+ RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4 = 9,
+ RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4 = 10
+};
+
+/* Translation attributes fields */
+#define RISCV_IOMMU_DC_TA_PSCID GENMASK_ULL(31, 12)
+
+/* First-stage context fields */
+#define RISCV_IOMMU_DC_FSC_PPN RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_FSC_MODE RISCV_IOMMU_ATP_MODE_FIELD
+
+/**
+ * enum riscv_iommu_dc_fsc_atp_modes - First stage address translation/protection modes
+ * @RISCV_IOMMU_DC_FSC_MODE_BARE: No translation/protection
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32: Sv32, when dc.tc.SXL == 1
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: Sv39, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: Sv48, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: Sv57, when dc.tc.SXL == 0
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8: 1lvl PDT, 8bit process ids
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17: 2lvl PDT, 17bit process ids
+ * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20: 3lvl PDT, 20bit process ids
+ *
+ * FSC holds IOSATP when RISCV_IOMMU_DC_TC_PDTV is 0 and PDTP otherwise.
+ * IOSATP controls the first stage address translation (same as the satp register on
+ * the RISC-V MMU), and PDTP holds the process directory table, used to select a
+ * first stage page table based on a process id (for devices that support multiple
+ * process ids).
+ */
+enum riscv_iommu_dc_fsc_atp_modes {
+ RISCV_IOMMU_DC_FSC_MODE_BARE = 0,
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 = 8,
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 = 8,
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48 = 9,
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57 = 10,
+ RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8 = 1,
+ RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17 = 2,
+ RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20 = 3
+};
+
+/* MSI page table pointer */
+#define RISCV_IOMMU_DC_MSIPTP_PPN RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_DC_MSIPTP_MODE RISCV_IOMMU_ATP_MODE_FIELD
+#define RISCV_IOMMU_DC_MSIPTP_MODE_OFF 0
+#define RISCV_IOMMU_DC_MSIPTP_MODE_FLAT 1
+
+/* MSI address mask */
+#define RISCV_IOMMU_DC_MSI_ADDR_MASK GENMASK_ULL(51, 0)
+
+/* MSI address pattern */
+#define RISCV_IOMMU_DC_MSI_PATTERN GENMASK_ULL(51, 0)
+
+/**
+ * struct riscv_iommu_pc - Process Context
+ * @ta: Translation Attributes
+ * @fsc: First stage context
+ *
+ * This structure is used for leaf nodes on the Process Directory Table
+ * See section 2.3 for more details
+ */
+struct riscv_iommu_pc {
+ u64 ta;
+ u64 fsc;
+};
+
+/* Translation attributes fields */
+#define RISCV_IOMMU_PC_TA_V BIT_ULL(0)
+#define RISCV_IOMMU_PC_TA_ENS BIT_ULL(1)
+#define RISCV_IOMMU_PC_TA_SUM BIT_ULL(2)
+#define RISCV_IOMMU_PC_TA_PSCID GENMASK_ULL(31, 12)
+
+/* First stage context fields */
+#define RISCV_IOMMU_PC_FSC_PPN RISCV_IOMMU_ATP_PPN_FIELD
+#define RISCV_IOMMU_PC_FSC_MODE RISCV_IOMMU_ATP_MODE_FIELD
+
+/*
+ * Chapter 3: In-memory queue interface
+ */
+
+/**
+ * struct riscv_iommu_command - Generic IOMMU command structure
+ * @dword0: Includes the opcode and the function identifier
+ * @dword1: Opcode specific data
+ *
+ * The commands are interpreted as two 64bit fields, where the first
+ * 7bits of the first field are the opcode which also defines the
+ * command's format, followed by a 3bit field that specifies the
+ * function invoked by that command, and the rest is opcode-specific.
+ * This is a generic struct which will be populated differently
+ * according to each command. For more infos on the commands and
+ * the command queue check section 3.1.
+ */
+struct riscv_iommu_command {
+ u64 dword0;
+ u64 dword1;
+};
+
+/* Fields on dword0, common for all commands */
+#define RISCV_IOMMU_CMD_OPCODE GENMASK_ULL(6, 0)
+#define RISCV_IOMMU_CMD_FUNC GENMASK_ULL(9, 7)
+
+/* 3.1.1 IOMMU Page-table cache invalidation */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IOTINVAL_OPCODE 1
+#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA 0
+#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA 1
+#define RISCV_IOMMU_CMD_IOTINVAL_AV BIT_ULL(10)
+#define RISCV_IOMMU_CMD_IOTINVAL_PSCID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_IOTINVAL_PSCV BIT_ULL(32)
+#define RISCV_IOMMU_CMD_IOTINVAL_GV BIT_ULL(33)
+#define RISCV_IOMMU_CMD_IOTINVAL_GSCID GENMASK_ULL(59, 44)
+/* dword1[61:10] is the 4K-aligned page address */
+#define RISCV_IOMMU_CMD_IOTINVAL_ADDR GENMASK_ULL(61, 10)
+
+/* 3.1.2 IOMMU Command Queue Fences */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IOFENCE_OPCODE 2
+#define RISCV_IOMMU_CMD_IOFENCE_FUNC_C 0
+#define RISCV_IOMMU_CMD_IOFENCE_AV BIT_ULL(10)
+#define RISCV_IOMMU_CMD_IOFENCE_WSI BIT_ULL(11)
+#define RISCV_IOMMU_CMD_IOFENCE_PR BIT_ULL(12)
+#define RISCV_IOMMU_CMD_IOFENCE_PW BIT_ULL(13)
+#define RISCV_IOMMU_CMD_IOFENCE_DATA GENMASK_ULL(63, 32)
+/* dword1 is the address, word-size aligned and shifted to the right by two bits. */
+
+/* 3.1.3 IOMMU Directory cache invalidation */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_IODIR_OPCODE 3
+#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT 0
+#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT 1
+#define RISCV_IOMMU_CMD_IODIR_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_IODIR_DV BIT_ULL(33)
+#define RISCV_IOMMU_CMD_IODIR_DID GENMASK_ULL(63, 40)
+/* dword1 is reserved for standard use */
+
+/* 3.1.4 IOMMU PCIe ATS */
+/* Fields on dword0 */
+#define RISCV_IOMMU_CMD_ATS_OPCODE 4
+#define RISCV_IOMMU_CMD_ATS_FUNC_INVAL 0
+#define RISCV_IOMMU_CMD_ATS_FUNC_PRGR 1
+#define RISCV_IOMMU_CMD_ATS_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_CMD_ATS_PV BIT_ULL(32)
+#define RISCV_IOMMU_CMD_ATS_DSV BIT_ULL(33)
+#define RISCV_IOMMU_CMD_ATS_RID GENMASK_ULL(55, 40)
+#define RISCV_IOMMU_CMD_ATS_DSEG GENMASK_ULL(63, 56)
+/* dword1 is the ATS payload, two different payload types for INVAL and PRGR */
+
+/* ATS.INVAL payload*/
+#define RISCV_IOMMU_CMD_ATS_INVAL_G BIT_ULL(0)
+/* Bits 1 - 10 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_INVAL_S BIT_ULL(11)
+#define RISCV_IOMMU_CMD_ATS_INVAL_UADDR GENMASK_ULL(63, 12)
+
+/* ATS.PRGR payload */
+/* Bits 0 - 31 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX GENMASK_ULL(40, 32)
+/* Bits 41 - 43 are zeroed */
+#define RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE GENMASK_ULL(47, 44)
+#define RISCV_IOMMU_CMD_ATS_PRGR_DST_ID GENMASK_ULL(63, 48)
+
+/**
+ * struct riscv_iommu_fq_record - Fault/Event Queue Record
+ * @hdr: Header, includes fault/event cause, PID/DID, transaction type etc
+ * @_reserved: Low 32bits for custom use, high 32bits for standard use
+ * @iotval: Transaction-type/cause specific format
+ * @iotval2: Cause specific format
+ *
+ * The fault/event queue reports events and failures raised when
+ * processing transactions. Each record is a 32byte structure where
+ * the first dword has a fixed format for providing generic infos
+ * regarding the fault/event, and two more dwords are there for
+ * fault/event-specific information. For more details see section
+ * 3.2.
+ */
+struct riscv_iommu_fq_record {
+ u64 hdr;
+ u64 _reserved;
+ u64 iotval;
+ u64 iotval2;
+};
+
+/* Fields on header */
+#define RISCV_IOMMU_FQ_HDR_CAUSE GENMASK_ULL(11, 0)
+#define RISCV_IOMMU_FQ_HDR_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_FQ_HDR_PV BIT_ULL(32)
+#define RISCV_IOMMU_FQ_HDR_PRIV BIT_ULL(33)
+#define RISCV_IOMMU_FQ_HDR_TTYP GENMASK_ULL(39, 34)
+#define RISCV_IOMMU_FQ_HDR_DID GENMASK_ULL(63, 40)
+
+/**
+ * enum riscv_iommu_fq_causes - Fault/event cause values
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT: Instruction access fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED: Read address misaligned
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT: Read load fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED: Write/AMO address misaligned
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT: Write/AMO access fault
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S: Instruction page fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S: Read page fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S: Write/AMO page fault
+ * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS: Instruction guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS: Read guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS: Write/AMO guest page fault
+ * @RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED: All inbound transactions disallowed
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT: DDT entry load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_INVALID: DDT entry invalid
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED: DDT entry misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_TTYP_BLOCKED: Transaction type disallowed
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT: MSI PTE load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_INVALID: MSI PTE invalid
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED: MSI PTE misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT: MRIF access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT: PDT entry load access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_INVALID: PDT entry invalid
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED: PDT entry misconfigured
+ * @RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED: DDT data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED: PDT data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED: MSI page table data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED: MRIF data corruption
+ * @RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR: Internal data path error
+ * @RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT: IOMMU MSI write access fault
+ * @RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED: First/second stage page table data corruption
+ *
+ * Values are on table 11 of the spec, encodings 275 - 2047 are reserved for standard
+ * use, and 2048 - 4095 for custom use.
+ */
+enum riscv_iommu_fq_causes {
+ RISCV_IOMMU_FQ_CAUSE_INST_FAULT = 1,
+ RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED = 4,
+ RISCV_IOMMU_FQ_CAUSE_RD_FAULT = 5,
+ RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED = 6,
+ RISCV_IOMMU_FQ_CAUSE_WR_FAULT = 7,
+ RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S = 12,
+ RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S = 13,
+ RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S = 15,
+ RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS = 20,
+ RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS = 21,
+ RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS = 23,
+ RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED = 256,
+ RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT = 257,
+ RISCV_IOMMU_FQ_CAUSE_DDT_INVALID = 258,
+ RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED = 259,
+ RISCV_IOMMU_FQ_CAUSE_TTYP_BLOCKED = 260,
+ RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT = 261,
+ RISCV_IOMMU_FQ_CAUSE_MSI_INVALID = 262,
+ RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED = 263,
+ RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT = 264,
+ RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT = 265,
+ RISCV_IOMMU_FQ_CAUSE_PDT_INVALID = 266,
+ RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED = 267,
+ RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED = 268,
+ RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED = 269,
+ RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED = 270,
+ RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED = 271,
+ RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR = 272,
+ RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT = 273,
+ RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED = 274
+};
+
+/**
+ * enum riscv_iommu_fq_ttypes: Fault/event transaction types
+ * @RISCV_IOMMU_FQ_TTYP_NONE: None. Fault not caused by an inbound transaction.
+ * @RISCV_IOMMU_FQ_TTYP_UADDR_INST_FETCH: Instruction fetch from untranslated address
+ * @RISCV_IOMMU_FQ_TTYP_UADDR_RD: Read from untranslated address
+ * @RISCV_IOMMU_FQ_TTYP_UADDR_WR: Write/AMO to untranslated address
+ * @RISCV_IOMMU_FQ_TTYP_TADDR_INST_FETCH: Instruction fetch from translated address
+ * @RISCV_IOMMU_FQ_TTYP_TADDR_RD: Read from translated address
+ * @RISCV_IOMMU_FQ_TTYP_TADDR_WR: Write/AMO to translated address
+ * @RISCV_IOMMU_FQ_TTYP_PCIE_ATS_REQ: PCIe ATS translation request
+ * @RISCV_IOMMU_FQ_TTYP_PCIE_MSG_REQ: PCIe message request
+ *
+ * Values are on table 12 of the spec, type 4 and 10 - 31 are reserved for standard use
+ * and 31 - 63 for custom use.
+ */
+enum riscv_iommu_fq_ttypes {
+ RISCV_IOMMU_FQ_TTYP_NONE = 0,
+ RISCV_IOMMU_FQ_TTYP_UADDR_INST_FETCH = 1,
+ RISCV_IOMMU_FQ_TTYP_UADDR_RD = 2,
+ RISCV_IOMMU_FQ_TTYP_UADDR_WR = 3,
+ RISCV_IOMMU_FQ_TTYP_TADDR_INST_FETCH = 5,
+ RISCV_IOMMU_FQ_TTYP_TADDR_RD = 6,
+ RISCV_IOMMU_FQ_TTYP_TADDR_WR = 7,
+ RISCV_IOMMU_FQ_TTYP_PCIE_ATS_REQ = 8,
+ RISCV_IOMMU_FQ_TTYP_PCIE_MSG_REQ = 9,
+};
+
+/**
+ * struct riscv_iommu_pq_record - PCIe Page Request record
+ * @hdr: Header, includes PID, DID etc
+ * @payload: Holds the page address, request group and permission bits
+ *
+ * For more infos on the PCIe Page Request queue see chapter 3.3.
+ */
+struct riscv_iommu_pq_record {
+ u64 hdr;
+ u64 payload;
+};
+
+/* Header fields */
+#define RISCV_IOMMU_PQ_HDR_PID GENMASK_ULL(31, 12)
+#define RISCV_IOMMU_PQ_HDR_PV BIT_ULL(32)
+#define RISCV_IOMMU_PQ_HDR_PRIV BIT_ULL(33)
+#define RISCV_IOMMU_PQ_HDR_EXEC BIT_ULL(34)
+#define RISCV_IOMMU_PQ_HDR_DID GENMASK_ULL(63, 40)
+
+/* Payload fields */
+#define RISCV_IOMMU_PQ_PAYLOAD_R BIT_ULL(0)
+#define RISCV_IOMMU_PQ_PAYLOAD_W BIT_ULL(1)
+#define RISCV_IOMMU_PQ_PAYLOAD_L BIT_ULL(2)
+#define RISCV_IOMMU_PQ_PAYLOAD_RWL_MASK GENMASK_ULL(2, 0)
+#define RISCV_IOMMU_PQ_PAYLOAD_PRGI GENMASK_ULL(11, 3) /* Page Request Group Index */
+#define RISCV_IOMMU_PQ_PAYLOAD_ADDR GENMASK_ULL(63, 12)
+
+/**
+ * struct riscv_iommu_msipte - MSI Page Table Entry
+ * @pte: MSI PTE
+ * @mrif_info: Memory-resident interrupt file info
+ *
+ * The MSI Page Table is used for virtualizing MSIs, so that when
+ * a device sends an MSI to a guest, the IOMMU can reroute it
+ * by translating the MSI address, either to a guest interrupt file
+ * or a memory resident interrupt file (MRIF). Note that this page table
+ * is an array of MSI PTEs, not a multi-level pt, each entry
+ * is a leaf entry. For more infos check out the AIA spec, chapter 9.5.
+ *
+ * Also in basic mode the mrif_info field is ignored by the IOMMU and can
+ * be used by software, any other reserved fields on pte must be zeroed-out
+ * by software.
+ */
+struct riscv_iommu_msipte {
+ u64 pte;
+ u64 mrif_info;
+};
+
+/* Fields on pte */
+#define RISCV_IOMMU_MSIPTE_V BIT_ULL(0)
+#define RISCV_IOMMU_MSIPTE_M GENMASK_ULL(2, 1)
+#define RISCV_IOMMU_MSIPTE_MRIF_ADDR GENMASK_ULL(53, 7) /* When M == 1 (MRIF mode) */
+#define RISCV_IOMMU_MSIPTE_PPN RISCV_IOMMU_PPN_FIELD /* When M == 3 (basic mode) */
+#define RISCV_IOMMU_MSIPTE_C BIT_ULL(63)
+
+/* Fields on mrif_info */
+#define RISCV_IOMMU_MSIPTE_MRIF_NID GENMASK_ULL(9, 0)
+#define RISCV_IOMMU_MSIPTE_MRIF_NPPN RISCV_IOMMU_PPN_FIELD
+#define RISCV_IOMMU_MSIPTE_MRIF_NID_MSB BIT_ULL(60)
+
+/* Helper functions: command structure builders. */
+
+static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA);
+ cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd,
+ u64 addr)
+{
+ cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr));
+ cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV;
+}
+
+static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd,
+ int pscid)
+{
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) |
+ RISCV_IOMMU_CMD_IOTINVAL_PSCV;
+}
+
+static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd,
+ int gscid)
+{
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) |
+ RISCV_IOMMU_CMD_IOTINVAL_GV;
+}
+
+static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
+ RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW;
+ cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd,
+ u64 addr, u32 data)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) |
+ FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) |
+ RISCV_IOMMU_CMD_IOFENCE_AV;
+ cmd->dword1 = addr >> 2;
+}
+
+static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT);
+ cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd)
+{
+ cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) |
+ FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT);
+ cmd->dword1 = 0;
+}
+
+static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd,
+ unsigned int devid)
+{
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) |
+ RISCV_IOMMU_CMD_IODIR_DV;
+}
+
+static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd,
+ unsigned int pasid)
+{
+ cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid);
+}
+
+#endif /* _RISCV_IOMMU_BITS_H_ */
diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c
new file mode 100644
index 000000000000..d82d2b00904c
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-pci.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ *
+ * RISCV IOMMU as a PCIe device
+ *
+ * Authors
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ * Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#include <linux/compiler.h>
+#include <linux/init.h>
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include "iommu-bits.h"
+#include "iommu.h"
+
+/* QEMU RISC-V IOMMU implementation */
+#define PCI_DEVICE_ID_REDHAT_RISCV_IOMMU 0x0014
+
+/* Rivos Inc. assigned PCI Vendor and Device IDs */
+#ifndef PCI_VENDOR_ID_RIVOS
+#define PCI_VENDOR_ID_RIVOS 0x1efd
+#endif
+
+#define PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA 0x0008
+
+static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ struct device *dev = &pdev->dev;
+ struct riscv_iommu_device *iommu;
+ int rc, vec;
+
+ rc = pcim_enable_device(pdev);
+ if (rc)
+ return rc;
+
+ if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM))
+ return -ENODEV;
+
+ if (pci_resource_len(pdev, 0) < RISCV_IOMMU_REG_SIZE)
+ return -ENODEV;
+
+ rc = pcim_iomap_regions(pdev, BIT(0), pci_name(pdev));
+ if (rc)
+ return dev_err_probe(dev, rc, "pcim_iomap_regions failed\n");
+
+ iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return -ENOMEM;
+
+ iommu->dev = dev;
+ iommu->reg = pcim_iomap_table(pdev)[0];
+
+ pci_set_master(pdev);
+ dev_set_drvdata(dev, iommu);
+
+ /* Check device reported capabilities / features. */
+ iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES);
+ iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+
+ /* The PCI driver only uses MSIs, make sure the IOMMU supports this */
+ switch (FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps)) {
+ case RISCV_IOMMU_CAPABILITIES_IGS_MSI:
+ case RISCV_IOMMU_CAPABILITIES_IGS_BOTH:
+ break;
+ default:
+ return dev_err_probe(dev, -ENODEV,
+ "unable to use message-signaled interrupts\n");
+ }
+
+ /* Allocate and assign IRQ vectors for the various events */
+ rc = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT,
+ PCI_IRQ_MSIX | PCI_IRQ_MSI);
+ if (rc <= 0)
+ return dev_err_probe(dev, -ENODEV,
+ "unable to allocate irq vectors\n");
+
+ iommu->irqs_count = rc;
+ for (vec = 0; vec < iommu->irqs_count; vec++)
+ iommu->irqs[vec] = msi_get_virq(dev, vec);
+
+ /* Enable message-signaled interrupts, fctl.WSI */
+ if (iommu->fctl & RISCV_IOMMU_FCTL_WSI) {
+ iommu->fctl ^= RISCV_IOMMU_FCTL_WSI;
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl);
+ }
+
+ return riscv_iommu_init(iommu);
+}
+
+static void riscv_iommu_pci_remove(struct pci_dev *pdev)
+{
+ struct riscv_iommu_device *iommu = dev_get_drvdata(&pdev->dev);
+
+ riscv_iommu_remove(iommu);
+}
+
+static void riscv_iommu_pci_shutdown(struct pci_dev *pdev)
+{
+ struct riscv_iommu_device *iommu = dev_get_drvdata(&pdev->dev);
+
+ riscv_iommu_disable(iommu);
+}
+
+static const struct pci_device_id riscv_iommu_pci_tbl[] = {
+ {PCI_VDEVICE(REDHAT, PCI_DEVICE_ID_REDHAT_RISCV_IOMMU), 0},
+ {PCI_VDEVICE(RIVOS, PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA), 0},
+ {0,}
+};
+
+static struct pci_driver riscv_iommu_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = riscv_iommu_pci_tbl,
+ .probe = riscv_iommu_pci_probe,
+ .remove = riscv_iommu_pci_remove,
+ .shutdown = riscv_iommu_pci_shutdown,
+ .driver = {
+ .suppress_bind_attrs = true,
+ },
+};
+
+builtin_pci_driver(riscv_iommu_pci_driver);
diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c
new file mode 100644
index 000000000000..83a28c83f991
--- /dev/null
+++ b/drivers/iommu/riscv/iommu-platform.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RISC-V IOMMU as a platform device
+ *
+ * Copyright © 2023 FORTH-ICS/CARV
+ * Copyright © 2023-2024 Rivos Inc.
+ *
+ * Authors
+ * Nick Kossifidis <mick@ics.forth.gr>
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ */
+
+#include <linux/acpi.h>
+#include <linux/irqchip/riscv-imsic.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+
+#include "iommu-bits.h"
+#include "iommu.h"
+
+static void riscv_iommu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+ struct device *dev = msi_desc_to_dev(desc);
+ struct riscv_iommu_device *iommu = dev_get_drvdata(dev);
+ u16 idx = desc->msi_index;
+ u64 addr;
+
+ addr = ((u64)msg->address_hi << 32) | msg->address_lo;
+
+ if (addr != (addr & RISCV_IOMMU_MSI_CFG_TBL_ADDR)) {
+ dev_err_once(dev,
+ "uh oh, the IOMMU can't send MSIs to 0x%llx, sending to 0x%llx instead\n",
+ addr, addr & RISCV_IOMMU_MSI_CFG_TBL_ADDR);
+ }
+
+ addr &= RISCV_IOMMU_MSI_CFG_TBL_ADDR;
+
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_MSI_CFG_TBL_ADDR(idx), addr);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_MSI_CFG_TBL_DATA(idx), msg->data);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_MSI_CFG_TBL_CTRL(idx), 0);
+}
+
+static int riscv_iommu_platform_probe(struct platform_device *pdev)
+{
+ enum riscv_iommu_igs_settings igs;
+ struct device *dev = &pdev->dev;
+ struct riscv_iommu_device *iommu = NULL;
+ struct irq_domain *msi_domain;
+ struct resource *res = NULL;
+ int vec, ret;
+
+ iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL);
+ if (!iommu)
+ return -ENOMEM;
+
+ iommu->dev = dev;
+ iommu->reg = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
+ if (IS_ERR(iommu->reg))
+ return dev_err_probe(dev, PTR_ERR(iommu->reg),
+ "could not map register region\n");
+
+ dev_set_drvdata(dev, iommu);
+
+ /* Check device reported capabilities / features. */
+ iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES);
+ iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+
+ iommu->irqs_count = platform_irq_count(pdev);
+ if (iommu->irqs_count <= 0)
+ return dev_err_probe(dev, -ENODEV,
+ "no IRQ resources provided\n");
+ if (iommu->irqs_count > RISCV_IOMMU_INTR_COUNT)
+ iommu->irqs_count = RISCV_IOMMU_INTR_COUNT;
+
+ igs = FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps);
+ switch (igs) {
+ case RISCV_IOMMU_CAPABILITIES_IGS_BOTH:
+ case RISCV_IOMMU_CAPABILITIES_IGS_MSI:
+ if (is_of_node(dev_fwnode(dev))) {
+ of_msi_configure(dev, to_of_node(dev->fwnode));
+ } else {
+ msi_domain = irq_find_matching_fwnode(imsic_acpi_get_fwnode(dev),
+ DOMAIN_BUS_PLATFORM_MSI);
+ dev_set_msi_domain(dev, msi_domain);
+ }
+
+ if (!dev_get_msi_domain(dev)) {
+ dev_warn(dev, "failed to find an MSI domain\n");
+ goto msi_fail;
+ }
+
+ ret = platform_device_msi_init_and_alloc_irqs(dev, iommu->irqs_count,
+ riscv_iommu_write_msi_msg);
+ if (ret) {
+ dev_warn(dev, "failed to allocate MSIs\n");
+ goto msi_fail;
+ }
+
+ for (vec = 0; vec < iommu->irqs_count; vec++)
+ iommu->irqs[vec] = msi_get_virq(dev, vec);
+
+ /* Enable message-signaled interrupts, fctl.WSI */
+ if (iommu->fctl & RISCV_IOMMU_FCTL_WSI) {
+ iommu->fctl ^= RISCV_IOMMU_FCTL_WSI;
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl);
+ }
+
+ dev_info(dev, "using MSIs\n");
+ break;
+
+msi_fail:
+ if (igs != RISCV_IOMMU_CAPABILITIES_IGS_BOTH) {
+ return dev_err_probe(dev, -ENODEV,
+ "unable to use wire-signaled interrupts\n");
+ }
+
+ fallthrough;
+
+ case RISCV_IOMMU_CAPABILITIES_IGS_WSI:
+ for (vec = 0; vec < iommu->irqs_count; vec++)
+ iommu->irqs[vec] = platform_get_irq(pdev, vec);
+
+ /* Enable wire-signaled interrupts, fctl.WSI */
+ if (!(iommu->fctl & RISCV_IOMMU_FCTL_WSI)) {
+ iommu->fctl |= RISCV_IOMMU_FCTL_WSI;
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl);
+ }
+ dev_info(dev, "using wire-signaled interrupts\n");
+ break;
+ default:
+ return dev_err_probe(dev, -ENODEV, "invalid IGS\n");
+ }
+
+ return riscv_iommu_init(iommu);
+};
+
+static void riscv_iommu_platform_remove(struct platform_device *pdev)
+{
+ struct riscv_iommu_device *iommu = dev_get_drvdata(&pdev->dev);
+ bool msi = !(iommu->fctl & RISCV_IOMMU_FCTL_WSI);
+
+ riscv_iommu_remove(iommu);
+
+ if (msi)
+ platform_device_msi_free_irqs_all(&pdev->dev);
+};
+
+static void riscv_iommu_platform_shutdown(struct platform_device *pdev)
+{
+ riscv_iommu_disable(dev_get_drvdata(&pdev->dev));
+};
+
+static const struct of_device_id riscv_iommu_of_match[] = {
+ {.compatible = "riscv,iommu",},
+ {},
+};
+
+static const struct acpi_device_id riscv_iommu_acpi_match[] = {
+ { "RSCV0004", 0 },
+ {}
+};
+MODULE_DEVICE_TABLE(acpi, riscv_iommu_acpi_match);
+
+static struct platform_driver riscv_iommu_platform_driver = {
+ .probe = riscv_iommu_platform_probe,
+ .remove = riscv_iommu_platform_remove,
+ .shutdown = riscv_iommu_platform_shutdown,
+ .driver = {
+ .name = "riscv,iommu",
+ .of_match_table = riscv_iommu_of_match,
+ .suppress_bind_attrs = true,
+ .acpi_match_table = riscv_iommu_acpi_match,
+ },
+};
+
+builtin_platform_driver(riscv_iommu_platform_driver);
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
new file mode 100644
index 000000000000..d9429097a2b5
--- /dev/null
+++ b/drivers/iommu/riscv/iommu.c
@@ -0,0 +1,1682 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IOMMU API for RISC-V IOMMU implementations.
+ *
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ *
+ * Authors
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ * Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#define pr_fmt(fmt) "riscv-iommu: " fmt
+
+#include <linux/acpi.h>
+#include <linux/acpi_rimt.h>
+#include <linux/compiler.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include "../iommu-pages.h"
+#include "iommu-bits.h"
+#include "iommu.h"
+
+/* Timeouts in [us] */
+#define RISCV_IOMMU_QCSR_TIMEOUT 150000
+#define RISCV_IOMMU_QUEUE_TIMEOUT 150000
+#define RISCV_IOMMU_DDTP_TIMEOUT 10000000
+#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000
+
+/* Number of entries per CMD/FLT queue, should be <= INT_MAX */
+#define RISCV_IOMMU_DEF_CQ_COUNT 8192
+#define RISCV_IOMMU_DEF_FQ_COUNT 4096
+
+/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */
+#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10))
+#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12))
+
+#define dev_to_iommu(dev) \
+ iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
+
+/* IOMMU PSCID allocation namespace. */
+static DEFINE_IDA(riscv_iommu_pscids);
+#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1)
+
+/* Device resource-managed allocations */
+struct riscv_iommu_devres {
+ void *addr;
+};
+
+static void riscv_iommu_devres_pages_release(struct device *dev, void *res)
+{
+ struct riscv_iommu_devres *devres = res;
+
+ iommu_free_pages(devres->addr);
+}
+
+static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p)
+{
+ struct riscv_iommu_devres *devres = res;
+ struct riscv_iommu_devres *target = p;
+
+ return devres->addr == target->addr;
+}
+
+static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu,
+ unsigned int size)
+{
+ struct riscv_iommu_devres *devres;
+ void *addr;
+
+ addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev),
+ GFP_KERNEL_ACCOUNT, size);
+ if (unlikely(!addr))
+ return NULL;
+
+ devres = devres_alloc(riscv_iommu_devres_pages_release,
+ sizeof(struct riscv_iommu_devres), GFP_KERNEL);
+
+ if (unlikely(!devres)) {
+ iommu_free_pages(addr);
+ return NULL;
+ }
+
+ devres->addr = addr;
+
+ devres_add(iommu->dev, devres);
+
+ return addr;
+}
+
+static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr)
+{
+ struct riscv_iommu_devres devres = { .addr = addr };
+
+ devres_release(iommu->dev, riscv_iommu_devres_pages_release,
+ riscv_iommu_devres_pages_match, &devres);
+}
+
+/*
+ * Hardware queue allocation and management.
+ */
+
+/* Setup queue base, control registers and default queue length */
+#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \
+ struct riscv_iommu_queue *_q = q; \
+ _q->qid = RISCV_IOMMU_INTR_ ## name; \
+ _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \
+ _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \
+ _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\
+} while (0)
+
+/* Note: offsets are the same for all queues */
+#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB))
+#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB))
+#define Q_ITEM(q, index) ((q)->mask & (index))
+#define Q_IPSR(q) BIT((q)->qid)
+
+/*
+ * Discover queue ring buffer hardware configuration, allocate in-memory
+ * ring buffer or use fixed I/O memory location, configure queue base register.
+ * Must be called before hardware queue is enabled.
+ *
+ * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT()
+ * @entry_size - queue single element size in bytes.
+ */
+static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_queue *queue,
+ size_t entry_size)
+{
+ unsigned int logsz;
+ u64 qb, rb;
+
+ /*
+ * Use WARL base register property to discover maximum allowed
+ * number of entries and optional fixed IO address for queue location.
+ */
+ riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD);
+ qb = riscv_iommu_readq(iommu, queue->qbr);
+
+ /*
+ * Calculate and verify hardware supported queue length, as reported
+ * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1).
+ * Update queue size based on hardware supported value.
+ */
+ logsz = ilog2(queue->mask);
+ if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb))
+ logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb);
+
+ /*
+ * Use WARL base register property to discover an optional fixed IO
+ * address for queue ring buffer location. Otherwise allocate contiguous
+ * system memory.
+ */
+ if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) {
+ const size_t queue_size = entry_size << (logsz + 1);
+
+ queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb));
+ queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size);
+ } else {
+ do {
+ const size_t queue_size = entry_size << (logsz + 1);
+
+ queue->base = riscv_iommu_get_pages(
+ iommu, max(queue_size, SZ_4K));
+ queue->phys = __pa(queue->base);
+ } while (!queue->base && logsz-- > 0);
+ }
+
+ if (!queue->base)
+ return -ENOMEM;
+
+ qb = phys_to_ppn(queue->phys) |
+ FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz);
+
+ /* Update base register and read back to verify hw accepted our write */
+ riscv_iommu_writeq(iommu, queue->qbr, qb);
+ rb = riscv_iommu_readq(iommu, queue->qbr);
+ if (rb != qb) {
+ dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid);
+ return -ENODEV;
+ }
+
+ /* Update actual queue mask */
+ queue->mask = (2U << logsz) - 1;
+
+ dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries",
+ queue->qid, logsz + 1);
+
+ return 0;
+}
+
+/* Check interrupt queue status, IPSR */
+static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data)
+{
+ struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+
+ if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue))
+ return IRQ_WAKE_THREAD;
+
+ return IRQ_NONE;
+}
+
+static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n)
+{
+ /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */
+ return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV;
+}
+
+/*
+ * Enable queue processing in the hardware, register interrupt handler.
+ *
+ * @queue - data structure, already allocated with riscv_iommu_queue_alloc()
+ * @irq_handler - threaded interrupt handler.
+ */
+static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_queue *queue,
+ irq_handler_t irq_handler)
+{
+ const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)];
+ u32 csr;
+ int rc;
+
+ if (queue->iommu)
+ return -EBUSY;
+
+ /* Polling not implemented */
+ if (!irq)
+ return -ENODEV;
+
+ queue->iommu = iommu;
+ rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler,
+ IRQF_ONESHOT | IRQF_SHARED,
+ dev_name(iommu->dev), queue);
+ if (rc) {
+ queue->iommu = NULL;
+ return rc;
+ }
+
+ /* Empty queue before enabling it */
+ if (queue->qid == RISCV_IOMMU_INTR_CQ)
+ riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0);
+ else
+ riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0);
+
+ /*
+ * Enable queue with interrupts, clear any memory fault if any.
+ * Wait for the hardware to acknowledge request and activate queue
+ * processing.
+ * Note: All CSR bitfields are in the same offsets for all queues.
+ */
+ riscv_iommu_writel(iommu, queue->qcr,
+ RISCV_IOMMU_QUEUE_ENABLE |
+ RISCV_IOMMU_QUEUE_INTR_ENABLE |
+ RISCV_IOMMU_QUEUE_MEM_FAULT);
+
+ riscv_iommu_readl_timeout(iommu, queue->qcr,
+ csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
+ 10, RISCV_IOMMU_QCSR_TIMEOUT);
+
+ if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE |
+ RISCV_IOMMU_QUEUE_BUSY |
+ RISCV_IOMMU_QUEUE_MEM_FAULT))) {
+ /* Best effort to stop and disable failing hardware queue. */
+ riscv_iommu_writel(iommu, queue->qcr, 0);
+ free_irq(irq, queue);
+ queue->iommu = NULL;
+ dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid);
+ return -EBUSY;
+ }
+
+ /* Clear any pending interrupt flag. */
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ return 0;
+}
+
+/*
+ * Disable queue. Wait for the hardware to acknowledge request and
+ * stop processing enqueued requests. Report errors but continue.
+ */
+static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue)
+{
+ struct riscv_iommu_device *iommu = queue->iommu;
+ u32 csr;
+
+ if (!iommu)
+ return;
+
+ free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue);
+ riscv_iommu_writel(iommu, queue->qcr, 0);
+ riscv_iommu_readl_timeout(iommu, queue->qcr,
+ csr, !(csr & RISCV_IOMMU_QUEUE_BUSY),
+ 10, RISCV_IOMMU_QCSR_TIMEOUT);
+
+ if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY))
+ dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n",
+ queue->qid, csr);
+
+ queue->iommu = NULL;
+}
+
+/*
+ * Returns number of available valid queue entries and the first item index.
+ * Update shadow producer index if necessary.
+ */
+static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue,
+ unsigned int *index)
+{
+ unsigned int head = atomic_read(&queue->head);
+ unsigned int tail = atomic_read(&queue->tail);
+ unsigned int last = Q_ITEM(queue, tail);
+ int available = (int)(tail - head);
+
+ *index = head;
+
+ if (available > 0)
+ return available;
+
+ /* read hardware producer index, check reserved register bits are not set. */
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue),
+ tail, (tail & ~queue->mask) == 0,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT)) {
+ dev_err_once(queue->iommu->dev,
+ "Hardware error: queue access timeout\n");
+ return 0;
+ }
+
+ if (tail == last)
+ return 0;
+
+ /* update shadow producer index */
+ return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head);
+}
+
+/*
+ * Release processed queue entries, should match riscv_iommu_queue_consume() calls.
+ */
+static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count)
+{
+ const unsigned int head = atomic_add_return(count, &queue->head);
+
+ riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head));
+}
+
+/* Return actual consumer index based on hardware reported queue head index. */
+static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue)
+{
+ const unsigned int cons = atomic_read(&queue->head);
+ const unsigned int last = Q_ITEM(queue, cons);
+ unsigned int head;
+
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
+ !(head & ~queue->mask),
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ return cons;
+
+ return cons + ((head - last) & queue->mask);
+}
+
+/* Wait for submitted item to be processed. */
+static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue,
+ unsigned int index,
+ unsigned int timeout_us)
+{
+ unsigned int cons = atomic_read(&queue->head);
+
+ /* Already processed by the consumer */
+ if ((int)(cons - index) > 0)
+ return 0;
+
+ /* Monitor consumer index */
+ return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons,
+ (int)(cons - index) > 0, 0, timeout_us);
+}
+
+/* Enqueue an entry and wait to be processed if timeout_us > 0
+ *
+ * Error handling for IOMMU hardware not responding in reasonable time
+ * will be added as separate patch series along with other RAS features.
+ * For now, only report hardware failure and continue.
+ */
+static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue,
+ void *entry, size_t entry_size)
+{
+ unsigned int prod;
+ unsigned int head;
+ unsigned int tail;
+ unsigned long flags;
+
+ /* Do not preempt submission flow. */
+ local_irq_save(flags);
+
+ /* 1. Allocate some space in the queue */
+ prod = atomic_inc_return(&queue->prod) - 1;
+ head = atomic_read(&queue->head);
+
+ /* 2. Wait for space availability. */
+ if ((prod - head) > queue->mask) {
+ if (readx_poll_timeout(atomic_read, &queue->head,
+ head, (prod - head) < queue->mask,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+ } else if ((prod - head) == queue->mask) {
+ const unsigned int last = Q_ITEM(queue, head);
+
+ if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head,
+ !(head & ~queue->mask) && head != last,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+ atomic_add((head - last) & queue->mask, &queue->head);
+ }
+
+ /* 3. Store entry in the ring buffer */
+ memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size);
+
+ /* 4. Wait for all previous entries to be ready */
+ if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail,
+ 0, RISCV_IOMMU_QUEUE_TIMEOUT))
+ goto err_busy;
+
+ /*
+ * 5. Make sure the ring buffer update (whether in normal or I/O memory) is
+ * completed and visible before signaling the tail doorbell to fetch
+ * the next command. 'fence ow, ow'
+ */
+ dma_wmb();
+ riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1));
+
+ /*
+ * 6. Make sure the doorbell write to the device has finished before updating
+ * the shadow tail index in normal memory. 'fence o, w'
+ */
+ mmiowb();
+ atomic_inc(&queue->tail);
+
+ /* 7. Complete submission and restore local interrupts */
+ local_irq_restore(flags);
+
+ return prod;
+
+err_busy:
+ local_irq_restore(flags);
+ dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n");
+
+ return prod;
+}
+
+/*
+ * IOMMU Command queue chapter 3.1
+ */
+
+/* Command queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data)
+{
+ const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+ unsigned int ctrl;
+
+ /* Clear MF/CQ errors, complete error recovery to be implemented. */
+ ctrl = riscv_iommu_readl(queue->iommu, queue->qcr);
+ if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO |
+ RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) {
+ riscv_iommu_writel(queue->iommu, queue->qcr, ctrl);
+ dev_warn(queue->iommu->dev,
+ "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n",
+ queue->qid,
+ !!(ctrl & RISCV_IOMMU_CQCSR_CQMF),
+ !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO),
+ !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL),
+ !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP));
+ }
+
+ /* Placeholder for command queue interrupt notifiers */
+
+ /* Clear command interrupt pending. */
+ riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ return IRQ_HANDLED;
+}
+
+/* Send command to the IOMMU command queue */
+static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_command *cmd)
+{
+ riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd));
+}
+
+/* Send IOFENCE.C command and wait for all scheduled commands to complete. */
+static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu,
+ unsigned int timeout_us)
+{
+ struct riscv_iommu_command cmd;
+ unsigned int prod;
+
+ riscv_iommu_cmd_iofence(&cmd);
+ prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd));
+
+ if (!timeout_us)
+ return;
+
+ if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us))
+ dev_err_once(iommu->dev,
+ "Hardware error: command execution timeout\n");
+}
+
+/*
+ * IOMMU Fault/Event queue chapter 3.2
+ */
+
+static void riscv_iommu_fault(struct riscv_iommu_device *iommu,
+ struct riscv_iommu_fq_record *event)
+{
+ unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr);
+ unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr);
+
+ /* Placeholder for future fault handling implementation, report only. */
+ if (err)
+ dev_warn_ratelimited(iommu->dev,
+ "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n",
+ err, devid, event->iotval, event->iotval2);
+}
+
+/* Fault queue interrupt handler thread function */
+static irqreturn_t riscv_iommu_fltq_process(int irq, void *data)
+{
+ struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data;
+ struct riscv_iommu_device *iommu = queue->iommu;
+ struct riscv_iommu_fq_record *events;
+ unsigned int ctrl, idx;
+ int cnt, len;
+
+ events = (struct riscv_iommu_fq_record *)queue->base;
+
+ /* Clear fault interrupt pending and process all received fault events. */
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue));
+
+ do {
+ cnt = riscv_iommu_queue_consume(queue, &idx);
+ for (len = 0; len < cnt; idx++, len++)
+ riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]);
+ riscv_iommu_queue_release(queue, cnt);
+ } while (cnt > 0);
+
+ /* Clear MF/OF errors, complete error recovery to be implemented. */
+ ctrl = riscv_iommu_readl(iommu, queue->qcr);
+ if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) {
+ riscv_iommu_writel(iommu, queue->qcr, ctrl);
+ dev_warn(iommu->dev,
+ "Queue #%u error; memory fault:%d overflow:%d\n",
+ queue->qid,
+ !!(ctrl & RISCV_IOMMU_FQCSR_FQMF),
+ !!(ctrl & RISCV_IOMMU_FQCSR_FQOF));
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* Lookup and initialize device context info structure. */
+static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu,
+ unsigned int devid)
+{
+ const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT);
+ unsigned int depth;
+ unsigned long ddt, old, new;
+ void *ptr;
+ u8 ddi_bits[3] = { 0 };
+ u64 *ddtp = NULL;
+
+ /* Make sure the mode is valid */
+ if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL ||
+ iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL)
+ return NULL;
+
+ /*
+ * Device id partitioning for base format:
+ * DDI[0]: bits 0 - 6 (1st level) (7 bits)
+ * DDI[1]: bits 7 - 15 (2nd level) (9 bits)
+ * DDI[2]: bits 16 - 23 (3rd level) (8 bits)
+ *
+ * For extended format:
+ * DDI[0]: bits 0 - 5 (1st level) (6 bits)
+ * DDI[1]: bits 6 - 14 (2nd level) (9 bits)
+ * DDI[2]: bits 15 - 23 (3rd level) (9 bits)
+ */
+ if (base_format) {
+ ddi_bits[0] = 7;
+ ddi_bits[1] = 7 + 9;
+ ddi_bits[2] = 7 + 9 + 8;
+ } else {
+ ddi_bits[0] = 6;
+ ddi_bits[1] = 6 + 9;
+ ddi_bits[2] = 6 + 9 + 9;
+ }
+
+ /* Make sure device id is within range */
+ depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL;
+ if (devid >= (1 << ddi_bits[depth]))
+ return NULL;
+
+ /* Get to the level of the non-leaf node that holds the device context */
+ for (ddtp = iommu->ddt_root; depth-- > 0;) {
+ const int split = ddi_bits[depth];
+ /*
+ * Each non-leaf node is 64bits wide and on each level
+ * nodes are indexed by DDI[depth].
+ */
+ ddtp += (devid >> split) & 0x1FF;
+
+ /*
+ * Check if this node has been populated and if not
+ * allocate a new level and populate it.
+ */
+ do {
+ ddt = READ_ONCE(*(unsigned long *)ddtp);
+ if (ddt & RISCV_IOMMU_DDTE_V) {
+ ddtp = __va(ppn_to_phys(ddt));
+ break;
+ }
+
+ ptr = riscv_iommu_get_pages(iommu, SZ_4K);
+ if (!ptr)
+ return NULL;
+
+ new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V;
+ old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new);
+
+ if (old == ddt) {
+ ddtp = (u64 *)ptr;
+ break;
+ }
+
+ /* Race setting DDT detected, re-read and retry. */
+ riscv_iommu_free_pages(iommu, ptr);
+ } while (1);
+ }
+
+ /*
+ * Grab the node that matches DDI[depth], note that when using base
+ * format the device context is 4 * 64bits, and the extended format
+ * is 8 * 64bits, hence the (3 - base_format) below.
+ */
+ ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format);
+
+ return (struct riscv_iommu_dc *)ddtp;
+}
+
+/*
+ * This is best effort IOMMU translation shutdown flow.
+ * Disable IOMMU without waiting for hardware response.
+ */
+void riscv_iommu_disable(struct riscv_iommu_device *iommu)
+{
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
+ FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE,
+ RISCV_IOMMU_DDTP_IOMMU_MODE_BARE));
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0);
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0);
+}
+
+#define riscv_iommu_read_ddtp(iommu) ({ \
+ u64 ddtp; \
+ riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \
+ !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \
+ RISCV_IOMMU_DDTP_TIMEOUT); \
+ ddtp; })
+
+static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu)
+{
+ u64 ddtp;
+ unsigned int mode;
+
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ /*
+ * It is optional for the hardware to report a fixed address for device
+ * directory root page when DDT.MODE is OFF or BARE.
+ */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+ if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE ||
+ mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) {
+ /* Use WARL to discover hardware fixed DDT PPN */
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP,
+ FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode));
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ iommu->ddt_phys = ppn_to_phys(ddtp);
+ if (iommu->ddt_phys)
+ iommu->ddt_root = devm_ioremap(iommu->dev,
+ iommu->ddt_phys, PAGE_SIZE);
+ if (iommu->ddt_root)
+ memset(iommu->ddt_root, 0, PAGE_SIZE);
+ }
+
+ if (!iommu->ddt_root) {
+ iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K);
+ iommu->ddt_phys = __pa(iommu->ddt_root);
+ }
+
+ if (!iommu->ddt_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Discover supported DDT modes starting from requested value,
+ * configure DDTP register with accepted mode and root DDT address.
+ * Accepted iommu->ddt_mode is updated on success.
+ */
+static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
+ unsigned int ddtp_mode)
+{
+ struct device *dev = iommu->dev;
+ u64 ddtp, rq_ddtp;
+ unsigned int mode, rq_mode = ddtp_mode;
+ struct riscv_iommu_command cmd;
+
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ /* Disallow state transition from xLVL to xLVL. */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+ if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
+ mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF &&
+ rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE &&
+ rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF)
+ return -EINVAL;
+
+ do {
+ rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode);
+ if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
+ rq_ddtp |= phys_to_ppn(iommu->ddt_phys);
+
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp);
+ ddtp = riscv_iommu_read_ddtp(iommu);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY) {
+ dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n",
+ rq_mode, ddtp);
+ return -EBUSY;
+ }
+
+ /* Verify IOMMU hardware accepts new DDTP config. */
+ mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp);
+
+ if (rq_mode == mode)
+ break;
+
+ /* Hardware mandatory DDTP mode has not been accepted. */
+ if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) {
+ dev_err(dev, "DDTP update failed hw: %llx vs %llx\n",
+ ddtp, rq_ddtp);
+ return -EINVAL;
+ }
+
+ /*
+ * Mode field is WARL, an IOMMU may support a subset of
+ * directory table levels in which case if we tried to set
+ * an unsupported number of levels we'll readback either
+ * a valid xLVL or off/bare. If we got off/bare, try again
+ * with a smaller xLVL.
+ */
+ if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL &&
+ rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) {
+ dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode);
+ rq_mode--;
+ continue;
+ }
+
+ /*
+ * We tried all supported modes and IOMMU hardware failed to
+ * accept new settings, something went very wrong since off/bare
+ * and at least one xLVL must be supported.
+ */
+ dev_err(dev, "DDTP hw mode %u, failed to set %u\n",
+ mode, ddtp_mode);
+ return -EINVAL;
+ } while (1);
+
+ iommu->ddt_mode = mode;
+ if (mode != ddtp_mode)
+ dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode);
+
+ /* Invalidate device context cache */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ /* Invalidate address translation cache */
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ /* IOFENCE.C */
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+
+ return 0;
+}
+
+/* This struct contains protection domain specific IOMMU driver data. */
+struct riscv_iommu_domain {
+ struct iommu_domain domain;
+ struct list_head bonds;
+ spinlock_t lock; /* protect bonds list updates. */
+ int pscid;
+ bool amo_enabled;
+ int numa_node;
+ unsigned int pgd_mode;
+ unsigned long *pgd_root;
+};
+
+#define iommu_domain_to_riscv(iommu_domain) \
+ container_of(iommu_domain, struct riscv_iommu_domain, domain)
+
+/* Private IOMMU data for managed devices, dev_iommu_priv_* */
+struct riscv_iommu_info {
+ struct riscv_iommu_domain *domain;
+};
+
+/*
+ * Linkage between an iommu_domain and attached devices.
+ *
+ * Protection domain requiring IOATC and DevATC translation cache invalidations,
+ * should be linked to attached devices using a riscv_iommu_bond structure.
+ * Devices should be linked to the domain before first use and unlinked after
+ * the translations from the referenced protection domain can no longer be used.
+ * Blocking and identity domains are not tracked here, as the IOMMU hardware
+ * does not cache negative and/or identity (BARE mode) translations, and DevATC
+ * is disabled for those protection domains.
+ *
+ * The device pointer and IOMMU data remain stable in the bond struct after
+ * _probe_device() where it's attached to the managed IOMMU, up to the
+ * completion of the _release_device() call. The release of the bond structure
+ * is synchronized with the device release.
+ */
+struct riscv_iommu_bond {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct device *dev;
+};
+
+static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_bond *bond;
+ struct list_head *bonds;
+
+ bond = kzalloc(sizeof(*bond), GFP_KERNEL);
+ if (!bond)
+ return -ENOMEM;
+ bond->dev = dev;
+
+ /*
+ * List of devices attached to the domain is arranged based on
+ * managed IOMMU device.
+ */
+
+ spin_lock(&domain->lock);
+ list_for_each(bonds, &domain->bonds)
+ if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
+ break;
+ list_add_rcu(&bond->list, bonds);
+ spin_unlock(&domain->lock);
+
+ /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */
+ smp_mb();
+
+ return 0;
+}
+
+static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain,
+ struct device *dev)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_bond *bond, *found = NULL;
+ struct riscv_iommu_command cmd;
+ int count = 0;
+
+ if (!domain)
+ return;
+
+ spin_lock(&domain->lock);
+ list_for_each_entry(bond, &domain->bonds, list) {
+ if (found && count)
+ break;
+ else if (bond->dev == dev)
+ found = bond;
+ else if (dev_to_iommu(bond->dev) == iommu)
+ count++;
+ }
+ if (found)
+ list_del_rcu(&found->list);
+ spin_unlock(&domain->lock);
+ kfree_rcu(found, rcu);
+
+ /*
+ * If this was the last bond between this domain and the IOMMU
+ * invalidate all cached entries for domain's PSCID.
+ */
+ if (!count) {
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+ riscv_iommu_cmd_send(iommu, &cmd);
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+ }
+}
+
+/*
+ * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
+ * This limit will be replaced with range invalidations, if supported by
+ * the hardware, when RISC-V IOMMU architecture specification update for
+ * range invalidations update will be available.
+ */
+#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20)
+
+static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
+ unsigned long start, unsigned long end)
+{
+ struct riscv_iommu_bond *bond;
+ struct riscv_iommu_device *iommu, *prev;
+ struct riscv_iommu_command cmd;
+ unsigned long len = end - start + 1;
+ unsigned long iova;
+
+ /*
+ * For each IOMMU linked with this protection domain (via bonds->dev),
+ * an IOTLB invaliation command will be submitted and executed.
+ *
+ * Possbile race with domain attach flow is handled by sequencing
+ * bond creation - riscv_iommu_bond_link(), and device directory
+ * update - riscv_iommu_iodir_update().
+ *
+ * PTE Update / IOTLB Inval Device attach & directory update
+ * -------------------------- --------------------------
+ * update page table entries add dev to the bond list
+ * FENCE RW,RW FENCE RW,RW
+ * For all IOMMUs: (can be empty) Update FSC/PSCID
+ * FENCE IOW,IOW FENCE IOW,IOW
+ * IOTLB.INVAL IODIR.INVAL
+ * IOFENCE.C
+ *
+ * If bond list is not updated with new device, directory context will
+ * be configured with already valid page table content. If an IOMMU is
+ * linked to the protection domain it will receive invalidation
+ * requests for updated page table entries.
+ */
+ smp_mb();
+
+ rcu_read_lock();
+
+ prev = NULL;
+ list_for_each_entry_rcu(bond, &domain->bonds, list) {
+ iommu = dev_to_iommu(bond->dev);
+
+ /*
+ * IOTLB invalidation request can be safely omitted if already sent
+ * to the IOMMU for the same PSCID, and with domain->bonds list
+ * arranged based on the device's IOMMU, it's sufficient to check
+ * last device the invalidation was sent to.
+ */
+ if (iommu == prev)
+ continue;
+
+ riscv_iommu_cmd_inval_vma(&cmd);
+ riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+ if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
+ for (iova = start; iova < end; iova += PAGE_SIZE) {
+ riscv_iommu_cmd_inval_set_addr(&cmd, iova);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+ } else {
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+ prev = iommu;
+ }
+
+ prev = NULL;
+ list_for_each_entry_rcu(bond, &domain->bonds, list) {
+ iommu = dev_to_iommu(bond->dev);
+ if (iommu == prev)
+ continue;
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+ prev = iommu;
+ }
+ rcu_read_unlock();
+}
+
+#define RISCV_IOMMU_FSC_BARE 0
+
+/*
+ * Update IODIR for the device.
+ *
+ * During the execution of riscv_iommu_probe_device(), IODIR entries are
+ * allocated for the device's identifiers. Device context invalidation
+ * becomes necessary only if one of the updated entries was previously
+ * marked as valid, given that invalid device context entries are not
+ * cached by the IOMMU hardware.
+ * In this implementation, updating a valid device context while the
+ * device is not quiesced might be disruptive, potentially causing
+ * interim translation faults.
+ */
+static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
+ struct device *dev, u64 fsc, u64 ta)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct riscv_iommu_dc *dc;
+ struct riscv_iommu_command cmd;
+ bool sync_required = false;
+ u64 tc;
+ int i;
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ tc = READ_ONCE(dc->tc);
+ if (!(tc & RISCV_IOMMU_DC_TC_V))
+ continue;
+
+ WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
+
+ /* Invalidate device context cached values */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ sync_required = true;
+ }
+
+ if (sync_required)
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+
+ /*
+ * For device context with DC_TC_PDTV = 0, translation attributes valid bit
+ * is stored as DC_TC_V bit (both sharing the same location at BIT(0)).
+ */
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ tc = READ_ONCE(dc->tc);
+ tc |= ta & RISCV_IOMMU_DC_TC_V;
+
+ WRITE_ONCE(dc->fsc, fsc);
+ WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID);
+ /* Update device context, write TC.V as the last step. */
+ dma_wmb();
+ WRITE_ONCE(dc->tc, tc);
+
+ /* Invalidate device context after update */
+ riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+ riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
+ riscv_iommu_cmd_send(iommu, &cmd);
+ }
+
+ riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+}
+
+/*
+ * IOVA page translation tree management.
+ */
+
+static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+ riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+}
+
+static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
+ struct iommu_iotlb_gather *gather)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+ riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
+}
+
+#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
+
+#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
+#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
+#define _io_pte_none(pte) ((pte) == 0)
+#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
+
+static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
+ unsigned long pte,
+ struct iommu_pages_list *freelist)
+{
+ unsigned long *ptr;
+ int i;
+
+ if (!_io_pte_present(pte) || _io_pte_leaf(pte))
+ return;
+
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+
+ /* Recursively free all sub page table pages */
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = READ_ONCE(ptr[i]);
+ if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
+ riscv_iommu_pte_free(domain, pte, freelist);
+ }
+
+ if (freelist)
+ iommu_pages_list_add(freelist, ptr);
+ else
+ iommu_free_pages(ptr);
+}
+
+static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
+ unsigned long iova, size_t pgsize,
+ gfp_t gfp)
+{
+ unsigned long *ptr = domain->pgd_root;
+ unsigned long pte, old;
+ int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+ void *addr;
+
+ do {
+ const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+ ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+ /*
+ * Note: returned entry might be a non-leaf if there was
+ * existing mapping with smaller granularity. Up to the caller
+ * to replace and invalidate.
+ */
+ if (((size_t)1 << shift) == pgsize)
+ return ptr;
+pte_retry:
+ pte = READ_ONCE(*ptr);
+ /*
+ * This is very likely incorrect as we should not be adding
+ * new mapping with smaller granularity on top
+ * of existing 2M/1G mapping. Fail.
+ */
+ if (_io_pte_present(pte) && _io_pte_leaf(pte))
+ return NULL;
+ /*
+ * Non-leaf entry is missing, allocate and try to add to the
+ * page table. This might race with other mappings, retry.
+ */
+ if (_io_pte_none(pte)) {
+ addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp,
+ SZ_4K);
+ if (!addr)
+ return NULL;
+ old = pte;
+ pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
+ if (cmpxchg_relaxed(ptr, old, pte) != old) {
+ iommu_free_pages(addr);
+ goto pte_retry;
+ }
+ }
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ } while (level-- > 0);
+
+ return NULL;
+}
+
+static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
+ unsigned long iova, size_t *pte_pgsize)
+{
+ unsigned long *ptr = domain->pgd_root;
+ unsigned long pte;
+ int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+
+ do {
+ const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+ ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+ pte = READ_ONCE(*ptr);
+ if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
+ *pte_pgsize = (size_t)1 << shift;
+ return ptr;
+ }
+ if (_io_pte_none(pte))
+ return NULL;
+ ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ } while (level-- > 0);
+
+ return NULL;
+}
+
+static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
+ unsigned long iova, phys_addr_t phys,
+ size_t pgsize, size_t pgcount, int prot,
+ gfp_t gfp, size_t *mapped)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ size_t size = 0;
+ unsigned long *ptr;
+ unsigned long pte, old, pte_prot;
+ int rc = 0;
+ struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist);
+
+ if (!(prot & IOMMU_WRITE))
+ pte_prot = _PAGE_BASE | _PAGE_READ;
+ else if (domain->amo_enabled)
+ pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
+ else
+ pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
+
+ while (pgcount) {
+ ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp);
+ if (!ptr) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ old = READ_ONCE(*ptr);
+ pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
+ if (cmpxchg_relaxed(ptr, old, pte) != old)
+ continue;
+
+ riscv_iommu_pte_free(domain, old, &freelist);
+
+ size += pgsize;
+ iova += pgsize;
+ phys += pgsize;
+ --pgcount;
+ }
+
+ *mapped = size;
+
+ if (!iommu_pages_list_empty(&freelist)) {
+ /*
+ * In 1.0 spec version, the smallest scope we can use to
+ * invalidate all levels of page table (i.e. leaf and non-leaf)
+ * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0.
+ * This will be updated with hardware support for
+ * capability.NL (non-leaf) IOTINVAL command.
+ */
+ riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+ iommu_put_pages_list(&freelist);
+ }
+
+ return rc;
+}
+
+static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
+ unsigned long iova, size_t pgsize,
+ size_t pgcount,
+ struct iommu_iotlb_gather *gather)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ size_t size = pgcount << __ffs(pgsize);
+ unsigned long *ptr, old;
+ size_t unmapped = 0;
+ size_t pte_size;
+
+ while (unmapped < size) {
+ ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+ if (!ptr)
+ return unmapped;
+
+ /* partial unmap is not allowed, fail. */
+ if (iova & (pte_size - 1))
+ return unmapped;
+
+ old = READ_ONCE(*ptr);
+ if (cmpxchg_relaxed(ptr, old, 0) != old)
+ continue;
+
+ iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
+ pte_size);
+
+ iova += pte_size;
+ unmapped += pte_size;
+ }
+
+ return unmapped;
+}
+
+static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
+ dma_addr_t iova)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ size_t pte_size;
+ unsigned long *ptr;
+
+ ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+ if (!ptr)
+ return 0;
+
+ return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
+}
+
+static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ const unsigned long pfn = virt_to_pfn(domain->pgd_root);
+
+ WARN_ON(!list_empty(&domain->bonds));
+
+ if ((int)domain->pscid > 0)
+ ida_free(&riscv_iommu_pscids, domain->pscid);
+
+ riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
+ kfree(domain);
+}
+
+static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
+{
+ switch (pgd_mode) {
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39;
+
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48;
+
+ case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+ return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57;
+ }
+ return false;
+}
+
+static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
+ struct device *dev,
+ struct iommu_domain *old)
+{
+ struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+ u64 fsc, ta;
+
+ if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
+ return -ENODEV;
+
+ fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
+ FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
+ ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) |
+ RISCV_IOMMU_PC_TA_V;
+
+ if (riscv_iommu_bond_link(domain, dev))
+ return -ENOMEM;
+
+ riscv_iommu_iodir_update(iommu, dev, fsc, ta);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = domain;
+
+ return 0;
+}
+
+static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
+ .attach_dev = riscv_iommu_attach_paging_domain,
+ .free = riscv_iommu_free_paging_domain,
+ .map_pages = riscv_iommu_map_pages,
+ .unmap_pages = riscv_iommu_unmap_pages,
+ .iova_to_phys = riscv_iommu_iova_to_phys,
+ .iotlb_sync = riscv_iommu_iotlb_sync,
+ .flush_iotlb_all = riscv_iommu_iotlb_flush_all,
+};
+
+static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
+{
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_device *iommu;
+ unsigned int pgd_mode;
+ dma_addr_t va_mask;
+ int va_bits;
+
+ iommu = dev_to_iommu(dev);
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57;
+ va_bits = 57;
+ } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48;
+ va_bits = 48;
+ } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) {
+ pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39;
+ va_bits = 39;
+ } else {
+ dev_err(dev, "cannot find supported page table mode\n");
+ return ERR_PTR(-ENODEV);
+ }
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD_RCU(&domain->bonds);
+ spin_lock_init(&domain->lock);
+ domain->numa_node = dev_to_node(iommu->dev);
+ domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
+ domain->pgd_mode = pgd_mode;
+ domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node,
+ GFP_KERNEL_ACCOUNT, SZ_4K);
+ if (!domain->pgd_root) {
+ kfree(domain);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
+ RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
+ if (domain->pscid < 0) {
+ iommu_free_pages(domain->pgd_root);
+ kfree(domain);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * Note: RISC-V Privilege spec mandates that virtual addresses
+ * need to be sign-extended, so if (VA_BITS - 1) is set, all
+ * bits >= VA_BITS need to also be set or else we'll get a
+ * page fault. However the code that creates the mappings
+ * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
+ * for now, so we'll end up with invalid virtual addresses
+ * to map. As a workaround until we get this sorted out
+ * limit the available virtual addresses to VA_BITS - 1.
+ */
+ va_mask = DMA_BIT_MASK(va_bits - 1);
+
+ domain->domain.geometry.aperture_start = 0;
+ domain->domain.geometry.aperture_end = va_mask;
+ domain->domain.geometry.force_aperture = true;
+ domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
+
+ domain->domain.ops = &riscv_iommu_paging_domain_ops;
+
+ return &domain->domain;
+}
+
+static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
+ struct device *dev,
+ struct iommu_domain *old)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ /* Make device context invalid, translation requests will fault w/ #258 */
+ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = NULL;
+
+ return 0;
+}
+
+static struct iommu_domain riscv_iommu_blocking_domain = {
+ .type = IOMMU_DOMAIN_BLOCKED,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = riscv_iommu_attach_blocking_domain,
+ }
+};
+
+static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
+ struct device *dev,
+ struct iommu_domain *old)
+{
+ struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
+ riscv_iommu_bond_unlink(info->domain, dev);
+ info->domain = NULL;
+
+ return 0;
+}
+
+static struct iommu_domain riscv_iommu_identity_domain = {
+ .type = IOMMU_DOMAIN_IDENTITY,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = riscv_iommu_attach_identity_domain,
+ }
+};
+
+static struct iommu_group *riscv_iommu_device_group(struct device *dev)
+{
+ if (dev_is_pci(dev))
+ return pci_device_group(dev);
+ return generic_device_group(dev);
+}
+
+static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args)
+{
+ return iommu_fwspec_add_ids(dev, args->args, 1);
+}
+
+static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
+{
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+ struct riscv_iommu_device *iommu;
+ struct riscv_iommu_info *info;
+ struct riscv_iommu_dc *dc;
+ u64 tc;
+ int i;
+
+ if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids)
+ return ERR_PTR(-ENODEV);
+
+ iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev);
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * IOMMU hardware operating in fail-over BARE mode will provide
+ * identity translation for all connected devices anyway...
+ */
+ if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)
+ return ERR_PTR(-ENODEV);
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+ /*
+ * Allocate and pre-configure device context entries in
+ * the device directory. Do not mark the context valid yet.
+ */
+ tc = 0;
+ if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD)
+ tc |= RISCV_IOMMU_DC_TC_SADE;
+ for (i = 0; i < fwspec->num_ids; i++) {
+ dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+ if (!dc) {
+ kfree(info);
+ return ERR_PTR(-ENODEV);
+ }
+ if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V)
+ dev_warn(dev, "already attached to IOMMU device directory\n");
+ WRITE_ONCE(dc->tc, tc);
+ }
+
+ dev_iommu_priv_set(dev, info);
+
+ return &iommu->iommu;
+}
+
+static void riscv_iommu_release_device(struct device *dev)
+{
+ struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
+
+ kfree_rcu_mightsleep(info);
+}
+
+static const struct iommu_ops riscv_iommu_ops = {
+ .of_xlate = riscv_iommu_of_xlate,
+ .identity_domain = &riscv_iommu_identity_domain,
+ .blocked_domain = &riscv_iommu_blocking_domain,
+ .release_domain = &riscv_iommu_blocking_domain,
+ .domain_alloc_paging = riscv_iommu_alloc_paging_domain,
+ .device_group = riscv_iommu_device_group,
+ .probe_device = riscv_iommu_probe_device,
+ .release_device = riscv_iommu_release_device,
+};
+
+static int riscv_iommu_init_check(struct riscv_iommu_device *iommu)
+{
+ u64 ddtp;
+
+ /*
+ * Make sure the IOMMU is switched off or in pass-through mode during
+ * regular boot flow and disable translation when we boot into a kexec
+ * kernel and the previous kernel left them enabled.
+ */
+ ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP);
+ if (ddtp & RISCV_IOMMU_DDTP_BUSY)
+ return -EBUSY;
+
+ if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) >
+ RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) {
+ if (!is_kdump_kernel())
+ return -EBUSY;
+ riscv_iommu_disable(iommu);
+ }
+
+ /* Configure accesses to in-memory data structures for CPU-native byte order. */
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
+ !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) {
+ if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END))
+ return -EINVAL;
+ riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL,
+ iommu->fctl ^ RISCV_IOMMU_FCTL_BE);
+ iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL);
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) !=
+ !!(iommu->fctl & RISCV_IOMMU_FCTL_BE))
+ return -EINVAL;
+ }
+
+ /*
+ * Distribute interrupt vectors, always use first vector for CIV.
+ * At least one interrupt is required. Read back and verify.
+ */
+ if (!iommu->irqs_count)
+ return -EINVAL;
+
+ iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) |
+ FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) |
+ FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count);
+ riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec);
+ iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC);
+ if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec),
+ FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)),
+ max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec),
+ FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count)
+ return -EINVAL;
+
+ return 0;
+}
+
+void riscv_iommu_remove(struct riscv_iommu_device *iommu)
+{
+ iommu_device_unregister(&iommu->iommu);
+ iommu_device_sysfs_remove(&iommu->iommu);
+ riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
+ riscv_iommu_queue_disable(&iommu->cmdq);
+ riscv_iommu_queue_disable(&iommu->fltq);
+}
+
+int riscv_iommu_init(struct riscv_iommu_device *iommu)
+{
+ int rc;
+
+ RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ);
+ RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ);
+
+ rc = riscv_iommu_init_check(iommu);
+ if (rc)
+ return dev_err_probe(iommu->dev, rc, "unexpected device state\n");
+
+ rc = riscv_iommu_iodir_alloc(iommu);
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq,
+ sizeof(struct riscv_iommu_command));
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq,
+ sizeof(struct riscv_iommu_fq_record));
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process);
+ if (rc)
+ return rc;
+
+ rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process);
+ if (rc)
+ goto err_queue_disable;
+
+ rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX);
+ if (rc)
+ goto err_queue_disable;
+
+ rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s",
+ dev_name(iommu->dev));
+ if (rc) {
+ dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n");
+ goto err_iodir_off;
+ }
+
+ if (!acpi_disabled) {
+ rc = rimt_iommu_register(iommu->dev);
+ if (rc) {
+ dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n");
+ goto err_remove_sysfs;
+ }
+ }
+
+ rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
+ if (rc) {
+ dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
+ goto err_remove_sysfs;
+ }
+
+ return 0;
+
+err_remove_sysfs:
+ iommu_device_sysfs_remove(&iommu->iommu);
+err_iodir_off:
+ riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF);
+err_queue_disable:
+ riscv_iommu_queue_disable(&iommu->fltq);
+ riscv_iommu_queue_disable(&iommu->cmdq);
+ return rc;
+}
diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h
new file mode 100644
index 000000000000..46df79dd5495
--- /dev/null
+++ b/drivers/iommu/riscv/iommu.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2022-2024 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ *
+ * Authors
+ * Tomasz Jeznach <tjeznach@rivosinc.com>
+ * Nick Kossifidis <mick@ics.forth.gr>
+ */
+
+#ifndef _RISCV_IOMMU_H_
+#define _RISCV_IOMMU_H_
+
+#include <linux/iommu.h>
+#include <linux/types.h>
+#include <linux/iopoll.h>
+
+#include "iommu-bits.h"
+
+struct riscv_iommu_device;
+
+struct riscv_iommu_queue {
+ atomic_t prod; /* unbounded producer allocation index */
+ atomic_t head; /* unbounded shadow ring buffer consumer index */
+ atomic_t tail; /* unbounded shadow ring buffer producer index */
+ unsigned int mask; /* index mask, queue length - 1 */
+ unsigned int irq; /* allocated interrupt number */
+ struct riscv_iommu_device *iommu; /* iommu device handling the queue when active */
+ void *base; /* ring buffer kernel pointer */
+ dma_addr_t phys; /* ring buffer physical address */
+ u16 qbr; /* base register offset, head and tail reference */
+ u16 qcr; /* control and status register offset */
+ u8 qid; /* queue identifier, same as RISCV_IOMMU_INTR_XX */
+};
+
+struct riscv_iommu_device {
+ /* iommu core interface */
+ struct iommu_device iommu;
+
+ /* iommu hardware */
+ struct device *dev;
+
+ /* hardware control register space */
+ void __iomem *reg;
+
+ /* supported and enabled hardware capabilities */
+ u64 caps;
+ u32 fctl;
+
+ /* available interrupt numbers, MSI or WSI */
+ unsigned int irqs[RISCV_IOMMU_INTR_COUNT];
+ unsigned int irqs_count;
+ unsigned int icvec;
+
+ /* hardware queues */
+ struct riscv_iommu_queue cmdq;
+ struct riscv_iommu_queue fltq;
+
+ /* device directory */
+ unsigned int ddt_mode;
+ dma_addr_t ddt_phys;
+ u64 *ddt_root;
+};
+
+int riscv_iommu_init(struct riscv_iommu_device *iommu);
+void riscv_iommu_remove(struct riscv_iommu_device *iommu);
+void riscv_iommu_disable(struct riscv_iommu_device *iommu);
+
+#define riscv_iommu_readl(iommu, addr) \
+ readl_relaxed((iommu)->reg + (addr))
+
+#define riscv_iommu_readq(iommu, addr) \
+ readq_relaxed((iommu)->reg + (addr))
+
+#define riscv_iommu_writel(iommu, addr, val) \
+ writel_relaxed((val), (iommu)->reg + (addr))
+
+#define riscv_iommu_writeq(iommu, addr, val) \
+ writeq_relaxed((val), (iommu)->reg + (addr))
+
+#define riscv_iommu_readq_timeout(iommu, addr, val, cond, delay_us, timeout_us) \
+ readx_poll_timeout(readq_relaxed, (iommu)->reg + (addr), val, cond, \
+ delay_us, timeout_us)
+
+#define riscv_iommu_readl_timeout(iommu, addr, val, cond, delay_us, timeout_us) \
+ readx_poll_timeout(readl_relaxed, (iommu)->reg + (addr), val, cond, \
+ delay_us, timeout_us)
+
+#endif
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 4b369419b32c..85f3667e797c 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -25,6 +25,7 @@
#include <linux/pm_runtime.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/string_choices.h>
#include "iommu-pages.h"
@@ -87,6 +88,7 @@ struct rk_iommu_domain {
dma_addr_t dt_dma;
spinlock_t iommus_lock; /* lock for iommus list */
spinlock_t dt_lock; /* lock for modifying page directory table */
+ struct device *dma_dev;
struct iommu_domain domain;
};
@@ -122,7 +124,6 @@ struct rk_iommudata {
struct rk_iommu *iommu;
};
-static struct device *dma_dev;
static const struct rk_iommu_ops *rk_ops;
static struct iommu_domain rk_identity_domain;
@@ -131,7 +132,7 @@ static inline void rk_table_flush(struct rk_iommu_domain *dom, dma_addr_t dma,
{
size_t size = count * sizeof(u32); /* count of u32 entry */
- dma_sync_single_for_device(dma_dev, dma, size, DMA_TO_DEVICE);
+ dma_sync_single_for_device(dom->dma_dev, dma, size, DMA_TO_DEVICE);
}
static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom)
@@ -611,7 +612,7 @@ static irqreturn_t rk_iommu_irq(int irq, void *dev_id)
dev_err(iommu->dev, "Page fault at %pad of type %s\n",
&iova,
- (flags == IOMMU_FAULT_WRITE) ? "write" : "read");
+ str_write_read(flags == IOMMU_FAULT_WRITE));
log_iova(iommu, i, iova);
@@ -729,14 +730,15 @@ static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain,
if (rk_dte_is_pt_valid(dte))
goto done;
- page_table = iommu_alloc_page(GFP_ATOMIC | rk_ops->gfp_flags);
+ page_table = iommu_alloc_pages_sz(GFP_ATOMIC | rk_ops->gfp_flags,
+ SPAGE_SIZE);
if (!page_table)
return ERR_PTR(-ENOMEM);
- pt_dma = dma_map_single(dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE);
- if (dma_mapping_error(dma_dev, pt_dma)) {
- dev_err(dma_dev, "DMA mapping error while allocating page table\n");
- iommu_free_page(page_table);
+ pt_dma = dma_map_single(rk_domain->dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE);
+ if (dma_mapping_error(rk_domain->dma_dev, pt_dma)) {
+ dev_err(rk_domain->dma_dev, "DMA mapping error while allocating page table\n");
+ iommu_free_pages(page_table);
return ERR_PTR(-ENOMEM);
}
@@ -958,7 +960,8 @@ out_disable_clocks:
}
static int rk_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct rk_iommu *iommu;
struct rk_iommu_domain *rk_domain;
@@ -1003,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = {
};
static int rk_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct rk_iommu *iommu;
struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
@@ -1024,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
if (iommu->domain == domain)
return 0;
- ret = rk_iommu_identity_attach(&rk_identity_domain, dev);
+ ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old);
if (ret)
return ret;
@@ -1039,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
return 0;
ret = rk_iommu_enable(iommu);
- if (ret)
- WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev));
+ if (ret) {
+ /*
+ * Note rk_iommu_identity_attach() might fail before physically
+ * attaching the dev to iommu->domain, in which case the actual
+ * old domain for this revert should be rk_identity_domain v.s.
+ * iommu->domain. Since rk_iommu_identity_attach() does not care
+ * about the old domain argument for now, this is not a problem.
+ */
+ WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev,
+ iommu->domain));
+ }
pm_runtime_put(iommu->dev);
@@ -1050,9 +1062,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain,
static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev)
{
struct rk_iommu_domain *rk_domain;
-
- if (!dma_dev)
- return NULL;
+ struct rk_iommu *iommu;
rk_domain = kzalloc(sizeof(*rk_domain), GFP_KERNEL);
if (!rk_domain)
@@ -1063,14 +1073,17 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev)
* Each level1 (dt) and level2 (pt) table has 1024 4-byte entries.
* Allocate one 4 KiB page for each table.
*/
- rk_domain->dt = iommu_alloc_page(GFP_KERNEL | rk_ops->gfp_flags);
+ rk_domain->dt = iommu_alloc_pages_sz(GFP_KERNEL | rk_ops->gfp_flags,
+ SPAGE_SIZE);
if (!rk_domain->dt)
goto err_free_domain;
- rk_domain->dt_dma = dma_map_single(dma_dev, rk_domain->dt,
+ iommu = rk_iommu_from_dev(dev);
+ rk_domain->dma_dev = iommu->dev;
+ rk_domain->dt_dma = dma_map_single(rk_domain->dma_dev, rk_domain->dt,
SPAGE_SIZE, DMA_TO_DEVICE);
- if (dma_mapping_error(dma_dev, rk_domain->dt_dma)) {
- dev_err(dma_dev, "DMA map error for DT\n");
+ if (dma_mapping_error(rk_domain->dma_dev, rk_domain->dt_dma)) {
+ dev_err(rk_domain->dma_dev, "DMA map error for DT\n");
goto err_free_dt;
}
@@ -1078,6 +1091,8 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev)
spin_lock_init(&rk_domain->dt_lock);
INIT_LIST_HEAD(&rk_domain->iommus);
+ rk_domain->domain.pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP;
+
rk_domain->domain.geometry.aperture_start = 0;
rk_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32);
rk_domain->domain.geometry.force_aperture = true;
@@ -1085,7 +1100,7 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev)
return &rk_domain->domain;
err_free_dt:
- iommu_free_page(rk_domain->dt);
+ iommu_free_pages(rk_domain->dt);
err_free_domain:
kfree(rk_domain);
@@ -1104,15 +1119,15 @@ static void rk_iommu_domain_free(struct iommu_domain *domain)
if (rk_dte_is_pt_valid(dte)) {
phys_addr_t pt_phys = rk_ops->pt_address(dte);
u32 *page_table = phys_to_virt(pt_phys);
- dma_unmap_single(dma_dev, pt_phys,
+ dma_unmap_single(rk_domain->dma_dev, pt_phys,
SPAGE_SIZE, DMA_TO_DEVICE);
- iommu_free_page(page_table);
+ iommu_free_pages(page_table);
}
}
- dma_unmap_single(dma_dev, rk_domain->dt_dma,
+ dma_unmap_single(rk_domain->dma_dev, rk_domain->dt_dma,
SPAGE_SIZE, DMA_TO_DEVICE);
- iommu_free_page(rk_domain->dt);
+ iommu_free_pages(rk_domain->dt);
kfree(rk_domain);
}
@@ -1147,14 +1162,13 @@ static int rk_iommu_of_xlate(struct device *dev,
struct platform_device *iommu_dev;
struct rk_iommudata *data;
- data = devm_kzalloc(dma_dev, sizeof(*data), GFP_KERNEL);
+ iommu_dev = of_find_device_by_node(args->np);
+
+ data = devm_kzalloc(&iommu_dev->dev, sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
- iommu_dev = of_find_device_by_node(args->np);
-
data->iommu = platform_get_drvdata(iommu_dev);
- data->iommu->domain = &rk_identity_domain;
dev_iommu_priv_set(dev, data);
platform_device_put(iommu_dev);
@@ -1168,7 +1182,6 @@ static const struct iommu_ops rk_iommu_ops = {
.probe_device = rk_iommu_probe_device,
.release_device = rk_iommu_release_device,
.device_group = generic_single_device_group,
- .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP,
.of_xlate = rk_iommu_of_xlate,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = rk_iommu_attach_device,
@@ -1192,6 +1205,8 @@ static int rk_iommu_probe(struct platform_device *pdev)
if (!iommu)
return -ENOMEM;
+ iommu->domain = &rk_identity_domain;
+
platform_set_drvdata(pdev, iommu);
iommu->dev = dev;
iommu->num_mmu = 0;
@@ -1255,22 +1270,6 @@ static int rk_iommu_probe(struct platform_device *pdev)
if (err)
return err;
- err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev));
- if (err)
- goto err_unprepare_clocks;
-
- err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev);
- if (err)
- goto err_remove_sysfs;
-
- /*
- * Use the first registered IOMMU device for domain to use with DMA
- * API, since a domain might not physically correspond to a single
- * IOMMU device..
- */
- if (!dma_dev)
- dma_dev = &pdev->dev;
-
pm_runtime_enable(dev);
for (i = 0; i < iommu->num_irq; i++) {
@@ -1289,12 +1288,19 @@ static int rk_iommu_probe(struct platform_device *pdev)
dma_set_mask_and_coherent(dev, rk_ops->dma_bit_mask);
+ err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev));
+ if (err)
+ goto err_pm_disable;
+
+ err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev);
+ if (err)
+ goto err_remove_sysfs;
+
return 0;
-err_pm_disable:
- pm_runtime_disable(dev);
err_remove_sysfs:
iommu_device_sysfs_remove(&iommu->iommu);
-err_unprepare_clocks:
+err_pm_disable:
+ pm_runtime_disable(dev);
clk_bulk_unprepare(iommu->num_clocks, iommu->clocks);
return err;
}
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index d8eaa7ea380b..fe679850af28 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -16,7 +16,7 @@
#include "dma-iommu.h"
-static const struct iommu_ops s390_iommu_ops;
+static const struct iommu_ops s390_iommu_ops, s390_iommu_rtr_ops;
static struct kmem_cache *dma_region_table_cache;
static struct kmem_cache *dma_page_table_cache;
@@ -31,8 +31,21 @@ struct s390_domain {
unsigned long *dma_table;
spinlock_t list_lock;
struct rcu_head rcu;
+ u8 origin_type;
};
+static struct iommu_domain blocking_domain;
+
+static inline unsigned int calc_rfx(dma_addr_t ptr)
+{
+ return ((unsigned long)ptr >> ZPCI_RF_SHIFT) & ZPCI_INDEX_MASK;
+}
+
+static inline unsigned int calc_rsx(dma_addr_t ptr)
+{
+ return ((unsigned long)ptr >> ZPCI_RS_SHIFT) & ZPCI_INDEX_MASK;
+}
+
static inline unsigned int calc_rtx(dma_addr_t ptr)
{
return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK;
@@ -54,6 +67,20 @@ static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa)
*entry |= (pfaa & ZPCI_PTE_ADDR_MASK);
}
+static inline void set_rf_rso(unsigned long *entry, phys_addr_t rso)
+{
+ *entry &= ZPCI_RTE_FLAG_MASK;
+ *entry |= (rso & ZPCI_RTE_ADDR_MASK);
+ *entry |= ZPCI_TABLE_TYPE_RFX;
+}
+
+static inline void set_rs_rto(unsigned long *entry, phys_addr_t rto)
+{
+ *entry &= ZPCI_RTE_FLAG_MASK;
+ *entry |= (rto & ZPCI_RTE_ADDR_MASK);
+ *entry |= ZPCI_TABLE_TYPE_RSX;
+}
+
static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto)
{
*entry &= ZPCI_RTE_FLAG_MASK;
@@ -68,6 +95,22 @@ static inline void set_st_pto(unsigned long *entry, phys_addr_t pto)
*entry |= ZPCI_TABLE_TYPE_SX;
}
+static inline void validate_rf_entry(unsigned long *entry)
+{
+ *entry &= ~ZPCI_TABLE_VALID_MASK;
+ *entry &= ~ZPCI_TABLE_OFFSET_MASK;
+ *entry |= ZPCI_TABLE_VALID;
+ *entry |= ZPCI_TABLE_LEN_RFX;
+}
+
+static inline void validate_rs_entry(unsigned long *entry)
+{
+ *entry &= ~ZPCI_TABLE_VALID_MASK;
+ *entry &= ~ZPCI_TABLE_OFFSET_MASK;
+ *entry |= ZPCI_TABLE_VALID;
+ *entry |= ZPCI_TABLE_LEN_RSX;
+}
+
static inline void validate_rt_entry(unsigned long *entry)
{
*entry &= ~ZPCI_TABLE_VALID_MASK;
@@ -118,6 +161,22 @@ static inline int pt_entry_isvalid(unsigned long entry)
return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID;
}
+static inline unsigned long *get_rf_rso(unsigned long entry)
+{
+ if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RFX)
+ return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK);
+ else
+ return NULL;
+}
+
+static inline unsigned long *get_rs_rto(unsigned long entry)
+{
+ if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RSX)
+ return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK);
+ else
+ return NULL;
+}
+
static inline unsigned long *get_rt_sto(unsigned long entry)
{
if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)
@@ -189,18 +248,59 @@ static void dma_free_seg_table(unsigned long entry)
dma_free_cpu_table(sto);
}
-static void dma_cleanup_tables(unsigned long *table)
+static void dma_free_rt_table(unsigned long entry)
{
+ unsigned long *rto = get_rs_rto(entry);
int rtx;
- if (!table)
+ for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
+ if (reg_entry_isvalid(rto[rtx]))
+ dma_free_seg_table(rto[rtx]);
+
+ dma_free_cpu_table(rto);
+}
+
+static void dma_free_rs_table(unsigned long entry)
+{
+ unsigned long *rso = get_rf_rso(entry);
+ int rsx;
+
+ for (rsx = 0; rsx < ZPCI_TABLE_ENTRIES; rsx++)
+ if (reg_entry_isvalid(rso[rsx]))
+ dma_free_rt_table(rso[rsx]);
+
+ dma_free_cpu_table(rso);
+}
+
+static void dma_cleanup_tables(struct s390_domain *domain)
+{
+ int rtx, rsx, rfx;
+
+ if (!domain->dma_table)
return;
- for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
- if (reg_entry_isvalid(table[rtx]))
- dma_free_seg_table(table[rtx]);
+ switch (domain->origin_type) {
+ case ZPCI_TABLE_TYPE_RFX:
+ for (rfx = 0; rfx < ZPCI_TABLE_ENTRIES; rfx++)
+ if (reg_entry_isvalid(domain->dma_table[rfx]))
+ dma_free_rs_table(domain->dma_table[rfx]);
+ break;
+ case ZPCI_TABLE_TYPE_RSX:
+ for (rsx = 0; rsx < ZPCI_TABLE_ENTRIES; rsx++)
+ if (reg_entry_isvalid(domain->dma_table[rsx]))
+ dma_free_rt_table(domain->dma_table[rsx]);
+ break;
+ case ZPCI_TABLE_TYPE_RTX:
+ for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++)
+ if (reg_entry_isvalid(domain->dma_table[rtx]))
+ dma_free_seg_table(domain->dma_table[rtx]);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid IOMMU table (%x)\n", domain->origin_type);
+ return;
+ }
- dma_free_cpu_table(table);
+ dma_free_cpu_table(domain->dma_table);
}
static unsigned long *dma_alloc_page_table(gfp_t gfp)
@@ -216,6 +316,70 @@ static unsigned long *dma_alloc_page_table(gfp_t gfp)
return table;
}
+static unsigned long *dma_walk_rs_table(unsigned long *rso,
+ dma_addr_t dma_addr, gfp_t gfp)
+{
+ unsigned int rsx = calc_rsx(dma_addr);
+ unsigned long old_rse, rse;
+ unsigned long *rsep, *rto;
+
+ rsep = &rso[rsx];
+ rse = READ_ONCE(*rsep);
+ if (reg_entry_isvalid(rse)) {
+ rto = get_rs_rto(rse);
+ } else {
+ rto = dma_alloc_cpu_table(gfp);
+ if (!rto)
+ return NULL;
+
+ set_rs_rto(&rse, virt_to_phys(rto));
+ validate_rs_entry(&rse);
+ entry_clr_protected(&rse);
+
+ old_rse = cmpxchg(rsep, ZPCI_TABLE_INVALID, rse);
+ if (old_rse != ZPCI_TABLE_INVALID) {
+ /* Somone else was faster, use theirs */
+ dma_free_cpu_table(rto);
+ rto = get_rs_rto(old_rse);
+ }
+ }
+ return rto;
+}
+
+static unsigned long *dma_walk_rf_table(unsigned long *rfo,
+ dma_addr_t dma_addr, gfp_t gfp)
+{
+ unsigned int rfx = calc_rfx(dma_addr);
+ unsigned long old_rfe, rfe;
+ unsigned long *rfep, *rso;
+
+ rfep = &rfo[rfx];
+ rfe = READ_ONCE(*rfep);
+ if (reg_entry_isvalid(rfe)) {
+ rso = get_rf_rso(rfe);
+ } else {
+ rso = dma_alloc_cpu_table(gfp);
+ if (!rso)
+ return NULL;
+
+ set_rf_rso(&rfe, virt_to_phys(rso));
+ validate_rf_entry(&rfe);
+ entry_clr_protected(&rfe);
+
+ old_rfe = cmpxchg(rfep, ZPCI_TABLE_INVALID, rfe);
+ if (old_rfe != ZPCI_TABLE_INVALID) {
+ /* Somone else was faster, use theirs */
+ dma_free_cpu_table(rso);
+ rso = get_rf_rso(old_rfe);
+ }
+ }
+
+ if (!rso)
+ return NULL;
+
+ return dma_walk_rs_table(rso, dma_addr, gfp);
+}
+
static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp)
{
unsigned long old_rte, rte;
@@ -269,11 +433,31 @@ static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp)
return pto;
}
-static unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, gfp_t gfp)
+static unsigned long *dma_walk_region_tables(struct s390_domain *domain,
+ dma_addr_t dma_addr, gfp_t gfp)
{
- unsigned long *sto, *pto;
+ switch (domain->origin_type) {
+ case ZPCI_TABLE_TYPE_RFX:
+ return dma_walk_rf_table(domain->dma_table, dma_addr, gfp);
+ case ZPCI_TABLE_TYPE_RSX:
+ return dma_walk_rs_table(domain->dma_table, dma_addr, gfp);
+ case ZPCI_TABLE_TYPE_RTX:
+ return domain->dma_table;
+ default:
+ return NULL;
+ }
+}
+
+static unsigned long *dma_walk_cpu_trans(struct s390_domain *domain,
+ dma_addr_t dma_addr, gfp_t gfp)
+{
+ unsigned long *rto, *sto, *pto;
unsigned int rtx, sx, px;
+ rto = dma_walk_region_tables(domain, dma_addr, gfp);
+ if (!rto)
+ return NULL;
+
rtx = calc_rtx(dma_addr);
sto = dma_get_seg_table_origin(&rto[rtx], gfp);
if (!sto)
@@ -327,9 +511,25 @@ static bool s390_iommu_capable(struct device *dev, enum iommu_cap cap)
}
}
+static inline u64 max_tbl_size(struct s390_domain *domain)
+{
+ switch (domain->origin_type) {
+ case ZPCI_TABLE_TYPE_RTX:
+ return ZPCI_TABLE_SIZE_RT - 1;
+ case ZPCI_TABLE_TYPE_RSX:
+ return ZPCI_TABLE_SIZE_RS - 1;
+ case ZPCI_TABLE_TYPE_RFX:
+ return U64_MAX;
+ default:
+ return 0;
+ }
+}
+
static struct iommu_domain *s390_domain_alloc_paging(struct device *dev)
{
+ struct zpci_dev *zdev = to_zpci_dev(dev);
struct s390_domain *s390_domain;
+ u64 aperture_size;
s390_domain = kzalloc(sizeof(*s390_domain), GFP_KERNEL);
if (!s390_domain)
@@ -340,9 +540,27 @@ static struct iommu_domain *s390_domain_alloc_paging(struct device *dev)
kfree(s390_domain);
return NULL;
}
+
+ aperture_size = min(s390_iommu_aperture,
+ zdev->end_dma - zdev->start_dma + 1);
+ if (aperture_size <= (ZPCI_TABLE_SIZE_RT - zdev->start_dma)) {
+ s390_domain->origin_type = ZPCI_TABLE_TYPE_RTX;
+ } else if (aperture_size <= (ZPCI_TABLE_SIZE_RS - zdev->start_dma) &&
+ (zdev->dtsm & ZPCI_IOTA_DT_RS)) {
+ s390_domain->origin_type = ZPCI_TABLE_TYPE_RSX;
+ } else if (zdev->dtsm & ZPCI_IOTA_DT_RF) {
+ s390_domain->origin_type = ZPCI_TABLE_TYPE_RFX;
+ } else {
+ /* Assume RTX available */
+ s390_domain->origin_type = ZPCI_TABLE_TYPE_RTX;
+ aperture_size = ZPCI_TABLE_SIZE_RT - zdev->start_dma;
+ }
+ zdev->end_dma = zdev->start_dma + aperture_size - 1;
+
+ s390_domain->domain.pgsize_bitmap = SZ_4K;
s390_domain->domain.geometry.force_aperture = true;
s390_domain->domain.geometry.aperture_start = 0;
- s390_domain->domain.geometry.aperture_end = ZPCI_TABLE_SIZE_RT - 1;
+ s390_domain->domain.geometry.aperture_end = max_tbl_size(s390_domain);
spin_lock_init(&s390_domain->list_lock);
INIT_LIST_HEAD_RCU(&s390_domain->devices);
@@ -354,7 +572,7 @@ static void s390_iommu_rcu_free_domain(struct rcu_head *head)
{
struct s390_domain *s390_domain = container_of(head, struct s390_domain, rcu);
- dma_cleanup_tables(s390_domain->dma_table);
+ dma_cleanup_tables(s390_domain);
kfree(s390_domain);
}
@@ -369,24 +587,116 @@ static void s390_domain_free(struct iommu_domain *domain)
call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain);
}
-static void s390_iommu_detach_device(struct iommu_domain *domain,
- struct device *dev)
+static void zdev_s390_domain_update(struct zpci_dev *zdev,
+ struct iommu_domain *domain)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zdev->dom_lock, flags);
+ zdev->s390_domain = domain;
+ spin_unlock_irqrestore(&zdev->dom_lock, flags);
+}
+
+static u64 get_iota_region_flag(struct s390_domain *domain)
+{
+ switch (domain->origin_type) {
+ case ZPCI_TABLE_TYPE_RTX:
+ return ZPCI_IOTA_RTTO_FLAG;
+ case ZPCI_TABLE_TYPE_RSX:
+ return ZPCI_IOTA_RSTO_FLAG;
+ case ZPCI_TABLE_TYPE_RFX:
+ return ZPCI_IOTA_RFTO_FLAG;
+ default:
+ WARN_ONCE(1, "Invalid IOMMU table (%x)\n", domain->origin_type);
+ return 0;
+ }
+}
+
+static bool reg_ioat_propagate_error(int cc, u8 status)
+{
+ /*
+ * If the device is in the error state the reset routine
+ * will register the IOAT of the newly set domain on re-enable
+ */
+ if (cc == ZPCI_CC_ERR && status == ZPCI_PCI_ST_FUNC_NOT_AVAIL)
+ return false;
+ /*
+ * If the device was removed treat registration as success
+ * and let the subsequent error event trigger tear down.
+ */
+ if (cc == ZPCI_CC_INVAL_HANDLE)
+ return false;
+ return cc != ZPCI_CC_OK;
+}
+
+static int s390_iommu_domain_reg_ioat(struct zpci_dev *zdev,
+ struct iommu_domain *domain, u8 *status)
+{
+ struct s390_domain *s390_domain;
+ int rc = 0;
+ u64 iota;
+
+ switch (domain->type) {
+ case IOMMU_DOMAIN_IDENTITY:
+ rc = zpci_register_ioat(zdev, 0, zdev->start_dma,
+ zdev->end_dma, 0, status);
+ break;
+ case IOMMU_DOMAIN_BLOCKED:
+ /* Nothing to do in this case */
+ break;
+ default:
+ s390_domain = to_s390_domain(domain);
+ iota = virt_to_phys(s390_domain->dma_table) |
+ get_iota_region_flag(s390_domain);
+ rc = zpci_register_ioat(zdev, 0, zdev->start_dma,
+ zdev->end_dma, iota, status);
+ }
+
+ return rc;
+}
+
+int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status)
+{
+ unsigned long flags;
+ int rc;
+
+ spin_lock_irqsave(&zdev->dom_lock, flags);
+
+ rc = s390_iommu_domain_reg_ioat(zdev, zdev->s390_domain, status);
+
+ spin_unlock_irqrestore(&zdev->dom_lock, flags);
+
+ return rc;
+}
+
+static int blocking_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
+ struct s390_domain *s390_domain;
unsigned long flags;
- spin_lock_irqsave(&s390_domain->list_lock, flags);
- list_del_rcu(&zdev->iommu_list);
- spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+ if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED)
+ return 0;
+
+ s390_domain = to_s390_domain(zdev->s390_domain);
+ if (zdev->dma_table) {
+ spin_lock_irqsave(&s390_domain->list_lock, flags);
+ list_del_rcu(&zdev->iommu_list);
+ spin_unlock_irqrestore(&s390_domain->list_lock, flags);
+ }
zpci_unregister_ioat(zdev, 0);
- zdev->s390_domain = NULL;
zdev->dma_table = NULL;
+ zdev_s390_domain_update(zdev, domain);
+
+ return 0;
}
static int s390_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct s390_domain *s390_domain = to_s390_domain(domain);
struct zpci_dev *zdev = to_zpci_dev(dev);
@@ -401,20 +711,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
domain->geometry.aperture_end < zdev->start_dma))
return -EINVAL;
- if (zdev->s390_domain)
- s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
+ blocking_domain_attach_device(&blocking_domain, dev, old);
- cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- virt_to_phys(s390_domain->dma_table), &status);
- /*
- * If the device is undergoing error recovery the reset code
- * will re-establish the new domain.
- */
- if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL)
+ /* If we fail now DMA remains blocked via blocking domain */
+ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
+ if (reg_ioat_propagate_error(cc, status))
return -EIO;
-
zdev->dma_table = s390_domain->dma_table;
- zdev->s390_domain = s390_domain;
+ zdev_s390_domain_update(zdev, domain);
spin_lock_irqsave(&s390_domain->list_lock, flags);
list_add_rcu(&zdev->iommu_list, &s390_domain->devices);
@@ -428,6 +732,8 @@ static void s390_iommu_get_resv_regions(struct device *dev,
{
struct zpci_dev *zdev = to_zpci_dev(dev);
struct iommu_resv_region *region;
+ u64 max_size, end_resv;
+ unsigned long flags;
if (zdev->start_dma) {
region = iommu_alloc_resv_region(0, zdev->start_dma, 0,
@@ -437,10 +743,21 @@ static void s390_iommu_get_resv_regions(struct device *dev,
list_add_tail(&region->list, list);
}
- if (zdev->end_dma < ZPCI_TABLE_SIZE_RT - 1) {
- region = iommu_alloc_resv_region(zdev->end_dma + 1,
- ZPCI_TABLE_SIZE_RT - zdev->end_dma - 1,
- 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
+ spin_lock_irqsave(&zdev->dom_lock, flags);
+ if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED ||
+ zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY) {
+ spin_unlock_irqrestore(&zdev->dom_lock, flags);
+ return;
+ }
+
+ max_size = max_tbl_size(to_s390_domain(zdev->s390_domain));
+ spin_unlock_irqrestore(&zdev->dom_lock, flags);
+
+ if (zdev->end_dma < max_size) {
+ end_resv = max_size - zdev->end_dma;
+ region = iommu_alloc_resv_region(zdev->end_dma + 1, end_resv,
+ 0, IOMMU_RESV_RESERVED,
+ GFP_KERNEL);
if (!region)
return;
list_add_tail(&region->list, list);
@@ -456,29 +773,17 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev)
zdev = to_zpci_dev(dev);
- if (zdev->start_dma > zdev->end_dma ||
- zdev->start_dma > ZPCI_TABLE_SIZE_RT - 1)
+ if (zdev->start_dma > zdev->end_dma)
return ERR_PTR(-EINVAL);
- if (zdev->end_dma > ZPCI_TABLE_SIZE_RT - 1)
- zdev->end_dma = ZPCI_TABLE_SIZE_RT - 1;
-
if (zdev->tlb_refresh)
dev->iommu->shadow_on_flush = 1;
- return &zdev->iommu_dev;
-}
+ /* Start with DMA blocked */
+ spin_lock_init(&zdev->dom_lock);
+ zdev_s390_domain_update(zdev, &blocking_domain);
-static void s390_iommu_release_device(struct device *dev)
-{
- struct zpci_dev *zdev = to_zpci_dev(dev);
-
- /*
- * release_device is expected to detach any domain currently attached
- * to the device, but keep it attached to other devices in the group.
- */
- if (zdev)
- s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
+ return &zdev->iommu_dev;
}
static int zpci_refresh_all(struct zpci_dev *zdev)
@@ -560,8 +865,7 @@ static int s390_iommu_validate_trans(struct s390_domain *s390_domain,
int rc;
for (i = 0; i < nr_pages; i++) {
- entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr,
- gfp);
+ entry = dma_walk_cpu_trans(s390_domain, dma_addr, gfp);
if (unlikely(!entry)) {
rc = -ENOMEM;
goto undo_cpu_trans;
@@ -576,8 +880,7 @@ static int s390_iommu_validate_trans(struct s390_domain *s390_domain,
undo_cpu_trans:
while (i-- > 0) {
dma_addr -= PAGE_SIZE;
- entry = dma_walk_cpu_trans(s390_domain->dma_table,
- dma_addr, gfp);
+ entry = dma_walk_cpu_trans(s390_domain, dma_addr, gfp);
if (!entry)
break;
dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID);
@@ -594,8 +897,7 @@ static int s390_iommu_invalidate_trans(struct s390_domain *s390_domain,
int rc = 0;
for (i = 0; i < nr_pages; i++) {
- entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr,
- GFP_ATOMIC);
+ entry = dma_walk_cpu_trans(s390_domain, dma_addr, GFP_ATOMIC);
if (unlikely(!entry)) {
rc = -EINVAL;
break;
@@ -639,6 +941,51 @@ static int s390_iommu_map_pages(struct iommu_domain *domain,
return rc;
}
+static unsigned long *get_rso_from_iova(struct s390_domain *domain,
+ dma_addr_t iova)
+{
+ unsigned long *rfo;
+ unsigned long rfe;
+ unsigned int rfx;
+
+ switch (domain->origin_type) {
+ case ZPCI_TABLE_TYPE_RFX:
+ rfo = domain->dma_table;
+ rfx = calc_rfx(iova);
+ rfe = READ_ONCE(rfo[rfx]);
+ if (!reg_entry_isvalid(rfe))
+ return NULL;
+ return get_rf_rso(rfe);
+ case ZPCI_TABLE_TYPE_RSX:
+ return domain->dma_table;
+ default:
+ return NULL;
+ }
+}
+
+static unsigned long *get_rto_from_iova(struct s390_domain *domain,
+ dma_addr_t iova)
+{
+ unsigned long *rso;
+ unsigned long rse;
+ unsigned int rsx;
+
+ switch (domain->origin_type) {
+ case ZPCI_TABLE_TYPE_RFX:
+ case ZPCI_TABLE_TYPE_RSX:
+ rso = get_rso_from_iova(domain, iova);
+ rsx = calc_rsx(iova);
+ rse = READ_ONCE(rso[rsx]);
+ if (!reg_entry_isvalid(rse))
+ return NULL;
+ return get_rs_rto(rse);
+ case ZPCI_TABLE_TYPE_RTX:
+ return domain->dma_table;
+ default:
+ return NULL;
+ }
+}
+
static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain,
dma_addr_t iova)
{
@@ -652,10 +999,13 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain,
iova > domain->geometry.aperture_end)
return 0;
+ rto = get_rto_from_iova(s390_domain, iova);
+ if (!rto)
+ return 0;
+
rtx = calc_rtx(iova);
sx = calc_sx(iova);
px = calc_px(iova);
- rto = s390_domain->dma_table;
rte = READ_ONCE(rto[rtx]);
if (reg_entry_isvalid(rte)) {
@@ -697,14 +1047,20 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain,
struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev)
{
- if (!zdev || !zdev->s390_domain)
+ struct s390_domain *s390_domain;
+
+ lockdep_assert_held(&zdev->dom_lock);
+
+ if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED ||
+ zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY)
return NULL;
- return &zdev->s390_domain->ctrs;
+
+ s390_domain = to_s390_domain(zdev->s390_domain);
+ return &s390_domain->ctrs;
}
int zpci_init_iommu(struct zpci_dev *zdev)
{
- u64 aperture_size;
int rc = 0;
rc = iommu_device_sysfs_add(&zdev->iommu_dev, NULL, NULL,
@@ -712,16 +1068,16 @@ int zpci_init_iommu(struct zpci_dev *zdev)
if (rc)
goto out_err;
- rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops, NULL);
+ if (zdev->rtr_avail) {
+ rc = iommu_device_register(&zdev->iommu_dev,
+ &s390_iommu_rtr_ops, NULL);
+ } else {
+ rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops,
+ NULL);
+ }
if (rc)
goto out_sysfs;
- zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
- aperture_size = min3(s390_iommu_aperture,
- ZPCI_TABLE_SIZE_RT - zdev->start_dma,
- zdev->end_dma - zdev->start_dma + 1);
- zdev->end_dma = zdev->start_dma + aperture_size - 1;
-
return 0;
out_sysfs:
@@ -776,22 +1132,66 @@ static int __init s390_iommu_init(void)
}
subsys_initcall(s390_iommu_init);
-static const struct iommu_ops s390_iommu_ops = {
- .capable = s390_iommu_capable,
- .domain_alloc_paging = s390_domain_alloc_paging,
- .probe_device = s390_iommu_probe_device,
- .release_device = s390_iommu_release_device,
- .device_group = generic_device_group,
- .pgsize_bitmap = SZ_4K,
- .get_resv_regions = s390_iommu_get_resv_regions,
- .default_domain_ops = &(const struct iommu_domain_ops) {
- .attach_dev = s390_iommu_attach_device,
- .map_pages = s390_iommu_map_pages,
- .unmap_pages = s390_iommu_unmap_pages,
- .flush_iotlb_all = s390_iommu_flush_iotlb_all,
- .iotlb_sync = s390_iommu_iotlb_sync,
- .iotlb_sync_map = s390_iommu_iotlb_sync_map,
- .iova_to_phys = s390_iommu_iova_to_phys,
- .free = s390_domain_free,
+static int s390_attach_dev_identity(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
+{
+ struct zpci_dev *zdev = to_zpci_dev(dev);
+ u8 status;
+ int cc;
+
+ blocking_domain_attach_device(&blocking_domain, dev, old);
+
+ /* If we fail now DMA remains blocked via blocking domain */
+ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
+ if (reg_ioat_propagate_error(cc, status))
+ return -EIO;
+
+ zdev_s390_domain_update(zdev, domain);
+
+ return 0;
+}
+
+static const struct iommu_domain_ops s390_identity_ops = {
+ .attach_dev = s390_attach_dev_identity,
+};
+
+static struct iommu_domain s390_identity_domain = {
+ .type = IOMMU_DOMAIN_IDENTITY,
+ .ops = &s390_identity_ops,
+};
+
+static struct iommu_domain blocking_domain = {
+ .type = IOMMU_DOMAIN_BLOCKED,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = blocking_domain_attach_device,
}
};
+
+#define S390_IOMMU_COMMON_OPS() \
+ .blocked_domain = &blocking_domain, \
+ .release_domain = &blocking_domain, \
+ .capable = s390_iommu_capable, \
+ .domain_alloc_paging = s390_domain_alloc_paging, \
+ .probe_device = s390_iommu_probe_device, \
+ .device_group = generic_device_group, \
+ .get_resv_regions = s390_iommu_get_resv_regions, \
+ .default_domain_ops = &(const struct iommu_domain_ops) { \
+ .attach_dev = s390_iommu_attach_device, \
+ .map_pages = s390_iommu_map_pages, \
+ .unmap_pages = s390_iommu_unmap_pages, \
+ .flush_iotlb_all = s390_iommu_flush_iotlb_all, \
+ .iotlb_sync = s390_iommu_iotlb_sync, \
+ .iotlb_sync_map = s390_iommu_iotlb_sync_map, \
+ .iova_to_phys = s390_iommu_iova_to_phys, \
+ .free = s390_domain_free, \
+ }
+
+static const struct iommu_ops s390_iommu_ops = {
+ S390_IOMMU_COMMON_OPS()
+};
+
+static const struct iommu_ops s390_iommu_rtr_ops = {
+ .identity_domain = &s390_identity_domain,
+ S390_IOMMU_COMMON_OPS()
+};
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index ba53571a8239..555d4505c747 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -143,6 +143,8 @@ static struct iommu_domain *sprd_iommu_domain_alloc_paging(struct device *dev)
spin_lock_init(&dom->pgtlock);
+ dom->domain.pgsize_bitmap = SPRD_IOMMU_PAGE_SIZE;
+
dom->domain.geometry.aperture_start = 0;
dom->domain.geometry.aperture_end = SZ_256M - 1;
dom->domain.geometry.force_aperture = true;
@@ -232,8 +234,8 @@ static void sprd_iommu_cleanup(struct sprd_iommu_domain *dom)
pgt_size = sprd_iommu_pgt_size(&dom->domain);
dma_free_coherent(dom->sdev->dev, pgt_size, dom->pgt_va, dom->pgt_pa);
- dom->sdev = NULL;
sprd_iommu_hw_en(dom->sdev, false);
+ dom->sdev = NULL;
}
static void sprd_iommu_domain_free(struct iommu_domain *domain)
@@ -245,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain)
}
static int sprd_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev);
struct sprd_iommu_domain *dom = to_sprd_domain(domain);
@@ -410,7 +413,6 @@ static const struct iommu_ops sprd_iommu_ops = {
.probe_device = sprd_iommu_probe_device,
.device_group = generic_single_device_group,
.of_xlate = sprd_iommu_of_xlate,
- .pgsize_bitmap = SPRD_IOMMU_PAGE_SIZE,
.owner = THIS_MODULE,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = sprd_iommu_attach_device,
@@ -531,7 +533,7 @@ static struct platform_driver sprd_iommu_driver = {
.suppress_bind_attrs = true,
},
.probe = sprd_iommu_probe,
- .remove_new = sprd_iommu_remove,
+ .remove = sprd_iommu_remove,
};
module_platform_driver(sprd_iommu_driver);
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index c519b991749d..90b26fe21817 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -452,6 +452,7 @@ static int sun50i_iommu_enable(struct sun50i_iommu *iommu)
IOMMU_TLB_PREFETCH_MASTER_ENABLE(3) |
IOMMU_TLB_PREFETCH_MASTER_ENABLE(4) |
IOMMU_TLB_PREFETCH_MASTER_ENABLE(5));
+ iommu_write(iommu, IOMMU_BYPASS_REG, 0);
iommu_write(iommu, IOMMU_INT_ENABLE_REG, IOMMU_INT_MASK);
iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(SUN50I_IOMMU_ACI_NONE),
IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 0) |
@@ -601,6 +602,14 @@ static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova,
u32 *page_table, *pte_addr;
int ret = 0;
+ /* the IOMMU can only handle 32-bit addresses, both input and output */
+ if ((uint64_t)paddr >> 32) {
+ ret = -EINVAL;
+ dev_warn_once(iommu->dev,
+ "attempt to map address beyond 4GB\n");
+ goto out;
+ }
+
page_table = sun50i_dte_get_page_table(sun50i_domain, iova, gfp);
if (IS_ERR(page_table)) {
ret = PTR_ERR(page_table);
@@ -681,12 +690,15 @@ sun50i_iommu_domain_alloc_paging(struct device *dev)
if (!sun50i_domain)
return NULL;
- sun50i_domain->dt = iommu_alloc_pages(GFP_KERNEL, get_order(DT_SIZE));
+ sun50i_domain->dt =
+ iommu_alloc_pages_sz(GFP_KERNEL | GFP_DMA32, DT_SIZE);
if (!sun50i_domain->dt)
goto err_free_domain;
refcount_set(&sun50i_domain->refcnt, 1);
+ sun50i_domain->domain.pgsize_bitmap = SZ_4K;
+
sun50i_domain->domain.geometry.aperture_start = 0;
sun50i_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32);
sun50i_domain->domain.geometry.force_aperture = true;
@@ -703,7 +715,7 @@ static void sun50i_iommu_domain_free(struct iommu_domain *domain)
{
struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
- iommu_free_pages(sun50i_domain->dt, get_order(DT_SIZE));
+ iommu_free_pages(sun50i_domain->dt);
sun50i_domain->dt = NULL;
kfree(sun50i_domain);
@@ -759,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu,
}
static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sun50i_iommu *iommu = dev_iommu_priv_get(dev);
struct sun50i_iommu_domain *sun50i_domain;
@@ -785,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = {
};
static int sun50i_iommu_attach_device(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
struct sun50i_iommu *iommu;
@@ -801,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain,
if (iommu->domain == domain)
return 0;
- sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev);
+ sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old);
sun50i_iommu_attach_domain(iommu, sun50i_domain);
@@ -827,12 +841,13 @@ static int sun50i_iommu_of_xlate(struct device *dev,
dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev));
+ put_device(&iommu_pdev->dev);
+
return iommu_fwspec_add_ids(dev, &id, 1);
}
static const struct iommu_ops sun50i_iommu_ops = {
.identity_domain = &sun50i_iommu_identity_domain,
- .pgsize_bitmap = SZ_4K,
.device_group = generic_single_device_group,
.domain_alloc_paging = sun50i_iommu_domain_alloc_paging,
.of_xlate = sun50i_iommu_of_xlate,
@@ -996,7 +1011,7 @@ static int sun50i_iommu_probe(struct platform_device *pdev)
iommu->pt_pool = kmem_cache_create(dev_name(&pdev->dev),
PT_SIZE, PT_SIZE,
- SLAB_HWCACHE_ALIGN,
+ SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA32,
NULL);
if (!iommu->pt_pool)
return -ENOMEM;
@@ -1057,6 +1072,7 @@ err_free_cache:
static const struct of_device_id sun50i_iommu_dt[] = {
{ .compatible = "allwinner,sun50i-h6-iommu", },
+ { .compatible = "allwinner,sun50i-h616-iommu", },
{ /* sentinel */ },
};
MODULE_DEVICE_TABLE(of, sun50i_iommu_dt);
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index f86c7ae91814..c391e7f2cde6 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -51,14 +51,17 @@ struct tegra_smmu {
struct iommu_device iommu; /* IOMMU Core code handle */
};
+struct tegra_pd;
+struct tegra_pt;
+
struct tegra_smmu_as {
struct iommu_domain domain;
struct tegra_smmu *smmu;
unsigned int use_count;
spinlock_t lock;
u32 *count;
- struct page **pts;
- struct page *pd;
+ struct tegra_pt **pts;
+ struct tegra_pd *pd;
dma_addr_t pd_dma;
unsigned id;
u32 attr;
@@ -155,6 +158,14 @@ static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset)
#define SMMU_PDE_ATTR (SMMU_PDE_READABLE | SMMU_PDE_WRITABLE | \
SMMU_PDE_NONSECURE)
+struct tegra_pd {
+ u32 val[SMMU_NUM_PDE];
+};
+
+struct tegra_pt {
+ u32 val[SMMU_NUM_PTE];
+};
+
static unsigned int iova_pd_index(unsigned long iova)
{
return (iova >> SMMU_PDE_SHIFT) & (SMMU_NUM_PDE - 1);
@@ -284,7 +295,7 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev)
as->attr = SMMU_PD_READABLE | SMMU_PD_WRITABLE | SMMU_PD_NONSECURE;
- as->pd = __iommu_alloc_pages(GFP_KERNEL | __GFP_DMA, 0);
+ as->pd = iommu_alloc_pages_sz(GFP_KERNEL | __GFP_DMA, SMMU_SIZE_PD);
if (!as->pd) {
kfree(as);
return NULL;
@@ -292,7 +303,7 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev)
as->count = kcalloc(SMMU_NUM_PDE, sizeof(u32), GFP_KERNEL);
if (!as->count) {
- __iommu_free_pages(as->pd, 0);
+ iommu_free_pages(as->pd);
kfree(as);
return NULL;
}
@@ -300,13 +311,15 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev)
as->pts = kcalloc(SMMU_NUM_PDE, sizeof(*as->pts), GFP_KERNEL);
if (!as->pts) {
kfree(as->count);
- __iommu_free_pages(as->pd, 0);
+ iommu_free_pages(as->pd);
kfree(as);
return NULL;
}
spin_lock_init(&as->lock);
+ as->domain.pgsize_bitmap = SZ_4K;
+
/* setup aperture */
as->domain.geometry.aperture_start = 0;
as->domain.geometry.aperture_end = 0xffffffff;
@@ -417,8 +430,8 @@ static int tegra_smmu_as_prepare(struct tegra_smmu *smmu,
goto unlock;
}
- as->pd_dma = dma_map_page(smmu->dev, as->pd, 0, SMMU_SIZE_PD,
- DMA_TO_DEVICE);
+ as->pd_dma =
+ dma_map_single(smmu->dev, as->pd, SMMU_SIZE_PD, DMA_TO_DEVICE);
if (dma_mapping_error(smmu->dev, as->pd_dma)) {
err = -ENOMEM;
goto unlock;
@@ -450,7 +463,7 @@ static int tegra_smmu_as_prepare(struct tegra_smmu *smmu,
return 0;
err_unmap:
- dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+ dma_unmap_single(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
unlock:
mutex_unlock(&smmu->lock);
@@ -469,7 +482,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
tegra_smmu_free_asid(smmu, as->id);
- dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+ dma_unmap_single(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
as->smmu = NULL;
@@ -477,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
}
static int tegra_smmu_attach_dev(struct iommu_domain *domain,
- struct device *dev)
+ struct device *dev, struct iommu_domain *old)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu *smmu = dev_iommu_priv_get(dev);
@@ -511,9 +524,9 @@ disable:
}
static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
- struct device *dev)
+ struct device *dev,
+ struct iommu_domain *old)
{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu_as *as;
struct tegra_smmu *smmu;
@@ -522,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain,
if (!fwspec)
return -ENODEV;
- if (domain == identity_domain || !domain)
+ if (old == identity_domain || !old)
return 0;
- as = to_smmu_as(domain);
+ as = to_smmu_as(old);
smmu = as->smmu;
for (index = 0; index < fwspec->num_ids; index++) {
tegra_smmu_disable(smmu, fwspec->ids[index], as->id);
@@ -548,11 +561,11 @@ static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova,
{
unsigned int pd_index = iova_pd_index(iova);
struct tegra_smmu *smmu = as->smmu;
- u32 *pd = page_address(as->pd);
+ u32 *pd = &as->pd->val[pd_index];
unsigned long offset = pd_index * sizeof(*pd);
/* Set the page directory entry first */
- pd[pd_index] = value;
+ *pd = value;
/* The flush the page directory entry from caches */
dma_sync_single_range_for_device(smmu->dev, as->pd_dma, offset,
@@ -564,11 +577,9 @@ static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova,
smmu_flush(smmu);
}
-static u32 *tegra_smmu_pte_offset(struct page *pt_page, unsigned long iova)
+static u32 *tegra_smmu_pte_offset(struct tegra_pt *pt, unsigned long iova)
{
- u32 *pt = page_address(pt_page);
-
- return pt + iova_pt_index(iova);
+ return &pt->val[iova_pt_index(iova)];
}
static u32 *tegra_smmu_pte_lookup(struct tegra_smmu_as *as, unsigned long iova,
@@ -576,21 +587,19 @@ static u32 *tegra_smmu_pte_lookup(struct tegra_smmu_as *as, unsigned long iova,
{
unsigned int pd_index = iova_pd_index(iova);
struct tegra_smmu *smmu = as->smmu;
- struct page *pt_page;
- u32 *pd;
+ struct tegra_pt *pt;
- pt_page = as->pts[pd_index];
- if (!pt_page)
+ pt = as->pts[pd_index];
+ if (!pt)
return NULL;
- pd = page_address(as->pd);
- *dmap = smmu_pde_to_dma(smmu, pd[pd_index]);
+ *dmap = smmu_pde_to_dma(smmu, as->pd->val[pd_index]);
- return tegra_smmu_pte_offset(pt_page, iova);
+ return tegra_smmu_pte_offset(pt, iova);
}
static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova,
- dma_addr_t *dmap, struct page *page)
+ dma_addr_t *dmap, struct tegra_pt *pt)
{
unsigned int pde = iova_pd_index(iova);
struct tegra_smmu *smmu = as->smmu;
@@ -598,30 +607,28 @@ static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova,
if (!as->pts[pde]) {
dma_addr_t dma;
- dma = dma_map_page(smmu->dev, page, 0, SMMU_SIZE_PT,
- DMA_TO_DEVICE);
+ dma = dma_map_single(smmu->dev, pt, SMMU_SIZE_PT,
+ DMA_TO_DEVICE);
if (dma_mapping_error(smmu->dev, dma)) {
- __iommu_free_pages(page, 0);
+ iommu_free_pages(pt);
return NULL;
}
if (!smmu_dma_addr_valid(smmu, dma)) {
- dma_unmap_page(smmu->dev, dma, SMMU_SIZE_PT,
- DMA_TO_DEVICE);
- __iommu_free_pages(page, 0);
+ dma_unmap_single(smmu->dev, dma, SMMU_SIZE_PT,
+ DMA_TO_DEVICE);
+ iommu_free_pages(pt);
return NULL;
}
- as->pts[pde] = page;
+ as->pts[pde] = pt;
tegra_smmu_set_pde(as, iova, SMMU_MK_PDE(dma, SMMU_PDE_ATTR |
SMMU_PDE_NEXT));
*dmap = dma;
} else {
- u32 *pd = page_address(as->pd);
-
- *dmap = smmu_pde_to_dma(smmu, pd[pde]);
+ *dmap = smmu_pde_to_dma(smmu, as->pd->val[pde]);
}
return tegra_smmu_pte_offset(as->pts[pde], iova);
@@ -637,7 +644,7 @@ static void tegra_smmu_pte_get_use(struct tegra_smmu_as *as, unsigned long iova)
static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova)
{
unsigned int pde = iova_pd_index(iova);
- struct page *page = as->pts[pde];
+ struct tegra_pt *pt = as->pts[pde];
/*
* When no entries in this page table are used anymore, return the
@@ -645,13 +652,13 @@ static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova)
*/
if (--as->count[pde] == 0) {
struct tegra_smmu *smmu = as->smmu;
- u32 *pd = page_address(as->pd);
- dma_addr_t pte_dma = smmu_pde_to_dma(smmu, pd[pde]);
+ dma_addr_t pte_dma = smmu_pde_to_dma(smmu, as->pd->val[pde]);
tegra_smmu_set_pde(as, iova, 0);
- dma_unmap_page(smmu->dev, pte_dma, SMMU_SIZE_PT, DMA_TO_DEVICE);
- __iommu_free_pages(page, 0);
+ dma_unmap_single(smmu->dev, pte_dma, SMMU_SIZE_PT,
+ DMA_TO_DEVICE);
+ iommu_free_pages(pt);
as->pts[pde] = NULL;
}
}
@@ -671,16 +678,16 @@ static void tegra_smmu_set_pte(struct tegra_smmu_as *as, unsigned long iova,
smmu_flush(smmu);
}
-static struct page *as_get_pde_page(struct tegra_smmu_as *as,
- unsigned long iova, gfp_t gfp,
- unsigned long *flags)
+static struct tegra_pt *as_get_pde_page(struct tegra_smmu_as *as,
+ unsigned long iova, gfp_t gfp,
+ unsigned long *flags)
{
unsigned int pde = iova_pd_index(iova);
- struct page *page = as->pts[pde];
+ struct tegra_pt *pt = as->pts[pde];
/* at first check whether allocation needs to be done at all */
- if (page)
- return page;
+ if (pt)
+ return pt;
/*
* In order to prevent exhaustion of the atomic memory pool, we
@@ -690,7 +697,7 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as,
if (gfpflags_allow_blocking(gfp))
spin_unlock_irqrestore(&as->lock, *flags);
- page = __iommu_alloc_pages(gfp | __GFP_DMA, 0);
+ pt = iommu_alloc_pages_sz(gfp | __GFP_DMA, SMMU_SIZE_PT);
if (gfpflags_allow_blocking(gfp))
spin_lock_irqsave(&as->lock, *flags);
@@ -701,13 +708,13 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as,
* if allocation succeeded and the allocation failure isn't fatal.
*/
if (as->pts[pde]) {
- if (page)
- __iommu_free_pages(page, 0);
+ if (pt)
+ iommu_free_pages(pt);
- page = as->pts[pde];
+ pt = as->pts[pde];
}
- return page;
+ return pt;
}
static int
@@ -717,15 +724,15 @@ __tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
{
struct tegra_smmu_as *as = to_smmu_as(domain);
dma_addr_t pte_dma;
- struct page *page;
+ struct tegra_pt *pt;
u32 pte_attrs;
u32 *pte;
- page = as_get_pde_page(as, iova, gfp, flags);
- if (!page)
+ pt = as_get_pde_page(as, iova, gfp, flags);
+ if (!pt)
return -ENOMEM;
- pte = as_get_pte(as, iova, &pte_dma, page);
+ pte = as_get_pte(as, iova, &pte_dma, pt);
if (!pte)
return -ENOMEM;
@@ -823,10 +830,9 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np)
return NULL;
mc = platform_get_drvdata(pdev);
- if (!mc) {
- put_device(&pdev->dev);
+ put_device(&pdev->dev);
+ if (!mc)
return NULL;
- }
return mc->smmu;
}
@@ -837,7 +843,7 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev,
const struct iommu_ops *ops = smmu->iommu.ops;
int err;
- err = iommu_fwspec_init(dev, &dev->of_node->fwnode, ops);
+ err = iommu_fwspec_init(dev, dev_fwnode(smmu->dev));
if (err < 0) {
dev_err(dev, "failed to initialize fwspec: %d\n", err);
return err;
@@ -846,7 +852,6 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev,
err = ops->of_xlate(dev, args);
if (err < 0) {
dev_err(dev, "failed to parse SW group ID: %d\n", err);
- iommu_fwspec_free(dev);
return err;
}
@@ -998,7 +1003,6 @@ static const struct iommu_ops tegra_smmu_ops = {
.probe_device = tegra_smmu_probe_device,
.device_group = tegra_smmu_device_group,
.of_xlate = tegra_smmu_of_xlate,
- .pgsize_bitmap = SZ_4K,
.default_domain_ops = &(const struct iommu_domain_ops) {
.attach_dev = tegra_smmu_attach_dev,
.map_pages = tegra_smmu_map,
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 36d680826b57..d314fa5cd847 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -48,6 +48,7 @@ struct viommu_dev {
u64 pgsize_bitmap;
u32 first_domain;
u32 last_domain;
+ u32 identity_domain_id;
/* Supported MAP flags */
u32 map_flags;
u32 probe_size;
@@ -62,7 +63,6 @@ struct viommu_mapping {
struct viommu_domain {
struct iommu_domain domain;
struct viommu_dev *viommu;
- struct mutex mutex; /* protects viommu pointer */
unsigned int id;
u32 map_flags;
@@ -70,7 +70,6 @@ struct viommu_domain {
struct rb_root_cached mappings;
unsigned long nr_endpoints;
- bool bypass;
};
struct viommu_endpoint {
@@ -97,6 +96,8 @@ struct viommu_event {
};
};
+static struct viommu_domain viommu_identity_domain;
+
#define to_viommu_domain(domain) \
container_of(domain, struct viommu_domain, domain)
@@ -305,6 +306,22 @@ out_unlock:
return ret;
}
+static int viommu_send_attach_req(struct viommu_dev *viommu, struct device *dev,
+ struct virtio_iommu_req_attach *req)
+{
+ int ret;
+ unsigned int i;
+ struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+ for (i = 0; i < fwspec->num_ids; i++) {
+ req->endpoint = cpu_to_le32(fwspec->ids[i]);
+ ret = viommu_send_req_sync(viommu, req, sizeof(*req));
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
/*
* viommu_add_mapping - add a mapping to the internal tree
*
@@ -637,71 +654,45 @@ static void viommu_event_handler(struct virtqueue *vq)
/* IOMMU API */
-static struct iommu_domain *viommu_domain_alloc(unsigned type)
+static struct iommu_domain *viommu_domain_alloc_paging(struct device *dev)
{
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
+ struct viommu_dev *viommu = vdev->viommu;
+ unsigned long viommu_page_size;
struct viommu_domain *vdomain;
-
- if (type != IOMMU_DOMAIN_UNMANAGED &&
- type != IOMMU_DOMAIN_DMA &&
- type != IOMMU_DOMAIN_IDENTITY)
- return NULL;
-
- vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL);
- if (!vdomain)
- return NULL;
-
- mutex_init(&vdomain->mutex);
- spin_lock_init(&vdomain->mappings_lock);
- vdomain->mappings = RB_ROOT_CACHED;
-
- return &vdomain->domain;
-}
-
-static int viommu_domain_finalise(struct viommu_endpoint *vdev,
- struct iommu_domain *domain)
-{
int ret;
- unsigned long viommu_page_size;
- struct viommu_dev *viommu = vdev->viommu;
- struct viommu_domain *vdomain = to_viommu_domain(domain);
viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap);
if (viommu_page_size > PAGE_SIZE) {
dev_err(vdev->dev,
"granule 0x%lx larger than system page size 0x%lx\n",
viommu_page_size, PAGE_SIZE);
- return -ENODEV;
+ return ERR_PTR(-ENODEV);
}
- ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain,
- viommu->last_domain, GFP_KERNEL);
- if (ret < 0)
- return ret;
+ vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL);
+ if (!vdomain)
+ return ERR_PTR(-ENOMEM);
- vdomain->id = (unsigned int)ret;
+ spin_lock_init(&vdomain->mappings_lock);
+ vdomain->mappings = RB_ROOT_CACHED;
- domain->pgsize_bitmap = viommu->pgsize_bitmap;
- domain->geometry = viommu->geometry;
+ ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain,
+ viommu->last_domain, GFP_KERNEL);
+ if (ret < 0) {
+ kfree(vdomain);
+ return ERR_PTR(ret);
+ }
- vdomain->map_flags = viommu->map_flags;
- vdomain->viommu = viommu;
+ vdomain->id = (unsigned int)ret;
- if (domain->type == IOMMU_DOMAIN_IDENTITY) {
- if (virtio_has_feature(viommu->vdev,
- VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
- vdomain->bypass = true;
- return 0;
- }
+ vdomain->domain.pgsize_bitmap = viommu->pgsize_bitmap;
+ vdomain->domain.geometry = viommu->geometry;
- ret = viommu_domain_map_identity(vdev, vdomain);
- if (ret) {
- ida_free(&viommu->domain_ids, vdomain->id);
- vdomain->viommu = NULL;
- return ret;
- }
- }
+ vdomain->map_flags = viommu->map_flags;
+ vdomain->viommu = viommu;
- return 0;
+ return &vdomain->domain;
}
static void viommu_domain_free(struct iommu_domain *domain)
@@ -717,29 +708,38 @@ static void viommu_domain_free(struct iommu_domain *domain)
kfree(vdomain);
}
-static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev)
+{
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
+ struct iommu_domain *domain;
+ int ret;
+
+ if (virtio_has_feature(vdev->viommu->vdev,
+ VIRTIO_IOMMU_F_BYPASS_CONFIG))
+ return &viommu_identity_domain.domain;
+
+ domain = viommu_domain_alloc_paging(dev);
+ if (IS_ERR(domain))
+ return domain;
+
+ ret = viommu_domain_map_identity(vdev, to_viommu_domain(domain));
+ if (ret) {
+ viommu_domain_free(domain);
+ return ERR_PTR(ret);
+ }
+ return domain;
+}
+
+static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev,
+ struct iommu_domain *old)
{
- int i;
int ret = 0;
struct virtio_iommu_req_attach req;
- struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
struct viommu_domain *vdomain = to_viommu_domain(domain);
- mutex_lock(&vdomain->mutex);
- if (!vdomain->viommu) {
- /*
- * Properly initialize the domain now that we know which viommu
- * owns it.
- */
- ret = viommu_domain_finalise(vdev, domain);
- } else if (vdomain->viommu != vdev->viommu) {
- ret = -EINVAL;
- }
- mutex_unlock(&vdomain->mutex);
-
- if (ret)
- return ret;
+ if (vdomain->viommu != vdev->viommu)
+ return -EINVAL;
/*
* In the virtio-iommu device, when attaching the endpoint to a new
@@ -761,16 +761,9 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
.domain = cpu_to_le32(vdomain->id),
};
- if (vdomain->bypass)
- req.flags |= cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS);
-
- for (i = 0; i < fwspec->num_ids; i++) {
- req.endpoint = cpu_to_le32(fwspec->ids[i]);
-
- ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req));
- if (ret)
- return ret;
- }
+ ret = viommu_send_attach_req(vdomain->viommu, dev, &req);
+ if (ret)
+ return ret;
if (!vdomain->nr_endpoints) {
/*
@@ -788,6 +781,41 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
return 0;
}
+static int viommu_attach_identity_domain(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_domain *old)
+{
+ int ret = 0;
+ struct virtio_iommu_req_attach req;
+ struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
+ struct viommu_domain *vdomain = to_viommu_domain(domain);
+
+ req = (struct virtio_iommu_req_attach) {
+ .head.type = VIRTIO_IOMMU_T_ATTACH,
+ .domain = cpu_to_le32(vdev->viommu->identity_domain_id),
+ .flags = cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS),
+ };
+
+ ret = viommu_send_attach_req(vdev->viommu, dev, &req);
+ if (ret)
+ return ret;
+
+ if (vdev->vdomain)
+ vdev->vdomain->nr_endpoints--;
+ vdomain->nr_endpoints++;
+ vdev->vdomain = vdomain;
+ return 0;
+}
+
+static struct viommu_domain viommu_identity_domain = {
+ .domain = {
+ .type = IOMMU_DOMAIN_IDENTITY,
+ .ops = &(const struct iommu_domain_ops) {
+ .attach_dev = viommu_attach_identity_domain,
+ },
+ },
+};
+
static void viommu_detach_dev(struct viommu_endpoint *vdev)
{
int i;
@@ -972,8 +1000,7 @@ static void viommu_get_resv_regions(struct device *dev, struct list_head *head)
iommu_dma_get_resv_regions(dev, head);
}
-static struct iommu_ops viommu_ops;
-static struct virtio_driver virtio_iommu_drv;
+static const struct bus_type *virtio_bus_type;
static int viommu_match_node(struct device *dev, const void *data)
{
@@ -982,8 +1009,9 @@ static int viommu_match_node(struct device *dev, const void *data)
static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode)
{
- struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL,
- fwnode, viommu_match_node);
+ struct device *dev = bus_find_device(virtio_bus_type, NULL, fwnode,
+ viommu_match_node);
+
put_device(dev);
return dev ? dev_to_virtio(dev)->priv : NULL;
@@ -1060,9 +1088,10 @@ static bool viommu_capable(struct device *dev, enum iommu_cap cap)
}
}
-static struct iommu_ops viommu_ops = {
+static const struct iommu_ops viommu_ops = {
.capable = viommu_capable,
- .domain_alloc = viommu_domain_alloc,
+ .domain_alloc_identity = viommu_domain_alloc_identity,
+ .domain_alloc_paging = viommu_domain_alloc_paging,
.probe_device = viommu_probe_device,
.release_device = viommu_release_device,
.device_group = viommu_device_group,
@@ -1084,14 +1113,13 @@ static struct iommu_ops viommu_ops = {
static int viommu_init_vqs(struct viommu_dev *viommu)
{
struct virtio_device *vdev = dev_to_virtio(viommu->dev);
- const char *names[] = { "request", "event" };
- vq_callback_t *callbacks[] = {
- NULL, /* No async requests */
- viommu_event_handler,
+ struct virtqueue_info vqs_info[] = {
+ { "request" },
+ { "event", viommu_event_handler },
};
- return virtio_find_vqs(vdev, VIOMMU_NR_VQS, viommu->vqs, callbacks,
- names, NULL);
+ return virtio_find_vqs(vdev, VIOMMU_NR_VQS, viommu->vqs,
+ vqs_info, NULL);
}
static int viommu_fill_evtq(struct viommu_dev *viommu)
@@ -1134,6 +1162,9 @@ static int viommu_probe(struct virtio_device *vdev)
if (!viommu)
return -ENOMEM;
+ /* Borrow this for easy lookups later */
+ virtio_bus_type = dev->bus;
+
spin_lock_init(&viommu->request_lock);
ida_init(&viommu->domain_ids);
viommu->dev = dev;
@@ -1185,7 +1216,11 @@ static int viommu_probe(struct virtio_device *vdev)
if (virtio_has_feature(vdev, VIRTIO_IOMMU_F_MMIO))
viommu->map_flags |= VIRTIO_IOMMU_MAP_F_MMIO;
- viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap;
+ /* Reserve an ID to use as the bypass domain */
+ if (virtio_has_feature(viommu->vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
+ viommu->identity_domain_id = viommu->first_domain;
+ viommu->first_domain++;
+ }
virtio_device_ready(vdev);
@@ -1199,10 +1234,10 @@ static int viommu_probe(struct virtio_device *vdev)
if (ret)
goto err_free_vqs;
- iommu_device_register(&viommu->iommu, &viommu_ops, parent_dev);
-
vdev->priv = viommu;
+ iommu_device_register(&viommu->iommu, &viommu_ops, parent_dev);
+
dev_info(dev, "input address: %u bits\n",
order_base_2(viommu->geometry.aperture_end));
dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap);