diff options
Diffstat (limited to 'drivers/iommu')
127 files changed, 26398 insertions, 9761 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index c04584be3089..99095645134f 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -40,12 +40,13 @@ config IOMMU_IO_PGTABLE_LPAE sizes at both stage-1 and stage-2, as well as address spaces up to 48-bits in size. -config IOMMU_IO_PGTABLE_LPAE_SELFTEST - bool "LPAE selftests" - depends on IOMMU_IO_PGTABLE_LPAE +config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST + tristate "KUnit tests for LPAE" + depends on IOMMU_IO_PGTABLE_LPAE && KUNIT + default KUNIT_ALL_TESTS help - Enable self-tests for LPAE page table allocator. This performs - a series of page-table consistency checks during boot. + Enable kunit tests for LPAE page table allocator. This performs + a series of page-table consistency checks. If unsure, say N here. @@ -151,10 +152,9 @@ config OF_IOMMU # IOMMU-agnostic DMA-mapping layer config IOMMU_DMA def_bool ARM64 || X86 || S390 - select DMA_OPS + select DMA_OPS_HELPERS select IOMMU_API select IOMMU_IOVA - select IRQ_MSI_IOMMU select NEED_SG_DMA_LENGTH select NEED_SG_DMA_FLAGS if SWIOTLB @@ -193,13 +193,15 @@ config MSM_IOMMU If unsure, say N here. source "drivers/iommu/amd/Kconfig" +source "drivers/iommu/arm/Kconfig" source "drivers/iommu/intel/Kconfig" source "drivers/iommu/iommufd/Kconfig" +source "drivers/iommu/riscv/Kconfig" config IRQ_REMAP bool "Support for Interrupt Remapping" depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI - select DMAR_TABLE if INTEL_IOMMU + select IRQ_MSI_LIB help Supports Interrupt remapping for IO-APIC and MSI devices. To use x2apic mode in the CPU's which support x2APIC enhancements or @@ -246,7 +248,7 @@ config SUN50I_IOMMU config TEGRA_IOMMU_SMMU bool "NVIDIA Tegra SMMU Support" - depends on ARCH_TEGRA + depends on ARCH_TEGRA || COMPILE_TEST depends on TEGRA_AHB depends on TEGRA_MC select IOMMU_API @@ -305,7 +307,6 @@ config APPLE_DART depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_DART select IOMMU_API select IOMMU_IO_PGTABLE_DART - default ARCH_APPLE help Support for Apple DART (Device Address Resolution Table) IOMMUs found in Apple ARM SoCs like the M1. @@ -314,117 +315,6 @@ config APPLE_DART Say Y here if you are using an Apple SoC. -# ARM IOMMU support -config ARM_SMMU - tristate "ARM Ltd. System MMU (SMMU) Support" - depends on ARM64 || ARM || COMPILE_TEST - depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE - select IOMMU_API - select IOMMU_IO_PGTABLE_LPAE - select ARM_DMA_USE_IOMMU if ARM - help - Support for implementations of the ARM System MMU architecture - versions 1 and 2. - - Say Y here if your SoC includes an IOMMU device implementing - the ARM SMMU architecture. - -config ARM_SMMU_LEGACY_DT_BINDINGS - bool "Support the legacy \"mmu-masters\" devicetree bindings" - depends on ARM_SMMU=y && OF - help - Support for the badly designed and deprecated "mmu-masters" - devicetree bindings. This allows some DMA masters to attach - to the SMMU but does not provide any support via the DMA API. - If you're lucky, you might be able to get VFIO up and running. - - If you say Y here then you'll make me very sad. Instead, say N - and move your firmware to the utopian future that was 2016. - -config ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT - bool "Default to disabling bypass on ARM SMMU v1 and v2" - depends on ARM_SMMU - default y - help - Say Y here to (by default) disable bypass streams such that - incoming transactions from devices that are not attached to - an iommu domain will report an abort back to the device and - will not be allowed to pass through the SMMU. - - Any old kernels that existed before this KConfig was - introduced would default to _allowing_ bypass (AKA the - equivalent of NO for this config). However the default for - this option is YES because the old behavior is insecure. - - There are few reasons to allow unmatched stream bypass, and - even fewer good ones. If saying YES here breaks your board - you should work on fixing your board. This KConfig option - is expected to be removed in the future and we'll simply - hardcode the bypass disable in the code. - - NOTE: the kernel command line parameter - 'arm-smmu.disable_bypass' will continue to override this - config. - -config ARM_SMMU_QCOM - def_tristate y - depends on ARM_SMMU && ARCH_QCOM - select QCOM_SCM - help - When running on a Qualcomm platform that has the custom variant - of the ARM SMMU, this needs to be built into the SMMU driver. - -config ARM_SMMU_QCOM_DEBUG - bool "ARM SMMU QCOM implementation defined debug support" - depends on ARM_SMMU_QCOM=y - help - Support for implementation specific debug features in ARM SMMU - hardware found in QTI platforms. This include support for - the Translation Buffer Units (TBU) that can be used to obtain - additional information when debugging memory management issues - like context faults. - - Say Y here to enable debug for issues such as context faults - or TLB sync timeouts which requires implementation defined - register dumps. - -config ARM_SMMU_V3 - tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support" - depends on ARM64 - select IOMMU_API - select IOMMU_IO_PGTABLE_LPAE - select GENERIC_MSI_IRQ - help - Support for implementations of the ARM System MMU architecture - version 3 providing translation support to a PCIe root complex. - - Say Y here if your system includes an IOMMU device implementing - the ARM SMMUv3 architecture. - -if ARM_SMMU_V3 -config ARM_SMMU_V3_SVA - bool "Shared Virtual Addressing support for the ARM SMMUv3" - select IOMMU_SVA - select IOMMU_IOPF - select MMU_NOTIFIER - help - Support for sharing process address spaces with devices using the - SMMUv3. - - Say Y here if your system supports SVA extensions such as PCIe PASID - and PRI. - -config ARM_SMMU_V3_KUNIT_TEST - tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS - depends on KUNIT - depends on ARM_SMMU_V3_SVA - default KUNIT_ALL_TESTS - help - Enable this option to unit-test arm-smmu-v3 driver functions. - - If unsure, say N. -endif - config S390_IOMMU def_bool y if S390 && PCI depends on S390 && PCI @@ -449,8 +339,7 @@ config MTK_IOMMU config MTK_IOMMU_V1 tristate "MediaTek IOMMU Version 1 (M4U gen1) Support" - depends on ARM - depends on ARCH_MEDIATEK || COMPILE_TEST + depends on (ARCH_MEDIATEK && ARM) || COMPILE_TEST select ARM_DMA_USE_IOMMU select IOMMU_API select MEMORY @@ -462,18 +351,6 @@ config MTK_IOMMU_V1 if unsure, say N here. -config QCOM_IOMMU - # Note: iommu drivers cannot (yet?) be built as modules - bool "Qualcomm IOMMU Support" - depends on ARCH_QCOM || COMPILE_TEST - depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE - select QCOM_SCM - select IOMMU_API - select IOMMU_IO_PGTABLE_LPAE - select ARM_DMA_USE_IOMMU - help - Support for IOMMU on certain Qualcomm SoCs. - config HYPERV_IOMMU bool "Hyper-V IRQ Handling" depends on HYPERV && X86 @@ -508,3 +385,5 @@ config SPRD_IOMMU Say Y here if you want to use the multimedia devices listed above. endif # IOMMU_SUPPORT + +source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 542760d963ec..8e8843316c4b 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -1,6 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += amd/ intel/ arm/ iommufd/ +obj-y += arm/ iommufd/ +obj-$(CONFIG_AMD_IOMMU) += amd/ +obj-$(CONFIG_INTEL_IOMMU) += intel/ +obj-$(CONFIG_RISCV_IOMMU) += riscv/ +obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/ obj-$(CONFIG_IOMMU_API) += iommu.o +obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o @@ -8,6 +13,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o +obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o obj-$(CONFIG_IOMMU_IOVA) += iova.o obj-$(CONFIG_OF_IOMMU) += of_iommu.o diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index 994063e5586f..f2acf471cb5d 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -7,13 +7,17 @@ config AMD_IOMMU select PCI_ATS select PCI_PRI select PCI_PASID + select IRQ_MSI_LIB select MMU_NOTIFIER select IOMMU_API select IOMMU_IOVA - select IOMMU_IO_PGTABLE select IOMMU_SVA select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_AMDV1 + select IOMMU_PT_X86_64 depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 9de33b2d42f5..5412a563c697 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o +obj-y += iommu.o init.o quirks.o ppr.o pasid.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 2d5945c982bd..25044d28f28a 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -16,7 +16,6 @@ irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data); irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data); irqreturn_t amd_iommu_int_thread_galog(int irq, void *data); irqreturn_t amd_iommu_int_handler(int irq, void *data); -void amd_iommu_apply_erratum_63(struct amd_iommu *iommu, u16 devid); void amd_iommu_restart_log(struct amd_iommu *iommu, const char *evt_type, u8 cntrl_intr, u8 cntrl_log, u32 status_run_mask, u32 status_overflow_mask); @@ -29,9 +28,9 @@ void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, gfp_t gfp, size_t size); #ifdef CONFIG_AMD_IOMMU_DEBUGFS -void amd_iommu_debugfs_setup(struct amd_iommu *iommu); +void amd_iommu_debugfs_setup(void); #else -static inline void amd_iommu_debugfs_setup(struct amd_iommu *iommu) {} +static inline void amd_iommu_debugfs_setup(void) {} #endif /* Needed for interrupt remapping */ @@ -41,17 +40,21 @@ void amd_iommu_disable(void); int amd_iommu_reenable(int mode); int amd_iommu_enable_faulting(unsigned int cpu); extern int amd_iommu_guest_ir; -extern enum io_pgtable_fmt amd_iommu_pgtable; +extern enum protection_domain_mode amd_iommu_pgtable; extern int amd_iommu_gpt_level; +extern u8 amd_iommu_hpt_level; +extern unsigned long amd_iommu_pgsize_bitmap; +extern bool amd_iommu_hatdis; /* Protection domain ops */ -struct protection_domain *protection_domain_alloc(unsigned int type); -void protection_domain_free(struct protection_domain *domain); +void amd_iommu_init_identity_domain(void); +struct protection_domain *protection_domain_alloc(void); struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct mm_struct *mm); void amd_iommu_domain_free(struct iommu_domain *dom); int iommu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); void amd_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain); @@ -85,16 +88,10 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag); * the IOMMU used by this driver. */ void amd_iommu_flush_all_caches(struct amd_iommu *iommu); -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); -void amd_iommu_domain_update(struct protection_domain *domain); -void amd_iommu_dev_update_dte(struct iommu_dev_data *dev_data, bool set); -void amd_iommu_domain_flush_complete(struct protection_domain *domain); void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size); void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, ioasid_t pasid, u64 address, size_t size); -void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data, - ioasid_t pasid); #ifdef CONFIG_IRQ_REMAP int amd_iommu_create_irq_domain(struct amd_iommu *iommu); @@ -121,14 +118,14 @@ static inline bool check_feature2(u64 mask) return (amd_iommu_efr2 & mask); } -static inline int check_feature_gpt_level(void) +static inline bool amd_iommu_v2_pgtbl_supported(void) { - return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK); + return (check_feature(FEATURE_GIOSUP) && check_feature(FEATURE_GT)); } static inline bool amd_iommu_gt_ppr_supported(void) { - return (check_feature(FEATURE_GT) && + return (amd_iommu_v2_pgtbl_supported() && check_feature(FEATURE_PPR) && check_feature(FEATURE_EPHSUP)); } @@ -143,19 +140,6 @@ static inline void *iommu_phys_to_virt(unsigned long paddr) return phys_to_virt(__sme_clr(paddr)); } -static inline -void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root) -{ - domain->iop.root = (u64 *)(root & PAGE_MASK); - domain->iop.mode = root & 7; /* lowest 3 bits encode pgtable mode */ -} - -static inline -void amd_iommu_domain_clr_pt_root(struct protection_domain *domain) -{ - amd_iommu_domain_set_pt_root(domain, 0); -} - static inline int get_pci_sbdf_id(struct pci_dev *pdev) { int seg = pci_domain_nr(pdev->bus); @@ -164,6 +148,8 @@ static inline int get_pci_sbdf_id(struct pci_dev *pdev) return PCI_SEG_DEVID_TO_SBDF(seg, devid); } +bool amd_iommu_ht_range_ignore(void); + /* * This must be called after device probe completes. During probe * use rlookup_amd_iommu() get the iommu. @@ -185,7 +171,6 @@ static inline struct protection_domain *to_pdomain(struct iommu_domain *dom) } bool translation_pre_enabled(struct amd_iommu *iommu); -bool amd_iommu_is_attach_deferred(struct device *dev); int __init add_special_device(u8 type, u8 id, u32 *devid, bool cmd_line); #ifdef CONFIG_DMI @@ -193,9 +178,11 @@ void amd_iommu_apply_ivrs_quirks(void); #else static inline void amd_iommu_apply_ivrs_quirks(void) { } #endif +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid); void amd_iommu_domain_set_pgtable(struct protection_domain *domain, u64 *root, int mode); struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); +struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); -#endif +#endif /* AMD_IOMMU_H */ diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 2b76b5dedc1d..320733e7d8b4 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -8,6 +8,7 @@ #ifndef _ASM_X86_AMD_IOMMU_TYPES_H #define _ASM_X86_AMD_IOMMU_TYPES_H +#include <linux/bitfield.h> #include <linux/iommu.h> #include <linux/types.h> #include <linux/mmu_notifier.h> @@ -17,7 +18,7 @@ #include <linux/spinlock.h> #include <linux/pci.h> #include <linux/irqreturn.h> -#include <linux/io-pgtable.h> +#include <linux/generic_pt/iommu.h> /* * Maximum number of IOMMUs supported @@ -28,8 +29,6 @@ * some size calculation constants */ #define DEV_TABLE_ENTRY_SIZE 32 -#define ALIAS_TABLE_ENTRY_SIZE 2 -#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *)) /* Capability offsets used by the driver */ #define MMIO_CAP_HDR_OFFSET 0x00 @@ -95,26 +94,28 @@ #define FEATURE_GA BIT_ULL(7) #define FEATURE_HE BIT_ULL(8) #define FEATURE_PC BIT_ULL(9) -#define FEATURE_GATS_SHIFT (12) -#define FEATURE_GATS_MASK (3ULL) +#define FEATURE_HATS GENMASK_ULL(11, 10) +#define FEATURE_GATS GENMASK_ULL(13, 12) +#define FEATURE_GLX GENMASK_ULL(15, 14) #define FEATURE_GAM_VAPIC BIT_ULL(21) +#define FEATURE_PASMAX GENMASK_ULL(36, 32) #define FEATURE_GIOSUP BIT_ULL(48) #define FEATURE_HASUP BIT_ULL(49) #define FEATURE_EPHSUP BIT_ULL(50) #define FEATURE_HDSUP BIT_ULL(52) #define FEATURE_SNP BIT_ULL(63) -#define FEATURE_PASID_SHIFT 32 -#define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT) - -#define FEATURE_GLXVAL_SHIFT 14 -#define FEATURE_GLXVAL_MASK (0x03ULL << FEATURE_GLXVAL_SHIFT) /* Extended Feature 2 Bits */ -#define FEATURE_SNPAVICSUP_SHIFT 5 -#define FEATURE_SNPAVICSUP_MASK (0x07ULL << FEATURE_SNPAVICSUP_SHIFT) +#define FEATURE_SEVSNPIO_SUP BIT_ULL(1) +#define FEATURE_SNPAVICSUP GENMASK_ULL(7, 5) #define FEATURE_SNPAVICSUP_GAM(x) \ - ((x & FEATURE_SNPAVICSUP_MASK) >> FEATURE_SNPAVICSUP_SHIFT == 0x1) + (FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1) +#define FEATURE_HT_RANGE_IGNORE BIT_ULL(11) + +#define FEATURE_NUM_INT_REMAP_SUP GENMASK_ULL(9, 8) +#define FEATURE_NUM_INT_REMAP_SUP_2K(x) \ + (FIELD_GET(FEATURE_NUM_INT_REMAP_SUP, x) == 0x1) /* Note: * The current driver only support 16-bit PASID. @@ -179,12 +180,16 @@ #define CONTROL_GAM_EN 25 #define CONTROL_GALOG_EN 28 #define CONTROL_GAINT_EN 29 +#define CONTROL_NUM_INT_REMAP_MODE 43 +#define CONTROL_NUM_INT_REMAP_MODE_MASK 0x03 +#define CONTROL_NUM_INT_REMAP_MODE_2K 0x01 +#define CONTROL_EPH_EN 45 #define CONTROL_XT_EN 50 #define CONTROL_INTCAPXT_EN 51 #define CONTROL_IRTCACHEDIS 59 #define CONTROL_SNPAVIC_EN 61 -#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT) +#define CTRL_INV_TO_MASK 7 #define CTRL_INV_TO_NONE 0 #define CTRL_INV_TO_1MS 1 #define CTRL_INV_TO_10MS 2 @@ -224,6 +229,8 @@ #define DEV_ENTRY_EX 0x67 #define DEV_ENTRY_SYSMGT1 0x68 #define DEV_ENTRY_SYSMGT2 0x69 +#define DTE_DATA1_SYSMGT_MASK GENMASK_ULL(41, 40) + #define DEV_ENTRY_IRQ_TBL_EN 0x80 #define DEV_ENTRY_INIT_PASS 0xb8 #define DEV_ENTRY_EINT_PASS 0xb9 @@ -241,6 +248,10 @@ #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) +#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */ +#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x)) +#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */ +#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x)) /* constants for event buffer handling */ #define EVT_BUFFER_SIZE 8192 /* 512 entries */ @@ -294,8 +305,13 @@ * that we support. * * 512GB Pages are not supported due to a hardware bug + * Page sizes >= the 52 bit max physical address of the CPU are not supported. */ -#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) +#define AMD_IOMMU_PGSIZES (GENMASK_ULL(51, 12) ^ SZ_512G) + +/* Special mode where page-sizes are limited to 4 KiB */ +#define AMD_IOMMU_PGSIZES_4K (PAGE_SIZE) + /* 4K, 2MB, 1G page sizes are supported */ #define AMD_IOMMU_PGSIZES_V2 (PAGE_SIZE | (1ULL << 21) | (1ULL << 30)) @@ -305,15 +321,14 @@ #define DTE_IRQ_REMAP_INTCTL (2ULL << 60) #define DTE_IRQ_REMAP_ENABLE 1ULL -/* - * AMD IOMMU hardware only support 512 IRTEs despite - * the architectural limitation of 2048 entries. - */ #define DTE_INTTAB_ALIGNMENT 128 -#define DTE_INTTABLEN_VALUE 9ULL -#define DTE_INTTABLEN (DTE_INTTABLEN_VALUE << 1) #define DTE_INTTABLEN_MASK (0xfULL << 1) -#define MAX_IRQS_PER_TABLE (1 << DTE_INTTABLEN_VALUE) +#define DTE_INTTABLEN_VALUE_512 9ULL +#define DTE_INTTABLEN_512 (DTE_INTTABLEN_VALUE_512 << 1) +#define MAX_IRQS_PER_TABLE_512 BIT(DTE_INTTABLEN_VALUE_512) +#define DTE_INTTABLEN_VALUE_2K 11ULL +#define DTE_INTTABLEN_2K (DTE_INTTABLEN_VALUE_2K << 1) +#define MAX_IRQS_PER_TABLE_2K BIT(DTE_INTTABLEN_VALUE_2K) #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 @@ -327,76 +342,7 @@ #define GUEST_PGTABLE_4_LEVEL 0x00 #define GUEST_PGTABLE_5_LEVEL 0x01 -#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) -#define PM_LEVEL_SIZE(x) (((x) < 6) ? \ - ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ - (0xffffffffffffffffULL)) -#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) -#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) -#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ - IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW) -#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) - -#define PM_MAP_4k 0 #define PM_ADDR_MASK 0x000ffffffffff000ULL -#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ - (~((1ULL << (12 + ((lvl) * 9))) - 1))) -#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) - -/* - * Returns the page table level to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_LEVEL(pagesize) \ - ((__ffs(pagesize) - 12) / 9) -/* - * Returns the number of ptes to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_PTE_COUNT(pagesize) \ - (1ULL << ((__ffs(pagesize) - 12) % 9)) - -/* - * Aligns a given io-virtual address to a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_ALIGN(address, pagesize) \ - ((address) & ~((pagesize) - 1)) -/* - * Creates an IOMMU PTE for an address and a given pagesize - * The PTE has no permission bits set - * Pagesize is expected to be a power-of-two larger than 4096 - */ -#define PAGE_SIZE_PTE(address, pagesize) \ - (((address) | ((pagesize) - 1)) & \ - (~(pagesize >> 1)) & PM_ADDR_MASK) - -/* - * Takes a PTE value with mode=0x07 and returns the page size it maps - */ -#define PTE_PAGE_SIZE(pte) \ - (1ULL << (1 + ffz(((pte) | 0xfffULL)))) - -/* - * Takes a page-table level and returns the default page-size for this level - */ -#define PTE_LEVEL_PAGE_SIZE(level) \ - (1ULL << (12 + (9 * (level)))) - -/* - * The IOPTE dirty bit - */ -#define IOMMU_PTE_HD_BIT (6) - -/* - * Bit value definition for I/O PTE fields - */ -#define IOMMU_PTE_PR BIT_ULL(0) -#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) -#define IOMMU_PTE_U BIT_ULL(59) -#define IOMMU_PTE_FC BIT_ULL(60) -#define IOMMU_PTE_IR BIT_ULL(61) -#define IOMMU_PTE_IW BIT_ULL(62) /* * Bit value definition for DTE fields @@ -406,8 +352,7 @@ #define DTE_FLAG_HAD (3ULL << 7) #define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GV BIT_ULL(55) -#define DTE_GLX_SHIFT (56) -#define DTE_GLX_MASK (3) +#define DTE_GLX GENMASK_ULL(57, 56) #define DTE_FLAG_IR BIT_ULL(61) #define DTE_FLAG_IW BIT_ULL(62) @@ -415,27 +360,17 @@ #define DTE_FLAG_MASK (0x3ffULL << 32) #define DEV_DOMID_MASK 0xffffULL -#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL) -#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL) -#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0x1fffffULL) - -#define DTE_GCR3_INDEX_A 0 -#define DTE_GCR3_INDEX_B 1 -#define DTE_GCR3_INDEX_C 1 - -#define DTE_GCR3_SHIFT_A 58 -#define DTE_GCR3_SHIFT_B 16 -#define DTE_GCR3_SHIFT_C 43 +#define DTE_GCR3_14_12 GENMASK_ULL(60, 58) +#define DTE_GCR3_30_15 GENMASK_ULL(31, 16) +#define DTE_GCR3_51_31 GENMASK_ULL(63, 43) #define DTE_GPT_LEVEL_SHIFT 54 +#define DTE_GPT_LEVEL_MASK GENMASK_ULL(55, 54) #define GCR3_VALID 0x01ULL -#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) -#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) -#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) -#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) -#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) +/* DTE[128:179] | DTE[184:191] */ +#define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52) #define IOMMU_PROT_MASK 0x03 #define IOMMU_PROT_IR 0x01 @@ -456,6 +391,9 @@ /* IOMMU Feature Reporting Field (for IVHD type 10h */ #define IOMMU_FEAT_GASUP_SHIFT 6 +/* IOMMU HATDIS for IVHD type 11h and 40h */ +#define IOMMU_IVHD_ATTR_HATDIS_SHIFT 0 + /* IOMMU Extended Feature Register (EFR) */ #define IOMMU_EFR_XTSUP_SHIFT 2 #define IOMMU_EFR_GASUP_SHIFT 7 @@ -471,7 +409,7 @@ extern bool amd_iommu_dump; #define DUMP_printk(format, arg...) \ do { \ if (amd_iommu_dump) \ - pr_info("AMD-Vi: " format, ## arg); \ + pr_info(format, ## arg); \ } while(0); /* global flag if IOMMUs cache non-present entries */ @@ -493,9 +431,6 @@ extern const struct iommu_ops amd_iommu_ops; /* IVRS indicates that pre-boot remapping was enabled */ extern bool amdr_ivrs_remap_support; -/* kmem_cache to get tables with 128 byte alignement */ -extern struct kmem_cache *amd_iommu_irq_cache; - #define PCI_SBDF_TO_SEGID(sbdf) (((sbdf) >> 16) & 0xffff) #define PCI_SBDF_TO_DEVID(sbdf) ((sbdf) & 0xffff) #define PCI_SEG_DEVID_TO_SBDF(seg, devid) ((((u32)(seg) & 0xffff) << 16) | \ @@ -519,6 +454,9 @@ extern struct kmem_cache *amd_iommu_irq_cache; #define for_each_pdom_dev_data_safe(pdom_dev_data, next, pdom) \ list_for_each_entry_safe((pdom_dev_data), (next), &pdom->dev_data_list, list) +#define for_each_ivhd_dte_flags(entry) \ + list_for_each_entry((entry), &amd_ivhd_dev_flags_list, list) + struct amd_iommu; struct iommu_domain; struct irq_domain; @@ -526,19 +464,6 @@ struct amd_irte_ops; #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) -#define io_pgtable_to_data(x) \ - container_of((x), struct amd_io_pgtable, iop) - -#define io_pgtable_ops_to_data(x) \ - io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) - -#define io_pgtable_ops_to_domain(x) \ - container_of(io_pgtable_ops_to_data(x), \ - struct protection_domain, iop) - -#define io_pgtable_cfg_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl_cfg) - struct gcr3_tbl_info { u64 *gcr3_tbl; /* Guest CR3 table */ int glx; /* Number of levels for GCR3 table */ @@ -546,16 +471,9 @@ struct gcr3_tbl_info { u16 domid; /* Per device domain ID */ }; -struct amd_io_pgtable { - struct io_pgtable_cfg pgtbl_cfg; - struct io_pgtable iop; - int mode; - u64 *root; - u64 *pgd; /* v2 pgtable pgd pointer */ -}; - enum protection_domain_mode { - PD_MODE_V1 = 1, + PD_MODE_NONE, + PD_MODE_V1, PD_MODE_V2, }; @@ -569,26 +487,36 @@ struct pdom_dev_data { struct list_head list; }; +/* Keeps track of the IOMMUs attached to protection domain */ +struct pdom_iommu_info { + struct amd_iommu *iommu; /* IOMMUs attach to protection domain */ + u32 refcnt; /* Count of attached dev/pasid per domain/IOMMU */ +}; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. */ struct protection_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + struct pt_iommu_x86_64 amdv2; + }; struct list_head dev_list; /* List of all devices in this domain */ - struct iommu_domain domain; /* generic domain handle used by - iommu core code */ - struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ - int nid; /* Node ID */ enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ - unsigned dev_cnt; /* devices assigned to this domain */ - unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ + struct xarray iommu_array; /* per-IOMMU reference count */ struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ }; +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain); /* * This structure contains information about one PCI segment in the system. @@ -609,12 +537,6 @@ struct amd_iommu_pci_seg { /* Size of the device table */ u32 dev_table_size; - /* Size of the alias table */ - u32 alias_table_size; - - /* Size of the rlookup table */ - u32 rlookup_table_size; - /* * device table virtual address * @@ -785,10 +707,17 @@ struct amd_iommu { u32 flags; volatile u64 *cmd_sem; atomic64_t cmd_sem_val; + /* + * Track physical address to directly use it in build_completion_wait() + * and avoid adding any special checks and handling for kdump. + */ + u64 cmd_sem_paddr; #ifdef CONFIG_AMD_IOMMU_DEBUGFS /* DebugFS Info */ struct dentry *debugfs; + int dbg_mmio_offset; + int dbg_cap_offset; #endif /* IOPF support */ @@ -836,7 +765,8 @@ struct devid_map { */ struct iommu_dev_data { /*Protect against attach/detach races */ - spinlock_t lock; + struct mutex mutex; + spinlock_t dte_lock; /* DTE lock for 256-bit access */ struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ @@ -845,6 +775,7 @@ struct iommu_dev_data { struct device *dev; u16 devid; /* PCI Device ID */ + unsigned int max_irqs; /* Maximum IRQs supported by device */ u32 max_pasids; /* Max supported PASIDs */ u32 flags; /* Holds AMD_IOMMU_DEVICE_FLAG_<*> */ int ats_qdep; @@ -878,16 +809,31 @@ extern struct list_head amd_iommu_pci_seg_list; extern struct list_head amd_iommu_list; /* - * Array with pointers to each IOMMU struct - * The indices are referenced in the protection domains + * Structure defining one entry in the device table */ -extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; +struct dev_table_entry { + union { + u64 data[4]; + u128 data128[2]; + }; +}; /* - * Structure defining one entry in the device table + * Structure defining one entry in the command buffer */ -struct dev_table_entry { - u64 data[4]; +struct iommu_cmd { + u32 data[4]; +}; + +/* + * Structure to sture persistent DTE flags from IVHD + */ +struct ivhd_dte_flags { + struct list_head list; + u16 segid; + u16 devid_first; + u16 devid_last; + struct dev_table_entry dte; }; /* @@ -914,17 +860,14 @@ struct unity_map_entry { * Data structures for device handling */ -/* size of the dma_ops aperture as power of 2 */ -extern unsigned amd_iommu_aperture_order; - -/* allocation bitmap for domain ids */ -extern unsigned long *amd_iommu_pd_alloc_bitmap; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ extern int amd_iommu_max_glx_val; +/* IDA to track protection domain IDs */ +extern struct ida pdom_ids; + /* Global EFR and EFR2 registers */ extern u64 amd_iommu_efr; extern u64 amd_iommu_efr2; @@ -1046,7 +989,6 @@ struct irq_2_irte { }; struct amd_ir_data { - u32 cached_ga_tag; struct amd_iommu *iommu; struct irq_2_irte irq_2_irte; struct msi_msg msi_entry; diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c index 545372fcc72f..20b04996441d 100644 --- a/drivers/iommu/amd/debugfs.c +++ b/drivers/iommu/amd/debugfs.c @@ -11,22 +11,382 @@ #include <linux/pci.h> #include "amd_iommu.h" +#include "../irq_remapping.h" static struct dentry *amd_iommu_debugfs; -static DEFINE_MUTEX(amd_iommu_debugfs_lock); #define MAX_NAME_LEN 20 +#define OFS_IN_SZ 8 +#define DEVID_IN_SZ 16 -void amd_iommu_debugfs_setup(struct amd_iommu *iommu) +static int sbdf = -1; + +static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct amd_iommu *iommu = m->private; + int ret; + + iommu->dbg_mmio_offset = -1; + + if (cnt > OFS_IN_SZ) + return -EINVAL; + + ret = kstrtou32_from_user(ubuf, cnt, 0, &iommu->dbg_mmio_offset); + if (ret) + return ret; + + if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) { + iommu->dbg_mmio_offset = -1; + return -EINVAL; + } + + return cnt; +} + +static int iommu_mmio_show(struct seq_file *m, void *unused) +{ + struct amd_iommu *iommu = m->private; + u64 value; + + if (iommu->dbg_mmio_offset < 0) { + seq_puts(m, "Please provide mmio register's offset\n"); + return 0; + } + + value = readq(iommu->mmio_base + iommu->dbg_mmio_offset); + seq_printf(m, "Offset:0x%x Value:0x%016llx\n", iommu->dbg_mmio_offset, value); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(iommu_mmio); + +static ssize_t iommu_capability_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct amd_iommu *iommu = m->private; + int ret; + + iommu->dbg_cap_offset = -1; + + if (cnt > OFS_IN_SZ) + return -EINVAL; + + ret = kstrtou32_from_user(ubuf, cnt, 0, &iommu->dbg_cap_offset); + if (ret) + return ret; + + /* Capability register at offset 0x14 is the last IOMMU capability register. */ + if (iommu->dbg_cap_offset > 0x14) { + iommu->dbg_cap_offset = -1; + return -EINVAL; + } + + return cnt; +} + +static int iommu_capability_show(struct seq_file *m, void *unused) +{ + struct amd_iommu *iommu = m->private; + u32 value; + int err; + + if (iommu->dbg_cap_offset < 0) { + seq_puts(m, "Please provide capability register's offset in the range [0x00 - 0x14]\n"); + return 0; + } + + err = pci_read_config_dword(iommu->dev, iommu->cap_ptr + iommu->dbg_cap_offset, &value); + if (err) { + seq_printf(m, "Not able to read capability register at 0x%x\n", + iommu->dbg_cap_offset); + return 0; + } + + seq_printf(m, "Offset:0x%x Value:0x%08x\n", iommu->dbg_cap_offset, value); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(iommu_capability); + +static int iommu_cmdbuf_show(struct seq_file *m, void *unused) +{ + struct amd_iommu *iommu = m->private; + struct iommu_cmd *cmd; + unsigned long flag; + u32 head, tail; + int i; + + raw_spin_lock_irqsave(&iommu->lock, flag); + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + seq_printf(m, "CMD Buffer Head Offset:%d Tail Offset:%d\n", + (head >> 4) & 0x7fff, (tail >> 4) & 0x7fff); + for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { + cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); + seq_printf(m, "%3d: %08x %08x %08x %08x\n", i, cmd->data[0], + cmd->data[1], cmd->data[2], cmd->data[3]); + } + raw_spin_unlock_irqrestore(&iommu->lock, flag); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(iommu_cmdbuf); + +static ssize_t devid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct amd_iommu_pci_seg *pci_seg; + int seg, bus, slot, func; + struct amd_iommu *iommu; + char *srcid_ptr; + u16 devid; + int i; + + sbdf = -1; + + if (cnt >= DEVID_IN_SZ) + return -EINVAL; + + srcid_ptr = memdup_user_nul(ubuf, cnt); + if (IS_ERR(srcid_ptr)) + return PTR_ERR(srcid_ptr); + + i = sscanf(srcid_ptr, "%x:%x:%x.%x", &seg, &bus, &slot, &func); + if (i != 4) { + i = sscanf(srcid_ptr, "%x:%x.%x", &bus, &slot, &func); + if (i != 3) { + kfree(srcid_ptr); + return -EINVAL; + } + seg = 0; + } + + devid = PCI_DEVID(bus, PCI_DEVFN(slot, func)); + + /* Check if user device id input is a valid input */ + for_each_pci_segment(pci_seg) { + if (pci_seg->id != seg) + continue; + if (devid > pci_seg->last_bdf) { + kfree(srcid_ptr); + return -EINVAL; + } + iommu = pci_seg->rlookup_table[devid]; + if (!iommu) { + kfree(srcid_ptr); + return -ENODEV; + } + break; + } + + if (pci_seg->id != seg) { + kfree(srcid_ptr); + return -EINVAL; + } + + sbdf = PCI_SEG_DEVID_TO_SBDF(seg, devid); + + kfree(srcid_ptr); + + return cnt; +} + +static int devid_show(struct seq_file *m, void *unused) { + u16 devid; + + if (sbdf >= 0) { + devid = PCI_SBDF_TO_DEVID(sbdf); + seq_printf(m, "%04x:%02x:%02x.%x\n", PCI_SBDF_TO_SEGID(sbdf), + PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid)); + } else + seq_puts(m, "No or Invalid input provided\n"); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(devid); + +static void dump_dte(struct seq_file *m, struct amd_iommu_pci_seg *pci_seg, u16 devid) +{ + struct dev_table_entry *dev_table; + struct amd_iommu *iommu; + + iommu = pci_seg->rlookup_table[devid]; + if (!iommu) + return; + + dev_table = get_dev_table(iommu); + if (!dev_table) { + seq_puts(m, "Device table not found"); + return; + } + + seq_printf(m, "%-12s %16s %16s %16s %16s iommu\n", "DeviceId", + "QWORD[3]", "QWORD[2]", "QWORD[1]", "QWORD[0]"); + seq_printf(m, "%04x:%02x:%02x.%x ", pci_seg->id, PCI_BUS_NUM(devid), + PCI_SLOT(devid), PCI_FUNC(devid)); + for (int i = 3; i >= 0; --i) + seq_printf(m, "%016llx ", dev_table[devid].data[i]); + seq_printf(m, "iommu%d\n", iommu->index); +} + +static int iommu_devtbl_show(struct seq_file *m, void *unused) +{ + struct amd_iommu_pci_seg *pci_seg; + u16 seg, devid; + + if (sbdf < 0) { + seq_puts(m, "Enter a valid device ID to 'devid' file\n"); + return 0; + } + seg = PCI_SBDF_TO_SEGID(sbdf); + devid = PCI_SBDF_TO_DEVID(sbdf); + + for_each_pci_segment(pci_seg) { + if (pci_seg->id != seg) + continue; + dump_dte(m, pci_seg, devid); + break; + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(iommu_devtbl); + +static void dump_128_irte(struct seq_file *m, struct irq_remap_table *table, u16 int_tab_len) +{ + struct irte_ga *ptr, *irte; + int index; + + for (index = 0; index < int_tab_len; index++) { + ptr = (struct irte_ga *)table->table; + irte = &ptr[index]; + + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && + !irte->lo.fields_vapic.valid) + continue; + else if (!irte->lo.fields_remap.valid) + continue; + seq_printf(m, "IRT[%04d] %016llx %016llx\n", index, irte->hi.val, irte->lo.val); + } +} + +static void dump_32_irte(struct seq_file *m, struct irq_remap_table *table, u16 int_tab_len) +{ + union irte *ptr, *irte; + int index; + + for (index = 0; index < int_tab_len; index++) { + ptr = (union irte *)table->table; + irte = &ptr[index]; + + if (!irte->fields.valid) + continue; + seq_printf(m, "IRT[%04d] %08x\n", index, irte->val); + } +} + +static void dump_irte(struct seq_file *m, u16 devid, struct amd_iommu_pci_seg *pci_seg) +{ + struct dev_table_entry *dev_table; + struct irq_remap_table *table; + struct amd_iommu *iommu; + unsigned long flags; + u16 int_tab_len; + + table = pci_seg->irq_lookup_table[devid]; + if (!table) { + seq_printf(m, "IRQ lookup table not set for %04x:%02x:%02x:%x\n", + pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid)); + return; + } + + iommu = pci_seg->rlookup_table[devid]; + if (!iommu) + return; + + dev_table = get_dev_table(iommu); + if (!dev_table) { + seq_puts(m, "Device table not found"); + return; + } + + int_tab_len = dev_table[devid].data[2] & DTE_INTTABLEN_MASK; + if (int_tab_len != DTE_INTTABLEN_512 && int_tab_len != DTE_INTTABLEN_2K) { + seq_puts(m, "The device's DTE contains an invalid IRT length value."); + return; + } + + seq_printf(m, "DeviceId %04x:%02x:%02x.%x\n", pci_seg->id, PCI_BUS_NUM(devid), + PCI_SLOT(devid), PCI_FUNC(devid)); + + raw_spin_lock_irqsave(&table->lock, flags); + if (AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + dump_128_irte(m, table, BIT(int_tab_len >> 1)); + else + dump_32_irte(m, table, BIT(int_tab_len >> 1)); + seq_puts(m, "\n"); + raw_spin_unlock_irqrestore(&table->lock, flags); +} + +static int iommu_irqtbl_show(struct seq_file *m, void *unused) +{ + struct amd_iommu_pci_seg *pci_seg; + u16 devid, seg; + + if (!irq_remapping_enabled) { + seq_puts(m, "Interrupt remapping is disabled\n"); + return 0; + } + + if (sbdf < 0) { + seq_puts(m, "Enter a valid device ID to 'devid' file\n"); + return 0; + } + + seg = PCI_SBDF_TO_SEGID(sbdf); + devid = PCI_SBDF_TO_DEVID(sbdf); + + for_each_pci_segment(pci_seg) { + if (pci_seg->id != seg) + continue; + dump_irte(m, devid, pci_seg); + break; + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(iommu_irqtbl); + +void amd_iommu_debugfs_setup(void) +{ + struct amd_iommu *iommu; char name[MAX_NAME_LEN + 1]; - mutex_lock(&amd_iommu_debugfs_lock); - if (!amd_iommu_debugfs) - amd_iommu_debugfs = debugfs_create_dir("amd", - iommu_debugfs_dir); - mutex_unlock(&amd_iommu_debugfs_lock); + amd_iommu_debugfs = debugfs_create_dir("amd", iommu_debugfs_dir); + + for_each_iommu(iommu) { + iommu->dbg_mmio_offset = -1; + iommu->dbg_cap_offset = -1; + + snprintf(name, MAX_NAME_LEN, "iommu%02d", iommu->index); + iommu->debugfs = debugfs_create_dir(name, amd_iommu_debugfs); + + debugfs_create_file("mmio", 0644, iommu->debugfs, iommu, + &iommu_mmio_fops); + debugfs_create_file("capability", 0644, iommu->debugfs, iommu, + &iommu_capability_fops); + debugfs_create_file("cmdbuf", 0444, iommu->debugfs, iommu, + &iommu_cmdbuf_fops); + } - snprintf(name, MAX_NAME_LEN, "iommu%02d", iommu->index); - iommu->debugfs = debugfs_create_dir(name, amd_iommu_debugfs); + debugfs_create_file("devid", 0644, amd_iommu_debugfs, NULL, + &devid_fops); + debugfs_create_file("devtbl", 0444, amd_iommu_debugfs, NULL, + &iommu_devtbl_fops); + debugfs_create_file("irqtbl", 0444, amd_iommu_debugfs, NULL, + &iommu_irqtbl_fops); } diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c89d85b54a1a..4b2953418977 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -12,7 +12,6 @@ #include <linux/acpi.h> #include <linux/list.h> #include <linux/bitmap.h> -#include <linux/slab.h> #include <linux/syscore_ops.h> #include <linux/interrupt.h> #include <linux/msi.h> @@ -152,7 +151,9 @@ struct ivmd_header { bool amd_iommu_dump; bool amd_iommu_irq_remap __read_mostly; -enum io_pgtable_fmt amd_iommu_pgtable = AMD_IOMMU_V1; +enum protection_domain_mode amd_iommu_pgtable = PD_MODE_V1; +/* Host page table level */ +u8 amd_iommu_hpt_level; /* Guest page table level */ int amd_iommu_gpt_level = PAGE_MODE_4_LEVEL; @@ -169,16 +170,16 @@ static int amd_iommu_target_ivhd_type; u64 amd_iommu_efr; u64 amd_iommu_efr2; +/* Host (v1) page table is not supported*/ +bool amd_iommu_hatdis; + /* SNP is enabled on the system? */ bool amd_iommu_snp_en; EXPORT_SYMBOL(amd_iommu_snp_en); LIST_HEAD(amd_iommu_pci_seg_list); /* list of all PCI segments */ -LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the - system */ - -/* Array to assign indices to IOMMUs*/ -struct amd_iommu *amd_iommus[MAX_IOMMUS]; +LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ +LIST_HEAD(amd_ivhd_dev_flags_list); /* list of all IVHD device entry settings */ /* Number of IOMMUs present in the system */ static int amd_iommus_present; @@ -192,11 +193,7 @@ bool amdr_ivrs_remap_support __read_mostly; bool amd_iommu_force_isolation __read_mostly; -/* - * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap - * to know which ones are already in use. - */ -unsigned long *amd_iommu_pd_alloc_bitmap; +unsigned long amd_iommu_pgsize_bitmap __ro_after_init = AMD_IOMMU_PGSIZES; enum iommu_init_state { IOMMU_START_STATE, @@ -226,7 +223,6 @@ static bool __initdata cmdline_maps; static enum iommu_init_state init_state = IOMMU_START_STATE; static int amd_iommu_enable_interrupts(void); -static int __init iommu_go_to_state(enum iommu_init_state state); static void init_device_table_dma(struct amd_iommu_pci_seg *pci_seg); static bool amd_iommu_pre_enabled = true; @@ -252,17 +248,14 @@ static void init_translation_status(struct amd_iommu *iommu) iommu->flags |= AMD_IOMMU_FLAG_TRANS_PRE_ENABLED; } -static inline unsigned long tbl_size(int entry_size, int last_bdf) +int amd_iommu_get_num_iommus(void) { - unsigned shift = PAGE_SHIFT + - get_order((last_bdf + 1) * entry_size); - - return 1UL << shift; + return amd_iommus_present; } -int amd_iommu_get_num_iommus(void) +bool amd_iommu_ht_range_ignore(void) { - return amd_iommus_present; + return check_feature2(FEATURE_HT_RANGE_IGNORE); } /* @@ -413,39 +406,35 @@ static void iommu_set_device_table(struct amd_iommu *iommu) BUG_ON(iommu->mmio_base == NULL); + if (is_kdump_kernel()) + return; + entry = iommu_virt_to_phys(dev_table); entry |= (dev_table_size >> 12) - 1; memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, &entry, sizeof(entry)); } -/* Generic functions to enable/disable certain features of the IOMMU. */ -void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_set(struct amd_iommu *iommu, u64 val, u64 mask, u8 shift) { u64 ctrl; ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl |= (1ULL << bit); + mask <<= shift; + ctrl &= ~mask; + ctrl |= (val << shift) & mask; writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +/* Generic functions to enable/disable certain features of the IOMMU. */ +void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { - u64 ctrl; - - ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl &= ~(1ULL << bit); - writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); + iommu_feature_set(iommu, 1ULL, 1ULL, bit); } -static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout) +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { - u64 ctrl; - - ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); - ctrl &= ~CTRL_INV_TO_MASK; - ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK; - writeq(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); + iommu_feature_set(iommu, 0ULL, 1ULL, bit); } /* Function to enable the hardware */ @@ -650,8 +639,8 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table, u16 pci_ /* Allocate per PCI segment device table */ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg) { - pci_seg->dev_table = iommu_alloc_pages(GFP_KERNEL | GFP_DMA32, - get_order(pci_seg->dev_table_size)); + pci_seg->dev_table = iommu_alloc_pages_sz(GFP_KERNEL | GFP_DMA32, + pci_seg->dev_table_size); if (!pci_seg->dev_table) return -ENOMEM; @@ -660,16 +649,19 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg) static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg) { - iommu_free_pages(pci_seg->dev_table, - get_order(pci_seg->dev_table_size)); + if (is_kdump_kernel()) + memunmap((void *)pci_seg->dev_table); + else + iommu_free_pages(pci_seg->dev_table); pci_seg->dev_table = NULL; } /* Allocate per PCI segment IOMMU rlookup table. */ static inline int __init alloc_rlookup_table(struct amd_iommu_pci_seg *pci_seg) { - pci_seg->rlookup_table = iommu_alloc_pages(GFP_KERNEL, - get_order(pci_seg->rlookup_table_size)); + pci_seg->rlookup_table = kvcalloc(pci_seg->last_bdf + 1, + sizeof(*pci_seg->rlookup_table), + GFP_KERNEL); if (pci_seg->rlookup_table == NULL) return -ENOMEM; @@ -678,17 +670,15 @@ static inline int __init alloc_rlookup_table(struct amd_iommu_pci_seg *pci_seg) static inline void free_rlookup_table(struct amd_iommu_pci_seg *pci_seg) { - iommu_free_pages(pci_seg->rlookup_table, - get_order(pci_seg->rlookup_table_size)); + kvfree(pci_seg->rlookup_table); pci_seg->rlookup_table = NULL; } static inline int __init alloc_irq_lookup_table(struct amd_iommu_pci_seg *pci_seg) { - pci_seg->irq_lookup_table = iommu_alloc_pages(GFP_KERNEL, - get_order(pci_seg->rlookup_table_size)); - kmemleak_alloc(pci_seg->irq_lookup_table, - pci_seg->rlookup_table_size, 1, GFP_KERNEL); + pci_seg->irq_lookup_table = kvcalloc(pci_seg->last_bdf + 1, + sizeof(*pci_seg->irq_lookup_table), + GFP_KERNEL); if (pci_seg->irq_lookup_table == NULL) return -ENOMEM; @@ -697,9 +687,7 @@ static inline int __init alloc_irq_lookup_table(struct amd_iommu_pci_seg *pci_se static inline void free_irq_lookup_table(struct amd_iommu_pci_seg *pci_seg) { - kmemleak_free(pci_seg->irq_lookup_table); - iommu_free_pages(pci_seg->irq_lookup_table, - get_order(pci_seg->rlookup_table_size)); + kvfree(pci_seg->irq_lookup_table); pci_seg->irq_lookup_table = NULL; } @@ -707,8 +695,9 @@ static int __init alloc_alias_table(struct amd_iommu_pci_seg *pci_seg) { int i; - pci_seg->alias_table = iommu_alloc_pages(GFP_KERNEL, - get_order(pci_seg->alias_table_size)); + pci_seg->alias_table = kvmalloc_array(pci_seg->last_bdf + 1, + sizeof(*pci_seg->alias_table), + GFP_KERNEL); if (!pci_seg->alias_table) return -ENOMEM; @@ -723,11 +712,30 @@ static int __init alloc_alias_table(struct amd_iommu_pci_seg *pci_seg) static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg) { - iommu_free_pages(pci_seg->alias_table, - get_order(pci_seg->alias_table_size)); + kvfree(pci_seg->alias_table); pci_seg->alias_table = NULL; } +static inline void *iommu_memremap(unsigned long paddr, size_t size) +{ + phys_addr_t phys; + + if (!paddr) + return NULL; + + /* + * Obtain true physical address in kdump kernel when SME is enabled. + * Currently, previous kernel with SME enabled and kdump kernel + * with SME support disabled is not supported. + */ + phys = __sme_clr(paddr); + + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return (__force void *)ioremap_encrypted(phys, size); + else + return memremap(phys, size, MEMREMAP_WB); +} + /* * Allocates the command buffer. This buffer is per AMD IOMMU. We can * write commands to that buffer later and the IOMMU will execute them @@ -735,8 +743,7 @@ static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg) */ static int __init alloc_command_buffer(struct amd_iommu *iommu) { - iommu->cmd_buf = iommu_alloc_pages(GFP_KERNEL, - get_order(CMD_BUFFER_SIZE)); + iommu->cmd_buf = iommu_alloc_pages_sz(GFP_KERNEL, CMD_BUFFER_SIZE); return iommu->cmd_buf ? 0 : -ENOMEM; } @@ -814,11 +821,16 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) BUG_ON(iommu->cmd_buf == NULL); - entry = iommu_virt_to_phys(iommu->cmd_buf); - entry |= MMIO_CMD_SIZE_512; - - memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); + if (!is_kdump_kernel()) { + /* + * Command buffer is re-used for kdump kernel and setting + * of MMIO register is not required. + */ + entry = iommu_virt_to_phys(iommu->cmd_buf); + entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, + &entry, sizeof(entry)); + } amd_iommu_reset_cmd_buffer(iommu); } @@ -833,20 +845,22 @@ static void iommu_disable_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - iommu_free_pages(iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); + iommu_free_pages(iommu->cmd_buf); } void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, gfp_t gfp, size_t size) { - int order = get_order(size); - void *buf = iommu_alloc_pages(gfp, order); + void *buf; - if (buf && - check_feature(FEATURE_SNP) && - set_memory_4k((unsigned long)buf, (1 << order))) { - iommu_free_pages(buf, order); - buf = NULL; + size = PAGE_ALIGN(size); + buf = iommu_alloc_pages_sz(gfp, size); + if (!buf) + return NULL; + if (check_feature(FEATURE_SNP) && + set_memory_4k((unsigned long)buf, size / PAGE_SIZE)) { + iommu_free_pages(buf); + return NULL; } return buf; @@ -867,10 +881,15 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu) BUG_ON(iommu->evt_buf == NULL); - entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; - - memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, - &entry, sizeof(entry)); + if (!is_kdump_kernel()) { + /* + * Event buffer is re-used for kdump kernel and setting + * of MMIO register is not required. + */ + entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, + &entry, sizeof(entry)); + } /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); @@ -889,14 +908,14 @@ static void iommu_disable_event_buffer(struct amd_iommu *iommu) static void __init free_event_buffer(struct amd_iommu *iommu) { - iommu_free_pages(iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); + iommu_free_pages(iommu->evt_buf); } static void free_ga_log(struct amd_iommu *iommu) { #ifdef CONFIG_IRQ_REMAP - iommu_free_pages(iommu->ga_log, get_order(GA_LOG_SIZE)); - iommu_free_pages(iommu->ga_log_tail, get_order(8)); + iommu_free_pages(iommu->ga_log); + iommu_free_pages(iommu->ga_log_tail); #endif } @@ -941,11 +960,11 @@ static int iommu_init_ga_log(struct amd_iommu *iommu) if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) return 0; - iommu->ga_log = iommu_alloc_pages(GFP_KERNEL, get_order(GA_LOG_SIZE)); + iommu->ga_log = iommu_alloc_pages_sz(GFP_KERNEL, GA_LOG_SIZE); if (!iommu->ga_log) goto err_out; - iommu->ga_log_tail = iommu_alloc_pages(GFP_KERNEL, get_order(8)); + iommu->ga_log_tail = iommu_alloc_pages_sz(GFP_KERNEL, 8); if (!iommu->ga_log_tail) goto err_out; @@ -959,14 +978,129 @@ err_out: static int __init alloc_cwwb_sem(struct amd_iommu *iommu) { iommu->cmd_sem = iommu_alloc_4k_pages(iommu, GFP_KERNEL, 1); + if (!iommu->cmd_sem) + return -ENOMEM; + iommu->cmd_sem_paddr = iommu_virt_to_phys((void *)iommu->cmd_sem); + return 0; +} + +static int __init remap_event_buffer(struct amd_iommu *iommu) +{ + u64 paddr; + + pr_info_once("Re-using event buffer from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_EVT_BUF_OFFSET) & PM_ADDR_MASK; + iommu->evt_buf = iommu_memremap(paddr, EVT_BUFFER_SIZE); - return iommu->cmd_sem ? 0 : -ENOMEM; + return iommu->evt_buf ? 0 : -ENOMEM; +} + +static int __init remap_command_buffer(struct amd_iommu *iommu) +{ + u64 paddr; + + pr_info_once("Re-using command buffer from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_CMD_BUF_OFFSET) & PM_ADDR_MASK; + iommu->cmd_buf = iommu_memremap(paddr, CMD_BUFFER_SIZE); + + return iommu->cmd_buf ? 0 : -ENOMEM; +} + +static int __init remap_or_alloc_cwwb_sem(struct amd_iommu *iommu) +{ + u64 paddr; + + if (check_feature(FEATURE_SNP)) { + /* + * When SNP is enabled, the exclusion base register is used for the + * completion wait buffer (CWB) address. Read and re-use it. + */ + pr_info_once("Re-using CWB buffers from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET) & PM_ADDR_MASK; + iommu->cmd_sem = iommu_memremap(paddr, PAGE_SIZE); + if (!iommu->cmd_sem) + return -ENOMEM; + iommu->cmd_sem_paddr = paddr; + } else { + return alloc_cwwb_sem(iommu); + } + + return 0; +} + +static int __init alloc_iommu_buffers(struct amd_iommu *iommu) +{ + int ret; + + /* + * Reuse/Remap the previous kernel's allocated completion wait + * command and event buffers for kdump boot. + */ + if (is_kdump_kernel()) { + ret = remap_or_alloc_cwwb_sem(iommu); + if (ret) + return ret; + + ret = remap_command_buffer(iommu); + if (ret) + return ret; + + ret = remap_event_buffer(iommu); + if (ret) + return ret; + } else { + ret = alloc_cwwb_sem(iommu); + if (ret) + return ret; + + ret = alloc_command_buffer(iommu); + if (ret) + return ret; + + ret = alloc_event_buffer(iommu); + if (ret) + return ret; + } + + return 0; } static void __init free_cwwb_sem(struct amd_iommu *iommu) { if (iommu->cmd_sem) - iommu_free_page((void *)iommu->cmd_sem); + iommu_free_pages((void *)iommu->cmd_sem); +} +static void __init unmap_cwwb_sem(struct amd_iommu *iommu) +{ + if (iommu->cmd_sem) { + if (check_feature(FEATURE_SNP)) + memunmap((void *)iommu->cmd_sem); + else + iommu_free_pages((void *)iommu->cmd_sem); + } +} + +static void __init unmap_command_buffer(struct amd_iommu *iommu) +{ + memunmap((void *)iommu->cmd_buf); +} + +static void __init unmap_event_buffer(struct amd_iommu *iommu) +{ + memunmap(iommu->evt_buf); +} + +static void __init free_iommu_buffers(struct amd_iommu *iommu) +{ + if (is_kdump_kernel()) { + unmap_cwwb_sem(iommu); + unmap_command_buffer(iommu); + unmap_event_buffer(iommu); + } else { + free_cwwb_sem(iommu); + free_command_buffer(iommu); + free_event_buffer(iommu); + } } static void iommu_enable_xt(struct amd_iommu *iommu) @@ -991,47 +1125,20 @@ static void iommu_enable_gt(struct amd_iommu *iommu) } /* sets a specific bit in the device table entry. */ -static void __set_dev_entry_bit(struct dev_table_entry *dev_table, - u16 devid, u8 bit) -{ - int i = (bit >> 6) & 0x03; - int _bit = bit & 0x3f; - - dev_table[devid].data[i] |= (1UL << _bit); -} - -static void set_dev_entry_bit(struct amd_iommu *iommu, u16 devid, u8 bit) -{ - struct dev_table_entry *dev_table = get_dev_table(iommu); - - return __set_dev_entry_bit(dev_table, devid, bit); -} - -static int __get_dev_entry_bit(struct dev_table_entry *dev_table, - u16 devid, u8 bit) +static void set_dte_bit(struct dev_table_entry *dte, u8 bit) { int i = (bit >> 6) & 0x03; int _bit = bit & 0x3f; - return (dev_table[devid].data[i] & (1UL << _bit)) >> _bit; + dte->data[i] |= (1UL << _bit); } -static int get_dev_entry_bit(struct amd_iommu *iommu, u16 devid, u8 bit) +static bool __reuse_device_table(struct amd_iommu *iommu) { - struct dev_table_entry *dev_table = get_dev_table(iommu); - - return __get_dev_entry_bit(dev_table, devid, bit); -} - -static bool __copy_device_table(struct amd_iommu *iommu) -{ - u64 int_ctl, int_tab_len, entry = 0; struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; - struct dev_table_entry *old_devtb = NULL; - u32 lo, hi, devid, old_devtb_size; + u32 lo, hi, old_devtb_size; phys_addr_t old_devtb_phys; - u16 dom_id, dte_v, irq_v; - u64 tmp; + u64 entry; /* Each IOMMU use separate device table with the same size */ lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET); @@ -1056,62 +1163,20 @@ static bool __copy_device_table(struct amd_iommu *iommu) pr_err("The address of old device table is above 4G, not trustworthy!\n"); return false; } - old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel()) - ? (__force void *)ioremap_encrypted(old_devtb_phys, - pci_seg->dev_table_size) - : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB); - - if (!old_devtb) - return false; - pci_seg->old_dev_tbl_cpy = iommu_alloc_pages(GFP_KERNEL | GFP_DMA32, - get_order(pci_seg->dev_table_size)); + /* + * Re-use the previous kernel's device table for kdump. + */ + pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size); if (pci_seg->old_dev_tbl_cpy == NULL) { - pr_err("Failed to allocate memory for copying old device table!\n"); - memunmap(old_devtb); + pr_err("Failed to remap memory for reusing old device table!\n"); return false; } - for (devid = 0; devid <= pci_seg->last_bdf; ++devid) { - pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid]; - dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK; - dte_v = old_devtb[devid].data[0] & DTE_FLAG_V; - - if (dte_v && dom_id) { - pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0]; - pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1]; - __set_bit(dom_id, amd_iommu_pd_alloc_bitmap); - /* If gcr3 table existed, mask it out */ - if (old_devtb[devid].data[0] & DTE_FLAG_GV) { - tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; - tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; - pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp; - tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A; - tmp |= DTE_FLAG_GV; - pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp; - } - } - - irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE; - int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK; - int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; - if (irq_v && (int_ctl || int_tab_len)) { - if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || - (int_tab_len != DTE_INTTABLEN)) { - pr_err("Wrong old irq remapping flag: %#x\n", devid); - memunmap(old_devtb); - return false; - } - - pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2]; - } - } - memunmap(old_devtb); - return true; } -static bool copy_device_table(void) +static bool reuse_device_table(void) { struct amd_iommu *iommu; struct amd_iommu_pci_seg *pci_seg; @@ -1119,17 +1184,17 @@ static bool copy_device_table(void) if (!amd_iommu_pre_enabled) return false; - pr_warn("Translation is already enabled - trying to copy translation structures\n"); + pr_warn("Translation is already enabled - trying to reuse translation structures\n"); /* * All IOMMUs within PCI segment shares common device table. - * Hence copy device table only once per PCI segment. + * Hence reuse device table only once per PCI segment. */ for_each_pci_segment(pci_seg) { for_each_iommu(iommu) { if (pci_seg->id != iommu->pci_seg->id) continue; - if (!__copy_device_table(iommu)) + if (!__reuse_device_table(iommu)) return false; break; } @@ -1138,42 +1203,107 @@ static bool copy_device_table(void) return true; } -void amd_iommu_apply_erratum_63(struct amd_iommu *iommu, u16 devid) +struct dev_table_entry *amd_iommu_get_ivhd_dte_flags(u16 segid, u16 devid) { - int sysmgt; + struct ivhd_dte_flags *e; + unsigned int best_len = UINT_MAX; + struct dev_table_entry *dte = NULL; - sysmgt = get_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT1) | - (get_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT2) << 1); + for_each_ivhd_dte_flags(e) { + /* + * Need to go through the whole list to find the smallest range, + * which contains the devid. + */ + if ((e->segid == segid) && + (e->devid_first <= devid) && (devid <= e->devid_last)) { + unsigned int len = e->devid_last - e->devid_first; - if (sysmgt == 0x01) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_IW); + if (len < best_len) { + dte = &(e->dte); + best_len = len; + } + } + } + return dte; +} + +static bool search_ivhd_dte_flags(u16 segid, u16 first, u16 last) +{ + struct ivhd_dte_flags *e; + + for_each_ivhd_dte_flags(e) { + if ((e->segid == segid) && + (e->devid_first == first) && + (e->devid_last == last)) + return true; + } + return false; } /* * This function takes the device specific flags read from the ACPI * table and sets up the device table entry with that information */ -static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, - u16 devid, u32 flags, u32 ext_flags) +static void __init +set_dev_entry_from_acpi_range(struct amd_iommu *iommu, u16 first, u16 last, + u32 flags, u32 ext_flags) { - if (flags & ACPI_DEVFLAG_INITPASS) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_INIT_PASS); - if (flags & ACPI_DEVFLAG_EXTINT) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_EINT_PASS); - if (flags & ACPI_DEVFLAG_NMI) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_NMI_PASS); - if (flags & ACPI_DEVFLAG_SYSMGT1) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT1); - if (flags & ACPI_DEVFLAG_SYSMGT2) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_SYSMGT2); - if (flags & ACPI_DEVFLAG_LINT0) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_LINT0_PASS); - if (flags & ACPI_DEVFLAG_LINT1) - set_dev_entry_bit(iommu, devid, DEV_ENTRY_LINT1_PASS); + int i; + struct dev_table_entry dte = {}; + + /* Parse IVHD DTE setting flags and store information */ + if (flags) { + struct ivhd_dte_flags *d; - amd_iommu_apply_erratum_63(iommu, devid); + if (search_ivhd_dte_flags(iommu->pci_seg->id, first, last)) + return; + + d = kzalloc(sizeof(struct ivhd_dte_flags), GFP_KERNEL); + if (!d) + return; - amd_iommu_set_rlookup_table(iommu, devid); + pr_debug("%s: devid range %#x:%#x\n", __func__, first, last); + + if (flags & ACPI_DEVFLAG_INITPASS) + set_dte_bit(&dte, DEV_ENTRY_INIT_PASS); + if (flags & ACPI_DEVFLAG_EXTINT) + set_dte_bit(&dte, DEV_ENTRY_EINT_PASS); + if (flags & ACPI_DEVFLAG_NMI) + set_dte_bit(&dte, DEV_ENTRY_NMI_PASS); + if (flags & ACPI_DEVFLAG_SYSMGT1) + set_dte_bit(&dte, DEV_ENTRY_SYSMGT1); + if (flags & ACPI_DEVFLAG_SYSMGT2) + set_dte_bit(&dte, DEV_ENTRY_SYSMGT2); + if (flags & ACPI_DEVFLAG_LINT0) + set_dte_bit(&dte, DEV_ENTRY_LINT0_PASS); + if (flags & ACPI_DEVFLAG_LINT1) + set_dte_bit(&dte, DEV_ENTRY_LINT1_PASS); + + /* Apply erratum 63, which needs info in initial_dte */ + if (FIELD_GET(DTE_DATA1_SYSMGT_MASK, dte.data[1]) == 0x1) + dte.data[0] |= DTE_FLAG_IW; + + memcpy(&d->dte, &dte, sizeof(dte)); + d->segid = iommu->pci_seg->id; + d->devid_first = first; + d->devid_last = last; + list_add_tail(&d->list, &amd_ivhd_dev_flags_list); + } + + for (i = first; i <= last; i++) { + if (flags) { + struct dev_table_entry *dev_table = get_dev_table(iommu); + + memcpy(&dev_table[i], &dte, sizeof(dte)); + } + amd_iommu_set_rlookup_table(iommu, i); + } +} + +static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, + u16 devid, u32 flags, u32 ext_flags) +{ + set_dev_entry_from_acpi_range(iommu, devid, devid, flags, ext_flags); } int __init add_special_device(u8 type, u8 id, u32 *devid, bool cmd_line) @@ -1241,7 +1371,7 @@ static int __init add_acpi_hid_device(u8 *hid, u8 *uid, u32 *devid, entry->cmd_line = cmd_line; entry->root_devid = (entry->devid & (~0x7)); - pr_info("%s, add hid:%s, uid:%s, rdevid:%d\n", + pr_info("%s, add hid:%s, uid:%s, rdevid:%#x\n", entry->cmd_line ? "cmd" : "ivrs", entry->hid, entry->uid, entry->root_devid); @@ -1333,15 +1463,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, switch (e->type) { case IVHD_DEV_ALL: - DUMP_printk(" DEV_ALL\t\t\tflags: %02x\n", e->flags); - - for (dev_i = 0; dev_i <= pci_seg->last_bdf; ++dev_i) - set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); + DUMP_printk(" DEV_ALL\t\t\tsetting: %#02x\n", e->flags); + set_dev_entry_from_acpi_range(iommu, 0, pci_seg->last_bdf, e->flags, 0); break; case IVHD_DEV_SELECT: - DUMP_printk(" DEV_SELECT\t\t\t devid: %04x:%02x:%02x.%x " - "flags: %02x\n", + DUMP_printk(" DEV_SELECT\t\t\tdevid: %04x:%02x:%02x.%x flags: %#02x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1352,8 +1479,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_SELECT_RANGE_START: - DUMP_printk(" DEV_SELECT_RANGE_START\t " - "devid: %04x:%02x:%02x.%x flags: %02x\n", + DUMP_printk(" DEV_SELECT_RANGE_START\tdevid: %04x:%02x:%02x.%x flags: %#02x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1366,8 +1492,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_ALIAS: - DUMP_printk(" DEV_ALIAS\t\t\t devid: %04x:%02x:%02x.%x " - "flags: %02x devid_to: %02x:%02x.%x\n", + DUMP_printk(" DEV_ALIAS\t\t\tdevid: %04x:%02x:%02x.%x flags: %#02x devid_to: %02x:%02x.%x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1384,9 +1509,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_ALIAS_RANGE: - DUMP_printk(" DEV_ALIAS_RANGE\t\t " - "devid: %04x:%02x:%02x.%x flags: %02x " - "devid_to: %04x:%02x:%02x.%x\n", + DUMP_printk(" DEV_ALIAS_RANGE\t\tdevid: %04x:%02x:%02x.%x flags: %#02x devid_to: %04x:%02x:%02x.%x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1403,8 +1526,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_EXT_SELECT: - DUMP_printk(" DEV_EXT_SELECT\t\t devid: %04x:%02x:%02x.%x " - "flags: %02x ext: %08x\n", + DUMP_printk(" DEV_EXT_SELECT\t\tdevid: %04x:%02x:%02x.%x flags: %#02x ext: %08x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1416,8 +1538,7 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_EXT_SELECT_RANGE: - DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " - "%04x:%02x:%02x.%x flags: %02x ext: %08x\n", + DUMP_printk(" DEV_EXT_SELECT_RANGE\tdevid: %04x:%02x:%02x.%x flags: %#02x ext: %08x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid), @@ -1430,21 +1551,18 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_RANGE_END: - DUMP_printk(" DEV_RANGE_END\t\t devid: %04x:%02x:%02x.%x\n", + DUMP_printk(" DEV_RANGE_END\t\tdevid: %04x:%02x:%02x.%x\n", seg_id, PCI_BUS_NUM(e->devid), PCI_SLOT(e->devid), PCI_FUNC(e->devid)); devid = e->devid; - for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) { + if (alias) { + for (dev_i = devid_start; dev_i <= devid; ++dev_i) pci_seg->alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi(iommu, - devid_to, flags, ext_flags); - } - set_dev_entry_from_acpi(iommu, dev_i, - flags, ext_flags); + set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags); } + set_dev_entry_from_acpi_range(iommu, devid_start, devid, flags, ext_flags); break; case IVHD_DEV_SPECIAL: { u8 handle, type; @@ -1463,11 +1581,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, else var = "UNKNOWN"; - DUMP_printk(" DEV_SPECIAL(%s[%d])\t\tdevid: %04x:%02x:%02x.%x\n", + DUMP_printk(" DEV_SPECIAL(%s[%d])\t\tdevid: %04x:%02x:%02x.%x, flags: %#02x\n", var, (int)handle, seg_id, PCI_BUS_NUM(devid), PCI_SLOT(devid), - PCI_FUNC(devid)); + PCI_FUNC(devid), + e->flags); ret = add_special_device(type, handle, &devid, false); if (ret) @@ -1527,11 +1646,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, } devid = PCI_SEG_DEVID_TO_SBDF(seg_id, e->devid); - DUMP_printk(" DEV_ACPI_HID(%s[%s])\t\tdevid: %04x:%02x:%02x.%x\n", + DUMP_printk(" DEV_ACPI_HID(%s[%s])\t\tdevid: %04x:%02x:%02x.%x, flags: %#02x\n", hid, uid, seg_id, PCI_BUS_NUM(devid), PCI_SLOT(devid), - PCI_FUNC(devid)); + PCI_FUNC(devid), + e->flags); flags = e->flags; @@ -1580,9 +1700,9 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id, pci_seg->last_bdf = last_bdf; DUMP_printk("PCI segment : 0x%0x, last bdf : 0x%04x\n", id, last_bdf); - pci_seg->dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE, last_bdf); - pci_seg->alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE, last_bdf); - pci_seg->rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE, last_bdf); + pci_seg->dev_table_size = + max(roundup_pow_of_two((last_bdf + 1) * DEV_TABLE_ENTRY_SIZE), + SZ_4K); pci_seg->id = id; init_llist_head(&pci_seg->dev_data_list); @@ -1590,13 +1710,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id, list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list); if (alloc_dev_table(pci_seg)) - return NULL; + goto err_free_pci_seg; if (alloc_alias_table(pci_seg)) - return NULL; + goto err_free_dev_table; if (alloc_rlookup_table(pci_seg)) - return NULL; + goto err_free_alias_table; return pci_seg; + +err_free_alias_table: + free_alias_table(pci_seg); +err_free_dev_table: + free_dev_table(pci_seg); +err_free_pci_seg: + list_del(&pci_seg->list); + kfree(pci_seg); + return NULL; } static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id, @@ -1637,9 +1766,7 @@ static void __init free_sysfs(struct amd_iommu *iommu) static void __init free_iommu_one(struct amd_iommu *iommu) { free_sysfs(iommu); - free_cwwb_sem(iommu); - free_command_buffer(iommu); - free_event_buffer(iommu); + free_iommu_buffers(iommu); amd_iommu_free_ppr_log(iommu); free_ga_log(iommu); iommu_unmap_mmio_space(iommu); @@ -1742,9 +1869,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, return -ENOSYS; } - /* Index is fine - add IOMMU to the array */ - amd_iommus[iommu->index] = iommu; - /* * Copy data from ACPI table entry to the iommu struct */ @@ -1762,13 +1886,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, else iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; - /* - * Note: GA (128-bit IRTE) mode requires cmpxchg16b supports. - * GAM also requires GA mode. Therefore, we need to - * check cmpxchg16b support before enabling it. - */ - if (!boot_cpu_has(X86_FEATURE_CX16) || - ((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0)) + /* GAM requires GA mode. */ + if ((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0) amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; break; case 0x11: @@ -1778,13 +1897,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, else iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; - /* - * Note: GA (128-bit IRTE) mode requires cmpxchg16b supports. - * XT, GAM also requires GA mode. Therefore, we need to - * check cmpxchg16b support before enabling them. - */ - if (!boot_cpu_has(X86_FEATURE_CX16) || - ((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0)) { + /* XT and GAM require GA mode. */ + if ((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0) { amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; break; } @@ -1792,6 +1906,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, if (h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE; + if (h->efr_attr & BIT(IOMMU_IVHD_ATTR_HATDIS_SHIFT)) { + pr_warn_once("Host Address Translation is not supported.\n"); + amd_iommu_hatdis = true; + } + early_iommu_features_init(iommu, h); break; @@ -1811,14 +1930,9 @@ static int __init init_iommu_one_late(struct amd_iommu *iommu) { int ret; - if (alloc_cwwb_sem(iommu)) - return -ENOMEM; - - if (alloc_command_buffer(iommu)) - return -ENOMEM; - - if (alloc_event_buffer(iommu)) - return -ENOMEM; + ret = alloc_iommu_buffers(iommu); + if (ret) + return ret; iommu->int_enabled = false; @@ -2024,9 +2138,6 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) if (!iommu->dev) return -ENODEV; - /* Prevent binding other PCI device drivers to IOMMU devices */ - iommu->dev->match_driver = false; - /* ACPI _PRT won't have an IRQ for IOMMU */ iommu->dev->irq_managed = 1; @@ -2042,14 +2153,12 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) int glxval; u64 pasmax; - pasmax = amd_iommu_efr & FEATURE_PASID_MASK; - pasmax >>= FEATURE_PASID_SHIFT; + pasmax = FIELD_GET(FEATURE_PASMAX, amd_iommu_efr); iommu->iommu.max_pasids = (1 << (pasmax + 1)) - 1; BUG_ON(iommu->iommu.max_pasids & ~PASID_MASK); - glxval = amd_iommu_efr & FEATURE_GLXVAL_MASK; - glxval >>= FEATURE_GLXVAL_SHIFT; + glxval = FIELD_GET(FEATURE_GLX, amd_iommu_efr); if (amd_iommu_max_glx_val == -1) amd_iommu_max_glx_val = glxval; @@ -2070,14 +2179,6 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) init_iommu_perf_ctr(iommu); - if (amd_iommu_pgtable == AMD_IOMMU_V2) { - if (!check_feature(FEATURE_GIOSUP) || - !check_feature(FEATURE_GT)) { - pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; - } - } - if (is_rd890_iommu(iommu->dev)) { int i, j; @@ -2125,7 +2226,15 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) return ret; } - iommu_device_register(&iommu->iommu, &amd_iommu_ops, NULL); + ret = iommu_device_register(&iommu->iommu, &amd_iommu_ops, NULL); + if (ret || amd_iommu_pgtable == PD_MODE_NONE) { + /* + * Remove sysfs if DMA translation is not supported by the + * IOMMU. Do not return an error to enable IRQ remapping + * in state_next(), DTE[V, TV] must eventually be set to 0. + */ + iommu_device_sysfs_remove(&iommu->iommu); + } return pci_enable_device(iommu->dev); } @@ -2152,6 +2261,9 @@ static void print_iommu_info(void) if (check_feature(FEATURE_SNP)) pr_cont(" SNP"); + if (check_feature2(FEATURE_SEVSNPIO_SUP)) + pr_cont(" SEV-TIO"); + pr_cont("\n"); } @@ -2160,7 +2272,7 @@ static void print_iommu_info(void) if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE) pr_info("X2APIC enabled\n"); } - if (amd_iommu_pgtable == AMD_IOMMU_V2) { + if (amd_iommu_pgtable == PD_MODE_V2) { pr_info("V2 page table enabled (Paging mode : %d level)\n", amd_iommu_gpt_level); } @@ -2172,6 +2284,9 @@ static int __init amd_iommu_init_pci(void) struct amd_iommu_pci_seg *pci_seg; int ret; + /* Init global identity domain before registering IOMMU */ + amd_iommu_init_identity_domain(); + for_each_iommu(iommu) { ret = iommu_init_pci(iommu); if (ret) { @@ -2344,7 +2459,7 @@ static struct irq_chip intcapxt_controller = { .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_set_affinity = intcapxt_set_affinity, .irq_set_wake = intcapxt_set_wake, - .flags = IRQCHIP_MASK_ON_SUSPEND, + .flags = IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_MOVE_DEFERRED, }; static const struct irq_domain_ops intcapxt_domain_ops = { @@ -2583,13 +2698,13 @@ static void init_device_table_dma(struct amd_iommu_pci_seg *pci_seg) u32 devid; struct dev_table_entry *dev_table = pci_seg->dev_table; - if (dev_table == NULL) + if (!dev_table || amd_iommu_pgtable == PD_MODE_NONE) return; for (devid = 0; devid <= pci_seg->last_bdf; ++devid) { - __set_dev_entry_bit(dev_table, devid, DEV_ENTRY_VALID); + set_dte_bit(&dev_table[devid], DEV_ENTRY_VALID); if (!amd_iommu_snp_en) - __set_dev_entry_bit(dev_table, devid, DEV_ENTRY_TRANSLATION); + set_dte_bit(&dev_table[devid], DEV_ENTRY_TRANSLATION); } } @@ -2617,8 +2732,7 @@ static void init_device_table(void) for_each_pci_segment(pci_seg) { for (devid = 0; devid <= pci_seg->last_bdf; ++devid) - __set_dev_entry_bit(pci_seg->dev_table, - devid, DEV_ENTRY_IRQ_TBL_EN); + set_dte_bit(&pci_seg->dev_table[devid], DEV_ENTRY_IRQ_TBL_EN); } } @@ -2646,7 +2760,11 @@ static void iommu_init_flags(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_COHERENT_EN); /* Set IOTLB invalidation timeout to 1s */ - iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); + iommu_feature_set(iommu, CTRL_INV_TO_1S, CTRL_INV_TO_MASK, CONTROL_INV_TIMEOUT); + + /* Enable Enhanced Peripheral Page Request Handling */ + if (check_feature(FEATURE_EPHSUP)) + iommu_feature_enable(iommu, CONTROL_EPH_EN); } static void iommu_apply_resume_quirks(struct amd_iommu *iommu) @@ -2735,6 +2853,17 @@ static void iommu_enable_irtcachedis(struct amd_iommu *iommu) iommu->irtcachedis_enabled ? "disabled" : "enabled"); } +static void iommu_enable_2k_int(struct amd_iommu *iommu) +{ + if (!FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + return; + + iommu_feature_set(iommu, + CONTROL_NUM_INT_REMAP_MODE_2K, + CONTROL_NUM_INT_REMAP_MODE_MASK, + CONTROL_NUM_INT_REMAP_MODE); +} + static void early_enable_iommu(struct amd_iommu *iommu) { iommu_disable(iommu); @@ -2747,6 +2876,7 @@ static void early_enable_iommu(struct amd_iommu *iommu) iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); + iommu_enable_2k_int(iommu); iommu_enable(iommu); amd_iommu_flush_all_caches(iommu); } @@ -2755,8 +2885,8 @@ static void early_enable_iommu(struct amd_iommu *iommu) * This function finally enables all IOMMUs found in the system after * they have been initialized. * - * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy - * the old content of device table entries. Not this case or copy failed, + * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse + * the old content of device table entries. Not this case or reuse failed, * just continue as normal kernel does. */ static void early_enable_iommus(void) @@ -2764,19 +2894,25 @@ static void early_enable_iommus(void) struct amd_iommu *iommu; struct amd_iommu_pci_seg *pci_seg; - if (!copy_device_table()) { + if (!reuse_device_table()) { /* - * If come here because of failure in copying device table from old + * If come here because of failure in reusing device table from old * kernel with all IOMMUs enabled, print error message and try to * free allocated old_dev_tbl_cpy. */ - if (amd_iommu_pre_enabled) - pr_err("Failed to copy DEV table from previous kernel.\n"); + if (amd_iommu_pre_enabled) { + pr_err("Failed to reuse DEV table from previous kernel.\n"); + /* + * Bail out early if unable to remap/reuse DEV table from + * previous kernel if SNP enabled as IOMMU commands will + * time out without DEV table and cause kdump boot panic. + */ + BUG_ON(check_feature(FEATURE_SNP)); + } for_each_pci_segment(pci_seg) { if (pci_seg->old_dev_tbl_cpy != NULL) { - iommu_free_pages(pci_seg->old_dev_tbl_cpy, - get_order(pci_seg->dev_table_size)); + memunmap((void *)pci_seg->old_dev_tbl_cpy); pci_seg->old_dev_tbl_cpy = NULL; } } @@ -2786,11 +2922,10 @@ static void early_enable_iommus(void) early_enable_iommu(iommu); } } else { - pr_info("Copied DEV table from previous kernel.\n"); + pr_info("Reused DEV table from previous kernel.\n"); for_each_pci_segment(pci_seg) { - iommu_free_pages(pci_seg->dev_table, - get_order(pci_seg->dev_table_size)); + iommu_free_pages(pci_seg->dev_table); pci_seg->dev_table = pci_seg->old_dev_tbl_cpy; } @@ -2803,6 +2938,7 @@ static void early_enable_iommus(void) iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); + iommu_enable_2k_int(iommu); iommu_set_device_table(iommu); amd_iommu_flush_all_caches(iommu); } @@ -2882,11 +3018,6 @@ static void enable_iommus_vapic(void) #endif } -static void enable_iommus(void) -{ - early_enable_iommus(); -} - static void disable_iommus(void) { struct amd_iommu *iommu; @@ -2905,7 +3036,7 @@ static void disable_iommus(void) * disable suspend until real resume implemented */ -static void amd_iommu_resume(void) +static void amd_iommu_resume(void *data) { struct amd_iommu *iommu; @@ -2913,12 +3044,13 @@ static void amd_iommu_resume(void) iommu_apply_resume_quirks(iommu); /* re-load the hardware */ - enable_iommus(); + for_each_iommu(iommu) + early_enable_iommu(iommu); amd_iommu_enable_interrupts(); } -static int amd_iommu_suspend(void) +static int amd_iommu_suspend(void *data) { /* disable IOMMUs to go out of the way for BIOS */ disable_iommus(); @@ -2926,16 +3058,17 @@ static int amd_iommu_suspend(void) return 0; } -static struct syscore_ops amd_iommu_syscore_ops = { +static const struct syscore_ops amd_iommu_syscore_ops = { .suspend = amd_iommu_suspend, .resume = amd_iommu_resume, }; +static struct syscore amd_iommu_syscore = { + .ops = &amd_iommu_syscore_ops, +}; + static void __init free_iommu_resources(void) { - kmem_cache_destroy(amd_iommu_irq_cache); - amd_iommu_irq_cache = NULL; - free_iommu_all(); free_pci_segments(); } @@ -2994,9 +3127,7 @@ static bool __init check_ioapic_information(void) static void __init free_dma_resources(void) { - iommu_free_pages(amd_iommu_pd_alloc_bitmap, - get_order(MAX_DOMAIN_ID / 8)); - amd_iommu_pd_alloc_bitmap = NULL; + ida_destroy(&pdom_ids); free_unity_maps(); } @@ -3036,8 +3167,9 @@ static void __init ivinfo_init(void *ivrs) static int __init early_amd_iommu_init(void) { struct acpi_table_header *ivrs_base; - int remap_cache_sz, ret; + int ret; acpi_status status; + u8 efr_hats; if (!amd_iommu_detected) return -ENODEV; @@ -3051,6 +3183,12 @@ static int __init early_amd_iommu_init(void) return -EINVAL; } + if (!boot_cpu_has(X86_FEATURE_CX16)) { + pr_err("Failed to initialize. The CMPXCHG16B feature is required.\n"); + ret = -EINVAL; + goto out; + } + /* * Validate checksum here so we don't need to do it when * we actually parse the table @@ -3064,20 +3202,6 @@ static int __init early_amd_iommu_init(void) amd_iommu_target_ivhd_type = get_highest_supported_ivhd_type(ivrs_base); DUMP_printk("Using IVHD type %#x\n", amd_iommu_target_ivhd_type); - /* Device table - directly used by all IOMMUs */ - ret = -ENOMEM; - - amd_iommu_pd_alloc_bitmap = iommu_alloc_pages(GFP_KERNEL, - get_order(MAX_DOMAIN_ID / 8)); - if (amd_iommu_pd_alloc_bitmap == NULL) - goto out; - - /* - * never allocate domain 0 because its used as the non-allocated and - * error value placeholder - */ - __set_bit(0, amd_iommu_pd_alloc_bitmap); - /* * now the data structures are allocated and basically initialized * start the real acpi table scan @@ -3088,9 +3212,40 @@ static int __init early_amd_iommu_init(void) /* 5 level guest page table */ if (cpu_feature_enabled(X86_FEATURE_LA57) && - check_feature_gpt_level() == GUEST_PGTABLE_5_LEVEL) + FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL) amd_iommu_gpt_level = PAGE_MODE_5_LEVEL; + efr_hats = FIELD_GET(FEATURE_HATS, amd_iommu_efr); + if (efr_hats != 0x3) { + /* + * efr[HATS] bits specify the maximum host translation level + * supported, with LEVEL 4 being initial max level. + */ + amd_iommu_hpt_level = efr_hats + PAGE_MODE_4_LEVEL; + } else { + pr_warn_once(FW_BUG "Disable host address translation due to invalid translation level (%#x).\n", + efr_hats); + amd_iommu_hatdis = true; + } + + if (amd_iommu_pgtable == PD_MODE_V2) { + if (!amd_iommu_v2_pgtbl_supported()) { + pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); + amd_iommu_pgtable = PD_MODE_V1; + } + } + + if (amd_iommu_hatdis) { + /* + * Host (v1) page table is not available. Attempt to use + * Guest (v2) page table. + */ + if (amd_iommu_v2_pgtbl_supported()) + amd_iommu_pgtable = PD_MODE_V2; + else + amd_iommu_pgtable = PD_MODE_NONE; + } + /* Disable any previously enabled IOMMUs */ if (!is_kdump_kernel() || amd_iommu_disabled) disable_iommus(); @@ -3100,22 +3255,7 @@ static int __init early_amd_iommu_init(void) if (amd_iommu_irq_remap) { struct amd_iommu_pci_seg *pci_seg; - /* - * Interrupt remapping enabled, create kmem_cache for the - * remapping tables. - */ ret = -ENOMEM; - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) - remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32); - else - remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2); - amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache", - remap_cache_sz, - DTE_INTTAB_ALIGNMENT, - 0, NULL); - if (!amd_iommu_irq_cache) - goto out; - for_each_pci_segment(pci_seg) { if (alloc_irq_lookup_table(pci_seg)) goto out; @@ -3196,7 +3336,7 @@ out: return true; } -static void iommu_snp_enable(void) +static __init void iommu_snp_enable(void) { #ifdef CONFIG_KVM_AMD_SEV if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) @@ -3210,7 +3350,7 @@ static void iommu_snp_enable(void) goto disable_snp; } - if (amd_iommu_pgtable != AMD_IOMMU_V1) { + if (amd_iommu_pgtable != PD_MODE_V1) { pr_warn("SNP: IOMMU is configured with V2 page table mode, SNP cannot be supported.\n"); goto disable_snp; } @@ -3221,6 +3361,14 @@ static void iommu_snp_enable(void) goto disable_snp; } + /* + * Enable host SNP support once SNP support is checked on IOMMU. + */ + if (snp_rmptable_init()) { + pr_warn("SNP: RMP initialization failed, SNP cannot be supported.\n"); + goto disable_snp; + } + pr_info("IOMMU SNP support enabled.\n"); return; @@ -3263,7 +3411,7 @@ static int __init state_next(void) init_state = IOMMU_ENABLED; break; case IOMMU_ENABLED: - register_syscore_ops(&amd_iommu_syscore_ops); + register_syscore(&amd_iommu_syscore); iommu_snp_enable(); ret = amd_iommu_init_pci(); init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT; @@ -3320,6 +3468,19 @@ static int __init iommu_go_to_state(enum iommu_init_state state) ret = state_next(); } + /* + * SNP platform initilazation requires IOMMUs to be fully configured. + * If the SNP support on IOMMUs has NOT been checked, simply mark SNP + * as unsupported. If the SNP support on IOMMUs has been checked and + * host SNP support enabled but RMP enforcement has not been enabled + * in IOMMUs, then the system is in a half-baked state, but can limp + * along as all memory should be Hypervisor-Owned in the RMP. WARN, + * but leave SNP as "supported" to avoid confusing the kernel. + */ + if (ret && cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !WARN_ON_ONCE(amd_iommu_snp_en)) + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + return ret; } @@ -3353,12 +3514,12 @@ int __init amd_iommu_enable(void) void amd_iommu_disable(void) { - amd_iommu_suspend(); + amd_iommu_suspend(NULL); } int amd_iommu_reenable(int mode) { - amd_iommu_resume(); + amd_iommu_resume(NULL); return 0; } @@ -3377,7 +3538,6 @@ int amd_iommu_enable_faulting(unsigned int cpu) */ static int __init amd_iommu_init(void) { - struct amd_iommu *iommu; int ret; ret = iommu_go_to_state(IOMMU_INITIALIZED); @@ -3391,8 +3551,8 @@ static int __init amd_iommu_init(void) } #endif - for_each_iommu(iommu) - amd_iommu_debugfs_setup(iommu); + if (!ret) + amd_iommu_debugfs_setup(); return ret; } @@ -3423,25 +3583,28 @@ static bool amd_iommu_sme_check(void) * IOMMUs * ****************************************************************************/ -int __init amd_iommu_detect(void) +void __init amd_iommu_detect(void) { int ret; if (no_iommu || (iommu_detected && !gart_iommu_aperture)) - return -ENODEV; + goto disable_snp; if (!amd_iommu_sme_check()) - return -ENODEV; + goto disable_snp; ret = iommu_go_to_state(IOMMU_IVRS_DETECTED); if (ret) - return ret; + goto disable_snp; amd_iommu_detected = true; iommu_detected = 1; x86_init.iommu.iommu_init = amd_iommu_init; + return; - return 1; +disable_snp: + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); } /**************************************************************************** @@ -3489,11 +3652,17 @@ static int __init parse_amd_iommu_options(char *str) } else if (strncmp(str, "force_isolation", 15) == 0) { amd_iommu_force_isolation = true; } else if (strncmp(str, "pgtbl_v1", 8) == 0) { - amd_iommu_pgtable = AMD_IOMMU_V1; + amd_iommu_pgtable = PD_MODE_V1; } else if (strncmp(str, "pgtbl_v2", 8) == 0) { - amd_iommu_pgtable = AMD_IOMMU_V2; + amd_iommu_pgtable = PD_MODE_V2; } else if (strncmp(str, "irtcachedis", 11) == 0) { amd_iommu_irtcachedis = true; + } else if (strncmp(str, "nohugepages", 11) == 0) { + pr_info("Restricting V1 page-sizes to 4KiB"); + amd_iommu_pgsize_bitmap = AMD_IOMMU_PGSIZES_4K; + } else if (strncmp(str, "v2_pgsizes_only", 15) == 0) { + pr_info("Restricting V1 page-sizes to 4KiB/2MiB/1GiB"); + amd_iommu_pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; } else { pr_notice("Unknown option - '%s'\n", str); } @@ -3588,7 +3757,7 @@ static int __init parse_ivrs_acpihid(char *str) { u32 seg = 0, bus, dev, fn; char *hid, *uid, *p, *addr; - char acpiid[ACPIID_LEN] = {0}; + char acpiid[ACPIID_LEN + 1] = { }; /* size with NULL terminator */ int i; addr = strchr(str, '@'); @@ -3614,7 +3783,7 @@ static int __init parse_ivrs_acpihid(char *str) /* We have the '@', make it the terminator to get just the acpiid */ *addr++ = 0; - if (strlen(str) > ACPIID_LEN + 1) + if (strlen(str) > ACPIID_LEN) goto not_found; if (sscanf(str, "=%s", acpiid) != 1) @@ -3645,6 +3814,14 @@ found: while (*uid == '0' && *(uid + 1)) uid++; + if (strlen(hid) >= ACPIHID_HID_LEN) { + pr_err("Invalid command line: hid is too long\n"); + return 1; + } else if (strlen(uid) >= ACPIHID_UID_LEN) { + pr_err("Invalid command line: uid is too long\n"); + return 1; + } + i = early_acpihid_map_size++; memcpy(early_acpihid_map[i].hid, hid, strlen(hid)); memcpy(early_acpihid_map[i].uid, uid, strlen(uid)); @@ -3854,4 +4031,10 @@ int amd_iommu_snp_disable(void) return 0; } EXPORT_SYMBOL_GPL(amd_iommu_snp_disable); + +bool amd_iommu_sev_tio_supported(void) +{ + return check_feature2(FEATURE_SEVSNPIO_SUP); +} +EXPORT_SYMBOL_GPL(amd_iommu_sev_tio_supported); #endif diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c deleted file mode 100644 index 9d9a7fde59e7..000000000000 --- a/drivers/iommu/amd/io_pgtable.c +++ /dev/null @@ -1,607 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table allocator. - * - * Copyright (C) 2020 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/io-pgtable.h> -#include <linux/kernel.h> -#include <linux/sizes.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/dma-mapping.h> - -#include <asm/barrier.h> - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -static void v1_tlb_flush_all(void *cookie) -{ -} - -static void v1_tlb_flush_walk(unsigned long iova, size_t size, - size_t granule, void *cookie) -{ -} - -static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, - unsigned long iova, size_t granule, - void *cookie) -{ -} - -static const struct iommu_flush_ops v1_flush_ops = { - .tlb_flush_all = v1_tlb_flush_all, - .tlb_flush_walk = v1_tlb_flush_walk, - .tlb_add_page = v1_tlb_add_page, -}; - -/* - * Helper function to get the first pte of a large mapping - */ -static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, - unsigned long *count) -{ - unsigned long pte_mask, pg_size, cnt; - u64 *fpte; - - pg_size = PTE_PAGE_SIZE(*pte); - cnt = PAGE_SIZE_PTE_COUNT(pg_size); - pte_mask = ~((cnt << 3) - 1); - fpte = (u64 *)(((unsigned long)pte) & pte_mask); - - if (page_size) - *page_size = pg_size; - - if (count) - *count = cnt; - - return fpte; -} - -/**************************************************************************** - * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ - -static void free_pt_page(u64 *pt, struct list_head *freelist) -{ - struct page *p = virt_to_page(pt); - - list_add_tail(&p->lru, freelist); -} - -static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) -{ - u64 *p; - int i; - - for (i = 0; i < 512; ++i) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - /* Large PTE? */ - if (PM_PTE_LEVEL(pt[i]) == 0 || - PM_PTE_LEVEL(pt[i]) == 7) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = IOMMU_PTE_PAGE(pt[i]); - if (lvl > 2) - free_pt_lvl(p, freelist, lvl - 1); - else - free_pt_page(p, freelist); - } - - free_pt_page(pt, freelist); -} - -static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) -{ - switch (mode) { - case PAGE_MODE_NONE: - case PAGE_MODE_7_LEVEL: - break; - case PAGE_MODE_1_LEVEL: - free_pt_page(root, freelist); - break; - case PAGE_MODE_2_LEVEL: - case PAGE_MODE_3_LEVEL: - case PAGE_MODE_4_LEVEL: - case PAGE_MODE_5_LEVEL: - case PAGE_MODE_6_LEVEL: - free_pt_lvl(root, freelist, mode); - break; - default: - BUG(); - } -} - -void amd_iommu_domain_set_pgtable(struct protection_domain *domain, - u64 *root, int mode) -{ - u64 pt_root; - - /* lowest 3 bits encode pgtable mode */ - pt_root = mode & 7; - pt_root |= (u64)root; - - amd_iommu_domain_set_pt_root(domain, pt_root); -} - -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - unsigned long address, - gfp_t gfp) -{ - unsigned long flags; - bool ret = true; - u64 *pte; - - pte = iommu_alloc_page_node(domain->nid, gfp); - if (!pte) - return false; - - spin_lock_irqsave(&domain->lock, flags); - - if (address <= PM_LEVEL_SIZE(domain->iop.mode)) - goto out; - - ret = false; - if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) - goto out; - - *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); - - domain->iop.root = pte; - domain->iop.mode += 1; - amd_iommu_update_and_flush_device_table(domain); - amd_iommu_domain_flush_complete(domain); - - /* - * Device Table needs to be updated and flushed before the new root can - * be published. - */ - amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); - - pte = NULL; - ret = true; - -out: - spin_unlock_irqrestore(&domain->lock, flags); - iommu_free_page(pte); - - return ret; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - unsigned long page_size, - u64 **pte_page, - gfp_t gfp, - bool *updated) -{ - int level, end_lvl; - u64 *pte, *page; - - BUG_ON(!is_power_of_2(page_size)); - - while (address > PM_LEVEL_SIZE(domain->iop.mode)) { - /* - * Return an error if there is no memory to update the - * page-table. - */ - if (!increase_address_space(domain, address, gfp)) - return NULL; - } - - - level = domain->iop.mode - 1; - pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; - address = PAGE_SIZE_ALIGN(address, page_size); - end_lvl = PAGE_SIZE_LEVEL(page_size); - - while (level > end_lvl) { - u64 __pte, __npte; - int pte_level; - - __pte = *pte; - pte_level = PM_PTE_LEVEL(__pte); - - /* - * If we replace a series of large PTEs, we need - * to tear down all of them. - */ - if (IOMMU_PTE_PRESENT(__pte) && - pte_level == PAGE_MODE_7_LEVEL) { - unsigned long count, i; - u64 *lpte; - - lpte = first_pte_l7(pte, NULL, &count); - - /* - * Unmap the replicated PTEs that still match the - * original large mapping - */ - for (i = 0; i < count; ++i) - cmpxchg64(&lpte[i], __pte, 0ULL); - - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE) { - page = iommu_alloc_page_node(domain->nid, gfp); - - if (!page) - return NULL; - - __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); - - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_page(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - /* No level skipping support yet */ - if (pte_level != level) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(__pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long *page_size) -{ - int level; - u64 *pte; - - *page_size = 0; - - if (address > PM_LEVEL_SIZE(pgtable->mode)) - return NULL; - - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - - while (level > 0) { - - /* Not Present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Large PTE */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || - PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) - break; - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - /* Walk to the next level */ - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - } - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) - pte = first_pte_l7(pte, page_size, NULL); - - return pte; -} - -static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) -{ - u64 *pt; - int mode; - - while (!try_cmpxchg64(pte, &pteval, 0)) - pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); - - if (!IOMMU_PTE_PRESENT(pteval)) - return; - - pt = IOMMU_PTE_PAGE(pteval); - mode = IOMMU_PTE_MODE(pteval); - - free_sub_pt(pt, mode, freelist); -} - -/* - * Generic mapping functions. It maps a physical address into a DMA - * address space. It allocates the page table pages if necessary. - * In the future it can be extended to a generic mapping function - * supporting all features of AMD IOMMU page tables like level skipping - * and full 64 bit address spaces. - */ -static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct protection_domain *dom = io_pgtable_ops_to_domain(ops); - LIST_HEAD(freelist); - bool updated = false; - u64 __pte, *pte; - int ret, i, count; - size_t size = pgcount << __ffs(pgsize); - unsigned long o_iova = iova; - - BUG_ON(!IS_ALIGNED(iova, pgsize)); - BUG_ON(!IS_ALIGNED(paddr, pgsize)); - - ret = -EINVAL; - if (!(prot & IOMMU_PROT_MASK)) - goto out; - - while (pgcount > 0) { - count = PAGE_SIZE_PTE_COUNT(pgsize); - pte = alloc_pte(dom, iova, pgsize, NULL, gfp, &updated); - - ret = -ENOMEM; - if (!pte) - goto out; - - for (i = 0; i < count; ++i) - free_clear_pte(&pte[i], pte[i], &freelist); - - if (!list_empty(&freelist)) - updated = true; - - if (count > 1) { - __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; - } else - __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; - - if (prot & IOMMU_PROT_IR) - __pte |= IOMMU_PTE_IR; - if (prot & IOMMU_PROT_IW) - __pte |= IOMMU_PTE_IW; - - for (i = 0; i < count; ++i) - pte[i] = __pte; - - iova += pgsize; - paddr += pgsize; - pgcount--; - if (mapped) - *mapped += pgsize; - } - - ret = 0; - -out: - if (updated) { - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - /* - * Flush domain TLB(s) and wait for completion. Any Device-Table - * Updates and flushing already happened in - * increase_address_space(). - */ - amd_iommu_domain_flush_pages(dom, o_iova, size); - spin_unlock_irqrestore(&dom->lock, flags); - } - - /* Everything flushed out, free pages now */ - iommu_put_pages_list(&freelist); - - return ret; -} - -static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long long unmapped; - unsigned long unmap_size; - u64 *pte; - size_t size = pgcount << __ffs(pgsize); - - BUG_ON(!is_power_of_2(pgsize)); - - unmapped = 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (pte) { - int i, count; - - count = PAGE_SIZE_PTE_COUNT(unmap_size); - for (i = 0; i < count; i++) - pte[i] = 0ULL; - } else { - return unmapped; - } - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, - unsigned long flags) -{ - bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; - bool dirty = false; - int i, count; - - /* - * 2.2.3.2 Host Dirty Support - * When a non-default page size is used , software must OR the - * Dirty bits in all of the replicated host PTEs used to map - * the page. The IOMMU does not guarantee the Dirty bits are - * set in all of the replicated PTEs. Any portion of the page - * may have been written even if the Dirty bit is set in only - * one of the replicated PTEs. - */ - count = PAGE_SIZE_PTE_COUNT(size); - for (i = 0; i < count && test_only; i++) { - if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { - dirty = true; - break; - } - } - - for (i = 0; i < count && !test_only; i++) { - if (test_and_clear_bit(IOMMU_PTE_HD_BIT, - (unsigned long *)&ptep[i])) { - dirty = true; - } - } - - return dirty; -} - -static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long end = iova + size - 1; - - do { - unsigned long pgsize = 0; - u64 *ptep, pte; - - ptep = fetch_pte(pgtable, iova, &pgsize); - if (ptep) - pte = READ_ONCE(*ptep); - if (!ptep || !IOMMU_PTE_PRESENT(pte)) { - pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); - iova += pgsize; - continue; - } - - /* - * Mark the whole IOVA range as dirty even if only one of - * the replicated PTEs were marked dirty. - */ - if (pte_test_and_clear_dirty(ptep, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -/* - * ---------------------------------------------------- - */ -static void v1_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); - struct protection_domain *dom; - LIST_HEAD(freelist); - - if (pgtable->mode == PAGE_MODE_NONE) - return; - - dom = container_of(pgtable, struct protection_domain, iop); - - /* Page-table is not visible to IOMMU anymore, so free it */ - BUG_ON(pgtable->mode < PAGE_MODE_NONE || - pgtable->mode > PAGE_MODE_6_LEVEL); - - free_sub_pt(pgtable->root, pgtable->mode, &freelist); - - /* Update data structure */ - amd_iommu_domain_clr_pt_root(dom); - - /* Make changes visible to IOMMUs */ - amd_iommu_domain_update(dom); - - iommu_put_pages_list(&freelist); -} - -static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, - cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, - cfg->tlb = &v1_flush_ops; - - pgtable->iop.ops.map_pages = iommu_v1_map_pages; - pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; - pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; - pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; - - return &pgtable->iop; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { - .alloc = v1_alloc_pgtable, - .free = v1_free_pgtable, -}; diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c deleted file mode 100644 index 78ac37c5ccc1..000000000000 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ /dev/null @@ -1,387 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table v2 allocator. - * - * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - * Author: Vasant Hegde <vasant.hegde@amd.com> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include <linux/bitops.h> -#include <linux/io-pgtable.h> -#include <linux/kernel.h> - -#include <asm/barrier.h> - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */ -#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */ -#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */ -#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */ -#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */ -#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */ -#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */ -#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */ -#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */ - -#define MAX_PTRS_PER_PAGE 512 - -#define IOMMU_PAGE_SIZE_2M BIT_ULL(21) -#define IOMMU_PAGE_SIZE_1G BIT_ULL(30) - - -static inline int get_pgtable_level(void) -{ - return amd_iommu_gpt_level; -} - -static inline bool is_large_pte(u64 pte) -{ - return (pte & IOMMU_PAGE_PSE); -} - -static inline u64 set_pgtable_attr(u64 *page) -{ - u64 prot; - - prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER; - prot |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; - - return (iommu_virt_to_phys(page) | prot); -} - -static inline void *get_pgtable_pte(u64 pte) -{ - return iommu_phys_to_virt(pte & PM_ADDR_MASK); -} - -static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot) -{ - u64 pte; - - pte = __sme_set(paddr & PM_ADDR_MASK); - pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER; - pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; - - if (prot & IOMMU_PROT_IW) - pte |= IOMMU_PAGE_RW; - - /* Large page */ - if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M) - pte |= IOMMU_PAGE_PSE; - - return pte; -} - -static inline u64 get_alloc_page_size(u64 size) -{ - if (size >= IOMMU_PAGE_SIZE_1G) - return IOMMU_PAGE_SIZE_1G; - - if (size >= IOMMU_PAGE_SIZE_2M) - return IOMMU_PAGE_SIZE_2M; - - return PAGE_SIZE; -} - -static inline int page_size_to_level(u64 pg_size) -{ - if (pg_size == IOMMU_PAGE_SIZE_1G) - return PAGE_MODE_3_LEVEL; - if (pg_size == IOMMU_PAGE_SIZE_2M) - return PAGE_MODE_2_LEVEL; - - return PAGE_MODE_1_LEVEL; -} - -static void free_pgtable(u64 *pt, int level) -{ - u64 *p; - int i; - - for (i = 0; i < MAX_PTRS_PER_PAGE; i++) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - if (is_large_pte(pt[i])) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = get_pgtable_pte(pt[i]); - if (level > 2) - free_pgtable(p, level - 1); - else - iommu_free_page(p); - } - - iommu_free_page(pt); -} - -/* Allocate page table */ -static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, - unsigned long pg_size, gfp_t gfp, bool *updated) -{ - u64 *pte, *page; - int level, end_level; - - level = get_pgtable_level() - 1; - end_level = page_size_to_level(pg_size); - pte = &pgd[PM_LEVEL_INDEX(level, iova)]; - iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE); - - while (level >= end_level) { - u64 __pte, __npte; - - __pte = *pte; - - if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) { - /* Unmap large pte */ - cmpxchg64(pte, *pte, 0ULL); - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte)) { - page = iommu_alloc_page_node(nid, gfp); - if (!page) - return NULL; - - __npte = set_pgtable_attr(page); - /* pte could have been changed somewhere. */ - if (cmpxchg64(pte, __pte, __npte) != __pte) - iommu_free_page(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - level -= 1; - pte = get_pgtable_pte(__pte); - pte = &pte[PM_LEVEL_INDEX(level, iova)]; - } - - /* Tear down existing pte entries */ - if (IOMMU_PTE_PRESENT(*pte)) { - u64 *__pte; - - *updated = true; - __pte = get_pgtable_pte(*pte); - cmpxchg64(pte, *pte, 0ULL); - if (pg_size == IOMMU_PAGE_SIZE_1G) - free_pgtable(__pte, end_level - 1); - else if (pg_size == IOMMU_PAGE_SIZE_2M) - iommu_free_page(__pte); - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. - * If there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long iova, unsigned long *page_size) -{ - u64 *pte; - int level; - - level = get_pgtable_level() - 1; - pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)]; - /* Default page size is 4K */ - *page_size = PAGE_SIZE; - - while (level) { - /* Not present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Walk to the next level */ - pte = get_pgtable_pte(*pte); - pte = &pte[PM_LEVEL_INDEX(level - 1, iova)]; - - /* Large page */ - if (is_large_pte(*pte)) { - if (level == PAGE_MODE_3_LEVEL) - *page_size = IOMMU_PAGE_SIZE_1G; - else if (level == PAGE_MODE_2_LEVEL) - *page_size = IOMMU_PAGE_SIZE_2M; - else - return NULL; /* Wrongly set PSE bit in PTE */ - - break; - } - - level -= 1; - } - - return pte; -} - -static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); - struct io_pgtable_cfg *cfg = &pdom->iop.iop.cfg; - u64 *pte; - unsigned long map_size; - unsigned long mapped_size = 0; - unsigned long o_iova = iova; - size_t size = pgcount << __ffs(pgsize); - int ret = 0; - bool updated = false; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount) - return -EINVAL; - - if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; - - while (mapped_size < size) { - map_size = get_alloc_page_size(pgsize); - pte = v2_alloc_pte(pdom->nid, pdom->iop.pgd, - iova, map_size, gfp, &updated); - if (!pte) { - ret = -EINVAL; - goto out; - } - - *pte = set_pte_attr(paddr, map_size, prot); - - iova += map_size; - paddr += map_size; - mapped_size += map_size; - } - -out: - if (updated) - amd_iommu_domain_flush_pages(pdom, o_iova, size); - - if (mapped) - *mapped += mapped_size; - - return ret; -} - -static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->iop.cfg; - unsigned long unmap_size; - unsigned long unmapped = 0; - size_t size = pgcount << __ffs(pgsize); - u64 *pte; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount)) - return 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (!pte) - return unmapped; - - *pte = 0ULL; - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -/* - * ---------------------------------------------------- - */ -static void v2_tlb_flush_all(void *cookie) -{ -} - -static void v2_tlb_flush_walk(unsigned long iova, size_t size, - size_t granule, void *cookie) -{ -} - -static void v2_tlb_add_page(struct iommu_iotlb_gather *gather, - unsigned long iova, size_t granule, - void *cookie) -{ -} - -static const struct iommu_flush_ops v2_flush_ops = { - .tlb_flush_all = v2_tlb_flush_all, - .tlb_flush_walk = v2_tlb_flush_walk, - .tlb_add_page = v2_tlb_add_page, -}; - -static void v2_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); - - if (!pgtable || !pgtable->pgd) - return; - - /* Free page table */ - free_pgtable(pgtable->pgd, get_pgtable_level()); - pgtable->pgd = NULL; -} - -static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - struct protection_domain *pdom = (struct protection_domain *)cookie; - int ias = IOMMU_IN_ADDR_BIT_SIZE; - - pgtable->pgd = iommu_alloc_page_node(pdom->nid, GFP_ATOMIC); - if (!pgtable->pgd) - return NULL; - - if (get_pgtable_level() == PAGE_MODE_5_LEVEL) - ias = 57; - - pgtable->iop.ops.map_pages = iommu_v2_map_pages; - pgtable->iop.ops.unmap_pages = iommu_v2_unmap_pages; - pgtable->iop.ops.iova_to_phys = iommu_v2_iova_to_phys; - - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2, - cfg->ias = ias, - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, - cfg->tlb = &v2_flush_ops; - - return &pgtable->iop; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { - .alloc = v2_alloc_pgtable, - .free = v2_free_pgtable, -}; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b19e8c0f48fa..9f1d56a5e145 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -14,20 +14,22 @@ #include <linux/pci-ats.h> #include <linux/bitmap.h> #include <linux/slab.h> +#include <linux/string_choices.h> #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/dma-map-ops.h> #include <linux/dma-direct.h> +#include <linux/idr.h> #include <linux/iommu-helper.h> #include <linux/delay.h> #include <linux/amd-iommu.h> #include <linux/notifier.h> #include <linux/export.h> #include <linux/irq.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/msi.h> #include <linux/irqdomain.h> #include <linux/percpu.h> -#include <linux/io-pgtable.h> #include <linux/cc_platform.h> #include <asm/irq_remapping.h> #include <asm/io_apic.h> @@ -38,9 +40,9 @@ #include <asm/gart.h> #include <asm/dma.h> #include <uapi/linux/iommufd.h> +#include <linux/generic_pt/iommu.h> #include "amd_iommu.h" -#include "../dma-iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" @@ -52,32 +54,36 @@ #define HT_RANGE_START (0xfd00000000ULL) #define HT_RANGE_END (0xffffffffffULL) -#define DEFAULT_PGTABLE_LEVEL PAGE_MODE_3_LEVEL - -static DEFINE_SPINLOCK(pd_bitmap_lock); - LIST_HEAD(ioapic_map); LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; -static const struct iommu_dirty_ops amd_dirty_ops; int amd_iommu_max_glx_val = -1; /* - * general struct to manage commands send to an IOMMU + * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap + * to know which ones are already in use. */ -struct iommu_cmd { - u32 data[4]; -}; +DEFINE_IDA(pdom_ids); -struct kmem_cache *amd_iommu_irq_cache; - -static void detach_device(struct device *dev); +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old); static void set_dte_entry(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data); + struct iommu_dev_data *dev_data, + phys_addr_t top_paddr, unsigned int top_level); + +static void amd_iommu_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level); + +static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); + +static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); /**************************************************************************** * @@ -85,6 +91,132 @@ static void set_dte_entry(struct amd_iommu *iommu, * ****************************************************************************/ +static __always_inline void amd_iommu_atomic128_set(__int128 *ptr, __int128 val) +{ + /* + * Note: + * We use arch_cmpxchg128_local() because: + * - Need cmpxchg16b instruction mainly for 128-bit store to DTE + * (not necessary for cmpxchg since this function is already + * protected by a spin_lock for this DTE). + * - Neither need LOCK_PREFIX nor try loop because of the spin_lock. + */ + arch_cmpxchg128_local(ptr, *ptr, val); +} + +static void write_dte_upper128(struct dev_table_entry *ptr, struct dev_table_entry *new) +{ + struct dev_table_entry old; + + old.data128[1] = ptr->data128[1]; + /* + * Preserve DTE_DATA2_INTR_MASK. This needs to be + * done here since it requires to be inside + * spin_lock(&dev_data->dte_lock) context. + */ + new->data[2] &= ~DTE_DATA2_INTR_MASK; + new->data[2] |= old.data[2] & DTE_DATA2_INTR_MASK; + + amd_iommu_atomic128_set(&ptr->data128[1], new->data128[1]); +} + +static void write_dte_lower128(struct dev_table_entry *ptr, struct dev_table_entry *new) +{ + amd_iommu_atomic128_set(&ptr->data128[0], new->data128[0]); +} + +/* + * Note: + * IOMMU reads the entire Device Table entry in a single 256-bit transaction + * but the driver is programming DTE using 2 128-bit cmpxchg. So, the driver + * need to ensure the following: + * - DTE[V|GV] bit is being written last when setting. + * - DTE[V|GV] bit is being written first when clearing. + * + * This function is used only by code, which updates DMA translation part of the DTE. + * So, only consider control bits related to DMA when updating the entry. + */ +static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, + struct dev_table_entry *new) +{ + unsigned long flags; + struct dev_table_entry *dev_table = get_dev_table(iommu); + struct dev_table_entry *ptr = &dev_table[dev_data->devid]; + + spin_lock_irqsave(&dev_data->dte_lock, flags); + + if (!(ptr->data[0] & DTE_FLAG_V)) { + /* Existing DTE is not valid. */ + write_dte_upper128(ptr, new); + write_dte_lower128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else if (!(new->data[0] & DTE_FLAG_V)) { + /* Existing DTE is valid. New DTE is not valid. */ + write_dte_lower128(ptr, new); + write_dte_upper128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else if (!FIELD_GET(DTE_FLAG_GV, ptr->data[0])) { + /* + * Both DTEs are valid. + * Existing DTE has no guest page table. + */ + write_dte_upper128(ptr, new); + write_dte_lower128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else if (!FIELD_GET(DTE_FLAG_GV, new->data[0])) { + /* + * Both DTEs are valid. + * Existing DTE has guest page table, + * new DTE has no guest page table, + */ + write_dte_lower128(ptr, new); + write_dte_upper128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else if (FIELD_GET(DTE_GPT_LEVEL_MASK, ptr->data[2]) != + FIELD_GET(DTE_GPT_LEVEL_MASK, new->data[2])) { + /* + * Both DTEs are valid and have guest page table, + * but have different number of levels. So, we need + * to upadte both upper and lower 128-bit value, which + * require disabling and flushing. + */ + struct dev_table_entry clear = {}; + + /* First disable DTE */ + write_dte_lower128(ptr, &clear); + iommu_flush_dte_sync(iommu, dev_data->devid); + + /* Then update DTE */ + write_dte_upper128(ptr, new); + write_dte_lower128(ptr, new); + iommu_flush_dte_sync(iommu, dev_data->devid); + } else { + /* + * Both DTEs are valid and have guest page table, + * and same number of levels. We just need to only + * update the lower 128-bit. So no need to disable DTE. + */ + write_dte_lower128(ptr, new); + } + + spin_unlock_irqrestore(&dev_data->dte_lock, flags); +} + +static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, + struct dev_table_entry *dte) +{ + unsigned long flags; + struct dev_table_entry *ptr; + struct dev_table_entry *dev_table = get_dev_table(iommu); + + ptr = &dev_table[dev_data->devid]; + + spin_lock_irqsave(&dev_data->dte_lock, flags); + dte->data128[0] = ptr->data128[0]; + dte->data128[1] = ptr->data128[1]; + spin_unlock_irqrestore(&dev_data->dte_lock, flags); +} + static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) { return (pdom && (pdom->pd_mode == PD_MODE_V2)); @@ -109,7 +241,9 @@ static inline int get_acpihid_device_id(struct device *dev, struct acpihid_map_entry **entry) { struct acpi_device *adev = ACPI_COMPANION(dev); - struct acpihid_map_entry *p; + struct acpihid_map_entry *p, *p1 = NULL; + int hid_count = 0; + bool fw_bug; if (!adev) return -ENODEV; @@ -117,12 +251,33 @@ static inline int get_acpihid_device_id(struct device *dev, list_for_each_entry(p, &acpihid_map, list) { if (acpi_dev_hid_uid_match(adev, p->hid, p->uid[0] ? p->uid : NULL)) { - if (entry) - *entry = p; - return p->devid; + p1 = p; + fw_bug = false; + hid_count = 1; + break; + } + + /* + * Count HID matches w/o UID, raise FW_BUG but allow exactly one match + */ + if (acpi_dev_hid_match(adev, p->hid)) { + p1 = p; + hid_count++; + fw_bug = true; } } - return -EINVAL; + + if (!p1) + return -EINVAL; + if (fw_bug) + dev_err_once(dev, FW_BUG "No ACPI device matched UID, but %d device%s matched HID.\n", + hid_count, str_plural(hid_count)); + if (hid_count > 1) + return -EINVAL; + if (entry) + *entry = p1; + + return p1->devid; } static inline int get_device_sbdf_id(struct device *dev) @@ -204,7 +359,8 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) if (!dev_data) return NULL; - spin_lock_init(&dev_data->lock); + mutex_init(&dev_data->mutex); + spin_lock_init(&dev_data->dte_lock); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -212,7 +368,7 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) return dev_data; } -static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid) +struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid) { struct iommu_dev_data *dev_data; struct llist_node *node; @@ -232,9 +388,11 @@ static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid static int clone_alias(struct pci_dev *pdev, u16 alias, void *data) { + struct dev_table_entry new; struct amd_iommu *iommu; - struct dev_table_entry *dev_table; + struct iommu_dev_data *dev_data, *alias_data; u16 devid = pci_dev_id(pdev); + int ret = 0; if (devid == alias) return 0; @@ -243,13 +401,27 @@ static int clone_alias(struct pci_dev *pdev, u16 alias, void *data) if (!iommu) return 0; - amd_iommu_set_rlookup_table(iommu, alias); - dev_table = get_dev_table(iommu); - memcpy(dev_table[alias].data, - dev_table[devid].data, - sizeof(dev_table[alias].data)); + /* Copy the data from pdev */ + dev_data = dev_iommu_priv_get(&pdev->dev); + if (!dev_data) { + pr_err("%s : Failed to get dev_data for 0x%x\n", __func__, devid); + ret = -EINVAL; + goto out; + } + get_dte256(iommu, dev_data, &new); - return 0; + /* Setup alias */ + alias_data = find_dev_data(iommu, alias); + if (!alias_data) { + pr_err("%s : Failed to get alias dev_data for 0x%x\n", __func__, alias); + ret = -EINVAL; + goto out; + } + update_dte256(iommu, alias_data, &new); + + amd_iommu_set_rlookup_table(iommu, alias); +out: + return ret; } static void clone_aliases(struct amd_iommu *iommu, struct device *dev) @@ -462,8 +634,8 @@ static inline void pdev_disable_cap_pasid(struct pci_dev *pdev) static void pdev_enable_caps(struct pci_dev *pdev) { - pdev_enable_cap_ats(pdev); pdev_enable_cap_pasid(pdev); + pdev_enable_cap_ats(pdev); pdev_enable_cap_pri(pdev); } @@ -522,6 +694,12 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) return -ENOMEM; dev_data->dev = dev; + + /* + * The dev_iommu_priv_set() needes to be called before setup_aliases. + * Otherwise, subsequent call to dev_iommu_priv_get() will fail. + */ + dev_iommu_priv_set(dev, dev_data); setup_aliases(iommu, dev); /* @@ -535,8 +713,6 @@ static int iommu_init_device(struct amd_iommu *iommu, struct device *dev) dev_data->flags = pdev_get_caps(to_pci_dev(dev)); } - dev_iommu_priv_set(dev, dev_data); - return 0; } @@ -557,22 +733,6 @@ static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) setup_aliases(iommu, dev); } -static void amd_iommu_uninit_device(struct device *dev) -{ - struct iommu_dev_data *dev_data; - - dev_data = dev_iommu_priv_get(dev); - if (!dev_data) - return; - - if (dev_data->domain) - detach_device(dev); - - /* - * We keep dev_data around for unplugged devices and reuse it when the - * device is re-plugged - not doing so would introduce a ton of races. - */ -} /**************************************************************************** * @@ -583,10 +743,13 @@ static void amd_iommu_uninit_device(struct device *dev) static void dump_dte_entry(struct amd_iommu *iommu, u16 devid) { int i; - struct dev_table_entry *dev_table = get_dev_table(iommu); + struct dev_table_entry dte; + struct iommu_dev_data *dev_data = find_dev_data(iommu, devid); + + get_dte256(iommu, dev_data, &dte); for (i = 0; i < 4; ++i) - pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]); + pr_err("DTE[%d]: %016llx\n", i, dte.data[i]); } static void dump_command(unsigned long phys_addr) @@ -726,7 +889,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) int type, devid, flags, tag; volatile u32 *event = __evt; int count = 0; - u64 address; + u64 address, ctrl; u32 pasid; retry: @@ -736,6 +899,7 @@ retry: (event[1] & EVENT_DOMID_MASK_LO); flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; address = (u64)(((u64)event[3]) << 32) | event[2]; + ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET); if (type == 0) { /* Did we hit the erratum? */ @@ -757,6 +921,7 @@ retry: dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n", iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), pasid, address, flags); + dev_err(dev, "Control Reg : 0x%llx\n", ctrl); dump_dte_entry(iommu, devid); break; case EVENT_TYPE_DEV_TAB_ERR: @@ -825,10 +990,12 @@ static void iommu_poll_events(struct amd_iommu *iommu) while (head != tail) { iommu_print_event(iommu, iommu->evt_buf + head); + + /* Update head pointer of hardware ring-buffer */ head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE; + writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); } - writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); } #ifdef CONFIG_IRQ_REMAP @@ -838,6 +1005,14 @@ int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) { iommu_ga_log_notifier = notifier; + /* + * Ensure all in-flight IRQ handlers run to completion before returning + * to the caller, e.g. to ensure module code isn't unloaded while it's + * being executed in the IRQ handler. + */ + if (!notifier) + synchronize_rcu(); + return 0; } EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier); @@ -987,6 +1162,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) * ****************************************************************************/ +static void dump_command_buffer(struct amd_iommu *iommu) +{ + struct iommu_cmd *cmd; + u32 head, tail; + int i; + + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), + MMIO_CMD_BUFFER_TAIL(tail)); + + for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { + cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); + pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], + cmd->data[3]); + } +} + static int wait_on_sem(struct amd_iommu *iommu, u64 data) { int i = 0; @@ -997,7 +1191,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data) } if (i == LOOP_TIMEOUT) { - pr_alert("Completion-Wait loop timed out\n"); + + pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", + iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), + PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); + + if (amd_iommu_dump) + DO_ONCE_LITE(dump_command_buffer, iommu); + return -EIO; } @@ -1026,7 +1227,7 @@ static void build_completion_wait(struct iommu_cmd *cmd, struct amd_iommu *iommu, u64 data) { - u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem); + u64 paddr = iommu->cmd_sem_paddr; memset(cmd, 0, sizeof(*cmd)); cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; @@ -1230,7 +1431,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (!iommu->need_sync) return 0; - data = atomic64_add_return(1, &iommu->cmd_sem_val); + data = atomic64_inc_return(&iommu->cmd_sem_val); build_completion_wait(&cmd, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); @@ -1247,6 +1448,21 @@ out_unlock: return ret; } +static void domain_flush_complete(struct protection_domain *domain) +{ + struct pdom_iommu_info *pdom_iommu_info; + unsigned long i; + + lockdep_assert_held(&domain->lock); + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + xa_for_each(&domain->iommu_array, i, pdom_iommu_info) + iommu_completion_wait(pdom_iommu_info->iommu); +} + static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) { struct iommu_cmd cmd; @@ -1256,6 +1472,15 @@ static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) return iommu_queue_command(iommu, &cmd); } +static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid) +{ + int ret; + + ret = iommu_flush_dte(iommu, devid); + if (!ret) + iommu_completion_wait(iommu); +} + static void amd_iommu_flush_dte_all(struct amd_iommu *iommu) { u32 devid; @@ -1410,6 +1635,7 @@ static int domain_flush_pages_v2(struct protection_domain *pdom, struct iommu_cmd cmd; int ret = 0; + lockdep_assert_held(&pdom->lock); list_for_each_entry(dev_data, &pdom->dev_list, list) { struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); u16 domid = dev_data->gcr3_info.domid; @@ -1426,21 +1652,22 @@ static int domain_flush_pages_v2(struct protection_domain *pdom, static int domain_flush_pages_v1(struct protection_domain *pdom, u64 address, size_t size) { + struct pdom_iommu_info *pdom_iommu_info; struct iommu_cmd cmd; - int ret = 0, i; + int ret = 0; + unsigned long i; + + lockdep_assert_held(&pdom->lock); build_inv_iommu_pages(&cmd, address, size, pdom->id, IOMMU_NO_PASID, false); - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (!pdom->dev_iommu[i]) - continue; - + xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) { /* * Devices of this domain are behind this IOMMU * We need a TLB flush */ - ret |= iommu_queue_command(amd_iommus[i], &cmd); + ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); } return ret; @@ -1458,6 +1685,8 @@ static void __domain_flush_pages(struct protection_domain *domain, ioasid_t pasid = IOMMU_NO_PASID; bool gn = false; + lockdep_assert_held(&domain->lock); + if (pdom_is_v2_pgtbl_mode(domain)) { gn = true; ret = domain_flush_pages_v2(domain, address, size); @@ -1479,11 +1708,13 @@ static void __domain_flush_pages(struct protection_domain *domain, void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size) { + lockdep_assert_held(&domain->lock); + if (likely(!amd_iommu_np_cache)) { __domain_flush_pages(domain, address, size); /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ - amd_iommu_domain_flush_complete(domain); + domain_flush_complete(domain); return; } @@ -1523,7 +1754,7 @@ void amd_iommu_domain_flush_pages(struct protection_domain *domain, } /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ - amd_iommu_domain_flush_complete(domain); + domain_flush_complete(domain); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ @@ -1549,79 +1780,11 @@ void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, iommu_completion_wait(iommu); } -void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data, - ioasid_t pasid) +static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, + ioasid_t pasid) { - amd_iommu_dev_flush_pasid_pages(dev_data, 0, - CMD_INV_IOMMU_ALL_PAGES_ADDRESS, pasid); -} - -void amd_iommu_domain_flush_complete(struct protection_domain *domain) -{ - int i; - - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (domain && !domain->dev_iommu[i]) - continue; - - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); - } -} - -/* Flush the not present cache if it exists */ -static void domain_flush_np_cache(struct protection_domain *domain, - dma_addr_t iova, size_t size) -{ - if (unlikely(amd_iommu_np_cache)) { - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - amd_iommu_domain_flush_pages(domain, iova, size); - spin_unlock_irqrestore(&domain->lock, flags); - } -} - - -/* - * This function flushes the DTEs for all devices in domain - */ -static void domain_flush_devices(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - - list_for_each_entry(dev_data, &domain->dev_list, list) - device_flush_dte(dev_data); -} - -static void update_device_table(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); - - set_dte_entry(iommu, dev_data); - clone_aliases(iommu, dev_data->dev); - } -} - -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) -{ - update_device_table(domain); - domain_flush_devices(domain); -} - -void amd_iommu_domain_update(struct protection_domain *domain) -{ - /* Update device table */ - amd_iommu_update_and_flush_device_table(domain); - - /* Flush domain TLB(s) and wait for completion */ - amd_iommu_domain_flush_all(domain); + amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0, + CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) @@ -1649,31 +1812,14 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) * ****************************************************************************/ -static u16 domain_id_alloc(void) +static int pdom_id_alloc(void) { - unsigned long flags; - int id; - - spin_lock_irqsave(&pd_bitmap_lock, flags); - id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); - BUG_ON(id == 0); - if (id > 0 && id < MAX_DOMAIN_ID) - __set_bit(id, amd_iommu_pd_alloc_bitmap); - else - id = 0; - spin_unlock_irqrestore(&pd_bitmap_lock, flags); - - return id; + return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC); } -static void domain_id_free(int id) +static void pdom_id_free(int id) { - unsigned long flags; - - spin_lock_irqsave(&pd_bitmap_lock, flags); - if (id > 0 && id < MAX_DOMAIN_ID) - __clear_bit(id, amd_iommu_pd_alloc_bitmap); - spin_unlock_irqrestore(&pd_bitmap_lock, flags); + ida_free(&pdom_ids, id); } static void free_gcr3_tbl_level1(u64 *tbl) @@ -1687,7 +1833,7 @@ static void free_gcr3_tbl_level1(u64 *tbl) ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); - iommu_free_page(ptr); + iommu_free_pages(ptr); } } @@ -1718,9 +1864,9 @@ static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) gcr3_info->glx = 0; /* Free per device domain ID */ - domain_id_free(gcr3_info->domid); + pdom_id_free(gcr3_info->domid); - iommu_free_page(gcr3_info->gcr3_tbl); + iommu_free_pages(gcr3_info->gcr3_tbl); gcr3_info->gcr3_tbl = NULL; } @@ -1745,6 +1891,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, { int levels = get_gcr3_levels(pasids); int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; + int domid; if (levels > amd_iommu_max_glx_val) return -EINVAL; @@ -1753,11 +1900,14 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, return -EBUSY; /* Allocate per device domain ID */ - gcr3_info->domid = domain_id_alloc(); + domid = pdom_id_alloc(); + if (domid <= 0) + return -ENOSPC; + gcr3_info->domid = domid; - gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC); + gcr3_info->gcr3_tbl = iommu_alloc_pages_node_sz(nid, GFP_ATOMIC, SZ_4K); if (gcr3_info->gcr3_tbl == NULL) { - domain_id_free(gcr3_info->domid); + pdom_id_free(domid); return -ENOMEM; } @@ -1816,7 +1966,7 @@ static int update_gcr3(struct iommu_dev_data *dev_data, else *pte = 0; - amd_iommu_dev_flush_pasid_all(dev_data, pasid); + dev_flush_pasid_all(dev_data, pasid); return 0; } @@ -1851,90 +2001,127 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid) return ret; } +static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr, + struct dev_table_entry *new) +{ + /* All existing DTE must have V bit set */ + new->data128[0] = DTE_FLAG_V; + new->data128[1] = 0; +} + +/* + * Note: + * The old value for GCR3 table and GPT have been cleared from caller. + */ +static void set_dte_gcr3_table(struct amd_iommu *iommu, + struct iommu_dev_data *dev_data, + struct dev_table_entry *target) +{ + struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; + u64 gcr3; + + if (!gcr3_info->gcr3_tbl) + return; + + pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n", + __func__, dev_data->devid, gcr3_info->glx, + (unsigned long long)gcr3_info->gcr3_tbl); + + gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); + + target->data[0] |= DTE_FLAG_GV | + FIELD_PREP(DTE_GLX, gcr3_info->glx) | + FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12); + if (pdom_is_v2_pgtbl_mode(dev_data->domain)) + target->data[0] |= DTE_FLAG_GIOV; + + target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | + FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); + + /* Guest page table can only support 4 and 5 levels */ + if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) + target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); + else + target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); +} + static void set_dte_entry(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data) + struct iommu_dev_data *dev_data, + phys_addr_t top_paddr, unsigned int top_level) { - u64 pte_root = 0; - u64 flags = 0; - u32 old_domid; - u16 devid = dev_data->devid; u16 domid; + u32 old_domid; + struct dev_table_entry *initial_dte; + struct dev_table_entry new = {}; struct protection_domain *domain = dev_data->domain; - struct dev_table_entry *dev_table = get_dev_table(iommu); struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; + struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; + struct pt_iommu_amdv1_hw_info pt_info; + + make_clear_dte(dev_data, dte, &new); if (gcr3_info && gcr3_info->gcr3_tbl) domid = dev_data->gcr3_info.domid; - else + else { domid = domain->id; - if (domain->iop.mode != PAGE_MODE_NONE) - pte_root = iommu_virt_to_phys(domain->iop.root); + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) { + /* + * When updating the IO pagetable, the new top and level + * are provided as parameters. For other operations i.e. + * device attach, retrieve the current pagetable info + * via the IOMMU PT API. + */ + if (top_paddr) { + pt_info.host_pt_root = top_paddr; + pt_info.mode = top_level + 1; + } else { + WARN_ON(top_paddr || top_level); + pt_iommu_amdv1_hw_info(&domain->amdv1, + &pt_info); + } - pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; + new.data[0] |= __sme_set(pt_info.host_pt_root) | + (pt_info.mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + } + } - pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V; + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; /* - * When SNP is enabled, Only set TV bit when IOMMU - * page translation is in use. + * When SNP is enabled, we can only support TV=1 with non-zero domain ID. + * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in + * do_iommu_domain_alloc(). */ - if (!amd_iommu_snp_en || (domid != 0)) - pte_root |= DTE_FLAG_TV; - - flags = dev_table[devid].data[1]; - - if (dev_data->ats_enabled) - flags |= DTE_FLAG_IOTLB; + WARN_ON(amd_iommu_snp_en && (domid == 0)); + new.data[0] |= DTE_FLAG_TV; if (dev_data->ppr) - pte_root |= 1ULL << DEV_ENTRY_PPR; + new.data[0] |= 1ULL << DEV_ENTRY_PPR; if (domain->dirty_tracking) - pte_root |= DTE_FLAG_HAD; - - if (gcr3_info && gcr3_info->gcr3_tbl) { - u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); - u64 glx = gcr3_info->glx; - u64 tmp; - - pte_root |= DTE_FLAG_GV; - pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT; - - /* First mask out possible old values for GCR3 table */ - tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; - flags &= ~tmp; - - tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; - flags &= ~tmp; - - /* Encode GCR3 table into DTE */ - tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A; - pte_root |= tmp; + new.data[0] |= DTE_FLAG_HAD; - tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B; - flags |= tmp; - - tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C; - flags |= tmp; + if (dev_data->ats_enabled) + new.data[1] |= DTE_FLAG_IOTLB; - if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { - dev_table[devid].data[2] |= - ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT); - } + old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK; + new.data[1] |= domid; - /* GIOV is supported with V2 page table mode only */ - if (pdom_is_v2_pgtbl_mode(domain)) - pte_root |= DTE_FLAG_GIOV; + /* + * Restore cached persistent DTE bits, which can be set by information + * in IVRS table. See set_dev_entry_from_acpi(). + */ + initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid); + if (initial_dte) { + new.data128[0] |= initial_dte->data128[0]; + new.data128[1] |= initial_dte->data128[1]; } - flags &= ~DEV_DOMID_MASK; - flags |= domid; + set_dte_gcr3_table(iommu, dev_data, &new); - old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK; - dev_table[devid].data[1] = flags; - dev_table[devid].data[0] = pte_root; + update_dte256(iommu, dev_data, &new); /* * A kdump kernel might be replacing a domain ID that was copied from @@ -1946,30 +2133,27 @@ static void set_dte_entry(struct amd_iommu *iommu, } } -static void clear_dte_entry(struct amd_iommu *iommu, u16 devid) +/* + * Clear DMA-remap related flags to block all DMA (blockeded domain) + */ +static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) { - struct dev_table_entry *dev_table = get_dev_table(iommu); - - /* remove entry from the device table seen by the hardware */ - dev_table[devid].data[0] = DTE_FLAG_V; - - if (!amd_iommu_snp_en) - dev_table[devid].data[0] |= DTE_FLAG_TV; - - dev_table[devid].data[1] &= DTE_FLAG_MASK; + struct dev_table_entry new = {}; + struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; - amd_iommu_apply_erratum_63(iommu, devid); + make_clear_dte(dev_data, dte, &new); + update_dte256(iommu, dev_data, &new); } /* Update and flush DTE for the given device */ -void amd_iommu_dev_update_dte(struct iommu_dev_data *dev_data, bool set) +static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) { struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); if (set) - set_dte_entry(iommu, dev_data); + set_dte_entry(iommu, dev_data, 0, 0); else - clear_dte_entry(iommu, dev_data->devid); + clear_dte_entry(iommu, dev_data); clone_aliases(iommu, dev_data->dev); device_flush_dte(dev_data); @@ -1985,6 +2169,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int max_pasids = dev_data->max_pasids; + struct pt_iommu_x86_64_hw_info pt_info; int ret = 0; /* @@ -2007,7 +2192,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, if (!pdom_is_v2_pgtbl_mode(pdom)) return ret; - ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true); + pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); + ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); if (ret) free_gcr3_table(&dev_data->gcr3_info); @@ -2028,56 +2214,64 @@ static void destroy_gcr3_table(struct iommu_dev_data *dev_data, free_gcr3_table(gcr3_info); } -static int do_attach(struct iommu_dev_data *dev_data, - struct protection_domain *domain) +static int pdom_attach_iommu(struct amd_iommu *iommu, + struct protection_domain *pdom) { - struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct pdom_iommu_info *pdom_iommu_info, *curr; + unsigned long flags; int ret = 0; - /* Update data structures */ - dev_data->domain = domain; - list_add(&dev_data->list, &domain->dev_list); + spin_lock_irqsave(&pdom->lock, flags); - /* Update NUMA Node ID */ - if (domain->nid == NUMA_NO_NODE) - domain->nid = dev_to_node(dev_data->dev); + pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); + if (pdom_iommu_info) { + pdom_iommu_info->refcnt++; + goto out_unlock; + } - /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; + pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC); + if (!pdom_iommu_info) { + ret = -ENOMEM; + goto out_unlock; + } - /* Setup GCR3 table */ - if (pdom_is_sva_capable(domain)) { - ret = init_gcr3_table(dev_data, domain); - if (ret) - return ret; + pdom_iommu_info->iommu = iommu; + pdom_iommu_info->refcnt = 1; + + curr = xa_cmpxchg(&pdom->iommu_array, iommu->index, + NULL, pdom_iommu_info, GFP_ATOMIC); + if (curr) { + kfree(pdom_iommu_info); + ret = -ENOSPC; + goto out_unlock; } +out_unlock: + spin_unlock_irqrestore(&pdom->lock, flags); return ret; } -static void do_detach(struct iommu_dev_data *dev_data) +static void pdom_detach_iommu(struct amd_iommu *iommu, + struct protection_domain *pdom) { - struct protection_domain *domain = dev_data->domain; - struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - - /* Clear DTE and flush the entry */ - amd_iommu_dev_update_dte(dev_data, false); + struct pdom_iommu_info *pdom_iommu_info; + unsigned long flags; - /* Flush IOTLB and wait for the flushes to finish */ - amd_iommu_domain_flush_all(domain); + spin_lock_irqsave(&pdom->lock, flags); - /* Clear GCR3 table */ - if (pdom_is_sva_capable(domain)) - destroy_gcr3_table(dev_data, domain); + pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); + if (!pdom_iommu_info) { + spin_unlock_irqrestore(&pdom->lock, flags); + return; + } - /* Update data structures */ - dev_data->domain = NULL; - list_del(&dev_data->list); + pdom_iommu_info->refcnt--; + if (pdom_iommu_info->refcnt == 0) { + xa_erase(&pdom->iommu_array, iommu->index); + kfree(pdom_iommu_info); + } - /* decrease reference counters - needs to happen after the flushes */ - domain->dev_iommu[iommu->index] -= 1; - domain->dev_cnt -= 1; + spin_unlock_irqrestore(&pdom->lock, flags); } /* @@ -2087,28 +2281,60 @@ static void do_detach(struct iommu_dev_data *dev_data) static int attach_device(struct device *dev, struct protection_domain *domain) { - struct iommu_dev_data *dev_data; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct pci_dev *pdev; unsigned long flags; int ret = 0; - spin_lock_irqsave(&domain->lock, flags); - - dev_data = dev_iommu_priv_get(dev); - - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); if (dev_data->domain != NULL) { ret = -EBUSY; goto out; } - ret = do_attach(dev_data, domain); + /* Do reference counting */ + ret = pdom_attach_iommu(iommu, domain); + if (ret) + goto out; -out: - spin_unlock(&dev_data->lock); + /* Setup GCR3 table */ + if (pdom_is_sva_capable(domain)) { + ret = init_gcr3_table(dev_data, domain); + if (ret) { + pdom_detach_iommu(iommu, domain); + goto out; + } + } + + pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; + if (pdev && pdom_is_sva_capable(domain)) { + pdev_enable_caps(pdev); + + /* + * Device can continue to function even if IOPF + * enablement failed. Hence in error path just + * disable device PRI support. + */ + if (amd_iommu_iopf_add_device(iommu, dev_data)) + pdev_disable_cap_pri(pdev); + } else if (pdev) { + pdev_enable_cap_ats(pdev); + } + /* Update data structures */ + dev_data->domain = domain; + spin_lock_irqsave(&domain->lock, flags); + list_add(&dev_data->list, &domain->dev_list); spin_unlock_irqrestore(&domain->lock, flags); + /* Update device table */ + dev_update_dte(dev_data, true); + +out: + mutex_unlock(&dev_data->mutex); + return ret; } @@ -2118,14 +2344,11 @@ out: static void detach_device(struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - struct protection_domain *domain = dev_data->domain; struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct protection_domain *domain = dev_data->domain; unsigned long flags; - bool ppr = dev_data->ppr; - - spin_lock_irqsave(&domain->lock, flags); - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); /* * First check if the device is still attached. It might already @@ -2136,27 +2359,36 @@ static void detach_device(struct device *dev) if (WARN_ON(!dev_data->domain)) goto out; - if (ppr) { + /* Remove IOPF handler */ + if (dev_data->ppr) { iopf_queue_flush_dev(dev); - - /* Updated here so that it gets reflected in DTE */ - dev_data->ppr = false; + amd_iommu_iopf_remove_device(iommu, dev_data); } - do_detach(dev_data); + if (dev_is_pci(dev)) + pdev_disable_caps(to_pci_dev(dev)); -out: - spin_unlock(&dev_data->lock); + /* Clear DTE and flush the entry */ + dev_update_dte(dev_data, false); + /* Flush IOTLB and wait for the flushes to finish */ + spin_lock_irqsave(&domain->lock, flags); + amd_iommu_domain_flush_all(domain); + list_del(&dev_data->list); spin_unlock_irqrestore(&domain->lock, flags); - /* Remove IOPF handler */ - if (ppr) - amd_iommu_iopf_remove_device(iommu, dev_data); + /* Clear GCR3 table */ + if (pdom_is_sva_capable(domain)) + destroy_gcr3_table(dev_data, domain); - if (dev_is_pci(dev)) - pdev_disable_caps(to_pci_dev(dev)); + /* Update data structures */ + dev_data->domain = NULL; + /* decrease reference counters - needs to happen after the flushes */ + pdom_detach_iommu(iommu, domain); + +out: + mutex_unlock(&dev_data->mutex); } static struct iommu_device *amd_iommu_probe_device(struct device *dev) @@ -2185,11 +2417,12 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); iommu_dev = ERR_PTR(ret); iommu_ignore_device(iommu, dev); - } else { - amd_iommu_set_pci_msi_domain(dev, iommu); - iommu_dev = &iommu->iommu; + goto out_err; } + amd_iommu_set_pci_msi_domain(dev, iommu); + iommu_dev = &iommu->iommu; + /* * If IOMMU and device supports PASID then it will contain max * supported PASIDs, else it will be zero. @@ -2201,24 +2434,38 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) pci_max_pasids(to_pci_dev(dev))); } + if (amd_iommu_pgtable == PD_MODE_NONE) { + pr_warn_once("%s: DMA translation not supported by iommu.\n", + __func__); + iommu_dev = ERR_PTR(-ENODEV); + goto out_err; + } + +out_err: + iommu_completion_wait(iommu); + if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + dev_data->max_irqs = MAX_IRQS_PER_TABLE_2K; + else + dev_data->max_irqs = MAX_IRQS_PER_TABLE_512; + + if (dev_is_pci(dev)) + pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT); + return iommu_dev; } static void amd_iommu_release_device(struct device *dev) { - struct amd_iommu *iommu; - - if (!check_device(dev)) - return; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - iommu = rlookup_amd_iommu(dev); - if (!iommu) - return; + WARN_ON(dev_data->domain); - amd_iommu_uninit_device(dev); - iommu_completion_wait(iommu); + /* + * We keep dev_data around for unplugged devices and reuse it when the + * device is re-plugged - not doing so would introduce a ton of races. + */ } static struct iommu_group *amd_iommu_device_group(struct device *dev) @@ -2239,236 +2486,384 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev) * *****************************************************************************/ -static void cleanup_domain(struct protection_domain *domain) +static void protection_domain_init(struct protection_domain *domain) { - struct iommu_dev_data *entry; + spin_lock_init(&domain->lock); + INIT_LIST_HEAD(&domain->dev_list); + INIT_LIST_HEAD(&domain->dev_data_list); + xa_init(&domain->iommu_array); +} - lockdep_assert_held(&domain->lock); +struct protection_domain *protection_domain_alloc(void) +{ + struct protection_domain *domain; + int domid; - if (!domain->dev_cnt) - return; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; - while (!list_empty(&domain->dev_list)) { - entry = list_first_entry(&domain->dev_list, - struct iommu_dev_data, list); - BUG_ON(!entry->domain); - do_detach(entry); + domid = pdom_id_alloc(); + if (domid <= 0) { + kfree(domain); + return NULL; } - WARN_ON(domain->dev_cnt != 0); + domain->id = domid; + + protection_domain_init(domain); + + return domain; } -void protection_domain_free(struct protection_domain *domain) +static bool amd_iommu_hd_support(struct amd_iommu *iommu) { - if (!domain) - return; - - if (domain->iop.pgtbl_cfg.tlb) - free_io_pgtable_ops(&domain->iop.iop.ops); + if (amd_iommu_hatdis) + return false; - if (domain->iop.root) - iommu_free_page(domain->iop.root); + return iommu && (iommu->features & FEATURE_HDSUP); +} - if (domain->id) - domain_id_free(domain->id); +static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) +{ + struct protection_domain *pdom = + container_of(iommupt, struct protection_domain, iommu); - kfree(domain); + return &pdom->lock; } -static int protection_domain_init_v1(struct protection_domain *domain, int mode) +/* + * Update all HW references to the domain with a new pgtable configuration. + */ +static void amd_iommu_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level) { - u64 *pt_root = NULL; + struct protection_domain *pdom = + container_of(iommu_table, struct protection_domain, iommu); + struct iommu_dev_data *dev_data; + + lockdep_assert_held(&pdom->lock); - BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL); + /* Update the DTE for all devices attached to this domain */ + list_for_each_entry(dev_data, &pdom->dev_list, list) { + struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); - if (mode != PAGE_MODE_NONE) { - pt_root = iommu_alloc_page(GFP_KERNEL); - if (!pt_root) - return -ENOMEM; + /* Update the HW references with the new level and top ptr */ + set_dte_entry(iommu, dev_data, top_paddr, top_level); + clone_aliases(iommu, dev_data->dev); } - domain->pd_mode = PD_MODE_V1; - amd_iommu_domain_set_pgtable(domain, pt_root, mode); + list_for_each_entry(dev_data, &pdom->dev_list, list) + device_flush_dte(dev_data); - return 0; + domain_flush_complete(pdom); } -static int protection_domain_init_v2(struct protection_domain *pdom) +/* + * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to + * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non + * present caching (like hypervisor shadowing). + */ +static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, + unsigned long iova, size_t size) { - pdom->pd_mode = PD_MODE_V2; - pdom->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; + struct protection_domain *domain = to_pdomain(dom); + unsigned long flags; + if (likely(!amd_iommu_np_cache)) + return 0; + + spin_lock_irqsave(&domain->lock, flags); + amd_iommu_domain_flush_pages(domain, iova, size); + spin_unlock_irqrestore(&domain->lock, flags); return 0; } -struct protection_domain *protection_domain_alloc(unsigned int type) +static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; + + spin_lock_irqsave(&dom->lock, flags); + amd_iommu_domain_flush_all(dom); + spin_unlock_irqrestore(&dom->lock, flags); +} + +static void amd_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - struct io_pgtable_ops *pgtbl_ops; + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; + + spin_lock_irqsave(&dom->lock, flags); + amd_iommu_domain_flush_pages(dom, gather->start, + gather->end - gather->start + 1); + spin_unlock_irqrestore(&dom->lock, flags); + iommu_put_pages_list(&gather->freelist); +} + +static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { + .get_top_lock = amd_iommu_get_top_lock, + .change_top = amd_iommu_change_top, +}; + +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .iotlb_sync_map = amd_iommu_iotlb_sync_map, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, + .attach_dev = amd_iommu_attach_device, + .free = amd_iommu_domain_free, + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, +}; + +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = amd_iommu_set_dirty_tracking, +}; + +static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, + u32 flags) +{ + struct pt_iommu_amdv1_cfg cfg = {}; struct protection_domain *domain; - int pgtable; int ret; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; + if (amd_iommu_hatdis) + return ERR_PTR(-EOPNOTSUPP); - domain->id = domain_id_alloc(); - if (!domain->id) - goto out_err; + domain = protection_domain_alloc(); + if (!domain) + return ERR_PTR(-ENOMEM); - spin_lock_init(&domain->lock); - INIT_LIST_HEAD(&domain->dev_list); - INIT_LIST_HEAD(&domain->dev_data_list); - domain->nid = NUMA_NO_NODE; + domain->pd_mode = PD_MODE_V1; + domain->iommu.driver_ops = &amd_hw_driver_ops_v1; + domain->iommu.nid = dev_to_node(dev); + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + domain->domain.dirty_ops = &amdv1_dirty_ops; - switch (type) { - /* No need to allocate io pgtable ops in passthrough mode */ - case IOMMU_DOMAIN_IDENTITY: - case IOMMU_DOMAIN_SVA: - return domain; - case IOMMU_DOMAIN_DMA: - pgtable = amd_iommu_pgtable; - break; /* - * Force IOMMU v1 page table when allocating - * domain for pass-through devices. + * Someday FORCE_COHERENCE should be set by + * amd_iommu_enforce_cache_coherency() like VT-d does. */ - case IOMMU_DOMAIN_UNMANAGED: - pgtable = AMD_IOMMU_V1; - break; - default: - goto out_err; - } + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); - switch (pgtable) { - case AMD_IOMMU_V1: - ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL); - break; - case AMD_IOMMU_V2: - ret = protection_domain_init_v2(domain); - break; - default: - ret = -EINVAL; - break; - } - - if (ret) - goto out_err; - - pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain); - if (!pgtbl_ops) - goto out_err; + /* + * AMD's IOMMU can flush as many pages as necessary in a single flush. + * Unless we run in a virtual machine, which can be inferred according + * to whether "non-present cache" is on, it is probably best to prefer + * (potentially) too extensive TLB flushing (i.e., more misses) over + * multiple TLB flushes (i.e., more flushes). For virtual machines the + * hypervisor needs to synchronize the host IOMMU PTEs with those of + * the guest, and the trade-off is different: unnecessary TLB flushes + * should be avoided. + */ + if (amd_iommu_np_cache) + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); + else + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); - return domain; -out_err: - protection_domain_free(domain); - return NULL; -} + cfg.common.hw_max_vasz_lg2 = + min(64, (amd_iommu_hpt_level - 1) * 9 + 21); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.starting_level = 2; + domain->domain.ops = &amdv1_ops; -static inline u64 dma_max_address(void) -{ - if (amd_iommu_pgtable == AMD_IOMMU_V1) - return ~0ULL; + ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); + if (ret) { + amd_iommu_domain_free(&domain->domain); + return ERR_PTR(ret); + } - /* V2 with 4/5 level page table */ - return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); + /* + * Narrow the supported page sizes to those selected by the kernel + * command line. + */ + domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; + return &domain->domain; } -static bool amd_iommu_hd_support(struct amd_iommu *iommu) -{ - return iommu && (iommu->features & FEATURE_HDSUP); -} +static const struct iommu_domain_ops amdv2_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), + .iotlb_sync_map = amd_iommu_iotlb_sync_map, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, + .attach_dev = amd_iommu_attach_device, + .free = amd_iommu_domain_free, + /* + * Note the AMDv2 page table format does not support a Force Coherency + * bit, so enforce_cache_coherency should not be set. However VFIO is + * not prepared to handle a case where some domains will support + * enforcement and others do not. VFIO and iommufd will have to be fixed + * before it can fully use the V2 page table. See the comment in + * iommufd_hwpt_paging_alloc(). For now leave things as they have + * historically been and lie about enforce_cache_coherencey. + */ + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, +}; -static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, - struct device *dev, u32 flags) +static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, + u32 flags) { - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + struct pt_iommu_x86_64_cfg cfg = {}; struct protection_domain *domain; - struct amd_iommu *iommu = NULL; - - if (dev) - iommu = get_amd_iommu_from_dev(dev); - - /* - * Since DTE[Mode]=0 is prohibited on SNP-enabled system, - * default to use IOMMU_DOMAIN_DMA[_FQ]. - */ - if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) - return ERR_PTR(-EINVAL); + int ret; - if (dirty_tracking && !amd_iommu_hd_support(iommu)) + if (!amd_iommu_v2_pgtbl_supported()) return ERR_PTR(-EOPNOTSUPP); - domain = protection_domain_alloc(type); + domain = protection_domain_alloc(); if (!domain) return ERR_PTR(-ENOMEM); - domain->domain.geometry.aperture_start = 0; - domain->domain.geometry.aperture_end = dma_max_address(); - domain->domain.geometry.force_aperture = true; + domain->pd_mode = PD_MODE_V2; + domain->iommu.nid = dev_to_node(dev); - if (iommu) { - domain->domain.type = type; - domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; - domain->domain.ops = iommu->iommu.ops->default_domain_ops; + cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); + if (amd_iommu_np_cache) + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); + else + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); - if (dirty_tracking) - domain->domain.dirty_ops = &amd_dirty_ops; + /* + * The v2 table behaves differently if it is attached to PASID 0 vs a + * non-zero PASID. On PASID 0 it has no sign extension and the full + * 57/48 bits decode the lower addresses. Otherwise it behaves like a + * normal sign extended x86 page table. Since we want the domain to work + * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not + * set which creates a table that is compatible in both modes. + */ + if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { + cfg.common.hw_max_vasz_lg2 = 56; + cfg.top_level = 4; + } else { + cfg.common.hw_max_vasz_lg2 = 47; + cfg.top_level = 3; } + cfg.common.hw_max_oasz_lg2 = 52; + domain->domain.ops = &amdv2_ops; + ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); + if (ret) { + amd_iommu_domain_free(&domain->domain); + return ERR_PTR(ret); + } return &domain->domain; } -static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) +static struct iommu_domain * +amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) + { - struct iommu_domain *domain; + struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); + const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID; - domain = do_iommu_domain_alloc(type, NULL, 0); - if (IS_ERR(domain)) - return NULL; + if ((flags & ~supported_flags) || user_data) + return ERR_PTR(-EOPNOTSUPP); - return domain; + switch (flags & supported_flags) { + case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: + /* Allocate domain with v1 page table for dirty tracking */ + if (!amd_iommu_hd_support(iommu)) + break; + return amd_iommu_domain_alloc_paging_v1(dev, flags); + case IOMMU_HWPT_ALLOC_PASID: + /* Allocate domain with v2 page table if IOMMU supports PASID. */ + if (!amd_iommu_pasid_supported()) + break; + return amd_iommu_domain_alloc_paging_v2(dev, flags); + case 0: { + struct iommu_domain *ret; + + /* If nothing specific is required use the kernel commandline default */ + if (amd_iommu_pgtable == PD_MODE_V1) { + ret = amd_iommu_domain_alloc_paging_v1(dev, flags); + if (ret != ERR_PTR(-EOPNOTSUPP)) + return ret; + return amd_iommu_domain_alloc_paging_v2(dev, flags); + } + ret = amd_iommu_domain_alloc_paging_v2(dev, flags); + if (ret != ERR_PTR(-EOPNOTSUPP)) + return ret; + return amd_iommu_domain_alloc_paging_v1(dev, flags); + } + default: + break; + } + return ERR_PTR(-EOPNOTSUPP); } -static struct iommu_domain * -amd_iommu_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +void amd_iommu_domain_free(struct iommu_domain *dom) +{ + struct protection_domain *domain = to_pdomain(dom); + + WARN_ON(!list_empty(&domain->dev_list)); + pt_iommu_deinit(&domain->iommu); + pdom_id_free(domain->id); + kfree(domain); +} +static int blocked_domain_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { - unsigned int type = IOMMU_DOMAIN_UNMANAGED; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) - return ERR_PTR(-EOPNOTSUPP); + if (dev_data->domain) + detach_device(dev); + + /* Clear DTE and flush the entry */ + mutex_lock(&dev_data->mutex); + dev_update_dte(dev_data, false); + mutex_unlock(&dev_data->mutex); - return do_iommu_domain_alloc(type, dev, flags); + return 0; } -void amd_iommu_domain_free(struct iommu_domain *dom) +static int blocked_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { - struct protection_domain *domain; - unsigned long flags; + amd_iommu_remove_dev_pasid(dev, pasid, old); + return 0; +} - if (!dom) - return; +static struct iommu_domain blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocked_domain_attach_device, + .set_dev_pasid = blocked_domain_set_dev_pasid, + } +}; - domain = to_pdomain(dom); +static struct protection_domain identity_domain; - spin_lock_irqsave(&domain->lock, flags); +static const struct iommu_domain_ops identity_domain_ops = { + .attach_dev = amd_iommu_attach_device, +}; - cleanup_domain(domain); +void amd_iommu_init_identity_domain(void) +{ + struct iommu_domain *domain = &identity_domain.domain; - spin_unlock_irqrestore(&domain->lock, flags); + domain->type = IOMMU_DOMAIN_IDENTITY; + domain->ops = &identity_domain_ops; + domain->owner = &amd_iommu_ops; + + identity_domain.id = pdom_id_alloc(); - protection_domain_free(domain); + protection_domain_init(&identity_domain); } -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev) +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); - struct pci_dev *pdev; int ret; /* @@ -2501,114 +2896,9 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } #endif - pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; - if (pdev && pdom_is_sva_capable(domain)) { - pdev_enable_caps(pdev); - - /* - * Device can continue to function even if IOPF - * enablement failed. Hence in error path just - * disable device PRI support. - */ - if (amd_iommu_iopf_add_device(iommu, dev_data)) - pdev_disable_cap_pri(pdev); - } else if (pdev) { - pdev_enable_cap_ats(pdev); - } - - /* Update device table */ - amd_iommu_dev_update_dte(dev_data, true); - - return ret; -} - -static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, - unsigned long iova, size_t size) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.iop.ops; - - if (ops->map_pages) - domain_flush_np_cache(domain, iova, size); - return 0; -} - -static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int iommu_prot, gfp_t gfp, size_t *mapped) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.iop.ops; - int prot = 0; - int ret = -EINVAL; - - if ((domain->pd_mode == PD_MODE_V1) && - (domain->iop.mode == PAGE_MODE_NONE)) - return -EINVAL; - - if (iommu_prot & IOMMU_READ) - prot |= IOMMU_PROT_IR; - if (iommu_prot & IOMMU_WRITE) - prot |= IOMMU_PROT_IW; - - if (ops->map_pages) { - ret = ops->map_pages(ops, iova, paddr, pgsize, - pgcount, prot, gfp, mapped); - } - return ret; } -static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size) -{ - /* - * AMD's IOMMU can flush as many pages as necessary in a single flush. - * Unless we run in a virtual machine, which can be inferred according - * to whether "non-present cache" is on, it is probably best to prefer - * (potentially) too extensive TLB flushing (i.e., more misses) over - * mutliple TLB flushes (i.e., more flushes). For virtual machines the - * hypervisor needs to synchronize the host IOMMU PTEs with those of - * the guest, and the trade-off is different: unnecessary TLB flushes - * should be avoided. - */ - if (amd_iommu_np_cache && - iommu_iotlb_gather_is_disjoint(gather, iova, size)) - iommu_iotlb_sync(domain, gather); - - iommu_iotlb_gather_add_range(gather, iova, size); -} - -static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.iop.ops; - size_t r; - - if ((domain->pd_mode == PD_MODE_V1) && - (domain->iop.mode == PAGE_MODE_NONE)) - return 0; - - r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0; - - if (r) - amd_iommu_iotlb_gather_add_page(dom, gather, iova, r); - - return r; -} - -static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, - dma_addr_t iova) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.iop.ops; - - return ops->iova_to_phys(ops, iova); -} - static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) { switch (cap) { @@ -2638,12 +2928,12 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable) { struct protection_domain *pdomain = to_pdomain(domain); - struct dev_table_entry *dev_table; + struct dev_table_entry *dte; struct iommu_dev_data *dev_data; bool domain_flush = false; struct amd_iommu *iommu; unsigned long flags; - u64 pte_root; + u64 new; spin_lock_irqsave(&pdomain->lock, flags); if (!(pdomain->dirty_tracking ^ enable)) { @@ -2652,16 +2942,15 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, } list_for_each_entry(dev_data, &pdomain->dev_list, list) { + spin_lock(&dev_data->dte_lock); iommu = get_amd_iommu_from_dev_data(dev_data); - - dev_table = get_dev_table(iommu); - pte_root = dev_table[dev_data->devid].data[0]; - - pte_root = (enable ? pte_root | DTE_FLAG_HAD : - pte_root & ~DTE_FLAG_HAD); + dte = &get_dev_table(iommu)[dev_data->devid]; + new = dte->data[0]; + new = (enable ? new | DTE_FLAG_HAD : new & ~DTE_FLAG_HAD); + dte->data[0] = new; + spin_unlock(&dev_data->dte_lock); /* Flush device DTE */ - dev_table[dev_data->devid].data[0] = pte_root; device_flush_dte(dev_data); domain_flush = true; } @@ -2676,28 +2965,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct protection_domain *pdomain = to_pdomain(domain); - struct io_pgtable_ops *ops = &pdomain->iop.iop.ops; - unsigned long lflags; - - if (!ops || !ops->read_and_clear_dirty) - return -EOPNOTSUPP; - - spin_lock_irqsave(&pdomain->lock, lflags); - if (!pdomain->dirty_tracking && dirty->bitmap) { - spin_unlock_irqrestore(&pdomain->lock, lflags); - return -EINVAL; - } - spin_unlock_irqrestore(&pdomain->lock, lflags); - - return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); -} - static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2749,6 +3016,9 @@ static void amd_iommu_get_resv_regions(struct device *dev, return; list_add_tail(®ion->list, head); + if (amd_iommu_ht_range_ignore()) + return; + region = iommu_alloc_resv_region(HT_RANGE_START, HT_RANGE_END - HT_RANGE_START + 1, 0, IOMMU_RESV_RESERVED, GFP_KERNEL); @@ -2757,35 +3027,13 @@ static void amd_iommu_get_resv_regions(struct device *dev, list_add_tail(®ion->list, head); } -bool amd_iommu_is_attach_deferred(struct device *dev) +static bool amd_iommu_is_attach_deferred(struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); return dev_data->defer_attach; } -static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) -{ - struct protection_domain *dom = to_pdomain(domain); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_all(dom); - spin_unlock_irqrestore(&dom->lock, flags); -} - -static void amd_iommu_iotlb_sync(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather) -{ - struct protection_domain *dom = to_pdomain(domain); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_pages(dom, gather->start, - gather->end - gather->start + 1); - spin_unlock_irqrestore(&dom->lock, flags); -} - static int amd_iommu_def_domain_type(struct device *dev) { struct iommu_dev_data *dev_data; @@ -2820,70 +3068,20 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } -static const struct iommu_dirty_ops amd_dirty_ops = { - .set_dirty_tracking = amd_iommu_set_dirty_tracking, - .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, -}; - -static int amd_iommu_dev_enable_feature(struct device *dev, - enum iommu_dev_features feat) -{ - int ret = 0; - - switch (feat) { - case IOMMU_DEV_FEAT_IOPF: - case IOMMU_DEV_FEAT_SVA: - break; - default: - ret = -EINVAL; - break; - } - return ret; -} - -static int amd_iommu_dev_disable_feature(struct device *dev, - enum iommu_dev_features feat) -{ - int ret = 0; - - switch (feat) { - case IOMMU_DEV_FEAT_IOPF: - case IOMMU_DEV_FEAT_SVA: - break; - default: - ret = -EINVAL; - break; - } - return ret; -} - const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, - .domain_alloc = amd_iommu_domain_alloc, - .domain_alloc_user = amd_iommu_domain_alloc_user, + .blocked_domain = &blocked_domain, + .release_domain = &blocked_domain, + .identity_domain = &identity_domain.domain, + .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, .domain_alloc_sva = amd_iommu_domain_alloc_sva, .probe_device = amd_iommu_probe_device, .release_device = amd_iommu_release_device, .device_group = amd_iommu_device_group, .get_resv_regions = amd_iommu_get_resv_regions, .is_attach_deferred = amd_iommu_is_attach_deferred, - .pgsize_bitmap = AMD_IOMMU_PGSIZES, .def_domain_type = amd_iommu_def_domain_type, - .dev_enable_feat = amd_iommu_dev_enable_feature, - .dev_disable_feat = amd_iommu_dev_disable_feature, - .remove_dev_pasid = amd_iommu_remove_dev_pasid, .page_response = amd_iommu_page_response, - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = amd_iommu_attach_device, - .map_pages = amd_iommu_map_pages, - .unmap_pages = amd_iommu_unmap_pages, - .iotlb_sync_map = amd_iommu_iotlb_sync_map, - .iova_to_phys = amd_iommu_iova_to_phys, - .flush_iotlb_all = amd_iommu_flush_iotlb_all, - .iotlb_sync = amd_iommu_iotlb_sync, - .free = amd_iommu_domain_free, - .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, - } }; #ifdef CONFIG_IRQ_REMAP @@ -2908,7 +3106,7 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) return; build_inv_irt(&cmd, devid); - data = atomic64_add_return(1, &iommu->cmd_sem_val); + data = atomic64_inc_return(&iommu->cmd_sem_val); build_completion_wait(&cmd2, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); @@ -2923,20 +3121,33 @@ out: raw_spin_unlock_irqrestore(&iommu->lock, flags); } +static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) +{ + if (dev_data && dev_data->max_irqs == MAX_IRQS_PER_TABLE_2K) + return DTE_INTTABLEN_2K; + return DTE_INTTABLEN_512; +} + static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid, struct irq_remap_table *table) { - u64 dte; - struct dev_table_entry *dev_table = get_dev_table(iommu); + u64 new; + struct dev_table_entry *dte = &get_dev_table(iommu)[devid]; + struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); + + if (dev_data) + spin_lock(&dev_data->dte_lock); - dte = dev_table[devid].data[2]; - dte &= ~DTE_IRQ_PHYS_ADDR_MASK; - dte |= iommu_virt_to_phys(table->table); - dte |= DTE_IRQ_REMAP_INTCTL; - dte |= DTE_INTTABLEN; - dte |= DTE_IRQ_REMAP_ENABLE; + new = READ_ONCE(dte->data[2]); + new &= ~DTE_IRQ_PHYS_ADDR_MASK; + new |= iommu_virt_to_phys(table->table); + new |= DTE_IRQ_REMAP_INTCTL; + new |= iommu_get_int_tablen(dev_data); + new |= DTE_IRQ_REMAP_ENABLE; + WRITE_ONCE(dte->data[2], new); - dev_table[devid].data[2] = dte; + if (dev_data) + spin_unlock(&dev_data->dte_lock); } static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) @@ -2957,7 +3168,7 @@ static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid) return table; } -static struct irq_remap_table *__alloc_irq_table(void) +static struct irq_remap_table *__alloc_irq_table(int nid, size_t size) { struct irq_remap_table *table; @@ -2965,19 +3176,14 @@ static struct irq_remap_table *__alloc_irq_table(void) if (!table) return NULL; - table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL); + table->table = iommu_alloc_pages_node_sz( + nid, GFP_KERNEL, max(DTE_INTTAB_ALIGNMENT, size)); if (!table->table) { kfree(table); return NULL; } raw_spin_lock_init(&table->lock); - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) - memset(table->table, 0, - MAX_IRQS_PER_TABLE * sizeof(u32)); - else - memset(table->table, 0, - (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); return table; } @@ -3009,13 +3215,23 @@ static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias, return 0; } +static inline size_t get_irq_table_size(unsigned int max_irqs) +{ + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + return max_irqs * sizeof(u32); + + return max_irqs * (sizeof(u64) * 2); +} + static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, - u16 devid, struct pci_dev *pdev) + u16 devid, struct pci_dev *pdev, + unsigned int max_irqs) { struct irq_remap_table *table = NULL; struct irq_remap_table *new_table = NULL; struct amd_iommu_pci_seg *pci_seg; unsigned long flags; + int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; u16 alias; spin_lock_irqsave(&iommu_table_lock, flags); @@ -3034,7 +3250,7 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu, spin_unlock_irqrestore(&iommu_table_lock, flags); /* Nothing there yet, allocate new irq remapping table */ - new_table = __alloc_irq_table(); + new_table = __alloc_irq_table(nid, get_irq_table_size(max_irqs)); if (!new_table) return NULL; @@ -3069,20 +3285,21 @@ out_unlock: spin_unlock_irqrestore(&iommu_table_lock, flags); if (new_table) { - kmem_cache_free(amd_iommu_irq_cache, new_table->table); + iommu_free_pages(new_table->table); kfree(new_table); } return table; } static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, - bool align, struct pci_dev *pdev) + bool align, struct pci_dev *pdev, + unsigned long max_irqs) { struct irq_remap_table *table; int index, c, alignment = 1; unsigned long flags; - table = alloc_irq_table(iommu, devid, pdev); + table = alloc_irq_table(iommu, devid, pdev, max_irqs); if (!table) return -ENODEV; @@ -3093,7 +3310,7 @@ static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count, /* Scan table for free entries */ for (index = ALIGN(table->min_index, alignment), c = 0; - index < MAX_IRQS_PER_TABLE;) { + index < max_irqs;) { if (!iommu->irte_ops->is_allocated(table, index)) { c += 1; } else { @@ -3155,7 +3372,7 @@ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, struct irte_ga *irte) { - bool ret; + int ret; ret = __modify_irte_ga(iommu, devid, index, irte); if (ret) @@ -3363,6 +3580,14 @@ static void fill_msi_msg(struct msi_msg *msg, u32 index) msg->data = index; msg->address_lo = 0; msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + /* + * The struct msi_msg.dest_mode_logical is used to set the DM bit + * in MSI Message Address Register. For device w/ 2K int-remap support, + * this is bit must be set to 1 regardless of the actual destination + * mode, which is signified by the IRTE[DM]. + */ + if (FEATURE_NUM_INT_REMAP_SUP_2K(amd_iommu_efr2)) + msg->arch_addr_lo.dest_mode_logical = true; msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; } @@ -3425,6 +3650,8 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, struct amd_ir_data *data = NULL; struct amd_iommu *iommu; struct irq_cfg *cfg; + struct iommu_dev_data *dev_data; + unsigned long max_irqs; int i, ret, devid, seg, sbdf; int index; @@ -3443,6 +3670,9 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (!iommu) return -EINVAL; + dev_data = search_dev_data(iommu, devid); + max_irqs = dev_data ? dev_data->max_irqs : MAX_IRQS_PER_TABLE_512; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); if (ret < 0) return ret; @@ -3450,7 +3680,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { struct irq_remap_table *table; - table = alloc_irq_table(iommu, devid, NULL); + table = alloc_irq_table(iommu, devid, NULL, max_irqs); if (table) { if (!table->min_index) { /* @@ -3471,9 +3701,11 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI); index = alloc_irq_index(iommu, devid, nr_irqs, align, - msi_desc_to_pci_dev(info->desc)); + msi_desc_to_pci_dev(info->desc), + max_irqs); } else { - index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL); + index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL, + max_irqs); } if (index < 0) { @@ -3510,7 +3742,6 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, irq_data->chip_data = data; irq_data->chip = &amd_ir_chip; irq_remapping_prepare_irte(data, cfg, info, devid, index, i); - irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); } return 0; @@ -3612,13 +3843,70 @@ static const struct irq_domain_ops amd_ir_domain_ops = { .deactivate = irq_remapping_deactivate, }; -int amd_iommu_activate_guest_mode(void *data) +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu, + bool ga_log_intr) +{ + if (cpu >= 0) { + entry->lo.fields_vapic.destination = + APICID_TO_IRTE_DEST_LO(cpu); + entry->hi.fields.destination = + APICID_TO_IRTE_DEST_HI(cpu); + entry->lo.fields_vapic.is_run = true; + entry->lo.fields_vapic.ga_log_intr = false; + } else { + entry->lo.fields_vapic.is_run = false; + entry->lo.fields_vapic.ga_log_intr = ga_log_intr; + } +} + +/* + * Update the pCPU information for an IRTE that is configured to post IRQs to + * a vCPU, without issuing an IOMMU invalidation for the IRTE. + * + * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination + * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't + * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based + * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is + * blocking and requires a notification wake event). I.e. treat vCPUs that are + * associated with a pCPU as running. This API is intended to be used when a + * vCPU is scheduled in/out (or stops running for any reason), to do a fast + * update of IsRun, GALogIntr, and (conditionally) Destination. + * + * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached + * and thus don't require an invalidation to ensure the IOMMU consumes fresh + * information. + */ +int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) +{ + struct amd_ir_data *ir_data = (struct amd_ir_data *)data; + struct irte_ga *entry = (struct irte_ga *) ir_data->entry; + + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) + return -EINVAL; + + if (!entry || !entry->lo.fields_vapic.guest_mode) + return 0; + + if (!ir_data->iommu) + return -ENODEV; + + __amd_iommu_update_ga(entry, cpu, ga_log_intr); + + return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, + ir_data->irq_2_irte.index, entry); +} +EXPORT_SYMBOL(amd_iommu_update_ga); + +int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; u64 valid; - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry) + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) + return -EINVAL; + + if (!entry) return 0; valid = entry->lo.fields_vapic.valid; @@ -3628,11 +3916,12 @@ int amd_iommu_activate_guest_mode(void *data) entry->lo.fields_vapic.valid = valid; entry->lo.fields_vapic.guest_mode = 1; - entry->lo.fields_vapic.ga_log_intr = 1; entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr; entry->hi.fields.vector = ir_data->ga_vector; entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; + __amd_iommu_update_ga(entry, cpu, ga_log_intr); + return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); } @@ -3645,8 +3934,10 @@ int amd_iommu_deactivate_guest_mode(void *data) struct irq_cfg *cfg = ir_data->cfg; u64 valid; - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || - !entry || !entry->lo.fields_vapic.guest_mode) + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) + return -EINVAL; + + if (!entry || !entry->lo.fields_vapic.guest_mode) return 0; valid = entry->lo.fields_remap.valid; @@ -3668,15 +3959,17 @@ int amd_iommu_deactivate_guest_mode(void *data) } EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode); -static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) +static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) { int ret; - struct amd_iommu_pi_data *pi_data = vcpu_info; - struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data; + struct amd_iommu_pi_data *pi_data = info; struct amd_ir_data *ir_data = data->chip_data; struct irq_2_irte *irte_info = &ir_data->irq_2_irte; struct iommu_dev_data *dev_data; + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) + return -EINVAL; + if (ir_data->iommu == NULL) return -EINVAL; @@ -3687,38 +3980,23 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) * we should not modify the IRTE */ if (!dev_data || !dev_data->use_vapic) - return 0; + return -EINVAL; ir_data->cfg = irqd_cfg(data); - pi_data->ir_data = ir_data; - /* Note: - * SVM tries to set up for VAPIC mode, but we are in - * legacy mode. So, we force legacy mode instead. - */ - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { - pr_debug("%s: Fall back to using intr legacy remap\n", - __func__); - pi_data->is_guest_mode = false; - } + if (pi_data) { + pi_data->ir_data = ir_data; - pi_data->prev_ga_tag = ir_data->cached_ga_tag; - if (pi_data->is_guest_mode) { - ir_data->ga_root_ptr = (pi_data->base >> 12); - ir_data->ga_vector = vcpu_pi_info->vector; + ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); + ir_data->ga_vector = pi_data->vector; ir_data->ga_tag = pi_data->ga_tag; - ret = amd_iommu_activate_guest_mode(ir_data); - if (!ret) - ir_data->cached_ga_tag = pi_data->ga_tag; + if (pi_data->is_guest_mode) + ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu, + pi_data->ga_log_intr); + else + ret = amd_iommu_deactivate_guest_mode(ir_data); } else { ret = amd_iommu_deactivate_guest_mode(ir_data); - - /* - * This communicates the ga_tag back to the caller - * so that it can do all the necessary clean up. - */ - if (!ret) - ir_data->cached_ga_tag = 0; } return ret; @@ -3785,54 +4063,32 @@ static struct irq_chip amd_ir_chip = { static const struct msi_parent_ops amdvi_msi_parent_ops = { .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, + .bus_select_token = DOMAIN_BUS_AMDVI, + .bus_select_mask = MATCH_PCI_MSI, .prefix = "IR-", .init_dev_msi_info = msi_parent_init_dev_msi_info, }; int amd_iommu_create_irq_domain(struct amd_iommu *iommu) { - struct fwnode_handle *fn; + struct irq_domain_info info = { + .fwnode = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index), + .ops = &amd_ir_domain_ops, + .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, + .host_data = iommu, + .parent = arch_get_ir_parent_domain(), + }; - fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index); - if (!fn) + if (!info.fwnode) return -ENOMEM; - iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0, - fn, &amd_ir_domain_ops, iommu); + + iommu->ir_domain = msi_create_parent_irq_domain(&info, &amdvi_msi_parent_ops); if (!iommu->ir_domain) { - irq_domain_free_fwnode(fn); + irq_domain_free_fwnode(info.fwnode); return -ENOMEM; } - - irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_AMDVI); - iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT | - IRQ_DOMAIN_FLAG_ISOLATED_MSI; - iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops; - return 0; } - -int amd_iommu_update_ga(int cpu, bool is_run, void *data) -{ - struct amd_ir_data *ir_data = (struct amd_ir_data *)data; - struct irte_ga *entry = (struct irte_ga *) ir_data->entry; - - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || - !entry || !entry->lo.fields_vapic.guest_mode) - return 0; - - if (!ir_data->iommu) - return -ENODEV; - - if (cpu >= 0) { - entry->lo.fields_vapic.destination = - APICID_TO_IRTE_DEST_LO(cpu); - entry->hi.fields.destination = - APICID_TO_IRTE_DEST_HI(cpu); - } - entry->lo.fields_vapic.is_run = is_run; - - return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, - ir_data->irq_2_irte.index, entry); -} -EXPORT_SYMBOL(amd_iommu_update_ga); #endif + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index a68215f2b3e1..77c8e9a91cbc 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -100,7 +100,8 @@ static const struct mmu_notifier_ops sva_mn = { }; int iommu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct pdom_dev_data *pdom_dev_data; struct protection_domain *sva_pdom = to_pdomain(domain); @@ -108,6 +109,9 @@ int iommu_sva_set_dev_pasid(struct iommu_domain *domain, unsigned long flags; int ret = -EINVAL; + if (old) + return -EOPNOTSUPP; + /* PASID zero is used for requests from the I/O device without PASID */ if (!is_pasid_valid(dev_data, pasid)) return ret; @@ -181,16 +185,17 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct protection_domain *pdom; int ret; - pdom = protection_domain_alloc(IOMMU_DOMAIN_SVA); + pdom = protection_domain_alloc(); if (!pdom) return ERR_PTR(-ENOMEM); pdom->domain.ops = &amd_sva_domain_ops; pdom->mn.ops = &sva_mn; + pdom->domain.type = IOMMU_DOMAIN_SVA; ret = mmu_notifier_register(&pdom->mn, mm); if (ret) { - protection_domain_free(pdom); + amd_iommu_domain_free(&pdom->domain); return ERR_PTR(ret); } diff --git a/drivers/iommu/amd/ppr.c b/drivers/iommu/amd/ppr.c index 7c67d69f0b8c..e6767c057d01 100644 --- a/drivers/iommu/amd/ppr.c +++ b/drivers/iommu/amd/ppr.c @@ -48,7 +48,7 @@ void amd_iommu_enable_ppr_log(struct amd_iommu *iommu) void __init amd_iommu_free_ppr_log(struct amd_iommu *iommu) { - iommu_free_pages(iommu->ppr_log, get_order(PPR_LOG_SIZE)); + iommu_free_pages(iommu->ppr_log); } /* diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index eb1e62cd499a..83a5aabcd15d 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -36,7 +36,7 @@ #define DART_MAX_STREAMS 256 #define DART_MAX_TTBR 4 -#define MAX_DARTS_PER_DEVICE 2 +#define MAX_DARTS_PER_DEVICE 3 /* Common registers */ @@ -122,6 +122,8 @@ #define DART_T8110_ERROR_ADDR_LO 0x170 #define DART_T8110_ERROR_ADDR_HI 0x174 +#define DART_T8110_ERROR_STREAMS 0x1c0 + #define DART_T8110_PROTECT 0x200 #define DART_T8110_UNPROTECT 0x204 #define DART_T8110_PROTECT_LOCK 0x208 @@ -133,6 +135,7 @@ #define DART_T8110_TCR 0x1000 #define DART_T8110_TCR_REMAP GENMASK(11, 8) #define DART_T8110_TCR_REMAP_EN BIT(7) +#define DART_T8110_TCR_FOUR_LEVEL BIT(3) #define DART_T8110_TCR_BYPASS_DAPF BIT(2) #define DART_T8110_TCR_BYPASS_DART BIT(1) #define DART_T8110_TCR_TRANSLATE_ENABLE BIT(0) @@ -166,22 +169,23 @@ struct apple_dart_hw { int max_sid_count; - u64 lock; - u64 lock_bit; + u32 lock; + u32 lock_bit; - u64 error; + u32 error; - u64 enable_streams; + u32 enable_streams; - u64 tcr; - u64 tcr_enabled; - u64 tcr_disabled; - u64 tcr_bypass; + u32 tcr; + u32 tcr_enabled; + u32 tcr_disabled; + u32 tcr_bypass; + u32 tcr_4level; - u64 ttbr; - u64 ttbr_valid; - u64 ttbr_addr_field_shift; - u64 ttbr_shift; + u32 ttbr; + u32 ttbr_valid; + u32 ttbr_addr_field_shift; + u32 ttbr_shift; int ttbr_count; }; @@ -217,6 +221,7 @@ struct apple_dart { u32 pgsize; u32 num_streams; u32 supports_bypass : 1; + u32 four_level : 1; struct iommu_group *sid2group[DART_MAX_STREAMS]; struct iommu_device iommu; @@ -277,6 +282,9 @@ struct apple_dart_domain { * @streams: streams for this device */ struct apple_dart_master_cfg { + /* Intersection of DART capabilitles */ + u32 supports_bypass : 1; + struct apple_dart_stream_map stream_maps[MAX_DARTS_PER_DEVICE]; }; @@ -302,13 +310,19 @@ static struct apple_dart_domain *to_dart_domain(struct iommu_domain *dom) } static void -apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map) +apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map, int levels) { struct apple_dart *dart = stream_map->dart; + u32 tcr = dart->hw->tcr_enabled; int sid; + if (levels == 4) + tcr |= dart->hw->tcr_4level; + + WARN_ON(levels != 3 && levels != 4); + WARN_ON(levels == 4 && !dart->four_level); for_each_set_bit(sid, stream_map->sidmap, dart->num_streams) - writel(dart->hw->tcr_enabled, dart->regs + DART_TCR(dart, sid)); + writel(tcr, dart->regs + DART_TCR(dart, sid)); } static void apple_dart_hw_disable_dma(struct apple_dart_stream_map *stream_map) @@ -566,7 +580,8 @@ apple_dart_setup_translation(struct apple_dart_domain *domain, for (; i < stream_map->dart->hw->ttbr_count; ++i) apple_dart_hw_clear_ttbr(stream_map, i); - apple_dart_hw_enable_translation(stream_map); + apple_dart_hw_enable_translation(stream_map, + pgtbl_cfg->apple_dart_cfg.n_levels); stream_map->dart->hw->invalidate_tlb(stream_map); } @@ -611,7 +626,7 @@ static int apple_dart_finalize_domain(struct apple_dart_domain *dart_domain, dart_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; dart_domain->domain.geometry.aperture_start = 0; dart_domain->domain.geometry.aperture_end = - (dma_addr_t)DMA_BIT_MASK(dart->ias); + (dma_addr_t)DMA_BIT_MASK(pgtbl_cfg.ias); dart_domain->domain.geometry.force_aperture = true; dart_domain->finalized = true; @@ -657,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain, } static int apple_dart_attach_dev_paging(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret, i; struct apple_dart_stream_map *stream_map; @@ -678,13 +694,14 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain, } static int apple_dart_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; int i; - if (!cfg->stream_maps[0].dart->supports_bypass) + if (!cfg->supports_bypass) return -EINVAL; for_each_stream_map(i, cfg, stream_map) @@ -702,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = { }; static int apple_dart_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; @@ -773,8 +791,7 @@ static void apple_dart_domain_free(struct iommu_domain *domain) { struct apple_dart_domain *dart_domain = to_dart_domain(domain); - if (dart_domain->pgtbl_ops) - free_io_pgtable_ops(dart_domain->pgtbl_ops); + free_io_pgtable_ops(dart_domain->pgtbl_ops); kfree(dart_domain); } @@ -788,24 +805,31 @@ static int apple_dart_of_xlate(struct device *dev, struct apple_dart *cfg_dart; int i, sid; + put_device(&iommu_pdev->dev); + if (args->args_count != 1) return -EINVAL; sid = args->args[0]; - if (!cfg) + if (!cfg) { cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); - if (!cfg) - return -ENOMEM; + if (!cfg) + return -ENOMEM; + /* Will be ANDed with DART capabilities */ + cfg->supports_bypass = true; + } dev_iommu_priv_set(dev, cfg); cfg_dart = cfg->stream_maps[0].dart; if (cfg_dart) { - if (cfg_dart->supports_bypass != dart->supports_bypass) - return -EINVAL; if (cfg_dart->pgsize != dart->pgsize) return -EINVAL; + if (cfg_dart->ias != dart->ias) + return -EINVAL; } + cfg->supports_bypass &= dart->supports_bypass; + for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) { if (cfg->stream_maps[i].dart == dart) { set_bit(sid, cfg->stream_maps[i].sidmap); @@ -945,7 +969,7 @@ static int apple_dart_def_domain_type(struct device *dev) if (cfg->stream_maps[0].dart->pgsize > PAGE_SIZE) return IOMMU_DOMAIN_IDENTITY; - if (!cfg->stream_maps[0].dart->supports_bypass) + if (!cfg->supports_bypass) return IOMMU_DOMAIN_DMA; return 0; @@ -986,7 +1010,6 @@ static const struct iommu_ops apple_dart_iommu_ops = { .of_xlate = apple_dart_of_xlate, .def_domain_type = apple_dart_def_domain_type, .get_resv_regions = apple_dart_get_resv_regions, - .pgsize_bitmap = -1UL, /* Restricted during dart probe */ .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = apple_dart_attach_dev_paging, @@ -1073,6 +1096,9 @@ static irqreturn_t apple_dart_t8110_irq(int irq, void *dev) error, stream_idx, error_code, fault_name, addr); writel(error, dart->regs + DART_T8110_ERROR); + for (int i = 0; i < BITS_TO_U32(dart->num_streams); i++) + writel(U32_MAX, dart->regs + DART_T8110_ERROR_STREAMS + 4 * i); + return IRQ_HANDLED; } @@ -1133,6 +1159,7 @@ static int apple_dart_probe(struct platform_device *pdev) dart->ias = FIELD_GET(DART_T8110_PARAMS3_VA_WIDTH, dart_params[2]); dart->oas = FIELD_GET(DART_T8110_PARAMS3_PA_WIDTH, dart_params[2]); dart->num_streams = FIELD_GET(DART_T8110_PARAMS4_NUM_SIDS, dart_params[3]); + dart->four_level = dart->ias > 36; break; } @@ -1165,9 +1192,9 @@ static int apple_dart_probe(struct platform_device *pdev) dev_info( &pdev->dev, - "DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d] initialized\n", + "DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d, AS %d -> %d] initialized\n", dart->pgsize, dart->num_streams, dart->supports_bypass, - dart->pgsize > PAGE_SIZE); + dart->pgsize > PAGE_SIZE, dart->ias, dart->oas); return 0; err_sysfs_remove: @@ -1288,6 +1315,7 @@ static const struct apple_dart_hw apple_dart_hw_t8110 = { .tcr_enabled = DART_T8110_TCR_TRANSLATE_ENABLE, .tcr_disabled = 0, .tcr_bypass = DART_T8110_TCR_BYPASS_DAPF | DART_T8110_TCR_BYPASS_DART, + .tcr_4level = DART_T8110_TCR_FOUR_LEVEL, .ttbr = DART_T8110_TTBR, .ttbr_valid = DART_T8110_TTBR_VALID, @@ -1352,7 +1380,7 @@ static struct platform_driver apple_dart_driver = { .pm = pm_sleep_ptr(&apple_dart_pm_ops), }, .probe = apple_dart_probe, - .remove_new = apple_dart_remove, + .remove = apple_dart_remove, }; module_platform_driver(apple_dart_driver); diff --git a/drivers/iommu/arm/Kconfig b/drivers/iommu/arm/Kconfig new file mode 100644 index 000000000000..ef42bbe07dbe --- /dev/null +++ b/drivers/iommu/arm/Kconfig @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: GPL-2.0-only +# ARM IOMMU support +config ARM_SMMU + tristate "ARM Ltd. System MMU (SMMU) Support" + depends on ARM64 || ARM || COMPILE_TEST + depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE + select IOMMU_API + select IOMMU_IO_PGTABLE_LPAE + select ARM_DMA_USE_IOMMU if ARM + help + Support for implementations of the ARM System MMU architecture + versions 1 and 2. + + Say Y here if your SoC includes an IOMMU device implementing + the ARM SMMU architecture. + +if ARM_SMMU +config ARM_SMMU_LEGACY_DT_BINDINGS + bool "Support the legacy \"mmu-masters\" devicetree bindings" + depends on ARM_SMMU=y && OF + help + Support for the badly designed and deprecated "mmu-masters" + devicetree bindings. This allows some DMA masters to attach + to the SMMU but does not provide any support via the DMA API. + If you're lucky, you might be able to get VFIO up and running. + + If you say Y here then you'll make me very sad. Instead, say N + and move your firmware to the utopian future that was 2016. + +config ARM_SMMU_DISABLE_BYPASS_BY_DEFAULT + bool "Disable unmatched stream bypass by default" if EXPERT + default y + help + If your firmware is broken and fails to describe StreamIDs which + Linux should know about in order to manage the SMMU correctly and + securely, and you don't want to boot with the 'arm-smmu.disable_bypass=0' + command line parameter, then as a last resort you can turn it off + by default here. But don't. This option may be removed at any time. + + Note that 'arm-smmu.disable_bypass=1' will still take precedence. + +config ARM_SMMU_MMU_500_CPRE_ERRATA + bool "Enable errata workaround for CPRE in SMMU reset path" + default y + help + Say Y here (by default) to apply workaround to disable + MMU-500's next-page prefetcher for sake of 4 known errata. + + Say N here only when it is sure that any errata related to + prefetch enablement are not applicable on the platform. + Refer silicon-errata.rst for info on errata IDs. + +config ARM_SMMU_QCOM + def_tristate y + depends on ARCH_QCOM + select QCOM_SCM + help + When running on a Qualcomm platform that has the custom variant + of the ARM SMMU, this needs to be built into the SMMU driver. + +config ARM_SMMU_QCOM_DEBUG + bool "ARM SMMU QCOM implementation defined debug support" + depends on ARM_SMMU_QCOM=y + help + Support for implementation specific debug features in ARM SMMU + hardware found in QTI platforms. This include support for + the Translation Buffer Units (TBU) that can be used to obtain + additional information when debugging memory management issues + like context faults. + + Say Y here to enable debug for issues such as context faults + or TLB sync timeouts which requires implementation defined + register dumps. +endif + +config ARM_SMMU_V3 + tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support" + depends on ARM64 + select IOMMU_API + select IOMMU_IO_PGTABLE_LPAE + select GENERIC_MSI_IRQ + select IOMMUFD_DRIVER if IOMMUFD + help + Support for implementations of the ARM System MMU architecture + version 3 providing translation support to a PCIe root complex. + + Say Y here if your system includes an IOMMU device implementing + the ARM SMMUv3 architecture. + +if ARM_SMMU_V3 +config ARM_SMMU_V3_SVA + bool "Shared Virtual Addressing support for the ARM SMMUv3" + select IOMMU_SVA + select IOMMU_IOPF + select MMU_NOTIFIER + help + Support for sharing process address spaces with devices using the + SMMUv3. + + Say Y here if your system supports SVA extensions such as PCIe PASID + and PRI. + +config ARM_SMMU_V3_IOMMUFD + bool "Enable IOMMUFD features for ARM SMMUv3 (EXPERIMENTAL)" + depends on IOMMUFD + help + Support for IOMMUFD features intended to support virtual machines + with accelerated virtual IOMMUs. + + Say Y here if you are doing development and testing on this feature. + +config ARM_SMMU_V3_KUNIT_TEST + tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on ARM_SMMU_V3_SVA + default KUNIT_ALL_TESTS + help + Enable this option to unit-test arm-smmu-v3 driver functions. + + If unsure, say N. + +config TEGRA241_CMDQV + bool "NVIDIA Tegra241 CMDQ-V extension support for ARM SMMUv3" + depends on ACPI + help + Support for NVIDIA CMDQ-Virtualization extension for ARM SMMUv3. The + CMDQ-V extension is similar to v3.3 ECMDQ for multi command queues + support, except with virtualization capabilities. + + Say Y here if your system is NVIDIA Tegra241 (Grace) or it has the same + CMDQ-V extension. +endif + +config QCOM_IOMMU + # Note: iommu drivers cannot (yet?) be built as modules + bool "Qualcomm IOMMU Support" + depends on ARCH_QCOM || COMPILE_TEST + depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE + select QCOM_SCM + select IOMMU_API + select IOMMU_IO_PGTABLE_LPAE + select ARM_DMA_USE_IOMMU + help + Support for IOMMU on certain Qualcomm SoCs. diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile index 014a997753a8..493a659cc66b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/Makefile +++ b/drivers/iommu/arm/arm-smmu-v3/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o -arm_smmu_v3-objs-y += arm-smmu-v3.o -arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o -arm_smmu_v3-objs := $(arm_smmu_v3-objs-y) +arm_smmu_v3-y := arm-smmu-v3.o +arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_IOMMUFD) += arm-smmu-v3-iommufd.o +arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o +arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o obj-$(CONFIG_ARM_SMMU_V3_KUNIT_TEST) += arm-smmu-v3-test.o diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c new file mode 100644 index 000000000000..93fdadd07431 --- /dev/null +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ + +#include <uapi/linux/iommufd.h> + +#include "arm-smmu-v3.h" + +void *arm_smmu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + const struct arm_smmu_impl_ops *impl_ops = master->smmu->impl_ops; + struct iommu_hw_info_arm_smmuv3 *info; + u32 __iomem *base_idr; + unsigned int i; + + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) { + if (!impl_ops || !impl_ops->hw_info) + return ERR_PTR(-EOPNOTSUPP); + return impl_ops->hw_info(master->smmu, length, type); + } + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + base_idr = master->smmu->base + ARM_SMMU_IDR0; + for (i = 0; i <= 5; i++) + info->idr[i] = readl_relaxed(base_idr + i); + info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR); + info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3; + + return info; +} + +static void arm_smmu_make_nested_cd_table_ste( + struct arm_smmu_ste *target, struct arm_smmu_master *master, + struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) +{ + arm_smmu_make_s2_domain_ste( + target, master, nested_domain->vsmmu->s2_parent, ats_enabled); + + target->data[0] = cpu_to_le64(STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, + STRTAB_STE_0_CFG_NESTED)); + target->data[0] |= nested_domain->ste[0] & + ~cpu_to_le64(STRTAB_STE_0_CFG); + target->data[1] |= nested_domain->ste[1]; + /* Merge events for DoS mitigations on eventq */ + target->data[1] |= cpu_to_le64(STRTAB_STE_1_MEV); +} + +/* + * Create a physical STE from the virtual STE that userspace provided when it + * created the nested domain. Using the vSTE userspace can request: + * - Non-valid STE + * - Abort STE + * - Bypass STE (install the S2, no CD table) + * - CD table STE (install the S2 and the userspace CD table) + */ +static void arm_smmu_make_nested_domain_ste( + struct arm_smmu_ste *target, struct arm_smmu_master *master, + struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) +{ + unsigned int cfg = + FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])); + + /* + * Userspace can request a non-valid STE through the nesting interface. + * We relay that into an abort physical STE with the intention that + * C_BAD_STE for this SID can be generated to userspace. + */ + if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) + cfg = STRTAB_STE_0_CFG_ABORT; + + switch (cfg) { + case STRTAB_STE_0_CFG_S1_TRANS: + arm_smmu_make_nested_cd_table_ste(target, master, nested_domain, + ats_enabled); + break; + case STRTAB_STE_0_CFG_BYPASS: + arm_smmu_make_s2_domain_ste(target, master, + nested_domain->vsmmu->s2_parent, + ats_enabled); + break; + case STRTAB_STE_0_CFG_ABORT: + default: + arm_smmu_make_abort_ste(target); + break; + } +} + +int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain) +{ + unsigned int cfg = + FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])); + struct arm_smmu_vmaster *vmaster; + unsigned long vsid; + int ret; + + iommu_group_mutex_assert(state->master->dev); + + ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core, + state->master->dev, &vsid); + /* + * Attaching to a translate nested domain must allocate a vDEVICE prior, + * as CD/ATS invalidations and vevents require a vSID to work properly. + * A abort/bypass domain is allowed to attach w/o vmaster for GBPA case. + */ + if (ret) { + if (cfg == STRTAB_STE_0_CFG_ABORT || + cfg == STRTAB_STE_0_CFG_BYPASS) + return 0; + return ret; + } + + vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL); + if (!vmaster) + return -ENOMEM; + vmaster->vsmmu = nested_domain->vsmmu; + vmaster->vsid = vsid; + state->vmaster = vmaster; + + return 0; +} + +void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state) +{ + struct arm_smmu_master *master = state->master; + + mutex_lock(&master->smmu->streams_mutex); + kfree(master->vmaster); + master->vmaster = state->vmaster; + mutex_unlock(&master->smmu->streams_mutex); +} + +void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) +{ + struct arm_smmu_attach_state state = { .master = master }; + + arm_smmu_attach_commit_vmaster(&state); +} + +static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old_domain) +{ + struct arm_smmu_nested_domain *nested_domain = + to_smmu_nested_domain(domain); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_attach_state state = { + .master = master, + .old_domain = old_domain, + .ssid = IOMMU_NO_PASID, + }; + struct arm_smmu_ste ste; + int ret; + + if (nested_domain->vsmmu->smmu != master->smmu) + return -EINVAL; + if (arm_smmu_ssids_in_use(&master->cd_table)) + return -EBUSY; + + mutex_lock(&arm_smmu_asid_lock); + /* + * The VM has to control the actual ATS state at the PCI device because + * we forward the invalidations directly from the VM. If the VM doesn't + * think ATS is on it will not generate ATC flushes and the ATC will + * become incoherent. Since we can't access the actual virtual PCI ATS + * config bit here base this off the EATS value in the STE. If the EATS + * is set then the VM must generate ATC flushes. + */ + state.disable_ats = !nested_domain->enable_ats; + ret = arm_smmu_attach_prepare(&state, domain); + if (ret) { + mutex_unlock(&arm_smmu_asid_lock); + return ret; + } + + arm_smmu_make_nested_domain_ste(&ste, master, nested_domain, + state.ats_enabled); + arm_smmu_install_ste_for_dev(master, &ste); + arm_smmu_attach_commit(&state); + mutex_unlock(&arm_smmu_asid_lock); + return 0; +} + +static void arm_smmu_domain_nested_free(struct iommu_domain *domain) +{ + kfree(to_smmu_nested_domain(domain)); +} + +static const struct iommu_domain_ops arm_smmu_nested_ops = { + .attach_dev = arm_smmu_attach_dev_nested, + .free = arm_smmu_domain_nested_free, +}; + +static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg, + bool *enable_ats) +{ + unsigned int eats; + unsigned int cfg; + + if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) { + memset(arg->ste, 0, sizeof(arg->ste)); + return 0; + } + + /* EIO is reserved for invalid STE data. */ + if ((arg->ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) || + (arg->ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED)) + return -EIO; + + cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg->ste[0])); + if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS && + cfg != STRTAB_STE_0_CFG_S1_TRANS) + return -EIO; + + /* + * Only Full ATS or ATS UR is supported + * The EATS field will be set by arm_smmu_make_nested_domain_ste() + */ + eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg->ste[1])); + arg->ste[1] &= ~cpu_to_le64(STRTAB_STE_1_EATS); + if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS) + return -EIO; + + if (cfg == STRTAB_STE_0_CFG_S1_TRANS) + *enable_ats = (eats == STRTAB_STE_1_EATS_TRANS); + return 0; +} + +struct iommu_domain * +arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + struct arm_smmu_nested_domain *nested_domain; + struct iommu_hwpt_arm_smmuv3 arg; + bool enable_ats = false; + int ret; + + if (flags) + return ERR_PTR(-EOPNOTSUPP); + + ret = iommu_copy_struct_from_user(&arg, user_data, + IOMMU_HWPT_DATA_ARM_SMMUV3, ste); + if (ret) + return ERR_PTR(ret); + + ret = arm_smmu_validate_vste(&arg, &enable_ats); + if (ret) + return ERR_PTR(ret); + + nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT); + if (!nested_domain) + return ERR_PTR(-ENOMEM); + + nested_domain->domain.type = IOMMU_DOMAIN_NESTED; + nested_domain->domain.ops = &arm_smmu_nested_ops; + nested_domain->enable_ats = enable_ats; + nested_domain->vsmmu = vsmmu; + nested_domain->ste[0] = arg.ste[0]; + nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS); + + return &nested_domain->domain; +} + +static int arm_vsmmu_vsid_to_sid(struct arm_vsmmu *vsmmu, u32 vsid, u32 *sid) +{ + struct arm_smmu_master *master; + struct device *dev; + int ret = 0; + + xa_lock(&vsmmu->core.vdevs); + dev = iommufd_viommu_find_dev(&vsmmu->core, (unsigned long)vsid); + if (!dev) { + ret = -EIO; + goto unlock; + } + master = dev_iommu_priv_get(dev); + + /* At this moment, iommufd only supports PCI device that has one SID */ + if (sid) + *sid = master->streams[0].id; +unlock: + xa_unlock(&vsmmu->core.vdevs); + return ret; +} + +/* This is basically iommu_viommu_arm_smmuv3_invalidate in u64 for conversion */ +struct arm_vsmmu_invalidation_cmd { + union { + u64 cmd[2]; + struct iommu_viommu_arm_smmuv3_invalidate ucmd; + }; +}; + +/* + * Convert, in place, the raw invalidation command into an internal format that + * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are + * stored in CPU endian. + * + * Enforce the VMID or SID on the command. + */ +static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu, + struct arm_vsmmu_invalidation_cmd *cmd) +{ + /* Commands are le64 stored in u64 */ + cmd->cmd[0] = le64_to_cpu(cmd->ucmd.cmd[0]); + cmd->cmd[1] = le64_to_cpu(cmd->ucmd.cmd[1]); + + switch (cmd->cmd[0] & CMDQ_0_OP) { + case CMDQ_OP_TLBI_NSNH_ALL: + /* Convert to NH_ALL */ + cmd->cmd[0] = CMDQ_OP_TLBI_NH_ALL | + FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid); + cmd->cmd[1] = 0; + break; + case CMDQ_OP_TLBI_NH_VA: + case CMDQ_OP_TLBI_NH_VAA: + case CMDQ_OP_TLBI_NH_ALL: + case CMDQ_OP_TLBI_NH_ASID: + cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID; + cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid); + break; + case CMDQ_OP_ATC_INV: + case CMDQ_OP_CFGI_CD: + case CMDQ_OP_CFGI_CD_ALL: { + u32 sid, vsid = FIELD_GET(CMDQ_CFGI_0_SID, cmd->cmd[0]); + + if (arm_vsmmu_vsid_to_sid(vsmmu, vsid, &sid)) + return -EIO; + cmd->cmd[0] &= ~CMDQ_CFGI_0_SID; + cmd->cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, sid); + break; + } + default: + return -EIO; + } + return 0; +} + +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + struct arm_smmu_device *smmu = vsmmu->smmu; + struct arm_vsmmu_invalidation_cmd *last; + struct arm_vsmmu_invalidation_cmd *cmds; + struct arm_vsmmu_invalidation_cmd *cur; + struct arm_vsmmu_invalidation_cmd *end; + int ret; + + cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + cur = cmds; + end = cmds + array->entry_num; + + static_assert(sizeof(*cmds) == 2 * sizeof(u64)); + ret = iommu_copy_struct_from_full_user_array( + cmds, sizeof(*cmds), array, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3); + if (ret) + goto out; + + last = cmds; + while (cur != end) { + ret = arm_vsmmu_convert_user_cmd(vsmmu, cur); + if (ret) + goto out; + + /* FIXME work in blocks of CMDQ_BATCH_ENTRIES and copy each block? */ + cur++; + if (cur != end && (cur - last) != CMDQ_BATCH_ENTRIES - 1) + continue; + + /* FIXME always uses the main cmdq rather than trying to group by type */ + ret = arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, last->cmd, + cur - last, true); + if (ret) { + cur--; + goto out; + } + last = cur; + } +out: + array->entry_num = cur - cmds; + kfree(cmds); + return ret; +} + +static const struct iommufd_viommu_ops arm_vsmmu_ops = { + .alloc_domain_nested = arm_vsmmu_alloc_domain_nested, + .cache_invalidate = arm_vsmmu_cache_invalidate, +}; + +size_t arm_smmu_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_device *smmu = master->smmu; + + if (!(smmu->features & ARM_SMMU_FEAT_NESTING)) + return 0; + + /* + * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW + * defect is needed to determine if arm_vsmmu_cache_invalidate() needs + * any change to remove this. + */ + if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) + return 0; + + /* + * Must support some way to prevent the VM from bypassing the cache + * because VFIO currently does not do any cache maintenance. canwbs + * indicates the device is fully coherent and no cache maintenance is + * ever required, even for PCI No-Snoop. S2FWB means the S1 can't make + * things non-coherent using the memattr, but No-Snoop behavior is not + * effected. + */ + if (!arm_smmu_master_canwbs(master) && + !(smmu->features & ARM_SMMU_FEAT_S2FWB)) + return 0; + + if (viommu_type == IOMMU_VIOMMU_TYPE_ARM_SMMUV3) + return VIOMMU_STRUCT_SIZE(struct arm_vsmmu, core); + + if (!smmu->impl_ops || !smmu->impl_ops->get_viommu_size) + return 0; + return smmu->impl_ops->get_viommu_size(viommu_type); +} + +int arm_vsmmu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + struct arm_smmu_device *smmu = + container_of(viommu->iommu_dev, struct arm_smmu_device, iommu); + struct arm_smmu_domain *s2_parent = to_smmu_domain(parent_domain); + + if (s2_parent->smmu != smmu) + return -EINVAL; + + vsmmu->smmu = smmu; + vsmmu->s2_parent = s2_parent; + /* FIXME Move VMID allocation from the S2 domain allocation to here */ + vsmmu->vmid = s2_parent->s2_cfg.vmid; + + if (viommu->type == IOMMU_VIOMMU_TYPE_ARM_SMMUV3) { + viommu->ops = &arm_vsmmu_ops; + return 0; + } + + return smmu->impl_ops->vsmmu_init(vsmmu, user_data); +} + +int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt) +{ + struct iommu_vevent_arm_smmuv3 vevt; + int i; + + lockdep_assert_held(&vmaster->vsmmu->smmu->streams_mutex); + + vevt.evt[0] = cpu_to_le64((evt[0] & ~EVTQ_0_SID) | + FIELD_PREP(EVTQ_0_SID, vmaster->vsid)); + for (i = 1; i < EVTQ_ENT_DWORDS; i++) + vevt.evt[i] = cpu_to_le64(evt[i]); + + return iommufd_viommu_report_event(&vmaster->vsmmu->core, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3, &vevt, + sizeof(vevt)); +} + +MODULE_IMPORT_NS("IOMMUFD"); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index e490ffb38015..59a480974d80 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -13,103 +13,29 @@ #include "arm-smmu-v3.h" #include "../../io-pgtable-arm.h" -struct arm_smmu_mmu_notifier { - struct mmu_notifier mn; - struct arm_smmu_ctx_desc *cd; - bool cleared; - refcount_t refs; - struct list_head list; - struct arm_smmu_domain *domain; -}; - -#define mn_to_smmu(mn) container_of(mn, struct arm_smmu_mmu_notifier, mn) - -struct arm_smmu_bond { - struct mm_struct *mm; - struct arm_smmu_mmu_notifier *smmu_mn; - struct list_head list; -}; - -#define sva_to_bond(handle) \ - container_of(handle, struct arm_smmu_bond, sva) - -static DEFINE_MUTEX(sva_lock); - -static void +static void __maybe_unused arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain) { - struct arm_smmu_master *master; + struct arm_smmu_master_domain *master_domain; struct arm_smmu_cd target_cd; unsigned long flags; spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { + list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) { + struct arm_smmu_master *master = master_domain->master; struct arm_smmu_cd *cdptr; - /* S1 domains only support RID attachment right now */ - cdptr = arm_smmu_get_cd_ptr(master, IOMMU_NO_PASID); + cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid); if (WARN_ON(!cdptr)) continue; arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); - arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr, + arm_smmu_write_cd_entry(master, master_domain->ssid, cdptr, &target_cd); } spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); } -/* - * Check if the CPU ASID is available on the SMMU side. If a private context - * descriptor is using it, try to replace it. - */ -static struct arm_smmu_ctx_desc * -arm_smmu_share_asid(struct mm_struct *mm, u16 asid) -{ - int ret; - u32 new_asid; - struct arm_smmu_ctx_desc *cd; - struct arm_smmu_device *smmu; - struct arm_smmu_domain *smmu_domain; - - cd = xa_load(&arm_smmu_asid_xa, asid); - if (!cd) - return NULL; - - if (cd->mm) { - if (WARN_ON(cd->mm != mm)) - return ERR_PTR(-EINVAL); - /* All devices bound to this mm use the same cd struct. */ - refcount_inc(&cd->refs); - return cd; - } - - smmu_domain = container_of(cd, struct arm_smmu_domain, cd); - smmu = smmu_domain->smmu; - - ret = xa_alloc(&arm_smmu_asid_xa, &new_asid, cd, - XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL); - if (ret) - return ERR_PTR(-ENOSPC); - /* - * Race with unmap: TLB invalidations will start targeting the new ASID, - * which isn't assigned yet. We'll do an invalidate-all on the old ASID - * later, so it doesn't matter. - */ - cd->asid = new_asid; - /* - * Update ASID and invalidate CD in all associated masters. There will - * be some overlap between use of both ASIDs, until we invalidate the - * TLB. - */ - arm_smmu_update_s1_domain_cd_entry(smmu_domain); - - /* Invalidate TLB entries previously associated with that context */ - arm_smmu_tlb_inv_asid(smmu, asid); - - xa_erase(&arm_smmu_asid_xa, asid); - return NULL; -} - static u64 page_size_to_cd(void) { static_assert(PAGE_SIZE == SZ_4K || PAGE_SIZE == SZ_16K || @@ -184,71 +110,17 @@ void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, * from the current CPU register */ target->data[3] = cpu_to_le64(read_sysreg(mair_el1)); -} -EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd); - -static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm) -{ - u16 asid; - int err = 0; - struct arm_smmu_ctx_desc *cd; - struct arm_smmu_ctx_desc *ret = NULL; - - /* Don't free the mm until we release the ASID */ - mmgrab(mm); - - asid = arm64_mm_context_get(mm); - if (!asid) { - err = -ESRCH; - goto out_drop_mm; - } - - cd = kzalloc(sizeof(*cd), GFP_KERNEL); - if (!cd) { - err = -ENOMEM; - goto out_put_context; - } - - refcount_set(&cd->refs, 1); - - mutex_lock(&arm_smmu_asid_lock); - ret = arm_smmu_share_asid(mm, asid); - if (ret) { - mutex_unlock(&arm_smmu_asid_lock); - goto out_free_cd; - } - - err = xa_insert(&arm_smmu_asid_xa, asid, cd, GFP_KERNEL); - mutex_unlock(&arm_smmu_asid_lock); - - if (err) - goto out_free_asid; - cd->asid = asid; - cd->mm = mm; - - return cd; - -out_free_asid: - arm_smmu_free_asid(cd); -out_free_cd: - kfree(cd); -out_put_context: - arm64_mm_context_put(mm); -out_drop_mm: - mmdrop(mm); - return err < 0 ? ERR_PTR(err) : ret; -} - -static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd) -{ - if (arm_smmu_free_asid(cd)) { - /* Unpin ASID */ - arm64_mm_context_put(cd->mm); - mmdrop(cd->mm); - kfree(cd); - } + /* + * Note that we don't bother with S1PIE on the SMMU, we just rely on + * our default encoding scheme matching direct permissions anyway. + * SMMU has no notion of S1POE nor GCS, so make sure that is clear if + * either is enabled for CPUs, just in case anyone imagines otherwise. + */ + if (system_supports_poe() || system_supports_gcs()) + dev_warn_once(master->smmu->dev, "SVA devices ignore permission overlays and GCS\n"); } +EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd); /* * Cloned from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h, this @@ -264,8 +136,8 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn); - struct arm_smmu_domain *smmu_domain = smmu_mn->domain; + struct arm_smmu_domain *smmu_domain = + container_of(mn, struct arm_smmu_domain, mmu_notifier); size_t size; /* @@ -282,62 +154,50 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, size = 0; } - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) { - if (!size) - arm_smmu_tlb_inv_asid(smmu_domain->smmu, - smmu_mn->cd->asid); - else - arm_smmu_tlb_inv_range_asid(start, size, - smmu_mn->cd->asid, - PAGE_SIZE, false, - smmu_domain); - } + if (!size) + arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid); + else + arm_smmu_tlb_inv_range_asid(start, size, smmu_domain->cd.asid, + PAGE_SIZE, false, smmu_domain); - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start, - size); + arm_smmu_atc_inv_domain(smmu_domain, start, size); } static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn); - struct arm_smmu_domain *smmu_domain = smmu_mn->domain; - struct arm_smmu_master *master; + struct arm_smmu_domain *smmu_domain = + container_of(mn, struct arm_smmu_domain, mmu_notifier); + struct arm_smmu_master_domain *master_domain; unsigned long flags; - mutex_lock(&sva_lock); - if (smmu_mn->cleared) { - mutex_unlock(&sva_lock); - return; - } - /* * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events, * but disable translation. */ spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + struct arm_smmu_master *master = master_domain->master; struct arm_smmu_cd target; struct arm_smmu_cd *cdptr; - cdptr = arm_smmu_get_cd_ptr(master, mm_get_enqcmd_pasid(mm)); + cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid); if (WARN_ON(!cdptr)) continue; - arm_smmu_make_sva_cd(&target, master, NULL, smmu_mn->cd->asid); - arm_smmu_write_cd_entry(master, mm_get_enqcmd_pasid(mm), cdptr, + arm_smmu_make_sva_cd(&target, master, NULL, + smmu_domain->cd.asid); + arm_smmu_write_cd_entry(master, master_domain->ssid, cdptr, &target); } spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0); - - smmu_mn->cleared = true; - mutex_unlock(&sva_lock); + arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid); + arm_smmu_atc_inv_domain(smmu_domain, 0, 0); } static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn) { - kfree(mn_to_smmu(mn)); + kfree(container_of(mn, struct arm_smmu_domain, mmu_notifier)); } static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { @@ -346,127 +206,6 @@ static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { .free_notifier = arm_smmu_mmu_notifier_free, }; -/* Allocate or get existing MMU notifier for this {domain, mm} pair */ -static struct arm_smmu_mmu_notifier * -arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, - struct mm_struct *mm) -{ - int ret; - struct arm_smmu_ctx_desc *cd; - struct arm_smmu_mmu_notifier *smmu_mn; - - list_for_each_entry(smmu_mn, &smmu_domain->mmu_notifiers, list) { - if (smmu_mn->mn.mm == mm) { - refcount_inc(&smmu_mn->refs); - return smmu_mn; - } - } - - cd = arm_smmu_alloc_shared_cd(mm); - if (IS_ERR(cd)) - return ERR_CAST(cd); - - smmu_mn = kzalloc(sizeof(*smmu_mn), GFP_KERNEL); - if (!smmu_mn) { - ret = -ENOMEM; - goto err_free_cd; - } - - refcount_set(&smmu_mn->refs, 1); - smmu_mn->cd = cd; - smmu_mn->domain = smmu_domain; - smmu_mn->mn.ops = &arm_smmu_mmu_notifier_ops; - - ret = mmu_notifier_register(&smmu_mn->mn, mm); - if (ret) { - kfree(smmu_mn); - goto err_free_cd; - } - - list_add(&smmu_mn->list, &smmu_domain->mmu_notifiers); - return smmu_mn; - -err_free_cd: - arm_smmu_free_shared_cd(cd); - return ERR_PTR(ret); -} - -static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) -{ - struct mm_struct *mm = smmu_mn->mn.mm; - struct arm_smmu_ctx_desc *cd = smmu_mn->cd; - struct arm_smmu_domain *smmu_domain = smmu_mn->domain; - - if (!refcount_dec_and_test(&smmu_mn->refs)) - return; - - list_del(&smmu_mn->list); - - /* - * If we went through clear(), we've already invalidated, and no - * new TLB entry can have been formed. - */ - if (!smmu_mn->cleared) { - arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, - 0); - } - - /* Frees smmu_mn */ - mmu_notifier_put(&smmu_mn->mn); - arm_smmu_free_shared_cd(cd); -} - -static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid, - struct mm_struct *mm) -{ - int ret; - struct arm_smmu_cd target; - struct arm_smmu_cd *cdptr; - struct arm_smmu_bond *bond; - struct arm_smmu_master *master = dev_iommu_priv_get(dev); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct arm_smmu_domain *smmu_domain; - - if (!(domain->type & __IOMMU_DOMAIN_PAGING)) - return -ENODEV; - smmu_domain = to_smmu_domain(domain); - if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) - return -ENODEV; - - if (!master || !master->sva_enabled) - return -ENODEV; - - bond = kzalloc(sizeof(*bond), GFP_KERNEL); - if (!bond) - return -ENOMEM; - - bond->mm = mm; - - bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm); - if (IS_ERR(bond->smmu_mn)) { - ret = PTR_ERR(bond->smmu_mn); - goto err_free_bond; - } - - cdptr = arm_smmu_alloc_cd_ptr(master, mm_get_enqcmd_pasid(mm)); - if (!cdptr) { - ret = -ENOMEM; - goto err_put_notifier; - } - arm_smmu_make_sva_cd(&target, master, mm, bond->smmu_mn->cd->asid); - arm_smmu_write_cd_entry(master, pasid, cdptr, &target); - - list_add(&bond->list, &master->bonds); - return 0; - -err_put_notifier: - arm_smmu_mmu_notifier_put(bond->smmu_mn); -err_free_bond: - kfree(bond); - return ret; -} - bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { unsigned long reg, fld; @@ -474,8 +213,15 @@ bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) unsigned long asid_bits; u32 feat_mask = ARM_SMMU_FEAT_COHERENCY; - if (vabits_actual == 52) + if (vabits_actual == 52) { + /* We don't support LPA2 */ + if (PAGE_SIZE != SZ_64K) + return false; feat_mask |= ARM_SMMU_FEAT_VAX; + } + + if (system_supports_bbml2_noabort()) + feat_mask |= ARM_SMMU_FEAT_BBML2; if ((smmu->features & feat_mask) != feat_mask) return false; @@ -512,89 +258,6 @@ bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) return true; } -bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master) -{ - /* We're not keeping track of SIDs in fault events */ - if (master->num_streams != 1) - return false; - - return master->stall_enabled; -} - -bool arm_smmu_master_sva_supported(struct arm_smmu_master *master) -{ - if (!(master->smmu->features & ARM_SMMU_FEAT_SVA)) - return false; - - /* SSID support is mandatory for the moment */ - return master->ssid_bits; -} - -bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master) -{ - bool enabled; - - mutex_lock(&sva_lock); - enabled = master->sva_enabled; - mutex_unlock(&sva_lock); - return enabled; -} - -static int arm_smmu_master_sva_enable_iopf(struct arm_smmu_master *master) -{ - struct device *dev = master->dev; - - /* - * Drivers for devices supporting PRI or stall should enable IOPF first. - * Others have device-specific fault handlers and don't need IOPF. - */ - if (!arm_smmu_master_iopf_supported(master)) - return 0; - - if (!master->iopf_enabled) - return -EINVAL; - - return iopf_queue_add_device(master->smmu->evtq.iopf, dev); -} - -static void arm_smmu_master_sva_disable_iopf(struct arm_smmu_master *master) -{ - struct device *dev = master->dev; - - if (!master->iopf_enabled) - return; - - iopf_queue_remove_device(master->smmu->evtq.iopf, dev); -} - -int arm_smmu_master_enable_sva(struct arm_smmu_master *master) -{ - int ret; - - mutex_lock(&sva_lock); - ret = arm_smmu_master_sva_enable_iopf(master); - if (!ret) - master->sva_enabled = true; - mutex_unlock(&sva_lock); - - return ret; -} - -int arm_smmu_master_disable_sva(struct arm_smmu_master *master) -{ - mutex_lock(&sva_lock); - if (!list_empty(&master->bonds)) { - dev_err(master->dev, "cannot disable SVA, device is bound\n"); - mutex_unlock(&sva_lock); - return -EBUSY; - } - arm_smmu_master_sva_disable_iopf(master); - master->sva_enabled = false; - mutex_unlock(&sva_lock); - - return 0; -} - void arm_smmu_sva_notifier_synchronize(void) { /* @@ -604,51 +267,55 @@ void arm_smmu_sva_notifier_synchronize(void) mmu_notifier_synchronize(); } -void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) +static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id, + struct iommu_domain *old) { - struct mm_struct *mm = domain->mm; - struct arm_smmu_bond *bond = NULL, *t; + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_cd target; + int ret; - mutex_lock(&sva_lock); - - arm_smmu_clear_cd(master, id); - - list_for_each_entry(t, &master->bonds, list) { - if (t->mm == mm) { - bond = t; - break; - } - } - - if (!WARN_ON(!bond)) { - list_del(&bond->list); - arm_smmu_mmu_notifier_put(bond->smmu_mn); - kfree(bond); - } - mutex_unlock(&sva_lock); -} - -static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) -{ - int ret = 0; - struct mm_struct *mm = domain->mm; + if (!(master->smmu->features & ARM_SMMU_FEAT_SVA)) + return -EOPNOTSUPP; - if (mm_get_enqcmd_pasid(mm) != id) + /* Prevent arm_smmu_mm_release from being called while we are attaching */ + if (!mmget_not_zero(domain->mm)) return -EINVAL; - mutex_lock(&sva_lock); - ret = __arm_smmu_sva_bind(dev, id, mm); - mutex_unlock(&sva_lock); + /* + * This does not need the arm_smmu_asid_lock because SVA domains never + * get reassigned + */ + arm_smmu_make_sva_cd(&target, master, domain->mm, smmu_domain->cd.asid); + ret = arm_smmu_set_pasid(master, smmu_domain, id, &target, old); + mmput(domain->mm); return ret; } static void arm_smmu_sva_domain_free(struct iommu_domain *domain) { - kfree(domain); + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + + /* + * Ensure the ASID is empty in the iommu cache before allowing reuse. + */ + arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid); + + /* + * Notice that the arm_smmu_mm_arch_invalidate_secondary_tlbs op can + * still be called/running at this point. We allow the ASID to be + * reused, and if there is a race then it just suffers harmless + * unnecessary invalidation. + */ + xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid); + + /* + * Actual free is defered to the SRCU callback + * arm_smmu_mmu_notifier_free() + */ + mmu_notifier_put(&smmu_domain->mmu_notifier); } static const struct iommu_domain_ops arm_smmu_sva_domain_ops = { @@ -656,14 +323,47 @@ static const struct iommu_domain_ops arm_smmu_sva_domain_ops = { .free = arm_smmu_sva_domain_free }; -struct iommu_domain *arm_smmu_sva_domain_alloc(void) +struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) { - struct iommu_domain *domain; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_domain *smmu_domain; + u32 asid; + int ret; + + if (!(master->smmu->features & ARM_SMMU_FEAT_SVA)) + return ERR_PTR(-EOPNOTSUPP); + + smmu_domain = arm_smmu_domain_alloc(); + if (IS_ERR(smmu_domain)) + return ERR_CAST(smmu_domain); + smmu_domain->domain.type = IOMMU_DOMAIN_SVA; + smmu_domain->domain.ops = &arm_smmu_sva_domain_ops; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - domain->ops = &arm_smmu_sva_domain_ops; + /* + * Choose page_size as the leaf page size for invalidation when + * ARM_SMMU_FEAT_RANGE_INV is present + */ + smmu_domain->domain.pgsize_bitmap = PAGE_SIZE; + smmu_domain->smmu = smmu; - return domain; + ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain, + XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL); + if (ret) + goto err_free; + + smmu_domain->cd.asid = asid; + smmu_domain->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops; + ret = mmu_notifier_register(&smmu_domain->mmu_notifier, mm); + if (ret) + goto err_asid; + + return &smmu_domain->domain; + +err_asid: + xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid); +err_free: + kfree(smmu_domain); + return ERR_PTR(ret); } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c index 315e487fd990..d2671bfd3798 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c @@ -30,6 +30,11 @@ static struct mm_struct sva_mm = { .pgd = (void *)0xdaedbeefdeadbeefULL, }; +enum arm_smmu_test_master_feat { + ARM_SMMU_MASTER_TEST_ATS = BIT(0), + ARM_SMMU_MASTER_TEST_STALL = BIT(1), +}; + static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry, const __le64 *used_bits, const __le64 *target, @@ -144,6 +149,14 @@ static void arm_smmu_v3_test_ste_expect_transition( KUNIT_EXPECT_MEMEQ(test, target->data, cur_copy.data, sizeof(cur_copy)); } +static void arm_smmu_v3_test_ste_expect_non_hitless_transition( + struct kunit *test, const struct arm_smmu_ste *cur, + const struct arm_smmu_ste *target, unsigned int num_syncs_expected) +{ + arm_smmu_v3_test_ste_expect_transition(test, cur, target, + num_syncs_expected, false); +} + static void arm_smmu_v3_test_ste_expect_hitless_transition( struct kunit *test, const struct arm_smmu_ste *cur, const struct arm_smmu_ste *target, unsigned int num_syncs_expected) @@ -155,16 +168,23 @@ static void arm_smmu_v3_test_ste_expect_hitless_transition( static const dma_addr_t fake_cdtab_dma_addr = 0xF0F0F0F0F0F0; static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste, - const dma_addr_t dma_addr) + unsigned int s1dss, + const dma_addr_t dma_addr, + enum arm_smmu_test_master_feat feat) { + bool ats_enabled = feat & ARM_SMMU_MASTER_TEST_ATS; + bool stall_enabled = feat & ARM_SMMU_MASTER_TEST_STALL; + struct arm_smmu_master master = { + .ats_enabled = ats_enabled, .cd_table.cdtab_dma = dma_addr, .cd_table.s1cdmax = 0xFF, .cd_table.s1fmt = STRTAB_STE_0_S1FMT_64K_L2, .smmu = &smmu, + .stall_enabled = stall_enabled, }; - arm_smmu_make_cdtable_ste(ste, &master); + arm_smmu_make_cdtable_ste(ste, &master, ats_enabled, s1dss); } static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test) @@ -194,7 +214,8 @@ static void arm_smmu_v3_write_ste_test_cdtable_to_abort(struct kunit *test) { struct arm_smmu_ste ste; - arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr); + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &abort_ste, NUM_EXPECTED_SYNCS(2)); } @@ -203,7 +224,8 @@ static void arm_smmu_v3_write_ste_test_abort_to_cdtable(struct kunit *test) { struct arm_smmu_ste ste; - arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr); + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); arm_smmu_v3_test_ste_expect_hitless_transition(test, &abort_ste, &ste, NUM_EXPECTED_SYNCS(2)); } @@ -212,7 +234,8 @@ static void arm_smmu_v3_write_ste_test_cdtable_to_bypass(struct kunit *test) { struct arm_smmu_ste ste; - arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr); + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &bypass_ste, NUM_EXPECTED_SYNCS(3)); } @@ -221,17 +244,63 @@ static void arm_smmu_v3_write_ste_test_bypass_to_cdtable(struct kunit *test) { struct arm_smmu_ste ste; - arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr); + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); arm_smmu_v3_test_ste_expect_hitless_transition(test, &bypass_ste, &ste, NUM_EXPECTED_SYNCS(3)); } +static void arm_smmu_v3_write_ste_test_cdtable_s1dss_change(struct kunit *test) +{ + struct arm_smmu_ste ste; + struct arm_smmu_ste s1dss_bypass; + + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); + + /* + * Flipping s1dss on a CD table STE only involves changes to the second + * qword of an STE and can be done in a single write. + */ + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &ste, &s1dss_bypass, NUM_EXPECTED_SYNCS(1)); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &s1dss_bypass, &ste, NUM_EXPECTED_SYNCS(1)); +} + +static void +arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass(struct kunit *test) +{ + struct arm_smmu_ste s1dss_bypass; + + arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &s1dss_bypass, &bypass_ste, NUM_EXPECTED_SYNCS(2)); +} + +static void +arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass(struct kunit *test) +{ + struct arm_smmu_ste s1dss_bypass; + + arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_v3_test_ste_expect_hitless_transition( + test, &bypass_ste, &s1dss_bypass, NUM_EXPECTED_SYNCS(2)); +} + static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste, - bool ats_enabled) + enum arm_smmu_test_master_feat feat) { + bool ats_enabled = feat & ARM_SMMU_MASTER_TEST_ATS; + bool stall_enabled = feat & ARM_SMMU_MASTER_TEST_STALL; struct arm_smmu_master master = { - .smmu = &smmu, .ats_enabled = ats_enabled, + .smmu = &smmu, + .stall_enabled = stall_enabled, }; struct io_pgtable io_pgtable = {}; struct arm_smmu_domain smmu_domain = { @@ -247,14 +316,14 @@ static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste, io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.sl = 3; io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.tsz = 4; - arm_smmu_make_s2_domain_ste(ste, &master, &smmu_domain); + arm_smmu_make_s2_domain_ste(ste, &master, &smmu_domain, ats_enabled); } static void arm_smmu_v3_write_ste_test_s2_to_abort(struct kunit *test) { struct arm_smmu_ste ste; - arm_smmu_test_make_s2_ste(&ste, true); + arm_smmu_test_make_s2_ste(&ste, ARM_SMMU_MASTER_TEST_ATS); arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &abort_ste, NUM_EXPECTED_SYNCS(2)); } @@ -263,7 +332,7 @@ static void arm_smmu_v3_write_ste_test_abort_to_s2(struct kunit *test) { struct arm_smmu_ste ste; - arm_smmu_test_make_s2_ste(&ste, true); + arm_smmu_test_make_s2_ste(&ste, ARM_SMMU_MASTER_TEST_ATS); arm_smmu_v3_test_ste_expect_hitless_transition(test, &abort_ste, &ste, NUM_EXPECTED_SYNCS(2)); } @@ -272,7 +341,7 @@ static void arm_smmu_v3_write_ste_test_s2_to_bypass(struct kunit *test) { struct arm_smmu_ste ste; - arm_smmu_test_make_s2_ste(&ste, true); + arm_smmu_test_make_s2_ste(&ste, ARM_SMMU_MASTER_TEST_ATS); arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &bypass_ste, NUM_EXPECTED_SYNCS(2)); } @@ -281,11 +350,53 @@ static void arm_smmu_v3_write_ste_test_bypass_to_s2(struct kunit *test) { struct arm_smmu_ste ste; - arm_smmu_test_make_s2_ste(&ste, true); + arm_smmu_test_make_s2_ste(&ste, ARM_SMMU_MASTER_TEST_ATS); arm_smmu_v3_test_ste_expect_hitless_transition(test, &bypass_ste, &ste, NUM_EXPECTED_SYNCS(2)); } +static void arm_smmu_v3_write_ste_test_s1_to_s2(struct kunit *test) +{ + struct arm_smmu_ste s1_ste; + struct arm_smmu_ste s2_ste; + + arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_test_make_s2_ste(&s2_ste, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste, + NUM_EXPECTED_SYNCS(3)); +} + +static void arm_smmu_v3_write_ste_test_s2_to_s1(struct kunit *test) +{ + struct arm_smmu_ste s1_ste; + struct arm_smmu_ste s2_ste; + + arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_test_make_s2_ste(&s2_ste, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste, + NUM_EXPECTED_SYNCS(3)); +} + +static void arm_smmu_v3_write_ste_test_non_hitless(struct kunit *test) +{ + struct arm_smmu_ste ste; + struct arm_smmu_ste ste_2; + + /* + * Although no flow resembles this in practice, one way to force an STE + * update to be non-hitless is to change its CD table pointer as well as + * s1 dss field in the same update. + */ + arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_test_make_cdtable_ste(&ste_2, STRTAB_STE_1_S1DSS_BYPASS, + 0x4B4B4b4B4B, ARM_SMMU_MASTER_TEST_ATS); + arm_smmu_v3_test_ste_expect_non_hitless_transition( + test, &ste, &ste_2, NUM_EXPECTED_SYNCS(3)); +} + static void arm_smmu_v3_test_cd_expect_transition( struct kunit *test, const struct arm_smmu_cd *cur, const struct arm_smmu_cd *target, unsigned int num_syncs_expected, @@ -407,6 +518,30 @@ static void arm_smmu_test_make_sva_release_cd(struct arm_smmu_cd *cd, arm_smmu_make_sva_cd(cd, &master, NULL, asid); } +static void arm_smmu_v3_write_ste_test_s1_to_s2_stall(struct kunit *test) +{ + struct arm_smmu_ste s1_ste; + struct arm_smmu_ste s2_ste; + + arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_STALL); + arm_smmu_test_make_s2_ste(&s2_ste, ARM_SMMU_MASTER_TEST_STALL); + arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste, + NUM_EXPECTED_SYNCS(3)); +} + +static void arm_smmu_v3_write_ste_test_s2_to_s1_stall(struct kunit *test) +{ + struct arm_smmu_ste s1_ste; + struct arm_smmu_ste s2_ste; + + arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0, + fake_cdtab_dma_addr, ARM_SMMU_MASTER_TEST_STALL); + arm_smmu_test_make_s2_ste(&s2_ste, ARM_SMMU_MASTER_TEST_STALL); + arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste, + NUM_EXPECTED_SYNCS(3)); +} + static void arm_smmu_v3_write_cd_test_sva_clear(struct kunit *test) { struct arm_smmu_cd cd = {}; @@ -439,12 +574,20 @@ static struct kunit_case arm_smmu_v3_test_cases[] = { KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_cdtable), KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_to_bypass), KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_cdtable), + KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_s1dss_change), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass), + KUNIT_CASE(arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass), KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_abort), KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_s2), KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_bypass), KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_s2), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1), + KUNIT_CASE(arm_smmu_v3_write_ste_test_non_hitless), KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_clear), KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_change_asid), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2_stall), + KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1_stall), KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear), KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_release), {}, @@ -464,5 +607,6 @@ static struct kunit_suite arm_smmu_v3_test_module = { }; kunit_test_suites(&arm_smmu_v3_test_module); -MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); +MODULE_DESCRIPTION("KUnit tests for arm-smmu-v3 driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ab415e107054..d16d35c78c06 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -26,7 +26,9 @@ #include <linux/pci.h> #include <linux/pci-ats.h> #include <linux/platform_device.h> +#include <linux/string_choices.h> #include <kunit/visibility.h> +#include <uapi/linux/iommufd.h> #include "arm-smmu-v3.h" #include "../../dma-iommu.h" @@ -36,6 +38,9 @@ module_param(disable_msipolling, bool, 0444); MODULE_PARM_DESC(disable_msipolling, "Disable MSI-based polling for CMD_SYNC completion."); +static const struct iommu_ops arm_smmu_ops; +static struct iommu_dirty_ops arm_smmu_dirty_ops; + enum arm_smmu_msi_index { EVTQ_MSI_INDEX, GERROR_MSI_INDEX, @@ -79,8 +84,28 @@ static struct arm_smmu_option_prop arm_smmu_options[] = { { 0, NULL}, }; -static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, - struct arm_smmu_device *smmu); +static const char * const event_str[] = { + [EVT_ID_BAD_STREAMID_CONFIG] = "C_BAD_STREAMID", + [EVT_ID_STE_FETCH_FAULT] = "F_STE_FETCH", + [EVT_ID_BAD_STE_CONFIG] = "C_BAD_STE", + [EVT_ID_STREAM_DISABLED_FAULT] = "F_STREAM_DISABLED", + [EVT_ID_BAD_SUBSTREAMID_CONFIG] = "C_BAD_SUBSTREAMID", + [EVT_ID_CD_FETCH_FAULT] = "F_CD_FETCH", + [EVT_ID_BAD_CD_CONFIG] = "C_BAD_CD", + [EVT_ID_TRANSLATION_FAULT] = "F_TRANSLATION", + [EVT_ID_ADDR_SIZE_FAULT] = "F_ADDR_SIZE", + [EVT_ID_ACCESS_FAULT] = "F_ACCESS", + [EVT_ID_PERMISSION_FAULT] = "F_PERMISSION", + [EVT_ID_VMS_FETCH_FAULT] = "F_VMS_FETCH", +}; + +static const char * const event_class_str[] = { + [0] = "CD fetch", + [1] = "Stage 1 translation table fetch", + [2] = "Input address caused fault", + [3] = "Reserved", +}; + static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); static void parse_driver_options(struct arm_smmu_device *smmu) @@ -291,6 +316,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) case CMDQ_OP_TLBI_NH_ASID: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); fallthrough; + case CMDQ_OP_TLBI_NH_ALL: case CMDQ_OP_TLBI_S12_VMALL: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); break; @@ -342,14 +368,30 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) return 0; } -static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu) +static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) +{ + struct arm_smmu_cmdq *cmdq = NULL; + + if (smmu->impl_ops && smmu->impl_ops->get_secondary_cmdq) + cmdq = smmu->impl_ops->get_secondary_cmdq(smmu, ent); + + return cmdq ?: &smmu->cmdq; +} + +static bool arm_smmu_cmdq_needs_busy_polling(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq) { - return &smmu->cmdq; + if (cmdq == &smmu->cmdq) + return false; + + return smmu->options & ARM_SMMU_OPT_TEGRA241_CMDQV; } static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, - struct arm_smmu_queue *q, u32 prod) + struct arm_smmu_cmdq *cmdq, u32 prod) { + struct arm_smmu_queue *q = &cmdq->q; struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC, }; @@ -364,10 +406,12 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, } arm_smmu_cmdq_build_cmd(cmd, &ent); + if (arm_smmu_cmdq_needs_busy_polling(smmu, cmdq)) + u64p_replace_bits(cmd, CMDQ_SYNC_0_CS_NONE, CMDQ_SYNC_0_CS); } -static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu, - struct arm_smmu_queue *q) +void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq) { static const char * const cerror_str[] = { [CMDQ_ERR_CERROR_NONE_IDX] = "No error", @@ -375,6 +419,7 @@ static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu, [CMDQ_ERR_CERROR_ABT_IDX] = "Abort on command fetch", [CMDQ_ERR_CERROR_ATC_INV_IDX] = "ATC invalidate timeout", }; + struct arm_smmu_queue *q = &cmdq->q; int i; u64 cmd[CMDQ_ENT_DWORDS]; @@ -417,13 +462,15 @@ static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu, /* Convert the erroneous command into a CMD_SYNC */ arm_smmu_cmdq_build_cmd(cmd, &cmd_sync); + if (arm_smmu_cmdq_needs_busy_polling(smmu, cmdq)) + u64p_replace_bits(cmd, CMDQ_SYNC_0_CS_NONE, CMDQ_SYNC_0_CS); queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); } static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) { - __arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq.q); + __arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq); } /* @@ -588,11 +635,11 @@ static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq, /* Wait for the command queue to become non-full */ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, struct arm_smmu_ll_queue *llq) { unsigned long flags; struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); int ret = 0; /* @@ -623,11 +670,11 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, * Must be called with the cmdq lock held in some capacity. */ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, struct arm_smmu_ll_queue *llq) { int ret = 0; struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); queue_poll_init(smmu, &qp); @@ -647,10 +694,10 @@ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, * Must be called with the cmdq lock held in some capacity. */ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, struct arm_smmu_ll_queue *llq) { struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); u32 prod = llq->prod; int ret = 0; @@ -697,12 +744,14 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, } static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, struct arm_smmu_ll_queue *llq) { - if (smmu->options & ARM_SMMU_OPT_MSIPOLL) - return __arm_smmu_cmdq_poll_until_msi(smmu, llq); + if (smmu->options & ARM_SMMU_OPT_MSIPOLL && + !arm_smmu_cmdq_needs_busy_polling(smmu, cmdq)) + return __arm_smmu_cmdq_poll_until_msi(smmu, cmdq, llq); - return __arm_smmu_cmdq_poll_until_consumed(smmu, llq); + return __arm_smmu_cmdq_poll_until_consumed(smmu, cmdq, llq); } static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, @@ -738,14 +787,14 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, * insert their own list of commands then all of the commands from one * CPU will appear before any of the commands from the other CPU. */ -static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, - u64 *cmds, int n, bool sync) +int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, u64 *cmds, int n, + bool sync) { u64 cmd_sync[CMDQ_ENT_DWORDS]; u32 prod; unsigned long flags; bool owner; - struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); struct arm_smmu_ll_queue llq, head; int ret = 0; @@ -759,7 +808,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, while (!queue_has_space(&llq, n + sync)) { local_irq_restore(flags); - if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) + if (arm_smmu_cmdq_poll_until_not_full(smmu, cmdq, &llq)) dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); local_irq_save(flags); } @@ -785,7 +834,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); if (sync) { prod = queue_inc_prod_n(&llq, n); - arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, &cmdq->q, prod); + arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, cmdq, prod); queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); /* @@ -835,7 +884,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ if (sync) { llq.prod = queue_inc_prod_n(&llq, n); - ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq); + ret = arm_smmu_cmdq_poll_until_sync(smmu, cmdq, &llq); if (ret) { dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n", @@ -870,7 +919,8 @@ static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, return -EINVAL; } - return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync); + return arm_smmu_cmdq_issue_cmdlist( + smmu, arm_smmu_get_cmdq(smmu, ent), cmd, 1, sync); } static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, @@ -885,21 +935,33 @@ static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu, return __arm_smmu_cmdq_issue_cmd(smmu, ent, true); } +static void arm_smmu_cmdq_batch_init(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_batch *cmds, + struct arm_smmu_cmdq_ent *ent) +{ + cmds->num = 0; + cmds->cmdq = arm_smmu_get_cmdq(smmu, ent); +} + static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, struct arm_smmu_cmdq_batch *cmds, struct arm_smmu_cmdq_ent *cmd) { + bool unsupported_cmd = !arm_smmu_cmdq_supports_cmd(cmds->cmdq, cmd); + bool force_sync = (cmds->num == CMDQ_BATCH_ENTRIES - 1) && + (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC); int index; - if (cmds->num == CMDQ_BATCH_ENTRIES - 1 && - (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) { - arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); - cmds->num = 0; + if (force_sync || unsupported_cmd) { + arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds, + cmds->num, true); + arm_smmu_cmdq_batch_init(smmu, cmds, cmd); } if (cmds->num == CMDQ_BATCH_ENTRIES) { - arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false); - cmds->num = 0; + arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds, + cmds->num, false); + arm_smmu_cmdq_batch_init(smmu, cmds, cmd); } index = cmds->num * CMDQ_ENT_DWORDS; @@ -915,7 +977,8 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu, struct arm_smmu_cmdq_batch *cmds) { - return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); + return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds, + cmds->num, true); } static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused, @@ -989,18 +1052,28 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW | - STRTAB_STE_1_EATS); + STRTAB_STE_1_EATS | STRTAB_STE_1_MEV); used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID); + + /* + * See 13.5 Summary of attribute/permission configuration fields + * for the SHCFG behavior. + */ + if (FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == + STRTAB_STE_1_S1DSS_BYPASS) + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); } /* S2 translates */ if (cfg & BIT(1)) { used_bits[1] |= - cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG); + cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS | + STRTAB_STE_1_SHCFG | STRTAB_STE_1_MEV); used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | - STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R); + STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2S | + STRTAB_STE_2_S2R); used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK); } @@ -1158,7 +1231,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master, }, }; - cmds.num = 0; + arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd); for (i = 0; i < master->num_streams; i++) { cmd.cfgi.sid = master->streams[i].id; arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); @@ -1167,52 +1240,40 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master, arm_smmu_cmdq_batch_submit(smmu, &cmds); } -static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu, - struct arm_smmu_l1_ctx_desc *l1_desc) +static void arm_smmu_write_cd_l1_desc(struct arm_smmu_cdtab_l1 *dst, + dma_addr_t l2ptr_dma) { - size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3); + u64 val = (l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | CTXDESC_L1_DESC_V; - l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, - &l1_desc->l2ptr_dma, GFP_KERNEL); - if (!l1_desc->l2ptr) { - dev_warn(smmu->dev, - "failed to allocate context descriptor table\n"); - return -ENOMEM; - } - return 0; + /* The HW has 64 bit atomicity with stores to the L2 CD table */ + WRITE_ONCE(dst->l2ptr, cpu_to_le64(val)); } -static void arm_smmu_write_cd_l1_desc(__le64 *dst, - struct arm_smmu_l1_ctx_desc *l1_desc) +static dma_addr_t arm_smmu_cd_l1_get_desc(const struct arm_smmu_cdtab_l1 *src) { - u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | - CTXDESC_L1_DESC_V; - - /* The HW has 64 bit atomicity with stores to the L2 CD table */ - WRITE_ONCE(*dst, cpu_to_le64(val)); + return le64_to_cpu(src->l2ptr) & CTXDESC_L1_DESC_L2PTR_MASK; } struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid) { - struct arm_smmu_l1_ctx_desc *l1_desc; + struct arm_smmu_cdtab_l2 *l2; struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; - if (!cd_table->cdtab) + if (!arm_smmu_cdtab_allocated(cd_table)) return NULL; if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR) - return (struct arm_smmu_cd *)(cd_table->cdtab + - ssid * CTXDESC_CD_DWORDS); + return &cd_table->linear.table[ssid]; - l1_desc = &cd_table->l1_desc[ssid / CTXDESC_L2_ENTRIES]; - if (!l1_desc->l2ptr) + l2 = cd_table->l2.l2ptrs[arm_smmu_cdtab_l1_idx(ssid)]; + if (!l2) return NULL; - return &l1_desc->l2ptr[ssid % CTXDESC_L2_ENTRIES]; + return &l2->cds[arm_smmu_cdtab_l2_idx(ssid)]; } -struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master, - u32 ssid) +static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master, + u32 ssid) { struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; struct arm_smmu_device *smmu = master->smmu; @@ -1220,24 +1281,25 @@ struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master, might_sleep(); iommu_group_mutex_assert(master->dev); - if (!cd_table->cdtab) { + if (!arm_smmu_cdtab_allocated(cd_table)) { if (arm_smmu_alloc_cd_tables(master)) return NULL; } if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_64K_L2) { - unsigned int idx = ssid / CTXDESC_L2_ENTRIES; - struct arm_smmu_l1_ctx_desc *l1_desc; + unsigned int idx = arm_smmu_cdtab_l1_idx(ssid); + struct arm_smmu_cdtab_l2 **l2ptr = &cd_table->l2.l2ptrs[idx]; - l1_desc = &cd_table->l1_desc[idx]; - if (!l1_desc->l2ptr) { - __le64 *l1ptr; + if (!*l2ptr) { + dma_addr_t l2ptr_dma; - if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc)) + *l2ptr = dma_alloc_coherent(smmu->dev, sizeof(**l2ptr), + &l2ptr_dma, GFP_KERNEL); + if (!*l2ptr) return NULL; - l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS; - arm_smmu_write_cd_l1_desc(l1ptr, l1_desc); + arm_smmu_write_cd_l1_desc(&cd_table->l2.l1tab[idx], + l2ptr_dma); /* An invalid L1CD can be cached */ arm_smmu_sync_cd(master, ssid, false); } @@ -1289,6 +1351,8 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid, struct arm_smmu_cd *cdptr, const struct arm_smmu_cd *target) { + bool target_valid = target->data[0] & cpu_to_le64(CTXDESC_CD_0_V); + bool cur_valid = cdptr->data[0] & cpu_to_le64(CTXDESC_CD_0_V); struct arm_smmu_cd_writer cd_writer = { .writer = { .ops = &arm_smmu_cd_writer_ops, @@ -1297,6 +1361,13 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid, .ssid = ssid, }; + if (ssid != IOMMU_NO_PASID && cur_valid != target_valid) { + if (cur_valid) + master->cd_table.used_ssids--; + else + master->cd_table.used_ssids++; + } + arm_smmu_write_entry(&cd_writer.writer, cdptr->data, target->data); } @@ -1331,6 +1402,12 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target, CTXDESC_CD_0_ASET | FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) ); + + /* To enable dirty flag update, set both Access flag and dirty state update */ + if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_HD) + target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_HA | + CTXDESC_CD_0_TCR_HD); + target->data[1] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.ttbr & CTXDESC_CD_1_TTB0_MASK); target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.mair); @@ -1342,7 +1419,7 @@ void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid) struct arm_smmu_cd target = {}; struct arm_smmu_cd *cdptr; - if (!master->cd_table.cdtab) + if (!arm_smmu_cdtab_allocated(&master->cd_table)) return; cdptr = arm_smmu_get_cd_ptr(master, ssid); if (WARN_ON(!cdptr)) @@ -1364,99 +1441,83 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master) if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) || max_contexts <= CTXDESC_L2_ENTRIES) { cd_table->s1fmt = STRTAB_STE_0_S1FMT_LINEAR; - cd_table->num_l1_ents = max_contexts; + cd_table->linear.num_ents = max_contexts; - l1size = max_contexts * (CTXDESC_CD_DWORDS << 3); + l1size = max_contexts * sizeof(struct arm_smmu_cd); + cd_table->linear.table = dma_alloc_coherent(smmu->dev, l1size, + &cd_table->cdtab_dma, + GFP_KERNEL); + if (!cd_table->linear.table) + return -ENOMEM; } else { cd_table->s1fmt = STRTAB_STE_0_S1FMT_64K_L2; - cd_table->num_l1_ents = DIV_ROUND_UP(max_contexts, - CTXDESC_L2_ENTRIES); + cd_table->l2.num_l1_ents = + DIV_ROUND_UP(max_contexts, CTXDESC_L2_ENTRIES); - cd_table->l1_desc = devm_kcalloc(smmu->dev, cd_table->num_l1_ents, - sizeof(*cd_table->l1_desc), - GFP_KERNEL); - if (!cd_table->l1_desc) + cd_table->l2.l2ptrs = kcalloc(cd_table->l2.num_l1_ents, + sizeof(*cd_table->l2.l2ptrs), + GFP_KERNEL); + if (!cd_table->l2.l2ptrs) return -ENOMEM; - l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); - } - - cd_table->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma, - GFP_KERNEL); - if (!cd_table->cdtab) { - dev_warn(smmu->dev, "failed to allocate context descriptor\n"); - ret = -ENOMEM; - goto err_free_l1; + l1size = cd_table->l2.num_l1_ents * sizeof(struct arm_smmu_cdtab_l1); + cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size, + &cd_table->cdtab_dma, + GFP_KERNEL); + if (!cd_table->l2.l1tab) { + ret = -ENOMEM; + goto err_free_l2ptrs; + } } - return 0; -err_free_l1: - if (cd_table->l1_desc) { - devm_kfree(smmu->dev, cd_table->l1_desc); - cd_table->l1_desc = NULL; - } +err_free_l2ptrs: + kfree(cd_table->l2.l2ptrs); + cd_table->l2.l2ptrs = NULL; return ret; } static void arm_smmu_free_cd_tables(struct arm_smmu_master *master) { int i; - size_t size, l1size; struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; - if (cd_table->l1_desc) { - size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3); - - for (i = 0; i < cd_table->num_l1_ents; i++) { - if (!cd_table->l1_desc[i].l2ptr) + if (cd_table->s1fmt != STRTAB_STE_0_S1FMT_LINEAR) { + for (i = 0; i < cd_table->l2.num_l1_ents; i++) { + if (!cd_table->l2.l2ptrs[i]) continue; - dmam_free_coherent(smmu->dev, size, - cd_table->l1_desc[i].l2ptr, - cd_table->l1_desc[i].l2ptr_dma); + dma_free_coherent(smmu->dev, + sizeof(*cd_table->l2.l2ptrs[i]), + cd_table->l2.l2ptrs[i], + arm_smmu_cd_l1_get_desc(&cd_table->l2.l1tab[i])); } - devm_kfree(smmu->dev, cd_table->l1_desc); - cd_table->l1_desc = NULL; + kfree(cd_table->l2.l2ptrs); - l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3); + dma_free_coherent(smmu->dev, + cd_table->l2.num_l1_ents * + sizeof(struct arm_smmu_cdtab_l1), + cd_table->l2.l1tab, cd_table->cdtab_dma); } else { - l1size = cd_table->num_l1_ents * (CTXDESC_CD_DWORDS << 3); - } - - dmam_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma); - cd_table->cdtab_dma = 0; - cd_table->cdtab = NULL; -} - -bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd) -{ - bool free; - struct arm_smmu_ctx_desc *old_cd; - - if (!cd->asid) - return false; - - free = refcount_dec_and_test(&cd->refs); - if (free) { - old_cd = xa_erase(&arm_smmu_asid_xa, cd->asid); - WARN_ON(old_cd != cd); + dma_free_coherent(smmu->dev, + cd_table->linear.num_ents * + sizeof(struct arm_smmu_cd), + cd_table->linear.table, cd_table->cdtab_dma); } - return free; } /* Stream table manipulation functions */ -static void -arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc) +static void arm_smmu_write_strtab_l1_desc(struct arm_smmu_strtab_l1 *dst, + dma_addr_t l2ptr_dma) { u64 val = 0; - val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span); - val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK; + val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, STRTAB_SPLIT + 1); + val |= l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK; /* The HW has 64 bit atomicity with stores to the L2 STE table */ - WRITE_ONCE(*dst, cpu_to_le64(val)); + WRITE_ONCE(dst->l2ptr, cpu_to_le64(val)); } struct arm_smmu_ste_writer { @@ -1511,7 +1572,6 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, } } -VISIBLE_IF_KUNIT void arm_smmu_make_abort_ste(struct arm_smmu_ste *target) { memset(target, 0, sizeof(*target)); @@ -1538,7 +1598,8 @@ EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_bypass_ste); VISIBLE_IF_KUNIT void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, - struct arm_smmu_master *master) + struct arm_smmu_master *master, bool ats_enabled, + unsigned int s1dss) { struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; struct arm_smmu_device *smmu = master->smmu; @@ -1552,7 +1613,7 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax)); target->data[1] = cpu_to_le64( - FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | + FIELD_PREP(STRTAB_STE_1_S1DSS, s1dss) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) | @@ -1561,7 +1622,12 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, STRTAB_STE_1_S1STALLD : 0) | FIELD_PREP(STRTAB_STE_1_EATS, - master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + + if ((smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR) && + s1dss == STRTAB_STE_1_S1DSS_BYPASS) + target->data[1] |= cpu_to_le64(FIELD_PREP( + STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); if (smmu->features & ARM_SMMU_FEAT_E2H) { /* @@ -1588,10 +1654,10 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste); -VISIBLE_IF_KUNIT void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, struct arm_smmu_master *master, - struct arm_smmu_domain *smmu_domain) + struct arm_smmu_domain *smmu_domain, + bool ats_enabled) { struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg; const struct io_pgtable_cfg *pgtbl_cfg = @@ -1608,8 +1674,10 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, target->data[1] = cpu_to_le64( FIELD_PREP(STRTAB_STE_1_EATS, - master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_S2FWB) + target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB); if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR) target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); @@ -1629,6 +1697,7 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, STRTAB_STE_2_S2ENDI | #endif STRTAB_STE_2_S2PTW | + (master->stall_enabled ? STRTAB_STE_2_S2S : 0) | STRTAB_STE_2_S2R); target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s2_cfg.vttbr & @@ -1653,66 +1722,111 @@ static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab, static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid) { - size_t size; - void *strtab; + dma_addr_t l2ptr_dma; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; - struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[sid >> STRTAB_SPLIT]; + struct arm_smmu_strtab_l2 **l2table; - if (desc->l2ptr) + l2table = &cfg->l2.l2ptrs[arm_smmu_strtab_l1_idx(sid)]; + if (*l2table) return 0; - size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3); - strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS]; - - desc->span = STRTAB_SPLIT + 1; - desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma, - GFP_KERNEL); - if (!desc->l2ptr) { + *l2table = dmam_alloc_coherent(smmu->dev, sizeof(**l2table), + &l2ptr_dma, GFP_KERNEL); + if (!*l2table) { dev_err(smmu->dev, "failed to allocate l2 stream table for SID %u\n", sid); return -ENOMEM; } - arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT); - arm_smmu_write_strtab_l1_desc(strtab, desc); + arm_smmu_init_initial_stes((*l2table)->stes, + ARRAY_SIZE((*l2table)->stes)); + arm_smmu_write_strtab_l1_desc(&cfg->l2.l1tab[arm_smmu_strtab_l1_idx(sid)], + l2ptr_dma); + return 0; +} + +static int arm_smmu_streams_cmp_key(const void *lhs, const struct rb_node *rhs) +{ + struct arm_smmu_stream *stream_rhs = + rb_entry(rhs, struct arm_smmu_stream, node); + const u32 *sid_lhs = lhs; + + if (*sid_lhs < stream_rhs->id) + return -1; + if (*sid_lhs > stream_rhs->id) + return 1; return 0; } +static int arm_smmu_streams_cmp_node(struct rb_node *lhs, + const struct rb_node *rhs) +{ + return arm_smmu_streams_cmp_key( + &rb_entry(lhs, struct arm_smmu_stream, node)->id, rhs); +} + static struct arm_smmu_master * arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid) { struct rb_node *node; - struct arm_smmu_stream *stream; lockdep_assert_held(&smmu->streams_mutex); - node = smmu->streams.rb_node; - while (node) { - stream = rb_entry(node, struct arm_smmu_stream, node); - if (stream->id < sid) - node = node->rb_right; - else if (stream->id > sid) - node = node->rb_left; - else - return stream->master; - } - - return NULL; + node = rb_find(&sid, &smmu->streams, arm_smmu_streams_cmp_key); + if (!node) + return NULL; + return rb_entry(node, struct arm_smmu_stream, node)->master; } /* IRQ and event handlers */ -static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) +static void arm_smmu_decode_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *event) +{ + struct arm_smmu_master *master; + + event->id = FIELD_GET(EVTQ_0_ID, raw[0]); + event->sid = FIELD_GET(EVTQ_0_SID, raw[0]); + event->ssv = FIELD_GET(EVTQ_0_SSV, raw[0]); + event->ssid = event->ssv ? FIELD_GET(EVTQ_0_SSID, raw[0]) : IOMMU_NO_PASID; + event->privileged = FIELD_GET(EVTQ_1_PnU, raw[1]); + event->instruction = FIELD_GET(EVTQ_1_InD, raw[1]); + event->s2 = FIELD_GET(EVTQ_1_S2, raw[1]); + event->read = FIELD_GET(EVTQ_1_RnW, raw[1]); + event->stag = FIELD_GET(EVTQ_1_STAG, raw[1]); + event->stall = FIELD_GET(EVTQ_1_STALL, raw[1]); + event->class = FIELD_GET(EVTQ_1_CLASS, raw[1]); + event->iova = FIELD_GET(EVTQ_2_ADDR, raw[2]); + event->ipa = raw[3] & EVTQ_3_IPA; + event->fetch_addr = raw[3] & EVTQ_3_FETCH_ADDR; + event->ttrnw = FIELD_GET(EVTQ_1_TT_READ, raw[1]); + event->class_tt = false; + event->dev = NULL; + + if (event->id == EVT_ID_PERMISSION_FAULT) + event->class_tt = (event->class == EVTQ_1_CLASS_TT); + + mutex_lock(&smmu->streams_mutex); + master = arm_smmu_find_master(smmu, event->sid); + if (master) + event->dev = get_device(master->dev); + mutex_unlock(&smmu->streams_mutex); +} + +static int arm_smmu_handle_event(struct arm_smmu_device *smmu, u64 *evt, + struct arm_smmu_event *event) { int ret = 0; u32 perm = 0; struct arm_smmu_master *master; - bool ssid_valid = evt[0] & EVTQ_0_SSV; - u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]); struct iopf_fault fault_evt = { }; struct iommu_fault *flt = &fault_evt.fault; - switch (FIELD_GET(EVTQ_0_ID, evt[0])) { + switch (event->id) { + case EVT_ID_BAD_STE_CONFIG: + case EVT_ID_STREAM_DISABLED_FAULT: + case EVT_ID_BAD_SUBSTREAMID_CONFIG: + case EVT_ID_BAD_CD_CONFIG: case EVT_ID_TRANSLATION_FAULT: case EVT_ID_ADDR_SIZE_FAULT: case EVT_ID_ACCESS_FAULT: @@ -1722,73 +1836,126 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) return -EOPNOTSUPP; } - /* Stage-2 is always pinned at the moment */ - if (evt[1] & EVTQ_1_S2) - return -EFAULT; - - if (!(evt[1] & EVTQ_1_STALL)) - return -EOPNOTSUPP; - - if (evt[1] & EVTQ_1_RnW) - perm |= IOMMU_FAULT_PERM_READ; - else - perm |= IOMMU_FAULT_PERM_WRITE; + if (event->stall) { + if (event->read) + perm |= IOMMU_FAULT_PERM_READ; + else + perm |= IOMMU_FAULT_PERM_WRITE; - if (evt[1] & EVTQ_1_InD) - perm |= IOMMU_FAULT_PERM_EXEC; + if (event->instruction) + perm |= IOMMU_FAULT_PERM_EXEC; - if (evt[1] & EVTQ_1_PnU) - perm |= IOMMU_FAULT_PERM_PRIV; + if (event->privileged) + perm |= IOMMU_FAULT_PERM_PRIV; - flt->type = IOMMU_FAULT_PAGE_REQ; - flt->prm = (struct iommu_fault_page_request) { - .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, - .grpid = FIELD_GET(EVTQ_1_STAG, evt[1]), - .perm = perm, - .addr = FIELD_GET(EVTQ_2_ADDR, evt[2]), - }; + flt->type = IOMMU_FAULT_PAGE_REQ; + flt->prm = (struct iommu_fault_page_request){ + .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, + .grpid = event->stag, + .perm = perm, + .addr = event->iova, + }; - if (ssid_valid) { - flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]); + if (event->ssv) { + flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + flt->prm.pasid = event->ssid; + } } mutex_lock(&smmu->streams_mutex); - master = arm_smmu_find_master(smmu, sid); + master = arm_smmu_find_master(smmu, event->sid); if (!master) { ret = -EINVAL; goto out_unlock; } - iommu_report_device_fault(master->dev, &fault_evt); + if (event->stall) + ret = iommu_report_device_fault(master->dev, &fault_evt); + else if (master->vmaster && !event->s2) + ret = arm_vmaster_report_event(master->vmaster, evt); + else + ret = -EOPNOTSUPP; /* Unhandled events should be pinned */ out_unlock: mutex_unlock(&smmu->streams_mutex); return ret; } +static void arm_smmu_dump_raw_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *event) +{ + int i; + + dev_err(smmu->dev, "event 0x%02x received:\n", event->id); + + for (i = 0; i < EVTQ_ENT_DWORDS; ++i) + dev_err(smmu->dev, "\t0x%016llx\n", raw[i]); +} + +#define ARM_SMMU_EVT_KNOWN(e) ((e)->id < ARRAY_SIZE(event_str) && event_str[(e)->id]) +#define ARM_SMMU_LOG_EVT_STR(e) ARM_SMMU_EVT_KNOWN(e) ? event_str[(e)->id] : "UNKNOWN" +#define ARM_SMMU_LOG_CLIENT(e) (e)->dev ? dev_name((e)->dev) : "(unassigned sid)" + +static void arm_smmu_dump_event(struct arm_smmu_device *smmu, u64 *raw, + struct arm_smmu_event *evt, + struct ratelimit_state *rs) +{ + if (!__ratelimit(rs)) + return; + + arm_smmu_dump_raw_event(smmu, raw, evt); + + switch (evt->id) { + case EVT_ID_TRANSLATION_FAULT: + case EVT_ID_ADDR_SIZE_FAULT: + case EVT_ID_ACCESS_FAULT: + case EVT_ID_PERMISSION_FAULT: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x iova: %#llx ipa: %#llx", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid, evt->iova, evt->ipa); + + dev_err(smmu->dev, "%s %s %s %s \"%s\"%s%s stag: %#x", + evt->privileged ? "priv" : "unpriv", + evt->instruction ? "inst" : "data", + str_read_write(evt->read), + evt->s2 ? "s2" : "s1", event_class_str[evt->class], + evt->class_tt ? (evt->ttrnw ? " ttd_read" : " ttd_write") : "", + evt->stall ? " stall" : "", evt->stag); + + break; + + case EVT_ID_STE_FETCH_FAULT: + case EVT_ID_CD_FETCH_FAULT: + case EVT_ID_VMS_FETCH_FAULT: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x fetch_addr: %#llx", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid, evt->fetch_addr); + + break; + + default: + dev_err(smmu->dev, "event: %s client: %s sid: %#x ssid: %#x", + ARM_SMMU_LOG_EVT_STR(evt), ARM_SMMU_LOG_CLIENT(evt), + evt->sid, evt->ssid); + } +} + static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) { - int i, ret; + u64 evt[EVTQ_ENT_DWORDS]; + struct arm_smmu_event event = {0}; struct arm_smmu_device *smmu = dev; struct arm_smmu_queue *q = &smmu->evtq.q; struct arm_smmu_ll_queue *llq = &q->llq; static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - u64 evt[EVTQ_ENT_DWORDS]; do { while (!queue_remove_raw(q, evt)) { - u8 id = FIELD_GET(EVTQ_0_ID, evt[0]); - - ret = arm_smmu_handle_evt(smmu, evt); - if (!ret || !__ratelimit(&rs)) - continue; - - dev_info(smmu->dev, "event 0x%02x received:\n", id); - for (i = 0; i < ARRAY_SIZE(evt); ++i) - dev_info(smmu->dev, "\t0x%016llx\n", - (unsigned long long)evt[i]); + arm_smmu_decode_event(smmu, evt, &event); + if (arm_smmu_handle_event(smmu, evt, &event)) + arm_smmu_dump_event(smmu, evt, &event, &rs); + put_device(event.dev); cond_resched(); } @@ -1995,15 +2162,16 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size, cmd->atc.size = log2_span; } -static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) +static int arm_smmu_atc_inv_master(struct arm_smmu_master *master, + ioasid_t ssid) { int i; struct arm_smmu_cmdq_ent cmd; struct arm_smmu_cmdq_batch cmds; - arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); + arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd); - cmds.num = 0; + arm_smmu_cmdq_batch_init(master->smmu, &cmds, &cmd); for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd); @@ -2012,13 +2180,15 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) return arm_smmu_cmdq_batch_submit(master->smmu, &cmds); } -int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, +int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, unsigned long iova, size_t size) { + struct arm_smmu_master_domain *master_domain; int i; unsigned long flags; - struct arm_smmu_cmdq_ent cmd; - struct arm_smmu_master *master; + struct arm_smmu_cmdq_ent cmd = { + .opcode = CMDQ_OP_ATC_INV, + }; struct arm_smmu_cmdq_batch cmds; if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) @@ -2041,15 +2211,27 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, if (!atomic_read(&smmu_domain->nr_ats_masters)) return 0; - arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); - - cmds.num = 0; + arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, &cmd); spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master, &smmu_domain->devices, domain_head) { + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + struct arm_smmu_master *master = master_domain->master; + if (!master->ats_enabled) continue; + if (master_domain->nested_ats_flush) { + /* + * If a S2 used as a nesting parent is changed we have + * no option but to completely flush the ATC. + */ + arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); + } else { + arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, + &cmd); + } + for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd); @@ -2081,7 +2263,7 @@ static void arm_smmu_tlb_inv_context(void *cookie) cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } - arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, 0, 0); + arm_smmu_atc_inv_domain(smmu_domain, 0, 0); } static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, @@ -2120,7 +2302,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, num_pages++; } - cmds.num = 0; + arm_smmu_cmdq_batch_init(smmu, &cmds, cmd); while (iova < end) { if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { @@ -2175,11 +2357,20 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, } __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain); + if (smmu_domain->nest_parent) { + /* + * When the S2 domain changes all the nested S1 ASIDs have to be + * flushed too. + */ + cmd.opcode = CMDQ_OP_TLBI_NH_ALL; + arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd); + } + /* * Unfortunately, this can't be leaf-only since we may have * zapped an entire table. */ - arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, iova, size); + arm_smmu_atc_inv_domain(smmu_domain, iova, size); } void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, @@ -2220,6 +2411,13 @@ static const struct iommu_flush_ops arm_smmu_flush_ops = { .tlb_add_page = arm_smmu_tlb_inv_page_nosync, }; +static bool arm_smmu_dbm_capable(struct arm_smmu_device *smmu) +{ + u32 features = (ARM_SMMU_FEAT_HD | ARM_SMMU_FEAT_COHERENCY); + + return (smmu->features & features) == features; +} + /* IOMMU API */ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) { @@ -2229,54 +2427,53 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) case IOMMU_CAP_CACHE_COHERENCY: /* Assume that a coherent TCU implies coherent TBUs */ return master->smmu->features & ARM_SMMU_FEAT_COHERENCY; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return arm_smmu_master_canwbs(master); case IOMMU_CAP_NOEXEC: case IOMMU_CAP_DEFERRED_FLUSH: return true; + case IOMMU_CAP_DIRTY_TRACKING: + return arm_smmu_dbm_capable(master->smmu); default: return false; } } -static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) +static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain) { + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_master_domain *master_domain; + unsigned long flags; + bool ret = true; - if (type == IOMMU_DOMAIN_SVA) - return arm_smmu_sva_domain_alloc(); - return ERR_PTR(-EOPNOTSUPP); + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + if (!arm_smmu_master_canwbs(master_domain->master)) { + ret = false; + break; + } + } + smmu_domain->enforce_cache_coherency = ret; + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + return ret; } -static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) +struct arm_smmu_domain *arm_smmu_domain_alloc(void) { struct arm_smmu_domain *smmu_domain; - /* - * Allocate the domain and initialise some of its data structures. - * We can't really do anything meaningful until we've added a - * master. - */ smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL); if (!smmu_domain) return ERR_PTR(-ENOMEM); - mutex_init(&smmu_domain->init_mutex); INIT_LIST_HEAD(&smmu_domain->devices); spin_lock_init(&smmu_domain->devices_lock); - INIT_LIST_HEAD(&smmu_domain->mmu_notifiers); - - if (dev) { - struct arm_smmu_master *master = dev_iommu_priv_get(dev); - int ret; - ret = arm_smmu_domain_finalise(smmu_domain, master->smmu); - if (ret) { - kfree(smmu_domain); - return ERR_PTR(ret); - } - } - return &smmu_domain->domain; + return smmu_domain; } -static void arm_smmu_domain_free(struct iommu_domain *domain) +static void arm_smmu_domain_free_paging(struct iommu_domain *domain) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; @@ -2287,7 +2484,7 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { /* Prevent SVA from touching the CD while we're freeing it */ mutex_lock(&arm_smmu_asid_lock); - arm_smmu_free_asid(&smmu_domain->cd); + xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid); mutex_unlock(&arm_smmu_asid_lock); } else { struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; @@ -2302,14 +2499,12 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu, struct arm_smmu_domain *smmu_domain) { int ret; - u32 asid; + u32 asid = 0; struct arm_smmu_ctx_desc *cd = &smmu_domain->cd; - refcount_set(&cd->refs, 1); - /* Prevent SVA from modifying the ASID until it is written to the CD */ mutex_lock(&arm_smmu_asid_lock); - ret = xa_alloc(&arm_smmu_asid_xa, &asid, cd, + ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain, XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL); cd->asid = (u16)asid; mutex_unlock(&arm_smmu_asid_lock); @@ -2333,49 +2528,51 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu, } static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, - struct arm_smmu_device *smmu) + struct arm_smmu_device *smmu, u32 flags) { int ret; - unsigned long ias, oas; enum io_pgtable_fmt fmt; struct io_pgtable_cfg pgtbl_cfg; struct io_pgtable_ops *pgtbl_ops; int (*finalise_stage_fn)(struct arm_smmu_device *smmu, struct arm_smmu_domain *smmu_domain); + bool enable_dirty = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - /* Restrict the stage to what we can actually support */ - if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; - if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2)) - smmu_domain->stage = ARM_SMMU_DOMAIN_S1; + pgtbl_cfg = (struct io_pgtable_cfg) { + .pgsize_bitmap = smmu->pgsize_bitmap, + .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY, + .tlb = &arm_smmu_flush_ops, + .iommu_dev = smmu->dev, + }; switch (smmu_domain->stage) { - case ARM_SMMU_DOMAIN_S1: - ias = (smmu->features & ARM_SMMU_FEAT_VAX) ? 52 : 48; - ias = min_t(unsigned long, ias, VA_BITS); - oas = smmu->ias; + case ARM_SMMU_DOMAIN_S1: { + unsigned long ias = (smmu->features & + ARM_SMMU_FEAT_VAX) ? 52 : 48; + + pgtbl_cfg.ias = min_t(unsigned long, ias, VA_BITS); + pgtbl_cfg.oas = smmu->ias; + if (enable_dirty) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_HD; fmt = ARM_64_LPAE_S1; finalise_stage_fn = arm_smmu_domain_finalise_s1; break; + } case ARM_SMMU_DOMAIN_S2: - ias = smmu->ias; - oas = smmu->oas; + if (enable_dirty) + return -EOPNOTSUPP; + pgtbl_cfg.ias = smmu->ias; + pgtbl_cfg.oas = smmu->oas; fmt = ARM_64_LPAE_S2; finalise_stage_fn = arm_smmu_domain_finalise_s2; + if ((smmu->features & ARM_SMMU_FEAT_S2FWB) && + (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB; break; default: return -EINVAL; } - pgtbl_cfg = (struct io_pgtable_cfg) { - .pgsize_bitmap = smmu->pgsize_bitmap, - .ias = ias, - .oas = oas, - .coherent_walk = smmu->features & ARM_SMMU_FEAT_COHERENCY, - .tlb = &arm_smmu_flush_ops, - .iommu_dev = smmu->dev, - }; - pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); if (!pgtbl_ops) return -ENOMEM; @@ -2383,6 +2580,8 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, smmu_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; smmu_domain->domain.geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1; smmu_domain->domain.geometry.force_aperture = true; + if (enable_dirty && smmu_domain->stage == ARM_SMMU_DOMAIN_S1) + smmu_domain->domain.dirty_ops = &arm_smmu_dirty_ops; ret = finalise_stage_fn(smmu, smmu_domain); if (ret < 0) { @@ -2401,25 +2600,28 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) { - unsigned int idx1, idx2; - /* Two-level walk */ - idx1 = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS; - idx2 = sid & ((1 << STRTAB_SPLIT) - 1); - return &cfg->l1_desc[idx1].l2ptr[idx2]; + return &cfg->l2.l2ptrs[arm_smmu_strtab_l1_idx(sid)] + ->stes[arm_smmu_strtab_l2_idx(sid)]; } else { /* Simple linear lookup */ - return (struct arm_smmu_ste *)&cfg - ->strtab[sid * STRTAB_STE_DWORDS]; + return &cfg->linear.table[sid]; } } -static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, - const struct arm_smmu_ste *target) +void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target) { int i, j; struct arm_smmu_device *smmu = master->smmu; + master->cd_table.in_ste = + FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(target->data[0])) == + STRTAB_STE_0_CFG_S1_TRANS; + master->ste_ats_enabled = + FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(target->data[1])) == + STRTAB_STE_1_EATS_TRANS; + for (i = 0; i < master->num_streams; ++i) { u32 sid = master->streams[i].id; struct arm_smmu_ste *step = @@ -2451,46 +2653,24 @@ static bool arm_smmu_ats_supported(struct arm_smmu_master *master) return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)); } -static void arm_smmu_enable_ats(struct arm_smmu_master *master, - struct arm_smmu_domain *smmu_domain) +static void arm_smmu_enable_ats(struct arm_smmu_master *master) { size_t stu; struct pci_dev *pdev; struct arm_smmu_device *smmu = master->smmu; - /* Don't enable ATS at the endpoint if it's not enabled in the STE */ - if (!master->ats_enabled) - return; - /* Smallest Translation Unit: log2 of the smallest supported granule */ stu = __ffs(smmu->pgsize_bitmap); pdev = to_pci_dev(master->dev); - atomic_inc(&smmu_domain->nr_ats_masters); /* * ATC invalidation of PASID 0 causes the entire ATC to be flushed. */ - arm_smmu_atc_inv_master(master); + arm_smmu_atc_inv_master(master, IOMMU_NO_PASID); if (pci_enable_ats(pdev, stu)) dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu); } -static void arm_smmu_disable_ats(struct arm_smmu_master *master, - struct arm_smmu_domain *smmu_domain) -{ - if (!master->ats_enabled) - return; - - pci_disable_ats(to_pci_dev(master->dev)); - /* - * Ensure ATS is disabled at the endpoint before we issue the - * ATC invalidation via the SMMU. - */ - wmb(); - arm_smmu_atc_inv_master(master); - atomic_dec(&smmu_domain->nr_ats_masters); -} - static int arm_smmu_enable_pasid(struct arm_smmu_master *master) { int ret; @@ -2538,68 +2718,320 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master) pci_disable_pasid(pdev); } -static void arm_smmu_detach_dev(struct arm_smmu_master *master) +static struct arm_smmu_master_domain * +arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain, + struct iommu_domain *domain, + struct arm_smmu_master *master, + ioasid_t ssid, bool nested_ats_flush) { - struct iommu_domain *domain = iommu_get_domain_for_dev(master->dev); - struct arm_smmu_domain *smmu_domain; + struct arm_smmu_master_domain *master_domain; + + lockdep_assert_held(&smmu_domain->devices_lock); + + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + if (master_domain->master == master && + master_domain->domain == domain && + master_domain->ssid == ssid && + master_domain->nested_ats_flush == nested_ats_flush) + return master_domain; + } + return NULL; +} + +/* + * If the domain uses the smmu_domain->devices list return the arm_smmu_domain + * structure, otherwise NULL. These domains track attached devices so they can + * issue invalidations. + */ +static struct arm_smmu_domain * +to_smmu_domain_devices(struct iommu_domain *domain) +{ + /* The domain can be NULL only when processing the first attach */ + if (!domain) + return NULL; + if ((domain->type & __IOMMU_DOMAIN_PAGING) || + domain->type == IOMMU_DOMAIN_SVA) + return to_smmu_domain(domain); + if (domain->type == IOMMU_DOMAIN_NESTED) + return to_smmu_nested_domain(domain)->vsmmu->s2_parent; + return NULL; +} + +static int arm_smmu_enable_iopf(struct arm_smmu_master *master, + struct arm_smmu_master_domain *master_domain) +{ + int ret; + + iommu_group_mutex_assert(master->dev); + + if (!IS_ENABLED(CONFIG_ARM_SMMU_V3_SVA)) + return -EOPNOTSUPP; + + /* + * Drivers for devices supporting PRI or stall require iopf others have + * device-specific fault handlers and don't need IOPF, so this is not a + * failure. + */ + if (!master->stall_enabled) + return 0; + + /* We're not keeping track of SIDs in fault events */ + if (master->num_streams != 1) + return -EOPNOTSUPP; + + if (master->iopf_refcount) { + master->iopf_refcount++; + master_domain->using_iopf = true; + return 0; + } + + ret = iopf_queue_add_device(master->smmu->evtq.iopf, master->dev); + if (ret) + return ret; + master->iopf_refcount = 1; + master_domain->using_iopf = true; + return 0; +} + +static void arm_smmu_disable_iopf(struct arm_smmu_master *master, + struct arm_smmu_master_domain *master_domain) +{ + iommu_group_mutex_assert(master->dev); + + if (!IS_ENABLED(CONFIG_ARM_SMMU_V3_SVA)) + return; + + if (!master_domain || !master_domain->using_iopf) + return; + + master->iopf_refcount--; + if (master->iopf_refcount == 0) + iopf_queue_remove_device(master->smmu->evtq.iopf, master->dev); +} + +static void arm_smmu_remove_master_domain(struct arm_smmu_master *master, + struct iommu_domain *domain, + ioasid_t ssid) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain); + struct arm_smmu_master_domain *master_domain; + bool nested_ats_flush = false; unsigned long flags; - if (!domain || !(domain->type & __IOMMU_DOMAIN_PAGING)) + if (!smmu_domain) return; - smmu_domain = to_smmu_domain(domain); - arm_smmu_disable_ats(master, smmu_domain); + if (domain->type == IOMMU_DOMAIN_NESTED) + nested_ats_flush = to_smmu_nested_domain(domain)->enable_ats; spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_del_init(&master->domain_head); + master_domain = arm_smmu_find_master_domain(smmu_domain, domain, master, + ssid, nested_ats_flush); + if (master_domain) { + list_del(&master_domain->devices_elm); + if (master->ats_enabled) + atomic_dec(&smmu_domain->nr_ats_masters); + } spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); - master->ats_enabled = false; + arm_smmu_disable_iopf(master, master_domain); + kfree(master_domain); } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +/* + * Start the sequence to attach a domain to a master. The sequence contains three + * steps: + * arm_smmu_attach_prepare() + * arm_smmu_install_ste_for_dev() + * arm_smmu_attach_commit() + * + * If prepare succeeds then the sequence must be completed. The STE installed + * must set the STE.EATS field according to state.ats_enabled. + * + * If the device supports ATS then this determines if EATS should be enabled + * in the STE, and starts sequencing EATS disable if required. + * + * The change of the EATS in the STE and the PCI ATS config space is managed by + * this sequence to be in the right order so that if PCI ATS is enabled then + * STE.ETAS is enabled. + * + * new_domain can be a non-paging domain. In this case ATS will not be enabled, + * and invalidations won't be tracked. + */ +int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, + struct iommu_domain *new_domain) { - int ret = 0; + struct arm_smmu_master *master = state->master; + struct arm_smmu_master_domain *master_domain; + struct arm_smmu_domain *smmu_domain = + to_smmu_domain_devices(new_domain); unsigned long flags; + int ret; + + /* + * arm_smmu_share_asid() must not see two domains pointing to the same + * arm_smmu_master_domain contents otherwise it could randomly write one + * or the other to the CD. + */ + lockdep_assert_held(&arm_smmu_asid_lock); + + if (smmu_domain || state->cd_needs_ats) { + /* + * The SMMU does not support enabling ATS with bypass/abort. + * When the STE is in bypass (STE.Config[2:0] == 0b100), ATS + * Translation Requests and Translated transactions are denied + * as though ATS is disabled for the stream (STE.EATS == 0b00), + * causing F_BAD_ATS_TREQ and F_TRANSL_FORBIDDEN events + * (IHI0070Ea 5.2 Stream Table Entry). + * + * However, if we have installed a CD table and are using S1DSS + * then ATS will work in S1DSS bypass. See "13.6.4 Full ATS + * skipping stage 1". + * + * Disable ATS if we are going to create a normal 0b100 bypass + * STE. + */ + state->ats_enabled = !state->disable_ats && + arm_smmu_ats_supported(master); + } + + if (smmu_domain) { + if (new_domain->type == IOMMU_DOMAIN_NESTED) { + ret = arm_smmu_attach_prepare_vmaster( + state, to_smmu_nested_domain(new_domain)); + if (ret) + return ret; + } + + master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL); + if (!master_domain) { + ret = -ENOMEM; + goto err_free_vmaster; + } + master_domain->domain = new_domain; + master_domain->master = master; + master_domain->ssid = state->ssid; + if (new_domain->type == IOMMU_DOMAIN_NESTED) + master_domain->nested_ats_flush = + to_smmu_nested_domain(new_domain)->enable_ats; + + if (new_domain->iopf_handler) { + ret = arm_smmu_enable_iopf(master, master_domain); + if (ret) + goto err_free_master_domain; + } + + /* + * During prepare we want the current smmu_domain and new + * smmu_domain to be in the devices list before we change any + * HW. This ensures that both domains will send ATS + * invalidations to the master until we are done. + * + * It is tempting to make this list only track masters that are + * using ATS, but arm_smmu_share_asid() also uses this to change + * the ASID of a domain, unrelated to ATS. + * + * Notice if we are re-attaching the same domain then the list + * will have two identical entries and commit will remove only + * one of them. + */ + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + if (smmu_domain->enforce_cache_coherency && + !arm_smmu_master_canwbs(master)) { + spin_unlock_irqrestore(&smmu_domain->devices_lock, + flags); + ret = -EINVAL; + goto err_iopf; + } + + if (state->ats_enabled) + atomic_inc(&smmu_domain->nr_ats_masters); + list_add(&master_domain->devices_elm, &smmu_domain->devices); + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + } + + if (!state->ats_enabled && master->ats_enabled) { + pci_disable_ats(to_pci_dev(master->dev)); + /* + * This is probably overkill, but the config write for disabling + * ATS should complete before the STE is configured to generate + * UR to avoid AER noise. + */ + wmb(); + } + return 0; + +err_iopf: + arm_smmu_disable_iopf(master, master_domain); +err_free_master_domain: + kfree(master_domain); +err_free_vmaster: + kfree(state->vmaster); + return ret; +} + +/* + * Commit is done after the STE/CD are configured with the EATS setting. It + * completes synchronizing the PCI device's ATC and finishes manipulating the + * smmu_domain->devices list. + */ +void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) +{ + struct arm_smmu_master *master = state->master; + + lockdep_assert_held(&arm_smmu_asid_lock); + + arm_smmu_attach_commit_vmaster(state); + + if (state->ats_enabled && !master->ats_enabled) { + arm_smmu_enable_ats(master); + } else if (state->ats_enabled && master->ats_enabled) { + /* + * The translation has changed, flush the ATC. At this point the + * SMMU is translating for the new domain and both the old&new + * domain will issue invalidations. + */ + arm_smmu_atc_inv_master(master, state->ssid); + } else if (!state->ats_enabled && master->ats_enabled) { + /* ATS is being switched off, invalidate the entire ATC */ + arm_smmu_atc_inv_master(master, IOMMU_NO_PASID); + } + + arm_smmu_remove_master_domain(master, state->old_domain, state->ssid); + master->ats_enabled = state->ats_enabled; +} + +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old_domain) +{ + int ret = 0; struct arm_smmu_ste target; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_attach_state state = { + .old_domain = old_domain, + .ssid = IOMMU_NO_PASID, + }; struct arm_smmu_master *master; struct arm_smmu_cd *cdptr; if (!fwspec) return -ENOENT; - master = dev_iommu_priv_get(dev); + state.master = master = dev_iommu_priv_get(dev); smmu = master->smmu; - /* - * Checking that SVA is disabled ensures that this device isn't bound to - * any mm, and can be safely detached from its old domain. Bonds cannot - * be removed concurrently since we're holding the group mutex. - */ - if (arm_smmu_master_sva_enabled(master)) { - dev_err(dev, "cannot attach - SVA enabled\n"); - return -EBUSY; - } - - mutex_lock(&smmu_domain->init_mutex); - - if (!smmu_domain->smmu) { - ret = arm_smmu_domain_finalise(smmu_domain, smmu); - } else if (smmu_domain->smmu != smmu) - ret = -EINVAL; - - mutex_unlock(&smmu_domain->init_mutex); - if (ret) - return ret; + if (smmu_domain->smmu != smmu) + return -EINVAL; if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { cdptr = arm_smmu_alloc_cd_ptr(master, IOMMU_NO_PASID); if (!cdptr) return -ENOMEM; - } + } else if (arm_smmu_ssids_in_use(&master->cd_table)) + return -EBUSY; /* * Prevent arm_smmu_share_asid() from trying to change the ASID @@ -2609,13 +3041,11 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) */ mutex_lock(&arm_smmu_asid_lock); - arm_smmu_detach_dev(master); - - master->ats_enabled = arm_smmu_ats_supported(master); - - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_add(&master->domain_head, &smmu_domain->devices); - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + ret = arm_smmu_attach_prepare(&state, domain); + if (ret) { + mutex_unlock(&arm_smmu_asid_lock); + return ret; + } switch (smmu_domain->stage) { case ARM_SMMU_DOMAIN_S1: { @@ -2624,29 +3054,165 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr, &target_cd); - arm_smmu_make_cdtable_ste(&target, master); + arm_smmu_make_cdtable_ste(&target, master, state.ats_enabled, + STRTAB_STE_1_S1DSS_SSID0); arm_smmu_install_ste_for_dev(master, &target); break; } case ARM_SMMU_DOMAIN_S2: - arm_smmu_make_s2_domain_ste(&target, master, smmu_domain); + arm_smmu_make_s2_domain_ste(&target, master, smmu_domain, + state.ats_enabled); arm_smmu_install_ste_for_dev(master, &target); arm_smmu_clear_cd(master, IOMMU_NO_PASID); break; } - arm_smmu_enable_ats(master, smmu_domain); + arm_smmu_attach_commit(&state); mutex_unlock(&arm_smmu_asid_lock); return 0; } -static int arm_smmu_attach_dev_ste(struct device *dev, - struct arm_smmu_ste *ste) +static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id, + struct iommu_domain *old) { + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_cd target_cd; - if (arm_smmu_master_sva_enabled(master)) - return -EBUSY; + if (smmu_domain->smmu != smmu) + return -EINVAL; + + if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) + return -EINVAL; + + /* + * We can read cd.asid outside the lock because arm_smmu_set_pasid() + * will fix it + */ + arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); + return arm_smmu_set_pasid(master, to_smmu_domain(domain), id, + &target_cd, old); +} + +static void arm_smmu_update_ste(struct arm_smmu_master *master, + struct iommu_domain *sid_domain, + bool ats_enabled) +{ + unsigned int s1dss = STRTAB_STE_1_S1DSS_TERMINATE; + struct arm_smmu_ste ste; + + if (master->cd_table.in_ste && master->ste_ats_enabled == ats_enabled) + return; + + if (sid_domain->type == IOMMU_DOMAIN_IDENTITY) + s1dss = STRTAB_STE_1_S1DSS_BYPASS; + else + WARN_ON(sid_domain->type != IOMMU_DOMAIN_BLOCKED); + + /* + * Change the STE into a cdtable one with SID IDENTITY/BLOCKED behavior + * using s1dss if necessary. If the cd_table is already installed then + * the S1DSS is correct and this will just update the EATS. Otherwise it + * installs the entire thing. This will be hitless. + */ + arm_smmu_make_cdtable_ste(&ste, master, ats_enabled, s1dss); + arm_smmu_install_ste_for_dev(master, &ste); +} + +int arm_smmu_set_pasid(struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, ioasid_t pasid, + struct arm_smmu_cd *cd, struct iommu_domain *old) +{ + struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev); + struct arm_smmu_attach_state state = { + .master = master, + .ssid = pasid, + .old_domain = old, + }; + struct arm_smmu_cd *cdptr; + int ret; + + /* The core code validates pasid */ + + if (smmu_domain->smmu != master->smmu) + return -EINVAL; + + if (!master->cd_table.in_ste && + sid_domain->type != IOMMU_DOMAIN_IDENTITY && + sid_domain->type != IOMMU_DOMAIN_BLOCKED) + return -EINVAL; + + cdptr = arm_smmu_alloc_cd_ptr(master, pasid); + if (!cdptr) + return -ENOMEM; + + mutex_lock(&arm_smmu_asid_lock); + ret = arm_smmu_attach_prepare(&state, &smmu_domain->domain); + if (ret) + goto out_unlock; + + /* + * We don't want to obtain to the asid_lock too early, so fix up the + * caller set ASID under the lock in case it changed. + */ + cd->data[0] &= ~cpu_to_le64(CTXDESC_CD_0_ASID); + cd->data[0] |= cpu_to_le64( + FIELD_PREP(CTXDESC_CD_0_ASID, smmu_domain->cd.asid)); + + arm_smmu_write_cd_entry(master, pasid, cdptr, cd); + arm_smmu_update_ste(master, sid_domain, state.ats_enabled); + + arm_smmu_attach_commit(&state); + +out_unlock: + mutex_unlock(&arm_smmu_asid_lock); + return ret; +} + +static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old_domain) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(old_domain); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + + mutex_lock(&arm_smmu_asid_lock); + arm_smmu_clear_cd(master, pasid); + if (master->ats_enabled) + arm_smmu_atc_inv_master(master, pasid); + arm_smmu_remove_master_domain(master, &smmu_domain->domain, pasid); + mutex_unlock(&arm_smmu_asid_lock); + + /* + * When the last user of the CD table goes away downgrade the STE back + * to a non-cd_table one, by re-attaching its sid_domain. + */ + if (!arm_smmu_ssids_in_use(&master->cd_table)) { + struct iommu_domain *sid_domain = + iommu_get_domain_for_dev(master->dev); + + if (sid_domain->type == IOMMU_DOMAIN_IDENTITY || + sid_domain->type == IOMMU_DOMAIN_BLOCKED) + sid_domain->ops->attach_dev(sid_domain, dev, + sid_domain); + } + return 0; +} + +static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, + struct iommu_domain *old_domain, + struct device *dev, + struct arm_smmu_ste *ste, + unsigned int s1dss) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_attach_state state = { + .master = master, + .old_domain = old_domain, + .ssid = IOMMU_NO_PASID, + }; /* * Do not allow any ASID to be changed while are working on the STE, @@ -2655,15 +3221,25 @@ static int arm_smmu_attach_dev_ste(struct device *dev, mutex_lock(&arm_smmu_asid_lock); /* - * The SMMU does not support enabling ATS with bypass/abort. When the - * STE is in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests - * and Translated transactions are denied as though ATS is disabled for - * the stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and - * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry). + * If the CD table is not in use we can use the provided STE, otherwise + * we use a cdtable STE with the provided S1DSS. */ - arm_smmu_detach_dev(master); - + if (arm_smmu_ssids_in_use(&master->cd_table)) { + /* + * If a CD table has to be present then we need to run with ATS + * on because we have to assume a PASID is using ATS. For + * IDENTITY this will setup things so that S1DSS=bypass which + * follows the explanation in "13.6.4 Full ATS skipping stage 1" + * and allows for ATS on the RID to work. + */ + state.cd_needs_ats = true; + arm_smmu_attach_prepare(&state, domain); + arm_smmu_make_cdtable_ste(ste, master, state.ats_enabled, s1dss); + } else { + arm_smmu_attach_prepare(&state, domain); + } arm_smmu_install_ste_for_dev(master, ste); + arm_smmu_attach_commit(&state); mutex_unlock(&arm_smmu_asid_lock); /* @@ -2672,17 +3248,20 @@ static int arm_smmu_attach_dev_ste(struct device *dev, * descriptor from arm_smmu_share_asid(). */ arm_smmu_clear_cd(master, IOMMU_NO_PASID); - return 0; } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); + arm_smmu_master_clear_vmaster(master); arm_smmu_make_bypass_ste(master->smmu, &ste); - return arm_smmu_attach_dev_ste(dev, &ste); + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, + STRTAB_STE_1_S1DSS_BYPASS); + return 0; } static const struct iommu_domain_ops arm_smmu_identity_ops = { @@ -2695,16 +3274,22 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + arm_smmu_master_clear_vmaster(master); arm_smmu_make_abort_ste(&ste); - return arm_smmu_attach_dev_ste(dev, &ste); + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, + STRTAB_STE_1_S1DSS_TERMINATE); + return 0; } static const struct iommu_domain_ops arm_smmu_blocked_ops = { .attach_dev = arm_smmu_attach_dev_blocked, + .set_dev_pasid = arm_smmu_blocking_set_dev_pasid, }; static struct iommu_domain arm_smmu_blocked_domain = { @@ -2712,6 +3297,69 @@ static struct iommu_domain arm_smmu_blocked_domain = { .ops = &arm_smmu_blocked_ops, }; +static struct iommu_domain * +arm_smmu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_device *smmu = master->smmu; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID | + IOMMU_HWPT_ALLOC_NEST_PARENT; + struct arm_smmu_domain *smmu_domain; + int ret; + + if (flags & ~PAGING_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + if (user_data) + return ERR_PTR(-EOPNOTSUPP); + + smmu_domain = arm_smmu_domain_alloc(); + if (IS_ERR(smmu_domain)) + return ERR_CAST(smmu_domain); + + switch (flags) { + case 0: + /* Prefer S1 if available */ + if (smmu->features & ARM_SMMU_FEAT_TRANS_S1) + smmu_domain->stage = ARM_SMMU_DOMAIN_S1; + else + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + break; + case IOMMU_HWPT_ALLOC_NEST_PARENT: + if (!(smmu->features & ARM_SMMU_FEAT_NESTING)) { + ret = -EOPNOTSUPP; + goto err_free; + } + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + smmu_domain->nest_parent = true; + break; + case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: + case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_PASID: + case IOMMU_HWPT_ALLOC_PASID: + if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1)) { + ret = -EOPNOTSUPP; + goto err_free; + } + smmu_domain->stage = ARM_SMMU_DOMAIN_S1; + break; + default: + ret = -EOPNOTSUPP; + goto err_free; + } + + smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED; + smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops; + ret = arm_smmu_domain_finalise(smmu_domain, smmu, flags); + if (ret) + goto err_free; + return &smmu_domain->domain; + +err_free: + kfree(smmu_domain); + return ERR_PTR(ret); +} + static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -2774,20 +3422,17 @@ static struct platform_driver arm_smmu_driver; static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, - fwnode); + struct device *dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); + put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid) { - unsigned long limit = smmu->strtab_cfg.num_l1_ents; - if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) - limit *= 1UL << STRTAB_SPLIT; - - return sid < limit; + return arm_smmu_strtab_l1_idx(sid) < smmu->strtab_cfg.l2.num_l1_ents; + return sid < smmu->strtab_cfg.linear.num_ents; } static int arm_smmu_init_sid_strtab(struct arm_smmu_device *smmu, u32 sid) @@ -2808,8 +3453,6 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu, { int i; int ret = 0; - struct arm_smmu_stream *new_stream, *cur_stream; - struct rb_node **new_node, *parent_node = NULL; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev); master->streams = kcalloc(fwspec->num_ids, sizeof(*master->streams), @@ -2820,9 +3463,10 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu, mutex_lock(&smmu->streams_mutex); for (i = 0; i < fwspec->num_ids; i++) { + struct arm_smmu_stream *new_stream = &master->streams[i]; + struct rb_node *existing; u32 sid = fwspec->ids[i]; - new_stream = &master->streams[i]; new_stream->id = sid; new_stream->master = master; @@ -2831,28 +3475,23 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu, break; /* Insert into SID tree */ - new_node = &(smmu->streams.rb_node); - while (*new_node) { - cur_stream = rb_entry(*new_node, struct arm_smmu_stream, - node); - parent_node = *new_node; - if (cur_stream->id > new_stream->id) { - new_node = &((*new_node)->rb_left); - } else if (cur_stream->id < new_stream->id) { - new_node = &((*new_node)->rb_right); - } else { - dev_warn(master->dev, - "stream %u already in tree\n", - cur_stream->id); - ret = -EINVAL; - break; - } - } - if (ret) - break; + existing = rb_find_add(&new_stream->node, &smmu->streams, + arm_smmu_streams_cmp_node); + if (existing) { + struct arm_smmu_master *existing_master = + rb_entry(existing, struct arm_smmu_stream, node) + ->master; + + /* Bridged PCI devices may end up with duplicated IDs */ + if (existing_master == master) + continue; - rb_link_node(&new_stream->node, parent_node, new_node); - rb_insert_color(&new_stream->node, &smmu->streams); + dev_warn(master->dev, + "Aliasing StreamID 0x%x (from %s) unsupported, expect DMA to be broken\n", + sid, dev_name(existing_master->dev)); + ret = -ENODEV; + break; + } } if (ret) { @@ -2882,8 +3521,6 @@ static void arm_smmu_remove_master(struct arm_smmu_master *master) kfree(master->streams); } -static struct iommu_ops arm_smmu_ops; - static struct iommu_device *arm_smmu_probe_device(struct device *dev) { int ret; @@ -2904,8 +3541,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) master->dev = dev; master->smmu = smmu; - INIT_LIST_HEAD(&master->bonds); - INIT_LIST_HEAD(&master->domain_head); dev_iommu_priv_set(dev, master); ret = arm_smmu_insert_master(smmu, master); @@ -2934,6 +3569,12 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) smmu->features & ARM_SMMU_FEAT_STALL_FORCE) master->stall_enabled = true; + if (dev_is_pci(dev)) { + unsigned int stu = __ffs(smmu->pgsize_bitmap); + + pci_prepare_ats(to_pci_dev(dev), stu); + } + return &smmu->iommu; err_free_master: @@ -2945,22 +3586,36 @@ static void arm_smmu_release_device(struct device *dev) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); - if (WARN_ON(arm_smmu_master_sva_enabled(master))) - iopf_queue_remove_device(master->smmu->evtq.iopf, dev); - - /* Put the STE back to what arm_smmu_init_strtab() sets */ - if (dev->iommu->require_direct) - arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev); - else - arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev); + WARN_ON(master->iopf_refcount); arm_smmu_disable_pasid(master); arm_smmu_remove_master(master); - if (master->cd_table.cdtab) + if (arm_smmu_cdtab_allocated(&master->cd_table)) arm_smmu_free_cd_tables(master); kfree(master); } +static int arm_smmu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + + return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); +} + +static int arm_smmu_set_dirty_tracking(struct iommu_domain *domain, + bool enabled) +{ + /* + * Always enabled and the dirty bitmap is cleared prior to + * set_dirty_tracking(). + */ + return 0; +} + static struct iommu_group *arm_smmu_device_group(struct device *dev) { struct iommu_group *group; @@ -2978,21 +3633,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_of_xlate(struct device *dev, const struct of_phandle_args *args) { @@ -3015,58 +3655,6 @@ static void arm_smmu_get_resv_regions(struct device *dev, iommu_dma_get_resv_regions(dev, head); } -static int arm_smmu_dev_enable_feature(struct device *dev, - enum iommu_dev_features feat) -{ - struct arm_smmu_master *master = dev_iommu_priv_get(dev); - - if (!master) - return -ENODEV; - - switch (feat) { - case IOMMU_DEV_FEAT_IOPF: - if (!arm_smmu_master_iopf_supported(master)) - return -EINVAL; - if (master->iopf_enabled) - return -EBUSY; - master->iopf_enabled = true; - return 0; - case IOMMU_DEV_FEAT_SVA: - if (!arm_smmu_master_sva_supported(master)) - return -EINVAL; - if (arm_smmu_master_sva_enabled(master)) - return -EBUSY; - return arm_smmu_master_enable_sva(master); - default: - return -EINVAL; - } -} - -static int arm_smmu_dev_disable_feature(struct device *dev, - enum iommu_dev_features feat) -{ - struct arm_smmu_master *master = dev_iommu_priv_get(dev); - - if (!master) - return -EINVAL; - - switch (feat) { - case IOMMU_DEV_FEAT_IOPF: - if (!master->iopf_enabled) - return -EINVAL; - if (master->sva_enabled) - return -EBUSY; - master->iopf_enabled = false; - return 0; - case IOMMU_DEV_FEAT_SVA: - if (!arm_smmu_master_sva_enabled(master)) - return -EINVAL; - return arm_smmu_master_disable_sva(master); - default: - return -EINVAL; - } -} - /* * HiSilicon PCIe tune and trace device can be used to trace TLP headers on the * PCIe link and save the data to memory by DMA. The hardware is restricted to @@ -3087,49 +3675,48 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } -static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain) -{ - arm_smmu_sva_remove_dev_pasid(domain, dev, pasid); -} - -static struct iommu_ops arm_smmu_ops = { +static const struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, + .release_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, - .domain_alloc = arm_smmu_domain_alloc, - .domain_alloc_paging = arm_smmu_domain_alloc_paging, + .hw_info = arm_smmu_hw_info, + .domain_alloc_sva = arm_smmu_sva_domain_alloc, + .domain_alloc_paging_flags = arm_smmu_domain_alloc_paging_flags, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .device_group = arm_smmu_device_group, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, - .remove_dev_pasid = arm_smmu_remove_dev_pasid, - .dev_enable_feat = arm_smmu_dev_enable_feature, - .dev_disable_feat = arm_smmu_dev_disable_feature, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, - .pgsize_bitmap = -1UL, /* Restricted during device attach */ + .get_viommu_size = arm_smmu_get_viommu_size, + .viommu_init = arm_vsmmu_init, + .user_pasid_table = 1, .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = arm_smmu_attach_dev, + .enforce_cache_coherency = arm_smmu_enforce_cache_coherency, + .set_dev_pasid = arm_smmu_s1_set_dev_pasid, .map_pages = arm_smmu_map_pages, .unmap_pages = arm_smmu_unmap_pages, .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, - .free = arm_smmu_domain_free, + .free = arm_smmu_domain_free_paging, } }; +static struct iommu_dirty_ops arm_smmu_dirty_ops = { + .read_and_clear_dirty = arm_smmu_read_and_clear_dirty, + .set_dirty_tracking = arm_smmu_set_dirty_tracking, +}; + /* Probing and initialisation functions */ -static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, - struct arm_smmu_queue *q, - void __iomem *page, - unsigned long prod_off, - unsigned long cons_off, - size_t dwords, const char *name) +int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, + struct arm_smmu_queue *q, void __iomem *page, + unsigned long prod_off, unsigned long cons_off, + size_t dwords, const char *name) { size_t qsz; @@ -3167,9 +3754,9 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, return 0; } -static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) +int arm_smmu_cmdq_init(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq) { - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; unsigned int nents = 1 << cmdq->q.llq.max_n_shift; atomic_set(&cmdq->owner_prod, 0); @@ -3194,7 +3781,7 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu) if (ret) return ret; - ret = arm_smmu_cmdq_init(smmu); + ret = arm_smmu_cmdq_init(smmu, &smmu->cmdq); if (ret) return ret; @@ -3221,109 +3808,71 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu) PRIQ_ENT_DWORDS, "priq"); } -static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu) -{ - unsigned int i; - struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; - void *strtab = smmu->strtab_cfg.strtab; - - cfg->l1_desc = devm_kcalloc(smmu->dev, cfg->num_l1_ents, - sizeof(*cfg->l1_desc), GFP_KERNEL); - if (!cfg->l1_desc) - return -ENOMEM; - - for (i = 0; i < cfg->num_l1_ents; ++i) { - arm_smmu_write_strtab_l1_desc(strtab, &cfg->l1_desc[i]); - strtab += STRTAB_L1_DESC_DWORDS << 3; - } - - return 0; -} - static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu) { - void *strtab; - u64 reg; - u32 size, l1size; + u32 l1size; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; + unsigned int last_sid_idx = + arm_smmu_strtab_l1_idx((1ULL << smmu->sid_bits) - 1); /* Calculate the L1 size, capped to the SIDSIZE. */ - size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3); - size = min(size, smmu->sid_bits - STRTAB_SPLIT); - cfg->num_l1_ents = 1 << size; - - size += STRTAB_SPLIT; - if (size < smmu->sid_bits) + cfg->l2.num_l1_ents = min(last_sid_idx + 1, STRTAB_MAX_L1_ENTRIES); + if (cfg->l2.num_l1_ents <= last_sid_idx) dev_warn(smmu->dev, "2-level strtab only covers %u/%u bits of SID\n", - size, smmu->sid_bits); + ilog2(cfg->l2.num_l1_ents * STRTAB_NUM_L2_STES), + smmu->sid_bits); - l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3); - strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma, - GFP_KERNEL); - if (!strtab) { + l1size = cfg->l2.num_l1_ents * sizeof(struct arm_smmu_strtab_l1); + cfg->l2.l1tab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->l2.l1_dma, + GFP_KERNEL); + if (!cfg->l2.l1tab) { dev_err(smmu->dev, "failed to allocate l1 stream table (%u bytes)\n", l1size); return -ENOMEM; } - cfg->strtab = strtab; - /* Configure strtab_base_cfg for 2 levels */ - reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL); - reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, size); - reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT); - cfg->strtab_base_cfg = reg; + cfg->l2.l2ptrs = devm_kcalloc(smmu->dev, cfg->l2.num_l1_ents, + sizeof(*cfg->l2.l2ptrs), GFP_KERNEL); + if (!cfg->l2.l2ptrs) + return -ENOMEM; - return arm_smmu_init_l1_strtab(smmu); + return 0; } static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu) { - void *strtab; - u64 reg; u32 size; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; - size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3); - strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma, - GFP_KERNEL); - if (!strtab) { + size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste); + cfg->linear.table = dmam_alloc_coherent(smmu->dev, size, + &cfg->linear.ste_dma, + GFP_KERNEL); + if (!cfg->linear.table) { dev_err(smmu->dev, "failed to allocate linear stream table (%u bytes)\n", size); return -ENOMEM; } - cfg->strtab = strtab; - cfg->num_l1_ents = 1 << smmu->sid_bits; + cfg->linear.num_ents = 1 << smmu->sid_bits; - /* Configure strtab_base_cfg for a linear table covering all SIDs */ - reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_LINEAR); - reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits); - cfg->strtab_base_cfg = reg; - - arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents); + arm_smmu_init_initial_stes(cfg->linear.table, cfg->linear.num_ents); return 0; } static int arm_smmu_init_strtab(struct arm_smmu_device *smmu) { - u64 reg; int ret; if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) ret = arm_smmu_init_strtab_2lvl(smmu); else ret = arm_smmu_init_strtab_linear(smmu); - if (ret) return ret; - /* Set the strtab base address */ - reg = smmu->strtab_cfg.strtab_dma & STRTAB_BASE_ADDR_MASK; - reg |= STRTAB_BASE_RA; - smmu->strtab_cfg.strtab_base = reg; - ida_init(&smmu->vmid_map); return 0; @@ -3340,7 +3889,14 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu) if (ret) return ret; - return arm_smmu_init_strtab(smmu); + ret = arm_smmu_init_strtab(smmu); + if (ret) + return ret; + + if (smmu->impl_ops && smmu->impl_ops->init_structures) + return smmu->impl_ops->init_structures(smmu); + + return 0; } static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val, @@ -3532,6 +4088,30 @@ static int arm_smmu_device_disable(struct arm_smmu_device *smmu) return ret; } +static void arm_smmu_write_strtab(struct arm_smmu_device *smmu) +{ + struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; + dma_addr_t dma; + u32 reg; + + if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) { + reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, + STRTAB_BASE_CFG_FMT_2LVL) | + FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, + ilog2(cfg->l2.num_l1_ents) + STRTAB_SPLIT) | + FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT); + dma = cfg->l2.l1_dma; + } else { + reg = FIELD_PREP(STRTAB_BASE_CFG_FMT, + STRTAB_BASE_CFG_FMT_LINEAR) | + FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits); + dma = cfg->linear.ste_dma; + } + writeq_relaxed((dma & STRTAB_BASE_ADDR_MASK) | STRTAB_BASE_RA, + smmu->base + ARM_SMMU_STRTAB_BASE); + writel_relaxed(reg, smmu->base + ARM_SMMU_STRTAB_BASE_CFG); +} + static int arm_smmu_device_reset(struct arm_smmu_device *smmu) { int ret; @@ -3567,10 +4147,7 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu) writel_relaxed(reg, smmu->base + ARM_SMMU_CR2); /* Stream table */ - writeq_relaxed(smmu->strtab_cfg.strtab_base, - smmu->base + ARM_SMMU_STRTAB_BASE); - writel_relaxed(smmu->strtab_cfg.strtab_base_cfg, - smmu->base + ARM_SMMU_STRTAB_BASE_CFG); + arm_smmu_write_strtab(smmu); /* Command queue */ writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE); @@ -3657,6 +4234,14 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu) return ret; } + if (smmu->impl_ops && smmu->impl_ops->device_reset) { + ret = smmu->impl_ops->device_reset(smmu); + if (ret) { + dev_err(smmu->dev, "failed to reset impl\n"); + return ret; + } + } + return 0; } @@ -3698,6 +4283,28 @@ static void arm_smmu_device_iidr_probe(struct arm_smmu_device *smmu) } } +static void arm_smmu_get_httu(struct arm_smmu_device *smmu, u32 reg) +{ + u32 fw_features = smmu->features & (ARM_SMMU_FEAT_HA | ARM_SMMU_FEAT_HD); + u32 hw_features = 0; + + switch (FIELD_GET(IDR0_HTTU, reg)) { + case IDR0_HTTU_ACCESS_DIRTY: + hw_features |= ARM_SMMU_FEAT_HD; + fallthrough; + case IDR0_HTTU_ACCESS: + hw_features |= ARM_SMMU_FEAT_HA; + } + + if (smmu->dev->of_node) + smmu->features |= hw_features; + else if (hw_features != fw_features) + /* ACPI IORT sets the HTTU bits */ + dev_warn(smmu->dev, + "IDR0.HTTU features(0x%x) overridden by FW configuration (0x%x)\n", + hw_features, fw_features); +} + static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) { u32 reg; @@ -3758,13 +4365,15 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) smmu->features |= ARM_SMMU_FEAT_E2H; } + arm_smmu_get_httu(smmu, reg); + /* * The coherency feature as set by FW is used in preference to the ID * register, but warn on mismatch. */ if (!!(reg & IDR0_COHACC) != coherent) dev_warn(smmu->dev, "IDR0.COHACC overridden by FW configuration (%s)\n", - coherent ? "true" : "false"); + str_true_false(coherent)); switch (FIELD_GET(IDR0_STALL_MODEL, reg)) { case IDR0_STALL_MODEL_FORCE: @@ -3847,6 +4456,11 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3); if (FIELD_GET(IDR3_RIL, reg)) smmu->features |= ARM_SMMU_FEAT_RANGE_INV; + if (FIELD_GET(IDR3_FWB, reg)) + smmu->features |= ARM_SMMU_FEAT_S2FWB; + + if (FIELD_GET(IDR3_BBM, reg) == 2) + smmu->features |= ARM_SMMU_FEAT_BBML2; /* IDR5 */ reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5); @@ -3895,11 +4509,6 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) smmu->oas = 48; } - if (arm_smmu_ops.pgsize_bitmap == -1UL) - arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap; - else - arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap; - /* Set the DMA mask for our table walker */ if (dma_set_mask_and_coherent(smmu->dev, DMA_BIT_MASK(smmu->oas))) dev_warn(smmu->dev, @@ -3922,18 +4531,55 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) } #ifdef CONFIG_ACPI -static void acpi_smmu_get_options(u32 model, struct arm_smmu_device *smmu) +#ifdef CONFIG_TEGRA241_CMDQV +static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node, + struct arm_smmu_device *smmu) +{ + const char *uid = kasprintf(GFP_KERNEL, "%u", node->identifier); + struct acpi_device *adev; + + /* Look for an NVDA200C node whose _UID matches the SMMU node ID */ + adev = acpi_dev_get_first_match_dev("NVDA200C", uid, -1); + if (adev) { + /* Tegra241 CMDQV driver is responsible for put_device() */ + smmu->impl_dev = &adev->dev; + smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV; + dev_info(smmu->dev, "found companion CMDQV device: %s\n", + dev_name(smmu->impl_dev)); + } + kfree(uid); +} +#else +static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node, + struct arm_smmu_device *smmu) { - switch (model) { +} +#endif + +static int acpi_smmu_iort_probe_model(struct acpi_iort_node *node, + struct arm_smmu_device *smmu) +{ + struct acpi_iort_smmu_v3 *iort_smmu = + (struct acpi_iort_smmu_v3 *)node->node_data; + + switch (iort_smmu->model) { case ACPI_IORT_SMMU_V3_CAVIUM_CN99XX: smmu->options |= ARM_SMMU_OPT_PAGE0_REGS_ONLY; break; case ACPI_IORT_SMMU_V3_HISILICON_HI161X: smmu->options |= ARM_SMMU_OPT_SKIP_PREFETCH; break; + case ACPI_IORT_SMMU_V3_GENERIC: + /* + * Tegra241 implementation stores its SMMU options and impl_dev + * in DSDT. Thus, go through the ACPI tables unconditionally. + */ + acpi_smmu_dsdt_probe_tegra241_cmdqv(node, smmu); + break; } dev_notice(smmu->dev, "option mask 0x%x\n", smmu->options); + return 0; } static int arm_smmu_device_acpi_probe(struct platform_device *pdev, @@ -3948,12 +4594,18 @@ static int arm_smmu_device_acpi_probe(struct platform_device *pdev, /* Retrieve SMMUv3 specific data */ iort_smmu = (struct acpi_iort_smmu_v3 *)node->node_data; - acpi_smmu_get_options(iort_smmu->model, smmu); - if (iort_smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) smmu->features |= ARM_SMMU_FEAT_COHERENCY; - return 0; + switch (FIELD_GET(ACPI_IORT_SMMU_V3_HTTU_OVERRIDE, iort_smmu->flags)) { + case IDR0_HTTU_ACCESS_DIRTY: + smmu->features |= ARM_SMMU_FEAT_HD; + fallthrough; + case IDR0_HTTU_ACCESS: + smmu->features |= ARM_SMMU_FEAT_HA; + } + + return acpi_smmu_iort_probe_model(node, smmu); } #else static inline int arm_smmu_device_acpi_probe(struct platform_device *pdev, @@ -4034,6 +4686,53 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) iort_put_rmr_sids(dev_fwnode(smmu->dev), &rmr_list); } +static void arm_smmu_impl_remove(void *data) +{ + struct arm_smmu_device *smmu = data; + + if (smmu->impl_ops && smmu->impl_ops->device_remove) + smmu->impl_ops->device_remove(smmu); +} + +/* + * Probe all the compiled in implementations. Each one checks to see if it + * matches this HW and if so returns a devm_krealloc'd arm_smmu_device which + * replaces the callers. Otherwise the original is returned or ERR_PTR. + */ +static struct arm_smmu_device *arm_smmu_impl_probe(struct arm_smmu_device *smmu) +{ + struct arm_smmu_device *new_smmu = ERR_PTR(-ENODEV); + const struct arm_smmu_impl_ops *ops; + int ret; + + if (smmu->impl_dev && (smmu->options & ARM_SMMU_OPT_TEGRA241_CMDQV)) + new_smmu = tegra241_cmdqv_probe(smmu); + + if (new_smmu == ERR_PTR(-ENODEV)) + return smmu; + if (IS_ERR(new_smmu)) + return new_smmu; + + ops = new_smmu->impl_ops; + if (ops) { + /* get_viommu_size and vsmmu_init ops must be paired */ + if (WARN_ON(!ops->get_viommu_size != !ops->vsmmu_init)) { + ret = -EINVAL; + goto err_remove; + } + } + + ret = devm_add_action_or_reset(new_smmu->dev, arm_smmu_impl_remove, + new_smmu); + if (ret) + return ERR_PTR(ret); + return new_smmu; + +err_remove: + arm_smmu_impl_remove(new_smmu); + return ERR_PTR(ret); +} + static int arm_smmu_device_probe(struct platform_device *pdev) { int irq, ret; @@ -4055,6 +4754,10 @@ static int arm_smmu_device_probe(struct platform_device *pdev) if (ret) return ret; + smmu = arm_smmu_impl_probe(smmu); + if (IS_ERR(smmu)) + return PTR_ERR(smmu); + /* Base address */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) @@ -4108,7 +4811,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev) /* Initialise in-memory data structures */ ret = arm_smmu_init_structures(smmu); if (ret) - return ret; + goto err_free_iopf; /* Record our private device structure */ platform_set_drvdata(pdev, smmu); @@ -4119,22 +4822,29 @@ static int arm_smmu_device_probe(struct platform_device *pdev) /* Reset the device */ ret = arm_smmu_device_reset(smmu); if (ret) - return ret; + goto err_disable; /* And we're up. Go go go! */ ret = iommu_device_sysfs_add(&smmu->iommu, dev, NULL, "smmu3.%pa", &ioaddr); if (ret) - return ret; + goto err_disable; ret = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev); if (ret) { dev_err(dev, "Failed to register iommu\n"); - iommu_device_sysfs_remove(&smmu->iommu); - return ret; + goto err_free_sysfs; } return 0; + +err_free_sysfs: + iommu_device_sysfs_remove(&smmu->iommu); +err_disable: + arm_smmu_device_disable(smmu); +err_free_iopf: + iopf_queue_free(smmu->evtq.iopf); + return ret; } static void arm_smmu_device_remove(struct platform_device *pdev) @@ -4174,7 +4884,7 @@ static struct platform_driver arm_smmu_driver = { .suppress_bind_attrs = true, }, .probe = arm_smmu_device_probe, - .remove_new = arm_smmu_device_remove, + .remove = arm_smmu_device_remove, .shutdown = arm_smmu_device_shutdown, }; module_driver(arm_smmu_driver, platform_driver_register, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 1242a086c9f9..ae23aacc3840 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -10,10 +10,14 @@ #include <linux/bitfield.h> #include <linux/iommu.h> +#include <linux/iommufd.h> #include <linux/kernel.h> #include <linux/mmzone.h> #include <linux/sizes.h> +struct arm_smmu_device; +struct arm_vsmmu; + /* MMIO registers */ #define ARM_SMMU_IDR0 0x0 #define IDR0_ST_LVL GENMASK(28, 27) @@ -33,6 +37,9 @@ #define IDR0_ASID16 (1 << 12) #define IDR0_ATS (1 << 10) #define IDR0_HYP (1 << 9) +#define IDR0_HTTU GENMASK(7, 6) +#define IDR0_HTTU_ACCESS 1 +#define IDR0_HTTU_ACCESS_DIRTY 2 #define IDR0_COHACC (1 << 4) #define IDR0_TTF GENMASK(3, 2) #define IDR0_TTF_AARCH64 2 @@ -52,7 +59,9 @@ #define IDR1_SIDSIZE GENMASK(5, 0) #define ARM_SMMU_IDR3 0xc +#define IDR3_FWB (1 << 8) #define IDR3_RIL (1 << 10) +#define IDR3_BBM GENMASK(12, 11) #define ARM_SMMU_IDR5 0x14 #define IDR5_STALL_MAX GENMASK(31, 16) @@ -76,6 +85,8 @@ #define IIDR_REVISION GENMASK(15, 12) #define IIDR_IMPLEMENTER GENMASK(11, 0) +#define ARM_SMMU_AIDR 0x1C + #define ARM_SMMU_CR0 0x20 #define CR0_ATSCHK (1 << 4) #define CR0_CMDQEN (1 << 3) @@ -199,10 +210,8 @@ * 2lvl: 128k L1 entries, * 256 lazy entries per table (each table covers a PCI bus) */ -#define STRTAB_L1_SZ_SHIFT 20 #define STRTAB_SPLIT 8 -#define STRTAB_L1_DESC_DWORDS 1 #define STRTAB_L1_DESC_SPAN GENMASK_ULL(4, 0) #define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6) @@ -212,12 +221,33 @@ struct arm_smmu_ste { __le64 data[STRTAB_STE_DWORDS]; }; +#define STRTAB_NUM_L2_STES (1 << STRTAB_SPLIT) +struct arm_smmu_strtab_l2 { + struct arm_smmu_ste stes[STRTAB_NUM_L2_STES]; +}; + +struct arm_smmu_strtab_l1 { + __le64 l2ptr; +}; +#define STRTAB_MAX_L1_ENTRIES (1 << 17) + +static inline u32 arm_smmu_strtab_l1_idx(u32 sid) +{ + return sid / STRTAB_NUM_L2_STES; +} + +static inline u32 arm_smmu_strtab_l2_idx(u32 sid) +{ + return sid % STRTAB_NUM_L2_STES; +} + #define STRTAB_STE_0_V (1UL << 0) #define STRTAB_STE_0_CFG GENMASK_ULL(3, 1) #define STRTAB_STE_0_CFG_ABORT 0 #define STRTAB_STE_0_CFG_BYPASS 4 #define STRTAB_STE_0_CFG_S1_TRANS 5 #define STRTAB_STE_0_CFG_S2_TRANS 6 +#define STRTAB_STE_0_CFG_NESTED 7 #define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4) #define STRTAB_STE_0_S1FMT_LINEAR 0 @@ -238,6 +268,8 @@ struct arm_smmu_ste { #define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4) #define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6) +#define STRTAB_STE_1_MEV (1UL << 19) +#define STRTAB_STE_1_S2FWB (1UL << 25) #define STRTAB_STE_1_S1STALLD (1UL << 27) #define STRTAB_STE_1_EATS GENMASK_ULL(29, 28) @@ -264,10 +296,20 @@ struct arm_smmu_ste { #define STRTAB_STE_2_S2AA64 (1UL << 51) #define STRTAB_STE_2_S2ENDI (1UL << 52) #define STRTAB_STE_2_S2PTW (1UL << 54) +#define STRTAB_STE_2_S2S (1UL << 57) #define STRTAB_STE_2_S2R (1UL << 58) #define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) +/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */ +#define STRTAB_STE_0_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \ + STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX) +#define STRTAB_STE_1_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \ + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \ + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS) + /* * Context descriptors. * @@ -277,7 +319,6 @@ struct arm_smmu_ste { */ #define CTXDESC_L2_ENTRIES 1024 -#define CTXDESC_L1_DESC_DWORDS 1 #define CTXDESC_L1_DESC_V (1UL << 0) #define CTXDESC_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 12) @@ -287,6 +328,24 @@ struct arm_smmu_cd { __le64 data[CTXDESC_CD_DWORDS]; }; +struct arm_smmu_cdtab_l2 { + struct arm_smmu_cd cds[CTXDESC_L2_ENTRIES]; +}; + +struct arm_smmu_cdtab_l1 { + __le64 l2ptr; +}; + +static inline unsigned int arm_smmu_cdtab_l1_idx(unsigned int ssid) +{ + return ssid / CTXDESC_L2_ENTRIES; +} + +static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid) +{ + return ssid % CTXDESC_L2_ENTRIES; +} + #define CTXDESC_CD_0_TCR_T0SZ GENMASK_ULL(5, 0) #define CTXDESC_CD_0_TCR_TG0 GENMASK_ULL(7, 6) #define CTXDESC_CD_0_TCR_IRGN0 GENMASK_ULL(9, 8) @@ -301,6 +360,9 @@ struct arm_smmu_cd { #define CTXDESC_CD_0_TCR_IPS GENMASK_ULL(34, 32) #define CTXDESC_CD_0_TCR_TBI0 (1ULL << 38) +#define CTXDESC_CD_0_TCR_HA (1UL << 43) +#define CTXDESC_CD_0_TCR_HD (1UL << 42) + #define CTXDESC_CD_0_AA64 (1UL << 41) #define CTXDESC_CD_0_S (1UL << 44) #define CTXDESC_CD_0_R (1UL << 45) @@ -314,7 +376,7 @@ struct arm_smmu_cd { * When the SMMU only supports linear context descriptor tables, pick a * reasonable size limit (64kB). */ -#define CTXDESC_LINEAR_CDMAX ilog2(SZ_64K / (CTXDESC_CD_DWORDS << 3)) +#define CTXDESC_LINEAR_CDMAX ilog2(SZ_64K / sizeof(struct arm_smmu_cd)) /* Command queue */ #define CMDQ_ENT_SZ_SHIFT 4 @@ -393,10 +455,18 @@ struct arm_smmu_cd { #define EVTQ_0_ID GENMASK_ULL(7, 0) +#define EVT_ID_BAD_STREAMID_CONFIG 0x02 +#define EVT_ID_STE_FETCH_FAULT 0x03 +#define EVT_ID_BAD_STE_CONFIG 0x04 +#define EVT_ID_STREAM_DISABLED_FAULT 0x06 +#define EVT_ID_BAD_SUBSTREAMID_CONFIG 0x08 +#define EVT_ID_CD_FETCH_FAULT 0x09 +#define EVT_ID_BAD_CD_CONFIG 0x0a #define EVT_ID_TRANSLATION_FAULT 0x10 #define EVT_ID_ADDR_SIZE_FAULT 0x11 #define EVT_ID_ACCESS_FAULT 0x12 #define EVT_ID_PERMISSION_FAULT 0x13 +#define EVT_ID_VMS_FETCH_FAULT 0x25 #define EVTQ_0_SSV (1UL << 11) #define EVTQ_0_SSID GENMASK_ULL(31, 12) @@ -408,9 +478,11 @@ struct arm_smmu_cd { #define EVTQ_1_RnW (1UL << 35) #define EVTQ_1_S2 (1UL << 39) #define EVTQ_1_CLASS GENMASK_ULL(41, 40) +#define EVTQ_1_CLASS_TT 0x01 #define EVTQ_1_TT_READ (1UL << 44) #define EVTQ_2_ADDR GENMASK_ULL(63, 0) #define EVTQ_3_IPA GENMASK_ULL(51, 12) +#define EVTQ_3_FETCH_ADDR GENMASK_ULL(51, 3) /* PRI queue */ #define PRIQ_ENT_SZ_SHIFT 4 @@ -467,8 +539,10 @@ struct arm_smmu_cmdq_ent { }; } cfgi; + #define CMDQ_OP_TLBI_NH_ALL 0x10 #define CMDQ_OP_TLBI_NH_ASID 0x11 #define CMDQ_OP_TLBI_NH_VA 0x12 + #define CMDQ_OP_TLBI_NH_VAA 0x13 #define CMDQ_OP_TLBI_EL2_ALL 0x20 #define CMDQ_OP_TLBI_EL2_ASID 0x21 #define CMDQ_OP_TLBI_EL2_VA 0x22 @@ -560,10 +634,18 @@ struct arm_smmu_cmdq { atomic_long_t *valid_map; atomic_t owner_prod; atomic_t lock; + bool (*supports_cmd)(struct arm_smmu_cmdq_ent *ent); }; +static inline bool arm_smmu_cmdq_supports_cmd(struct arm_smmu_cmdq *cmdq, + struct arm_smmu_cmdq_ent *ent) +{ + return cmdq->supports_cmd ? cmdq->supports_cmd(ent) : true; +} + struct arm_smmu_cmdq_batch { u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS]; + struct arm_smmu_cmdq *cmdq; int num; }; @@ -578,52 +660,86 @@ struct arm_smmu_priq { }; /* High-level stream table and context descriptor structures */ -struct arm_smmu_strtab_l1_desc { - u8 span; - - struct arm_smmu_ste *l2ptr; - dma_addr_t l2ptr_dma; -}; - struct arm_smmu_ctx_desc { u16 asid; - - refcount_t refs; - struct mm_struct *mm; -}; - -struct arm_smmu_l1_ctx_desc { - struct arm_smmu_cd *l2ptr; - dma_addr_t l2ptr_dma; }; struct arm_smmu_ctx_desc_cfg { - __le64 *cdtab; + union { + struct { + struct arm_smmu_cd *table; + unsigned int num_ents; + } linear; + struct { + struct arm_smmu_cdtab_l1 *l1tab; + struct arm_smmu_cdtab_l2 **l2ptrs; + unsigned int num_l1_ents; + } l2; + }; dma_addr_t cdtab_dma; - struct arm_smmu_l1_ctx_desc *l1_desc; - unsigned int num_l1_ents; + unsigned int used_ssids; + u8 in_ste; u8 s1fmt; /* log2 of the maximum number of CDs supported by this table */ u8 s1cdmax; }; +static inline bool +arm_smmu_cdtab_allocated(struct arm_smmu_ctx_desc_cfg *cfg) +{ + return cfg->linear.table || cfg->l2.l1tab; +} + +/* True if the cd table has SSIDS > 0 in use. */ +static inline bool arm_smmu_ssids_in_use(struct arm_smmu_ctx_desc_cfg *cd_table) +{ + return cd_table->used_ssids; +} + struct arm_smmu_s2_cfg { u16 vmid; }; struct arm_smmu_strtab_cfg { - __le64 *strtab; - dma_addr_t strtab_dma; - struct arm_smmu_strtab_l1_desc *l1_desc; - unsigned int num_l1_ents; + union { + struct { + struct arm_smmu_ste *table; + dma_addr_t ste_dma; + unsigned int num_ents; + } linear; + struct { + struct arm_smmu_strtab_l1 *l1tab; + struct arm_smmu_strtab_l2 **l2ptrs; + dma_addr_t l1_dma; + unsigned int num_l1_ents; + } l2; + }; +}; - u64 strtab_base; - u32 strtab_base_cfg; +struct arm_smmu_impl_ops { + int (*device_reset)(struct arm_smmu_device *smmu); + void (*device_remove)(struct arm_smmu_device *smmu); + int (*init_structures)(struct arm_smmu_device *smmu); + struct arm_smmu_cmdq *(*get_secondary_cmdq)( + struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent); + /* + * An implementation should define its own type other than the default + * IOMMU_HW_INFO_TYPE_ARM_SMMUV3. And it must validate the input @type + * to return its own structure. + */ + void *(*hw_info)(struct arm_smmu_device *smmu, u32 *length, + enum iommu_hw_info_type *type); + size_t (*get_viommu_size)(enum iommu_viommu_type viommu_type); + int (*vsmmu_init)(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data); }; /* An SMMUv3 instance */ struct arm_smmu_device { struct device *dev; + struct device *impl_dev; + const struct arm_smmu_impl_ops *impl_ops; + void __iomem *base; void __iomem *page1; @@ -648,12 +764,17 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_E2H (1 << 18) #define ARM_SMMU_FEAT_NESTING (1 << 19) #define ARM_SMMU_FEAT_ATTR_TYPES_OVR (1 << 20) +#define ARM_SMMU_FEAT_HA (1 << 21) +#define ARM_SMMU_FEAT_HD (1 << 22) +#define ARM_SMMU_FEAT_S2FWB (1 << 23) +#define ARM_SMMU_FEAT_BBML2 (1 << 24) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) #define ARM_SMMU_OPT_PAGE0_REGS_ONLY (1 << 1) #define ARM_SMMU_OPT_MSIPOLL (1 << 2) #define ARM_SMMU_OPT_CMDQ_FORCE_SYNC (1 << 3) +#define ARM_SMMU_OPT_TEGRA241_CMDQV (1 << 4) u32 options; struct arm_smmu_cmdq cmdq; @@ -692,21 +813,45 @@ struct arm_smmu_stream { struct rb_node node; }; +struct arm_smmu_vmaster { + struct arm_vsmmu *vsmmu; + unsigned long vsid; +}; + +struct arm_smmu_event { + u8 stall : 1, + ssv : 1, + privileged : 1, + instruction : 1, + s2 : 1, + read : 1, + ttrnw : 1, + class_tt : 1; + u8 id; + u8 class; + u16 stag; + u32 sid; + u32 ssid; + u64 iova; + u64 ipa; + u64 fetch_addr; + struct device *dev; +}; + /* SMMU private data for each master */ struct arm_smmu_master { struct arm_smmu_device *smmu; struct device *dev; - struct list_head domain_head; struct arm_smmu_stream *streams; + struct arm_smmu_vmaster *vmaster; /* use smmu->streams_mutex */ /* Locked by the iommu core using the group mutex */ struct arm_smmu_ctx_desc_cfg cd_table; unsigned int num_streams; - bool ats_enabled; + bool ats_enabled : 1; + bool ste_ats_enabled : 1; bool stall_enabled; - bool sva_enabled; - bool iopf_enabled; - struct list_head bonds; unsigned int ssid_bits; + unsigned int iopf_refcount; }; /* SMMU private data for an IOMMU domain */ @@ -717,7 +862,6 @@ enum arm_smmu_domain_stage { struct arm_smmu_domain { struct arm_smmu_device *smmu; - struct mutex init_mutex; /* Protects smmu pointer */ struct io_pgtable_ops *pgtbl_ops; atomic_t nr_ats_masters; @@ -730,10 +874,21 @@ struct arm_smmu_domain { struct iommu_domain domain; + /* List of struct arm_smmu_master_domain */ struct list_head devices; spinlock_t devices_lock; + bool enforce_cache_coherency : 1; + bool nest_parent : 1; + + struct mmu_notifier mmu_notifier; +}; + +struct arm_smmu_nested_domain { + struct iommu_domain domain; + struct arm_vsmmu *vsmmu; + bool enable_ats : 1; - struct list_head mmu_notifiers; + __le64 ste[2]; }; /* The following are exposed for testing purposes. */ @@ -748,37 +903,59 @@ struct arm_smmu_entry_writer_ops { void (*sync)(struct arm_smmu_entry_writer *writer); }; +void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); +void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, + bool ats_enabled); + #if IS_ENABLED(CONFIG_KUNIT) void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits); void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur, const __le64 *target); void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits); -void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu, struct arm_smmu_ste *target); void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, - struct arm_smmu_master *master); -void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, - struct arm_smmu_master *master, - struct arm_smmu_domain *smmu_domain); + struct arm_smmu_master *master, bool ats_enabled, + unsigned int s1dss); void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, struct arm_smmu_master *master, struct mm_struct *mm, u16 asid); #endif +struct arm_smmu_master_domain { + struct list_head devices_elm; + struct arm_smmu_master *master; + /* + * For nested domains the master_domain is threaded onto the S2 parent, + * this points to the IOMMU_DOMAIN_NESTED to disambiguate the masters. + */ + struct iommu_domain *domain; + ioasid_t ssid; + bool nested_ats_flush : 1; + bool using_iopf : 1; +}; + static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) { return container_of(dom, struct arm_smmu_domain, domain); } +static inline struct arm_smmu_nested_domain * +to_smmu_nested_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct arm_smmu_nested_domain, domain); +} + extern struct xarray arm_smmu_asid_xa; extern struct mutex arm_smmu_asid_lock; +struct arm_smmu_domain *arm_smmu_domain_alloc(void); + void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid); struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid); -struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master, - u32 ssid); void arm_smmu_make_s1_cd(struct arm_smmu_cd *target, struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain); @@ -786,67 +963,135 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid, struct arm_smmu_cd *cdptr, const struct arm_smmu_cd *target); +int arm_smmu_set_pasid(struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, ioasid_t pasid, + struct arm_smmu_cd *cd, struct iommu_domain *old); + void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid); void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, size_t granule, bool leaf, struct arm_smmu_domain *smmu_domain); -bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd); -int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, +int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, unsigned long iova, size_t size); +void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq); +int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, + struct arm_smmu_queue *q, void __iomem *page, + unsigned long prod_off, unsigned long cons_off, + size_t dwords, const char *name); +int arm_smmu_cmdq_init(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq); + +static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master) +{ + return dev_iommu_fwspec_get(master->dev)->flags & + IOMMU_FWSPEC_PCI_RC_CANWBS; +} + +struct arm_smmu_attach_state { + /* Inputs */ + struct iommu_domain *old_domain; + struct arm_smmu_master *master; + bool cd_needs_ats; + bool disable_ats; + ioasid_t ssid; + /* Resulting state */ + struct arm_smmu_vmaster *vmaster; + bool ats_enabled; +}; + +int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, + struct iommu_domain *new_domain); +void arm_smmu_attach_commit(struct arm_smmu_attach_state *state); +void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target); + +int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, u64 *cmds, int n, + bool sync); + #ifdef CONFIG_ARM_SMMU_V3_SVA bool arm_smmu_sva_supported(struct arm_smmu_device *smmu); -bool arm_smmu_master_sva_supported(struct arm_smmu_master *master); -bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master); -int arm_smmu_master_enable_sva(struct arm_smmu_master *master); -int arm_smmu_master_disable_sva(struct arm_smmu_master *master); -bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master); void arm_smmu_sva_notifier_synchronize(void); -struct iommu_domain *arm_smmu_sva_domain_alloc(void); -void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id); +struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); #else /* CONFIG_ARM_SMMU_V3_SVA */ static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { return false; } -static inline bool arm_smmu_master_sva_supported(struct arm_smmu_master *master) -{ - return false; -} +static inline void arm_smmu_sva_notifier_synchronize(void) {} -static inline bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master) -{ - return false; -} +#define arm_smmu_sva_domain_alloc NULL + +#endif /* CONFIG_ARM_SMMU_V3_SVA */ -static inline int arm_smmu_master_enable_sva(struct arm_smmu_master *master) +#ifdef CONFIG_TEGRA241_CMDQV +struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu); +#else /* CONFIG_TEGRA241_CMDQV */ +static inline struct arm_smmu_device * +tegra241_cmdqv_probe(struct arm_smmu_device *smmu) { - return -ENODEV; + return ERR_PTR(-ENODEV); } +#endif /* CONFIG_TEGRA241_CMDQV */ + +struct arm_vsmmu { + struct iommufd_viommu core; + struct arm_smmu_device *smmu; + struct arm_smmu_domain *s2_parent; + u16 vmid; +}; -static inline int arm_smmu_master_disable_sva(struct arm_smmu_master *master) +#if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD) +void *arm_smmu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type); +size_t arm_smmu_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type); +int arm_vsmmu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data); +int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain); +void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); +void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); +int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt); +struct iommu_domain * +arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data); +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array); +#else +#define arm_smmu_get_viommu_size NULL +#define arm_smmu_hw_info NULL +#define arm_vsmmu_init NULL +#define arm_vsmmu_alloc_domain_nested NULL +#define arm_vsmmu_cache_invalidate NULL + +static inline int +arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain) { - return -ENODEV; + return 0; } -static inline bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master) +static inline void +arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state) { - return false; } -static inline void arm_smmu_sva_notifier_synchronize(void) {} - -static inline struct iommu_domain *arm_smmu_sva_domain_alloc(void) +static inline void +arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) { - return NULL; } -static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, - struct device *dev, - ioasid_t id) +static inline int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, + u64 *evt) { + return -EOPNOTSUPP; } -#endif /* CONFIG_ARM_SMMU_V3_SVA */ +#endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ + #endif /* _ARM_SMMU_V3_H */ diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c new file mode 100644 index 000000000000..378104cd395e --- /dev/null +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -0,0 +1,1349 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2021-2024 NVIDIA CORPORATION & AFFILIATES. */ + +#define dev_fmt(fmt) "tegra241_cmdqv: " fmt + +#include <linux/acpi.h> +#include <linux/debugfs.h> +#include <linux/dma-mapping.h> +#include <linux/interrupt.h> +#include <linux/iommu.h> +#include <linux/iommufd.h> +#include <linux/iopoll.h> +#include <uapi/linux/iommufd.h> + +#include <acpi/acpixf.h> + +#include "arm-smmu-v3.h" + +/* CMDQV register page base and size defines */ +#define TEGRA241_CMDQV_CONFIG_BASE (0) +#define TEGRA241_CMDQV_CONFIG_SIZE (SZ_64K) +#define TEGRA241_VCMDQ_PAGE0_BASE (TEGRA241_CMDQV_CONFIG_BASE + SZ_64K) +#define TEGRA241_VCMDQ_PAGE1_BASE (TEGRA241_VCMDQ_PAGE0_BASE + SZ_64K) +#define TEGRA241_VINTF_PAGE_BASE (TEGRA241_VCMDQ_PAGE1_BASE + SZ_64K) + +/* CMDQV global base regs */ +#define TEGRA241_CMDQV_CONFIG 0x0000 +#define CMDQV_EN BIT(0) + +#define TEGRA241_CMDQV_PARAM 0x0004 +#define CMDQV_NUM_SID_PER_VM_LOG2 GENMASK(15, 12) +#define CMDQV_NUM_VINTF_LOG2 GENMASK(11, 8) +#define CMDQV_NUM_VCMDQ_LOG2 GENMASK(7, 4) +#define CMDQV_VER GENMASK(3, 0) + +#define TEGRA241_CMDQV_STATUS 0x0008 +#define CMDQV_ENABLED BIT(0) + +#define TEGRA241_CMDQV_VINTF_ERR_MAP 0x0014 +#define TEGRA241_CMDQV_VINTF_INT_MASK 0x001C +#define TEGRA241_CMDQV_CMDQ_ERR_MAP(m) (0x0024 + 0x4*(m)) + +#define TEGRA241_CMDQV_CMDQ_ALLOC(q) (0x0200 + 0x4*(q)) +#define CMDQV_CMDQ_ALLOC_VINTF GENMASK(20, 15) +#define CMDQV_CMDQ_ALLOC_LVCMDQ GENMASK(7, 1) +#define CMDQV_CMDQ_ALLOCATED BIT(0) + +/* VINTF base regs */ +#define TEGRA241_VINTF(v) (0x1000 + 0x100*(v)) + +#define TEGRA241_VINTF_CONFIG 0x0000 +#define VINTF_HYP_OWN BIT(17) +#define VINTF_VMID GENMASK(16, 1) +#define VINTF_EN BIT(0) + +#define TEGRA241_VINTF_STATUS 0x0004 +#define VINTF_STATUS GENMASK(3, 1) +#define VINTF_ENABLED BIT(0) + +#define TEGRA241_VINTF_SID_MATCH(s) (0x0040 + 0x4*(s)) +#define TEGRA241_VINTF_SID_REPLACE(s) (0x0080 + 0x4*(s)) + +#define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \ + (0x00C0 + 0x8*(m)) +#define LVCMDQ_ERR_MAP_NUM_64 2 + +/* VCMDQ base regs */ +/* -- PAGE0 -- */ +#define TEGRA241_VCMDQ_PAGE0(q) (TEGRA241_VCMDQ_PAGE0_BASE + 0x80*(q)) + +#define TEGRA241_VCMDQ_CONS 0x00000 +#define VCMDQ_CONS_ERR GENMASK(30, 24) + +#define TEGRA241_VCMDQ_PROD 0x00004 + +#define TEGRA241_VCMDQ_CONFIG 0x00008 +#define VCMDQ_EN BIT(0) + +#define TEGRA241_VCMDQ_STATUS 0x0000C +#define VCMDQ_ENABLED BIT(0) + +#define TEGRA241_VCMDQ_GERROR 0x00010 +#define TEGRA241_VCMDQ_GERRORN 0x00014 + +/* -- PAGE1 -- */ +#define TEGRA241_VCMDQ_PAGE1(q) (TEGRA241_VCMDQ_PAGE1_BASE + 0x80*(q)) +#define VCMDQ_ADDR GENMASK(47, 5) +#define VCMDQ_LOG2SIZE GENMASK(4, 0) + +#define TEGRA241_VCMDQ_BASE 0x00000 +#define TEGRA241_VCMDQ_CONS_INDX_BASE 0x00008 + +/* VINTF logical-VCMDQ pages */ +#define TEGRA241_VINTFi_PAGE0(i) (TEGRA241_VINTF_PAGE_BASE + SZ_128K*(i)) +#define TEGRA241_VINTFi_PAGE1(i) (TEGRA241_VINTFi_PAGE0(i) + SZ_64K) +#define TEGRA241_VINTFi_LVCMDQ_PAGE0(i, q) \ + (TEGRA241_VINTFi_PAGE0(i) + 0x80*(q)) +#define TEGRA241_VINTFi_LVCMDQ_PAGE1(i, q) \ + (TEGRA241_VINTFi_PAGE1(i) + 0x80*(q)) + +/* MMIO helpers */ +#define REG_CMDQV(_cmdqv, _regname) \ + ((_cmdqv)->base + TEGRA241_CMDQV_##_regname) +#define REG_VINTF(_vintf, _regname) \ + ((_vintf)->base + TEGRA241_VINTF_##_regname) +#define REG_VCMDQ_PAGE0(_vcmdq, _regname) \ + ((_vcmdq)->page0 + TEGRA241_VCMDQ_##_regname) +#define REG_VCMDQ_PAGE1(_vcmdq, _regname) \ + ((_vcmdq)->page1 + TEGRA241_VCMDQ_##_regname) + + +static bool disable_cmdqv; +module_param(disable_cmdqv, bool, 0444); +MODULE_PARM_DESC(disable_cmdqv, + "This allows to disable CMDQV HW and use default SMMU internal CMDQ."); + +static bool bypass_vcmdq; +module_param(bypass_vcmdq, bool, 0444); +MODULE_PARM_DESC(bypass_vcmdq, + "This allows to bypass VCMDQ for debugging use or perf comparison."); + +/** + * struct tegra241_vcmdq - Virtual Command Queue + * @core: Embedded iommufd_hw_queue structure + * @idx: Global index in the CMDQV + * @lidx: Local index in the VINTF + * @enabled: Enable status + * @cmdqv: Parent CMDQV pointer + * @vintf: Parent VINTF pointer + * @prev: Previous LVCMDQ to depend on + * @cmdq: Command Queue struct + * @page0: MMIO Page0 base address + * @page1: MMIO Page1 base address + */ +struct tegra241_vcmdq { + struct iommufd_hw_queue core; + + u16 idx; + u16 lidx; + + bool enabled; + + struct tegra241_cmdqv *cmdqv; + struct tegra241_vintf *vintf; + struct tegra241_vcmdq *prev; + struct arm_smmu_cmdq cmdq; + + void __iomem *page0; + void __iomem *page1; +}; +#define hw_queue_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core) + +/** + * struct tegra241_vintf - Virtual Interface + * @vsmmu: Embedded arm_vsmmu structure + * @idx: Global index in the CMDQV + * @enabled: Enable status + * @hyp_own: Owned by hypervisor (in-kernel) + * @cmdqv: Parent CMDQV pointer + * @lvcmdqs: List of logical VCMDQ pointers + * @lvcmdq_mutex: Lock to serialize user-allocated lvcmdqs + * @base: MMIO base address + * @mmap_offset: Offset argument for mmap() syscall + * @sids: Stream ID mapping resources + */ +struct tegra241_vintf { + struct arm_vsmmu vsmmu; + + u16 idx; + + bool enabled; + bool hyp_own; + + struct tegra241_cmdqv *cmdqv; + struct tegra241_vcmdq **lvcmdqs; + struct mutex lvcmdq_mutex; /* user space race */ + + void __iomem *base; + unsigned long mmap_offset; + + struct ida sids; +}; +#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, vsmmu.core) + +/** + * struct tegra241_vintf_sid - Virtual Interface Stream ID Mapping + * @core: Embedded iommufd_vdevice structure, holding virtual Stream ID + * @vintf: Parent VINTF pointer + * @sid: Physical Stream ID + * @idx: Mapping index in the VINTF + */ +struct tegra241_vintf_sid { + struct iommufd_vdevice core; + struct tegra241_vintf *vintf; + u32 sid; + u8 idx; +}; +#define vdev_to_vsid(v) container_of(v, struct tegra241_vintf_sid, core) + +/** + * struct tegra241_cmdqv - CMDQ-V for SMMUv3 + * @smmu: SMMUv3 device + * @dev: CMDQV device + * @base: MMIO base address + * @base_phys: MMIO physical base address, for mmap + * @irq: IRQ number + * @num_vintfs: Total number of VINTFs + * @num_vcmdqs: Total number of VCMDQs + * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF + * @num_sids_per_vintf: Total number of SID mappings per VINTF + * @vintf_ids: VINTF id allocator + * @vintfs: List of VINTFs + */ +struct tegra241_cmdqv { + struct arm_smmu_device smmu; + struct device *dev; + + void __iomem *base; + phys_addr_t base_phys; + int irq; + + /* CMDQV Hardware Params */ + u16 num_vintfs; + u16 num_vcmdqs; + u16 num_lvcmdqs_per_vintf; + u16 num_sids_per_vintf; + + struct ida vintf_ids; + + struct tegra241_vintf **vintfs; +}; + +/* Config and Polling Helpers */ + +static inline int tegra241_cmdqv_write_config(struct tegra241_cmdqv *cmdqv, + void __iomem *addr_config, + void __iomem *addr_status, + u32 regval, const char *header, + bool *out_enabled) +{ + bool en = regval & BIT(0); + int ret; + + writel(regval, addr_config); + ret = readl_poll_timeout(addr_status, regval, + en ? regval & BIT(0) : !(regval & BIT(0)), + 1, ARM_SMMU_POLL_TIMEOUT_US); + if (ret) + dev_err(cmdqv->dev, "%sfailed to %sable, STATUS=0x%08X\n", + header, en ? "en" : "dis", regval); + if (out_enabled) + WRITE_ONCE(*out_enabled, regval & BIT(0)); + return ret; +} + +static inline int cmdqv_write_config(struct tegra241_cmdqv *cmdqv, u32 regval) +{ + return tegra241_cmdqv_write_config(cmdqv, + REG_CMDQV(cmdqv, CONFIG), + REG_CMDQV(cmdqv, STATUS), + regval, "CMDQV: ", NULL); +} + +static inline int vintf_write_config(struct tegra241_vintf *vintf, u32 regval) +{ + char header[16]; + + snprintf(header, 16, "VINTF%u: ", vintf->idx); + return tegra241_cmdqv_write_config(vintf->cmdqv, + REG_VINTF(vintf, CONFIG), + REG_VINTF(vintf, STATUS), + regval, header, &vintf->enabled); +} + +static inline char *lvcmdq_error_header(struct tegra241_vcmdq *vcmdq, + char *header, int hlen) +{ + WARN_ON(hlen < 64); + if (WARN_ON(!vcmdq->vintf)) + return ""; + snprintf(header, hlen, "VINTF%u: VCMDQ%u/LVCMDQ%u: ", + vcmdq->vintf->idx, vcmdq->idx, vcmdq->lidx); + return header; +} + +static inline int vcmdq_write_config(struct tegra241_vcmdq *vcmdq, u32 regval) +{ + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + + return tegra241_cmdqv_write_config(vcmdq->cmdqv, + REG_VCMDQ_PAGE0(vcmdq, CONFIG), + REG_VCMDQ_PAGE0(vcmdq, STATUS), + regval, h, &vcmdq->enabled); +} + +/* ISR Functions */ + +static void tegra241_vintf_user_handle_error(struct tegra241_vintf *vintf) +{ + struct iommufd_viommu *viommu = &vintf->vsmmu.core; + struct iommu_vevent_tegra241_cmdqv vevent_data; + int i; + + for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) { + u64 err = readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i))); + + vevent_data.lvcmdq_err_map[i] = cpu_to_le64(err); + } + + iommufd_viommu_report_event(viommu, IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV, + &vevent_data, sizeof(vevent_data)); +} + +static void tegra241_vintf0_handle_error(struct tegra241_vintf *vintf) +{ + int i; + + for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) { + u64 map = readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i))); + + while (map) { + unsigned long lidx = __ffs64(map); + struct tegra241_vcmdq *vcmdq = vintf->lvcmdqs[lidx]; + u32 gerror = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)); + + __arm_smmu_cmdq_skip_err(&vintf->cmdqv->smmu, &vcmdq->cmdq); + writel(gerror, REG_VCMDQ_PAGE0(vcmdq, GERRORN)); + map &= ~BIT_ULL(lidx); + } + } +} + +static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid) +{ + struct tegra241_cmdqv *cmdqv = (struct tegra241_cmdqv *)devid; + void __iomem *reg_vintf_map = REG_CMDQV(cmdqv, VINTF_ERR_MAP); + char err_str[256]; + u64 vintf_map; + + /* Use readl_relaxed() as register addresses are not 64-bit aligned */ + vintf_map = (u64)readl_relaxed(reg_vintf_map + 0x4) << 32 | + (u64)readl_relaxed(reg_vintf_map); + + snprintf(err_str, sizeof(err_str), + "vintf_map: %016llx, vcmdq_map %08x:%08x:%08x:%08x", vintf_map, + readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(3))), + readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(2))), + readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(1))), + readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(0)))); + + dev_warn(cmdqv->dev, "unexpected error reported. %s\n", err_str); + + /* Handle VINTF0 and its LVCMDQs */ + if (vintf_map & BIT_ULL(0)) { + tegra241_vintf0_handle_error(cmdqv->vintfs[0]); + vintf_map &= ~BIT_ULL(0); + } + + /* Handle other user VINTFs and their LVCMDQs */ + while (vintf_map) { + unsigned long idx = __ffs64(vintf_map); + + tegra241_vintf_user_handle_error(cmdqv->vintfs[idx]); + vintf_map &= ~BIT_ULL(idx); + } + + return IRQ_HANDLED; +} + +/* Command Queue Function */ + +static bool tegra241_guest_vcmdq_supports_cmd(struct arm_smmu_cmdq_ent *ent) +{ + switch (ent->opcode) { + case CMDQ_OP_TLBI_NH_ASID: + case CMDQ_OP_TLBI_NH_VA: + case CMDQ_OP_ATC_INV: + return true; + default: + return false; + } +} + +static struct arm_smmu_cmdq * +tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) +{ + struct tegra241_cmdqv *cmdqv = + container_of(smmu, struct tegra241_cmdqv, smmu); + struct tegra241_vintf *vintf = cmdqv->vintfs[0]; + struct tegra241_vcmdq *vcmdq; + u16 lidx; + + if (READ_ONCE(bypass_vcmdq)) + return NULL; + + /* Use SMMU CMDQ if VINTF0 is uninitialized */ + if (!READ_ONCE(vintf->enabled)) + return NULL; + + /* + * Select a LVCMDQ to use. Here we use a temporal solution to + * balance out traffic on cmdq issuing: each cmdq has its own + * lock, if all cpus issue cmdlist using the same cmdq, only + * one CPU at a time can enter the process, while the others + * will be spinning at the same lock. + */ + lidx = raw_smp_processor_id() % cmdqv->num_lvcmdqs_per_vintf; + vcmdq = vintf->lvcmdqs[lidx]; + if (!vcmdq || !READ_ONCE(vcmdq->enabled)) + return NULL; + + /* Unsupported CMD goes for smmu->cmdq pathway */ + if (!arm_smmu_cmdq_supports_cmd(&vcmdq->cmdq, ent)) + return NULL; + return &vcmdq->cmdq; +} + +/* HW Reset Functions */ + +/* + * When a guest-owned VCMDQ is disabled, if the guest did not enqueue a CMD_SYNC + * following an ATC_INV command at the end of the guest queue while this ATC_INV + * is timed out, the TIMEOUT will not be reported until this VCMDQ gets assigned + * to the next VM, which will be a false alarm potentially causing some unwanted + * behavior in the new VM. Thus, a guest-owned VCMDQ must flush the TIMEOUT when + * it gets disabled. This can be done by just issuing a CMD_SYNC to SMMU CMDQ. + */ +static void tegra241_vcmdq_hw_flush_timeout(struct tegra241_vcmdq *vcmdq) +{ + struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu; + u64 cmd_sync[CMDQ_ENT_DWORDS] = {}; + + cmd_sync[0] = FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) | + FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_NONE); + + /* + * It does not hurt to insert another CMD_SYNC, taking advantage of the + * arm_smmu_cmdq_issue_cmdlist() that waits for the CMD_SYNC completion. + */ + arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, cmd_sync, 1, true); +} + +/* This function is for LVCMDQ, so @vcmdq must not be unmapped yet */ +static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) +{ + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + u32 gerrorn, gerror; + + if (vcmdq_write_config(vcmdq, 0)) { + dev_err(vcmdq->cmdqv->dev, + "%sGERRORN=0x%X, GERROR=0x%X, CONS=0x%X\n", h, + readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN)), + readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)), + readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS))); + } + tegra241_vcmdq_hw_flush_timeout(vcmdq); + + writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, PROD)); + writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, CONS)); + writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, BASE)); + writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, CONS_INDX_BASE)); + + gerrorn = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN)); + gerror = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)); + if (gerror != gerrorn) { + dev_warn(vcmdq->cmdqv->dev, + "%suncleared error detected, resetting\n", h); + writel(gerror, REG_VCMDQ_PAGE0(vcmdq, GERRORN)); + } + + dev_dbg(vcmdq->cmdqv->dev, "%sdeinited\n", h); +} + +/* This function is for LVCMDQ, so @vcmdq must be mapped prior */ +static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq) +{ + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + int ret; + + /* Reset VCMDQ */ + tegra241_vcmdq_hw_deinit(vcmdq); + + /* Configure and enable VCMDQ */ + writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE)); + + ret = vcmdq_write_config(vcmdq, VCMDQ_EN); + if (ret) { + dev_err(vcmdq->cmdqv->dev, + "%sGERRORN=0x%X, GERROR=0x%X, CONS=0x%X\n", h, + readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN)), + readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)), + readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS))); + return ret; + } + + dev_dbg(vcmdq->cmdqv->dev, "%sinited\n", h); + return 0; +} + +/* Unmap a global VCMDQ from the pre-assigned LVCMDQ */ +static void tegra241_vcmdq_unmap_lvcmdq(struct tegra241_vcmdq *vcmdq) +{ + u32 regval = readl(REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + + writel(regval & ~CMDQV_CMDQ_ALLOCATED, + REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + dev_dbg(vcmdq->cmdqv->dev, "%sunmapped\n", h); +} + +static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf) +{ + u16 lidx = vintf->cmdqv->num_lvcmdqs_per_vintf; + int sidx; + + /* HW requires to unmap LVCMDQs in descending order */ + while (lidx--) { + if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) { + tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]); + tegra241_vcmdq_unmap_lvcmdq(vintf->lvcmdqs[lidx]); + } + } + vintf_write_config(vintf, 0); + for (sidx = 0; sidx < vintf->cmdqv->num_sids_per_vintf; sidx++) { + writel(0, REG_VINTF(vintf, SID_MATCH(sidx))); + writel(0, REG_VINTF(vintf, SID_REPLACE(sidx))); + } +} + +/* Map a global VCMDQ to the pre-assigned LVCMDQ */ +static void tegra241_vcmdq_map_lvcmdq(struct tegra241_vcmdq *vcmdq) +{ + u32 regval = readl(REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + + writel(regval | CMDQV_CMDQ_ALLOCATED, + REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + dev_dbg(vcmdq->cmdqv->dev, "%smapped\n", h); +} + +static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own) +{ + u32 regval; + u16 lidx; + int ret; + + /* Reset VINTF */ + tegra241_vintf_hw_deinit(vintf); + + /* Configure and enable VINTF */ + /* + * Note that HYP_OWN bit is wired to zero when running in guest kernel, + * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a + * restricted set of supported commands. + */ + regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) | + FIELD_PREP(VINTF_VMID, vintf->vsmmu.vmid); + writel(regval, REG_VINTF(vintf, CONFIG)); + + ret = vintf_write_config(vintf, regval | VINTF_EN); + if (ret) + return ret; + /* + * As being mentioned above, HYP_OWN bit is wired to zero for a guest + * kernel, so read it back from HW to ensure that reflects in hyp_own + */ + vintf->hyp_own = !!(VINTF_HYP_OWN & readl(REG_VINTF(vintf, CONFIG))); + + /* HW requires to map LVCMDQs in ascending order */ + for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) { + if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) { + tegra241_vcmdq_map_lvcmdq(vintf->lvcmdqs[lidx]); + ret = tegra241_vcmdq_hw_init(vintf->lvcmdqs[lidx]); + if (ret) { + tegra241_vintf_hw_deinit(vintf); + return ret; + } + } + } + + return 0; +} + +static int tegra241_cmdqv_hw_reset(struct arm_smmu_device *smmu) +{ + struct tegra241_cmdqv *cmdqv = + container_of(smmu, struct tegra241_cmdqv, smmu); + u16 qidx, lidx, idx; + u32 regval; + int ret; + + /* Reset CMDQV */ + regval = readl_relaxed(REG_CMDQV(cmdqv, CONFIG)); + ret = cmdqv_write_config(cmdqv, regval & ~CMDQV_EN); + if (ret) + return ret; + ret = cmdqv_write_config(cmdqv, regval | CMDQV_EN); + if (ret) + return ret; + + /* Assign preallocated global VCMDQs to each VINTF as LVCMDQs */ + for (idx = 0, qidx = 0; idx < cmdqv->num_vintfs; idx++) { + for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) { + regval = FIELD_PREP(CMDQV_CMDQ_ALLOC_VINTF, idx); + regval |= FIELD_PREP(CMDQV_CMDQ_ALLOC_LVCMDQ, lidx); + writel_relaxed(regval, + REG_CMDQV(cmdqv, CMDQ_ALLOC(qidx++))); + } + } + + return tegra241_vintf_hw_init(cmdqv->vintfs[0], true); +} + +/* VCMDQ Resource Helpers */ + +static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq) +{ + struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu; + struct arm_smmu_cmdq *cmdq = &vcmdq->cmdq; + struct arm_smmu_queue *q = &cmdq->q; + char name[16]; + u32 regval; + int ret; + + snprintf(name, 16, "vcmdq%u", vcmdq->idx); + + /* Cap queue size to SMMU's IDR1.CMDQS and ensure natural alignment */ + regval = readl_relaxed(smmu->base + ARM_SMMU_IDR1); + q->llq.max_n_shift = + min_t(u32, CMDQ_MAX_SZ_SHIFT, FIELD_GET(IDR1_CMDQS, regval)); + + /* Use the common helper to init the VCMDQ, and then... */ + ret = arm_smmu_init_one_queue(smmu, q, vcmdq->page0, + TEGRA241_VCMDQ_PROD, TEGRA241_VCMDQ_CONS, + CMDQ_ENT_DWORDS, name); + if (ret) + return ret; + + /* ...override q_base to write VCMDQ_BASE registers */ + q->q_base = q->base_dma & VCMDQ_ADDR; + q->q_base |= FIELD_PREP(VCMDQ_LOG2SIZE, q->llq.max_n_shift); + + if (!vcmdq->vintf->hyp_own) + cmdq->supports_cmd = tegra241_guest_vcmdq_supports_cmd; + + return arm_smmu_cmdq_init(smmu, cmdq); +} + +/* VINTF Logical VCMDQ Resource Helpers */ + +static void tegra241_vintf_deinit_lvcmdq(struct tegra241_vintf *vintf, u16 lidx) +{ + vintf->lvcmdqs[lidx] = NULL; +} + +static int tegra241_vintf_init_lvcmdq(struct tegra241_vintf *vintf, u16 lidx, + struct tegra241_vcmdq *vcmdq) +{ + struct tegra241_cmdqv *cmdqv = vintf->cmdqv; + u16 idx = vintf->idx; + + vcmdq->idx = idx * cmdqv->num_lvcmdqs_per_vintf + lidx; + vcmdq->lidx = lidx; + vcmdq->cmdqv = cmdqv; + vcmdq->vintf = vintf; + vcmdq->page0 = cmdqv->base + TEGRA241_VINTFi_LVCMDQ_PAGE0(idx, lidx); + vcmdq->page1 = cmdqv->base + TEGRA241_VINTFi_LVCMDQ_PAGE1(idx, lidx); + + vintf->lvcmdqs[lidx] = vcmdq; + return 0; +} + +static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx) +{ + struct tegra241_vcmdq *vcmdq = vintf->lvcmdqs[lidx]; + char header[64]; + + /* Note that the lvcmdq queue memory space is managed by devres */ + + tegra241_vintf_deinit_lvcmdq(vintf, lidx); + + dev_dbg(vintf->cmdqv->dev, + "%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64)); + /* Guest-owned VCMDQ is free-ed with hw_queue by iommufd core */ + if (vcmdq->vintf->hyp_own) + kfree(vcmdq); +} + +static struct tegra241_vcmdq * +tegra241_vintf_alloc_lvcmdq(struct tegra241_vintf *vintf, u16 lidx) +{ + struct tegra241_cmdqv *cmdqv = vintf->cmdqv; + struct tegra241_vcmdq *vcmdq; + char header[64]; + int ret; + + vcmdq = kzalloc(sizeof(*vcmdq), GFP_KERNEL); + if (!vcmdq) + return ERR_PTR(-ENOMEM); + + ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq); + if (ret) + goto free_vcmdq; + + /* Build an arm_smmu_cmdq for each LVCMDQ */ + ret = tegra241_vcmdq_alloc_smmu_cmdq(vcmdq); + if (ret) + goto deinit_lvcmdq; + + dev_dbg(cmdqv->dev, + "%sallocated\n", lvcmdq_error_header(vcmdq, header, 64)); + return vcmdq; + +deinit_lvcmdq: + tegra241_vintf_deinit_lvcmdq(vintf, lidx); +free_vcmdq: + kfree(vcmdq); + return ERR_PTR(ret); +} + +/* VINTF Resource Helpers */ + +static void tegra241_cmdqv_deinit_vintf(struct tegra241_cmdqv *cmdqv, u16 idx) +{ + kfree(cmdqv->vintfs[idx]->lvcmdqs); + ida_free(&cmdqv->vintf_ids, idx); + cmdqv->vintfs[idx] = NULL; +} + +static int tegra241_cmdqv_init_vintf(struct tegra241_cmdqv *cmdqv, u16 max_idx, + struct tegra241_vintf *vintf) +{ + + u16 idx; + int ret; + + ret = ida_alloc_max(&cmdqv->vintf_ids, max_idx, GFP_KERNEL); + if (ret < 0) + return ret; + idx = ret; + + vintf->idx = idx; + vintf->cmdqv = cmdqv; + vintf->base = cmdqv->base + TEGRA241_VINTF(idx); + + vintf->lvcmdqs = kcalloc(cmdqv->num_lvcmdqs_per_vintf, + sizeof(*vintf->lvcmdqs), GFP_KERNEL); + if (!vintf->lvcmdqs) { + ida_free(&cmdqv->vintf_ids, idx); + return -ENOMEM; + } + + cmdqv->vintfs[idx] = vintf; + return ret; +} + +/* Remove Helpers */ + +static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx) +{ + struct tegra241_vintf *vintf = cmdqv->vintfs[idx]; + u16 lidx; + + tegra241_vintf_hw_deinit(vintf); + + /* Remove LVCMDQ resources */ + for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) + if (vintf->lvcmdqs[lidx]) + tegra241_vintf_free_lvcmdq(vintf, lidx); + + dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx); + tegra241_cmdqv_deinit_vintf(cmdqv, idx); + if (!vintf->hyp_own) { + mutex_destroy(&vintf->lvcmdq_mutex); + ida_destroy(&vintf->sids); + /* Guest-owned VINTF is free-ed with viommu by iommufd core */ + } else { + kfree(vintf); + } +} + +static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu) +{ + struct tegra241_cmdqv *cmdqv = + container_of(smmu, struct tegra241_cmdqv, smmu); + u16 idx; + + /* Remove VINTF resources */ + for (idx = 0; idx < cmdqv->num_vintfs; idx++) { + if (cmdqv->vintfs[idx]) { + /* Only vintf0 should remain at this stage */ + WARN_ON(idx > 0); + tegra241_cmdqv_remove_vintf(cmdqv, idx); + } + } + + /* Remove cmdqv resources */ + ida_destroy(&cmdqv->vintf_ids); + + if (cmdqv->irq > 0) + free_irq(cmdqv->irq, cmdqv); + iounmap(cmdqv->base); + kfree(cmdqv->vintfs); + put_device(cmdqv->dev); /* smmu->impl_dev */ +} + +static int +tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data); + +static void *tegra241_cmdqv_hw_info(struct arm_smmu_device *smmu, u32 *length, + enum iommu_hw_info_type *type) +{ + struct tegra241_cmdqv *cmdqv = + container_of(smmu, struct tegra241_cmdqv, smmu); + struct iommu_hw_info_tegra241_cmdqv *info; + u32 regval; + + if (*type != IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV) + return ERR_PTR(-EOPNOTSUPP); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM)); + info->log2vcmdqs = ilog2(cmdqv->num_lvcmdqs_per_vintf); + info->log2vsids = ilog2(cmdqv->num_sids_per_vintf); + info->version = FIELD_GET(CMDQV_VER, regval); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV; + return info; +} + +static size_t tegra241_cmdqv_get_vintf_size(enum iommu_viommu_type viommu_type) +{ + if (viommu_type != IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV) + return 0; + return VIOMMU_STRUCT_SIZE(struct tegra241_vintf, vsmmu.core); +} + +static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = { + /* For in-kernel use */ + .get_secondary_cmdq = tegra241_cmdqv_get_cmdq, + .device_reset = tegra241_cmdqv_hw_reset, + .device_remove = tegra241_cmdqv_remove, + /* For user-space use */ + .hw_info = tegra241_cmdqv_hw_info, + .get_viommu_size = tegra241_cmdqv_get_vintf_size, + .vsmmu_init = tegra241_cmdqv_init_vintf_user, +}; + +/* Probe Functions */ + +static int tegra241_cmdqv_acpi_is_memory(struct acpi_resource *res, void *data) +{ + struct resource_win win; + + return !acpi_dev_resource_address_space(res, &win); +} + +static int tegra241_cmdqv_acpi_get_irqs(struct acpi_resource *ares, void *data) +{ + struct resource r; + int *irq = data; + + if (*irq <= 0 && acpi_dev_resource_interrupt(ares, 0, &r)) + *irq = r.start; + return 1; /* No need to add resource to the list */ +} + +static struct resource * +tegra241_cmdqv_find_acpi_resource(struct device *dev, int *irq) +{ + struct acpi_device *adev = to_acpi_device(dev); + struct list_head resource_list; + struct resource_entry *rentry; + struct resource *res = NULL; + int ret; + + INIT_LIST_HEAD(&resource_list); + ret = acpi_dev_get_resources(adev, &resource_list, + tegra241_cmdqv_acpi_is_memory, NULL); + if (ret < 0) { + dev_err(dev, "failed to get memory resource: %d\n", ret); + return NULL; + } + + rentry = list_first_entry_or_null(&resource_list, + struct resource_entry, node); + if (!rentry) { + dev_err(dev, "failed to get memory resource entry\n"); + goto free_list; + } + + /* Caller must free the res */ + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + goto free_list; + + *res = *rentry->res; + + acpi_dev_free_resource_list(&resource_list); + + INIT_LIST_HEAD(&resource_list); + + if (irq) + ret = acpi_dev_get_resources(adev, &resource_list, + tegra241_cmdqv_acpi_get_irqs, irq); + if (ret < 0 || !irq || *irq <= 0) + dev_warn(dev, "no interrupt. errors will not be reported\n"); + +free_list: + acpi_dev_free_resource_list(&resource_list); + return res; +} + +static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu) +{ + struct tegra241_cmdqv *cmdqv = + container_of(smmu, struct tegra241_cmdqv, smmu); + struct tegra241_vintf *vintf; + int lidx; + int ret; + + vintf = kzalloc(sizeof(*vintf), GFP_KERNEL); + if (!vintf) + return -ENOMEM; + + /* Init VINTF0 for in-kernel use */ + ret = tegra241_cmdqv_init_vintf(cmdqv, 0, vintf); + if (ret) { + dev_err(cmdqv->dev, "failed to init vintf0: %d\n", ret); + return ret; + } + + /* Preallocate logical VCMDQs to VINTF0 */ + for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) { + struct tegra241_vcmdq *vcmdq; + + vcmdq = tegra241_vintf_alloc_lvcmdq(vintf, lidx); + if (IS_ERR(vcmdq)) + return PTR_ERR(vcmdq); + } + + /* Now, we are ready to run all the impl ops */ + smmu->impl_ops = &tegra241_cmdqv_impl_ops; + return 0; +} + +#ifdef CONFIG_IOMMU_DEBUGFS +static struct dentry *cmdqv_debugfs_dir; +#endif + +static struct arm_smmu_device * +__tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, + int irq) +{ + static const struct arm_smmu_impl_ops init_ops = { + .init_structures = tegra241_cmdqv_init_structures, + .device_remove = tegra241_cmdqv_remove, + }; + struct tegra241_cmdqv *cmdqv = NULL; + struct arm_smmu_device *new_smmu; + void __iomem *base; + u32 regval; + int ret; + + static_assert(offsetof(struct tegra241_cmdqv, smmu) == 0); + + base = ioremap(res->start, resource_size(res)); + if (!base) { + dev_err(smmu->dev, "failed to ioremap\n"); + return NULL; + } + + regval = readl(base + TEGRA241_CMDQV_CONFIG); + if (disable_cmdqv) { + dev_info(smmu->dev, "Detected disable_cmdqv=true\n"); + writel(regval & ~CMDQV_EN, base + TEGRA241_CMDQV_CONFIG); + goto iounmap; + } + + cmdqv = devm_krealloc(smmu->dev, smmu, sizeof(*cmdqv), GFP_KERNEL); + if (!cmdqv) + goto iounmap; + new_smmu = &cmdqv->smmu; + + cmdqv->irq = irq; + cmdqv->base = base; + cmdqv->dev = smmu->impl_dev; + cmdqv->base_phys = res->start; + + if (cmdqv->irq > 0) { + ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr, + IRQF_ONESHOT, "tegra241-cmdqv", + cmdqv); + if (ret) { + dev_err(cmdqv->dev, "failed to request irq (%d): %d\n", + cmdqv->irq, ret); + goto iounmap; + } + } + + regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM)); + cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval); + cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval); + cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs; + cmdqv->num_sids_per_vintf = + 1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval); + + cmdqv->vintfs = + kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL); + if (!cmdqv->vintfs) + goto free_irq; + + ida_init(&cmdqv->vintf_ids); + +#ifdef CONFIG_IOMMU_DEBUGFS + if (!cmdqv_debugfs_dir) { + cmdqv_debugfs_dir = + debugfs_create_dir("tegra241_cmdqv", iommu_debugfs_dir); + debugfs_create_bool("bypass_vcmdq", 0644, cmdqv_debugfs_dir, + &bypass_vcmdq); + } +#endif + + /* Provide init-level ops only, until tegra241_cmdqv_init_structures */ + new_smmu->impl_ops = &init_ops; + + return new_smmu; + +free_irq: + if (cmdqv->irq > 0) + free_irq(cmdqv->irq, cmdqv); +iounmap: + iounmap(base); + return NULL; +} + +struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu) +{ + struct arm_smmu_device *new_smmu; + struct resource *res = NULL; + int irq; + + if (!smmu->dev->of_node) + res = tegra241_cmdqv_find_acpi_resource(smmu->impl_dev, &irq); + if (!res) + goto out_fallback; + + new_smmu = __tegra241_cmdqv_probe(smmu, res, irq); + kfree(res); + + if (new_smmu) + return new_smmu; + +out_fallback: + dev_info(smmu->impl_dev, "Falling back to standard SMMU CMDQ\n"); + smmu->options &= ~ARM_SMMU_OPT_TEGRA241_CMDQV; + put_device(smmu->impl_dev); + return ERR_PTR(-ENODEV); +} + +/* User space VINTF and VCMDQ Functions */ + +static size_t tegra241_vintf_get_vcmdq_size(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type) +{ + if (queue_type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV) + return 0; + return HW_QUEUE_STRUCT_SIZE(struct tegra241_vcmdq, core); +} + +static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq) +{ + char header[64]; + + /* Configure the vcmdq only; User space does the enabling */ + writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE)); + + dev_dbg(vcmdq->cmdqv->dev, "%sinited at host PA 0x%llx size 0x%lx\n", + lvcmdq_error_header(vcmdq, header, 64), + vcmdq->cmdq.q.q_base & VCMDQ_ADDR, + 1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE)); + return 0; +} + +static void +tegra241_vintf_destroy_lvcmdq_user(struct iommufd_hw_queue *hw_queue) +{ + struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue); + + mutex_lock(&vcmdq->vintf->lvcmdq_mutex); + tegra241_vcmdq_hw_deinit(vcmdq); + tegra241_vcmdq_unmap_lvcmdq(vcmdq); + tegra241_vintf_free_lvcmdq(vcmdq->vintf, vcmdq->lidx); + if (vcmdq->prev) + iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core); + mutex_unlock(&vcmdq->vintf->lvcmdq_mutex); +} + +static int tegra241_vintf_alloc_lvcmdq_user(struct iommufd_hw_queue *hw_queue, + u32 lidx, phys_addr_t base_addr_pa) +{ + struct tegra241_vintf *vintf = viommu_to_vintf(hw_queue->viommu); + struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue); + struct tegra241_cmdqv *cmdqv = vintf->cmdqv; + struct arm_smmu_device *smmu = &cmdqv->smmu; + struct tegra241_vcmdq *prev = NULL; + u32 log2size, max_n_shift; + char header[64]; + int ret; + + if (hw_queue->type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV) + return -EOPNOTSUPP; + if (lidx >= cmdqv->num_lvcmdqs_per_vintf) + return -EINVAL; + + mutex_lock(&vintf->lvcmdq_mutex); + + if (vintf->lvcmdqs[lidx]) { + ret = -EEXIST; + goto unlock; + } + + /* + * HW requires to map LVCMDQs in ascending order, so reject if the + * previous lvcmdqs is not allocated yet. + */ + if (lidx) { + prev = vintf->lvcmdqs[lidx - 1]; + if (!prev) { + ret = -EIO; + goto unlock; + } + } + + /* + * hw_queue->length must be a power of 2, in range of + * [ 32, 2 ^ (idr[1].CMDQS + CMDQ_ENT_SZ_SHIFT) ] + */ + max_n_shift = FIELD_GET(IDR1_CMDQS, + readl_relaxed(smmu->base + ARM_SMMU_IDR1)); + if (!is_power_of_2(hw_queue->length) || hw_queue->length < 32 || + hw_queue->length > (1 << (max_n_shift + CMDQ_ENT_SZ_SHIFT))) { + ret = -EINVAL; + goto unlock; + } + log2size = ilog2(hw_queue->length) - CMDQ_ENT_SZ_SHIFT; + + /* base_addr_pa must be aligned to hw_queue->length */ + if (base_addr_pa & ~VCMDQ_ADDR || + base_addr_pa & (hw_queue->length - 1)) { + ret = -EINVAL; + goto unlock; + } + + /* + * HW requires to unmap LVCMDQs in descending order, so destroy() must + * follow this rule. Set a dependency on its previous LVCMDQ so iommufd + * core will help enforce it. + */ + if (prev) { + ret = iommufd_hw_queue_depend(vcmdq, prev, core); + if (ret) + goto unlock; + } + vcmdq->prev = prev; + + ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq); + if (ret) + goto undepend_vcmdq; + + dev_dbg(cmdqv->dev, "%sallocated\n", + lvcmdq_error_header(vcmdq, header, 64)); + + tegra241_vcmdq_map_lvcmdq(vcmdq); + + vcmdq->cmdq.q.q_base = base_addr_pa & VCMDQ_ADDR; + vcmdq->cmdq.q.q_base |= log2size; + + ret = tegra241_vcmdq_hw_init_user(vcmdq); + if (ret) + goto unmap_lvcmdq; + + hw_queue->destroy = &tegra241_vintf_destroy_lvcmdq_user; + mutex_unlock(&vintf->lvcmdq_mutex); + return 0; + +unmap_lvcmdq: + tegra241_vcmdq_unmap_lvcmdq(vcmdq); + tegra241_vintf_deinit_lvcmdq(vintf, lidx); +undepend_vcmdq: + if (vcmdq->prev) + iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core); +unlock: + mutex_unlock(&vintf->lvcmdq_mutex); + return ret; +} + +static void tegra241_cmdqv_destroy_vintf_user(struct iommufd_viommu *viommu) +{ + struct tegra241_vintf *vintf = viommu_to_vintf(viommu); + + if (vintf->mmap_offset) + iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, + vintf->mmap_offset); + tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx); +} + +static void tegra241_vintf_destroy_vsid(struct iommufd_vdevice *vdev) +{ + struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev); + struct tegra241_vintf *vintf = vsid->vintf; + + writel(0, REG_VINTF(vintf, SID_MATCH(vsid->idx))); + writel(0, REG_VINTF(vintf, SID_REPLACE(vsid->idx))); + ida_free(&vintf->sids, vsid->idx); + dev_dbg(vintf->cmdqv->dev, + "VINTF%u: deallocated SID_REPLACE%d for pSID=%x\n", vintf->idx, + vsid->idx, vsid->sid); +} + +static int tegra241_vintf_init_vsid(struct iommufd_vdevice *vdev) +{ + struct device *dev = iommufd_vdevice_to_device(vdev); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct tegra241_vintf *vintf = viommu_to_vintf(vdev->viommu); + struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev); + struct arm_smmu_stream *stream = &master->streams[0]; + u64 virt_sid = vdev->virt_id; + int sidx; + + if (virt_sid > UINT_MAX) + return -EINVAL; + + WARN_ON_ONCE(master->num_streams != 1); + + /* Find an empty pair of SID_REPLACE and SID_MATCH */ + sidx = ida_alloc_max(&vintf->sids, vintf->cmdqv->num_sids_per_vintf - 1, + GFP_KERNEL); + if (sidx < 0) + return sidx; + + writel(stream->id, REG_VINTF(vintf, SID_REPLACE(sidx))); + writel(virt_sid << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(sidx))); + dev_dbg(vintf->cmdqv->dev, + "VINTF%u: allocated SID_REPLACE%d for pSID=%x, vSID=%x\n", + vintf->idx, sidx, stream->id, (u32)virt_sid); + + vsid->idx = sidx; + vsid->vintf = vintf; + vsid->sid = stream->id; + + vdev->destroy = &tegra241_vintf_destroy_vsid; + return 0; +} + +static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = { + .destroy = tegra241_cmdqv_destroy_vintf_user, + .alloc_domain_nested = arm_vsmmu_alloc_domain_nested, + /* Non-accelerated commands will be still handled by the kernel */ + .cache_invalidate = arm_vsmmu_cache_invalidate, + .vdevice_size = VDEVICE_STRUCT_SIZE(struct tegra241_vintf_sid, core), + .vdevice_init = tegra241_vintf_init_vsid, + .get_hw_queue_size = tegra241_vintf_get_vcmdq_size, + .hw_queue_init_phys = tegra241_vintf_alloc_lvcmdq_user, +}; + +static int +tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data) +{ + struct tegra241_cmdqv *cmdqv = + container_of(vsmmu->smmu, struct tegra241_cmdqv, smmu); + struct tegra241_vintf *vintf = viommu_to_vintf(&vsmmu->core); + struct iommu_viommu_tegra241_cmdqv data; + phys_addr_t page0_base; + int ret; + + /* + * Unsupported type should be rejected by tegra241_cmdqv_get_vintf_size. + * Seeing one here indicates a kernel bug or some data corruption. + */ + if (WARN_ON(vsmmu->core.type != IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV)) + return -EOPNOTSUPP; + + if (!user_data) + return -EINVAL; + + ret = iommu_copy_struct_from_user(&data, user_data, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + out_vintf_mmap_length); + if (ret) + return ret; + + ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf); + if (ret < 0) { + dev_err(cmdqv->dev, "no more available vintf\n"); + return ret; + } + + /* + * Initialize the user-owned VINTF without a LVCMDQ, as it cannot pre- + * allocate a LVCMDQ until user space wants one, for security reasons. + * It is different than the kernel-owned VINTF0, which had pre-assigned + * and pre-allocated global VCMDQs that would be mapped to the LVCMDQs + * by the tegra241_vintf_hw_init() call. + */ + ret = tegra241_vintf_hw_init(vintf, false); + if (ret) + goto deinit_vintf; + + page0_base = cmdqv->base_phys + TEGRA241_VINTFi_PAGE0(vintf->idx); + ret = iommufd_viommu_alloc_mmap(&vintf->vsmmu.core, page0_base, SZ_64K, + &vintf->mmap_offset); + if (ret) + goto hw_deinit_vintf; + + data.out_vintf_mmap_length = SZ_64K; + data.out_vintf_mmap_offset = vintf->mmap_offset; + ret = iommu_copy_struct_to_user(user_data, &data, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + out_vintf_mmap_length); + if (ret) + goto free_mmap; + + ida_init(&vintf->sids); + mutex_init(&vintf->lvcmdq_mutex); + + dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n", vintf->idx, + vintf->vsmmu.vmid); + + vsmmu->core.ops = &tegra241_cmdqv_viommu_ops; + return 0; + +free_mmap: + iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, vintf->mmap_offset); +hw_deinit_vintf: + tegra241_vintf_hw_deinit(vintf); +deinit_vintf: + tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx); + return ret; +} + +MODULE_IMPORT_NS("IOMMUFD"); diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c index 9dc772f2cbb2..db9b9a8e139c 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c @@ -110,7 +110,6 @@ static struct arm_smmu_device *cavium_smmu_impl_init(struct arm_smmu_device *smm int arm_mmu500_reset(struct arm_smmu_device *smmu) { u32 reg, major; - int i; /* * On MMU-500 r2p0 onwards we need to clear ACR.CACHE_LOCK before * writes to the context bank ACTLRs will stick. And we just hope that @@ -128,18 +127,20 @@ int arm_mmu500_reset(struct arm_smmu_device *smmu) reg |= ARM_MMU500_ACR_SMTNMB_TLBEN | ARM_MMU500_ACR_S2CRB_TLBEN; arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sACR, reg); +#ifdef CONFIG_ARM_SMMU_MMU_500_CPRE_ERRATA /* * Disable MMU-500's not-particularly-beneficial next-page - * prefetcher for the sake of errata #841119 and #826419. + * prefetcher for the sake of at least 5 known errata. */ - for (i = 0; i < smmu->num_context_banks; ++i) { + for (int i = 0; i < smmu->num_context_banks; ++i) { reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR); reg &= ~ARM_MMU500_ACTLR_CPRE; arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_ACTLR, reg); reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR); if (reg & ARM_MMU500_ACTLR_CPRE) - dev_warn_once(smmu->dev, "Failed to disable prefetcher [errata #841119 and #826419], check ACR.CACHE_LOCK\n"); + dev_warn_once(smmu->dev, "Failed to disable prefetcher for errata workarounds, check SACR.CACHE_LOCK\n"); } +#endif return 0; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c index 957d988b6d83..2fce4f6d4e1b 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c @@ -200,7 +200,7 @@ static irqreturn_t nvidia_smmu_context_fault_bank(int irq, void __iomem *cb_base = nvidia_smmu_page(smmu, inst, smmu->numpage + idx); fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR); - if (!(fsr & ARM_SMMU_FSR_FAULT)) + if (!(fsr & ARM_SMMU_CB_FSR_FAULT)) return IRQ_NONE; fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0); @@ -277,7 +277,7 @@ static int nvidia_smmu_init_context(struct arm_smmu_domain *smmu_domain, */ if (of_device_is_compatible(np, "nvidia,tegra234-smmu") || of_device_is_compatible(np, "nvidia,tegra194-smmu")) { - smmu->pgsize_bitmap = PAGE_SIZE; + smmu->pgsize_bitmap &= GENMASK(PAGE_SHIFT, 0); pgtbl_cfg->pgsize_bitmap = smmu->pgsize_bitmap; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c index 552199cbd9e2..65e0ef6539fe 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c @@ -73,7 +73,7 @@ void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) if (__ratelimit(&rs)) { dev_err(smmu->dev, "TLB sync timed out -- SMMU may be deadlocked\n"); - cfg = qsmmu->cfg; + cfg = qsmmu->data->cfg; if (!cfg) return; @@ -141,7 +141,7 @@ static int qcom_tbu_halt(struct qcom_tbu *tbu, struct arm_smmu_domain *smmu_doma writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG); fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); - if ((fsr & ARM_SMMU_FSR_FAULT) && (fsr & ARM_SMMU_FSR_SS)) { + if ((fsr & ARM_SMMU_CB_FSR_FAULT) && (fsr & ARM_SMMU_CB_FSR_SS)) { u32 sctlr_orig, sctlr; /* @@ -298,7 +298,7 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain, arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr); fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); - if (fsr & ARM_SMMU_FSR_FAULT) { + if (fsr & ARM_SMMU_CB_FSR_FAULT) { /* Clear pending interrupts */ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); @@ -306,7 +306,7 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain, * TBU halt takes care of resuming any stalled transcation. * Kept it here for completeness sake. */ - if (fsr & ARM_SMMU_FSR_SS) + if (fsr & ARM_SMMU_CB_FSR_SS) arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, ARM_SMMU_RESUME_TERMINATE); } @@ -320,11 +320,11 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain, phys = qcom_tbu_trigger_atos(smmu_domain, tbu, iova, sid); fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); - if (fsr & ARM_SMMU_FSR_FAULT) { + if (fsr & ARM_SMMU_CB_FSR_FAULT) { /* Clear pending interrupts */ arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); - if (fsr & ARM_SMMU_FSR_SS) + if (fsr & ARM_SMMU_CB_FSR_SS) arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, ARM_SMMU_RESUME_TERMINATE); } @@ -383,68 +383,53 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev) struct arm_smmu_domain *smmu_domain = dev; struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; struct arm_smmu_device *smmu = smmu_domain->smmu; - u32 fsr, fsynr, cbfrsynra, resume = 0; + struct arm_smmu_context_fault_info cfi; + u32 resume = 0; int idx = smmu_domain->cfg.cbndx; phys_addr_t phys_soft; - unsigned long iova; int ret, tmp; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); - if (!(fsr & ARM_SMMU_FSR_FAULT)) - return IRQ_NONE; + arm_smmu_read_context_fault_info(smmu, idx, &cfi); - fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0); - iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR); - cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx)); + if (!(cfi.fsr & ARM_SMMU_CB_FSR_FAULT)) + return IRQ_NONE; if (list_empty(&tbu_list)) { - ret = report_iommu_fault(&smmu_domain->domain, NULL, iova, - fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); + ret = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova, + cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); if (ret == -ENOSYS) - dev_err_ratelimited(smmu->dev, - "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n", - fsr, iova, fsynr, cbfrsynra, idx); + arm_smmu_print_context_fault_info(smmu, idx, &cfi); + + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr); + + if (cfi.fsr & ARM_SMMU_CB_FSR_SS) { + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, + ret == -EAGAIN ? 0 : ARM_SMMU_RESUME_TERMINATE); + } - arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); return IRQ_HANDLED; } - phys_soft = ops->iova_to_phys(ops, iova); + phys_soft = ops->iova_to_phys(ops, cfi.iova); - tmp = report_iommu_fault(&smmu_domain->domain, NULL, iova, - fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); + tmp = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova, + cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); if (!tmp || tmp == -EBUSY) { - dev_dbg(smmu->dev, - "Context fault handled by client: iova=0x%08lx, fsr=0x%x, fsynr=0x%x, cb=%d\n", - iova, fsr, fsynr, idx); - dev_dbg(smmu->dev, "soft iova-to-phys=%pa\n", &phys_soft); ret = IRQ_HANDLED; resume = ARM_SMMU_RESUME_TERMINATE; + } else if (tmp == -EAGAIN) { + ret = IRQ_HANDLED; + resume = 0; } else { - phys_addr_t phys_atos = qcom_smmu_verify_fault(smmu_domain, iova, fsr); + phys_addr_t phys_atos = qcom_smmu_verify_fault(smmu_domain, cfi.iova, cfi.fsr); if (__ratelimit(&_rs)) { - dev_err(smmu->dev, - "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n", - fsr, iova, fsynr, cbfrsynra, idx); - dev_err(smmu->dev, - "FSR = %08x [%s%s%s%s%s%s%s%s%s], SID=0x%x\n", - fsr, - (fsr & 0x02) ? "TF " : "", - (fsr & 0x04) ? "AFF " : "", - (fsr & 0x08) ? "PF " : "", - (fsr & 0x10) ? "EF " : "", - (fsr & 0x20) ? "TLBMCF " : "", - (fsr & 0x40) ? "TLBLKF " : "", - (fsr & 0x80) ? "MHF " : "", - (fsr & 0x40000000) ? "SS " : "", - (fsr & 0x80000000) ? "MULTI " : "", - cbfrsynra); + arm_smmu_print_context_fault_info(smmu, idx, &cfi); dev_err(smmu->dev, "soft iova-to-phys=%pa\n", &phys_soft); @@ -478,17 +463,17 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev) */ if (tmp != -EBUSY) { /* Clear the faulting FSR */ - arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr); /* Retry or terminate any stalled transactions */ - if (fsr & ARM_SMMU_FSR_SS) + if (cfi.fsr & ARM_SMMU_CB_FSR_SS) arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, resume); } return ret; } -static int qcom_tbu_probe(struct platform_device *pdev) +int qcom_tbu_probe(struct platform_device *pdev) { struct of_phandle_args args = { .args_count = 2 }; struct device_node *np = pdev->dev.of_node; @@ -530,18 +515,3 @@ static int qcom_tbu_probe(struct platform_device *pdev) return 0; } - -static const struct of_device_id qcom_tbu_of_match[] = { - { .compatible = "qcom,sc7280-tbu" }, - { .compatible = "qcom,sdm845-tbu" }, - { } -}; - -static struct platform_driver qcom_tbu_driver = { - .driver = { - .name = "qcom_tbu", - .of_match_table = qcom_tbu_of_match, - }, - .probe = qcom_tbu_probe, -}; -builtin_platform_driver(qcom_tbu_driver); diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 25f034677f56..573085349df3 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -8,12 +8,48 @@ #include <linux/delay.h> #include <linux/of_device.h> #include <linux/firmware/qcom/qcom_scm.h> +#include <linux/platform_device.h> +#include <linux/pm_runtime.h> #include "arm-smmu.h" #include "arm-smmu-qcom.h" #define QCOM_DUMMY_VAL -1 +/* + * SMMU-500 TRM defines BIT(0) as CMTLB (Enable context caching in the + * macro TLB) and BIT(1) as CPRE (Enable context caching in the prefetch + * buffer). The remaining bits are implementation defined and vary across + * SoCs. + */ + +#define CPRE (1 << 1) +#define CMTLB (1 << 0) +#define PREFETCH_SHIFT 8 +#define PREFETCH_DEFAULT 0 +#define PREFETCH_SHALLOW (1 << PREFETCH_SHIFT) +#define PREFETCH_MODERATE (2 << PREFETCH_SHIFT) +#define PREFETCH_DEEP (3 << PREFETCH_SHIFT) +#define GFX_ACTLR_PRR (1 << 5) + +static const struct of_device_id qcom_smmu_actlr_client_of_match[] = { + { .compatible = "qcom,adreno", + .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, + { .compatible = "qcom,adreno-gmu", + .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, + { .compatible = "qcom,adreno-smmu", + .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, + { .compatible = "qcom,fastrpc", + .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, + { .compatible = "qcom,sc7280-mdss", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sc7280-venus", + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, + { .compatible = "qcom,sm8550-mdss", + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, + { } +}; + static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu) { return container_of(smmu, struct qcom_smmu, smmu); @@ -76,25 +112,80 @@ static void qcom_adreno_smmu_set_stall(const void *cookie, bool enabled) { struct arm_smmu_domain *smmu_domain = (void *)cookie; struct arm_smmu_cfg *cfg = &smmu_domain->cfg; - struct qcom_smmu *qsmmu = to_qcom_smmu(smmu_domain->smmu); + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct qcom_smmu *qsmmu = to_qcom_smmu(smmu); + u32 mask = BIT(cfg->cbndx); + bool stall_changed = !!(qsmmu->stall_enabled & mask) != enabled; + unsigned long flags; if (enabled) - qsmmu->stall_enabled |= BIT(cfg->cbndx); + qsmmu->stall_enabled |= mask; else - qsmmu->stall_enabled &= ~BIT(cfg->cbndx); + qsmmu->stall_enabled &= ~mask; + + /* + * If the device is on and we changed the setting, update the register. + * The spec pseudocode says that CFCFG is resampled after a fault, and + * we believe that no implementations cache it in the TLB, so it should + * be safe to change it without a TLB invalidation. + */ + if (stall_changed && pm_runtime_get_if_active(smmu->dev) > 0) { + u32 reg; + + spin_lock_irqsave(&smmu_domain->cb_lock, flags); + reg = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR); + + if (enabled) + reg |= ARM_SMMU_SCTLR_CFCFG; + else + reg &= ~ARM_SMMU_SCTLR_CFCFG; + + arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_SCTLR, reg); + spin_unlock_irqrestore(&smmu_domain->cb_lock, flags); + + pm_runtime_put_autosuspend(smmu->dev); + } } -static void qcom_adreno_smmu_resume_translation(const void *cookie, bool terminate) +static void qcom_adreno_smmu_set_prr_bit(const void *cookie, bool set) { struct arm_smmu_domain *smmu_domain = (void *)cookie; - struct arm_smmu_cfg *cfg = &smmu_domain->cfg; struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_cfg *cfg = &smmu_domain->cfg; u32 reg = 0; + int ret; - if (terminate) - reg |= ARM_SMMU_RESUME_TERMINATE; + ret = pm_runtime_resume_and_get(smmu->dev); + if (ret < 0) { + dev_err(smmu->dev, "failed to get runtime PM: %d\n", ret); + return; + } - arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_RESUME, reg); + reg = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_ACTLR); + reg &= ~GFX_ACTLR_PRR; + if (set) + reg |= FIELD_PREP(GFX_ACTLR_PRR, 1); + arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_ACTLR, reg); + pm_runtime_put_autosuspend(smmu->dev); +} + +static void qcom_adreno_smmu_set_prr_addr(const void *cookie, phys_addr_t page_addr) +{ + struct arm_smmu_domain *smmu_domain = (void *)cookie; + struct arm_smmu_device *smmu = smmu_domain->smmu; + int ret; + + ret = pm_runtime_resume_and_get(smmu->dev); + if (ret < 0) { + dev_err(smmu->dev, "failed to get runtime PM: %d\n", ret); + return; + } + + writel_relaxed(lower_32_bits(page_addr), + smmu->base + ARM_SMMU_GFX_PRR_CFG_LADDR); + writel_relaxed(upper_32_bits(page_addr), + smmu->base + ARM_SMMU_GFX_PRR_CFG_UADDR); + pm_runtime_put_autosuspend(smmu->dev); } #define QCOM_ADRENO_SMMU_GPU_SID 0 @@ -205,13 +296,37 @@ static bool qcom_adreno_can_do_ttbr1(struct arm_smmu_device *smmu) return true; } +static void qcom_smmu_set_actlr_dev(struct device *dev, struct arm_smmu_device *smmu, int cbndx, + const struct of_device_id *client_match) +{ + const struct of_device_id *match = + of_match_device(client_match, dev); + + if (!match) { + dev_dbg(dev, "no ACTLR settings present\n"); + return; + } + + arm_smmu_cb_write(smmu, cbndx, ARM_SMMU_CB_ACTLR, (unsigned long)match->data); +} + static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) { + const struct device_node *np = smmu_domain->smmu->dev->of_node; + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct qcom_smmu *qsmmu = to_qcom_smmu(smmu); + const struct of_device_id *client_match; + int cbndx = smmu_domain->cfg.cbndx; struct adreno_smmu_priv *priv; smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + client_match = qsmmu->data->client_match; + + if (client_match) + qcom_smmu_set_actlr_dev(dev, smmu, cbndx, client_match); + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -236,7 +351,15 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, priv->set_ttbr0_cfg = qcom_adreno_smmu_set_ttbr0_cfg; priv->get_fault_info = qcom_adreno_smmu_get_fault_info; priv->set_stall = qcom_adreno_smmu_set_stall; - priv->resume_translation = qcom_adreno_smmu_resume_translation; + priv->set_prr_bit = NULL; + priv->set_prr_addr = NULL; + + if (of_device_is_compatible(np, "qcom,smmu-500") && + !of_device_is_compatible(np, "qcom,sm8250-smmu-500") && + of_device_is_compatible(np, "qcom,adreno-smmu")) { + priv->set_prr_bit = qcom_adreno_smmu_set_prr_bit; + priv->set_prr_addr = qcom_adreno_smmu_set_prr_addr; + } return 0; } @@ -244,9 +367,11 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,adreno" }, { .compatible = "qcom,adreno-gmu" }, + { .compatible = "qcom,glymur-mdss" }, { .compatible = "qcom,mdp4" }, { .compatible = "qcom,mdss" }, { .compatible = "qcom,qcm2290-mdss" }, + { .compatible = "qcom,sar2130p-mdss" }, { .compatible = "qcom,sc7180-mdss" }, { .compatible = "qcom,sc7180-mss-pil" }, { .compatible = "qcom,sc7280-mdss" }, @@ -256,6 +381,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,sdm670-mdss" }, { .compatible = "qcom,sdm845-mdss" }, { .compatible = "qcom,sdm845-mss-pil" }, + { .compatible = "qcom,sm6115-mdss" }, { .compatible = "qcom,sm6350-mdss" }, { .compatible = "qcom,sm6375-mdss" }, { .compatible = "qcom,sm8150-mdss" }, @@ -267,8 +393,18 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) { + struct arm_smmu_device *smmu = smmu_domain->smmu; + struct qcom_smmu *qsmmu = to_qcom_smmu(smmu); + const struct of_device_id *client_match; + int cbndx = smmu_domain->cfg.cbndx; + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + client_match = qsmmu->data->client_match; + + if (client_match) + qcom_smmu_set_actlr_dev(dev, smmu, cbndx, client_match); + return 0; } @@ -281,18 +417,34 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) int i; /* - * Some platforms support more than the Arm SMMU architected maximum of - * 128 stream matching groups. For unknown reasons, the additional - * groups don't exhibit the same behavior as the architected registers, - * so limit the groups to 128 until the behavior is fixed for the other - * groups. + * MSM8998 LPASS SMMU reports 13 context banks, but accessing + * the last context bank crashes the system. */ - if (smmu->num_mapping_groups > 128) { - dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); - smmu->num_mapping_groups = 128; + if (of_device_is_compatible(smmu->dev->of_node, "qcom,msm8998-smmu-v2") && + smmu->num_context_banks == 13) { + smmu->num_context_banks = 12; + } else if (of_device_is_compatible(smmu->dev->of_node, "qcom,sdm630-smmu-v2")) { + if (smmu->num_context_banks == 21) /* SDM630 / SDM660 A2NOC SMMU */ + smmu->num_context_banks = 7; + else if (smmu->num_context_banks == 14) /* SDM630 / SDM660 LPASS SMMU */ + smmu->num_context_banks = 13; } - last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); + /* + * Some platforms support more than the Arm SMMU architected maximum of + * 128 stream matching groups. The additional registers appear to have + * the same behavior as the architected registers in the hardware. + * However, on some firmware versions, the hypervisor does not + * correctly trap and emulate accesses to the additional registers, + * resulting in unexpected behavior. + * + * If there are more than 128 groups, use the last reliable group to + * detect if we need to apply the bypass quirk. + */ + if (smmu->num_mapping_groups > 128) + last_s2cr = ARM_SMMU_GR0_S2CR(127); + else + last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); /* * With some firmware versions writes to S2CR of type FAULT are @@ -315,6 +467,11 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS); arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg); + + if (smmu->num_mapping_groups > 128) { + dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); + smmu->num_mapping_groups = 128; + } } for (i = 0; i < smmu->num_mapping_groups; i++) { @@ -336,6 +493,19 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) return 0; } +static int qcom_adreno_smmuv2_cfg_probe(struct arm_smmu_device *smmu) +{ + /* Support for 16K pages is advertised on some SoCs, but it doesn't seem to work */ + smmu->features &= ~ARM_SMMU_FEAT_FMT_AARCH64_16K; + + /* TZ protects several last context banks, hide them from Linux */ + if (of_device_is_compatible(smmu->dev->of_node, "qcom,sdm630-smmu-v2") && + smmu->num_context_banks == 5) + smmu->num_context_banks = 2; + + return 0; +} + static void qcom_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx) { struct arm_smmu_s2cr *s2cr = smmu->s2crs + idx; @@ -434,10 +604,12 @@ static const struct arm_smmu_impl sdm845_smmu_500_impl = { static const struct arm_smmu_impl qcom_adreno_smmu_v2_impl = { .init_context = qcom_adreno_smmu_init_context, + .cfg_probe = qcom_adreno_smmuv2_cfg_probe, .def_domain_type = qcom_smmu_def_domain_type, .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank, .write_sctlr = qcom_adreno_smmu_write_sctlr, .tlb_sync = qcom_smmu_tlb_sync, + .context_fault_needs_threaded_irq = true, }; static const struct arm_smmu_impl qcom_adreno_smmu_500_impl = { @@ -447,6 +619,7 @@ static const struct arm_smmu_impl qcom_adreno_smmu_500_impl = { .alloc_context_bank = qcom_adreno_smmu_alloc_context_bank, .write_sctlr = qcom_adreno_smmu_write_sctlr, .tlb_sync = qcom_smmu_tlb_sync, + .context_fault_needs_threaded_irq = true, }; static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, @@ -469,14 +642,15 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, /* Check to make sure qcom_scm has finished probing */ if (!qcom_scm_is_available()) - return ERR_PTR(-EPROBE_DEFER); + return ERR_PTR(dev_err_probe(smmu->dev, -EPROBE_DEFER, + "qcom_scm not ready\n")); qsmmu = devm_krealloc(smmu->dev, smmu, sizeof(*qsmmu), GFP_KERNEL); if (!qsmmu) return ERR_PTR(-ENOMEM); qsmmu->smmu.impl = impl; - qsmmu->cfg = data->cfg; + qsmmu->data = data; return &qsmmu->smmu; } @@ -519,6 +693,7 @@ static const struct qcom_smmu_match_data qcom_smmu_500_impl0_data = { .impl = &qcom_smmu_500_impl, .adreno_impl = &qcom_adreno_smmu_500_impl, .cfg = &qcom_smmu_impl0_cfg, + .client_match = qcom_smmu_actlr_client_of_match, }; /* @@ -536,6 +711,7 @@ static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { { .compatible = "qcom,sc8180x-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sc8280xp-smmu-500", .data = &qcom_smmu_500_impl0_data }, { .compatible = "qcom,sdm630-smmu-v2", .data = &qcom_smmu_v2_data }, + { .compatible = "qcom,sdm670-smmu-v2", .data = &qcom_smmu_v2_data }, { .compatible = "qcom,sdm845-smmu-v2", .data = &qcom_smmu_v2_data }, { .compatible = "qcom,sdm845-smmu-500", .data = &sdm845_smmu_500_data }, { .compatible = "qcom,sm6115-smmu-500", .data = &qcom_smmu_500_impl0_data}, @@ -561,10 +737,47 @@ static struct acpi_platform_list qcom_acpi_platlist[] = { }; #endif +static int qcom_smmu_tbu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + int ret; + + if (IS_ENABLED(CONFIG_ARM_SMMU_QCOM_DEBUG)) { + ret = qcom_tbu_probe(pdev); + if (ret) + return ret; + } + + if (dev->pm_domain) { + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + } + + return 0; +} + +static const struct of_device_id qcom_smmu_tbu_of_match[] = { + { .compatible = "qcom,sc7280-tbu" }, + { .compatible = "qcom,sdm845-tbu" }, + { } +}; + +static struct platform_driver qcom_smmu_tbu_driver = { + .driver = { + .name = "qcom_tbu", + .of_match_table = qcom_smmu_tbu_of_match, + }, + .probe = qcom_smmu_tbu_probe, +}; + struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) { const struct device_node *np = smmu->dev->of_node; const struct of_device_id *match; + static u8 tbu_registered; + + if (!tbu_registered++) + platform_driver_register(&qcom_smmu_tbu_driver); #ifdef CONFIG_ACPI if (np == NULL) { diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h index 9bb3ae7d62da..8addd453f5f1 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h @@ -8,7 +8,7 @@ struct qcom_smmu { struct arm_smmu_device smmu; - const struct qcom_smmu_config *cfg; + const struct qcom_smmu_match_data *data; bool bypass_quirk; u8 bypass_cbndx; u32 stall_enabled; @@ -28,14 +28,17 @@ struct qcom_smmu_match_data { const struct qcom_smmu_config *cfg; const struct arm_smmu_impl *impl; const struct arm_smmu_impl *adreno_impl; + const struct of_device_id * const client_match; }; irqreturn_t qcom_smmu_context_fault(int irq, void *dev); #ifdef CONFIG_ARM_SMMU_QCOM_DEBUG void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu); +int qcom_tbu_probe(struct platform_device *pdev); #else static inline void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) { } +static inline int qcom_tbu_probe(struct platform_device *pdev) { return -EINVAL; } #endif #endif /* _ARM_SMMU_QCOM_H */ diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 87c81f75cf84..5e690cf85ec9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -34,6 +34,7 @@ #include <linux/pm_runtime.h> #include <linux/ratelimit.h> #include <linux/slab.h> +#include <linux/string_choices.h> #include <linux/fsl/mc.h> @@ -78,8 +79,11 @@ static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu) static inline void arm_smmu_rpm_put(struct arm_smmu_device *smmu) { - if (pm_runtime_enabled(smmu->dev)) - pm_runtime_put_autosuspend(smmu->dev); + if (pm_runtime_enabled(smmu->dev)) { + pm_runtime_mark_last_busy(smmu->dev); + __pm_runtime_put_autosuspend(smmu->dev); + + } } static void arm_smmu_rpm_use_autosuspend(struct arm_smmu_device *smmu) @@ -105,7 +109,7 @@ static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) } static struct platform_driver arm_smmu_driver; -static struct iommu_ops arm_smmu_ops; +static const struct iommu_ops arm_smmu_ops; #ifdef CONFIG_ARM_SMMU_LEGACY_DT_BINDINGS static struct device_node *dev_get_dev_node(struct device *dev) @@ -178,8 +182,7 @@ static int arm_smmu_register_legacy_master(struct device *dev, it.cur_count = 1; } - err = iommu_fwspec_init(dev, &smmu_dev->of_node->fwnode, - &arm_smmu_ops); + err = iommu_fwspec_init(dev, NULL); if (err) return err; @@ -405,32 +408,78 @@ static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v1 = { .tlb_add_page = arm_smmu_tlb_add_page_s2_v1, }; + +void arm_smmu_read_context_fault_info(struct arm_smmu_device *smmu, int idx, + struct arm_smmu_context_fault_info *cfi) +{ + cfi->iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR); + cfi->fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); + cfi->fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0); + cfi->cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx)); +} + +void arm_smmu_print_context_fault_info(struct arm_smmu_device *smmu, int idx, + const struct arm_smmu_context_fault_info *cfi) +{ + dev_err(smmu->dev, + "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n", + cfi->fsr, cfi->iova, cfi->fsynr, cfi->cbfrsynra, idx); + + dev_err(smmu->dev, "FSR = %08x [%s%sFormat=%u%s%s%s%s%s%s%s%s], SID=0x%x\n", + cfi->fsr, + (cfi->fsr & ARM_SMMU_CB_FSR_MULTI) ? "MULTI " : "", + (cfi->fsr & ARM_SMMU_CB_FSR_SS) ? "SS " : "", + (u32)FIELD_GET(ARM_SMMU_CB_FSR_FORMAT, cfi->fsr), + (cfi->fsr & ARM_SMMU_CB_FSR_UUT) ? " UUT" : "", + (cfi->fsr & ARM_SMMU_CB_FSR_ASF) ? " ASF" : "", + (cfi->fsr & ARM_SMMU_CB_FSR_TLBLKF) ? " TLBLKF" : "", + (cfi->fsr & ARM_SMMU_CB_FSR_TLBMCF) ? " TLBMCF" : "", + (cfi->fsr & ARM_SMMU_CB_FSR_EF) ? " EF" : "", + (cfi->fsr & ARM_SMMU_CB_FSR_PF) ? " PF" : "", + (cfi->fsr & ARM_SMMU_CB_FSR_AFF) ? " AFF" : "", + (cfi->fsr & ARM_SMMU_CB_FSR_TF) ? " TF" : "", + cfi->cbfrsynra); + + dev_err(smmu->dev, "FSYNR0 = %08x [S1CBNDX=%u%s%s%s%s%s%s PLVL=%u]\n", + cfi->fsynr, + (u32)FIELD_GET(ARM_SMMU_CB_FSYNR0_S1CBNDX, cfi->fsynr), + (cfi->fsynr & ARM_SMMU_CB_FSYNR0_AFR) ? " AFR" : "", + (cfi->fsynr & ARM_SMMU_CB_FSYNR0_PTWF) ? " PTWF" : "", + (cfi->fsynr & ARM_SMMU_CB_FSYNR0_NSATTR) ? " NSATTR" : "", + (cfi->fsynr & ARM_SMMU_CB_FSYNR0_IND) ? " IND" : "", + (cfi->fsynr & ARM_SMMU_CB_FSYNR0_PNU) ? " PNU" : "", + (cfi->fsynr & ARM_SMMU_CB_FSYNR0_WNR) ? " WNR" : "", + (u32)FIELD_GET(ARM_SMMU_CB_FSYNR0_PLVL, cfi->fsynr)); +} + static irqreturn_t arm_smmu_context_fault(int irq, void *dev) { - u32 fsr, fsynr, cbfrsynra; - unsigned long iova; + struct arm_smmu_context_fault_info cfi; struct arm_smmu_domain *smmu_domain = dev; struct arm_smmu_device *smmu = smmu_domain->smmu; + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); int idx = smmu_domain->cfg.cbndx; int ret; - fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR); - if (!(fsr & ARM_SMMU_FSR_FAULT)) + arm_smmu_read_context_fault_info(smmu, idx, &cfi); + + if (!(cfi.fsr & ARM_SMMU_CB_FSR_FAULT)) return IRQ_NONE; - fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0); - iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR); - cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx)); + ret = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova, + cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); - ret = report_iommu_fault(&smmu_domain->domain, NULL, iova, - fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); + if (ret == -ENOSYS && __ratelimit(&rs)) + arm_smmu_print_context_fault_info(smmu, idx, &cfi); - if (ret == -ENOSYS) - dev_err_ratelimited(smmu->dev, - "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n", - fsr, iova, fsynr, cbfrsynra, idx); + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr); + + if (cfi.fsr & ARM_SMMU_CB_FSR_SS) { + arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, + ret == -EAGAIN ? 0 : ARM_SMMU_RESUME_TERMINATE); + } - arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr); return IRQ_HANDLED; } @@ -870,6 +919,8 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain) static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) { struct arm_smmu_domain *smmu_domain; + struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); + struct arm_smmu_device *smmu = cfg->smmu; /* * Allocate the domain and initialise some of its data structures. @@ -882,6 +933,7 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) mutex_init(&smmu_domain->init_mutex); spin_lock_init(&smmu_domain->cb_lock); + smmu_domain->domain.pgsize_bitmap = smmu->pgsize_bitmap; return &smmu_domain->domain; } @@ -1113,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg, } } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -1155,7 +1208,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) /* Looks ok, so add the device to the domain */ arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_TRANS, smmu_domain->cfg.cbndx, fwspec); - arm_smmu_rpm_use_autosuspend(smmu); rpm_put: arm_smmu_rpm_put(smmu); return ret; @@ -1178,13 +1230,13 @@ static int arm_smmu_attach_dev_type(struct device *dev, return ret; arm_smmu_master_install_s2crs(cfg, type, 0, fwspec); - arm_smmu_rpm_use_autosuspend(smmu); arm_smmu_rpm_put(smmu); return 0; } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS); } @@ -1199,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT); } @@ -1306,7 +1359,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain, arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_ATS1PR, va); reg = arm_smmu_page(smmu, ARM_SMMU_CB(smmu, idx)) + ARM_SMMU_CB_ATSR; - if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ARM_SMMU_ATSR_ACTIVE), + if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ARM_SMMU_CB_ATSR_ACTIVE), 5, 50)) { spin_unlock_irqrestore(&smmu_domain->cb_lock, flags); dev_err(dev, @@ -1372,8 +1425,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) static struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device_by_fwnode(&arm_smmu_driver.driver, - fwnode); + struct device *dev = bus_find_device_by_fwnode(&platform_bus_type, fwnode); + put_device(dev); return dev ? dev_get_drvdata(dev) : NULL; } @@ -1446,7 +1499,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) out_cfg_free: kfree(cfg); out_free: - iommu_fwspec_free(dev); return ERR_PTR(ret); } @@ -1519,21 +1571,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { @@ -1596,7 +1633,7 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } -static struct iommu_ops arm_smmu_ops = { +static const struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, @@ -1608,7 +1645,6 @@ static struct iommu_ops arm_smmu_ops = { .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, .def_domain_type = arm_smmu_def_domain_type, - .pgsize_bitmap = -1UL, /* Restricted during device attach */ .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = arm_smmu_attach_dev, @@ -1617,7 +1653,6 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, .set_pgtable_quirks = arm_smmu_set_pgtable_quirks, .free = arm_smmu_domain_free, } @@ -1642,7 +1677,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu) /* Make sure all context banks are disabled and clear CB_FSR */ for (i = 0; i < smmu->num_context_banks; ++i) { arm_smmu_write_context_bank(smmu, i); - arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, ARM_SMMU_FSR_FAULT); + arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, ARM_SMMU_CB_FSR_FAULT); } /* Invalidate the TLB, just in case */ @@ -1889,10 +1924,6 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu) if (smmu->features & ARM_SMMU_FEAT_FMT_AARCH64_64K) smmu->pgsize_bitmap |= SZ_64K | SZ_512M; - if (arm_smmu_ops.pgsize_bitmap == -1UL) - arm_smmu_ops.pgsize_bitmap = smmu->pgsize_bitmap; - else - arm_smmu_ops.pgsize_bitmap |= smmu->pgsize_bitmap; dev_notice(smmu->dev, "\tSupported page sizes: 0x%08lx\n", smmu->pgsize_bitmap); @@ -2083,7 +2114,7 @@ static void arm_smmu_rmr_install_bypass_smr(struct arm_smmu_device *smmu) } dev_notice(smmu->dev, "\tpreserved %d boot mapping%s\n", cnt, - cnt == 1 ? "" : "s"); + str_plural(cnt)); iort_put_rmr_sids(dev_fwnode(smmu->dev), &rmr_list); } @@ -2193,29 +2224,26 @@ static int arm_smmu_device_probe(struct platform_device *pdev) i, irq); } + platform_set_drvdata(pdev, smmu); + + /* Check for RMRs and install bypass SMRs if any */ + arm_smmu_rmr_install_bypass_smr(smmu); + + arm_smmu_device_reset(smmu); + arm_smmu_test_smr_masks(smmu); + err = iommu_device_sysfs_add(&smmu->iommu, smmu->dev, NULL, "smmu.%pa", &smmu->ioaddr); - if (err) { - dev_err(dev, "Failed to register iommu in sysfs\n"); - return err; - } + if (err) + return dev_err_probe(dev, err, "Failed to register iommu in sysfs\n"); err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, using_legacy_binding ? NULL : dev); if (err) { - dev_err(dev, "Failed to register iommu\n"); iommu_device_sysfs_remove(&smmu->iommu); - return err; + return dev_err_probe(dev, err, "Failed to register iommu\n"); } - platform_set_drvdata(pdev, smmu); - - /* Check for RMRs and install bypass SMRs if any */ - arm_smmu_rmr_install_bypass_smr(smmu); - - arm_smmu_device_reset(smmu); - arm_smmu_test_smr_masks(smmu); - /* * We want to avoid touching dev->power.lock in fastpaths unless * it's really going to do something useful - pm_runtime_enabled() @@ -2225,6 +2253,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev) if (dev->pm_domain) { pm_runtime_set_active(dev); pm_runtime_enable(dev); + arm_smmu_rpm_use_autosuspend(smmu); } return 0; @@ -2333,7 +2362,7 @@ static struct platform_driver arm_smmu_driver = { .suppress_bind_attrs = true, }, .probe = arm_smmu_device_probe, - .remove_new = arm_smmu_device_remove, + .remove = arm_smmu_device_remove, .shutdown = arm_smmu_device_shutdown, }; module_platform_driver(arm_smmu_driver); diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 4765c6945c34..2dbf3243b5ad 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -154,6 +154,8 @@ enum arm_smmu_cbar_type { #define ARM_SMMU_SCTLR_M BIT(0) #define ARM_SMMU_CB_ACTLR 0x4 +#define ARM_SMMU_GFX_PRR_CFG_LADDR 0x6008 +#define ARM_SMMU_GFX_PRR_CFG_UADDR 0x600C #define ARM_SMMU_CB_RESUME 0x8 #define ARM_SMMU_RESUME_TERMINATE BIT(0) @@ -196,34 +198,42 @@ enum arm_smmu_cbar_type { #define ARM_SMMU_CB_PAR_F BIT(0) #define ARM_SMMU_CB_FSR 0x58 -#define ARM_SMMU_FSR_MULTI BIT(31) -#define ARM_SMMU_FSR_SS BIT(30) -#define ARM_SMMU_FSR_UUT BIT(8) -#define ARM_SMMU_FSR_ASF BIT(7) -#define ARM_SMMU_FSR_TLBLKF BIT(6) -#define ARM_SMMU_FSR_TLBMCF BIT(5) -#define ARM_SMMU_FSR_EF BIT(4) -#define ARM_SMMU_FSR_PF BIT(3) -#define ARM_SMMU_FSR_AFF BIT(2) -#define ARM_SMMU_FSR_TF BIT(1) - -#define ARM_SMMU_FSR_IGN (ARM_SMMU_FSR_AFF | \ - ARM_SMMU_FSR_ASF | \ - ARM_SMMU_FSR_TLBMCF | \ - ARM_SMMU_FSR_TLBLKF) - -#define ARM_SMMU_FSR_FAULT (ARM_SMMU_FSR_MULTI | \ - ARM_SMMU_FSR_SS | \ - ARM_SMMU_FSR_UUT | \ - ARM_SMMU_FSR_EF | \ - ARM_SMMU_FSR_PF | \ - ARM_SMMU_FSR_TF | \ - ARM_SMMU_FSR_IGN) +#define ARM_SMMU_CB_FSR_MULTI BIT(31) +#define ARM_SMMU_CB_FSR_SS BIT(30) +#define ARM_SMMU_CB_FSR_FORMAT GENMASK(10, 9) +#define ARM_SMMU_CB_FSR_UUT BIT(8) +#define ARM_SMMU_CB_FSR_ASF BIT(7) +#define ARM_SMMU_CB_FSR_TLBLKF BIT(6) +#define ARM_SMMU_CB_FSR_TLBMCF BIT(5) +#define ARM_SMMU_CB_FSR_EF BIT(4) +#define ARM_SMMU_CB_FSR_PF BIT(3) +#define ARM_SMMU_CB_FSR_AFF BIT(2) +#define ARM_SMMU_CB_FSR_TF BIT(1) + +#define ARM_SMMU_CB_FSR_IGN (ARM_SMMU_CB_FSR_AFF | \ + ARM_SMMU_CB_FSR_ASF | \ + ARM_SMMU_CB_FSR_TLBMCF | \ + ARM_SMMU_CB_FSR_TLBLKF) + +#define ARM_SMMU_CB_FSR_FAULT (ARM_SMMU_CB_FSR_MULTI | \ + ARM_SMMU_CB_FSR_SS | \ + ARM_SMMU_CB_FSR_UUT | \ + ARM_SMMU_CB_FSR_EF | \ + ARM_SMMU_CB_FSR_PF | \ + ARM_SMMU_CB_FSR_TF | \ + ARM_SMMU_CB_FSR_IGN) #define ARM_SMMU_CB_FAR 0x60 #define ARM_SMMU_CB_FSYNR0 0x68 -#define ARM_SMMU_FSYNR0_WNR BIT(4) +#define ARM_SMMU_CB_FSYNR0_PLVL GENMASK(1, 0) +#define ARM_SMMU_CB_FSYNR0_WNR BIT(4) +#define ARM_SMMU_CB_FSYNR0_PNU BIT(5) +#define ARM_SMMU_CB_FSYNR0_IND BIT(6) +#define ARM_SMMU_CB_FSYNR0_NSATTR BIT(8) +#define ARM_SMMU_CB_FSYNR0_PTWF BIT(10) +#define ARM_SMMU_CB_FSYNR0_AFR BIT(11) +#define ARM_SMMU_CB_FSYNR0_S1CBNDX GENMASK(23, 16) #define ARM_SMMU_CB_FSYNR1 0x6c @@ -237,7 +247,7 @@ enum arm_smmu_cbar_type { #define ARM_SMMU_CB_ATS1PR 0x800 #define ARM_SMMU_CB_ATSR 0x8f0 -#define ARM_SMMU_ATSR_ACTIVE BIT(0) +#define ARM_SMMU_CB_ATSR_ACTIVE BIT(0) #define ARM_SMMU_RESUME_TERMINATE BIT(0) @@ -533,4 +543,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu); void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx); int arm_mmu500_reset(struct arm_smmu_device *smmu); +struct arm_smmu_context_fault_info { + unsigned long iova; + u32 fsr; + u32 fsynr; + u32 cbfrsynra; +}; + +void arm_smmu_read_context_fault_info(struct arm_smmu_device *smmu, int idx, + struct arm_smmu_context_fault_info *cfi); + +void arm_smmu_print_context_fault_info(struct arm_smmu_device *smmu, int idx, + const struct arm_smmu_context_fault_info *cfi); + #endif /* _ARM_SMMU_H */ diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index e079bb7a993e..f69d9276dc55 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -194,7 +194,7 @@ static irqreturn_t qcom_iommu_fault(int irq, void *dev) fsr = iommu_readl(ctx, ARM_SMMU_CB_FSR); - if (!(fsr & ARM_SMMU_FSR_FAULT)) + if (!(fsr & ARM_SMMU_CB_FSR_FAULT)) return IRQ_NONE; fsynr = iommu_readl(ctx, ARM_SMMU_CB_FSYNR0); @@ -229,7 +229,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain, goto out_unlock; pgtbl_cfg = (struct io_pgtable_cfg) { - .pgsize_bitmap = qcom_iommu_ops.pgsize_bitmap, + .pgsize_bitmap = domain->pgsize_bitmap, .ias = 32, .oas = 40, .tlb = &qcom_flush_ops, @@ -246,8 +246,6 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain, goto out_clear_iommu; } - /* Update the domain's page sizes to reflect the page table format */ - domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; domain->geometry.aperture_end = (1ULL << pgtbl_cfg.ias) - 1; domain->geometry.force_aperture = true; @@ -274,7 +272,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain, /* Clear context bank fault address fault status registers */ iommu_writel(ctx, ARM_SMMU_CB_FAR, 0); - iommu_writel(ctx, ARM_SMMU_CB_FSR, ARM_SMMU_FSR_FAULT); + iommu_writel(ctx, ARM_SMMU_CB_FSR, ARM_SMMU_CB_FSR_FAULT); /* TTBRs */ iommu_writeq(ctx, ARM_SMMU_CB_TTBR0, @@ -337,6 +335,7 @@ static struct iommu_domain *qcom_iommu_domain_alloc_paging(struct device *dev) mutex_init(&qcom_domain->init_mutex); spin_lock_init(&qcom_domain->pgtbl_lock); + qcom_domain->domain.pgsize_bitmap = SZ_4K; return &qcom_domain->domain; } @@ -360,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) kfree(qcom_domain); } -static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int qcom_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); @@ -389,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev } static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct qcom_iommu_domain *qcom_domain; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); unsigned int i; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - qcom_domain = to_qcom_iommu_domain(domain); + qcom_domain = to_qcom_iommu_domain(old); if (WARN_ON(!qcom_domain->iommu)) return -EINVAL; @@ -566,14 +566,14 @@ static int qcom_iommu_of_xlate(struct device *dev, qcom_iommu = platform_get_drvdata(iommu_pdev); + put_device(&iommu_pdev->dev); + /* make sure the asid specified in dt is valid, so we don't have * to sanity check this elsewhere: */ if (WARN_ON(asid > qcom_iommu->max_asid) || - WARN_ON(qcom_iommu->ctxs[asid] == NULL)) { - put_device(&iommu_pdev->dev); + WARN_ON(qcom_iommu->ctxs[asid] == NULL)) return -EINVAL; - } if (!dev_iommu_priv_get(dev)) { dev_iommu_priv_set(dev, qcom_iommu); @@ -582,10 +582,8 @@ static int qcom_iommu_of_xlate(struct device *dev, * multiple different iommu devices. Multiple context * banks are ok, but multiple devices are not: */ - if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) { - put_device(&iommu_pdev->dev); + if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) return -EINVAL; - } } return iommu_fwspec_add_ids(dev, &asid, 1); @@ -598,7 +596,6 @@ static const struct iommu_ops qcom_iommu_ops = { .probe_device = qcom_iommu_probe_device, .device_group = generic_device_group, .of_xlate = qcom_iommu_of_xlate, - .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = qcom_iommu_attach_dev, .map_pages = qcom_iommu_map, @@ -759,7 +756,7 @@ static struct platform_driver qcom_iommu_ctx_driver = { .of_match_table = ctx_of_match, }, .probe = qcom_iommu_ctx_probe, - .remove_new = qcom_iommu_ctx_remove, + .remove = qcom_iommu_ctx_remove, }; static bool qcom_iommu_has_secure_context(struct qcom_iommu_dev *qcom_iommu) @@ -931,7 +928,7 @@ static struct platform_driver qcom_iommu_driver = { .pm = &qcom_iommu_pm_ops, }, .probe = qcom_iommu_device_probe, - .remove_new = qcom_iommu_device_remove, + .remove = qcom_iommu_device_remove, }; static int __init qcom_iommu_init(void) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 43520e7275cc..c92088855450 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -17,14 +17,17 @@ #include <linux/gfp.h> #include <linux/huge_mm.h> #include <linux/iommu.h> +#include <linux/iommu-dma.h> #include <linux/iova.h> #include <linux/irq.h> #include <linux/list_sort.h> #include <linux/memremap.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/msi.h> #include <linux/of_iommu.h> #include <linux/pci.h> +#include <linux/pci-p2pdma.h> #include <linux/scatterlist.h> #include <linux/spinlock.h> #include <linux/swiotlb.h> @@ -40,11 +43,6 @@ struct iommu_dma_msi_page { phys_addr_t phys; }; -enum iommu_dma_cookie_type { - IOMMU_DMA_IOVA_COOKIE, - IOMMU_DMA_MSI_COOKIE, -}; - enum iommu_dma_queue_type { IOMMU_DMA_OPTS_PER_CPU_QUEUE, IOMMU_DMA_OPTS_SINGLE_QUEUE, @@ -57,35 +55,30 @@ struct iommu_dma_options { }; struct iommu_dma_cookie { - enum iommu_dma_cookie_type type; + struct iova_domain iovad; + struct list_head msi_page_list; + /* Flush queue */ union { - /* Full allocator for IOMMU_DMA_IOVA_COOKIE */ - struct { - struct iova_domain iovad; - /* Flush queue */ - union { - struct iova_fq *single_fq; - struct iova_fq __percpu *percpu_fq; - }; - /* Number of TLB flushes that have been started */ - atomic64_t fq_flush_start_cnt; - /* Number of TLB flushes that have been finished */ - atomic64_t fq_flush_finish_cnt; - /* Timer to regularily empty the flush queues */ - struct timer_list fq_timer; - /* 1 when timer is active, 0 when not */ - atomic_t fq_timer_on; - }; - /* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */ - dma_addr_t msi_iova; + struct iova_fq *single_fq; + struct iova_fq __percpu *percpu_fq; }; - struct list_head msi_page_list; - + /* Number of TLB flushes that have been started */ + atomic64_t fq_flush_start_cnt; + /* Number of TLB flushes that have been finished */ + atomic64_t fq_flush_finish_cnt; + /* Timer to regularily empty the flush queues */ + struct timer_list fq_timer; + /* 1 when timer is active, 0 when not */ + atomic_t fq_timer_on; /* Domain for flush queue callback; NULL if flush queue not in use */ - struct iommu_domain *fq_domain; + struct iommu_domain *fq_domain; /* Options for dma-iommu use */ - struct iommu_dma_options options; - struct mutex mutex; + struct iommu_dma_options options; +}; + +struct iommu_dma_msi_cookie { + dma_addr_t msi_iova; + struct list_head msi_page_list; }; static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled); @@ -113,7 +106,7 @@ early_param("iommu.forcedac", iommu_dma_forcedac_setup); struct iova_fq_entry { unsigned long iova_pfn; unsigned long pages; - struct list_head freelist; + struct iommu_pages_list freelist; u64 counter; /* Flush counter when this entry was added */ }; @@ -162,6 +155,8 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq fq->entries[idx].iova_pfn, fq->entries[idx].pages); + fq->entries[idx].freelist = + IOMMU_PAGES_LIST_INIT(fq->entries[idx].freelist); fq->head = (fq->head + 1) & fq->mod_mask; } } @@ -184,7 +179,8 @@ static void fq_flush_iotlb(struct iommu_dma_cookie *cookie) static void fq_flush_timeout(struct timer_list *t) { - struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer); + struct iommu_dma_cookie *cookie = timer_container_of(cookie, t, + fq_timer); int cpu; atomic_set(&cookie->fq_timer_on, 0); @@ -200,7 +196,7 @@ static void fq_flush_timeout(struct timer_list *t) static void queue_iova(struct iommu_dma_cookie *cookie, unsigned long pfn, unsigned long pages, - struct list_head *freelist) + struct iommu_pages_list *freelist) { struct iova_fq *fq; unsigned long flags; @@ -239,7 +235,7 @@ static void queue_iova(struct iommu_dma_cookie *cookie, fq->entries[idx].iova_pfn = pfn; fq->entries[idx].pages = pages; fq->entries[idx].counter = atomic64_read(&cookie->fq_flush_start_cnt); - list_splice(freelist, &fq->entries[idx].freelist); + iommu_pages_list_splice(freelist, &fq->entries[idx].freelist); spin_unlock_irqrestore(&fq->lock, flags); @@ -279,7 +275,7 @@ static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie) if (!cookie->fq_domain) return; - del_timer_sync(&cookie->fq_timer); + timer_delete_sync(&cookie->fq_timer); if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) iommu_dma_free_fq_single(cookie->single_fq); else @@ -297,7 +293,8 @@ static void iommu_dma_init_one_fq(struct iova_fq *fq, size_t fq_size) spin_lock_init(&fq->lock); for (i = 0; i < fq_size; i++) - INIT_LIST_HEAD(&fq->entries[i].freelist); + fq->entries[i].freelist = + IOMMU_PAGES_LIST_INIT(fq->entries[i].freelist); } static int iommu_dma_init_fq_single(struct iommu_dma_cookie *cookie) @@ -364,39 +361,24 @@ int iommu_dma_init_fq(struct iommu_domain *domain) return 0; } -static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie) -{ - if (cookie->type == IOMMU_DMA_IOVA_COOKIE) - return cookie->iovad.granule; - return PAGE_SIZE; -} - -static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type) -{ - struct iommu_dma_cookie *cookie; - - cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); - if (cookie) { - INIT_LIST_HEAD(&cookie->msi_page_list); - cookie->type = type; - } - return cookie; -} - /** * iommu_get_dma_cookie - Acquire DMA-API resources for a domain * @domain: IOMMU domain to prepare for DMA-API usage */ int iommu_get_dma_cookie(struct iommu_domain *domain) { - if (domain->iova_cookie) + struct iommu_dma_cookie *cookie; + + if (domain->cookie_type != IOMMU_COOKIE_NONE) return -EEXIST; - domain->iova_cookie = cookie_alloc(IOMMU_DMA_IOVA_COOKIE); - if (!domain->iova_cookie) + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); + if (!cookie) return -ENOMEM; - mutex_init(&domain->iova_cookie->mutex); + INIT_LIST_HEAD(&cookie->msi_page_list); + domain->cookie_type = IOMMU_COOKIE_DMA_IOVA; + domain->iova_cookie = cookie; return 0; } @@ -414,48 +396,56 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) */ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { - struct iommu_dma_cookie *cookie; + struct iommu_dma_msi_cookie *cookie; if (domain->type != IOMMU_DOMAIN_UNMANAGED) return -EINVAL; - if (domain->iova_cookie) + if (domain->cookie_type != IOMMU_COOKIE_NONE) return -EEXIST; - cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE); + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); if (!cookie) return -ENOMEM; cookie->msi_iova = base; - domain->iova_cookie = cookie; + INIT_LIST_HEAD(&cookie->msi_page_list); + domain->cookie_type = IOMMU_COOKIE_DMA_MSI; + domain->msi_cookie = cookie; return 0; } EXPORT_SYMBOL(iommu_get_msi_cookie); /** * iommu_put_dma_cookie - Release a domain's DMA mapping resources - * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or - * iommu_get_msi_cookie() + * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() */ void iommu_put_dma_cookie(struct iommu_domain *domain) { struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; - if (!cookie) - return; - - if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule) { + if (cookie->iovad.granule) { iommu_dma_free_fq(cookie); put_iova_domain(&cookie->iovad); } + list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) + kfree(msi); + kfree(cookie); +} - list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) { - list_del(&msi->list); +/** + * iommu_put_msi_cookie - Release a domain's MSI mapping resources + * @domain: IOMMU domain previously prepared by iommu_get_msi_cookie() + */ +void iommu_put_msi_cookie(struct iommu_domain *domain) +{ + struct iommu_dma_msi_cookie *cookie = domain->msi_cookie; + struct iommu_dma_msi_page *msi, *tmp; + + list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) kfree(msi); - } kfree(cookie); - domain->iova_cookie = NULL; } /** @@ -675,7 +665,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev struct iova_domain *iovad; int ret; - if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE) + if (!cookie || domain->cookie_type != IOMMU_COOKIE_DMA_IOVA) return -EINVAL; iovad = &cookie->iovad; @@ -697,23 +687,20 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev domain->geometry.aperture_start >> order); /* start_pfn is always nonzero for an already-initialised domain */ - mutex_lock(&cookie->mutex); if (iovad->start_pfn) { if (1UL << order != iovad->granule || base_pfn != iovad->start_pfn) { pr_warn("Incompatible range for DMA domain\n"); - ret = -EFAULT; - goto done_unlock; + return -EFAULT; } - ret = 0; - goto done_unlock; + return 0; } init_iova_domain(iovad, 1UL << order, base_pfn); ret = iova_domain_init_rcaches(iovad); if (ret) - goto done_unlock; + return ret; iommu_dma_init_options(&cookie->options, dev); @@ -722,11 +709,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain))) domain->type = IOMMU_DOMAIN_DMA; - ret = iova_reserve_iommu_regions(dev, domain); - -done_unlock: - mutex_unlock(&cookie->mutex); - return ret; + return iova_reserve_iommu_regions(dev, domain); } /** @@ -741,7 +724,12 @@ done_unlock: static int dma_info_to_prot(enum dma_data_direction dir, bool coherent, unsigned long attrs) { - int prot = coherent ? IOMMU_CACHE : 0; + int prot; + + if (attrs & DMA_ATTR_MMIO) + prot = IOMMU_MMIO; + else + prot = coherent ? IOMMU_CACHE : 0; if (attrs & DMA_ATTR_PRIVILEGED) prot |= IOMMU_PRIV; @@ -765,9 +753,9 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, struct iova_domain *iovad = &cookie->iovad; unsigned long shift, iova_len, iova; - if (cookie->type == IOMMU_DMA_MSI_COOKIE) { - cookie->msi_iova += size; - return cookie->msi_iova - size; + if (domain->cookie_type == IOMMU_COOKIE_DMA_MSI) { + domain->msi_cookie->msi_iova += size; + return domain->msi_cookie->msi_iova - size; } shift = iova_shift(iovad); @@ -804,16 +792,16 @@ done: return (dma_addr_t)iova << shift; } -static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie, - dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather) +static void iommu_dma_free_iova(struct iommu_domain *domain, dma_addr_t iova, + size_t size, struct iommu_iotlb_gather *gather) { - struct iova_domain *iovad = &cookie->iovad; + struct iova_domain *iovad = &domain->iova_cookie->iovad; /* The MSI case is only ever cleaning up its most recent allocation */ - if (cookie->type == IOMMU_DMA_MSI_COOKIE) - cookie->msi_iova -= size; + if (domain->cookie_type == IOMMU_COOKIE_DMA_MSI) + domain->msi_cookie->msi_iova -= size; else if (gather && gather->queued) - queue_iova(cookie, iova_pfn(iovad, iova), + queue_iova(domain->iova_cookie, iova_pfn(iovad, iova), size >> iova_shift(iovad), &gather->freelist); else @@ -841,7 +829,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, if (!iotlb_gather.queued) iommu_iotlb_sync(domain, &iotlb_gather); - iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); + iommu_dma_free_iova(domain, dma_addr, size, &iotlb_gather); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, @@ -869,7 +857,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, return DMA_MAPPING_ERROR; if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) { - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); return DMA_MAPPING_ERROR; } return iova + iova_off; @@ -939,8 +927,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev, * but an IOMMU which supports smaller pages might not map the whole thing. */ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, - size_t size, struct sg_table *sgt, gfp_t gfp, pgprot_t prot, - unsigned long attrs) + size_t size, struct sg_table *sgt, gfp_t gfp, unsigned long attrs) { struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; @@ -1007,22 +994,21 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, out_free_sg: sg_free_table(sgt); out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); out_free_pages: __iommu_dma_free_pages(pages, count); return NULL; } static void *iommu_dma_alloc_remap(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot, - unsigned long attrs) + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { struct page **pages; struct sg_table sgt; void *vaddr; + pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); - pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, prot, - attrs); + pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, attrs); if (!pages) return NULL; *dma_handle = sgt.sgl->dma_address; @@ -1039,9 +1025,23 @@ out_unmap: return NULL; } -static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, - size_t size, enum dma_data_direction dir, gfp_t gfp, - unsigned long attrs) +/* + * This is the actual return value from the iommu_dma_alloc_noncontiguous. + * + * The users of the DMA API should only care about the sg_table, but to make + * the DMA-API internal vmaping and freeing easier we stash away the page + * array as well (except for the fallback case). This can go away any time, + * e.g. when a vmap-variant that takes a scatterlist comes along. + */ +struct dma_sgt_handle { + struct sg_table sgt; + struct page **pages; +}; +#define sgt_handle(sgt) \ + container_of((sgt), struct dma_sgt_handle, sgt) + +struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) { struct dma_sgt_handle *sh; @@ -1049,8 +1049,7 @@ static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, if (!sh) return NULL; - sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp, - PAGE_KERNEL, attrs); + sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp, attrs); if (!sh->pages) { kfree(sh); return NULL; @@ -1058,7 +1057,7 @@ static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, return &sh->sgt; } -static void iommu_dma_free_noncontiguous(struct device *dev, size_t size, +void iommu_dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { struct dma_sgt_handle *sh = sgt_handle(sgt); @@ -1069,8 +1068,26 @@ static void iommu_dma_free_noncontiguous(struct device *dev, size_t size, kfree(sh); } -static void iommu_dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) +void *iommu_dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL); +} + +int iommu_dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt) +{ + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return vm_map_pages(vma, sgt_handle(sgt)->pages, count); +} + +void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir) { phys_addr_t phys; @@ -1081,12 +1098,11 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(phys, size, dir); - if (is_swiotlb_buffer(dev, phys)) - swiotlb_sync_single_for_cpu(dev, phys, size, dir); + swiotlb_sync_single_for_cpu(dev, phys, size, dir); } -static void iommu_dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) +void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir) { phys_addr_t phys; @@ -1094,16 +1110,14 @@ static void iommu_dma_sync_single_for_device(struct device *dev, return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); - if (is_swiotlb_buffer(dev, phys)) - swiotlb_sync_single_for_device(dev, phys, size, dir); + swiotlb_sync_single_for_device(dev, phys, size, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(phys, size, dir); } -static void iommu_dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) +void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir) { struct scatterlist *sg; int i; @@ -1117,9 +1131,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); } -static void iommu_dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) +void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir) { struct scatterlist *sg; int i; @@ -1134,11 +1147,57 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); } -static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iova_domain *iovad = &domain->iova_cookie->iovad; + + if (!is_swiotlb_active(dev)) { + dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n"); + return (phys_addr_t)DMA_MAPPING_ERROR; + } + + trace_swiotlb_bounced(dev, phys, size); + + phys = swiotlb_tbl_map_single(dev, phys, size, iova_mask(iovad), dir, + attrs); + + /* + * Untrusted devices should not see padding areas with random leftover + * kernel data, so zero the pre- and post-padding. + * swiotlb_tbl_map_single() has initialized the bounce buffer proper to + * the contents of the original memory buffer. + */ + if (phys != (phys_addr_t)DMA_MAPPING_ERROR && dev_is_untrusted(dev)) { + size_t start, virt = (size_t)phys_to_virt(phys); + + /* Pre-padding */ + start = iova_align_down(iovad, virt); + memset((void *)start, 0, virt - start); + + /* Post-padding */ + start = virt + size; + memset((void *)start, 0, iova_align(iovad, start) - start); + } + + return phys; +} + +/* + * Checks if a physical buffer has unaligned boundaries with respect to + * the IOMMU granule. Returns non-zero if either the start or end + * address is not aligned to the granule boundary. + */ +static inline size_t iova_unaligned(struct iova_domain *iovad, phys_addr_t phys, + size_t size) +{ + return iova_offset(iovad, phys | size); +} + +dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; bool coherent = dev_is_dma_coherent(dev); int prot = dma_info_to_prot(dir, coherent, attrs); struct iommu_domain *domain = iommu_get_dma_domain(dev); @@ -1147,60 +1206,39 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, dma_addr_t iova, dma_mask = dma_get_mask(dev); /* - * If both the physical buffer start address and size are - * page aligned, we don't need to use a bounce page. + * If both the physical buffer start address and size are page aligned, + * we don't need to use a bounce page. */ if (dev_use_swiotlb(dev, size, dir) && - iova_offset(iovad, phys | size)) { - if (!is_swiotlb_active(dev)) { - dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n"); + iova_unaligned(iovad, phys, size)) { + if (attrs & DMA_ATTR_MMIO) return DMA_MAPPING_ERROR; - } - - trace_swiotlb_bounced(dev, phys, size); - phys = swiotlb_tbl_map_single(dev, phys, size, - iova_mask(iovad), dir, attrs); - - if (phys == DMA_MAPPING_ERROR) + phys = iommu_dma_map_swiotlb(dev, phys, size, dir, attrs); + if (phys == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; - - /* - * Untrusted devices should not see padding areas with random - * leftover kernel data, so zero the pre- and post-padding. - * swiotlb_tbl_map_single() has initialized the bounce buffer - * proper to the contents of the original memory buffer. - */ - if (dev_is_untrusted(dev)) { - size_t start, virt = (size_t)phys_to_virt(phys); - - /* Pre-padding */ - start = iova_align_down(iovad, virt); - memset((void *)start, 0, virt - start); - - /* Post-padding */ - start = virt + size; - memset((void *)start, 0, - iova_align(iovad, start) - start); - } } - if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_device(phys, size, dir); iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); - if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys)) + if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO)) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); return iova; } -static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, +void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct iommu_domain *domain = iommu_get_dma_domain(dev); phys_addr_t phys; - phys = iommu_iova_to_phys(domain, dma_handle); + if (attrs & DMA_ATTR_MMIO) { + __iommu_dma_unmap(dev, dma_handle, size); + return; + } + + phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); if (WARN_ON(!phys)) return; @@ -1209,8 +1247,7 @@ static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, __iommu_dma_unmap(dev, dma_handle, size); - if (unlikely(is_swiotlb_buffer(dev, phys))) - swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } /* @@ -1314,7 +1351,7 @@ static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *s int i; for_each_sg(sg, s, nents, i) - iommu_dma_unmap_page(dev, sg_dma_address(s), + iommu_dma_unmap_phys(dev, sg_dma_address(s), sg_dma_len(s), dir, attrs); } @@ -1327,8 +1364,8 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg, sg_dma_mark_swiotlb(sg); for_each_sg(sg, s, nents, i) { - sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s), - s->offset, s->length, dir, attrs); + sg_dma_address(s) = iommu_dma_map_phys(dev, sg_phys(s), + s->length, dir, attrs); if (sg_dma_address(s) == DMA_MAPPING_ERROR) goto out_unmap; sg_dma_len(s) = s->length; @@ -1348,8 +1385,8 @@ out_unmap: * impedance-matching, to be able to hand off a suitably-aligned list, * but still preserve the original offsets and sizes for the caller. */ -static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, unsigned long attrs) +int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) { struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; @@ -1357,7 +1394,6 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, struct scatterlist *s, *prev = NULL; int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs); struct pci_p2pdma_map_state p2pdma_state = {}; - enum pci_p2pdma_map_type map; dma_addr_t iova; size_t iova_len = 0; unsigned long mask = dma_get_seg_boundary(dev); @@ -1387,28 +1423,30 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, size_t s_length = s->length; size_t pad_len = (mask - iova_len + 1) & mask; - if (is_pci_p2pdma_page(sg_page(s))) { - map = pci_p2pdma_map_segment(&p2pdma_state, dev, s); - switch (map) { - case PCI_P2PDMA_MAP_BUS_ADDR: - /* - * iommu_map_sg() will skip this segment as - * it is marked as a bus address, - * __finalise_sg() will copy the dma address - * into the output segment. - */ - continue; - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * Mapping through host bridge should be - * mapped with regular IOVAs, thus we - * do nothing here and continue below. - */ - break; - default: - ret = -EREMOTEIO; - goto out_restore_sg; - } + switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(s))) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Mapping through host bridge should be mapped with + * regular IOVAs, thus we do nothing here and continue + * below. + */ + break; + case PCI_P2PDMA_MAP_NONE: + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * iommu_map_sg() will skip this segment as it is marked + * as a bus address, __finalise_sg() will copy the dma + * address into the output segment. + */ + s->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(s)); + sg_dma_len(s) = sg->length; + sg_dma_mark_bus_address(s); + continue; + default: + ret = -EREMOTEIO; + goto out_restore_sg; } sg_dma_address(s) = s_iova_off; @@ -1459,7 +1497,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, return __finalise_sg(dev, sg, nents, iova); out_free_iova: - iommu_dma_free_iova(cookie, iova, iova_len, NULL); + iommu_dma_free_iova(domain, iova, iova_len, NULL); out_restore_sg: __invalidate_sg(sg, nents); out: @@ -1468,8 +1506,8 @@ out: return ret; } -static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, unsigned long attrs) +void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) { dma_addr_t end = 0, start; struct scatterlist *tmp; @@ -1518,20 +1556,6 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, __iommu_dma_unmap(dev, start, end - start); } -static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - return __iommu_dma_map(dev, phys, size, - dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO, - dma_get_mask(dev)); -} - -static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - __iommu_dma_unmap(dev, handle, size); -} - static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) { size_t alloc_size = PAGE_ALIGN(size); @@ -1563,7 +1587,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) dma_free_contiguous(dev, page, alloc_size); } -static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr, +void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle, unsigned long attrs) { __iommu_dma_unmap(dev, handle, size); @@ -1607,8 +1631,8 @@ out_free_pages: return NULL; } -static void *iommu_dma_alloc(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp, unsigned long attrs) +void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, + gfp_t gfp, unsigned long attrs) { bool coherent = dev_is_dma_coherent(dev); int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs); @@ -1619,8 +1643,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size, if (gfpflags_allow_blocking(gfp) && !(attrs & DMA_ATTR_FORCE_CONTIGUOUS)) { - return iommu_dma_alloc_remap(dev, size, handle, gfp, - dma_pgprot(dev, PAGE_KERNEL, attrs), attrs); + return iommu_dma_alloc_remap(dev, size, handle, gfp, attrs); } if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && @@ -1642,7 +1665,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size, return cpu_addr; } -static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, +int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { @@ -1673,7 +1696,7 @@ static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, vma->vm_page_prot); } -static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt, +int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { @@ -1700,19 +1723,19 @@ static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt, return ret; } -static unsigned long iommu_dma_get_merge_boundary(struct device *dev) +unsigned long iommu_dma_get_merge_boundary(struct device *dev) { struct iommu_domain *domain = iommu_get_dma_domain(dev); return (1UL << __ffs(domain->pgsize_bitmap)) - 1; } -static size_t iommu_dma_opt_mapping_size(void) +size_t iommu_dma_opt_mapping_size(void) { return iova_rcache_range(); } -static size_t iommu_dma_max_mapping_size(struct device *dev) +size_t iommu_dma_max_mapping_size(struct device *dev) { if (dev_is_untrusted(dev)) return swiotlb_max_mapping_size(dev); @@ -1720,31 +1743,359 @@ static size_t iommu_dma_max_mapping_size(struct device *dev) return SIZE_MAX; } -static const struct dma_map_ops iommu_dma_ops = { - .flags = DMA_F_PCI_P2PDMA_SUPPORTED | - DMA_F_CAN_SKIP_SYNC, - .alloc = iommu_dma_alloc, - .free = iommu_dma_free, - .alloc_pages_op = dma_common_alloc_pages, - .free_pages = dma_common_free_pages, - .alloc_noncontiguous = iommu_dma_alloc_noncontiguous, - .free_noncontiguous = iommu_dma_free_noncontiguous, - .mmap = iommu_dma_mmap, - .get_sgtable = iommu_dma_get_sgtable, - .map_page = iommu_dma_map_page, - .unmap_page = iommu_dma_unmap_page, - .map_sg = iommu_dma_map_sg, - .unmap_sg = iommu_dma_unmap_sg, - .sync_single_for_cpu = iommu_dma_sync_single_for_cpu, - .sync_single_for_device = iommu_dma_sync_single_for_device, - .sync_sg_for_cpu = iommu_dma_sync_sg_for_cpu, - .sync_sg_for_device = iommu_dma_sync_sg_for_device, - .map_resource = iommu_dma_map_resource, - .unmap_resource = iommu_dma_unmap_resource, - .get_merge_boundary = iommu_dma_get_merge_boundary, - .opt_mapping_size = iommu_dma_opt_mapping_size, - .max_mapping_size = iommu_dma_max_mapping_size, -}; +/** + * dma_iova_try_alloc - Try to allocate an IOVA space + * @dev: Device to allocate the IOVA space for + * @state: IOVA state + * @phys: physical address + * @size: IOVA size + * + * Check if @dev supports the IOVA-based DMA API, and if yes allocate IOVA space + * for the given base address and size. + * + * Note: @phys is only used to calculate the IOVA alignment. Callers that always + * do PAGE_SIZE aligned transfers can safely pass 0 here. + * + * Returns %true if the IOVA-based DMA API can be used and IOVA space has been + * allocated, or %false if the regular DMA API should be used. + */ +bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t size) +{ + struct iommu_dma_cookie *cookie; + struct iommu_domain *domain; + struct iova_domain *iovad; + size_t iova_off; + dma_addr_t addr; + + memset(state, 0, sizeof(*state)); + if (!use_dma_iommu(dev)) + return false; + + domain = iommu_get_dma_domain(dev); + cookie = domain->iova_cookie; + iovad = &cookie->iovad; + iova_off = iova_offset(iovad, phys); + + if (static_branch_unlikely(&iommu_deferred_attach_enabled) && + iommu_deferred_attach(dev, iommu_get_domain_for_dev(dev))) + return false; + + if (WARN_ON_ONCE(!size)) + return false; + + /* + * DMA_IOVA_USE_SWIOTLB is flag which is set by dma-iommu + * internals, make sure that caller didn't set it and/or + * didn't use this interface to map SIZE_MAX. + */ + if (WARN_ON_ONCE((u64)size & DMA_IOVA_USE_SWIOTLB)) + return false; + + addr = iommu_dma_alloc_iova(domain, + iova_align(iovad, size + iova_off), + dma_get_mask(dev), dev); + if (!addr) + return false; + + state->addr = addr + iova_off; + state->__size = size; + return true; +} +EXPORT_SYMBOL_GPL(dma_iova_try_alloc); + +/** + * dma_iova_free - Free an IOVA space + * @dev: Device to free the IOVA space for + * @state: IOVA state + * + * Undoes a successful dma_try_iova_alloc(). + * + * Note that all dma_iova_link() calls need to be undone first. For callers + * that never call dma_iova_unlink(), dma_iova_destroy() can be used instead + * which unlinks all ranges and frees the IOVA space in a single efficient + * operation. + */ +void dma_iova_free(struct device *dev, struct dma_iova_state *state) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + size_t iova_start_pad = iova_offset(iovad, state->addr); + size_t size = dma_iova_size(state); + + iommu_dma_free_iova(domain, state->addr - iova_start_pad, + iova_align(iovad, size + iova_start_pad), NULL); +} +EXPORT_SYMBOL_GPL(dma_iova_free); + +static int __dma_iova_link(struct device *dev, dma_addr_t addr, + phys_addr_t phys, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + bool coherent = dev_is_dma_coherent(dev); + int prot = dma_info_to_prot(dir, coherent, attrs); + + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + arch_sync_dma_for_device(phys, size, dir); + + return iommu_map_nosync(iommu_get_dma_domain(dev), addr, phys, size, + prot, GFP_ATOMIC); +} + +static int iommu_dma_iova_bounce_and_link(struct device *dev, dma_addr_t addr, + phys_addr_t phys, size_t bounce_len, + enum dma_data_direction dir, unsigned long attrs, + size_t iova_start_pad) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iova_domain *iovad = &domain->iova_cookie->iovad; + phys_addr_t bounce_phys; + int error; + + bounce_phys = iommu_dma_map_swiotlb(dev, phys, bounce_len, dir, attrs); + if (bounce_phys == DMA_MAPPING_ERROR) + return -ENOMEM; + + error = __dma_iova_link(dev, addr - iova_start_pad, + bounce_phys - iova_start_pad, + iova_align(iovad, bounce_len), dir, attrs); + if (error) + swiotlb_tbl_unmap_single(dev, bounce_phys, bounce_len, dir, + attrs); + return error; +} + +static int iommu_dma_iova_link_swiotlb(struct device *dev, + struct dma_iova_state *state, phys_addr_t phys, size_t offset, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + size_t iova_start_pad = iova_offset(iovad, phys); + size_t iova_end_pad = iova_offset(iovad, phys + size); + dma_addr_t addr = state->addr + offset; + size_t mapped = 0; + int error; + + if (iova_start_pad) { + size_t bounce_len = min(size, iovad->granule - iova_start_pad); + + error = iommu_dma_iova_bounce_and_link(dev, addr, phys, + bounce_len, dir, attrs, iova_start_pad); + if (error) + return error; + state->__size |= DMA_IOVA_USE_SWIOTLB; + + mapped += bounce_len; + size -= bounce_len; + if (!size) + return 0; + } + + size -= iova_end_pad; + error = __dma_iova_link(dev, addr + mapped, phys + mapped, size, dir, + attrs); + if (error) + goto out_unmap; + mapped += size; + + if (iova_end_pad) { + error = iommu_dma_iova_bounce_and_link(dev, addr + mapped, + phys + mapped, iova_end_pad, dir, attrs, 0); + if (error) + goto out_unmap; + state->__size |= DMA_IOVA_USE_SWIOTLB; + } + + return 0; + +out_unmap: + dma_iova_unlink(dev, state, 0, mapped, dir, attrs); + return error; +} + +/** + * dma_iova_link - Link a range of IOVA space + * @dev: DMA device + * @state: IOVA state + * @phys: physical address to link + * @offset: offset into the IOVA state to map into + * @size: size of the buffer + * @dir: DMA direction + * @attrs: attributes of mapping properties + * + * Link a range of IOVA space for the given IOVA state without IOTLB sync. + * This function is used to link multiple physical addresses in contiguous + * IOVA space without performing costly IOTLB sync. + * + * The caller is responsible to call to dma_iova_sync() to sync IOTLB at + * the end of linkage. + */ +int dma_iova_link(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + size_t iova_start_pad = iova_offset(iovad, phys); + + if (WARN_ON_ONCE(iova_start_pad && offset > 0)) + return -EIO; + + if (dev_use_swiotlb(dev, size, dir) && + iova_unaligned(iovad, phys, size)) { + if (attrs & DMA_ATTR_MMIO) + return -EPERM; + + return iommu_dma_iova_link_swiotlb(dev, state, phys, offset, + size, dir, attrs); + } + + return __dma_iova_link(dev, state->addr + offset - iova_start_pad, + phys - iova_start_pad, + iova_align(iovad, size + iova_start_pad), dir, attrs); +} +EXPORT_SYMBOL_GPL(dma_iova_link); + +/** + * dma_iova_sync - Sync IOTLB + * @dev: DMA device + * @state: IOVA state + * @offset: offset into the IOVA state to sync + * @size: size of the buffer + * + * Sync IOTLB for the given IOVA state. This function should be called on + * the IOVA-contiguous range created by one ore more dma_iova_link() calls + * to sync the IOTLB. + */ +int dma_iova_sync(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + dma_addr_t addr = state->addr + offset; + size_t iova_start_pad = iova_offset(iovad, addr); + + return iommu_sync_map(domain, addr - iova_start_pad, + iova_align(iovad, size + iova_start_pad)); +} +EXPORT_SYMBOL_GPL(dma_iova_sync); + +static void iommu_dma_iova_unlink_range_slow(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + size_t iova_start_pad = iova_offset(iovad, addr); + dma_addr_t end = addr + size; + + do { + phys_addr_t phys; + size_t len; + + phys = iommu_iova_to_phys(domain, addr); + if (WARN_ON(!phys)) + /* Something very horrible happen here */ + return; + + len = min_t(size_t, + end - addr, iovad->granule - iova_start_pad); + + if (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + arch_sync_dma_for_cpu(phys, len, dir); + + swiotlb_tbl_unmap_single(dev, phys, len, dir, attrs); + + addr += len; + iova_start_pad = 0; + } while (addr < end); +} + +static void __iommu_dma_iova_unlink(struct device *dev, + struct dma_iova_state *state, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs, + bool free_iova) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + dma_addr_t addr = state->addr + offset; + size_t iova_start_pad = iova_offset(iovad, addr); + struct iommu_iotlb_gather iotlb_gather; + size_t unmapped; + + if ((state->__size & DMA_IOVA_USE_SWIOTLB) || + (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))) + iommu_dma_iova_unlink_range_slow(dev, addr, size, dir, attrs); + + iommu_iotlb_gather_init(&iotlb_gather); + iotlb_gather.queued = free_iova && READ_ONCE(cookie->fq_domain); + + size = iova_align(iovad, size + iova_start_pad); + addr -= iova_start_pad; + unmapped = iommu_unmap_fast(domain, addr, size, &iotlb_gather); + WARN_ON(unmapped != size); + + if (!iotlb_gather.queued) + iommu_iotlb_sync(domain, &iotlb_gather); + if (free_iova) + iommu_dma_free_iova(domain, addr, size, &iotlb_gather); +} + +/** + * dma_iova_unlink - Unlink a range of IOVA space + * @dev: DMA device + * @state: IOVA state + * @offset: offset into the IOVA state to unlink + * @size: size of the buffer + * @dir: DMA direction + * @attrs: attributes of mapping properties + * + * Unlink a range of IOVA space for the given IOVA state. + */ +void dma_iova_unlink(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + __iommu_dma_iova_unlink(dev, state, offset, size, dir, attrs, false); +} +EXPORT_SYMBOL_GPL(dma_iova_unlink); + +/** + * dma_iova_destroy - Finish a DMA mapping transaction + * @dev: DMA device + * @state: IOVA state + * @mapped_len: number of bytes to unmap + * @dir: DMA direction + * @attrs: attributes of mapping properties + * + * Unlink the IOVA range up to @mapped_len and free the entire IOVA space. The + * range of IOVA from dma_addr to @mapped_len must all be linked, and be the + * only linked IOVA in state. + */ +void dma_iova_destroy(struct device *dev, struct dma_iova_state *state, + size_t mapped_len, enum dma_data_direction dir, + unsigned long attrs) +{ + if (mapped_len) + __iommu_dma_iova_unlink(dev, state, 0, mapped_len, dir, attrs, + true); + else + /* + * We can be here if first call to dma_iova_link() failed and + * there is nothing to unlink, so let's be more clear. + */ + dma_iova_free(dev, state); +} +EXPORT_SYMBOL_GPL(dma_iova_destroy); void iommu_setup_dma_ops(struct device *dev) { @@ -1753,32 +2104,58 @@ void iommu_setup_dma_ops(struct device *dev) if (dev_is_pci(dev)) dev->iommu->pci_32bit_workaround = !iommu_dma_forcedac; - if (iommu_is_dma_domain(domain)) { - if (iommu_dma_init_domain(domain, dev)) - goto out_err; - dev->dma_ops = &iommu_dma_ops; - } else if (dev->dma_ops == &iommu_dma_ops) { - /* Clean up if we've switched *from* a DMA domain */ - dev->dma_ops = NULL; - } + dev->dma_iommu = iommu_is_dma_domain(domain); + if (dev->dma_iommu && iommu_dma_init_domain(domain, dev)) + goto out_err; return; out_err: - pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n", - dev_name(dev)); + pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n", + dev_name(dev)); + dev->dma_iommu = false; +} + +static bool has_msi_cookie(const struct iommu_domain *domain) +{ + return domain && (domain->cookie_type == IOMMU_COOKIE_DMA_IOVA || + domain->cookie_type == IOMMU_COOKIE_DMA_MSI); +} + +static size_t cookie_msi_granule(const struct iommu_domain *domain) +{ + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + return domain->iova_cookie->iovad.granule; + case IOMMU_COOKIE_DMA_MSI: + return PAGE_SIZE; + default: + BUG(); + } +} + +static struct list_head *cookie_msi_pages(const struct iommu_domain *domain) +{ + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + return &domain->iova_cookie->msi_page_list; + case IOMMU_COOKIE_DMA_MSI: + return &domain->msi_cookie->msi_page_list; + default: + BUG(); + } } static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, phys_addr_t msi_addr, struct iommu_domain *domain) { - struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct list_head *msi_page_list = cookie_msi_pages(domain); struct iommu_dma_msi_page *msi_page; dma_addr_t iova; int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; - size_t size = cookie_msi_granule(cookie); + size_t size = cookie_msi_granule(domain); msi_addr &= ~(phys_addr_t)(size - 1); - list_for_each_entry(msi_page, &cookie->msi_page_list, list) + list_for_each_entry(msi_page, msi_page_list, list) if (msi_page->phys == msi_addr) return msi_page; @@ -1796,70 +2173,35 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, INIT_LIST_HEAD(&msi_page->list); msi_page->phys = msi_addr; msi_page->iova = iova; - list_add(&msi_page->list, &cookie->msi_page_list); + list_add(&msi_page->list, msi_page_list); return msi_page; out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); out_free_page: kfree(msi_page); return NULL; } -/** - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain - * @desc: MSI descriptor, will store the MSI page - * @msi_addr: MSI target address to be mapped - * - * Return: 0 on success or negative error code if the mapping failed. - */ -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_dma_msi_page *msi_page; - static DEFINE_MUTEX(msi_prepare_lock); /* see below */ + const struct iommu_dma_msi_page *msi_page; - if (!domain || !domain->iova_cookie) { - desc->iommu_cookie = NULL; + if (!has_msi_cookie(domain)) { + msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } - /* - * In fact the whole prepare operation should already be serialised by - * irq_domain_mutex further up the callchain, but that's pretty subtle - * on its own, so consider this locking as failsafe documentation... - */ - mutex_lock(&msi_prepare_lock); + iommu_group_mutex_assert(dev); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); - mutex_unlock(&msi_prepare_lock); - - msi_desc_set_iommu_cookie(desc, msi_page); - if (!msi_page) return -ENOMEM; - return 0; -} - -/** - * iommu_dma_compose_msi_msg() - Apply translation to an MSI message - * @desc: MSI descriptor prepared by iommu_dma_prepare_msi() - * @msg: MSI message containing target physical address - */ -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ - struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - const struct iommu_dma_msi_page *msi_page; - msi_page = msi_desc_get_iommu_cookie(desc); - - if (!domain || !domain->iova_cookie || WARN_ON(!msi_page)) - return; - - msg->address_hi = upper_32_bits(msi_page->iova); - msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1; - msg->address_lo += lower_32_bits(msi_page->iova); + msi_desc_set_iommu_msi_iova(desc, msi_page->iova, + ilog2(cookie_msi_granule(domain))); + return 0; } static int iommu_dma_init(void) diff --git a/drivers/iommu/dma-iommu.h b/drivers/iommu/dma-iommu.h index c12d63457c76..eca201c1f963 100644 --- a/drivers/iommu/dma-iommu.h +++ b/drivers/iommu/dma-iommu.h @@ -13,11 +13,15 @@ void iommu_setup_dma_ops(struct device *dev); int iommu_get_dma_cookie(struct iommu_domain *domain); void iommu_put_dma_cookie(struct iommu_domain *domain); +void iommu_put_msi_cookie(struct iommu_domain *domain); int iommu_dma_init_fq(struct iommu_domain *domain); void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); +int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + extern bool iommu_dma_forcedac; #else /* CONFIG_IOMMU_DMA */ @@ -40,9 +44,19 @@ static inline void iommu_put_dma_cookie(struct iommu_domain *domain) { } +static inline void iommu_put_msi_cookie(struct iommu_domain *domain) +{ +} + static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) { } +static inline int iommu_dma_sw_msi(struct iommu_domain *domain, + struct msi_desc *desc, phys_addr_t msi_addr) +{ + return -ENODEV; +} + #endif /* CONFIG_IOMMU_DMA */ #endif /* __DMA_IOMMU_H */ diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index c666ecab955d..b512c6b939ac 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -22,6 +22,7 @@ #include <linux/pm_runtime.h> #include <linux/slab.h> +#include "dma-iommu.h" #include "iommu-pages.h" typedef u32 sysmmu_iova_t; @@ -249,7 +250,7 @@ struct exynos_iommu_domain { struct list_head clients; /* list of sysmmu_drvdata.domain_node */ sysmmu_pte_t *pgtable; /* lv1 page table, 16KB */ short *lv2entcnt; /* free lv2 entry counter for each section */ - spinlock_t lock; /* lock for modyfying list of clients */ + spinlock_t lock; /* lock for modifying list of clients */ spinlock_t pgtablelock; /* lock for modifying page table @ pgtable */ struct iommu_domain domain; /* generic domain data structure */ }; @@ -292,7 +293,7 @@ struct sysmmu_drvdata { struct clk *aclk; /* SYSMMU's aclk clock */ struct clk *pclk; /* SYSMMU's pclk clock */ struct clk *clk_master; /* master's device clock */ - spinlock_t lock; /* lock for modyfying state */ + spinlock_t lock; /* lock for modifying state */ bool active; /* current status */ struct exynos_iommu_domain *domain; /* domain we belong to */ struct list_head domain_node; /* node for domain clients list */ @@ -746,7 +747,7 @@ static int exynos_sysmmu_probe(struct platform_device *pdev) ret = devm_request_irq(dev, irq, exynos_sysmmu_irq, 0, dev_name(dev), data); if (ret) { - dev_err(dev, "Unabled to register handler of irq %d\n", irq); + dev_err(dev, "Unable to register handler of irq %d\n", irq); return ret; } @@ -832,7 +833,7 @@ static int __maybe_unused exynos_sysmmu_suspend(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(master); mutex_lock(&owner->rpm_lock); - if (&data->domain->domain != &exynos_identity_domain) { + if (data->domain) { dev_dbg(data->sysmmu, "saving state\n"); __sysmmu_disable(data); } @@ -850,7 +851,7 @@ static int __maybe_unused exynos_sysmmu_resume(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(master); mutex_lock(&owner->rpm_lock); - if (&data->domain->domain != &exynos_identity_domain) { + if (data->domain) { dev_dbg(data->sysmmu, "restoring state\n"); __sysmmu_enable(data); } @@ -902,11 +903,11 @@ static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev) if (!domain) return NULL; - domain->pgtable = iommu_alloc_pages(GFP_KERNEL, 2); + domain->pgtable = iommu_alloc_pages_sz(GFP_KERNEL, SZ_16K); if (!domain->pgtable) goto err_pgtable; - domain->lv2entcnt = iommu_alloc_pages(GFP_KERNEL, 1); + domain->lv2entcnt = iommu_alloc_pages_sz(GFP_KERNEL, SZ_8K); if (!domain->lv2entcnt) goto err_counter; @@ -925,6 +926,8 @@ static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev) spin_lock_init(&domain->pgtablelock); INIT_LIST_HEAD(&domain->clients); + domain->domain.pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE; + domain->domain.geometry.aperture_start = 0; domain->domain.geometry.aperture_end = ~0UL; domain->domain.geometry.force_aperture = true; @@ -932,9 +935,9 @@ static struct iommu_domain *exynos_iommu_domain_alloc_paging(struct device *dev) return &domain->domain; err_lv2ent: - iommu_free_pages(domain->lv2entcnt, 1); + iommu_free_pages(domain->lv2entcnt); err_counter: - iommu_free_pages(domain->pgtable, 2); + iommu_free_pages(domain->pgtable); err_pgtable: kfree(domain); return NULL; @@ -975,13 +978,14 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) phys_to_virt(base)); } - iommu_free_pages(domain->pgtable, 2); - iommu_free_pages(domain->lv2entcnt, 1); + iommu_free_pages(domain->pgtable); + iommu_free_pages(domain->lv2entcnt); kfree(domain); } static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct exynos_iommu_domain *domain; @@ -1032,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = { }; static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); @@ -1041,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, unsigned long flags; int err; - err = exynos_iommu_identity_attach(&exynos_identity_domain, dev); + err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old); if (err) return err; @@ -1426,8 +1431,6 @@ static void exynos_iommu_release_device(struct device *dev) struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct sysmmu_drvdata *data; - WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev)); - list_for_each_entry(data, &owner->controllers, owner_node) device_link_del(data->link); } @@ -1443,17 +1446,14 @@ static int exynos_iommu_of_xlate(struct device *dev, return -ENODEV; data = platform_get_drvdata(sysmmu); - if (!data) { - put_device(&sysmmu->dev); + put_device(&sysmmu->dev); + if (!data) return -ENODEV; - } if (!owner) { owner = kzalloc(sizeof(*owner), GFP_KERNEL); - if (!owner) { - put_device(&sysmmu->dev); + if (!owner) return -ENOMEM; - } INIT_LIST_HEAD(&owner->controllers); mutex_init(&owner->rpm_lock); @@ -1473,11 +1473,12 @@ static int exynos_iommu_of_xlate(struct device *dev, static const struct iommu_ops exynos_iommu_ops = { .identity_domain = &exynos_identity_domain, + .release_domain = &exynos_identity_domain, .domain_alloc_paging = exynos_iommu_domain_alloc_paging, .device_group = generic_device_group, .probe_device = exynos_iommu_probe_device, .release_device = exynos_iommu_release_device, - .pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE, + .get_resv_regions = iommu_dma_get_resv_regions, .of_xlate = exynos_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = exynos_iommu_attach_device, diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index e9d2bff4659b..9664ef9840d2 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -64,7 +64,7 @@ static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain, spin_lock_irqsave(&iommu_lock, flags); ret = pamu_update_paace_stash(liodn, val); if (ret) { - pr_debug("Failed to update SPAACE for liodn %d\n ", liodn); + pr_debug("Failed to update SPAACE for liodn %d\n", liodn); spin_unlock_irqrestore(&iommu_lock, flags); return ret; } @@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val) } static int fsl_pamu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain); unsigned long flags; @@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, * switches to what looks like BLOCKING. */ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct fsl_dma_domain *dma_domain; const u32 *prop; int len; @@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, * Hack to keep things working as they always have, only leaving an * UNMANAGED domain makes it BLOCKING. */ - if (domain == platform_domain || !domain || - domain->type != IOMMU_DOMAIN_UNMANAGED) + if (old == platform_domain || !old || + old->type != IOMMU_DOMAIN_UNMANAGED) return 0; - dma_domain = to_fsl_dma_domain(domain); + dma_domain = to_fsl_dma_domain(old); /* * Use LIODN of the PCI controller while detaching a @@ -416,14 +416,12 @@ static struct iommu_group *fsl_pamu_device_group(struct device *dev) static struct iommu_device *fsl_pamu_probe_device(struct device *dev) { - int len; - /* * uboot must fill the fsl,liodn for platform devices to be supported by * the iommu. */ if (!dev_is_pci(dev) && - !of_get_property(dev->of_node, "fsl,liodn", &len)) + !of_property_present(dev->of_node, "fsl,liodn")) return ERR_PTR(-ENODEV); return &pamu_iommu; diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig new file mode 100644 index 000000000000..52ac9e661ffd --- /dev/null +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -0,0 +1,14 @@ +CONFIG_KUNIT=y +CONFIG_GENERIC_PT=y +CONFIG_DEBUG_GENERIC_PT=y +CONFIG_IOMMU_PT=y +CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_VTDSS=y +CONFIG_IOMMU_PT_X86_64=y +CONFIG_IOMMU_PT_KUNIT_TEST=y + +CONFIG_IOMMUFD=y +CONFIG_DEBUG_KERNEL=y +CONFIG_FAULT_INJECTION=y +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_IOMMUFD_TEST=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig new file mode 100644 index 000000000000..ce4fb4786914 --- /dev/null +++ b/drivers/iommu/generic_pt/Kconfig @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig GENERIC_PT + bool "Generic Radix Page Table" if COMPILE_TEST + help + Generic library for building radix tree page tables. + + Generic PT provides a set of HW page table formats and a common + set of APIs to work with them. + +if GENERIC_PT +config DEBUG_GENERIC_PT + bool "Extra debugging checks for GENERIC_PT" + help + Enable extra run time debugging checks for GENERIC_PT code. This + incurs a runtime cost and should not be enabled for production + kernels. + + The kunit tests require this to be enabled to get full coverage. + +config IOMMU_PT + tristate "IOMMU Page Tables" + select IOMMU_API + depends on IOMMU_SUPPORT + depends on GENERIC_PT + help + Generic library for building IOMMU page tables + + IOMMU_PT provides an implementation of the page table operations + related to struct iommu_domain using GENERIC_PT. It provides a single + implementation of the page table operations that can be shared by + multiple drivers. + +if IOMMU_PT +config IOMMU_PT_AMDV1 + tristate "IOMMU page table for 64-bit AMD IOMMU v1" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the AMD v1 page table. AMDv1 is the + "host" page table. It supports granular page sizes of almost every + power of 2 and decodes the full 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_VTDSS + tristate "IOMMU page table for Intel VT-d Second Stage" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5 + level Second Stage page table. It is similar to the X86_64 format with + 4K/2M/1G page sizes. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_X86_64 + tristate "IOMMU page table for x86 64-bit, 4/5 levels" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the x86 64-bit 4/5 level page table. + It supports 4K/2M/1G page sizes and can decode a sign-extended + portion of the 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + +config IOMMU_PT_KUNIT_TEST + tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 + depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 + depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS + default KUNIT_ALL_TESTS + help + Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the + enabled page table formats. The test covers most of the GENERIC_PT + functions provided by the page table format, as well as covering the + iommu_domain related functions. + +endif +endif diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile new file mode 100644 index 000000000000..976b49ec97dc --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0 + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 +iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 + +IOMMU_PT_KUNIT_TEST := +define create_format +obj-$(2) += iommu_$(1).o +iommu_pt_kunit_test-y += kunit_iommu_$(1).o +CFLAGS_kunit_iommu_$(1).o += -DGENERIC_PT_KUNIT=1 +IOMMU_PT_KUNIT_TEST := iommu_pt_kunit_test.o + +endef + +$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y))) +$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m))) + +# The kunit objects are constructed by compiling the main source +# with -DGENERIC_PT_KUNIT +$(obj)/kunit_iommu_%.o: $(src)/iommu_%.c FORCE + $(call rule_mkdir) + $(call if_changed_dep,cc_o_c) + +obj-$(CONFIG_IOMMU_PT_KUNIT_TEST) += $(IOMMU_PT_KUNIT_TEST) diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h new file mode 100644 index 000000000000..aa8e1a8ec95f --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -0,0 +1,411 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * AMD IOMMU v1 page table + * + * This is described in Section "2.2.3 I/O Page Tables for Host Translations" + * of the "AMD I/O Virtualization Technology (IOMMU) Specification" + * + * Note the level numbering here matches the core code, so level 0 is the same + * as mode 1. + * + */ +#ifndef __GENERIC_PT_FMT_AMDV1_H +#define __GENERIC_PT_FMT_AMDV1_H + +#include "defs_amdv1.h" +#include "../pt_defs.h" + +#include <asm/page.h> +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/mem_encrypt.h> +#include <linux/minmax.h> +#include <linux/sizes.h> +#include <linux/string.h> + +enum { + PT_ITEM_WORD_SIZE = sizeof(u64), + /* + * The IOMMUFD selftest uses the AMDv1 format with some alterations It + * uses a 2k page size to test cases where the CPU page size is not the + * same. + */ +#ifdef AMDV1_IOMMUFD_SELFTEST + PT_MAX_VA_ADDRESS_LG2 = 56, + PT_MAX_OUTPUT_ADDRESS_LG2 = 51, + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 11, +#else + PT_MAX_VA_ADDRESS_LG2 = 64, + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_TOP_LEVEL = 5, + PT_GRANULE_LG2SZ = 12, +#endif + PT_TABLEMEM_LG2SZ = 12, + + /* The DTE only has these bits for the top phyiscal address */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* PTE bits */ +enum { + AMDV1PT_FMT_PR = BIT(0), + AMDV1PT_FMT_D = BIT(6), + AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9), + AMDV1PT_FMT_OA = GENMASK_ULL(51, 12), + AMDV1PT_FMT_FC = BIT_ULL(60), + AMDV1PT_FMT_IR = BIT_ULL(61), + AMDV1PT_FMT_IW = BIT_ULL(62), +}; + +/* + * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make + * these defines to avoid it. + */ +#define AMDV1PT_FMT_NL_DEFAULT 0 +#define AMDV1PT_FMT_NL_SIZE 7 + +static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ); +} +#define pt_table_pa amdv1pt_table_pa + +/* Returns the oa for the start of the contiguous entry */ +static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + pt_oaddr_t oa; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + oa = FIELD_GET(AMDV1PT_FMT_OA, entry); + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) { + unsigned int sz_bits = oaffz(oa); + + oa = oalog2_set_mod(oa, 0, sz_bits); + } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) != + AMDV1PT_FMT_NL_DEFAULT)) + return 0; + return oalog2_mul(oa, PT_GRANULE_LG2SZ); +} +#define pt_entry_oa amdv1pt_entry_oa + +static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts) +{ + /* + * Table 15: Page Table Level Parameters + * The top most level cannot have translation entries + */ + return pts->level < PT_MAX_TOP_LEVEL; +} +#define pt_can_have_leaf amdv1pt_can_have_leaf + +/* Body in pt_fmt_defaults.h */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +static inline unsigned int +amdv1pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + u32 code; + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) == + AMDV1PT_FMT_NL_DEFAULT) + return ilog2(1); + + PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) != + AMDV1PT_FMT_NL_SIZE); + + /* + * The contiguous size is encoded in the length of a string of 1's in + * the low bits of the OA. Reverse the equation: + * code = log2_to_int(num_contig_lg2 + item_lg2sz - + * PT_GRANULE_LG2SZ - 1) - 1 + * Which can be expressed as: + * num_contig_lg2 = oalog2_ffz(code) + 1 - + * item_lg2sz - PT_GRANULE_LG2SZ + * + * Assume the bit layout is correct and remove the masking. Reorganize + * the equation to move all the arithmetic before the ffz. + */ + code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 + + pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ); + return ffz_t(u32, code); +} +#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2 + +static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts) +{ + /* + * Top entry covers bits [63:57] only, this is handled through + * max_vasz_lg2. + */ + if (PT_WARN_ON(pts->level == 5)) + return 7; + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 amdv1pt_num_items_lg2 + +static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!amdv1pt_can_have_leaf(pts)) + return 0; + + /* + * Table 14: Example Page Size Encodings + * Address bits 51:32 can be used to encode page sizes greater than 4 + * Gbytes. Address bits 63:52 are zero-extended. + * + * 512GB Pages are not supported due to a hardware bug. + * Otherwise every power of two size is supported. + */ + return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1), + isz_lg2) & ~SZ_512G; +} +#define pt_possible_sizes amdv1pt_possible_sizes + +static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64) + pts->index; + unsigned int next_level; + u64 entry; + + pts->entry = entry = READ_ONCE(*tablep); + if (!(entry & AMDV1PT_FMT_PR)) + return PT_ENTRY_EMPTY; + + next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry); + if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT || + next_level == AMDV1PT_FMT_NL_SIZE) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw amdv1pt_load_entry_raw + +static inline void +amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + + if (oasz_lg2 == isz_lg2) { + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_DEFAULT); + WRITE_ONCE(*tablep, entry); + } else { + unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_SIZE) | + FIELD_PREP(AMDV1PT_FMT_OA, + oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ - + 1) - + 1); + + /* See amdv1pt_clear_entries() */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, entry); + } else { + memset64(tablep, entry, log2_to_int(num_contig_lg2)); + } + } + pts->entry = entry; +} +#define pt_install_leaf_entry amdv1pt_install_leaf_entry + +static inline bool amdv1pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + /* + * IR and IW are ANDed from the table levels along with the PTE. We + * always control permissions from the PTE, so always set IR and IW for + * tables. + */ + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) | + FIELD_PREP(AMDV1PT_FMT_OA, + log2_div(table_pa, PT_GRANULE_LG2SZ)) | + AMDV1PT_FMT_IR | AMDV1PT_FMT_IW; + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table amdv1pt_install_table + +static inline void amdv1pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = + pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW); +} +#define pt_attr_from_entry amdv1pt_attr_from_entry + +static inline void amdv1pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + /* + * gcc generates rep stos for the io-pgtable code, and this difference + * can show in microbenchmarks with larger contiguous page sizes. + * rep is slower for small cases. + */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); + } else { + memset64(tablep, 0, log2_to_int(num_contig_lg2)); + } +} +#define pt_clear_entries amdv1pt_clear_entries + +static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + if (READ_ONCE(*tablep) & AMDV1PT_FMT_D) + return true; + return false; +} +#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty + +static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D); +} +#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean + +static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | AMDV1PT_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_amdv1 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_amdv1, iommu) + ->amdpt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu; +} + +static inline int amdv1pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE)) + pte |= AMDV1PT_FMT_FC; + if (iommu_prot & IOMMU_READ) + pte |= AMDV1PT_FMT_IR; + if (iommu_prot & IOMMU_WRITE) + pte |= AMDV1PT_FMT_IW; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot amdv1pt_iommu_set_prot + +static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, + const struct pt_iommu_amdv1_cfg *cfg) +{ + struct pt_amdv1 *table = &iommu_table->amdpt; + unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; + + if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL) + return -EINVAL; + + if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) && + cfg->starting_level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) * + (cfg->starting_level + 1); + + table->common.max_vasz_lg2 = + min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2); + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + pt_top_set_level(&table->common, cfg->starting_level); + return 0; +} +#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init + +#ifndef PT_FMT_VARIANT +static inline void +amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, + const struct pt_range *top_range, + struct pt_iommu_amdv1_hw_info *info) +{ + info->host_pt_root = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK); + info->mode = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info +#endif + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = { + /* Matches what io_pgtable does */ + [0] = { .starting_level = 2 }, +}; +#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = 0 }; +#endif + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h new file mode 100644 index 000000000000..0b9614ca6d10 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H +#define __GENERIC_PT_FMT_DEFS_AMDV1_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct amdv1pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs amdv1pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h new file mode 100644 index 000000000000..4a239bcaae2a --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H +#define __GENERIC_PT_FMT_DEFS_VTDSS_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct vtdss_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs vtdss_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h new file mode 100644 index 000000000000..6f589e1f55d3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H +#define __GENERIC_PT_FMT_DEFS_X86_64_H + +#include <linux/generic_pt/common.h> +#include <linux/types.h> + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct x86_64_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs x86_64_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c new file mode 100644 index 000000000000..72a2337d0c55 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT amdv1 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \ + BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) +#define PT_FORCE_ENABLED_FEATURES \ + (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c new file mode 100644 index 000000000000..74e597cba9d9 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define AMDV1_IOMMUFD_SELFTEST 1 +#define PT_FMT amdv1 +#define PT_FMT_VARIANT mock +#define PT_SUPPORTED_FEATURES 0 + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h new file mode 100644 index 000000000000..d28e86abdf2e --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Template to build the iommu module and kunit from the format and + * implementation headers. + * + * The format should have: + * #define PT_FMT <name> + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy)) + * And optionally: + * #define PT_FORCE_ENABLED_FEATURES .. + * #define PT_FMT_VARIANT <suffix> + */ +#include <linux/args.h> +#include <linux/stringify.h> + +#ifdef PT_FMT_VARIANT +#define PTPFX_RAW \ + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT) +#else +#define PTPFX_RAW PT_FMT +#endif + +#define PTPFX CONCATENATE(PTPFX_RAW, _) + +#define _PT_FMT_H PT_FMT.h +#define PT_FMT_H __stringify(_PT_FMT_H) + +#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H) +#define PT_DEFS_H __stringify(_PT_DEFS_H) + +#include <linux/generic_pt/common.h> +#include PT_DEFS_H +#include "../pt_defs.h" +#include PT_FMT_H +#include "../pt_common.h" + +#ifndef GENERIC_PT_KUNIT +#include "../iommu_pt.h" +#else +/* + * The makefile will compile the .c file twice, once with GENERIC_PT_KUNIT set + * which means we are building the kunit modle. + */ +#include "../kunit_generic_pt.h" +#include "../kunit_iommu_pt.h" +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c new file mode 100644 index 000000000000..f551711e2a33 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT vtdss +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \ + BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c new file mode 100644 index 000000000000..5472660c2d71 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT x86_64 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ + BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h new file mode 100644 index 000000000000..f5f8981edde7 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/vtdss.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + * Intel VT-d Second Stange 5/4 level page table + * + * This is described in + * Section "3.7 Second-Stage Translation" + * Section "9.8 Second-Stage Paging Entries" + * + * Of the "Intel Virtualization Technology for Directed I/O Architecture + * Specification". + * + * The named levels in the spec map to the pts->level as: + * Table/SS-PTE - 0 + * Directory/SS-PDE - 1 + * Directory Ptr/SS-PDPTE - 2 + * PML4/SS-PML4E - 3 + * PML5/SS-PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_VTDSS_H +#define __GENERIC_PT_FMT_VTDSS_H + +#include "defs_vtdss.h" +#include "../pt_defs.h" + +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/log2.h> + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* SSPTPTR is 4k aligned and limited by HAW */ + PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12), +}; + +/* Shared descriptor bits */ +enum { + VTDSS_FMT_R = BIT(0), + VTDSS_FMT_W = BIT(1), + VTDSS_FMT_A = BIT(8), + VTDSS_FMT_D = BIT(9), + VTDSS_FMT_SNP = BIT(11), + VTDSS_FMT_OA = GENMASK_ULL(51, 12), +}; + +/* PDPTE/PDE */ +enum { + VTDSS_FMT_PS = BIT(7), +}; + +#define common_to_vtdss_pt(common_ptr) \ + container_of_const(common_ptr, struct pt_vtdss, common) +#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common) + +static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa vtdss_pt_table_pa + +static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa vtdss_pt_entry_oa + +static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf vtdss_pt_can_have_leaf + +static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 vtdss_pt_num_items_lg2 + +static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!entry) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw vtdss_pt_load_entry_raw + +static inline void +vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= VTDSS_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry vtdss_pt_install_leaf_entry + +static inline bool vtdss_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = VTDSS_FMT_R | VTDSS_FMT_W | + FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + return pt_table_install64(pts, entry); +} +#define pt_install_table vtdss_pt_install_table + +static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP); +} +#define pt_attr_from_entry vtdss_pt_attr_from_entry + +static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + return READ_ONCE(*tablep) & VTDSS_FMT_D; +} +#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty + +static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D); +} +#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean + +static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | VTDSS_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty + +static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common) +{ + return 10; +} +#define pt_max_sw_bit vtdss_pt_max_sw_bit + +static inline u64 vtdss_pt_sw_bit(unsigned int bitnr) +{ + if (__builtin_constant_p(bitnr) && bitnr > 10) + BUILD_BUG(); + + /* Bits marked Ignored in the specification */ + switch (bitnr) { + case 0: + return BIT(10); + case 1 ... 9: + return BIT_ULL((bitnr - 1) + 52); + case 10: + return BIT_ULL(63); + /* Some bits in 9-3 are available in some entries */ + default: + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit vtdss_pt_sw_bit + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_vtdss + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->vtdss_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, vtdss_pt.common) + ->iommu; +} + +static inline int vtdss_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + /* + * VTDSS does not have a present bit, so we tell if any entry is present + * by checking for R or W. + */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return -EINVAL; + + if (iommu_prot & IOMMU_READ) + pte |= VTDSS_FMT_R; + if (iommu_prot & IOMMU_WRITE) + pte |= VTDSS_FMT_W; + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE)) + pte |= VTDSS_FMT_SNP; + + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) && + !(iommu_prot & IOMMU_WRITE)) { + pr_err_ratelimited( + "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot vtdss_pt_iommu_set_prot + +static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table, + const struct pt_iommu_vtdss_cfg *cfg) +{ + struct pt_vtdss *table = &iommu_table->vtdss_pt; + + if (cfg->top_level > 4 || cfg->top_level < 2) + return -EOPNOTSUPP; + + pt_top_set_level(&table->common, cfg->top_level); + return 0; +} +#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init + +static inline void +vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table, + const struct pt_range *top_range, + struct pt_iommu_vtdss_hw_info *info) +{ + info->ssptptr = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK); + /* + * top_level = 2 = 3 level table aw=1 + * top_level = 3 = 4 level table aw=2 + * top_level = 4 = 5 level table aw=3 + */ + info->aw = top_range->top_level - 1; +} +#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = { + [0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2}, + [1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3}, + [2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4}, +}; +#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) }; +#endif +#endif diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h new file mode 100644 index 000000000000..210748d9d6e8 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/x86_64.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * x86 page table. Supports the 4 and 5 level variations. + * + * The 4 and 5 level version is described in: + * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software + * Developer's Manual Volume 3 + * + * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization + * Technology for Directed I/O Architecture Specification" + * + * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O + * Virtualization Technology (IOMMU) Specification" + * + * It is used by x86 CPUs, AMD and VT-d IOMMU HW. + * + * Note the 3 level format is very similar and almost implemented here. The + * reserved/ignored layout is different and there are functional bit + * differences. + * + * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower + * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign + * extended addressing with this page table format. + * + * The named levels in the spec map to the pts->level as: + * Table/PTE - 0 + * Directory/PDE - 1 + * Directory Ptr/PDPTE - 2 + * PML4/PML4E - 3 + * PML5/PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_X86_64_H +#define __GENERIC_PT_FMT_X86_64_H + +#include "defs_x86_64.h" +#include "../pt_defs.h" + +#include <linux/bitfield.h> +#include <linux/container_of.h> +#include <linux/log2.h> +#include <linux/mem_encrypt.h> + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* + * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k + * aligned and is limited by the architected HAW + */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* Shared descriptor bits */ +enum { + X86_64_FMT_P = BIT(0), + X86_64_FMT_RW = BIT(1), + X86_64_FMT_U = BIT(2), + X86_64_FMT_A = BIT(5), + X86_64_FMT_D = BIT(6), + X86_64_FMT_OA = GENMASK_ULL(51, 12), + X86_64_FMT_XD = BIT_ULL(63), +}; + +/* PDPTE/PDE */ +enum { + X86_64_FMT_PS = BIT(7), +}; + +static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa x86_64_pt_table_pa + +static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa x86_64_pt_entry_oa + +static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf x86_64_pt_can_have_leaf + +static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 x86_64_pt_num_items_lg2 + +static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!(entry & X86_64_FMT_P)) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw x86_64_pt_load_entry_raw + +static inline void +x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = X86_64_FMT_P | + FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= X86_64_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry x86_64_pt_install_leaf_entry + +static inline bool x86_64_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table x86_64_pt_install_table + +static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + X86_64_FMT_D | X86_64_FMT_XD); +} +#define pt_attr_from_entry x86_64_pt_attr_from_entry + +static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common) +{ + return 12; +} +#define pt_max_sw_bit x86_64_pt_max_sw_bit + +static inline u64 x86_64_pt_sw_bit(unsigned int bitnr) +{ + if (__builtin_constant_p(bitnr) && bitnr > 12) + BUILD_BUG(); + + /* Bits marked Ignored/AVL in the specification */ + switch (bitnr) { + case 0: + return BIT(9); + case 1: + return BIT(11); + case 2 ... 12: + return BIT_ULL((bitnr - 2) + 52); + /* Some bits in 8,6,4,3 are available in some entries */ + default: + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit x86_64_pt_sw_bit + +/* --- iommu */ +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +#define pt_iommu_table pt_iommu_x86_64 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->x86_64_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, x86_64_pt.common) + ->iommu; +} + +static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte; + + pte = X86_64_FMT_U | X86_64_FMT_A; + if (iommu_prot & IOMMU_WRITE) + pte |= X86_64_FMT_RW | X86_64_FMT_D; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot x86_64_pt_iommu_set_prot + +static inline int +x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table, + const struct pt_iommu_x86_64_cfg *cfg) +{ + struct pt_x86_64 *table = &iommu_table->x86_64_pt; + + if (cfg->top_level < 3 || cfg->top_level > 4) + return -EOPNOTSUPP; + + pt_top_set_level(&table->common, cfg->top_level); + + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + return 0; +} +#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init + +static inline void +x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table, + const struct pt_range *top_range, + struct pt_iommu_x86_64_hw_info *info) +{ + info->gcr3_pt = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK); + info->levels = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = { + [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 48, .top_level = 3 }, + [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 57, .top_level = 4 }, + /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */ + [2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 }, + [3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4}, +}; +#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)}; +#endif +#endif diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h new file mode 100644 index 000000000000..97aeda1ad01c --- /dev/null +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -0,0 +1,1289 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * "Templated C code" for implementing the iommu operations for page tables. + * This is compiled multiple times, over all the page table formats to pick up + * the per-format definitions. + */ +#ifndef __GENERIC_PT_IOMMU_PT_H +#define __GENERIC_PT_IOMMU_PT_H + +#include "pt_iter.h" + +#include <linux/export.h> +#include <linux/iommu.h> +#include "../iommu-pages.h" +#include <linux/cleanup.h> +#include <linux/dma-mapping.h> + +enum { + SW_BIT_CACHE_FLUSH_DONE = 0, +}; + +static void flush_writes_range(const struct pt_state *pts, + unsigned int start_index, unsigned int end_index) +{ + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_flush_incoherent( + iommu_from_common(pts->range->common)->iommu_device, + pts->table, start_index * PT_ITEM_WORD_SIZE, + (end_index - start_index) * PT_ITEM_WORD_SIZE); +} + +static void flush_writes_item(const struct pt_state *pts) +{ + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_flush_incoherent( + iommu_from_common(pts->range->common)->iommu_device, + pts->table, pts->index * PT_ITEM_WORD_SIZE, + PT_ITEM_WORD_SIZE); +} + +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, + struct pt_iommu *iommu_table, pt_vaddr_t iova, + pt_vaddr_t len, + struct iommu_pages_list *free_list) +{ + struct pt_common *common = common_from_iommu(iommu_table); + + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(free_list, + iommu_table->iommu_device); + + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); + /* + * Note that the sync frees the gather's free list, so we must + * not have any pages on that list that are covered by iova/len + */ + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); + } + + iommu_pages_list_splice(free_list, &iotlb_gather->freelist); +} + +#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) + +static int make_range_ul(struct pt_common *common, struct pt_range *range, + unsigned long iova, unsigned long len) +{ + unsigned long last; + + if (unlikely(len == 0)) + return -EINVAL; + + if (check_add_overflow(iova, len - 1, &last)) + return -EOVERFLOW; + + *range = pt_make_range(common, iova, last); + if (sizeof(iova) > sizeof(range->va)) { + if (unlikely(range->va != iova || range->last_va != last)) + return -EOVERFLOW; + } + return 0; +} + +static __maybe_unused int make_range_u64(struct pt_common *common, + struct pt_range *range, u64 iova, + u64 len) +{ + if (unlikely(iova > ULONG_MAX || len > ULONG_MAX)) + return -EOVERFLOW; + return make_range_ul(common, range, iova, len); +} + +/* + * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch + * to the correct validation based on the type. + */ +#define make_range_no_check(common, range, iova, len) \ + ({ \ + int ret; \ + if (sizeof(iova) > sizeof(unsigned long) || \ + sizeof(len) > sizeof(unsigned long)) \ + ret = make_range_u64(common, range, iova, len); \ + else \ + ret = make_range_ul(common, range, iova, len); \ + ret; \ + }) + +#define make_range(common, range, iova, len) \ + ({ \ + int ret = make_range_no_check(common, range, iova, len); \ + if (!ret) \ + ret = pt_check_range(range); \ + ret; \ + }) + +static inline unsigned int compute_best_pgsize(struct pt_state *pts, + pt_oaddr_t oa) +{ + struct pt_iommu *iommu_table = iommu_from_common(pts->range->common); + + if (!pt_can_have_leaf(pts)) + return 0; + + /* + * The page size is limited by the domain's bitmap. This allows the core + * code to reduce the supported page sizes by changing the bitmap. + */ + return pt_compute_best_pgsize(pt_possible_sizes(pts) & + iommu_table->domain.pgsize_bitmap, + pts->range->va, pts->range->last_va, oa); +} + +static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + pt_oaddr_t *res = arg; + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, descend_fn); + case PT_ENTRY_OA: + *res = pt_entry_oa_exact(&pts); + return 0; + } + return -ENOENT; +} +PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys); + +/** + * iova_to_phys() - Return the output address for the given IOVA + * @domain: Table to query + * @iova: IO virtual address to query + * + * Determine the output address from the given IOVA. @iova may have any + * alignment, the returned physical will be adjusted with any sub page offset. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Return: 0 if there is no translation for the given iova. + */ +phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_range range; + pt_oaddr_t res; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + ret = pt_walk_range(&range, __iova_to_phys, &res); + /* PHYS_ADDR_MAX would be a better error code */ + if (ret) + return 0; + return res; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); + +struct pt_iommu_dirty_args { + struct iommu_dirty_bitmap *dirty; + unsigned int flags; +}; + +static void record_dirty(struct pt_state *pts, + struct pt_iommu_dirty_args *dirty, + unsigned int num_contig_lg2) +{ + pt_vaddr_t dirty_len; + + if (num_contig_lg2 != ilog2(1)) { + unsigned int index = pts->index; + unsigned int end_index = log2_set_mod_max_t( + unsigned int, pts->index, num_contig_lg2); + + /* Adjust for being contained inside a contiguous page */ + end_index = min(end_index, pts->end_index); + dirty_len = (end_index - index) * + log2_to_int(pt_table_item_lg2sz(pts)); + } else { + dirty_len = log2_to_int(pt_table_item_lg2sz(pts)); + } + + if (dirty->dirty->bitmap) + iova_bitmap_set(dirty->dirty->bitmap, pts->range->va, + dirty_len); + + if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) { + /* + * No write log required because DMA incoherence and atomic + * dirty tracking bits can't work together + */ + pt_entry_make_write_clean(pts); + iommu_iotlb_gather_add_range(dirty->dirty->gather, + pts->range->va, dirty_len); + } +} + +static inline int __read_and_clear_dirty(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_dirty_args *dirty = arg; + int ret; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + ret = pt_descend(&pts, arg, __read_and_clear_dirty); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts)) + record_dirty(&pts, dirty, + pt_entry_num_contig_lg2(&pts)); + } + return 0; +} + +/** + * read_and_clear_dirty() - Manipulate the HW set write dirty state + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the IOVA + * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR + * @dirty: Place to store the dirty bits + * + * Iterate over all the entries in the mapped range and record their write dirty + * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then + * the entries will be left dirty, otherwise they are returned to being not + * write dirty. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_iommu_dirty_args dirty_args = { + .dirty = dirty, + .flags = flags, + }; + struct pt_range range; + int ret; + +#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty) + return -EOPNOTSUPP; +#endif + + ret = make_range(common_from_iommu(iommu_table), &range, iova, size); + if (ret) + return ret; + + ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args); + PT_WARN_ON(ret); + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); + +static inline int __set_dirty(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, __set_dirty); + case PT_ENTRY_OA: + if (!pt_entry_make_write_dirty(&pts)) + return -EAGAIN; + return 0; + } + return -ENOENT; +} + +static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table, + dma_addr_t iova) +{ + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + /* + * Note: There is no locking here yet, if the test suite races this it + * can crash. It should use RCU locking eventually. + */ + return pt_walk_range(&range, __set_dirty, NULL); +} + +struct pt_iommu_collect_args { + struct iommu_pages_list free_list; + /* Fail if any OAs are within the range */ + u8 check_mapped : 1; +}; + +static int __collect_tables(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_collect_args *collect = arg; + int ret; + + if (!collect->check_mapped && !pt_can_have_table(&pts)) + return 0; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + iommu_pages_list_add(&collect->free_list, pts.table_lower); + ret = pt_descend(&pts, arg, __collect_tables); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && collect->check_mapped) + return -EADDRINUSE; + } + return 0; +} + +enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH}; + +/* Allocate a table, the empty table will be ready to be installed. */ +static inline struct pt_table_p *_table_alloc(struct pt_common *common, + size_t lg2sz, gfp_t gfp, + enum alloc_mode mode) +{ + struct pt_iommu *iommu_table = iommu_from_common(common); + struct pt_table_p *table_mem; + + table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp, + log2_to_int(lg2sz)); + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + mode == ALLOC_NORMAL) { + int ret = iommu_pages_start_incoherent( + table_mem, iommu_table->iommu_device); + if (ret) { + iommu_free_pages(table_mem); + return ERR_PTR(ret); + } + } + return table_mem; +} + +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, + uintptr_t top_of_table, + gfp_t gfp, + enum alloc_mode mode) +{ + /* + * Top doesn't need the free list or otherwise, so it technically + * doesn't need to use iommu pages. Use the API anyhow as the top is + * usually not smaller than PAGE_SIZE to keep things simple. + */ + return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table), + gfp, mode); +} + +/* Allocate an interior table */ +static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts, + gfp_t gfp, enum alloc_mode mode) +{ + struct pt_state child_pts = + pt_init(parent_pts->range, parent_pts->level - 1, NULL); + + return _table_alloc(parent_pts->range->common, + pt_num_items_lg2(&child_pts) + + ilog2(PT_ITEM_WORD_SIZE), + gfp, mode); +} + +static inline int pt_iommu_new_table(struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + struct pt_table_p *table_mem; + phys_addr_t phys; + + /* Given PA/VA/length can't be represented */ + if (PT_WARN_ON(!pt_can_have_table(pts))) + return -ENXIO; + + table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + + phys = virt_to_phys(table_mem); + if (!pt_install_table(pts, phys, attrs)) { + iommu_pages_free_incoherent( + table_mem, + iommu_from_common(pts->range->common)->iommu_device); + return -EAGAIN; + } + + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) { + flush_writes_item(pts); + pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE); + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + /* + * The underlying table can't store the physical table address. + * This happens when kunit testing tables outside their normal + * environment where a CPU might be limited. + */ + pt_load_single_entry(pts); + if (PT_WARN_ON(pt_table_pa(pts) != phys)) { + pt_clear_entries(pts, ilog2(1)); + iommu_pages_free_incoherent( + table_mem, iommu_from_common(pts->range->common) + ->iommu_device); + return -EINVAL; + } + } + + pts->table_lower = table_mem; + return 0; +} + +struct pt_iommu_map_args { + struct iommu_iotlb_gather *iotlb_gather; + struct pt_write_attrs attrs; + pt_oaddr_t oa; + unsigned int leaf_pgsize_lg2; + unsigned int leaf_level; +}; + +/* + * This will recursively check any tables in the block to validate they are + * empty and then free them through the gather. + */ +static int clear_contig(const struct pt_state *start_pts, + struct iommu_iotlb_gather *iotlb_gather, + unsigned int step, unsigned int pgsize_lg2) +{ + struct pt_iommu *iommu_table = + iommu_from_common(start_pts->range->common); + struct pt_range range = *start_pts->range; + struct pt_state pts = + pt_init(&range, start_pts->level, start_pts->table); + struct pt_iommu_collect_args collect = { .check_mapped = true }; + int ret; + + pts.index = start_pts->index; + pts.end_index = start_pts->index + step; + for (; _pt_iter_load(&pts); pt_next_entry(&pts)) { + if (pts.type == PT_ENTRY_TABLE) { + collect.free_list = + IOMMU_PAGES_LIST_INIT(collect.free_list); + ret = pt_walk_descend_all(&pts, __collect_tables, + &collect); + if (ret) + return ret; + + /* + * The table item must be cleared before we can update + * the gather + */ + pt_clear_entries(&pts, ilog2(1)); + flush_writes_item(&pts); + + iommu_pages_list_add(&collect.free_list, + pt_table_ptr(&pts)); + gather_range_pages( + iotlb_gather, iommu_table, range.va, + log2_to_int(pt_table_item_lg2sz(&pts)), + &collect.free_list); + } else if (pts.type != PT_ENTRY_EMPTY) { + return -EADDRINUSE; + } + } + return 0; +} + +static int __map_range_leaf(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; + unsigned int start_index; + pt_oaddr_t oa = map->oa; + unsigned int step; + bool need_contig; + int ret = 0; + + PT_WARN_ON(map->leaf_level != level); + PT_WARN_ON(!pt_can_have_leaf(&pts)); + + step = log2_to_int_t(unsigned int, + leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); + need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); + + _pt_iter_first(&pts); + start_index = pts.index; + do { + pts.type = pt_load_entry_raw(&pts); + if (pts.type != PT_ENTRY_EMPTY || need_contig) { + if (pts.index != start_index) + pt_index_to_va(&pts); + ret = clear_contig(&pts, map->iotlb_gather, step, + leaf_pgsize_lg2); + if (ret) + break; + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + pt_index_to_va(&pts); + PT_WARN_ON(compute_best_pgsize(&pts, oa) != + leaf_pgsize_lg2); + } + pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs); + + oa += log2_to_int(leaf_pgsize_lg2); + pts.index += step; + } while (pts.index < pts.end_index); + + flush_writes_range(&pts, start_index, pts.index); + + map->oa = oa; + return ret; +} + +static int __map_range(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + int ret; + + PT_WARN_ON(map->leaf_level == level); + PT_WARN_ON(!pt_can_have_table(&pts)); + + _pt_iter_first(&pts); + + /* Descend to a child table */ + do { + pts.type = pt_load_entry_raw(&pts); + + if (pts.type != PT_ENTRY_TABLE) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + ret = pt_iommu_new_table(&pts, &map->attrs); + if (ret) { + /* + * Racing with another thread installing a table + */ + if (ret == -EAGAIN) + continue; + return ret; + } + } else { + pts.table_lower = pt_table_ptr(&pts); + /* + * Racing with a shared pt_iommu_new_table()? The other + * thread is still flushing the cache, so we have to + * also flush it to ensure that when our thread's map + * completes all the table items leading to our mapping + * are visible. + * + * This requires the pt_set_bit_release() to be a + * release of the cache flush so that this can acquire + * visibility at the iommu. + */ + if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) && + !pt_test_sw_bit_acquire(&pts, + SW_BIT_CACHE_FLUSH_DONE)) + flush_writes_item(&pts); + } + + /* + * The already present table can possibly be shared with another + * concurrent map. + */ + if (map->leaf_level == level - 1) + ret = pt_descend(&pts, arg, __map_range_leaf); + else + ret = pt_descend(&pts, arg, __map_range); + if (ret) + return ret; + + pts.index++; + pt_index_to_va(&pts); + if (pts.index >= pts.end_index) + break; + } while (true); + return 0; +} + +/* + * Fast path for the easy case of mapping a 4k page to an already allocated + * table. This is a common workload. If it returns EAGAIN run the full algorithm + * instead. + */ +static __always_inline int __do_map_single_page(struct pt_range *range, + void *arg, unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + + pts.type = pt_load_single_entry(&pts); + if (level == 0) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT, + &map->attrs); + /* No flush, not used when incoherent */ + map->oa += PAGE_SIZE; + return 0; + } + if (pts.type == PT_ENTRY_TABLE) + return pt_descend(&pts, arg, descend_fn); + /* Something else, use the slow path */ + return -EAGAIN; +} +PT_MAKE_LEVELS(__map_single_page, __do_map_single_page); + +/* + * Add a table to the top, increasing the top level as much as necessary to + * encompass range. + */ +static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list); + struct pt_common *common = common_from_iommu(iommu_table); + uintptr_t top_of_table = READ_ONCE(common->top_of_table); + uintptr_t new_top_of_table = top_of_table; + struct pt_table_p *table_mem; + unsigned int new_level; + spinlock_t *domain_lock; + unsigned long flags; + int ret; + + while (true) { + struct pt_range top_range = + _pt_top_range(common, new_top_of_table); + struct pt_state pts = pt_init_top(&top_range); + + top_range.va = range->va; + top_range.last_va = range->last_va; + + if (!pt_check_range(&top_range) && + map->leaf_level <= pts.level) { + new_level = pts.level; + break; + } + + pts.level++; + if (pts.level > PT_MAX_TOP_LEVEL || + pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) { + ret = -ERANGE; + goto err_free; + } + + table_mem = + table_alloc_top(common, _pt_top_set(NULL, pts.level), + map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH); + if (IS_ERR(table_mem)) { + ret = PTR_ERR(table_mem); + goto err_free; + } + iommu_pages_list_add(&free_list, table_mem); + + /* The new table links to the lower table always at index 0 */ + top_range.va = 0; + top_range.top_level = pts.level; + pts.table_lower = pts.table; + pts.table = table_mem; + pt_load_single_entry(&pts); + PT_WARN_ON(pts.index != 0); + pt_install_table(&pts, virt_to_phys(pts.table_lower), + &map->attrs); + new_top_of_table = _pt_top_set(pts.table, pts.level); + } + + /* + * Avoid double flushing, flush it once after all pt_install_table() + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + ret = iommu_pages_start_incoherent_list( + &free_list, iommu_table->iommu_device); + if (ret) + goto err_free; + } + + /* + * top_of_table is write locked by the spinlock, but readers can use + * READ_ONCE() to get the value. Since we encode both the level and the + * pointer in one quanta the lockless reader will always see something + * valid. The HW must be updated to the new level under the spinlock + * before top_of_table is updated so that concurrent readers don't map + * into the new level until it is fully functional. If another thread + * already updated it while we were working then throw everything away + * and try again. + */ + domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table); + spin_lock_irqsave(domain_lock, flags); + if (common->top_of_table != top_of_table || + top_of_table == new_top_of_table) { + spin_unlock_irqrestore(domain_lock, flags); + ret = -EAGAIN; + goto err_free; + } + + /* + * We do not issue any flushes for change_top on the expectation that + * any walk cache will not become a problem by adding another layer to + * the tree. Misses will rewalk from the updated top pointer, hits + * continue to be correct. Negative caching is fine too since all the + * new IOVA added by the new top is non-present. + */ + iommu_table->driver_ops->change_top( + iommu_table, virt_to_phys(table_mem), new_level); + WRITE_ONCE(common->top_of_table, new_top_of_table); + spin_unlock_irqrestore(domain_lock, flags); + return 0; + +err_free: + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&free_list, + iommu_table->iommu_device); + iommu_put_pages_list(&free_list); + return ret; +} + +static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct pt_common *common = common_from_iommu(iommu_table); + int ret; + + do { + ret = pt_check_range(range); + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + return ret; + + if (!ret && map->leaf_level <= range->top_level) + break; + + ret = increase_top(iommu_table, range, map); + if (ret && ret != -EAGAIN) + return ret; + + /* Reload the new top */ + *range = pt_make_range(common, range->va, range->last_va); + } while (ret); + PT_WARN_ON(pt_check_range(range)); + return 0; +} + +static int do_map(struct pt_range *range, struct pt_common *common, + bool single_page, struct pt_iommu_map_args *map) +{ + /* + * The __map_single_page() fast path does not support DMA_INCOHERENT + * flushing to keep its .text small. + */ + if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + int ret; + + ret = pt_walk_range(range, __map_single_page, map); + if (ret != -EAGAIN) + return ret; + /* EAGAIN falls through to the full path */ + } + + if (map->leaf_level == range->top_level) + return pt_walk_range(range, __map_range_leaf, map); + return pt_walk_range(range, __map_range, map); +} + +/** + * map_pages() - Install translation for an IOVA range + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * @mapped: Total bytes successfully mapped + * + * The range starting at IOVA will have paddr installed into it. The caller + * must specify a valid pgsize and pgcount to segment the range into compatible + * blocks. + * + * On error the caller will probably want to invoke unmap on the range from iova + * up to the amount indicated by @mapped to return the table back to an + * unchanged state. + * + * Context: The caller must hold a write range lock that includes the whole + * range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were + * mapped are added to @mapped, @mapped is not zerod first. + */ +int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; + struct pt_common *common = common_from_iommu(iommu_table); + struct iommu_iotlb_gather iotlb_gather; + pt_vaddr_t len = pgsize * pgcount; + struct pt_iommu_map_args map = { + .iotlb_gather = &iotlb_gather, + .oa = paddr, + .leaf_pgsize_lg2 = vaffs(pgsize), + }; + bool single_page = false; + struct pt_range range; + int ret; + + iommu_iotlb_gather_init(&iotlb_gather); + + if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE)))) + return -EINVAL; + + /* Check the paddr doesn't exceed what the table can store */ + if ((sizeof(pt_oaddr_t) < sizeof(paddr) && + (pt_vaddr_t)paddr > PT_VADDR_MAX) || + (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 && + oalog2_div(paddr, common->max_oasz_lg2))) + return -ERANGE; + + ret = pt_iommu_set_prot(common, &map.attrs, prot); + if (ret) + return ret; + map.attrs.gfp = gfp; + + ret = make_range_no_check(common, &range, iova, len); + if (ret) + return ret; + + /* Calculate target page size and level for the leaves */ + if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && + pgcount == 1) { + PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); + if (log2_mod(iova | paddr, PAGE_SHIFT)) + return -ENXIO; + map.leaf_pgsize_lg2 = PAGE_SHIFT; + map.leaf_level = 0; + single_page = true; + } else { + map.leaf_pgsize_lg2 = pt_compute_best_pgsize( + pgsize_bitmap, range.va, range.last_va, paddr); + if (!map.leaf_pgsize_lg2) + return -ENXIO; + map.leaf_level = + pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); + } + + ret = check_map_range(iommu_table, &range, &map); + if (ret) + return ret; + + PT_WARN_ON(map.leaf_level > range.top_level); + + ret = do_map(&range, common, single_page, &map); + + /* + * Table levels were freed and replaced with large items, flush any walk + * cache that may refer to the freed levels. + */ + if (!iommu_pages_list_empty(&iotlb_gather.freelist)) + iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather); + + /* Bytes successfully mapped */ + PT_WARN_ON(!ret && map.oa - paddr != len); + *mapped += map.oa - paddr; + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); + +struct pt_unmap_args { + struct iommu_pages_list free_list; + pt_vaddr_t unmapped; +}; + +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_unmap_args *unmap = arg; + unsigned int num_oas = 0; + unsigned int start_index; + int ret = 0; + + _pt_iter_first(&pts); + start_index = pts.index; + pts.type = pt_load_entry_raw(&pts); + /* + * A starting index is in the middle of a contiguous entry + * + * The IOMMU API does not require drivers to support unmapping parts of + * large pages. Long ago VFIO would try to split maps but the current + * version never does. + * + * Instead when unmap reaches a partial unmap of the start of a large + * IOPTE it should remove the entire IOPTE and return that size to the + * caller. + */ + if (pts.type == PT_ENTRY_OA) { + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) + return -EINVAL; + /* Micro optimization */ + goto start_oa; + } + + do { + if (pts.type != PT_ENTRY_OA) { + bool fully_covered; + + if (pts.type != PT_ENTRY_TABLE) { + ret = -EINVAL; + break; + } + + if (pts.index != start_index) + pt_index_to_va(&pts); + pts.table_lower = pt_table_ptr(&pts); + + fully_covered = pt_entry_fully_covered( + &pts, pt_table_item_lg2sz(&pts)); + + ret = pt_descend(&pts, arg, __unmap_range); + if (ret) + break; + + /* + * If the unmapping range fully covers the table then we + * can free it as well. The clear is delayed until we + * succeed in clearing the lower table levels. + */ + if (fully_covered) { + iommu_pages_list_add(&unmap->free_list, + pts.table_lower); + pt_clear_entries(&pts, ilog2(1)); + } + pts.index++; + } else { + unsigned int num_contig_lg2; +start_oa: + /* + * If the caller requested an last that falls within a + * single entry then the entire entry is unmapped and + * the length returned will be larger than requested. + */ + num_contig_lg2 = pt_entry_num_contig_lg2(&pts); + pt_clear_entries(&pts, num_contig_lg2); + num_oas += log2_to_int(num_contig_lg2); + pts.index += log2_to_int(num_contig_lg2); + } + if (pts.index >= pts.end_index) + break; + pts.type = pt_load_entry_raw(&pts); + } while (true); + + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); + flush_writes_range(&pts, start_index, pts.index); + + return ret; +} + +/** + * unmap_pages() - Make a range of IOVA empty/not present + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_pages() will remove a translation created by map_pages(). It cannot + * subdivide a mapping created by map_pages(), so it should be called with IOVA + * ranges that match those passed to map_pages(). The IOVA range can aggregate + * contiguous map_pages() calls so long as no individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the point + * unmapping stopped. + */ +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( + unmap.free_list) }; + pt_vaddr_t len = pgsize * pgcount; + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, len); + if (ret) + return 0; + + pt_walk_range(&range, __unmap_range, &unmap); + + gather_range_pages(iotlb_gather, iommu_table, iova, len, + &unmap.free_list); + + return unmap.unmapped; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); + +static void NS(get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_top_range(common); + struct pt_state pts = pt_init_top(&range); + pt_vaddr_t pgsize_bitmap = 0; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) { + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL; + pts.level++) { + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) + break; + pgsize_bitmap |= pt_possible_sizes(&pts); + } + } else { + for (pts.level = 0; pts.level <= range.top_level; pts.level++) + pgsize_bitmap |= pt_possible_sizes(&pts); + } + + /* Hide page sizes larger than the maximum OA */ + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2); +} + +static void NS(deinit)(struct pt_iommu *iommu_table) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + /* + * The driver has to already have fenced the HW access to the page table + * and invalidated any caching referring to this memory. + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&collect.free_list, + iommu_table->iommu_device); + iommu_put_pages_list(&collect.free_list); +} + +static const struct pt_iommu_ops NS(ops) = { +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ + IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) + .set_dirty = NS(set_dirty), +#endif + .get_info = NS(get_info), + .deinit = NS(deinit), +}; + +static int pt_init_common(struct pt_common *common) +{ + struct pt_range top_range = pt_top_range(common); + + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL)) + return -EINVAL; + + if (top_range.top_level == PT_MAX_TOP_LEVEL || + common->max_vasz_lg2 == top_range.max_vasz_lg2) + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP); + + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2) + common->features |= BIT(PT_FEAT_FULL_VA); + + /* Requested features must match features compiled into this format */ + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) || + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) && + (common->features & PT_FORCE_ENABLED_FEATURES) != + PT_FORCE_ENABLED_FEATURES)) + return -EOPNOTSUPP; + + /* + * Check if the top level of the page table is too small to hold the + * specified maxvasz. + */ + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + top_range.top_level != PT_MAX_TOP_LEVEL) { + struct pt_state pts = { .range = &top_range, + .level = top_range.top_level }; + + if (common->max_vasz_lg2 > + pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts)) + return -EOPNOTSUPP; + } + + if (common->max_oasz_lg2 == 0) + common->max_oasz_lg2 = pt_max_oa_lg2(common); + else + common->max_oasz_lg2 = min(common->max_oasz_lg2, + pt_max_oa_lg2(common)); + return 0; +} + +static int pt_iommu_init_domain(struct pt_iommu *iommu_table, + struct iommu_domain *domain) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_iommu_info info; + struct pt_range range; + + NS(get_info)(iommu_table, &info); + + domain->type = __IOMMU_DOMAIN_PAGING; + domain->pgsize_bitmap = info.pgsize_bitmap; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + range = _pt_top_range(common, + _pt_top_set(NULL, PT_MAX_TOP_LEVEL)); + else + range = pt_top_range(common); + + /* A 64-bit high address space table on a 32-bit system cannot work. */ + domain->geometry.aperture_start = (unsigned long)range.va; + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va) + return -EOVERFLOW; + + /* + * The aperture is limited to what the API can do after considering all + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used + * to store a VA. Set the aperture to something that is valid for all + * cases. Saturate instead of truncate the end if the types are smaller + * than the top range. aperture_end should be called aperture_last. + */ + domain->geometry.aperture_end = (unsigned long)range.last_va; + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) { + domain->geometry.aperture_end = ULONG_MAX; + domain->pgsize_bitmap &= ULONG_MAX; + } + domain->geometry.force_aperture = true; + + return 0; +} + +static void pt_iommu_zero(struct pt_iommu_table *fmt_table) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_iommu cfg = *iommu_table; + + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0); + memset_after(fmt_table, 0, iommu.domain); + + /* The caller can initialize some of these values */ + iommu_table->iommu_device = cfg.iommu_device; + iommu_table->driver_ops = cfg.driver_ops; + iommu_table->nid = cfg.nid; +} + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) + +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_table_p *table_mem; + int ret; + + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 || + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2) + return -EINVAL; + + pt_iommu_zero(fmt_table); + common->features = cfg->common.features; + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2; + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2; + ret = pt_iommu_fmt_init(fmt_table, cfg); + if (ret) + return ret; + + if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common)) + return -EINVAL; + + ret = pt_init_common(common); + if (ret) + return ret; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + WARN_ON(!iommu_table->driver_ops || + !iommu_table->driver_ops->change_top || + !iommu_table->driver_ops->get_top_lock)) + return -EINVAL; + + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && + (pt_feature(common, PT_FEAT_FULL_VA) || + pt_feature(common, PT_FEAT_DYNAMIC_TOP))) + return -EINVAL; + + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + WARN_ON(!iommu_table->iommu_device)) + return -EINVAL; + + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); + if (ret) + return ret; + + table_mem = table_alloc_top(common, common->top_of_table, gfp, + ALLOC_NORMAL); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + pt_top_set(common, table_mem, pt_top_get_level(common)); + + /* Must be last, see pt_iommu_deinit() */ + iommu_table->ops = &NS(ops); + return 0; +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU"); + +#ifdef pt_iommu_fmt_hw_info +#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info) +#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info) +void pt_iommu_hw_info(struct pt_iommu_table *fmt_table, + struct pt_iommu_table_hw_info *info) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range top_range = pt_top_range(common); + + pt_iommu_fmt_hw_info(fmt_table, &top_range, info); +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); +#endif + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); +MODULE_IMPORT_NS("GENERIC_PT"); +/* For iommu_dirty_bitmap_record() */ +MODULE_IMPORT_NS("IOMMUFD"); + +#endif /* __GENERIC_PT_IOMMU_PT_H */ diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h new file mode 100644 index 000000000000..68278bf15cfe --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_generic_pt.h @@ -0,0 +1,823 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Test the format API directly. + * + */ +#include "kunit_iommu.h" +#include "pt_iter.h" + +static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + int ret; + + KUNIT_ASSERT_EQ(test, len, (size_t)len); + + ret = iommu_map(&priv->domain, va, pa, len, IOMMU_READ | IOMMU_WRITE, + GFP_KERNEL); + KUNIT_ASSERT_NO_ERRNO_FN(test, "map_pages", ret); +} + +#define KUNIT_ASSERT_PT_LOAD(test, pts, entry) \ + ({ \ + pt_load_entry(pts); \ + KUNIT_ASSERT_EQ(test, (pts)->type, entry); \ + }) + +struct check_levels_arg { + struct kunit *test; + void *fn_arg; + void (*fn)(struct kunit *test, struct pt_state *pts, void *arg); +}; + +static int __check_all_levels(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct check_levels_arg *chk = arg; + struct kunit *test = chk->test; + int ret; + + _pt_iter_first(&pts); + + + /* + * If we were able to use the full VA space this should always be the + * last index in each table. + */ + if (!(IS_32BIT && range->max_vasz_lg2 > 32)) { + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND) && + pts.level == pts.range->top_level) + KUNIT_ASSERT_EQ(test, pts.index, + log2_to_int(range->max_vasz_lg2 - 1 - + pt_table_item_lg2sz(&pts)) - + 1); + else + KUNIT_ASSERT_EQ(test, pts.index, + log2_to_int(pt_table_oa_lg2sz(&pts) - + pt_table_item_lg2sz(&pts)) - + 1); + } + + if (pt_can_have_table(&pts)) { + pt_load_single_entry(&pts); + KUNIT_ASSERT_EQ(test, pts.type, PT_ENTRY_TABLE); + ret = pt_descend(&pts, arg, __check_all_levels); + KUNIT_ASSERT_EQ(test, ret, 0); + + /* Index 0 is used by the test */ + if (IS_32BIT && !pts.index) + return 0; + KUNIT_ASSERT_NE(chk->test, pts.index, 0); + } + + /* + * A format should not create a table with only one entry, at least this + * test approach won't work. + */ + KUNIT_ASSERT_GT(chk->test, pts.end_index, 1); + + /* + * For increase top we end up using index 0 for the original top's tree, + * so use index 1 for testing instead. + */ + pts.index = 0; + pt_index_to_va(&pts); + pt_load_single_entry(&pts); + if (pts.type == PT_ENTRY_TABLE && pts.end_index > 2) { + pts.index = 1; + pt_index_to_va(&pts); + } + (*chk->fn)(chk->test, &pts, chk->fn_arg); + return 0; +} + +/* + * Call fn for each level in the table with a pts setup to index 0 in a table + * for that level. This allows writing tests that run on every level. + * The test can use every index in the table except the last one. + */ +static void check_all_levels(struct kunit *test, + void (*fn)(struct kunit *test, + struct pt_state *pts, void *arg), + void *fn_arg) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct check_levels_arg chk = { + .test = test, + .fn = fn, + .fn_arg = fn_arg, + }; + int ret; + + if (pt_feature(priv->common, PT_FEAT_DYNAMIC_TOP) && + priv->common->max_vasz_lg2 > range.max_vasz_lg2) + range.last_va = fvalog2_set_mod_max(range.va, + priv->common->max_vasz_lg2); + + /* + * Map a page at the highest VA, this will populate all the levels so we + * can then iterate over them. Index 0 will be used for testing. + */ + if (IS_32BIT && range.max_vasz_lg2 > 32) + range.last_va = (u32)range.last_va; + range.va = range.last_va - (priv->smallest_pgsz - 1); + do_map(test, range.va, 0, priv->smallest_pgsz); + + range = pt_make_range(priv->common, range.va, range.last_va); + ret = pt_walk_range(&range, __check_all_levels, &chk); + KUNIT_ASSERT_EQ(test, ret, 0); +} + +static void test_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + /* Fixture does the setup */ + KUNIT_ASSERT_NE(test, priv->info.pgsize_bitmap, 0); +} + +/* + * Basic check that the log2_* functions are working, especially at the integer + * limits. + */ +static void test_bitops(struct kunit *test) +{ + int i; + + KUNIT_ASSERT_EQ(test, fls_t(u32, 0), 0); + KUNIT_ASSERT_EQ(test, fls_t(u32, 1), 1); + KUNIT_ASSERT_EQ(test, fls_t(u32, BIT(2)), 3); + KUNIT_ASSERT_EQ(test, fls_t(u32, U32_MAX), 32); + + KUNIT_ASSERT_EQ(test, fls_t(u64, 0), 0); + KUNIT_ASSERT_EQ(test, fls_t(u64, 1), 1); + KUNIT_ASSERT_EQ(test, fls_t(u64, BIT(2)), 3); + KUNIT_ASSERT_EQ(test, fls_t(u64, U64_MAX), 64); + + KUNIT_ASSERT_EQ(test, ffs_t(u32, 1), 0); + KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(2)), 2); + KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(31)), 31); + + KUNIT_ASSERT_EQ(test, ffs_t(u64, 1), 0); + KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT(2)), 2); + KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT_ULL(63)), 63); + + for (i = 0; i != 31; i++) + KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i); + + for (i = 0; i != 63; i++) + KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i); + + for (i = 0; i != 32; i++) { + u64 val = get_random_u64(); + + KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffs_t(u32, val)), 0); + KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffs_t(u64, val)), 0); + + KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffz_t(u32, val)), + log2_to_max_int_t(u32, ffz_t(u32, val))); + KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffz_t(u64, val)), + log2_to_max_int_t(u64, ffz_t(u64, val))); + } +} + +static unsigned int ref_best_pgsize(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, + pt_vaddr_t last_va, pt_oaddr_t oa) +{ + pt_vaddr_t pgsz_lg2; + + /* Brute force the constraints described in pt_compute_best_pgsize() */ + for (pgsz_lg2 = PT_VADDR_MAX_LG2 - 1; pgsz_lg2 != 0; pgsz_lg2--) { + if ((pgsz_bitmap & log2_to_int(pgsz_lg2)) && + log2_mod(va, pgsz_lg2) == 0 && + oalog2_mod(oa, pgsz_lg2) == 0 && + va + log2_to_int(pgsz_lg2) - 1 <= last_va && + log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2) && + oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)) + return pgsz_lg2; + } + return 0; +} + +/* Check that the bit logic in pt_compute_best_pgsize() works. */ +static void test_best_pgsize(struct kunit *test) +{ + unsigned int a_lg2; + unsigned int b_lg2; + unsigned int c_lg2; + + /* Try random prefixes with every suffix combination */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } + + /* 0 prefix, every suffix */ + for (c_lg2 = 1; c_lg2 != PT_VADDR_MAX_LG2 - 1; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = 0; + pt_oaddr_t oa = 0; + pt_vaddr_t last_va = log2_set_mod_max(0, c_lg2); + + KUNIT_ASSERT_EQ(test, + pt_compute_best_pgsize(pgsz_bitmap, va, last_va, + oa), + ref_best_pgsize(pgsz_bitmap, va, last_va, oa)); + } + + /* 1's prefix, every suffix */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = PT_VADDR_MAX << a_lg2; + pt_oaddr_t oa = PT_VADDR_MAX << b_lg2; + pt_vaddr_t last_va = PT_VADDR_MAX; + + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } + + /* pgsize_bitmap is always 0 */ + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { + pt_vaddr_t pgsz_bitmap = 0; + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + 0); + } + } + } + + if (sizeof(pt_vaddr_t) <= 4) + return; + + /* over 32 bit page sizes */ + for (a_lg2 = 32; a_lg2 != 42; a_lg2++) { + for (b_lg2 = 32; b_lg2 != 42; b_lg2++) { + for (c_lg2 = 32; c_lg2 != 42; c_lg2++) { + pt_vaddr_t pgsz_bitmap = get_random_u64(); + pt_vaddr_t va = get_random_u64() << a_lg2; + pt_oaddr_t oa = get_random_u64() << b_lg2; + pt_vaddr_t last_va = log2_set_mod_max( + get_random_u64(), c_lg2); + + if (va > last_va) + swap(va, last_va); + KUNIT_ASSERT_EQ( + test, + pt_compute_best_pgsize(pgsz_bitmap, va, + last_va, oa), + ref_best_pgsize(pgsz_bitmap, va, + last_va, oa)); + } + } + } +} + +/* + * Check that pt_install_table() and pt_table_pa() match + */ +static void test_lvl_table_ptr(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_oaddr_t paddr = + log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2); + struct pt_write_attrs attrs = {}; + + if (!pt_can_have_table(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + pt_load_single_entry(pts); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + + KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs)); + + /* A second install should pass because install updates pts->entry. */ + KUNIT_ASSERT_EQ(test, pt_install_table(pts, paddr, &attrs), true); + + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_TABLE); + KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr); + + pt_clear_entries(pts, ilog2(1)); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); +} + +static void test_table_ptr(struct kunit *test) +{ + check_all_levels(test, test_lvl_table_ptr, NULL); +} + +struct lvl_radix_arg { + pt_vaddr_t vbits; +}; + +/* + * Check pt_table_oa_lg2sz() and pt_table_item_lg2sz() they need to decode a + * continuous list of VA across all the levels that covers the entire advertised + * VA space. + */ +static void test_lvl_radix(struct kunit *test, struct pt_state *pts, void *arg) +{ + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct lvl_radix_arg *radix = arg; + + /* Every bit below us is decoded */ + KUNIT_ASSERT_EQ(test, log2_set_mod_max(0, isz_lg2), radix->vbits); + + /* We are not decoding bits someone else is */ + KUNIT_ASSERT_EQ(test, log2_div(radix->vbits, isz_lg2), 0); + + /* Can't decode past the pt_vaddr_t size */ + KUNIT_ASSERT_LE(test, table_lg2sz, PT_VADDR_MAX_LG2); + KUNIT_ASSERT_EQ(test, fvalog2_div(table_lg2sz, PT_MAX_VA_ADDRESS_LG2), + 0); + + radix->vbits = fvalog2_set_mod_max(0, table_lg2sz); +} + +static void test_max_va(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + + KUNIT_ASSERT_GE(test, priv->common->max_vasz_lg2, range.max_vasz_lg2); +} + +static void test_table_radix(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct lvl_radix_arg radix = { .vbits = priv->smallest_pgsz - 1 }; + struct pt_range range; + + check_all_levels(test, test_lvl_radix, &radix); + + range = pt_top_range(priv->common); + if (range.max_vasz_lg2 == PT_VADDR_MAX_LG2) { + KUNIT_ASSERT_EQ(test, radix.vbits, PT_VADDR_MAX); + } else { + if (!IS_32BIT) + KUNIT_ASSERT_EQ(test, + log2_set_mod_max(0, range.max_vasz_lg2), + radix.vbits); + KUNIT_ASSERT_EQ(test, log2_div(radix.vbits, range.max_vasz_lg2), + 0); + } +} + +static unsigned int safe_pt_num_items_lg2(const struct pt_state *pts) +{ + struct pt_range top_range = pt_top_range(pts->range->common); + struct pt_state top_pts = pt_init_top(&top_range); + + /* + * Avoid calling pt_num_items_lg2() on the top, instead we can derive + * the size of the top table from the top range. + */ + if (pts->level == top_range.top_level) + return ilog2(pt_range_to_end_index(&top_pts)); + return pt_num_items_lg2(pts); +} + +static void test_lvl_possible_sizes(struct kunit *test, struct pt_state *pts, + void *arg) +{ + unsigned int num_items_lg2 = safe_pt_num_items_lg2(pts); + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!pt_can_have_leaf(pts)) { + KUNIT_ASSERT_EQ(test, pgsize_bitmap, 0); + return; + } + + /* No bits for sizes that would be outside this table */ + KUNIT_ASSERT_EQ(test, log2_mod(pgsize_bitmap, isz_lg2), 0); + KUNIT_ASSERT_EQ( + test, fvalog2_div(pgsize_bitmap, num_items_lg2 + isz_lg2), 0); + + /* + * Non contiguous must be supported. AMDv1 has a HW bug where it does + * not support it on one of the levels. + */ + if ((u64)pgsize_bitmap != 0xff0000000000ULL || + strcmp(__stringify(PTPFX_RAW), "amdv1") != 0) + KUNIT_ASSERT_TRUE(test, pgsize_bitmap & log2_to_int(isz_lg2)); + else + KUNIT_ASSERT_NE(test, pgsize_bitmap, 0); + + /* A contiguous entry should not span the whole table */ + if (num_items_lg2 + isz_lg2 != PT_VADDR_MAX_LG2) + KUNIT_ASSERT_FALSE( + test, + pgsize_bitmap & log2_to_int(num_items_lg2 + isz_lg2)); +} + +static void test_entry_possible_sizes(struct kunit *test) +{ + check_all_levels(test, test_lvl_possible_sizes, NULL); +} + +static void sweep_all_pgsizes(struct kunit *test, struct pt_state *pts, + struct pt_write_attrs *attrs, + pt_oaddr_t test_oaddr) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + unsigned int len_lg2; + + if (pts->index != 0) + return; + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) { + struct pt_state sub_pts = *pts; + pt_oaddr_t oaddr; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + oaddr = log2_set_mod(test_oaddr, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, attrs); + /* Verify that every contiguous item translates correctly */ + for (sub_pts.index = 0; + sub_pts.index != log2_to_int(len_lg2 - isz_lg2); + sub_pts.index++) { + KUNIT_ASSERT_PT_LOAD(test, &sub_pts, PT_ENTRY_OA); + KUNIT_ASSERT_EQ(test, pt_item_oa(&sub_pts), + oaddr + sub_pts.index * + oalog2_mul(1, isz_lg2)); + KUNIT_ASSERT_EQ(test, pt_entry_oa(&sub_pts), oaddr); + KUNIT_ASSERT_EQ(test, pt_entry_num_contig_lg2(&sub_pts), + len_lg2 - isz_lg2); + } + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + } +} + +/* + * Check that pt_install_leaf_entry() and pt_entry_oa() match. + * Check that pt_clear_entries() works. + */ +static void test_lvl_entry_oa(struct kunit *test, struct pt_state *pts, + void *arg) +{ + unsigned int max_oa_lg2 = pts->range->common->max_oasz_lg2; + struct kunit_iommu_priv *priv = test->priv; + struct pt_write_attrs attrs = {}; + + if (!pt_can_have_leaf(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + sweep_all_pgsizes(test, pts, &attrs, priv->test_oa); + + /* Check that the table can store the boundary OAs */ + sweep_all_pgsizes(test, pts, &attrs, 0); + if (max_oa_lg2 == PT_OADDR_MAX_LG2) + sweep_all_pgsizes(test, pts, &attrs, PT_OADDR_MAX); + else + sweep_all_pgsizes(test, pts, &attrs, + oalog2_to_max_int(max_oa_lg2)); +} + +static void test_entry_oa(struct kunit *test) +{ + check_all_levels(test, test_lvl_entry_oa, NULL); +} + +/* Test pt_attr_from_entry() */ +static void test_lvl_attr_from_entry(struct kunit *test, struct pt_state *pts, + void *arg) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct kunit_iommu_priv *priv = test->priv; + unsigned int len_lg2; + unsigned int prot; + + if (!pt_can_have_leaf(pts)) + return; + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) { + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + for (prot = 0; prot <= (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE | + IOMMU_NOEXEC | IOMMU_MMIO); + prot++) { + pt_oaddr_t oaddr; + struct pt_write_attrs attrs = {}; + u64 good_entry; + + /* + * If the format doesn't support this combination of + * prot bits skip it + */ + if (pt_iommu_set_prot(pts->range->common, &attrs, + prot)) { + /* But RW has to be supported */ + KUNIT_ASSERT_NE(test, prot, + IOMMU_READ | IOMMU_WRITE); + continue; + } + + oaddr = log2_set_mod(priv->test_oa, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + good_entry = pts->entry; + + memset(&attrs, 0, sizeof(attrs)); + pt_attr_from_entry(pts, &attrs); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + /* + * The descriptor produced by pt_attr_from_entry() + * produce an identical entry value when re-written + */ + KUNIT_ASSERT_EQ(test, good_entry, pts->entry); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + } + } +} + +static void test_attr_from_entry(struct kunit *test) +{ + check_all_levels(test, test_lvl_attr_from_entry, NULL); +} + +static void test_lvl_dirty(struct kunit *test, struct pt_state *pts, void *arg) +{ + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct kunit_iommu_priv *priv = test->priv; + unsigned int start_idx = pts->index; + struct pt_write_attrs attrs = {}; + unsigned int len_lg2; + + if (!pt_can_have_leaf(pts)) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ | IOMMU_WRITE)); + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) { + pt_oaddr_t oaddr; + unsigned int i; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + oaddr = log2_set_mod(priv->test_oa, 0, len_lg2); + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); + + pt_load_entry(pts); + pt_entry_make_write_clean(pts); + pt_load_entry(pts); + KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts)); + + for (i = 0; i != log2_to_int(len_lg2 - isz_lg2); i++) { + /* dirty every contiguous entry */ + pts->index = start_idx + i; + pt_load_entry(pts); + KUNIT_ASSERT_TRUE(test, pt_entry_make_write_dirty(pts)); + pts->index = start_idx; + pt_load_entry(pts); + KUNIT_ASSERT_TRUE(test, pt_entry_is_write_dirty(pts)); + + pt_entry_make_write_clean(pts); + pt_load_entry(pts); + KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts)); + } + + pt_clear_entries(pts, len_lg2 - isz_lg2); + } +} + +static __maybe_unused void test_dirty(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!pt_dirty_supported(priv->common)) + kunit_skip(test, + "Page table features do not support dirty tracking"); + + check_all_levels(test, test_lvl_dirty, NULL); +} + +static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct pt_write_attrs attrs = {}; + unsigned int len_lg2; + + if (!pt_can_have_leaf(pts)) + return; + if (pts->index != 0) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2); + struct pt_write_attrs new_attrs = {}; + unsigned int bitnr; + + if (!(pgsize_bitmap & log2_to_int(len_lg2))) + continue; + + pt_install_leaf_entry(pts, paddr, len_lg2, &attrs); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) + KUNIT_ASSERT_FALSE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) { + KUNIT_ASSERT_FALSE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + pt_set_sw_bit_release(pts, bitnr); + KUNIT_ASSERT_TRUE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + } + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); + bitnr++) + KUNIT_ASSERT_TRUE(test, + pt_test_sw_bit_acquire(pts, bitnr)); + + KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr); + + /* SW bits didn't leak into the attrs */ + pt_attr_from_entry(pts, &new_attrs); + KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs)); + + pt_clear_entries(pts, len_lg2 - isz_lg2); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); + } +} + +static __maybe_unused void test_sw_bit_leaf(struct kunit *test) +{ + check_all_levels(test, test_lvl_sw_bit_leaf, NULL); +} + +static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts, + void *arg) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_write_attrs attrs = {}; + pt_oaddr_t paddr = + log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2); + unsigned int bitnr; + + if (!pt_can_have_leaf(pts)) + return; + if (pts->index != 0) + return; + + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", + pt_iommu_set_prot(pts->range->common, &attrs, + IOMMU_READ)); + + KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) + KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr)); + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) { + KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr)); + pt_set_sw_bit_release(pts, bitnr); + KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr)); + } + + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) + KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr)); + + KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr); + + pt_clear_entries(pts, ilog2(1)); + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); +} + +static __maybe_unused void test_sw_bit_table(struct kunit *test) +{ + check_all_levels(test, test_lvl_sw_bit_table, NULL); +} + +static struct kunit_case generic_pt_test_cases[] = { + KUNIT_CASE_FMT(test_init), + KUNIT_CASE_FMT(test_bitops), + KUNIT_CASE_FMT(test_best_pgsize), + KUNIT_CASE_FMT(test_table_ptr), + KUNIT_CASE_FMT(test_max_va), + KUNIT_CASE_FMT(test_table_radix), + KUNIT_CASE_FMT(test_entry_possible_sizes), + KUNIT_CASE_FMT(test_entry_oa), + KUNIT_CASE_FMT(test_attr_from_entry), +#ifdef pt_entry_is_write_dirty + KUNIT_CASE_FMT(test_dirty), +#endif +#ifdef pt_sw_bit + KUNIT_CASE_FMT(test_sw_bit_leaf), + KUNIT_CASE_FMT(test_sw_bit_table), +#endif + {}, +}; + +static int pt_kunit_generic_pt_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv; + int ret; + + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + ret = pt_kunit_priv_init(test, priv); + if (ret) { + kunit_kfree(test, priv); + return ret; + } + test->priv = priv; + return 0; +} + +static void pt_kunit_generic_pt_exit(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!test->priv) + return; + + pt_iommu_deinit(priv->iommu); + kunit_kfree(test, test->priv); +} + +static struct kunit_suite NS(generic_pt_suite) = { + .name = __stringify(NS(fmt_test)), + .init = pt_kunit_generic_pt_init, + .exit = pt_kunit_generic_pt_exit, + .test_cases = generic_pt_test_cases, +}; +kunit_test_suites(&NS(generic_pt_suite)); diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h new file mode 100644 index 000000000000..22c9e4c4dd97 --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_iommu.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_KUNIT_IOMMU_H +#define __GENERIC_PT_KUNIT_IOMMU_H + +#define GENERIC_PT_KUNIT 1 +#include <kunit/device.h> +#include <kunit/test.h> +#include "../iommu-pages.h" +#include "pt_iter.h" + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp); + +/* The format can provide a list of configurations it would like to test */ +#ifdef kunit_fmt_cfgs +static const void *kunit_pt_gen_params_cfg(struct kunit *test, const void *prev, + char *desc) +{ + uintptr_t cfg_id = (uintptr_t)prev; + + cfg_id++; + if (cfg_id >= ARRAY_SIZE(kunit_fmt_cfgs) + 1) + return NULL; + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s_cfg_%u", + __stringify(PTPFX_RAW), (unsigned int)(cfg_id - 1)); + return (void *)cfg_id; +} +#define KUNIT_CASE_FMT(test_name) \ + KUNIT_CASE_PARAM(test_name, kunit_pt_gen_params_cfg) +#else +#define KUNIT_CASE_FMT(test_name) KUNIT_CASE(test_name) +#endif + +#define KUNIT_ASSERT_NO_ERRNO(test, ret) \ + KUNIT_ASSERT_EQ_MSG(test, ret, 0, KUNIT_SUBSUBTEST_INDENT "errno %pe", \ + ERR_PTR(ret)) + +#define KUNIT_ASSERT_NO_ERRNO_FN(test, fn, ret) \ + KUNIT_ASSERT_EQ_MSG(test, ret, 0, \ + KUNIT_SUBSUBTEST_INDENT "errno %pe from %s", \ + ERR_PTR(ret), fn) + +/* + * When the test is run on a 32 bit system unsigned long can be 32 bits. This + * cause the iommu op signatures to be restricted to 32 bits. Meaning the test + * has to be mindful not to create any VA's over the 32 bit limit. Reduce the + * scope of the testing as the main purpose of checking on full 32 bit is to + * look for 32bitism in the core code. Run the test on i386 with X86_PAE=y to + * get the full coverage when dma_addr_t & phys_addr_t are 8 bytes + */ +#define IS_32BIT (sizeof(unsigned long) == 4) + +struct kunit_iommu_priv { + union { + struct iommu_domain domain; + struct pt_iommu_table fmt_table; + }; + spinlock_t top_lock; + struct device *dummy_dev; + struct pt_iommu *iommu; + struct pt_common *common; + struct pt_iommu_table_cfg cfg; + struct pt_iommu_info info; + unsigned int smallest_pgsz_lg2; + pt_vaddr_t smallest_pgsz; + unsigned int largest_pgsz_lg2; + pt_oaddr_t test_oa; + pt_vaddr_t safe_pgsize_bitmap; + unsigned long orig_nr_secondary_pagetable; + +}; +PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain); + +static void pt_kunit_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + iommu_put_pages_list(&gather->freelist); +} + +#define IOMMU_PT_DOMAIN_OPS1(x) IOMMU_PT_DOMAIN_OPS(x) +static const struct iommu_domain_ops kunit_pt_ops = { + IOMMU_PT_DOMAIN_OPS1(PTPFX_RAW), + .iotlb_sync = &pt_kunit_iotlb_sync, +}; + +static void pt_kunit_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level) +{ +} + +static spinlock_t *pt_kunit_get_top_lock(struct pt_iommu *iommu_table) +{ + struct kunit_iommu_priv *priv = container_of( + iommu_table, struct kunit_iommu_priv, fmt_table.iommu); + + return &priv->top_lock; +} + +static const struct pt_iommu_driver_ops pt_kunit_driver_ops = { + .change_top = &pt_kunit_change_top, + .get_top_lock = &pt_kunit_get_top_lock, +}; + +static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv) +{ + unsigned int va_lg2sz; + int ret; + + /* Enough so the memory allocator works */ + priv->dummy_dev = kunit_device_register(test, "pt_kunit_dev"); + if (IS_ERR(priv->dummy_dev)) + return PTR_ERR(priv->dummy_dev); + set_dev_node(priv->dummy_dev, NUMA_NO_NODE); + + spin_lock_init(&priv->top_lock); + +#ifdef kunit_fmt_cfgs + priv->cfg = kunit_fmt_cfgs[((uintptr_t)test->param_value) - 1]; + /* + * The format can set a list of features that the kunit_fmt_cfgs + * controls, other features are default to on. + */ + priv->cfg.common.features |= PT_SUPPORTED_FEATURES & + (~KUNIT_FMT_FEATURES); +#else + priv->cfg.common.features = PT_SUPPORTED_FEATURES; +#endif + + /* Defaults, for the kunit */ + if (!priv->cfg.common.hw_max_vasz_lg2) + priv->cfg.common.hw_max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; + if (!priv->cfg.common.hw_max_oasz_lg2) + priv->cfg.common.hw_max_oasz_lg2 = pt_max_oa_lg2(NULL); + + priv->fmt_table.iommu.nid = NUMA_NO_NODE; + priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops; + priv->fmt_table.iommu.iommu_device = priv->dummy_dev; + priv->domain.ops = &kunit_pt_ops; + ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL); + if (ret) { + if (ret == -EOVERFLOW) + kunit_skip( + test, + "This configuration cannot be tested on 32 bit"); + return ret; + } + + priv->iommu = &priv->fmt_table.iommu; + priv->common = common_from_iommu(&priv->fmt_table.iommu); + priv->iommu->ops->get_info(priv->iommu, &priv->info); + + /* + * size_t is used to pass the mapping length, it can be 32 bit, truncate + * the pagesizes so we don't use large sizes. + */ + priv->info.pgsize_bitmap = (size_t)priv->info.pgsize_bitmap; + + priv->smallest_pgsz_lg2 = vaffs(priv->info.pgsize_bitmap); + priv->smallest_pgsz = log2_to_int(priv->smallest_pgsz_lg2); + priv->largest_pgsz_lg2 = + vafls((dma_addr_t)priv->info.pgsize_bitmap) - 1; + + priv->test_oa = + oalog2_mod(0x74a71445deadbeef, priv->common->max_oasz_lg2); + + /* + * We run out of VA space if the mappings get too big, make something + * smaller that can safely pass through dma_addr_t API. + */ + va_lg2sz = priv->common->max_vasz_lg2; + if (IS_32BIT && va_lg2sz > 32) + va_lg2sz = 32; + priv->safe_pgsize_bitmap = + log2_mod(priv->info.pgsize_bitmap, va_lg2sz - 1); + + return 0; +} + +#endif diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h new file mode 100644 index 000000000000..e8a63c8ea850 --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "kunit_iommu.h" +#include "pt_iter.h" +#include <linux/generic_pt/iommu.h> +#include <linux/iommu.h> + +static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len); + +struct count_valids { + u64 per_size[PT_VADDR_MAX_LG2]; +}; + +static int __count_valids(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct count_valids *valids = arg; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + pt_descend(&pts, arg, __count_valids); + continue; + } + if (pts.type == PT_ENTRY_OA) { + valids->per_size[pt_entry_oa_lg2sz(&pts)]++; + continue; + } + } + return 0; +} + +/* + * Number of valid table entries. This counts contiguous entries as a single + * valid. + */ +static unsigned int count_valids(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) + total += valids.per_size[i]; + return total; +} + +/* Only a single page size is present, count the number of valid entries */ +static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) { + if ((1ULL << i) == pgsz) + total = valids.per_size[i]; + else + KUNIT_ASSERT_EQ(test, valids.per_size[i], 0); + } + return total; +} + +static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + size_t ret; + + ret = iommu_unmap(&priv->domain, va, len); + KUNIT_ASSERT_EQ(test, ret, len); +} + +static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2); + pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2); + + for (; pfn != end_pfn; pfn++) { + phys_addr_t res = iommu_iova_to_phys(&priv->domain, + pfn * priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa); + if (res != pa) + break; + pa += priv->smallest_pgsz; + } +} + +static void test_increase_level(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_common *common = priv->common; + + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format"); + + if (IS_32BIT) + kunit_skip(test, "Unable to test on 32bit"); + + KUNIT_ASSERT_GT(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + + /* Add every possible level to the max */ + while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) { + struct pt_range top_range = pt_top_range(common); + + if (top_range.va == 0) + do_map(test, top_range.last_va + 1, 0, + priv->smallest_pgsz); + else + do_map(test, top_range.va - priv->smallest_pgsz, 0, + priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level, + top_range.top_level + 1); + KUNIT_ASSERT_GE(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + } +} + +static void test_map_simple(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + pt_vaddr_t cur_va; + + /* Map every reported page size */ + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + + cur_va = ALIGN(cur_va, len); + do_map(test, cur_va, paddr, len); + if (len <= SZ_2G) + check_iova(test, cur_va, paddr, len); + cur_va += len; + } + + /* The read interface reports that every page size was created */ + range = pt_top_range(priv->common); + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + if (pgsize_bitmap & (1ULL << pgsz_lg2)) + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1); + else + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0); + } + + /* Unmap works */ + range = pt_top_range(priv->common); + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + cur_va = ALIGN(cur_va, len); + do_unmap(test, cur_va, len); + cur_va += len; + } + KUNIT_ASSERT_EQ(test, count_valids(test), 0); +} + +/* + * Test to convert a table pointer into an OA by mapping something small, + * unmapping it so as to leave behind a table pointer, then mapping something + * larger that will convert the table into an OA. + */ +static void test_map_table_to_oa(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t limited_pgbitmap = + priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G); + struct pt_range range = pt_top_range(priv->common); + unsigned int pgsz_lg2; + pt_vaddr_t max_pgsize; + pt_vaddr_t cur_va; + + max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1); + KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize); + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + pt_vaddr_t offset; + + if (!(priv->info.pgsize_bitmap & len)) + continue; + if (len > max_pgsize) + break; + + cur_va = ALIGN(range.va + priv->smallest_pgsz * 256, + max_pgsize); + for (offset = 0; offset != max_pgsize; offset += len) + do_map(test, cur_va + offset, paddr + offset, len); + check_iova(test, cur_va, paddr, max_pgsize); + KUNIT_ASSERT_EQ(test, count_valids_single(test, len), + log2_div(max_pgsize, pgsz_lg2)); + + if (len == max_pgsize) { + do_unmap(test, cur_va, max_pgsize); + } else { + do_unmap(test, cur_va, max_pgsize / 2); + for (offset = max_pgsize / 2; offset != max_pgsize; + offset += len) + do_unmap(test, cur_va + offset, len); + } + + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + } +} + +/* + * Test unmapping a small page at the start of a large page. This always unmaps + * the large page. + */ +static void test_unmap_split(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + unsigned int count = 0; + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_vaddr_t base_len = log2_to_int(pgsz_lg2); + unsigned int next_pgsz_lg2; + + if (!(pgsize_bitmap & base_len)) + continue; + + for (next_pgsz_lg2 = pgsz_lg2 + 1; + next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) { + pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2); + pt_vaddr_t vaddr = top_range.va; + pt_oaddr_t paddr = 0; + size_t gnmapped; + + if (!(pgsize_bitmap & next_len)) + continue; + + do_map(test, vaddr, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + /* Make sure unmap doesn't keep going */ + do_map(test, vaddr, paddr, next_len); + do_map(test, vaddr + next_len, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr + next_len, + next_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + count++; + } + } + + if (count == 0) + kunit_skip(test, "Test needs two page sizes"); +} + +static void unmap_collisions(struct kunit *test, struct maple_tree *mt, + pt_vaddr_t start, pt_vaddr_t last) +{ + struct kunit_iommu_priv *priv = test->priv; + MA_STATE(mas, mt, start, last); + void *entry; + + mtree_lock(mt); + mas_for_each(&mas, entry, last) { + pt_vaddr_t mas_start = mas.index; + pt_vaddr_t len = (mas.last - mas_start) + 1; + pt_oaddr_t paddr; + + mas_erase(&mas); + mas_pause(&mas); + mtree_unlock(mt); + + paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2); + check_iova(test, mas_start, paddr, len); + do_unmap(test, mas_start, len); + mtree_lock(mt); + } + mtree_unlock(mt); +} + +static void clamp_range(struct kunit *test, struct pt_range *range) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (range->last_va - range->va > SZ_1G) + range->last_va = range->va + SZ_1G; + KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX); + if (range->va <= MAPLE_RESERVED_RANGE) + range->va = + ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz); +} + +/* + * Randomly map and unmap ranges that can large physical pages. If a random + * range overlaps with existing ranges then unmap them. This hits all the + * special cases. + */ +static void test_random_map(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range upper_range = pt_upper_range(priv->common); + struct pt_range top_range = pt_top_range(priv->common); + struct maple_tree mt; + unsigned int iter; + + mt_init(&mt); + + /* + * Shrink the range so randomization is more likely to have + * intersections + */ + clamp_range(test, &top_range); + clamp_range(test, &upper_range); + + for (iter = 0; iter != 1000; iter++) { + struct pt_range *range = &top_range; + pt_oaddr_t paddr; + pt_vaddr_t start; + pt_vaddr_t end; + int ret; + + if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) && + ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1)) + range = &upper_range; + + start = get_random_u32_below( + min(U32_MAX, range->last_va - range->va)); + end = get_random_u32_below( + min(U32_MAX, range->last_va - start)); + + start = ALIGN_DOWN(start, priv->smallest_pgsz); + end = ALIGN(end, priv->smallest_pgsz); + start += range->va; + end += start; + if (start < range->va || end > range->last_va + 1 || + start >= end) + continue; + + /* Try overmapping to test the failure handling */ + paddr = oalog2_mod(start, priv->common->max_oasz_lg2); + ret = iommu_map(&priv->domain, start, paddr, end - start, + IOMMU_READ | IOMMU_WRITE, GFP_KERNEL); + if (ret) { + KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE); + unmap_collisions(test, &mt, start, end - 1); + do_map(test, start, paddr, end - start); + } + + KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range", + mtree_insert_range(&mt, start, end - 1, + XA_ZERO_ENTRY, + GFP_KERNEL)); + + check_iova(test, start, paddr, end - start); + if (iter % 100) + cond_resched(); + } + + unmap_collisions(test, &mt, 0, PT_VADDR_MAX); + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + + mtree_destroy(&mt); +} + +/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */ +static void test_pgsize_boundary(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + + if (top_range.va != 0 || top_range.last_va < 0xfef9ffff || + priv->smallest_pgsz != SZ_4K) + kunit_skip(test, "Format does not have the required range"); + + do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1); +} + +/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */ +static void test_mixed(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + u64 start = 0x3fe400ULL << 12; + u64 end = 0x4c0600ULL << 12; + pt_vaddr_t len = end - start; + pt_oaddr_t oa = start; + + if (top_range.last_va <= start || sizeof(unsigned long) == 4) + kunit_skip(test, "range is too small"); + if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21))) + kunit_skip(test, "incompatible psize"); + + do_map(test, start, oa, len); + /* 14 2M, 3 1G, 3 2M */ + KUNIT_ASSERT_EQ(test, count_valids(test), 20); + check_iova(test, start, oa, len); +} + +static struct kunit_case iommu_test_cases[] = { + KUNIT_CASE_FMT(test_increase_level), + KUNIT_CASE_FMT(test_map_simple), + KUNIT_CASE_FMT(test_map_table_to_oa), + KUNIT_CASE_FMT(test_unmap_split), + KUNIT_CASE_FMT(test_random_map), + KUNIT_CASE_FMT(test_pgsize_boundary), + KUNIT_CASE_FMT(test_mixed), + {}, +}; + +static int pt_kunit_iommu_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv; + int ret; + + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->orig_nr_secondary_pagetable = + global_node_page_state(NR_SECONDARY_PAGETABLE); + ret = pt_kunit_priv_init(test, priv); + if (ret) { + kunit_kfree(test, priv); + return ret; + } + test->priv = priv; + return 0; +} + +static void pt_kunit_iommu_exit(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!test->priv) + return; + + pt_iommu_deinit(priv->iommu); + /* + * Look for memory leaks, assumes kunit is running isolated and nothing + * else is using secondary page tables. + */ + KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable, + global_node_page_state(NR_SECONDARY_PAGETABLE)); + kunit_kfree(test, test->priv); +} + +static struct kunit_suite NS(iommu_suite) = { + .name = __stringify(NS(iommu_test)), + .init = pt_kunit_iommu_init, + .exit = pt_kunit_iommu_exit, + .test_cases = iommu_test_cases, +}; +kunit_test_suites(&NS(iommu_suite)); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Kunit for generic page table"); +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h new file mode 100644 index 000000000000..e1123d35c907 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_common.h @@ -0,0 +1,389 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included after the format. It contains definitions + * that build on the format definitions to create the basic format API. + * + * The format API is listed here, with kdocs. The functions without bodies are + * implemented in the format using the pattern: + * static inline FMTpt_XXX(..) {..} + * #define pt_XXX FMTpt_XXX + * + * If the format doesn't implement a function then pt_fmt_defaults.h can provide + * a generic version. + * + * The routines marked "@pts: Entry to query" operate on the entire contiguous + * entry and can be called with a pts->index pointing to any sub item that makes + * up that entry. + * + * The header order is: + * pt_defs.h + * FMT.h + * pt_common.h + */ +#ifndef __GENERIC_PT_PT_COMMON_H +#define __GENERIC_PT_PT_COMMON_H + +#include "pt_defs.h" +#include "pt_fmt_defaults.h" + +/** + * pt_attr_from_entry() - Convert the permission bits back to attrs + * @pts: Entry to convert from + * @attrs: Resulting attrs + * + * Fill in the attrs with the permission bits encoded in the current leaf entry. + * The attrs should be usable with pt_install_leaf_entry() to reconstruct the + * same entry. + */ +static inline void pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs); + +/** + * pt_can_have_leaf() - True if the current level can have an OA entry + * @pts: The current level + * + * True if the current level can support pt_install_leaf_entry(). A leaf + * entry produce an OA. + */ +static inline bool pt_can_have_leaf(const struct pt_state *pts); + +/** + * pt_can_have_table() - True if the current level can have a lower table + * @pts: The current level + * + * Every level except 0 is allowed to have a lower table. + */ +static inline bool pt_can_have_table(const struct pt_state *pts) +{ + /* No further tables at level 0 */ + return pts->level > 0; +} + +/** + * pt_clear_entries() - Make entries empty (non-present) + * @pts: Starting table index + * @num_contig_lg2: Number of contiguous items to clear + * + * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY + * and does not have any effect on table walking. The starting index must be + * aligned to num_contig_lg2. + */ +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2); + +/** + * pt_entry_make_write_dirty() - Make an entry dirty + * @pts: Table entry to change + * + * Make pt_entry_is_write_dirty() return true for this entry. This can be called + * asynchronously with any other table manipulation under a RCU lock and must + * not corrupt the table. + */ +static inline bool pt_entry_make_write_dirty(struct pt_state *pts); + +/** + * pt_entry_make_write_clean() - Make the entry write clean + * @pts: Table entry to change + * + * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will + * eventually be notified of this change via a TLB flush, which is the point + * that the HW must become synchronized. Any "write dirty" prior to the TLB + * flush can be lost, but once the TLB flush completes all writes must make + * their entries write dirty. + * + * The format should alter the entry in a way that is compatible with any + * concurrent update from HW. The entire contiguous entry is changed. + */ +static inline void pt_entry_make_write_clean(struct pt_state *pts); + +/** + * pt_entry_is_write_dirty() - True if the entry has been written to + * @pts: Entry to query + * + * "write dirty" means that the HW has written to the OA translated + * by this entry. If the entry is contiguous then the consolidated + * "write dirty" for all the items must be returned. + */ +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts); + +/** + * pt_dirty_supported() - True if the page table supports dirty tracking + * @common: Page table to query + */ +static inline bool pt_dirty_supported(struct pt_common *common); + +/** + * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry + * @pts: Entry to query + * + * Return the number of contiguous items this leaf entry spans. If the entry + * is single item it returns ilog2(1). + */ +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts); + +/** + * pt_entry_oa() - Output Address for this leaf entry + * @pts: Entry to query + * + * Return the output address for the start of the entry. If the entry + * is contiguous this returns the same value for each sub-item. I.e.:: + * + * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0 + * + * See pt_item_oa(). The format should implement one of these two functions + * depending on how it stores the OAs in the table. + */ +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts); + +/** + * pt_entry_oa_lg2sz() - Return the size of an OA entry + * @pts: Entry to query + * + * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise + * it returns the total VA/OA size of the entire contiguous entry. + */ +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) +{ + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts); +} + +/** + * pt_entry_oa_exact() - Return the complete OA for an entry + * @pts: Entry to query + * + * During iteration the first entry could have a VA with an offset from the + * natural start of the entry. Return the exact OA including the pts's VA + * offset. + */ +static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts) +{ + return _pt_entry_oa_fast(pts) | + log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts)); +} + +/** + * pt_full_va_prefix() - The top bits of the VA + * @common: Page table to query + * + * This is usually 0, but some formats have their VA space going downward from + * PT_VADDR_MAX, and will return that instead. This value must always be + * adjusted by struct pt_common max_vasz_lg2. + */ +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common); + +/** + * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry + * @common: Page table to query + * + * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT). + * This is useful to create optimized paths for common cases of PAGE_SIZE + * mappings. + */ +static inline bool pt_has_system_page_size(const struct pt_common *common); + +/** + * pt_install_leaf_entry() - Write a leaf entry to the table + * @pts: Table index to change + * @oa: Output Address for this leaf + * @oasz_lg2: Size in VA/OA for this leaf + * @attrs: Attributes to modify the entry + * + * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates + * the VA indicated by pts to the given OA. + * + * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz(). + * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2. + * + * This must not be called if pt_can_have_leaf() == false. Contiguous sizes + * not indicated by pt_possible_sizes() must not be specified. + */ +static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs); + +/** + * pt_install_table() - Write a table entry to the table + * @pts: Table index to change + * @table_pa: CPU physical address of the lower table's memory + * @attrs: Attributes to modify the table index + * + * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa + * is the table at pts->level - 1. This is done by cmpxchg so pts must have the + * current entry loaded. The pts is updated with the installed entry. + * + * This must not be called if pt_can_have_table() == false. + * + * Returns: true if the table was installed successfully. + */ +static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs); + +/** + * pt_item_oa() - Output Address for this leaf item + * @pts: Item to query + * + * Return the output address for this item. If the item is part of a contiguous + * entry it returns the value of the OA for this individual sub item. + * + * See pt_entry_oa(). The format should implement one of these two functions + * depending on how it stores the OA's in the table. + */ +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts); + +/** + * pt_load_entry_raw() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Return the type of entry that was loaded. pts->entry will be filled in with + * the entry's content. See pt_load_entry() + */ +static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts); + +/** + * pt_max_oa_lg2() - Return the maximum OA the table format can hold + * @common: Page table to query + * + * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the + * OA. This is the absolute maximum address the table can hold. struct pt_common + * max_oasz_lg2 sets a lower dynamic maximum based on HW capability. + */ +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common); + +/** + * pt_num_items_lg2() - Return the number of items in this table level + * @pts: The current level + * + * The number of items in a table level defines the number of bits this level + * decodes from the VA. This function is not called for the top level, + * so it does not need to compute a special value for the top case. The + * result for the top is based on pt_common max_vasz_lg2. + * + * The value is used as part of determining the table indexes via the + * equation:: + * + * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2()) + */ +static inline unsigned int pt_num_items_lg2(const struct pt_state *pts); + +/** + * pt_pgsz_lg2_to_level - Return the level that maps the page size + * @common: Page table to query + * @pgsize_lg2: Log2 page size + * + * Returns the table level that will map the given page size. The page + * size must be part of the pt_possible_sizes() for some level. + */ +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2); + +/** + * pt_possible_sizes() - Return a bitmap of possible output sizes at this level + * @pts: The current level + * + * Each level has a list of possible output sizes that can be installed as + * leaf entries. If pt_can_have_leaf() is false returns zero. + * + * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating + * that a non-contiguous single item leaf entry is supported. The following + * pt_num_items_lg2() number of bits can be set indicating contiguous entries + * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be + * set, contiguous entries cannot span the entire table. + * + * The OR of pt_possible_sizes() of all levels is the typical bitmask of all + * supported sizes in the entire table. + */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts); + +/** + * pt_table_item_lg2sz() - Size of a single item entry in this table level + * @pts: The current level + * + * The size of the item specifies how much VA and OA a single item occupies. + * + * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous + * entries. + */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +/** + * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table + * @pts: The current level + * + * Return the size of VA decoded by the entire table level. + */ +static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts) +{ + if (pts->range->top_level == pts->level) + return pts->range->max_vasz_lg2; + return min_t(unsigned int, pts->range->common->max_vasz_lg2, + pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts)); +} + +/** + * pt_table_pa() - Return the CPU physical address of the table entry + * @pts: Entry to query + * + * This is only ever called on PT_ENTRY_TABLE entries. Must return the same + * value passed to pt_install_table(). + */ +static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts); + +/** + * pt_table_ptr() - Return a CPU pointer for a table item + * @pts: Entry to query + * + * Same as pt_table_pa() but returns a CPU pointer. + */ +static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts) +{ + return __va(pt_table_pa(pts)); +} + +/** + * pt_max_sw_bit() - Return the maximum software bit usable for any level and + * entry + * @common: Page table + * + * The swbit can be passed as bitnr to the other sw_bit functions. + */ +static inline unsigned int pt_max_sw_bit(struct pt_common *common); + +/** + * pt_test_sw_bit_acquire() - Read a software bit in an item + * @pts: Entry to read + * @bitnr: Bit to read + * + * Software bits are ignored by HW and can be used for any purpose by the + * software. This does a test bit and acquire operation. + */ +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr); + +/** + * pt_set_sw_bit_release() - Set a software bit in an item + * @pts: Entry to set + * @bitnr: Bit to set + * + * Software bits are ignored by HW and can be used for any purpose by the + * software. This does a set bit and release operation. + */ +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr); + +/** + * pt_load_entry() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Set the type of entry that was loaded. pts->entry and pts->table_lower + * will be filled in with the entry's content. + */ +static inline void pt_load_entry(struct pt_state *pts) +{ + pts->type = pt_load_entry_raw(pts); + if (pts->type == PT_ENTRY_TABLE) + pts->table_lower = pt_table_ptr(pts); +} +#endif diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h new file mode 100644 index 000000000000..c25544d72f97 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_defs.h @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included before the format. It contains definitions + * that are required to compile the format. The header order is: + * pt_defs.h + * fmt_XX.h + * pt_common.h + */ +#ifndef __GENERIC_PT_DEFS_H +#define __GENERIC_PT_DEFS_H + +#include <linux/generic_pt/common.h> + +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/bits.h> +#include <linux/limits.h> +#include <linux/bug.h> +#include <linux/kconfig.h> +#include "pt_log2.h" + +/* Header self-compile default defines */ +#ifndef pt_write_attrs +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; +#endif + +struct pt_table_p; + +enum { + PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32, + PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32, +}; + +/* + * The format instantiation can have features wired off or on to optimize the + * code gen. Supported features are just a reflection of what the current set of + * kernel users want to use. + */ +#ifndef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES 0 +#endif + +/* + * When in debug mode we compile all formats with all features. This allows the + * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or + * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +enum { + PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, + PT_DEBUG_SUPPORTED_FEATURES = + UINT_MAX & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ? + 0 : + BIT(PT_FEAT_DMA_INCOHERENT))) & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? + BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : + BIT(PT_FEAT_SIGN_EXTEND)), +}; +#undef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES +#endif + +#ifndef PT_FORCE_ENABLED_FEATURES +#define PT_FORCE_ENABLED_FEATURES 0 +#endif + +/** + * DOC: Generic Page Table Language + * + * Language used in Generic Page Table + * VA + * The input address to the page table, often the virtual address. + * OA + * The output address from the page table, often the physical address. + * leaf + * An entry that results in an output address. + * start/end + * An half-open range, e.g. [0,0) refers to no VA. + * start/last + * An inclusive closed range, e.g. [0,0] refers to the VA 0 + * common + * The generic page table container struct pt_common + * level + * Level 0 is always a table of only leaves with no futher table pointers. + * Increasing levels increase the size of the table items. The least + * significant VA bits used to index page tables are used to index the Level + * 0 table. The various labels for table levels used by HW descriptions are + * not used. + * top_level + * The inclusive highest level of the table. A two-level table + * has a top level of 1. + * table + * A linear array of translation items for that level. + * index + * The position in a table of an element: item = table[index] + * item + * A single index in a table + * entry + * A single logical element in a table. If contiguous pages are not + * supported then item and entry are the same thing, otherwise entry refers + * to all the items that comprise a single contiguous translation. + * item/entry_size + * The number of bytes of VA the table index translates for. + * If the item is a table entry then the next table covers + * this size. If the entry translates to an output address then the + * full OA is: OA | (VA % entry_size) + * contig_count + * The number of consecutive items fused into a single entry. + * item_size * contig_count is the size of that entry's translation. + * lg2 + * Indicates the value is encoded as log2, i.e. 1<<x is the actual value. + * Normally the compiler is fine to optimize divide and mod with log2 values + * automatically when inlining, however if the values are not constant + * expressions it can't. So we do it by hand; we want to avoid 64-bit + * divmod. + */ + +/* Returned by pt_load_entry() and for_each_pt_level_entry() */ +enum pt_entry_type { + PT_ENTRY_EMPTY, + /* Entry is valid and points to a lower table level */ + PT_ENTRY_TABLE, + /* Entry is valid and returns an output address */ + PT_ENTRY_OA, +}; + +struct pt_range { + struct pt_common *common; + struct pt_table_p *top_table; + pt_vaddr_t va; + pt_vaddr_t last_va; + u8 top_level; + u8 max_vasz_lg2; +}; + +/* + * Similar to xa_state, this records information about an in-progress parse at a + * single level. + */ +struct pt_state { + struct pt_range *range; + struct pt_table_p *table; + struct pt_table_p *table_lower; + u64 entry; + enum pt_entry_type type; + unsigned short index; + unsigned short end_index; + u8 level; +}; + +#define pt_cur_table(pts, type) ((type *)((pts)->table)) + +/* + * Try to install a new table pointer. The locking methodology requires this to + * be atomic (multiple threads can race to install a pointer). The losing + * threads will fail the atomic and return false. They should free any memory + * and reparse the table level again. + */ +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry) +{ + u64 *entryp = pt_cur_table(pts, u64) + pts->index; + u64 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg64_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} +#endif + +static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry) +{ + u32 *entryp = pt_cur_table(pts, u32) + pts->index; + u32 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} + +#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr)) + +static inline bool pt_feature(const struct pt_common *common, + unsigned int feature_nr) +{ + if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr)) + return true; + if (!PT_SUPPORTED_FEATURE(feature_nr)) + return false; + return common->features & BIT(feature_nr); +} + +static inline bool pts_feature(const struct pt_state *pts, + unsigned int feature_nr) +{ + return pt_feature(pts->range->common, feature_nr); +} + +/* + * PT_WARN_ON is used for invariants that the kunit should be checking can't + * happen. + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +#define PT_WARN_ON WARN_ON +#else +static inline bool PT_WARN_ON(bool condition) +{ + return false; +} +#endif + +/* These all work on the VA type */ +#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2) +#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2) +#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2) +#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2) +#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2) +#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2) +#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2) +#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2) +#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2) +#define vaffs(a) ffs_t(pt_vaddr_t, a) +#define vafls(a) fls_t(pt_vaddr_t, a) +#define vaffz(a) ffz_t(pt_vaddr_t, a) + +/* + * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and + * generate a useful defined result. The non-fva versions will malfunction at + * this extreme. + */ +static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return 0; + return log2_div_t(pt_vaddr_t, a, b_lg2); +} + +static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return a; + return log2_mod_t(pt_vaddr_t, a, b_lg2); +} + +static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b, + unsigned int c_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2) + return true; + return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val, + unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return val; + return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return PT_VADDR_MAX; + return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2); +} + +/* These all work on the OA type */ +#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2) +#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2) +#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2) +#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2) +#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2) +#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2) +#define oaffs(a) ffs_t(pt_oaddr_t, a) +#define oafls(a) fls_t(pt_oaddr_t, a) +#define oaffz(a) ffz_t(pt_oaddr_t, a) + +static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem, + unsigned int top_level) +{ + return top_level | (uintptr_t)table_mem; +} + +static inline void pt_top_set(struct pt_common *common, + struct pt_table_p *table_mem, + unsigned int top_level) +{ + WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level)); +} + +static inline void pt_top_set_level(struct pt_common *common, + unsigned int top_level) +{ + pt_top_set(common, NULL, top_level); +} + +static inline unsigned int pt_top_get_level(const struct pt_common *common) +{ + return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS); +} + +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2); + +#endif diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h new file mode 100644 index 000000000000..69fb7c2314ca --- /dev/null +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Default definitions for formats that don't define these functions. + */ +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H +#define __GENERIC_PT_PT_FMT_DEFAULTS_H + +#include "pt_defs.h" +#include <linux/log2.h> + +/* Header self-compile default defines */ +#ifndef pt_load_entry_raw +#include "fmt/amdv1.h" +#endif + +/* + * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and + * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top. + */ +#ifndef pt_table_item_lg2sz +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts) +{ + return PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level; +} +#endif + +#ifndef pt_pgsz_lg2_to_level +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2) +{ + return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) / + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)); +} +#endif + +/* + * If not supplied by the format then contiguous pages are not supported. + * + * If contiguous pages are supported then the format must also provide + * pt_contig_count_lg2() if it supports a single contiguous size per level, + * or pt_possible_sizes() if it supports multiple sizes per level. + */ +#ifndef pt_entry_num_contig_lg2 +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} + +/* + * Return the number of contiguous OA items forming an entry at this table level + */ +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} +#endif + +/* If not supplied by the format then dirty tracking is not supported */ +#ifndef pt_entry_is_write_dirty +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts) +{ + return false; +} + +static inline void pt_entry_make_write_clean(struct pt_state *pts) +{ +} + +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return false; +} +#else +/* If not supplied then dirty tracking is always enabled */ +#ifndef pt_dirty_supported +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return true; +} +#endif +#endif + +#ifndef pt_entry_make_write_dirty +static inline bool pt_entry_make_write_dirty(struct pt_state *pts) +{ + return false; +} +#endif + +/* + * Format supplies either: + * pt_entry_oa - OA is at the start of a contiguous entry + * or + * pt_item_oa - OA is adjusted for every item in a contiguous entry + * + * Build the missing one + * + * The internal helper _pt_entry_oa_fast() allows generating + * an efficient pt_entry_oa_exact(), it doesn't care which + * option is selected. + */ +#ifdef pt_entry_oa +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts) +{ + return pt_entry_oa(pts) | + log2_mul(pts->index, pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_entry_oa +#endif + +#ifdef pt_item_oa +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts) +{ + return log2_set_mod(pt_item_oa(pts), 0, + pt_entry_num_contig_lg2(pts) + + pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_item_oa +#endif + +/* + * If not supplied by the format then use the constant + * PT_MAX_OUTPUT_ADDRESS_LG2. + */ +#ifndef pt_max_oa_lg2 +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common) +{ + return PT_MAX_OUTPUT_ADDRESS_LG2; +} +#endif + +#ifndef pt_has_system_page_size +static inline bool pt_has_system_page_size(const struct pt_common *common) +{ + return PT_GRANULE_LG2SZ == PAGE_SHIFT; +} +#endif + +/* + * If not supplied by the format then assume only one contiguous size determined + * by pt_contig_count_lg2() + */ +#ifndef pt_possible_sizes +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts); + +/* Return a bitmap of possible leaf page sizes at this level */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!pt_can_have_leaf(pts)) + return 0; + return log2_to_int(isz_lg2) | + log2_to_int(pt_contig_count_lg2(pts) + isz_lg2); +} +#endif + +/* If not supplied by the format then use 0. */ +#ifndef pt_full_va_prefix +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common) +{ + return 0; +} +#endif + +/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */ +#ifndef pt_clear_entries +static inline void pt_clear_entries64(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries32(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u32 *tablep = pt_cur_table(pts, u32) + pts->index; + u32 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + if (PT_ITEM_WORD_SIZE == sizeof(u32)) + pt_clear_entries32(pts, num_contig_lg2); + else + pt_clear_entries64(pts, num_contig_lg2); +} +#define pt_clear_entries pt_clear_entries +#endif + +/* If not supplied then SW bits are not supported */ +#ifdef pt_sw_bit +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr) +{ + /* Acquire, pairs with pt_set_sw_bit_release() */ + smp_mb(); + /* For a contiguous entry the sw bit is only stored in the first item. */ + return pts->entry & pt_sw_bit(bitnr); +} +#define pt_test_sw_bit_acquire pt_test_sw_bit_acquire + +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr) +{ +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) + if (PT_ITEM_WORD_SIZE == sizeof(u64)) { + u64 *entryp = pt_cur_table(pts, u64) + pts->index; + u64 old_entry = pts->entry; + u64 new_entry; + + do { + new_entry = old_entry | pt_sw_bit(bitnr); + } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry)); + pts->entry = new_entry; + return; + } +#endif + if (PT_ITEM_WORD_SIZE == sizeof(u32)) { + u32 *entryp = pt_cur_table(pts, u32) + pts->index; + u32 old_entry = pts->entry; + u32 new_entry; + + do { + new_entry = old_entry | pt_sw_bit(bitnr); + } while (!try_cmpxchg_release(entryp, &old_entry, new_entry)); + pts->entry = new_entry; + } else + BUILD_BUG(); +} +#define pt_set_sw_bit_release pt_set_sw_bit_release +#else +static inline unsigned int pt_max_sw_bit(struct pt_common *common) +{ + return 0; +} + +extern void __pt_no_sw_bit(void); +static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, + unsigned int bitnr) +{ + __pt_no_sw_bit(); + return false; +} + +static inline void pt_set_sw_bit_release(struct pt_state *pts, + unsigned int bitnr) +{ + __pt_no_sw_bit(); +} +#endif + +/* + * Format can call in the pt_install_leaf_entry() to check the arguments are all + * aligned correctly. + */ +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2))) + return false; + +#ifdef pt_possible_sizes + if (PT_WARN_ON(isz_lg2 > oasz_lg2 || + oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts))) + return false; +#else + if (PT_WARN_ON(oasz_lg2 != isz_lg2 && + oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts))) + return false; +#endif + + if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2))) + return false; + return true; +} + +#endif diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h new file mode 100644 index 000000000000..c0d8617cce29 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -0,0 +1,636 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Iterators for Generic Page Table + */ +#ifndef __GENERIC_PT_PT_ITER_H +#define __GENERIC_PT_PT_ITER_H + +#include "pt_common.h" + +#include <linux/errno.h> + +/* + * Use to mangle symbols so that backtraces and the symbol table are + * understandable. Any non-inlined function should get mangled like this. + */ +#define NS(fn) CONCATENATE(PTPFX, fn) + +/** + * pt_check_range() - Validate the range can be iterated + * @range: Range to validate + * + * Check that VA and last_va fall within the permitted range of VAs. If the + * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension + * is correct. + */ +static inline int pt_check_range(struct pt_range *range) +{ + pt_vaddr_t prefix; + + PT_WARN_ON(!range->max_vasz_lg2); + + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) { + PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2); + prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ? + PT_VADDR_MAX : + 0; + } else { + prefix = pt_full_va_prefix(range->common); + } + + if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) || + !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2)) + return -ERANGE; + return 0; +} + +/** + * pt_index_to_va() - Update range->va to the current pts->index + * @pts: Iteration State + * + * Adjust range->va to match the current index. This is done in a lazy manner + * since computing the VA takes several instructions and is rarely required. + */ +static inline void pt_index_to_va(struct pt_state *pts) +{ + pt_vaddr_t lower_va; + + lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts)); + pts->range->va = fvalog2_set_mod(pts->range->va, lower_va, + pt_table_oa_lg2sz(pts)); +} + +/* + * Add index_count_lg2 number of entries to pts's VA and index. The VA will be + * adjusted to the end of the contiguous block if it is currently in the middle. + */ +static inline void _pt_advance(struct pt_state *pts, + unsigned int index_count_lg2) +{ + pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0, + index_count_lg2); +} + +/** + * pt_entry_fully_covered() - Check if the item or entry is entirely contained + * within pts->range + * @pts: Iteration State + * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or + * pt_entry_oa_lg2sz() + * + * Returns: true if the item is fully enclosed by the pts->range. + */ +static inline bool pt_entry_fully_covered(const struct pt_state *pts, + unsigned int oasz_lg2) +{ + struct pt_range *range = pts->range; + + /* Range begins at the start of the entry */ + if (log2_mod(pts->range->va, oasz_lg2)) + return false; + + /* Range ends past the end of the entry */ + if (!log2_div_eq(range->va, range->last_va, oasz_lg2)) + return true; + + /* Range ends at the end of the entry */ + return log2_mod_eq_max(range->last_va, oasz_lg2); +} + +/** + * pt_range_to_index() - Starting index for an iteration + * @pts: Iteration State + * + * Return: the starting index for the iteration in pts. + */ +static inline unsigned int pt_range_to_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + PT_WARN_ON(pts->level > pts->range->top_level); + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->va, + pts->range->max_vasz_lg2), + isz_lg2); + return log2_mod(log2_div(pts->range->va, isz_lg2), + pt_num_items_lg2(pts)); +} + +/** + * pt_range_to_end_index() - Ending index iteration + * @pts: Iteration State + * + * Return: the last index for the iteration in pts. + */ +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct pt_range *range = pts->range; + unsigned int num_entries_lg2; + + if (range->va == range->last_va) + return pts->index + 1; + + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->last_va, + pts->range->max_vasz_lg2), + isz_lg2) + + 1; + + num_entries_lg2 = pt_num_items_lg2(pts); + + /* last_va falls within this table */ + if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2)) + return log2_mod(log2_div(pts->range->last_va, isz_lg2), + num_entries_lg2) + + 1; + + return log2_to_int(num_entries_lg2); +} + +static inline void _pt_iter_first(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pts->end_index = pt_range_to_end_index(pts); + PT_WARN_ON(pts->index > pts->end_index); +} + +static inline bool _pt_iter_load(struct pt_state *pts) +{ + if (pts->index >= pts->end_index) + return false; + pt_load_entry(pts); + return true; +} + +/** + * pt_next_entry() - Advance pts to the next entry + * @pts: Iteration State + * + * Update pts to go to the next index at this level. If pts is pointing at a + * contiguous entry then the index may advance my more than one. + */ +static inline void pt_next_entry(struct pt_state *pts) +{ + if (pts->type == PT_ENTRY_OA && + !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0)) + _pt_advance(pts, pt_entry_num_contig_lg2(pts)); + else + pts->index++; + pt_index_to_va(pts); +} + +/** + * for_each_pt_level_entry() - For loop wrapper over entries in the range + * @pts: Iteration State + * + * This is the basic iteration primitive. It iterates over all the entries in + * pts->range that fall within the pts's current table level. Each step does + * pt_load_entry(pts). + */ +#define for_each_pt_level_entry(pts) \ + for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts)) + +/** + * pt_load_single_entry() - Version of pt_load_entry() usable within a walker + * @pts: Iteration State + * + * Alternative to for_each_pt_level_entry() if the walker function uses only a + * single entry. + */ +static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pt_load_entry(pts); + return pts->type; +} + +static __always_inline struct pt_range _pt_top_range(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = { + .common = common, + .top_table = + (struct pt_table_p *)(top_of_table & + ~(uintptr_t)PT_TOP_LEVEL_MASK), + .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS), + }; + struct pt_state pts = { .range = &range, .level = range.top_level }; + unsigned int max_vasz_lg2; + + max_vasz_lg2 = common->max_vasz_lg2; + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + pts.level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2, + pt_num_items_lg2(&pts) + + pt_table_item_lg2sz(&pts)); + + /* + * The top range will default to the lower region only with sign extend. + */ + range.max_vasz_lg2 = max_vasz_lg2; + if (pt_feature(common, PT_FEAT_SIGN_EXTEND)) + max_vasz_lg2--; + + range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2); + range.last_va = + fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2); + return range; +} + +/** + * pt_top_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static __always_inline struct pt_range pt_top_range(struct pt_common *common) +{ + /* + * The top pointer can change without locking. We capture the value and + * it's level here and are safe to walk it so long as both values are + * captured without tearing. + */ + return _pt_top_range(common, READ_ONCE(common->top_of_table)); +} + +/** + * pt_all_range() - Return a range that spans the entire page table + * @common: Table + * + * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND + * is supported range->va and range->last_va will be incorrect during the + * iteration and must not be accessed. + */ +static inline struct pt_range pt_all_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + /* + * Pretend the table is linear from 0 without a sign extension. This + * generates the correct indexes for iteration. + */ + range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2); + return range; +} + +/** + * pt_upper_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static inline struct pt_range pt_upper_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1); + range.last_va = PT_VADDR_MAX; + return range; +} + +/** + * pt_make_range() - Return a range that spans part of the table + * @common: Table + * @va: Start address + * @last_va: Last address + * + * The caller must validate the range with pt_check_range() before using it. + */ +static __always_inline struct pt_range +pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va) +{ + struct pt_range range = + _pt_top_range(common, READ_ONCE(common->top_of_table)); + + range.va = va; + range.last_va = last_va; + + return range; +} + +/* + * Span a slice of the table starting at a lower table level from an active + * walk. + */ +static __always_inline struct pt_range +pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va, + pt_vaddr_t last_va) +{ + struct pt_range range = *parent; + + range.va = va; + range.last_va = last_va; + + PT_WARN_ON(last_va < va); + PT_WARN_ON(pt_check_range(&range)); + + return range; +} + +/** + * pt_init() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * @level: Table level for the state + * @table: Pointer to the table memory at level + * + * Helper to initialize the on-stack pt_state from walker arguments. + */ +static __always_inline struct pt_state +pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = { + .range = range, + .table = table, + .level = level, + }; + return pts; +} + +/** + * pt_init_top() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * + * The pt_state points to the top most level. + */ +static __always_inline struct pt_state pt_init_top(struct pt_range *range) +{ + return pt_init(range, range->top_level, range->top_table); +} + +typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table); + +/** + * pt_descend() - Recursively invoke the walker for the lower level + * @pts: Iteration State + * @arg: Value to pass to the function + * @fn: Walker function to call + * + * pts must point to a table item. Invoke fn as a walker on the table + * pts points to. + */ +static __always_inline int pt_descend(struct pt_state *pts, void *arg, + pt_level_fn_t fn) +{ + int ret; + + if (PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower); + return ret; +} + +/** + * pt_walk_range() - Walk over a VA range + * @range: Range pointer + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * Walk over a VA range. The caller should have done a validity check, at + * least calling pt_check_range(), when building range. The walk will + * start at the top most table. + */ +static __always_inline int pt_walk_range(struct pt_range *range, + pt_level_fn_t fn, void *arg) +{ + return fn(range, arg, range->top_level, range->top_table); +} + +/* + * pt_walk_descend() - Recursively invoke the walker for a slice of a lower + * level + * @pts: Iteration State + * @va: Start address + * @last_va: Last address + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over a slice of the + * lower table. The caller must ensure that va/last_va are within the table + * item. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int pt_walk_descend(const struct pt_state *pts, + pt_vaddr_t va, pt_vaddr_t last_va, + pt_level_fn_t fn, void *arg) +{ + struct pt_range range = pt_make_child_range(pts->range, va, last_va); + + if (PT_WARN_ON(!pt_can_have_table(pts)) || + PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + return fn(&range, arg, pts->level - 1, pts->table_lower); +} + +/* + * pt_walk_descend_all() - Recursively invoke the walker for a table item + * @parent_pts: Iteration State + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over the entire lower + * table. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn, + void *arg) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts); + + return pt_walk_descend(parent_pts, + log2_set_mod(parent_pts->range->va, 0, isz_lg2), + log2_set_mod_max(parent_pts->range->va, isz_lg2), + fn, arg); +} + +/** + * pt_range_slice() - Return a range that spans indexes + * @pts: Iteration State + * @start_index: Starting index within pts + * @end_index: Ending index within pts + * + * Create a range than spans an index range of the current table level + * pt_state points at. + */ +static inline struct pt_range pt_range_slice(const struct pt_state *pts, + unsigned int start_index, + unsigned int end_index) +{ + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); + pt_vaddr_t last_va; + pt_vaddr_t va; + + va = fvalog2_set_mod(pts->range->va, + log2_mul(start_index, pt_table_item_lg2sz(pts)), + table_lg2sz); + last_va = fvalog2_set_mod( + pts->range->va, + log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz); + return pt_make_child_range(pts->range, va, last_va); +} + +/** + * pt_top_memsize_lg2() + * @common: Table + * @top_of_table: Top of table value from _pt_top_set() + * + * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this + * will compute the top size assuming the table will grow. + */ +static inline unsigned int pt_top_memsize_lg2(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = _pt_top_range(common, top_of_table); + struct pt_state pts = pt_init_top(&range); + unsigned int num_items_lg2; + + num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts); + if (range.top_level != PT_MAX_TOP_LEVEL && + pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts)); + + /* Round up the allocation size to the minimum alignment */ + return max(ffs_t(u64, PT_TOP_PHYS_MASK), + num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE)); +} + +/** + * pt_compute_best_pgsize() - Determine the best page size for leaf entries + * @pgsz_bitmap: Permitted page sizes + * @va: Starting virtual address for the leaf entry + * @last_va: Last virtual address for the leaf entry, sets the max page size + * @oa: Starting output address for the leaf entry + * + * Compute the largest page size for va, last_va, and oa together and return it + * in lg2. The largest page size depends on the format's supported page sizes at + * this level, and the relative alignment of the VA and OA addresses. 0 means + * the OA cannot be stored with the provided pgsz_bitmap. + */ +static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, + pt_vaddr_t va, + pt_vaddr_t last_va, + pt_oaddr_t oa) +{ + unsigned int best_pgsz_lg2; + unsigned int pgsz_lg2; + pt_vaddr_t len = last_va - va + 1; + pt_vaddr_t mask; + + if (PT_WARN_ON(va >= last_va)) + return 0; + + /* + * Given a VA/OA pair the best page size is the largest page size + * where: + * + * 1) VA and OA start at the page. Bitwise this is the count of least + * significant 0 bits. + * This also implies that last_va/oa has the same prefix as va/oa. + */ + mask = va | oa; + + /* + * 2) The page size is not larger than the last_va (length). Since page + * sizes are always power of two this can't be larger than the + * largest power of two factor of the length. + */ + mask |= log2_to_int(vafls(len) - 1); + + best_pgsz_lg2 = vaffs(mask); + + /* Choose the highest bit <= best_pgsz_lg2 */ + if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1) + pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1); + + pgsz_lg2 = vafls(pgsz_bitmap); + if (!pgsz_lg2) + return 0; + + pgsz_lg2--; + + PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0); + PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0); + PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va); + PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + PT_WARN_ON( + !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + return pgsz_lg2; +} + +#define _PT_MAKE_CALL_LEVEL(fn) \ + static __always_inline int fn(struct pt_range *range, void *arg, \ + unsigned int level, \ + struct pt_table_p *table) \ + { \ + static_assert(PT_MAX_TOP_LEVEL <= 5); \ + if (level == 0) \ + return CONCATENATE(fn, 0)(range, arg, 0, table); \ + if (level == 1 || PT_MAX_TOP_LEVEL == 1) \ + return CONCATENATE(fn, 1)(range, arg, 1, table); \ + if (level == 2 || PT_MAX_TOP_LEVEL == 2) \ + return CONCATENATE(fn, 2)(range, arg, 2, table); \ + if (level == 3 || PT_MAX_TOP_LEVEL == 3) \ + return CONCATENATE(fn, 3)(range, arg, 3, table); \ + if (level == 4 || PT_MAX_TOP_LEVEL == 4) \ + return CONCATENATE(fn, 4)(range, arg, 4, table); \ + return CONCATENATE(fn, 5)(range, arg, 5, table); \ + } + +static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg, + unsigned int unused_level, + struct pt_table_p *table) +{ + static_assert(PT_MAX_TOP_LEVEL <= 5); + return -EPROTOTYPE; +} + +#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \ + static inline int fn(struct pt_range *range, void *arg, \ + unsigned int unused_level, \ + struct pt_table_p *table) \ + { \ + return do_fn(range, arg, level, table, descend_fn); \ + } + +/** + * PT_MAKE_LEVELS() - Build an unwound walker + * @fn: Name of the walker function + * @do_fn: Function to call at each level + * + * This builds a function call tree that can be fully inlined. + * The caller must provide a function body in an __always_inline function:: + * + * static __always_inline int do_fn(struct pt_range *range, void *arg, + * unsigned int level, struct pt_table_p *table, + * pt_level_fn_t descend_fn) + * + * An inline function will be created for each table level that calls do_fn with + * a compile time constant for level and a pointer to the next lower function. + * This generates an optimally inlined walk where each of the functions sees a + * constant level and can codegen the exact constants/etc for that level. + * + * Note this can produce a lot of code! + */ +#define PT_MAKE_LEVELS(fn, do_fn) \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \ + do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \ + _PT_MAKE_CALL_LEVEL(fn) + +#endif diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h new file mode 100644 index 000000000000..6dbbed119238 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_log2.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Helper macros for working with log2 values + * + */ +#ifndef __GENERIC_PT_LOG2_H +#define __GENERIC_PT_LOG2_H +#include <linux/bitops.h> +#include <linux/limits.h> + +/* Compute a */ +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2))) +static_assert(log2_to_int_t(unsigned int, 0) == 1); + +/* Compute a - 1 (aka all low bits set) */ +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1)) + +/* Compute a / b */ +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2))) +static_assert(log2_div_t(unsigned int, 4, 2) == 1); + +/* + * Compute: + * a / c == b / c + * aka the high bits are equal + */ +#define log2_div_eq_t(type, a, b, c_lg2) \ + (log2_div_t(type, (a) ^ (b), c_lg2) == 0) +static_assert(log2_div_eq_t(unsigned int, 1, 1, 2)); + +/* Compute a % b */ +#define log2_mod_t(type, a, b_lg2) \ + ((type)(((type)a) & log2_to_max_int_t(type, b_lg2))) +static_assert(log2_mod_t(unsigned int, 1, 2) == 1); + +/* + * Compute: + * a % b == b - 1 + * aka the low bits are all 1s + */ +#define log2_mod_eq_max_t(type, a, b_lg2) \ + (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2)) +static_assert(log2_mod_eq_max_t(unsigned int, 3, 2)); + +/* + * Return a value such that: + * a / b == ret / b + * ret % b == val + * aka set the low bits to val. val must be < b + */ +#define log2_set_mod_t(type, a, val, b_lg2) \ + ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val))) +static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1); + +/* Return a value such that: + * a / b == ret / b + * ret % b == b - 1 + * aka set the low bits to all 1s + */ +#define log2_set_mod_max_t(type, a, b_lg2) \ + (((type)(a)) | log2_to_max_int_t(type, b_lg2)) +static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3); + +/* Compute a * b */ +#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2))) +static_assert(log2_mul_t(unsigned int, 2, 2) == 8); + +#define _dispatch_sz(type, fn, a) \ + (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a)) + +/* + * Return the highest value such that: + * fls_t(u32, 0) == 0 + * fls_t(u3, 1) == 1 + * a >= log2_to_int(ret - 1) + * aka find last set bit + */ +static inline unsigned int fls32(u32 a) +{ + return fls(a); +} +#define fls_t(type, a) _dispatch_sz(type, fls, a) + +/* + * Return the highest value such that: + * ffs_t(u32, 0) == UNDEFINED + * ffs_t(u32, 1) == 0 + * log_mod(a, ret) == 0 + * aka find first set bit + */ +static inline unsigned int __ffs32(u32 a) +{ + return __ffs(a); +} +#define ffs_t(type, a) _dispatch_sz(type, __ffs, a) + +/* + * Return the highest value such that: + * ffz_t(u32, U32_MAX) == UNDEFINED + * ffz_t(u32, 0) == 0 + * ffz_t(u32, 1) == 1 + * log_mod(a, ret) == log_to_max_int(ret) + * aka find first zero bit + */ +static inline unsigned int ffz32(u32 a) +{ + return ffz(a); +} +static inline unsigned int ffz64(u64 a) +{ + if (sizeof(u64) == sizeof(unsigned long)) + return ffz(a); + + if ((u32)a == U32_MAX) + return ffz32(a >> 32) + 32; + return ffz32(a); +} +#define ffz_t(type, a) _dispatch_sz(type, ffz, a) + +#endif diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 8a5c17b97310..0961ac805944 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -130,7 +130,7 @@ static int __init hyperv_prepare_irq_remapping(void) x86_init.hyper.msi_ext_dest_id()) return -ENODEV; - if (hv_root_partition) { + if (hv_root_partition()) { name = "HYPERV-ROOT-IR"; ops = &hyperv_root_ir_domain_ops; } else { @@ -151,7 +151,7 @@ static int __init hyperv_prepare_irq_remapping(void) return -ENOMEM; } - if (hv_root_partition) + if (hv_root_partition()) return 0; /* The rest is only relevant to guests */ /* @@ -164,8 +164,8 @@ static int __init hyperv_prepare_irq_remapping(void) * max cpu affinity for IOAPIC irqs. Scan cpu 0-255 and set cpu * into ioapic_max_cpumask if its APIC ID is less than 256. */ - for (i = min_t(unsigned int, num_possible_cpus() - 1, 255); i >= 0; i--) - if (cpu_physical_id(i) < 256) + for (i = min_t(unsigned int, nr_cpu_ids - 1, 255); i >= 0; i--) + if (cpu_possible(i) && cpu_physical_id(i) < 256) cpumask_set_cpu(i, &ioapic_max_cpumask); return 0; @@ -193,15 +193,13 @@ struct hyperv_root_ir_data { static void hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) { - u64 status; - u32 vector; - struct irq_cfg *cfg; - int ioapic_id; - const struct cpumask *affinity; - int cpu; - struct hv_interrupt_entry entry; struct hyperv_root_ir_data *data = irq_data->chip_data; + struct hv_interrupt_entry entry; + const struct cpumask *affinity; struct IO_APIC_route_entry e; + struct irq_cfg *cfg; + int cpu, ioapic_id; + u32 vector; cfg = irqd_cfg(irq_data); affinity = irq_data_get_effective_affinity_mask(irq_data); @@ -214,23 +212,16 @@ hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) && data->entry.ioapic_rte.as_uint64) { entry = data->entry; - status = hv_unmap_ioapic_interrupt(ioapic_id, &entry); - - if (status != HV_STATUS_SUCCESS) - pr_debug("%s: unexpected unmap status %lld\n", __func__, status); + (void)hv_unmap_ioapic_interrupt(ioapic_id, &entry); data->entry.ioapic_rte.as_uint64 = 0; data->entry.source = 0; /* Invalid source */ } - status = hv_map_ioapic_interrupt(ioapic_id, data->is_level, cpu, - vector, &entry); - - if (status != HV_STATUS_SUCCESS) { - pr_err("%s: map hypercall failed, status %lld\n", __func__, status); + if (hv_map_ioapic_interrupt(ioapic_id, data->is_level, cpu, + vector, &entry)) return; - } data->entry = entry; @@ -322,10 +313,10 @@ static void hyperv_root_irq_remapping_free(struct irq_domain *domain, data = irq_data->chip_data; e = &data->entry; - if (e->source == HV_DEVICE_TYPE_IOAPIC - && e->ioapic_rte.as_uint64) - hv_unmap_ioapic_interrupt(data->ioapic_id, - &data->entry); + if (e->source == HV_DEVICE_TYPE_IOAPIC && + e->ioapic_rte.as_uint64) + (void)hv_unmap_ioapic_interrupt(data->ioapic_id, + &data->entry); kfree(data); } diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index f52fb39c968e..5471f814e073 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -12,9 +12,13 @@ config DMAR_DEBUG config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" depends on PCI_MSI && ACPI && X86 - select DMA_OPS select IOMMU_API + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_X86_64 + select IOMMU_PT_VTDSS select IOMMU_IOVA + select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD select NEED_DMA_MAP_STATE select DMAR_TABLE @@ -51,7 +55,6 @@ config INTEL_IOMMU_SVM depends on X86_64 select MMU_NOTIFIER select IOMMU_SVA - select IOMMU_IOPF help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by @@ -67,7 +70,7 @@ config INTEL_IOMMU_DEFAULT_ON config INTEL_IOMMU_FLOPPY_WA def_bool y - depends on X86 + depends on X86 && BLK_DEV_FD help Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index c8beb0281559..ada651c4a01b 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,11 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o -obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o +obj-y += iommu.o pasid.o nested.o cache.o prq.o +obj-$(CONFIG_DMAR_TABLE) += dmar.o trace.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o -ifdef CONFIG_INTEL_IOMMU obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o -endif obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index e8418cdd8331..265e7290256b 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -40,13 +40,13 @@ static bool cache_tage_match(struct cache_tag *tag, u16 domain_id, } /* Assign a cache tag with specified type to domain. */ -static int cache_tag_assign(struct dmar_domain *domain, u16 did, - struct device *dev, ioasid_t pasid, - enum cache_tag_type type) +int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev, + ioasid_t pasid, enum cache_tag_type type) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct cache_tag *tag, *temp; + struct list_head *prev; unsigned long flags; tag = kzalloc(sizeof(*tag), GFP_KERNEL); @@ -65,6 +65,7 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did, tag->dev = iommu->iommu.dev; spin_lock_irqsave(&domain->cache_lock, flags); + prev = &domain->cache_tags; list_for_each_entry(temp, &domain->cache_tags, node) { if (cache_tage_match(temp, did, iommu, dev, pasid, type)) { temp->users++; @@ -73,8 +74,15 @@ static int cache_tag_assign(struct dmar_domain *domain, u16 did, trace_cache_tag_assign(temp); return 0; } + if (temp->iommu == iommu) + prev = &temp->node; } - list_add_tail(&tag->node, &domain->cache_tags); + /* + * Link cache tags of same iommu unit together, so corresponding + * flush ops can be batched for iommu unit. + */ + list_add(&tag->node, prev); + spin_unlock_irqrestore(&domain->cache_lock, flags); trace_cache_tag_assign(tag); @@ -105,12 +113,35 @@ static void cache_tag_unassign(struct dmar_domain *domain, u16 did, spin_unlock_irqrestore(&domain->cache_lock, flags); } +/* domain->qi_batch will be freed in iommu_free_domain() path. */ +static int domain_qi_batch_alloc(struct dmar_domain *domain) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&domain->cache_lock, flags); + if (domain->qi_batch) + goto out_unlock; + + domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_ATOMIC); + if (!domain->qi_batch) + ret = -ENOMEM; +out_unlock: + spin_unlock_irqrestore(&domain->cache_lock, flags); + + return ret; +} + static int __cache_tag_assign_domain(struct dmar_domain *domain, u16 did, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_IOTLB); if (ret || !info->ats_enabled) return ret; @@ -139,6 +170,10 @@ static int __cache_tag_assign_parent_domain(struct dmar_domain *domain, u16 did, struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; + ret = domain_qi_batch_alloc(domain); + if (ret) + return ret; + ret = cache_tag_assign(domain, did, dev, pasid, CACHE_TAG_NESTING_IOTLB); if (ret || !info->ats_enabled) return ret; @@ -245,7 +280,8 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, * shared_bits are all equal in both pfn and end_pfn. */ shared_bits = ~(pfn ^ end_pfn) & ~bitmask; - mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG; + mask = shared_bits ? __ffs(shared_bits) : MAX_AGAW_PFN_WIDTH; + aligned_pages = 1UL << mask; } *_pages = aligned_pages; @@ -254,6 +290,138 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask); } +static void qi_batch_flush_descs(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (!iommu || !batch->index) + return; + + qi_submit_sync(iommu, batch->descs, batch->index, 0); + + /* Reset the index value and clean the whole batch buffer. */ + memset(batch, 0, sizeof(*batch)); +} + +static void qi_batch_increment_index(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (++batch->index == QI_MAX_BATCHED_DESC_COUNT) + qi_batch_flush_descs(iommu, batch); +} + +static void qi_batch_add_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_batch *batch) +{ + qi_desc_iotlb(iommu, did, addr, size_order, type, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned int mask, + struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any Device-TLB + * invalidation requests while address remapping hardware is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, + u64 addr, unsigned long npages, bool ih, + struct qi_batch *batch) +{ + /* + * npages == -1 means a PASID-selective invalidation, otherwise, + * a positive value for Page-selective-within-PASID invalidation. + * 0 is not a valid input. + */ + if (!npages) + return; + + qi_desc_piotlb(did, pasid, addr, npages, ih, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_pasid_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any + * Device-TLB invalidation requests while address remapping hardware + * is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, qdep, addr, size_order, + &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long pages, + unsigned long mask, int ih) +{ + struct intel_iommu *iommu = tag->iommu; + u64 type = DMA_TLB_PSI_FLUSH; + + if (intel_domain_is_fs_paging(domain)) { + qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, + pages, ih, domain->qi_batch); + return; + } + + /* + * Fallback to domain selective flush if no PSI support or the size + * is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap) || pages == -1) { + addr = 0; + mask = 0; + ih = 0; + type = DMA_TLB_DSI_FLUSH; + } + + if (ecap_qis(iommu->ecap)) + qi_batch_add_iotlb(iommu, tag->domain_id, addr | ih, mask, type, + domain->qi_batch); + else + __iommu_flush_iotlb(iommu, tag->domain_id, addr | ih, mask, type); +} + +static void cache_tag_flush_devtlb_psi(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long mask) +{ + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + if (tag->pasid == IOMMU_NO_PASID) { + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + return; + } + + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, + domain->qi_batch); +} + /* * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive) * when the memory mappings in the target domain have been modified. @@ -261,38 +429,29 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, unsigned long end, int ih) { + struct intel_iommu *iommu = NULL; unsigned long pages, mask, addr; struct cache_tag *tag; unsigned long flags; - addr = calculate_psi_aligned_address(start, end, &pages, &mask); + if (start == 0 && end == ULONG_MAX) { + addr = 0; + pages = -1; + mask = MAX_AGAW_PFN_WIDTH; + } else { + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + } spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; switch (tag->type) { case CACHE_TAG_IOTLB: case CACHE_TAG_NESTING_IOTLB: - if (domain->use_first_level) { - qi_flush_piotlb(iommu, tag->domain_id, - tag->pasid, addr, pages, ih); - } else { - /* - * Fallback to domain selective flush if no - * PSI support or the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - addr | ih, mask, - DMA_TLB_PSI_FLUSH); - } + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, ih); break; case CACHE_TAG_NESTING_DEVTLB: /* @@ -306,23 +465,13 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, mask = MAX_AGAW_PFN_WIDTH; fallthrough; case CACHE_TAG_DEVTLB: - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - if (tag->pasid == IOMMU_NO_PASID) - qi_flush_dev_iotlb(iommu, sid, info->pfsid, - info->ats_qdep, addr, mask); - else - qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid, - tag->pasid, info->ats_qdep, - addr, mask); - - quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep); + cache_tag_flush_devtlb_psi(domain, tag, addr, mask); break; } trace_cache_tag_flush_range(tag, start, end, addr, pages, mask); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } @@ -332,40 +481,7 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, */ void cache_tag_flush_all(struct dmar_domain *domain) { - struct cache_tag *tag; - unsigned long flags; - - spin_lock_irqsave(&domain->cache_lock, flags); - list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; - - switch (tag->type) { - case CACHE_TAG_IOTLB: - case CACHE_TAG_NESTING_IOTLB: - if (domain->use_first_level) - qi_flush_piotlb(iommu, tag->domain_id, - tag->pasid, 0, -1, 0); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - break; - case CACHE_TAG_DEVTLB: - case CACHE_TAG_NESTING_DEVTLB: - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - qi_flush_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, - 0, MAX_AGAW_PFN_WIDTH); - quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, - IOMMU_NO_PASID, info->ats_qdep); - break; - } - - trace_cache_tag_flush_all(tag); - } - spin_unlock_irqrestore(&domain->cache_lock, flags); + cache_tag_flush_range(domain, 0, ULONG_MAX, 0); } /* @@ -382,6 +498,7 @@ void cache_tag_flush_all(struct dmar_domain *domain) void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end) { + struct intel_iommu *iommu = NULL; unsigned long pages, mask, addr; struct cache_tag *tag; unsigned long flags; @@ -390,30 +507,22 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; - if (!cap_caching_mode(iommu->cap) || domain->use_first_level) { + if (!cap_caching_mode(iommu->cap) || + intel_domain_is_fs_paging(domain)) { iommu_flush_write_buffer(iommu); continue; } if (tag->type == CACHE_TAG_IOTLB || - tag->type == CACHE_TAG_NESTING_IOTLB) { - /* - * Fallback to domain selective flush if no - * PSI support or the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - addr, mask, - DMA_TLB_PSI_FLUSH); - } + tag->type == CACHE_TAG_NESTING_IOTLB) + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, 0); trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c deleted file mode 100644 index 9862dc20b35e..000000000000 --- a/drivers/iommu/intel/cap_audit.c +++ /dev/null @@ -1,217 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * cap_audit.c - audit iommu capabilities for boot time and hot plug - * - * Copyright (C) 2021 Intel Corporation - * - * Author: Kyung Min Park <kyung.min.park@intel.com> - * Lu Baolu <baolu.lu@linux.intel.com> - */ - -#define pr_fmt(fmt) "DMAR: " fmt - -#include "iommu.h" -#include "cap_audit.h" - -static u64 intel_iommu_cap_sanity; -static u64 intel_iommu_ecap_sanity; - -static inline void check_irq_capabilities(struct intel_iommu *a, - struct intel_iommu *b) -{ - CHECK_FEATURE_MISMATCH(a, b, cap, pi_support, CAP_PI_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, eim_support, ECAP_EIM_MASK); -} - -static inline void check_dmar_capabilities(struct intel_iommu *a, - struct intel_iommu *b) -{ - MINIMAL_FEATURE_IOMMU(b, cap, CAP_MAMV_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_NFR_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_SLLPS_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_FRO_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_MGAW_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_SAGAW_MASK); - MINIMAL_FEATURE_IOMMU(b, cap, CAP_NDOMS_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_PSS_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_MHMV_MASK); - MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_IRO_MASK); - - CHECK_FEATURE_MISMATCH(a, b, cap, fl5lp_support, CAP_FL5LP_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, fl1gp_support, CAP_FL1GP_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, read_drain, CAP_RD_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, write_drain, CAP_WD_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, pgsel_inv, CAP_PSI_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, zlr, CAP_ZLR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, caching_mode, CAP_CM_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, phmr, CAP_PHMR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, plmr, CAP_PLMR_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, rwbf, CAP_RWBF_MASK); - CHECK_FEATURE_MISMATCH(a, b, cap, afl, CAP_AFL_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, rps, ECAP_RPS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, smpwc, ECAP_SMPWC_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, flts, ECAP_FLTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, slts, ECAP_SLTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, nwfs, ECAP_NWFS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, slads, ECAP_SLADS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, smts, ECAP_SMTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pds, ECAP_PDS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, dit, ECAP_DIT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pasid, ECAP_PASID_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, eafs, ECAP_EAFS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, srs, ECAP_SRS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, ers, ECAP_ERS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, prs, ECAP_PRS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, nest, ECAP_NEST_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, mts, ECAP_MTS_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, sc_support, ECAP_SC_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, pass_through, ECAP_PT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, dev_iotlb_support, ECAP_DT_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, qis, ECAP_QI_MASK); - CHECK_FEATURE_MISMATCH(a, b, ecap, coherent, ECAP_C_MASK); -} - -static int cap_audit_hotplug(struct intel_iommu *iommu, enum cap_audit_type type) -{ - bool mismatch = false; - u64 old_cap = intel_iommu_cap_sanity; - u64 old_ecap = intel_iommu_ecap_sanity; - - if (type == CAP_AUDIT_HOTPLUG_IRQR) { - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pi_support, CAP_PI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eim_support, ECAP_EIM_MASK); - goto out; - } - - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl5lp_support, CAP_FL5LP_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl1gp_support, CAP_FL1GP_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, read_drain, CAP_RD_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, write_drain, CAP_WD_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pgsel_inv, CAP_PSI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, zlr, CAP_ZLR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, caching_mode, CAP_CM_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, phmr, CAP_PHMR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, plmr, CAP_PLMR_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, rwbf, CAP_RWBF_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, afl, CAP_AFL_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, rps, ECAP_RPS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smpwc, ECAP_SMPWC_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, flts, ECAP_FLTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slts, ECAP_SLTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nwfs, ECAP_NWFS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slads, ECAP_SLADS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smts, ECAP_SMTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pds, ECAP_PDS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dit, ECAP_DIT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pasid, ECAP_PASID_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eafs, ECAP_EAFS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, srs, ECAP_SRS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, ers, ECAP_ERS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, prs, ECAP_PRS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nest, ECAP_NEST_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, mts, ECAP_MTS_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, sc_support, ECAP_SC_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pass_through, ECAP_PT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dev_iotlb_support, ECAP_DT_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, qis, ECAP_QI_MASK); - CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, coherent, ECAP_C_MASK); - - /* Abort hot plug if the hot plug iommu feature is smaller than global */ - MINIMAL_FEATURE_HOTPLUG(iommu, cap, max_amask_val, CAP_MAMV_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, num_fault_regs, CAP_NFR_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, super_page_val, CAP_SLLPS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, fault_reg_offset, CAP_FRO_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, mgaw, CAP_MGAW_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, sagaw, CAP_SAGAW_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, cap, ndoms, CAP_NDOMS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, pss, ECAP_PSS_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, max_handle_mask, ECAP_MHMV_MASK, mismatch); - MINIMAL_FEATURE_HOTPLUG(iommu, ecap, iotlb_offset, ECAP_IRO_MASK, mismatch); - -out: - if (mismatch) { - intel_iommu_cap_sanity = old_cap; - intel_iommu_ecap_sanity = old_ecap; - return -EFAULT; - } - - return 0; -} - -static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type) -{ - struct dmar_drhd_unit *d; - struct intel_iommu *i; - int rc = 0; - - rcu_read_lock(); - if (list_empty(&dmar_drhd_units)) - goto out; - - for_each_active_iommu(i, d) { - if (!iommu) { - intel_iommu_ecap_sanity = i->ecap; - intel_iommu_cap_sanity = i->cap; - iommu = i; - continue; - } - - if (type == CAP_AUDIT_STATIC_DMAR) - check_dmar_capabilities(iommu, i); - else - check_irq_capabilities(iommu, i); - } - - /* - * If the system is sane to support scalable mode, either SL or FL - * should be sane. - */ - if (intel_cap_smts_sanity() && - !intel_cap_flts_sanity() && !intel_cap_slts_sanity()) - rc = -EOPNOTSUPP; - -out: - rcu_read_unlock(); - return rc; -} - -int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu) -{ - switch (type) { - case CAP_AUDIT_STATIC_DMAR: - case CAP_AUDIT_STATIC_IRQR: - return cap_audit_static(iommu, type); - case CAP_AUDIT_HOTPLUG_DMAR: - case CAP_AUDIT_HOTPLUG_IRQR: - return cap_audit_hotplug(iommu, type); - default: - break; - } - - return -EFAULT; -} - -bool intel_cap_smts_sanity(void) -{ - return ecap_smts(intel_iommu_ecap_sanity); -} - -bool intel_cap_pasid_sanity(void) -{ - return ecap_pasid(intel_iommu_ecap_sanity); -} - -bool intel_cap_nest_sanity(void) -{ - return ecap_nest(intel_iommu_ecap_sanity); -} - -bool intel_cap_flts_sanity(void) -{ - return ecap_flts(intel_iommu_ecap_sanity); -} - -bool intel_cap_slts_sanity(void) -{ - return ecap_slts(intel_iommu_ecap_sanity); -} diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h deleted file mode 100644 index d07b75938961..000000000000 --- a/drivers/iommu/intel/cap_audit.h +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * cap_audit.h - audit iommu capabilities header - * - * Copyright (C) 2021 Intel Corporation - * - * Author: Kyung Min Park <kyung.min.park@intel.com> - */ - -/* - * Capability Register Mask - */ -#define CAP_FL5LP_MASK BIT_ULL(60) -#define CAP_PI_MASK BIT_ULL(59) -#define CAP_FL1GP_MASK BIT_ULL(56) -#define CAP_RD_MASK BIT_ULL(55) -#define CAP_WD_MASK BIT_ULL(54) -#define CAP_MAMV_MASK GENMASK_ULL(53, 48) -#define CAP_NFR_MASK GENMASK_ULL(47, 40) -#define CAP_PSI_MASK BIT_ULL(39) -#define CAP_SLLPS_MASK GENMASK_ULL(37, 34) -#define CAP_FRO_MASK GENMASK_ULL(33, 24) -#define CAP_ZLR_MASK BIT_ULL(22) -#define CAP_MGAW_MASK GENMASK_ULL(21, 16) -#define CAP_SAGAW_MASK GENMASK_ULL(12, 8) -#define CAP_CM_MASK BIT_ULL(7) -#define CAP_PHMR_MASK BIT_ULL(6) -#define CAP_PLMR_MASK BIT_ULL(5) -#define CAP_RWBF_MASK BIT_ULL(4) -#define CAP_AFL_MASK BIT_ULL(3) -#define CAP_NDOMS_MASK GENMASK_ULL(2, 0) - -/* - * Extended Capability Register Mask - */ -#define ECAP_RPS_MASK BIT_ULL(49) -#define ECAP_SMPWC_MASK BIT_ULL(48) -#define ECAP_FLTS_MASK BIT_ULL(47) -#define ECAP_SLTS_MASK BIT_ULL(46) -#define ECAP_SLADS_MASK BIT_ULL(45) -#define ECAP_VCS_MASK BIT_ULL(44) -#define ECAP_SMTS_MASK BIT_ULL(43) -#define ECAP_PDS_MASK BIT_ULL(42) -#define ECAP_DIT_MASK BIT_ULL(41) -#define ECAP_PASID_MASK BIT_ULL(40) -#define ECAP_PSS_MASK GENMASK_ULL(39, 35) -#define ECAP_EAFS_MASK BIT_ULL(34) -#define ECAP_NWFS_MASK BIT_ULL(33) -#define ECAP_SRS_MASK BIT_ULL(31) -#define ECAP_ERS_MASK BIT_ULL(30) -#define ECAP_PRS_MASK BIT_ULL(29) -#define ECAP_NEST_MASK BIT_ULL(26) -#define ECAP_MTS_MASK BIT_ULL(25) -#define ECAP_MHMV_MASK GENMASK_ULL(23, 20) -#define ECAP_IRO_MASK GENMASK_ULL(17, 8) -#define ECAP_SC_MASK BIT_ULL(7) -#define ECAP_PT_MASK BIT_ULL(6) -#define ECAP_EIM_MASK BIT_ULL(4) -#define ECAP_DT_MASK BIT_ULL(2) -#define ECAP_QI_MASK BIT_ULL(1) -#define ECAP_C_MASK BIT_ULL(0) - -/* - * u64 intel_iommu_cap_sanity, intel_iommu_ecap_sanity will be adjusted as each - * IOMMU gets audited. - */ -#define DO_CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \ -do { \ - if (cap##_##feature(a) != cap##_##feature(b)) { \ - intel_iommu_##cap##_sanity &= ~(MASK); \ - pr_info("IOMMU feature %s inconsistent", #feature); \ - } \ -} while (0) - -#define CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \ - DO_CHECK_FEATURE_MISMATCH((a)->cap, (b)->cap, cap, feature, MASK) - -#define CHECK_FEATURE_MISMATCH_HOTPLUG(b, cap, feature, MASK) \ -do { \ - if (cap##_##feature(intel_iommu_##cap##_sanity)) \ - DO_CHECK_FEATURE_MISMATCH(intel_iommu_##cap##_sanity, \ - (b)->cap, cap, feature, MASK); \ -} while (0) - -#define MINIMAL_FEATURE_IOMMU(iommu, cap, MASK) \ -do { \ - u64 min_feature = intel_iommu_##cap##_sanity & (MASK); \ - min_feature = min_t(u64, min_feature, (iommu)->cap & (MASK)); \ - intel_iommu_##cap##_sanity = (intel_iommu_##cap##_sanity & ~(MASK)) | \ - min_feature; \ -} while (0) - -#define MINIMAL_FEATURE_HOTPLUG(iommu, cap, feature, MASK, mismatch) \ -do { \ - if ((intel_iommu_##cap##_sanity & (MASK)) > \ - (cap##_##feature((iommu)->cap))) \ - mismatch = true; \ - else \ - (iommu)->cap = ((iommu)->cap & ~(MASK)) | \ - (intel_iommu_##cap##_sanity & (MASK)); \ -} while (0) - -enum cap_audit_type { - CAP_AUDIT_STATIC_DMAR, - CAP_AUDIT_STATIC_IRQR, - CAP_AUDIT_HOTPLUG_DMAR, - CAP_AUDIT_HOTPLUG_IRQR, -}; - -bool intel_cap_smts_sanity(void); -bool intel_cap_pasid_sanity(void); -bool intel_cap_nest_sanity(void); -bool intel_cap_flts_sanity(void); -bool intel_cap_slts_sanity(void); - -static inline bool scalable_mode_support(void) -{ - return (intel_iommu_sm && intel_cap_smts_sanity()); -} - -static inline bool pasid_mode_support(void) -{ - return scalable_mode_support() && intel_cap_pasid_sanity(); -} - -static inline bool nested_mode_support(void) -{ - return scalable_mode_support() && intel_cap_nest_sanity(); -} - -int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index affbf4a1558d..617fd81a80f0 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -62,8 +62,6 @@ static const struct iommu_regset iommu_regs_64[] = { IOMMU_REGSET_ENTRY(CAP), IOMMU_REGSET_ENTRY(ECAP), IOMMU_REGSET_ENTRY(RTADDR), - IOMMU_REGSET_ENTRY(CCMD), - IOMMU_REGSET_ENTRY(AFLOG), IOMMU_REGSET_ENTRY(PHMBASE), IOMMU_REGSET_ENTRY(PHMLIMIT), IOMMU_REGSET_ENTRY(IQH), @@ -435,8 +433,21 @@ static int domain_translation_struct_show(struct seq_file *m, } pgd &= VTD_PAGE_MASK; } else { /* legacy mode */ - pgd = context->lo & VTD_PAGE_MASK; - agaw = context->hi & 7; + u8 tt = (u8)(context->lo & GENMASK_ULL(3, 2)) >> 2; + + /* + * According to Translation Type(TT), + * get the page table pointer(SSPTPTR). + */ + switch (tt) { + case CONTEXT_TT_MULTI_LEVEL: + case CONTEXT_TT_DEV_IOTLB: + pgd = context->lo & VTD_PAGE_MASK; + agaw = context->hi & 7; + break; + default: + goto iommu_unlock; + } } seq_printf(m, "Device %04x:%02x:%02x.%x ", @@ -648,17 +659,11 @@ DEFINE_SHOW_ATTRIBUTE(ir_translation_struct); static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu, struct dmar_drhd_unit *drhd) { - int ret; - seq_printf(m, "IOMMU: %s Register Base Address: %llx\n", iommu->name, drhd->reg_base_addr); - ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE); - if (ret < 0) - seq_puts(m, "Failed to get latency snapshot"); - else - seq_puts(m, debug_buf); - seq_puts(m, "\n"); + dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE); + seq_printf(m, "%s\n", debug_buf); } static int latency_show(struct seq_file *m, void *v) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 304e84949ca7..ec975c73cfe6 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -935,14 +935,11 @@ void __init detect_intel_iommu(void) pci_request_acs(); } -#ifdef CONFIG_X86 if (!ret) { x86_init.iommu.iommu_init = intel_iommu_init; x86_platform.iommu_shutdown = intel_iommu_shutdown; } -#endif - if (dmar_tbl) { acpi_put_table(dmar_tbl); dmar_tbl = NULL; @@ -1060,7 +1057,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err = iommu->seq_id; goto error; } - sprintf(iommu->name, "dmar%d", iommu->seq_id); + snprintf(iommu->name, sizeof(iommu->name), "dmar%d", iommu->seq_id); err = map_iommu(iommu, drhd); if (err) { @@ -1099,6 +1096,9 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) spin_lock_init(&iommu->device_rbtree_lock); mutex_init(&iommu->iopf_lock); iommu->node = NUMA_NO_NODE; + spin_lock_init(&iommu->lock); + ida_init(&iommu->domain_ida); + mutex_init(&iommu->did_lock); ver = readl(iommu->reg + DMAR_VER_REG); pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", @@ -1187,7 +1187,7 @@ static void free_iommu(struct intel_iommu *iommu) } if (iommu->qi) { - iommu_free_page(iommu->qi->desc); + iommu_free_pages(iommu->qi->desc); kfree(iommu->qi->desc_status); kfree(iommu->qi); } @@ -1195,6 +1195,7 @@ static void free_iommu(struct intel_iommu *iommu) if (iommu->reg) unmap_iommu(iommu); + ida_destroy(&iommu->domain_ida); ida_free(&dmar_seq_ids, iommu->seq_id); kfree(iommu); } @@ -1204,9 +1205,7 @@ static void free_iommu(struct intel_iommu *iommu) */ static inline void reclaim_free_desc(struct q_inval *qi) { - while (qi->desc_status[qi->free_tail] == QI_DONE || - qi->desc_status[qi->free_tail] == QI_ABORT) { - qi->desc_status[qi->free_tail] = QI_FREE; + while (qi->desc_status[qi->free_tail] == QI_FREE && qi->free_tail != qi->free_head) { qi->free_tail = (qi->free_tail + 1) % QI_LENGTH; qi->free_cnt++; } @@ -1446,7 +1445,7 @@ restart: */ writel(qi->free_head << shift, iommu->reg + DMAR_IQT_REG); - while (qi->desc_status[wait_index] != QI_DONE) { + while (READ_ONCE(qi->desc_status[wait_index]) != QI_DONE) { /* * We will leave the interrupts disabled, to prevent interrupt * context to queue another cmd while a cmd is already submitted @@ -1463,8 +1462,16 @@ restart: raw_spin_lock(&qi->q_lock); } - for (i = 0; i < count; i++) - qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE; + /* + * The reclaim code can free descriptors from multiple submissions + * starting from the tail of the queue. When count == 0, the + * status of the standalone wait descriptor at the tail of the queue + * must be set to QI_FREE to allow the reclaim code to proceed. + * It is also possible that descriptors from one of the previous + * submissions has to be reclaimed by a subsequent submission. + */ + for (i = 0; i <= count; i++) + qi->desc_status[(index + i) % QI_LENGTH] = QI_FREE; reclaim_free_desc(qi); raw_spin_unlock_irqrestore(&qi->q_lock, flags); @@ -1520,24 +1527,9 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type) { - u8 dw = 0, dr = 0; - struct qi_desc desc; - int ih = 0; - - if (cap_write_drain(iommu->cap)) - dw = 1; - - if (cap_read_drain(iommu->cap)) - dr = 1; - - desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) - | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; - desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) - | QI_IOTLB_AM(size_order); - desc.qw2 = 0; - desc.qw3 = 0; + qi_desc_iotlb(iommu, did, addr, size_order, type, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1555,20 +1547,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (!(iommu->gcmd & DMA_GCMD_TE)) return; - if (mask) { - addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; - desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; - } else - desc.qw1 = QI_DEV_IOTLB_ADDR(addr); - - if (qdep >= QI_DEV_IOTLB_MAX_INVS) - qdep = 0; - - desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | - QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); - desc.qw2 = 0; - desc.qw3 = 0; - + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1588,28 +1567,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, return; } - if (npages == -1) { - desc.qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc.qw1 = 0; - } else { - int mask = ilog2(__roundup_pow_of_two(npages)); - unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); - - if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) - addr = ALIGN_DOWN(addr, align); - - desc.qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | - QI_EIOTLB_TYPE; - desc.qw1 = QI_EIOTLB_ADDR(addr) | - QI_EIOTLB_IH(ih) | - QI_EIOTLB_AM(mask); - } - + qi_desc_piotlb(did, pasid, addr, npages, ih, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1617,7 +1575,6 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, u32 pasid, u16 qdep, u64 addr, unsigned int size_order) { - unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; /* @@ -1629,40 +1586,9 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (!(iommu->gcmd & DMA_GCMD_TE)) return; - desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(pfsid); - - /* - * If S bit is 0, we only flush a single page. If S bit is set, - * The least significant zero bit indicates the invalidation address - * range. VT-d spec 6.5.2.6. - * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. - * size order = 0 is PAGE_SIZE 4KB - * Max Invs Pending (MIP) is set to 0 for now until we have DIT in - * ECAP. - */ - if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) - pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", - addr, size_order); - - /* Take page address */ - desc.qw1 = QI_DEV_EIOTLB_ADDR(addr); - - if (size_order) { - /* - * Existing 0s in address below size_order may be the least - * significant bit, we must set them to 1s to avoid having - * smaller size than desired. - */ - desc.qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, - VTD_PAGE_SHIFT); - /* Clear size_order bit to indicate size */ - desc.qw1 &= ~mask; - /* Set the S bit to indicate flushing more than 1 page */ - desc.qw1 |= QI_DEV_EIOTLB_SIZE; - } - + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, + qdep, addr, size_order, + &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1756,7 +1682,6 @@ int dmar_enable_qi(struct intel_iommu *iommu) { struct q_inval *qi; void *desc; - int order; if (!ecap_qis(iommu->ecap)) return -ENOENT; @@ -1777,8 +1702,9 @@ int dmar_enable_qi(struct intel_iommu *iommu) * Need two pages to accommodate 256 descriptors of 256 bits each * if the remapping hardware supports scalable mode translation. */ - order = ecap_smts(iommu->ecap) ? 1 : 0; - desc = iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order); + desc = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, + ecap_smts(iommu->ecap) ? SZ_8K : + SZ_4K); if (!desc) { kfree(qi); iommu->qi = NULL; @@ -1789,7 +1715,7 @@ int dmar_enable_qi(struct intel_iommu *iommu) qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC); if (!qi->desc_status) { - iommu_free_page(qi->desc); + iommu_free_pages(qi->desc); kfree(qi); iommu->qi = NULL; return -ENOMEM; @@ -1970,19 +1896,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -void dmar_msi_read(int irq, struct msi_msg *msg) -{ - struct intel_iommu *iommu = irq_get_handler_data(irq); - int reg = dmar_msi_reg(iommu, irq); - unsigned long flag; - - raw_spin_lock_irqsave(&iommu->register_lock, flag); - msg->data = readl(iommu->reg + reg + 4); - msg->address_lo = readl(iommu->reg + reg + 8); - msg->address_hi = readl(iommu->reg + reg + 12); - raw_spin_unlock_irqrestore(&iommu->register_lock, flag); -} - static int dmar_fault_do_one(struct intel_iommu *iommu, int type, u8 fault_reason, u32 pasid, u16 source_id, unsigned long long addr) @@ -2131,6 +2044,7 @@ int enable_drhd_fault_handling(unsigned int cpu) /* * Enable fault control interrupt. */ + guard(rwsem_read)(&dmar_global_lock); for_each_iommu(iommu, drhd) { u32 fault_status; int ret; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index fd11a080380c..134302fbcd92 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -29,13 +29,12 @@ #include "../irq_remapping.h" #include "../iommu-pages.h" #include "pasid.h" -#include "cap_audit.h" #include "perfmon.h" #define ROOT_SIZE VTD_PAGE_SIZE #define CONTEXT_SIZE VTD_PAGE_SIZE -#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) +#define IS_GFX_DEVICE(pdev) pci_is_display(pdev) #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) @@ -46,18 +45,13 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 -#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) -#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) - -/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR - to match. That way, we can use 'unsigned long' for PFNs with impunity. */ -#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ - __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) -#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) - static void __init check_tylersburg_isoch(void); +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); static int rwbf_quirk; +#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) + /* * set to 1 to panic kernel if can't successfully enable VT-d * (used when kernel is launched w/ TXT) @@ -167,15 +161,6 @@ static void device_rbtree_remove(struct device_domain_info *info) spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); } -/* - * This domain is a statically identity mapping domain. - * 1. This domain creats a static 1:1 mapping to all usable memory. - * 2. It maps to each iommu if successful. - * 3. Each iommu mapps to this domain if successful. - */ -static struct dmar_domain *si_domain; -static int hw_pass_through = 1; - struct dmar_rmrr_unit { struct list_head list; /* list of rmrr units */ struct acpi_dmar_header *hdr; /* ACPI header */ @@ -225,7 +210,6 @@ static int disable_igfx_iommu; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; -static const struct iommu_dirty_ops intel_dirty_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { @@ -293,18 +277,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -static int domain_type_is_si(struct dmar_domain *domain) -{ - return domain->domain.type == IOMMU_DOMAIN_IDENTITY; -} - -static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) -{ - int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; - - return !(addr_width < BITS_PER_LONG && pfn >> addr_width); -} - /* * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of @@ -366,135 +338,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static void domain_update_iommu_coherency(struct dmar_domain *domain) -{ - struct iommu_domain_info *info; - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - bool found = false; - unsigned long i; - - domain->iommu_coherency = true; - xa_for_each(&domain->iommu_array, i, info) { - found = true; - if (!iommu_paging_structure_coherency(info->iommu)) { - domain->iommu_coherency = false; - break; - } - } - if (found) - return; - - /* No hardware attached; use lowest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (!iommu_paging_structure_coherency(iommu)) { - domain->iommu_coherency = false; - break; - } - } - rcu_read_unlock(); -} - -static int domain_update_iommu_superpage(struct dmar_domain *domain, - struct intel_iommu *skip) -{ - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - int mask = 0x3; - - if (!intel_iommu_superpage) - return 0; - - /* set iommu_superpage to the smallest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (iommu != skip) { - if (domain && domain->use_first_level) { - if (!cap_fl1gp_support(iommu->cap)) - mask = 0x1; - } else { - mask &= cap_super_page_val(iommu->cap); - } - - if (!mask) - break; - } - } - rcu_read_unlock(); - - return fls(mask); -} - -static int domain_update_device_node(struct dmar_domain *domain) -{ - struct device_domain_info *info; - int nid = NUMA_NO_NODE; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - /* - * There could possibly be multiple device numa nodes as devices - * within the same domain may sit behind different IOMMUs. There - * isn't perfect answer in such situation, so we select first - * come first served policy. - */ - nid = dev_to_node(info->dev); - if (nid != NUMA_NO_NODE) - break; - } - spin_unlock_irqrestore(&domain->lock, flags); - - return nid; -} - -/* Return the super pagesize bitmap if supported. */ -static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) -{ - unsigned long bitmap = 0; - - /* - * 1-level super page supports page size of 2MiB, 2-level super page - * supports page size of both 2MiB and 1GiB. - */ - if (domain->iommu_superpage == 1) - bitmap |= SZ_2M; - else if (domain->iommu_superpage == 2) - bitmap |= SZ_2M | SZ_1G; - - return bitmap; -} - -/* Some capabilities may be different across iommus */ -void domain_update_iommu_cap(struct dmar_domain *domain) -{ - domain_update_iommu_coherency(domain); - domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); - - /* - * If RHSA is missing, we should default to the device numa domain - * as fall back. - */ - if (domain->nid == NUMA_NO_NODE) - domain->nid = domain_update_device_node(domain); - - /* - * First-level translation restricts the input-address to a - * canonical address (i.e., address bits 63:N have the same - * value as address bit [N-1], where N is 48-bits with 4-level - * paging and 57-bits with 5-level paging). Hence, skip bit - * [N-1]. - */ - if (domain->use_first_level) - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); - else - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); - - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); - domain_update_iotlb(domain); -} - struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { @@ -524,7 +367,8 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, if (!alloc) return NULL; - context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); + context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, + SZ_4K); if (!context) return NULL; @@ -680,13 +524,6 @@ out: return iommu; } -static void domain_flush_cache(struct dmar_domain *domain, - void *addr, int size) -{ - if (!domain->iommu_coherency) - clflush_cache_range(addr, size); -} - static void free_context_table(struct intel_iommu *iommu) { struct context_entry *context; @@ -698,17 +535,17 @@ static void free_context_table(struct intel_iommu *iommu) for (i = 0; i < ROOT_ENTRY_NR; i++) { context = iommu_context_addr(iommu, i, 0, 0); if (context) - iommu_free_page(context); + iommu_free_pages(context); if (!sm_supported(iommu)) continue; context = iommu_context_addr(iommu, i, 0x80, 0); if (context) - iommu_free_page(context); + iommu_free_pages(context); } - iommu_free_page(iommu->root_entry); + iommu_free_pages(iommu->root_entry); iommu->root_entry = NULL; } @@ -722,14 +559,15 @@ static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, while (1) { offset = pfn_level_offset(pfn, level); pte = &parent[offset]; - if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { - pr_info("PTE not present at level %d\n", level); - break; - } pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); - if (level == 1) + if (!dma_pte_present(pte)) { + pr_info("page table not present at level %d\n", level - 1); + break; + } + + if (level == 1 || dma_pte_superpage(pte)) break; parent = phys_to_virt(dma_pte_addr(pte)); @@ -752,11 +590,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); /* root entry dump */ - rt_entry = &iommu->root_entry[bus]; - if (!rt_entry) { - pr_info("root table entry is not present\n"); + if (!iommu->root_entry) { + pr_info("root table is not present\n"); return; } + rt_entry = &iommu->root_entry[bus]; if (sm_supported(iommu)) pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", @@ -767,7 +605,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* context entry dump */ ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); if (!ctx_entry) { - pr_info("context table entry is not present\n"); + pr_info("context table is not present\n"); return; } @@ -776,17 +614,23 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* legacy mode does not require PASID entries */ if (!sm_supported(iommu)) { + if (!context_present(ctx_entry)) { + pr_info("legacy mode page table is not present\n"); + return; + } level = agaw_to_level(ctx_entry->hi & 7); pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); goto pgtable_walk; } - /* get the pointer to pasid directory entry */ - dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); - if (!dir) { - pr_info("pasid directory entry is not present\n"); + if (!context_present(ctx_entry)) { + pr_info("pasid directory table is not present\n"); return; } + + /* get the pointer to pasid directory entry */ + dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + /* For request-without-pasid, get the pasid from context entry */ if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) pasid = IOMMU_NO_PASID; @@ -798,7 +642,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* get the pointer to the pasid table entry */ entries = get_pasid_table_from_pde(pde); if (!entries) { - pr_info("pasid table entry is not present\n"); + pr_info("pasid table is not present\n"); return; } index = pasid & PASID_PTE_MASK; @@ -806,6 +650,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, for (i = 0; i < ARRAY_SIZE(pte->val); i++) pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); + if (!pasid_pte_is_present(pte)) { + pr_info("scalable mode page table is not present\n"); + return; + } + if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { level = pte->val[2] & BIT_ULL(2) ? 5 : 4; pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); @@ -819,286 +668,12 @@ pgtable_walk: } #endif -static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, - unsigned long pfn, int *target_level, - gfp_t gfp) -{ - struct dma_pte *parent, *pte; - int level = agaw_to_level(domain->agaw); - int offset; - - if (!domain_pfn_supported(domain, pfn)) - /* Address beyond IOMMU's addressing capabilities. */ - return NULL; - - parent = domain->pgd; - - while (1) { - void *tmp_page; - - offset = pfn_level_offset(pfn, level); - pte = &parent[offset]; - if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) - break; - if (level == *target_level) - break; - - if (!dma_pte_present(pte)) { - uint64_t pteval, tmp; - - tmp_page = iommu_alloc_page_node(domain->nid, gfp); - - if (!tmp_page) - return NULL; - - domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); - pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; - if (domain->use_first_level) - pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - - tmp = 0ULL; - if (!try_cmpxchg64(&pte->val, &tmp, pteval)) - /* Someone else set it while we were thinking; use theirs. */ - iommu_free_page(tmp_page); - else - domain_flush_cache(domain, pte, sizeof(*pte)); - } - if (level == 1) - break; - - parent = phys_to_virt(dma_pte_addr(pte)); - level--; - } - - if (!*target_level) - *target_level = level; - - return pte; -} - -/* return address's pte at specific level */ -static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, - unsigned long pfn, - int level, int *large_page) -{ - struct dma_pte *parent, *pte; - int total = agaw_to_level(domain->agaw); - int offset; - - parent = domain->pgd; - while (level <= total) { - offset = pfn_level_offset(pfn, total); - pte = &parent[offset]; - if (level == total) - return pte; - - if (!dma_pte_present(pte)) { - *large_page = total; - break; - } - - if (dma_pte_superpage(pte)) { - *large_page = total; - return pte; - } - - parent = phys_to_virt(dma_pte_addr(pte)); - total--; - } - return NULL; -} - -/* clear last level pte, a tlb flush should be followed */ -static void dma_pte_clear_range(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long last_pfn) -{ - unsigned int large_page; - struct dma_pte *first_pte, *pte; - - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || - WARN_ON(start_pfn > last_pfn)) - return; - - /* we don't need lock here; nobody else touches the iova range */ - do { - large_page = 1; - first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); - if (!pte) { - start_pfn = align_to_level(start_pfn + 1, large_page + 1); - continue; - } - do { - dma_clear_pte(pte); - start_pfn += lvl_to_nr_pages(large_page); - pte++; - } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); - - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - - } while (start_pfn && start_pfn <= last_pfn); -} - -static void dma_pte_free_level(struct dmar_domain *domain, int level, - int retain_level, struct dma_pte *pte, - unsigned long pfn, unsigned long start_pfn, - unsigned long last_pfn) -{ - pfn = max(start_pfn, pfn); - pte = &pte[pfn_level_offset(pfn, level)]; - - do { - unsigned long level_pfn; - struct dma_pte *level_pte; - - if (!dma_pte_present(pte) || dma_pte_superpage(pte)) - goto next; - - level_pfn = pfn & level_mask(level); - level_pte = phys_to_virt(dma_pte_addr(pte)); - - if (level > 2) { - dma_pte_free_level(domain, level - 1, retain_level, - level_pte, level_pfn, start_pfn, - last_pfn); - } - - /* - * Free the page table if we're below the level we want to - * retain and the range covers the entire table. - */ - if (level < retain_level && !(start_pfn > level_pfn || - last_pfn < level_pfn + level_size(level) - 1)) { - dma_clear_pte(pte); - domain_flush_cache(domain, pte, sizeof(*pte)); - iommu_free_page(level_pte); - } -next: - pfn += level_size(level); - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); -} - -/* - * clear last level (leaf) ptes and free page table pages below the - * level we wish to keep intact. - */ -static void dma_pte_free_pagetable(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long last_pfn, - int retain_level) -{ - dma_pte_clear_range(domain, start_pfn, last_pfn); - - /* We don't need lock here; nobody else touches the iova range */ - dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, - domain->pgd, 0, start_pfn, last_pfn); - - /* free pgd */ - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - iommu_free_page(domain->pgd); - domain->pgd = NULL; - } -} - -/* When a page at a given level is being unlinked from its parent, we don't - need to *modify* it at all. All we need to do is make a list of all the - pages which can be freed just as soon as we've flushed the IOTLB and we - know the hardware page-walk will no longer touch them. - The 'pte' argument is the *parent* PTE, pointing to the page that is to - be freed. */ -static void dma_pte_list_pagetables(struct dmar_domain *domain, - int level, struct dma_pte *pte, - struct list_head *freelist) -{ - struct page *pg; - - pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); - list_add_tail(&pg->lru, freelist); - - if (level == 1) - return; - - pte = page_address(pg); - do { - if (dma_pte_present(pte) && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); - pte++; - } while (!first_pte_in_page(pte)); -} - -static void dma_pte_clear_level(struct dmar_domain *domain, int level, - struct dma_pte *pte, unsigned long pfn, - unsigned long start_pfn, unsigned long last_pfn, - struct list_head *freelist) -{ - struct dma_pte *first_pte = NULL, *last_pte = NULL; - - pfn = max(start_pfn, pfn); - pte = &pte[pfn_level_offset(pfn, level)]; - - do { - unsigned long level_pfn = pfn & level_mask(level); - - if (!dma_pte_present(pte)) - goto next; - - /* If range covers entire pagetable, free it */ - if (start_pfn <= level_pfn && - last_pfn >= level_pfn + level_size(level) - 1) { - /* These suborbinate page tables are going away entirely. Don't - bother to clear them; we're just going to *free* them. */ - if (level > 1 && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); - - dma_clear_pte(pte); - if (!first_pte) - first_pte = pte; - last_pte = pte; - } else if (level > 1) { - /* Recurse down into a level that isn't *entirely* obsolete */ - dma_pte_clear_level(domain, level - 1, - phys_to_virt(dma_pte_addr(pte)), - level_pfn, start_pfn, last_pfn, - freelist); - } -next: - pfn = level_pfn + level_size(level); - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); - - if (first_pte) - domain_flush_cache(domain, first_pte, - (void *)++last_pte - (void *)first_pte); -} - -/* We can't just free the pages because the IOMMU may still be walking - the page tables, and may have cached the intermediate levels. The - pages can only be freed after the IOTLB flush has been done. */ -static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, - unsigned long last_pfn, struct list_head *freelist) -{ - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || - WARN_ON(start_pfn > last_pfn)) - return; - - /* we don't need lock here; nobody else touches the iova range */ - dma_pte_clear_level(domain, agaw_to_level(domain->agaw), - domain->pgd, 0, start_pfn, last_pfn, freelist); - - /* free pgd */ - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { - struct page *pgd_page = virt_to_page(domain->pgd); - list_add_tail(&pgd_page->lru, freelist); - domain->pgd = NULL; - } -} - /* iommu handling */ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { struct root_entry *root; - root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC); + root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); if (!root) { pr_err("Allocating root entry for %s failed\n", iommu->name); @@ -1199,9 +774,8 @@ static void __iommu_flush_context(struct intel_iommu *iommu, raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -/* return value determine if we need a write buffer flush */ -static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int size_order, u64 type) +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; @@ -1270,32 +844,6 @@ domain_lookup_dev_info(struct dmar_domain *domain, return NULL; } -void domain_update_iotlb(struct dmar_domain *domain) -{ - struct dev_pasid_info *dev_pasid; - struct device_domain_info *info; - bool has_iotlb_device = false; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - if (info->ats_enabled) { - has_iotlb_device = true; - break; - } - } - - list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { - info = dev_iommu_priv_get(dev_pasid->dev); - if (info->ats_enabled) { - has_iotlb_device = true; - break; - } - } - domain->has_iotlb_device = has_iotlb_device; - spin_unlock_irqrestore(&domain->lock, flags); -} - /* * The extra devTLB flush quirk impacts those QAT devices with PCI device * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() @@ -1314,64 +862,59 @@ static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) return true; } -static void iommu_enable_pci_caps(struct device_domain_info *info) +static void iommu_enable_pci_ats(struct device_domain_info *info) { struct pci_dev *pdev; - if (!dev_is_pci(info->dev)) + if (!info->ats_supported) return; pdev = to_pci_dev(info->dev); + if (!pci_ats_page_aligned(pdev)) + return; - /* The PCIe spec, in its wisdom, declares that the behaviour of - the device if you enable PASID support after ATS support is - undefined. So always enable PASID support on devices which - have it, even if we can't yet know if we're ever going to - use it. */ - if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) - info->pasid_enabled = 1; - - if (info->ats_supported && pci_ats_page_aligned(pdev) && - !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { + if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) info->ats_enabled = 1; - domain_update_iotlb(info->domain); - } } -static void iommu_disable_pci_caps(struct device_domain_info *info) +static void iommu_disable_pci_ats(struct device_domain_info *info) +{ + if (!info->ats_enabled) + return; + + pci_disable_ats(to_pci_dev(info->dev)); + info->ats_enabled = 0; +} + +static void iommu_enable_pci_pri(struct device_domain_info *info) { struct pci_dev *pdev; - if (!dev_is_pci(info->dev)) + if (!info->ats_enabled || !info->pri_supported) return; pdev = to_pci_dev(info->dev); + /* PASID is required in PRG Response Message. */ + if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) + return; - if (info->ats_enabled) { - pci_disable_ats(pdev); - info->ats_enabled = 0; - domain_update_iotlb(info->domain); - } + if (pci_reset_pri(pdev)) + return; - if (info->pasid_enabled) { - pci_disable_pasid(pdev); - info->pasid_enabled = 0; - } + if (!pci_enable_pri(pdev, PRQ_DEPTH)) + info->pri_enabled = 1; } -static void __iommu_flush_dev_iotlb(struct device_domain_info *info, - u64 addr, unsigned int mask) +static void iommu_disable_pci_pri(struct device_domain_info *info) { - u16 sid, qdep; - - if (!info || !info->ats_enabled) + if (!info->pri_enabled) return; - sid = info->bus << 8 | info->devfn; - qdep = info->ats_qdep; - qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, - qdep, addr, mask); - quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep); + if (WARN_ON(info->iopf_refcount)) + iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); + + pci_disable_pri(to_pci_dev(info->dev)); + info->pri_enabled = 0; } static void intel_flush_iotlb_all(struct iommu_domain *domain) @@ -1435,52 +978,13 @@ static void iommu_disable_translation(struct intel_iommu *iommu) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -static int iommu_init_domains(struct intel_iommu *iommu) -{ - u32 ndomains; - - ndomains = cap_ndoms(iommu->cap); - pr_debug("%s: Number of Domains supported <%d>\n", - iommu->name, ndomains); - - spin_lock_init(&iommu->lock); - - iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL); - if (!iommu->domain_ids) - return -ENOMEM; - - /* - * If Caching mode is set, then invalid translations are tagged - * with domain-id 0, hence we need to pre-allocate it. We also - * use domain-id 0 as a marker for non-allocated domain-id, so - * make sure it is not used for a real domain. - */ - set_bit(0, iommu->domain_ids); - - /* - * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid - * entry for first-level or pass-through translation modes should - * be programmed with a domain id different from those used for - * second-level or nested translation. We reserve a domain id for - * this purpose. - */ - if (sm_supported(iommu)) - set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); - - return 0; -} - static void disable_dmar_iommu(struct intel_iommu *iommu) { - if (!iommu->domain_ids) - return; - /* * All iommu domains must have been detached from the devices, * hence there should be no domain IDs in use. */ - if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) - > NUM_RESERVED_DID)) + if (WARN_ON(!ida_is_empty(&iommu->domain_ida))) return; if (iommu->gcmd & DMA_GCMD_TE) @@ -1489,11 +993,6 @@ static void disable_dmar_iommu(struct intel_iommu *iommu) static void free_dmar_iommu(struct intel_iommu *iommu) { - if (iommu->domain_ids) { - bitmap_free(iommu->domain_ids); - iommu->domain_ids = NULL; - } - if (iommu->copied_tables) { bitmap_free(iommu->copied_tables); iommu->copied_tables = NULL; @@ -1502,58 +1001,30 @@ static void free_dmar_iommu(struct intel_iommu *iommu) /* free context mapping */ free_context_table(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu)) { - if (ecap_prs(iommu->ecap)) - intel_svm_finish_prq(iommu); - } -#endif + if (ecap_prs(iommu->ecap)) + intel_iommu_finish_prq(iommu); } /* * Check and return whether first level is used by default for * DMA translation. */ -static bool first_level_by_default(unsigned int type) +static bool first_level_by_default(struct intel_iommu *iommu) { /* Only SL is available in legacy mode */ - if (!scalable_mode_support()) + if (!sm_supported(iommu)) return false; /* Only level (either FL or SL) is available, just use it */ - if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) - return intel_cap_flts_sanity(); - - /* Both levels are available, decide it based on domain type */ - return type != IOMMU_DOMAIN_UNMANAGED; -} - -static struct dmar_domain *alloc_domain(unsigned int type) -{ - struct dmar_domain *domain; + if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) + return ecap_flts(iommu->ecap); - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - - domain->nid = NUMA_NO_NODE; - if (first_level_by_default(type)) - domain->use_first_level = true; - domain->has_iotlb_device = false; - INIT_LIST_HEAD(&domain->devices); - INIT_LIST_HEAD(&domain->dev_pasids); - INIT_LIST_HEAD(&domain->cache_tags); - spin_lock_init(&domain->lock); - spin_lock_init(&domain->cache_lock); - xa_init(&domain->iommu_array); - - return domain; + return true; } int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; - unsigned long ndomains; int num, ret = -ENOSPC; if (domain->domain.type == IOMMU_DOMAIN_SVA) @@ -1563,41 +1034,36 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) if (!info) return -ENOMEM; - spin_lock(&iommu->lock); + guard(mutex)(&iommu->did_lock); curr = xa_load(&domain->iommu_array, iommu->seq_id); if (curr) { curr->refcnt++; - spin_unlock(&iommu->lock); kfree(info); return 0; } - ndomains = cap_ndoms(iommu->cap); - num = find_first_zero_bit(iommu->domain_ids, ndomains); - if (num >= ndomains) { + num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, + cap_ndoms(iommu->cap) - 1, GFP_KERNEL); + if (num < 0) { pr_err("%s: No free domain ids\n", iommu->name); goto err_unlock; } - set_bit(num, iommu->domain_ids); info->refcnt = 1; info->did = num; info->iommu = iommu; curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, - NULL, info, GFP_ATOMIC); + NULL, info, GFP_KERNEL); if (curr) { ret = xa_err(curr) ? : -EBUSY; goto err_clear; } - domain_update_iommu_cap(domain); - spin_unlock(&iommu->lock); return 0; err_clear: - clear_bit(info->did, iommu->domain_ids); + ida_free(&iommu->domain_ida, info->did); err_unlock: - spin_unlock(&iommu->lock); kfree(info); return ret; } @@ -1609,45 +1075,68 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) if (domain->domain.type == IOMMU_DOMAIN_SVA) return; - spin_lock(&iommu->lock); + guard(mutex)(&iommu->did_lock); info = xa_load(&domain->iommu_array, iommu->seq_id); if (--info->refcnt == 0) { - clear_bit(info->did, iommu->domain_ids); + ida_free(&iommu->domain_ida, info->did); xa_erase(&domain->iommu_array, iommu->seq_id); - domain->nid = NUMA_NO_NODE; - domain_update_iommu_cap(domain); kfree(info); } - spin_unlock(&iommu->lock); } -static int guestwidth_to_adjustwidth(int gaw) +/* + * For kdump cases, old valid entries may be cached due to the + * in-flight DMA and copied pgtable, but there is no unmapping + * behaviour for them, thus we need an explicit cache flush for + * the newly-mapped device. For kdump, at this point, the device + * is supposed to finish reset at its driver probe stage, so no + * in-flight DMA will exist, and we don't need to worry anymore + * hereafter. + */ +static void copied_context_tear_down(struct intel_iommu *iommu, + struct context_entry *context, + u8 bus, u8 devfn) { - int agaw; - int r = (gaw - 12) % 9; + u16 did_old; - if (r == 0) - agaw = gaw; - else - agaw = gaw + 9 - r; - if (agaw > 64) - agaw = 64; - return agaw; -} + if (!context_copied(iommu, bus, devfn)) + return; -static void domain_exit(struct dmar_domain *domain) -{ - if (domain->pgd) { - LIST_HEAD(freelist); + assert_spin_locked(&iommu->lock); + + did_old = context_domain_id(context); + context_clear_entry(context); - domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); - iommu_put_pages_list(&freelist); + if (did_old < cap_ndoms(iommu->cap)) { + iommu->flush.flush_context(iommu, did_old, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did_old, 0, 0, + DMA_TLB_DSI_FLUSH); } - if (WARN_ON(!list_empty(&domain->devices))) - return; + clear_context_copied(iommu, bus, devfn); +} - kfree(domain); +/* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ +static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, + u8 bus, u8 devfn) +{ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + PCI_DEVID(bus, devfn), + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + } else { + iommu_flush_write_buffer(iommu); + } } static int domain_context_mapping_one(struct dmar_domain *domain, @@ -1658,12 +1147,14 @@ static int domain_context_mapping_one(struct dmar_domain *domain, domain_lookup_dev_info(domain, iommu, bus, devfn); u16 did = domain_id_iommu(domain, iommu); int translation = CONTEXT_TT_MULTI_LEVEL; - struct dma_pte *pgd = domain->pgd; + struct pt_iommu_vtdss_hw_info pt_info; struct context_entry *context; - int agaw, ret; + int ret; - if (hw_pass_through && domain_type_is_si(domain)) - translation = CONTEXT_TT_PASS_THROUGH; + if (WARN_ON(!intel_domain_is_ss_paging(domain))) + return -EINVAL; + + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1678,83 +1169,23 @@ static int domain_context_mapping_one(struct dmar_domain *domain, if (context_present(context) && !context_copied(iommu, bus, devfn)) goto out_unlock; - /* - * For kdump cases, old valid entries may be cached due to the - * in-flight DMA and copied pgtable, but there is no unmapping - * behaviour for them, thus we need an explicit cache flush for - * the newly-mapped device. For kdump, at this point, the device - * is supposed to finish reset at its driver probe stage, so no - * in-flight DMA will exist, and we don't need to worry anymore - * hereafter. - */ - if (context_copied(iommu, bus, devfn)) { - u16 did_old = context_domain_id(context); - - if (did_old < cap_ndoms(iommu->cap)) { - iommu->flush.flush_context(iommu, did_old, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, did_old, 0, 0, - DMA_TLB_DSI_FLUSH); - } - - clear_context_copied(iommu, bus, devfn); - } - + copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); context_set_domain_id(context, did); - if (translation != CONTEXT_TT_PASS_THROUGH) { - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - - if (info && info->ats_supported) - translation = CONTEXT_TT_DEV_IOTLB; - else - translation = CONTEXT_TT_MULTI_LEVEL; - - context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); - } else { - /* - * In pass through mode, AW must be programmed to - * indicate the largest AGAW value supported by - * hardware. And ASR is ignored by hardware. - */ - context_set_address_width(context, iommu->msagaw); - } + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; + context_set_address_root(context, pt_info.ssptptr); + context_set_address_width(context, pt_info.aw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); - - /* - * It's a non-present to present mapping. If hardware doesn't cache - * non-present entry we only need to flush the write-buffer. If the - * _does_ cache non-present entries, then it does so in the special - * domain #0, which we have to flush: - */ - if (cap_caching_mode(iommu->cap)) { - iommu->flush.flush_context(iommu, 0, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - } else { - iommu_flush_write_buffer(iommu); - } - + context_present_cache_flush(iommu, did, bus, devfn); ret = 0; out_unlock: @@ -1780,177 +1211,17 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; + int ret; if (!dev_is_pci(dev)) return domain_context_mapping_one(domain, iommu, bus, devfn); - return pci_for_each_dma_alias(to_pci_dev(dev), - domain_context_mapping_cb, domain); -} - -/* Return largest possible superpage level for a given mapping */ -static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phy_pfn, unsigned long pages) -{ - int support, level = 1; - unsigned long pfnmerge; - - support = domain->iommu_superpage; - - /* To use a large page, the virtual *and* physical addresses - must be aligned to 2MiB/1GiB/etc. Lower bits set in either - of them will mean we have to use smaller pages. So just - merge them and check both at once. */ - pfnmerge = iov_pfn | phy_pfn; - - while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { - pages >>= VTD_STRIDE_SHIFT; - if (!pages) - break; - pfnmerge >>= VTD_STRIDE_SHIFT; - level++; - support--; - } - return level; -} - -/* - * Ensure that old small page tables are removed to make room for superpage(s). - * We're going to add new large pages, so make sure we don't remove their parent - * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. - */ -static void switch_to_super_page(struct dmar_domain *domain, - unsigned long start_pfn, - unsigned long end_pfn, int level) -{ - unsigned long lvl_pages = lvl_to_nr_pages(level); - struct dma_pte *pte = NULL; - - while (start_pfn <= end_pfn) { - if (!pte) - pte = pfn_to_dma_pte(domain, start_pfn, &level, - GFP_ATOMIC); - - if (dma_pte_present(pte)) { - dma_pte_free_pagetable(domain, start_pfn, - start_pfn + lvl_pages - 1, - level + 1); - - cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, - end_pfn << VTD_PAGE_SHIFT, 0); - } - - pte++; - start_pfn += lvl_pages; - if (first_pte_in_page(pte)) - pte = NULL; - } -} - -static int -__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phys_pfn, unsigned long nr_pages, int prot, - gfp_t gfp) -{ - struct dma_pte *first_pte = NULL, *pte = NULL; - unsigned int largepage_lvl = 0; - unsigned long lvl_pages = 0; - phys_addr_t pteval; - u64 attr; - - if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) - return -EINVAL; - - if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) - return -EINVAL; - - if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { - pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); - return -EINVAL; - } - - attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); - attr |= DMA_FL_PTE_PRESENT; - if (domain->use_first_level) { - attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - if (prot & DMA_PTE_WRITE) - attr |= DMA_FL_PTE_DIRTY; - } - - domain->has_mappings = true; - - pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; - - while (nr_pages > 0) { - uint64_t tmp; - - if (!pte) { - largepage_lvl = hardware_largepage_caps(domain, iov_pfn, - phys_pfn, nr_pages); - - pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, - gfp); - if (!pte) - return -ENOMEM; - first_pte = pte; - - lvl_pages = lvl_to_nr_pages(largepage_lvl); - - /* It is large page*/ - if (largepage_lvl > 1) { - unsigned long end_pfn; - unsigned long pages_to_remove; - - pteval |= DMA_PTE_LARGE_PAGE; - pages_to_remove = min_t(unsigned long, nr_pages, - nr_pte_to_next_page(pte) * lvl_pages); - end_pfn = iov_pfn + pages_to_remove - 1; - switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); - } else { - pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; - } - - } - /* We don't need lock here, nobody else - * touches the iova range - */ - tmp = 0ULL; - if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { - static int dumps = 5; - pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", - iov_pfn, tmp, (unsigned long long)pteval); - if (dumps) { - dumps--; - debug_dma_dump_mappings(NULL); - } - WARN_ON(1); - } + ret = pci_for_each_dma_alias(to_pci_dev(dev), + domain_context_mapping_cb, domain); + if (ret) + return ret; - nr_pages -= lvl_pages; - iov_pfn += lvl_pages; - phys_pfn += lvl_pages; - pteval += lvl_pages * VTD_PAGE_SIZE; - - /* If the next PTE would be the first in a new page, then we - * need to flush the cache on the entries we've just written. - * And then we'll need to recalculate 'pte', so clear it and - * let it get set again in the if (!pte) block above. - * - * If we're done (!nr_pages) we need to flush the cache too. - * - * Also if we've been setting superpages, we may need to - * recalculate 'pte' and switch back to smaller pages for the - * end of the mapping, if the trailing size is not enough to - * use another superpage (i.e. nr_pages < lvl_pages). - */ - pte++; - if (!nr_pages || first_pte_in_page(pte) || - (largepage_lvl > 1 && nr_pages < lvl_pages)) { - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - pte = NULL; - } - } + iommu_enable_pci_ats(info); return 0; } @@ -1959,7 +1230,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 { struct intel_iommu *iommu = info->iommu; struct context_entry *context; - u16 did_old; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); @@ -1968,138 +1239,74 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 return; } - did_old = context_domain_id(context); - + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - iommu->flush.flush_context(iommu, - did_old, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - - iommu->flush.flush_iotlb(iommu, - did_old, - 0, - 0, - DMA_TLB_DSI_FLUSH); - - __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); + intel_context_flush_no_pasid(info, context, did); } -static int domain_setup_first_level(struct intel_iommu *iommu, - struct dmar_domain *domain, - struct device *dev, - u32 pasid) +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old) { - struct dma_pte *pgd = domain->pgd; - int agaw, level; - int flags = 0; - - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - return -ENOMEM; - } - - level = agaw_to_level(agaw); - if (level != 4 && level != 5) - return -EINVAL; - - if (level == 5) - flags |= PASID_FLAG_FL5LP; - - if (domain->force_snooping) - flags |= PASID_FLAG_PAGE_SNOOP; - - return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, - domain_id_iommu(domain, iommu), - flags); + if (!old) + return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, + did, flags); + return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did, + iommu_domain_did(old, iommu), + flags); } -static bool dev_is_real_dma_subdevice(struct device *dev) +static int domain_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { - return dev && dev_is_pci(dev) && - pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); + if (!old) + return intel_pasid_setup_second_level(iommu, domain, + dev, pasid); + return intel_pasid_replace_second_level(iommu, domain, dev, + iommu_domain_did(old, iommu), + pasid); } -static int iommu_domain_identity_map(struct dmar_domain *domain, - unsigned long first_vpfn, - unsigned long last_vpfn) +static int domain_setup_passthrough(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { - /* - * RMRR range might have overlap with physical memory range, - * clear it first - */ - dma_pte_clear_range(domain, first_vpfn, last_vpfn); - - return __domain_mapping(domain, first_vpfn, - first_vpfn, last_vpfn - first_vpfn + 1, - DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); + if (!old) + return intel_pasid_setup_pass_through(iommu, dev, pasid); + return intel_pasid_replace_pass_through(iommu, dev, + iommu_domain_did(old, iommu), + pasid); } -static int md_domain_init(struct dmar_domain *domain, int guest_width); - -static int __init si_domain_init(int hw) +static int domain_setup_first_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, + u32 pasid, struct iommu_domain *old) { - struct dmar_rmrr_unit *rmrr; - struct device *dev; - int i, nid, ret; - - si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); - if (!si_domain) - return -EFAULT; + struct pt_iommu_x86_64_hw_info pt_info; + unsigned int flags = 0; - if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - domain_exit(si_domain); - si_domain = NULL; - return -EFAULT; - } - - if (hw) - return 0; - - for_each_online_node(nid) { - unsigned long start_pfn, end_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - ret = iommu_domain_identity_map(si_domain, - mm_to_dma_pfn_start(start_pfn), - mm_to_dma_pfn_end(end_pfn)); - if (ret) - return ret; - } - } + pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info); + if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5)) + return -EINVAL; - /* - * Identity map the RMRRs so that devices with RMRRs could also use - * the si_domain. - */ - for_each_rmrr_units(rmrr) { - for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, - i, dev) { - unsigned long long start = rmrr->base_address; - unsigned long long end = rmrr->end_address; + if (pt_info.levels == 5) + flags |= PASID_FLAG_FL5LP; - if (WARN_ON(end < start || - end >> agaw_to_width(si_domain->agaw))) - continue; + if (domain->force_snooping) + flags |= PASID_FLAG_PAGE_SNOOP; - ret = iommu_domain_identity_map(si_domain, - mm_to_dma_pfn_start(start >> PAGE_SHIFT), - mm_to_dma_pfn_end(end >> PAGE_SHIFT)); - if (ret) - return ret; - } - } + if (!(domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + flags |= PASID_FLAG_PWSNP; - return 0; + return __domain_setup_first_level(iommu, dev, pasid, + domain_id_iommu(domain, iommu), + pt_info.gcr3_pt, flags, old); } static int dmar_domain_attach_device(struct dmar_domain *domain, @@ -2115,6 +1322,7 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, return ret; info->domain = domain; + info->domain_attached = true; spin_lock_irqsave(&domain->lock, flags); list_add(&info->link, &domain->devices); spin_unlock_irqrestore(&domain->lock, flags); @@ -2124,19 +1332,18 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); - else if (hw_pass_through && domain_type_is_si(domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); - else if (domain->use_first_level) - ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); - else - ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); + else if (intel_domain_is_fs_paging(domain)) + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (intel_domain_is_ss_paging(domain)) + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (WARN_ON(true)) + ret = -EINVAL; if (ret) goto out_block_translation; - if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) - iommu_enable_pci_caps(info); - ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); if (ret) goto out_block_translation; @@ -2177,19 +1384,18 @@ static bool device_rmrr_is_relaxable(struct device *dev) return false; } -/* - * Return the required default domain type for a specific device. - * - * @dev: the device in query - * @startup: true if this is during early boot - * - * Returns: - * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain - * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain - * - 0: both identity and dynamic domains work for this device - */ static int device_def_domain_type(struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * Hardware does not support the passthrough translation mode. + * Always use a dynamaic mapping domain. + */ + if (!ecap_pass_through(iommu->ecap)) + return IOMMU_DOMAIN_DMA; + if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); @@ -2287,7 +1493,8 @@ static int copy_context_table(struct intel_iommu *iommu, if (!old_ce) goto out; - new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL); + new_ce = iommu_alloc_pages_node_sz(iommu->node, + GFP_KERNEL, SZ_4K); if (!new_ce) goto out_unmap; @@ -2302,7 +1509,7 @@ static int copy_context_table(struct intel_iommu *iommu, did = context_domain_id(&ce); if (did >= 0 && did < cap_ndoms(iommu->cap)) - set_bit(did, iommu->domain_ids); + ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL); set_context_copied(iommu, bus, devfn); new_ce[idx] = ce; @@ -2410,10 +1617,6 @@ static int __init init_dmars(void) struct intel_iommu *iommu; int ret; - ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL); - if (ret) - goto free_iommu; - for_each_iommu(iommu, drhd) { if (drhd->ignored) { iommu_disable_translation(iommu); @@ -2433,11 +1636,6 @@ static int __init init_dmars(void) } intel_iommu_init_qi(iommu); - - ret = iommu_init_domains(iommu); - if (ret) - goto free_iommu; - init_translation_status(iommu); if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { @@ -2480,8 +1678,6 @@ static int __init init_dmars(void) } } - if (!ecap_pass_through(iommu->ecap)) - hw_pass_through = 0; intel_svm_check(iommu); } @@ -2497,10 +1693,6 @@ static int __init init_dmars(void) check_tylersburg_isoch(); - ret = si_domain_init(hw_pass_through); - if (ret) - goto free_iommu; - /* * for each drhd * enable fault log @@ -2521,19 +1713,18 @@ static int __init init_dmars(void) iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { + if (ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. */ up_write(&dmar_global_lock); - ret = intel_svm_enable_prq(iommu); + ret = intel_iommu_enable_prq(iommu); down_write(&dmar_global_lock); if (ret) goto free_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; @@ -2546,10 +1737,6 @@ free_iommu: disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } - if (si_domain) { - domain_exit(si_domain); - si_domain = NULL; - } return ret; } @@ -2638,7 +1825,7 @@ static void iommu_flush_all(void) } } -static int iommu_suspend(void) +static int iommu_suspend(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -2665,7 +1852,7 @@ static int iommu_suspend(void) return 0; } -static void iommu_resume(void) +static void iommu_resume(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -2696,14 +1883,18 @@ static void iommu_resume(void) } } -static struct syscore_ops iommu_syscore_ops = { +static const struct syscore_ops iommu_syscore_ops = { .resume = iommu_resume, .suspend = iommu_suspend, }; +static struct syscore iommu_syscore = { + .ops = &iommu_syscore_ops, +}; + static void __init init_iommu_pm_ops(void) { - register_syscore_ops(&iommu_syscore_ops); + register_syscore(&iommu_syscore); } #else @@ -2917,25 +2108,8 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { - int sp, ret; struct intel_iommu *iommu = dmaru->iommu; - - ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); - if (ret) - goto out; - - if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { - pr_warn("%s: Doesn't support hardware pass through.\n", - iommu->name); - return -ENXIO; - } - - sp = domain_update_iommu_superpage(NULL, iommu) - 1; - if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { - pr_warn("%s: Doesn't support large page.\n", - iommu->name); - return -ENXIO; - } + int ret; /* * Disable translation if already enabled prior to OS handover. @@ -2943,9 +2117,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) if (iommu->gcmd & DMA_GCMD_TE) iommu_disable_translation(iommu); - ret = iommu_init_domains(iommu); - if (ret == 0) - ret = iommu_alloc_root_entry(iommu); + ret = iommu_alloc_root_entry(iommu); if (ret) goto out; @@ -2963,13 +2135,12 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { - ret = intel_svm_enable_prq(iommu); + if (ecap_prs(iommu->ecap)) { + ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto disable_iommu; @@ -3037,7 +2208,6 @@ static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) struct device *tmp; int i; - dev = pci_physfn(dev); rcu_read_lock(); list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { @@ -3054,15 +2224,16 @@ out: return satcu; } -static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) +static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) { - int i, ret = 1; - struct pci_bus *bus; struct pci_dev *bridge = NULL; - struct device *tmp; - struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; struct dmar_satc_unit *satcu; + struct acpi_dmar_atsr *atsr; + bool supported = true; + struct pci_bus *bus; + struct device *tmp; + int i; dev = pci_physfn(dev); satcu = dmar_find_matched_satc_unit(dev); @@ -3080,11 +2251,11 @@ static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) bridge = bus->self; /* If it's an integrated device, allow ATS */ if (!bridge) - return 1; + return true; /* Connected via non-PCIe: no ATS */ if (!pci_is_pcie(bridge) || pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) - return 0; + return false; /* If we found the root port, look it up in the ATSR */ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) break; @@ -3103,11 +2274,11 @@ static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) if (atsru->include_all) goto out; } - ret = 0; + supported = false; out: rcu_read_unlock(); - return ret; + return supported; } int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) @@ -3180,43 +2351,6 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) return 0; } -static int intel_iommu_memory_notifier(struct notifier_block *nb, - unsigned long val, void *v) -{ - struct memory_notify *mhp = v; - unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); - unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + - mhp->nr_pages - 1); - - switch (val) { - case MEM_GOING_ONLINE: - if (iommu_domain_identity_map(si_domain, - start_vpfn, last_vpfn)) { - pr_warn("Failed to build identity map for [%lx-%lx]\n", - start_vpfn, last_vpfn); - return NOTIFY_BAD; - } - break; - - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - { - LIST_HEAD(freelist); - - domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); - iommu_put_pages_list(&freelist); - } - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block intel_iommu_memory_nb = { - .notifier_call = intel_iommu_memory_notifier, - .priority = 0 -}; - static void intel_disable_iommus(void) { struct intel_iommu *iommu = NULL; @@ -3234,16 +2368,19 @@ void intel_iommu_shutdown(void) if (no_iommu || dmar_disabled) return; - down_write(&dmar_global_lock); + /* + * All other CPUs were brought down, hotplug interrupts were disabled, + * no lock and RCU checking needed anymore + */ + list_for_each_entry(drhd, &dmar_drhd_units, list) { + iommu = drhd->iommu; - /* Disable PMRs explicitly here. */ - for_each_iommu(iommu, drhd) + /* Disable PMRs explicitly here. */ iommu_disable_protect_mem_regions(iommu); - /* Make sure the IOMMUs are switched off */ - intel_disable_iommus(); - - up_write(&dmar_global_lock); + /* Make sure the IOMMUs are switched off */ + iommu_disable_translation(iommu); + } } static struct intel_iommu *dev_to_intel_iommu(struct device *dev) @@ -3299,9 +2436,14 @@ static ssize_t domains_used_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); - return sysfs_emit(buf, "%d\n", - bitmap_weight(iommu->domain_ids, - cap_ndoms(iommu->cap))); + unsigned int count = 0; + int id; + + for (id = 0; id < cap_ndoms(iommu->cap); id++) + if (ida_exists(&iommu->domain_ida, id)) + count++; + + return sysfs_emit(buf, "%d\n", count); } static DEVICE_ATTR_RO(domains_used); @@ -3376,6 +2518,7 @@ static int __init probe_acpi_namespace_devices(void) if (dev->bus != &acpi_bus_type) continue; + up_read(&dmar_global_lock); adev = to_acpi_device(dev); mutex_lock(&adev->physical_node_lock); list_for_each_entry(pn, @@ -3385,6 +2528,7 @@ static int __init probe_acpi_namespace_devices(void) break; } mutex_unlock(&adev->physical_node_lock); + down_read(&dmar_global_lock); if (ret) return ret; @@ -3502,23 +2646,25 @@ int __init intel_iommu_init(void) * the virtual and physical IOMMU page-tables. */ if (cap_caching_mode(iommu->cap) && - !first_level_by_default(IOMMU_DOMAIN_DMA)) { + !first_level_by_default(iommu)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, "%s", iommu->name); + /* + * The iommu device probe is protected by the iommu_probe_device_lock. + * Release the dmar_global_lock before entering the device probe path + * to avoid unnecessary lock order splat. + */ + up_read(&dmar_global_lock); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + down_read(&dmar_global_lock); iommu_pmu_register(iommu); } - up_read(&dmar_global_lock); - - if (si_domain && !hw_pass_through) - register_memory_notifier(&intel_iommu_memory_nb); - down_read(&dmar_global_lock); if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); @@ -3559,11 +2705,14 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op */ static void domain_context_clear(struct device_domain_info *info) { - if (!dev_is_pci(info->dev)) + if (!dev_is_pci(info->dev)) { domain_context_clear_one(info, info->bus, info->devfn); + return; + } pci_for_each_dma_alias(to_pci_dev(info->dev), &domain_context_clear_one_cb, info); + iommu_disable_pci_ats(info); } /* @@ -3577,7 +2726,13 @@ void device_block_translation(struct device *dev) struct intel_iommu *iommu = info->iommu; unsigned long flags; - iommu_disable_pci_caps(info); + /* Device in DMA blocking state. Noting to do. */ + if (!info->domain_attached) + return; + + if (info->domain) + cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); + if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu)) intel_pasid_tear_down_entry(iommu, dev, @@ -3586,6 +2741,9 @@ void device_block_translation(struct device *dev) domain_context_clear(info); } + /* Device now in DMA blocking state. */ + info->domain_attached = false; + if (!info->domain) return; @@ -3593,335 +2751,410 @@ void device_block_translation(struct device *dev) list_del(&info->link); spin_unlock_irqrestore(&info->domain->lock, flags); - cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); domain_detach_iommu(info->domain, iommu); info->domain = NULL; } -static int md_domain_init(struct dmar_domain *domain, int guest_width) -{ - int adjust_width; - - /* calculate AGAW */ - domain->gaw = guest_width; - adjust_width = guestwidth_to_adjustwidth(guest_width); - domain->agaw = width_to_agaw(adjust_width); - - domain->iommu_coherency = false; - domain->iommu_superpage = 0; - domain->max_addr = 0; - - /* always allocate the top pgd */ - domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); - if (!domain->pgd) - return -ENOMEM; - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); - return 0; -} - static int blocking_domain_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + + iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev); device_block_translation(dev); return 0; } +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); + static struct iommu_domain blocking_domain = { .type = IOMMU_DOMAIN_BLOCKED, .ops = &(const struct iommu_domain_ops) { .attach_dev = blocking_domain_attach_dev, + .set_dev_pasid = blocking_domain_set_dev_pasid, } }; -static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) +static struct dmar_domain *paging_domain_alloc(void) { - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; + struct dmar_domain *domain; - switch (type) { - case IOMMU_DOMAIN_DMA: - case IOMMU_DOMAIN_UNMANAGED: - dmar_domain = alloc_domain(type); - if (!dmar_domain) { - pr_err("Can't allocate dmar_domain\n"); - return NULL; - } - if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - pr_err("Domain initialization failed\n"); - domain_exit(dmar_domain); - return NULL; - } + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); - domain = &dmar_domain->domain; - domain->geometry.aperture_start = 0; - domain->geometry.aperture_end = - __DOMAIN_MAX_ADDR(dmar_domain->gaw); - domain->geometry.force_aperture = true; + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + INIT_LIST_HEAD(&domain->cache_tags); + spin_lock_init(&domain->lock); + spin_lock_init(&domain->cache_lock); + xa_init(&domain->iommu_array); + INIT_LIST_HEAD(&domain->s1_domains); + spin_lock_init(&domain->s1_lock); - return domain; - case IOMMU_DOMAIN_IDENTITY: - return &si_domain->domain; - default: - return NULL; + return domain; +} + +static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu, + unsigned int *top_level) +{ + unsigned int mgaw = cap_mgaw(iommu->cap); + + /* + * Spec 3.6 First-Stage Translation: + * + * Software must limit addresses to less than the minimum of MGAW + * and the lower canonical address width implied by FSPM (i.e., + * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level). + */ + if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) { + *top_level = 4; + return min(57, mgaw); } - return NULL; + /* Four level is always supported */ + *top_level = 3; + return min(48, mgaw); } static struct iommu_domain * -intel_iommu_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +intel_iommu_domain_alloc_first_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - struct intel_iommu *iommu = info->iommu; + struct pt_iommu_x86_64_cfg cfg = {}; struct dmar_domain *dmar_domain; - struct iommu_domain *domain; - - /* Must be NESTING domain */ - if (parent) { - if (!nested_supported(iommu) || flags) - return ERR_PTR(-EOPNOTSUPP); - return intel_nested_domain_alloc(parent, user_data); - } + int ret; - if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) - return ERR_PTR(-EOPNOTSUPP); - if (nested_parent && !nested_supported(iommu)) + if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); - if (user_data || (dirty_tracking && !ssads_supported(iommu))) + + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); + dmar_domain = paging_domain_alloc(); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + cfg.common.hw_max_vasz_lg2 = + compute_vasz_lg2_fs(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | + BIT(PT_FEAT_FLUSH_RANGE); + /* First stage always uses scalable mode */ + if (!ecap_smpwc(iommu->ecap)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); + dmar_domain->domain.ops = &intel_fs_paging_domain_ops; /* - * domain_alloc_user op needs to fully initialize a domain before - * return, so uses iommu_domain_alloc() here for simple. + * iotlb sync for map is only needed for legacy implementations that + * explicitly require flushing internal write buffers to ensure memory + * coherence. */ - domain = iommu_domain_alloc(dev->bus); - if (!domain) - return ERR_PTR(-ENOMEM); + if (rwbf_required(iommu)) + dmar_domain->iotlb_sync_map = true; - dmar_domain = to_dmar_domain(domain); - - if (nested_parent) { - dmar_domain->nested_parent = true; - INIT_LIST_HEAD(&dmar_domain->s1_domains); - spin_lock_init(&dmar_domain->s1_lock); + ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); } - if (dirty_tracking) { - if (dmar_domain->use_first_level) { - iommu_domain_free(domain); - return ERR_PTR(-EOPNOTSUPP); - } - domain->dirty_ops = &intel_dirty_ops; - } + if (!cap_fl1gp_support(iommu->cap)) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; - return domain; + return &dmar_domain->domain; } -static void intel_iommu_domain_free(struct iommu_domain *domain) +static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu, + unsigned int *top_level) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); + unsigned int sagaw = cap_sagaw(iommu->cap); + unsigned int mgaw = cap_mgaw(iommu->cap); - WARN_ON(dmar_domain->nested_parent && - !list_empty(&dmar_domain->s1_domains)); - if (domain != &si_domain->domain) - domain_exit(dmar_domain); + /* + * Find the largest table size that both the mgaw and sagaw support. + * This sets the valid range of IOVA and the top starting level. + * Some HW may only support a 4 or 5 level walk but must limit IOVA to + * 3 levels. + */ + if (mgaw > 48 && sagaw >= BIT(3)) { + *top_level = 4; + return min(57, mgaw); + } else if (mgaw > 39 && sagaw >= BIT(2)) { + *top_level = 3 + ffs(sagaw >> 3); + return min(48, mgaw); + } else if (mgaw > 30 && sagaw >= BIT(1)) { + *top_level = 2 + ffs(sagaw >> 2); + return min(39, mgaw); + } + return 0; } -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { + IOMMU_PT_DIRTY_OPS(vtdss), + .set_dirty_tracking = intel_iommu_set_dirty_tracking, +}; + +static struct iommu_domain * +intel_iommu_domain_alloc_second_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_iommu *iommu = info->iommu; - int addr_width; + struct pt_iommu_vtdss_cfg cfg = {}; + struct dmar_domain *dmar_domain; + unsigned int sslps; + int ret; - if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) - return -EINVAL; + if (flags & + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID))) + return ERR_PTR(-EOPNOTSUPP); - if (domain->dirty_ops && !ssads_supported(iommu)) - return -EINVAL; + if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && + !nested_supported(iommu)) || + ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + !ssads_supported(iommu))) + return ERR_PTR(-EOPNOTSUPP); - /* check if this iommu agaw is sufficient for max mapped address */ - addr_width = agaw_to_width(iommu->agaw); - if (addr_width > cap_mgaw(iommu->cap)) - addr_width = cap_mgaw(iommu->cap); + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); - if (dmar_domain->max_addr > (1LL << addr_width)) - return -EINVAL; - dmar_domain->gaw = addr_width; + dmar_domain = paging_domain_alloc(); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE); /* - * Knock out extra levels of page tables if necessary + * Read-only mapping is disallowed on the domain which serves as the + * parent in a nested configuration, due to HW errata + * (ERRATA_772415_SPR17) */ - while (iommu->agaw < dmar_domain->agaw) { - struct dma_pte *pte; + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) + cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE); - pte = dmar_domain->pgd; - if (dma_pte_present(pte)) { - dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); - iommu_free_page(pte); - } - dmar_domain->agaw--; + if (!iommu_paging_structure_coherency(iommu)) + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); + dmar_domain->iommu.iommu_device = dev; + dmar_domain->iommu.nid = dev_to_node(dev); + dmar_domain->domain.ops = &intel_ss_paging_domain_ops; + dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; + + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops; + + ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL); + if (ret) { + kfree(dmar_domain); + return ERR_PTR(ret); } - if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && - context_copied(iommu, info->bus, info->devfn)) - return intel_pasid_setup_sm_context(dev); + /* Adjust the supported page sizes to HW capability */ + sslps = cap_super_page_val(iommu->cap); + if (!(sslps & BIT(0))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M; + if (!(sslps & BIT(1))) + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; + if (!intel_iommu_superpage) + dmar_domain->domain.pgsize_bitmap = SZ_4K; - return 0; + /* + * Besides the internal write buffer flush, the caching mode used for + * legacy nested translation (which utilizes shadowing page tables) + * also requires iotlb sync on map. + */ + if (rwbf_required(iommu) || cap_caching_mode(iommu->cap)) + dmar_domain->iotlb_sync_map = true; + + return &dmar_domain->domain; } -static int intel_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) +static struct iommu_domain * +intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { struct device_domain_info *info = dev_iommu_priv_get(dev); - int ret; - - if (info->domain) - device_block_translation(dev); + struct intel_iommu *iommu = info->iommu; + struct iommu_domain *domain; - ret = prepare_domain_attach_device(domain, dev); - if (ret) - return ret; + if (user_data) + return ERR_PTR(-EOPNOTSUPP); - return dmar_domain_attach_device(to_dmar_domain(domain), dev); + /* Prefer first stage if possible by default. */ + domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags); + if (domain != ERR_PTR(-EOPNOTSUPP)) + return domain; + return intel_iommu_domain_alloc_second_stage(dev, iommu, flags); } -static int intel_iommu_map(struct iommu_domain *domain, - unsigned long iova, phys_addr_t hpa, - size_t size, int iommu_prot, gfp_t gfp) +static void intel_iommu_domain_free(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - u64 max_addr; - int prot = 0; - - if (iommu_prot & IOMMU_READ) - prot |= DMA_PTE_READ; - if (iommu_prot & IOMMU_WRITE) - prot |= DMA_PTE_WRITE; - if (dmar_domain->set_pte_snp) - prot |= DMA_PTE_SNP; - - max_addr = iova + size; - if (dmar_domain->max_addr < max_addr) { - u64 end; - - /* check if minimum agaw is sufficient for mapped address */ - end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; - if (end < max_addr) { - pr_err("%s: iommu width (%d) is not " - "sufficient for the mapped address (%llx)\n", - __func__, dmar_domain->gaw, max_addr); - return -EFAULT; - } - dmar_domain->max_addr = max_addr; - } - /* Round up size to next multiple of PAGE_SIZE, if it and - the low bits of hpa would take us onto the next page */ - size = aligned_nrpages(hpa, size); - return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, - hpa >> VTD_PAGE_SHIFT, size, prot, gfp); + + if (WARN_ON(dmar_domain->nested_parent && + !list_empty(&dmar_domain->s1_domains))) + return; + + if (WARN_ON(!list_empty(&dmar_domain->devices))) + return; + + pt_iommu_deinit(&dmar_domain->iommu); + + kfree(dmar_domain->qi_batch); + kfree(dmar_domain); } -static int intel_iommu_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) +static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) { - unsigned long pgshift = __ffs(pgsize); - size_t size = pgcount << pgshift; - int ret; + if (WARN_ON(dmar_domain->domain.dirty_ops || + dmar_domain->nested_parent)) + return -EINVAL; - if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return -EINVAL; - if (!IS_ALIGNED(iova | paddr, pgsize)) + if (!ecap_smpwc(iommu->ecap) && + !(dmar_domain->fspt.x86_64_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) return -EINVAL; - ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); - if (!ret && mapped) - *mapped = size; + /* Supports the number of table levels */ + if (!cap_fl5lp_support(iommu->cap) && + dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48) + return -EINVAL; - return ret; + /* Same page size support */ + if (!cap_fl1gp_support(iommu->cap) && + (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map) + return -EINVAL; + + return 0; } -static size_t intel_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *gather) +static int +paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long start_pfn, last_pfn; - int level = 0; + unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2; + unsigned int sslps = cap_super_page_val(iommu->cap); + struct pt_iommu_vtdss_hw_info pt_info; - /* Cope with horrid API which requires us to unmap more than the - size argument if it happens to be a large-page mapping. */ - if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, - &level, GFP_ATOMIC))) - return 0; + pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info); - if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) - size = VTD_PAGE_SIZE << level_to_offset_bits(level); + if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) + return -EINVAL; + if (dmar_domain->nested_parent && !nested_supported(iommu)) + return -EINVAL; + + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return -EINVAL; - start_pfn = iova >> VTD_PAGE_SHIFT; - last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; + if (!iommu_paging_structure_coherency(iommu) && + !(dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))) + return -EINVAL; - domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); + /* Address width falls within the capability */ + if (cap_mgaw(iommu->cap) < vasz_lg2) + return -EINVAL; - if (dmar_domain->max_addr == iova + size) - dmar_domain->max_addr = iova; + /* Page table level is supported. */ + if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw))) + return -EINVAL; + + /* Same page size support */ + if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) + return -EINVAL; + if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) && + !dmar_domain->iotlb_sync_map) + return -EINVAL; /* - * We do not use page-selective IOTLB invalidation in flush queue, - * so there is no need to track page and sync iotlb. + * FIXME this is locked wrong, it needs to be under the + * dmar_domain->lock */ - if (!iommu_iotlb_gather_queued(gather)) - iommu_iotlb_gather_add_page(domain, gather, iova, size); + if ((dmar_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) && + !ecap_sc_support(iommu->ecap)) + return -EINVAL; + return 0; +} - return size; +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + int ret = -EINVAL; + + if (intel_domain_is_fs_paging(dmar_domain)) + ret = paging_domain_compatible_first_stage(dmar_domain, iommu); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = paging_domain_compatible_second_stage(dmar_domain, iommu); + else if (WARN_ON(true)) + ret = -EINVAL; + if (ret) + return ret; + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && + context_copied(iommu, info->bus, info->devfn)) + return intel_pasid_setup_sm_context(dev); + + return 0; } -static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) +static int intel_iommu_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { - unsigned long pgshift = __ffs(pgsize); - size_t size = pgcount << pgshift; + int ret; + + device_block_translation(dev); - return intel_iommu_unmap(domain, iova, size, gather); + ret = paging_domain_compatible(domain, dev); + if (ret) + return ret; + + ret = iopf_for_domain_set(domain, dev); + if (ret) + return ret; + + ret = dmar_domain_attach_device(to_dmar_domain(domain), dev); + if (ret) + iopf_for_domain_remove(domain, dev); + + return ret; } static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { cache_tag_flush_range(to_dmar_domain(domain), gather->start, - gather->end, list_empty(&gather->freelist)); + gather->end, + iommu_pages_list_empty(&gather->freelist)); iommu_put_pages_list(&gather->freelist); } -static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) -{ - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct dma_pte *pte; - int level = 0; - u64 phys = 0; - - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, - GFP_ATOMIC); - if (pte && dma_pte_present(pte)) - phys = dma_pte_addr(pte) + - (iova & (BIT_MASK(level_to_offset_bits(level) + - VTD_PAGE_SHIFT) - 1)); - - return phys; -} - static bool domain_support_force_snooping(struct dmar_domain *domain) { struct device_domain_info *info; @@ -3938,44 +3171,41 @@ static bool domain_support_force_snooping(struct dmar_domain *domain) return support; } -static void domain_set_force_snooping(struct dmar_domain *domain) +static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain) { + struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct device_domain_info *info; - assert_spin_locked(&domain->lock); - /* - * Second level page table supports per-PTE snoop control. The - * iommu_map() interface will handle this by setting SNP bit. - */ - if (!domain->use_first_level) { - domain->set_pte_snp = true; - return; - } + guard(spinlock_irqsave)(&dmar_domain->lock); + + if (dmar_domain->force_snooping) + return true; - list_for_each_entry(info, &domain->devices, link) + if (!domain_support_force_snooping(dmar_domain)) + return false; + + dmar_domain->force_snooping = true; + list_for_each_entry(info, &dmar_domain->devices, link) intel_pasid_setup_page_snoop_control(info->iommu, info->dev, IOMMU_NO_PASID); + return true; } -static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) +static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long flags; - - if (dmar_domain->force_snooping) - return true; - spin_lock_irqsave(&dmar_domain->lock, flags); - if (!domain_support_force_snooping(dmar_domain) || - (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { - spin_unlock_irqrestore(&dmar_domain->lock, flags); + guard(spinlock_irqsave)(&dmar_domain->lock); + if (!domain_support_force_snooping(dmar_domain)) return false; - } - domain_set_force_snooping(dmar_domain); + /* + * Second level page table supports per-PTE snoop control. The + * iommu_map() interface will handle this by setting SNP bit. + */ + dmar_domain->sspt.vtdss_pt.common.features |= + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE); dmar_domain->force_snooping = true; - spin_unlock_irqrestore(&dmar_domain->lock, flags); - return true; } @@ -4053,13 +3283,14 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } if (info->ats_supported && ecap_prs(iommu->ecap) && - pci_pri_supported(pdev)) + ecap_pds(iommu->ecap) && pci_pri_supported(pdev)) info->pri_supported = 1; } } dev_iommu_priv_set(dev, info); if (pdev && pci_ats_supported(pdev)) { + pci_prepare_ats(pdev, VTD_PAGE_SHIFT); ret = device_rbtree_insert(iommu, info); if (ret) goto free; @@ -4092,11 +3323,48 @@ free: return ERR_PTR(ret); } +static void intel_iommu_probe_finalize(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * The PCIe spec, in its wisdom, declares that the behaviour of the + * device is undefined if you enable PASID support after ATS support. + * So always enable PASID support on devices which have it, even if + * we can't yet know if we're ever going to use it. + */ + if (info->pasid_supported && + !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1)) + info->pasid_enabled = 1; + + if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { + iommu_enable_pci_ats(info); + /* Assign a DEVTLB cache tag to the default domain. */ + if (info->ats_enabled && info->domain) { + u16 did = domain_id_iommu(info->domain, iommu); + + if (cache_tag_assign(info->domain, did, dev, + IOMMU_NO_PASID, CACHE_TAG_DEVTLB)) + iommu_disable_pci_ats(info); + } + } + iommu_enable_pci_pri(info); +} + static void intel_iommu_release_device(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + iommu_disable_pci_pri(info); + iommu_disable_pci_ats(info); + + if (info->pasid_enabled) { + pci_disable_pasid(to_pci_dev(dev)); + info->pasid_enabled = 0; + } + mutex_lock(&iommu->iopf_lock); if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) device_rbtree_remove(info); @@ -4109,7 +3377,6 @@ static void intel_iommu_release_device(struct device *dev) intel_pasid_free_table(dev); intel_iommu_debugfs_remove_dev(info); kfree(info); - set_dma_ops(dev, NULL); } static void intel_iommu_get_resv_regions(struct device *device, @@ -4178,132 +3445,44 @@ static struct iommu_group *intel_iommu_device_group(struct device *dev) return generic_device_group(dev); } -static int intel_iommu_enable_sva(struct device *dev) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; - - if (!info || dmar_disabled) - return -EINVAL; - - iommu = info->iommu; - if (!iommu) - return -EINVAL; - - if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) - return -ENODEV; - - if (!info->pasid_enabled || !info->ats_enabled) - return -EINVAL; - - /* - * Devices having device-specific I/O fault handling should not - * support PCI/PRI. The IOMMU side has no means to check the - * capability of device-specific IOPF. Therefore, IOMMU can only - * default that if the device driver enables SVA on a non-PRI - * device, it will handle IOPF in its own way. - */ - if (!info->pri_supported) - return 0; - - /* Devices supporting PRI should have it enabled. */ - if (!info->pri_enabled) - return -EINVAL; - - return 0; -} - -static int intel_iommu_enable_iopf(struct device *dev) +int intel_iommu_enable_iopf(struct device *dev) { - struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; + struct intel_iommu *iommu = info->iommu; int ret; - if (!pdev || !info || !info->ats_enabled || !info->pri_supported) + if (!info->pri_enabled) return -ENODEV; - if (info->pri_enabled) - return -EBUSY; - - iommu = info->iommu; - if (!iommu) - return -EINVAL; - - /* PASID is required in PRG Response Message. */ - if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) - return -EINVAL; - - ret = pci_reset_pri(pdev); - if (ret) - return ret; + /* pri_enabled is protected by the group mutex. */ + iommu_group_mutex_assert(dev); + if (info->iopf_refcount) { + info->iopf_refcount++; + return 0; + } ret = iopf_queue_add_device(iommu->iopf_queue, dev); if (ret) return ret; - ret = pci_enable_pri(pdev, PRQ_DEPTH); - if (ret) { - iopf_queue_remove_device(iommu->iopf_queue, dev); - return ret; - } - - info->pri_enabled = 1; + info->iopf_refcount = 1; return 0; } -static int intel_iommu_disable_iopf(struct device *dev) +void intel_iommu_disable_iopf(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; - if (!info->pri_enabled) - return -EINVAL; - - /* - * PCIe spec states that by clearing PRI enable bit, the Page - * Request Interface will not issue new page requests, but has - * outstanding page requests that have been transmitted or are - * queued for transmission. This is supposed to be called after - * the device driver has stopped DMA, all PASIDs have been - * unbound and the outstanding PRQs have been drained. - */ - pci_disable_pri(to_pci_dev(dev)); - info->pri_enabled = 0; - iopf_queue_remove_device(iommu->iopf_queue, dev); - - return 0; -} - -static int -intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) -{ - switch (feat) { - case IOMMU_DEV_FEAT_IOPF: - return intel_iommu_enable_iopf(dev); - - case IOMMU_DEV_FEAT_SVA: - return intel_iommu_enable_sva(dev); - - default: - return -ENODEV; - } -} - -static int -intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) -{ - switch (feat) { - case IOMMU_DEV_FEAT_IOPF: - return intel_iommu_disable_iopf(dev); + if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) + return; - case IOMMU_DEV_FEAT_SVA: - return 0; + iommu_group_mutex_assert(dev); + if (--info->iopf_refcount) + return; - default: - return -ENODEV; - } + iopf_queue_remove_device(iommu->iopf_queue, dev); } static bool intel_iommu_is_attach_deferred(struct device *dev) @@ -4333,20 +3512,31 @@ static bool risky_device(struct pci_dev *pdev) static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { - cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + if (dmar_domain->iotlb_sync_map) + cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1); return 0; } -static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain) +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct dev_pasid_info *curr, *dev_pasid = NULL; struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; unsigned long flags; + if (!domain) + return; + + /* Identity domain has no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY) + return; + + dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { @@ -4355,27 +3545,79 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, break; } } - WARN_ON_ONCE(!dev_pasid); spin_unlock_irqrestore(&dmar_domain->lock, flags); cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); - intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + if (!WARN_ON_ONCE(!dev_pasid)) { + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + kfree(dev_pasid); + } +} + +static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); + iopf_for_domain_remove(old, dev); + domain_remove_dev_pasid(old, dev, pasid); + + return 0; +} + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + unsigned long flags; + int ret; + + dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); + if (!dev_pasid) + return ERR_PTR(-ENOMEM); + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) + goto out_free; + + ret = cache_tag_assign_domain(dmar_domain, dev, pasid); + if (ret) + goto out_detach_iommu; + + dev_pasid->dev = dev; + dev_pasid->pasid = pasid; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return dev_pasid; +out_detach_iommu: + domain_detach_iommu(dmar_domain, iommu); +out_free: kfree(dev_pasid); - intel_pasid_tear_down_entry(iommu, dev, pasid, false); - intel_drain_pasid_prq(dev, pasid); + return ERR_PTR(ret); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct dev_pasid_info *dev_pasid; - unsigned long flags; int ret; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EINVAL; + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; @@ -4385,58 +3627,54 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); - ret = domain_attach_iommu(dmar_domain, iommu); + ret = iopf_for_domain_replace(domain, old, dev); if (ret) - goto out_free; + goto out_remove_dev_pasid; - ret = cache_tag_assign_domain(dmar_domain, dev, pasid); - if (ret) - goto out_detach_iommu; - - if (domain_type_is_si(dmar_domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, pasid); - else if (dmar_domain->use_first_level) + if (intel_domain_is_fs_paging(dmar_domain)) ret = domain_setup_first_level(iommu, dmar_domain, - dev, pasid); - else - ret = intel_pasid_setup_second_level(iommu, dmar_domain, - dev, pasid); + dev, pasid, old); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = domain_setup_second_level(iommu, dmar_domain, + dev, pasid, old); + else if (WARN_ON(true)) + ret = -EINVAL; + if (ret) - goto out_unassign_tag; + goto out_unwind_iopf; - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_remove_dev_pasid(old, dev, pasid); - if (domain->type & __IOMMU_DOMAIN_PAGING) - intel_iommu_debugfs_create_dev_pasid(dev_pasid); + intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; -out_unassign_tag: - cache_tag_unassign_domain(dmar_domain, dev, pasid); -out_detach_iommu: - domain_detach_iommu(dmar_domain, iommu); -out_free: - kfree(dev_pasid); + +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); return ret; } -static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) +static void *intel_iommu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct iommu_hw_info_vtd *vtd; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_INTEL_VTD) + return ERR_PTR(-EOPNOTSUPP); + vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); if (!vtd) return ERR_PTR(-ENOMEM); @@ -4530,82 +3768,163 @@ err_unwind: return ret; } -static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) +static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) { - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long end = iova + size - 1; - unsigned long pgsize; + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, 1); + if (!context) { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + if (context_present(context) && !context_copied(iommu, bus, devfn)) { + spin_unlock(&iommu->lock); + return 0; + } + + copied_context_tear_down(iommu, context, bus, devfn); + context_clear_entry(context); + context_set_domain_id(context, FLPT_DEFAULT_DID); /* - * IOMMUFD core calls into a dirty tracking disabled domain without an - * IOVA bitmap set in order to clean dirty bits in all PTEs that might - * have occurred when we stopped dirty tracking. This ensures that we - * never inherit dirtied bits from a previous cycle. + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. */ - if (!dmar_domain->dirty_tracking && dirty->bitmap) - return -EINVAL; + context_set_address_width(context, iommu->msagaw); + context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); + context_set_fault_enable(context); + context_set_present(context); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(context, sizeof(*context)); + context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); + spin_unlock(&iommu->lock); + + return 0; +} - do { - struct dma_pte *pte; - int lvl = 0; +static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, - GFP_ATOMIC); - pgsize = level_size(lvl) << VTD_PAGE_SHIFT; - if (!pte || !dma_pte_present(pte)) { - iova += pgsize; - continue; - } + return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); +} + +static int device_setup_pass_through(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); - if (dma_sl_pte_test_and_clear_dirty(pte, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); + if (!dev_is_pci(dev)) + return context_setup_pass_through(dev, info->bus, info->devfn); + + return pci_for_each_dma_alias(to_pci_dev(dev), + context_setup_pass_through_cb, dev); +} +static int identity_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + device_block_translation(dev); + + if (dev_is_real_dma_subdevice(dev)) + return 0; + + /* + * No PRI support with the global identity domain. No need to enable or + * disable PRI in this path as the iommu has been put in the blocking + * state. + */ + if (sm_supported(iommu)) + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); + else + ret = device_setup_pass_through(dev); + + if (!ret) + info->domain_attached = true; + + return ret; +} + +static int identity_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + return ret; + + ret = domain_setup_passthrough(iommu, dev, pasid, old); + if (ret) { + iopf_for_domain_replace(old, domain, dev); + return ret; + } + + domain_remove_dev_pasid(old, dev, pasid); return 0; } -static const struct iommu_dirty_ops intel_dirty_ops = { - .set_dirty_tracking = intel_iommu_set_dirty_tracking, - .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, +static struct iommu_domain identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = identity_domain_attach_dev, + .set_dev_pasid = identity_domain_set_dev_pasid, + }, +}; + +const struct iommu_domain_ops intel_fs_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, +}; + +const struct iommu_domain_ops intel_ss_paging_domain_ops = { + IOMMU_PT_DOMAIN_OPS(vtdss), + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, }; const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, + .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, - .domain_alloc = intel_iommu_domain_alloc, - .domain_alloc_user = intel_iommu_domain_alloc_user, + .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, .domain_alloc_sva = intel_svm_domain_alloc, + .domain_alloc_nested = intel_iommu_domain_alloc_nested, .probe_device = intel_iommu_probe_device, + .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, .device_group = intel_iommu_device_group, - .dev_enable_feat = intel_iommu_dev_enable_feat, - .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, - .remove_dev_pasid = intel_iommu_remove_dev_pasid, - .pgsize_bitmap = SZ_4K, -#ifdef CONFIG_INTEL_IOMMU_SVM - .page_response = intel_svm_page_response, -#endif - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = intel_iommu_attach_device, - .set_dev_pasid = intel_iommu_set_dev_pasid, - .map_pages = intel_iommu_map_pages, - .unmap_pages = intel_iommu_unmap_pages, - .iotlb_sync_map = intel_iommu_iotlb_sync_map, - .flush_iotlb_all = intel_flush_iotlb_all, - .iotlb_sync = intel_iommu_tlb_sync, - .iova_to_phys = intel_iommu_iova_to_phys, - .free = intel_iommu_domain_free, - .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, - } + .page_response = intel_iommu_page_response, }; static void quirk_iommu_igfx(struct pci_dev *dev) @@ -4626,6 +3945,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); +/* QM57/QS57 integrated gfx malfunctions with dmar */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx); + /* Broadwell igfx malfunctions with dmar */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); @@ -4703,7 +4025,6 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); @@ -4897,3 +4218,5 @@ err: return ret; } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index eaf015b4353b..25c5e22096d4 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -22,8 +22,9 @@ #include <linux/bitfield.h> #include <linux/xarray.h> #include <linux/perf_event.h> +#include <linux/pci.h> +#include <linux/generic_pt/iommu.h> -#include <asm/cacheflush.h> #include <asm/iommu.h> #include <uapi/linux/iommufd.h> @@ -49,7 +50,6 @@ #define DMA_FL_PTE_US BIT_ULL(2) #define DMA_FL_PTE_ACCESS BIT_ULL(5) #define DMA_FL_PTE_DIRTY BIT_ULL(6) -#define DMA_FL_PTE_XD BIT_ULL(63) #define DMA_SL_PTE_DIRTY_BIT 9 #define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT) @@ -77,7 +77,6 @@ #define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ #define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ #define DMAR_FEUADDR_REG 0x44 /* Upper address register */ -#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ #define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ #define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ #define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ @@ -173,8 +172,6 @@ #define cap_pgsel_inv(c) (((c) >> 39) & 1) #define cap_super_page_val(c) (((c) >> 34) & 0xf) -#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ - * OFFSET_STRIDE) + 21) #define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) #define cap_max_fault_reg_offset(c) \ @@ -462,7 +459,6 @@ enum { #define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) /* Page group response descriptor QW1 */ -#define QI_PGRP_LPIG(x) (((u64)(x)) << 2) #define QI_PGRP_IDX(idx) (((u64)(idx)) << 3) @@ -493,14 +489,13 @@ struct q_inval { /* Page Request Queue depth */ #define PRQ_ORDER 4 -#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) -#define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) +#define PRQ_SIZE (SZ_4K << PRQ_ORDER) +#define PRQ_RING_MASK (PRQ_SIZE - 0x20) +#define PRQ_DEPTH (PRQ_SIZE >> 5) struct dmar_pci_notify_info; #ifdef CONFIG_IRQ_REMAP -/* 1MB - maximum possible interrupt remapping table size */ -#define INTR_REMAP_PAGE_ORDER 8 #define INTR_REMAP_TABLE_REG_SIZE 0xf #define INTR_REMAP_TABLE_REG_SIZE_MASK 0xf @@ -542,7 +537,8 @@ enum { #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) #define ssads_supported(iommu) (sm_supported(iommu) && \ - ecap_slads((iommu)->ecap)) + ecap_slads((iommu)->ecap) && \ + ecap_smpwc(iommu->ecap)) #define nested_supported(iommu) (sm_supported(iommu) && \ ecap_nest((iommu)->ecap)) @@ -585,23 +581,36 @@ struct iommu_domain_info { * to VT-d spec, section 9.3 */ }; +/* + * We start simply by using a fixed size for the batched descriptors. This + * size is currently sufficient for our needs. Future improvements could + * involve dynamically allocating the batch buffer based on actual demand, + * allowing us to adjust the batch size for optimal performance in different + * scenarios. + */ +#define QI_MAX_BATCHED_DESC_COUNT 16 +struct qi_batch { + struct qi_desc descs[QI_MAX_BATCHED_DESC_COUNT]; + unsigned int index; +}; + struct dmar_domain { - int nid; /* node id */ + union { + struct iommu_domain domain; + struct pt_iommu iommu; + /* First stage page table */ + struct pt_iommu_x86_64 fspt; + /* Second stage page table */ + struct pt_iommu_vtdss sspt; + }; + struct xarray iommu_array; /* Attached IOMMU array */ - u8 has_iotlb_device: 1; - u8 iommu_coherency: 1; /* indicate coherency of iommu access */ - u8 force_snooping : 1; /* Create IOPTEs with snoop control */ - u8 set_pte_snp:1; - u8 use_first_level:1; /* DMA translation for the domain goes - * through the first level page table, - * otherwise, goes through the second - * level. - */ + u8 force_snooping:1; /* Create PASID entry with snoop control */ u8 dirty_tracking:1; /* Dirty tracking is enabled */ u8 nested_parent:1; /* Has other domains nested on it */ - u8 has_mappings:1; /* Has mappings configured through - * iommu_map() interface. + u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write + * buffer when creating mappings. */ spinlock_t lock; /* Protect device tracking lists */ @@ -610,27 +619,11 @@ struct dmar_domain { spinlock_t cache_lock; /* Protect the cache tag list */ struct list_head cache_tags; /* Cache tag list */ + struct qi_batch *qi_batch; /* Batched QI descriptors */ - int iommu_superpage;/* Level of superpages supported: - 0 == 4KiB (no superpages), 1 == 2MiB, - 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ union { /* DMA remapping domain */ struct { - /* virtual address */ - struct dma_pte *pgd; - /* max guest address width */ - int gaw; - /* - * adjusted guest address width: - * 0: level 2 30-bit - * 1: level 3 39-bit - * 2: level 4 48-bit - * 3: level 5 57-bit - */ - int agaw; - /* maximum mapped address */ - u64 max_addr; /* Protect the s1_domains list */ spinlock_t s1_lock; /* Track s1_domains nested on this domain */ @@ -641,8 +634,6 @@ struct dmar_domain { struct { /* parent page table which the user domain is nested on */ struct dmar_domain *s2_domain; - /* user page table pointer (in GPA) */ - unsigned long s1_pgtbl; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; /* link to parent domain siblings */ @@ -654,10 +645,10 @@ struct dmar_domain { struct mmu_notifier notifier; }; }; - - struct iommu_domain domain; /* generic domain data structure for - iommu core */ }; +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain); /* * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters. @@ -688,8 +679,6 @@ struct iommu_pmu { DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; unsigned char irq_name[16]; - struct hlist_node cpuhp_node; - int cpu; }; #define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) @@ -710,22 +699,22 @@ struct intel_iommu { int msagaw; /* max sagaw of this iommu */ unsigned int irq, pr_irq, perf_irq; u16 segment; /* PCI segment# */ - unsigned char name[13]; /* Device Name */ + unsigned char name[16]; /* Device Name */ #ifdef CONFIG_INTEL_IOMMU - unsigned long *domain_ids; /* bitmap of domains */ + /* mutex to protect domain_ida */ + struct mutex did_lock; + struct ida domain_ida; /* domain id allocator */ unsigned long *copied_tables; /* bitmap of copied tables */ spinlock_t lock; /* protect context, domain ids */ struct root_entry *root_entry; /* virtual address */ struct iommu_flush flush; #endif -#ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ unsigned long prq_seq_number; struct completion prq_complete; -#endif struct iopf_queue *iopf_queue; unsigned char iopfq_name[16]; /* Synchronization between fault report and iommu device release. */ @@ -766,7 +755,9 @@ struct device_domain_info { u8 ats_supported:1; u8 ats_enabled:1; u8 dtlb_extra_inval:1; /* Quirk for devices need extra flush */ + u8 domain_attached:1; /* Device has domain attached */ u8 ats_qdep; + unsigned int iopf_refcount; struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ @@ -800,6 +791,24 @@ static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) return container_of(dom, struct dmar_domain, domain); } +/* + * Domain ID 0 and 1 are reserved: + * + * If Caching mode is set, then invalid translations are tagged + * with domain-id 0, hence we need to pre-allocate it. We also + * use domain-id 0 as a marker for non-allocated domain-id, so + * make sure it is not used for a real domain. + * + * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid + * entry for first-level or pass-through translation modes should + * be programmed with a domain id different from those used for + * second-level or nested translation. We reserve a domain id for + * this purpose. This domain id is also used for identity domain + * in legacy mode. + */ +#define FLPT_DEFAULT_DID 1 +#define IDA_START_DID 2 + /* Retrieve the domain ID which has allocated to the domain */ static inline u16 domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -810,6 +819,21 @@ domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) return info->did; } +static inline u16 +iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) +{ + if (domain->type == IOMMU_DOMAIN_SVA || + domain->type == IOMMU_DOMAIN_IDENTITY) + return FLPT_DEFAULT_DID; + return domain_id_iommu(to_dmar_domain(domain), iommu); +} + +static inline bool dev_is_real_dma_subdevice(struct device *dev) +{ + return dev && dev_is_pci(dev) && + pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); +} + /* * 0: readable * 1: writable @@ -823,19 +847,13 @@ struct dma_pte { u64 val; }; -static inline void dma_clear_pte(struct dma_pte *pte) -{ - pte->val = 0; -} - static inline u64 dma_pte_addr(struct dma_pte *pte) { #ifdef CONFIG_64BIT - return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD); + return pte->val & VTD_PAGE_MASK; #else /* Must have a full atomic 64-bit read */ - return __cmpxchg64(&pte->val, 0ULL, 0ULL) & - VTD_PAGE_MASK & (~DMA_FL_PTE_XD); + return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK; #endif } @@ -844,32 +862,11 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } -static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, - unsigned long flags) -{ - if (flags & IOMMU_DIRTY_NO_CLEAR) - return (pte->val & DMA_SL_PTE_DIRTY) != 0; - - return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, - (unsigned long *)&pte->val); -} - static inline bool dma_pte_superpage(struct dma_pte *pte) { return (pte->val & DMA_PTE_LARGE_PAGE); } -static inline bool first_pte_in_page(struct dma_pte *pte) -{ - return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE); -} - -static inline int nr_pte_to_next_page(struct dma_pte *pte) -{ - return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) : - (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; -} - static inline bool context_present(struct context_entry *context) { return (context->lo & 1); @@ -885,11 +882,6 @@ static inline int agaw_to_level(int agaw) return agaw + 2; } -static inline int agaw_to_width(int agaw) -{ - return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); -} - static inline int width_to_agaw(int width) { return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); @@ -905,44 +897,6 @@ static inline int pfn_level_offset(u64 pfn, int level) return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; } -static inline u64 level_mask(int level) -{ - return -1ULL << level_to_offset_bits(level); -} - -static inline u64 level_size(int level) -{ - return 1ULL << level_to_offset_bits(level); -} - -static inline u64 align_to_level(u64 pfn, int level) -{ - return (pfn + level_size(level) - 1) & level_mask(level); -} - -static inline unsigned long lvl_to_nr_pages(unsigned int lvl) -{ - return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); -} - -/* VT-d pages must always be _smaller_ than MM pages. Otherwise things - are never going to work. */ -static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) -{ - return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); -} -static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) -{ - return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; -} -static inline unsigned long page_to_dma_pfn(struct page *pg) -{ - return mm_to_dma_pfn_start(page_to_pfn(pg)); -} -static inline unsigned long virt_to_dma_pfn(void *p) -{ - return page_to_dma_pfn(virt_to_page(p)); -} static inline void context_set_present(struct context_entry *context) { @@ -1047,6 +1001,15 @@ static inline void context_set_sm_pre(struct context_entry *context) context->lo |= BIT_ULL(4); } +/* + * Clear the PRE(Page Request Enable) field of a scalable mode context + * entry. + */ +static inline void context_clear_sm_pre(struct context_entry *context) +{ + context->lo &= ~BIT_ULL(4); +} + /* Returns a number of VTD pages, but aligned to MM page size */ static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size) { @@ -1060,6 +1023,115 @@ static inline unsigned long nrpages_to_size(unsigned long npages) return npages << VTD_PAGE_SHIFT; } +static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_desc *desc) +{ + u8 dw = 0, dr = 0; + int ih = addr & 1; + + if (cap_write_drain(iommu->cap)) + dw = 1; + + if (cap_read_drain(iommu->cap)) + dr = 1; + + desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) + | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; + desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) + | QI_IOTLB_AM(size_order); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_dev_iotlb(u16 sid, u16 pfsid, u16 qdep, u64 addr, + unsigned int mask, struct qi_desc *desc) +{ + if (mask) { + addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; + desc->qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; + } else { + desc->qw1 = QI_DEV_IOTLB_ADDR(addr); + } + + if (qdep >= QI_DEV_IOTLB_MAX_INVS) + qdep = 0; + + desc->qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | + QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_piotlb(u16 did, u32 pasid, u64 addr, + unsigned long npages, bool ih, + struct qi_desc *desc) +{ + if (npages == -1) { + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = 0; + } else { + int mask = ilog2(__roundup_pow_of_two(npages)); + unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); + + if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) + addr = ALIGN_DOWN(addr, align); + + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = QI_EIOTLB_ADDR(addr) | + QI_EIOTLB_IH(ih) | + QI_EIOTLB_AM(mask); + } +} + +static inline void qi_desc_dev_iotlb_pasid(u16 sid, u16 pfsid, u32 pasid, + u16 qdep, u64 addr, + unsigned int size_order, + struct qi_desc *desc) +{ + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); + + desc->qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(pfsid); + + /* + * If S bit is 0, we only flush a single page. If S bit is set, + * The least significant zero bit indicates the invalidation address + * range. VT-d spec 6.5.2.6. + * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. + * size order = 0 is PAGE_SIZE 4KB + * Max Invs Pending (MIP) is set to 0 for now until we have DIT in + * ECAP. + */ + if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) + pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", + addr, size_order); + + /* Take page address */ + desc->qw1 = QI_DEV_EIOTLB_ADDR(addr); + + if (size_order) { + /* + * Existing 0s in address below size_order may be the least + * significant bit, we must set them to 1s to avoid having + * smaller size than desired. + */ + desc->qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, + VTD_PAGE_SHIFT); + /* Clear size_order bit to indicate size */ + desc->qw1 &= ~mask; + /* Set the S bit to indicate flushing more than 1 page */ + desc->qw1 |= QI_DEV_EIOTLB_SIZE; + } +} + /* Convert value to context PASID directory size field coding. */ #define context_pdts(pds) (((pds) & 0x7) << 9) @@ -1091,25 +1163,37 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, unsigned int count, unsigned long options); + +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); /* * Options used in qi_submit_sync: * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. */ #define QI_OPT_WAIT_DRAIN BIT(0) -void domain_update_iotlb(struct dmar_domain *domain); int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev); -void domain_update_iommu_cap(struct dmar_domain *domain); +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); + +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old); int dmar_ir_support(void); void iommu_flush_write_buffer(struct intel_iommu *iommu); -struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, - const struct iommu_user_data *user_data); +struct iommu_domain * +intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, + const struct iommu_user_data *user_data); struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid); enum cache_tag_type { @@ -1135,6 +1219,8 @@ struct cache_tag { unsigned int users; }; +int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev, + ioasid_t pasid, enum cache_tag_type type); int cache_tag_assign_domain(struct dmar_domain *domain, struct device *dev, ioasid_t pasid); void cache_tag_unassign_domain(struct dmar_domain *domain, @@ -1145,18 +1231,57 @@ void cache_tag_flush_all(struct dmar_domain *domain); void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end); +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did); + +int intel_iommu_enable_prq(struct intel_iommu *iommu); +int intel_iommu_finish_prq(struct intel_iommu *iommu); +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); + +int intel_iommu_enable_iopf(struct device *dev); +void intel_iommu_disable_iopf(struct device *dev); + +static inline int iopf_for_domain_set(struct iommu_domain *domain, + struct device *dev) +{ + if (!domain || !domain->iopf_handler) + return 0; + + return intel_iommu_enable_iopf(dev); +} + +static inline void iopf_for_domain_remove(struct iommu_domain *domain, + struct device *dev) +{ + if (!domain || !domain->iopf_handler) + return; + + intel_iommu_disable_iopf(dev); +} + +static inline int iopf_for_domain_replace(struct iommu_domain *new, + struct iommu_domain *old, + struct device *dev) +{ + int ret; + + ret = iopf_for_domain_set(new, dev); + if (ret) + return ret; + + iopf_for_domain_remove(old, dev); + + return 0; +} + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); -int intel_svm_enable_prq(struct intel_iommu *iommu); -int intel_svm_finish_prq(struct intel_iommu *iommu); -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm); -void intel_drain_pasid_prq(struct device *dev, u32 pasid); #else static inline void intel_svm_check(struct intel_iommu *iommu) {} -static inline void intel_drain_pasid_prq(struct device *dev, u32 pasid) {} static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm) { @@ -1183,6 +1308,18 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc); extern const struct iommu_ops intel_iommu_ops; +extern const struct iommu_domain_ops intel_fs_paging_domain_ops; +extern const struct iommu_domain_ops intel_ss_paging_domain_ops; + +static inline bool intel_domain_is_fs_paging(struct dmar_domain *domain) +{ + return domain->domain.ops == &intel_fs_paging_domain_ops; +} + +static inline bool intel_domain_is_ss_paging(struct dmar_domain *domain) +{ + return domain->domain.ops == &intel_ss_paging_domain_ops; +} #ifdef CONFIG_INTEL_IOMMU extern int intel_iommu_sm; diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index e4a70886678c..4f9b01dc91e8 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -10,6 +10,7 @@ #include <linux/hpet.h> #include <linux/pci.h> #include <linux/irq.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/acpi.h> #include <linux/irqdomain.h> #include <linux/crash_dump.h> @@ -24,12 +25,6 @@ #include "iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" -#include "cap_audit.h" - -enum irq_mode { - IRQ_REMAPPING, - IRQ_POSTING, -}; struct ioapic_scope { struct intel_iommu *iommu; @@ -50,8 +45,8 @@ struct irq_2_iommu { u16 irte_index; u16 sub_handle; u8 irte_mask; - enum irq_mode mode; bool posted_msi; + bool posted_vcpu; }; struct intel_ir_data { @@ -139,7 +134,6 @@ static int alloc_irte(struct intel_iommu *iommu, irq_iommu->irte_index = index; irq_iommu->sub_handle = 0; irq_iommu->irte_mask = mask; - irq_iommu->mode = IRQ_REMAPPING; } raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); @@ -194,8 +188,6 @@ static int modify_irte(struct irq_2_iommu *irq_iommu, rc = qi_flush_iec(iommu, index, 0); - /* Update iommu mode according to the IRTE mode */ - irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING; raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); return rc; @@ -312,7 +304,7 @@ static int set_ioapic_sid(struct irte *irte, int apic) for (i = 0; i < MAX_IO_APICS; i++) { if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { - sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; + sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn); break; } } @@ -337,7 +329,7 @@ static int set_hpet_sid(struct irte *irte, u8 id) for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].iommu && ir_hpet[i].id == id) { - sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn); break; } } @@ -527,8 +519,14 @@ static void iommu_enable_irq_remapping(struct intel_iommu *iommu) static int intel_setup_irq_remapping(struct intel_iommu *iommu) { + struct irq_domain_info info = { + .ops = &intel_ir_domain_ops, + .parent = arch_get_ir_parent_domain(), + .domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI, + .size = INTR_REMAP_TABLE_ENTRIES, + .host_data = iommu, + }; struct ir_table *ir_table; - struct fwnode_handle *fn; unsigned long *bitmap; void *ir_table_base; @@ -539,11 +537,11 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (!ir_table) return -ENOMEM; - ir_table_base = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, - INTR_REMAP_PAGE_ORDER); + /* 1MB - maximum possible interrupt remapping table size */ + ir_table_base = + iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, SZ_1M); if (!ir_table_base) { - pr_err("IR%d: failed to allocate pages of order %d\n", - iommu->seq_id, INTR_REMAP_PAGE_ORDER); + pr_err("IR%d: failed to allocate 1M of pages\n", iommu->seq_id); goto out_free_table; } @@ -553,25 +551,16 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) goto out_free_pages; } - fn = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id); - if (!fn) + info.fwnode = irq_domain_alloc_named_id_fwnode("INTEL-IR", iommu->seq_id); + if (!info.fwnode) goto out_free_bitmap; - iommu->ir_domain = - irq_domain_create_hierarchy(arch_get_ir_parent_domain(), - 0, INTR_REMAP_TABLE_ENTRIES, - fn, &intel_ir_domain_ops, - iommu); + iommu->ir_domain = msi_create_parent_irq_domain(&info, &dmar_msi_parent_ops); if (!iommu->ir_domain) { pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); goto out_free_fwnode; } - irq_domain_update_bus_token(iommu->ir_domain, DOMAIN_BUS_DMAR); - iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT | - IRQ_DOMAIN_FLAG_ISOLATED_MSI; - iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops; - ir_table->base = ir_table_base; ir_table->bitmap = bitmap; iommu->ir_table = ir_table; @@ -597,8 +586,8 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) if (ir_pre_enabled(iommu)) { if (!is_kdump_kernel()) { - pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n", - iommu->name); + pr_info_once("IRQ remapping was enabled on %s but we are not in kdump mode\n", + iommu->name); clear_ir_pre_enabled(iommu); iommu_disable_irq_remapping(iommu); } else if (iommu_load_old_irte(iommu)) @@ -617,11 +606,11 @@ out_free_ir_domain: irq_domain_remove(iommu->ir_domain); iommu->ir_domain = NULL; out_free_fwnode: - irq_domain_free_fwnode(fn); + irq_domain_free_fwnode(info.fwnode); out_free_bitmap: bitmap_free(bitmap); out_free_pages: - iommu_free_pages(ir_table_base, INTR_REMAP_PAGE_ORDER); + iommu_free_pages(ir_table_base); out_free_table: kfree(ir_table); @@ -642,7 +631,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu) irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } - iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER); + iommu_free_pages(iommu->ir_table->base); bitmap_free(iommu->ir_table->bitmap); kfree(iommu->ir_table); iommu->ir_table = NULL; @@ -727,9 +716,6 @@ static int __init intel_prepare_irq_remapping(void) if (dmar_table_init() < 0) return -ENODEV; - if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL)) - return -ENODEV; - if (!dmar_ir_support()) return -ENODEV; @@ -1173,7 +1159,26 @@ static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {} #endif -static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) +static void __intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) +{ + struct intel_ir_data *ir_data = irqd->chip_data; + + /* + * Don't modify IRTEs for IRQs that are being posted to vCPUs if the + * host CPU affinity changes. + */ + if (ir_data->irq_2_iommu.posted_vcpu && !force_host) + return; + + ir_data->irq_2_iommu.posted_vcpu = false; + + if (ir_data->irq_2_iommu.posted_msi) + intel_ir_reconfigure_irte_posted(irqd); + else + modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); +} + +static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force_host) { struct intel_ir_data *ir_data = irqd->chip_data; struct irte *irte = &ir_data->irte_entry; @@ -1186,10 +1191,7 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) irte->vector = cfg->vector; irte->dest_id = IRTE_DEST(cfg->dest_apicid); - if (ir_data->irq_2_iommu.posted_msi) - intel_ir_reconfigure_irte_posted(irqd); - else if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) - modify_irte(&ir_data->irq_2_iommu, irte); + __intel_ir_reconfigure_irte(irqd, force_host); } /* @@ -1240,11 +1242,11 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data, static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) { struct intel_ir_data *ir_data = data->chip_data; - struct vcpu_data *vcpu_pi_info = info; + struct intel_iommu_pi_data *pi_data = info; /* stop posting interrupts, back to the default mode */ - if (!vcpu_pi_info) { - modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); + if (!pi_data) { + __intel_ir_reconfigure_irte(data, true); } else { struct irte irte_pi; @@ -1261,12 +1263,13 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) /* Update the posted mode fields */ irte_pi.p_pst = 1; irte_pi.p_urgent = 0; - irte_pi.p_vector = vcpu_pi_info->vector; - irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >> + irte_pi.p_vector = pi_data->vector; + irte_pi.pda_l = (pi_data->pi_desc_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); - irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) & + irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); + ir_data->irq_2_iommu.posted_vcpu = true; modify_irte(&ir_data->irq_2_iommu, &irte_pi); } @@ -1282,43 +1285,44 @@ static struct irq_chip intel_ir_chip = { }; /* - * With posted MSIs, all vectors are multiplexed into a single notification - * vector. Devices MSIs are then dispatched in a demux loop where - * EOIs can be coalesced as well. + * With posted MSIs, the MSI vectors are multiplexed into a single notification + * vector, and only the notification vector is sent to the APIC IRR. Device + * MSIs are then dispatched in a demux loop that harvests the MSIs from the + * CPU's Posted Interrupt Request bitmap. I.e. Posted MSIs never get sent to + * the APIC IRR, and thus do not need an EOI. The notification handler instead + * performs a single EOI after processing the PIR. * - * "INTEL-IR-POST" IRQ chip does not do EOI on ACK, thus the dummy irq_ack() - * function. Instead EOI is performed by the posted interrupt notification - * handler. + * Note! Pending SMP/CPU affinity changes, which are per MSI, must still be + * honored, only the APIC EOI is omitted. * * For the example below, 3 MSIs are coalesced into one CPU notification. Only - * one apic_eoi() is needed. + * one apic_eoi() is needed, but each MSI needs to process pending changes to + * its CPU affinity. * * __sysvec_posted_msi_notification() * irq_enter(); * handle_edge_irq() * irq_chip_ack_parent() - * dummy(); // No EOI + * irq_move_irq(); // No EOI * handle_irq_event() * driver_handler() * handle_edge_irq() * irq_chip_ack_parent() - * dummy(); // No EOI + * irq_move_irq(); // No EOI * handle_irq_event() * driver_handler() * handle_edge_irq() * irq_chip_ack_parent() - * dummy(); // No EOI + * irq_move_irq(); // No EOI * handle_irq_event() * driver_handler() * apic_eoi() * irq_exit() + * */ - -static void dummy_ack(struct irq_data *d) { } - static struct irq_chip intel_ir_chip_post_msi = { .name = "INTEL-IR-POST", - .irq_ack = dummy_ack, + .irq_ack = irq_move_irq, .irq_set_affinity = intel_ir_set_affinity, .irq_compose_msi_msg = intel_ir_compose_msi_msg, .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, @@ -1352,12 +1356,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, case X86_IRQ_ALLOC_TYPE_IOAPIC: /* Set source-id of interrupt request */ set_ioapic_sid(irte, info->devid); - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", - info->devid, irte->present, irte->fpd, - irte->dst_mode, irte->redir_hint, - irte->trigger_mode, irte->dlvry_mode, - irte->avail, irte->vector, irte->dest_id, - irte->sid, irte->sq, irte->svt); + apic_pr_verbose("IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", + info->devid, irte->present, irte->fpd, irte->dst_mode, + irte->redir_hint, irte->trigger_mode, irte->dlvry_mode, + irte->avail, irte->vector, irte->dest_id, irte->sid, + irte->sq, irte->svt); sub_handle = info->ioapic.pin; break; case X86_IRQ_ALLOC_TYPE_HPET: @@ -1464,7 +1467,6 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain, else irq_data->chip = &intel_ir_chip; intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i); - irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); } return 0; @@ -1495,6 +1497,9 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain, struct intel_ir_data *data = irq_data->chip_data; struct irte entry; + WARN_ON_ONCE(data->irq_2_iommu.posted_vcpu); + data->irq_2_iommu.posted_vcpu = false; + memset(&entry, 0, sizeof(entry)); modify_irte(&data->irq_2_iommu, &entry); } @@ -1523,6 +1528,8 @@ static const struct irq_domain_ops intel_ir_domain_ops = { static const struct msi_parent_ops dmar_msi_parent_ops = { .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI, + .bus_select_token = DOMAIN_BUS_DMAR, + .bus_select_mask = MATCH_PCI_MSI, .prefix = "IR-", .init_dev_msi_info = msi_parent_init_dev_msi_info, }; @@ -1535,10 +1542,6 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu) int ret; int eim = x2apic_enabled(); - ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu); - if (ret) - return ret; - if (eim && !ecap_eim_support(iommu->ecap)) { pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n", iommu->reg_phys, iommu->ecap); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 16a2bcf5cfeb..a3fb8c193ca6 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -19,7 +19,7 @@ #include "pasid.h" static int intel_nested_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -27,20 +27,14 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, unsigned long flags; int ret = 0; - if (info->domain) - device_block_translation(dev); - - if (iommu->agaw < dmar_domain->s2_domain->agaw) { - dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); - return -ENODEV; - } + device_block_translation(dev); /* * Stage-1 domain cannot work alone, it is nested on a s2_domain. * The s2_domain will be used in nested translation, hence needs * to ensure the s2_domain is compatible with this IOMMU. */ - ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); if (ret) { dev_err_ratelimited(dev, "s2 domain is not compatible\n"); return ret; @@ -56,19 +50,24 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, if (ret) goto detach_iommu; + ret = iopf_for_domain_set(domain, dev); + if (ret) + goto unassign_tag; + ret = intel_pasid_setup_nested(iommu, dev, IOMMU_NO_PASID, dmar_domain); if (ret) - goto unassign_tag; + goto disable_iopf; info->domain = dmar_domain; + info->domain_attached = true; spin_lock_irqsave(&dmar_domain->lock, flags); list_add(&info->link, &dmar_domain->devices); spin_unlock_irqrestore(&dmar_domain->lock, flags); - domain_update_iotlb(dmar_domain); - return 0; +disable_iopf: + iopf_for_domain_remove(domain, dev); unassign_tag: cache_tag_unassign_domain(dmar_domain, dev, IOMMU_NO_PASID); detach_iommu: @@ -85,6 +84,7 @@ static void intel_nested_domain_free(struct iommu_domain *domain) spin_lock(&s2_domain->s1_lock); list_del(&dmar_domain->s2_link); spin_unlock(&s2_domain->s1_lock); + kfree(dmar_domain->qi_batch); kfree(dmar_domain); } @@ -131,25 +131,87 @@ out: return ret; } +static int domain_setup_nested(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_nested(iommu, dev, pasid, domain); + return intel_pasid_replace_nested(iommu, dev, pasid, + iommu_domain_did(old, iommu), + domain); +} + +static int intel_nested_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + goto out_remove_dev_pasid; + + ret = domain_setup_nested(iommu, dmar_domain, dev, pasid, old); + if (ret) + goto out_unwind_iopf; + + domain_remove_dev_pasid(old, dev, pasid); + + return 0; + +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + static const struct iommu_domain_ops intel_nested_domain_ops = { .attach_dev = intel_nested_attach_dev, + .set_dev_pasid = intel_nested_set_dev_pasid, .free = intel_nested_domain_free, .cache_invalidate_user = intel_nested_cache_invalidate_user, }; -struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, - const struct iommu_user_data *user_data) +struct iommu_domain * +intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, + const struct iommu_user_data *user_data) { + struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *s2_domain = to_dmar_domain(parent); + struct intel_iommu *iommu = info->iommu; struct iommu_hwpt_vtd_s1 vtd; struct dmar_domain *domain; int ret; + if (!nested_supported(iommu) || flags & ~IOMMU_HWPT_ALLOC_PASID) + return ERR_PTR(-EOPNOTSUPP); + /* Must be nested domain */ if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) return ERR_PTR(-EOPNOTSUPP); - if (parent->ops != intel_iommu_ops.default_domain_ops || - !s2_domain->nested_parent) + if (!intel_domain_is_ss_paging(s2_domain) || !s2_domain->nested_parent) return ERR_PTR(-EINVAL); ret = iommu_copy_struct_from_user(&vtd, user_data, @@ -161,9 +223,7 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, if (!domain) return ERR_PTR(-ENOMEM); - domain->use_first_level = true; domain->s2_domain = s2_domain; - domain->s1_pgtbl = vtd.pgtbl_addr; domain->s1_cfg = vtd; domain->domain.ops = &intel_nested_domain_ops; domain->domain.type = IOMMU_DOMAIN_NESTED; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index abce19e2ad6f..3e2255057079 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -60,14 +60,14 @@ int intel_pasid_alloc_table(struct device *dev) size = max_pasid >> (PASID_PDE_SHIFT - 3); order = size ? get_order(size) : 0; - dir = iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order); + dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL, + 1 << (order + PAGE_SHIFT)); if (!dir) { kfree(pasid_table); return -ENOMEM; } pasid_table->table = dir; - pasid_table->order = order; pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); info->pasid_table = pasid_table; @@ -97,10 +97,10 @@ void intel_pasid_free_table(struct device *dev) max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT; for (i = 0; i < max_pde; i++) { table = get_pasid_table_from_pde(&dir[i]); - iommu_free_page(table); + iommu_free_pages(table); } - iommu_free_pages(pasid_table->table, pasid_table->order); + iommu_free_pages(pasid_table->table); kfree(pasid_table); } @@ -146,7 +146,10 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid) retry: entries = get_pasid_table_from_pde(&dir[dir_index]); if (!entries) { - entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC); + u64 tmp; + + entries = iommu_alloc_pages_node_sz(info->iommu->node, + GFP_ATOMIC, SZ_4K); if (!entries) return NULL; @@ -156,9 +159,10 @@ retry: * clear. However, this entry might be populated by others * while we are preparing it. Use theirs with a retry. */ - if (cmpxchg64(&dir[dir_index].val, 0ULL, - (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { - iommu_free_page(entries); + tmp = 0ULL; + if (!try_cmpxchg64(&dir[dir_index].val, &tmp, + (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) { + iommu_free_pages(entries); goto retry; } if (!ecap_coherent(info->iommu->ecap)) { @@ -217,7 +221,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, if (pci_dev_is_disconnected(to_pci_dev(dev))) return; - sid = info->bus << 8 | info->devfn; + sid = PCI_DEVID(info->bus, info->devfn); qdep = info->ats_qdep; pfsid = info->pfsid; @@ -241,11 +245,31 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); - if (WARN_ON(!pte) || !pasid_pte_is_present(pte)) { + if (WARN_ON(!pte)) { spin_unlock(&iommu->lock); return; } + if (!pasid_pte_is_present(pte)) { + if (!pasid_pte_is_fault_disabled(pte)) { + WARN_ON(READ_ONCE(pte->val[0]) != 0); + spin_unlock(&iommu->lock); + return; + } + + /* + * When a PASID is used for SVA by a device, it's possible + * that the pasid entry is non-present with the Fault + * Processing Disabled bit set. Clear the pasid entry and + * drain the PRQ for the PASID before return. + */ + pasid_clear_entry(pte); + spin_unlock(&iommu->lock); + intel_iommu_drain_pasid_prq(dev, pasid); + + return; + } + did = pasid_get_domain_id(pte); pgtt = pasid_pte_get_pgtt(pte); intel_pasid_clear_entry(dev, pasid, fault_ignore); @@ -261,9 +285,9 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, else iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); + if (!fault_ignore) + intel_iommu_drain_pasid_prq(dev, pasid); } /* @@ -286,12 +310,72 @@ static void pasid_flush_caches(struct intel_iommu *iommu, } /* + * This function is supposed to be used after caller updates the fields + * except for the SSADE and P bit of a pasid table entry. It does the + * below: + * - Flush cacheline if needed + * - Flush the caches per Table 28 ”Guidance to Software for Invalidations“ + * of VT-d spec 5.0. + */ +static void intel_pasid_flush_present(struct intel_iommu *iommu, + struct device *dev, + u32 pasid, u16 did, + struct pasid_entry *pte) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * VT-d spec 5.0 table28 states guides for cache invalidation: + * + * - PASID-selective-within-Domain PASID-cache invalidation + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + + devtlb_invalidation_with_pasid(iommu, dev, pasid); +} + +/* * Set up the scalable mode pasid table entry for first only * translation type. */ -int intel_pasid_setup_first_level(struct intel_iommu *iommu, - struct device *dev, pgd_t *pgd, - u32 pasid, u16 did, int flags) +static void pasid_pte_config_first_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + phys_addr_t fsptptr, u16 did, + int flags) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + /* Setup the first level page table pointer: */ + pasid_set_flptr(pte, fsptptr); + + if (flags & PASID_FLAG_FL5LP) + pasid_set_flpm(pte, 1); + + if (flags & PASID_FLAG_PAGE_SNOOP) + pasid_set_pgsnp(pte); + + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP); + + /* Setup Present and PASID Granular Transfer Type: */ + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); + pasid_set_present(pte); +} + +int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, + phys_addr_t fsptptr, u32 pasid, u16 did, + int flags) { struct pasid_entry *pte; @@ -319,64 +403,93 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); + pasid_pte_config_first_level(iommu, pte, fsptptr, did, flags); - /* Setup the first level page table pointer: */ - pasid_set_flptr(pte, (u64)__pa(pgd)); + spin_unlock(&iommu->lock); - if (flags & PASID_FLAG_FL5LP) - pasid_set_flpm(pte, 1); + pasid_flush_caches(iommu, pte, pasid, did); - if (flags & PASID_FLAG_PAGE_SNOOP) - pasid_set_pgsnp(pte); + return 0; +} - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - pasid_set_nxe(pte); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, phys_addr_t fsptptr, + u32 pasid, u16 did, u16 old_did, + int flags) +{ + struct pasid_entry *pte, new_pte; - /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); - pasid_set_present(pte); + if (!ecap_flts(iommu->ecap)) { + pr_err("No first level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { + pr_err("No 5-level paging support for first-level on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } /* - * Skip top levels of page tables for iommu which has less agaw - * than default. Unnecessary for PT mode. + * Set up the scalable mode pasid entry for second only translation type. */ -static int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) +static void pasid_pte_config_second_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct dmar_domain *domain, u16 did) { - int agaw; + struct pt_iommu_vtdss_hw_info pt_info; - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - *pgd = phys_to_virt(dma_pte_addr(*pgd)); - if (!dma_pte_present(*pgd)) - return -EINVAL; - } + lockdep_assert_held(&iommu->lock); - return agaw; + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_slptr(pte, pt_info.ssptptr); + pasid_set_address_width(pte, pt_info.aw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); + if (domain->dirty_tracking) + pasid_set_ssade(pte); + + pasid_set_present(pte); } -/* - * Set up the scalable mode pasid entry for second only translation type. - */ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid) { struct pasid_entry *pte; - struct dma_pte *pgd; - u64 pgd_val; - int agaw; u16 did; + /* * If hardware advertises no support for second level * translation, return directly. @@ -387,16 +500,50 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EINVAL; } - pgd = domain->pgd; - agaw = iommu_skip_agaw(domain, iommu, &pgd); - if (agaw < 0) { - dev_err(dev, "Invalid domain page table\n"); + did = domain_id_iommu(domain, iommu); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_pte_config_second_level(iommu, pte, domain, did); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} + +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did; + + /* + * If hardware advertises no support for second level + * translation, return directly. + */ + if (!ecap_slts(iommu->ecap)) { + pr_err("No second level translation support on %s\n", + iommu->name); return -EINVAL; } - pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); + pasid_pte_config_second_level(iommu, &new_pte, domain, did); + spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); if (!pte) { @@ -404,25 +551,18 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -ENODEV; } - if (pasid_pte_is_present(pte)) { + if (!pasid_pte_is_present(pte)) { spin_unlock(&iommu->lock); - return -EBUSY; + return -EINVAL; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (domain->dirty_tracking) - pasid_set_ssade(pte); + WARN_ON(old_did != pasid_get_domain_id(pte)); - pasid_set_present(pte); + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } @@ -491,9 +631,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); return 0; } @@ -501,6 +639,20 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, /* * Set up the scalable mode pasid entry for passthrough translation type. */ +static void pasid_pte_config_pass_through(struct intel_iommu *iommu, + struct pasid_entry *pte, u16 did) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_present(pte); +} + int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid) { @@ -519,13 +671,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - pasid_set_present(pte); + pasid_pte_config_pass_through(iommu, pte, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -533,6 +679,38 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did = FLPT_DEFAULT_DID; + + pasid_pte_config_pass_through(iommu, &new_pte, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set the page snoop control for a pasid entry which has been set up. */ @@ -553,26 +731,50 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, did = pasid_get_domain_id(pte); spin_unlock(&iommu->lock); - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(pte, sizeof(*pte)); + intel_pasid_flush_present(iommu, dev, pasid, did, pte); +} - /* - * VT-d spec 3.4 table23 states guides for cache invalidation: - * - * - PASID-selective-within-Domain PASID-cache invalidation - * - PASID-selective PASID-based IOTLB invalidation - * - If (pasid is RID_PASID) - * - Global Device-TLB invalidation to affected functions - * Else - * - PASID-based Device-TLB invalidation (with S=1 and - * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions - */ - pasid_cache_invalidation_with_pasid(iommu, did, pasid); - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); +static void pasid_pte_config_nestd(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct iommu_hwpt_vtd_s1 *s1_cfg, + struct dmar_domain *s2_domain, + u16 did) +{ + struct pt_iommu_vtdss_hw_info pt_info; + + lockdep_assert_held(&iommu->lock); + + pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info); + + pasid_clear_entry(pte); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, pt_info.ssptptr); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, pt_info.aw); + pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features & + BIT(PT_FEAT_DMA_INCOHERENT))); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); } /** @@ -590,10 +792,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain) { struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; - pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; struct dmar_domain *s2_domain = domain->s2_domain; u16 did = domain_id_iommu(domain, iommu); - struct dma_pte *pgd = s2_domain->pgd; struct pasid_entry *pte; /* Address width should match the address width supported by hardware */ @@ -636,37 +836,73 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return -EBUSY; } - pasid_clear_entry(pte); + pasid_pte_config_nestd(iommu, pte, s1_cfg, s2_domain, did); + spin_unlock(&iommu->lock); - if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) - pasid_set_flpm(pte, 1); + pasid_flush_caches(iommu, pte, pasid, did); - pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + return 0; +} - if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { - pasid_set_sre(pte); - if (s1_cfg->flags & IOMMU_VTD_S1_WPE) - pasid_set_wpe(pte); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct pasid_entry *pte, new_pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; } - if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) - pasid_set_eafe(pte); + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } - if (s2_domain->force_snooping) - pasid_set_pgsnp(pte); + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } - pasid_set_slptr(pte, virt_to_phys(pgd)); - pasid_set_fault_enable(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, s2_domain->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (s2_domain->dirty_tracking) - pasid_set_ssade(pte); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); - pasid_set_present(pte); + pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } @@ -681,6 +917,7 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, false); @@ -689,28 +926,11 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) return; } + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - - /* - * Cache invalidation for changes to a scalable-mode context table - * entry. - * - * Section 6.5.3.3 of the VT-d spec: - * - Device-selective context-cache invalidation; - * - Domain-selective PASID-cache invalidation to affected domains - * (can be skipped if all PASID entries were not-present); - * - Domain-selective IOTLB invalidation to affected domains; - * - Global Device-TLB invalidation to affected functions. - * - * The iommu has been parked in the blocking state. All domains have - * been detached from the device or PASID. The PASID and IOTLB caches - * have been invalidated during the domain detach path. - */ - iommu->flush.flush_context(iommu, 0, PCI_DEVID(bus, devfn), - DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); - devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); + intel_context_flush_no_pasid(info, context, did); } static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) @@ -768,10 +988,10 @@ static int context_entry_set_pasid_table(struct context_entry *context, if (info->ats_supported) context_set_sm_dte(context); - if (info->pri_supported) - context_set_sm_pre(context); if (info->pasid_supported) context_set_pasid(context); + if (info->pri_supported) + context_set_sm_pre(context); context_set_fault_enable(context); context_set_present(context); @@ -872,3 +1092,61 @@ int intel_pasid_setup_sm_context(struct device *dev) return pci_for_each_dma_alias(to_pci_dev(dev), pci_pasid_table_setup, dev); } + +/* + * Global Device-TLB invalidation following changes in a context entry which + * was present. + */ +static void __context_flush_dev_iotlb(struct device_domain_info *info) +{ + if (!info->ats_enabled) + return; + + qi_flush_dev_iotlb(info->iommu, PCI_DEVID(info->bus, info->devfn), + info->pfsid, info->ats_qdep, 0, MAX_AGAW_PFN_WIDTH); + + /* + * There is no guarantee that the device DMA is stopped when it reaches + * here. Therefore, always attempt the extra device TLB invalidation + * quirk. The impact on performance is acceptable since this is not a + * performance-critical path. + */ + quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, IOMMU_NO_PASID, + info->ats_qdep); +} + +/* + * Cache invalidations after change in a context table entry that was present + * according to the Spec 6.5.3.3 (Guidance to Software for Invalidations). + * This helper can only be used when IOMMU is working in the legacy mode or + * IOMMU is in scalable mode but all PASID table entries of the device are + * non-present. + */ +void intel_context_flush_no_pasid(struct device_domain_info *info, + struct context_entry *context, u16 did) +{ + struct intel_iommu *iommu = info->iommu; + + /* + * Device-selective context-cache invalidation. The Domain-ID field + * of the Context-cache Invalidate Descriptor is ignored by hardware + * when operating in scalable mode. Therefore the @did value doesn't + * matter in scalable mode. + */ + iommu->flush.flush_context(iommu, did, PCI_DEVID(info->bus, info->devfn), + DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); + + /* + * For legacy mode: + * - Domain-selective IOTLB invalidation + * - Global Device-TLB invalidation to all affected functions + */ + if (!sm_supported(iommu)) { + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + __context_flush_dev_iotlb(info); + + return; + } + + __context_flush_dev_iotlb(info); +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index da9978fef7ac..b4c85242dc79 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -22,15 +22,9 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) -/* - * Domain ID reserved for pasid entries programmed for first-level - * only and pass-through transfer modes. - */ -#define FLPT_DEFAULT_DID 1 -#define NUM_RESERVED_DID 2 - #define PASID_FLAG_NESTED BIT(1) #define PASID_FLAG_PAGE_SNOOP BIT(2) +#define PASID_FLAG_PWSNP BIT(2) /* * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- @@ -54,7 +48,6 @@ struct pasid_entry { /* The representative of a PASID table */ struct pasid_table { void *table; /* pasid table pointer */ - int order; /* page order of pasid table */ u32 max_pasid; /* max pasid */ }; @@ -80,6 +73,12 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte) return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT; } +/* Get FPD(Fault Processing Disable) bit of a PASID table entry */ +static inline bool pasid_pte_is_fault_disabled(struct pasid_entry *pte) +{ + return READ_ONCE(pte->val[0]) & PASID_PTE_FPD; +} + /* Get PGTT field of a PASID table entry */ static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) { @@ -248,16 +247,6 @@ static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) } /* - * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID - * entry. It is required when XD bit of the first level page table - * entry is about to be set. - */ -static inline void pasid_set_nxe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); -} - -/* * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode * PASID entry. */ @@ -300,9 +289,9 @@ extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); void intel_pasid_free_table(struct device *dev); struct pasid_table *intel_pasid_get_table(struct device *dev); -int intel_pasid_setup_first_level(struct intel_iommu *iommu, - struct device *dev, pgd_t *pgd, - u32 pasid, u16 did, int flags); +int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, + phys_addr_t fsptptr, u32 pasid, u16 did, + int flags); int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); @@ -313,6 +302,20 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid); int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, phys_addr_t fsptptr, + u32 pasid, u16 did, u16 old_did, int flags); +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain); + void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c index adc4de6bbd88..dceeadc3ee7c 100644 --- a/drivers/iommu/intel/perf.c +++ b/drivers/iommu/intel/perf.c @@ -113,7 +113,7 @@ static char *latency_type_names[] = { " svm_prq" }; -int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) +void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) { struct latency_statistic *lstat = iommu->perf_statistic; unsigned long flags; @@ -122,7 +122,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) memset(str, 0, size); for (i = 0; i < COUNTS_NUM; i++) - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "%s", latency_counter_names[i]); spin_lock_irqsave(&latency_lock, flags); @@ -130,7 +130,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) if (!dmar_latency_enabled(iommu, i)) continue; - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "\n%s", latency_type_names[i]); for (j = 0; j < COUNTS_NUM; j++) { @@ -156,11 +156,9 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) break; } - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "%12lld", val); } } spin_unlock_irqrestore(&latency_lock, flags); - - return bytes; } diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h index df9a36942d64..1d4baad7e852 100644 --- a/drivers/iommu/intel/perf.h +++ b/drivers/iommu/intel/perf.h @@ -40,7 +40,7 @@ void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type); bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type); void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency); -int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size); +void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size); #else static inline int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type) @@ -64,9 +64,8 @@ dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 laten { } -static inline int +static inline void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) { - return 0; } #endif /* CONFIG_DMAR_PERF */ diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c index 44083d01852d..75f493bcb353 100644 --- a/drivers/iommu/intel/perfmon.c +++ b/drivers/iommu/intel/perfmon.c @@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_events_attr_group = { .attrs = attrs_empty, }; -static cpumask_t iommu_pmu_cpu_mask; - -static ssize_t -cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask); -} -static DEVICE_ATTR_RO(cpumask); - -static struct attribute *iommu_pmu_cpumask_attrs[] = { - &dev_attr_cpumask.attr, - NULL -}; - -static struct attribute_group iommu_pmu_cpumask_attr_group = { - .attrs = iommu_pmu_cpumask_attrs, -}; - static const struct attribute_group *iommu_pmu_attr_groups[] = { &iommu_pmu_format_attr_group, &iommu_pmu_events_attr_group, - &iommu_pmu_cpumask_attr_group, NULL }; @@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct intel_iommu *iommu) iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups; iommu_pmu->pmu.attr_update = iommu_pmu_attr_update; iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; iommu_pmu->pmu.module = THIS_MODULE; return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1); @@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu) iommu->perf_irq = 0; } -static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) -{ - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); - - if (cpumask_empty(&iommu_pmu_cpu_mask)) - cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask); - - if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask)) - iommu_pmu->cpu = cpu; - - return 0; -} - -static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) -{ - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); - int target = cpumask_first(&iommu_pmu_cpu_mask); - - /* - * The iommu_pmu_cpu_mask has been updated when offline the CPU - * for the first iommu_pmu. Migrate the other iommu_pmu to the - * new target. - */ - if (target < nr_cpu_ids && target != iommu_pmu->cpu) { - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); - iommu_pmu->cpu = target; - return 0; - } - - if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask)) - return 0; - - target = cpumask_any_but(cpu_online_mask, cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &iommu_pmu_cpu_mask); - else - return 0; - - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); - iommu_pmu->cpu = target; - - return 0; -} - -static int nr_iommu_pmu; -static enum cpuhp_state iommu_cpuhp_slot; - -static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu) -{ - int ret; - - if (!nr_iommu_pmu) { - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, - "driver/iommu/intel/perfmon:online", - iommu_pmu_cpu_online, - iommu_pmu_cpu_offline); - if (ret < 0) - return ret; - iommu_cpuhp_slot = ret; - } - - ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); - if (ret) { - if (!nr_iommu_pmu) - cpuhp_remove_multi_state(iommu_cpuhp_slot); - return ret; - } - nr_iommu_pmu++; - - return 0; -} - -static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu) -{ - cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); - - if (--nr_iommu_pmu) - return; - - cpuhp_remove_multi_state(iommu_cpuhp_slot); -} - void iommu_pmu_register(struct intel_iommu *iommu) { struct iommu_pmu *iommu_pmu = iommu->pmu; @@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iommu *iommu) if (__iommu_pmu_register(iommu)) goto err; - if (iommu_pmu_cpuhp_setup(iommu_pmu)) - goto unregister; - /* Set interrupt for overflow */ if (iommu_pmu_set_interrupt(iommu)) - goto cpuhp_free; + goto unregister; return; -cpuhp_free: - iommu_pmu_cpuhp_free(iommu_pmu); unregister: perf_pmu_unregister(&iommu_pmu->pmu); err: @@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_iommu *iommu) return; iommu_pmu_unset_interrupt(iommu); - iommu_pmu_cpuhp_free(iommu_pmu); perf_pmu_unregister(&iommu_pmu->pmu); } diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c new file mode 100644 index 000000000000..ff63c228e6e1 --- /dev/null +++ b/drivers/iommu/intel/prq.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Intel Corporation + * + * Originally split from drivers/iommu/intel/svm.c + */ + +#include <linux/pci.h> +#include <linux/pci-ats.h> + +#include "iommu.h" +#include "pasid.h" +#include "../iommu-pages.h" +#include "trace.h" + +/* Page request queue descriptor */ +struct page_req_dsc { + union { + struct { + u64 type:8; + u64 pasid_present:1; + u64 rsvd:7; + u64 rid:16; + u64 pasid:20; + u64 exe_req:1; + u64 pm_req:1; + u64 rsvd2:10; + }; + u64 qw_0; + }; + union { + struct { + u64 rd_req:1; + u64 wr_req:1; + u64 lpig:1; + u64 prg_index:9; + u64 addr:52; + }; + u64 qw_1; + }; + u64 qw_2; + u64 qw_3; +}; + +/** + * intel_iommu_drain_pasid_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + int head, tail; + u16 sid, did; + + info = dev_iommu_priv_get(dev); + if (!info->iopf_refcount) + return; + + iommu = info->iommu; + domain = info->domain; + sid = PCI_DEVID(info->bus, info->devfn); + did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (req->rid != sid || + (req->pasid_present && pasid != req->pasid) || + (!req->pasid_present && pasid != IOMMU_NO_PASID)) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + iopf_queue_flush_dev(dev); + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + if (pasid == IOMMU_NO_PASID) { + qi_desc_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, &desc[1]); + qi_desc_dev_iotlb(sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, &desc[2]); + } else { + qi_desc_piotlb(did, pasid, 0, -1, 0, &desc[1]); + qi_desc_dev_iotlb_pasid(sid, info->pfsid, pasid, info->ats_qdep, + 0, MAX_AGAW_PFN_WIDTH, &desc[2]); + } +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + +static bool is_canonical_address(u64 addr) +{ + int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + long saddr = (long)addr; + + return (((saddr << shift) >> shift) == saddr); +} + +static void handle_bad_prq_event(struct intel_iommu *iommu, + struct page_req_dsc *req, int result) +{ + struct qi_desc desc = { }; + + pr_err("%s: Invalid page request: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + + if (!req->lpig) + return; + + desc.qw0 = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID(req->rid) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_RESP_CODE(result) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(req->prg_index); + + qi_submit_sync(iommu, &desc, 1, 0); +} + +static int prq_to_iommu_prot(struct page_req_dsc *req) +{ + int prot = 0; + + if (req->rd_req) + prot |= IOMMU_FAULT_PERM_READ; + if (req->wr_req) + prot |= IOMMU_FAULT_PERM_WRITE; + if (req->exe_req) + prot |= IOMMU_FAULT_PERM_EXEC; + if (req->pm_req) + prot |= IOMMU_FAULT_PERM_PRIV; + + return prot; +} + +static void intel_prq_report(struct intel_iommu *iommu, struct device *dev, + struct page_req_dsc *desc) +{ + struct iopf_fault event = { }; + + /* Fill in event data for device specific processing */ + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; + event.fault.prm.pasid = desc->pasid; + event.fault.prm.grpid = desc->prg_index; + event.fault.prm.perm = prq_to_iommu_prot(desc); + + if (desc->lpig) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (desc->pasid_present) { + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + } + + iommu_report_device_fault(dev, &event); +} + +static irqreturn_t prq_event_thread(int irq, void *d) +{ + struct intel_iommu *iommu = d; + struct page_req_dsc *req; + int head, tail, handled; + struct device *dev; + u64 address; + + /* + * Clear PPR bit before reading head/tail registers, to ensure that + * we get a new interrupt if needed. + */ + writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); + + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + handled = (head != tail); + while (head != tail) { + req = &iommu->prq[head / sizeof(*req)]; + address = (u64)req->addr << VTD_PAGE_SHIFT; + + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", + iommu->name); +bad_req: + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + goto prq_advance; + } + + if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { + pr_err("IOMMU: %s: Page request in Privilege Mode\n", + iommu->name); + goto bad_req; + } + + if (unlikely(req->exe_req && req->rd_req)) { + pr_err("IOMMU: %s: Execution request not supported\n", + iommu->name); + goto bad_req; + } + + /* Drop Stop Marker message. No need for a response. */ + if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) + goto prq_advance; + + /* + * If prq is to be handled outside iommu driver via receiver of + * the fault notifiers, we skip the page response here. + */ + mutex_lock(&iommu->iopf_lock); + dev = device_rbtree_find(iommu, req->rid); + if (!dev) { + mutex_unlock(&iommu->iopf_lock); + goto bad_req; + } + + intel_prq_report(iommu, dev, req); + trace_prq_report(iommu, dev, req->qw_0, req->qw_1, + req->qw_2, req->qw_3, + iommu->prq_seq_number++); + mutex_unlock(&iommu->iopf_lock); +prq_advance: + head = (head + sizeof(*req)) & PRQ_RING_MASK; + } + + dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", + iommu->name); + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + if (head == tail) { + iopf_queue_discard_partial(iommu->iopf_queue); + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", + iommu->name); + } + } + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + + return IRQ_RETVAL(handled); +} + +int intel_iommu_enable_prq(struct intel_iommu *iommu) +{ + struct iopf_queue *iopfq; + int irq, ret; + + iommu->prq = + iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, PRQ_SIZE); + if (!iommu->prq) { + pr_warn("IOMMU: %s: Failed to allocate page request queue\n", + iommu->name); + return -ENOMEM; + } + + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) { + pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", + iommu->name); + ret = -EINVAL; + goto free_prq; + } + iommu->pr_irq = irq; + + snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), + "dmar%d-iopfq", iommu->seq_id); + iopfq = iopf_queue_alloc(iommu->iopfq_name); + if (!iopfq) { + pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); + ret = -ENOMEM; + goto free_hwirq; + } + iommu->iopf_queue = iopfq; + + snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); + + ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, + iommu->prq_name, iommu); + if (ret) { + pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", + iommu->name); + goto free_iopfq; + } + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + + init_completion(&iommu->prq_complete); + + return 0; + +free_iopfq: + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; +free_hwirq: + dmar_free_hwirq(irq); + iommu->pr_irq = 0; +free_prq: + iommu_free_pages(iommu->prq); + iommu->prq = NULL; + + return ret; +} + +int intel_iommu_finish_prq(struct intel_iommu *iommu) +{ + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); + + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + } + + if (iommu->iopf_queue) { + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; + } + + iommu_free_pages(iommu->prq); + iommu->prq = NULL; + + return 0; +} + +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; + struct iommu_fault_page_request *prm; + struct qi_desc desc; + bool pasid_present; + u16 sid; + + prm = &evt->fault.prm; + sid = PCI_DEVID(bus, devfn); + pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + + desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | + QI_PGRP_PASID_P(pasid_present) | + QI_PGRP_RESP_CODE(msg->code) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(prm->grpid); + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(iommu, &desc, 1, 0); +} diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 0e3a9b38bef2..71de7947971f 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -25,92 +25,6 @@ #include "../iommu-pages.h" #include "trace.h" -static irqreturn_t prq_event_thread(int irq, void *d); - -int intel_svm_enable_prq(struct intel_iommu *iommu) -{ - struct iopf_queue *iopfq; - int irq, ret; - - iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); - if (!iommu->prq) { - pr_warn("IOMMU: %s: Failed to allocate page request queue\n", - iommu->name); - return -ENOMEM; - } - - irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); - if (irq <= 0) { - pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", - iommu->name); - ret = -EINVAL; - goto free_prq; - } - iommu->pr_irq = irq; - - snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), - "dmar%d-iopfq", iommu->seq_id); - iopfq = iopf_queue_alloc(iommu->iopfq_name); - if (!iopfq) { - pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); - ret = -ENOMEM; - goto free_hwirq; - } - iommu->iopf_queue = iopfq; - - snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); - - ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, - iommu->prq_name, iommu); - if (ret) { - pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", - iommu->name); - goto free_iopfq; - } - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); - - init_completion(&iommu->prq_complete); - - return 0; - -free_iopfq: - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; -free_hwirq: - dmar_free_hwirq(irq); - iommu->pr_irq = 0; -free_prq: - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return ret; -} - -int intel_svm_finish_prq(struct intel_iommu *iommu) -{ - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); - - if (iommu->pr_irq) { - free_irq(iommu->pr_irq, iommu); - dmar_free_hwirq(iommu->pr_irq); - iommu->pr_irq = 0; - } - - if (iommu->iopf_queue) { - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; - } - - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return 0; -} - void intel_svm_check(struct intel_iommu *iommu) { if (!pasid_supported(iommu)) @@ -184,7 +98,10 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static void intel_mm_free_notifier(struct mmu_notifier *mn) { - kfree(container_of(mn, struct dmar_domain, notifier)); + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); + + kfree(domain->qi_batch); + kfree(domain); } static const struct mmu_notifier_ops intel_mmuops = { @@ -193,359 +110,81 @@ static const struct mmu_notifier_ops intel_mmuops = { .free_notifier = intel_mm_free_notifier, }; -static int intel_svm_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) +static int intel_iommu_sva_supported(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_iommu *iommu = info->iommu; - struct mm_struct *mm = domain->mm; - struct dev_pasid_info *dev_pasid; - unsigned long sflags; - unsigned long flags; - int ret = 0; - - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; - - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - - ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid); - if (ret) - goto free_dev_pasid; - - /* Setup the pasid table: */ - sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; - ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, - FLPT_DEFAULT_DID, sflags); - if (ret) - goto unassign_tag; - - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); - - return 0; - -unassign_tag: - cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); -free_dev_pasid: - kfree(dev_pasid); - - return ret; -} - -/* Page request queue descriptor */ -struct page_req_dsc { - union { - struct { - u64 type:8; - u64 pasid_present:1; - u64 rsvd:7; - u64 rid:16; - u64 pasid:20; - u64 exe_req:1; - u64 pm_req:1; - u64 rsvd2:10; - }; - u64 qw_0; - }; - union { - struct { - u64 rd_req:1; - u64 wr_req:1; - u64 lpig:1; - u64 prg_index:9; - u64 addr:52; - }; - u64 qw_1; - }; - u64 qw_2; - u64 qw_3; -}; - -static bool is_canonical_address(u64 addr) -{ - int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); - long saddr = (long) addr; - - return (((saddr << shift) >> shift) == saddr); -} - -/** - * intel_drain_pasid_prq - Drain page requests and responses for a pasid - * @dev: target device - * @pasid: pasid for draining - * - * Drain all pending page requests and responses related to @pasid in both - * software and hardware. This is supposed to be called after the device - * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB - * and DevTLB have been invalidated. - * - * It waits until all pending page requests for @pasid in the page fault - * queue are completed by the prq handling thread. Then follow the steps - * described in VT-d spec CH7.10 to drain all page requests and page - * responses pending in the hardware. - */ -void intel_drain_pasid_prq(struct device *dev, u32 pasid) -{ - struct device_domain_info *info; - struct dmar_domain *domain; struct intel_iommu *iommu; - struct qi_desc desc[3]; - struct pci_dev *pdev; - int head, tail; - u16 sid, did; - int qdep; - - info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev))) - return; - if (!info->pri_enabled) - return; + if (!info || dmar_disabled) + return -EINVAL; iommu = info->iommu; - domain = info->domain; - pdev = to_pci_dev(dev); - sid = PCI_DEVID(info->bus, info->devfn); - did = domain_id_iommu(domain, iommu); - qdep = pci_ats_queue_depth(pdev); + if (!iommu) + return -EINVAL; - /* - * Check and wait until all pending page requests in the queue are - * handled by the prq handling thread. - */ -prq_retry: - reinit_completion(&iommu->prq_complete); - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - while (head != tail) { - struct page_req_dsc *req; - - req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { - head = (head + sizeof(*req)) & PRQ_RING_MASK; - continue; - } - - wait_for_completion(&iommu->prq_complete); - goto prq_retry; - } + if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) + return -ENODEV; - iopf_queue_flush_dev(dev); + if (!info->pasid_enabled || !info->ats_enabled) + return -EINVAL; /* - * Perform steps described in VT-d spec CH7.10 to drain page - * requests and responses in hardware. + * Devices having device-specific I/O fault handling should not + * support PCI/PRI. The IOMMU side has no means to check the + * capability of device-specific IOPF. Therefore, IOMMU can only + * default that if the device driver enables SVA on a non-PRI + * device, it will handle IOPF in its own way. */ - memset(desc, 0, sizeof(desc)); - desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | - QI_IWD_FENCE | - QI_IWD_TYPE; - desc[1].qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | - QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | - QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(info->pfsid); -qi_retry: - reinit_completion(&iommu->prq_complete); - qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - wait_for_completion(&iommu->prq_complete); - goto qi_retry; - } -} - -static int prq_to_iommu_prot(struct page_req_dsc *req) -{ - int prot = 0; - - if (req->rd_req) - prot |= IOMMU_FAULT_PERM_READ; - if (req->wr_req) - prot |= IOMMU_FAULT_PERM_WRITE; - if (req->exe_req) - prot |= IOMMU_FAULT_PERM_EXEC; - if (req->pm_req) - prot |= IOMMU_FAULT_PERM_PRIV; - - return prot; -} - -static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, - struct page_req_dsc *desc) -{ - struct iopf_fault event = { }; - - /* Fill in event data for device specific processing */ - event.fault.type = IOMMU_FAULT_PAGE_REQ; - event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; - event.fault.prm.pasid = desc->pasid; - event.fault.prm.grpid = desc->prg_index; - event.fault.prm.perm = prq_to_iommu_prot(desc); - - if (desc->lpig) - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - if (desc->pasid_present) { - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; - } - - iommu_report_device_fault(dev, &event); -} - -static void handle_bad_prq_event(struct intel_iommu *iommu, - struct page_req_dsc *req, int result) -{ - struct qi_desc desc = { }; - - pr_err("%s: Invalid page request: %08llx %08llx\n", - iommu->name, ((unsigned long long *)req)[0], - ((unsigned long long *)req)[1]); - - if (!req->lpig) - return; + if (!info->pri_supported) + return 0; - desc.qw0 = QI_PGRP_PASID(req->pasid) | - QI_PGRP_DID(req->rid) | - QI_PGRP_PASID_P(req->pasid_present) | - QI_PGRP_RESP_CODE(result) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(req->prg_index) | - QI_PGRP_LPIG(req->lpig); + /* Devices supporting PRI should have it enabled. */ + if (!info->pri_enabled) + return -EINVAL; - qi_submit_sync(iommu, &desc, 1, 0); + return 0; } -static irqreturn_t prq_event_thread(int irq, void *d) +static int intel_svm_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { - struct intel_iommu *iommu = d; - struct page_req_dsc *req; - int head, tail, handled; - struct device *dev; - u64 address; + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct mm_struct *mm = domain->mm; + struct dev_pasid_info *dev_pasid; + unsigned long sflags; + int ret = 0; - /* - * Clear PPR bit before reading head/tail registers, to ensure that - * we get a new interrupt if needed. - */ - writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); - - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - handled = (head != tail); - while (head != tail) { - req = &iommu->prq[head / sizeof(*req)]; - address = (u64)req->addr << VTD_PAGE_SHIFT; - - if (unlikely(!req->pasid_present)) { - pr_err("IOMMU: %s: Page request without PASID\n", - iommu->name); -bad_req: - handle_bad_prq_event(iommu, req, QI_RESP_INVALID); - goto prq_advance; - } - - if (unlikely(!is_canonical_address(address))) { - pr_err("IOMMU: %s: Address is not canonical\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { - pr_err("IOMMU: %s: Page request in Privilege Mode\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->exe_req && req->rd_req)) { - pr_err("IOMMU: %s: Execution request not supported\n", - iommu->name); - goto bad_req; - } - - /* Drop Stop Marker message. No need for a response. */ - if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) - goto prq_advance; - - /* - * If prq is to be handled outside iommu driver via receiver of - * the fault notifiers, we skip the page response here. - */ - mutex_lock(&iommu->iopf_lock); - dev = device_rbtree_find(iommu, req->rid); - if (!dev) { - mutex_unlock(&iommu->iopf_lock); - goto bad_req; - } - - intel_svm_prq_report(iommu, dev, req); - trace_prq_report(iommu, dev, req->qw_0, req->qw_1, - req->qw_2, req->qw_3, - iommu->prq_seq_number++); - mutex_unlock(&iommu->iopf_lock); -prq_advance: - head = (head + sizeof(*req)) & PRQ_RING_MASK; - } + ret = intel_iommu_sva_supported(dev); + if (ret) + return ret; - dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); - /* - * Clear the page request overflow bit and wake up all threads that - * are waiting for the completion of this handling. - */ - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", - iommu->name); - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - if (head == tail) { - iopf_queue_discard_partial(iommu->iopf_queue); - writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); - pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", - iommu->name); - } - } + ret = iopf_for_domain_replace(domain, old, dev); + if (ret) + goto out_remove_dev_pasid; - if (!completion_done(&iommu->prq_complete)) - complete(&iommu->prq_complete); + /* Setup the pasid table: */ + sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + sflags |= PASID_FLAG_PWSNP; + ret = __domain_setup_first_level(iommu, dev, pasid, + FLPT_DEFAULT_DID, __pa(mm->pgd), + sflags, old); + if (ret) + goto out_unwind_iopf; - return IRQ_RETVAL(handled); -} + domain_remove_dev_pasid(old, dev, pasid); -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; - struct iommu_fault_page_request *prm; - struct qi_desc desc; - bool pasid_present; - bool last_page; - u16 sid; - - prm = &evt->fault.prm; - sid = PCI_DEVID(bus, devfn); - pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - - desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | - QI_PGRP_PASID_P(pasid_present) | - QI_PGRP_RESP_CODE(msg->code) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); - desc.qw2 = 0; - desc.qw3 = 0; - - qi_submit_sync(iommu, &desc, 1, 0); + return 0; +out_unwind_iopf: + iopf_for_domain_replace(old, domain, dev); +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; } static void intel_svm_domain_free(struct iommu_domain *domain) @@ -567,12 +206,15 @@ struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct dmar_domain *domain; int ret; + ret = intel_iommu_sva_supported(dev); + if (ret) + return ERR_PTR(ret); + domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return ERR_PTR(-ENOMEM); domain->domain.ops = &intel_svm_domain_ops; - domain->use_first_level = true; INIT_LIST_HEAD(&domain->dev_pasids); INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->cache_lock); diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h index 9defdae6ebae..6311ba3f1691 100644 --- a/drivers/iommu/intel/trace.h +++ b/drivers/iommu/intel/trace.h @@ -130,11 +130,6 @@ DEFINE_EVENT(cache_tag_log, cache_tag_unassign, TP_ARGS(tag) ); -DEFINE_EVENT(cache_tag_log, cache_tag_flush_all, - TP_PROTO(struct cache_tag *tag), - TP_ARGS(tag) -); - DECLARE_EVENT_CLASS(cache_tag_flush, TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, unsigned long addr, unsigned long pages, unsigned long mask), diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 06d78fcc79fd..8b5926c1452e 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -59,30 +59,6 @@ void iopf_free_group(struct iopf_group *group) } EXPORT_SYMBOL_GPL(iopf_free_group); -static struct iommu_domain *get_domain_for_iopf(struct device *dev, - struct iommu_fault *fault) -{ - struct iommu_domain *domain; - - if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) { - domain = iommu_get_domain_for_dev_pasid(dev, fault->prm.pasid, 0); - if (IS_ERR(domain)) - domain = NULL; - } else { - domain = iommu_get_domain_for_dev(dev); - } - - if (!domain || !domain->iopf_handler) { - dev_warn_ratelimited(dev, - "iopf (pasid %d) without domain attached or handler installed\n", - fault->prm.pasid); - - return NULL; - } - - return domain; -} - /* Non-last request of a group. Postpone until the last one. */ static int report_partial_fault(struct iommu_fault_param *fault_param, struct iommu_fault *fault) @@ -134,9 +110,64 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, list_add(&group->pending_node, &iopf_param->faults); mutex_unlock(&iopf_param->lock); + group->fault_count = list_count_nodes(&group->faults); + return group; } +static struct iommu_attach_handle *find_fault_handler(struct device *dev, + struct iopf_fault *evt) +{ + struct iommu_fault *fault = &evt->fault; + struct iommu_attach_handle *attach_handle; + + if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) { + attach_handle = iommu_attach_handle_get(dev->iommu_group, + fault->prm.pasid, 0); + if (IS_ERR(attach_handle)) { + const struct iommu_ops *ops = dev_iommu_ops(dev); + + if (!ops->user_pasid_table) + return NULL; + /* + * The iommu driver for this device supports user- + * managed PASID table. Therefore page faults for + * any PASID should go through the NESTING domain + * attached to the device RID. + */ + attach_handle = iommu_attach_handle_get( + dev->iommu_group, IOMMU_NO_PASID, + IOMMU_DOMAIN_NESTED); + if (IS_ERR(attach_handle)) + return NULL; + } + } else { + attach_handle = iommu_attach_handle_get(dev->iommu_group, + IOMMU_NO_PASID, 0); + + if (IS_ERR(attach_handle)) + return NULL; + } + + if (!attach_handle->domain->iopf_handler) + return NULL; + + return attach_handle; +} + +static void iopf_error_response(struct device *dev, struct iopf_fault *evt) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_fault *fault = &evt->fault; + struct iommu_page_response resp = { + .pasid = fault->prm.pasid, + .grpid = fault->prm.grpid, + .code = IOMMU_PAGE_RESP_INVALID + }; + + ops->page_response(dev, evt, &resp); +} + /** * iommu_report_device_fault() - Report fault event to device driver * @dev: the device @@ -175,23 +206,39 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, * handling framework should guarantee that the iommu domain could only be * freed after the device has stopped generating page faults (or the iommu * hardware has been set to block the page faults) and the pending page faults - * have been flushed. + * have been flushed. In case no page fault handler is attached or no iopf params + * are setup, then the ops->page_response() is called to complete the evt. + * + * Returns 0 on success, or an error in case of a bad/failed iopf setup. */ -void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { + struct iommu_attach_handle *attach_handle; struct iommu_fault *fault = &evt->fault; struct iommu_fault_param *iopf_param; struct iopf_group abort_group = {}; struct iopf_group *group; + attach_handle = find_fault_handler(dev, evt); + if (!attach_handle) + goto err_bad_iopf; + + /* + * Something has gone wrong if a fault capable domain is attached but no + * iopf_param is setup + */ iopf_param = iopf_get_dev_fault_param(dev); if (WARN_ON(!iopf_param)) - return; + goto err_bad_iopf; if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { - report_partial_fault(iopf_param, fault); + int ret; + + ret = report_partial_fault(iopf_param, fault); iopf_put_dev_fault_param(iopf_param); /* A request that is not the last does not need to be ack'd */ + + return ret; } /* @@ -206,25 +253,33 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) if (group == &abort_group) goto err_abort; - group->domain = get_domain_for_iopf(dev, fault); - if (!group->domain) - goto err_abort; + group->attach_handle = attach_handle; /* * On success iopf_handler must call iopf_group_response() and * iopf_free_group() */ - if (group->domain->iopf_handler(group)) + if (group->attach_handle->domain->iopf_handler(group)) goto err_abort; - return; + return 0; err_abort: + dev_warn_ratelimited(dev, "iopf with pasid %d aborted\n", + fault->prm.pasid); iopf_group_response(group, IOMMU_PAGE_RESP_FAILURE); if (group == &abort_group) __iopf_free_group(group); else iopf_free_group(group); + + return 0; + +err_bad_iopf: + if (fault->type == IOMMU_FAULT_PAGE_REQ) + iopf_error_response(dev, evt); + + return -EINVAL; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); @@ -423,6 +478,7 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) ops->page_response(dev, iopf, &resp); list_del_init(&group->pending_node); + iopf_free_group(group); } mutex_unlock(&fault_param->lock); diff --git a/drivers/iommu/io-pgtable-arm-selftests.c b/drivers/iommu/io-pgtable-arm-selftests.c new file mode 100644 index 000000000000..334e70350924 --- /dev/null +++ b/drivers/iommu/io-pgtable-arm-selftests.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CPU-agnostic ARM page table allocator. + * + * Copyright (C) 2014 ARM Limited + * + * Author: Will Deacon <will.deacon@arm.com> + */ + +#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt + +#include <kunit/device.h> +#include <kunit/test.h> +#include <linux/io-pgtable.h> +#include <linux/kernel.h> + +#include "io-pgtable-arm.h" + +static struct io_pgtable_cfg *cfg_cookie; + +static void dummy_tlb_flush_all(void *cookie) +{ + WARN_ON(cookie != cfg_cookie); +} + +static void dummy_tlb_flush(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + WARN_ON(cookie != cfg_cookie); + WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); +} + +static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ + dummy_tlb_flush(iova, granule, granule, cookie); +} + +static const struct iommu_flush_ops dummy_tlb_ops = { + .tlb_flush_all = dummy_tlb_flush_all, + .tlb_flush_walk = dummy_tlb_flush, + .tlb_add_page = dummy_tlb_add_page, +}; + +#define __FAIL(test, i) ({ \ + KUNIT_FAIL(test, "test failed for fmt idx %d\n", (i)); \ + -EFAULT; \ +}) + +static int arm_lpae_run_tests(struct kunit *test, struct io_pgtable_cfg *cfg) +{ + static const enum io_pgtable_fmt fmts[] = { + ARM_64_LPAE_S1, + ARM_64_LPAE_S2, + }; + + int i, j; + unsigned long iova; + size_t size, mapped; + struct io_pgtable_ops *ops; + + for (i = 0; i < ARRAY_SIZE(fmts); ++i) { + cfg_cookie = cfg; + ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg); + if (!ops) { + kunit_err(test, "failed to allocate io pgtable ops\n"); + return -ENOMEM; + } + + /* + * Initial sanity checks. + * Empty page tables shouldn't provide any translations. + */ + if (ops->iova_to_phys(ops, 42)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, SZ_1G + 42)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, SZ_2G + 42)) + return __FAIL(test, i); + + /* + * Distinct mappings of different granule sizes. + */ + iova = 0; + for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { + size = 1UL << j; + + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + /* Overlapping mappings */ + if (!ops->map_pages(ops, iova, iova + size, size, 1, + IOMMU_READ | IOMMU_NOEXEC, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) + return __FAIL(test, i); + + iova += SZ_1G; + } + + /* Full unmap */ + iova = 0; + for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { + size = 1UL << j; + + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42)) + return __FAIL(test, i); + + /* Remap full block */ + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_WRITE, GFP_KERNEL, &mapped)) + return __FAIL(test, i); + + if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) + return __FAIL(test, i); + + iova += SZ_1G; + } + + /* + * Map/unmap the last largest supported page of the IAS, this can + * trigger corner cases in the concatednated page tables. + */ + mapped = 0; + size = 1UL << __fls(cfg->pgsize_bitmap); + iova = (1UL << cfg->ias) - size; + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) + return __FAIL(test, i); + if (mapped != size) + return __FAIL(test, i); + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) + return __FAIL(test, i); + + free_io_pgtable_ops(ops); + } + + return 0; +} + +static void arm_lpae_do_selftests(struct kunit *test) +{ + static const unsigned long pgsize[] = { + SZ_4K | SZ_2M | SZ_1G, + SZ_16K | SZ_32M, + SZ_64K | SZ_512M, + }; + + static const unsigned int address_size[] = { + 32, 36, 40, 42, 44, 48, + }; + + int i, j, k, pass = 0, fail = 0; + struct device *dev; + struct io_pgtable_cfg cfg = { + .tlb = &dummy_tlb_ops, + .coherent_walk = true, + .quirks = IO_PGTABLE_QUIRK_NO_WARN, + }; + + dev = kunit_device_register(test, "io-pgtable-test"); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); + if (IS_ERR_OR_NULL(dev)) + return; + + cfg.iommu_dev = dev; + + for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { + for (j = 0; j < ARRAY_SIZE(address_size); ++j) { + /* Don't use ias > oas as it is not valid for stage-2. */ + for (k = 0; k <= j; ++k) { + cfg.pgsize_bitmap = pgsize[i]; + cfg.ias = address_size[k]; + cfg.oas = address_size[j]; + kunit_info(test, "pgsize_bitmap 0x%08lx, IAS %u OAS %u\n", + pgsize[i], cfg.ias, cfg.oas); + if (arm_lpae_run_tests(test, &cfg)) + fail++; + else + pass++; + } + } + } + + kunit_info(test, "completed with %d PASS %d FAIL\n", pass, fail); +} + +static struct kunit_case io_pgtable_arm_test_cases[] = { + KUNIT_CASE(arm_lpae_do_selftests), + {}, +}; + +static struct kunit_suite io_pgtable_arm_test = { + .name = "io-pgtable-arm-test", + .test_cases = io_pgtable_arm_test_cases, +}; + +kunit_test_suite(io_pgtable_arm_test); + +MODULE_DESCRIPTION("io-pgtable-arm library kunit tests"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 75f244a3e12d..523355e91a2c 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -166,7 +166,6 @@ struct arm_v7s_io_pgtable { arm_v7s_iopte *pgd; struct kmem_cache *l2_tables; - spinlock_t split_lock; }; static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl); @@ -363,25 +362,6 @@ static arm_v7s_iopte arm_v7s_prot_to_pte(int prot, int lvl, return pte; } -static int arm_v7s_pte_to_prot(arm_v7s_iopte pte, int lvl) -{ - int prot = IOMMU_READ; - arm_v7s_iopte attr = pte >> ARM_V7S_ATTR_SHIFT(lvl); - - if (!(attr & ARM_V7S_PTE_AP_RDONLY)) - prot |= IOMMU_WRITE; - if (!(attr & ARM_V7S_PTE_AP_UNPRIV)) - prot |= IOMMU_PRIV; - if ((attr & (ARM_V7S_TEX_MASK << ARM_V7S_TEX_SHIFT)) == 0) - prot |= IOMMU_MMIO; - else if (pte & ARM_V7S_ATTR_C) - prot |= IOMMU_CACHE; - if (pte & ARM_V7S_ATTR_XN(lvl)) - prot |= IOMMU_NOEXEC; - - return prot; -} - static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl) { if (lvl == 1) { @@ -398,23 +378,6 @@ static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl) return pte; } -static arm_v7s_iopte arm_v7s_cont_to_pte(arm_v7s_iopte pte, int lvl) -{ - if (lvl == 1) { - pte &= ~ARM_V7S_CONT_SECTION; - } else if (lvl == 2) { - arm_v7s_iopte xn = pte & BIT(ARM_V7S_CONT_PAGE_XN_SHIFT); - arm_v7s_iopte tex = pte & (ARM_V7S_CONT_PAGE_TEX_MASK << - ARM_V7S_CONT_PAGE_TEX_SHIFT); - - pte ^= xn | tex | ARM_V7S_PTE_TYPE_CONT_PAGE; - pte |= (xn >> ARM_V7S_CONT_PAGE_XN_SHIFT) | - (tex >> ARM_V7S_CONT_PAGE_TEX_SHIFT) | - ARM_V7S_PTE_TYPE_PAGE; - } - return pte; -} - static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl) { if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte, lvl)) @@ -552,9 +515,8 @@ static int arm_v7s_map_pages(struct io_pgtable_ops *ops, unsigned long iova, paddr >= (1ULL << data->iop.cfg.oas))) return -ERANGE; - /* If no access, then nothing to do */ if (!(prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; while (pgcount--) { ret = __arm_v7s_map(data, iova, paddr, pgsize, prot, 1, data->pgd, @@ -592,77 +554,6 @@ static void arm_v7s_free_pgtable(struct io_pgtable *iop) kfree(data); } -static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, - unsigned long iova, int idx, int lvl, - arm_v7s_iopte *ptep) -{ - struct io_pgtable *iop = &data->iop; - arm_v7s_iopte pte; - size_t size = ARM_V7S_BLOCK_SIZE(lvl); - int i; - - /* Check that we didn't lose a race to get the lock */ - pte = *ptep; - if (!arm_v7s_pte_is_cont(pte, lvl)) - return pte; - - ptep -= idx & (ARM_V7S_CONT_PAGES - 1); - pte = arm_v7s_cont_to_pte(pte, lvl); - for (i = 0; i < ARM_V7S_CONT_PAGES; i++) - ptep[i] = pte + i * size; - - __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg); - - size *= ARM_V7S_CONT_PAGES; - io_pgtable_tlb_flush_walk(iop, iova, size, size); - return pte; -} - -static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size, - arm_v7s_iopte blk_pte, - arm_v7s_iopte *ptep) -{ - struct io_pgtable_cfg *cfg = &data->iop.cfg; - arm_v7s_iopte pte, *tablep; - int i, unmap_idx, num_entries, num_ptes; - - tablep = __arm_v7s_alloc_table(2, GFP_ATOMIC, data); - if (!tablep) - return 0; /* Bytes unmapped */ - - num_ptes = ARM_V7S_PTES_PER_LVL(2, cfg); - num_entries = size >> ARM_V7S_LVL_SHIFT(2); - unmap_idx = ARM_V7S_LVL_IDX(iova, 2, cfg); - - pte = arm_v7s_prot_to_pte(arm_v7s_pte_to_prot(blk_pte, 1), 2, cfg); - if (num_entries > 1) - pte = arm_v7s_pte_to_cont(pte, 2); - - for (i = 0; i < num_ptes; i += num_entries, pte += size) { - /* Unmap! */ - if (i == unmap_idx) - continue; - - __arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg); - } - - pte = arm_v7s_install_table(tablep, ptep, blk_pte, cfg); - if (pte != blk_pte) { - __arm_v7s_free_table(tablep, 2, data); - - if (!ARM_V7S_PTE_IS_TABLE(pte, 1)) - return 0; - - tablep = iopte_deref(pte, 1, data); - return __arm_v7s_unmap(data, gather, iova, size, 2, tablep); - } - - io_pgtable_tlb_add_page(&data->iop, gather, iova, size); - return size; -} - static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, @@ -695,11 +586,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, * case in a lock for the sake of correctness and be done with it. */ if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) { - unsigned long flags; - - spin_lock_irqsave(&data->split_lock, flags); - pte[0] = arm_v7s_split_cont(data, iova, idx, lvl, ptep); - spin_unlock_irqrestore(&data->split_lock, flags); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* If the size matches this level, we're in the right place */ @@ -722,12 +610,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, } return size; } else if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte[0], lvl)) { - /* - * Insert a table at the next level to map the old region, - * minus the part we want to unmap - */ - return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0], - ptep); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* Keep on walkin' */ @@ -812,8 +696,6 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, if (!data) return NULL; - spin_lock_init(&data->split_lock); - /* * ARM_MTK_TTBR_EXT extend the translation table base support larger * memory address. @@ -937,8 +819,8 @@ static int __init arm_v7s_do_selftests(void) .quirks = IO_PGTABLE_QUIRK_ARM_NS, .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M, }; - unsigned int iova, size, iova_start; - unsigned int i, loopnr = 0; + unsigned int iova, size; + unsigned int i; size_t mapped; selftest_running = true; @@ -986,26 +868,6 @@ static int __init arm_v7s_do_selftests(void) return __FAIL(ops); iova += SZ_16M; - loopnr++; - } - - /* Partial unmap */ - i = 1; - size = 1UL << __ffs(cfg.pgsize_bitmap); - while (i < loopnr) { - iova_start = i * SZ_16M; - if (ops->unmap_pages(ops, iova_start + size, size, 1, NULL) != size) - return __FAIL(ops); - - /* Remap of partial unmap */ - if (ops->map_pages(ops, iova_start + size, size, size, 1, - IOMMU_READ, GFP_KERNEL, &mapped)) - return __FAIL(ops); - - if (ops->iova_to_phys(ops, iova_start + size + 42) - != (size + 42)) - return __FAIL(ops); - i++; } /* Full unmap */ diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 3d23b924cec1..e6626004b323 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -12,7 +12,6 @@ #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/io-pgtable.h> -#include <linux/kernel.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/types.h> @@ -76,6 +75,7 @@ #define ARM_LPAE_PTE_NSTABLE (((arm_lpae_iopte)1) << 63) #define ARM_LPAE_PTE_XN (((arm_lpae_iopte)3) << 53) +#define ARM_LPAE_PTE_DBM (((arm_lpae_iopte)1) << 51) #define ARM_LPAE_PTE_AF (((arm_lpae_iopte)1) << 10) #define ARM_LPAE_PTE_SH_NS (((arm_lpae_iopte)0) << 8) #define ARM_LPAE_PTE_SH_OS (((arm_lpae_iopte)2) << 8) @@ -83,17 +83,16 @@ #define ARM_LPAE_PTE_NS (((arm_lpae_iopte)1) << 5) #define ARM_LPAE_PTE_VALID (((arm_lpae_iopte)1) << 0) -#define ARM_LPAE_PTE_ATTR_LO_MASK (((arm_lpae_iopte)0x3ff) << 2) -/* Ignore the contiguous bit for block splitting */ -#define ARM_LPAE_PTE_ATTR_HI_MASK (((arm_lpae_iopte)6) << 52) -#define ARM_LPAE_PTE_ATTR_MASK (ARM_LPAE_PTE_ATTR_LO_MASK | \ - ARM_LPAE_PTE_ATTR_HI_MASK) /* Software bit for solving coherency races */ #define ARM_LPAE_PTE_SW_SYNC (((arm_lpae_iopte)1) << 55) /* Stage-1 PTE */ #define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6) -#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)2) << 6) +#define ARM_LPAE_PTE_AP_RDONLY_BIT 7 +#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)1) << \ + ARM_LPAE_PTE_AP_RDONLY_BIT) +#define ARM_LPAE_PTE_AP_WR_CLEAN_MASK (ARM_LPAE_PTE_AP_RDONLY | \ + ARM_LPAE_PTE_DBM) #define ARM_LPAE_PTE_ATTRINDX_SHIFT 2 #define ARM_LPAE_PTE_nG (((arm_lpae_iopte)1) << 11) @@ -101,6 +100,18 @@ #define ARM_LPAE_PTE_HAP_FAULT (((arm_lpae_iopte)0) << 6) #define ARM_LPAE_PTE_HAP_READ (((arm_lpae_iopte)1) << 6) #define ARM_LPAE_PTE_HAP_WRITE (((arm_lpae_iopte)2) << 6) +/* + * For !FWB these code to: + * 1111 = Normal outer write back cachable / Inner Write Back Cachable + * Permit S1 to override + * 0101 = Normal Non-cachable / Inner Non-cachable + * 0001 = Device / Device-nGnRE + * For S2FWB these code: + * 0110 Force Normal Write Back + * 0101 Normal* is forced Normal-NC, Device unchanged + * 0001 Force Device-nGnRE + */ +#define ARM_LPAE_PTE_MEMATTR_FWB_WB (((arm_lpae_iopte)0x6) << 2) #define ARM_LPAE_PTE_MEMATTR_OIWB (((arm_lpae_iopte)0xf) << 2) #define ARM_LPAE_PTE_MEMATTR_NC (((arm_lpae_iopte)0x5) << 2) #define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2) @@ -137,7 +148,11 @@ #define iopte_type(pte) \ (((pte) >> ARM_LPAE_PTE_TYPE_SHIFT) & ARM_LPAE_PTE_TYPE_MASK) -#define iopte_prot(pte) ((pte) & ARM_LPAE_PTE_ATTR_MASK) +#define iopte_writeable_dirty(pte) \ + (((pte) & ARM_LPAE_PTE_AP_WR_CLEAN_MASK) == ARM_LPAE_PTE_DBM) + +#define iopte_set_writeable_clean(ptep) \ + set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, (unsigned long *)(ptep)) struct arm_lpae_io_pgtable { struct io_pgtable iop; @@ -160,6 +175,13 @@ static inline bool iopte_leaf(arm_lpae_iopte pte, int lvl, return iopte_type(pte) == ARM_LPAE_PTE_TYPE_BLOCK; } +static inline bool iopte_table(arm_lpae_iopte pte, int lvl) +{ + if (lvl == (ARM_LPAE_MAX_LEVELS - 1)) + return false; + return iopte_type(pte) == ARM_LPAE_PTE_TYPE_TABLE; +} + static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr, struct arm_lpae_io_pgtable *data) { @@ -181,7 +203,45 @@ static phys_addr_t iopte_to_paddr(arm_lpae_iopte pte, return (paddr | (paddr << (48 - 12))) & (ARM_LPAE_PTE_ADDR_MASK << 4); } -static bool selftest_running = false; +/* + * Convert an index returned by ARM_LPAE_PGD_IDX(), which can point into + * a concatenated PGD, into the maximum number of entries that can be + * mapped in the same table page. + */ +static inline int arm_lpae_max_entries(int i, struct arm_lpae_io_pgtable *data) +{ + int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data); + + return ptes_per_table - (i & (ptes_per_table - 1)); +} + +/* + * Check if concatenated PGDs are mandatory according to Arm DDI0487 (K.a) + * 1) R_DXBSH: For 16KB, and 48-bit input size, use level 1 instead of 0. + * 2) R_SRKBC: After de-ciphering the table for PA size and valid initial lookup + * a) 40 bits PA size with 4K: use level 1 instead of level 0 (2 tables for ias = oas) + * b) 40 bits PA size with 16K: use level 2 instead of level 1 (16 tables for ias = oas) + * c) 42 bits PA size with 4K: use level 1 instead of level 0 (8 tables for ias = oas) + * d) 48 bits PA size with 16K: use level 1 instead of level 0 (2 tables for ias = oas) + */ +static inline bool arm_lpae_concat_mandatory(struct io_pgtable_cfg *cfg, + struct arm_lpae_io_pgtable *data) +{ + unsigned int ias = cfg->ias; + unsigned int oas = cfg->oas; + + /* Covers 1 and 2.d */ + if ((ARM_LPAE_GRANULE(data) == SZ_16K) && (data->start_level == 0)) + return (oas == 48) || (ias == 48); + + /* Covers 2.a and 2.c */ + if ((ARM_LPAE_GRANULE(data) == SZ_4K) && (data->start_level == 0)) + return (oas == 40) || (oas == 42); + + /* Case 2.b */ + return (ARM_LPAE_GRANULE(data) == SZ_16K) && + (data->start_level == 1) && (oas == 40); +} static dma_addr_t __arm_lpae_dma_addr(void *pages) { @@ -193,16 +253,20 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, void *cookie) { struct device *dev = cfg->iommu_dev; - int order = get_order(size); + size_t alloc_size; dma_addr_t dma; void *pages; - VM_BUG_ON((gfp & __GFP_HIGHMEM)); - + /* + * For very small starting-level translation tables the HW requires a + * minimum alignment of at least 64 to cover all cases. + */ + alloc_size = max(size, 64); if (cfg->alloc) - pages = cfg->alloc(cookie, size, gfp); + pages = cfg->alloc(cookie, alloc_size, gfp); else - pages = iommu_alloc_pages_node(dev_to_node(dev), gfp, order); + pages = iommu_alloc_pages_node_sz(dev_to_node(dev), gfp, + alloc_size); if (!pages) return NULL; @@ -230,7 +294,7 @@ out_free: if (cfg->free) cfg->free(cookie, pages, size); else - iommu_free_pages(pages, order); + iommu_free_pages(pages); return NULL; } @@ -246,7 +310,7 @@ static void __arm_lpae_free_pages(void *pages, size_t size, if (cfg->free) cfg->free(cookie, pages, size); else - iommu_free_pages(pages, get_order(size)); + iommu_free_pages(pages); } static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, @@ -256,13 +320,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, sizeof(*ptep) * num_entries, DMA_TO_DEVICE); } -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg) +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries) { + for (int i = 0; i < num_entries; i++) + ptep[i] = 0; - *ptep = 0; - - if (!cfg->coherent_walk) - __arm_lpae_sync_pte(ptep, 1, cfg); + if (!cfg->coherent_walk && num_entries) + __arm_lpae_sync_pte(ptep, num_entries, cfg); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, @@ -301,7 +365,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, for (i = 0; i < num_entries; i++) if (iopte_leaf(ptep[i], lvl, data->iop.fmt)) { /* We require an unmap first */ - WARN_ON(!selftest_running); + WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN)); return -EEXIST; } else if (iopte_type(ptep[i]) == ARM_LPAE_PTE_TYPE_TABLE) { /* @@ -372,7 +436,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, /* If we can install a leaf entry at this level, then do so */ if (size == block_size) { - max_entries = ARM_LPAE_PTES_PER_TABLE(data) - map_idx_start; + max_entries = arm_lpae_max_entries(map_idx_start, data); num_entries = min_t(int, pgcount, max_entries); ret = arm_lpae_init_pte(data, iova, paddr, prot, lvl, num_entries, ptep); if (!ret) @@ -403,7 +467,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, cptep = iopte_deref(pte, data); } else if (pte) { /* We require an unmap first */ - WARN_ON(!selftest_running); + WARN_ON(!(cfg->quirks & IO_PGTABLE_QUIRK_NO_WARN)); return -EEXIST; } @@ -422,6 +486,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, pte = ARM_LPAE_PTE_nG; if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ)) pte |= ARM_LPAE_PTE_AP_RDONLY; + else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD) + pte |= ARM_LPAE_PTE_DBM; if (!(prot & IOMMU_PRIV)) pte |= ARM_LPAE_PTE_AP_UNPRIV; } else { @@ -438,12 +504,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, */ if (data->iop.fmt == ARM_64_LPAE_S2 || data->iop.fmt == ARM_32_LPAE_S2) { - if (prot & IOMMU_MMIO) + if (prot & IOMMU_MMIO) { pte |= ARM_LPAE_PTE_MEMATTR_DEV; - else if (prot & IOMMU_CACHE) - pte |= ARM_LPAE_PTE_MEMATTR_OIWB; - else + } else if (prot & IOMMU_CACHE) { + if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB) + pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB; + else + pte |= ARM_LPAE_PTE_MEMATTR_OIWB; + } else { pte |= ARM_LPAE_PTE_MEMATTR_NC; + } } else { if (prot & IOMMU_MMIO) pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV @@ -495,9 +565,8 @@ static int arm_lpae_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(iaext || paddr >> cfg->oas)) return -ERANGE; - /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; prot = arm_lpae_prot_to_pte(data, iommu_prot); ret = __arm_lpae_map(data, iova, paddr, pgsize, pgcount, prot, lvl, @@ -550,66 +619,6 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop) kfree(data); } -static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size, - arm_lpae_iopte blk_pte, int lvl, - arm_lpae_iopte *ptep, size_t pgcount) -{ - struct io_pgtable_cfg *cfg = &data->iop.cfg; - arm_lpae_iopte pte, *tablep; - phys_addr_t blk_paddr; - size_t tablesz = ARM_LPAE_GRANULE(data); - size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data); - int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data); - int i, unmap_idx_start = -1, num_entries = 0, max_entries; - - if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) - return 0; - - tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie); - if (!tablep) - return 0; /* Bytes unmapped */ - - if (size == split_sz) { - unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data); - max_entries = ptes_per_table - unmap_idx_start; - num_entries = min_t(int, pgcount, max_entries); - } - - blk_paddr = iopte_to_paddr(blk_pte, data); - pte = iopte_prot(blk_pte); - - for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) { - /* Unmap! */ - if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries)) - continue; - - __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]); - } - - pte = arm_lpae_install_table(tablep, ptep, blk_pte, data); - if (pte != blk_pte) { - __arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie); - /* - * We may race against someone unmapping another part of this - * block, but anything else is invalid. We can't misinterpret - * a page entry here since we're never at the last level. - */ - if (iopte_type(pte) != ARM_LPAE_PTE_TYPE_TABLE) - return 0; - - tablep = iopte_deref(pte, data); - } else if (unmap_idx_start >= 0) { - for (i = 0; i < num_entries; i++) - io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size); - - return num_entries * size; - } - - return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep); -} - static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, size_t pgcount, @@ -626,42 +635,45 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data); ptep += unmap_idx_start; pte = READ_ONCE(*ptep); - if (WARN_ON(!pte)) - return 0; + if (!pte) { + WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN)); + return -ENOENT; + } /* If the size matches this level, we're in the right place */ if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { - max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start; + max_entries = arm_lpae_max_entries(unmap_idx_start, data); num_entries = min_t(int, pgcount, max_entries); - while (i < num_entries) { - pte = READ_ONCE(*ptep); - if (WARN_ON(!pte)) + /* Find and handle non-leaf entries */ + for (i = 0; i < num_entries; i++) { + pte = READ_ONCE(ptep[i]); + if (!pte) { + WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN)); break; - - __arm_lpae_clear_pte(ptep, &iop->cfg); + } if (!iopte_leaf(pte, lvl, iop->fmt)) { + __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1); + /* Also flush any partial walks */ io_pgtable_tlb_flush_walk(iop, iova + i * size, size, ARM_LPAE_GRANULE(data)); __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); - } else if (!iommu_iotlb_gather_queued(gather)) { - io_pgtable_tlb_add_page(iop, gather, iova + i * size, size); } - - ptep++; - i++; } + /* Clear the remaining entries */ + __arm_lpae_clear_pte(ptep, &iop->cfg, i); + + if (gather && !iommu_iotlb_gather_queued(gather)) + for (int j = 0; j < i; j++) + io_pgtable_tlb_add_page(iop, gather, iova + j * size, size); + return i * size; } else if (iopte_leaf(pte, lvl, iop->fmt)) { - /* - * Insert a table at the next level to map the old region, - * minus the part we want to unmap - */ - return arm_lpae_split_blk_unmap(data, gather, iova, size, pte, - lvl + 1, ptep, pgcount); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* Keep on walkin' */ @@ -690,40 +702,172 @@ static size_t arm_lpae_unmap_pages(struct io_pgtable_ops *ops, unsigned long iov data->start_level, ptep); } +struct io_pgtable_walk_data { + struct io_pgtable *iop; + void *data; + int (*visit)(struct io_pgtable_walk_data *walk_data, int lvl, + arm_lpae_iopte *ptep, size_t size); + unsigned long flags; + u64 addr; + const u64 end; +}; + +static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, + int lvl); + +struct iova_to_phys_data { + arm_lpae_iopte pte; + int lvl; +}; + +static int visit_iova_to_phys(struct io_pgtable_walk_data *walk_data, int lvl, + arm_lpae_iopte *ptep, size_t size) +{ + struct iova_to_phys_data *data = walk_data->data; + data->pte = *ptep; + data->lvl = lvl; + return 0; +} + static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); - arm_lpae_iopte pte, *ptep = data->pgd; - int lvl = data->start_level; + struct iova_to_phys_data d; + struct io_pgtable_walk_data walk_data = { + .data = &d, + .visit = visit_iova_to_phys, + .addr = iova, + .end = iova + 1, + }; + int ret; + + ret = __arm_lpae_iopte_walk(data, &walk_data, data->pgd, data->start_level); + if (ret) + return 0; + + iova &= (ARM_LPAE_BLOCK_SIZE(d.lvl, data) - 1); + return iopte_to_paddr(d.pte, data) | iova; +} + +static int visit_pgtable_walk(struct io_pgtable_walk_data *walk_data, int lvl, + arm_lpae_iopte *ptep, size_t size) +{ + struct arm_lpae_io_pgtable_walk_data *data = walk_data->data; + data->ptes[lvl] = *ptep; + return 0; +} - do { - /* Valid IOPTE pointer? */ - if (!ptep) - return 0; +static int arm_lpae_pgtable_walk(struct io_pgtable_ops *ops, unsigned long iova, + void *wd) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_walk_data walk_data = { + .data = wd, + .visit = visit_pgtable_walk, + .addr = iova, + .end = iova + 1, + }; - /* Grab the IOPTE we're interested in */ - ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); - pte = READ_ONCE(*ptep); + return __arm_lpae_iopte_walk(data, &walk_data, data->pgd, data->start_level); +} - /* Valid entry? */ - if (!pte) - return 0; +static int io_pgtable_visit(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, int lvl) +{ + struct io_pgtable *iop = &data->iop; + arm_lpae_iopte pte = READ_ONCE(*ptep); - /* Leaf entry? */ - if (iopte_leaf(pte, lvl, data->iop.fmt)) - goto found_translation; + size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data); + int ret = walk_data->visit(walk_data, lvl, ptep, size); + if (ret) + return ret; - /* Take it to the next level */ - ptep = iopte_deref(pte, data); - } while (++lvl < ARM_LPAE_MAX_LEVELS); + if (iopte_leaf(pte, lvl, iop->fmt)) { + walk_data->addr += size; + return 0; + } + + if (!iopte_table(pte, lvl)) { + return -EINVAL; + } + + ptep = iopte_deref(pte, data); + return __arm_lpae_iopte_walk(data, walk_data, ptep, lvl + 1); +} + +static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data, + struct io_pgtable_walk_data *walk_data, + arm_lpae_iopte *ptep, + int lvl) +{ + u32 idx; + int max_entries, ret; + + if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) + return -EINVAL; + + if (lvl == data->start_level) + max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte); + else + max_entries = ARM_LPAE_PTES_PER_TABLE(data); + + for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data); + (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) { + ret = io_pgtable_visit(data, walk_data, ptep + idx, lvl); + if (ret) + return ret; + } + + return 0; +} + +static int visit_dirty(struct io_pgtable_walk_data *walk_data, int lvl, + arm_lpae_iopte *ptep, size_t size) +{ + struct iommu_dirty_bitmap *dirty = walk_data->data; + + if (!iopte_leaf(*ptep, lvl, walk_data->iop->fmt)) + return 0; + + if (iopte_writeable_dirty(*ptep)) { + iommu_dirty_bitmap_record(dirty, walk_data->addr, size); + if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR)) + iopte_set_writeable_clean(ptep); + } - /* Ran out of page tables to walk */ return 0; +} + +static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_cfg *cfg = &data->iop.cfg; + struct io_pgtable_walk_data walk_data = { + .iop = &data->iop, + .data = dirty, + .visit = visit_dirty, + .flags = flags, + .addr = iova, + .end = iova + size, + }; + arm_lpae_iopte *ptep = data->pgd; + int lvl = data->start_level; + + if (WARN_ON(!size)) + return -EINVAL; + if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1))) + return -EINVAL; + if (data->iop.fmt != ARM_64_LPAE_S1) + return -EINVAL; -found_translation: - iova &= (ARM_LPAE_BLOCK_SIZE(lvl, data) - 1); - return iopte_to_paddr(pte, data) | iova; + return __arm_lpae_iopte_walk(data, &walk_data, ptep, lvl); } static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg) @@ -804,6 +948,8 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) .map_pages = arm_lpae_map_pages, .unmap_pages = arm_lpae_unmap_pages, .iova_to_phys = arm_lpae_iova_to_phys, + .read_and_clear_dirty = arm_lpae_read_and_clear_dirty, + .pgtable_walk = arm_lpae_pgtable_walk, }; return data; @@ -819,7 +965,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_ARM_TTBR1 | - IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)) + IO_PGTABLE_QUIRK_ARM_OUTER_WBWA | + IO_PGTABLE_QUIRK_ARM_HD | + IO_PGTABLE_QUIRK_NO_WARN)) return NULL; data = arm_lpae_alloc_pgtable(cfg); @@ -920,26 +1068,20 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) struct arm_lpae_io_pgtable *data; typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr; - /* The NS quirk doesn't apply at stage 2 */ - if (cfg->quirks) + if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB | + IO_PGTABLE_QUIRK_NO_WARN)) return NULL; data = arm_lpae_alloc_pgtable(cfg); if (!data) return NULL; - /* - * Concatenate PGDs at level 1 if possible in order to reduce - * the depth of the stage-2 walk. - */ - if (data->start_level == 0) { - unsigned long pgd_pages; - - pgd_pages = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte); - if (pgd_pages <= ARM_LPAE_S2_MAX_CONCAT_PAGES) { - data->pgd_bits += data->bits_per_level; - data->start_level++; - } + if (arm_lpae_concat_mandatory(cfg, data)) { + if (WARN_ON((ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte)) > + ARM_LPAE_S2_MAX_CONCAT_PAGES)) + return NULL; + data->pgd_bits += data->bits_per_level; + data->start_level++; } /* VTCR */ @@ -1123,196 +1265,3 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = { .alloc = arm_mali_lpae_alloc_pgtable, .free = arm_lpae_free_pgtable, }; - -#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST - -static struct io_pgtable_cfg *cfg_cookie __initdata; - -static void __init dummy_tlb_flush_all(void *cookie) -{ - WARN_ON(cookie != cfg_cookie); -} - -static void __init dummy_tlb_flush(unsigned long iova, size_t size, - size_t granule, void *cookie) -{ - WARN_ON(cookie != cfg_cookie); - WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); -} - -static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather, - unsigned long iova, size_t granule, - void *cookie) -{ - dummy_tlb_flush(iova, granule, granule, cookie); -} - -static const struct iommu_flush_ops dummy_tlb_ops __initconst = { - .tlb_flush_all = dummy_tlb_flush_all, - .tlb_flush_walk = dummy_tlb_flush, - .tlb_add_page = dummy_tlb_add_page, -}; - -static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops) -{ - struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &data->iop.cfg; - - pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n", - cfg->pgsize_bitmap, cfg->ias); - pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n", - ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data), - ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd); -} - -#define __FAIL(ops, i) ({ \ - WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \ - arm_lpae_dump_ops(ops); \ - selftest_running = false; \ - -EFAULT; \ -}) - -static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) -{ - static const enum io_pgtable_fmt fmts[] __initconst = { - ARM_64_LPAE_S1, - ARM_64_LPAE_S2, - }; - - int i, j; - unsigned long iova; - size_t size, mapped; - struct io_pgtable_ops *ops; - - selftest_running = true; - - for (i = 0; i < ARRAY_SIZE(fmts); ++i) { - cfg_cookie = cfg; - ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg); - if (!ops) { - pr_err("selftest: failed to allocate io pgtable ops\n"); - return -ENOMEM; - } - - /* - * Initial sanity checks. - * Empty page tables shouldn't provide any translations. - */ - if (ops->iova_to_phys(ops, 42)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_1G + 42)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_2G + 42)) - return __FAIL(ops, i); - - /* - * Distinct mappings of different granule sizes. - */ - iova = 0; - for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { - size = 1UL << j; - - if (ops->map_pages(ops, iova, iova, size, 1, - IOMMU_READ | IOMMU_WRITE | - IOMMU_NOEXEC | IOMMU_CACHE, - GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - /* Overlapping mappings */ - if (!ops->map_pages(ops, iova, iova + size, size, 1, - IOMMU_READ | IOMMU_NOEXEC, - GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) - return __FAIL(ops, i); - - iova += SZ_1G; - } - - /* Partial unmap */ - size = 1UL << __ffs(cfg->pgsize_bitmap); - if (ops->unmap_pages(ops, SZ_1G + size, size, 1, NULL) != size) - return __FAIL(ops, i); - - /* Remap of partial unmap */ - if (ops->map_pages(ops, SZ_1G + size, size, size, 1, - IOMMU_READ, GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_1G + size + 42) != (size + 42)) - return __FAIL(ops, i); - - /* Full unmap */ - iova = 0; - for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { - size = 1UL << j; - - if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42)) - return __FAIL(ops, i); - - /* Remap full block */ - if (ops->map_pages(ops, iova, iova, size, 1, - IOMMU_WRITE, GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) - return __FAIL(ops, i); - - iova += SZ_1G; - } - - free_io_pgtable_ops(ops); - } - - selftest_running = false; - return 0; -} - -static int __init arm_lpae_do_selftests(void) -{ - static const unsigned long pgsize[] __initconst = { - SZ_4K | SZ_2M | SZ_1G, - SZ_16K | SZ_32M, - SZ_64K | SZ_512M, - }; - - static const unsigned int ias[] __initconst = { - 32, 36, 40, 42, 44, 48, - }; - - int i, j, pass = 0, fail = 0; - struct device dev; - struct io_pgtable_cfg cfg = { - .tlb = &dummy_tlb_ops, - .oas = 48, - .coherent_walk = true, - .iommu_dev = &dev, - }; - - /* __arm_lpae_alloc_pages() merely needs dev_to_node() to work */ - set_dev_node(&dev, NUMA_NO_NODE); - - for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { - for (j = 0; j < ARRAY_SIZE(ias); ++j) { - cfg.pgsize_bitmap = pgsize[i]; - cfg.ias = ias[j]; - pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u\n", - pgsize[i], ias[j]); - if (arm_lpae_run_tests(&cfg)) - fail++; - else - pass++; - } - } - - pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail); - return fail ? -EFAULT : 0; -} -subsys_initcall(arm_lpae_do_selftests); -#endif diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c index ad28031e1e93..54d287cc0dd1 100644 --- a/drivers/iommu/io-pgtable-dart.c +++ b/drivers/iommu/io-pgtable-dart.c @@ -27,8 +27,9 @@ #define DART1_MAX_ADDR_BITS 36 -#define DART_MAX_TABLES 4 -#define DART_LEVELS 2 +#define DART_MAX_TABLE_BITS 2 +#define DART_MAX_TABLES BIT(DART_MAX_TABLE_BITS) +#define DART_MAX_LEVELS 4 /* Includes TTBR level */ /* Struct accessors */ #define io_pgtable_to_data(x) \ @@ -68,6 +69,7 @@ struct dart_io_pgtable { struct io_pgtable iop; + int levels; int tbl_bits; int bits_per_level; @@ -107,14 +109,6 @@ static phys_addr_t iopte_to_paddr(dart_iopte pte, return paddr; } -static void *__dart_alloc_pages(size_t size, gfp_t gfp) -{ - int order = get_order(size); - - VM_BUG_ON((gfp & __GFP_HIGHMEM)); - return iommu_alloc_pages(gfp, order); -} - static int dart_init_pte(struct dart_io_pgtable *data, unsigned long iova, phys_addr_t paddr, dart_iopte prot, int num_entries, @@ -135,7 +129,6 @@ static int dart_init_pte(struct dart_io_pgtable *data, pte |= FIELD_PREP(APPLE_DART_PTE_SUBPAGE_START, 0); pte |= FIELD_PREP(APPLE_DART_PTE_SUBPAGE_END, 0xfff); - pte |= APPLE_DART1_PTE_PROT_SP_DIS; pte |= APPLE_DART_PTE_VALID; for (i = 0; i < num_entries; i++) @@ -165,44 +158,45 @@ static dart_iopte dart_install_table(dart_iopte *table, return old; } -static int dart_get_table(struct dart_io_pgtable *data, unsigned long iova) +static int dart_get_index(struct dart_io_pgtable *data, unsigned long iova, int level) { - return (iova >> (3 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & - ((1 << data->tbl_bits) - 1); + return (iova >> (level * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & + ((1 << data->bits_per_level) - 1); } -static int dart_get_l1_index(struct dart_io_pgtable *data, unsigned long iova) -{ - - return (iova >> (2 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & - ((1 << data->bits_per_level) - 1); -} - -static int dart_get_l2_index(struct dart_io_pgtable *data, unsigned long iova) +static int dart_get_last_index(struct dart_io_pgtable *data, unsigned long iova) { return (iova >> (data->bits_per_level + ilog2(sizeof(dart_iopte)))) & ((1 << data->bits_per_level) - 1); } -static dart_iopte *dart_get_l2(struct dart_io_pgtable *data, unsigned long iova) +static dart_iopte *dart_get_last(struct dart_io_pgtable *data, unsigned long iova) { dart_iopte pte, *ptep; - int tbl = dart_get_table(data, iova); + int level = data->levels; + int tbl = dart_get_index(data, iova, level); + + if (tbl >= (1 << data->tbl_bits)) + return NULL; ptep = data->pgd[tbl]; if (!ptep) return NULL; - ptep += dart_get_l1_index(data, iova); - pte = READ_ONCE(*ptep); + while (--level > 1) { + ptep += dart_get_index(data, iova, level); + pte = READ_ONCE(*ptep); - /* Valid entry? */ - if (!pte) - return NULL; + /* Valid entry? */ + if (!pte) + return NULL; - /* Deref to get level 2 table */ - return iopte_deref(pte, data); + /* Deref to get next level table */ + ptep = iopte_deref(pte, data); + } + + return ptep; } static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data, @@ -211,6 +205,7 @@ static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data, dart_iopte pte = 0; if (data->iop.fmt == APPLE_DART) { + pte |= APPLE_DART1_PTE_PROT_SP_DIS; if (!(prot & IOMMU_WRITE)) pte |= APPLE_DART1_PTE_PROT_NO_WRITE; if (!(prot & IOMMU_READ)) @@ -238,6 +233,7 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, int ret = 0, tbl, num_entries, max_entries, map_idx_start; dart_iopte pte, *cptep, *ptep; dart_iopte prot; + int level = data->levels; if (WARN_ON(pgsize != cfg->pgsize_bitmap)) return -EINVAL; @@ -245,35 +241,39 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(paddr >> cfg->oas)) return -ERANGE; - /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; + + tbl = dart_get_index(data, iova, level); - tbl = dart_get_table(data, iova); + if (tbl >= (1 << data->tbl_bits)) + return -ENOMEM; ptep = data->pgd[tbl]; - ptep += dart_get_l1_index(data, iova); - pte = READ_ONCE(*ptep); + while (--level > 1) { + ptep += dart_get_index(data, iova, level); + pte = READ_ONCE(*ptep); - /* no L2 table present */ - if (!pte) { - cptep = __dart_alloc_pages(tblsz, gfp); - if (!cptep) - return -ENOMEM; + /* no table present */ + if (!pte) { + cptep = iommu_alloc_pages_sz(gfp, tblsz); + if (!cptep) + return -ENOMEM; - pte = dart_install_table(cptep, ptep, 0, data); - if (pte) - iommu_free_pages(cptep, get_order(tblsz)); + pte = dart_install_table(cptep, ptep, 0, data); + if (pte) + iommu_free_pages(cptep); - /* L2 table is present (now) */ - pte = READ_ONCE(*ptep); - } + /* L2 table is present (now) */ + pte = READ_ONCE(*ptep); + } - ptep = iopte_deref(pte, data); + ptep = iopte_deref(pte, data); + } /* install a leaf entries into L2 table */ prot = dart_prot_to_pte(data, iommu_prot); - map_idx_start = dart_get_l2_index(data, iova); + map_idx_start = dart_get_last_index(data, iova); max_entries = DART_PTES_PER_TABLE(data) - map_idx_start; num_entries = min_t(int, pgcount, max_entries); ptep += map_idx_start; @@ -302,13 +302,13 @@ static size_t dart_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(pgsize != cfg->pgsize_bitmap || !pgcount)) return 0; - ptep = dart_get_l2(data, iova); + ptep = dart_get_last(data, iova); /* Valid L2 IOPTE pointer? */ if (WARN_ON(!ptep)) return 0; - unmap_idx_start = dart_get_l2_index(data, iova); + unmap_idx_start = dart_get_last_index(data, iova); ptep += unmap_idx_start; max_entries = DART_PTES_PER_TABLE(data) - unmap_idx_start; @@ -339,13 +339,13 @@ static phys_addr_t dart_iova_to_phys(struct io_pgtable_ops *ops, struct dart_io_pgtable *data = io_pgtable_ops_to_data(ops); dart_iopte pte, *ptep; - ptep = dart_get_l2(data, iova); + ptep = dart_get_last(data, iova); /* Valid L2 IOPTE pointer? */ if (!ptep) return 0; - ptep += dart_get_l2_index(data, iova); + ptep += dart_get_last_index(data, iova); pte = READ_ONCE(*ptep); /* Found translation */ @@ -362,21 +362,37 @@ static struct dart_io_pgtable * dart_alloc_pgtable(struct io_pgtable_cfg *cfg) { struct dart_io_pgtable *data; - int tbl_bits, bits_per_level, va_bits, pg_shift; + int levels, max_tbl_bits, tbl_bits, bits_per_level, va_bits, pg_shift; + + /* + * Old 4K page DARTs can use up to 4 top-level tables. + * Newer ones only ever use a maximum of 1. + */ + if (cfg->pgsize_bitmap == SZ_4K) + max_tbl_bits = DART_MAX_TABLE_BITS; + else + max_tbl_bits = 0; pg_shift = __ffs(cfg->pgsize_bitmap); bits_per_level = pg_shift - ilog2(sizeof(dart_iopte)); va_bits = cfg->ias - pg_shift; - tbl_bits = max_t(int, 0, va_bits - (bits_per_level * DART_LEVELS)); - if ((1 << tbl_bits) > DART_MAX_TABLES) + levels = max_t(int, 2, (va_bits - max_tbl_bits + bits_per_level - 1) / bits_per_level); + + if (levels > (DART_MAX_LEVELS - 1)) + return NULL; + + tbl_bits = max_t(int, 0, va_bits - (bits_per_level * levels)); + + if (tbl_bits > max_tbl_bits) return NULL; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL; + data->levels = levels + 1; /* Table level counts as one level */ data->tbl_bits = tbl_bits; data->bits_per_level = bits_per_level; @@ -412,9 +428,11 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) return NULL; cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits; + cfg->apple_dart_cfg.n_levels = data->levels; for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) { - data->pgd[i] = __dart_alloc_pages(DART_GRANULE(data), GFP_KERNEL); + data->pgd[i] = + iommu_alloc_pages_sz(GFP_KERNEL, DART_GRANULE(data)); if (!data->pgd[i]) goto out_free_data; cfg->apple_dart_cfg.ttbr[i] = virt_to_phys(data->pgd[i]); @@ -424,32 +442,37 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) out_free_data: while (--i >= 0) { - iommu_free_pages(data->pgd[i], - get_order(DART_GRANULE(data))); + iommu_free_pages(data->pgd[i]); } kfree(data); return NULL; } -static void apple_dart_free_pgtable(struct io_pgtable *iop) +static void apple_dart_free_pgtables(struct dart_io_pgtable *data, dart_iopte *ptep, int level) { - struct dart_io_pgtable *data = io_pgtable_to_data(iop); - int order = get_order(DART_GRANULE(data)); - dart_iopte *ptep, *end; - int i; + dart_iopte *end; + dart_iopte *start = ptep; - for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) { - ptep = data->pgd[i]; + if (level > 1) { end = (void *)ptep + DART_GRANULE(data); while (ptep != end) { dart_iopte pte = *ptep++; if (pte) - iommu_free_pages(iopte_deref(pte, data), order); + apple_dart_free_pgtables(data, iopte_deref(pte, data), level - 1); } - iommu_free_pages(data->pgd[i], order); } + iommu_free_pages(start); +} + +static void apple_dart_free_pgtable(struct io_pgtable *iop) +{ + struct dart_io_pgtable *data = io_pgtable_to_data(iop); + int i; + + for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) + apple_dart_free_pgtables(data, data->pgd[i], data->levels - 1); kfree(data); } diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c index 8841c1487f00..843fec8e8a51 100644 --- a/drivers/iommu/io-pgtable.c +++ b/drivers/iommu/io-pgtable.c @@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = { #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S [ARM_V7S] = &io_pgtable_arm_v7s_init_fns, #endif -#ifdef CONFIG_AMD_IOMMU - [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns, - [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns, -#endif }; static int check_custom_allocator(enum io_pgtable_fmt fmt, diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c new file mode 100644 index 000000000000..3bab175d8557 --- /dev/null +++ b/drivers/iommu/iommu-pages.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ +#include "iommu-pages.h" +#include <linux/dma-mapping.h> +#include <linux/gfp.h> +#include <linux/mm.h> + +#define IOPTDESC_MATCH(pg_elm, elm) \ + static_assert(offsetof(struct page, pg_elm) == \ + offsetof(struct ioptdesc, elm)) +IOPTDESC_MATCH(flags, __page_flags); +IOPTDESC_MATCH(lru, iopt_freelist_elm); /* Ensure bit 0 is clear */ +IOPTDESC_MATCH(mapping, __page_mapping); +IOPTDESC_MATCH(private, _private); +IOPTDESC_MATCH(page_type, __page_type); +IOPTDESC_MATCH(_refcount, __page_refcount); +#ifdef CONFIG_MEMCG +IOPTDESC_MATCH(memcg_data, memcg_data); +#endif +#undef IOPTDESC_MATCH +static_assert(sizeof(struct ioptdesc) <= sizeof(struct page)); + +static inline size_t ioptdesc_mem_size(struct ioptdesc *desc) +{ + return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT); +} + +/** + * iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from + * specific NUMA node + * @nid: memory NUMA node id + * @gfp: buddy allocator flags + * @size: Memory size to allocate, rounded up to a power of 2 + * + * Returns the virtual address of the allocated page. The page must be freed + * either by calling iommu_free_pages() or via iommu_put_pages_list(). The + * returned allocation is round_up_pow_two(size) big, and is physically aligned + * to its size. + */ +void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size) +{ + struct ioptdesc *iopt; + unsigned long pgcnt; + struct folio *folio; + unsigned int order; + + /* This uses page_address() on the memory. */ + if (WARN_ON(gfp & __GFP_HIGHMEM)) + return NULL; + + /* + * Currently sub page allocations result in a full page being returned. + */ + order = get_order(size); + + /* + * __folio_alloc_node() does not handle NUMA_NO_NODE like + * alloc_pages_node() did. + */ + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + + folio = __folio_alloc_node(gfp | __GFP_ZERO, order, nid); + if (unlikely(!folio)) + return NULL; + + iopt = folio_ioptdesc(folio); + iopt->incoherent = false; + + /* + * All page allocations that should be reported to as "iommu-pagetables" + * to userspace must use one of the functions below. This includes + * allocations of page-tables and other per-iommu_domain configuration + * structures. + * + * This is necessary for the proper accounting as IOMMU state can be + * rather large, i.e. multiple gigabytes in size. + */ + pgcnt = 1UL << order; + mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, pgcnt); + lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, pgcnt); + + return folio_address(folio); +} +EXPORT_SYMBOL_GPL(iommu_alloc_pages_node_sz); + +static void __iommu_free_desc(struct ioptdesc *iopt) +{ + struct folio *folio = ioptdesc_folio(iopt); + const unsigned long pgcnt = folio_nr_pages(folio); + + if (IOMMU_PAGES_USE_DMA_API) + WARN_ON_ONCE(iopt->incoherent); + + mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt); + lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt); + folio_put(folio); +} + +/** + * iommu_free_pages - free pages + * @virt: virtual address of the page to be freed. + * + * The page must have have been allocated by iommu_alloc_pages_node_sz() + */ +void iommu_free_pages(void *virt) +{ + if (!virt) + return; + __iommu_free_desc(virt_to_ioptdesc(virt)); +} +EXPORT_SYMBOL_GPL(iommu_free_pages); + +/** + * iommu_put_pages_list - free a list of pages. + * @list: The list of pages to be freed + * + * Frees a list of pages allocated by iommu_alloc_pages_node_sz(). On return the + * passed list is invalid, the caller must use IOMMU_PAGES_LIST_INIT to reinit + * the list if it expects to use it again. + */ +void iommu_put_pages_list(struct iommu_pages_list *list) +{ + struct ioptdesc *iopt, *tmp; + + list_for_each_entry_safe(iopt, tmp, &list->pages, iopt_freelist_elm) + __iommu_free_desc(iopt); +} +EXPORT_SYMBOL_GPL(iommu_put_pages_list); + +/** + * iommu_pages_start_incoherent - Setup the page for cache incoherent operation + * @virt: The page to setup + * @dma_dev: The iommu device + * + * For incoherent memory this will use the DMA API to manage the cache flushing + * on some arches. This is a lot of complexity compared to just calling + * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers + * have been doing. The DMA API requires keeping track of the DMA map and + * freeing it when required. This keeps track of the dma map inside the ioptdesc + * so that error paths are simple for the caller. + */ +int iommu_pages_start_incoherent(void *virt, struct device *dma_dev) +{ + struct ioptdesc *iopt = virt_to_ioptdesc(virt); + dma_addr_t dma; + + if (WARN_ON(iopt->incoherent)) + return -EINVAL; + + if (!IOMMU_PAGES_USE_DMA_API) { + iommu_pages_flush_incoherent(dma_dev, virt, 0, + ioptdesc_mem_size(iopt)); + } else { + dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt), + DMA_TO_DEVICE); + if (dma_mapping_error(dma_dev, dma)) + return -EINVAL; + + /* + * The DMA API is not allowed to do anything other than DMA + * direct. It would be nice to also check + * dev_is_dma_coherent(dma_dev)); + */ + if (WARN_ON(dma != virt_to_phys(virt))) { + dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt), + DMA_TO_DEVICE); + return -EOPNOTSUPP; + } + } + + iopt->incoherent = 1; + return 0; +} +EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent); + +/** + * iommu_pages_start_incoherent_list - Make a list of pages incoherent + * @list: The list of pages to setup + * @dma_dev: The iommu device + * + * Perform iommu_pages_start_incoherent() across all of list. + * + * If this fails the caller must call iommu_pages_stop_incoherent_list(). + */ +int iommu_pages_start_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) +{ + struct ioptdesc *cur; + int ret; + + list_for_each_entry(cur, &list->pages, iopt_freelist_elm) { + if (WARN_ON(cur->incoherent)) + continue; + + ret = iommu_pages_start_incoherent( + folio_address(ioptdesc_folio(cur)), dma_dev); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list); + +/** + * iommu_pages_stop_incoherent_list - Undo incoherence across a list + * @list: The list of pages to release + * @dma_dev: The iommu device + * + * Revert iommu_pages_start_incoherent() across all of the list. Pages that did + * not call or succeed iommu_pages_start_incoherent() will be ignored. + */ +#if IOMMU_PAGES_USE_DMA_API +void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) +{ + struct ioptdesc *cur; + + list_for_each_entry(cur, &list->pages, iopt_freelist_elm) { + struct folio *folio = ioptdesc_folio(cur); + + if (!cur->incoherent) + continue; + dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)), + ioptdesc_mem_size(cur), DMA_TO_DEVICE); + cur->incoherent = 0; + } +} +EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list); + +/** + * iommu_pages_free_incoherent - Free an incoherent page + * @virt: virtual address of the page to be freed. + * @dma_dev: The iommu device + * + * If the page is incoherent it made coherent again then freed. + */ +void iommu_pages_free_incoherent(void *virt, struct device *dma_dev) +{ + struct ioptdesc *iopt = virt_to_ioptdesc(virt); + + if (iopt->incoherent) { + dma_unmap_single(dma_dev, virt_to_phys(virt), + ioptdesc_mem_size(iopt), DMA_TO_DEVICE); + iopt->incoherent = 0; + } + __iommu_free_desc(iopt); +} +EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent); +#endif diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h index 82ebf0033081..ae9da4f571f6 100644 --- a/drivers/iommu/iommu-pages.h +++ b/drivers/iommu/iommu-pages.h @@ -7,180 +7,142 @@ #ifndef __IOMMU_PAGES_H #define __IOMMU_PAGES_H -#include <linux/vmstat.h> -#include <linux/gfp.h> -#include <linux/mm.h> - -/* - * All page allocations that should be reported to as "iommu-pagetables" to - * userspace must use one of the functions below. This includes allocations of - * page-tables and other per-iommu_domain configuration structures. - * - * This is necessary for the proper accounting as IOMMU state can be rather - * large, i.e. multiple gigabytes in size. - */ +#include <linux/iommu.h> /** - * __iommu_alloc_account - account for newly allocated page. - * @page: head struct page of the page. - * @order: order of the page + * struct ioptdesc - Memory descriptor for IOMMU page tables + * @iopt_freelist_elm: List element for a struct iommu_pages_list + * + * This struct overlays struct page for now. Do not modify without a good + * understanding of the issues. */ -static inline void __iommu_alloc_account(struct page *page, int order) +struct ioptdesc { + unsigned long __page_flags; + + struct list_head iopt_freelist_elm; + unsigned long __page_mapping; + union { + u8 incoherent; + pgoff_t __index; + }; + void *_private; + + unsigned int __page_type; + atomic_t __page_refcount; +#ifdef CONFIG_MEMCG + unsigned long memcg_data; +#endif +}; + +static inline struct ioptdesc *folio_ioptdesc(struct folio *folio) { - const long pgcnt = 1l << order; - - mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, pgcnt); - mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, pgcnt); + return (struct ioptdesc *)folio; } -/** - * __iommu_free_account - account a page that is about to be freed. - * @page: head struct page of the page. - * @order: order of the page - */ -static inline void __iommu_free_account(struct page *page, int order) +static inline struct folio *ioptdesc_folio(struct ioptdesc *iopt) { - const long pgcnt = 1l << order; - - mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, -pgcnt); - mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, -pgcnt); + return (struct folio *)iopt; } -/** - * __iommu_alloc_pages - allocate a zeroed page of a given order. - * @gfp: buddy allocator flags - * @order: page order - * - * returns the head struct page of the allocated page. - */ -static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order) +static inline struct ioptdesc *virt_to_ioptdesc(void *virt) { - struct page *page; - - page = alloc_pages(gfp | __GFP_ZERO, order); - if (unlikely(!page)) - return NULL; - - __iommu_alloc_account(page, order); - - return page; + return folio_ioptdesc(virt_to_folio(virt)); } +void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size); +void iommu_free_pages(void *virt); +void iommu_put_pages_list(struct iommu_pages_list *list); + /** - * __iommu_free_pages - free page of a given order - * @page: head struct page of the page - * @order: page order + * iommu_pages_list_add - add the page to a iommu_pages_list + * @list: List to add the page to + * @virt: Address returned from iommu_alloc_pages_node_sz() */ -static inline void __iommu_free_pages(struct page *page, int order) +static inline void iommu_pages_list_add(struct iommu_pages_list *list, + void *virt) { - if (!page) - return; - - __iommu_free_account(page, order); - __free_pages(page, order); + list_add_tail(&virt_to_ioptdesc(virt)->iopt_freelist_elm, &list->pages); } /** - * iommu_alloc_pages_node - allocate a zeroed page of a given order from - * specific NUMA node. - * @nid: memory NUMA node id - * @gfp: buddy allocator flags - * @order: page order + * iommu_pages_list_splice - Put all the pages in list from into list to + * @from: Source list of pages + * @to: Destination list of pages * - * returns the virtual address of the allocated page + * from must be re-initialized after calling this function if it is to be + * used again. */ -static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order) +static inline void iommu_pages_list_splice(struct iommu_pages_list *from, + struct iommu_pages_list *to) { - struct page *page = alloc_pages_node(nid, gfp | __GFP_ZERO, order); - - if (unlikely(!page)) - return NULL; - - __iommu_alloc_account(page, order); - - return page_address(page); + list_splice(&from->pages, &to->pages); } /** - * iommu_alloc_pages - allocate a zeroed page of a given order - * @gfp: buddy allocator flags - * @order: page order - * - * returns the virtual address of the allocated page + * iommu_pages_list_empty - True if the list is empty + * @list: List to check */ -static inline void *iommu_alloc_pages(gfp_t gfp, int order) +static inline bool iommu_pages_list_empty(struct iommu_pages_list *list) { - struct page *page = __iommu_alloc_pages(gfp, order); - - if (unlikely(!page)) - return NULL; - - return page_address(page); + return list_empty(&list->pages); } /** - * iommu_alloc_page_node - allocate a zeroed page at specific NUMA node. + * iommu_alloc_pages_sz - Allocate a zeroed page of a given size from + * specific NUMA node * @nid: memory NUMA node id * @gfp: buddy allocator flags + * @size: Memory size to allocate, this is rounded up to a power of 2 * - * returns the virtual address of the allocated page + * Returns the virtual address of the allocated page. */ -static inline void *iommu_alloc_page_node(int nid, gfp_t gfp) +static inline void *iommu_alloc_pages_sz(gfp_t gfp, size_t size) { - return iommu_alloc_pages_node(nid, gfp, 0); + return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size); } -/** - * iommu_alloc_page - allocate a zeroed page - * @gfp: buddy allocator flags - * - * returns the virtual address of the allocated page - */ -static inline void *iommu_alloc_page(gfp_t gfp) +int iommu_pages_start_incoherent(void *virt, struct device *dma_dev); +int iommu_pages_start_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev); + +#ifdef CONFIG_X86 +#define IOMMU_PAGES_USE_DMA_API 0 +#include <linux/cacheflush.h> + +static inline void iommu_pages_flush_incoherent(struct device *dma_dev, + void *virt, size_t offset, + size_t len) { - return iommu_alloc_pages(gfp, 0); + clflush_cache_range(virt + offset, len); } - -/** - * iommu_free_pages - free page of a given order - * @virt: virtual address of the page to be freed. - * @order: page order - */ -static inline void iommu_free_pages(void *virt, int order) +static inline void +iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev) { - if (!virt) - return; - - __iommu_free_pages(virt_to_page(virt), order); + /* + * For performance leave the incoherent flag alone which turns this into + * a NOP. For X86 the rest of the stop/free flow ignores the flag. + */ } - -/** - * iommu_free_page - free page - * @virt: virtual address of the page to be freed. - */ -static inline void iommu_free_page(void *virt) +static inline void iommu_pages_free_incoherent(void *virt, + struct device *dma_dev) { - iommu_free_pages(virt, 0); + iommu_free_pages(virt); } +#else +#define IOMMU_PAGES_USE_DMA_API 1 +#include <linux/dma-mapping.h> -/** - * iommu_put_pages_list - free a list of pages. - * @page: the head of the lru list to be freed. - * - * There are no locking requirement for these pages, as they are going to be - * put on a free list as soon as refcount reaches 0. Pages are put on this LRU - * list once they are removed from the IOMMU page tables. However, they can - * still be access through debugfs. - */ -static inline void iommu_put_pages_list(struct list_head *page) +static inline void iommu_pages_flush_incoherent(struct device *dma_dev, + void *virt, size_t offset, + size_t len) { - while (!list_empty(page)) { - struct page *p = list_entry(page->prev, struct page, lru); - - list_del(&p->lru); - __iommu_free_account(p, 0); - put_page(p); - } + dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len, + DMA_TO_DEVICE); } +void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, + struct device *dma_dev); +void iommu_pages_free_incoherent(void *virt, struct device *dma_dev); +#endif -#endif /* __IOMMU_PAGES_H */ +#endif /* __IOMMU_PAGES_H */ diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index 5f731d994803..c95394cd03a7 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -5,6 +5,7 @@ #define __LINUX_IOMMU_PRIV_H #include <linux/iommu.h> +#include <linux/msi.h> static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) { @@ -17,8 +18,16 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain); +void dev_iommu_free(struct device *dev); + +const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); + +static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwspec) +{ + return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); +} + +void iommu_fwspec_free(struct device *dev); int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, @@ -28,4 +37,32 @@ void iommu_device_unregister_bus(struct iommu_device *iommu, const struct bus_type *bus, struct notifier_block *nb); +int iommu_mock_device_add(struct device *dev, struct iommu_device *iommu); + +struct iommu_attach_handle *iommu_attach_handle_get(struct iommu_group *group, + ioasid_t pasid, + unsigned int type); +int iommu_attach_group_handle(struct iommu_domain *domain, + struct iommu_group *group, + struct iommu_attach_handle *handle); +void iommu_detach_group_handle(struct iommu_domain *domain, + struct iommu_group *group); +int iommu_replace_group_handle(struct iommu_group *group, + struct iommu_domain *new_domain, + struct iommu_attach_handle *handle); + +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) && IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#else /* !CONFIG_IOMMUFD_DRIVER_CORE || !CONFIG_IRQ_MSI_IOMMU */ +static inline int iommufd_sw_msi(struct iommu_domain *domain, + struct msi_desc *desc, phys_addr_t msi_addr) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_IOMMUFD_DRIVER_CORE && CONFIG_IRQ_MSI_IOMMU */ + +int iommu_replace_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle); #endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 18a35e798b72..d236aef80a8d 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,10 @@ #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); +static bool iommu_sva_present; +static LIST_HEAD(iommu_sva_mms); +static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); /* Allocate a PASID for the mm within range (inclusive) */ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct device *dev) @@ -40,8 +44,8 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; + iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); - INIT_LIST_HEAD(&iommu_mm->sva_handles); /* * Make sure the write to mm->iommu_mm is not reordered in front of * initialization to iommu_mm fields. If it does, readers may see a @@ -62,18 +66,20 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de * reference is taken. Caller must call iommu_sva_unbind_device() * to release each reference. * - * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to - * initialize the required SVA features. - * * On error, returns an ERR_PTR value. */ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { + struct iommu_group *group = dev->iommu_group; + struct iommu_attach_handle *attach_handle; struct iommu_mm_data *iommu_mm; struct iommu_domain *domain; struct iommu_sva *handle; int ret; + if (!group) + return ERR_PTR(-ENODEV); + mutex_lock(&iommu_sva_lock); /* Allocate mm->pasid if necessary. */ @@ -83,12 +89,22 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm goto out_unlock; } - list_for_each_entry(handle, &mm->iommu_mm->sva_handles, handle_item) { - if (handle->dev == dev) { - refcount_inc(&handle->users); - mutex_unlock(&iommu_sva_lock); - return handle; + /* A bond already exists, just take a reference`. */ + attach_handle = iommu_attach_handle_get(group, iommu_mm->pasid, IOMMU_DOMAIN_SVA); + if (!IS_ERR(attach_handle)) { + handle = container_of(attach_handle, struct iommu_sva, handle); + if (attach_handle->domain->mm != mm) { + ret = -EBUSY; + goto out_unlock; } + refcount_inc(&handle->users); + mutex_unlock(&iommu_sva_lock); + return handle; + } + + if (PTR_ERR(attach_handle) != -ENOENT) { + ret = PTR_ERR(attach_handle); + goto out_unlock; } handle = kzalloc(sizeof(*handle), GFP_KERNEL); @@ -99,7 +115,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm /* Search for an existing domain. */ list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) { - ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); + ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid, + &handle->handle); if (!ret) { domain->users++; goto out; @@ -113,18 +130,22 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm goto out_free_handle; } - ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); + ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid, + &handle->handle); if (ret) goto out_free_domain; domain->users = 1; - list_add(&domain->next, &mm->iommu_mm->sva_domains); + if (list_empty(&iommu_mm->sva_domains)) { + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = true; + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + } + list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); - list_add(&handle->handle_item, &mm->iommu_mm->sva_handles); mutex_unlock(&iommu_sva_lock); handle->dev = dev; - handle->domain = domain; return handle; out_free_domain: @@ -147,7 +168,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_bind_device); */ void iommu_sva_unbind_device(struct iommu_sva *handle) { - struct iommu_domain *domain = handle->domain; + struct iommu_domain *domain = handle->handle.domain; struct iommu_mm_data *iommu_mm = domain->mm->iommu_mm; struct device *dev = handle->dev; @@ -156,13 +177,19 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) mutex_unlock(&iommu_sva_lock); return; } - list_del(&handle->handle_item); iommu_detach_device_pasid(domain, dev, iommu_mm->pasid); if (--domain->users == 0) { list_del(&domain->next); iommu_domain_free(domain); } + + if (list_empty(&iommu_mm->sva_domains)) { + list_del(&iommu_mm->mm_list_elm); + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = false; + } + mutex_unlock(&iommu_sva_lock); kfree(handle); } @@ -170,7 +197,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); u32 iommu_sva_get_pasid(struct iommu_sva *handle) { - struct iommu_domain *domain = handle->domain; + struct iommu_domain *domain = handle->handle.domain; return mm_get_enqcmd_pasid(domain->mm); } @@ -259,7 +286,8 @@ static void iommu_sva_handle_iopf(struct work_struct *work) if (status != IOMMU_PAGE_RESP_SUCCESS) break; - status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm); + status = iommu_sva_handle_mm(&iopf->fault, + group->attach_handle->domain->mm); } iopf_group_response(group, status); @@ -277,23 +305,21 @@ static int iommu_sva_iopf_handler(struct iopf_group *group) return 0; } -struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, - struct mm_struct *mm) +static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) { const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; - if (ops->domain_alloc_sva) { - domain = ops->domain_alloc_sva(dev, mm); - if (IS_ERR(domain)) - return domain; - } else { - domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); - if (!domain) - return ERR_PTR(-ENOMEM); - } + if (!ops->domain_alloc_sva) + return ERR_PTR(-EOPNOTSUPP); + + domain = ops->domain_alloc_sva(dev, mm); + if (IS_ERR(domain)) + return domain; domain->type = IOMMU_DOMAIN_SVA; + domain->cookie_type = IOMMU_COOKIE_SVA; mmgrab(mm); domain->mm = mm; domain->owner = ops; @@ -301,3 +327,15 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; } + +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{ + struct iommu_mm_data *iommu_mm; + + guard(mutex)(&iommu_sva_lock); + if (!iommu_sva_present) + return; + + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c index cbe378c34ba3..170022c09536 100644 --- a/drivers/iommu/iommu-sysfs.c +++ b/drivers/iommu/iommu-sysfs.c @@ -34,7 +34,7 @@ static void release_device(struct device *dev) kfree(dev); } -static struct class iommu_class = { +static const struct class iommu_class = { .name = "iommu", .dev_release = release_device, .dev_groups = dev_groups, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 9df7cc75c1bc..2ca990dfbb88 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -18,6 +18,7 @@ #include <linux/errno.h> #include <linux/host1x_context_bus.h> #include <linux/iommu.h> +#include <linux/iommufd.h> #include <linux/idr.h> #include <linux/err.h> #include <linux/pci.h> @@ -32,6 +33,7 @@ #include <trace/events/iommu.h> #include <linux/sched/mm.h> #include <linux/msi.h> +#include <uapi/linux/iommufd.h> #include "dma-iommu.h" #include "iommu-priv.h" @@ -44,6 +46,9 @@ static unsigned int iommu_def_domain_type __read_mostly; static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; +/* Tags used with xa_tag_pointer() in group->pasid_array */ +enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 }; + struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; @@ -90,15 +95,17 @@ static const char * const iommu_group_resv_type_string[] = { #define IOMMU_CMD_LINE_DMA_API BIT(0) #define IOMMU_CMD_LINE_STRICT BIT(1) +static int bus_iommu_probe(const struct bus_type *bus); static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); -static struct iommu_domain * -__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type); static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev); + struct device *dev, struct iommu_domain *old); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); +static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int type, + unsigned int flags); enum { IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0, @@ -107,6 +114,7 @@ enum { static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags); static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, @@ -133,6 +141,8 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group, struct device *dev); static void __iommu_group_free_device(struct iommu_group *group, struct group_device *grp_dev); +static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, + const struct iommu_ops *ops); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ @@ -268,6 +278,8 @@ int iommu_device_register(struct iommu_device *iommu, err = bus_iommu_probe(iommu_buses[i]); if (err) iommu_device_unregister(iommu); + else + WRITE_ONCE(iommu->ready, true); return err; } EXPORT_SYMBOL_GPL(iommu_device_register); @@ -293,6 +305,7 @@ void iommu_device_unregister_bus(struct iommu_device *iommu, struct notifier_block *nb) { bus_unregister_notifier(bus, nb); + fwnode_remove_software_node(iommu->fwnode); iommu_device_unregister(iommu); } EXPORT_SYMBOL_GPL(iommu_device_unregister_bus); @@ -315,6 +328,12 @@ int iommu_device_register_bus(struct iommu_device *iommu, if (err) return err; + iommu->fwnode = fwnode_create_software_node(NULL, NULL); + if (IS_ERR(iommu->fwnode)) { + bus_unregister_notifier(bus, nb); + return PTR_ERR(iommu->fwnode); + } + spin_lock(&iommu_device_lock); list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); @@ -324,9 +343,28 @@ int iommu_device_register_bus(struct iommu_device *iommu, iommu_device_unregister_bus(iommu, bus, nb); return err; } + WRITE_ONCE(iommu->ready, true); return 0; } EXPORT_SYMBOL_GPL(iommu_device_register_bus); + +int iommu_mock_device_add(struct device *dev, struct iommu_device *iommu) +{ + int rc; + + mutex_lock(&iommu_probe_device_lock); + rc = iommu_fwspec_init(dev, iommu->fwnode); + mutex_unlock(&iommu_probe_device_lock); + + if (rc) + return rc; + + rc = device_add(dev); + if (rc) + iommu_fwspec_free(dev); + return rc; +} +EXPORT_SYMBOL_GPL(iommu_mock_device_add); #endif static struct dev_iommu *dev_iommu_get(struct device *dev) @@ -347,7 +385,7 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) return param; } -static void dev_iommu_free(struct device *dev) +void dev_iommu_free(struct device *dev) { struct dev_iommu *param = dev->iommu; @@ -399,14 +437,42 @@ EXPORT_SYMBOL_GPL(dev_iommu_priv_set); * Init the dev->iommu and dev->iommu_group in the struct device and get the * driver probed */ -static int iommu_init_device(struct device *dev, const struct iommu_ops *ops) +static int iommu_init_device(struct device *dev) { + const struct iommu_ops *ops; struct iommu_device *iommu_dev; struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; + /* + * For FDT-based systems and ACPI IORT/VIOT, the common firmware parsing + * is buried in the bus dma_configure path. Properly unpicking that is + * still a big job, so for now just invoke the whole thing. The device + * already having a driver bound means dma_configure has already run and + * found no IOMMU to wait for, so there's no point calling it again. + */ + if (!dev->iommu->fwspec && !dev->driver && dev->bus->dma_configure) { + mutex_unlock(&iommu_probe_device_lock); + dev->bus->dma_configure(dev); + mutex_lock(&iommu_probe_device_lock); + /* If another instance finished the job for us, skip it */ + if (!dev->iommu || dev->iommu_group) + return -ENODEV; + } + /* + * At this point, relevant devices either now have a fwspec which will + * match ops registered with a non-NULL fwnode, or we can reasonably + * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can + * be present, and that any of their registered instances has suitable + * ops for probing, and thus cheekily co-opt the same mechanism. + */ + ops = iommu_fwspec_ops(dev->iommu->fwspec); + if (!ops) { + ret = -ENODEV; + goto err_free; + } if (!try_module_get(ops->owner)) { ret = -EINVAL; @@ -477,8 +543,21 @@ static void iommu_deinit_device(struct device *dev) * Regardless, if a delayed attach never occurred, then the release * should still avoid touching any hardware configuration either. */ - if (!dev->iommu->attach_deferred && ops->release_domain) - ops->release_domain->ops->attach_dev(ops->release_domain, dev); + if (!dev->iommu->attach_deferred && ops->release_domain) { + struct iommu_domain *release_domain = ops->release_domain; + + /* + * If the device requires direct mappings then it should not + * be parked on a BLOCKED domain during release as that would + * break the direct mappings. + */ + if (dev->iommu->require_direct && ops->identity_domain && + release_domain == ops->blocked_domain) + release_domain = ops->identity_domain; + + release_domain->ops->attach_dev(release_domain, dev, + group->domain); + } if (ops->release_device) ops->release_device(dev); @@ -503,35 +582,27 @@ static void iommu_deinit_device(struct device *dev) dev->iommu_group = NULL; module_put(ops->owner); dev_iommu_free(dev); +#ifdef CONFIG_IOMMU_DMA + dev->dma_iommu = false; +#endif +} + +static struct iommu_domain *pasid_array_entry_to_domain(void *entry) +{ + if (xa_pointer_tag(entry) == IOMMU_PASID_ARRAY_DOMAIN) + return xa_untag_pointer(entry); + return ((struct iommu_attach_handle *)xa_untag_pointer(entry))->domain; } DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { - const struct iommu_ops *ops; - struct iommu_fwspec *fwspec; struct iommu_group *group; struct group_device *gdev; int ret; /* - * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU - * instances with non-NULL fwnodes, and client devices should have been - * identified with a fwspec by this point. Otherwise, we can currently - * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can - * be present, and that any of their registered instances has suitable - * ops for probing, and thus cheekily co-opt the same mechanism. - */ - fwspec = dev_iommu_fwspec_get(dev); - if (fwspec && fwspec->ops) - ops = fwspec->ops; - else - ops = iommu_ops_from_fwnode(NULL); - - if (!ops) - return -ENODEV; - /* * Serialise to avoid races between IOMMU drivers registering in * parallel and/or the "replay" calls from ACPI/OF code via client * driver probe. Once the latter have been cleaned up we should @@ -544,9 +615,15 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (dev->iommu_group) return 0; - ret = iommu_init_device(dev, ops); + ret = iommu_init_device(dev); if (ret) return ret; + /* + * And if we do now see any replay calls, they would indicate someone + * misusing the dma_configure path outside bus code. + */ + if (dev->driver) + dev_WARN(dev, "late IOMMU probe at driver bind, something fishy here!\n"); group = dev->iommu_group; gdev = iommu_group_alloc_device(group, dev); @@ -565,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (group->default_domain) iommu_create_device_direct_mappings(group->default_domain, dev); if (group->domain) { - ret = __iommu_device_set_domain(group, dev, group->domain, 0); + ret = __iommu_device_set_domain(group, dev, group->domain, NULL, + 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain && !group_list) { @@ -1147,10 +1225,6 @@ map_end: } } - - if (!list_empty(&mappings) && iommu_is_dma_domain(domain)) - iommu_flush_iotlb_all(domain); - out: iommu_put_resv_regions(dev, &mappings); @@ -1592,12 +1666,57 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(fsl_mc_device_group); +static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + if (ops->identity_domain) + return ops->identity_domain; + + if (ops->domain_alloc_identity) { + domain = ops->domain_alloc_identity(dev); + if (IS_ERR(domain)) + return domain; + } else { + return ERR_PTR(-EOPNOTSUPP); + } + + iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops); + return domain; +} + static struct iommu_domain * __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { + struct device *dev = iommu_group_first_dev(group); + struct iommu_domain *dom; + if (group->default_domain && group->default_domain->type == req_type) return group->default_domain; - return __iommu_group_domain_alloc(group, req_type); + + /* + * When allocating the DMA API domain assume that the driver is going to + * use PASID and make sure the RID's domain is PASID compatible. + */ + if (req_type & __IOMMU_DOMAIN_PAGING) { + dom = __iommu_paging_domain_alloc_flags(dev, req_type, + dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0); + + /* + * If driver does not support PASID feature then + * try to allocate non-PASID domain + */ + if (PTR_ERR(dom) == -EOPNOTSUPP) + dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0); + + return dom; + } + + if (req_type == IOMMU_DOMAIN_IDENTITY) + return __iommu_alloc_identity_domain(dev); + + return ERR_PTR(-EINVAL); } /* @@ -1714,7 +1833,7 @@ static int iommu_get_def_domain_type(struct iommu_group *group, group->id); /* - * Try to recover, drivers are allowed to force IDENITY or DMA, IDENTITY + * Try to recover, drivers are allowed to force IDENTITY or DMA, IDENTITY * takes precedence. */ if (type == IOMMU_DOMAIN_IDENTITY) @@ -1801,7 +1920,7 @@ static void iommu_group_do_probe_finalize(struct device *dev) ops->probe_finalize(dev); } -int bus_iommu_probe(const struct bus_type *bus) +static int bus_iommu_probe(const struct bus_type *bus) { struct iommu_group *group, *next; LIST_HEAD(group_list); @@ -1847,31 +1966,6 @@ int bus_iommu_probe(const struct bus_type *bus) } /** - * iommu_present() - make platform-specific assumptions about an IOMMU - * @bus: bus to check - * - * Do not use this function. You want device_iommu_mapped() instead. - * - * Return: true if some IOMMU is present and aware of devices on the given bus; - * in general it may not be the only IOMMU, and it may not have anything to do - * with whatever device you are ultimately interested in. - */ -bool iommu_present(const struct bus_type *bus) -{ - bool ret = false; - - for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { - if (iommu_buses[i] == bus) { - spin_lock(&iommu_device_lock); - ret = !list_empty(&iommu_device_list); - spin_unlock(&iommu_device_lock); - } - } - return ret; -} -EXPORT_SYMBOL_GPL(iommu_present); - -/** * device_iommu_capable() - check for a general IOMMU capability * @dev: device to which the capability would be relevant, if available * @cap: IOMMU capability @@ -1933,110 +2027,87 @@ void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token) { - BUG_ON(!domain); + if (WARN_ON(!domain || domain->cookie_type != IOMMU_COOKIE_NONE)) + return; + domain->cookie_type = IOMMU_COOKIE_FAULT_HANDLER; domain->handler = handler; domain->handler_token = token; } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); -static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, - struct device *dev, - unsigned int type) +static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, + const struct iommu_ops *ops) { - struct iommu_domain *domain; - unsigned int alloc_type = type & IOMMU_DOMAIN_ALLOC_FLAGS; - - if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain) - return ops->identity_domain; - else if (alloc_type == IOMMU_DOMAIN_BLOCKED && ops->blocked_domain) - return ops->blocked_domain; - else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging) - domain = ops->domain_alloc_paging(dev); - else if (ops->domain_alloc) - domain = ops->domain_alloc(alloc_type); - else - return ERR_PTR(-EOPNOTSUPP); - - /* - * Many domain_alloc ops now return ERR_PTR, make things easier for the - * driver by accepting ERR_PTR from all domain_alloc ops instead of - * having two rules. - */ - if (IS_ERR(domain)) - return domain; - if (!domain) - return ERR_PTR(-ENOMEM); - domain->type = type; domain->owner = ops; - /* - * If not already set, assume all sizes by default; the driver - * may override this later - */ - if (!domain->pgsize_bitmap) - domain->pgsize_bitmap = ops->pgsize_bitmap; - if (!domain->ops) domain->ops = ops->default_domain_ops; - - if (iommu_is_dma_domain(domain)) { - int rc; - - rc = iommu_get_dma_cookie(domain); - if (rc) { - iommu_domain_free(domain); - return ERR_PTR(rc); - } - } - return domain; } static struct iommu_domain * -__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) +__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, + unsigned int flags) { - struct device *dev = iommu_group_first_dev(group); + const struct iommu_ops *ops; + struct iommu_domain *domain; - return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type); -} + if (!dev_has_iommu(dev)) + return ERR_PTR(-ENODEV); -static int __iommu_domain_alloc_dev(struct device *dev, void *data) -{ - const struct iommu_ops **ops = data; + ops = dev_iommu_ops(dev); - if (!dev_has_iommu(dev)) - return 0; + if (ops->domain_alloc_paging && !flags) + domain = ops->domain_alloc_paging(dev); + else if (ops->domain_alloc_paging_flags) + domain = ops->domain_alloc_paging_flags(dev, flags, NULL); +#if IS_ENABLED(CONFIG_FSL_PAMU) + else if (ops->domain_alloc && !flags) + domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED); +#endif + else + return ERR_PTR(-EOPNOTSUPP); - if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev), - "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n", - dev_bus_name(dev))) - return -EBUSY; + if (IS_ERR(domain)) + return domain; + if (!domain) + return ERR_PTR(-ENOMEM); - *ops = dev_iommu_ops(dev); - return 0; + iommu_domain_init(domain, type, ops); + return domain; } -struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) +/** + * iommu_paging_domain_alloc_flags() - Allocate a paging domain + * @dev: device for which the domain is allocated + * @flags: Bitmap of iommufd_hwpt_alloc_flags + * + * Allocate a paging domain which will be managed by a kernel driver. Return + * allocated domain if successful, or an ERR pointer for failure. + */ +struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int flags) { - const struct iommu_ops *ops = NULL; - int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev); - struct iommu_domain *domain; - - if (err || !ops) - return NULL; - - domain = __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED); - if (IS_ERR(domain)) - return NULL; - return domain; + return __iommu_paging_domain_alloc_flags(dev, + IOMMU_DOMAIN_UNMANAGED, flags); } -EXPORT_SYMBOL_GPL(iommu_domain_alloc); +EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags); void iommu_domain_free(struct iommu_domain *domain) { - if (domain->type == IOMMU_DOMAIN_SVA) + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + iommu_put_dma_cookie(domain); + break; + case IOMMU_COOKIE_DMA_MSI: + iommu_put_msi_cookie(domain); + break; + case IOMMU_COOKIE_SVA: mmdrop(domain->mm); - iommu_put_dma_cookie(domain); + break; + default: + break; + } if (domain->ops->free) domain->ops->free(domain); } @@ -2059,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group) } static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; - ret = domain->ops->attach_dev(domain, dev); + ret = domain->ops->attach_dev(domain, dev, old); if (ret) return ret; dev->iommu->attach_deferred = 0; @@ -2115,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { if (dev->iommu && dev->iommu->attach_deferred) - return __iommu_attach_device(domain, dev); + return __iommu_attach_device(domain, dev, NULL); return 0; } @@ -2160,6 +2231,30 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev) return dev->iommu_group->default_domain; } +static void *iommu_make_pasid_array_entry(struct iommu_domain *domain, + struct iommu_attach_handle *handle) +{ + if (handle) { + handle->domain = domain; + return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE); + } + + return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN); +} + +static bool domain_iommu_ops_compatible(const struct iommu_ops *ops, + struct iommu_domain *domain) +{ + if (domain->owner == ops) + return true; + + /* For static domains, owner isn't set. */ + if (domain == ops->blocked_domain || domain == ops->identity_domain) + return true; + + return false; +} + static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { @@ -2170,7 +2265,8 @@ static int __iommu_attach_group(struct iommu_domain *domain, return -EBUSY; dev = iommu_group_first_dev(group); - if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner) + if (!dev_has_iommu(dev) || + !domain_iommu_ops_compatible(dev_iommu_ops(dev), domain)) return -EINVAL; return __iommu_group_set_domain(group, domain); @@ -2200,35 +2296,10 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_attach_group); -/** - * iommu_group_replace_domain - replace the domain that a group is attached to - * @new_domain: new IOMMU domain to replace with - * @group: IOMMU group that will be attached to the new domain - * - * This API allows the group to switch domains without being forced to go to - * the blocking domain in-between. - * - * If the currently attached domain is a core domain (e.g. a default_domain), - * it will act just like the iommu_attach_group(). - */ -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain) -{ - int ret; - - if (!new_domain) - return -EINVAL; - - mutex_lock(&group->mutex); - ret = __iommu_group_set_domain(group, new_domain); - mutex_unlock(&group->mutex); - return ret; -} -EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, IOMMUFD_INTERNAL); - static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags) { int ret; @@ -2254,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group, dev->iommu->attach_deferred = 0; } - ret = __iommu_attach_device(new_domain, dev); + ret = __iommu_attach_device(new_domain, dev, old_domain); if (ret) { /* * If we have a blocking domain then try to attach that in hopes @@ -2264,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group, if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) && group->blocking_domain && group->blocking_domain != new_domain) - __iommu_attach_device(group->blocking_domain, dev); + __iommu_attach_device(group->blocking_domain, dev, + old_domain); return ret; } return 0; @@ -2311,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, result = 0; for_each_group_device(group, gdev) { ret = __iommu_device_set_domain(group, gdev->dev, new_domain, - flags); + group->domain, flags); if (ret) { result = ret; /* @@ -2336,6 +2408,9 @@ err_revert: */ last_gdev = gdev; for_each_group_device(group, gdev) { + /* No need to revert the last gdev that failed to set domain */ + if (gdev == last_gdev) + break; /* * A NULL domain can happen only for first probe, in which case * we leave group->domain as NULL and let release clean @@ -2343,10 +2418,8 @@ err_revert: */ if (group->domain) WARN_ON(__iommu_device_set_domain( - group, gdev->dev, group->domain, + group, gdev->dev, group->domain, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); - if (gdev == last_gdev) - break; } return ret; } @@ -2377,6 +2450,7 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, unsigned int pgsize_idx, pgsize_idx_next; unsigned long pgsizes; size_t offset, pgsize, pgsize_next; + size_t offset_end; unsigned long addr_merge = paddr | iova; /* Page sizes supported by the hardware and small enough for @size */ @@ -2417,7 +2491,8 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, * If size is big enough to accommodate the larger page, reduce * the number of smaller pages. */ - if (offset + pgsize_next <= size) + if (!check_add_overflow(offset, pgsize_next, &offset_end) && + offset_end <= size) size = offset; out_set_count: @@ -2425,8 +2500,8 @@ out_set_count: return pgsize; } -static int __iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { const struct iommu_domain_ops *ops = domain->ops; unsigned long orig_iova = iova; @@ -2435,12 +2510,19 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t orig_paddr = paddr; int ret = 0; + might_sleep_if(gfpflags_allow_blocking(gfp)); + if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) return -ENODEV; + /* Discourage passing strange GFP flags */ + if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | + __GFP_HIGHMEM))) + return -EINVAL; + /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); @@ -2488,31 +2570,27 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, return ret; } -int iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { const struct iommu_domain_ops *ops = domain->ops; - int ret; - - might_sleep_if(gfpflags_allow_blocking(gfp)); - /* Discourage passing strange GFP flags */ - if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | - __GFP_HIGHMEM))) - return -EINVAL; + if (!ops->iotlb_sync_map) + return 0; + return ops->iotlb_sync_map(domain, iova, size); +} - ret = __iommu_map(domain, iova, paddr, size, prot, gfp); - if (ret == 0 && ops->iotlb_sync_map) { - ret = ops->iotlb_sync_map(domain, iova, size); - if (ret) - goto out_err; - } +int iommu_map(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + int ret; - return ret; + ret = iommu_map_nosync(domain, iova, paddr, size, prot, gfp); + if (ret) + return ret; -out_err: - /* undo mappings already done */ - iommu_unmap(domain, iova, size); + ret = iommu_sync_map(domain, iova, size); + if (ret) + iommu_unmap(domain, iova, size); return ret; } @@ -2572,6 +2650,20 @@ static size_t __iommu_unmap(struct iommu_domain *domain, return unmapped; } +/** + * iommu_unmap() - Remove mappings from a range of IOVA + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the range starting from @iova + * + * iommu_unmap() will remove a translation created by iommu_map(). It cannot + * subdivide a mapping created by iommu_map(), so it should be called with IOVA + * ranges that match what was passed to iommu_map(). The range can aggregate + * contiguous iommu_map() calls so long as no individual range is split. + * + * Returns: Number of bytes of IOVA unmapped. iova + res will be the point + * unmapping stopped. + */ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { @@ -2586,6 +2678,25 @@ size_t iommu_unmap(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_unmap); +/** + * iommu_unmap_fast() - Remove mappings from a range of IOVA without IOTLB sync + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the range starting from @iova + * @iotlb_gather: range information for a pending IOTLB flush + * + * iommu_unmap_fast() will remove a translation created by iommu_map(). + * It can't subdivide a mapping created by iommu_map(), so it should be + * called with IOVA ranges that match what was passed to iommu_map(). The + * range can aggregate contiguous iommu_map() calls so long as no individual + * range is split. + * + * Basically iommu_unmap_fast() is the same as iommu_unmap() but for callers + * which manage the IOTLB flushing externally to perform a batched sync. + * + * Returns: Number of bytes of IOVA unmapped. iova + res will be the point + * unmapping stopped. + */ size_t iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather) @@ -2598,26 +2709,17 @@ ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot, gfp_t gfp) { - const struct iommu_domain_ops *ops = domain->ops; size_t len = 0, mapped = 0; phys_addr_t start; unsigned int i = 0; int ret; - might_sleep_if(gfpflags_allow_blocking(gfp)); - - /* Discourage passing strange GFP flags */ - if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | - __GFP_HIGHMEM))) - return -EINVAL; - while (i <= nents) { phys_addr_t s_phys = sg_phys(sg); if (len && s_phys != start + len) { - ret = __iommu_map(domain, iova + mapped, start, + ret = iommu_map_nosync(domain, iova + mapped, start, len, prot, gfp); - if (ret) goto out_err; @@ -2640,11 +2742,10 @@ next: sg = sg_next(sg); } - if (ops->iotlb_sync_map) { - ret = ops->iotlb_sync_map(domain, iova, mapped); - if (ret) - goto out_err; - } + ret = iommu_sync_map(domain, iova, mapped); + if (ret) + goto out_err; + return mapped; out_err: @@ -2688,7 +2789,8 @@ int report_iommu_fault(struct iommu_domain *domain, struct device *dev, * if upper layers showed interest and installed a fault handler, * invoke it. */ - if (domain->handler) + if (domain->cookie_type == IOMMU_COOKIE_FAULT_HANDLER && + domain->handler) ret = domain->handler(domain, dev, iova, flags, domain->handler_token); @@ -2709,16 +2811,6 @@ static int __init iommu_init(void) } core_initcall(iommu_init); -int iommu_enable_nesting(struct iommu_domain *domain) -{ - if (domain->type != IOMMU_DOMAIN_UNMANAGED) - return -EINVAL; - if (!domain->ops->enable_nesting) - return -EINVAL; - return domain->ops->enable_nesting(domain); -} -EXPORT_SYMBOL_GPL(iommu_enable_nesting); - int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirk) { @@ -2807,28 +2899,39 @@ bool iommu_default_passthrough(void) } EXPORT_SYMBOL_GPL(iommu_default_passthrough); -const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode) +static const struct iommu_device *iommu_from_fwnode(const struct fwnode_handle *fwnode) { - const struct iommu_ops *ops = NULL; - struct iommu_device *iommu; + const struct iommu_device *iommu, *ret = NULL; spin_lock(&iommu_device_lock); list_for_each_entry(iommu, &iommu_device_list, list) if (iommu->fwnode == fwnode) { - ops = iommu->ops; + ret = iommu; break; } spin_unlock(&iommu_device_lock); - return ops; + return ret; } -int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, - const struct iommu_ops *ops) +const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode) { + const struct iommu_device *iommu = iommu_from_fwnode(fwnode); + + return iommu ? iommu->ops : NULL; +} + +int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode) +{ + const struct iommu_device *iommu = iommu_from_fwnode(iommu_fwnode); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + if (!iommu) + return driver_deferred_probe_check_state(dev); + if (!dev->iommu && !READ_ONCE(iommu->ready)) + return -EPROBE_DEFER; + if (fwspec) - return ops == fwspec->ops ? 0 : -EINVAL; + return iommu->ops == iommu_fwspec_ops(fwspec) ? 0 : -EINVAL; if (!dev_iommu_get(dev)) return -ENOMEM; @@ -2838,9 +2941,8 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, if (!fwspec) return -ENOMEM; - of_node_get(to_of_node(iommu_fwnode)); + fwnode_handle_get(iommu_fwnode); fwspec->iommu_fwnode = iommu_fwnode; - fwspec->ops = ops; dev_iommu_fwspec_set(dev, fwspec); return 0; } @@ -2856,7 +2958,6 @@ void iommu_fwspec_free(struct device *dev) dev_iommu_fwspec_set(dev, NULL); } } -EXPORT_SYMBOL_GPL(iommu_fwspec_free); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) { @@ -2884,38 +2985,6 @@ int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) } EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids); -/* - * Per device IOMMU features. - */ -int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) -{ - if (dev_has_iommu(dev)) { - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (ops->dev_enable_feat) - return ops->dev_enable_feat(dev, feat); - } - - return -ENODEV; -} -EXPORT_SYMBOL_GPL(iommu_dev_enable_feature); - -/* - * The device drivers should do the necessary cleanups before calling this. - */ -int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) -{ - if (dev_has_iommu(dev)) { - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (ops->dev_disable_feat) - return ops->dev_disable_feat(dev, feat); - } - - return -EBUSY; -} -EXPORT_SYMBOL_GPL(iommu_dev_disable_feature); - /** * iommu_setup_default_domain - Set the default_domain for the group * @group: Group to change @@ -2949,6 +3018,14 @@ static int iommu_setup_default_domain(struct iommu_group *group, if (group->default_domain == dom) return 0; + if (iommu_is_dma_domain(dom)) { + ret = iommu_get_dma_cookie(dom); + if (ret) { + iommu_domain_free(dom); + return ret; + } + } + /* * IOMMU_RESV_DIRECT and IOMMU_RESV_DIRECT_RELAXABLE regions must be * mapped before their device is attached, in order to guarantee @@ -3096,6 +3173,11 @@ int iommu_device_use_default_domain(struct device *dev) return 0; mutex_lock(&group->mutex); + /* We may race against bus_iommu_probe() finalising groups here */ + if (!group->default_domain) { + ret = -EPROBE_DEFER; + goto unlock_out; + } if (group->owner_cnt) { if (group->domain != group->default_domain || group->owner || !xa_empty(&group->pasid_array)) { @@ -3136,22 +3218,25 @@ void iommu_device_unuse_default_domain(struct device *dev) static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) { + struct device *dev = iommu_group_first_dev(group); + const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (group->blocking_domain) return 0; - domain = __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED); - if (IS_ERR(domain)) { - /* - * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED - * create an empty domain instead. - */ - domain = __iommu_group_domain_alloc(group, - IOMMU_DOMAIN_UNMANAGED); - if (IS_ERR(domain)) - return PTR_ERR(domain); + if (ops->blocked_domain) { + group->blocking_domain = ops->blocked_domain; + return 0; } + + /* + * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED create an + * empty PAGING domain instead. + */ + domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) + return PTR_ERR(domain); group->blocking_domain = domain; return 0; } @@ -3308,16 +3393,30 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); +static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *blocked_domain = ops->blocked_domain; + + WARN_ON(blocked_domain->ops->set_dev_pasid(blocked_domain, + dev, pasid, domain)); +} + static int __iommu_set_group_pasid(struct iommu_domain *domain, - struct iommu_group *group, ioasid_t pasid) + struct iommu_group *group, ioasid_t pasid, + struct iommu_domain *old) { struct group_device *device, *last_gdev; int ret; for_each_group_device(group, device) { - ret = domain->ops->set_dev_pasid(domain, device->dev, pasid); - if (ret) - goto err_revert; + if (device->dev->iommu->max_pasids > 0) { + ret = domain->ops->set_dev_pasid(domain, device->dev, + pasid, old); + if (ret) + goto err_revert; + } } return 0; @@ -3325,11 +3424,20 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, err_revert: last_gdev = device; for_each_group_device(group, device) { - const struct iommu_ops *ops = dev_iommu_ops(device->dev); - if (device == last_gdev) break; - ops->remove_dev_pasid(device->dev, pasid, domain); + if (device->dev->iommu->max_pasids > 0) { + /* + * If no old domain, undo the succeeded devices/pasid. + * Otherwise, rollback the succeeded devices/pasid to + * the old domain. And it is a driver bug to fail + * attaching with a previously good domain. + */ + if (!old || + WARN_ON(old->ops->set_dev_pasid(old, device->dev, + pasid, domain))) + iommu_remove_dev_pasid(device->dev, pasid, domain); + } } return ret; } @@ -3339,11 +3447,10 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, struct iommu_domain *domain) { struct group_device *device; - const struct iommu_ops *ops; for_each_group_device(group, device) { - ops = dev_iommu_ops(device->dev); - ops->remove_dev_pasid(device->dev, pasid, domain); + if (device->dev->iommu->max_pasids > 0) + iommu_remove_dev_pasid(device->dev, pasid, domain); } } @@ -3352,107 +3459,195 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, * @domain: the iommu domain. * @dev: the attached device. * @pasid: the pasid of the device. + * @handle: the attach handle. + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle if it intends to pass a valid handle. * * Return: 0 on success, or an error. */ int iommu_attach_device_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; struct group_device *device; - void *curr; + const struct iommu_ops *ops; + void *entry; int ret; - if (!domain->ops->set_dev_pasid) - return -EOPNOTSUPP; - if (!group) return -ENODEV; - if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner || + ops = dev_iommu_ops(dev); + + if (!domain->ops->set_dev_pasid || + !ops->blocked_domain || + !ops->blocked_domain->ops->set_dev_pasid) + return -EOPNOTSUPP; + + if (!domain_iommu_ops_compatible(ops, domain) || pasid == IOMMU_NO_PASID) return -EINVAL; mutex_lock(&group->mutex); for_each_group_device(group, device) { - if (pasid >= device->dev->iommu->max_pasids) { + /* + * Skip PASID validation for devices without PASID support + * (max_pasids = 0). These devices cannot issue transactions + * with PASID, so they don't affect group's PASID usage. + */ + if ((device->dev->iommu->max_pasids > 0) && + (pasid >= device->dev->iommu->max_pasids)) { ret = -EINVAL; goto out_unlock; } } - curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL); - if (curr) { - ret = xa_err(curr) ? : -EBUSY; + entry = iommu_make_pasid_array_entry(domain, handle); + + /* + * Entry present is a failure case. Use xa_insert() instead of + * xa_reserve(). + */ + ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL); + if (ret) + goto out_unlock; + + ret = __iommu_set_group_pasid(domain, group, pasid, NULL); + if (ret) { + xa_release(&group->pasid_array, pasid); goto out_unlock; } - ret = __iommu_set_group_pasid(domain, group, pasid); - if (ret) - xa_erase(&group->pasid_array, pasid); + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + out_unlock: mutex_unlock(&group->mutex); return ret; } EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); -/* - * iommu_detach_device_pasid() - Detach the domain from pasid of device - * @domain: the iommu domain. +/** + * iommu_replace_device_pasid - Replace the domain that a specific pasid + * of the device is attached to + * @domain: the new iommu domain * @dev: the attached device. * @pasid: the pasid of the device. + * @handle: the attach handle. * - * The @domain must have been attached to @pasid of the @dev with - * iommu_attach_device_pasid(). + * This API allows the pasid to switch domains. The @pasid should have been + * attached. Otherwise, this fails. The pasid will keep the old configuration + * if replacement failed. + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle if it intends to pass a valid handle. + * + * Return 0 on success, or an error. */ -void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, - ioasid_t pasid) +int iommu_replace_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; + struct iommu_attach_handle *entry; + struct iommu_domain *curr_domain; + void *curr; + int ret; + + if (!group) + return -ENODEV; + + if (!domain->ops->set_dev_pasid) + return -EOPNOTSUPP; + + if (!domain_iommu_ops_compatible(dev_iommu_ops(dev), domain) || + pasid == IOMMU_NO_PASID || !handle) + return -EINVAL; mutex_lock(&group->mutex); - __iommu_remove_group_pasid(group, pasid, domain); - WARN_ON(xa_erase(&group->pasid_array, pasid) != domain); + entry = iommu_make_pasid_array_entry(domain, handle); + curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, + XA_ZERO_ENTRY, GFP_KERNEL); + if (xa_is_err(curr)) { + ret = xa_err(curr); + goto out_unlock; + } + + /* + * No domain (with or without handle) attached, hence not + * a replace case. + */ + if (!curr) { + xa_release(&group->pasid_array, pasid); + ret = -EINVAL; + goto out_unlock; + } + + /* + * Reusing handle is problematic as there are paths that refers + * the handle without lock. To avoid race, reject the callers that + * attempt it. + */ + if (curr == entry) { + WARN_ON(1); + ret = -EINVAL; + goto out_unlock; + } + + curr_domain = pasid_array_entry_to_domain(curr); + ret = 0; + + if (curr_domain != domain) { + ret = __iommu_set_group_pasid(domain, group, + pasid, curr_domain); + if (ret) + goto out_unlock; + } + + /* + * The above xa_cmpxchg() reserved the memory, and the + * group->mutex is held, this cannot fail. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + +out_unlock: mutex_unlock(&group->mutex); + return ret; } -EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); +EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, "IOMMUFD_INTERNAL"); /* - * iommu_get_domain_for_dev_pasid() - Retrieve domain for @pasid of @dev - * @dev: the queried device - * @pasid: the pasid of the device - * @type: matched domain type, 0 for any match - * - * This is a variant of iommu_get_domain_for_dev(). It returns the existing - * domain attached to pasid of a device. Callers must hold a lock around this - * function, and both iommu_attach/detach_dev_pasid() whenever a domain of - * type is being manipulated. This API does not internally resolve races with - * attach/detach. + * iommu_detach_device_pasid() - Detach the domain from pasid of device + * @domain: the iommu domain. + * @dev: the attached device. + * @pasid: the pasid of the device. * - * Return: attached domain on success, NULL otherwise. + * The @domain must have been attached to @pasid of the @dev with + * iommu_attach_device_pasid(). */ -struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, - ioasid_t pasid, - unsigned int type) +void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid) { /* Caller must be a probed driver on dev */ struct iommu_group *group = dev->iommu_group; - struct iommu_domain *domain; - - if (!group) - return NULL; - - xa_lock(&group->pasid_array); - domain = xa_load(&group->pasid_array, pasid); - if (type && domain && domain->type != type) - domain = ERR_PTR(-EBUSY); - xa_unlock(&group->pasid_array); - return domain; + mutex_lock(&group->mutex); + __iommu_remove_group_pasid(group, pasid, domain); + xa_erase(&group->pasid_array, pasid); + mutex_unlock(&group->mutex); } -EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid); +EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev) { @@ -3480,3 +3675,201 @@ void iommu_free_global_pasid(ioasid_t pasid) ida_free(&iommu_global_pasid_ida, pasid); } EXPORT_SYMBOL_GPL(iommu_free_global_pasid); + +/** + * iommu_attach_handle_get - Return the attach handle + * @group: the iommu group that domain was attached to + * @pasid: the pasid within the group + * @type: matched domain type, 0 for any match + * + * Return handle or ERR_PTR(-ENOENT) on none, ERR_PTR(-EBUSY) on mismatch. + * + * Return the attach handle to the caller. The life cycle of an iommu attach + * handle is from the time when the domain is attached to the time when the + * domain is detached. Callers are required to synchronize the call of + * iommu_attach_handle_get() with domain attachment and detachment. The attach + * handle can only be used during its life cycle. + */ +struct iommu_attach_handle * +iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type) +{ + struct iommu_attach_handle *handle; + void *entry; + + xa_lock(&group->pasid_array); + entry = xa_load(&group->pasid_array, pasid); + if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) { + handle = ERR_PTR(-ENOENT); + } else { + handle = xa_untag_pointer(entry); + if (type && handle->domain->type != type) + handle = ERR_PTR(-EBUSY); + } + xa_unlock(&group->pasid_array); + + return handle; +} +EXPORT_SYMBOL_NS_GPL(iommu_attach_handle_get, "IOMMUFD_INTERNAL"); + +/** + * iommu_attach_group_handle - Attach an IOMMU domain to an IOMMU group + * @domain: IOMMU domain to attach + * @group: IOMMU group that will be attached + * @handle: attach handle + * + * Returns 0 on success and error code on failure. + * + * This is a variant of iommu_attach_group(). It allows the caller to provide + * an attach handle and use it when the domain is attached. This is currently + * used by IOMMUFD to deliver the I/O page faults. + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle. + */ +int iommu_attach_group_handle(struct iommu_domain *domain, + struct iommu_group *group, + struct iommu_attach_handle *handle) +{ + void *entry; + int ret; + + if (!handle) + return -EINVAL; + + mutex_lock(&group->mutex); + entry = iommu_make_pasid_array_entry(domain, handle); + ret = xa_insert(&group->pasid_array, + IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL); + if (ret) + goto out_unlock; + + ret = __iommu_attach_group(domain, group); + if (ret) { + xa_release(&group->pasid_array, IOMMU_NO_PASID); + goto out_unlock; + } + + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + IOMMU_NO_PASID, entry, GFP_KERNEL))); + +out_unlock: + mutex_unlock(&group->mutex); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_attach_group_handle, "IOMMUFD_INTERNAL"); + +/** + * iommu_detach_group_handle - Detach an IOMMU domain from an IOMMU group + * @domain: IOMMU domain to attach + * @group: IOMMU group that will be attached + * + * Detach the specified IOMMU domain from the specified IOMMU group. + * It must be used in conjunction with iommu_attach_group_handle(). + */ +void iommu_detach_group_handle(struct iommu_domain *domain, + struct iommu_group *group) +{ + mutex_lock(&group->mutex); + __iommu_group_set_core_domain(group); + xa_erase(&group->pasid_array, IOMMU_NO_PASID); + mutex_unlock(&group->mutex); +} +EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL"); + +/** + * iommu_replace_group_handle - replace the domain that a group is attached to + * @group: IOMMU group that will be attached to the new domain + * @new_domain: new IOMMU domain to replace with + * @handle: attach handle + * + * This API allows the group to switch domains without being forced to go to + * the blocking domain in-between. It allows the caller to provide an attach + * handle for the new domain and use it when the domain is attached. + * + * If the currently attached domain is a core domain (e.g. a default_domain), + * it will act just like the iommu_attach_group_handle(). + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle. + */ +int iommu_replace_group_handle(struct iommu_group *group, + struct iommu_domain *new_domain, + struct iommu_attach_handle *handle) +{ + void *curr, *entry; + int ret; + + if (!new_domain || !handle) + return -EINVAL; + + mutex_lock(&group->mutex); + entry = iommu_make_pasid_array_entry(new_domain, handle); + ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); + if (ret) + goto err_unlock; + + ret = __iommu_group_set_domain(group, new_domain); + if (ret) + goto err_release; + + curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); + WARN_ON(xa_is_err(curr)); + + mutex_unlock(&group->mutex); + + return 0; +err_release: + xa_release(&group->pasid_array, IOMMU_NO_PASID); +err_unlock: + mutex_unlock(&group->mutex); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +/** + * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain + * @desc: MSI descriptor, will store the MSI page + * @msi_addr: MSI target address to be mapped + * + * The implementation of sw_msi() should take msi_addr and map it to + * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the + * mapping information. + * + * Return: 0 on success or negative error code if the mapping failed. + */ +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommu_group *group = dev->iommu_group; + int ret = 0; + + if (!group) + return 0; + + mutex_lock(&group->mutex); + /* An IDENTITY domain must pass through */ + if (group->domain && group->domain->type != IOMMU_DOMAIN_IDENTITY) { + switch (group->domain->cookie_type) { + case IOMMU_COOKIE_DMA_MSI: + case IOMMU_COOKIE_DMA_IOVA: + ret = iommu_dma_sw_msi(group->domain, desc, msi_addr); + break; + case IOMMU_COOKIE_IOMMUFD: + ret = iommufd_sw_msi(group->domain, desc, msi_addr); + break; + default: + ret = -EOPNOTSUPP; + break; + } + } + mutex_unlock(&group->mutex); + return ret; +} +#endif /* CONFIG_IRQ_MSI_IOMMU */ diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 76656fe0470d..eae3f03629b0 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -1,4 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +config IOMMUFD_DRIVER_CORE + bool + default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n + config IOMMUFD tristate "IOMMU Userspace API" select INTERVAL_TREE @@ -37,6 +41,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + depends on IOMMU_PT_AMDV1 select IOMMUFD_DRIVER default n help diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 34b446146961..71d692c9a8f4 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,14 +1,19 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ device.o \ + eventq.o \ hw_pagetable.o \ io_pagetable.o \ ioas.o \ main.o \ pages.o \ - vfio_compat.o + vfio_compat.o \ + viommu.o iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o obj-$(CONFIG_IOMMUFD) += iommufd.o obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o + +iommufd_driver-y := driver.o +obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 873630c111c1..4c842368289f 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1,12 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include <linux/iommu.h> #include <linux/iommufd.h> +#include <linux/pci-ats.h> #include <linux/slab.h> -#include <linux/iommu.h> #include <uapi/linux/iommufd.h> -#include "../iommu-priv.h" +#include "../iommu-priv.h" #include "io_pagetable.h" #include "iommufd_private.h" @@ -17,12 +18,17 @@ MODULE_PARM_DESC( "Allow IOMMUFD to bind to devices even if the platform cannot isolate " "the MSI interrupt window. Enabling this is a security weakness."); +struct iommufd_attach { + struct iommufd_hw_pagetable *hwpt; + struct xarray device_array; +}; + static void iommufd_group_release(struct kref *kref) { struct iommufd_group *igroup = container_of(kref, struct iommufd_group, ref); - WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list)); + WARN_ON(!xa_empty(&igroup->pasid_attach)); xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, NULL, GFP_KERNEL); @@ -89,7 +95,7 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, kref_init(&new_igroup->ref); mutex_init(&new_igroup->lock); - INIT_LIST_HEAD(&new_igroup->device_list); + xa_init(&new_igroup->pasid_attach); new_igroup->sw_msi_start = PHYS_ADDR_MAX; /* group reference moves into new_igroup */ new_igroup->group = group; @@ -131,6 +137,57 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, } } +static void iommufd_device_remove_vdev(struct iommufd_device *idev) +{ + struct iommufd_vdevice *vdev; + + mutex_lock(&idev->igroup->lock); + /* prevent new references from vdev */ + idev->destroying = true; + /* vdev has been completely destroyed by userspace */ + if (!idev->vdev) + goto out_unlock; + + vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); + /* + * An ongoing vdev destroy ioctl has removed the vdev from the object + * xarray, but has not finished iommufd_vdevice_destroy() yet as it + * needs the same mutex. We exit the locking then wait on wait_cnt + * reference for the vdev destruction. + */ + if (IS_ERR(vdev)) + goto out_unlock; + + /* Should never happen */ + if (WARN_ON(vdev != idev->vdev)) { + iommufd_put_object(idev->ictx, &vdev->obj); + goto out_unlock; + } + + /* + * vdev is still alive. Hold a users refcount to prevent racing with + * userspace destruction, then use iommufd_object_tombstone_user() to + * destroy it and leave a tombstone. + */ + refcount_inc(&vdev->obj.users); + iommufd_put_object(idev->ictx, &vdev->obj); + mutex_unlock(&idev->igroup->lock); + iommufd_object_tombstone_user(idev->ictx, &vdev->obj); + return; + +out_unlock: + mutex_unlock(&idev->igroup->lock); +} + +void iommufd_device_pre_destroy(struct iommufd_object *obj) +{ + struct iommufd_device *idev = + container_of(obj, struct iommufd_device, obj); + + /* Release the wait_cnt reference on this */ + iommufd_device_remove_vdev(idev); +} + void iommufd_device_destroy(struct iommufd_object *obj) { struct iommufd_device *idev = @@ -232,7 +289,7 @@ out_group_put: iommufd_put_group(igroup); return ERR_PTR(rc); } -EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, "IOMMUFD"); /** * iommufd_ctx_has_group - True if any device within the group is bound @@ -263,7 +320,7 @@ bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group) xa_unlock(&ictx->objects); return false; } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, "IOMMUFD"); /** * iommufd_device_unbind - Undo iommufd_device_bind() @@ -278,69 +335,97 @@ void iommufd_device_unbind(struct iommufd_device *idev) { iommufd_object_destroy_user(idev->ictx, &idev->obj); } -EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, "IOMMUFD"); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev) { return idev->ictx; } -EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, "IOMMUFD"); u32 iommufd_device_to_id(struct iommufd_device *idev) { return idev->obj.id; } -EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); + +static unsigned int iommufd_group_device_num(struct iommufd_group *igroup, + ioasid_t pasid) +{ + struct iommufd_attach *attach; + struct iommufd_device *idev; + unsigned int count = 0; + unsigned long index; + + lockdep_assert_held(&igroup->lock); + attach = xa_load(&igroup->pasid_attach, pasid); + if (attach) + xa_for_each(&attach->device_array, index, idev) + count++; + return count; +} + +#ifdef CONFIG_IRQ_MSI_IOMMU static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { - phys_addr_t sw_msi_start = igroup->sw_msi_start; - int rc; + struct iommufd_ctx *ictx = igroup->ictx; + struct iommufd_sw_msi_map *cur; + + if (igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; /* - * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to - * call iommu_get_msi_cookie() on its behalf. This is necessary to setup - * the MSI window so iommu_dma_prepare_msi() can install pages into our - * domain after request_irq(). If it is not done interrupts will not - * work on this domain. - * - * FIXME: This is conceptually broken for iommufd since we want to allow - * userspace to change the domains, eg switch from an identity IOAS to a - * DMA IOAS. There is currently no way to create a MSI window that - * matches what the IRQ layer actually expects in a newly created - * domain. + * Install all the MSI pages the device has been using into the domain */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt_paging->common.domain, - sw_msi_start); + guard(mutex)(&ictx->sw_msi_lock); + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + int rc; + + if (cur->sw_msi_start != igroup->sw_msi_start || + !test_bit(cur->id, igroup->required_sw_msi.bitmap)) + continue; + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur); if (rc) return rc; - - /* - * iommu_get_msi_cookie() can only be called once per domain, - * it returns -EBUSY on later calls. - */ - hwpt_paging->msi_cookie = true; } return 0; } +#else +static inline int +iommufd_group_setup_msi(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + return 0; +} +#endif -static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, - struct iommufd_device *idev) +static bool +iommufd_group_first_attach(struct iommufd_group *igroup, ioasid_t pasid) { + lockdep_assert_held(&igroup->lock); + return !xa_load(&igroup->pasid_attach, pasid); +} + +static int +iommufd_device_attach_reserved_iova(struct iommufd_device *idev, + struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommufd_group *igroup = idev->igroup; int rc; - lockdep_assert_held(&idev->igroup->lock); + lockdep_assert_held(&igroup->lock); rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, idev->dev, - &idev->igroup->sw_msi_start); + &igroup->sw_msi_start); if (rc) return rc; - if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging); + if (iommufd_group_first_attach(igroup, IOMMU_NO_PASID)) { + rc = iommufd_group_setup_msi(igroup, hwpt_paging); if (rc) { iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); @@ -350,22 +435,216 @@ static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, return 0; } +/* The device attach/detach/replace helpers for attach_handle */ + +static bool iommufd_device_is_attached(struct iommufd_device *idev, + ioasid_t pasid) +{ + struct iommufd_attach *attach; + + attach = xa_load(&idev->igroup->pasid_attach, pasid); + return xa_load(&attach->device_array, idev->obj.id); +} + +static int iommufd_hwpt_pasid_compat(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev, + ioasid_t pasid) +{ + struct iommufd_group *igroup = idev->igroup; + + lockdep_assert_held(&igroup->lock); + + if (pasid == IOMMU_NO_PASID) { + unsigned long start = IOMMU_NO_PASID; + + if (!hwpt->pasid_compat && + xa_find_after(&igroup->pasid_attach, + &start, UINT_MAX, XA_PRESENT)) + return -EINVAL; + } else { + struct iommufd_attach *attach; + + if (!hwpt->pasid_compat) + return -EINVAL; + + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + if (attach && attach->hwpt && !attach->hwpt->pasid_compat) + return -EINVAL; + } + + return 0; +} + +static bool iommufd_hwpt_compatible_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + struct pci_dev *pdev; + + if (!hwpt->fault || !dev_is_pci(idev->dev)) + return true; + + /* + * Once we turn on PCI/PRI support for VF, the response failure code + * should not be forwarded to the hardware due to PRI being a shared + * resource between PF and VFs. There is no coordination for this + * shared capability. This waits for a vPRI reset to recover. + */ + pdev = to_pci_dev(idev->dev); + + return (!pdev->is_virtfn || !pci_pri_supported(pdev)); +} + +static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev, + ioasid_t pasid) +{ + struct iommufd_attach_handle *handle; + int rc; + + if (!iommufd_hwpt_compatible_device(hwpt, idev)) + return -EINVAL; + + rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid); + if (rc) + return rc; + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->idev = idev; + if (pasid == IOMMU_NO_PASID) + rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, + &handle->handle); + else + rc = iommu_attach_device_pasid(hwpt->domain, idev->dev, pasid, + &handle->handle); + if (rc) + goto out_free_handle; + + return 0; + +out_free_handle: + kfree(handle); + return rc; +} + +static struct iommufd_attach_handle * +iommufd_device_get_attach_handle(struct iommufd_device *idev, ioasid_t pasid) +{ + struct iommu_attach_handle *handle; + + lockdep_assert_held(&idev->igroup->lock); + + handle = iommu_attach_handle_get(idev->igroup->group, pasid, 0); + if (IS_ERR(handle)) + return NULL; + return to_iommufd_handle(handle); +} + +static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev, + ioasid_t pasid) +{ + struct iommufd_attach_handle *handle; + + handle = iommufd_device_get_attach_handle(idev, pasid); + if (pasid == IOMMU_NO_PASID) + iommu_detach_group_handle(hwpt->domain, idev->igroup->group); + else + iommu_detach_device_pasid(hwpt->domain, idev->dev, pasid); + + iommufd_auto_response_faults(hwpt, handle); + kfree(handle); +} + +static int iommufd_hwpt_replace_device(struct iommufd_device *idev, + ioasid_t pasid, + struct iommufd_hw_pagetable *hwpt, + struct iommufd_hw_pagetable *old) +{ + struct iommufd_attach_handle *handle, *old_handle; + int rc; + + if (!iommufd_hwpt_compatible_device(hwpt, idev)) + return -EINVAL; + + rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid); + if (rc) + return rc; + + old_handle = iommufd_device_get_attach_handle(idev, pasid); + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->idev = idev; + if (pasid == IOMMU_NO_PASID) + rc = iommu_replace_group_handle(idev->igroup->group, + hwpt->domain, &handle->handle); + else + rc = iommu_replace_device_pasid(hwpt->domain, idev->dev, + pasid, &handle->handle); + if (rc) + goto out_free_handle; + + iommufd_auto_response_faults(hwpt, old_handle); + kfree(old_handle); + + return 0; + +out_free_handle: + kfree(handle); + return rc; +} + int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) + struct iommufd_device *idev, ioasid_t pasid) { + struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID; + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hw_pagetable *old_hwpt; + struct iommufd_attach *attach; int rc; - mutex_lock(&idev->igroup->lock); + mutex_lock(&igroup->lock); - if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) { - rc = -EINVAL; + attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL, + XA_ZERO_ENTRY, GFP_KERNEL); + if (xa_is_err(attach)) { + rc = xa_err(attach); goto err_unlock; } - if (hwpt_is_paging(hwpt)) { - rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev); + if (!attach) { + attach = kzalloc(sizeof(*attach), GFP_KERNEL); + if (!attach) { + rc = -ENOMEM; + goto err_release_pasid; + } + xa_init(&attach->device_array); + } + + old_hwpt = attach->hwpt; + + rc = xa_insert(&attach->device_array, idev->obj.id, XA_ZERO_ENTRY, + GFP_KERNEL); + if (rc) { + WARN_ON(rc == -EBUSY && !old_hwpt); + goto err_free_attach; + } + + if (old_hwpt && old_hwpt != hwpt) { + rc = -EINVAL; + goto err_release_devid; + } + + if (attach_resv) { + rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging); if (rc) - goto err_unlock; + goto err_release_devid; } /* @@ -375,52 +654,76 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * reserved regions are only updated during individual device * attachment. */ - if (list_empty(&idev->igroup->device_list)) { - rc = iommu_attach_group(hwpt->domain, idev->igroup->group); + if (iommufd_group_first_attach(igroup, pasid)) { + rc = iommufd_hwpt_attach_device(hwpt, idev, pasid); if (rc) goto err_unresv; - idev->igroup->hwpt = hwpt; + attach->hwpt = hwpt; + WARN_ON(xa_is_err(xa_store(&igroup->pasid_attach, pasid, attach, + GFP_KERNEL))); } refcount_inc(&hwpt->obj.users); - list_add_tail(&idev->group_item, &idev->igroup->device_list); - mutex_unlock(&idev->igroup->lock); + WARN_ON(xa_is_err(xa_store(&attach->device_array, idev->obj.id, + idev, GFP_KERNEL))); + mutex_unlock(&igroup->lock); return 0; err_unresv: - if (hwpt_is_paging(hwpt)) - iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, - idev->dev); + if (attach_resv) + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); +err_release_devid: + xa_release(&attach->device_array, idev->obj.id); +err_free_attach: + if (iommufd_group_first_attach(igroup, pasid)) + kfree(attach); +err_release_pasid: + if (iommufd_group_first_attach(igroup, pasid)) + xa_release(&igroup->pasid_attach, pasid); err_unlock: - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); return rc; } struct iommufd_hw_pagetable * -iommufd_hw_pagetable_detach(struct iommufd_device *idev) +iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) { - struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_attach *attach; - mutex_lock(&idev->igroup->lock); - list_del(&idev->group_item); - if (list_empty(&idev->igroup->device_list)) { - iommu_detach_group(hwpt->domain, idev->igroup->group); - idev->igroup->hwpt = NULL; + mutex_lock(&igroup->lock); + attach = xa_load(&igroup->pasid_attach, pasid); + if (!attach) { + mutex_unlock(&igroup->lock); + return NULL; } - if (hwpt_is_paging(hwpt)) - iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, - idev->dev); - mutex_unlock(&idev->igroup->lock); + + hwpt = attach->hwpt; + hwpt_paging = find_hwpt_paging(hwpt); + + xa_erase(&attach->device_array, idev->obj.id); + if (xa_empty(&attach->device_array)) { + iommufd_hwpt_detach_device(hwpt, idev, pasid); + xa_erase(&igroup->pasid_attach, pasid); + kfree(attach); + } + if (hwpt_paging && pasid == IOMMU_NO_PASID) + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); + mutex_unlock(&igroup->lock); + + iommufd_hw_pagetable_put(idev->ictx, hwpt); /* Caller must destroy hwpt */ return hwpt; } static struct iommufd_hw_pagetable * -iommufd_device_do_attach(struct iommufd_device *idev, +iommufd_device_do_attach(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_hw_pagetable *hwpt) { int rc; - rc = iommufd_hw_pagetable_attach(hwpt, idev); + rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid); if (rc) return ERR_PTR(rc); return NULL; @@ -430,27 +733,33 @@ static void iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { + struct iommufd_attach *attach; struct iommufd_device *cur; + unsigned long index; lockdep_assert_held(&igroup->lock); - list_for_each_entry(cur, &igroup->device_list, group_item) + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + xa_for_each(&attach->device_array, index, cur) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); } static int -iommufd_group_do_replace_paging(struct iommufd_group *igroup, - struct iommufd_hwpt_paging *hwpt_paging) +iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) { - struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt; + struct iommufd_hwpt_paging *old_hwpt_paging; + struct iommufd_attach *attach; struct iommufd_device *cur; + unsigned long index; int rc; lockdep_assert_held(&igroup->lock); - if (!hwpt_is_paging(old_hwpt) || - hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) { + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + old_hwpt_paging = find_hwpt_paging(attach->hwpt); + if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) { + xa_for_each(&attach->device_array, index, cur) { rc = iopt_table_enforce_dev_resv_regions( &hwpt_paging->ioas->iopt, cur->dev, NULL); if (rc) @@ -469,70 +778,81 @@ err_unresv: } static struct iommufd_hw_pagetable * -iommufd_device_do_replace(struct iommufd_device *idev, +iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_hw_pagetable *hwpt) { + struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID; + struct iommufd_hwpt_paging *old_hwpt_paging; struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; + struct iommufd_attach *attach; unsigned int num_devices; int rc; - mutex_lock(&idev->igroup->lock); + mutex_lock(&igroup->lock); + + attach = xa_load(&igroup->pasid_attach, pasid); + if (!attach) { + rc = -EINVAL; + goto err_unlock; + } + + old_hwpt = attach->hwpt; - if (igroup->hwpt == NULL) { + WARN_ON(!old_hwpt || xa_empty(&attach->device_array)); + + if (!iommufd_device_is_attached(idev, pasid)) { rc = -EINVAL; goto err_unlock; } - if (hwpt == igroup->hwpt) { - mutex_unlock(&idev->igroup->lock); + if (hwpt == old_hwpt) { + mutex_unlock(&igroup->lock); return NULL; } - old_hwpt = igroup->hwpt; - if (hwpt_is_paging(hwpt)) { - rc = iommufd_group_do_replace_paging(igroup, - to_hwpt_paging(hwpt)); + if (attach_resv) { + rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging); if (rc) goto err_unlock; } - rc = iommu_group_replace_domain(igroup->group, hwpt->domain); + rc = iommufd_hwpt_replace_device(idev, pasid, hwpt, old_hwpt); if (rc) goto err_unresv; - if (hwpt_is_paging(old_hwpt) && - (!hwpt_is_paging(hwpt) || - to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas)) - iommufd_group_remove_reserved_iova(igroup, - to_hwpt_paging(old_hwpt)); + old_hwpt_paging = find_hwpt_paging(old_hwpt); + if (old_hwpt_paging && pasid == IOMMU_NO_PASID && + (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas)) + iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging); - igroup->hwpt = hwpt; + attach->hwpt = hwpt; - num_devices = list_count_nodes(&igroup->device_list); + num_devices = iommufd_group_device_num(igroup, pasid); /* - * Move the refcounts held by the device_list to the new hwpt. Retain a + * Move the refcounts held by the device_array to the new hwpt. Retain a * refcount for this thread as the caller will free it. */ refcount_add(num_devices, &hwpt->obj.users); if (num_devices > 1) WARN_ON(refcount_sub_and_test(num_devices - 1, &old_hwpt->obj.users)); - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); /* Caller must destroy old_hwpt */ return old_hwpt; err_unresv: - if (hwpt_is_paging(hwpt)) - iommufd_group_remove_reserved_iova(igroup, - to_hwpt_paging(old_hwpt)); + if (attach_resv) + iommufd_group_remove_reserved_iova(igroup, hwpt_paging); err_unlock: - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); return ERR_PTR(rc); } typedef struct iommufd_hw_pagetable *(*attach_fn)( - struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); + struct iommufd_device *idev, ioasid_t pasid, + struct iommufd_hw_pagetable *hwpt); /* * When automatically managing the domains we search for a compatible domain in @@ -540,7 +860,7 @@ typedef struct iommufd_hw_pagetable *(*attach_fn)( * Automatic domain selection will never pick a manually created domain. */ static struct iommufd_hw_pagetable * -iommufd_device_auto_get_domain(struct iommufd_device *idev, +iommufd_device_auto_get_domain(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_ioas *ioas, u32 *pt_id, attach_fn do_attach) { @@ -569,7 +889,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, hwpt = &hwpt_paging->common; if (!iommufd_lock_obj(&hwpt->obj)) continue; - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) { iommufd_put_object(idev->ictx, &hwpt->obj); /* @@ -587,8 +907,8 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } - hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, - immediate_attach, NULL); + hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, pasid, + 0, immediate_attach, NULL); if (IS_ERR(hwpt_paging)) { destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; @@ -596,7 +916,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, hwpt = &hwpt_paging->common; if (!immediate_attach) { - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) goto out_abort; } else { @@ -617,8 +937,9 @@ out_unlock: return destroy_hwpt; } -static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, - attach_fn do_attach) +static int iommufd_device_change_pt(struct iommufd_device *idev, + ioasid_t pasid, + u32 *pt_id, attach_fn do_attach) { struct iommufd_hw_pagetable *destroy_hwpt; struct iommufd_object *pt_obj; @@ -633,7 +954,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) goto out_put_pt_obj; break; @@ -642,8 +963,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, struct iommufd_ioas *ioas = container_of(pt_obj, struct iommufd_ioas, obj); - destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id, - do_attach); + destroy_hwpt = iommufd_device_auto_get_domain(idev, pasid, ioas, + pt_id, do_attach); if (IS_ERR(destroy_hwpt)) goto out_put_pt_obj; break; @@ -665,22 +986,26 @@ out_put_pt_obj: } /** - * iommufd_device_attach - Connect a device to an iommu_domain + * iommufd_device_attach - Connect a device/pasid to an iommu_domain * @idev: device to attach + * @pasid: pasid to attach * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING * Output the IOMMUFD_OBJ_HWPT_PAGING ID * - * This connects the device to an iommu_domain, either automatically or manually - * selected. Once this completes the device could do DMA. + * This connects the device/pasid to an iommu_domain, either automatically + * or manually selected. Once this completes the device could do DMA with + * @pasid. @pasid is IOMMU_NO_PASID if this attach is for no pasid usage. * * The caller should return the resulting pt_id back to userspace. * This function is undone by calling iommufd_device_detach(). */ -int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id) { int rc; - rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach); + rc = iommufd_device_change_pt(idev, pasid, pt_id, + &iommufd_device_do_attach); if (rc) return rc; @@ -691,11 +1016,12 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) refcount_inc(&idev->obj.users); return 0; } -EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD"); /** - * iommufd_device_replace - Change the device's iommu_domain + * iommufd_device_replace - Change the device/pasid's iommu_domain * @idev: device to change + * @pasid: pasid to change * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING * Output the IOMMUFD_OBJ_HWPT_PAGING ID * @@ -706,31 +1032,36 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); * * If it fails then no change is made to the attachment. The iommu driver may * implement this so there is no disruption in translation. This can only be - * called if iommufd_device_attach() has already succeeded. + * called if iommufd_device_attach() has already succeeded. @pasid is + * IOMMU_NO_PASID for no pasid usage. */ -int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) +int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id) { - return iommufd_device_change_pt(idev, pt_id, + return iommufd_device_change_pt(idev, pasid, pt_id, &iommufd_device_do_replace); } -EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, "IOMMUFD"); /** - * iommufd_device_detach - Disconnect a device to an iommu_domain + * iommufd_device_detach - Disconnect a device/device to an iommu_domain * @idev: device to detach + * @pasid: pasid to detach * * Undo iommufd_device_attach(). This disconnects the idev from the previously * attached pt_id. The device returns back to a blocked DMA translation. + * @pasid is IOMMU_NO_PASID for no pasid usage. */ -void iommufd_device_detach(struct iommufd_device *idev) +void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid) { struct iommufd_hw_pagetable *hwpt; - hwpt = iommufd_hw_pagetable_detach(idev); - iommufd_hw_pagetable_put(idev->ictx, hwpt); + hwpt = iommufd_hw_pagetable_detach(idev, pasid); + if (!hwpt) + return; refcount_dec(&idev->obj.users); } -EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, "IOMMUFD"); /* * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at @@ -769,7 +1100,7 @@ static int iommufd_access_change_ioas(struct iommufd_access *access, } if (cur_ioas) { - if (access->ops->unmap) { + if (!iommufd_access_is_internal(access) && access->ops->unmap) { mutex_unlock(&access->ioas_lock); access->ops->unmap(access->data, 0, ULONG_MAX); mutex_lock(&access->ioas_lock); @@ -805,7 +1136,39 @@ void iommufd_access_destroy_object(struct iommufd_object *obj) if (access->ioas) WARN_ON(iommufd_access_change_ioas(access, NULL)); mutex_unlock(&access->ioas_lock); - iommufd_ctx_put(access->ictx); + if (!iommufd_access_is_internal(access)) + iommufd_ctx_put(access->ictx); +} + +static struct iommufd_access *__iommufd_access_create(struct iommufd_ctx *ictx) +{ + struct iommufd_access *access; + + /* + * There is no uAPI for the access object, but to keep things symmetric + * use the object infrastructure anyhow. + */ + access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + if (IS_ERR(access)) + return access; + + /* The calling driver is a user until iommufd_access_destroy() */ + refcount_inc(&access->obj.users); + mutex_init(&access->ioas_lock); + return access; +} + +struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx) +{ + struct iommufd_access *access; + + access = __iommufd_access_create(ictx); + if (IS_ERR(access)) + return access; + access->iova_alignment = PAGE_SIZE; + + iommufd_object_finalize(ictx, &access->obj); + return access; } /** @@ -827,11 +1190,7 @@ iommufd_access_create(struct iommufd_ctx *ictx, { struct iommufd_access *access; - /* - * There is no uAPI for the access object, but to keep things symmetric - * use the object infrastructure anyhow. - */ - access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + access = __iommufd_access_create(ictx); if (IS_ERR(access)) return access; @@ -843,16 +1202,13 @@ iommufd_access_create(struct iommufd_ctx *ictx, else access->iova_alignment = 1; - /* The calling driver is a user until iommufd_access_destroy() */ - refcount_inc(&access->obj.users); access->ictx = ictx; iommufd_ctx_get(ictx); iommufd_object_finalize(ictx, &access->obj); *id = access->obj.id; - mutex_init(&access->ioas_lock); return access; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_create, "IOMMUFD"); /** * iommufd_access_destroy - Destroy an iommufd_access @@ -864,7 +1220,7 @@ void iommufd_access_destroy(struct iommufd_access *access) { iommufd_object_destroy_user(access->ictx, &access->obj); } -EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, "IOMMUFD"); void iommufd_access_detach(struct iommufd_access *access) { @@ -876,7 +1232,7 @@ void iommufd_access_detach(struct iommufd_access *access) WARN_ON(iommufd_access_change_ioas(access, NULL)); mutex_unlock(&access->ioas_lock); } -EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, "IOMMUFD"); int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) { @@ -892,7 +1248,23 @@ int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) mutex_unlock(&access->ioas_lock); return rc; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, "IOMMUFD"); + +int iommufd_access_attach_internal(struct iommufd_access *access, + struct iommufd_ioas *ioas) +{ + int rc; + + mutex_lock(&access->ioas_lock); + if (WARN_ON(access->ioas)) { + mutex_unlock(&access->ioas_lock); + return -EINVAL; + } + + rc = iommufd_access_change_ioas(access, ioas); + mutex_unlock(&access->ioas_lock); + return rc; +} int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) { @@ -907,7 +1279,7 @@ int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) mutex_unlock(&access->ioas_lock); return rc; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, "IOMMUFD"); /** * iommufd_access_notify_unmap - Notify users of an iopt to stop using it @@ -935,7 +1307,8 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, xa_lock(&ioas->iopt.access_list); xa_for_each(&ioas->iopt.access_list, index, access) { - if (!iommufd_lock_obj(&access->obj)) + if (!iommufd_lock_obj(&access->obj) || + iommufd_access_is_internal(access)) continue; xa_unlock(&ioas->iopt.access_list); @@ -959,6 +1332,7 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, void iommufd_access_unpin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length) { + bool internal = iommufd_access_is_internal(access); struct iopt_area_contig_iter iter; struct io_pagetable *iopt; unsigned long last_iova; @@ -985,12 +1359,13 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, area, iopt_area_iova_to_index(area, iter.cur_iova), iopt_area_iova_to_index( area, - min(last_iova, iopt_area_last_iova(area)))); + min(last_iova, iopt_area_last_iova(area))), + internal); WARN_ON(!iopt_area_contig_done(&iter)); up_read(&iopt->iova_rwsem); mutex_unlock(&access->ioas_lock); } -EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, "IOMMUFD"); static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter) { @@ -1034,6 +1409,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length, struct page **out_pages, unsigned int flags) { + bool internal = iommufd_access_is_internal(access); struct iopt_area_contig_iter iter; struct io_pagetable *iopt; unsigned long last_iova; @@ -1042,7 +1418,8 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, /* Driver's ops don't support pin_pages */ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && - WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap)) + WARN_ON(access->iova_alignment != PAGE_SIZE || + (!internal && !access->ops->unmap))) return -EINVAL; if (!length) @@ -1076,7 +1453,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, } rc = iopt_area_add_access(area, index, last_index, out_pages, - flags); + flags, internal); if (rc) goto err_remove; out_pages += last_index - index + 1; @@ -1099,13 +1476,14 @@ err_remove: iopt_area_iova_to_index(area, iter.cur_iova), iopt_area_iova_to_index( area, min(last_iova, - iopt_area_last_iova(area)))); + iopt_area_last_iova(area))), + internal); } up_read(&iopt->iova_rwsem); mutex_unlock(&access->ioas_lock); return rc; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, "IOMMUFD"); /** * iommufd_access_rw - Read or write data under the iova @@ -1126,7 +1504,7 @@ int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, struct io_pagetable *iopt; struct iopt_area *area; unsigned long last_iova; - int rc; + int rc = -EINVAL; if (!length) return -EINVAL; @@ -1169,10 +1547,11 @@ err_out: mutex_unlock(&access->ioas_lock); return rc; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD"); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) { + const u32 SUPPORTED_FLAGS = IOMMU_HW_INFO_FLAG_INPUT_TYPE; struct iommu_hw_info *cmd = ucmd->cmd; void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr); const struct iommu_ops *ops; @@ -1182,9 +1561,15 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) void *data; int rc; - if (cmd->flags || cmd->__reserved) + if (cmd->flags & ~SUPPORTED_FLAGS) + return -EOPNOTSUPP; + if (cmd->__reserved[0] || cmd->__reserved[1] || cmd->__reserved[2]) return -EOPNOTSUPP; + /* Clear the type field since drivers don't support a random input */ + if (!(cmd->flags & IOMMU_HW_INFO_FLAG_INPUT_TYPE)) + cmd->in_data_type = IOMMU_HW_INFO_TYPE_DEFAULT; + idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); @@ -1203,7 +1588,7 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) */ if (WARN_ON_ONCE(cmd->out_data_type == IOMMU_HW_INFO_TYPE_NONE)) { - rc = -ENODEV; + rc = -EOPNOTSUPP; goto out_free; } } else { @@ -1239,6 +1624,36 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + cmd->out_max_pasid_log2 = 0; + /* + * Currently, all iommu drivers enable PASID in the probe_device() + * op if iommu and device supports it. So the max_pasids stored in + * dev->iommu indicates both PASID support and enable status. A + * non-zero dev->iommu->max_pasids means PASID is supported and + * enabled. The iommufd only reports PASID capability to userspace + * if it's enabled. + */ + if (idev->dev->iommu->max_pasids) { + cmd->out_max_pasid_log2 = ilog2(idev->dev->iommu->max_pasids); + + if (dev_is_pci(idev->dev)) { + struct pci_dev *pdev = to_pci_dev(idev->dev); + int ctrl; + + ctrl = pci_pasid_status(pdev); + + WARN_ON_ONCE(ctrl < 0 || + !(ctrl & PCI_PASID_CTRL_ENABLE)); + + if (ctrl & PCI_PASID_CTRL_EXEC) + cmd->out_capabilities |= + IOMMU_HW_CAP_PCI_PASID_EXEC; + if (ctrl & PCI_PASID_CTRL_PRIV) + cmd->out_capabilities |= + IOMMU_HW_CAP_PCI_PASID_PRIV; + } + } + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_free: kfree(data); diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c new file mode 100644 index 000000000000..21d4a35538f6 --- /dev/null +++ b/drivers/iommu/iommufd/driver.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "iommufd_private.h" + +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +int _iommufd_object_depend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ + /* Reject self dependency that dead locks */ + if (obj_dependent == obj_depended) + return -EINVAL; + /* Only support dependency between two objects of the same type */ + if (obj_dependent->type != obj_depended->type) + return -EINVAL; + + refcount_inc(&obj_depended->users); + return 0; +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_depend, "IOMMUFD"); + +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +void _iommufd_object_undepend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ + if (WARN_ON_ONCE(obj_dependent == obj_depended || + obj_dependent->type != obj_depended->type)) + return; + + refcount_dec(&obj_depended->users); +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_undepend, "IOMMUFD"); + +/* + * Allocate an @offset to return to user space to use for an mmap() syscall + * + * Driver should use a per-structure helper in include/linux/iommufd.h + */ +int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner, + phys_addr_t mmio_addr, size_t length, + unsigned long *offset) +{ + struct iommufd_mmap *immap; + unsigned long startp; + int rc; + + if (!PAGE_ALIGNED(mmio_addr)) + return -EINVAL; + if (!length || !PAGE_ALIGNED(length)) + return -EINVAL; + + immap = kzalloc(sizeof(*immap), GFP_KERNEL); + if (!immap) + return -ENOMEM; + immap->owner = owner; + immap->length = length; + immap->mmio_addr = mmio_addr; + + /* Skip the first page to ease caller identifying the returned offset */ + rc = mtree_alloc_range(&ictx->mt_mmap, &startp, immap, immap->length, + PAGE_SIZE, ULONG_MAX, GFP_KERNEL); + if (rc < 0) { + kfree(immap); + return rc; + } + + /* mmap() syscall will right-shift the offset in vma->vm_pgoff too */ + immap->vm_pgoff = startp >> PAGE_SHIFT; + *offset = startp; + return 0; +} +EXPORT_SYMBOL_NS_GPL(_iommufd_alloc_mmap, "IOMMUFD"); + +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +void _iommufd_destroy_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, unsigned long offset) +{ + struct iommufd_mmap *immap; + + immap = mtree_erase(&ictx->mt_mmap, offset); + WARN_ON_ONCE(!immap || immap->owner != owner); + kfree(immap); +} +EXPORT_SYMBOL_NS_GPL(_iommufd_destroy_mmap, "IOMMUFD"); + +struct device *iommufd_vdevice_to_device(struct iommufd_vdevice *vdev) +{ + return vdev->idev->dev; +} +EXPORT_SYMBOL_NS_GPL(iommufd_vdevice_to_device, "IOMMUFD"); + +/* Caller should xa_lock(&viommu->vdevs) to protect the return value */ +struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, + unsigned long vdev_id) +{ + struct iommufd_vdevice *vdev; + + lockdep_assert_held(&viommu->vdevs.xa_lock); + + vdev = xa_load(&viommu->vdevs, vdev_id); + return vdev ? iommufd_vdevice_to_device(vdev) : NULL; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, "IOMMUFD"); + +/* Return -ENOENT if device is not associated to the vIOMMU */ +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id) +{ + struct iommufd_vdevice *vdev; + unsigned long index; + int rc = -ENOENT; + + if (WARN_ON_ONCE(!vdev_id)) + return -EINVAL; + + xa_lock(&viommu->vdevs); + xa_for_each(&viommu->vdevs, index, vdev) { + if (iommufd_vdevice_to_device(vdev) == dev) { + *vdev_id = vdev->virt_id; + rc = 0; + break; + } + } + xa_unlock(&viommu->vdevs); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_get_vdev_id, "IOMMUFD"); + +/* + * Typically called in driver's threaded IRQ handler. + * The @type and @event_data must be defined in include/uapi/linux/iommufd.h + */ +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len) +{ + struct iommufd_veventq *veventq; + struct iommufd_vevent *vevent; + int rc = 0; + + if (WARN_ON_ONCE(!data_len || !event_data)) + return -EINVAL; + + down_read(&viommu->veventqs_rwsem); + + veventq = iommufd_viommu_find_veventq(viommu, type); + if (!veventq) { + rc = -EOPNOTSUPP; + goto out_unlock_veventqs; + } + + spin_lock(&veventq->common.lock); + if (veventq->num_events == veventq->depth) { + vevent = &veventq->lost_events_header; + goto out_set_header; + } + + vevent = kzalloc(struct_size(vevent, event_data, data_len), GFP_ATOMIC); + if (!vevent) { + rc = -ENOMEM; + vevent = &veventq->lost_events_header; + goto out_set_header; + } + vevent->data_len = data_len; + memcpy(vevent->event_data, event_data, data_len); + veventq->num_events++; + +out_set_header: + iommufd_vevent_handler(veventq, vevent); + spin_unlock(&veventq->common.lock); +out_unlock_veventqs: + up_read(&viommu->veventqs_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_event, "IOMMUFD"); + +#ifdef CONFIG_IRQ_MSI_IOMMU +/* + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq + * layer. The mapping to IOVA is global to the iommufd file descriptor, every + * domain that is attached to a device using the same MSI parameters will use + * the same IOVA. + */ +static struct iommufd_sw_msi_map * +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr, + phys_addr_t sw_msi_start) +{ + struct iommufd_sw_msi_map *cur; + unsigned int max_pgoff = 0; + + lockdep_assert_held(&ictx->sw_msi_lock); + + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + if (cur->sw_msi_start != sw_msi_start) + continue; + max_pgoff = max(max_pgoff, cur->pgoff + 1); + if (cur->msi_addr == msi_addr) + return cur; + } + + if (ictx->sw_msi_id >= + BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap)) + return ERR_PTR(-EOVERFLOW); + + cur = kzalloc(sizeof(*cur), GFP_KERNEL); + if (!cur) + return ERR_PTR(-ENOMEM); + + cur->sw_msi_start = sw_msi_start; + cur->msi_addr = msi_addr; + cur->pgoff = max_pgoff; + cur->id = ictx->sw_msi_id++; + list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list); + return cur; +} + +int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map) +{ + unsigned long iova; + + lockdep_assert_held(&ictx->sw_msi_lock); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) { + int rc; + + rc = iommu_map(hwpt_paging->common.domain, iova, + msi_map->msi_addr, PAGE_SIZE, + IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO, + GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap); + } + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_sw_msi_install, "IOMMUFD_INTERNAL"); + +/* + * Called by the irq code if the platform translates the MSI address through the + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will + * allocate a fd global iova for the physical page that is the same on all + * domains and devices. + */ +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommufd_hwpt_paging *hwpt_paging; + struct iommu_attach_handle *raw_handle; + struct iommufd_attach_handle *handle; + struct iommufd_sw_msi_map *msi_map; + struct iommufd_ctx *ictx; + unsigned long iova; + int rc; + + /* + * It is safe to call iommu_attach_handle_get() here because the iommu + * core code invokes this under the group mutex which also prevents any + * change of the attach handle for the duration of this function. + */ + iommu_group_mutex_assert(dev); + + raw_handle = + iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); + if (IS_ERR(raw_handle)) + return 0; + hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt); + + handle = to_iommufd_handle(raw_handle); + /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */ + if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; + + ictx = handle->idev->ictx; + guard(mutex)(&ictx->sw_msi_lock); + /* + * The input msi_addr is the exact byte offset of the MSI doorbell, we + * assume the caller has checked that it is contained with a MMIO region + * that is secure to map at PAGE_SIZE. + */ + msi_map = iommufd_sw_msi_get_map(handle->idev->ictx, + msi_addr & PAGE_MASK, + handle->idev->igroup->sw_msi_start); + if (IS_ERR(msi_map)) + return PTR_ERR(msi_map); + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map); + if (rc) + return rc; + __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT); + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_sw_msi, "IOMMUFD"); +#endif + +MODULE_DESCRIPTION("iommufd code shared with builtin modules"); +MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c new file mode 100644 index 000000000000..e23d9ee4fe38 --- /dev/null +++ b/drivers/iommu/iommufd/eventq.c @@ -0,0 +1,541 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Intel Corporation + */ +#define pr_fmt(fmt) "iommufd: " fmt + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/iommufd.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/poll.h> +#include <uapi/linux/iommufd.h> + +#include "../iommu-priv.h" +#include "iommufd_private.h" + +/* IOMMUFD_OBJ_FAULT Functions */ +void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, + struct iommufd_attach_handle *handle) +{ + struct iommufd_fault *fault = hwpt->fault; + struct iopf_group *group, *next; + struct list_head free_list; + unsigned long index; + + if (!fault || !handle) + return; + INIT_LIST_HEAD(&free_list); + + mutex_lock(&fault->mutex); + spin_lock(&fault->common.lock); + list_for_each_entry_safe(group, next, &fault->common.deliver, node) { + if (group->attach_handle != &handle->handle) + continue; + list_move(&group->node, &free_list); + } + spin_unlock(&fault->common.lock); + + list_for_each_entry_safe(group, next, &free_list, node) { + list_del(&group->node); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + + xa_for_each(&fault->response, index, group) { + if (group->attach_handle != &handle->handle) + continue; + xa_erase(&fault->response, index); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + mutex_unlock(&fault->mutex); +} + +void iommufd_fault_destroy(struct iommufd_object *obj) +{ + struct iommufd_eventq *eventq = + container_of(obj, struct iommufd_eventq, obj); + struct iommufd_fault *fault = eventq_to_fault(eventq); + struct iopf_group *group, *next; + unsigned long index; + + /* + * The iommufd object's reference count is zero at this point. + * We can be confident that no other threads are currently + * accessing this pointer. Therefore, acquiring the mutex here + * is unnecessary. + */ + list_for_each_entry_safe(group, next, &fault->common.deliver, node) { + list_del(&group->node); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + xa_for_each(&fault->response, index, group) { + xa_erase(&fault->response, index); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + xa_destroy(&fault->response); + mutex_destroy(&fault->mutex); +} + +static void iommufd_compose_fault_message(struct iommu_fault *fault, + struct iommu_hwpt_pgfault *hwpt_fault, + struct iommufd_device *idev, + u32 cookie) +{ + hwpt_fault->flags = fault->prm.flags; + hwpt_fault->dev_id = idev->obj.id; + hwpt_fault->pasid = fault->prm.pasid; + hwpt_fault->grpid = fault->prm.grpid; + hwpt_fault->perm = fault->prm.perm; + hwpt_fault->addr = fault->prm.addr; + hwpt_fault->length = 0; + hwpt_fault->cookie = cookie; +} + +/* Fetch the first node out of the fault->deliver list */ +static struct iopf_group * +iommufd_fault_deliver_fetch(struct iommufd_fault *fault) +{ + struct list_head *list = &fault->common.deliver; + struct iopf_group *group = NULL; + + spin_lock(&fault->common.lock); + if (!list_empty(list)) { + group = list_first_entry(list, struct iopf_group, node); + list_del(&group->node); + } + spin_unlock(&fault->common.lock); + return group; +} + +/* Restore a node back to the head of the fault->deliver list */ +static void iommufd_fault_deliver_restore(struct iommufd_fault *fault, + struct iopf_group *group) +{ + spin_lock(&fault->common.lock); + list_add(&group->node, &fault->common.deliver); + spin_unlock(&fault->common.lock); +} + +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + size_t fault_size = sizeof(struct iommu_hwpt_pgfault); + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_fault *fault = eventq_to_fault(eventq); + struct iommu_hwpt_pgfault data = {}; + struct iommufd_device *idev; + struct iopf_group *group; + struct iopf_fault *iopf; + size_t done = 0; + int rc = 0; + + if (*ppos || count % fault_size) + return -ESPIPE; + + mutex_lock(&fault->mutex); + while ((group = iommufd_fault_deliver_fetch(fault))) { + if (done >= count || + group->fault_count * fault_size > count - done) { + iommufd_fault_deliver_restore(fault, group); + break; + } + + rc = xa_alloc(&fault->response, &group->cookie, group, + xa_limit_32b, GFP_KERNEL); + if (rc) { + iommufd_fault_deliver_restore(fault, group); + break; + } + + idev = to_iommufd_handle(group->attach_handle)->idev; + list_for_each_entry(iopf, &group->faults, list) { + iommufd_compose_fault_message(&iopf->fault, + &data, idev, + group->cookie); + if (copy_to_user(buf + done, &data, fault_size)) { + xa_erase(&fault->response, group->cookie); + iommufd_fault_deliver_restore(fault, group); + rc = -EFAULT; + break; + } + done += fault_size; + } + } + mutex_unlock(&fault->mutex); + + return done == 0 ? rc : done; +} + +static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf, + size_t count, loff_t *ppos) +{ + size_t response_size = sizeof(struct iommu_hwpt_page_response); + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_fault *fault = eventq_to_fault(eventq); + struct iommu_hwpt_page_response response; + struct iopf_group *group; + size_t done = 0; + int rc = 0; + + if (*ppos || count % response_size) + return -ESPIPE; + + mutex_lock(&fault->mutex); + while (count > done) { + rc = copy_from_user(&response, buf + done, response_size); + if (rc) + break; + + static_assert((int)IOMMUFD_PAGE_RESP_SUCCESS == + (int)IOMMU_PAGE_RESP_SUCCESS); + static_assert((int)IOMMUFD_PAGE_RESP_INVALID == + (int)IOMMU_PAGE_RESP_INVALID); + if (response.code != IOMMUFD_PAGE_RESP_SUCCESS && + response.code != IOMMUFD_PAGE_RESP_INVALID) { + rc = -EINVAL; + break; + } + + group = xa_erase(&fault->response, response.cookie); + if (!group) { + rc = -EINVAL; + break; + } + + iopf_group_response(group, response.code); + iopf_free_group(group); + done += response_size; + } + mutex_unlock(&fault->mutex); + + return done == 0 ? rc : done; +} + +/* IOMMUFD_OBJ_VEVENTQ Functions */ + +void iommufd_veventq_abort(struct iommufd_object *obj) +{ + struct iommufd_eventq *eventq = + container_of(obj, struct iommufd_eventq, obj); + struct iommufd_veventq *veventq = eventq_to_veventq(eventq); + struct iommufd_viommu *viommu = veventq->viommu; + struct iommufd_vevent *cur, *next; + + lockdep_assert_held_write(&viommu->veventqs_rwsem); + + list_for_each_entry_safe(cur, next, &eventq->deliver, node) { + list_del(&cur->node); + if (cur != &veventq->lost_events_header) + kfree(cur); + } + + refcount_dec(&viommu->obj.users); + list_del(&veventq->node); +} + +void iommufd_veventq_destroy(struct iommufd_object *obj) +{ + struct iommufd_veventq *veventq = eventq_to_veventq( + container_of(obj, struct iommufd_eventq, obj)); + + down_write(&veventq->viommu->veventqs_rwsem); + iommufd_veventq_abort(obj); + up_write(&veventq->viommu->veventqs_rwsem); +} + +static struct iommufd_vevent * +iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq) +{ + struct iommufd_eventq *eventq = &veventq->common; + struct list_head *list = &eventq->deliver; + struct iommufd_vevent *vevent = NULL; + + spin_lock(&eventq->lock); + if (!list_empty(list)) { + struct iommufd_vevent *next; + + next = list_first_entry(list, struct iommufd_vevent, node); + /* Make a copy of the lost_events_header for copy_to_user */ + if (next == &veventq->lost_events_header) { + vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC); + if (!vevent) + goto out_unlock; + } + list_del(&next->node); + if (vevent) + memcpy(vevent, next, sizeof(*vevent)); + else + vevent = next; + } +out_unlock: + spin_unlock(&eventq->lock); + return vevent; +} + +static void iommufd_veventq_deliver_restore(struct iommufd_veventq *veventq, + struct iommufd_vevent *vevent) +{ + struct iommufd_eventq *eventq = &veventq->common; + struct list_head *list = &eventq->deliver; + + spin_lock(&eventq->lock); + if (vevent_for_lost_events_header(vevent)) { + /* Remove the copy of the lost_events_header */ + kfree(vevent); + vevent = NULL; + /* An empty list needs the lost_events_header back */ + if (list_empty(list)) + vevent = &veventq->lost_events_header; + } + if (vevent) + list_add(&vevent->node, list); + spin_unlock(&eventq->lock); +} + +static ssize_t iommufd_veventq_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_veventq *veventq = eventq_to_veventq(eventq); + struct iommufd_vevent_header *hdr; + struct iommufd_vevent *cur; + size_t done = 0; + int rc = 0; + + if (*ppos) + return -ESPIPE; + + while ((cur = iommufd_veventq_deliver_fetch(veventq))) { + /* Validate the remaining bytes against the header size */ + if (done >= count || sizeof(*hdr) > count - done) { + iommufd_veventq_deliver_restore(veventq, cur); + break; + } + hdr = &cur->header; + + /* If being a normal vEVENT, validate against the full size */ + if (!vevent_for_lost_events_header(cur) && + sizeof(hdr) + cur->data_len > count - done) { + iommufd_veventq_deliver_restore(veventq, cur); + break; + } + + if (copy_to_user(buf + done, hdr, sizeof(*hdr))) { + iommufd_veventq_deliver_restore(veventq, cur); + rc = -EFAULT; + break; + } + done += sizeof(*hdr); + + if (cur->data_len && + copy_to_user(buf + done, cur->event_data, cur->data_len)) { + iommufd_veventq_deliver_restore(veventq, cur); + rc = -EFAULT; + break; + } + spin_lock(&eventq->lock); + if (!vevent_for_lost_events_header(cur)) + veventq->num_events--; + spin_unlock(&eventq->lock); + done += cur->data_len; + kfree(cur); + } + + return done == 0 ? rc : done; +} + +/* Common Event Queue Functions */ + +static __poll_t iommufd_eventq_fops_poll(struct file *filep, + struct poll_table_struct *wait) +{ + struct iommufd_eventq *eventq = filep->private_data; + __poll_t pollflags = 0; + + if (eventq->obj.type == IOMMUFD_OBJ_FAULT) + pollflags |= EPOLLOUT; + + poll_wait(filep, &eventq->wait_queue, wait); + spin_lock(&eventq->lock); + if (!list_empty(&eventq->deliver)) + pollflags |= EPOLLIN | EPOLLRDNORM; + spin_unlock(&eventq->lock); + + return pollflags; +} + +static int iommufd_eventq_fops_release(struct inode *inode, struct file *filep) +{ + struct iommufd_eventq *eventq = filep->private_data; + + refcount_dec(&eventq->obj.users); + iommufd_ctx_put(eventq->ictx); + return 0; +} + +#define INIT_EVENTQ_FOPS(read_op, write_op) \ + ((const struct file_operations){ \ + .owner = THIS_MODULE, \ + .open = nonseekable_open, \ + .read = read_op, \ + .write = write_op, \ + .poll = iommufd_eventq_fops_poll, \ + .release = iommufd_eventq_fops_release, \ + }) + +static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name, + struct iommufd_ctx *ictx, + const struct file_operations *fops) +{ + struct file *filep; + + spin_lock_init(&eventq->lock); + INIT_LIST_HEAD(&eventq->deliver); + init_waitqueue_head(&eventq->wait_queue); + + /* The filep is fput() by the core code during failure */ + filep = anon_inode_getfile(name, fops, eventq, O_RDWR); + if (IS_ERR(filep)) + return PTR_ERR(filep); + + eventq->ictx = ictx; + iommufd_ctx_get(eventq->ictx); + eventq->filep = filep; + refcount_inc(&eventq->obj.users); + + return get_unused_fd_flags(O_CLOEXEC); +} + +static const struct file_operations iommufd_fault_fops = + INIT_EVENTQ_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write); + +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_fault_alloc *cmd = ucmd->cmd; + struct iommufd_fault *fault; + int fdno; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + fault = __iommufd_object_alloc_ucmd(ucmd, fault, IOMMUFD_OBJ_FAULT, + common.obj); + if (IS_ERR(fault)) + return PTR_ERR(fault); + + xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); + mutex_init(&fault->mutex); + + fdno = iommufd_eventq_init(&fault->common, "[iommufd-pgfault]", + ucmd->ictx, &iommufd_fault_fops); + if (fdno < 0) + return fdno; + + cmd->out_fault_id = fault->common.obj.id; + cmd->out_fault_fd = fdno; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put_fdno; + + fd_install(fdno, fault->common.filep); + + return 0; +out_put_fdno: + put_unused_fd(fdno); + return rc; +} + +int iommufd_fault_iopf_handler(struct iopf_group *group) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_fault *fault; + + hwpt = group->attach_handle->domain->iommufd_hwpt; + fault = hwpt->fault; + + spin_lock(&fault->common.lock); + list_add_tail(&group->node, &fault->common.deliver); + spin_unlock(&fault->common.lock); + + wake_up_interruptible(&fault->common.wait_queue); + + return 0; +} + +static const struct file_operations iommufd_veventq_fops = + INIT_EVENTQ_FOPS(iommufd_veventq_fops_read, NULL); + +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_veventq_alloc *cmd = ucmd->cmd; + struct iommufd_veventq *veventq; + struct iommufd_viommu *viommu; + int fdno; + int rc; + + if (cmd->flags || cmd->__reserved || + cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->veventq_depth) + return -EINVAL; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + down_write(&viommu->veventqs_rwsem); + + if (iommufd_viommu_find_veventq(viommu, cmd->type)) { + rc = -EEXIST; + goto out_unlock_veventqs; + } + + veventq = __iommufd_object_alloc(ucmd->ictx, veventq, + IOMMUFD_OBJ_VEVENTQ, common.obj); + if (IS_ERR(veventq)) { + rc = PTR_ERR(veventq); + goto out_unlock_veventqs; + } + + veventq->type = cmd->type; + veventq->viommu = viommu; + refcount_inc(&viommu->obj.users); + veventq->depth = cmd->veventq_depth; + list_add_tail(&veventq->node, &viommu->veventqs); + veventq->lost_events_header.header.flags = + IOMMU_VEVENTQ_FLAG_LOST_EVENTS; + + fdno = iommufd_eventq_init(&veventq->common, "[iommufd-viommu-event]", + ucmd->ictx, &iommufd_veventq_fops); + if (fdno < 0) { + rc = fdno; + goto out_abort; + } + + cmd->out_veventq_id = veventq->common.obj.id; + cmd->out_veventq_fd = fdno; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put_fdno; + + iommufd_object_finalize(ucmd->ictx, &veventq->common.obj); + fd_install(fdno, veventq->common.filep); + goto out_unlock_veventqs; + +out_put_fdno: + put_unused_fd(fdno); +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj); +out_unlock_veventqs: + up_write(&viommu->veventqs_rwsem); + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 33d142f8057d..fe789c2dc0c9 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -8,6 +8,15 @@ #include "../iommu-priv.h" #include "iommufd_private.h" +static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt) +{ + if (hwpt->domain) + iommu_domain_free(hwpt->domain); + + if (hwpt->fault) + refcount_dec(&hwpt->fault->common.obj.users); +} + void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) { struct iommufd_hwpt_paging *hwpt_paging = @@ -22,9 +31,7 @@ void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) hwpt_paging->common.domain); } - if (hwpt_paging->common.domain) - iommu_domain_free(hwpt_paging->common.domain); - + __iommufd_hwpt_destroy(&hwpt_paging->common); refcount_dec(&hwpt_paging->ioas->obj.users); } @@ -49,10 +56,11 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) struct iommufd_hwpt_nested *hwpt_nested = container_of(obj, struct iommufd_hwpt_nested, common.obj); - if (hwpt_nested->common.domain) - iommu_domain_free(hwpt_nested->common.domain); - - refcount_dec(&hwpt_nested->parent->common.obj.users); + __iommufd_hwpt_destroy(&hwpt_nested->common); + if (hwpt_nested->viommu) + refcount_dec(&hwpt_nested->viommu->obj.users); + else + refcount_dec(&hwpt_nested->parent->common.obj.users); } void iommufd_hwpt_nested_abort(struct iommufd_object *obj) @@ -82,6 +90,7 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for + * @pasid: PASID to get an iommu_domain for * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt * @user_data: The user provided driver specific data describing the domain to @@ -97,12 +106,14 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) */ struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach, + struct iommufd_device *idev, ioasid_t pasid, + u32 flags, bool immediate_attach, const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_FAULT_ID_VALID | + IOMMU_HWPT_ALLOC_PASID; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; @@ -110,16 +121,23 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, lockdep_assert_held(&ioas->mutex); - if ((flags || user_data) && !ops->domain_alloc_user) + if ((flags || user_data) && !ops->domain_alloc_paging_flags) return ERR_PTR(-EOPNOTSUPP); if (flags & ~valid_flags) return ERR_PTR(-EOPNOTSUPP); + if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + !device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) + return ERR_PTR(-EOPNOTSUPP); + if ((flags & IOMMU_HWPT_FAULT_ID_VALID) && + (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)) + return ERR_PTR(-EOPNOTSUPP); hwpt_paging = __iommufd_object_alloc( ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); if (IS_ERR(hwpt_paging)) return ERR_CAST(hwpt_paging); hwpt = &hwpt_paging->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; INIT_LIST_HEAD(&hwpt_paging->hwpt_item); /* Pairs with iommufd_hw_pagetable_destroy() */ @@ -127,9 +145,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, hwpt_paging->ioas = ioas; hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - if (ops->domain_alloc_user) { - hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL, - user_data); + if (ops->domain_alloc_paging_flags) { + hwpt->domain = ops->domain_alloc_paging_flags(idev->dev, + flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; @@ -137,12 +155,15 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, } hwpt->domain->owner = ops; } else { - hwpt->domain = iommu_domain_alloc(idev->dev->bus); - if (!hwpt->domain) { - rc = -ENOMEM; + hwpt->domain = iommu_paging_domain_alloc(idev->dev); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; goto out_abort; } } + hwpt->domain->iommufd_hwpt = hwpt; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; /* * Set the coherency mode before we do iopt_table_add_domain() as some @@ -171,7 +192,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, * sequence. Once those drivers are fixed this should be removed. */ if (immediate_attach) { - rc = iommufd_hw_pagetable_attach(hwpt, idev); + rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid); if (rc) goto out_abort; } @@ -184,7 +205,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, out_detach: if (immediate_attach) - iommufd_hw_pagetable_detach(idev); + iommufd_hw_pagetable_detach(idev, pasid); out_abort: iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); @@ -213,9 +234,11 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt; int rc; - if (flags || !user_data->len || !ops->domain_alloc_user) + if ((flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) || + !user_data->len || !ops->domain_alloc_nested) return ERR_PTR(-EOPNOTSUPP); - if (parent->auto_domain || !parent->nest_parent) + if (parent->auto_domain || !parent->nest_parent || + parent->common.domain->owner != ops) return ERR_PTR(-EINVAL); hwpt_nested = __iommufd_object_alloc( @@ -223,21 +246,25 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, if (IS_ERR(hwpt_nested)) return ERR_CAST(hwpt_nested); hwpt = &hwpt_nested->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; refcount_inc(&parent->common.obj.users); hwpt_nested->parent = parent; - hwpt->domain = ops->domain_alloc_user(idev->dev, flags, - parent->common.domain, user_data); + hwpt->domain = ops->domain_alloc_nested( + idev->dev, parent->common.domain, + flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; goto out_abort; } hwpt->domain->owner = ops; + hwpt->domain->iommufd_hwpt = hwpt; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { - rc = -EINVAL; + rc = -EOPNOTSUPP; goto out_abort; } return hwpt_nested; @@ -247,6 +274,63 @@ out_abort: return ERR_PTR(rc); } +/** + * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU + * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED + * hw_pagetable. + */ +static struct iommufd_hwpt_nested * +iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) + return ERR_PTR(-EOPNOTSUPP); + if (!user_data->len) + return ERR_PTR(-EOPNOTSUPP); + if (!viommu->ops || !viommu->ops->alloc_domain_nested) + return ERR_PTR(-EOPNOTSUPP); + + hwpt_nested = __iommufd_object_alloc( + viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; + + hwpt_nested->viommu = viommu; + refcount_inc(&viommu->obj.users); + hwpt_nested->parent = viommu->hwpt; + + hwpt->domain = viommu->ops->alloc_domain_nested( + viommu, flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + hwpt->domain->iommufd_hwpt = hwpt; + hwpt->domain->owner = viommu->iommu_dev->ops; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EOPNOTSUPP; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; @@ -283,8 +367,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) ioas = container_of(pt_obj, struct iommufd_ioas, obj); mutex_lock(&ioas->mutex); hwpt_paging = iommufd_hwpt_paging_alloc( - ucmd->ictx, ioas, idev, cmd->flags, false, - user_data.len ? &user_data : NULL); + ucmd->ictx, ioas, idev, IOMMU_NO_PASID, cmd->flags, + false, user_data.len ? &user_data : NULL); if (IS_ERR(hwpt_paging)) { rc = PTR_ERR(hwpt_paging); goto out_unlock; @@ -303,11 +387,41 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) goto out_unlock; } hwpt = &hwpt_nested->common; + } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_viommu *viommu; + + viommu = container_of(pt_obj, struct iommufd_viommu, obj); + if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { + rc = -EINVAL; + goto out_unlock; + } + hwpt_nested = iommufd_viommu_alloc_hwpt_nested( + viommu, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; } else { rc = -EINVAL; goto out_put_pt; } + if (cmd->flags & IOMMU_HWPT_FAULT_ID_VALID) { + struct iommufd_fault *fault; + + fault = iommufd_get_fault(ucmd, cmd->fault_id); + if (IS_ERR(fault)) { + rc = PTR_ERR(fault); + goto out_hwpt; + } + hwpt->fault = fault; + hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; + refcount_inc(&fault->common.obj.users); + iommufd_put_object(ucmd->ictx, &fault->common.obj); + } + cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) @@ -384,7 +498,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) .entry_len = cmd->entry_len, .entry_num = cmd->entry_num, }; - struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *pt_obj; u32 done_num = 0; int rc; @@ -398,17 +512,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) goto out; } - hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id); - if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); + pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) { + rc = PTR_ERR(pt_obj); goto out; } + if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + if (!hwpt->domain->ops || + !hwpt->domain->ops->cache_invalidate_user) { + rc = -EOPNOTSUPP; + goto out_put_pt; + } + rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, + &data_array); + } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { + struct iommufd_viommu *viommu = + container_of(pt_obj, struct iommufd_viommu, obj); + + if (!viommu->ops || !viommu->ops->cache_invalidate) { + rc = -EOPNOTSUPP; + goto out_put_pt; + } + rc = viommu->ops->cache_invalidate(viommu, &data_array); + } else { + rc = -EINVAL; + goto out_put_pt; + } - rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, - &data_array); done_num = data_array.entry_num; - iommufd_put_object(ucmd->ictx, &hwpt->obj); +out_put_pt: + iommufd_put_object(ucmd->ictx, pt_obj); out: cmd->entry_num = done_num; if (iommufd_ucmd_respond(ucmd, sizeof(*cmd))) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 05fd9d3abf1b..54cf4d856179 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -8,17 +8,19 @@ * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ +#include <linux/dma-buf.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/lockdep.h> -#include <linux/iommu.h> #include <linux/sched/mm.h> -#include <linux/err.h> #include <linux/slab.h> -#include <linux/errno.h> #include <uapi/linux/iommufd.h> -#include "io_pagetable.h" #include "double_span.h" +#include "io_pagetable.h" struct iopt_pages_list { struct iopt_pages *pages; @@ -70,36 +72,45 @@ struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) return iter->area; } -static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, - unsigned long length, - unsigned long iova_alignment, - unsigned long page_offset) +static bool __alloc_iova_check_range(unsigned long *start, unsigned long last, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) { - if (span->is_used || span->last_hole - span->start_hole < length - 1) + unsigned long aligned_start; + + /* ALIGN_UP() */ + if (check_add_overflow(*start, iova_alignment - 1, &aligned_start)) return false; + aligned_start &= ~(iova_alignment - 1); + aligned_start |= page_offset; - span->start_hole = ALIGN(span->start_hole, iova_alignment) | - page_offset; - if (span->start_hole > span->last_hole || - span->last_hole - span->start_hole < length - 1) + if (aligned_start >= last || last - aligned_start < length - 1) return false; + *start = aligned_start; return true; } -static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, +static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, unsigned long length, unsigned long iova_alignment, unsigned long page_offset) { - if (span->is_hole || span->last_used - span->start_used < length - 1) + if (span->is_used) return false; + return __alloc_iova_check_range(&span->start_hole, span->last_hole, + length, iova_alignment, page_offset); +} - span->start_used = ALIGN(span->start_used, iova_alignment) | - page_offset; - if (span->start_used > span->last_used || - span->last_used - span->start_used < length - 1) +static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_hole) return false; - return true; + return __alloc_iova_check_range(&span->start_used, span->last_used, + length, iova_alignment, page_offset); } /* @@ -107,11 +118,12 @@ static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, * Does not return a 0 IOVA even if it is valid. */ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, - unsigned long uptr, unsigned long length) + unsigned long addr, unsigned long length) { - unsigned long page_offset = uptr % PAGE_SIZE; + unsigned long page_offset = addr % PAGE_SIZE; struct interval_tree_double_span_iter used_span; struct interval_tree_span_iter allowed_span; + unsigned long max_alignment = PAGE_SIZE; unsigned long iova_alignment; lockdep_assert_held(&iopt->iova_rwsem); @@ -121,15 +133,22 @@ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, return -EOVERFLOW; /* - * Keep alignment present in the uptr when building the IOVA, this + * Keep alignment present in addr when building the IOVA, which * increases the chance we can map a THP. */ - if (!uptr) + if (!addr) iova_alignment = roundup_pow_of_two(length); else iova_alignment = min_t(unsigned long, roundup_pow_of_two(length), - 1UL << __ffs64(uptr)); + 1UL << __ffs64(addr)); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + max_alignment = HPAGE_SIZE; +#endif + /* Protect against ALIGN() overflow */ + if (iova_alignment >= max_alignment) + iova_alignment = max_alignment; if (iova_alignment < iopt->iova_alignment) return -EINVAL; @@ -240,6 +259,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, int iommu_prot, unsigned int flags) { struct iopt_pages_list *elm; + unsigned long start; unsigned long iova; int rc = 0; @@ -259,9 +279,18 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, /* Use the first entry to guess the ideal IOVA alignment */ elm = list_first_entry(pages_list, struct iopt_pages_list, next); - rc = iopt_alloc_iova( - iopt, dst_iova, - (uintptr_t)elm->pages->uptr + elm->start_byte, length); + switch (elm->pages->type) { + case IOPT_ADDRESS_USER: + start = elm->start_byte + (uintptr_t)elm->pages->uptr; + break; + case IOPT_ADDRESS_FILE: + start = elm->start_byte + elm->pages->start; + break; + case IOPT_ADDRESS_DMABUF: + start = elm->start_byte + elm->pages->dmabuf.start; + break; + } + rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) goto out_unlock; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && @@ -376,6 +405,34 @@ out_unlock_domains: return rc; } +static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + struct iopt_pages *pages, unsigned long *iova, + unsigned long length, unsigned long start_byte, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list elm = {}; + LIST_HEAD(pages_list); + int rc; + + elm.pages = pages; + elm.start_byte = start_byte; + if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && + elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) + elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; + elm.length = length; + list_add(&elm.next, &pages_list); + + rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); + if (rc) { + if (elm.area) + iopt_abort_area(elm.area); + if (elm.pages) + iopt_put_pages(elm.pages); + return rc; + } + return 0; +} + /** * iopt_map_user_pages() - Map a user VA to an iova in the io page table * @ictx: iommufd_ctx the iopt is part of @@ -400,29 +457,69 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long length, int iommu_prot, unsigned int flags) { - struct iopt_pages_list elm = {}; - LIST_HEAD(pages_list); - int rc; + struct iopt_pages *pages; - elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); - if (IS_ERR(elm.pages)) - return PTR_ERR(elm.pages); - if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && - elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) - elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; - elm.start_byte = uptr - elm.pages->uptr; - elm.length = length; - list_add(&elm.next, &pages_list); + pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) + return PTR_ERR(pages); - rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); - if (rc) { - if (elm.area) - iopt_abort_area(elm.area); - if (elm.pages) - iopt_put_pages(elm.pages); - return rc; + return iopt_map_common(ictx, iopt, pages, iova, length, + uptr - pages->uptr, iommu_prot, flags); +} + +/** + * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file. + * @ictx: iommufd_ctx the iopt is part of + * @iopt: io_pagetable to act on + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains + * the chosen iova on output. Otherwise is the iova to map to on input + * @fd: fdno of a file to map + * @start: map file starting at this byte offset + * @length: Number of bytes to map + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping + * @flags: IOPT_ALLOC_IOVA or zero + */ +int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, int fd, unsigned long start, + unsigned long length, int iommu_prot, + unsigned int flags) +{ + struct iopt_pages *pages; + struct dma_buf *dmabuf; + unsigned long start_byte; + unsigned long last; + + if (!length) + return -EINVAL; + if (check_add_overflow(start, length - 1, &last)) + return -EOVERFLOW; + + start_byte = start - ALIGN_DOWN(start, PAGE_SIZE); + dmabuf = dma_buf_get(fd); + if (!IS_ERR(dmabuf)) { + pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start, + length, + iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) { + dma_buf_put(dmabuf); + return PTR_ERR(pages); + } + } else { + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; + + pages = iopt_alloc_file_pages(file, start_byte, start, length, + iommu_prot & IOMMU_WRITE); + fput(file); + if (IS_ERR(pages)) + return PTR_ERR(pages); } - return 0; + + return iopt_map_common(ictx, iopt, pages, iova, length, + start_byte, iommu_prot, flags); } struct iova_bitmap_fn_arg { @@ -643,7 +740,8 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, struct iopt_area *area; unsigned long unmapped_bytes = 0; unsigned int tries = 0; - int rc = -ENOENT; + /* If there are no mapped entries then success */ + int rc = 0; /* * The domains_rwsem must be held in read mode any time any area->pages @@ -664,6 +762,12 @@ again: goto out_unlock_iova; } + /* The area is locked by an object that has not been destroyed */ + if (area->num_locks) { + rc = -EBUSY; + goto out_unlock_iova; + } + if (area_first < start || area_last > last) { rc = -ENOENT; goto out_unlock_iova; @@ -688,8 +792,10 @@ again: iommufd_access_notify_unmap(iopt, area_first, length); /* Something is not responding to unmap requests. */ tries++; - if (WARN_ON(tries > 100)) - return -EDEADLOCK; + if (WARN_ON(tries > 100)) { + rc = -EDEADLOCK; + goto out_unmapped; + } goto again; } @@ -705,12 +811,11 @@ again: down_write(&iopt->iova_rwsem); } - if (unmapped_bytes) - rc = 0; out_unlock_iova: up_write(&iopt->iova_rwsem); up_read(&iopt->domains_rwsem); +out_unmapped: if (unmapped) *unmapped = unmapped_bytes; return rc; @@ -742,13 +847,8 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) { - int rc; - - rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); /* If the IOVAs are empty then unmap all succeeds */ - if (rc == -ENOENT) - return 0; - return rc; + return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); } /* The caller must always free all the nodes in the allowed_iova rb_root. */ @@ -894,9 +994,15 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(!area->storage_domain); if (area->storage_domain == domain) area->storage_domain = storage_domain; + if (iopt_is_dmabuf(pages)) { + if (!iopt_dmabuf_revoked(pages)) + iopt_area_unmap_domain(area, domain); + iopt_dmabuf_untrack_domain(pages, area, domain); + } mutex_unlock(&pages->mutex); - iopt_area_unmap_domain(area, domain); + if (!iopt_is_dmabuf(pages)) + iopt_area_unmap_domain(area, domain); } return; } @@ -913,6 +1019,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(area->storage_domain != domain); area->storage_domain = NULL; iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } } @@ -942,10 +1050,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt, if (!pages) continue; - mutex_lock(&pages->mutex); + guard(mutex)(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto out_unfill; + } rc = iopt_area_fill_domain(area, domain); if (rc) { - mutex_unlock(&pages->mutex); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); goto out_unfill; } if (!area->storage_domain) { @@ -954,7 +1068,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt, interval_tree_insert(&area->pages_node, &pages->domains_itree); } - mutex_unlock(&pages->mutex); } return 0; @@ -975,6 +1088,8 @@ out_unfill: area->storage_domain = NULL; } iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } return rc; @@ -1185,6 +1300,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) if (!pages || area->prevent_access) return -EBUSY; + /* Maintaining the domains_itree below is a bit complicated */ + if (iopt_is_dmabuf(pages)) + return -EOPNOTSUPP; + if (new_start & (alignment - 1) || iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; @@ -1355,8 +1474,7 @@ out_unlock: } void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access, - u32 iopt_access_list_id) + struct iommufd_access *access, u32 iopt_access_list_id) { down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 0ec3509b7e33..14cd052fd320 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -5,9 +5,10 @@ #ifndef __IO_PAGETABLE_H #define __IO_PAGETABLE_H +#include <linux/dma-buf.h> #include <linux/interval_tree.h> -#include <linux/mutex.h> #include <linux/kref.h> +#include <linux/mutex.h> #include <linux/xarray.h> #include "iommufd_private.h" @@ -48,6 +49,7 @@ struct iopt_area { int iommu_prot; bool prevent_access : 1; unsigned int num_accesses; + unsigned int num_locks; }; struct iopt_allowed { @@ -68,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain); +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain); +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain); +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages); +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages); + static inline unsigned long iopt_area_index(struct iopt_area *area) { return area->pages_node.start; @@ -173,6 +185,27 @@ enum { IOPT_PAGES_ACCOUNT_NONE = 0, IOPT_PAGES_ACCOUNT_USER = 1, IOPT_PAGES_ACCOUNT_MM = 2, + IOPT_PAGES_ACCOUNT_MODE_NUM = 3, +}; + +enum iopt_address_type { + IOPT_ADDRESS_USER = 0, + IOPT_ADDRESS_FILE, + IOPT_ADDRESS_DMABUF, +}; + +struct iopt_pages_dmabuf_track { + struct iommu_domain *domain; + struct iopt_area *area; + struct list_head elm; +}; + +struct iopt_pages_dmabuf { + struct dma_buf_attachment *attach; + struct dma_buf_phys_vec phys; + /* Always PAGE_SIZE aligned */ + unsigned long start; + struct list_head tracker; }; /* @@ -195,7 +228,16 @@ struct iopt_pages { struct task_struct *source_task; struct mm_struct *source_mm; struct user_struct *source_user; - void __user *uptr; + enum iopt_address_type type; + union { + void __user *uptr; /* IOPT_ADDRESS_USER */ + struct { /* IOPT_ADDRESS_FILE */ + struct file *file; + unsigned long start; + }; + /* IOPT_ADDRESS_DMABUF */ + struct iopt_pages_dmabuf dmabuf; + }; bool writable:1; u8 account_mode; @@ -206,8 +248,32 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; -struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, - bool writable); +static inline bool iopt_is_dmabuf(struct iopt_pages *pages) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return false; + return pages->type == IOPT_ADDRESS_DMABUF; +} + +static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages) +{ + lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) + return pages->dmabuf.phys.len == 0; + return false; +} + +struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, + unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { @@ -223,9 +289,9 @@ void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, int iopt_area_add_access(struct iopt_area *area, unsigned long start, unsigned long last, struct page **out_pages, - unsigned int flags); + unsigned int flags, bool lock_area); void iopt_area_remove_access(struct iopt_area *area, unsigned long start, - unsigned long last); + unsigned long last, bool unlock_area); int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, void *data, unsigned long length, unsigned int flags); @@ -238,4 +304,9 @@ struct iopt_pages_access { unsigned int users; }; +struct pfn_reader_user; + +int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user); + #endif diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 742248276548..f4721afedadc 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -2,9 +2,10 @@ /* * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include <linux/file.h> #include <linux/interval_tree.h> -#include <linux/iommufd.h> #include <linux/iommu.h> +#include <linux/iommufd.h> #include <uapi/linux/iommufd.h> #include "io_pagetable.h" @@ -51,7 +52,10 @@ int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_table; + + down_read(&ucmd->ictx->ioas_creation_lock); iommufd_object_finalize(ucmd->ictx, &ioas->obj); + up_read(&ucmd->ictx->ioas_creation_lock); return 0; out_table: @@ -197,6 +201,46 @@ static int conv_iommu_prot(u32 map_flags) return iommu_prot; } +int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map_file *cmd = ucmd->cmd; + unsigned long iova = cmd->iova; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + int rc; + + if (cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -EOPNOTSUPP; + + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + + rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd, + cmd->start, cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(ucmd->ictx, &ioas->obj); + return rc; +} + int iommufd_ioas_map(struct iommufd_ucmd *ucmd) { struct iommu_ioas_map *cmd = ucmd->cmd; @@ -213,6 +257,10 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd) if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); if (IS_ERR(ioas)) return PTR_ERR(ioas); @@ -253,6 +301,10 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) cmd->dst_iova >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id); if (IS_ERR(src_ioas)) return PTR_ERR(src_ioas); @@ -309,6 +361,10 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) &unmapped); if (rc) goto out_put; + if (!unmapped) { + rc = -ENOENT; + goto out_put; + } } cmd->length = unmapped; @@ -319,6 +375,215 @@ out_put: return rc; } +static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx, + struct xarray *ioas_list) +{ + struct iommufd_ioas *ioas; + unsigned long index; + + xa_for_each(ioas_list, index, ioas) { + up_write(&ioas->iopt.iova_rwsem); + refcount_dec(&ioas->obj.users); + } + up_write(&ictx->ioas_creation_lock); + xa_destroy(ioas_list); +} + +static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx, + struct xarray *ioas_list) +{ + struct iommufd_object *obj; + unsigned long index; + int rc; + + /* + * This is very ugly, it is done instead of adding a lock around + * pages->source_mm, which is a performance path for mdev, we just + * obtain the write side of all the iova_rwsems which also protects the + * pages->source_*. Due to copies we can't know which IOAS could read + * from the pages, so we just lock everything. This is the only place + * locks are nested and they are uniformly taken in ID order. + * + * ioas_creation_lock prevents new IOAS from being installed in the + * xarray while we do this, and also prevents more than one thread from + * holding nested locks. + */ + down_write(&ictx->ioas_creation_lock); + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + struct iommufd_ioas *ioas; + + if (!obj || obj->type != IOMMUFD_OBJ_IOAS) + continue; + + if (!refcount_inc_not_zero(&obj->users)) + continue; + + xa_unlock(&ictx->objects); + + ioas = container_of(obj, struct iommufd_ioas, obj); + down_write_nest_lock(&ioas->iopt.iova_rwsem, + &ictx->ioas_creation_lock); + + rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL)); + if (rc) { + iommufd_release_all_iova_rwsem(ictx, ioas_list); + return rc; + } + + xa_lock(&ictx->objects); + } + xa_unlock(&ictx->objects); + return 0; +} + +static bool need_charge_update(struct iopt_pages *pages) +{ + switch (pages->account_mode) { + case IOPT_PAGES_ACCOUNT_NONE: + return false; + case IOPT_PAGES_ACCOUNT_MM: + return pages->source_mm != current->mm; + case IOPT_PAGES_ACCOUNT_USER: + /* + * Update when mm changes because it also accounts + * in mm->pinned_vm. + */ + return (pages->source_user != current_user()) || + (pages->source_mm != current->mm); + } + return true; +} + +static int charge_current(unsigned long *npinned) +{ + struct iopt_pages tmp = { + .source_mm = current->mm, + .source_task = current->group_leader, + .source_user = current_user(), + }; + unsigned int account_mode; + int rc; + + for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM; + account_mode++) { + if (!npinned[account_mode]) + continue; + + tmp.account_mode = account_mode; + rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true, + NULL); + if (rc) + goto err_undo; + } + return 0; + +err_undo: + while (account_mode != 0) { + account_mode--; + if (!npinned[account_mode]) + continue; + tmp.account_mode = account_mode; + iopt_pages_update_pinned(&tmp, npinned[account_mode], false, + NULL); + } + return rc; +} + +static void change_mm(struct iopt_pages *pages) +{ + struct task_struct *old_task = pages->source_task; + struct user_struct *old_user = pages->source_user; + struct mm_struct *old_mm = pages->source_mm; + + pages->source_mm = current->mm; + mmgrab(pages->source_mm); + mmdrop(old_mm); + + pages->source_task = current->group_leader; + get_task_struct(pages->source_task); + put_task_struct(old_task); + + pages->source_user = get_uid(current_user()); + free_uid(old_user); +} + +#define for_each_ioas_area(_xa, _index, _ioas, _area) \ + xa_for_each((_xa), (_index), (_ioas)) \ + for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \ + _area; \ + _area = iopt_area_iter_next(_area, 0, ULONG_MAX)) + +int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_change_process *cmd = ucmd->cmd; + struct iommufd_ctx *ictx = ucmd->ictx; + unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {}; + struct iommufd_ioas *ioas; + struct iopt_area *area; + struct iopt_pages *pages; + struct xarray ioas_list; + unsigned long index; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + xa_init(&ioas_list); + rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list); + if (rc) + return rc; + + for_each_ioas_area(&ioas_list, index, ioas, area) { + if (area->pages->type != IOPT_ADDRESS_FILE) { + rc = -EINVAL; + goto out; + } + } + + /* + * Count last_pinned pages, then clear it to avoid double counting + * if the same iopt_pages is visited multiple times in this loop. + * Since we are under all the locks, npinned == last_npinned, so we + * can easily restore last_npinned before we return. + */ + for_each_ioas_area(&ioas_list, index, ioas, area) { + pages = area->pages; + + if (need_charge_update(pages)) { + all_npinned[pages->account_mode] += pages->last_npinned; + pages->last_npinned = 0; + } + } + + rc = charge_current(all_npinned); + + if (rc) { + /* Charge failed. Fix last_npinned and bail. */ + for_each_ioas_area(&ioas_list, index, ioas, area) + area->pages->last_npinned = area->pages->npinned; + goto out; + } + + for_each_ioas_area(&ioas_list, index, ioas, area) { + pages = area->pages; + + /* Uncharge the old one (which also restores last_npinned) */ + if (need_charge_update(pages)) { + int r = iopt_pages_update_pinned(pages, pages->npinned, + false, NULL); + + if (WARN_ON(r)) + rc = r; + } + change_mm(pages); + } + +out: + iommufd_release_all_iova_rwsem(ictx, &ioas_list); + return rc; +} + int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 991f864d1f9b..eb6d1a70f673 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -4,24 +4,54 @@ #ifndef __IOMMUFD_PRIVATE_H #define __IOMMUFD_PRIVATE_H -#include <linux/rwsem.h> -#include <linux/xarray.h> -#include <linux/refcount.h> -#include <linux/uaccess.h> #include <linux/iommu.h> +#include <linux/iommufd.h> #include <linux/iova_bitmap.h> +#include <linux/maple_tree.h> +#include <linux/rwsem.h> +#include <linux/uaccess.h> +#include <linux/xarray.h> #include <uapi/linux/iommufd.h> +#include "../iommu-priv.h" + struct iommu_domain; struct iommu_group; struct iommu_option; struct iommufd_device; +struct dma_buf_attachment; +struct dma_buf_phys_vec; + +struct iommufd_sw_msi_map { + struct list_head sw_msi_item; + phys_addr_t sw_msi_start; + phys_addr_t msi_addr; + unsigned int pgoff; + unsigned int id; +}; + +/* Bitmap of struct iommufd_sw_msi_map::id */ +struct iommufd_sw_msi_maps { + DECLARE_BITMAP(bitmap, 64); +}; + +#ifdef CONFIG_IRQ_MSI_IOMMU +int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map); +#endif struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; wait_queue_head_t destroy_wait; + struct rw_semaphore ioas_creation_lock; + struct maple_tree mt_mmap; + + struct mutex sw_msi_lock; + struct list_head sw_msi_list; + unsigned int sw_msi_id; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -29,6 +59,18 @@ struct iommufd_ctx { struct iommufd_ioas *vfio_ioas; }; +/* Entry for iommufd_ctx::mt_mmap */ +struct iommufd_mmap { + struct iommufd_object *owner; + + /* Page-shifted start position in mt_mmap to validate vma->vm_pgoff */ + unsigned long vm_pgoff; + + /* Physical range for io_remap_pfn_range() */ + phys_addr_t mmio_addr; + size_t length; +}; + /* * The IOVA to PFN map. The map automatically copies the PFNs into multiple * domains and permits sharing of PFNs between io_pagetable instances. This @@ -67,6 +109,10 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags); +int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, int fd, + unsigned long start, unsigned long length, + int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags); @@ -105,6 +151,7 @@ struct iommufd_ucmd { void __user *ubuffer; u32 user_size; void *cmd; + struct iommufd_object *new_obj; }; int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, @@ -120,33 +167,11 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, return 0; } -enum iommufd_object_type { - IOMMUFD_OBJ_NONE, - IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, - IOMMUFD_OBJ_DEVICE, - IOMMUFD_OBJ_HWPT_PAGING, - IOMMUFD_OBJ_HWPT_NESTED, - IOMMUFD_OBJ_IOAS, - IOMMUFD_OBJ_ACCESS, -#ifdef CONFIG_IOMMUFD_TEST - IOMMUFD_OBJ_SELFTEST, -#endif - IOMMUFD_OBJ_MAX, -}; - -/* Base struct for all objects with a userspace ID handle. */ -struct iommufd_object { - refcount_t shortterm_users; - refcount_t users; - enum iommufd_object_type type; - unsigned int id; -}; - static inline bool iommufd_lock_obj(struct iommufd_object *obj) { if (!refcount_inc_not_zero(&obj->users)) return false; - if (!refcount_inc_not_zero(&obj->shortterm_users)) { + if (!refcount_inc_not_zero(&obj->wait_cnt)) { /* * If the caller doesn't already have a ref on obj this must be * called under the xa_lock. Otherwise the caller is holding a @@ -164,11 +189,11 @@ static inline void iommufd_put_object(struct iommufd_ctx *ictx, struct iommufd_object *obj) { /* - * Users first, then shortterm so that REMOVE_WAIT_SHORTTERM never sees - * a spurious !0 users with a 0 shortterm_users. + * Users first, then wait_cnt so that REMOVE_WAIT never sees a spurious + * !0 users with a 0 wait_cnt. */ refcount_dec(&obj->users); - if (refcount_dec_and_test(&obj->shortterm_users)) + if (refcount_dec_and_test(&obj->wait_cnt)) wake_up_interruptible_all(&ictx->destroy_wait); } @@ -179,7 +204,8 @@ void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj); enum { - REMOVE_WAIT_SHORTTERM = 1, + REMOVE_WAIT = BIT(0), + REMOVE_OBJ_TOMBSTONE = BIT(1), }; int iommufd_object_remove(struct iommufd_ctx *ictx, struct iommufd_object *to_destroy, u32 id, @@ -187,15 +213,35 @@ int iommufd_object_remove(struct iommufd_ctx *ictx, /* * The caller holds a users refcount and wants to destroy the object. At this - * point the caller has no shortterm_users reference and at least the xarray - * will be holding one. + * point the caller has no wait_cnt reference and at least the xarray will be + * holding one. */ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { int ret; - ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM); + ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT); + + /* + * If there is a bug and we couldn't destroy the object then we did put + * back the caller's users refcount and will eventually try to free it + * again during close. + */ + WARN_ON(ret); +} + +/* + * Similar to iommufd_object_destroy_user(), except that the object ID is left + * reserved/tombstoned. + */ +static inline void iommufd_object_tombstone_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + int ret; + + ret = iommufd_object_remove(ictx, obj, obj->id, + REMOVE_WAIT | REMOVE_OBJ_TOMBSTONE); /* * If there is a bug and we couldn't destroy the object then we did put @@ -222,6 +268,11 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, iommufd_object_remove(ictx, obj, obj->id, 0); } +/* + * Callers of these normal object allocators must call iommufd_object_finalize() + * to finalize the object, or call iommufd_object_abort_and_destroy() to revert + * the allocation. + */ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); @@ -239,6 +290,26 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, __iommufd_object_alloc(ictx, ptr, type, obj) /* + * Callers of these _ucmd allocators should not call iommufd_object_finalize() + * or iommufd_object_abort_and_destroy(), as the core automatically does that. + */ +struct iommufd_object * +_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, size_t size, + enum iommufd_object_type type); + +#define __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) \ + container_of(_iommufd_object_alloc_ucmd( \ + ucmd, \ + sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ + offsetof(typeof(*(ptr)), \ + obj) != 0), \ + type), \ + typeof(*(ptr)), obj) + +#define iommufd_object_alloc_ucmd(ucmd, ptr, type) \ + __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) + +/* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The * mapping is copied into all of the associated domains and made available to @@ -262,8 +333,7 @@ struct iommufd_ioas { static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx, u32 id) { - return container_of(iommufd_get_object(ictx, id, - IOMMUFD_OBJ_IOAS), + return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_IOAS), struct iommufd_ioas, obj); } @@ -273,6 +343,8 @@ void iommufd_ioas_destroy(struct iommufd_object *obj); int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd); +int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); @@ -292,6 +364,8 @@ int iommufd_check_iova_range(struct io_pagetable *iopt, struct iommufd_hw_pagetable { struct iommufd_object obj; struct iommu_domain *domain; + struct iommufd_fault *fault; + bool pasid_compat : 1; }; struct iommufd_hwpt_paging { @@ -299,15 +373,16 @@ struct iommufd_hwpt_paging { struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; - bool msi_cookie : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; + struct iommufd_sw_msi_maps present_sw_msi; }; struct iommufd_hwpt_nested { struct iommufd_hw_pagetable common; struct iommufd_hwpt_paging *parent; + struct iommufd_viommu *viommu; }; static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) @@ -321,6 +396,25 @@ to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) return container_of(hwpt, struct iommufd_hwpt_paging, common); } +static inline struct iommufd_hwpt_nested * +to_hwpt_nested(struct iommufd_hw_pagetable *hwpt) +{ + return container_of(hwpt, struct iommufd_hwpt_nested, common); +} + +static inline struct iommufd_hwpt_paging * +find_hwpt_paging(struct iommufd_hw_pagetable *hwpt) +{ + switch (hwpt->obj.type) { + case IOMMUFD_OBJ_HWPT_PAGING: + return to_hwpt_paging(hwpt); + case IOMMUFD_OBJ_HWPT_NESTED: + return to_hwpt_nested(hwpt)->parent; + default: + return NULL; + } +} + static inline struct iommufd_hwpt_paging * iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) { @@ -342,13 +436,13 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach, + struct iommufd_device *idev, ioasid_t pasid, + u32 flags, bool immediate_attach, const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); + struct iommufd_device *idev, ioasid_t pasid); struct iommufd_hw_pagetable * -iommufd_hw_pagetable_detach(struct iommufd_device *idev); +iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid); void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); void iommufd_hwpt_paging_abort(struct iommufd_object *obj); void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); @@ -362,9 +456,8 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); - lockdep_assert_not_held(&hwpt_paging->ioas->mutex); - if (hwpt_paging->auto_domain) { + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); iommufd_object_put_and_try_destroy(ictx, &hwpt->obj); return; } @@ -372,13 +465,15 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, refcount_dec(&hwpt->obj.users); } +struct iommufd_attach; + struct iommufd_group { struct kref ref; struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; - struct iommufd_hw_pagetable *hwpt; - struct list_head device_list; + struct xarray pasid_attach; + struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; @@ -395,6 +490,8 @@ struct iommufd_device { /* always the physical device */ struct device *dev; bool enforce_cache_coherency; + struct iommufd_vdevice *vdev; + bool destroying; }; static inline struct iommufd_device * @@ -405,9 +502,12 @@ iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) struct iommufd_device, obj); } +void iommufd_device_pre_destroy(struct iommufd_object *obj); void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); +struct device *iommufd_global_device(void); + struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; @@ -422,10 +522,191 @@ struct iommufd_access { int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access, - u32 iopt_access_list_id); + struct iommufd_access *access, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); +/* iommufd_access for internal use */ +static inline bool iommufd_access_is_internal(struct iommufd_access *access) +{ + return !access->ictx; +} + +struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx); + +static inline void +iommufd_access_destroy_internal(struct iommufd_ctx *ictx, + struct iommufd_access *access) +{ + iommufd_object_destroy_user(ictx, &access->obj); +} + +int iommufd_access_attach_internal(struct iommufd_access *access, + struct iommufd_ioas *ioas); + +static inline void iommufd_access_detach_internal(struct iommufd_access *access) +{ + iommufd_access_detach(access); +} + +struct iommufd_eventq { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct file *filep; + + spinlock_t lock; /* protects the deliver list */ + struct list_head deliver; + + struct wait_queue_head wait_queue; +}; + +struct iommufd_attach_handle { + struct iommu_attach_handle handle; + struct iommufd_device *idev; +}; + +/* Convert an iommu attach handle to iommufd handle. */ +#define to_iommufd_handle(hdl) container_of(hdl, struct iommufd_attach_handle, handle) + +/* + * An iommufd_fault object represents an interface to deliver I/O page faults + * to the user space. These objects are created/destroyed by the user space and + * associated with hardware page table objects during page-table allocation. + */ +struct iommufd_fault { + struct iommufd_eventq common; + struct mutex mutex; /* serializes response flows */ + struct xarray response; +}; + +static inline struct iommufd_fault * +eventq_to_fault(struct iommufd_eventq *eventq) +{ + return container_of(eventq, struct iommufd_fault, common); +} + +static inline struct iommufd_fault * +iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_FAULT), + struct iommufd_fault, common.obj); +} + +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); +void iommufd_fault_destroy(struct iommufd_object *obj); +int iommufd_fault_iopf_handler(struct iopf_group *group); +void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, + struct iommufd_attach_handle *handle); + +/* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */ +struct iommufd_vevent { + struct iommufd_vevent_header header; + struct list_head node; /* for iommufd_eventq::deliver */ + ssize_t data_len; + u64 event_data[] __counted_by(data_len); +}; + +#define vevent_for_lost_events_header(vevent) \ + (vevent->header.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS) + +/* + * An iommufd_veventq object represents an interface to deliver vIOMMU events to + * the user space. It is created/destroyed by the user space and associated with + * a vIOMMU object during the allocations. + */ +struct iommufd_veventq { + struct iommufd_eventq common; + struct iommufd_viommu *viommu; + struct list_head node; /* for iommufd_viommu::veventqs */ + + enum iommu_veventq_type type; + unsigned int depth; + + /* Use common.lock for protection */ + u32 num_events; + u32 sequence; + + /* Must be last as it ends in a flexible-array member. */ + struct iommufd_vevent lost_events_header; +}; + +static inline struct iommufd_veventq * +eventq_to_veventq(struct iommufd_eventq *eventq) +{ + return container_of(eventq, struct iommufd_veventq, common); +} + +static inline struct iommufd_veventq * +iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_VEVENTQ), + struct iommufd_veventq, common.obj); +} + +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd); +void iommufd_veventq_destroy(struct iommufd_object *obj); +void iommufd_veventq_abort(struct iommufd_object *obj); + +static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq, + struct iommufd_vevent *vevent) +{ + struct iommufd_eventq *eventq = &veventq->common; + + lockdep_assert_held(&eventq->lock); + + /* + * Remove the lost_events_header and add the new node at the same time. + * Note the new node can be lost_events_header, for a sequence update. + */ + if (list_is_last(&veventq->lost_events_header.node, &eventq->deliver)) + list_del(&veventq->lost_events_header.node); + list_add_tail(&vevent->node, &eventq->deliver); + vevent->header.sequence = veventq->sequence; + veventq->sequence = (veventq->sequence + 1) & INT_MAX; + + wake_up_interruptible(&eventq->wait_queue); +} + +static inline struct iommufd_viommu * +iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_VIOMMU), + struct iommufd_viommu, obj); +} + +static inline struct iommufd_veventq * +iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, + enum iommu_veventq_type type) +{ + struct iommufd_veventq *veventq, *next; + + lockdep_assert_held(&viommu->veventqs_rwsem); + + list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) { + if (veventq->type == type) + return veventq; + } + return NULL; +} + +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_viommu_destroy(struct iommufd_object *obj); +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_vdevice_destroy(struct iommufd_object *obj); +void iommufd_vdevice_abort(struct iommufd_object *obj); +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_hw_queue_destroy(struct iommufd_object *obj); + +static inline struct iommufd_vdevice * +iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id) +{ + return container_of(iommufd_get_object(ictx, id, + IOMMUFD_OBJ_VDEVICE), + struct iommufd_vdevice, obj); +} + #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); @@ -436,6 +717,8 @@ bool iommufd_should_fail(void); int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys); #else static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, @@ -457,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev) { return false; } +static inline int +iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + return -EOPNOTSUPP; +} #endif #endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index e854d3f67205..73e73e1ec158 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -4,8 +4,8 @@ #ifndef _UAPI_IOMMUFD_TEST_H #define _UAPI_IOMMUFD_TEST_H -#include <linux/types.h> #include <linux/iommufd.h> +#include <linux/types.h> enum { IOMMU_TEST_OP_ADD_RESERVED = 1, @@ -22,11 +22,29 @@ enum { IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, IOMMU_TEST_OP_DIRTY, IOMMU_TEST_OP_MD_CHECK_IOTLB, + IOMMU_TEST_OP_TRIGGER_IOPF, + IOMMU_TEST_OP_DEV_CHECK_CACHE, + IOMMU_TEST_OP_TRIGGER_VEVENT, + IOMMU_TEST_OP_PASID_ATTACH, + IOMMU_TEST_OP_PASID_REPLACE, + IOMMU_TEST_OP_PASID_DETACH, + IOMMU_TEST_OP_PASID_CHECK_HWPT, + IOMMU_TEST_OP_DMABUF_GET, + IOMMU_TEST_OP_DMABUF_REVOKE, +}; + +enum { + MOCK_IOMMUPT_DEFAULT = 0, + MOCK_IOMMUPT_HUGE, + MOCK_IOMMUPT_AMDV1, }; +/* These values are true for MOCK_IOMMUPT_DEFAULT */ enum { MOCK_APERTURE_START = 1UL << 24, MOCK_APERTURE_LAST = (1UL << 31) - 1, + MOCK_PAGE_SIZE = 2048, + MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE, }; enum { @@ -45,7 +63,7 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, - MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, + MOCK_FLAGS_DEVICE_PASID = 1 << 2, }; enum { @@ -53,6 +71,14 @@ enum { MOCK_NESTED_DOMAIN_IOTLB_NUM = 4, }; +enum { + MOCK_DEV_CACHE_ID_MAX = 3, + MOCK_DEV_CACHE_NUM = 4, +}; + +/* Reserved for special pasid replace test */ +#define IOMMU_TEST_PASID_RESERVED 1024 + struct iommu_test_cmd { __u32 size; __u32 op; @@ -127,11 +153,55 @@ struct iommu_test_cmd { __u32 id; __u32 iotlb; } check_iotlb; + struct { + __u32 dev_id; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + } trigger_iopf; + struct { + __u32 id; + __u32 cache; + } check_dev_cache; + struct { + __u32 dev_id; + } trigger_vevent; + struct { + __u32 pasid; + __u32 pt_id; + /* @id is stdev_id */ + } pasid_attach; + struct { + __u32 pasid; + __u32 pt_id; + /* @id is stdev_id */ + } pasid_replace; + struct { + __u32 pasid; + /* @id is stdev_id */ + } pasid_detach; + struct { + __u32 pasid; + __u32 hwpt_id; + /* @id is stdev_id */ + } pasid_check; + struct { + __u32 length; + __u32 open_flags; + } dmabuf_get; + struct { + __s32 dmabuf_fd; + __u32 revoked; + } dmabuf_revoke; }; __u32 last; }; #define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32) +/* Mock device/iommu PASID width */ +#define MOCK_PASID_WIDTH 20 + /* Mock structs for IOMMU_DEVICE_GET_HW_INFO ioctl */ #define IOMMU_HW_INFO_TYPE_SELFTEST 0xfeedbeef #define IOMMU_HW_INFO_SELFTEST_REGVAL 0xdeadbeef @@ -144,6 +214,7 @@ struct iommu_test_hw_info { /* Should not be equal to any defined value in enum iommu_hwpt_data_type */ #define IOMMU_HWPT_DATA_SELFTEST 0xdead #define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef +#define IOMMU_TEST_DEV_CACHE_DEFAULT 0xbaddad /** * struct iommu_hwpt_selftest @@ -152,6 +223,7 @@ struct iommu_test_hw_info { */ struct iommu_hwpt_selftest { __u32 iotlb; + __u32 pagetable_type; }; /* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */ @@ -172,4 +244,51 @@ struct iommu_hwpt_invalidate_selftest { __u32 iotlb_id; }; +#define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef + +/** + * struct iommu_viommu_selftest - vIOMMU data for Mock driver + * (IOMMU_VIOMMU_TYPE_SELFTEST) + * @in_data: Input random data from user space + * @out_data: Output data (matching @in_data) to user space + * @out_mmap_offset: The offset argument for mmap syscall + * @out_mmap_length: The length argument for mmap syscall + * + * Simply set @out_data=@in_data for a loopback test + */ +struct iommu_viommu_selftest { + __u32 in_data; + __u32 out_data; + __aligned_u64 out_mmap_offset; + __aligned_u64 out_mmap_length; +}; + +/* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */ +#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef +#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef + +/** + * struct iommu_viommu_invalidate_selftest - Invalidation data for Mock VIOMMU + * (IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) + * @flags: Invalidate flags + * @cache_id: Invalidate cache entry index + * + * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @cache_id will be ignored + */ +struct iommu_viommu_invalidate_selftest { +#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0) + __u32 flags; + __u32 vdev_id; + __u32 cache_id; +}; + +#define IOMMU_VEVENTQ_TYPE_SELFTEST 0xbeefbeef + +struct iommu_viommu_event_selftest { + __u32 virt_id; +}; + +#define IOMMU_HW_QUEUE_TYPE_SELFTEST 0xdeadbeef +#define IOMMU_TEST_HW_QUEUE_MAX 2 + #endif diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index db8c46bee155..b5b67a9d3fb3 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -3,10 +3,10 @@ * Copyright (c) 2022, Oracle and/or its affiliates. * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved */ +#include <linux/highmem.h> #include <linux/iova_bitmap.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/highmem.h> #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) @@ -35,6 +35,9 @@ struct iova_bitmap_map { /* base IOVA representing bit 0 of the first page */ unsigned long iova; + /* mapped length */ + unsigned long length; + /* page size order that each bit granules to */ unsigned long pgshift; @@ -113,9 +116,6 @@ struct iova_bitmap { /* length of the IOVA range for the whole bitmap */ size_t length; - - /* length of the IOVA range set ahead the pinned pages */ - unsigned long set_ahead_length; }; /* @@ -130,9 +130,8 @@ struct iova_bitmap { static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap, unsigned long iova) { - unsigned long pgsize = 1 << bitmap->mapped.pgshift; - - return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize); + return (iova >> bitmap->mapped.pgshift) / + BITS_PER_TYPE(*bitmap->bitmap); } /* @@ -156,6 +155,8 @@ static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap) return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip); } +static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap); + /* * Pins the bitmap user pages for the current range window. * This is internal to IOVA bitmap and called when advancing the @@ -206,6 +207,7 @@ static int iova_bitmap_get(struct iova_bitmap *bitmap) * aligned. */ mapped->pgoff = offset_in_page(addr); + mapped->length = iova_bitmap_mapped_length(bitmap); return 0; } @@ -263,16 +265,13 @@ struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, goto err; } - rc = iova_bitmap_get(bitmap); - if (rc) - goto err; return bitmap; err: iova_bitmap_free(bitmap); return ERR_PTR(rc); } -EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, "IOMMUFD"); /** * iova_bitmap_free() - Frees an IOVA bitmap object @@ -294,7 +293,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) kfree(bitmap); } -EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, "IOMMUFD"); /* * Returns the remaining bitmap indexes from mapped_total_index to process for @@ -338,65 +337,34 @@ static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap) } /* - * Returns true if there's not more data to iterate. + * Returns true if [@iova..@iova+@length-1] is part of the mapped IOVA range. */ -static bool iova_bitmap_done(struct iova_bitmap *bitmap) +static bool iova_bitmap_mapped_range(struct iova_bitmap_map *mapped, + unsigned long iova, size_t length) { - return bitmap->mapped_base_index >= bitmap->mapped_total_index; -} - -static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap, - size_t set_ahead_length) -{ - int ret = 0; - - while (set_ahead_length > 0 && !iova_bitmap_done(bitmap)) { - unsigned long length = iova_bitmap_mapped_length(bitmap); - unsigned long iova = iova_bitmap_mapped_iova(bitmap); - - ret = iova_bitmap_get(bitmap); - if (ret) - break; - - length = min(length, set_ahead_length); - iova_bitmap_set(bitmap, iova, length); - - set_ahead_length -= length; - bitmap->mapped_base_index += - iova_bitmap_offset_to_index(bitmap, length - 1) + 1; - iova_bitmap_put(bitmap); - } - - bitmap->set_ahead_length = 0; - return ret; + return mapped->npages && + (iova >= mapped->iova && + (iova + length - 1) <= (mapped->iova + mapped->length - 1)); } /* - * Advances to the next range, releases the current pinned + * Advances to a selected range, releases the current pinned * pages and pins the next set of bitmap pages. * Returns 0 on success or otherwise errno. */ -static int iova_bitmap_advance(struct iova_bitmap *bitmap) +static int iova_bitmap_advance_to(struct iova_bitmap *bitmap, + unsigned long iova) { - unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1; - unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1; + unsigned long index; - bitmap->mapped_base_index += count; + index = iova_bitmap_offset_to_index(bitmap, iova - bitmap->iova); + if (index >= bitmap->mapped_total_index) + return -EINVAL; + bitmap->mapped_base_index = index; iova_bitmap_put(bitmap); - if (iova_bitmap_done(bitmap)) - return 0; - - /* Iterate, set and skip any bits requested for next iteration */ - if (bitmap->set_ahead_length) { - int ret; - ret = iova_bitmap_set_ahead(bitmap, bitmap->set_ahead_length); - if (ret) - return ret; - } - - /* When advancing the index we pin the next set of bitmap pages */ + /* Pin the next set of bitmap pages */ return iova_bitmap_get(bitmap); } @@ -416,19 +384,9 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap) int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, iova_bitmap_fn_t fn) { - int ret = 0; - - for (; !iova_bitmap_done(bitmap) && !ret; - ret = iova_bitmap_advance(bitmap)) { - ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap), - iova_bitmap_mapped_length(bitmap), opaque); - if (ret) - break; - } - - return ret; + return fn(bitmap, bitmap->iova, bitmap->length, opaque); } -EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, "IOMMUFD"); /** * iova_bitmap_set() - Records an IOVA range in bitmap @@ -444,11 +402,24 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length) { struct iova_bitmap_map *mapped = &bitmap->mapped; - unsigned long cur_bit = ((iova - mapped->iova) >> - mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; - unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> - mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; - unsigned long last_page_idx = mapped->npages - 1; + unsigned long cur_bit, last_bit, last_page_idx; + +update_indexes: + if (unlikely(!iova_bitmap_mapped_range(mapped, iova, length))) { + /* + * The attempt to advance the base index to @iova + * may fail if it's out of bounds, or pinning the pages + * returns an error. + */ + if (iova_bitmap_advance_to(bitmap, iova)) + return; + } + + last_page_idx = mapped->npages - 1; + cur_bit = ((iova - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + last_bit = (((iova + length - 1) - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; do { unsigned int page_idx = cur_bit / BITS_PER_PAGE; @@ -457,18 +428,19 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, last_bit - cur_bit + 1); void *kaddr; - if (unlikely(page_idx > last_page_idx)) - break; + if (unlikely(page_idx > last_page_idx)) { + unsigned long left = + ((last_bit - cur_bit + 1) << mapped->pgshift); + + iova += (length - left); + length = left; + goto update_indexes; + } kaddr = kmap_local_page(mapped->pages[page_idx]); bitmap_set(kaddr, offset, nbits); kunmap_local(kaddr); cur_bit += nbits; } while (cur_bit <= last_bit); - - if (unlikely(cur_bit <= last_bit)) { - bitmap->set_ahead_length = - ((last_bit - cur_bit + 1) << bitmap->mapped.pgshift); - } } -EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, "IOMMUFD"); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 39b32932c61e..5cc4b08c25f5 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -8,21 +8,23 @@ */ #define pr_fmt(fmt) "iommufd: " fmt +#include <linux/bug.h> #include <linux/file.h> #include <linux/fs.h> -#include <linux/module.h> -#include <linux/slab.h> +#include <linux/iommufd.h> #include <linux/miscdevice.h> +#include <linux/module.h> #include <linux/mutex.h> -#include <linux/bug.h> +#include <linux/slab.h> #include <uapi/linux/iommufd.h> -#include <linux/iommufd.h> #include "io_pagetable.h" #include "iommufd_private.h" #include "iommufd_test.h" struct iommufd_object_ops { + size_t file_offset; + void (*pre_destroy)(struct iommufd_object *obj); void (*destroy)(struct iommufd_object *obj); void (*abort)(struct iommufd_object *obj); }; @@ -41,7 +43,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, return ERR_PTR(-ENOMEM); obj->type = type; /* Starts out bias'd by 1 until it is removed from the xarray */ - refcount_set(&obj->shortterm_users, 1); + refcount_set(&obj->wait_cnt, 1); refcount_set(&obj->users, 1); /* @@ -51,8 +53,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, * it anymore, so the caller must complete all errorable operations * before calling iommufd_object_finalize(). */ - rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, - xa_limit_31b, GFP_KERNEL_ACCOUNT); + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b, + GFP_KERNEL_ACCOUNT); if (rc) goto out_free; return obj; @@ -61,6 +63,33 @@ out_free: return ERR_PTR(rc); } +struct iommufd_object *_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *new_obj; + + /* Something is coded wrong if this is hit */ + if (WARN_ON(ucmd->new_obj)) + return ERR_PTR(-EBUSY); + + /* + * An abort op means that its caller needs to invoke it within a lock in + * the caller. So it doesn't work with _iommufd_object_alloc_ucmd() that + * will invoke the abort op in iommufd_object_abort_and_destroy(), which + * must be outside the caller's lock. + */ + if (WARN_ON(iommufd_object_ops[type].abort)) + return ERR_PTR(-EOPNOTSUPP); + + new_obj = _iommufd_object_alloc(ucmd->ictx, size, type); + if (IS_ERR(new_obj)) + return new_obj; + + ucmd->new_obj = new_obj; + return new_obj; +} + /* * Allow concurrent access to the object. * @@ -73,20 +102,30 @@ out_free: void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + XA_STATE(xas, &ictx->objects, obj->id); void *old; - old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); - /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ - WARN_ON(old); + xa_lock(&ictx->objects); + old = xas_store(&xas, obj); + xa_unlock(&ictx->objects); + /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */ + WARN_ON(old != XA_ZERO_ENTRY); } /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + XA_STATE(xas, &ictx->objects, obj->id); void *old; - old = xa_erase(&ictx->objects, obj->id); - WARN_ON(old); + xa_lock(&ictx->objects); + old = xas_store(&xas, NULL); + xa_unlock(&ictx->objects); + WARN_ON(old != XA_ZERO_ENTRY); + + if (WARN_ON(!refcount_dec_and_test(&obj->users))) + return; + kfree(obj); } @@ -97,10 +136,30 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj) { - if (iommufd_object_ops[obj->type].abort) - iommufd_object_ops[obj->type].abort(obj); + const struct iommufd_object_ops *ops = &iommufd_object_ops[obj->type]; + + if (ops->file_offset) { + struct file **filep = ((void *)obj) + ops->file_offset; + + /* + * A file should hold a users refcount while the file is open + * and put it back in its release. The file should hold a + * pointer to obj in their private data. Normal fput() is + * deferred to a workqueue and can get out of order with the + * following kfree(obj). Using the sync version ensures the + * release happens immediately. During abort we require the file + * refcount is one at this point - meaning the object alloc + * function cannot do anything to allow another thread to take a + * refcount prior to a guaranteed success. + */ + if (*filep) + __fput_sync(*filep); + } + + if (ops->abort) + ops->abort(obj); else - iommufd_object_ops[obj->type].destroy(obj); + ops->destroy(obj); iommufd_object_abort(ictx, obj); } @@ -121,20 +180,22 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, return obj; } -static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, - struct iommufd_object *to_destroy) +static int iommufd_object_dec_wait(struct iommufd_ctx *ictx, + struct iommufd_object *to_destroy) { - if (refcount_dec_and_test(&to_destroy->shortterm_users)) + if (refcount_dec_and_test(&to_destroy->wait_cnt)) return 0; + if (iommufd_object_ops[to_destroy->type].pre_destroy) + iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy); + if (wait_event_timeout(ictx->destroy_wait, - refcount_read(&to_destroy->shortterm_users) == - 0, - msecs_to_jiffies(10000))) + refcount_read(&to_destroy->wait_cnt) == 0, + msecs_to_jiffies(60000))) return 0; pr_crit("Time out waiting for iommufd object to become free\n"); - refcount_inc(&to_destroy->shortterm_users); + refcount_inc(&to_destroy->wait_cnt); return -EBUSY; } @@ -148,17 +209,18 @@ int iommufd_object_remove(struct iommufd_ctx *ictx, { struct iommufd_object *obj; XA_STATE(xas, &ictx->objects, id); - bool zerod_shortterm = false; + bool zerod_wait_cnt = false; int ret; /* - * The purpose of the shortterm_users is to ensure deterministic - * destruction of objects used by external drivers and destroyed by this - * function. Any temporary increment of the refcount must increment - * shortterm_users, such as during ioctl execution. + * The purpose of the wait_cnt is to ensure deterministic destruction + * of objects used by external drivers and destroyed by this function. + * Incrementing this wait_cnt should either be short lived, such as + * during ioctl execution, or be revoked and blocked during + * pre_destroy(), such as vdev holding the idev's refcount. */ - if (flags & REMOVE_WAIT_SHORTTERM) { - ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy); + if (flags & REMOVE_WAIT) { + ret = iommufd_object_dec_wait(ictx, to_destroy); if (ret) { /* * We have a bug. Put back the callers reference and @@ -167,7 +229,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx, refcount_dec(&to_destroy->users); return ret; } - zerod_shortterm = true; + zerod_wait_cnt = true; } xa_lock(&ictx->objects); @@ -193,17 +255,17 @@ int iommufd_object_remove(struct iommufd_ctx *ictx, goto err_xa; } - xas_store(&xas, NULL); + xas_store(&xas, (flags & REMOVE_OBJ_TOMBSTONE) ? XA_ZERO_ENTRY : NULL); if (ictx->vfio_ioas == container_of(obj, struct iommufd_ioas, obj)) ictx->vfio_ioas = NULL; xa_unlock(&ictx->objects); /* - * Since users is zero any positive users_shortterm must be racing + * Since users is zero any positive wait_cnt must be racing * iommufd_put_object(), or we have a bug. */ - if (!zerod_shortterm) { - ret = iommufd_object_dec_wait_shortterm(ictx, obj); + if (!zerod_wait_cnt) { + ret = iommufd_object_dec_wait(ictx, obj); if (WARN_ON(ret)) return ret; } @@ -213,9 +275,9 @@ int iommufd_object_remove(struct iommufd_ctx *ictx, return 0; err_xa: - if (zerod_shortterm) { + if (zerod_wait_cnt) { /* Restore the xarray owned reference */ - refcount_set(&obj->shortterm_users, 1); + refcount_set(&obj->wait_cnt, 1); } xa_unlock(&ictx->objects); @@ -248,10 +310,14 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); } + init_rwsem(&ictx->ioas_creation_lock); xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; + mt_init_flags(&ictx->mt_mmap, MT_FLAGS_ALLOC_RANGE); init_waitqueue_head(&ictx->destroy_wait); + mutex_init(&ictx->sw_msi_lock); + INIT_LIST_HEAD(&ictx->sw_msi_list); filp->private_data = ictx; return 0; } @@ -259,6 +325,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) static int iommufd_fops_release(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_sw_msi_map *next; + struct iommufd_sw_msi_map *cur; struct iommufd_object *obj; /* @@ -273,20 +341,47 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) while (!xa_empty(&ictx->objects)) { unsigned int destroyed = 0; unsigned long index; + bool empty = true; + /* + * We can't use xa_empty() to end the loop as the tombstones + * are stored as XA_ZERO_ENTRY in the xarray. However + * xa_for_each() automatically converts them to NULL and skips + * them causing xa_empty() to be kept false. Thus once + * xa_for_each() finds no further !NULL entries the loop is + * done. + */ xa_for_each(&ictx->objects, index, obj) { + empty = false; if (!refcount_dec_if_one(&obj->users)) continue; + destroyed++; xa_erase(&ictx->objects, index); iommufd_object_ops[obj->type].destroy(obj); kfree(obj); } + + if (empty) + break; + /* Bug related to users refcount */ if (WARN_ON(!destroyed)) break; } + + /* + * There may be some tombstones left over from + * iommufd_object_tombstone_user() + */ + xa_destroy(&ictx->objects); + WARN_ON(!xa_empty(&ictx->groups)); + + mutex_destroy(&ictx->sw_msi_lock); + list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item) + kfree(cur); + kfree(ictx); return 0; } @@ -319,7 +414,9 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) union ucmd_buffer { struct iommu_destroy destroy; + struct iommu_fault_alloc fault; struct iommu_hw_info info; + struct iommu_hw_queue_alloc hw_queue; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_invalidate cache; @@ -331,7 +428,10 @@ union ucmd_buffer { struct iommu_ioas_map map; struct iommu_ioas_unmap unmap; struct iommu_option option; + struct iommu_vdevice_alloc vdev; + struct iommu_veventq_alloc veventq; struct iommu_vfio_ioas vfio_ioas; + struct iommu_viommu_alloc viommu; #ifdef CONFIG_IOMMUFD_TEST struct iommu_test_cmd test; #endif @@ -355,8 +455,12 @@ struct iommufd_ioctl_op { } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, + struct iommu_fault_alloc, out_fault_fd), IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, __reserved), + IOCTL_OP(IOMMU_HW_QUEUE_ALLOC, iommufd_hw_queue_alloc_ioctl, + struct iommu_hw_queue_alloc, length), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, @@ -369,18 +473,26 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process, + struct iommu_ioas_change_process, __reserved), IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, src_iova), IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, struct iommu_ioas_iova_ranges, out_iova_alignment), - IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, - iova), + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, iova), + IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file, + struct iommu_ioas_map_file, iova), IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, length), - IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, - val64), + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), + IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, + struct iommu_vdevice_alloc, virt_id), + IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc, + struct iommu_veventq_alloc, out_veventq_fd), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), + IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, + struct iommu_viommu_alloc, out_viommu_id), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif @@ -419,14 +531,91 @@ static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, if (ret) return ret; ret = op->execute(&ucmd); + + if (ucmd.new_obj) { + if (ret) + iommufd_object_abort_and_destroy(ictx, ucmd.new_obj); + else + iommufd_object_finalize(ictx, ucmd.new_obj); + } return ret; } +static void iommufd_fops_vma_open(struct vm_area_struct *vma) +{ + struct iommufd_mmap *immap = vma->vm_private_data; + + refcount_inc(&immap->owner->users); +} + +static void iommufd_fops_vma_close(struct vm_area_struct *vma) +{ + struct iommufd_mmap *immap = vma->vm_private_data; + + refcount_dec(&immap->owner->users); +} + +static const struct vm_operations_struct iommufd_vma_ops = { + .open = iommufd_fops_vma_open, + .close = iommufd_fops_vma_close, +}; + +/* The vm_pgoff must be pre-allocated from mt_mmap, and given to user space */ +static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct iommufd_ctx *ictx = filp->private_data; + size_t length = vma->vm_end - vma->vm_start; + struct iommufd_mmap *immap; + int rc; + + if (!PAGE_ALIGNED(length)) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + if (vma->vm_flags & VM_EXEC) + return -EPERM; + + mtree_lock(&ictx->mt_mmap); + /* vma->vm_pgoff carries a page-shifted start position to an immap */ + immap = mtree_load(&ictx->mt_mmap, vma->vm_pgoff << PAGE_SHIFT); + if (!immap || !refcount_inc_not_zero(&immap->owner->users)) { + mtree_unlock(&ictx->mt_mmap); + return -ENXIO; + } + mtree_unlock(&ictx->mt_mmap); + + /* + * mtree_load() returns the immap for any contained mmio_addr, so only + * allow the exact immap thing to be mapped + */ + if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) { + rc = -ENXIO; + goto err_refcount; + } + + vma->vm_pgoff = 0; + vma->vm_private_data = immap; + vma->vm_ops = &iommufd_vma_ops; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + rc = io_remap_pfn_range(vma, vma->vm_start, + immap->mmio_addr >> PAGE_SHIFT, length, + vma->vm_page_prot); + if (rc) + goto err_refcount; + return 0; + +err_refcount: + refcount_dec(&immap->owner->users); + return rc; +} + static const struct file_operations iommufd_fops = { .owner = THIS_MODULE, .open = iommufd_fops_open, .release = iommufd_fops_release, .unlocked_ioctl = iommufd_fops_ioctl, + .mmap = iommufd_fops_mmap, }; /** @@ -439,7 +628,7 @@ void iommufd_ctx_get(struct iommufd_ctx *ictx) { get_file(ictx->file); } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, "IOMMUFD"); /** * iommufd_ctx_from_file - Acquires a reference to the iommufd context @@ -459,7 +648,7 @@ struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) iommufd_ctx_get(ictx); return ictx; } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, "IOMMUFD"); /** * iommufd_ctx_from_fd - Acquires a reference to the iommufd context @@ -483,7 +672,7 @@ struct iommufd_ctx *iommufd_ctx_from_fd(int fd) /* fget is the same as iommufd_ctx_get() */ return file->private_data; } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, "IOMMUFD"); /** * iommufd_ctx_put - Put back a reference @@ -493,17 +682,28 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) { fput(ictx->file); } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, "IOMMUFD"); + +#define IOMMUFD_FILE_OFFSET(_struct, _filep, _obj) \ + .file_offset = (offsetof(_struct, _filep) + \ + BUILD_BUG_ON_ZERO(!__same_type( \ + struct file *, ((_struct *)NULL)->_filep)) + \ + BUILD_BUG_ON_ZERO(offsetof(_struct, _obj))) static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_ACCESS] = { .destroy = iommufd_access_destroy_object, }, [IOMMUFD_OBJ_DEVICE] = { + .pre_destroy = iommufd_device_pre_destroy, .destroy = iommufd_device_destroy, }, - [IOMMUFD_OBJ_IOAS] = { - .destroy = iommufd_ioas_destroy, + [IOMMUFD_OBJ_FAULT] = { + .destroy = iommufd_fault_destroy, + IOMMUFD_FILE_OFFSET(struct iommufd_fault, common.filep, common.obj), + }, + [IOMMUFD_OBJ_HW_QUEUE] = { + .destroy = iommufd_hw_queue_destroy, }, [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hwpt_paging_destroy, @@ -513,6 +713,21 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_hwpt_nested_destroy, .abort = iommufd_hwpt_nested_abort, }, + [IOMMUFD_OBJ_IOAS] = { + .destroy = iommufd_ioas_destroy, + }, + [IOMMUFD_OBJ_VDEVICE] = { + .destroy = iommufd_vdevice_destroy, + .abort = iommufd_vdevice_abort, + }, + [IOMMUFD_OBJ_VEVENTQ] = { + .destroy = iommufd_veventq_destroy, + .abort = iommufd_veventq_abort, + IOMMUFD_FILE_OFFSET(struct iommufd_veventq, common.filep, common.obj), + }, + [IOMMUFD_OBJ_VIOMMU] = { + .destroy = iommufd_viommu_destroy, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, @@ -528,7 +743,6 @@ static struct miscdevice iommu_misc_dev = { .mode = 0660, }; - static struct miscdevice vfio_misc_dev = { .minor = VFIO_MINOR, .name = "vfio", @@ -537,6 +751,15 @@ static struct miscdevice vfio_misc_dev = { .mode = 0666, }; +/* + * Used only by DMABUF, returns a valid struct device to use as a dummy struct + * device for attachment. + */ +struct device *iommufd_global_device(void) +{ + return iommu_misc_dev.this_device; +} + static int __init iommufd_init(void) { int ret; @@ -578,7 +801,8 @@ module_exit(iommufd_exit); MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif -MODULE_IMPORT_NS(IOMMUFD_INTERNAL); -MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); +MODULE_IMPORT_NS("IOMMUFD"); +MODULE_IMPORT_NS("DMA_BUF"); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 528f356238b3..dbe51ecb9a20 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,16 +45,20 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> +#include <linux/file.h> +#include <linux/highmem.h> +#include <linux/iommu.h> +#include <linux/iommufd.h> +#include <linux/kthread.h> #include <linux/overflow.h> #include <linux/slab.h> -#include <linux/iommu.h> #include <linux/sched/mm.h> -#include <linux/highmem.h> -#include <linux/kthread.h> -#include <linux/iommufd.h> +#include <linux/vfio_pci_core.h> -#include "io_pagetable.h" #include "double_span.h" +#include "io_pagetable.h" #ifndef CONFIG_IOMMUFD_TEST #define TEMP_MEMORY_LIMIT 65536 @@ -257,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, return container_of(node, struct iopt_area, pages_node); } +enum batch_kind { + BATCH_CPU_MEMORY = 0, + BATCH_MMIO, +}; + /* * A simple datastructure to hold a vector of PFNs, optimized for contiguous * PFNs. This is used as a temporary holding memory for shuttling pfns from one @@ -270,7 +279,9 @@ struct pfn_batch { unsigned int array_size; unsigned int end; unsigned int total_pfns; + enum batch_kind kind; }; +enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) }; static void batch_clear(struct pfn_batch *batch) { @@ -346,27 +357,47 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) kfree(batch->pfns); } -/* true if the pfn was added, false otherwise */ -static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, + u32 nr, enum batch_kind kind) { - const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); + unsigned int end = batch->end; - if (batch->end && - pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && - batch->npfns[batch->end - 1] != MAX_NPFNS) { - batch->npfns[batch->end - 1]++; - batch->total_pfns++; - return true; + if (batch->kind != kind) { + /* One kind per batch */ + if (batch->end != 0) + return false; + batch->kind = kind; } - if (batch->end == batch->array_size) + + if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && + nr <= MAX_NPFNS - batch->npfns[end - 1]) { + batch->npfns[end - 1] += nr; + } else if (end < batch->array_size) { + batch->pfns[end] = pfn; + batch->npfns[end] = nr; + batch->end++; + } else { return false; - batch->total_pfns++; - batch->pfns[batch->end] = pfn; - batch->npfns[batch->end] = 1; - batch->end++; + } + + batch->total_pfns += nr; return true; } +static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) +{ + batch->npfns[batch->end - 1] -= nr; + if (batch->npfns[batch->end - 1] == 0) + batch->end--; + batch->total_pfns -= nr; +} + +/* true if the pfn was added, false otherwise */ +static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +{ + return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY); +} + /* * Fill the batch with pfns from the domain. When the batch is full, or it * reaches last_index, the function will return. The caller should use @@ -477,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, { bool disable_large_pages = area->iopt->disable_large_pages; unsigned long last_iova = iopt_area_last_iova(area); + int iommu_prot = area->iommu_prot; unsigned int page_offset = 0; unsigned long start_iova; unsigned long next_iova; @@ -484,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, unsigned long iova; int rc; + if (batch->kind == BATCH_MMIO) { + iommu_prot &= ~IOMMU_CACHE; + iommu_prot |= IOMMU_MMIO; + } + /* The first index might be a partial page */ if (start_index == iopt_area_index(area)) page_offset = area->page_offset; @@ -497,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, rc = batch_iommu_map_small( domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot); + next_iova - iova, iommu_prot); else rc = iommu_map(domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot, + next_iova - iova, iommu_prot, GFP_KERNEL_ACCOUNT); if (rc) goto err_unmap; @@ -622,6 +659,41 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages, break; } +static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, + unsigned long *offset_p, unsigned long npages) +{ + int rc = 0; + struct folio **folios = *folios_p; + unsigned long offset = *offset_p; + + while (npages) { + struct folio *folio = *folios; + unsigned long nr = folio_nr_pages(folio) - offset; + unsigned long pfn = page_to_pfn(folio_page(folio, offset)); + + nr = min(nr, npages); + npages -= nr; + + if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY)) + break; + if (nr > 1) { + rc = folio_add_pins(folio, nr - 1); + if (rc) { + batch_remove_pfn_num(batch, nr); + goto out; + } + } + + folios++; + offset = 0; + } + +out: + *folios_p = folios; + *offset_p = offset; + return rc; +} + static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, unsigned int first_page_off, size_t npages) { @@ -703,19 +775,32 @@ struct pfn_reader_user { * neither */ int locked; + + /* The following are only valid if file != NULL. */ + struct file *file; + struct folio **ufolios; + size_t ufolios_len; + unsigned long ufolios_offset; + struct folio **ufolios_next; }; static void pfn_reader_user_init(struct pfn_reader_user *user, struct iopt_pages *pages) { user->upages = NULL; + user->upages_len = 0; user->upages_start = 0; user->upages_end = 0; user->locked = -1; - user->gup_flags = FOLL_LONGTERM; if (pages->writable) user->gup_flags |= FOLL_WRITE; + + user->file = (pages->type == IOPT_ADDRESS_FILE) ? pages->file : NULL; + user->ufolios = NULL; + user->ufolios_len = 0; + user->ufolios_next = NULL; + user->ufolios_offset = 0; } static void pfn_reader_user_destroy(struct pfn_reader_user *user, @@ -724,13 +809,67 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user, if (user->locked != -1) { if (user->locked) mmap_read_unlock(pages->source_mm); - if (pages->source_mm != current->mm) + if (!user->file && pages->source_mm != current->mm) mmput(pages->source_mm); user->locked = -1; } kfree(user->upages); user->upages = NULL; + kfree(user->ufolios); + user->ufolios = NULL; +} + +static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start, + unsigned long npages) +{ + unsigned long i; + unsigned long offset; + unsigned long npages_out = 0; + struct page **upages = user->upages; + unsigned long end = start + (npages << PAGE_SHIFT) - 1; + long nfolios = user->ufolios_len / sizeof(*user->ufolios); + + /* + * todo: memfd_pin_folios should return the last pinned offset so + * we can compute npages pinned, and avoid looping over folios here + * if upages == NULL. + */ + nfolios = memfd_pin_folios(user->file, start, end, user->ufolios, + nfolios, &offset); + if (nfolios <= 0) + return nfolios; + + offset >>= PAGE_SHIFT; + user->ufolios_next = user->ufolios; + user->ufolios_offset = offset; + + for (i = 0; i < nfolios; i++) { + struct folio *folio = user->ufolios[i]; + unsigned long nr = folio_nr_pages(folio); + unsigned long npin = min(nr - offset, npages); + + npages -= npin; + npages_out += npin; + + if (upages) { + if (npin == 1) { + *upages++ = folio_page(folio, offset); + } else { + int rc = folio_add_pins(folio, npin - 1); + + if (rc) + return rc; + + while (npin--) + *upages++ = folio_page(folio, offset++); + } + } + + offset = 0; + } + + return npages_out; } static int pfn_reader_user_pin(struct pfn_reader_user *user, @@ -739,7 +878,9 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, unsigned long last_index) { bool remote_mm = pages->source_mm != current->mm; - unsigned long npages; + unsigned long npages = last_index - start_index + 1; + unsigned long start; + unsigned long unum; uintptr_t uptr; long rc; @@ -747,40 +888,50 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, WARN_ON(last_index < start_index)) return -EINVAL; - if (!user->upages) { + if (!user->file && !user->upages) { /* All undone in pfn_reader_destroy() */ - user->upages_len = - (last_index - start_index + 1) * sizeof(*user->upages); + user->upages_len = npages * sizeof(*user->upages); user->upages = temp_kmalloc(&user->upages_len, NULL, 0); if (!user->upages) return -ENOMEM; } + if (user->file && !user->ufolios) { + user->ufolios_len = npages * sizeof(*user->ufolios); + user->ufolios = temp_kmalloc(&user->ufolios_len, NULL, 0); + if (!user->ufolios) + return -ENOMEM; + } + if (user->locked == -1) { /* * The majority of usages will run the map task within the mm * providing the pages, so we can optimize into * get_user_pages_fast() */ - if (remote_mm) { + if (!user->file && remote_mm) { if (!mmget_not_zero(pages->source_mm)) return -EFAULT; } user->locked = 0; } - npages = min_t(unsigned long, last_index - start_index + 1, - user->upages_len / sizeof(*user->upages)); - + unum = user->file ? user->ufolios_len / sizeof(*user->ufolios) : + user->upages_len / sizeof(*user->upages); + npages = min_t(unsigned long, npages, unum); if (iommufd_should_fail()) return -EFAULT; - uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); - if (!remote_mm) + if (user->file) { + start = pages->start + (start_index * PAGE_SIZE); + rc = pin_memfd_pages(user, start, npages); + } else if (!remote_mm) { + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); rc = pin_user_pages_fast(uptr, npages, user->gup_flags, user->upages); - else { + } else { + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); if (!user->locked) { mmap_read_lock(pages->source_mm); user->locked = 1; @@ -809,13 +960,14 @@ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + cur_pages = atomic_long_read(&pages->source_user->locked_vm); do { - cur_pages = atomic_long_read(&pages->source_user->locked_vm); new_pages = cur_pages + npages; if (new_pages > lock_limit) return -ENOMEM; - } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages, - new_pages) != cur_pages); + } while (!atomic_long_try_cmpxchg(&pages->source_user->locked_vm, + &cur_pages, new_pages)); return 0; } @@ -837,7 +989,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, mmap_read_unlock(pages->source_mm); user->locked = 0; /* If we had the lock then we also have a get */ - } else if ((!user || !user->upages) && + + } else if ((!user || (!user->upages && !user->ufolios)) && pages->source_mm != current->mm) { if (!mmget_not_zero(pages->source_mm)) return -EINVAL; @@ -854,8 +1007,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, return rc; } -static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, - bool inc, struct pfn_reader_user *user) +int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) { int rc = 0; @@ -889,8 +1042,8 @@ static void update_unpinned(struct iopt_pages *pages) return; if (pages->npinned == pages->last_npinned) return; - do_update_pinned(pages, pages->last_npinned - pages->npinned, false, - NULL); + iopt_pages_update_pinned(pages, pages->last_npinned - pages->npinned, + false, NULL); } /* @@ -920,7 +1073,42 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, npages = pages->npinned - pages->last_npinned; inc = true; } - return do_update_pinned(pages, npages, inc, user); + return iopt_pages_update_pinned(pages, npages, inc, user); +} + +struct pfn_reader_dmabuf { + struct dma_buf_phys_vec phys; + unsigned long start_offset; +}; + +static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf, + struct iopt_pages *pages) +{ + /* Callers must not get here if the dmabuf was already revoked */ + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return -EINVAL; + + dmabuf->phys = pages->dmabuf.phys; + dmabuf->start_offset = pages->dmabuf.start; + return 0; +} + +static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf, + struct pfn_batch *batch, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE; + + /* + * start/last_index and start are all PAGE_SIZE aligned, the batch is + * always filled using page size aligned PFNs just like the other types. + * If the dmabuf has been sliced on a sub page offset then the common + * batch to domain code will adjust it before mapping to the domain. + */ + batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start), + last_index - start_index + 1, BATCH_MMIO); + return 0; } /* @@ -941,7 +1129,10 @@ struct pfn_reader { unsigned long batch_end_index; unsigned long last_index; - struct pfn_reader_user user; + union { + struct pfn_reader_user user; + struct pfn_reader_dmabuf dmabuf; + }; }; static int pfn_reader_update_pinned(struct pfn_reader *pfns) @@ -977,6 +1168,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; + struct pfn_reader_user *user; + unsigned long npages; struct iopt_area *area; int rc; @@ -1007,18 +1200,29 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) return 0; } - if (start_index >= pfns->user.upages_end) { - rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + if (iopt_is_dmabuf(pfns->pages)) + return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch, + start_index, span->last_hole); + + user = &pfns->user; + if (start_index >= user->upages_end) { + rc = pfn_reader_user_pin(user, pfns->pages, start_index, span->last_hole); if (rc) return rc; } - batch_from_pages(&pfns->batch, - pfns->user.upages + - (start_index - pfns->user.upages_start), - pfns->user.upages_end - start_index); - return 0; + npages = user->upages_end - start_index; + start_index -= user->upages_start; + rc = 0; + + if (!user->file) + batch_from_pages(&pfns->batch, user->upages + start_index, + npages); + else + rc = batch_from_folios(&pfns->batch, &user->ufolios_next, + &user->ufolios_offset, npages); + return rc; } static bool pfn_reader_done(struct pfn_reader *pfns) @@ -1070,7 +1274,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, pfns->batch_start_index = start_index; pfns->batch_end_index = start_index; pfns->last_index = last_index; - pfn_reader_user_init(&pfns->user, pages); + if (iopt_is_dmabuf(pages)) + pfn_reader_dmabuf_init(&pfns->dmabuf, pages); + else + pfn_reader_user_init(&pfns->user, pages); rc = batch_init(&pfns->batch, last_index - start_index + 1); if (rc) return rc; @@ -1091,16 +1298,29 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; + struct pfn_reader_user *user; - if (pfns->user.upages_end > pfns->batch_end_index) { - size_t npages = pfns->user.upages_end - pfns->batch_end_index; + if (iopt_is_dmabuf(pages)) + return; + user = &pfns->user; + if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ - unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - - pfns->user.upages_start), - npages); + + unsigned long npages = user->upages_end - pfns->batch_end_index; + unsigned long start_index = pfns->batch_end_index - + user->upages_start; + + if (!user->file) { + unpin_user_pages(user->upages + start_index, npages); + } else { + long n = user->ufolios_len / sizeof(*user->ufolios); + + unpin_folios(user->ufolios_next, + user->ufolios + n - user->ufolios_next); + } iopt_pages_sub_npinned(pages, npages); - pfns->user.upages_end = pfns->batch_end_index; + user->upages_end = pfns->batch_end_index; } if (pfns->batch_start_index != pfns->batch_end_index) { pfn_reader_unpin(pfns); @@ -1113,7 +1333,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns) struct iopt_pages *pages = pfns->pages; pfn_reader_release_pins(pfns); - pfn_reader_user_destroy(&pfns->user, pfns->pages); + if (!iopt_is_dmabuf(pfns->pages)) + pfn_reader_user_destroy(&pfns->user, pfns->pages); batch_destroy(&pfns->batch, NULL); WARN_ON(pages->last_npinned != pages->npinned); } @@ -1138,11 +1359,10 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, return 0; } -struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, - bool writable) +static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte, + unsigned long length, bool writable) { struct iopt_pages *pages; - unsigned long end; /* * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP @@ -1151,9 +1371,6 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, if (length > SIZE_MAX - PAGE_SIZE || length == 0) return ERR_PTR(-EINVAL); - if (check_add_overflow((unsigned long)uptr, length, &end)) - return ERR_PTR(-EOVERFLOW); - pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); if (!pages) return ERR_PTR(-ENOMEM); @@ -1163,8 +1380,7 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, mutex_init(&pages->mutex); pages->source_mm = current->mm; mmgrab(pages->source_mm); - pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); - pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); + pages->npages = DIV_ROUND_UP(length + start_byte, PAGE_SIZE); pages->access_itree = RB_ROOT_CACHED; pages->domains_itree = RB_ROOT_CACHED; pages->writable = writable; @@ -1178,6 +1394,253 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, return pages; } +struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, + unsigned long length, bool writable) +{ + struct iopt_pages *pages; + unsigned long end; + void __user *uptr_down = + (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + + if (check_add_overflow((unsigned long)uptr, length, &end)) + return ERR_PTR(-EOVERFLOW); + + pages = iopt_alloc_pages(uptr - uptr_down, length, writable); + if (IS_ERR(pages)) + return pages; + pages->uptr = uptr_down; + pages->type = IOPT_ADDRESS_USER; + return pages; +} + +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable) + +{ + struct iopt_pages *pages; + + pages = iopt_alloc_pages(start_byte, length, writable); + if (IS_ERR(pages)) + return pages; + pages->file = get_file(file); + pages->start = start - start_byte; + pages->type = IOPT_ADDRESS_FILE; + return pages; +} + +static void iopt_revoke_notify(struct dma_buf_attachment *attach) +{ + struct iopt_pages *pages = attach->importer_priv; + struct iopt_pages_dmabuf_track *track; + + guard(mutex)(&pages->mutex); + if (iopt_dmabuf_revoked(pages)) + return; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + struct iopt_area *area = track->area; + + iopt_area_unmap_domain_range(area, track->domain, + iopt_area_index(area), + iopt_area_last_index(area)); + } + pages->dmabuf.phys.len = 0; +} + +static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = { + .allow_peer2peer = true, + .move_notify = iopt_revoke_notify, +}; + +/* + * iommufd and vfio have a circular dependency. Future work for a phys + * based private interconnect will remove this. + */ +static int +sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + typeof(&vfio_pci_dma_buf_iommufd_map) fn; + int rc; + + rc = iommufd_test_dma_buf_iommufd_map(attachment, phys); + if (rc != -EOPNOTSUPP) + return rc; + + if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)) + return -EOPNOTSUPP; + + fn = symbol_get(vfio_pci_dma_buf_iommufd_map); + if (!fn) + return -EOPNOTSUPP; + rc = fn(attachment, phys); + symbol_put(vfio_pci_dma_buf_iommufd_map); + return rc; +} + +static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages, + struct dma_buf *dmabuf) +{ + struct dma_buf_attachment *attach; + int rc; + + attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(), + &iopt_dmabuf_attach_revoke_ops, pages); + if (IS_ERR(attach)) + return PTR_ERR(attach); + + dma_resv_lock(dmabuf->resv, NULL); + /* + * Lock ordering requires the mutex to be taken inside the reservation, + * make sure lockdep sees this. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + mutex_lock(&pages->mutex); + mutex_unlock(&pages->mutex); + } + + rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys); + if (rc) + goto err_detach; + + dma_resv_unlock(dmabuf->resv); + + /* On success iopt_release_pages() will detach and put the dmabuf. */ + pages->dmabuf.attach = attach; + return 0; + +err_detach: + dma_resv_unlock(dmabuf->resv); + dma_buf_detach(dmabuf, attach); + return rc; +} + +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable) +{ + static struct lock_class_key pages_dmabuf_mutex_key; + struct iopt_pages *pages; + int rc; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return ERR_PTR(-EOPNOTSUPP); + + if (dmabuf->size <= (start + length - 1) || + length / PAGE_SIZE >= MAX_NPFNS) + return ERR_PTR(-EINVAL); + + pages = iopt_alloc_pages(start_byte, length, writable); + if (IS_ERR(pages)) + return pages; + + /* + * The mmap_lock can be held when obtaining the dmabuf reservation lock + * which creates a locking cycle with the pages mutex which is held + * while obtaining the mmap_lock. This locking path is not present for + * IOPT_ADDRESS_DMABUF so split the lock class. + */ + lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key); + + /* dmabuf does not use pinned page accounting. */ + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + pages->type = IOPT_ADDRESS_DMABUF; + pages->dmabuf.start = start - start_byte; + INIT_LIST_HEAD(&pages->dmabuf.tracker); + + rc = iopt_map_dmabuf(ictx, pages, dmabuf); + if (rc) { + iopt_put_pages(pages); + return ERR_PTR(rc); + } + + return pages; +} + +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + if (WARN_ON(!iopt_is_dmabuf(pages))) + return -EINVAL; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->domain == domain && track->area == area)) + return -EINVAL; + + track = kzalloc(sizeof(*track), GFP_KERNEL); + if (!track) + return -ENOMEM; + track->domain = domain; + track->area = area; + list_add_tail(&track->elm, &pages->dmabuf.tracker); + + return 0; +} + +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + WARN_ON(!iopt_is_dmabuf(pages)); + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + if (track->domain == domain && track->area == area) { + list_del(&track->elm); + kfree(track); + return; + } + } + WARN_ON(true); +} + +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iommu_domain *domain; + unsigned long index; + int rc; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->area == area)) + return -EINVAL; + + xa_for_each(&area->iopt->domains, index, domain) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto err_untrack; + } + return 0; +err_untrack: + iopt_dmabuf_untrack_all_domains(area, pages); + return rc; +} + +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iopt_pages_dmabuf_track *tmp; + + list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker, + elm) { + if (track->area == area) { + list_del(&track->elm); + kfree(track); + } + } +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1190,6 +1653,15 @@ void iopt_release_pages(struct kref *kref) mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); + if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) { + struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf; + + dma_buf_detach(dmabuf, pages->dmabuf.attach); + dma_buf_put(dmabuf); + WARN_ON(!list_empty(&pages->dmabuf.tracker)); + } else if (pages->type == IOPT_ADDRESS_FILE) { + fput(pages->file); + } kfree(pages); } @@ -1267,6 +1739,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area, lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return; + iopt_area_unmap_domain_range(area, domain, start_index, + last_index); + return; + } + /* * For security we must not unpin something that is still DMA mapped, * so this must unmap any IOVA before we go ahead and unpin the pages. @@ -1342,6 +1822,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain) { + if (iopt_dmabuf_revoked(pages)) + return; + __iopt_area_unfill_domain(area, pages, domain, iopt_area_last_index(area)); } @@ -1362,6 +1845,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) lockdep_assert_held(&area->pages->mutex); + if (iopt_dmabuf_revoked(area->pages)) + return 0; + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), iopt_area_last_index(area)); if (rc) @@ -1421,33 +1907,44 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) return 0; mutex_lock(&pages->mutex); - rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), - iopt_area_last_index(area)); - if (rc) - goto out_unlock; + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_all_domains(area, pages); + if (rc) + goto out_unlock; + } - while (!pfn_reader_done(&pfns)) { - done_first_end_index = pfns.batch_end_index; - done_all_end_index = pfns.batch_start_index; - xa_for_each(&area->iopt->domains, index, domain) { - rc = batch_to_domain(&pfns.batch, domain, area, - pfns.batch_start_index); + if (!iopt_dmabuf_revoked(pages)) { + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_untrack; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; + + rc = pfn_reader_next(&pfns); if (rc) goto out_unmap; } - done_all_end_index = done_first_end_index; - - rc = pfn_reader_next(&pfns); + rc = pfn_reader_update_pinned(&pfns); if (rc) goto out_unmap; + + pfn_reader_destroy(&pfns); } - rc = pfn_reader_update_pinned(&pfns); - if (rc) - goto out_unmap; area->storage_domain = xa_load(&area->iopt->domains, 0); interval_tree_insert(&area->pages_node, &pages->domains_itree); - goto out_destroy; + mutex_unlock(&pages->mutex); + return 0; out_unmap: pfn_reader_release_pins(&pfns); @@ -1474,8 +1971,10 @@ out_unmap: end_index); } } -out_destroy: pfn_reader_destroy(&pfns); +out_untrack: + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); out_unlock: mutex_unlock(&pages->mutex); return rc; @@ -1501,16 +2000,22 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) if (!area->storage_domain) goto out_unlock; - xa_for_each(&iopt->domains, index, domain) - if (domain != area->storage_domain) + xa_for_each(&iopt->domains, index, domain) { + if (domain == area->storage_domain) + continue; + + if (!iopt_dmabuf_revoked(pages)) iopt_area_unmap_domain_range( area, domain, iopt_area_index(area), iopt_area_last_index(area)); + } if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); interval_tree_remove(&area->pages_node, &pages->domains_itree); iopt_area_unfill_domain(area, pages, area->storage_domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); area->storage_domain = NULL; out_unlock: mutex_unlock(&pages->mutex); @@ -1629,11 +2134,11 @@ static int iopt_pages_fill_from_domain(struct iopt_pages *pages, return 0; } -static int iopt_pages_fill_from_mm(struct iopt_pages *pages, - struct pfn_reader_user *user, - unsigned long start_index, - unsigned long last_index, - struct page **out_pages) +static int iopt_pages_fill(struct iopt_pages *pages, + struct pfn_reader_user *user, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) { unsigned long cur_index = start_index; int rc; @@ -1707,8 +2212,8 @@ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, /* hole */ cur_pages = out_pages + (span.start_hole - start_index); - rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, - span.last_hole, cur_pages); + rc = iopt_pages_fill(pages, &user, span.start_hole, + span.last_hole, cur_pages); if (rc) goto out_clean_xa; rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, @@ -1788,6 +2293,10 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, struct page *page = NULL; int rc; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(pages->type != IOPT_ADDRESS_USER)) + return -EINVAL; + if (!mmget_not_zero(pages->source_mm)) return iopt_pages_rw_slow(pages, index, index, offset, data, length, flags); @@ -1843,6 +2352,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; + if (iopt_is_dmabuf(pages)) + return -EINVAL; + + if (pages->type != IOPT_ADDRESS_USER) + return iopt_pages_rw_slow(pages, start_index, last_index, + start_byte % PAGE_SIZE, data, length, + flags); + if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, @@ -1906,6 +2423,7 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, * @last_index: Inclusive last page index * @out_pages: Output list of struct page's representing the PFNs * @flags: IOMMUFD_ACCESS_RW_* flags + * @lock_area: Fail userspace munmap on this area * * Record that an in-kernel access will be accessing the pages, ensure they are * pinned, and return the PFNs as a simple list of 'struct page *'. @@ -1913,8 +2431,8 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, * This should be undone through a matching call to iopt_area_remove_access() */ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, - unsigned long last_index, struct page **out_pages, - unsigned int flags) + unsigned long last_index, struct page **out_pages, + unsigned int flags, bool lock_area) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; @@ -1927,6 +2445,8 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, access = iopt_pages_get_exact_access(pages, start_index, last_index); if (access) { area->num_accesses++; + if (lock_area) + area->num_locks++; access->users++; iopt_pages_fill_from_xarray(pages, start_index, last_index, out_pages); @@ -1948,6 +2468,8 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, access->node.last = last_index; access->users = 1; area->num_accesses++; + if (lock_area) + area->num_locks++; interval_tree_insert(&access->node, &pages->access_itree); mutex_unlock(&pages->mutex); return 0; @@ -1964,12 +2486,13 @@ err_unlock: * @area: The source of PFNs * @start_index: First page index * @last_index: Inclusive last page index + * @unlock_area: Must match the matching iopt_area_add_access()'s lock_area * * Undo iopt_area_add_access() and unpin the pages if necessary. The caller * must stop using the PFNs before calling this. */ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, - unsigned long last_index) + unsigned long last_index, bool unlock_area) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; @@ -1980,6 +2503,10 @@ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, goto out_unlock; WARN_ON(area->num_accesses == 0 || access->users == 0); + if (unlock_area) { + WARN_ON(area->num_locks == 0); + area->num_locks--; + } area->num_accesses--; access->users--; if (access->users) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 7a2199470f31..c4322fd26f93 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -3,14 +3,19 @@ * * Kernel side components to support tools/testing/selftests/iommu */ -#include <linux/slab.h> -#include <linux/iommu.h> -#include <linux/xarray.h> -#include <linux/file.h> #include <linux/anon_inodes.h> +#include <linux/debugfs.h> +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> #include <linux/fault-inject.h> +#include <linux/file.h> +#include <linux/iommu.h> #include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/xarray.h> #include <uapi/linux/iommufd.h> +#include <linux/generic_pt/iommu.h> +#include "../iommu-pages.h" #include "../iommu-priv.h" #include "io_pagetable.h" @@ -40,23 +45,11 @@ static DEFINE_IDA(mock_dev_ida); enum { MOCK_DIRTY_TRACK = 1, - MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, - MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, - - /* - * Like a real page table alignment requires the low bits of the address - * to be zero. xarray also requires the high bit to be zero, so we store - * the pfns shifted. The upper bits are used for metadata. - */ - MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, - - _MOCK_PFN_START = MOCK_PFN_MASK + 1, - MOCK_PFN_START_IOVA = _MOCK_PFN_START, - MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, - MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, - MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, }; +static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain); +static void mock_dev_disable_iopf(struct device *dev, struct iommu_domain *domain); + /* * Syzkaller has trouble randomizing the correct iova to use since it is linked * to the map ioctl's output, and it has no ide about that. So, simplify things. @@ -120,27 +113,84 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, } struct mock_iommu_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + }; unsigned long flags; - struct iommu_domain domain; - struct xarray pfns; }; +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain); + +static inline struct mock_iommu_domain * +to_mock_domain(struct iommu_domain *domain) +{ + return container_of(domain, struct mock_iommu_domain, domain); +} struct mock_iommu_domain_nested { struct iommu_domain domain; - struct mock_iommu_domain *parent; + struct mock_viommu *mock_viommu; u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; }; +static inline struct mock_iommu_domain_nested * +to_mock_nested(struct iommu_domain *domain) +{ + return container_of(domain, struct mock_iommu_domain_nested, domain); +} + +struct mock_viommu { + struct iommufd_viommu core; + struct mock_iommu_domain *s2_parent; + struct mock_hw_queue *hw_queue[IOMMU_TEST_HW_QUEUE_MAX]; + struct mutex queue_mutex; + + unsigned long mmap_offset; + u32 *page; /* Mmap page to test u32 type of in_data */ +}; + +static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) +{ + return container_of(viommu, struct mock_viommu, core); +} + +struct mock_hw_queue { + struct iommufd_hw_queue core; + struct mock_viommu *mock_viommu; + struct mock_hw_queue *prev; + u16 index; +}; + +static inline struct mock_hw_queue * +to_mock_hw_queue(struct iommufd_hw_queue *hw_queue) +{ + return container_of(hw_queue, struct mock_hw_queue, core); +} + enum selftest_obj_type { TYPE_IDEV, }; struct mock_dev { struct device dev; + struct mock_viommu *viommu; + struct rw_semaphore viommu_rwsem; unsigned long flags; + unsigned long vdev_id; int id; + u32 cache[MOCK_DEV_CACHE_NUM]; + atomic_t pasid_1024_fake_error; + unsigned int iopf_refcount; + struct iommu_domain *domain; }; +static inline struct mock_dev *to_mock_dev(struct device *dev) +{ + return container_of(dev, struct mock_dev, dev); +} + struct selftest_obj { struct iommufd_object obj; enum selftest_obj_type type; @@ -154,19 +204,94 @@ struct selftest_obj { }; }; +static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) +{ + return container_of(obj, struct selftest_obj, obj); +} + static int mock_domain_nop_attach(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); + struct mock_viommu *new_viommu = NULL; + unsigned long vdev_id = 0; + int rc; if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return -EINVAL; + iommu_group_mutex_assert(dev); + if (domain->type == IOMMU_DOMAIN_NESTED) { + new_viommu = to_mock_nested(domain)->mock_viommu; + if (new_viommu) { + rc = iommufd_viommu_get_vdev_id(&new_viommu->core, dev, + &vdev_id); + if (rc) + return rc; + } + } + if (new_viommu != mdev->viommu) { + down_write(&mdev->viommu_rwsem); + mdev->viommu = new_viommu; + mdev->vdev_id = vdev_id; + up_write(&mdev->viommu_rwsem); + } + + rc = mock_dev_enable_iopf(dev, domain); + if (rc) + return rc; + + mock_dev_disable_iopf(dev, mdev->domain); + mdev->domain = domain; + + return 0; +} + +static int mock_domain_set_dev_pasid_nop(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct mock_dev *mdev = to_mock_dev(dev); + int rc; + + /* + * Per the first attach with pasid 1024, set the + * mdev->pasid_1024_fake_error. Hence the second call of this op + * can fake an error to validate the error path of the core. This + * is helpful to test the case in which the iommu core needs to + * rollback to the old domain due to driver failure. e.g. replace. + * User should be careful about the third call of this op, it shall + * succeed since the mdev->pasid_1024_fake_error is cleared in the + * second call. + */ + if (pasid == 1024) { + if (domain->type == IOMMU_DOMAIN_BLOCKED) { + atomic_set(&mdev->pasid_1024_fake_error, 0); + } else if (atomic_read(&mdev->pasid_1024_fake_error)) { + /* + * Clear the flag, and fake an error to fail the + * replacement. + */ + atomic_set(&mdev->pasid_1024_fake_error, 0); + return -ENOMEM; + } else { + /* Set the flag to fake an error in next call */ + atomic_set(&mdev->pasid_1024_fake_error, 1); + } + } + + rc = mock_dev_enable_iopf(dev, domain); + if (rc) + return rc; + + mock_dev_disable_iopf(dev, old); + return 0; } static const struct iommu_domain_ops mock_blocking_ops = { .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop }; static struct iommu_domain mock_blocking_domain = { @@ -174,10 +299,15 @@ static struct iommu_domain mock_blocking_domain = { .ops = &mock_blocking_ops, }; -static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) +static void *mock_domain_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct iommu_test_hw_info *info; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); @@ -192,8 +322,7 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, bool enable) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = mock->flags; if (enable && !domain->dirty_ops) @@ -209,309 +338,540 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, - unsigned long iova, size_t page_size, - unsigned long flags) +static struct mock_iommu_domain_nested * +__mock_domain_alloc_nested(const struct iommu_user_data *user_data) { - unsigned long cur, end = iova + page_size - 1; - bool dirty = false; - void *ent, *old; - - for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); - if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) - continue; + struct mock_iommu_domain_nested *mock_nested; + struct iommu_hwpt_selftest user_cfg; + int rc, i; - dirty = true; - /* Clear dirty */ - if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { - unsigned long val; + if (user_data->type != IOMMU_HWPT_DATA_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - } - } + rc = iommu_copy_struct_from_user(&user_cfg, user_data, + IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); - return dirty; + mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); + if (!mock_nested) + return ERR_PTR(-ENOMEM); + mock_nested->domain.ops = &domain_nested_ops; + mock_nested->domain.type = IOMMU_DOMAIN_NESTED; + for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) + mock_nested->iotlb[i] = user_cfg.iotlb; + return mock_nested; } -static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) +static struct iommu_domain * +mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, const struct iommu_user_data *user_data) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); - unsigned long end = iova + size; - void *ent; + struct mock_iommu_domain_nested *mock_nested; + struct mock_iommu_domain *mock_parent; - if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) - return -EINVAL; + if (flags & ~IOMMU_HWPT_ALLOC_PASID) + return ERR_PTR(-EOPNOTSUPP); + if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING)) + return ERR_PTR(-EINVAL); - do { - unsigned long pgsize = MOCK_IO_PAGE_SIZE; - unsigned long head; + mock_parent = to_mock_domain(parent); + if (!mock_parent) + return ERR_PTR(-EINVAL); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent) { - iova += pgsize; - continue; - } + mock_nested = __mock_domain_alloc_nested(user_data); + if (IS_ERR(mock_nested)) + return ERR_CAST(mock_nested); + return &mock_nested->domain; +} - if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) - pgsize = MOCK_HUGE_PAGE_SIZE; - head = iova & ~(pgsize - 1); +static void mock_domain_free(struct iommu_domain *domain) +{ + struct mock_iommu_domain *mock = to_mock_domain(domain); - /* Clear dirty */ - if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, head, pgsize); - iova = head + pgsize; - } while (iova < end); + pt_iommu_deinit(&mock->iommu); + kfree(mock); +} - return 0; +static void mock_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + iommu_put_pages_list(&gather->freelist); } -const struct iommu_dirty_ops dirty_ops = { +static const struct iommu_domain_ops amdv1_mock_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; + +static const struct iommu_domain_ops amdv1_mock_huge_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; +#undef pt_iommu_amdv1_mock_map_pages + +static const struct iommu_dirty_ops amdv1_mock_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1_mock), .set_dirty_tracking = mock_domain_set_dirty_tracking, - .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; -static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; + +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; + +static struct mock_iommu_domain * +mock_domain_alloc_pgtable(struct device *dev, + const struct iommu_hwpt_selftest *user_cfg, u32 flags) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); struct mock_iommu_domain *mock; + int rc; mock = kzalloc(sizeof(*mock), GFP_KERNEL); if (!mock) - return NULL; - mock->domain.geometry.aperture_start = MOCK_APERTURE_START; - mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; - mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; - if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) - mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; - mock->domain.ops = mock_ops.default_domain_ops; + return ERR_PTR(-ENOMEM); mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - xa_init(&mock->pfns); - return &mock->domain; -} -static struct iommu_domain * -__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, - const struct iommu_hwpt_selftest *user_cfg) -{ - struct mock_iommu_domain_nested *mock_nested; - int i; + mock->amdv1.iommu.nid = NUMA_NO_NODE; + + switch (user_cfg->pagetable_type) { + case MOCK_IOMMUPT_DEFAULT: + case MOCK_IOMMUPT_HUGE: { + struct pt_iommu_amdv1_cfg cfg = {}; + + /* The mock version has a 2k page size */ + cfg.common.hw_max_vasz_lg2 = 56; + cfg.common.hw_max_oasz_lg2 = 51; + cfg.starting_level = 2; + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.ops = &amdv1_mock_huge_ops; + else + mock->domain.ops = &amdv1_mock_ops; + rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + + /* + * In huge mode userspace should only provide huge pages, we + * have to include PAGE_SIZE for the domain to be accepted by + * iommufd. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE | + PAGE_SIZE; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_mock_dirty_ops; + break; + } - mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); - if (!mock_nested) - return ERR_PTR(-ENOMEM); - mock_nested->parent = mock_parent; - mock_nested->domain.ops = &domain_nested_ops; - mock_nested->domain.type = IOMMU_DOMAIN_NESTED; - for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) - mock_nested->iotlb[i] = user_cfg->iotlb; - return &mock_nested->domain; + case MOCK_IOMMUPT_AMDV1: { + struct pt_iommu_amdv1_cfg cfg = {}; + + cfg.common.hw_max_vasz_lg2 = 64; + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); + cfg.starting_level = 2; + mock->domain.ops = &amdv1_ops; + rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_dirty_ops; + break; + } + default: + rc = -EOPNOTSUPP; + goto err_free; + } + + /* + * Override the real aperture to the MOCK aperture for test purposes. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) { + WARN_ON(mock->domain.geometry.aperture_start != 0); + WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST); + + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; + } + + return mock; +err_free: + kfree(mock); + return ERR_PTR(rc); } static struct iommu_domain * -mock_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +mock_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { - struct mock_iommu_domain *mock_parent; - struct iommu_hwpt_selftest user_cfg; + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID; + struct mock_dev *mdev = to_mock_dev(dev); + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_hwpt_selftest user_cfg = {}; + struct mock_iommu_domain *mock; int rc; - /* must be mock_domain */ - if (!parent) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); - bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; - struct iommu_domain *domain; - - if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) - return ERR_PTR(-EOPNOTSUPP); - if (user_data || (has_dirty_flag && no_dirty_ops)) - return ERR_PTR(-EOPNOTSUPP); - domain = mock_domain_alloc_paging(dev); - if (!domain) - return ERR_PTR(-ENOMEM); - if (has_dirty_flag) - container_of(domain, struct mock_iommu_domain, domain) - ->domain.dirty_ops = &dirty_ops; - return domain; - } - - /* must be mock_domain_nested */ - if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags) + if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) return ERR_PTR(-EOPNOTSUPP); - if (!parent || parent->ops != mock_ops.default_domain_ops) - return ERR_PTR(-EINVAL); - mock_parent = container_of(parent, struct mock_iommu_domain, domain); - if (!mock_parent) - return ERR_PTR(-EINVAL); + if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST && + user_data->type != IOMMU_HWPT_DATA_NONE)) + return ERR_PTR(-EOPNOTSUPP); - rc = iommu_copy_struct_from_user(&user_cfg, user_data, - IOMMU_HWPT_DATA_SELFTEST, iotlb); - if (rc) - return ERR_PTR(rc); + if (user_data) { + rc = iommu_copy_struct_from_user( + &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); + } - return __mock_domain_alloc_nested(mock_parent, &user_cfg); + mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags); + if (IS_ERR(mock)) + return ERR_CAST(mock); + return &mock->domain; } -static void mock_domain_free(struct iommu_domain *domain) +static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_dev *mdev = to_mock_dev(dev); - WARN_ON(!xa_empty(&mock->pfns)); - kfree(mock); + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return true; + case IOMMU_CAP_DIRTY_TRACKING: + return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY); + default: + break; + } + + return false; } -static int mock_domain_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, int prot, - gfp_t gfp, size_t *mapped) +static struct iopf_queue *mock_iommu_iopf_queue; + +static struct mock_iommu_device { + struct iommu_device iommu_dev; + struct completion complete; + refcount_t users; +} mock_iommu; + +static struct iommu_device *mock_probe_device(struct device *dev) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); - unsigned long flags = MOCK_PFN_START_IOVA; - unsigned long start_iova = iova; + if (dev->bus != &iommufd_mock_bus_type.bus) + return ERR_PTR(-ENODEV); + return &mock_iommu.iommu_dev; +} - /* - * xarray does not reliably work with fault injection because it does a - * retry allocation, so put our own failure point. - */ - if (iommufd_should_fail()) - return -ENOENT; +static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ +} - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); - for (; pgcount; pgcount--) { - size_t cur; +static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain) +{ + struct mock_dev *mdev = to_mock_dev(dev); + int ret; - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - void *old; + if (!domain || !domain->iopf_handler) + return 0; - if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - flags = MOCK_PFN_LAST_IOVA; - if (pgsize != MOCK_IO_PAGE_SIZE) { - flags |= MOCK_PFN_HUGE_IOVA; - } - old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, - xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | - flags), - gfp); - if (xa_is_err(old)) { - for (; start_iova != iova; - start_iova += MOCK_IO_PAGE_SIZE) - xa_erase(&mock->pfns, - start_iova / - MOCK_IO_PAGE_SIZE); - return xa_err(old); - } - WARN_ON(old); - iova += MOCK_IO_PAGE_SIZE; - paddr += MOCK_IO_PAGE_SIZE; - *mapped += MOCK_IO_PAGE_SIZE; - flags = 0; - } + if (!mock_iommu_iopf_queue) + return -ENODEV; + + if (mdev->iopf_refcount) { + mdev->iopf_refcount++; + return 0; } + + ret = iopf_queue_add_device(mock_iommu_iopf_queue, dev); + if (ret) + return ret; + + mdev->iopf_refcount = 1; + return 0; } -static size_t mock_domain_unmap_pages(struct iommu_domain *domain, - unsigned long iova, size_t pgsize, - size_t pgcount, - struct iommu_iotlb_gather *iotlb_gather) +static void mock_dev_disable_iopf(struct device *dev, struct iommu_domain *domain) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); - bool first = true; - size_t ret = 0; - void *ent; + struct mock_dev *mdev = to_mock_dev(dev); - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + if (!domain || !domain->iopf_handler) + return; - for (; pgcount; pgcount--) { - size_t cur; + if (--mdev->iopf_refcount) + return; - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + iopf_queue_remove_device(mock_iommu_iopf_queue, dev); +} - /* - * iommufd generates unmaps that must be a strict - * superset of the map's performend So every - * starting/ending IOVA should have been an iova passed - * to map. - * - * This simple logic doesn't work when the HUGE_PAGE is - * turned on since the core code will automatically - * switch between the two page sizes creating a break in - * the unmap calls. The break can land in the middle of - * contiguous IOVA. - */ - if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) { - if (first) { - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); - first = false; - } - if (pgcount == 1 && - cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); - } +static void mock_viommu_destroy(struct iommufd_viommu *viommu) +{ + struct mock_iommu_device *mock_iommu = container_of( + viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); + + if (refcount_dec_and_test(&mock_iommu->users)) + complete(&mock_iommu->complete); + if (mock_viommu->mmap_offset) + iommufd_viommu_destroy_mmap(&mock_viommu->core, + mock_viommu->mmap_offset); + free_page((unsigned long)mock_viommu->page); + mutex_destroy(&mock_viommu->queue_mutex); + + /* iommufd core frees mock_viommu and viommu */ +} + +static struct iommu_domain * +mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); + struct mock_iommu_domain_nested *mock_nested; + + if (flags & ~IOMMU_HWPT_ALLOC_PASID) + return ERR_PTR(-EOPNOTSUPP); + + mock_nested = __mock_domain_alloc_nested(user_data); + if (IS_ERR(mock_nested)) + return ERR_CAST(mock_nested); + mock_nested->mock_viommu = mock_viommu; + return &mock_nested->domain; +} + +static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) +{ + struct iommu_viommu_invalidate_selftest *cmds; + struct iommu_viommu_invalidate_selftest *cur; + struct iommu_viommu_invalidate_selftest *end; + int rc; + + /* A zero-length array is allowed to validate the array type */ + if (array->entry_num == 0 && + array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) { + array->entry_num = 0; + return 0; + } + + cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + cur = cmds; + end = cmds + array->entry_num; + + static_assert(sizeof(*cmds) == 3 * sizeof(u32)); + rc = iommu_copy_struct_from_full_user_array( + cmds, sizeof(*cmds), array, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST); + if (rc) + goto out; + + while (cur != end) { + struct mock_dev *mdev; + struct device *dev; + int i; - iova += MOCK_IO_PAGE_SIZE; - ret += MOCK_IO_PAGE_SIZE; + if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) { + rc = -EOPNOTSUPP; + goto out; + } + + if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) { + rc = -EINVAL; + goto out; + } + + xa_lock(&viommu->vdevs); + dev = iommufd_viommu_find_dev(viommu, + (unsigned long)cur->vdev_id); + if (!dev) { + xa_unlock(&viommu->vdevs); + rc = -EINVAL; + goto out; + } + mdev = container_of(dev, struct mock_dev, dev); + + if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) { + /* Invalidate all cache entries and ignore cache_id */ + for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) + mdev->cache[i] = 0; + } else { + mdev->cache[cur->cache_id] = 0; } + xa_unlock(&viommu->vdevs); + + cur++; } - return ret; +out: + array->entry_num = cur - cmds; + kfree(cmds); + return rc; } -static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) +static size_t mock_viommu_get_hw_queue_size(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); - void *ent; + if (queue_type != IOMMU_HW_QUEUE_TYPE_SELFTEST) + return 0; + return HW_QUEUE_STRUCT_SIZE(struct mock_hw_queue, core); +} - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - WARN_ON(!ent); - return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; +static void mock_hw_queue_destroy(struct iommufd_hw_queue *hw_queue) +{ + struct mock_hw_queue *mock_hw_queue = to_mock_hw_queue(hw_queue); + struct mock_viommu *mock_viommu = mock_hw_queue->mock_viommu; + + mutex_lock(&mock_viommu->queue_mutex); + mock_viommu->hw_queue[mock_hw_queue->index] = NULL; + if (mock_hw_queue->prev) + iommufd_hw_queue_undepend(mock_hw_queue, mock_hw_queue->prev, + core); + mutex_unlock(&mock_viommu->queue_mutex); } -static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) +/* Test iommufd_hw_queue_depend/undepend() */ +static int mock_hw_queue_init_phys(struct iommufd_hw_queue *hw_queue, u32 index, + phys_addr_t base_addr_pa) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_viommu *mock_viommu = to_mock_viommu(hw_queue->viommu); + struct mock_hw_queue *mock_hw_queue = to_mock_hw_queue(hw_queue); + struct mock_hw_queue *prev = NULL; + int rc = 0; - switch (cap) { - case IOMMU_CAP_CACHE_COHERENCY: - return true; - case IOMMU_CAP_DIRTY_TRACKING: - return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY); - default: - break; + if (index >= IOMMU_TEST_HW_QUEUE_MAX) + return -EINVAL; + + mutex_lock(&mock_viommu->queue_mutex); + + if (mock_viommu->hw_queue[index]) { + rc = -EEXIST; + goto unlock; } - return false; + if (index) { + prev = mock_viommu->hw_queue[index - 1]; + if (!prev) { + rc = -EIO; + goto unlock; + } + } + + /* + * Test to catch a kernel bug if the core converted the physical address + * incorrectly. Let mock_domain_iova_to_phys() WARN_ON if it fails. + */ + if (base_addr_pa != iommu_iova_to_phys(&mock_viommu->s2_parent->domain, + hw_queue->base_addr)) { + rc = -EFAULT; + goto unlock; + } + + if (prev) { + rc = iommufd_hw_queue_depend(mock_hw_queue, prev, core); + if (rc) + goto unlock; + } + + mock_hw_queue->prev = prev; + mock_hw_queue->mock_viommu = mock_viommu; + mock_viommu->hw_queue[index] = mock_hw_queue; + + hw_queue->destroy = &mock_hw_queue_destroy; +unlock: + mutex_unlock(&mock_viommu->queue_mutex); + return rc; } -static struct iommu_device mock_iommu_device = { +static struct iommufd_viommu_ops mock_viommu_ops = { + .destroy = mock_viommu_destroy, + .alloc_domain_nested = mock_viommu_alloc_domain_nested, + .cache_invalidate = mock_viommu_cache_invalidate, + .get_hw_queue_size = mock_viommu_get_hw_queue_size, + .hw_queue_init_phys = mock_hw_queue_init_phys, }; -static struct iommu_device *mock_probe_device(struct device *dev) +static size_t mock_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type) { - if (dev->bus != &iommufd_mock_bus_type.bus) - return ERR_PTR(-ENODEV); - return &mock_iommu_device; + if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST) + return 0; + return VIOMMU_STRUCT_SIZE(struct mock_viommu, core); +} + +static int mock_viommu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data) +{ + struct mock_iommu_device *mock_iommu = container_of( + viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); + struct iommu_viommu_selftest data; + int rc; + + if (user_data) { + rc = iommu_copy_struct_from_user( + &data, user_data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data); + if (rc) + return rc; + + /* Allocate two pages */ + mock_viommu->page = + (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (!mock_viommu->page) + return -ENOMEM; + + rc = iommufd_viommu_alloc_mmap(&mock_viommu->core, + __pa(mock_viommu->page), + PAGE_SIZE * 2, + &mock_viommu->mmap_offset); + if (rc) + goto err_free_page; + + /* For loopback tests on both the page and out_data */ + *mock_viommu->page = data.in_data; + data.out_data = data.in_data; + data.out_mmap_length = PAGE_SIZE * 2; + data.out_mmap_offset = mock_viommu->mmap_offset; + rc = iommu_copy_struct_to_user( + user_data, &data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data); + if (rc) + goto err_destroy_mmap; + } + + refcount_inc(&mock_iommu->users); + mutex_init(&mock_viommu->queue_mutex); + mock_viommu->s2_parent = to_mock_domain(parent_domain); + + viommu->ops = &mock_viommu_ops; + return 0; + +err_destroy_mmap: + iommufd_viommu_destroy_mmap(&mock_viommu->core, + mock_viommu->mmap_offset); +err_free_page: + free_page((unsigned long)mock_viommu->page); + return rc; } static const struct iommu_ops mock_ops = { @@ -522,37 +882,28 @@ static const struct iommu_ops mock_ops = { .default_domain = &mock_blocking_domain, .blocked_domain = &mock_blocking_domain, .owner = THIS_MODULE, - .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, - .domain_alloc_paging = mock_domain_alloc_paging, - .domain_alloc_user = mock_domain_alloc_user, + .domain_alloc_paging_flags = mock_domain_alloc_paging_flags, + .domain_alloc_nested = mock_domain_alloc_nested, .capable = mock_domain_capable, .device_group = generic_device_group, .probe_device = mock_probe_device, - .default_domain_ops = - &(struct iommu_domain_ops){ - .free = mock_domain_free, - .attach_dev = mock_domain_nop_attach, - .map_pages = mock_domain_map_pages, - .unmap_pages = mock_domain_unmap_pages, - .iova_to_phys = mock_domain_iova_to_phys, - }, + .page_response = mock_domain_page_response, + .user_pasid_table = true, + .get_viommu_size = mock_get_viommu_size, + .viommu_init = mock_viommu_init, }; static void mock_domain_free_nested(struct iommu_domain *domain) { - struct mock_iommu_domain_nested *mock_nested = - container_of(domain, struct mock_iommu_domain_nested, domain); - - kfree(mock_nested); + kfree(to_mock_nested(domain)); } static int mock_domain_cache_invalidate_user(struct iommu_domain *domain, struct iommu_user_data_array *array) { - struct mock_iommu_domain_nested *mock_nested = - container_of(domain, struct mock_iommu_domain_nested, domain); + struct mock_iommu_domain_nested *mock_nested = to_mock_nested(domain); struct iommu_hwpt_invalidate_selftest inv; u32 processed = 0; int i = 0, j; @@ -600,6 +951,7 @@ static struct iommu_domain_ops domain_nested_ops = { .free = mock_domain_free_nested, .attach_dev = mock_domain_nop_attach, .cache_invalidate_user = mock_domain_cache_invalidate_user, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, }; static inline struct iommufd_hw_pagetable * @@ -623,11 +975,11 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, if (IS_ERR(hwpt)) return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || - hwpt->domain->ops != mock_ops.default_domain_ops) { + hwpt->domain->owner != &mock_ops) { iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } - *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); + *mock = to_mock_domain(hwpt->domain); return hwpt; } @@ -645,14 +997,13 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } - *mock_nested = container_of(hwpt->domain, - struct mock_iommu_domain_nested, domain); + *mock_nested = to_mock_nested(hwpt->domain); return hwpt; } static void mock_dev_release(struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); ida_free(&mock_dev_ida, mdev->id); kfree(mdev); @@ -660,21 +1011,29 @@ static void mock_dev_release(struct device *dev) static struct mock_dev *mock_dev_create(unsigned long dev_flags) { + struct property_entry prop[] = { + PROPERTY_ENTRY_U32("pasid-num-bits", 0), + {}, + }; + const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY | + MOCK_FLAGS_DEVICE_PASID; struct mock_dev *mdev; - int rc; + int rc, i; - if (dev_flags & - ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) + if (dev_flags & ~valid_flags) return ERR_PTR(-EINVAL); mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(-ENOMEM); + init_rwsem(&mdev->viommu_rwsem); device_initialize(&mdev->dev); mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; + for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) + mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT; rc = ida_alloc(&mock_dev_ida, GFP_KERNEL); if (rc < 0) @@ -685,7 +1044,16 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) if (rc) goto err_put; - rc = device_add(&mdev->dev); + if (dev_flags & MOCK_FLAGS_DEVICE_PASID) + prop[0] = PROPERTY_ENTRY_U32("pasid-num-bits", MOCK_PASID_WIDTH); + + rc = device_create_managed_software_node(&mdev->dev, prop, NULL); + if (rc) { + dev_err(&mdev->dev, "add pasid-num-bits property failed, rc: %d", rc); + goto err_put; + } + + rc = iommu_mock_device_add(&mdev->dev, &mock_iommu.iommu_dev); if (rc) goto err_put; return mdev; @@ -740,7 +1108,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, } sobj->idev.idev = idev; - rc = iommufd_device_attach(idev, &pt_id); + rc = iommufd_device_attach(idev, IOMMU_NO_PASID, &pt_id); if (rc) goto out_unbind; @@ -755,7 +1123,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, return 0; out_detach: - iommufd_device_detach(idev); + iommufd_device_detach(idev, IOMMU_NO_PASID); out_unbind: iommufd_device_unbind(idev); out_mdev: @@ -765,39 +1133,49 @@ out_sobj: return rc; } -/* Replace the mock domain with a manually allocated hw_pagetable */ -static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, - unsigned int device_id, u32 pt_id, - struct iommu_test_cmd *cmd) +static struct selftest_obj * +iommufd_test_get_selftest_obj(struct iommufd_ctx *ictx, u32 id) { struct iommufd_object *dev_obj; struct selftest_obj *sobj; - int rc; /* * Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure * it doesn't race with detach, which is not allowed. */ - dev_obj = - iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST); + dev_obj = iommufd_get_object(ictx, id, IOMMUFD_OBJ_SELFTEST); if (IS_ERR(dev_obj)) - return PTR_ERR(dev_obj); + return ERR_CAST(dev_obj); - sobj = container_of(dev_obj, struct selftest_obj, obj); + sobj = to_selftest_obj(dev_obj); if (sobj->type != TYPE_IDEV) { - rc = -EINVAL; - goto out_dev_obj; + iommufd_put_object(ictx, dev_obj); + return ERR_PTR(-EINVAL); } + return sobj; +} - rc = iommufd_device_replace(sobj->idev.idev, &pt_id); +/* Replace the mock domain with a manually allocated hw_pagetable */ +static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, + unsigned int device_id, u32 pt_id, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, device_id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + rc = iommufd_device_replace(sobj->idev.idev, IOMMU_NO_PASID, &pt_id); if (rc) - goto out_dev_obj; + goto out_sobj; cmd->mock_domain_replace.pt_id = pt_id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); -out_dev_obj: - iommufd_put_object(ucmd->ictx, dev_obj); +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); return rc; } @@ -826,23 +1204,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, { struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; + unsigned int page_size; uintptr_t end; int rc; - if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || - (uintptr_t)uptr % MOCK_IO_PAGE_SIZE || - check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) - return -EINVAL; - hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - for (; length; length -= MOCK_IO_PAGE_SIZE) { + page_size = 1 << __ffs(mock->domain.pgsize_bitmap); + if (iova % page_size || length % page_size || + (uintptr_t)uptr % page_size || + check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) + return -EINVAL; + + for (; length; length -= page_size) { struct page *pages[1]; + phys_addr_t io_phys; unsigned long pfn; long npages; - void *ent; npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, pages); @@ -857,15 +1237,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, pfn = page_to_pfn(pages[0]); put_page(pages[0]); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent || - (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != - pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova); + if (io_phys != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { rc = -EINVAL; goto out_put; } - iova += MOCK_IO_PAGE_SIZE; - uptr += MOCK_IO_PAGE_SIZE; + iova += page_size; + uptr += page_size; } rc = 0; @@ -909,9 +1288,8 @@ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, return 0; } -static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, - u32 mockpt_id, unsigned int iotlb_id, - u32 iotlb) +static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, u32 mockpt_id, + unsigned int iotlb_id, u32 iotlb) { struct mock_iommu_domain_nested *mock_nested; struct iommufd_hw_pagetable *hwpt; @@ -921,8 +1299,7 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - mock_nested = container_of(hwpt->domain, - struct mock_iommu_domain_nested, domain); + mock_nested = to_mock_nested(hwpt->domain); if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX || mock_nested->iotlb[iotlb_id] != iotlb) @@ -931,6 +1308,24 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, return rc; } +static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id, + unsigned int cache_id, u32 cache) +{ + struct iommufd_device *idev; + struct mock_dev *mdev; + int rc = 0; + + idev = iommufd_get_device(ucmd, idev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + mdev = container_of(idev->dev, struct mock_dev, dev); + + if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache) + rc = -EINVAL; + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + struct selftest_access { struct iommufd_access *access; struct file *file; @@ -1167,7 +1562,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, int rc; /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ - if (length > 16*1024*1024) + if (length > 16 * 1024 * 1024) return -ENOMEM; if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ)) @@ -1184,7 +1579,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, if (flags & MOCK_FLAGS_ACCESS_SYZ) iova = iommufd_test_syz_conv_iova(staccess->access, - &cmd->access_pages.iova); + &cmd->access_pages.iova); npages = (ALIGN(iova + length, PAGE_SIZE) - ALIGN_DOWN(iova, PAGE_SIZE)) / @@ -1260,7 +1655,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, int rc; /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ - if (length > 16*1024*1024) + if (length > 16 * 1024 * 1024) return -ENOMEM; if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH | @@ -1286,7 +1681,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, if (flags & MOCK_FLAGS_ACCESS_SYZ) iova = iommufd_test_syz_conv_iova(staccess->access, - &cmd->access_rw.iova); + &cmd->access_rw.iova); rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags); if (rc) @@ -1313,7 +1708,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, unsigned long page_size, void __user *uptr, u32 flags) { - unsigned long bitmap_size, i, max; + unsigned long i, max; struct iommu_test_cmd *cmd = ucmd->cmd; struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; @@ -1328,42 +1723,29 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - if (!(mock->flags & MOCK_DIRTY_TRACK)) { + if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) { rc = -EINVAL; goto out_put; } max = length / page_size; - bitmap_size = max / BITS_PER_BYTE; - - tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT); + tmp = kvzalloc(DIV_ROUND_UP(max, BITS_PER_LONG) * sizeof(unsigned long), + GFP_KERNEL_ACCOUNT); if (!tmp) { rc = -ENOMEM; goto out_put; } - if (copy_from_user(tmp, uptr, bitmap_size)) { + if (copy_from_user(tmp, uptr, DIV_ROUND_UP(max, BITS_PER_BYTE))) { rc = -EFAULT; goto out_free; } for (i = 0; i < max; i++) { - unsigned long cur = iova + i * page_size; - void *ent, *old; - if (!test_bit(i, (unsigned long *)tmp)) continue; - - ent = xa_load(&mock->pfns, cur / page_size); - if (ent) { - unsigned long val; - - val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / page_size, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - count++; - } + mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size); + count++; } cmd->dirty.out_nr_dirty = count; @@ -1375,19 +1757,330 @@ out_put: return rc; } +static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct iopf_fault event = {}; + struct iommufd_device *idev; + + idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + event.fault.prm.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (cmd->trigger_iopf.pasid != IOMMU_NO_PASID) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = cmd->trigger_iopf.addr; + event.fault.prm.pasid = cmd->trigger_iopf.pasid; + event.fault.prm.grpid = cmd->trigger_iopf.grpid; + event.fault.prm.perm = cmd->trigger_iopf.perm; + + iommu_report_device_fault(idev->dev, &event); + iommufd_put_object(ucmd->ictx, &idev->obj); + + return 0; +} + +static int iommufd_test_trigger_vevent(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct iommu_viommu_event_selftest test = {}; + struct iommufd_device *idev; + struct mock_dev *mdev; + int rc = -ENOENT; + + idev = iommufd_get_device(ucmd, cmd->trigger_vevent.dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + mdev = to_mock_dev(idev->dev); + + down_read(&mdev->viommu_rwsem); + if (!mdev->viommu || !mdev->vdev_id) + goto out_unlock; + + test.virt_id = mdev->vdev_id; + rc = iommufd_viommu_report_event(&mdev->viommu->core, + IOMMU_VEVENTQ_TYPE_SELFTEST, &test, + sizeof(test)); +out_unlock: + up_read(&mdev->viommu_rwsem); + iommufd_put_object(ucmd->ictx, &idev->obj); + + return rc; +} + +static inline struct iommufd_hw_pagetable * +iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) +{ + struct iommufd_object *pt_obj; + + pt_obj = iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) + return ERR_CAST(pt_obj); + + if (pt_obj->type != IOMMUFD_OBJ_HWPT_NESTED && + pt_obj->type != IOMMUFD_OBJ_HWPT_PAGING) { + iommufd_put_object(ucmd->ictx, pt_obj); + return ERR_PTR(-EINVAL); + } + + return container_of(pt_obj, struct iommufd_hw_pagetable, obj); +} + +static int iommufd_test_pasid_check_hwpt(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + u32 hwpt_id = cmd->pasid_check.hwpt_id; + struct iommu_domain *attached_domain; + struct iommu_attach_handle *handle; + struct iommufd_hw_pagetable *hwpt; + struct selftest_obj *sobj; + struct mock_dev *mdev; + int rc = 0; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + mdev = sobj->idev.mock_dev; + + handle = iommu_attach_handle_get(mdev->dev.iommu_group, + cmd->pasid_check.pasid, 0); + if (IS_ERR(handle)) + attached_domain = NULL; + else + attached_domain = handle->domain; + + /* hwpt_id == 0 means to check if pasid is detached */ + if (!hwpt_id) { + if (attached_domain) + rc = -EINVAL; + goto out_sobj; + } + + hwpt = iommufd_get_hwpt(ucmd, hwpt_id); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_sobj; + } + + if (attached_domain != hwpt->domain) + rc = -EINVAL; + + iommufd_put_object(ucmd->ictx, &hwpt->obj); +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_attach(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + rc = iommufd_device_attach(sobj->idev.idev, cmd->pasid_attach.pasid, + &cmd->pasid_attach.pt_id); + if (rc) + goto out_sobj; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + iommufd_device_detach(sobj->idev.idev, cmd->pasid_attach.pasid); + +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_replace(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + rc = iommufd_device_replace(sobj->idev.idev, cmd->pasid_attach.pasid, + &cmd->pasid_attach.pt_id); + if (rc) + goto out_sobj; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_detach(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + iommufd_device_detach(sobj->idev.idev, cmd->pasid_detach.pasid); + iommufd_put_object(ucmd->ictx, &sobj->obj); + return 0; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { - struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); + struct selftest_obj *sobj = to_selftest_obj(obj); switch (sobj->type) { case TYPE_IDEV: - iommufd_device_detach(sobj->idev.idev); + iommufd_device_detach(sobj->idev.idev, IOMMU_NO_PASID); iommufd_device_unbind(sobj->idev.idev); mock_dev_destroy(sobj->idev.mock_dev); break; } } +struct iommufd_test_dma_buf { + void *memory; + size_t length; + bool revoked; +}; + +static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + return 0; +} + +static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ +} + +static struct sg_table * +iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ +} + +static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf) +{ + struct iommufd_test_dma_buf *priv = dmabuf->priv; + + kfree(priv->memory); + kfree(priv); +} + +static const struct dma_buf_ops iommufd_test_dmabuf_ops = { + .attach = iommufd_test_dma_buf_attach, + .detach = iommufd_test_dma_buf_detach, + .map_dma_buf = iommufd_test_dma_buf_map, + .release = iommufd_test_dma_buf_release, + .unmap_dma_buf = iommufd_test_dma_buf_unmap, +}; + +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(attachment->dmabuf->resv); + + if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + phys->paddr = virt_to_phys(priv->memory); + phys->len = priv->length; + return 0; +} + +static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd, + unsigned int open_flags, + size_t len) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc; + + len = ALIGN(len, PAGE_SIZE); + if (len == 0 || len > PAGE_SIZE * 512) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->length = len; + priv->memory = kzalloc(len, GFP_KERNEL); + if (!priv->memory) { + rc = -ENOMEM; + goto err_free; + } + + exp_info.ops = &iommufd_test_dmabuf_ops; + exp_info.size = len; + exp_info.flags = open_flags; + exp_info.priv = priv; + + dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(dmabuf)) { + rc = PTR_ERR(dmabuf); + goto err_free; + } + + return dma_buf_fd(dmabuf, open_flags); + +err_free: + kfree(priv->memory); + kfree(priv); + return rc; +} + +static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd, + bool revoked) +{ + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc = 0; + + dmabuf = dma_buf_get(fd); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + if (dmabuf->ops != &iommufd_test_dmabuf_ops) { + rc = -EOPNOTSUPP; + goto err_put; + } + + priv = dmabuf->priv; + dma_resv_lock(dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(dmabuf); + dma_resv_unlock(dmabuf->resv); + +err_put: + dma_buf_put(dmabuf); + return rc; +} + int iommufd_test(struct iommufd_ucmd *ucmd) { struct iommu_test_cmd *cmd = ucmd->cmd; @@ -1416,6 +2109,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_md_check_iotlb(ucmd, cmd->id, cmd->check_iotlb.id, cmd->check_iotlb.iotlb); + case IOMMU_TEST_OP_DEV_CHECK_CACHE: + return iommufd_test_dev_check_cache(ucmd, cmd->id, + cmd->check_dev_cache.id, + cmd->check_dev_cache.cache); case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); @@ -1450,6 +2147,25 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->dirty.page_size, u64_to_user_ptr(cmd->dirty.uptr), cmd->dirty.flags); + case IOMMU_TEST_OP_TRIGGER_IOPF: + return iommufd_test_trigger_iopf(ucmd, cmd); + case IOMMU_TEST_OP_TRIGGER_VEVENT: + return iommufd_test_trigger_vevent(ucmd, cmd); + case IOMMU_TEST_OP_PASID_ATTACH: + return iommufd_test_pasid_attach(ucmd, cmd); + case IOMMU_TEST_OP_PASID_REPLACE: + return iommufd_test_pasid_replace(ucmd, cmd); + case IOMMU_TEST_OP_PASID_DETACH: + return iommufd_test_pasid_detach(ucmd, cmd); + case IOMMU_TEST_OP_PASID_CHECK_HWPT: + return iommufd_test_pasid_check_hwpt(ucmd, cmd); + case IOMMU_TEST_OP_DMABUF_GET: + return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags, + cmd->dmabuf_get.length); + case IOMMU_TEST_OP_DMABUF_REVOKE: + return iommufd_test_dmabuf_revoke(ucmd, + cmd->dmabuf_revoke.dmabuf_fd, + cmd->dmabuf_revoke.revoked); default: return -EOPNOTSUPP; } @@ -1480,21 +2196,28 @@ int __init iommufd_test_init(void) if (rc) goto err_platform; - rc = iommu_device_sysfs_add(&mock_iommu_device, + rc = iommu_device_sysfs_add(&mock_iommu.iommu_dev, &selftest_iommu_dev->dev, NULL, "%s", dev_name(&selftest_iommu_dev->dev)); if (rc) goto err_bus; - rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops, - &iommufd_mock_bus_type.bus, - &iommufd_mock_bus_type.nb); + rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops, + &iommufd_mock_bus_type.bus, + &iommufd_mock_bus_type.nb); if (rc) goto err_sysfs; + + refcount_set(&mock_iommu.users, 1); + init_completion(&mock_iommu.complete); + + mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq"); + mock_iommu.iommu_dev.max_pasids = (1 << MOCK_PASID_WIDTH); + return 0; err_sysfs: - iommu_device_sysfs_remove(&mock_iommu_device); + iommu_device_sysfs_remove(&mock_iommu.iommu_dev); err_bus: bus_unregister(&iommufd_mock_bus_type.bus); err_platform: @@ -1504,13 +2227,37 @@ err_dbgfs: return rc; } +static void iommufd_test_wait_for_users(void) +{ + if (refcount_dec_and_test(&mock_iommu.users)) + return; + /* + * Time out waiting for iommu device user count to become 0. + * + * Note that this is just making an example here, since the selftest is + * built into the iommufd module, i.e. it only unplugs the iommu device + * when unloading the module. So, it is expected that this WARN_ON will + * not trigger, as long as any iommufd FDs are open. + */ + WARN_ON(!wait_for_completion_timeout(&mock_iommu.complete, + msecs_to_jiffies(10000))); +} + void iommufd_test_exit(void) { - iommu_device_sysfs_remove(&mock_iommu_device); - iommu_device_unregister_bus(&mock_iommu_device, + if (mock_iommu_iopf_queue) { + iopf_queue_free(mock_iommu_iopf_queue); + mock_iommu_iopf_queue = NULL; + } + + iommufd_test_wait_for_users(); + iommu_device_sysfs_remove(&mock_iommu.iommu_dev); + iommu_device_unregister_bus(&mock_iommu.iommu_dev, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); bus_unregister(&iommufd_mock_bus_type.bus); platform_device_unregister(selftest_iommu_dev); debugfs_remove_recursive(dbgfs_root); } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index a3ad5f0b6c59..a258ee2f4579 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -44,7 +44,7 @@ int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) iommufd_put_object(ictx, &ioas->obj); return 0; } -EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, "IOMMUFD_VFIO"); /** * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached @@ -66,7 +66,7 @@ int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) xa_unlock(&ictx->objects); return ret; } -EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO); +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, "IOMMUFD_VFIO"); /** * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created @@ -118,7 +118,7 @@ out_abort: iommufd_object_abort(ictx, &ioas->obj); return ret; } -EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO); +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, "IOMMUFD_VFIO"); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) { @@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, case VFIO_DMA_CC_IOMMU: return iommufd_vfio_cc_iommu(ictx); - /* - * This is obsolete, and to be removed from VFIO. It was an incomplete - * idea that got merged. - * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ - */ - case VFIO_TYPE1_NESTING_IOMMU: + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: return 0; /* diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c new file mode 100644 index 000000000000..462b457ffd0c --- /dev/null +++ b/drivers/iommu/iommufd/viommu.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "iommufd_private.h" + +void iommufd_viommu_destroy(struct iommufd_object *obj) +{ + struct iommufd_viommu *viommu = + container_of(obj, struct iommufd_viommu, obj); + + if (viommu->ops && viommu->ops->destroy) + viommu->ops->destroy(viommu); + refcount_dec(&viommu->hwpt->common.obj.users); + xa_destroy(&viommu->vdevs); +} + +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_viommu_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_viommu *viommu; + struct iommufd_device *idev; + const struct iommu_ops *ops; + size_t viommu_size; + int rc; + + if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ops = dev_iommu_ops(idev->dev); + if (!ops->get_viommu_size || !ops->viommu_init) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + viommu_size = ops->get_viommu_size(idev->dev, cmd->type); + if (!viommu_size) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + /* + * It is a driver bug for providing a viommu_size smaller than the core + * vIOMMU structure size + */ + if (WARN_ON_ONCE(viommu_size < sizeof(*viommu))) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); + goto out_put_idev; + } + + if (!hwpt_paging->nest_parent) { + rc = -EINVAL; + goto out_put_hwpt; + } + + viommu = (struct iommufd_viommu *)_iommufd_object_alloc_ucmd( + ucmd, viommu_size, IOMMUFD_OBJ_VIOMMU); + if (IS_ERR(viommu)) { + rc = PTR_ERR(viommu); + goto out_put_hwpt; + } + + xa_init(&viommu->vdevs); + viommu->type = cmd->type; + viommu->ictx = ucmd->ictx; + viommu->hwpt = hwpt_paging; + refcount_inc(&viommu->hwpt->common.obj.users); + INIT_LIST_HEAD(&viommu->veventqs); + init_rwsem(&viommu->veventqs_rwsem); + /* + * It is the most likely case that a physical IOMMU is unpluggable. A + * pluggable IOMMU instance (if exists) is responsible for refcounting + * on its own. + */ + viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); + + rc = ops->viommu_init(viommu, hwpt_paging->common.domain, + user_data.len ? &user_data : NULL); + if (rc) + goto out_put_hwpt; + + /* It is a driver bug that viommu->ops isn't filled */ + if (WARN_ON_ONCE(!viommu->ops)) { + rc = -EOPNOTSUPP; + goto out_put_hwpt; + } + + cmd->out_viommu_id = viommu->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put_hwpt: + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); +out_put_idev: + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + +void iommufd_vdevice_abort(struct iommufd_object *obj) +{ + struct iommufd_vdevice *vdev = + container_of(obj, struct iommufd_vdevice, obj); + struct iommufd_viommu *viommu = vdev->viommu; + struct iommufd_device *idev = vdev->idev; + + lockdep_assert_held(&idev->igroup->lock); + + if (vdev->destroy) + vdev->destroy(vdev); + /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ + xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL); + refcount_dec(&viommu->obj.users); + idev->vdev = NULL; +} + +void iommufd_vdevice_destroy(struct iommufd_object *obj) +{ + struct iommufd_vdevice *vdev = + container_of(obj, struct iommufd_vdevice, obj); + struct iommufd_device *idev = vdev->idev; + struct iommufd_ctx *ictx = idev->ictx; + + mutex_lock(&idev->igroup->lock); + iommufd_vdevice_abort(obj); + mutex_unlock(&idev->igroup->lock); + iommufd_put_object(ictx, &idev->obj); +} + +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_vdevice_alloc *cmd = ucmd->cmd; + struct iommufd_vdevice *vdev, *curr; + size_t vdev_size = sizeof(*vdev); + struct iommufd_viommu *viommu; + struct iommufd_device *idev; + u64 virt_id = cmd->virt_id; + int rc = 0; + + /* virt_id indexes an xarray */ + if (virt_id > ULONG_MAX) + return -EINVAL; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_put_viommu; + } + + if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { + rc = -EINVAL; + goto out_put_idev; + } + + mutex_lock(&idev->igroup->lock); + if (idev->destroying) { + rc = -ENOENT; + goto out_unlock_igroup; + } + + if (idev->vdev) { + rc = -EEXIST; + goto out_unlock_igroup; + } + + if (viommu->ops && viommu->ops->vdevice_size) { + /* + * It is a driver bug for: + * - ops->vdevice_size smaller than the core structure size + * - not implementing a pairing ops->vdevice_init op + */ + if (WARN_ON_ONCE(viommu->ops->vdevice_size < vdev_size || + !viommu->ops->vdevice_init)) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + vdev_size = viommu->ops->vdevice_size; + } + + vdev = (struct iommufd_vdevice *)_iommufd_object_alloc( + ucmd->ictx, vdev_size, IOMMUFD_OBJ_VDEVICE); + if (IS_ERR(vdev)) { + rc = PTR_ERR(vdev); + goto out_unlock_igroup; + } + + vdev->virt_id = virt_id; + vdev->viommu = viommu; + refcount_inc(&viommu->obj.users); + /* + * A wait_cnt reference is held on the idev so long as we have the + * pointer. iommufd_device_pre_destroy() will revoke it before the + * idev real destruction. + */ + vdev->idev = idev; + + /* + * iommufd_device_destroy() delays until idev->vdev is NULL before + * freeing the idev, which only happens once the vdev is finished + * destruction. + */ + idev->vdev = vdev; + + curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL); + if (curr) { + rc = xa_err(curr) ?: -EEXIST; + goto out_abort; + } + + if (viommu->ops && viommu->ops->vdevice_init) { + rc = viommu->ops->vdevice_init(vdev); + if (rc) + goto out_abort; + } + + cmd->out_vdevice_id = vdev->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_abort; + iommufd_object_finalize(ucmd->ictx, &vdev->obj); + goto out_unlock_igroup; + +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); +out_unlock_igroup: + mutex_unlock(&idev->igroup->lock); +out_put_idev: + if (rc) + iommufd_put_object(ucmd->ictx, &idev->obj); +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} + +static void iommufd_hw_queue_destroy_access(struct iommufd_ctx *ictx, + struct iommufd_access *access, + u64 base_iova, size_t length) +{ + u64 aligned_iova = PAGE_ALIGN_DOWN(base_iova); + u64 offset = base_iova - aligned_iova; + + iommufd_access_unpin_pages(access, aligned_iova, + PAGE_ALIGN(length + offset)); + iommufd_access_detach_internal(access); + iommufd_access_destroy_internal(ictx, access); +} + +void iommufd_hw_queue_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_queue *hw_queue = + container_of(obj, struct iommufd_hw_queue, obj); + + if (hw_queue->destroy) + hw_queue->destroy(hw_queue); + if (hw_queue->access) + iommufd_hw_queue_destroy_access(hw_queue->viommu->ictx, + hw_queue->access, + hw_queue->base_addr, + hw_queue->length); + if (hw_queue->viommu) + refcount_dec(&hw_queue->viommu->obj.users); +} + +/* + * When the HW accesses the guest queue via physical addresses, the underlying + * physical pages of the guest queue must be contiguous. Also, for the security + * concern that IOMMUFD_CMD_IOAS_UNMAP could potentially remove the mappings of + * the guest queue from the nesting parent iopt while the HW is still accessing + * the guest queue memory physically, such a HW queue must require an access to + * pin the underlying pages and prevent that from happening. + */ +static struct iommufd_access * +iommufd_hw_queue_alloc_phys(struct iommu_hw_queue_alloc *cmd, + struct iommufd_viommu *viommu, phys_addr_t *base_pa) +{ + u64 aligned_iova = PAGE_ALIGN_DOWN(cmd->nesting_parent_iova); + u64 offset = cmd->nesting_parent_iova - aligned_iova; + struct iommufd_access *access; + struct page **pages; + size_t max_npages; + size_t length; + size_t i; + int rc; + + /* max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE) */ + if (check_add_overflow(offset, cmd->length, &length)) + return ERR_PTR(-ERANGE); + if (check_add_overflow(length, PAGE_SIZE - 1, &length)) + return ERR_PTR(-ERANGE); + max_npages = length / PAGE_SIZE; + /* length needs to be page aligned too */ + length = max_npages * PAGE_SIZE; + + /* + * Use kvcalloc() to avoid memory fragmentation for a large page array. + * Set __GFP_NOWARN to avoid syzkaller blowups + */ + pages = kvcalloc(max_npages, sizeof(*pages), GFP_KERNEL | __GFP_NOWARN); + if (!pages) + return ERR_PTR(-ENOMEM); + + access = iommufd_access_create_internal(viommu->ictx); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_free; + } + + rc = iommufd_access_attach_internal(access, viommu->hwpt->ioas); + if (rc) + goto out_destroy; + + rc = iommufd_access_pin_pages(access, aligned_iova, length, pages, 0); + if (rc) + goto out_detach; + + /* Validate if the underlying physical pages are contiguous */ + for (i = 1; i < max_npages; i++) { + if (page_to_pfn(pages[i]) == page_to_pfn(pages[i - 1]) + 1) + continue; + rc = -EFAULT; + goto out_unpin; + } + + *base_pa = (page_to_pfn(pages[0]) << PAGE_SHIFT) + offset; + kvfree(pages); + return access; + +out_unpin: + iommufd_access_unpin_pages(access, aligned_iova, length); +out_detach: + iommufd_access_detach_internal(access); +out_destroy: + iommufd_access_destroy_internal(viommu->ictx, access); +out_free: + kvfree(pages); + return ERR_PTR(rc); +} + +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_hw_queue_alloc *cmd = ucmd->cmd; + struct iommufd_hw_queue *hw_queue; + struct iommufd_viommu *viommu; + struct iommufd_access *access; + size_t hw_queue_size; + phys_addr_t base_pa; + u64 last; + int rc; + + if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->length) + return -EINVAL; + if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1, + &last)) + return -EOVERFLOW; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + if (!viommu->ops || !viommu->ops->get_hw_queue_size || + !viommu->ops->hw_queue_init_phys) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + hw_queue_size = viommu->ops->get_hw_queue_size(viommu, cmd->type); + if (!hw_queue_size) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + /* + * It is a driver bug for providing a hw_queue_size smaller than the + * core HW queue structure size + */ + if (WARN_ON_ONCE(hw_queue_size < sizeof(*hw_queue))) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + hw_queue = (struct iommufd_hw_queue *)_iommufd_object_alloc_ucmd( + ucmd, hw_queue_size, IOMMUFD_OBJ_HW_QUEUE); + if (IS_ERR(hw_queue)) { + rc = PTR_ERR(hw_queue); + goto out_put_viommu; + } + + access = iommufd_hw_queue_alloc_phys(cmd, viommu, &base_pa); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_put_viommu; + } + + hw_queue->viommu = viommu; + refcount_inc(&viommu->obj.users); + hw_queue->access = access; + hw_queue->type = cmd->type; + hw_queue->length = cmd->length; + hw_queue->base_addr = cmd->nesting_parent_iova; + + rc = viommu->ops->hw_queue_init_phys(hw_queue, cmd->index, base_pa); + if (rc) + goto out_put_viommu; + + cmd->out_hw_queue_id = hw_queue->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index d59d0ea2fd21..18f839721813 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -6,6 +6,7 @@ */ #include <linux/iova.h> +#include <linux/kmemleak.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/smp.h> @@ -506,7 +507,7 @@ __adjust_overlap_range(struct iova *iova, * reserve_iova - reserves an iova in the given range * @iovad: - iova domain pointer * @pfn_lo: - lower page frame address - * @pfn_hi:- higher pfn adderss + * @pfn_hi:- higher pfn address * This function allocates reserves the address range from pfn_lo to pfn_hi so * that this address is not dished out as part of alloc_iova. */ @@ -673,6 +674,11 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache) { struct iova_magazine *mag = rcache->depot; + /* + * As the mag->next pointer is moved to rcache->depot and reset via + * the mag->size assignment, mark it as a transient false positive. + */ + kmemleak_transient_leak(mag->next); rcache->depot = mag->next; mag->size = IOVA_MAG_SIZE; rcache->depot_size--; @@ -1000,4 +1006,5 @@ void iova_cache_put(void) EXPORT_SYMBOL_GPL(iova_cache_put); MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>"); +MODULE_DESCRIPTION("IOMMU I/O Virtual Address management"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index b657cc09605f..ca848288dbf2 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -430,7 +430,7 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain) * non-secure mode. */ domain->cfg.quirks = IO_PGTABLE_QUIRK_ARM_NS; - domain->cfg.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K; + domain->cfg.pgsize_bitmap = domain->io_domain.pgsize_bitmap; domain->cfg.ias = 32; domain->cfg.oas = 40; domain->cfg.tlb = &ipmmu_flush_ops; @@ -571,6 +571,7 @@ static struct iommu_domain *ipmmu_domain_alloc_paging(struct device *dev) return NULL; mutex_init(&domain->mutex); + domain->io_domain.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K; return &domain->io_domain; } @@ -589,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) } static int ipmmu_attach_device(struct iommu_domain *io_domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); @@ -636,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, } static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_domain *domain; unsigned int i; - if (io_domain == identity_domain || !io_domain) + if (old == identity_domain || !old) return 0; - domain = to_vmsa_domain(io_domain); + domain = to_vmsa_domain(old); for (i = 0; i < fwspec->num_ids; ++i) ipmmu_utlb_disable(domain, fwspec->ids[i]); @@ -719,6 +720,8 @@ static int ipmmu_init_platform_device(struct device *dev, dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev)); + put_device(&ipmmu_pdev->dev); + return 0; } @@ -804,8 +807,7 @@ static int ipmmu_init_arm_mapping(struct device *dev) if (!mmu->mapping) { struct dma_iommu_mapping *mapping; - mapping = arm_iommu_create_mapping(&platform_bus_type, - SZ_1G, SZ_2G); + mapping = arm_iommu_create_mapping(dev, SZ_1G, SZ_2G); if (IS_ERR(mapping)) { dev_err(mmu->dev, "failed to create ARM IOMMU mapping\n"); ret = PTR_ERR(mapping); @@ -883,7 +885,6 @@ static const struct iommu_ops ipmmu_ops = { */ .device_group = IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA) ? generic_device_group : generic_single_device_group, - .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K, .of_xlate = ipmmu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = ipmmu_attach_device, @@ -1082,31 +1083,25 @@ static int ipmmu_probe(struct platform_device *pdev) } } + platform_set_drvdata(pdev, mmu); /* * Register the IPMMU to the IOMMU subsystem in the following cases: * - R-Car Gen2 IPMMU (all devices registered) * - R-Car Gen3 IPMMU (leaf devices only - skip root IPMMU-MM device) */ - if (!mmu->features->has_cache_leaf_nodes || !ipmmu_is_root(mmu)) { - ret = iommu_device_sysfs_add(&mmu->iommu, &pdev->dev, NULL, - dev_name(&pdev->dev)); - if (ret) - return ret; - - ret = iommu_device_register(&mmu->iommu, &ipmmu_ops, &pdev->dev); - if (ret) - return ret; - } + if (mmu->features->has_cache_leaf_nodes && ipmmu_is_root(mmu)) + return 0; - /* - * We can't create the ARM mapping here as it requires the bus to have - * an IOMMU, which only happens when bus_set_iommu() is called in - * ipmmu_init() after the probe function returns. - */ + ret = iommu_device_sysfs_add(&mmu->iommu, &pdev->dev, NULL, "%s", + dev_name(&pdev->dev)); + if (ret) + return ret; - platform_set_drvdata(pdev, mmu); + ret = iommu_device_register(&mmu->iommu, &ipmmu_ops, &pdev->dev); + if (ret) + iommu_device_sysfs_remove(&mmu->iommu); - return 0; + return ret; } static void ipmmu_remove(struct platform_device *pdev) @@ -1160,6 +1155,6 @@ static struct platform_driver ipmmu_driver = { .pm = pm_sleep_ptr(&ipmmu_pm), }, .probe = ipmmu_probe, - .remove_new = ipmmu_remove, + .remove = ipmmu_remove, }; builtin_platform_driver(ipmmu_driver); diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 989e0869d805..819add75a665 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -312,6 +312,8 @@ static struct iommu_domain *msm_iommu_domain_alloc_paging(struct device *dev) INIT_LIST_HEAD(&priv->list_attached); + priv->domain.pgsize_bitmap = MSM_IOMMU_PGSIZES; + priv->domain.geometry.aperture_start = 0; priv->domain.geometry.aperture_end = (1ULL << 32) - 1; priv->domain.geometry.force_aperture = true; @@ -339,7 +341,7 @@ static int msm_iommu_domain_config(struct msm_priv *priv) spin_lock_init(&priv->pgtlock); priv->cfg = (struct io_pgtable_cfg) { - .pgsize_bitmap = msm_iommu_ops.pgsize_bitmap, + .pgsize_bitmap = priv->domain.pgsize_bitmap, .ias = 32, .oas = 32, .tlb = &msm_iommu_flush_ops, @@ -352,8 +354,6 @@ static int msm_iommu_domain_config(struct msm_priv *priv) return -EINVAL; } - msm_iommu_ops.pgsize_bitmap = priv->cfg.pgsize_bitmap; - return 0; } @@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev) return &iommu->iommu; } -static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; unsigned long flags; @@ -441,19 +442,19 @@ fail: } static int msm_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct msm_priv *priv; unsigned long flags; struct msm_iommu_dev *iommu; struct msm_iommu_ctx_dev *master; int ret = 0; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - priv = to_msm_priv(domain); + priv = to_msm_priv(old); free_io_pgtable_ops(priv->iop); spin_lock_irqsave(&msm_iommu_lock, flags); @@ -692,7 +693,6 @@ static struct iommu_ops msm_iommu_ops = { .domain_alloc_paging = msm_iommu_domain_alloc_paging, .probe_device = msm_iommu_probe_device, .device_group = generic_device_group, - .pgsize_bitmap = MSM_IOMMU_PGSIZES, .of_xlate = qcom_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = msm_iommu_attach_dev, @@ -725,47 +725,32 @@ static int msm_iommu_probe(struct platform_device *pdev) iommu->dev = &pdev->dev; INIT_LIST_HEAD(&iommu->ctx_list); - iommu->pclk = devm_clk_get(iommu->dev, "smmu_pclk"); + iommu->pclk = devm_clk_get_prepared(iommu->dev, "smmu_pclk"); if (IS_ERR(iommu->pclk)) return dev_err_probe(iommu->dev, PTR_ERR(iommu->pclk), "could not get smmu_pclk\n"); - ret = clk_prepare(iommu->pclk); - if (ret) - return dev_err_probe(iommu->dev, ret, - "could not prepare smmu_pclk\n"); - - iommu->clk = devm_clk_get(iommu->dev, "iommu_clk"); - if (IS_ERR(iommu->clk)) { - clk_unprepare(iommu->pclk); + iommu->clk = devm_clk_get_prepared(iommu->dev, "iommu_clk"); + if (IS_ERR(iommu->clk)) return dev_err_probe(iommu->dev, PTR_ERR(iommu->clk), "could not get iommu_clk\n"); - } - - ret = clk_prepare(iommu->clk); - if (ret) { - clk_unprepare(iommu->pclk); - return dev_err_probe(iommu->dev, ret, "could not prepare iommu_clk\n"); - } r = platform_get_resource(pdev, IORESOURCE_MEM, 0); iommu->base = devm_ioremap_resource(iommu->dev, r); if (IS_ERR(iommu->base)) { ret = dev_err_probe(iommu->dev, PTR_ERR(iommu->base), "could not get iommu base\n"); - goto fail; + return ret; } ioaddr = r->start; iommu->irq = platform_get_irq(pdev, 0); - if (iommu->irq < 0) { - ret = -ENODEV; - goto fail; - } + if (iommu->irq < 0) + return -ENODEV; ret = of_property_read_u32(iommu->dev->of_node, "qcom,ncb", &val); if (ret) { dev_err(iommu->dev, "could not get ncb\n"); - goto fail; + return ret; } iommu->ncb = val; @@ -780,8 +765,7 @@ static int msm_iommu_probe(struct platform_device *pdev) if (!par) { pr_err("Invalid PAR value detected\n"); - ret = -ENODEV; - goto fail; + return -ENODEV; } ret = devm_request_threaded_irq(iommu->dev, iommu->irq, NULL, @@ -791,7 +775,7 @@ static int msm_iommu_probe(struct platform_device *pdev) iommu); if (ret) { pr_err("Request IRQ %d failed with ret=%d\n", iommu->irq, ret); - goto fail; + return ret; } list_add(&iommu->dev_node, &qcom_iommu_devices); @@ -800,23 +784,19 @@ static int msm_iommu_probe(struct platform_device *pdev) "msm-smmu.%pa", &ioaddr); if (ret) { pr_err("Could not add msm-smmu at %pa to sysfs\n", &ioaddr); - goto fail; + return ret; } ret = iommu_device_register(&iommu->iommu, &msm_iommu_ops, &pdev->dev); if (ret) { pr_err("Could not register msm-smmu at %pa\n", &ioaddr); - goto fail; + return ret; } pr_info("device mapped at %p, irq %d with %d ctx banks\n", iommu->base, iommu->irq, iommu->ncb); return ret; -fail: - clk_unprepare(iommu->clk); - clk_unprepare(iommu->pclk); - return ret; } static const struct of_device_id msm_iommu_dt_match[] = { @@ -824,20 +804,11 @@ static const struct of_device_id msm_iommu_dt_match[] = { {} }; -static void msm_iommu_remove(struct platform_device *pdev) -{ - struct msm_iommu_dev *iommu = platform_get_drvdata(pdev); - - clk_unprepare(iommu->clk); - clk_unprepare(iommu->pclk); -} - static struct platform_driver msm_iommu_driver = { .driver = { .name = "msm_iommu", .of_match_table = msm_iommu_dt_match, }, .probe = msm_iommu_probe, - .remove_new = msm_iommu_remove, }; builtin_platform_driver(msm_iommu_driver); diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 6a2707fe7a78..60fcd3d3b5eb 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -29,6 +29,7 @@ #include <linux/spinlock.h> #include <linux/soc/mediatek/infracfg.h> #include <linux/soc/mediatek/mtk_sip_svc.h> +#include <linux/string_choices.h> #include <asm/barrier.h> #include <soc/mediatek/smi.h> @@ -138,6 +139,7 @@ /* 2 bits: iommu type */ #define MTK_IOMMU_TYPE_MM (0x0 << 13) #define MTK_IOMMU_TYPE_INFRA (0x1 << 13) +#define MTK_IOMMU_TYPE_APU (0x2 << 13) #define MTK_IOMMU_TYPE_MASK (0x3 << 13) /* PM and clock always on. e.g. infra iommu */ #define PM_CLK_AO BIT(15) @@ -146,6 +148,7 @@ #define TF_PORT_TO_ADDR_MT8173 BIT(18) #define INT_ID_PORT_WIDTH_6 BIT(19) #define CFG_IFA_MASTER_IN_ATF BIT(20) +#define DL_WITH_MULTI_LARB BIT(21) #define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \ ((((pdata)->flags) & (mask)) == (_x)) @@ -171,6 +174,7 @@ enum mtk_iommu_plat { M4U_MT8183, M4U_MT8186, M4U_MT8188, + M4U_MT8189, M4U_MT8192, M4U_MT8195, M4U_MT8365, @@ -334,6 +338,8 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data, unsigned int ban */ #define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL +static LIST_HEAD(apulist); /* List the apu iommu HWs */ +static LIST_HEAD(infralist); /* List the iommu_infra HW */ static LIST_HEAD(m4ulist); /* List all the M4U HWs */ #define for_each_m4u(data, head) list_for_each_entry(data, head, list) @@ -349,6 +355,15 @@ static const struct mtk_iommu_iova_region single_domain[] = { #define MT8192_MULTI_REGION_NR (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) ? \ MT8192_MULTI_REGION_NR_MAX : 1) +static const struct mtk_iommu_iova_region mt8189_multi_dom_apu[] = { + { .iova_base = 0x200000ULL, .size = SZ_512M}, /* APU SECURE */ +#if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) + { .iova_base = SZ_1G, .size = 0xc0000000}, /* APU CODE */ + { .iova_base = 0x70000000ULL, .size = 0x12600000}, /* APU VLM */ + { .iova_base = SZ_4G, .size = SZ_4G * 3}, /* APU VPU */ +#endif +}; + static const struct mtk_iommu_iova_region mt8192_multi_dom[MT8192_MULTI_REGION_NR] = { { .iova_base = 0x0, .size = MTK_IOMMU_IOVA_SZ_4G}, /* 0 ~ 4G, */ #if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) @@ -510,7 +525,7 @@ static irqreturn_t mtk_iommu_isr(int irq, void *dev_id) bank->parent_dev, "fault type=0x%x iova=0x%llx pa=0x%llx master=0x%x(larb=%d port=%d) layer=%d %s\n", int_state, fault_iova, fault_pa, regval, fault_larb, fault_port, - layer, write ? "write" : "read"); + layer, str_write_read(write)); } /* Interrupt clear */ @@ -602,7 +617,7 @@ static int mtk_iommu_config(struct mtk_iommu_data *data, struct device *dev, larb_mmu->bank[portid] = upper_32_bits(region->iova_base); dev_dbg(dev, "%s iommu for larb(%s) port 0x%lx region %d rgn-bank %d.\n", - enable ? "enable" : "disable", dev_name(larb_mmu->dev), + str_enable_disable(enable), dev_name(larb_mmu->dev), portid_msk, regionid, upper_32_bits(region->iova_base)); if (enable) @@ -630,8 +645,8 @@ static int mtk_iommu_config(struct mtk_iommu_data *data, struct device *dev, } if (ret) dev_err(dev, "%s iommu(%s) inframaster 0x%lx fail(%d).\n", - enable ? "enable" : "disable", - dev_name(data->dev), portid_msk, ret); + str_enable_disable(enable), dev_name(data->dev), + portid_msk, ret); } return ret; } @@ -647,7 +662,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, if (share_dom) { dom->iop = share_dom->iop; dom->cfg = share_dom->cfg; - dom->domain.pgsize_bitmap = share_dom->cfg.pgsize_bitmap; + dom->domain.pgsize_bitmap = share_dom->domain.pgsize_bitmap; goto update_iova_region; } @@ -655,7 +670,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, .quirks = IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_PERMS | IO_PGTABLE_QUIRK_ARM_MTK_EXT, - .pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap, + .pgsize_bitmap = dom->domain.pgsize_bitmap, .ias = MTK_IOMMU_HAS_FLAG(data->plat_data, IOVA_34_EN) ? 34 : 32, .iommu_dev = data->dev, }; @@ -674,9 +689,6 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, return -ENOMEM; } - /* Update our support page sizes bitmap */ - dom->domain.pgsize_bitmap = dom->cfg.pgsize_bitmap; - data->share_dom = dom; update_iova_region: @@ -696,6 +708,7 @@ static struct iommu_domain *mtk_iommu_domain_alloc_paging(struct device *dev) if (!dom) return NULL; mutex_init(&dom->mutex); + dom->domain.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M; return &dom->domain; } @@ -706,7 +719,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain) } static int mtk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata; struct mtk_iommu_domain *dom = to_mtk_domain(domain); @@ -774,12 +787,12 @@ err_unlock: } static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct mtk_iommu_data *data = dev_iommu_priv_get(dev); - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; mtk_iommu_config(data, dev, false, 0); @@ -866,6 +879,7 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev) struct mtk_iommu_data *data = dev_iommu_priv_get(dev); struct device_link *link; struct device *larbdev; + unsigned long larbid_msk = 0; unsigned int larbid, larbidx, i; if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) @@ -873,30 +887,50 @@ static struct iommu_device *mtk_iommu_probe_device(struct device *dev) /* * Link the consumer device with the smi-larb device(supplier). - * The device that connects with each a larb is a independent HW. - * All the ports in each a device should be in the same larbs. + * w/DL_WITH_MULTI_LARB: the master may connect with multi larbs, + * we should create device link with each larb. + * w/o DL_WITH_MULTI_LARB: the master must connect with one larb, + * otherwise fail. */ larbid = MTK_M4U_TO_LARB(fwspec->ids[0]); if (larbid >= MTK_LARB_NR_MAX) return ERR_PTR(-EINVAL); + larbid_msk |= BIT(larbid); + for (i = 1; i < fwspec->num_ids; i++) { larbidx = MTK_M4U_TO_LARB(fwspec->ids[i]); - if (larbid != larbidx) { + if (MTK_IOMMU_HAS_FLAG(data->plat_data, DL_WITH_MULTI_LARB)) { + larbid_msk |= BIT(larbidx); + } else if (larbid != larbidx) { dev_err(dev, "Can only use one larb. Fail@larb%d-%d.\n", larbid, larbidx); return ERR_PTR(-EINVAL); } } - larbdev = data->larb_imu[larbid].dev; - if (!larbdev) - return ERR_PTR(-EINVAL); - link = device_link_add(dev, larbdev, - DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); - if (!link) - dev_err(dev, "Unable to link %s\n", dev_name(larbdev)); + for_each_set_bit(larbid, &larbid_msk, 32) { + larbdev = data->larb_imu[larbid].dev; + if (!larbdev) + return ERR_PTR(-EINVAL); + + link = device_link_add(dev, larbdev, + DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); + if (!link) { + dev_err(dev, "Unable to link %s\n", dev_name(larbdev)); + goto link_remove; + } + } + return &data->iommu; + +link_remove: + for_each_set_bit(i, &larbid_msk, larbid) { + larbdev = data->larb_imu[i].dev; + device_link_remove(dev, larbdev); + } + + return ERR_PTR(-ENODEV); } static void mtk_iommu_release_device(struct device *dev) @@ -904,11 +938,19 @@ static void mtk_iommu_release_device(struct device *dev) struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; struct device *larbdev; - unsigned int larbid; + unsigned int larbid, i; + unsigned long larbid_msk = 0; data = dev_iommu_priv_get(dev); - if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { - larbid = MTK_M4U_TO_LARB(fwspec->ids[0]); + if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) + return; + + for (i = 0; i < fwspec->num_ids; i++) { + larbid = MTK_M4U_TO_LARB(fwspec->ids[i]); + larbid_msk |= BIT(larbid); + } + + for_each_set_bit(larbid, &larbid_msk, 32) { larbdev = data->larb_imu[larbid].dev; device_link_remove(dev, larbdev); } @@ -975,6 +1017,8 @@ static int mtk_iommu_of_xlate(struct device *dev, return -EINVAL; dev_iommu_priv_set(dev, platform_get_drvdata(m4updev)); + + put_device(&m4updev->dev); } return iommu_fwspec_add_ids(dev, args->args, 1); @@ -1018,7 +1062,6 @@ static const struct iommu_ops mtk_iommu_ops = { .device_group = mtk_iommu_device_group, .of_xlate = mtk_iommu_of_xlate, .get_resv_regions = mtk_iommu_get_resv_regions, - .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M, .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = mtk_iommu_attach_device, @@ -1213,16 +1256,19 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m } component_match_add(dev, match, component_compare_dev, &plarbdev->dev); - platform_device_put(plarbdev); } - if (!frst_avail_smicomm_node) - return -EINVAL; + if (!frst_avail_smicomm_node) { + ret = -EINVAL; + goto err_larbdev_put; + } pcommdev = of_find_device_by_node(frst_avail_smicomm_node); of_node_put(frst_avail_smicomm_node); - if (!pcommdev) - return -ENODEV; + if (!pcommdev) { + ret = -ENODEV; + goto err_larbdev_put; + } data->smicomm_dev = &pcommdev->dev; link = device_link_add(data->smicomm_dev, dev, @@ -1230,16 +1276,16 @@ static int mtk_iommu_mm_dts_parse(struct device *dev, struct component_match **m platform_device_put(pcommdev); if (!link) { dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev)); - return -EINVAL; + ret = -EINVAL; + goto err_larbdev_put; } return 0; err_larbdev_put: - for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) { - if (!data->larb_imu[i].dev) - continue; + /* id mapping may not be linear, loop the whole array */ + for (i = 0; i < MTK_LARB_NR_MAX; i++) put_device(data->larb_imu[i].dev); - } + return ret; } @@ -1371,15 +1417,6 @@ static int mtk_iommu_probe(struct platform_device *pdev) platform_set_drvdata(pdev, data); mutex_init(&data->mutex); - ret = iommu_device_sysfs_add(&data->iommu, dev, NULL, - "mtk-iommu.%pa", &ioaddr); - if (ret) - goto out_link_remove; - - ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev); - if (ret) - goto out_sysfs_remove; - if (MTK_IOMMU_HAS_FLAG(data->plat_data, SHARE_PGTABLE)) { list_add_tail(&data->list, data->plat_data->hw_list); data->hw_list = data->plat_data->hw_list; @@ -1389,21 +1426,34 @@ static int mtk_iommu_probe(struct platform_device *pdev) data->hw_list = &data->hw_list_head; } + ret = iommu_device_sysfs_add(&data->iommu, dev, NULL, + "mtk-iommu.%pa", &ioaddr); + if (ret) + goto out_list_del; + + ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev); + if (ret) + goto out_sysfs_remove; + if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { ret = component_master_add_with_match(dev, &mtk_iommu_com_ops, match); if (ret) - goto out_list_del; + goto out_device_unregister; } return ret; -out_list_del: - list_del(&data->list); +out_device_unregister: iommu_device_unregister(&data->iommu); out_sysfs_remove: iommu_device_sysfs_remove(&data->iommu); -out_link_remove: - if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) +out_list_del: + list_del(&data->list); + if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { device_link_remove(data->smicomm_dev, dev); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); + } out_runtime_disable: pm_runtime_disable(dev); return ret; @@ -1423,6 +1473,9 @@ static void mtk_iommu_remove(struct platform_device *pdev) if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { device_link_remove(data->smicomm_dev, &pdev->dev); component_master_del(&pdev->dev, &mtk_iommu_com_ops); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); } pm_runtime_disable(&pdev->dev); for (i = 0; i < data->plat_data->banks_num; i++) { @@ -1549,6 +1602,31 @@ static const struct mtk_iommu_plat_data mt6795_data = { .larbid_remap = {{0}, {1}, {2}, {3}, {4}}, /* Linear mapping. */ }; +static const unsigned int mt8192_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = { + [0] = {~0, ~0}, /* Region0: larb0/1 */ + [1] = {0, 0, 0, 0, ~0, ~0, 0, ~0}, /* Region1: larb4/5/7 */ + [2] = {0, 0, ~0, 0, 0, 0, 0, 0, /* Region2: larb2/9/11/13/14/16/17/18/19/20 */ + 0, ~0, 0, ~0, 0, ~(u32)(BIT(9) | BIT(10)), ~(u32)(BIT(4) | BIT(5)), 0, + ~0, ~0, ~0, ~0, ~0}, + [3] = {0}, + [4] = {[13] = BIT(9) | BIT(10)}, /* larb13 port9/10 */ + [5] = {[14] = BIT(4) | BIT(5)}, /* larb14 port4/5 */ +}; + +static const struct mtk_iommu_plat_data mt6893_data = { + .m4u_plat = M4U_MT8192, + .flags = HAS_BCLK | OUT_ORDER_WR_EN | HAS_SUB_COMM_2BITS | + WR_THROT_EN | IOVA_34_EN | SHARE_PGTABLE | MTK_IOMMU_TYPE_MM, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .banks_num = 1, + .banks_enable = {true}, + .iova_region = mt8192_multi_dom, + .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom), + .iova_region_larb_msk = mt8192_larb_region_msk, + .larbid_remap = {{0}, {1}, {4, 5}, {7}, {2}, {9, 11, 19, 20}, + {0, 14, 16}, {0, 13, 18, 17}}, +}; + static const struct mtk_iommu_plat_data mt8167_data = { .m4u_plat = M4U_MT8167, .flags = RESET_AXI | HAS_LEGACY_IVRP_PADDR | MTK_IOMMU_TYPE_MM, @@ -1599,7 +1677,7 @@ static const unsigned int mt8186_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK static const struct mtk_iommu_plat_data mt8186_data_mm = { .m4u_plat = M4U_MT8186, .flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN | - WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM, + WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM | PGTABLE_PA_35_EN, .larbid_remap = {{0}, {1, MTK_INVALID_LARBID, 8}, {4}, {7}, {2}, {9, 11, 19, 20}, {MTK_INVALID_LARBID, 14, 16}, {MTK_INVALID_LARBID, 13, MTK_INVALID_LARBID, 17}}, @@ -1672,15 +1750,64 @@ static const struct mtk_iommu_plat_data mt8188_data_vpp = { 27, 28 /* ccu0 */, MTK_INVALID_LARBID}, {4, 6}}, }; -static const unsigned int mt8192_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = { - [0] = {~0, ~0}, /* Region0: larb0/1 */ - [1] = {0, 0, 0, 0, ~0, ~0, 0, ~0}, /* Region1: larb4/5/7 */ - [2] = {0, 0, ~0, 0, 0, 0, 0, 0, /* Region2: larb2/9/11/13/14/16/17/18/19/20 */ - 0, ~0, 0, ~0, 0, ~(u32)(BIT(9) | BIT(10)), ~(u32)(BIT(4) | BIT(5)), 0, - ~0, ~0, ~0, ~0, ~0}, - [3] = {0}, - [4] = {[13] = BIT(9) | BIT(10)}, /* larb13 port9/10 */ - [5] = {[14] = BIT(4) | BIT(5)}, /* larb14 port4/5 */ +static const unsigned int mt8189_apu_region_msk[][MTK_LARB_NR_MAX] = { + [0] = {[0] = BIT(2)}, /* Region0: fake larb 0 APU_SECURE */ + [1] = {[0] = BIT(1)}, /* Region1: fake larb 0 APU_CODE */ + [2] = {[0] = BIT(3)}, /* Region2: fake larb 0 APU_VLM */ + [3] = {[0] = BIT(0)}, /* Region3: fake larb 0 APU_DATA */ +}; + +static const struct mtk_iommu_plat_data mt8189_data_apu = { + .m4u_plat = M4U_MT8189, + .flags = IOVA_34_EN | DCM_DISABLE | + MTK_IOMMU_TYPE_APU | PGTABLE_PA_35_EN, + .hw_list = &apulist, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .banks_num = 1, + .banks_enable = {true}, + .iova_region = mt8189_multi_dom_apu, + .iova_region_nr = ARRAY_SIZE(mt8189_multi_dom_apu), + .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}}, + .iova_region_larb_msk = mt8189_apu_region_msk, +}; + +static const struct mtk_iommu_plat_data mt8189_data_infra = { + .m4u_plat = M4U_MT8189, + .flags = WR_THROT_EN | DCM_DISABLE | MTK_IOMMU_TYPE_INFRA | + CFG_IFA_MASTER_IN_ATF | SHARE_PGTABLE | PGTABLE_PA_35_EN, + .hw_list = &infralist, + .banks_num = 1, + .banks_enable = {true}, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .iova_region = single_domain, + .iova_region_nr = ARRAY_SIZE(single_domain), +}; + +static const u32 mt8189_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = { + [0] = {~0, ~0, ~0, [22] = BIT(0)}, /* Region0: all ports for larb0/1/2 */ + [1] = {[3] = ~0, [4] = ~0}, /* Region1: all ports for larb4(3)/7(4) */ + [2] = {[5] = ~0, [6] = ~0, /* Region2: all ports for larb9(5)/11(6) */ + [7] = ~0, [8] = ~0, /* Region2: all ports for larb13(7)/14(8) */ + [9] = ~0, [10] = ~0, /* Region2: all ports for larb16(9)/17(10) */ + [11] = ~0, [12] = ~0, /* Region2: all ports for larb19(11)/20(12) */ + [21] = ~0}, /* Region2: larb21 fake GCE larb */ +}; + +static const struct mtk_iommu_plat_data mt8189_data_mm = { + .m4u_plat = M4U_MT8189, + .flags = HAS_BCLK | HAS_SUB_COMM_3BITS | OUT_ORDER_WR_EN | + WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM | + PGTABLE_PA_35_EN | DL_WITH_MULTI_LARB, + .hw_list = &m4ulist, + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, + .banks_num = 5, + .banks_enable = {true, false, false, false, false}, + .iova_region = mt8192_multi_dom, + .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom), + .iova_region_larb_msk = mt8189_larb_region_msk, + .larbid_remap = {{0}, {1}, {21/* GCE_D */, 21/* GCE_M */, 2}, + {19, 20, 9, 11}, {7}, {4}, + {13, 17}, {14, 16}}, }; static const struct mtk_iommu_plat_data mt8192_data = { @@ -1776,6 +1903,7 @@ static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt2712-m4u", .data = &mt2712_data}, { .compatible = "mediatek,mt6779-m4u", .data = &mt6779_data}, { .compatible = "mediatek,mt6795-m4u", .data = &mt6795_data}, + { .compatible = "mediatek,mt6893-iommu-mm", .data = &mt6893_data}, { .compatible = "mediatek,mt8167-m4u", .data = &mt8167_data}, { .compatible = "mediatek,mt8173-m4u", .data = &mt8173_data}, { .compatible = "mediatek,mt8183-m4u", .data = &mt8183_data}, @@ -1783,6 +1911,9 @@ static const struct of_device_id mtk_iommu_of_ids[] = { { .compatible = "mediatek,mt8188-iommu-infra", .data = &mt8188_data_infra}, { .compatible = "mediatek,mt8188-iommu-vdo", .data = &mt8188_data_vdo}, { .compatible = "mediatek,mt8188-iommu-vpp", .data = &mt8188_data_vpp}, + { .compatible = "mediatek,mt8189-iommu-apu", .data = &mt8189_data_apu}, + { .compatible = "mediatek,mt8189-iommu-infra", .data = &mt8189_data_infra}, + { .compatible = "mediatek,mt8189-iommu-mm", .data = &mt8189_data_mm}, { .compatible = "mediatek,mt8192-m4u", .data = &mt8192_data}, { .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra}, { .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo}, @@ -1794,7 +1925,7 @@ MODULE_DEVICE_TABLE(of, mtk_iommu_of_ids); static struct platform_driver mtk_iommu_driver = { .probe = mtk_iommu_probe, - .remove_new = mtk_iommu_remove, + .remove = mtk_iommu_remove, .driver = { .name = "mtk-iommu", .of_match_table = mtk_iommu_of_ids, diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index d6e4002200bd..c8d8eff5373d 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -25,12 +25,22 @@ #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/string_choices.h> #include <asm/barrier.h> -#include <asm/dma-iommu.h> #include <dt-bindings/memory/mtk-memory-port.h> #include <dt-bindings/memory/mt2701-larb-port.h> #include <soc/mediatek/smi.h> +#if defined(CONFIG_ARM) +#include <asm/dma-iommu.h> +#else +#define arm_iommu_create_mapping(...) NULL +#define arm_iommu_attach_device(...) -ENODEV +struct dma_iommu_mapping { + struct iommu_domain *domain; +}; +#endif + #define REG_MMU_PT_BASE_ADDR 0x000 #define F_ALL_INVLD 0x2 @@ -243,7 +253,7 @@ static void mtk_iommu_v1_config(struct mtk_iommu_v1_data *data, larb_mmu = &data->larb_imu[larbid]; dev_dbg(dev, "%s iommu port: %d\n", - enable ? "enable" : "disable", portid); + str_enable_disable(enable), portid); if (enable) larb_mmu->mmu |= MTK_SMI_MMU_EN(portid); @@ -278,6 +288,8 @@ static struct iommu_domain *mtk_iommu_v1_domain_alloc_paging(struct device *dev) if (!dom) return NULL; + dom->domain.pgsize_bitmap = MT2701_IOMMU_PAGE_SIZE; + return &dom->domain; } @@ -291,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain) kfree(to_mtk_domain(domain)); } -static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev) +static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain); @@ -317,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device } static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); @@ -401,7 +416,6 @@ static const struct iommu_ops mtk_iommu_v1_ops; static int mtk_iommu_v1_create_mapping(struct device *dev, const struct of_phandle_args *args) { - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_v1_data *data; struct platform_device *m4updev; struct dma_iommu_mapping *mtk_mapping; @@ -413,14 +427,9 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, return -EINVAL; } - if (!fwspec) { - ret = iommu_fwspec_init(dev, &args->np->fwnode, &mtk_iommu_v1_ops); - if (ret) - return ret; - fwspec = dev_iommu_fwspec_get(dev); - } else if (dev_iommu_fwspec_get(dev)->ops != &mtk_iommu_v1_ops) { - return -EINVAL; - } + ret = iommu_fwspec_init(dev, of_fwnode_handle(args->np)); + if (ret) + return ret; if (!dev_iommu_priv_get(dev)) { /* Get the m4u device */ @@ -429,6 +438,8 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, return -EINVAL; dev_iommu_priv_set(dev, platform_get_drvdata(m4updev)); + + put_device(&m4updev->dev); } ret = iommu_fwspec_add_ids(dev, args->args, 1); @@ -439,8 +450,7 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, mtk_mapping = data->mapping; if (!mtk_mapping) { /* MTK iommu support 4GB iova address space. */ - mtk_mapping = arm_iommu_create_mapping(&platform_bus_type, - 0, 1ULL << 32); + mtk_mapping = arm_iommu_create_mapping(dev, 0, 1ULL << 32); if (IS_ERR(mtk_mapping)) return PTR_ERR(mtk_mapping); @@ -452,22 +462,13 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct iommu_fwspec *fwspec = NULL; struct of_phandle_args iommu_spec; struct mtk_iommu_v1_data *data; int err, idx = 0, larbid, larbidx; struct device_link *link; struct device *larbdev; - /* - * In the deferred case, free the existed fwspec. - * Always initialize the fwspec internally. - */ - if (fwspec) { - iommu_fwspec_free(dev); - fwspec = dev_iommu_fwspec_get(dev); - } - while (!of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", idx, &iommu_spec)) { @@ -482,6 +483,9 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) idx++; } + if (!fwspec) + return ERR_PTR(-ENODEV); + data = dev_iommu_priv_get(dev); /* Link the consumer device with the smi-larb device(supplier) */ @@ -512,14 +516,10 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) static void mtk_iommu_v1_probe_finalize(struct device *dev) { - struct dma_iommu_mapping *mtk_mapping; - struct mtk_iommu_v1_data *data; + __maybe_unused struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); int err; - data = dev_iommu_priv_get(dev); - mtk_mapping = data->mapping; - - err = arm_iommu_attach_device(dev, mtk_mapping); + err = arm_iommu_attach_device(dev, data->mapping); if (err) dev_err(dev, "Can't create IOMMU mapping - DMA-OPS will not work\n"); } @@ -585,7 +585,6 @@ static const struct iommu_ops mtk_iommu_v1_ops = { .probe_finalize = mtk_iommu_v1_probe_finalize, .release_device = mtk_iommu_v1_release_device, .device_group = generic_device_group, - .pgsize_bitmap = MT2701_IOMMU_PAGE_SIZE, .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = mtk_iommu_v1_attach_device, @@ -647,13 +646,18 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) if (larb_nr < 0) return larb_nr; + if (larb_nr > MTK_LARB_NR_MAX) + return -EINVAL; + for (i = 0; i < larb_nr; i++) { struct device_node *larbnode; struct platform_device *plarbdev; larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i); - if (!larbnode) - return -EINVAL; + if (!larbnode) { + ret = -EINVAL; + goto out_put_larbs; + } if (!of_device_is_available(larbnode)) { of_node_put(larbnode); @@ -663,11 +667,14 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) plarbdev = of_find_device_by_node(larbnode); if (!plarbdev) { of_node_put(larbnode); - return -ENODEV; + ret = -ENODEV; + goto out_put_larbs; } if (!plarbdev->dev.driver) { of_node_put(larbnode); - return -EPROBE_DEFER; + put_device(&plarbdev->dev); + ret = -EPROBE_DEFER; + goto out_put_larbs; } data->larb_imu[i].dev = &plarbdev->dev; @@ -679,7 +686,7 @@ static int mtk_iommu_v1_probe(struct platform_device *pdev) ret = mtk_iommu_v1_hw_init(data); if (ret) - return ret; + goto out_put_larbs; ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL, dev_name(&pdev->dev)); @@ -701,12 +708,17 @@ out_sysfs_remove: iommu_device_sysfs_remove(&data->iommu); out_clk_unprepare: clk_disable_unprepare(data->bclk); +out_put_larbs: + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); + return ret; } static void mtk_iommu_v1_remove(struct platform_device *pdev) { struct mtk_iommu_v1_data *data = platform_get_drvdata(pdev); + int i; iommu_device_sysfs_remove(&data->iommu); iommu_device_unregister(&data->iommu); @@ -714,6 +726,9 @@ static void mtk_iommu_v1_remove(struct platform_device *pdev) clk_disable_unprepare(data->bclk); devm_free_irq(&pdev->dev, data->irq, data); component_master_del(&pdev->dev, &mtk_iommu_v1_com_ops); + + for (i = 0; i < MTK_LARB_NR_MAX; i++) + put_device(data->larb_imu[i].dev); } static int __maybe_unused mtk_iommu_v1_suspend(struct device *dev) @@ -752,7 +767,7 @@ static const struct dev_pm_ops mtk_iommu_v1_pm_ops = { static struct platform_driver mtk_iommu_v1_driver = { .probe = mtk_iommu_v1_probe, - .remove_new = mtk_iommu_v1_remove, + .remove = mtk_iommu_v1_remove, .driver = { .name = "mtk-iommu-v1", .of_match_table = mtk_iommu_v1_of_ids, diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 3afe0b48a48d..6b989a62def2 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -17,30 +17,23 @@ #include <linux/slab.h> #include <linux/fsl/mc.h> +#include "iommu-priv.h" + static int of_iommu_xlate(struct device *dev, struct of_phandle_args *iommu_spec) { const struct iommu_ops *ops; - struct fwnode_handle *fwnode = &iommu_spec->np->fwnode; int ret; - ops = iommu_ops_from_fwnode(fwnode); - if ((ops && !ops->of_xlate) || - !of_device_is_available(iommu_spec->np)) + if (!of_device_is_available(iommu_spec->np)) return -ENODEV; - ret = iommu_fwspec_init(dev, fwnode, ops); + ret = iommu_fwspec_init(dev, of_fwnode_handle(iommu_spec->np)); if (ret) return ret; - /* - * The otherwise-empty fwspec handily serves to indicate the specific - * IOMMU device we're waiting for, which will be useful if we ever get - * a proper probe-ordering dependency mechanism in future. - */ - if (!ops) - return driver_deferred_probe_check_state(dev); - if (!try_module_get(ops->owner)) + ops = iommu_ops_from_fwnode(&iommu_spec->np->fwnode); + if (!ops->of_xlate || !try_module_get(ops->owner)) return -ENODEV; ret = ops->of_xlate(dev, iommu_spec); @@ -105,6 +98,14 @@ static int of_iommu_configure_device(struct device_node *master_np, of_iommu_configure_dev(master_np, dev); } +static void of_pci_check_device_ats(struct device *dev, struct device_node *np) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + if (fwspec && of_property_read_bool(np, "ats-supported")) + fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS; +} + /* * Returns: * 0 on success, an iommu was configured @@ -115,7 +116,7 @@ static int of_iommu_configure_device(struct device_node *master_np, int of_iommu_configure(struct device *dev, struct device_node *master_np, const u32 *id) { - struct iommu_fwspec *fwspec; + bool dev_iommu_present; int err; if (!master_np) @@ -123,15 +124,11 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, /* Serialise to make dev->iommu stable under our potential fwspec */ mutex_lock(&iommu_probe_device_lock); - fwspec = dev_iommu_fwspec_get(dev); - if (fwspec) { - if (fwspec->ops) { - mutex_unlock(&iommu_probe_device_lock); - return 0; - } - /* In the deferred case, start again from scratch */ - iommu_fwspec_free(dev); + if (dev_iommu_fwspec_get(dev)) { + mutex_unlock(&iommu_probe_device_lock); + return 0; } + dev_iommu_present = dev->iommu; /* * We don't currently walk up the tree looking for a parent IOMMU. @@ -147,23 +144,28 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, pci_request_acs(); err = pci_for_each_dma_alias(to_pci_dev(dev), of_pci_iommu_init, &info); + of_pci_check_device_ats(dev, master_np); } else { err = of_iommu_configure_device(master_np, dev, id); } + + if (err && dev_iommu_present) + iommu_fwspec_free(dev); + else if (err && dev->iommu) + dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); - if (err == -ENODEV || err == -EPROBE_DEFER) - return err; - if (err) - goto err_log; + /* + * If we're not on the iommu_probe_device() path (as indicated by the + * initial dev->iommu) then try to simulate it. This should no longer + * happen unless of_dma_configure() is being misused outside bus code. + */ + if (!err && dev->bus && !dev_iommu_present) + err = iommu_probe_device(dev); - err = iommu_probe_device(dev); - if (err) - goto err_log; - return 0; + if (err && err != -EPROBE_DEFER) + dev_dbg(dev, "Adding to IOMMU failed: %d\n", err); -err_log: - dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err)); return err; } @@ -219,7 +221,7 @@ void of_iommu_get_resv_regions(struct device *dev, struct list_head *list) * that represent reservations in the IOVA space, which are regions that should * not be mapped. */ - if (of_find_property(it.node, "reg", NULL)) { + if (of_property_present(it.node, "reg")) { err = of_address_to_resource(it.node, 0, &phys); if (err < 0) { dev_err(dev, "failed to parse memory region %pOF: %d\n", diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index c9528065a59a..768973b7e511 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1123,29 +1123,15 @@ static int omap_iommu_dra7_get_dsp_system_cfg(struct platform_device *pdev, struct omap_iommu *obj) { struct device_node *np = pdev->dev.of_node; - int ret; if (!of_device_is_compatible(np, "ti,dra7-dsp-iommu")) return 0; - if (!of_property_read_bool(np, "ti,syscon-mmuconfig")) { - dev_err(&pdev->dev, "ti,syscon-mmuconfig property is missing\n"); - return -EINVAL; - } - - obj->syscfg = - syscon_regmap_lookup_by_phandle(np, "ti,syscon-mmuconfig"); - if (IS_ERR(obj->syscfg)) { - /* can fail with -EPROBE_DEFER */ - ret = PTR_ERR(obj->syscfg); - return ret; - } - - if (of_property_read_u32_index(np, "ti,syscon-mmuconfig", 1, - &obj->id)) { - dev_err(&pdev->dev, "couldn't get the IOMMU instance id within subsystem\n"); - return -EINVAL; - } + obj->syscfg = syscon_regmap_lookup_by_phandle_args(np, "ti,syscon-mmuconfig", + 1, &obj->id); + if (IS_ERR(obj->syscfg)) + return dev_err_probe(&pdev->dev, PTR_ERR(obj->syscfg), + "ti,syscon-mmuconfig property is missing\n"); if (obj->id != 0 && obj->id != 1) { dev_err(&pdev->dev, "invalid IOMMU instance id\n"); @@ -1230,25 +1216,24 @@ static int omap_iommu_probe(struct platform_device *pdev) if (err) return err; - err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev); - if (err) - goto out_sysfs; obj->has_iommu_driver = true; } + err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev); + if (err) + goto out_sysfs; + pm_runtime_enable(obj->dev); omap_iommu_debugfs_add(obj); dev_info(&pdev->dev, "%s registered\n", obj->name); - /* Re-probe bus to probe device attached to this IOMMU */ - bus_iommu_probe(&platform_bus_type); - return 0; out_sysfs: - iommu_device_sysfs_remove(&obj->iommu); + if (obj->has_iommu_driver) + iommu_device_sysfs_remove(&obj->iommu); return err; } @@ -1256,10 +1241,10 @@ static void omap_iommu_remove(struct platform_device *pdev) { struct omap_iommu *obj = platform_get_drvdata(pdev); - if (obj->has_iommu_driver) { + if (obj->has_iommu_driver) iommu_device_sysfs_remove(&obj->iommu); - iommu_device_unregister(&obj->iommu); - } + + iommu_device_unregister(&obj->iommu); omap_iommu_debugfs_remove(obj); @@ -1286,7 +1271,7 @@ static const struct of_device_id omap_iommu_of_match[] = { static struct platform_driver omap_iommu_driver = { .probe = omap_iommu_probe, - .remove_new = omap_iommu_remove, + .remove = omap_iommu_remove, .driver = { .name = "omap-iommu", .pm = &omap_iommu_pm_ops, @@ -1318,8 +1303,8 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, struct omap_iommu_device *iommu; struct omap_iommu *oiommu; struct iotlb_entry e; + int ret = -EINVAL; int omap_pgsz; - u32 ret = -EINVAL; int i; omap_pgsz = bytes_to_iopgsz(bytes); @@ -1446,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain) odomain->iommus = NULL; } -static int -omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int omap_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev); struct omap_iommu_domain *omap_domain = to_omap_domain(domain); @@ -1551,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, } static int omap_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct omap_iommu_domain *omap_domain; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - omap_domain = to_omap_domain(domain); + omap_domain = to_omap_domain(old); spin_lock(&omap_domain->lock); _omap_iommu_detach_dev(omap_domain, dev); spin_unlock(&omap_domain->lock); @@ -1585,6 +1570,8 @@ static struct iommu_domain *omap_iommu_domain_alloc_paging(struct device *dev) spin_lock_init(&omap_domain->lock); + omap_domain->domain.pgsize_bitmap = OMAP_IOMMU_PGSIZES; + omap_domain->domain.geometry.aperture_start = 0; omap_domain->domain.geometry.aperture_end = (1ULL << 32) - 1; omap_domain->domain.geometry.force_aperture = true; @@ -1681,23 +1668,20 @@ static struct iommu_device *omap_iommu_probe_device(struct device *dev) } pdev = of_find_device_by_node(np); + of_node_put(np); if (!pdev) { - of_node_put(np); kfree(arch_data); return ERR_PTR(-ENODEV); } oiommu = platform_get_drvdata(pdev); + put_device(&pdev->dev); if (!oiommu) { - of_node_put(np); kfree(arch_data); return ERR_PTR(-EINVAL); } tmp->iommu_dev = oiommu; - tmp->dev = &pdev->dev; - - of_node_put(np); } dev_iommu_priv_set(dev, arch_data); @@ -1723,13 +1707,19 @@ static void omap_iommu_release_device(struct device *dev) } +static int omap_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) +{ + /* TODO: collect args->np to save re-parsing in probe above */ + return 0; +} + static const struct iommu_ops omap_iommu_ops = { .identity_domain = &omap_iommu_identity_domain, .domain_alloc_paging = omap_iommu_domain_alloc_paging, .probe_device = omap_iommu_probe_device, .release_device = omap_iommu_release_device, .device_group = generic_single_device_group, - .pgsize_bitmap = OMAP_IOMMU_PGSIZES, + .of_xlate = omap_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = omap_iommu_attach_dev, .map_pages = omap_iommu_map, diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h index 27697109ec79..50b39be61abc 100644 --- a/drivers/iommu/omap-iommu.h +++ b/drivers/iommu/omap-iommu.h @@ -88,7 +88,6 @@ struct omap_iommu { /** * struct omap_iommu_arch_data - omap iommu private data * @iommu_dev: handle of the OMAP iommu device - * @dev: handle of the iommu device * * This is an omap iommu private data object, which binds an iommu user * to its iommu device. This object should be placed at the iommu user's @@ -97,7 +96,6 @@ struct omap_iommu { */ struct omap_iommu_arch_data { struct omap_iommu *iommu_dev; - struct device *dev; }; struct cr_regs { diff --git a/drivers/iommu/riscv/Kconfig b/drivers/iommu/riscv/Kconfig new file mode 100644 index 000000000000..c071816f59a6 --- /dev/null +++ b/drivers/iommu/riscv/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only +# RISC-V IOMMU support + +config RISCV_IOMMU + bool "RISC-V IOMMU Support" + depends on RISCV && 64BIT + default y + select IOMMU_API + help + Support for implementations of the RISC-V IOMMU architecture that + complements the RISC-V MMU capabilities, providing similar address + translation and protection functions for accesses from I/O devices. + + Say Y here if your SoC includes an IOMMU device implementing + the RISC-V IOMMU architecture. + +config RISCV_IOMMU_PCI + def_bool y if RISCV_IOMMU && PCI_MSI + help + Support for the PCIe implementation of RISC-V IOMMU architecture. diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile new file mode 100644 index 000000000000..b5929f9f23e6 --- /dev/null +++ b/drivers/iommu/riscv/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-y += iommu.o iommu-platform.o +obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h new file mode 100644 index 000000000000..98daf0e1a306 --- /dev/null +++ b/drivers/iommu/riscv/iommu-bits.h @@ -0,0 +1,784 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * Copyright © 2023 RISC-V IOMMU Task Group + * + * RISC-V IOMMU - Register Layout and Data Structures. + * + * Based on the 'RISC-V IOMMU Architecture Specification', Version 1.0 + * Published at https://github.com/riscv-non-isa/riscv-iommu + * + */ + +#ifndef _RISCV_IOMMU_BITS_H_ +#define _RISCV_IOMMU_BITS_H_ + +#include <linux/types.h> +#include <linux/bitfield.h> +#include <linux/bits.h> + +/* + * Chapter 5: Memory Mapped register interface + */ + +/* Common field positions */ +#define RISCV_IOMMU_PPN_FIELD GENMASK_ULL(53, 10) +#define RISCV_IOMMU_QUEUE_LOG2SZ_FIELD GENMASK_ULL(4, 0) +#define RISCV_IOMMU_QUEUE_INDEX_FIELD GENMASK_ULL(31, 0) +#define RISCV_IOMMU_QUEUE_ENABLE BIT(0) +#define RISCV_IOMMU_QUEUE_INTR_ENABLE BIT(1) +#define RISCV_IOMMU_QUEUE_MEM_FAULT BIT(8) +#define RISCV_IOMMU_QUEUE_OVERFLOW BIT(9) +#define RISCV_IOMMU_QUEUE_ACTIVE BIT(16) +#define RISCV_IOMMU_QUEUE_BUSY BIT(17) + +#define RISCV_IOMMU_ATP_PPN_FIELD GENMASK_ULL(43, 0) +#define RISCV_IOMMU_ATP_MODE_FIELD GENMASK_ULL(63, 60) + +/* 5.3 IOMMU Capabilities (64bits) */ +#define RISCV_IOMMU_REG_CAPABILITIES 0x0000 +#define RISCV_IOMMU_CAPABILITIES_VERSION GENMASK_ULL(7, 0) +#define RISCV_IOMMU_CAPABILITIES_SV32 BIT_ULL(8) +#define RISCV_IOMMU_CAPABILITIES_SV39 BIT_ULL(9) +#define RISCV_IOMMU_CAPABILITIES_SV48 BIT_ULL(10) +#define RISCV_IOMMU_CAPABILITIES_SV57 BIT_ULL(11) +#define RISCV_IOMMU_CAPABILITIES_SVPBMT BIT_ULL(15) +#define RISCV_IOMMU_CAPABILITIES_SV32X4 BIT_ULL(16) +#define RISCV_IOMMU_CAPABILITIES_SV39X4 BIT_ULL(17) +#define RISCV_IOMMU_CAPABILITIES_SV48X4 BIT_ULL(18) +#define RISCV_IOMMU_CAPABILITIES_SV57X4 BIT_ULL(19) +#define RISCV_IOMMU_CAPABILITIES_AMO_MRIF BIT_ULL(21) +#define RISCV_IOMMU_CAPABILITIES_MSI_FLAT BIT_ULL(22) +#define RISCV_IOMMU_CAPABILITIES_MSI_MRIF BIT_ULL(23) +#define RISCV_IOMMU_CAPABILITIES_AMO_HWAD BIT_ULL(24) +#define RISCV_IOMMU_CAPABILITIES_ATS BIT_ULL(25) +#define RISCV_IOMMU_CAPABILITIES_T2GPA BIT_ULL(26) +#define RISCV_IOMMU_CAPABILITIES_END BIT_ULL(27) +#define RISCV_IOMMU_CAPABILITIES_IGS GENMASK_ULL(29, 28) +#define RISCV_IOMMU_CAPABILITIES_HPM BIT_ULL(30) +#define RISCV_IOMMU_CAPABILITIES_DBG BIT_ULL(31) +#define RISCV_IOMMU_CAPABILITIES_PAS GENMASK_ULL(37, 32) +#define RISCV_IOMMU_CAPABILITIES_PD8 BIT_ULL(38) +#define RISCV_IOMMU_CAPABILITIES_PD17 BIT_ULL(39) +#define RISCV_IOMMU_CAPABILITIES_PD20 BIT_ULL(40) + +/** + * enum riscv_iommu_igs_settings - Interrupt Generation Support Settings + * @RISCV_IOMMU_CAPABILITIES_IGS_MSI: IOMMU supports only MSI generation + * @RISCV_IOMMU_CAPABILITIES_IGS_WSI: IOMMU supports only Wired-Signaled interrupt + * @RISCV_IOMMU_CAPABILITIES_IGS_BOTH: IOMMU supports both MSI and WSI generation + * @RISCV_IOMMU_CAPABILITIES_IGS_RSRV: Reserved for standard use + */ +enum riscv_iommu_igs_settings { + RISCV_IOMMU_CAPABILITIES_IGS_MSI = 0, + RISCV_IOMMU_CAPABILITIES_IGS_WSI = 1, + RISCV_IOMMU_CAPABILITIES_IGS_BOTH = 2, + RISCV_IOMMU_CAPABILITIES_IGS_RSRV = 3 +}; + +/* 5.4 Features control register (32bits) */ +#define RISCV_IOMMU_REG_FCTL 0x0008 +#define RISCV_IOMMU_FCTL_BE BIT(0) +#define RISCV_IOMMU_FCTL_WSI BIT(1) +#define RISCV_IOMMU_FCTL_GXL BIT(2) + +/* 5.5 Device-directory-table pointer (64bits) */ +#define RISCV_IOMMU_REG_DDTP 0x0010 +#define RISCV_IOMMU_DDTP_IOMMU_MODE GENMASK_ULL(3, 0) +#define RISCV_IOMMU_DDTP_BUSY BIT_ULL(4) +#define RISCV_IOMMU_DDTP_PPN RISCV_IOMMU_PPN_FIELD + +/** + * enum riscv_iommu_ddtp_modes - IOMMU translation modes + * @RISCV_IOMMU_DDTP_IOMMU_MODE_OFF: No inbound transactions allowed + * @RISCV_IOMMU_DDTP_IOMMU_MODE_BARE: Pass-through mode + * @RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL: One-level DDT + * @RISCV_IOMMU_DDTP_IOMMU_MODE_2LVL: Two-level DDT + * @RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL: Three-level DDT + * @RISCV_IOMMU_DDTP_IOMMU_MODE_MAX: Max value allowed by specification + */ +enum riscv_iommu_ddtp_modes { + RISCV_IOMMU_DDTP_IOMMU_MODE_OFF = 0, + RISCV_IOMMU_DDTP_IOMMU_MODE_BARE = 1, + RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL = 2, + RISCV_IOMMU_DDTP_IOMMU_MODE_2LVL = 3, + RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL = 4, + RISCV_IOMMU_DDTP_IOMMU_MODE_MAX = 4 +}; + +/* 5.6 Command Queue Base (64bits) */ +#define RISCV_IOMMU_REG_CQB 0x0018 +#define RISCV_IOMMU_CQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD +#define RISCV_IOMMU_CQB_PPN RISCV_IOMMU_PPN_FIELD + +/* 5.7 Command Queue head (32bits) */ +#define RISCV_IOMMU_REG_CQH 0x0020 +#define RISCV_IOMMU_CQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.8 Command Queue tail (32bits) */ +#define RISCV_IOMMU_REG_CQT 0x0024 +#define RISCV_IOMMU_CQT_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.9 Fault Queue Base (64bits) */ +#define RISCV_IOMMU_REG_FQB 0x0028 +#define RISCV_IOMMU_FQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD +#define RISCV_IOMMU_FQB_PPN RISCV_IOMMU_PPN_FIELD + +/* 5.10 Fault Queue Head (32bits) */ +#define RISCV_IOMMU_REG_FQH 0x0030 +#define RISCV_IOMMU_FQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.11 Fault Queue tail (32bits) */ +#define RISCV_IOMMU_REG_FQT 0x0034 +#define RISCV_IOMMU_FQT_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.12 Page Request Queue base (64bits) */ +#define RISCV_IOMMU_REG_PQB 0x0038 +#define RISCV_IOMMU_PQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD +#define RISCV_IOMMU_PQB_PPN RISCV_IOMMU_PPN_FIELD + +/* 5.13 Page Request Queue head (32bits) */ +#define RISCV_IOMMU_REG_PQH 0x0040 +#define RISCV_IOMMU_PQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.14 Page Request Queue tail (32bits) */ +#define RISCV_IOMMU_REG_PQT 0x0044 +#define RISCV_IOMMU_PQT_INDEX_MASK RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.15 Command Queue CSR (32bits) */ +#define RISCV_IOMMU_REG_CQCSR 0x0048 +#define RISCV_IOMMU_CQCSR_CQEN RISCV_IOMMU_QUEUE_ENABLE +#define RISCV_IOMMU_CQCSR_CIE RISCV_IOMMU_QUEUE_INTR_ENABLE +#define RISCV_IOMMU_CQCSR_CQMF RISCV_IOMMU_QUEUE_MEM_FAULT +#define RISCV_IOMMU_CQCSR_CMD_TO BIT(9) +#define RISCV_IOMMU_CQCSR_CMD_ILL BIT(10) +#define RISCV_IOMMU_CQCSR_FENCE_W_IP BIT(11) +#define RISCV_IOMMU_CQCSR_CQON RISCV_IOMMU_QUEUE_ACTIVE +#define RISCV_IOMMU_CQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY + +/* 5.16 Fault Queue CSR (32bits) */ +#define RISCV_IOMMU_REG_FQCSR 0x004C +#define RISCV_IOMMU_FQCSR_FQEN RISCV_IOMMU_QUEUE_ENABLE +#define RISCV_IOMMU_FQCSR_FIE RISCV_IOMMU_QUEUE_INTR_ENABLE +#define RISCV_IOMMU_FQCSR_FQMF RISCV_IOMMU_QUEUE_MEM_FAULT +#define RISCV_IOMMU_FQCSR_FQOF RISCV_IOMMU_QUEUE_OVERFLOW +#define RISCV_IOMMU_FQCSR_FQON RISCV_IOMMU_QUEUE_ACTIVE +#define RISCV_IOMMU_FQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY + +/* 5.17 Page Request Queue CSR (32bits) */ +#define RISCV_IOMMU_REG_PQCSR 0x0050 +#define RISCV_IOMMU_PQCSR_PQEN RISCV_IOMMU_QUEUE_ENABLE +#define RISCV_IOMMU_PQCSR_PIE RISCV_IOMMU_QUEUE_INTR_ENABLE +#define RISCV_IOMMU_PQCSR_PQMF RISCV_IOMMU_QUEUE_MEM_FAULT +#define RISCV_IOMMU_PQCSR_PQOF RISCV_IOMMU_QUEUE_OVERFLOW +#define RISCV_IOMMU_PQCSR_PQON RISCV_IOMMU_QUEUE_ACTIVE +#define RISCV_IOMMU_PQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY + +/* 5.18 Interrupt Pending Status (32bits) */ +#define RISCV_IOMMU_REG_IPSR 0x0054 + +#define RISCV_IOMMU_INTR_CQ 0 +#define RISCV_IOMMU_INTR_FQ 1 +#define RISCV_IOMMU_INTR_PM 2 +#define RISCV_IOMMU_INTR_PQ 3 +#define RISCV_IOMMU_INTR_COUNT 4 + +#define RISCV_IOMMU_IPSR_CIP BIT(RISCV_IOMMU_INTR_CQ) +#define RISCV_IOMMU_IPSR_FIP BIT(RISCV_IOMMU_INTR_FQ) +#define RISCV_IOMMU_IPSR_PMIP BIT(RISCV_IOMMU_INTR_PM) +#define RISCV_IOMMU_IPSR_PIP BIT(RISCV_IOMMU_INTR_PQ) + +/* 5.19 Performance monitoring counter overflow status (32bits) */ +#define RISCV_IOMMU_REG_IOCOUNTOVF 0x0058 +#define RISCV_IOMMU_IOCOUNTOVF_CY BIT(0) +#define RISCV_IOMMU_IOCOUNTOVF_HPM GENMASK_ULL(31, 1) + +/* 5.20 Performance monitoring counter inhibits (32bits) */ +#define RISCV_IOMMU_REG_IOCOUNTINH 0x005C +#define RISCV_IOMMU_IOCOUNTINH_CY BIT(0) +#define RISCV_IOMMU_IOCOUNTINH_HPM GENMASK(31, 1) + +/* 5.21 Performance monitoring cycles counter (64bits) */ +#define RISCV_IOMMU_REG_IOHPMCYCLES 0x0060 +#define RISCV_IOMMU_IOHPMCYCLES_COUNTER GENMASK_ULL(62, 0) +#define RISCV_IOMMU_IOHPMCYCLES_OF BIT_ULL(63) + +/* 5.22 Performance monitoring event counters (31 * 64bits) */ +#define RISCV_IOMMU_REG_IOHPMCTR_BASE 0x0068 +#define RISCV_IOMMU_REG_IOHPMCTR(_n) (RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8)) + +/* 5.23 Performance monitoring event selectors (31 * 64bits) */ +#define RISCV_IOMMU_REG_IOHPMEVT_BASE 0x0160 +#define RISCV_IOMMU_REG_IOHPMEVT(_n) (RISCV_IOMMU_REG_IOHPMEVT_BASE + ((_n) * 0x8)) +#define RISCV_IOMMU_IOHPMEVT_EVENTID GENMASK_ULL(14, 0) +#define RISCV_IOMMU_IOHPMEVT_DMASK BIT_ULL(15) +#define RISCV_IOMMU_IOHPMEVT_PID_PSCID GENMASK_ULL(35, 16) +#define RISCV_IOMMU_IOHPMEVT_DID_GSCID GENMASK_ULL(59, 36) +#define RISCV_IOMMU_IOHPMEVT_PV_PSCV BIT_ULL(60) +#define RISCV_IOMMU_IOHPMEVT_DV_GSCV BIT_ULL(61) +#define RISCV_IOMMU_IOHPMEVT_IDT BIT_ULL(62) +#define RISCV_IOMMU_IOHPMEVT_OF BIT_ULL(63) + +/* Number of defined performance-monitoring event selectors */ +#define RISCV_IOMMU_IOHPMEVT_CNT 31 + +/** + * enum riscv_iommu_hpmevent_id - Performance-monitoring event identifier + * + * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count + * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests + * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests + * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests + * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses + * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks + * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks + * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: First-stage page table walks + * @RISCV_IOMMU_HPMEVENT_G_WALKS: Second-stage page table walks + * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs + */ +enum riscv_iommu_hpmevent_id { + RISCV_IOMMU_HPMEVENT_INVALID = 0, + RISCV_IOMMU_HPMEVENT_URQ = 1, + RISCV_IOMMU_HPMEVENT_TRQ = 2, + RISCV_IOMMU_HPMEVENT_ATS_RQ = 3, + RISCV_IOMMU_HPMEVENT_TLB_MISS = 4, + RISCV_IOMMU_HPMEVENT_DD_WALK = 5, + RISCV_IOMMU_HPMEVENT_PD_WALK = 6, + RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7, + RISCV_IOMMU_HPMEVENT_G_WALKS = 8, + RISCV_IOMMU_HPMEVENT_MAX = 9 +}; + +/* 5.24 Translation request IOVA (64bits) */ +#define RISCV_IOMMU_REG_TR_REQ_IOVA 0x0258 +#define RISCV_IOMMU_TR_REQ_IOVA_VPN GENMASK_ULL(63, 12) + +/* 5.25 Translation request control (64bits) */ +#define RISCV_IOMMU_REG_TR_REQ_CTL 0x0260 +#define RISCV_IOMMU_TR_REQ_CTL_GO_BUSY BIT_ULL(0) +#define RISCV_IOMMU_TR_REQ_CTL_PRIV BIT_ULL(1) +#define RISCV_IOMMU_TR_REQ_CTL_EXE BIT_ULL(2) +#define RISCV_IOMMU_TR_REQ_CTL_NW BIT_ULL(3) +#define RISCV_IOMMU_TR_REQ_CTL_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_TR_REQ_CTL_PV BIT_ULL(32) +#define RISCV_IOMMU_TR_REQ_CTL_DID GENMASK_ULL(63, 40) + +/* 5.26 Translation request response (64bits) */ +#define RISCV_IOMMU_REG_TR_RESPONSE 0x0268 +#define RISCV_IOMMU_TR_RESPONSE_FAULT BIT_ULL(0) +#define RISCV_IOMMU_TR_RESPONSE_PBMT GENMASK_ULL(8, 7) +#define RISCV_IOMMU_TR_RESPONSE_SZ BIT_ULL(9) +#define RISCV_IOMMU_TR_RESPONSE_PPN RISCV_IOMMU_PPN_FIELD + +/* 5.27 Interrupt cause to vector (64bits) */ +#define RISCV_IOMMU_REG_ICVEC 0x02F8 +#define RISCV_IOMMU_ICVEC_CIV GENMASK_ULL(3, 0) +#define RISCV_IOMMU_ICVEC_FIV GENMASK_ULL(7, 4) +#define RISCV_IOMMU_ICVEC_PMIV GENMASK_ULL(11, 8) +#define RISCV_IOMMU_ICVEC_PIV GENMASK_ULL(15, 12) + +/* 5.28 MSI Configuration table (32 * 64bits) */ +#define RISCV_IOMMU_REG_MSI_CFG_TBL 0x0300 +#define RISCV_IOMMU_REG_MSI_CFG_TBL_ADDR(_n) \ + (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10)) +#define RISCV_IOMMU_MSI_CFG_TBL_ADDR GENMASK_ULL(55, 2) +#define RISCV_IOMMU_REG_MSI_CFG_TBL_DATA(_n) \ + (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10) + 0x08) +#define RISCV_IOMMU_MSI_CFG_TBL_DATA GENMASK_ULL(31, 0) +#define RISCV_IOMMU_REG_MSI_CFG_TBL_CTRL(_n) \ + (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10) + 0x0C) +#define RISCV_IOMMU_MSI_CFG_TBL_CTRL_M BIT_ULL(0) + +#define RISCV_IOMMU_REG_SIZE 0x1000 + +/* + * Chapter 2: Data structures + */ + +/* + * Device Directory Table macros for non-leaf nodes + */ +#define RISCV_IOMMU_DDTE_V BIT_ULL(0) +#define RISCV_IOMMU_DDTE_PPN RISCV_IOMMU_PPN_FIELD + +/** + * struct riscv_iommu_dc - Device Context + * @tc: Translation Control + * @iohgatp: I/O Hypervisor guest address translation and protection + * (Second stage context) + * @ta: Translation Attributes + * @fsc: First stage context + * @msiptp: MSI page table pointer + * @msi_addr_mask: MSI address mask + * @msi_addr_pattern: MSI address pattern + * @_reserved: Reserved for future use, padding + * + * This structure is used for leaf nodes on the Device Directory Table, + * in case RISCV_IOMMU_CAPABILITIES_MSI_FLAT is not set, the bottom 4 fields + * are not present and are skipped with pointer arithmetic to avoid + * casting, check out riscv_iommu_get_dc(). + * See section 2.1 for more details + */ +struct riscv_iommu_dc { + u64 tc; + u64 iohgatp; + u64 ta; + u64 fsc; + u64 msiptp; + u64 msi_addr_mask; + u64 msi_addr_pattern; + u64 _reserved; +}; + +/* Translation control fields */ +#define RISCV_IOMMU_DC_TC_V BIT_ULL(0) +#define RISCV_IOMMU_DC_TC_EN_ATS BIT_ULL(1) +#define RISCV_IOMMU_DC_TC_EN_PRI BIT_ULL(2) +#define RISCV_IOMMU_DC_TC_T2GPA BIT_ULL(3) +#define RISCV_IOMMU_DC_TC_DTF BIT_ULL(4) +#define RISCV_IOMMU_DC_TC_PDTV BIT_ULL(5) +#define RISCV_IOMMU_DC_TC_PRPR BIT_ULL(6) +#define RISCV_IOMMU_DC_TC_GADE BIT_ULL(7) +#define RISCV_IOMMU_DC_TC_SADE BIT_ULL(8) +#define RISCV_IOMMU_DC_TC_DPE BIT_ULL(9) +#define RISCV_IOMMU_DC_TC_SBE BIT_ULL(10) +#define RISCV_IOMMU_DC_TC_SXL BIT_ULL(11) + +/* Second-stage (aka G-stage) context fields */ +#define RISCV_IOMMU_DC_IOHGATP_PPN RISCV_IOMMU_ATP_PPN_FIELD +#define RISCV_IOMMU_DC_IOHGATP_GSCID GENMASK_ULL(59, 44) +#define RISCV_IOMMU_DC_IOHGATP_MODE RISCV_IOMMU_ATP_MODE_FIELD + +/** + * enum riscv_iommu_dc_iohgatp_modes - Guest address translation/protection modes + * @RISCV_IOMMU_DC_IOHGATP_MODE_BARE: No translation/protection + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4: Sv32x4 (2-bit extension of Sv32), when fctl.GXL == 1 + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4: Sv39x4 (2-bit extension of Sv39), when fctl.GXL == 0 + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4: Sv48x4 (2-bit extension of Sv48), when fctl.GXL == 0 + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4: Sv57x4 (2-bit extension of Sv57), when fctl.GXL == 0 + */ +enum riscv_iommu_dc_iohgatp_modes { + RISCV_IOMMU_DC_IOHGATP_MODE_BARE = 0, + RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4 = 8, + RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4 = 8, + RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4 = 9, + RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4 = 10 +}; + +/* Translation attributes fields */ +#define RISCV_IOMMU_DC_TA_PSCID GENMASK_ULL(31, 12) + +/* First-stage context fields */ +#define RISCV_IOMMU_DC_FSC_PPN RISCV_IOMMU_ATP_PPN_FIELD +#define RISCV_IOMMU_DC_FSC_MODE RISCV_IOMMU_ATP_MODE_FIELD + +/** + * enum riscv_iommu_dc_fsc_atp_modes - First stage address translation/protection modes + * @RISCV_IOMMU_DC_FSC_MODE_BARE: No translation/protection + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32: Sv32, when dc.tc.SXL == 1 + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: Sv39, when dc.tc.SXL == 0 + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: Sv48, when dc.tc.SXL == 0 + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: Sv57, when dc.tc.SXL == 0 + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8: 1lvl PDT, 8bit process ids + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17: 2lvl PDT, 17bit process ids + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20: 3lvl PDT, 20bit process ids + * + * FSC holds IOSATP when RISCV_IOMMU_DC_TC_PDTV is 0 and PDTP otherwise. + * IOSATP controls the first stage address translation (same as the satp register on + * the RISC-V MMU), and PDTP holds the process directory table, used to select a + * first stage page table based on a process id (for devices that support multiple + * process ids). + */ +enum riscv_iommu_dc_fsc_atp_modes { + RISCV_IOMMU_DC_FSC_MODE_BARE = 0, + RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 = 8, + RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 = 8, + RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48 = 9, + RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57 = 10, + RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8 = 1, + RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17 = 2, + RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20 = 3 +}; + +/* MSI page table pointer */ +#define RISCV_IOMMU_DC_MSIPTP_PPN RISCV_IOMMU_ATP_PPN_FIELD +#define RISCV_IOMMU_DC_MSIPTP_MODE RISCV_IOMMU_ATP_MODE_FIELD +#define RISCV_IOMMU_DC_MSIPTP_MODE_OFF 0 +#define RISCV_IOMMU_DC_MSIPTP_MODE_FLAT 1 + +/* MSI address mask */ +#define RISCV_IOMMU_DC_MSI_ADDR_MASK GENMASK_ULL(51, 0) + +/* MSI address pattern */ +#define RISCV_IOMMU_DC_MSI_PATTERN GENMASK_ULL(51, 0) + +/** + * struct riscv_iommu_pc - Process Context + * @ta: Translation Attributes + * @fsc: First stage context + * + * This structure is used for leaf nodes on the Process Directory Table + * See section 2.3 for more details + */ +struct riscv_iommu_pc { + u64 ta; + u64 fsc; +}; + +/* Translation attributes fields */ +#define RISCV_IOMMU_PC_TA_V BIT_ULL(0) +#define RISCV_IOMMU_PC_TA_ENS BIT_ULL(1) +#define RISCV_IOMMU_PC_TA_SUM BIT_ULL(2) +#define RISCV_IOMMU_PC_TA_PSCID GENMASK_ULL(31, 12) + +/* First stage context fields */ +#define RISCV_IOMMU_PC_FSC_PPN RISCV_IOMMU_ATP_PPN_FIELD +#define RISCV_IOMMU_PC_FSC_MODE RISCV_IOMMU_ATP_MODE_FIELD + +/* + * Chapter 3: In-memory queue interface + */ + +/** + * struct riscv_iommu_command - Generic IOMMU command structure + * @dword0: Includes the opcode and the function identifier + * @dword1: Opcode specific data + * + * The commands are interpreted as two 64bit fields, where the first + * 7bits of the first field are the opcode which also defines the + * command's format, followed by a 3bit field that specifies the + * function invoked by that command, and the rest is opcode-specific. + * This is a generic struct which will be populated differently + * according to each command. For more infos on the commands and + * the command queue check section 3.1. + */ +struct riscv_iommu_command { + u64 dword0; + u64 dword1; +}; + +/* Fields on dword0, common for all commands */ +#define RISCV_IOMMU_CMD_OPCODE GENMASK_ULL(6, 0) +#define RISCV_IOMMU_CMD_FUNC GENMASK_ULL(9, 7) + +/* 3.1.1 IOMMU Page-table cache invalidation */ +/* Fields on dword0 */ +#define RISCV_IOMMU_CMD_IOTINVAL_OPCODE 1 +#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA 0 +#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA 1 +#define RISCV_IOMMU_CMD_IOTINVAL_AV BIT_ULL(10) +#define RISCV_IOMMU_CMD_IOTINVAL_PSCID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_CMD_IOTINVAL_PSCV BIT_ULL(32) +#define RISCV_IOMMU_CMD_IOTINVAL_GV BIT_ULL(33) +#define RISCV_IOMMU_CMD_IOTINVAL_GSCID GENMASK_ULL(59, 44) +/* dword1[61:10] is the 4K-aligned page address */ +#define RISCV_IOMMU_CMD_IOTINVAL_ADDR GENMASK_ULL(61, 10) + +/* 3.1.2 IOMMU Command Queue Fences */ +/* Fields on dword0 */ +#define RISCV_IOMMU_CMD_IOFENCE_OPCODE 2 +#define RISCV_IOMMU_CMD_IOFENCE_FUNC_C 0 +#define RISCV_IOMMU_CMD_IOFENCE_AV BIT_ULL(10) +#define RISCV_IOMMU_CMD_IOFENCE_WSI BIT_ULL(11) +#define RISCV_IOMMU_CMD_IOFENCE_PR BIT_ULL(12) +#define RISCV_IOMMU_CMD_IOFENCE_PW BIT_ULL(13) +#define RISCV_IOMMU_CMD_IOFENCE_DATA GENMASK_ULL(63, 32) +/* dword1 is the address, word-size aligned and shifted to the right by two bits. */ + +/* 3.1.3 IOMMU Directory cache invalidation */ +/* Fields on dword0 */ +#define RISCV_IOMMU_CMD_IODIR_OPCODE 3 +#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT 0 +#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT 1 +#define RISCV_IOMMU_CMD_IODIR_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_CMD_IODIR_DV BIT_ULL(33) +#define RISCV_IOMMU_CMD_IODIR_DID GENMASK_ULL(63, 40) +/* dword1 is reserved for standard use */ + +/* 3.1.4 IOMMU PCIe ATS */ +/* Fields on dword0 */ +#define RISCV_IOMMU_CMD_ATS_OPCODE 4 +#define RISCV_IOMMU_CMD_ATS_FUNC_INVAL 0 +#define RISCV_IOMMU_CMD_ATS_FUNC_PRGR 1 +#define RISCV_IOMMU_CMD_ATS_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_CMD_ATS_PV BIT_ULL(32) +#define RISCV_IOMMU_CMD_ATS_DSV BIT_ULL(33) +#define RISCV_IOMMU_CMD_ATS_RID GENMASK_ULL(55, 40) +#define RISCV_IOMMU_CMD_ATS_DSEG GENMASK_ULL(63, 56) +/* dword1 is the ATS payload, two different payload types for INVAL and PRGR */ + +/* ATS.INVAL payload*/ +#define RISCV_IOMMU_CMD_ATS_INVAL_G BIT_ULL(0) +/* Bits 1 - 10 are zeroed */ +#define RISCV_IOMMU_CMD_ATS_INVAL_S BIT_ULL(11) +#define RISCV_IOMMU_CMD_ATS_INVAL_UADDR GENMASK_ULL(63, 12) + +/* ATS.PRGR payload */ +/* Bits 0 - 31 are zeroed */ +#define RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX GENMASK_ULL(40, 32) +/* Bits 41 - 43 are zeroed */ +#define RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE GENMASK_ULL(47, 44) +#define RISCV_IOMMU_CMD_ATS_PRGR_DST_ID GENMASK_ULL(63, 48) + +/** + * struct riscv_iommu_fq_record - Fault/Event Queue Record + * @hdr: Header, includes fault/event cause, PID/DID, transaction type etc + * @_reserved: Low 32bits for custom use, high 32bits for standard use + * @iotval: Transaction-type/cause specific format + * @iotval2: Cause specific format + * + * The fault/event queue reports events and failures raised when + * processing transactions. Each record is a 32byte structure where + * the first dword has a fixed format for providing generic infos + * regarding the fault/event, and two more dwords are there for + * fault/event-specific information. For more details see section + * 3.2. + */ +struct riscv_iommu_fq_record { + u64 hdr; + u64 _reserved; + u64 iotval; + u64 iotval2; +}; + +/* Fields on header */ +#define RISCV_IOMMU_FQ_HDR_CAUSE GENMASK_ULL(11, 0) +#define RISCV_IOMMU_FQ_HDR_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_FQ_HDR_PV BIT_ULL(32) +#define RISCV_IOMMU_FQ_HDR_PRIV BIT_ULL(33) +#define RISCV_IOMMU_FQ_HDR_TTYP GENMASK_ULL(39, 34) +#define RISCV_IOMMU_FQ_HDR_DID GENMASK_ULL(63, 40) + +/** + * enum riscv_iommu_fq_causes - Fault/event cause values + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT: Instruction access fault + * @RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED: Read address misaligned + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT: Read load fault + * @RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED: Write/AMO address misaligned + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT: Write/AMO access fault + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S: Instruction page fault + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S: Read page fault + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S: Write/AMO page fault + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS: Instruction guest page fault + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS: Read guest page fault + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS: Write/AMO guest page fault + * @RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED: All inbound transactions disallowed + * @RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT: DDT entry load access fault + * @RISCV_IOMMU_FQ_CAUSE_DDT_INVALID: DDT entry invalid + * @RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED: DDT entry misconfigured + * @RISCV_IOMMU_FQ_CAUSE_TTYP_BLOCKED: Transaction type disallowed + * @RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT: MSI PTE load access fault + * @RISCV_IOMMU_FQ_CAUSE_MSI_INVALID: MSI PTE invalid + * @RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED: MSI PTE misconfigured + * @RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT: MRIF access fault + * @RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT: PDT entry load access fault + * @RISCV_IOMMU_FQ_CAUSE_PDT_INVALID: PDT entry invalid + * @RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED: PDT entry misconfigured + * @RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED: DDT data corruption + * @RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED: PDT data corruption + * @RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED: MSI page table data corruption + * @RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED: MRIF data corruption + * @RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR: Internal data path error + * @RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT: IOMMU MSI write access fault + * @RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED: First/second stage page table data corruption + * + * Values are on table 11 of the spec, encodings 275 - 2047 are reserved for standard + * use, and 2048 - 4095 for custom use. + */ +enum riscv_iommu_fq_causes { + RISCV_IOMMU_FQ_CAUSE_INST_FAULT = 1, + RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED = 4, + RISCV_IOMMU_FQ_CAUSE_RD_FAULT = 5, + RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED = 6, + RISCV_IOMMU_FQ_CAUSE_WR_FAULT = 7, + RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S = 12, + RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S = 13, + RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S = 15, + RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS = 20, + RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS = 21, + RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS = 23, + RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED = 256, + RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT = 257, + RISCV_IOMMU_FQ_CAUSE_DDT_INVALID = 258, + RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED = 259, + RISCV_IOMMU_FQ_CAUSE_TTYP_BLOCKED = 260, + RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT = 261, + RISCV_IOMMU_FQ_CAUSE_MSI_INVALID = 262, + RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED = 263, + RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT = 264, + RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT = 265, + RISCV_IOMMU_FQ_CAUSE_PDT_INVALID = 266, + RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED = 267, + RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED = 268, + RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED = 269, + RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED = 270, + RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED = 271, + RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR = 272, + RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT = 273, + RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED = 274 +}; + +/** + * enum riscv_iommu_fq_ttypes: Fault/event transaction types + * @RISCV_IOMMU_FQ_TTYP_NONE: None. Fault not caused by an inbound transaction. + * @RISCV_IOMMU_FQ_TTYP_UADDR_INST_FETCH: Instruction fetch from untranslated address + * @RISCV_IOMMU_FQ_TTYP_UADDR_RD: Read from untranslated address + * @RISCV_IOMMU_FQ_TTYP_UADDR_WR: Write/AMO to untranslated address + * @RISCV_IOMMU_FQ_TTYP_TADDR_INST_FETCH: Instruction fetch from translated address + * @RISCV_IOMMU_FQ_TTYP_TADDR_RD: Read from translated address + * @RISCV_IOMMU_FQ_TTYP_TADDR_WR: Write/AMO to translated address + * @RISCV_IOMMU_FQ_TTYP_PCIE_ATS_REQ: PCIe ATS translation request + * @RISCV_IOMMU_FQ_TTYP_PCIE_MSG_REQ: PCIe message request + * + * Values are on table 12 of the spec, type 4 and 10 - 31 are reserved for standard use + * and 31 - 63 for custom use. + */ +enum riscv_iommu_fq_ttypes { + RISCV_IOMMU_FQ_TTYP_NONE = 0, + RISCV_IOMMU_FQ_TTYP_UADDR_INST_FETCH = 1, + RISCV_IOMMU_FQ_TTYP_UADDR_RD = 2, + RISCV_IOMMU_FQ_TTYP_UADDR_WR = 3, + RISCV_IOMMU_FQ_TTYP_TADDR_INST_FETCH = 5, + RISCV_IOMMU_FQ_TTYP_TADDR_RD = 6, + RISCV_IOMMU_FQ_TTYP_TADDR_WR = 7, + RISCV_IOMMU_FQ_TTYP_PCIE_ATS_REQ = 8, + RISCV_IOMMU_FQ_TTYP_PCIE_MSG_REQ = 9, +}; + +/** + * struct riscv_iommu_pq_record - PCIe Page Request record + * @hdr: Header, includes PID, DID etc + * @payload: Holds the page address, request group and permission bits + * + * For more infos on the PCIe Page Request queue see chapter 3.3. + */ +struct riscv_iommu_pq_record { + u64 hdr; + u64 payload; +}; + +/* Header fields */ +#define RISCV_IOMMU_PQ_HDR_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_PQ_HDR_PV BIT_ULL(32) +#define RISCV_IOMMU_PQ_HDR_PRIV BIT_ULL(33) +#define RISCV_IOMMU_PQ_HDR_EXEC BIT_ULL(34) +#define RISCV_IOMMU_PQ_HDR_DID GENMASK_ULL(63, 40) + +/* Payload fields */ +#define RISCV_IOMMU_PQ_PAYLOAD_R BIT_ULL(0) +#define RISCV_IOMMU_PQ_PAYLOAD_W BIT_ULL(1) +#define RISCV_IOMMU_PQ_PAYLOAD_L BIT_ULL(2) +#define RISCV_IOMMU_PQ_PAYLOAD_RWL_MASK GENMASK_ULL(2, 0) +#define RISCV_IOMMU_PQ_PAYLOAD_PRGI GENMASK_ULL(11, 3) /* Page Request Group Index */ +#define RISCV_IOMMU_PQ_PAYLOAD_ADDR GENMASK_ULL(63, 12) + +/** + * struct riscv_iommu_msipte - MSI Page Table Entry + * @pte: MSI PTE + * @mrif_info: Memory-resident interrupt file info + * + * The MSI Page Table is used for virtualizing MSIs, so that when + * a device sends an MSI to a guest, the IOMMU can reroute it + * by translating the MSI address, either to a guest interrupt file + * or a memory resident interrupt file (MRIF). Note that this page table + * is an array of MSI PTEs, not a multi-level pt, each entry + * is a leaf entry. For more infos check out the AIA spec, chapter 9.5. + * + * Also in basic mode the mrif_info field is ignored by the IOMMU and can + * be used by software, any other reserved fields on pte must be zeroed-out + * by software. + */ +struct riscv_iommu_msipte { + u64 pte; + u64 mrif_info; +}; + +/* Fields on pte */ +#define RISCV_IOMMU_MSIPTE_V BIT_ULL(0) +#define RISCV_IOMMU_MSIPTE_M GENMASK_ULL(2, 1) +#define RISCV_IOMMU_MSIPTE_MRIF_ADDR GENMASK_ULL(53, 7) /* When M == 1 (MRIF mode) */ +#define RISCV_IOMMU_MSIPTE_PPN RISCV_IOMMU_PPN_FIELD /* When M == 3 (basic mode) */ +#define RISCV_IOMMU_MSIPTE_C BIT_ULL(63) + +/* Fields on mrif_info */ +#define RISCV_IOMMU_MSIPTE_MRIF_NID GENMASK_ULL(9, 0) +#define RISCV_IOMMU_MSIPTE_MRIF_NPPN RISCV_IOMMU_PPN_FIELD +#define RISCV_IOMMU_MSIPTE_MRIF_NID_MSB BIT_ULL(60) + +/* Helper functions: command structure builders. */ + +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd, + u64 addr) +{ + cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr)); + cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV; +} + +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd, + int pscid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) | + RISCV_IOMMU_CMD_IOTINVAL_PSCV; +} + +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd, + int gscid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) | + RISCV_IOMMU_CMD_IOTINVAL_GV; +} + +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) | + RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW; + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd, + u64 addr, u32 data) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) | + FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | + RISCV_IOMMU_CMD_IOFENCE_AV; + cmd->dword1 = addr >> 2; +} + +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd, + unsigned int devid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | + RISCV_IOMMU_CMD_IODIR_DV; +} + +static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd, + unsigned int pasid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid); +} + +#endif /* _RISCV_IOMMU_BITS_H_ */ diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c new file mode 100644 index 000000000000..d82d2b00904c --- /dev/null +++ b/drivers/iommu/riscv/iommu-pci.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * + * RISCV IOMMU as a PCIe device + * + * Authors + * Tomasz Jeznach <tjeznach@rivosinc.com> + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/iommu.h> +#include <linux/kernel.h> +#include <linux/pci.h> + +#include "iommu-bits.h" +#include "iommu.h" + +/* QEMU RISC-V IOMMU implementation */ +#define PCI_DEVICE_ID_REDHAT_RISCV_IOMMU 0x0014 + +/* Rivos Inc. assigned PCI Vendor and Device IDs */ +#ifndef PCI_VENDOR_ID_RIVOS +#define PCI_VENDOR_ID_RIVOS 0x1efd +#endif + +#define PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA 0x0008 + +static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + struct device *dev = &pdev->dev; + struct riscv_iommu_device *iommu; + int rc, vec; + + rc = pcim_enable_device(pdev); + if (rc) + return rc; + + if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) + return -ENODEV; + + if (pci_resource_len(pdev, 0) < RISCV_IOMMU_REG_SIZE) + return -ENODEV; + + rc = pcim_iomap_regions(pdev, BIT(0), pci_name(pdev)); + if (rc) + return dev_err_probe(dev, rc, "pcim_iomap_regions failed\n"); + + iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return -ENOMEM; + + iommu->dev = dev; + iommu->reg = pcim_iomap_table(pdev)[0]; + + pci_set_master(pdev); + dev_set_drvdata(dev, iommu); + + /* Check device reported capabilities / features. */ + iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES); + iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); + + /* The PCI driver only uses MSIs, make sure the IOMMU supports this */ + switch (FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps)) { + case RISCV_IOMMU_CAPABILITIES_IGS_MSI: + case RISCV_IOMMU_CAPABILITIES_IGS_BOTH: + break; + default: + return dev_err_probe(dev, -ENODEV, + "unable to use message-signaled interrupts\n"); + } + + /* Allocate and assign IRQ vectors for the various events */ + rc = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT, + PCI_IRQ_MSIX | PCI_IRQ_MSI); + if (rc <= 0) + return dev_err_probe(dev, -ENODEV, + "unable to allocate irq vectors\n"); + + iommu->irqs_count = rc; + for (vec = 0; vec < iommu->irqs_count; vec++) + iommu->irqs[vec] = msi_get_virq(dev, vec); + + /* Enable message-signaled interrupts, fctl.WSI */ + if (iommu->fctl & RISCV_IOMMU_FCTL_WSI) { + iommu->fctl ^= RISCV_IOMMU_FCTL_WSI; + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl); + } + + return riscv_iommu_init(iommu); +} + +static void riscv_iommu_pci_remove(struct pci_dev *pdev) +{ + struct riscv_iommu_device *iommu = dev_get_drvdata(&pdev->dev); + + riscv_iommu_remove(iommu); +} + +static void riscv_iommu_pci_shutdown(struct pci_dev *pdev) +{ + struct riscv_iommu_device *iommu = dev_get_drvdata(&pdev->dev); + + riscv_iommu_disable(iommu); +} + +static const struct pci_device_id riscv_iommu_pci_tbl[] = { + {PCI_VDEVICE(REDHAT, PCI_DEVICE_ID_REDHAT_RISCV_IOMMU), 0}, + {PCI_VDEVICE(RIVOS, PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA), 0}, + {0,} +}; + +static struct pci_driver riscv_iommu_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = riscv_iommu_pci_tbl, + .probe = riscv_iommu_pci_probe, + .remove = riscv_iommu_pci_remove, + .shutdown = riscv_iommu_pci_shutdown, + .driver = { + .suppress_bind_attrs = true, + }, +}; + +builtin_pci_driver(riscv_iommu_pci_driver); diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c new file mode 100644 index 000000000000..83a28c83f991 --- /dev/null +++ b/drivers/iommu/riscv/iommu-platform.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RISC-V IOMMU as a platform device + * + * Copyright © 2023 FORTH-ICS/CARV + * Copyright © 2023-2024 Rivos Inc. + * + * Authors + * Nick Kossifidis <mick@ics.forth.gr> + * Tomasz Jeznach <tjeznach@rivosinc.com> + */ + +#include <linux/acpi.h> +#include <linux/irqchip/riscv-imsic.h> +#include <linux/kernel.h> +#include <linux/msi.h> +#include <linux/of_irq.h> +#include <linux/of_platform.h> +#include <linux/platform_device.h> + +#include "iommu-bits.h" +#include "iommu.h" + +static void riscv_iommu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg) +{ + struct device *dev = msi_desc_to_dev(desc); + struct riscv_iommu_device *iommu = dev_get_drvdata(dev); + u16 idx = desc->msi_index; + u64 addr; + + addr = ((u64)msg->address_hi << 32) | msg->address_lo; + + if (addr != (addr & RISCV_IOMMU_MSI_CFG_TBL_ADDR)) { + dev_err_once(dev, + "uh oh, the IOMMU can't send MSIs to 0x%llx, sending to 0x%llx instead\n", + addr, addr & RISCV_IOMMU_MSI_CFG_TBL_ADDR); + } + + addr &= RISCV_IOMMU_MSI_CFG_TBL_ADDR; + + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_MSI_CFG_TBL_ADDR(idx), addr); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_MSI_CFG_TBL_DATA(idx), msg->data); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_MSI_CFG_TBL_CTRL(idx), 0); +} + +static int riscv_iommu_platform_probe(struct platform_device *pdev) +{ + enum riscv_iommu_igs_settings igs; + struct device *dev = &pdev->dev; + struct riscv_iommu_device *iommu = NULL; + struct irq_domain *msi_domain; + struct resource *res = NULL; + int vec, ret; + + iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return -ENOMEM; + + iommu->dev = dev; + iommu->reg = devm_platform_get_and_ioremap_resource(pdev, 0, &res); + if (IS_ERR(iommu->reg)) + return dev_err_probe(dev, PTR_ERR(iommu->reg), + "could not map register region\n"); + + dev_set_drvdata(dev, iommu); + + /* Check device reported capabilities / features. */ + iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES); + iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); + + iommu->irqs_count = platform_irq_count(pdev); + if (iommu->irqs_count <= 0) + return dev_err_probe(dev, -ENODEV, + "no IRQ resources provided\n"); + if (iommu->irqs_count > RISCV_IOMMU_INTR_COUNT) + iommu->irqs_count = RISCV_IOMMU_INTR_COUNT; + + igs = FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps); + switch (igs) { + case RISCV_IOMMU_CAPABILITIES_IGS_BOTH: + case RISCV_IOMMU_CAPABILITIES_IGS_MSI: + if (is_of_node(dev_fwnode(dev))) { + of_msi_configure(dev, to_of_node(dev->fwnode)); + } else { + msi_domain = irq_find_matching_fwnode(imsic_acpi_get_fwnode(dev), + DOMAIN_BUS_PLATFORM_MSI); + dev_set_msi_domain(dev, msi_domain); + } + + if (!dev_get_msi_domain(dev)) { + dev_warn(dev, "failed to find an MSI domain\n"); + goto msi_fail; + } + + ret = platform_device_msi_init_and_alloc_irqs(dev, iommu->irqs_count, + riscv_iommu_write_msi_msg); + if (ret) { + dev_warn(dev, "failed to allocate MSIs\n"); + goto msi_fail; + } + + for (vec = 0; vec < iommu->irqs_count; vec++) + iommu->irqs[vec] = msi_get_virq(dev, vec); + + /* Enable message-signaled interrupts, fctl.WSI */ + if (iommu->fctl & RISCV_IOMMU_FCTL_WSI) { + iommu->fctl ^= RISCV_IOMMU_FCTL_WSI; + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl); + } + + dev_info(dev, "using MSIs\n"); + break; + +msi_fail: + if (igs != RISCV_IOMMU_CAPABILITIES_IGS_BOTH) { + return dev_err_probe(dev, -ENODEV, + "unable to use wire-signaled interrupts\n"); + } + + fallthrough; + + case RISCV_IOMMU_CAPABILITIES_IGS_WSI: + for (vec = 0; vec < iommu->irqs_count; vec++) + iommu->irqs[vec] = platform_get_irq(pdev, vec); + + /* Enable wire-signaled interrupts, fctl.WSI */ + if (!(iommu->fctl & RISCV_IOMMU_FCTL_WSI)) { + iommu->fctl |= RISCV_IOMMU_FCTL_WSI; + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl); + } + dev_info(dev, "using wire-signaled interrupts\n"); + break; + default: + return dev_err_probe(dev, -ENODEV, "invalid IGS\n"); + } + + return riscv_iommu_init(iommu); +}; + +static void riscv_iommu_platform_remove(struct platform_device *pdev) +{ + struct riscv_iommu_device *iommu = dev_get_drvdata(&pdev->dev); + bool msi = !(iommu->fctl & RISCV_IOMMU_FCTL_WSI); + + riscv_iommu_remove(iommu); + + if (msi) + platform_device_msi_free_irqs_all(&pdev->dev); +}; + +static void riscv_iommu_platform_shutdown(struct platform_device *pdev) +{ + riscv_iommu_disable(dev_get_drvdata(&pdev->dev)); +}; + +static const struct of_device_id riscv_iommu_of_match[] = { + {.compatible = "riscv,iommu",}, + {}, +}; + +static const struct acpi_device_id riscv_iommu_acpi_match[] = { + { "RSCV0004", 0 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, riscv_iommu_acpi_match); + +static struct platform_driver riscv_iommu_platform_driver = { + .probe = riscv_iommu_platform_probe, + .remove = riscv_iommu_platform_remove, + .shutdown = riscv_iommu_platform_shutdown, + .driver = { + .name = "riscv,iommu", + .of_match_table = riscv_iommu_of_match, + .suppress_bind_attrs = true, + .acpi_match_table = riscv_iommu_acpi_match, + }, +}; + +builtin_platform_driver(riscv_iommu_platform_driver); diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c new file mode 100644 index 000000000000..d9429097a2b5 --- /dev/null +++ b/drivers/iommu/riscv/iommu.c @@ -0,0 +1,1682 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * IOMMU API for RISC-V IOMMU implementations. + * + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * + * Authors + * Tomasz Jeznach <tjeznach@rivosinc.com> + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#define pr_fmt(fmt) "riscv-iommu: " fmt + +#include <linux/acpi.h> +#include <linux/acpi_rimt.h> +#include <linux/compiler.h> +#include <linux/crash_dump.h> +#include <linux/init.h> +#include <linux/iommu.h> +#include <linux/iopoll.h> +#include <linux/kernel.h> +#include <linux/pci.h> + +#include "../iommu-pages.h" +#include "iommu-bits.h" +#include "iommu.h" + +/* Timeouts in [us] */ +#define RISCV_IOMMU_QCSR_TIMEOUT 150000 +#define RISCV_IOMMU_QUEUE_TIMEOUT 150000 +#define RISCV_IOMMU_DDTP_TIMEOUT 10000000 +#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 + +/* Number of entries per CMD/FLT queue, should be <= INT_MAX */ +#define RISCV_IOMMU_DEF_CQ_COUNT 8192 +#define RISCV_IOMMU_DEF_FQ_COUNT 4096 + +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ +#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) +#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) + +#define dev_to_iommu(dev) \ + iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) + +/* IOMMU PSCID allocation namespace. */ +static DEFINE_IDA(riscv_iommu_pscids); +#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) + +/* Device resource-managed allocations */ +struct riscv_iommu_devres { + void *addr; +}; + +static void riscv_iommu_devres_pages_release(struct device *dev, void *res) +{ + struct riscv_iommu_devres *devres = res; + + iommu_free_pages(devres->addr); +} + +static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) +{ + struct riscv_iommu_devres *devres = res; + struct riscv_iommu_devres *target = p; + + return devres->addr == target->addr; +} + +static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, + unsigned int size) +{ + struct riscv_iommu_devres *devres; + void *addr; + + addr = iommu_alloc_pages_node_sz(dev_to_node(iommu->dev), + GFP_KERNEL_ACCOUNT, size); + if (unlikely(!addr)) + return NULL; + + devres = devres_alloc(riscv_iommu_devres_pages_release, + sizeof(struct riscv_iommu_devres), GFP_KERNEL); + + if (unlikely(!devres)) { + iommu_free_pages(addr); + return NULL; + } + + devres->addr = addr; + + devres_add(iommu->dev, devres); + + return addr; +} + +static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) +{ + struct riscv_iommu_devres devres = { .addr = addr }; + + devres_release(iommu->dev, riscv_iommu_devres_pages_release, + riscv_iommu_devres_pages_match, &devres); +} + +/* + * Hardware queue allocation and management. + */ + +/* Setup queue base, control registers and default queue length */ +#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ + struct riscv_iommu_queue *_q = q; \ + _q->qid = RISCV_IOMMU_INTR_ ## name; \ + _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ + _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ + _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ +} while (0) + +/* Note: offsets are the same for all queues */ +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) +#define Q_ITEM(q, index) ((q)->mask & (index)) +#define Q_IPSR(q) BIT((q)->qid) + +/* + * Discover queue ring buffer hardware configuration, allocate in-memory + * ring buffer or use fixed I/O memory location, configure queue base register. + * Must be called before hardware queue is enabled. + * + * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() + * @entry_size - queue single element size in bytes. + */ +static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, + struct riscv_iommu_queue *queue, + size_t entry_size) +{ + unsigned int logsz; + u64 qb, rb; + + /* + * Use WARL base register property to discover maximum allowed + * number of entries and optional fixed IO address for queue location. + */ + riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); + qb = riscv_iommu_readq(iommu, queue->qbr); + + /* + * Calculate and verify hardware supported queue length, as reported + * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). + * Update queue size based on hardware supported value. + */ + logsz = ilog2(queue->mask); + if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) + logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); + + /* + * Use WARL base register property to discover an optional fixed IO + * address for queue ring buffer location. Otherwise allocate contiguous + * system memory. + */ + if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { + const size_t queue_size = entry_size << (logsz + 1); + + queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); + queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); + } else { + do { + const size_t queue_size = entry_size << (logsz + 1); + + queue->base = riscv_iommu_get_pages( + iommu, max(queue_size, SZ_4K)); + queue->phys = __pa(queue->base); + } while (!queue->base && logsz-- > 0); + } + + if (!queue->base) + return -ENOMEM; + + qb = phys_to_ppn(queue->phys) | + FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); + + /* Update base register and read back to verify hw accepted our write */ + riscv_iommu_writeq(iommu, queue->qbr, qb); + rb = riscv_iommu_readq(iommu, queue->qbr); + if (rb != qb) { + dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); + return -ENODEV; + } + + /* Update actual queue mask */ + queue->mask = (2U << logsz) - 1; + + dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", + queue->qid, logsz + 1); + + return 0; +} + +/* Check interrupt queue status, IPSR */ +static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) +{ + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + + if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) + return IRQ_WAKE_THREAD; + + return IRQ_NONE; +} + +static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) +{ + /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ + return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; +} + +/* + * Enable queue processing in the hardware, register interrupt handler. + * + * @queue - data structure, already allocated with riscv_iommu_queue_alloc() + * @irq_handler - threaded interrupt handler. + */ +static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, + struct riscv_iommu_queue *queue, + irq_handler_t irq_handler) +{ + const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; + u32 csr; + int rc; + + if (queue->iommu) + return -EBUSY; + + /* Polling not implemented */ + if (!irq) + return -ENODEV; + + queue->iommu = iommu; + rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, + IRQF_ONESHOT | IRQF_SHARED, + dev_name(iommu->dev), queue); + if (rc) { + queue->iommu = NULL; + return rc; + } + + /* Empty queue before enabling it */ + if (queue->qid == RISCV_IOMMU_INTR_CQ) + riscv_iommu_writel(queue->iommu, Q_TAIL(queue), 0); + else + riscv_iommu_writel(queue->iommu, Q_HEAD(queue), 0); + + /* + * Enable queue with interrupts, clear any memory fault if any. + * Wait for the hardware to acknowledge request and activate queue + * processing. + * Note: All CSR bitfields are in the same offsets for all queues. + */ + riscv_iommu_writel(iommu, queue->qcr, + RISCV_IOMMU_QUEUE_ENABLE | + RISCV_IOMMU_QUEUE_INTR_ENABLE | + RISCV_IOMMU_QUEUE_MEM_FAULT); + + riscv_iommu_readl_timeout(iommu, queue->qcr, + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), + 10, RISCV_IOMMU_QCSR_TIMEOUT); + + if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | + RISCV_IOMMU_QUEUE_BUSY | + RISCV_IOMMU_QUEUE_MEM_FAULT))) { + /* Best effort to stop and disable failing hardware queue. */ + riscv_iommu_writel(iommu, queue->qcr, 0); + free_irq(irq, queue); + queue->iommu = NULL; + dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); + return -EBUSY; + } + + /* Clear any pending interrupt flag. */ + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + return 0; +} + +/* + * Disable queue. Wait for the hardware to acknowledge request and + * stop processing enqueued requests. Report errors but continue. + */ +static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) +{ + struct riscv_iommu_device *iommu = queue->iommu; + u32 csr; + + if (!iommu) + return; + + free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); + riscv_iommu_writel(iommu, queue->qcr, 0); + riscv_iommu_readl_timeout(iommu, queue->qcr, + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), + 10, RISCV_IOMMU_QCSR_TIMEOUT); + + if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) + dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", + queue->qid, csr); + + queue->iommu = NULL; +} + +/* + * Returns number of available valid queue entries and the first item index. + * Update shadow producer index if necessary. + */ +static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, + unsigned int *index) +{ + unsigned int head = atomic_read(&queue->head); + unsigned int tail = atomic_read(&queue->tail); + unsigned int last = Q_ITEM(queue, tail); + int available = (int)(tail - head); + + *index = head; + + if (available > 0) + return available; + + /* read hardware producer index, check reserved register bits are not set. */ + if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), + tail, (tail & ~queue->mask) == 0, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { + dev_err_once(queue->iommu->dev, + "Hardware error: queue access timeout\n"); + return 0; + } + + if (tail == last) + return 0; + + /* update shadow producer index */ + return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); +} + +/* + * Release processed queue entries, should match riscv_iommu_queue_consume() calls. + */ +static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) +{ + const unsigned int head = atomic_add_return(count, &queue->head); + + riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); +} + +/* Return actual consumer index based on hardware reported queue head index. */ +static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) +{ + const unsigned int cons = atomic_read(&queue->head); + const unsigned int last = Q_ITEM(queue, cons); + unsigned int head; + + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, + !(head & ~queue->mask), + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + return cons; + + return cons + ((head - last) & queue->mask); +} + +/* Wait for submitted item to be processed. */ +static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, + unsigned int index, + unsigned int timeout_us) +{ + unsigned int cons = atomic_read(&queue->head); + + /* Already processed by the consumer */ + if ((int)(cons - index) > 0) + return 0; + + /* Monitor consumer index */ + return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, + (int)(cons - index) > 0, 0, timeout_us); +} + +/* Enqueue an entry and wait to be processed if timeout_us > 0 + * + * Error handling for IOMMU hardware not responding in reasonable time + * will be added as separate patch series along with other RAS features. + * For now, only report hardware failure and continue. + */ +static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, + void *entry, size_t entry_size) +{ + unsigned int prod; + unsigned int head; + unsigned int tail; + unsigned long flags; + + /* Do not preempt submission flow. */ + local_irq_save(flags); + + /* 1. Allocate some space in the queue */ + prod = atomic_inc_return(&queue->prod) - 1; + head = atomic_read(&queue->head); + + /* 2. Wait for space availability. */ + if ((prod - head) > queue->mask) { + if (readx_poll_timeout(atomic_read, &queue->head, + head, (prod - head) < queue->mask, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + } else if ((prod - head) == queue->mask) { + const unsigned int last = Q_ITEM(queue, head); + + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, + !(head & ~queue->mask) && head != last, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + atomic_add((head - last) & queue->mask, &queue->head); + } + + /* 3. Store entry in the ring buffer */ + memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); + + /* 4. Wait for all previous entries to be ready */ + if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + + /* + * 5. Make sure the ring buffer update (whether in normal or I/O memory) is + * completed and visible before signaling the tail doorbell to fetch + * the next command. 'fence ow, ow' + */ + dma_wmb(); + riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); + + /* + * 6. Make sure the doorbell write to the device has finished before updating + * the shadow tail index in normal memory. 'fence o, w' + */ + mmiowb(); + atomic_inc(&queue->tail); + + /* 7. Complete submission and restore local interrupts */ + local_irq_restore(flags); + + return prod; + +err_busy: + local_irq_restore(flags); + dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); + + return prod; +} + +/* + * IOMMU Command queue chapter 3.1 + */ + +/* Command queue interrupt handler thread function */ +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) +{ + const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + unsigned int ctrl; + + /* Clear MF/CQ errors, complete error recovery to be implemented. */ + ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); + if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | + RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { + riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); + dev_warn(queue->iommu->dev, + "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", + queue->qid, + !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), + !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); + } + + /* Placeholder for command queue interrupt notifiers */ + + /* Clear command interrupt pending. */ + riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + return IRQ_HANDLED; +} + +/* Send command to the IOMMU command queue */ +static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, + struct riscv_iommu_command *cmd) +{ + riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); +} + +/* Send IOFENCE.C command and wait for all scheduled commands to complete. */ +static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, + unsigned int timeout_us) +{ + struct riscv_iommu_command cmd; + unsigned int prod; + + riscv_iommu_cmd_iofence(&cmd); + prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); + + if (!timeout_us) + return; + + if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) + dev_err_once(iommu->dev, + "Hardware error: command execution timeout\n"); +} + +/* + * IOMMU Fault/Event queue chapter 3.2 + */ + +static void riscv_iommu_fault(struct riscv_iommu_device *iommu, + struct riscv_iommu_fq_record *event) +{ + unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); + unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); + + /* Placeholder for future fault handling implementation, report only. */ + if (err) + dev_warn_ratelimited(iommu->dev, + "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", + err, devid, event->iotval, event->iotval2); +} + +/* Fault queue interrupt handler thread function */ +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) +{ + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + struct riscv_iommu_device *iommu = queue->iommu; + struct riscv_iommu_fq_record *events; + unsigned int ctrl, idx; + int cnt, len; + + events = (struct riscv_iommu_fq_record *)queue->base; + + /* Clear fault interrupt pending and process all received fault events. */ + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + do { + cnt = riscv_iommu_queue_consume(queue, &idx); + for (len = 0; len < cnt; idx++, len++) + riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); + riscv_iommu_queue_release(queue, cnt); + } while (cnt > 0); + + /* Clear MF/OF errors, complete error recovery to be implemented. */ + ctrl = riscv_iommu_readl(iommu, queue->qcr); + if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { + riscv_iommu_writel(iommu, queue->qcr, ctrl); + dev_warn(iommu->dev, + "Queue #%u error; memory fault:%d overflow:%d\n", + queue->qid, + !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), + !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); + } + + return IRQ_HANDLED; +} + +/* Lookup and initialize device context info structure. */ +static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, + unsigned int devid) +{ + const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); + unsigned int depth; + unsigned long ddt, old, new; + void *ptr; + u8 ddi_bits[3] = { 0 }; + u64 *ddtp = NULL; + + /* Make sure the mode is valid */ + if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || + iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) + return NULL; + + /* + * Device id partitioning for base format: + * DDI[0]: bits 0 - 6 (1st level) (7 bits) + * DDI[1]: bits 7 - 15 (2nd level) (9 bits) + * DDI[2]: bits 16 - 23 (3rd level) (8 bits) + * + * For extended format: + * DDI[0]: bits 0 - 5 (1st level) (6 bits) + * DDI[1]: bits 6 - 14 (2nd level) (9 bits) + * DDI[2]: bits 15 - 23 (3rd level) (9 bits) + */ + if (base_format) { + ddi_bits[0] = 7; + ddi_bits[1] = 7 + 9; + ddi_bits[2] = 7 + 9 + 8; + } else { + ddi_bits[0] = 6; + ddi_bits[1] = 6 + 9; + ddi_bits[2] = 6 + 9 + 9; + } + + /* Make sure device id is within range */ + depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; + if (devid >= (1 << ddi_bits[depth])) + return NULL; + + /* Get to the level of the non-leaf node that holds the device context */ + for (ddtp = iommu->ddt_root; depth-- > 0;) { + const int split = ddi_bits[depth]; + /* + * Each non-leaf node is 64bits wide and on each level + * nodes are indexed by DDI[depth]. + */ + ddtp += (devid >> split) & 0x1FF; + + /* + * Check if this node has been populated and if not + * allocate a new level and populate it. + */ + do { + ddt = READ_ONCE(*(unsigned long *)ddtp); + if (ddt & RISCV_IOMMU_DDTE_V) { + ddtp = __va(ppn_to_phys(ddt)); + break; + } + + ptr = riscv_iommu_get_pages(iommu, SZ_4K); + if (!ptr) + return NULL; + + new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; + old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); + + if (old == ddt) { + ddtp = (u64 *)ptr; + break; + } + + /* Race setting DDT detected, re-read and retry. */ + riscv_iommu_free_pages(iommu, ptr); + } while (1); + } + + /* + * Grab the node that matches DDI[depth], note that when using base + * format the device context is 4 * 64bits, and the extended format + * is 8 * 64bits, hence the (3 - base_format) below. + */ + ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); + + return (struct riscv_iommu_dc *)ddtp; +} + +/* + * This is best effort IOMMU translation shutdown flow. + * Disable IOMMU without waiting for hardware response. + */ +void riscv_iommu_disable(struct riscv_iommu_device *iommu) +{ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, + FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, + RISCV_IOMMU_DDTP_IOMMU_MODE_BARE)); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); +} + +#define riscv_iommu_read_ddtp(iommu) ({ \ + u64 ddtp; \ + riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ + !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ + RISCV_IOMMU_DDTP_TIMEOUT); \ + ddtp; }) + +static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) +{ + u64 ddtp; + unsigned int mode; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* + * It is optional for the hardware to report a fixed address for device + * directory root page when DDT.MODE is OFF or BARE. + */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || + mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { + /* Use WARL to discover hardware fixed DDT PPN */ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, + FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + iommu->ddt_phys = ppn_to_phys(ddtp); + if (iommu->ddt_phys) + iommu->ddt_root = devm_ioremap(iommu->dev, + iommu->ddt_phys, PAGE_SIZE); + if (iommu->ddt_root) + memset(iommu->ddt_root, 0, PAGE_SIZE); + } + + if (!iommu->ddt_root) { + iommu->ddt_root = riscv_iommu_get_pages(iommu, SZ_4K); + iommu->ddt_phys = __pa(iommu->ddt_root); + } + + if (!iommu->ddt_root) + return -ENOMEM; + + return 0; +} + +/* + * Discover supported DDT modes starting from requested value, + * configure DDTP register with accepted mode and root DDT address. + * Accepted iommu->ddt_mode is updated on success. + */ +static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, + unsigned int ddtp_mode) +{ + struct device *dev = iommu->dev; + u64 ddtp, rq_ddtp; + unsigned int mode, rq_mode = ddtp_mode; + struct riscv_iommu_command cmd; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* Disallow state transition from xLVL to xLVL. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && + mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && + rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && + rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) + return -EINVAL; + + do { + rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); + if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) + rq_ddtp |= phys_to_ppn(iommu->ddt_phys); + + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) { + dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", + rq_mode, ddtp); + return -EBUSY; + } + + /* Verify IOMMU hardware accepts new DDTP config. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + + if (rq_mode == mode) + break; + + /* Hardware mandatory DDTP mode has not been accepted. */ + if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { + dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", + ddtp, rq_ddtp); + return -EINVAL; + } + + /* + * Mode field is WARL, an IOMMU may support a subset of + * directory table levels in which case if we tried to set + * an unsupported number of levels we'll readback either + * a valid xLVL or off/bare. If we got off/bare, try again + * with a smaller xLVL. + */ + if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && + rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { + dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); + rq_mode--; + continue; + } + + /* + * We tried all supported modes and IOMMU hardware failed to + * accept new settings, something went very wrong since off/bare + * and at least one xLVL must be supported. + */ + dev_err(dev, "DDTP hw mode %u, failed to set %u\n", + mode, ddtp_mode); + return -EINVAL; + } while (1); + + iommu->ddt_mode = mode; + if (mode != ddtp_mode) + dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); + + /* Invalidate device context cache */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_send(iommu, &cmd); + + /* Invalidate address translation cache */ + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_send(iommu, &cmd); + + /* IOFENCE.C */ + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + + return 0; +} + +/* This struct contains protection domain specific IOMMU driver data. */ +struct riscv_iommu_domain { + struct iommu_domain domain; + struct list_head bonds; + spinlock_t lock; /* protect bonds list updates. */ + int pscid; + bool amo_enabled; + int numa_node; + unsigned int pgd_mode; + unsigned long *pgd_root; +}; + +#define iommu_domain_to_riscv(iommu_domain) \ + container_of(iommu_domain, struct riscv_iommu_domain, domain) + +/* Private IOMMU data for managed devices, dev_iommu_priv_* */ +struct riscv_iommu_info { + struct riscv_iommu_domain *domain; +}; + +/* + * Linkage between an iommu_domain and attached devices. + * + * Protection domain requiring IOATC and DevATC translation cache invalidations, + * should be linked to attached devices using a riscv_iommu_bond structure. + * Devices should be linked to the domain before first use and unlinked after + * the translations from the referenced protection domain can no longer be used. + * Blocking and identity domains are not tracked here, as the IOMMU hardware + * does not cache negative and/or identity (BARE mode) translations, and DevATC + * is disabled for those protection domains. + * + * The device pointer and IOMMU data remain stable in the bond struct after + * _probe_device() where it's attached to the managed IOMMU, up to the + * completion of the _release_device() call. The release of the bond structure + * is synchronized with the device release. + */ +struct riscv_iommu_bond { + struct list_head list; + struct rcu_head rcu; + struct device *dev; +}; + +static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_bond *bond; + struct list_head *bonds; + + bond = kzalloc(sizeof(*bond), GFP_KERNEL); + if (!bond) + return -ENOMEM; + bond->dev = dev; + + /* + * List of devices attached to the domain is arranged based on + * managed IOMMU device. + */ + + spin_lock(&domain->lock); + list_for_each(bonds, &domain->bonds) + if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) + break; + list_add_rcu(&bond->list, bonds); + spin_unlock(&domain->lock); + + /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ + smp_mb(); + + return 0; +} + +static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_bond *bond, *found = NULL; + struct riscv_iommu_command cmd; + int count = 0; + + if (!domain) + return; + + spin_lock(&domain->lock); + list_for_each_entry(bond, &domain->bonds, list) { + if (found && count) + break; + else if (bond->dev == dev) + found = bond; + else if (dev_to_iommu(bond->dev) == iommu) + count++; + } + if (found) + list_del_rcu(&found->list); + spin_unlock(&domain->lock); + kfree_rcu(found, rcu); + + /* + * If this was the last bond between this domain and the IOMMU + * invalidate all cached entries for domain's PSCID. + */ + if (!count) { + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); + riscv_iommu_cmd_send(iommu, &cmd); + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + } +} + +/* + * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. + * This limit will be replaced with range invalidations, if supported by + * the hardware, when RISC-V IOMMU architecture specification update for + * range invalidations update will be available. + */ +#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) + +static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, + unsigned long start, unsigned long end) +{ + struct riscv_iommu_bond *bond; + struct riscv_iommu_device *iommu, *prev; + struct riscv_iommu_command cmd; + unsigned long len = end - start + 1; + unsigned long iova; + + /* + * For each IOMMU linked with this protection domain (via bonds->dev), + * an IOTLB invaliation command will be submitted and executed. + * + * Possbile race with domain attach flow is handled by sequencing + * bond creation - riscv_iommu_bond_link(), and device directory + * update - riscv_iommu_iodir_update(). + * + * PTE Update / IOTLB Inval Device attach & directory update + * -------------------------- -------------------------- + * update page table entries add dev to the bond list + * FENCE RW,RW FENCE RW,RW + * For all IOMMUs: (can be empty) Update FSC/PSCID + * FENCE IOW,IOW FENCE IOW,IOW + * IOTLB.INVAL IODIR.INVAL + * IOFENCE.C + * + * If bond list is not updated with new device, directory context will + * be configured with already valid page table content. If an IOMMU is + * linked to the protection domain it will receive invalidation + * requests for updated page table entries. + */ + smp_mb(); + + rcu_read_lock(); + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + + /* + * IOTLB invalidation request can be safely omitted if already sent + * to the IOMMU for the same PSCID, and with domain->bonds list + * arranged based on the device's IOMMU, it's sufficient to check + * last device the invalidation was sent to. + */ + if (iommu == prev) + continue; + + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); + if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { + for (iova = start; iova < end; iova += PAGE_SIZE) { + riscv_iommu_cmd_inval_set_addr(&cmd, iova); + riscv_iommu_cmd_send(iommu, &cmd); + } + } else { + riscv_iommu_cmd_send(iommu, &cmd); + } + prev = iommu; + } + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + if (iommu == prev) + continue; + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + prev = iommu; + } + rcu_read_unlock(); +} + +#define RISCV_IOMMU_FSC_BARE 0 + +/* + * Update IODIR for the device. + * + * During the execution of riscv_iommu_probe_device(), IODIR entries are + * allocated for the device's identifiers. Device context invalidation + * becomes necessary only if one of the updated entries was previously + * marked as valid, given that invalid device context entries are not + * cached by the IOMMU hardware. + * In this implementation, updating a valid device context while the + * device is not quiesced might be disruptive, potentially causing + * interim translation faults. + */ +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, u64 fsc, u64 ta) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_dc *dc; + struct riscv_iommu_command cmd; + bool sync_required = false; + u64 tc; + int i; + + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + if (!(tc & RISCV_IOMMU_DC_TC_V)) + continue; + + WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); + + /* Invalidate device context cached values */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); + riscv_iommu_cmd_send(iommu, &cmd); + sync_required = true; + } + + if (sync_required) + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + + /* + * For device context with DC_TC_PDTV = 0, translation attributes valid bit + * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). + */ + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + tc |= ta & RISCV_IOMMU_DC_TC_V; + + WRITE_ONCE(dc->fsc, fsc); + WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); + /* Update device context, write TC.V as the last step. */ + dma_wmb(); + WRITE_ONCE(dc->tc, tc); + + /* Invalidate device context after update */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); + riscv_iommu_cmd_send(iommu, &cmd); + } + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); +} + +/* + * IOVA page translation tree management. + */ + +static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + + riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); +} + +static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, + struct iommu_iotlb_gather *gather) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + + riscv_iommu_iotlb_inval(domain, gather->start, gather->end); +} + +#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) + +#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) +#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) +#define _io_pte_none(pte) ((pte) == 0) +#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) + +static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, + unsigned long pte, + struct iommu_pages_list *freelist) +{ + unsigned long *ptr; + int i; + + if (!_io_pte_present(pte) || _io_pte_leaf(pte)) + return; + + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + + /* Recursively free all sub page table pages */ + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = READ_ONCE(ptr[i]); + if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) + riscv_iommu_pte_free(domain, pte, freelist); + } + + if (freelist) + iommu_pages_list_add(freelist, ptr); + else + iommu_free_pages(ptr); +} + +static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, + unsigned long iova, size_t pgsize, + gfp_t gfp) +{ + unsigned long *ptr = domain->pgd_root; + unsigned long pte, old; + int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; + void *addr; + + do { + const int shift = PAGE_SHIFT + PT_SHIFT * level; + + ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); + /* + * Note: returned entry might be a non-leaf if there was + * existing mapping with smaller granularity. Up to the caller + * to replace and invalidate. + */ + if (((size_t)1 << shift) == pgsize) + return ptr; +pte_retry: + pte = READ_ONCE(*ptr); + /* + * This is very likely incorrect as we should not be adding + * new mapping with smaller granularity on top + * of existing 2M/1G mapping. Fail. + */ + if (_io_pte_present(pte) && _io_pte_leaf(pte)) + return NULL; + /* + * Non-leaf entry is missing, allocate and try to add to the + * page table. This might race with other mappings, retry. + */ + if (_io_pte_none(pte)) { + addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp, + SZ_4K); + if (!addr) + return NULL; + old = pte; + pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); + if (cmpxchg_relaxed(ptr, old, pte) != old) { + iommu_free_pages(addr); + goto pte_retry; + } + } + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + } while (level-- > 0); + + return NULL; +} + +static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, + unsigned long iova, size_t *pte_pgsize) +{ + unsigned long *ptr = domain->pgd_root; + unsigned long pte; + int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; + + do { + const int shift = PAGE_SHIFT + PT_SHIFT * level; + + ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); + pte = READ_ONCE(*ptr); + if (_io_pte_present(pte) && _io_pte_leaf(pte)) { + *pte_pgsize = (size_t)1 << shift; + return ptr; + } + if (_io_pte_none(pte)) + return NULL; + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + } while (level-- > 0); + + return NULL; +} + +static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, + unsigned long iova, phys_addr_t phys, + size_t pgsize, size_t pgcount, int prot, + gfp_t gfp, size_t *mapped) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t size = 0; + unsigned long *ptr; + unsigned long pte, old, pte_prot; + int rc = 0; + struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); + + if (!(prot & IOMMU_WRITE)) + pte_prot = _PAGE_BASE | _PAGE_READ; + else if (domain->amo_enabled) + pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; + else + pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; + + while (pgcount) { + ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); + if (!ptr) { + rc = -ENOMEM; + break; + } + + old = READ_ONCE(*ptr); + pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); + if (cmpxchg_relaxed(ptr, old, pte) != old) + continue; + + riscv_iommu_pte_free(domain, old, &freelist); + + size += pgsize; + iova += pgsize; + phys += pgsize; + --pgcount; + } + + *mapped = size; + + if (!iommu_pages_list_empty(&freelist)) { + /* + * In 1.0 spec version, the smallest scope we can use to + * invalidate all levels of page table (i.e. leaf and non-leaf) + * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. + * This will be updated with hardware support for + * capability.NL (non-leaf) IOTINVAL command. + */ + riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); + iommu_put_pages_list(&freelist); + } + + return rc; +} + +static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *gather) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t size = pgcount << __ffs(pgsize); + unsigned long *ptr, old; + size_t unmapped = 0; + size_t pte_size; + + while (unmapped < size) { + ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); + if (!ptr) + return unmapped; + + /* partial unmap is not allowed, fail. */ + if (iova & (pte_size - 1)) + return unmapped; + + old = READ_ONCE(*ptr); + if (cmpxchg_relaxed(ptr, old, 0) != old) + continue; + + iommu_iotlb_gather_add_page(&domain->domain, gather, iova, + pte_size); + + iova += pte_size; + unmapped += pte_size; + } + + return unmapped; +} + +static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, + dma_addr_t iova) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t pte_size; + unsigned long *ptr; + + ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); + if (!ptr) + return 0; + + return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); +} + +static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + const unsigned long pfn = virt_to_pfn(domain->pgd_root); + + WARN_ON(!list_empty(&domain->bonds)); + + if ((int)domain->pscid > 0) + ida_free(&riscv_iommu_pscids, domain->pscid); + + riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); + kfree(domain); +} + +static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) +{ + switch (pgd_mode) { + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; + + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; + + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; + } + return false; +} + +static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, + struct device *dev, + struct iommu_domain *old) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + u64 fsc, ta; + + if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) + return -ENODEV; + + fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | + FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); + ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | + RISCV_IOMMU_PC_TA_V; + + if (riscv_iommu_bond_link(domain, dev)) + return -ENOMEM; + + riscv_iommu_iodir_update(iommu, dev, fsc, ta); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = domain; + + return 0; +} + +static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { + .attach_dev = riscv_iommu_attach_paging_domain, + .free = riscv_iommu_free_paging_domain, + .map_pages = riscv_iommu_map_pages, + .unmap_pages = riscv_iommu_unmap_pages, + .iova_to_phys = riscv_iommu_iova_to_phys, + .iotlb_sync = riscv_iommu_iotlb_sync, + .flush_iotlb_all = riscv_iommu_iotlb_flush_all, +}; + +static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) +{ + struct riscv_iommu_domain *domain; + struct riscv_iommu_device *iommu; + unsigned int pgd_mode; + dma_addr_t va_mask; + int va_bits; + + iommu = dev_to_iommu(dev); + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; + va_bits = 57; + } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; + va_bits = 48; + } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; + va_bits = 39; + } else { + dev_err(dev, "cannot find supported page table mode\n"); + return ERR_PTR(-ENODEV); + } + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD_RCU(&domain->bonds); + spin_lock_init(&domain->lock); + domain->numa_node = dev_to_node(iommu->dev); + domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); + domain->pgd_mode = pgd_mode; + domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node, + GFP_KERNEL_ACCOUNT, SZ_4K); + if (!domain->pgd_root) { + kfree(domain); + return ERR_PTR(-ENOMEM); + } + + domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, + RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); + if (domain->pscid < 0) { + iommu_free_pages(domain->pgd_root); + kfree(domain); + return ERR_PTR(-ENOMEM); + } + + /* + * Note: RISC-V Privilege spec mandates that virtual addresses + * need to be sign-extended, so if (VA_BITS - 1) is set, all + * bits >= VA_BITS need to also be set or else we'll get a + * page fault. However the code that creates the mappings + * above us (e.g. iommu_dma_alloc_iova()) won't do that for us + * for now, so we'll end up with invalid virtual addresses + * to map. As a workaround until we get this sorted out + * limit the available virtual addresses to VA_BITS - 1. + */ + va_mask = DMA_BIT_MASK(va_bits - 1); + + domain->domain.geometry.aperture_start = 0; + domain->domain.geometry.aperture_end = va_mask; + domain->domain.geometry.force_aperture = true; + domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); + + domain->domain.ops = &riscv_iommu_paging_domain_ops; + + return &domain->domain; +} + +static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, + struct device *dev, + struct iommu_domain *old) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + /* Make device context invalid, translation requests will fault w/ #258 */ + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = NULL; + + return 0; +} + +static struct iommu_domain riscv_iommu_blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = riscv_iommu_attach_blocking_domain, + } +}; + +static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, + struct device *dev, + struct iommu_domain *old) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = NULL; + + return 0; +} + +static struct iommu_domain riscv_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = riscv_iommu_attach_identity_domain, + } +}; + +static struct iommu_group *riscv_iommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + return generic_device_group(dev); +} + +static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) +{ + return iommu_fwspec_add_ids(dev, args->args, 1); +} + +static struct iommu_device *riscv_iommu_probe_device(struct device *dev) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_device *iommu; + struct riscv_iommu_info *info; + struct riscv_iommu_dc *dc; + u64 tc; + int i; + + if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) + return ERR_PTR(-ENODEV); + + iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev); + if (!iommu) + return ERR_PTR(-ENODEV); + + /* + * IOMMU hardware operating in fail-over BARE mode will provide + * identity translation for all connected devices anyway... + */ + if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) + return ERR_PTR(-ENODEV); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + /* + * Allocate and pre-configure device context entries in + * the device directory. Do not mark the context valid yet. + */ + tc = 0; + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) + tc |= RISCV_IOMMU_DC_TC_SADE; + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + if (!dc) { + kfree(info); + return ERR_PTR(-ENODEV); + } + if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) + dev_warn(dev, "already attached to IOMMU device directory\n"); + WRITE_ONCE(dc->tc, tc); + } + + dev_iommu_priv_set(dev, info); + + return &iommu->iommu; +} + +static void riscv_iommu_release_device(struct device *dev) +{ + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + kfree_rcu_mightsleep(info); +} + +static const struct iommu_ops riscv_iommu_ops = { + .of_xlate = riscv_iommu_of_xlate, + .identity_domain = &riscv_iommu_identity_domain, + .blocked_domain = &riscv_iommu_blocking_domain, + .release_domain = &riscv_iommu_blocking_domain, + .domain_alloc_paging = riscv_iommu_alloc_paging_domain, + .device_group = riscv_iommu_device_group, + .probe_device = riscv_iommu_probe_device, + .release_device = riscv_iommu_release_device, +}; + +static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) +{ + u64 ddtp; + + /* + * Make sure the IOMMU is switched off or in pass-through mode during + * regular boot flow and disable translation when we boot into a kexec + * kernel and the previous kernel left them enabled. + */ + ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) > + RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) { + if (!is_kdump_kernel()) + return -EBUSY; + riscv_iommu_disable(iommu); + } + + /* Configure accesses to in-memory data structures for CPU-native byte order. */ + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != + !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) { + if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END)) + return -EINVAL; + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, + iommu->fctl ^ RISCV_IOMMU_FCTL_BE); + iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != + !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) + return -EINVAL; + } + + /* + * Distribute interrupt vectors, always use first vector for CIV. + * At least one interrupt is required. Read back and verify. + */ + if (!iommu->irqs_count) + return -EINVAL; + + iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | + FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | + FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); + iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); + if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), + FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)), + max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), + FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) + return -EINVAL; + + return 0; +} + +void riscv_iommu_remove(struct riscv_iommu_device *iommu) +{ + iommu_device_unregister(&iommu->iommu); + iommu_device_sysfs_remove(&iommu->iommu); + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); + riscv_iommu_queue_disable(&iommu->cmdq); + riscv_iommu_queue_disable(&iommu->fltq); +} + +int riscv_iommu_init(struct riscv_iommu_device *iommu) +{ + int rc; + + RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); + RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); + + rc = riscv_iommu_init_check(iommu); + if (rc) + return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); + + rc = riscv_iommu_iodir_alloc(iommu); + if (rc) + return rc; + + rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, + sizeof(struct riscv_iommu_command)); + if (rc) + return rc; + + rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, + sizeof(struct riscv_iommu_fq_record)); + if (rc) + return rc; + + rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); + if (rc) + return rc; + + rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); + if (rc) + goto err_queue_disable; + + rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); + if (rc) + goto err_queue_disable; + + rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", + dev_name(iommu->dev)); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); + goto err_iodir_off; + } + + if (!acpi_disabled) { + rc = rimt_iommu_register(iommu->dev); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n"); + goto err_remove_sysfs; + } + } + + rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n"); + goto err_remove_sysfs; + } + + return 0; + +err_remove_sysfs: + iommu_device_sysfs_remove(&iommu->iommu); +err_iodir_off: + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); +err_queue_disable: + riscv_iommu_queue_disable(&iommu->fltq); + riscv_iommu_queue_disable(&iommu->cmdq); + return rc; +} diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h new file mode 100644 index 000000000000..46df79dd5495 --- /dev/null +++ b/drivers/iommu/riscv/iommu.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * + * Authors + * Tomasz Jeznach <tjeznach@rivosinc.com> + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#ifndef _RISCV_IOMMU_H_ +#define _RISCV_IOMMU_H_ + +#include <linux/iommu.h> +#include <linux/types.h> +#include <linux/iopoll.h> + +#include "iommu-bits.h" + +struct riscv_iommu_device; + +struct riscv_iommu_queue { + atomic_t prod; /* unbounded producer allocation index */ + atomic_t head; /* unbounded shadow ring buffer consumer index */ + atomic_t tail; /* unbounded shadow ring buffer producer index */ + unsigned int mask; /* index mask, queue length - 1 */ + unsigned int irq; /* allocated interrupt number */ + struct riscv_iommu_device *iommu; /* iommu device handling the queue when active */ + void *base; /* ring buffer kernel pointer */ + dma_addr_t phys; /* ring buffer physical address */ + u16 qbr; /* base register offset, head and tail reference */ + u16 qcr; /* control and status register offset */ + u8 qid; /* queue identifier, same as RISCV_IOMMU_INTR_XX */ +}; + +struct riscv_iommu_device { + /* iommu core interface */ + struct iommu_device iommu; + + /* iommu hardware */ + struct device *dev; + + /* hardware control register space */ + void __iomem *reg; + + /* supported and enabled hardware capabilities */ + u64 caps; + u32 fctl; + + /* available interrupt numbers, MSI or WSI */ + unsigned int irqs[RISCV_IOMMU_INTR_COUNT]; + unsigned int irqs_count; + unsigned int icvec; + + /* hardware queues */ + struct riscv_iommu_queue cmdq; + struct riscv_iommu_queue fltq; + + /* device directory */ + unsigned int ddt_mode; + dma_addr_t ddt_phys; + u64 *ddt_root; +}; + +int riscv_iommu_init(struct riscv_iommu_device *iommu); +void riscv_iommu_remove(struct riscv_iommu_device *iommu); +void riscv_iommu_disable(struct riscv_iommu_device *iommu); + +#define riscv_iommu_readl(iommu, addr) \ + readl_relaxed((iommu)->reg + (addr)) + +#define riscv_iommu_readq(iommu, addr) \ + readq_relaxed((iommu)->reg + (addr)) + +#define riscv_iommu_writel(iommu, addr, val) \ + writel_relaxed((val), (iommu)->reg + (addr)) + +#define riscv_iommu_writeq(iommu, addr, val) \ + writeq_relaxed((val), (iommu)->reg + (addr)) + +#define riscv_iommu_readq_timeout(iommu, addr, val, cond, delay_us, timeout_us) \ + readx_poll_timeout(readq_relaxed, (iommu)->reg + (addr), val, cond, \ + delay_us, timeout_us) + +#define riscv_iommu_readl_timeout(iommu, addr, val, cond, delay_us, timeout_us) \ + readx_poll_timeout(readl_relaxed, (iommu)->reg + (addr), val, cond, \ + delay_us, timeout_us) + +#endif diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 4b369419b32c..85f3667e797c 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -25,6 +25,7 @@ #include <linux/pm_runtime.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/string_choices.h> #include "iommu-pages.h" @@ -87,6 +88,7 @@ struct rk_iommu_domain { dma_addr_t dt_dma; spinlock_t iommus_lock; /* lock for iommus list */ spinlock_t dt_lock; /* lock for modifying page directory table */ + struct device *dma_dev; struct iommu_domain domain; }; @@ -122,7 +124,6 @@ struct rk_iommudata { struct rk_iommu *iommu; }; -static struct device *dma_dev; static const struct rk_iommu_ops *rk_ops; static struct iommu_domain rk_identity_domain; @@ -131,7 +132,7 @@ static inline void rk_table_flush(struct rk_iommu_domain *dom, dma_addr_t dma, { size_t size = count * sizeof(u32); /* count of u32 entry */ - dma_sync_single_for_device(dma_dev, dma, size, DMA_TO_DEVICE); + dma_sync_single_for_device(dom->dma_dev, dma, size, DMA_TO_DEVICE); } static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom) @@ -611,7 +612,7 @@ static irqreturn_t rk_iommu_irq(int irq, void *dev_id) dev_err(iommu->dev, "Page fault at %pad of type %s\n", &iova, - (flags == IOMMU_FAULT_WRITE) ? "write" : "read"); + str_write_read(flags == IOMMU_FAULT_WRITE)); log_iova(iommu, i, iova); @@ -729,14 +730,15 @@ static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain, if (rk_dte_is_pt_valid(dte)) goto done; - page_table = iommu_alloc_page(GFP_ATOMIC | rk_ops->gfp_flags); + page_table = iommu_alloc_pages_sz(GFP_ATOMIC | rk_ops->gfp_flags, + SPAGE_SIZE); if (!page_table) return ERR_PTR(-ENOMEM); - pt_dma = dma_map_single(dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE); - if (dma_mapping_error(dma_dev, pt_dma)) { - dev_err(dma_dev, "DMA mapping error while allocating page table\n"); - iommu_free_page(page_table); + pt_dma = dma_map_single(rk_domain->dma_dev, page_table, SPAGE_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(rk_domain->dma_dev, pt_dma)) { + dev_err(rk_domain->dma_dev, "DMA mapping error while allocating page table\n"); + iommu_free_pages(page_table); return ERR_PTR(-ENOMEM); } @@ -958,7 +960,8 @@ out_disable_clocks: } static int rk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain; @@ -1003,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = { }; static int rk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain = to_rk_domain(domain); @@ -1024,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - ret = rk_iommu_identity_attach(&rk_identity_domain, dev); + ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old); if (ret) return ret; @@ -1039,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, return 0; ret = rk_iommu_enable(iommu); - if (ret) - WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev)); + if (ret) { + /* + * Note rk_iommu_identity_attach() might fail before physically + * attaching the dev to iommu->domain, in which case the actual + * old domain for this revert should be rk_identity_domain v.s. + * iommu->domain. Since rk_iommu_identity_attach() does not care + * about the old domain argument for now, this is not a problem. + */ + WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev, + iommu->domain)); + } pm_runtime_put(iommu->dev); @@ -1050,9 +1062,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) { struct rk_iommu_domain *rk_domain; - - if (!dma_dev) - return NULL; + struct rk_iommu *iommu; rk_domain = kzalloc(sizeof(*rk_domain), GFP_KERNEL); if (!rk_domain) @@ -1063,14 +1073,17 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) * Each level1 (dt) and level2 (pt) table has 1024 4-byte entries. * Allocate one 4 KiB page for each table. */ - rk_domain->dt = iommu_alloc_page(GFP_KERNEL | rk_ops->gfp_flags); + rk_domain->dt = iommu_alloc_pages_sz(GFP_KERNEL | rk_ops->gfp_flags, + SPAGE_SIZE); if (!rk_domain->dt) goto err_free_domain; - rk_domain->dt_dma = dma_map_single(dma_dev, rk_domain->dt, + iommu = rk_iommu_from_dev(dev); + rk_domain->dma_dev = iommu->dev; + rk_domain->dt_dma = dma_map_single(rk_domain->dma_dev, rk_domain->dt, SPAGE_SIZE, DMA_TO_DEVICE); - if (dma_mapping_error(dma_dev, rk_domain->dt_dma)) { - dev_err(dma_dev, "DMA map error for DT\n"); + if (dma_mapping_error(rk_domain->dma_dev, rk_domain->dt_dma)) { + dev_err(rk_domain->dma_dev, "DMA map error for DT\n"); goto err_free_dt; } @@ -1078,6 +1091,8 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) spin_lock_init(&rk_domain->dt_lock); INIT_LIST_HEAD(&rk_domain->iommus); + rk_domain->domain.pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP; + rk_domain->domain.geometry.aperture_start = 0; rk_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32); rk_domain->domain.geometry.force_aperture = true; @@ -1085,7 +1100,7 @@ static struct iommu_domain *rk_iommu_domain_alloc_paging(struct device *dev) return &rk_domain->domain; err_free_dt: - iommu_free_page(rk_domain->dt); + iommu_free_pages(rk_domain->dt); err_free_domain: kfree(rk_domain); @@ -1104,15 +1119,15 @@ static void rk_iommu_domain_free(struct iommu_domain *domain) if (rk_dte_is_pt_valid(dte)) { phys_addr_t pt_phys = rk_ops->pt_address(dte); u32 *page_table = phys_to_virt(pt_phys); - dma_unmap_single(dma_dev, pt_phys, + dma_unmap_single(rk_domain->dma_dev, pt_phys, SPAGE_SIZE, DMA_TO_DEVICE); - iommu_free_page(page_table); + iommu_free_pages(page_table); } } - dma_unmap_single(dma_dev, rk_domain->dt_dma, + dma_unmap_single(rk_domain->dma_dev, rk_domain->dt_dma, SPAGE_SIZE, DMA_TO_DEVICE); - iommu_free_page(rk_domain->dt); + iommu_free_pages(rk_domain->dt); kfree(rk_domain); } @@ -1147,14 +1162,13 @@ static int rk_iommu_of_xlate(struct device *dev, struct platform_device *iommu_dev; struct rk_iommudata *data; - data = devm_kzalloc(dma_dev, sizeof(*data), GFP_KERNEL); + iommu_dev = of_find_device_by_node(args->np); + + data = devm_kzalloc(&iommu_dev->dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; - iommu_dev = of_find_device_by_node(args->np); - data->iommu = platform_get_drvdata(iommu_dev); - data->iommu->domain = &rk_identity_domain; dev_iommu_priv_set(dev, data); platform_device_put(iommu_dev); @@ -1168,7 +1182,6 @@ static const struct iommu_ops rk_iommu_ops = { .probe_device = rk_iommu_probe_device, .release_device = rk_iommu_release_device, .device_group = generic_single_device_group, - .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP, .of_xlate = rk_iommu_of_xlate, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = rk_iommu_attach_device, @@ -1192,6 +1205,8 @@ static int rk_iommu_probe(struct platform_device *pdev) if (!iommu) return -ENOMEM; + iommu->domain = &rk_identity_domain; + platform_set_drvdata(pdev, iommu); iommu->dev = dev; iommu->num_mmu = 0; @@ -1255,22 +1270,6 @@ static int rk_iommu_probe(struct platform_device *pdev) if (err) return err; - err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev)); - if (err) - goto err_unprepare_clocks; - - err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev); - if (err) - goto err_remove_sysfs; - - /* - * Use the first registered IOMMU device for domain to use with DMA - * API, since a domain might not physically correspond to a single - * IOMMU device.. - */ - if (!dma_dev) - dma_dev = &pdev->dev; - pm_runtime_enable(dev); for (i = 0; i < iommu->num_irq; i++) { @@ -1289,12 +1288,19 @@ static int rk_iommu_probe(struct platform_device *pdev) dma_set_mask_and_coherent(dev, rk_ops->dma_bit_mask); + err = iommu_device_sysfs_add(&iommu->iommu, dev, NULL, dev_name(dev)); + if (err) + goto err_pm_disable; + + err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev); + if (err) + goto err_remove_sysfs; + return 0; -err_pm_disable: - pm_runtime_disable(dev); err_remove_sysfs: iommu_device_sysfs_remove(&iommu->iommu); -err_unprepare_clocks: +err_pm_disable: + pm_runtime_disable(dev); clk_bulk_unprepare(iommu->num_clocks, iommu->clocks); return err; } diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index d8eaa7ea380b..fe679850af28 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -16,7 +16,7 @@ #include "dma-iommu.h" -static const struct iommu_ops s390_iommu_ops; +static const struct iommu_ops s390_iommu_ops, s390_iommu_rtr_ops; static struct kmem_cache *dma_region_table_cache; static struct kmem_cache *dma_page_table_cache; @@ -31,8 +31,21 @@ struct s390_domain { unsigned long *dma_table; spinlock_t list_lock; struct rcu_head rcu; + u8 origin_type; }; +static struct iommu_domain blocking_domain; + +static inline unsigned int calc_rfx(dma_addr_t ptr) +{ + return ((unsigned long)ptr >> ZPCI_RF_SHIFT) & ZPCI_INDEX_MASK; +} + +static inline unsigned int calc_rsx(dma_addr_t ptr) +{ + return ((unsigned long)ptr >> ZPCI_RS_SHIFT) & ZPCI_INDEX_MASK; +} + static inline unsigned int calc_rtx(dma_addr_t ptr) { return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK; @@ -54,6 +67,20 @@ static inline void set_pt_pfaa(unsigned long *entry, phys_addr_t pfaa) *entry |= (pfaa & ZPCI_PTE_ADDR_MASK); } +static inline void set_rf_rso(unsigned long *entry, phys_addr_t rso) +{ + *entry &= ZPCI_RTE_FLAG_MASK; + *entry |= (rso & ZPCI_RTE_ADDR_MASK); + *entry |= ZPCI_TABLE_TYPE_RFX; +} + +static inline void set_rs_rto(unsigned long *entry, phys_addr_t rto) +{ + *entry &= ZPCI_RTE_FLAG_MASK; + *entry |= (rto & ZPCI_RTE_ADDR_MASK); + *entry |= ZPCI_TABLE_TYPE_RSX; +} + static inline void set_rt_sto(unsigned long *entry, phys_addr_t sto) { *entry &= ZPCI_RTE_FLAG_MASK; @@ -68,6 +95,22 @@ static inline void set_st_pto(unsigned long *entry, phys_addr_t pto) *entry |= ZPCI_TABLE_TYPE_SX; } +static inline void validate_rf_entry(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_VALID_MASK; + *entry &= ~ZPCI_TABLE_OFFSET_MASK; + *entry |= ZPCI_TABLE_VALID; + *entry |= ZPCI_TABLE_LEN_RFX; +} + +static inline void validate_rs_entry(unsigned long *entry) +{ + *entry &= ~ZPCI_TABLE_VALID_MASK; + *entry &= ~ZPCI_TABLE_OFFSET_MASK; + *entry |= ZPCI_TABLE_VALID; + *entry |= ZPCI_TABLE_LEN_RSX; +} + static inline void validate_rt_entry(unsigned long *entry) { *entry &= ~ZPCI_TABLE_VALID_MASK; @@ -118,6 +161,22 @@ static inline int pt_entry_isvalid(unsigned long entry) return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; } +static inline unsigned long *get_rf_rso(unsigned long entry) +{ + if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RFX) + return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK); + else + return NULL; +} + +static inline unsigned long *get_rs_rto(unsigned long entry) +{ + if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RSX) + return phys_to_virt(entry & ZPCI_RTE_ADDR_MASK); + else + return NULL; +} + static inline unsigned long *get_rt_sto(unsigned long entry) { if ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX) @@ -189,18 +248,59 @@ static void dma_free_seg_table(unsigned long entry) dma_free_cpu_table(sto); } -static void dma_cleanup_tables(unsigned long *table) +static void dma_free_rt_table(unsigned long entry) { + unsigned long *rto = get_rs_rto(entry); int rtx; - if (!table) + for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) + if (reg_entry_isvalid(rto[rtx])) + dma_free_seg_table(rto[rtx]); + + dma_free_cpu_table(rto); +} + +static void dma_free_rs_table(unsigned long entry) +{ + unsigned long *rso = get_rf_rso(entry); + int rsx; + + for (rsx = 0; rsx < ZPCI_TABLE_ENTRIES; rsx++) + if (reg_entry_isvalid(rso[rsx])) + dma_free_rt_table(rso[rsx]); + + dma_free_cpu_table(rso); +} + +static void dma_cleanup_tables(struct s390_domain *domain) +{ + int rtx, rsx, rfx; + + if (!domain->dma_table) return; - for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) - if (reg_entry_isvalid(table[rtx])) - dma_free_seg_table(table[rtx]); + switch (domain->origin_type) { + case ZPCI_TABLE_TYPE_RFX: + for (rfx = 0; rfx < ZPCI_TABLE_ENTRIES; rfx++) + if (reg_entry_isvalid(domain->dma_table[rfx])) + dma_free_rs_table(domain->dma_table[rfx]); + break; + case ZPCI_TABLE_TYPE_RSX: + for (rsx = 0; rsx < ZPCI_TABLE_ENTRIES; rsx++) + if (reg_entry_isvalid(domain->dma_table[rsx])) + dma_free_rt_table(domain->dma_table[rsx]); + break; + case ZPCI_TABLE_TYPE_RTX: + for (rtx = 0; rtx < ZPCI_TABLE_ENTRIES; rtx++) + if (reg_entry_isvalid(domain->dma_table[rtx])) + dma_free_seg_table(domain->dma_table[rtx]); + break; + default: + WARN_ONCE(1, "Invalid IOMMU table (%x)\n", domain->origin_type); + return; + } - dma_free_cpu_table(table); + dma_free_cpu_table(domain->dma_table); } static unsigned long *dma_alloc_page_table(gfp_t gfp) @@ -216,6 +316,70 @@ static unsigned long *dma_alloc_page_table(gfp_t gfp) return table; } +static unsigned long *dma_walk_rs_table(unsigned long *rso, + dma_addr_t dma_addr, gfp_t gfp) +{ + unsigned int rsx = calc_rsx(dma_addr); + unsigned long old_rse, rse; + unsigned long *rsep, *rto; + + rsep = &rso[rsx]; + rse = READ_ONCE(*rsep); + if (reg_entry_isvalid(rse)) { + rto = get_rs_rto(rse); + } else { + rto = dma_alloc_cpu_table(gfp); + if (!rto) + return NULL; + + set_rs_rto(&rse, virt_to_phys(rto)); + validate_rs_entry(&rse); + entry_clr_protected(&rse); + + old_rse = cmpxchg(rsep, ZPCI_TABLE_INVALID, rse); + if (old_rse != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_cpu_table(rto); + rto = get_rs_rto(old_rse); + } + } + return rto; +} + +static unsigned long *dma_walk_rf_table(unsigned long *rfo, + dma_addr_t dma_addr, gfp_t gfp) +{ + unsigned int rfx = calc_rfx(dma_addr); + unsigned long old_rfe, rfe; + unsigned long *rfep, *rso; + + rfep = &rfo[rfx]; + rfe = READ_ONCE(*rfep); + if (reg_entry_isvalid(rfe)) { + rso = get_rf_rso(rfe); + } else { + rso = dma_alloc_cpu_table(gfp); + if (!rso) + return NULL; + + set_rf_rso(&rfe, virt_to_phys(rso)); + validate_rf_entry(&rfe); + entry_clr_protected(&rfe); + + old_rfe = cmpxchg(rfep, ZPCI_TABLE_INVALID, rfe); + if (old_rfe != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_cpu_table(rso); + rso = get_rf_rso(old_rfe); + } + } + + if (!rso) + return NULL; + + return dma_walk_rs_table(rso, dma_addr, gfp); +} + static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp) { unsigned long old_rte, rte; @@ -269,11 +433,31 @@ static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp) return pto; } -static unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr, gfp_t gfp) +static unsigned long *dma_walk_region_tables(struct s390_domain *domain, + dma_addr_t dma_addr, gfp_t gfp) { - unsigned long *sto, *pto; + switch (domain->origin_type) { + case ZPCI_TABLE_TYPE_RFX: + return dma_walk_rf_table(domain->dma_table, dma_addr, gfp); + case ZPCI_TABLE_TYPE_RSX: + return dma_walk_rs_table(domain->dma_table, dma_addr, gfp); + case ZPCI_TABLE_TYPE_RTX: + return domain->dma_table; + default: + return NULL; + } +} + +static unsigned long *dma_walk_cpu_trans(struct s390_domain *domain, + dma_addr_t dma_addr, gfp_t gfp) +{ + unsigned long *rto, *sto, *pto; unsigned int rtx, sx, px; + rto = dma_walk_region_tables(domain, dma_addr, gfp); + if (!rto) + return NULL; + rtx = calc_rtx(dma_addr); sto = dma_get_seg_table_origin(&rto[rtx], gfp); if (!sto) @@ -327,9 +511,25 @@ static bool s390_iommu_capable(struct device *dev, enum iommu_cap cap) } } +static inline u64 max_tbl_size(struct s390_domain *domain) +{ + switch (domain->origin_type) { + case ZPCI_TABLE_TYPE_RTX: + return ZPCI_TABLE_SIZE_RT - 1; + case ZPCI_TABLE_TYPE_RSX: + return ZPCI_TABLE_SIZE_RS - 1; + case ZPCI_TABLE_TYPE_RFX: + return U64_MAX; + default: + return 0; + } +} + static struct iommu_domain *s390_domain_alloc_paging(struct device *dev) { + struct zpci_dev *zdev = to_zpci_dev(dev); struct s390_domain *s390_domain; + u64 aperture_size; s390_domain = kzalloc(sizeof(*s390_domain), GFP_KERNEL); if (!s390_domain) @@ -340,9 +540,27 @@ static struct iommu_domain *s390_domain_alloc_paging(struct device *dev) kfree(s390_domain); return NULL; } + + aperture_size = min(s390_iommu_aperture, + zdev->end_dma - zdev->start_dma + 1); + if (aperture_size <= (ZPCI_TABLE_SIZE_RT - zdev->start_dma)) { + s390_domain->origin_type = ZPCI_TABLE_TYPE_RTX; + } else if (aperture_size <= (ZPCI_TABLE_SIZE_RS - zdev->start_dma) && + (zdev->dtsm & ZPCI_IOTA_DT_RS)) { + s390_domain->origin_type = ZPCI_TABLE_TYPE_RSX; + } else if (zdev->dtsm & ZPCI_IOTA_DT_RF) { + s390_domain->origin_type = ZPCI_TABLE_TYPE_RFX; + } else { + /* Assume RTX available */ + s390_domain->origin_type = ZPCI_TABLE_TYPE_RTX; + aperture_size = ZPCI_TABLE_SIZE_RT - zdev->start_dma; + } + zdev->end_dma = zdev->start_dma + aperture_size - 1; + + s390_domain->domain.pgsize_bitmap = SZ_4K; s390_domain->domain.geometry.force_aperture = true; s390_domain->domain.geometry.aperture_start = 0; - s390_domain->domain.geometry.aperture_end = ZPCI_TABLE_SIZE_RT - 1; + s390_domain->domain.geometry.aperture_end = max_tbl_size(s390_domain); spin_lock_init(&s390_domain->list_lock); INIT_LIST_HEAD_RCU(&s390_domain->devices); @@ -354,7 +572,7 @@ static void s390_iommu_rcu_free_domain(struct rcu_head *head) { struct s390_domain *s390_domain = container_of(head, struct s390_domain, rcu); - dma_cleanup_tables(s390_domain->dma_table); + dma_cleanup_tables(s390_domain); kfree(s390_domain); } @@ -369,24 +587,116 @@ static void s390_domain_free(struct iommu_domain *domain) call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain); } -static void s390_iommu_detach_device(struct iommu_domain *domain, - struct device *dev) +static void zdev_s390_domain_update(struct zpci_dev *zdev, + struct iommu_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&zdev->dom_lock, flags); + zdev->s390_domain = domain; + spin_unlock_irqrestore(&zdev->dom_lock, flags); +} + +static u64 get_iota_region_flag(struct s390_domain *domain) +{ + switch (domain->origin_type) { + case ZPCI_TABLE_TYPE_RTX: + return ZPCI_IOTA_RTTO_FLAG; + case ZPCI_TABLE_TYPE_RSX: + return ZPCI_IOTA_RSTO_FLAG; + case ZPCI_TABLE_TYPE_RFX: + return ZPCI_IOTA_RFTO_FLAG; + default: + WARN_ONCE(1, "Invalid IOMMU table (%x)\n", domain->origin_type); + return 0; + } +} + +static bool reg_ioat_propagate_error(int cc, u8 status) +{ + /* + * If the device is in the error state the reset routine + * will register the IOAT of the newly set domain on re-enable + */ + if (cc == ZPCI_CC_ERR && status == ZPCI_PCI_ST_FUNC_NOT_AVAIL) + return false; + /* + * If the device was removed treat registration as success + * and let the subsequent error event trigger tear down. + */ + if (cc == ZPCI_CC_INVAL_HANDLE) + return false; + return cc != ZPCI_CC_OK; +} + +static int s390_iommu_domain_reg_ioat(struct zpci_dev *zdev, + struct iommu_domain *domain, u8 *status) +{ + struct s390_domain *s390_domain; + int rc = 0; + u64 iota; + + switch (domain->type) { + case IOMMU_DOMAIN_IDENTITY: + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, + zdev->end_dma, 0, status); + break; + case IOMMU_DOMAIN_BLOCKED: + /* Nothing to do in this case */ + break; + default: + s390_domain = to_s390_domain(domain); + iota = virt_to_phys(s390_domain->dma_table) | + get_iota_region_flag(s390_domain); + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, + zdev->end_dma, iota, status); + } + + return rc; +} + +int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave(&zdev->dom_lock, flags); + + rc = s390_iommu_domain_reg_ioat(zdev, zdev->s390_domain, status); + + spin_unlock_irqrestore(&zdev->dom_lock, flags); + + return rc; +} + +static int blocking_domain_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { - struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); + struct s390_domain *s390_domain; unsigned long flags; - spin_lock_irqsave(&s390_domain->list_lock, flags); - list_del_rcu(&zdev->iommu_list); - spin_unlock_irqrestore(&s390_domain->list_lock, flags); + if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED) + return 0; + + s390_domain = to_s390_domain(zdev->s390_domain); + if (zdev->dma_table) { + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_del_rcu(&zdev->iommu_list); + spin_unlock_irqrestore(&s390_domain->list_lock, flags); + } zpci_unregister_ioat(zdev, 0); - zdev->s390_domain = NULL; zdev->dma_table = NULL; + zdev_s390_domain_update(zdev, domain); + + return 0; } static int s390_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); @@ -401,20 +711,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, domain->geometry.aperture_end < zdev->start_dma)) return -EINVAL; - if (zdev->s390_domain) - s390_iommu_detach_device(&zdev->s390_domain->domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); - cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(s390_domain->dma_table), &status); - /* - * If the device is undergoing error recovery the reset code - * will re-establish the new domain. - */ - if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + /* If we fail now DMA remains blocked via blocking domain */ + cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); + if (reg_ioat_propagate_error(cc, status)) return -EIO; - zdev->dma_table = s390_domain->dma_table; - zdev->s390_domain = s390_domain; + zdev_s390_domain_update(zdev, domain); spin_lock_irqsave(&s390_domain->list_lock, flags); list_add_rcu(&zdev->iommu_list, &s390_domain->devices); @@ -428,6 +732,8 @@ static void s390_iommu_get_resv_regions(struct device *dev, { struct zpci_dev *zdev = to_zpci_dev(dev); struct iommu_resv_region *region; + u64 max_size, end_resv; + unsigned long flags; if (zdev->start_dma) { region = iommu_alloc_resv_region(0, zdev->start_dma, 0, @@ -437,10 +743,21 @@ static void s390_iommu_get_resv_regions(struct device *dev, list_add_tail(®ion->list, list); } - if (zdev->end_dma < ZPCI_TABLE_SIZE_RT - 1) { - region = iommu_alloc_resv_region(zdev->end_dma + 1, - ZPCI_TABLE_SIZE_RT - zdev->end_dma - 1, - 0, IOMMU_RESV_RESERVED, GFP_KERNEL); + spin_lock_irqsave(&zdev->dom_lock, flags); + if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED || + zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY) { + spin_unlock_irqrestore(&zdev->dom_lock, flags); + return; + } + + max_size = max_tbl_size(to_s390_domain(zdev->s390_domain)); + spin_unlock_irqrestore(&zdev->dom_lock, flags); + + if (zdev->end_dma < max_size) { + end_resv = max_size - zdev->end_dma; + region = iommu_alloc_resv_region(zdev->end_dma + 1, end_resv, + 0, IOMMU_RESV_RESERVED, + GFP_KERNEL); if (!region) return; list_add_tail(®ion->list, list); @@ -456,29 +773,17 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev) zdev = to_zpci_dev(dev); - if (zdev->start_dma > zdev->end_dma || - zdev->start_dma > ZPCI_TABLE_SIZE_RT - 1) + if (zdev->start_dma > zdev->end_dma) return ERR_PTR(-EINVAL); - if (zdev->end_dma > ZPCI_TABLE_SIZE_RT - 1) - zdev->end_dma = ZPCI_TABLE_SIZE_RT - 1; - if (zdev->tlb_refresh) dev->iommu->shadow_on_flush = 1; - return &zdev->iommu_dev; -} + /* Start with DMA blocked */ + spin_lock_init(&zdev->dom_lock); + zdev_s390_domain_update(zdev, &blocking_domain); -static void s390_iommu_release_device(struct device *dev) -{ - struct zpci_dev *zdev = to_zpci_dev(dev); - - /* - * release_device is expected to detach any domain currently attached - * to the device, but keep it attached to other devices in the group. - */ - if (zdev) - s390_iommu_detach_device(&zdev->s390_domain->domain, dev); + return &zdev->iommu_dev; } static int zpci_refresh_all(struct zpci_dev *zdev) @@ -560,8 +865,7 @@ static int s390_iommu_validate_trans(struct s390_domain *s390_domain, int rc; for (i = 0; i < nr_pages; i++) { - entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr, - gfp); + entry = dma_walk_cpu_trans(s390_domain, dma_addr, gfp); if (unlikely(!entry)) { rc = -ENOMEM; goto undo_cpu_trans; @@ -576,8 +880,7 @@ static int s390_iommu_validate_trans(struct s390_domain *s390_domain, undo_cpu_trans: while (i-- > 0) { dma_addr -= PAGE_SIZE; - entry = dma_walk_cpu_trans(s390_domain->dma_table, - dma_addr, gfp); + entry = dma_walk_cpu_trans(s390_domain, dma_addr, gfp); if (!entry) break; dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID); @@ -594,8 +897,7 @@ static int s390_iommu_invalidate_trans(struct s390_domain *s390_domain, int rc = 0; for (i = 0; i < nr_pages; i++) { - entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr, - GFP_ATOMIC); + entry = dma_walk_cpu_trans(s390_domain, dma_addr, GFP_ATOMIC); if (unlikely(!entry)) { rc = -EINVAL; break; @@ -639,6 +941,51 @@ static int s390_iommu_map_pages(struct iommu_domain *domain, return rc; } +static unsigned long *get_rso_from_iova(struct s390_domain *domain, + dma_addr_t iova) +{ + unsigned long *rfo; + unsigned long rfe; + unsigned int rfx; + + switch (domain->origin_type) { + case ZPCI_TABLE_TYPE_RFX: + rfo = domain->dma_table; + rfx = calc_rfx(iova); + rfe = READ_ONCE(rfo[rfx]); + if (!reg_entry_isvalid(rfe)) + return NULL; + return get_rf_rso(rfe); + case ZPCI_TABLE_TYPE_RSX: + return domain->dma_table; + default: + return NULL; + } +} + +static unsigned long *get_rto_from_iova(struct s390_domain *domain, + dma_addr_t iova) +{ + unsigned long *rso; + unsigned long rse; + unsigned int rsx; + + switch (domain->origin_type) { + case ZPCI_TABLE_TYPE_RFX: + case ZPCI_TABLE_TYPE_RSX: + rso = get_rso_from_iova(domain, iova); + rsx = calc_rsx(iova); + rse = READ_ONCE(rso[rsx]); + if (!reg_entry_isvalid(rse)) + return NULL; + return get_rs_rto(rse); + case ZPCI_TABLE_TYPE_RTX: + return domain->dma_table; + default: + return NULL; + } +} + static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { @@ -652,10 +999,13 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, iova > domain->geometry.aperture_end) return 0; + rto = get_rto_from_iova(s390_domain, iova); + if (!rto) + return 0; + rtx = calc_rtx(iova); sx = calc_sx(iova); px = calc_px(iova); - rto = s390_domain->dma_table; rte = READ_ONCE(rto[rtx]); if (reg_entry_isvalid(rte)) { @@ -697,14 +1047,20 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain, struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev) { - if (!zdev || !zdev->s390_domain) + struct s390_domain *s390_domain; + + lockdep_assert_held(&zdev->dom_lock); + + if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED || + zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY) return NULL; - return &zdev->s390_domain->ctrs; + + s390_domain = to_s390_domain(zdev->s390_domain); + return &s390_domain->ctrs; } int zpci_init_iommu(struct zpci_dev *zdev) { - u64 aperture_size; int rc = 0; rc = iommu_device_sysfs_add(&zdev->iommu_dev, NULL, NULL, @@ -712,16 +1068,16 @@ int zpci_init_iommu(struct zpci_dev *zdev) if (rc) goto out_err; - rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops, NULL); + if (zdev->rtr_avail) { + rc = iommu_device_register(&zdev->iommu_dev, + &s390_iommu_rtr_ops, NULL); + } else { + rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops, + NULL); + } if (rc) goto out_sysfs; - zdev->start_dma = PAGE_ALIGN(zdev->start_dma); - aperture_size = min3(s390_iommu_aperture, - ZPCI_TABLE_SIZE_RT - zdev->start_dma, - zdev->end_dma - zdev->start_dma + 1); - zdev->end_dma = zdev->start_dma + aperture_size - 1; - return 0; out_sysfs: @@ -776,22 +1132,66 @@ static int __init s390_iommu_init(void) } subsys_initcall(s390_iommu_init); -static const struct iommu_ops s390_iommu_ops = { - .capable = s390_iommu_capable, - .domain_alloc_paging = s390_domain_alloc_paging, - .probe_device = s390_iommu_probe_device, - .release_device = s390_iommu_release_device, - .device_group = generic_device_group, - .pgsize_bitmap = SZ_4K, - .get_resv_regions = s390_iommu_get_resv_regions, - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = s390_iommu_attach_device, - .map_pages = s390_iommu_map_pages, - .unmap_pages = s390_iommu_unmap_pages, - .flush_iotlb_all = s390_iommu_flush_iotlb_all, - .iotlb_sync = s390_iommu_iotlb_sync, - .iotlb_sync_map = s390_iommu_iotlb_sync_map, - .iova_to_phys = s390_iommu_iova_to_phys, - .free = s390_domain_free, +static int s390_attach_dev_identity(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + struct zpci_dev *zdev = to_zpci_dev(dev); + u8 status; + int cc; + + blocking_domain_attach_device(&blocking_domain, dev, old); + + /* If we fail now DMA remains blocked via blocking domain */ + cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); + if (reg_ioat_propagate_error(cc, status)) + return -EIO; + + zdev_s390_domain_update(zdev, domain); + + return 0; +} + +static const struct iommu_domain_ops s390_identity_ops = { + .attach_dev = s390_attach_dev_identity, +}; + +static struct iommu_domain s390_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &s390_identity_ops, +}; + +static struct iommu_domain blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocking_domain_attach_device, } }; + +#define S390_IOMMU_COMMON_OPS() \ + .blocked_domain = &blocking_domain, \ + .release_domain = &blocking_domain, \ + .capable = s390_iommu_capable, \ + .domain_alloc_paging = s390_domain_alloc_paging, \ + .probe_device = s390_iommu_probe_device, \ + .device_group = generic_device_group, \ + .get_resv_regions = s390_iommu_get_resv_regions, \ + .default_domain_ops = &(const struct iommu_domain_ops) { \ + .attach_dev = s390_iommu_attach_device, \ + .map_pages = s390_iommu_map_pages, \ + .unmap_pages = s390_iommu_unmap_pages, \ + .flush_iotlb_all = s390_iommu_flush_iotlb_all, \ + .iotlb_sync = s390_iommu_iotlb_sync, \ + .iotlb_sync_map = s390_iommu_iotlb_sync_map, \ + .iova_to_phys = s390_iommu_iova_to_phys, \ + .free = s390_domain_free, \ + } + +static const struct iommu_ops s390_iommu_ops = { + S390_IOMMU_COMMON_OPS() +}; + +static const struct iommu_ops s390_iommu_rtr_ops = { + .identity_domain = &s390_identity_domain, + S390_IOMMU_COMMON_OPS() +}; diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index ba53571a8239..555d4505c747 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -143,6 +143,8 @@ static struct iommu_domain *sprd_iommu_domain_alloc_paging(struct device *dev) spin_lock_init(&dom->pgtlock); + dom->domain.pgsize_bitmap = SPRD_IOMMU_PAGE_SIZE; + dom->domain.geometry.aperture_start = 0; dom->domain.geometry.aperture_end = SZ_256M - 1; dom->domain.geometry.force_aperture = true; @@ -232,8 +234,8 @@ static void sprd_iommu_cleanup(struct sprd_iommu_domain *dom) pgt_size = sprd_iommu_pgt_size(&dom->domain); dma_free_coherent(dom->sdev->dev, pgt_size, dom->pgt_va, dom->pgt_pa); - dom->sdev = NULL; sprd_iommu_hw_en(dom->sdev, false); + dom->sdev = NULL; } static void sprd_iommu_domain_free(struct iommu_domain *domain) @@ -245,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain) } static int sprd_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); struct sprd_iommu_domain *dom = to_sprd_domain(domain); @@ -410,7 +413,6 @@ static const struct iommu_ops sprd_iommu_ops = { .probe_device = sprd_iommu_probe_device, .device_group = generic_single_device_group, .of_xlate = sprd_iommu_of_xlate, - .pgsize_bitmap = SPRD_IOMMU_PAGE_SIZE, .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = sprd_iommu_attach_device, @@ -531,7 +533,7 @@ static struct platform_driver sprd_iommu_driver = { .suppress_bind_attrs = true, }, .probe = sprd_iommu_probe, - .remove_new = sprd_iommu_remove, + .remove = sprd_iommu_remove, }; module_platform_driver(sprd_iommu_driver); diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index c519b991749d..90b26fe21817 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -452,6 +452,7 @@ static int sun50i_iommu_enable(struct sun50i_iommu *iommu) IOMMU_TLB_PREFETCH_MASTER_ENABLE(3) | IOMMU_TLB_PREFETCH_MASTER_ENABLE(4) | IOMMU_TLB_PREFETCH_MASTER_ENABLE(5)); + iommu_write(iommu, IOMMU_BYPASS_REG, 0); iommu_write(iommu, IOMMU_INT_ENABLE_REG, IOMMU_INT_MASK); iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(SUN50I_IOMMU_ACI_NONE), IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 0) | @@ -601,6 +602,14 @@ static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova, u32 *page_table, *pte_addr; int ret = 0; + /* the IOMMU can only handle 32-bit addresses, both input and output */ + if ((uint64_t)paddr >> 32) { + ret = -EINVAL; + dev_warn_once(iommu->dev, + "attempt to map address beyond 4GB\n"); + goto out; + } + page_table = sun50i_dte_get_page_table(sun50i_domain, iova, gfp); if (IS_ERR(page_table)) { ret = PTR_ERR(page_table); @@ -681,12 +690,15 @@ sun50i_iommu_domain_alloc_paging(struct device *dev) if (!sun50i_domain) return NULL; - sun50i_domain->dt = iommu_alloc_pages(GFP_KERNEL, get_order(DT_SIZE)); + sun50i_domain->dt = + iommu_alloc_pages_sz(GFP_KERNEL | GFP_DMA32, DT_SIZE); if (!sun50i_domain->dt) goto err_free_domain; refcount_set(&sun50i_domain->refcnt, 1); + sun50i_domain->domain.pgsize_bitmap = SZ_4K; + sun50i_domain->domain.geometry.aperture_start = 0; sun50i_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32); sun50i_domain->domain.geometry.force_aperture = true; @@ -703,7 +715,7 @@ static void sun50i_iommu_domain_free(struct iommu_domain *domain) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); - iommu_free_pages(sun50i_domain->dt, get_order(DT_SIZE)); + iommu_free_pages(sun50i_domain->dt); sun50i_domain->dt = NULL; kfree(sun50i_domain); @@ -759,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu, } static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); struct sun50i_iommu_domain *sun50i_domain; @@ -785,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = { }; static int sun50i_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu; @@ -801,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev); + sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old); sun50i_iommu_attach_domain(iommu, sun50i_domain); @@ -827,12 +841,13 @@ static int sun50i_iommu_of_xlate(struct device *dev, dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev)); + put_device(&iommu_pdev->dev); + return iommu_fwspec_add_ids(dev, &id, 1); } static const struct iommu_ops sun50i_iommu_ops = { .identity_domain = &sun50i_iommu_identity_domain, - .pgsize_bitmap = SZ_4K, .device_group = generic_single_device_group, .domain_alloc_paging = sun50i_iommu_domain_alloc_paging, .of_xlate = sun50i_iommu_of_xlate, @@ -996,7 +1011,7 @@ static int sun50i_iommu_probe(struct platform_device *pdev) iommu->pt_pool = kmem_cache_create(dev_name(&pdev->dev), PT_SIZE, PT_SIZE, - SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA32, NULL); if (!iommu->pt_pool) return -ENOMEM; @@ -1057,6 +1072,7 @@ err_free_cache: static const struct of_device_id sun50i_iommu_dt[] = { { .compatible = "allwinner,sun50i-h6-iommu", }, + { .compatible = "allwinner,sun50i-h616-iommu", }, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, sun50i_iommu_dt); diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index f86c7ae91814..c391e7f2cde6 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -51,14 +51,17 @@ struct tegra_smmu { struct iommu_device iommu; /* IOMMU Core code handle */ }; +struct tegra_pd; +struct tegra_pt; + struct tegra_smmu_as { struct iommu_domain domain; struct tegra_smmu *smmu; unsigned int use_count; spinlock_t lock; u32 *count; - struct page **pts; - struct page *pd; + struct tegra_pt **pts; + struct tegra_pd *pd; dma_addr_t pd_dma; unsigned id; u32 attr; @@ -155,6 +158,14 @@ static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset) #define SMMU_PDE_ATTR (SMMU_PDE_READABLE | SMMU_PDE_WRITABLE | \ SMMU_PDE_NONSECURE) +struct tegra_pd { + u32 val[SMMU_NUM_PDE]; +}; + +struct tegra_pt { + u32 val[SMMU_NUM_PTE]; +}; + static unsigned int iova_pd_index(unsigned long iova) { return (iova >> SMMU_PDE_SHIFT) & (SMMU_NUM_PDE - 1); @@ -284,7 +295,7 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev) as->attr = SMMU_PD_READABLE | SMMU_PD_WRITABLE | SMMU_PD_NONSECURE; - as->pd = __iommu_alloc_pages(GFP_KERNEL | __GFP_DMA, 0); + as->pd = iommu_alloc_pages_sz(GFP_KERNEL | __GFP_DMA, SMMU_SIZE_PD); if (!as->pd) { kfree(as); return NULL; @@ -292,7 +303,7 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev) as->count = kcalloc(SMMU_NUM_PDE, sizeof(u32), GFP_KERNEL); if (!as->count) { - __iommu_free_pages(as->pd, 0); + iommu_free_pages(as->pd); kfree(as); return NULL; } @@ -300,13 +311,15 @@ static struct iommu_domain *tegra_smmu_domain_alloc_paging(struct device *dev) as->pts = kcalloc(SMMU_NUM_PDE, sizeof(*as->pts), GFP_KERNEL); if (!as->pts) { kfree(as->count); - __iommu_free_pages(as->pd, 0); + iommu_free_pages(as->pd); kfree(as); return NULL; } spin_lock_init(&as->lock); + as->domain.pgsize_bitmap = SZ_4K; + /* setup aperture */ as->domain.geometry.aperture_start = 0; as->domain.geometry.aperture_end = 0xffffffff; @@ -417,8 +430,8 @@ static int tegra_smmu_as_prepare(struct tegra_smmu *smmu, goto unlock; } - as->pd_dma = dma_map_page(smmu->dev, as->pd, 0, SMMU_SIZE_PD, - DMA_TO_DEVICE); + as->pd_dma = + dma_map_single(smmu->dev, as->pd, SMMU_SIZE_PD, DMA_TO_DEVICE); if (dma_mapping_error(smmu->dev, as->pd_dma)) { err = -ENOMEM; goto unlock; @@ -450,7 +463,7 @@ static int tegra_smmu_as_prepare(struct tegra_smmu *smmu, return 0; err_unmap: - dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE); + dma_unmap_single(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE); unlock: mutex_unlock(&smmu->lock); @@ -469,7 +482,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu, tegra_smmu_free_asid(smmu, as->id); - dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE); + dma_unmap_single(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE); as->smmu = NULL; @@ -477,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu, } static int tegra_smmu_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu *smmu = dev_iommu_priv_get(dev); @@ -511,9 +524,9 @@ disable: } static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu_as *as; struct tegra_smmu *smmu; @@ -522,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, if (!fwspec) return -ENODEV; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - as = to_smmu_as(domain); + as = to_smmu_as(old); smmu = as->smmu; for (index = 0; index < fwspec->num_ids; index++) { tegra_smmu_disable(smmu, fwspec->ids[index], as->id); @@ -548,11 +561,11 @@ static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova, { unsigned int pd_index = iova_pd_index(iova); struct tegra_smmu *smmu = as->smmu; - u32 *pd = page_address(as->pd); + u32 *pd = &as->pd->val[pd_index]; unsigned long offset = pd_index * sizeof(*pd); /* Set the page directory entry first */ - pd[pd_index] = value; + *pd = value; /* The flush the page directory entry from caches */ dma_sync_single_range_for_device(smmu->dev, as->pd_dma, offset, @@ -564,11 +577,9 @@ static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova, smmu_flush(smmu); } -static u32 *tegra_smmu_pte_offset(struct page *pt_page, unsigned long iova) +static u32 *tegra_smmu_pte_offset(struct tegra_pt *pt, unsigned long iova) { - u32 *pt = page_address(pt_page); - - return pt + iova_pt_index(iova); + return &pt->val[iova_pt_index(iova)]; } static u32 *tegra_smmu_pte_lookup(struct tegra_smmu_as *as, unsigned long iova, @@ -576,21 +587,19 @@ static u32 *tegra_smmu_pte_lookup(struct tegra_smmu_as *as, unsigned long iova, { unsigned int pd_index = iova_pd_index(iova); struct tegra_smmu *smmu = as->smmu; - struct page *pt_page; - u32 *pd; + struct tegra_pt *pt; - pt_page = as->pts[pd_index]; - if (!pt_page) + pt = as->pts[pd_index]; + if (!pt) return NULL; - pd = page_address(as->pd); - *dmap = smmu_pde_to_dma(smmu, pd[pd_index]); + *dmap = smmu_pde_to_dma(smmu, as->pd->val[pd_index]); - return tegra_smmu_pte_offset(pt_page, iova); + return tegra_smmu_pte_offset(pt, iova); } static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova, - dma_addr_t *dmap, struct page *page) + dma_addr_t *dmap, struct tegra_pt *pt) { unsigned int pde = iova_pd_index(iova); struct tegra_smmu *smmu = as->smmu; @@ -598,30 +607,28 @@ static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova, if (!as->pts[pde]) { dma_addr_t dma; - dma = dma_map_page(smmu->dev, page, 0, SMMU_SIZE_PT, - DMA_TO_DEVICE); + dma = dma_map_single(smmu->dev, pt, SMMU_SIZE_PT, + DMA_TO_DEVICE); if (dma_mapping_error(smmu->dev, dma)) { - __iommu_free_pages(page, 0); + iommu_free_pages(pt); return NULL; } if (!smmu_dma_addr_valid(smmu, dma)) { - dma_unmap_page(smmu->dev, dma, SMMU_SIZE_PT, - DMA_TO_DEVICE); - __iommu_free_pages(page, 0); + dma_unmap_single(smmu->dev, dma, SMMU_SIZE_PT, + DMA_TO_DEVICE); + iommu_free_pages(pt); return NULL; } - as->pts[pde] = page; + as->pts[pde] = pt; tegra_smmu_set_pde(as, iova, SMMU_MK_PDE(dma, SMMU_PDE_ATTR | SMMU_PDE_NEXT)); *dmap = dma; } else { - u32 *pd = page_address(as->pd); - - *dmap = smmu_pde_to_dma(smmu, pd[pde]); + *dmap = smmu_pde_to_dma(smmu, as->pd->val[pde]); } return tegra_smmu_pte_offset(as->pts[pde], iova); @@ -637,7 +644,7 @@ static void tegra_smmu_pte_get_use(struct tegra_smmu_as *as, unsigned long iova) static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova) { unsigned int pde = iova_pd_index(iova); - struct page *page = as->pts[pde]; + struct tegra_pt *pt = as->pts[pde]; /* * When no entries in this page table are used anymore, return the @@ -645,13 +652,13 @@ static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova) */ if (--as->count[pde] == 0) { struct tegra_smmu *smmu = as->smmu; - u32 *pd = page_address(as->pd); - dma_addr_t pte_dma = smmu_pde_to_dma(smmu, pd[pde]); + dma_addr_t pte_dma = smmu_pde_to_dma(smmu, as->pd->val[pde]); tegra_smmu_set_pde(as, iova, 0); - dma_unmap_page(smmu->dev, pte_dma, SMMU_SIZE_PT, DMA_TO_DEVICE); - __iommu_free_pages(page, 0); + dma_unmap_single(smmu->dev, pte_dma, SMMU_SIZE_PT, + DMA_TO_DEVICE); + iommu_free_pages(pt); as->pts[pde] = NULL; } } @@ -671,16 +678,16 @@ static void tegra_smmu_set_pte(struct tegra_smmu_as *as, unsigned long iova, smmu_flush(smmu); } -static struct page *as_get_pde_page(struct tegra_smmu_as *as, - unsigned long iova, gfp_t gfp, - unsigned long *flags) +static struct tegra_pt *as_get_pde_page(struct tegra_smmu_as *as, + unsigned long iova, gfp_t gfp, + unsigned long *flags) { unsigned int pde = iova_pd_index(iova); - struct page *page = as->pts[pde]; + struct tegra_pt *pt = as->pts[pde]; /* at first check whether allocation needs to be done at all */ - if (page) - return page; + if (pt) + return pt; /* * In order to prevent exhaustion of the atomic memory pool, we @@ -690,7 +697,7 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, if (gfpflags_allow_blocking(gfp)) spin_unlock_irqrestore(&as->lock, *flags); - page = __iommu_alloc_pages(gfp | __GFP_DMA, 0); + pt = iommu_alloc_pages_sz(gfp | __GFP_DMA, SMMU_SIZE_PT); if (gfpflags_allow_blocking(gfp)) spin_lock_irqsave(&as->lock, *flags); @@ -701,13 +708,13 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * if allocation succeeded and the allocation failure isn't fatal. */ if (as->pts[pde]) { - if (page) - __iommu_free_pages(page, 0); + if (pt) + iommu_free_pages(pt); - page = as->pts[pde]; + pt = as->pts[pde]; } - return page; + return pt; } static int @@ -717,15 +724,15 @@ __tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, { struct tegra_smmu_as *as = to_smmu_as(domain); dma_addr_t pte_dma; - struct page *page; + struct tegra_pt *pt; u32 pte_attrs; u32 *pte; - page = as_get_pde_page(as, iova, gfp, flags); - if (!page) + pt = as_get_pde_page(as, iova, gfp, flags); + if (!pt) return -ENOMEM; - pte = as_get_pte(as, iova, &pte_dma, page); + pte = as_get_pte(as, iova, &pte_dma, pt); if (!pte) return -ENOMEM; @@ -823,10 +830,9 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np) return NULL; mc = platform_get_drvdata(pdev); - if (!mc) { - put_device(&pdev->dev); + put_device(&pdev->dev); + if (!mc) return NULL; - } return mc->smmu; } @@ -837,7 +843,7 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, const struct iommu_ops *ops = smmu->iommu.ops; int err; - err = iommu_fwspec_init(dev, &dev->of_node->fwnode, ops); + err = iommu_fwspec_init(dev, dev_fwnode(smmu->dev)); if (err < 0) { dev_err(dev, "failed to initialize fwspec: %d\n", err); return err; @@ -846,7 +852,6 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, err = ops->of_xlate(dev, args); if (err < 0) { dev_err(dev, "failed to parse SW group ID: %d\n", err); - iommu_fwspec_free(dev); return err; } @@ -998,7 +1003,6 @@ static const struct iommu_ops tegra_smmu_ops = { .probe_device = tegra_smmu_probe_device, .device_group = tegra_smmu_device_group, .of_xlate = tegra_smmu_of_xlate, - .pgsize_bitmap = SZ_4K, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = tegra_smmu_attach_dev, .map_pages = tegra_smmu_map, diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 36d680826b57..d314fa5cd847 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -48,6 +48,7 @@ struct viommu_dev { u64 pgsize_bitmap; u32 first_domain; u32 last_domain; + u32 identity_domain_id; /* Supported MAP flags */ u32 map_flags; u32 probe_size; @@ -62,7 +63,6 @@ struct viommu_mapping { struct viommu_domain { struct iommu_domain domain; struct viommu_dev *viommu; - struct mutex mutex; /* protects viommu pointer */ unsigned int id; u32 map_flags; @@ -70,7 +70,6 @@ struct viommu_domain { struct rb_root_cached mappings; unsigned long nr_endpoints; - bool bypass; }; struct viommu_endpoint { @@ -97,6 +96,8 @@ struct viommu_event { }; }; +static struct viommu_domain viommu_identity_domain; + #define to_viommu_domain(domain) \ container_of(domain, struct viommu_domain, domain) @@ -305,6 +306,22 @@ out_unlock: return ret; } +static int viommu_send_attach_req(struct viommu_dev *viommu, struct device *dev, + struct virtio_iommu_req_attach *req) +{ + int ret; + unsigned int i; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + for (i = 0; i < fwspec->num_ids; i++) { + req->endpoint = cpu_to_le32(fwspec->ids[i]); + ret = viommu_send_req_sync(viommu, req, sizeof(*req)); + if (ret) + return ret; + } + return 0; +} + /* * viommu_add_mapping - add a mapping to the internal tree * @@ -637,71 +654,45 @@ static void viommu_event_handler(struct virtqueue *vq) /* IOMMU API */ -static struct iommu_domain *viommu_domain_alloc(unsigned type) +static struct iommu_domain *viommu_domain_alloc_paging(struct device *dev) { + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct viommu_dev *viommu = vdev->viommu; + unsigned long viommu_page_size; struct viommu_domain *vdomain; - - if (type != IOMMU_DOMAIN_UNMANAGED && - type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_IDENTITY) - return NULL; - - vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); - if (!vdomain) - return NULL; - - mutex_init(&vdomain->mutex); - spin_lock_init(&vdomain->mappings_lock); - vdomain->mappings = RB_ROOT_CACHED; - - return &vdomain->domain; -} - -static int viommu_domain_finalise(struct viommu_endpoint *vdev, - struct iommu_domain *domain) -{ int ret; - unsigned long viommu_page_size; - struct viommu_dev *viommu = vdev->viommu; - struct viommu_domain *vdomain = to_viommu_domain(domain); viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap); if (viommu_page_size > PAGE_SIZE) { dev_err(vdev->dev, "granule 0x%lx larger than system page size 0x%lx\n", viommu_page_size, PAGE_SIZE); - return -ENODEV; + return ERR_PTR(-ENODEV); } - ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, - viommu->last_domain, GFP_KERNEL); - if (ret < 0) - return ret; + vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); + if (!vdomain) + return ERR_PTR(-ENOMEM); - vdomain->id = (unsigned int)ret; + spin_lock_init(&vdomain->mappings_lock); + vdomain->mappings = RB_ROOT_CACHED; - domain->pgsize_bitmap = viommu->pgsize_bitmap; - domain->geometry = viommu->geometry; + ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, + viommu->last_domain, GFP_KERNEL); + if (ret < 0) { + kfree(vdomain); + return ERR_PTR(ret); + } - vdomain->map_flags = viommu->map_flags; - vdomain->viommu = viommu; + vdomain->id = (unsigned int)ret; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - if (virtio_has_feature(viommu->vdev, - VIRTIO_IOMMU_F_BYPASS_CONFIG)) { - vdomain->bypass = true; - return 0; - } + vdomain->domain.pgsize_bitmap = viommu->pgsize_bitmap; + vdomain->domain.geometry = viommu->geometry; - ret = viommu_domain_map_identity(vdev, vdomain); - if (ret) { - ida_free(&viommu->domain_ids, vdomain->id); - vdomain->viommu = NULL; - return ret; - } - } + vdomain->map_flags = viommu->map_flags; + vdomain->viommu = viommu; - return 0; + return &vdomain->domain; } static void viommu_domain_free(struct iommu_domain *domain) @@ -717,29 +708,38 @@ static void viommu_domain_free(struct iommu_domain *domain) kfree(vdomain); } -static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev) +{ + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct iommu_domain *domain; + int ret; + + if (virtio_has_feature(vdev->viommu->vdev, + VIRTIO_IOMMU_F_BYPASS_CONFIG)) + return &viommu_identity_domain.domain; + + domain = viommu_domain_alloc_paging(dev); + if (IS_ERR(domain)) + return domain; + + ret = viommu_domain_map_identity(vdev, to_viommu_domain(domain)); + if (ret) { + viommu_domain_free(domain); + return ERR_PTR(ret); + } + return domain; +} + +static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { - int i; int ret = 0; struct virtio_iommu_req_attach req; - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); struct viommu_domain *vdomain = to_viommu_domain(domain); - mutex_lock(&vdomain->mutex); - if (!vdomain->viommu) { - /* - * Properly initialize the domain now that we know which viommu - * owns it. - */ - ret = viommu_domain_finalise(vdev, domain); - } else if (vdomain->viommu != vdev->viommu) { - ret = -EINVAL; - } - mutex_unlock(&vdomain->mutex); - - if (ret) - return ret; + if (vdomain->viommu != vdev->viommu) + return -EINVAL; /* * In the virtio-iommu device, when attaching the endpoint to a new @@ -761,16 +761,9 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) .domain = cpu_to_le32(vdomain->id), }; - if (vdomain->bypass) - req.flags |= cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS); - - for (i = 0; i < fwspec->num_ids; i++) { - req.endpoint = cpu_to_le32(fwspec->ids[i]); - - ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req)); - if (ret) - return ret; - } + ret = viommu_send_attach_req(vdomain->viommu, dev, &req); + if (ret) + return ret; if (!vdomain->nr_endpoints) { /* @@ -788,6 +781,41 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) return 0; } +static int viommu_attach_identity_domain(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + int ret = 0; + struct virtio_iommu_req_attach req; + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct viommu_domain *vdomain = to_viommu_domain(domain); + + req = (struct virtio_iommu_req_attach) { + .head.type = VIRTIO_IOMMU_T_ATTACH, + .domain = cpu_to_le32(vdev->viommu->identity_domain_id), + .flags = cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS), + }; + + ret = viommu_send_attach_req(vdev->viommu, dev, &req); + if (ret) + return ret; + + if (vdev->vdomain) + vdev->vdomain->nr_endpoints--; + vdomain->nr_endpoints++; + vdev->vdomain = vdomain; + return 0; +} + +static struct viommu_domain viommu_identity_domain = { + .domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = viommu_attach_identity_domain, + }, + }, +}; + static void viommu_detach_dev(struct viommu_endpoint *vdev) { int i; @@ -972,8 +1000,7 @@ static void viommu_get_resv_regions(struct device *dev, struct list_head *head) iommu_dma_get_resv_regions(dev, head); } -static struct iommu_ops viommu_ops; -static struct virtio_driver virtio_iommu_drv; +static const struct bus_type *virtio_bus_type; static int viommu_match_node(struct device *dev, const void *data) { @@ -982,8 +1009,9 @@ static int viommu_match_node(struct device *dev, const void *data) static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode) { - struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL, - fwnode, viommu_match_node); + struct device *dev = bus_find_device(virtio_bus_type, NULL, fwnode, + viommu_match_node); + put_device(dev); return dev ? dev_to_virtio(dev)->priv : NULL; @@ -1060,9 +1088,10 @@ static bool viommu_capable(struct device *dev, enum iommu_cap cap) } } -static struct iommu_ops viommu_ops = { +static const struct iommu_ops viommu_ops = { .capable = viommu_capable, - .domain_alloc = viommu_domain_alloc, + .domain_alloc_identity = viommu_domain_alloc_identity, + .domain_alloc_paging = viommu_domain_alloc_paging, .probe_device = viommu_probe_device, .release_device = viommu_release_device, .device_group = viommu_device_group, @@ -1084,14 +1113,13 @@ static struct iommu_ops viommu_ops = { static int viommu_init_vqs(struct viommu_dev *viommu) { struct virtio_device *vdev = dev_to_virtio(viommu->dev); - const char *names[] = { "request", "event" }; - vq_callback_t *callbacks[] = { - NULL, /* No async requests */ - viommu_event_handler, + struct virtqueue_info vqs_info[] = { + { "request" }, + { "event", viommu_event_handler }, }; - return virtio_find_vqs(vdev, VIOMMU_NR_VQS, viommu->vqs, callbacks, - names, NULL); + return virtio_find_vqs(vdev, VIOMMU_NR_VQS, viommu->vqs, + vqs_info, NULL); } static int viommu_fill_evtq(struct viommu_dev *viommu) @@ -1134,6 +1162,9 @@ static int viommu_probe(struct virtio_device *vdev) if (!viommu) return -ENOMEM; + /* Borrow this for easy lookups later */ + virtio_bus_type = dev->bus; + spin_lock_init(&viommu->request_lock); ida_init(&viommu->domain_ids); viommu->dev = dev; @@ -1185,7 +1216,11 @@ static int viommu_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_IOMMU_F_MMIO)) viommu->map_flags |= VIRTIO_IOMMU_MAP_F_MMIO; - viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; + /* Reserve an ID to use as the bypass domain */ + if (virtio_has_feature(viommu->vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) { + viommu->identity_domain_id = viommu->first_domain; + viommu->first_domain++; + } virtio_device_ready(vdev); @@ -1199,10 +1234,10 @@ static int viommu_probe(struct virtio_device *vdev) if (ret) goto err_free_vqs; - iommu_device_register(&viommu->iommu, &viommu_ops, parent_dev); - vdev->priv = viommu; + iommu_device_register(&viommu->iommu, &viommu_ops, parent_dev); + dev_info(dev, "input address: %u bits\n", order_base_2(viommu->geometry.aperture_end)); dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap); |
