diff options
Diffstat (limited to 'drivers/iommu/amd')
| -rw-r--r-- | drivers/iommu/amd/Kconfig | 5 | ||||
| -rw-r--r-- | drivers/iommu/amd/Makefile | 2 | ||||
| -rw-r--r-- | drivers/iommu/amd/amd_iommu.h | 1 | ||||
| -rw-r--r-- | drivers/iommu/amd/amd_iommu_types.h | 114 | ||||
| -rw-r--r-- | drivers/iommu/amd/debugfs.c | 2 | ||||
| -rw-r--r-- | drivers/iommu/amd/init.c | 15 | ||||
| -rw-r--r-- | drivers/iommu/amd/io_pgtable.c | 577 | ||||
| -rw-r--r-- | drivers/iommu/amd/io_pgtable_v2.c | 370 | ||||
| -rw-r--r-- | drivers/iommu/amd/iommu.c | 572 |
9 files changed, 328 insertions, 1330 deletions
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index ecef69c11144..f2acf471cb5d 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -11,10 +11,13 @@ config AMD_IOMMU select MMU_NOTIFIER select IOMMU_API select IOMMU_IOVA - select IOMMU_IO_PGTABLE select IOMMU_SVA select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_AMDV1 + select IOMMU_PT_X86_64 depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 59c04a67f398..5412a563c697 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o +obj-y += iommu.o init.o quirks.o ppr.o pasid.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 9b4b589a54b5..25044d28f28a 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -88,7 +88,6 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag); * the IOMMU used by this driver. */ void amd_iommu_flush_all_caches(struct amd_iommu *iommu); -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size); void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index a698a2e7ce2a..78b1c44bd6b5 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -18,7 +18,7 @@ #include <linux/spinlock.h> #include <linux/pci.h> #include <linux/irqreturn.h> -#include <linux/io-pgtable.h> +#include <linux/generic_pt/iommu.h> /* * Maximum number of IOMMUs supported @@ -247,6 +247,10 @@ #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) +#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */ +#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x)) +#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */ +#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x)) /* constants for event buffer handling */ #define EVT_BUFFER_SIZE 8192 /* 512 entries */ @@ -337,76 +341,7 @@ #define GUEST_PGTABLE_4_LEVEL 0x00 #define GUEST_PGTABLE_5_LEVEL 0x01 -#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) -#define PM_LEVEL_SIZE(x) (((x) < 6) ? \ - ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ - (0xffffffffffffffffULL)) -#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) -#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) -#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ - IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW) -#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) - -#define PM_MAP_4k 0 #define PM_ADDR_MASK 0x000ffffffffff000ULL -#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ - (~((1ULL << (12 + ((lvl) * 9))) - 1))) -#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) - -/* - * Returns the page table level to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_LEVEL(pagesize) \ - ((__ffs(pagesize) - 12) / 9) -/* - * Returns the number of ptes to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_PTE_COUNT(pagesize) \ - (1ULL << ((__ffs(pagesize) - 12) % 9)) - -/* - * Aligns a given io-virtual address to a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_ALIGN(address, pagesize) \ - ((address) & ~((pagesize) - 1)) -/* - * Creates an IOMMU PTE for an address and a given pagesize - * The PTE has no permission bits set - * Pagesize is expected to be a power-of-two larger than 4096 - */ -#define PAGE_SIZE_PTE(address, pagesize) \ - (((address) | ((pagesize) - 1)) & \ - (~(pagesize >> 1)) & PM_ADDR_MASK) - -/* - * Takes a PTE value with mode=0x07 and returns the page size it maps - */ -#define PTE_PAGE_SIZE(pte) \ - (1ULL << (1 + ffz(((pte) | 0xfffULL)))) - -/* - * Takes a page-table level and returns the default page-size for this level - */ -#define PTE_LEVEL_PAGE_SIZE(level) \ - (1ULL << (12 + (9 * (level)))) - -/* - * The IOPTE dirty bit - */ -#define IOMMU_PTE_HD_BIT (6) - -/* - * Bit value definition for I/O PTE fields - */ -#define IOMMU_PTE_PR BIT_ULL(0) -#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) -#define IOMMU_PTE_U BIT_ULL(59) -#define IOMMU_PTE_FC BIT_ULL(60) -#define IOMMU_PTE_IR BIT_ULL(61) -#define IOMMU_PTE_IW BIT_ULL(62) /* * Bit value definition for DTE fields @@ -436,12 +371,6 @@ /* DTE[128:179] | DTE[184:191] */ #define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52) -#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) -#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) -#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) -#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) -#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) - #define IOMMU_PROT_MASK 0x03 #define IOMMU_PROT_IR 0x01 #define IOMMU_PROT_IW 0x02 @@ -534,19 +463,6 @@ struct amd_irte_ops; #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) -#define io_pgtable_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl) - -#define io_pgtable_ops_to_data(x) \ - io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) - -#define io_pgtable_ops_to_domain(x) \ - container_of(io_pgtable_ops_to_data(x), \ - struct protection_domain, iop) - -#define io_pgtable_cfg_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl.cfg) - struct gcr3_tbl_info { u64 *gcr3_tbl; /* Guest CR3 table */ int glx; /* Number of levels for GCR3 table */ @@ -554,14 +470,6 @@ struct gcr3_tbl_info { u16 domid; /* Per device domain ID */ }; -struct amd_io_pgtable { - seqcount_t seqcount; /* Protects root/mode update */ - struct io_pgtable pgtbl; - int mode; - u64 *root; - u64 *pgd; /* v2 pgtable pgd pointer */ -}; - enum protection_domain_mode { PD_MODE_NONE, PD_MODE_V1, @@ -589,10 +497,13 @@ struct pdom_iommu_info { * independent of their use. */ struct protection_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + struct pt_iommu_x86_64 amdv2; + }; struct list_head dev_list; /* List of all devices in this domain */ - struct iommu_domain domain; /* generic domain handle used by - iommu core code */ - struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ @@ -602,6 +513,9 @@ struct protection_domain { struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ }; +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain); /* * This structure contains information about one PCI segment in the system. diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c index 10fa217a7119..20b04996441d 100644 --- a/drivers/iommu/amd/debugfs.c +++ b/drivers/iommu/amd/debugfs.c @@ -37,7 +37,7 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf, if (ret) return ret; - if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - 4) { + if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) { iommu->dbg_mmio_offset = -1; return -EINVAL; } diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index f2991c11867c..4f4d4955269e 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1710,13 +1710,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id, list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list); if (alloc_dev_table(pci_seg)) - return NULL; + goto err_free_pci_seg; if (alloc_alias_table(pci_seg)) - return NULL; + goto err_free_dev_table; if (alloc_rlookup_table(pci_seg)) - return NULL; + goto err_free_alias_table; return pci_seg; + +err_free_alias_table: + free_alias_table(pci_seg); +err_free_dev_table: + free_dev_table(pci_seg); +err_free_pci_seg: + list_del(&pci_seg->list); + kfree(pci_seg); + return NULL; } static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id, diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c deleted file mode 100644 index 70c2f5b1631b..000000000000 --- a/drivers/iommu/amd/io_pgtable.c +++ /dev/null @@ -1,577 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table allocator. - * - * Copyright (C) 2020 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/io-pgtable.h> -#include <linux/kernel.h> -#include <linux/sizes.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/dma-mapping.h> -#include <linux/seqlock.h> - -#include <asm/barrier.h> - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -/* - * Helper function to get the first pte of a large mapping - */ -static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, - unsigned long *count) -{ - unsigned long pte_mask, pg_size, cnt; - u64 *fpte; - - pg_size = PTE_PAGE_SIZE(*pte); - cnt = PAGE_SIZE_PTE_COUNT(pg_size); - pte_mask = ~((cnt << 3) - 1); - fpte = (u64 *)(((unsigned long)pte) & pte_mask); - - if (page_size) - *page_size = pg_size; - - if (count) - *count = cnt; - - return fpte; -} - -static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl) -{ - u64 *p; - int i; - - for (i = 0; i < 512; ++i) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - /* Large PTE? */ - if (PM_PTE_LEVEL(pt[i]) == 0 || - PM_PTE_LEVEL(pt[i]) == 7) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = IOMMU_PTE_PAGE(pt[i]); - if (lvl > 2) - free_pt_lvl(p, freelist, lvl - 1); - else - iommu_pages_list_add(freelist, p); - } - - iommu_pages_list_add(freelist, pt); -} - -static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist) -{ - switch (mode) { - case PAGE_MODE_NONE: - case PAGE_MODE_7_LEVEL: - break; - case PAGE_MODE_1_LEVEL: - iommu_pages_list_add(freelist, root); - break; - case PAGE_MODE_2_LEVEL: - case PAGE_MODE_3_LEVEL: - case PAGE_MODE_4_LEVEL: - case PAGE_MODE_5_LEVEL: - case PAGE_MODE_6_LEVEL: - free_pt_lvl(root, freelist, mode); - break; - default: - BUG(); - } -} - -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned int page_size_level, - gfp_t gfp) -{ - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - struct protection_domain *domain = - container_of(pgtable, struct protection_domain, iop); - unsigned long flags; - bool ret = true; - u64 *pte; - - pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K); - if (!pte) - return false; - - spin_lock_irqsave(&domain->lock, flags); - - if (address <= PM_LEVEL_SIZE(pgtable->mode) && - pgtable->mode - 1 >= page_size_level) - goto out; - - ret = false; - if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level)) - goto out; - - *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); - - write_seqcount_begin(&pgtable->seqcount); - pgtable->root = pte; - pgtable->mode += 1; - write_seqcount_end(&pgtable->seqcount); - - amd_iommu_update_and_flush_device_table(domain); - - pte = NULL; - ret = true; - -out: - spin_unlock_irqrestore(&domain->lock, flags); - iommu_free_pages(pte); - - return ret; -} - -static u64 *alloc_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long page_size, - u64 **pte_page, - gfp_t gfp, - bool *updated) -{ - unsigned long last_addr = address + (page_size - 1); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned int seqcount; - int level, end_lvl; - u64 *pte, *page; - - BUG_ON(!is_power_of_2(page_size)); - - while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || - pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { - /* - * Return an error if there is no memory to update the - * page-table. - */ - if (!increase_address_space(pgtable, last_addr, - PAGE_SIZE_LEVEL(page_size), gfp)) - return NULL; - } - - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - - address = PAGE_SIZE_ALIGN(address, page_size); - end_lvl = PAGE_SIZE_LEVEL(page_size); - - while (level > end_lvl) { - u64 __pte, __npte; - int pte_level; - - __pte = *pte; - pte_level = PM_PTE_LEVEL(__pte); - - /* - * If we replace a series of large PTEs, we need - * to tear down all of them. - */ - if (IOMMU_PTE_PRESENT(__pte) && - pte_level == PAGE_MODE_7_LEVEL) { - unsigned long count, i; - u64 *lpte; - - lpte = first_pte_l7(pte, NULL, &count); - - /* - * Unmap the replicated PTEs that still match the - * original large mapping - */ - for (i = 0; i < count; ++i) - cmpxchg64(&lpte[i], __pte, 0ULL); - - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE) { - page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, - SZ_4K); - - if (!page) - return NULL; - - __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); - - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - /* No level skipping support yet */ - if (pte_level != level) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(__pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long *page_size) -{ - int level; - unsigned int seqcount; - u64 *pte; - - *page_size = 0; - - if (address > PM_LEVEL_SIZE(pgtable->mode)) - return NULL; - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - *page_size = PTE_LEVEL_PAGE_SIZE(level); - - while (level > 0) { - - /* Not Present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Large PTE */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || - PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) - break; - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - /* Walk to the next level */ - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - } - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) - pte = first_pte_l7(pte, page_size, NULL); - - return pte; -} - -static void free_clear_pte(u64 *pte, u64 pteval, - struct iommu_pages_list *freelist) -{ - u64 *pt; - int mode; - - while (!try_cmpxchg64(pte, &pteval, 0)) - pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); - - if (!IOMMU_PTE_PRESENT(pteval)) - return; - - pt = IOMMU_PTE_PAGE(pteval); - mode = IOMMU_PTE_MODE(pteval); - - free_sub_pt(pt, mode, freelist); -} - -/* - * Generic mapping functions. It maps a physical address into a DMA - * address space. It allocates the page table pages if necessary. - * In the future it can be extended to a generic mapping function - * supporting all features of AMD IOMMU page tables like level skipping - * and full 64 bit address spaces. - */ -static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - bool updated = false; - u64 __pte, *pte; - int ret, i, count; - size_t size = pgcount << __ffs(pgsize); - unsigned long o_iova = iova; - - BUG_ON(!IS_ALIGNED(iova, pgsize)); - BUG_ON(!IS_ALIGNED(paddr, pgsize)); - - ret = -EINVAL; - if (!(prot & IOMMU_PROT_MASK)) - goto out; - - while (pgcount > 0) { - count = PAGE_SIZE_PTE_COUNT(pgsize); - pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); - - ret = -ENOMEM; - if (!pte) - goto out; - - for (i = 0; i < count; ++i) - free_clear_pte(&pte[i], pte[i], &freelist); - - if (!iommu_pages_list_empty(&freelist)) - updated = true; - - if (count > 1) { - __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; - } else - __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; - - if (prot & IOMMU_PROT_IR) - __pte |= IOMMU_PTE_IR; - if (prot & IOMMU_PROT_IW) - __pte |= IOMMU_PTE_IW; - - for (i = 0; i < count; ++i) - pte[i] = __pte; - - iova += pgsize; - paddr += pgsize; - pgcount--; - if (mapped) - *mapped += pgsize; - } - - ret = 0; - -out: - if (updated) { - struct protection_domain *dom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - /* - * Flush domain TLB(s) and wait for completion. Any Device-Table - * Updates and flushing already happened in - * increase_address_space(). - */ - amd_iommu_domain_flush_pages(dom, o_iova, size); - spin_unlock_irqrestore(&dom->lock, flags); - } - - /* Everything flushed out, free pages now */ - iommu_put_pages_list(&freelist); - - return ret; -} - -static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long long unmapped; - unsigned long unmap_size; - u64 *pte; - size_t size = pgcount << __ffs(pgsize); - - BUG_ON(!is_power_of_2(pgsize)); - - unmapped = 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (pte) { - int i, count; - - count = PAGE_SIZE_PTE_COUNT(unmap_size); - for (i = 0; i < count; i++) - pte[i] = 0ULL; - } else { - return unmapped; - } - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, - unsigned long flags) -{ - bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; - bool dirty = false; - int i, count; - - /* - * 2.2.3.2 Host Dirty Support - * When a non-default page size is used , software must OR the - * Dirty bits in all of the replicated host PTEs used to map - * the page. The IOMMU does not guarantee the Dirty bits are - * set in all of the replicated PTEs. Any portion of the page - * may have been written even if the Dirty bit is set in only - * one of the replicated PTEs. - */ - count = PAGE_SIZE_PTE_COUNT(size); - for (i = 0; i < count && test_only; i++) { - if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { - dirty = true; - break; - } - } - - for (i = 0; i < count && !test_only; i++) { - if (test_and_clear_bit(IOMMU_PTE_HD_BIT, - (unsigned long *)&ptep[i])) { - dirty = true; - } - } - - return dirty; -} - -static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long end = iova + size - 1; - - do { - unsigned long pgsize = 0; - u64 *ptep, pte; - - ptep = fetch_pte(pgtable, iova, &pgsize); - if (ptep) - pte = READ_ONCE(*ptep); - if (!ptep || !IOMMU_PTE_PRESENT(pte)) { - pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); - iova += pgsize; - continue; - } - - /* - * Mark the whole IOVA range as dirty even if only one of - * the replicated PTEs were marked dirty. - */ - if (pte_test_and_clear_dirty(ptep, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -/* - * ---------------------------------------------------- - */ -static void v1_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - - if (pgtable->mode == PAGE_MODE_NONE) - return; - - /* Page-table is not visible to IOMMU anymore, so free it */ - BUG_ON(pgtable->mode < PAGE_MODE_NONE || - pgtable->mode > amd_iommu_hpt_level); - - free_sub_pt(pgtable->root, pgtable->mode, &freelist); - iommu_put_pages_list(&freelist); -} - -static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - - pgtable->root = - iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->root) - return NULL; - pgtable->mode = PAGE_MODE_3_LEVEL; - seqcount_init(&pgtable->seqcount); - - cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; - cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; - pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { - .alloc = v1_alloc_pgtable, - .free = v1_free_pgtable, -}; diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c deleted file mode 100644 index b47941353ccb..000000000000 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ /dev/null @@ -1,370 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table v2 allocator. - * - * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - * Author: Vasant Hegde <vasant.hegde@amd.com> - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include <linux/bitops.h> -#include <linux/io-pgtable.h> -#include <linux/kernel.h> - -#include <asm/barrier.h> - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */ -#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */ -#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */ -#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */ -#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */ -#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */ -#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */ -#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */ -#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */ - -#define MAX_PTRS_PER_PAGE 512 - -#define IOMMU_PAGE_SIZE_2M BIT_ULL(21) -#define IOMMU_PAGE_SIZE_1G BIT_ULL(30) - - -static inline int get_pgtable_level(void) -{ - return amd_iommu_gpt_level; -} - -static inline bool is_large_pte(u64 pte) -{ - return (pte & IOMMU_PAGE_PSE); -} - -static inline u64 set_pgtable_attr(u64 *page) -{ - u64 prot; - - prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER; - prot |= IOMMU_PAGE_ACCESS; - - return (iommu_virt_to_phys(page) | prot); -} - -static inline void *get_pgtable_pte(u64 pte) -{ - return iommu_phys_to_virt(pte & PM_ADDR_MASK); -} - -static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot) -{ - u64 pte; - - pte = __sme_set(paddr & PM_ADDR_MASK); - pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER; - pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; - - if (prot & IOMMU_PROT_IW) - pte |= IOMMU_PAGE_RW; - - /* Large page */ - if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M) - pte |= IOMMU_PAGE_PSE; - - return pte; -} - -static inline u64 get_alloc_page_size(u64 size) -{ - if (size >= IOMMU_PAGE_SIZE_1G) - return IOMMU_PAGE_SIZE_1G; - - if (size >= IOMMU_PAGE_SIZE_2M) - return IOMMU_PAGE_SIZE_2M; - - return PAGE_SIZE; -} - -static inline int page_size_to_level(u64 pg_size) -{ - if (pg_size == IOMMU_PAGE_SIZE_1G) - return PAGE_MODE_3_LEVEL; - if (pg_size == IOMMU_PAGE_SIZE_2M) - return PAGE_MODE_2_LEVEL; - - return PAGE_MODE_1_LEVEL; -} - -static void free_pgtable(u64 *pt, int level) -{ - u64 *p; - int i; - - for (i = 0; i < MAX_PTRS_PER_PAGE; i++) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - if (is_large_pte(pt[i])) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = get_pgtable_pte(pt[i]); - if (level > 2) - free_pgtable(p, level - 1); - else - iommu_free_pages(p); - } - - iommu_free_pages(pt); -} - -/* Allocate page table */ -static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, - unsigned long pg_size, gfp_t gfp, bool *updated) -{ - u64 *pte, *page; - int level, end_level; - - level = get_pgtable_level() - 1; - end_level = page_size_to_level(pg_size); - pte = &pgd[PM_LEVEL_INDEX(level, iova)]; - iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE); - - while (level >= end_level) { - u64 __pte, __npte; - - __pte = *pte; - - if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) { - /* Unmap large pte */ - cmpxchg64(pte, *pte, 0ULL); - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte)) { - page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K); - if (!page) - return NULL; - - __npte = set_pgtable_attr(page); - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - level -= 1; - pte = get_pgtable_pte(__pte); - pte = &pte[PM_LEVEL_INDEX(level, iova)]; - } - - /* Tear down existing pte entries */ - if (IOMMU_PTE_PRESENT(*pte)) { - u64 *__pte; - - *updated = true; - __pte = get_pgtable_pte(*pte); - cmpxchg64(pte, *pte, 0ULL); - if (pg_size == IOMMU_PAGE_SIZE_1G) - free_pgtable(__pte, end_level - 1); - else if (pg_size == IOMMU_PAGE_SIZE_2M) - iommu_free_pages(__pte); - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. - * If there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long iova, unsigned long *page_size) -{ - u64 *pte; - int level; - - level = get_pgtable_level() - 1; - pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)]; - /* Default page size is 4K */ - *page_size = PAGE_SIZE; - - while (level) { - /* Not present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Walk to the next level */ - pte = get_pgtable_pte(*pte); - pte = &pte[PM_LEVEL_INDEX(level - 1, iova)]; - - /* Large page */ - if (is_large_pte(*pte)) { - if (level == PAGE_MODE_3_LEVEL) - *page_size = IOMMU_PAGE_SIZE_1G; - else if (level == PAGE_MODE_2_LEVEL) - *page_size = IOMMU_PAGE_SIZE_2M; - else - return NULL; /* Wrongly set PSE bit in PTE */ - - break; - } - - level -= 1; - } - - return pte; -} - -static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - u64 *pte; - unsigned long map_size; - unsigned long mapped_size = 0; - unsigned long o_iova = iova; - size_t size = pgcount << __ffs(pgsize); - int ret = 0; - bool updated = false; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount) - return -EINVAL; - - if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; - - while (mapped_size < size) { - map_size = get_alloc_page_size(pgsize); - pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd, - iova, map_size, gfp, &updated); - if (!pte) { - ret = -ENOMEM; - goto out; - } - - *pte = set_pte_attr(paddr, map_size, prot); - - iova += map_size; - paddr += map_size; - mapped_size += map_size; - } - -out: - if (updated) { - struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&pdom->lock, flags); - amd_iommu_domain_flush_pages(pdom, o_iova, size); - spin_unlock_irqrestore(&pdom->lock, flags); - } - - if (mapped) - *mapped += mapped_size; - - return ret; -} - -static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned long unmap_size; - unsigned long unmapped = 0; - size_t size = pgcount << __ffs(pgsize); - u64 *pte; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount)) - return 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (!pte) - return unmapped; - - *pte = 0ULL; - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -/* - * ---------------------------------------------------- - */ -static void v2_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - - if (!pgtable || !pgtable->pgd) - return; - - /* Free page table */ - free_pgtable(pgtable->pgd, get_pgtable_level()); - pgtable->pgd = NULL; -} - -static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - int ias = IOMMU_IN_ADDR_BIT_SIZE; - - pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->pgd) - return NULL; - - if (get_pgtable_level() == PAGE_MODE_5_LEVEL) - ias = 57; - - pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys; - - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - cfg->ias = ias; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { - .alloc = v2_alloc_pgtable, - .free = v2_free_pgtable, -}; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2e1865daa1ce..9f1d56a5e145 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -30,7 +30,6 @@ #include <linux/msi.h> #include <linux/irqdomain.h> #include <linux/percpu.h> -#include <linux/io-pgtable.h> #include <linux/cc_platform.h> #include <asm/irq_remapping.h> #include <asm/io_apic.h> @@ -41,9 +40,9 @@ #include <asm/gart.h> #include <asm/dma.h> #include <uapi/linux/iommufd.h> +#include <linux/generic_pt/iommu.h> #include "amd_iommu.h" -#include "../dma-iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" @@ -60,7 +59,6 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; -static const struct iommu_dirty_ops amd_dirty_ops; int amd_iommu_max_glx_val = -1; @@ -70,15 +68,22 @@ int amd_iommu_max_glx_val = -1; */ DEFINE_IDA(pdom_ids); -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev); +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old); static void set_dte_entry(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data); + struct iommu_dev_data *dev_data, + phys_addr_t top_paddr, unsigned int top_level); + +static void amd_iommu_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level); static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable); /**************************************************************************** * @@ -1157,6 +1162,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) * ****************************************************************************/ +static void dump_command_buffer(struct amd_iommu *iommu) +{ + struct iommu_cmd *cmd; + u32 head, tail; + int i; + + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), + MMIO_CMD_BUFFER_TAIL(tail)); + + for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { + cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); + pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], + cmd->data[3]); + } +} + static int wait_on_sem(struct amd_iommu *iommu, u64 data) { int i = 0; @@ -1167,7 +1191,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data) } if (i == LOOP_TIMEOUT) { - pr_alert("Completion-Wait loop timed out\n"); + + pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", + iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), + PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); + + if (amd_iommu_dump) + DO_ONCE_LITE(dump_command_buffer, iommu); + return -EIO; } @@ -1756,42 +1787,6 @@ static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } -/* Flush the not present cache if it exists */ -static void domain_flush_np_cache(struct protection_domain *domain, - dma_addr_t iova, size_t size) -{ - if (unlikely(amd_iommu_np_cache)) { - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - amd_iommu_domain_flush_pages(domain, iova, size); - spin_unlock_irqrestore(&domain->lock, flags); - } -} - - -/* - * This function flushes the DTEs for all devices in domain - */ -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - - lockdep_assert_held(&domain->lock); - - list_for_each_entry(dev_data, &domain->dev_list, list) { - struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); - - set_dte_entry(iommu, dev_data); - clone_aliases(iommu, dev_data->dev); - } - - list_for_each_entry(dev_data, &domain->dev_list, list) - device_flush_dte(dev_data); - - domain_flush_complete(domain); -} - int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) { struct iommu_dev_data *dev_data; @@ -2051,7 +2046,8 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu, } static void set_dte_entry(struct amd_iommu *iommu, - struct iommu_dev_data *dev_data) + struct iommu_dev_data *dev_data, + phys_addr_t top_paddr, unsigned int top_level) { u16 domid; u32 old_domid; @@ -2060,19 +2056,36 @@ static void set_dte_entry(struct amd_iommu *iommu, struct protection_domain *domain = dev_data->domain; struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; + struct pt_iommu_amdv1_hw_info pt_info; + + make_clear_dte(dev_data, dte, &new); if (gcr3_info && gcr3_info->gcr3_tbl) domid = dev_data->gcr3_info.domid; - else + else { domid = domain->id; - make_clear_dte(dev_data, dte, &new); - - if (domain->iop.mode != PAGE_MODE_NONE) - new.data[0] |= iommu_virt_to_phys(domain->iop.root); + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) { + /* + * When updating the IO pagetable, the new top and level + * are provided as parameters. For other operations i.e. + * device attach, retrieve the current pagetable info + * via the IOMMU PT API. + */ + if (top_paddr) { + pt_info.host_pt_root = top_paddr; + pt_info.mode = top_level + 1; + } else { + WARN_ON(top_paddr || top_level); + pt_iommu_amdv1_hw_info(&domain->amdv1, + &pt_info); + } - new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; + new.data[0] |= __sme_set(pt_info.host_pt_root) | + (pt_info.mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + } + } new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; @@ -2138,7 +2151,7 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); if (set) - set_dte_entry(iommu, dev_data); + set_dte_entry(iommu, dev_data, 0, 0); else clear_dte_entry(iommu, dev_data); @@ -2156,6 +2169,7 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); int max_pasids = dev_data->max_pasids; + struct pt_iommu_x86_64_hw_info pt_info; int ret = 0; /* @@ -2178,7 +2192,8 @@ static int init_gcr3_table(struct iommu_dev_data *dev_data, if (!pdom_is_v2_pgtbl_mode(pdom)) return ret; - ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true); + pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); + ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); if (ret) free_gcr3_table(&dev_data->gcr3_info); @@ -2500,94 +2515,240 @@ struct protection_domain *protection_domain_alloc(void) return domain; } -static int pdom_setup_pgtable(struct protection_domain *domain, - struct device *dev) +static bool amd_iommu_hd_support(struct amd_iommu *iommu) +{ + if (amd_iommu_hatdis) + return false; + + return iommu && (iommu->features & FEATURE_HDSUP); +} + +static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) { - struct io_pgtable_ops *pgtbl_ops; - enum io_pgtable_fmt fmt; + struct protection_domain *pdom = + container_of(iommupt, struct protection_domain, iommu); - switch (domain->pd_mode) { - case PD_MODE_V1: - fmt = AMD_IOMMU_V1; - break; - case PD_MODE_V2: - fmt = AMD_IOMMU_V2; - break; - case PD_MODE_NONE: - WARN_ON_ONCE(1); - return -EPERM; + return &pdom->lock; +} + +/* + * Update all HW references to the domain with a new pgtable configuration. + */ +static void amd_iommu_change_top(struct pt_iommu *iommu_table, + phys_addr_t top_paddr, unsigned int top_level) +{ + struct protection_domain *pdom = + container_of(iommu_table, struct protection_domain, iommu); + struct iommu_dev_data *dev_data; + + lockdep_assert_held(&pdom->lock); + + /* Update the DTE for all devices attached to this domain */ + list_for_each_entry(dev_data, &pdom->dev_list, list) { + struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); + + /* Update the HW references with the new level and top ptr */ + set_dte_entry(iommu, dev_data, top_paddr, top_level); + clone_aliases(iommu, dev_data->dev); } - domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev); - pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain); - if (!pgtbl_ops) - return -ENOMEM; + list_for_each_entry(dev_data, &pdom->dev_list, list) + device_flush_dte(dev_data); + + domain_flush_complete(pdom); +} + +/* + * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to + * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non + * present caching (like hypervisor shadowing). + */ +static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, + unsigned long iova, size_t size) +{ + struct protection_domain *domain = to_pdomain(dom); + unsigned long flags; + if (likely(!amd_iommu_np_cache)) + return 0; + + spin_lock_irqsave(&domain->lock, flags); + amd_iommu_domain_flush_pages(domain, iova, size); + spin_unlock_irqrestore(&domain->lock, flags); return 0; } -static inline u64 dma_max_address(enum protection_domain_mode pgtable) +static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { - if (pgtable == PD_MODE_V1) - return PM_LEVEL_SIZE(amd_iommu_hpt_level); + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; - /* - * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page - * Translation" shows that the V2 table sign extends the top of the - * address space creating a reserved region in the middle of the - * translation, just like the CPU does. Further Vasant says the docs are - * incomplete and this only applies to non-zero PASIDs. If the AMDv2 - * page table is assigned to the 0 PASID then there is no sign extension - * check. - * - * Since the IOMMU must have a fixed geometry, and the core code does - * not understand sign extended addressing, we have to chop off the high - * bit to get consistent behavior with attachments of the domain to any - * PASID. - */ - return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1); + spin_lock_irqsave(&dom->lock, flags); + amd_iommu_domain_flush_all(dom); + spin_unlock_irqrestore(&dom->lock, flags); } -static bool amd_iommu_hd_support(struct amd_iommu *iommu) +static void amd_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - if (amd_iommu_hatdis) - return false; + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; - return iommu && (iommu->features & FEATURE_HDSUP); + spin_lock_irqsave(&dom->lock, flags); + amd_iommu_domain_flush_pages(dom, gather->start, + gather->end - gather->start + 1); + spin_unlock_irqrestore(&dom->lock, flags); + iommu_put_pages_list(&gather->freelist); } -static struct iommu_domain * -do_iommu_domain_alloc(struct device *dev, u32 flags, - enum protection_domain_mode pgtable) +static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { + .get_top_lock = amd_iommu_get_top_lock, + .change_top = amd_iommu_change_top, +}; + +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .iotlb_sync_map = amd_iommu_iotlb_sync_map, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, + .attach_dev = amd_iommu_attach_device, + .free = amd_iommu_domain_free, + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, +}; + +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = amd_iommu_set_dirty_tracking, +}; + +static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, + u32 flags) { - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); + struct pt_iommu_amdv1_cfg cfg = {}; struct protection_domain *domain; int ret; + if (amd_iommu_hatdis) + return ERR_PTR(-EOPNOTSUPP); + domain = protection_domain_alloc(); if (!domain) return ERR_PTR(-ENOMEM); - domain->pd_mode = pgtable; - ret = pdom_setup_pgtable(domain, dev); + domain->pd_mode = PD_MODE_V1; + domain->iommu.driver_ops = &amd_hw_driver_ops_v1; + domain->iommu.nid = dev_to_node(dev); + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + domain->domain.dirty_ops = &amdv1_dirty_ops; + + /* + * Someday FORCE_COHERENCE should be set by + * amd_iommu_enforce_cache_coherency() like VT-d does. + */ + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); + + /* + * AMD's IOMMU can flush as many pages as necessary in a single flush. + * Unless we run in a virtual machine, which can be inferred according + * to whether "non-present cache" is on, it is probably best to prefer + * (potentially) too extensive TLB flushing (i.e., more misses) over + * multiple TLB flushes (i.e., more flushes). For virtual machines the + * hypervisor needs to synchronize the host IOMMU PTEs with those of + * the guest, and the trade-off is different: unnecessary TLB flushes + * should be avoided. + */ + if (amd_iommu_np_cache) + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); + else + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); + + cfg.common.hw_max_vasz_lg2 = + min(64, (amd_iommu_hpt_level - 1) * 9 + 21); + cfg.common.hw_max_oasz_lg2 = 52; + cfg.starting_level = 2; + domain->domain.ops = &amdv1_ops; + + ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); if (ret) { - pdom_id_free(domain->id); - kfree(domain); + amd_iommu_domain_free(&domain->domain); return ERR_PTR(ret); } - domain->domain.geometry.aperture_start = 0; - domain->domain.geometry.aperture_end = dma_max_address(pgtable); - domain->domain.geometry.force_aperture = true; - domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; + /* + * Narrow the supported page sizes to those selected by the kernel + * command line. + */ + domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; + return &domain->domain; +} - domain->domain.type = IOMMU_DOMAIN_UNMANAGED; - domain->domain.ops = iommu->iommu.ops->default_domain_ops; +static const struct iommu_domain_ops amdv2_ops = { + IOMMU_PT_DOMAIN_OPS(x86_64), + .iotlb_sync_map = amd_iommu_iotlb_sync_map, + .flush_iotlb_all = amd_iommu_flush_iotlb_all, + .iotlb_sync = amd_iommu_iotlb_sync, + .attach_dev = amd_iommu_attach_device, + .free = amd_iommu_domain_free, + /* + * Note the AMDv2 page table format does not support a Force Coherency + * bit, so enforce_cache_coherency should not be set. However VFIO is + * not prepared to handle a case where some domains will support + * enforcement and others do not. VFIO and iommufd will have to be fixed + * before it can fully use the V2 page table. See the comment in + * iommufd_hwpt_paging_alloc(). For now leave things as they have + * historically been and lie about enforce_cache_coherencey. + */ + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, +}; - if (dirty_tracking) - domain->domain.dirty_ops = &amd_dirty_ops; +static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, + u32 flags) +{ + struct pt_iommu_x86_64_cfg cfg = {}; + struct protection_domain *domain; + int ret; + if (!amd_iommu_v2_pgtbl_supported()) + return ERR_PTR(-EOPNOTSUPP); + + domain = protection_domain_alloc(); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->pd_mode = PD_MODE_V2; + domain->iommu.nid = dev_to_node(dev); + + cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); + if (amd_iommu_np_cache) + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); + else + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); + + /* + * The v2 table behaves differently if it is attached to PASID 0 vs a + * non-zero PASID. On PASID 0 it has no sign extension and the full + * 57/48 bits decode the lower addresses. Otherwise it behaves like a + * normal sign extended x86 page table. Since we want the domain to work + * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not + * set which creates a table that is compatible in both modes. + */ + if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { + cfg.common.hw_max_vasz_lg2 = 56; + cfg.top_level = 4; + } else { + cfg.common.hw_max_vasz_lg2 = 47; + cfg.top_level = 3; + } + cfg.common.hw_max_oasz_lg2 = 52; + domain->domain.ops = &amdv2_ops; + + ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); + if (ret) { + amd_iommu_domain_free(&domain->domain); + return ERR_PTR(ret); + } return &domain->domain; } @@ -2608,15 +2769,27 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, /* Allocate domain with v1 page table for dirty tracking */ if (!amd_iommu_hd_support(iommu)) break; - return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); + return amd_iommu_domain_alloc_paging_v1(dev, flags); case IOMMU_HWPT_ALLOC_PASID: /* Allocate domain with v2 page table if IOMMU supports PASID. */ if (!amd_iommu_pasid_supported()) break; - return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); - case 0: + return amd_iommu_domain_alloc_paging_v2(dev, flags); + case 0: { + struct iommu_domain *ret; + /* If nothing specific is required use the kernel commandline default */ - return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); + if (amd_iommu_pgtable == PD_MODE_V1) { + ret = amd_iommu_domain_alloc_paging_v1(dev, flags); + if (ret != ERR_PTR(-EOPNOTSUPP)) + return ret; + return amd_iommu_domain_alloc_paging_v2(dev, flags); + } + ret = amd_iommu_domain_alloc_paging_v2(dev, flags); + if (ret != ERR_PTR(-EOPNOTSUPP)) + return ret; + return amd_iommu_domain_alloc_paging_v1(dev, flags); + } default: break; } @@ -2628,14 +2801,14 @@ void amd_iommu_domain_free(struct iommu_domain *dom) struct protection_domain *domain = to_pdomain(dom); WARN_ON(!list_empty(&domain->dev_list)); - if (domain->domain.type & __IOMMU_DOMAIN_PAGING) - free_io_pgtable_ops(&domain->iop.pgtbl.ops); + pt_iommu_deinit(&domain->iommu); pdom_id_free(domain->id); kfree(domain); } static int blocked_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); @@ -2685,16 +2858,8 @@ void amd_iommu_init_identity_domain(void) protection_domain_init(&identity_domain); } -/* Same as blocked domain except it supports only ops->attach_dev() */ -static struct iommu_domain release_domain = { - .type = IOMMU_DOMAIN_BLOCKED, - .ops = &(const struct iommu_domain_ops) { - .attach_dev = blocked_domain_attach_device, - } -}; - -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev) +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); @@ -2734,93 +2899,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } -static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, - unsigned long iova, size_t size) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - - if (ops->map_pages) - domain_flush_np_cache(domain, iova, size); - return 0; -} - -static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int iommu_prot, gfp_t gfp, size_t *mapped) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - int prot = 0; - int ret = -EINVAL; - - if ((domain->pd_mode == PD_MODE_V1) && - (domain->iop.mode == PAGE_MODE_NONE)) - return -EINVAL; - - if (iommu_prot & IOMMU_READ) - prot |= IOMMU_PROT_IR; - if (iommu_prot & IOMMU_WRITE) - prot |= IOMMU_PROT_IW; - - if (ops->map_pages) { - ret = ops->map_pages(ops, iova, paddr, pgsize, - pgcount, prot, gfp, mapped); - } - - return ret; -} - -static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size) -{ - /* - * AMD's IOMMU can flush as many pages as necessary in a single flush. - * Unless we run in a virtual machine, which can be inferred according - * to whether "non-present cache" is on, it is probably best to prefer - * (potentially) too extensive TLB flushing (i.e., more misses) over - * mutliple TLB flushes (i.e., more flushes). For virtual machines the - * hypervisor needs to synchronize the host IOMMU PTEs with those of - * the guest, and the trade-off is different: unnecessary TLB flushes - * should be avoided. - */ - if (amd_iommu_np_cache && - iommu_iotlb_gather_is_disjoint(gather, iova, size)) - iommu_iotlb_sync(domain, gather); - - iommu_iotlb_gather_add_range(gather, iova, size); -} - -static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - size_t r; - - if ((domain->pd_mode == PD_MODE_V1) && - (domain->iop.mode == PAGE_MODE_NONE)) - return 0; - - r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0; - - if (r) - amd_iommu_iotlb_gather_add_page(dom, gather, iova, r); - - return r; -} - -static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, - dma_addr_t iova) -{ - struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; - - return ops->iova_to_phys(ops, iova); -} - static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) { switch (cap) { @@ -2887,28 +2965,6 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct protection_domain *pdomain = to_pdomain(domain); - struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops; - unsigned long lflags; - - if (!ops || !ops->read_and_clear_dirty) - return -EOPNOTSUPP; - - spin_lock_irqsave(&pdomain->lock, lflags); - if (!pdomain->dirty_tracking && dirty->bitmap) { - spin_unlock_irqrestore(&pdomain->lock, lflags); - return -EINVAL; - } - spin_unlock_irqrestore(&pdomain->lock, lflags); - - return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); -} - static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2978,28 +3034,6 @@ static bool amd_iommu_is_attach_deferred(struct device *dev) return dev_data->defer_attach; } -static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) -{ - struct protection_domain *dom = to_pdomain(domain); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_all(dom); - spin_unlock_irqrestore(&dom->lock, flags); -} - -static void amd_iommu_iotlb_sync(struct iommu_domain *domain, - struct iommu_iotlb_gather *gather) -{ - struct protection_domain *dom = to_pdomain(domain); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_pages(dom, gather->start, - gather->end - gather->start + 1); - spin_unlock_irqrestore(&dom->lock, flags); -} - static int amd_iommu_def_domain_type(struct device *dev) { struct iommu_dev_data *dev_data; @@ -3034,15 +3068,10 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } -static const struct iommu_dirty_ops amd_dirty_ops = { - .set_dirty_tracking = amd_iommu_set_dirty_tracking, - .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, -}; - const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .blocked_domain = &blocked_domain, - .release_domain = &release_domain, + .release_domain = &blocked_domain, .identity_domain = &identity_domain.domain, .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, .domain_alloc_sva = amd_iommu_domain_alloc_sva, @@ -3053,17 +3082,6 @@ const struct iommu_ops amd_iommu_ops = { .is_attach_deferred = amd_iommu_is_attach_deferred, .def_domain_type = amd_iommu_def_domain_type, .page_response = amd_iommu_page_response, - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = amd_iommu_attach_device, - .map_pages = amd_iommu_map_pages, - .unmap_pages = amd_iommu_unmap_pages, - .iotlb_sync_map = amd_iommu_iotlb_sync_map, - .iova_to_phys = amd_iommu_iova_to_phys, - .flush_iotlb_all = amd_iommu_flush_iotlb_all, - .iotlb_sync = amd_iommu_iotlb_sync, - .free = amd_iommu_domain_free, - .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, - } }; #ifdef CONFIG_IRQ_REMAP @@ -3354,7 +3372,7 @@ static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, struct irte_ga *irte) { - bool ret; + int ret; ret = __modify_irte_ga(iommu, devid, index, irte); if (ret) @@ -4072,3 +4090,5 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) return 0; } #endif + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); |
