diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 16:44:56 -1000 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 16:44:56 -1000 |
| commit | 463f46e114f74465cf8d01b124e7b74ad1ce2afd (patch) | |
| tree | 67ba1f82d7a3baba926eab2a896b0f4c4f3aced2 /drivers/iommu/intel/pasid.c | |
| parent | ff269e2cd5adce4ae14f883fc9c8803bc43ee1e9 (diff) | |
| parent | b2b67c997bf74453f3469d8b54e4859f190943bd (diff) | |
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd
Pull iommufd updates from Jason Gunthorpe:
"This brings three new iommufd capabilities:
- Dirty tracking for DMA.
AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the
IOPTEs within the IO page table. This can be used to generate a
record of what memory is being dirtied by DMA activities during a
VM migration process. A VMM like qemu will combine the IOMMU dirty
bits with the CPU's dirty log to determine what memory to transfer.
VFIO already has a DMA dirty tracking framework that requires PCI
devices to implement tracking HW internally. The iommufd version
provides an alternative that the VMM can select, if available. The
two are designed to have very similar APIs.
- Userspace controlled attributes for hardware page tables
(HWPT/iommu_domain). There are currently a few generic attributes
for HWPTs (support dirty tracking, and parent of a nest). This is
an entry point for the userspace iommu driver to control the HW in
detail.
- Nested translation support for HWPTs. This is a 2D translation
scheme similar to the CPU where a DMA goes through a first stage to
determine an intermediate address which is then translated trough a
second stage to a physical address.
Like for CPU translation the first stage table would exist in VM
controlled memory and the second stage is in the kernel and matches
the VM's guest to physical map.
As every IOMMU has a unique set of parameter to describe the S1 IO
page table and its associated parameters the userspace IOMMU driver
has to marshal the information into the correct format.
This is 1/3 of the feature, it allows creating the nested
translation and binding it to VFIO devices, however the API to
support IOTLB and ATC invalidation of the stage 1 io page table,
and forwarding of IO faults are still in progress.
The series includes AMD and Intel support for dirty tracking. Intel
support for nested translation.
Along the way are a number of internal items:
- New iommu core items: ops->domain_alloc_user(),
ops->set_dirty_tracking, ops->read_and_clear_dirty(),
IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user
- UAF fix in iopt_area_split()
- Spelling fixes and some test suite improvement"
* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits)
iommufd: Organize the mock domain alloc functions closer to Joerg's tree
iommufd/selftest: Fix page-size check in iommufd_test_dirty()
iommufd: Add iopt_area_alloc()
iommufd: Fix missing update of domains_itree after splitting iopt_area
iommu/vt-d: Disallow read-only mappings to nest parent domain
iommu/vt-d: Add nested domain allocation
iommu/vt-d: Set the nested domain to a device
iommu/vt-d: Make domain attach helpers to be extern
iommu/vt-d: Add helper to setup pasid nested translation
iommu/vt-d: Add helper for nested domain allocation
iommu/vt-d: Extend dmar_domain to support nested domain
iommufd: Add data structure for Intel VT-d stage-1 domain allocation
iommu/vt-d: Enhance capability check for nested parent domain allocation
iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs
iommufd/selftest: Add nested domain allocation for mock domain
iommu: Add iommu_copy_struct_from_user helper
iommufd: Add a nested HW pagetable object
iommu: Pass in parent domain with user_data to domain_alloc_user op
iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED
iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable
...
Diffstat (limited to 'drivers/iommu/intel/pasid.c')
| -rw-r--r-- | drivers/iommu/intel/pasid.c | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 8f92b92f3d2a..74e8e4c17e81 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) WRITE_ONCE(*ptr, (old & ~mask) | bits); } +static inline u64 pasid_get_bits(u64 *ptr) +{ + return READ_ONCE(*ptr); +} + /* * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode * PASID entry. @@ -336,6 +341,45 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe) } /* + * Enable second level A/D bits by setting the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); +} + +/* + * Disable second level A/D bits by clearing the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_clear_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 0); +} + +/* + * Checks if second level A/D bits specifically the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry is set. + */ +static inline bool pasid_get_ssade(struct pasid_entry *pe) +{ + return pasid_get_bits(&pe->val[0]) & (1 << 9); +} + +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + +/* * Setup the WPE(Write Protect Enable) field (Bit 132) of a * scalable mode PASID entry. */ @@ -402,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value) pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); } +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) @@ -627,6 +680,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (domain->dirty_tracking) + pasid_set_ssade(pte); pasid_set_present(pte); spin_unlock(&iommu->lock); @@ -637,6 +692,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, } /* + * Set up dirty tracking on a second only or nested translation type. + */ +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u32 pasid, + bool enabled) +{ + struct pasid_entry *pte; + u16 did, pgtt; + + spin_lock(&iommu->lock); + + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, "Failed to get pasid entry of PASID %d\n", pasid); + return -ENODEV; + } + + did = domain_id_iommu(domain, iommu); + pgtt = pasid_pte_get_pgtt(pte); + if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && + pgtt != PASID_ENTRY_PGTT_NESTED) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, + "Dirty tracking not supported on translation type %d\n", + pgtt); + return -EOPNOTSUPP; + } + + if (pasid_get_ssade(pte) == enabled) { + spin_unlock(&iommu->lock); + return 0; + } + + if (enabled) + pasid_set_ssade(pte); + else + pasid_clear_ssade(pte); + spin_unlock(&iommu->lock); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * From VT-d spec table 25 "Guidance to Software for Invalidations": + * + * - PASID-selective-within-Domain PASID-cache invalidation + * If (PGTT=SS or Nested) + * - Domain-selective IOTLB invalidation + * Else + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + + /* Device IOTLB doesn't need to be flushed in caching mode. */ + if (!cap_caching_mode(iommu->cap)) + devtlb_invalidation_with_pasid(iommu, dev, pasid); + + return 0; +} + +/* * Set up the scalable mode pasid entry for passthrough translation type. */ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, @@ -713,3 +840,97 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, if (!cap_caching_mode(iommu->cap)) devtlb_invalidation_with_pasid(iommu, dev, pasid); } + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @pasid: PASID to be programmed in the device PASID table + * @domain: User stage-1 domain nested on a stage-2 domain + * + * This is used for nested translation. The input domain should be + * nested type and nested on a parent with 'is_nested_parent' flag + * set. + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct dma_pte *pgd = s2_domain->pgd; + struct pasid_entry *pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_clear_entry(pte); + + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, s2_domain->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} |
