diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 16:44:56 -1000 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-11-01 16:44:56 -1000 |
| commit | 463f46e114f74465cf8d01b124e7b74ad1ce2afd (patch) | |
| tree | 67ba1f82d7a3baba926eab2a896b0f4c4f3aced2 /include/linux | |
| parent | ff269e2cd5adce4ae14f883fc9c8803bc43ee1e9 (diff) | |
| parent | b2b67c997bf74453f3469d8b54e4859f190943bd (diff) | |
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd
Pull iommufd updates from Jason Gunthorpe:
"This brings three new iommufd capabilities:
- Dirty tracking for DMA.
AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the
IOPTEs within the IO page table. This can be used to generate a
record of what memory is being dirtied by DMA activities during a
VM migration process. A VMM like qemu will combine the IOMMU dirty
bits with the CPU's dirty log to determine what memory to transfer.
VFIO already has a DMA dirty tracking framework that requires PCI
devices to implement tracking HW internally. The iommufd version
provides an alternative that the VMM can select, if available. The
two are designed to have very similar APIs.
- Userspace controlled attributes for hardware page tables
(HWPT/iommu_domain). There are currently a few generic attributes
for HWPTs (support dirty tracking, and parent of a nest). This is
an entry point for the userspace iommu driver to control the HW in
detail.
- Nested translation support for HWPTs. This is a 2D translation
scheme similar to the CPU where a DMA goes through a first stage to
determine an intermediate address which is then translated trough a
second stage to a physical address.
Like for CPU translation the first stage table would exist in VM
controlled memory and the second stage is in the kernel and matches
the VM's guest to physical map.
As every IOMMU has a unique set of parameter to describe the S1 IO
page table and its associated parameters the userspace IOMMU driver
has to marshal the information into the correct format.
This is 1/3 of the feature, it allows creating the nested
translation and binding it to VFIO devices, however the API to
support IOTLB and ATC invalidation of the stage 1 io page table,
and forwarding of IO faults are still in progress.
The series includes AMD and Intel support for dirty tracking. Intel
support for nested translation.
Along the way are a number of internal items:
- New iommu core items: ops->domain_alloc_user(),
ops->set_dirty_tracking, ops->read_and_clear_dirty(),
IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user
- UAF fix in iopt_area_split()
- Spelling fixes and some test suite improvement"
* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits)
iommufd: Organize the mock domain alloc functions closer to Joerg's tree
iommufd/selftest: Fix page-size check in iommufd_test_dirty()
iommufd: Add iopt_area_alloc()
iommufd: Fix missing update of domains_itree after splitting iopt_area
iommu/vt-d: Disallow read-only mappings to nest parent domain
iommu/vt-d: Add nested domain allocation
iommu/vt-d: Set the nested domain to a device
iommu/vt-d: Make domain attach helpers to be extern
iommu/vt-d: Add helper to setup pasid nested translation
iommu/vt-d: Add helper for nested domain allocation
iommu/vt-d: Extend dmar_domain to support nested domain
iommufd: Add data structure for Intel VT-d stage-1 domain allocation
iommu/vt-d: Enhance capability check for nested parent domain allocation
iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs
iommufd/selftest: Add nested domain allocation for mock domain
iommu: Add iommu_copy_struct_from_user helper
iommufd: Add a nested HW pagetable object
iommu: Pass in parent domain with user_data to domain_alloc_user op
iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED
iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable
...
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/io-pgtable.h | 4 | ||||
| -rw-r--r-- | include/linux/iommu.h | 146 | ||||
| -rw-r--r-- | include/linux/iova_bitmap.h | 26 |
3 files changed, 175 insertions, 1 deletions
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 1b7a44b35616..25142a0e2fc2 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -166,6 +166,10 @@ struct io_pgtable_ops { struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); + int (*read_and_clear_dirty)(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty); }; /** diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c50a769d569a..8fb1b41b4d15 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -13,6 +13,7 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/of.h> +#include <linux/iova_bitmap.h> #include <uapi/linux/iommu.h> #define IOMMU_READ (1 << 0) @@ -37,6 +38,7 @@ struct bus_type; struct device; struct iommu_domain; struct iommu_domain_ops; +struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_fault_event; @@ -65,6 +67,9 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */ +#define __IOMMU_DOMAIN_NESTED (1U << 6) /* User-managed address space nested + on a stage-2 translation */ + #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ /* * This are the possible domain-types @@ -91,10 +96,13 @@ struct iommu_domain_geometry { __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) #define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA) +#define IOMMU_DOMAIN_NESTED (__IOMMU_DOMAIN_NESTED) struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; + const struct iommu_dirty_ops *dirty_ops; + unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; @@ -133,6 +141,7 @@ enum iommu_cap { * usefully support the non-strict DMA flush queue. */ IOMMU_CAP_DEFERRED_FLUSH, + IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ }; /* These are the possible reserved region types */ @@ -228,13 +237,109 @@ struct iommu_iotlb_gather { }; /** + * struct iommu_dirty_bitmap - Dirty IOVA bitmap state + * @bitmap: IOVA bitmap + * @gather: Range information for a pending IOTLB flush + */ +struct iommu_dirty_bitmap { + struct iova_bitmap *bitmap; + struct iommu_iotlb_gather *gather; +}; + +/* Read but do not clear any dirty bits */ +#define IOMMU_DIRTY_NO_CLEAR (1 << 0) + +/** + * struct iommu_dirty_ops - domain specific dirty tracking operations + * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain + * @read_and_clear_dirty: Walk IOMMU page tables for dirtied PTEs marshalled + * into a bitmap, with a bit represented as a page. + * Reads the dirty PTE bits and clears it from IO + * pagetables. + */ +struct iommu_dirty_ops { + int (*set_dirty_tracking)(struct iommu_domain *domain, bool enabled); + int (*read_and_clear_dirty)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty); +}; + +/** + * struct iommu_user_data - iommu driver specific user space data info + * @type: The data type of the user buffer + * @uptr: Pointer to the user buffer for copy_from_user() + * @len: The length of the user buffer in bytes + * + * A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h + * @type, @uptr and @len should be just copied from an iommufd core uAPI struct. + */ +struct iommu_user_data { + unsigned int type; + void __user *uptr; + size_t len; +}; + +/** + * __iommu_copy_struct_from_user - Copy iommu driver specific user space data + * @dst_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @src_data: Pointer to a struct iommu_user_data for user space data info + * @data_type: The data type of the @dst_data. Must match with @src_data.type + * @data_len: Length of current user data structure, i.e. sizeof(struct _dst) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int __iommu_copy_struct_from_user( + void *dst_data, const struct iommu_user_data *src_data, + unsigned int data_type, size_t data_len, size_t min_len) +{ + if (src_data->type != data_type) + return -EINVAL; + if (WARN_ON(!dst_data || !src_data)) + return -EINVAL; + if (src_data->len < min_len || data_len < src_data->len) + return -EINVAL; + return copy_struct_from_user(dst_data, data_len, src_data->uptr, + src_data->len); +} + +/** + * iommu_copy_struct_from_user - Copy iommu driver specific user space data + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @user_data: Pointer to a struct iommu_user_data for user space data info + * @data_type: The data type of the @kdst. Must match with @user_data->type + * @min_last: The last memember of the data structure @kdst points in the + * initial version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \ + __iommu_copy_struct_from_user(kdst, user_data, data_type, \ + sizeof(*kdst), \ + offsetofend(typeof(*kdst), min_last)) + +/** * struct iommu_ops - iommu ops and capabilities * @capable: check capability * @hw_info: report iommu hardware information. The data buffer returned by this * op is allocated in the iommu driver and freed by the caller after * use. The information type is one of enum iommu_hw_info_type defined * in include/uapi/linux/iommufd.h. - * @domain_alloc: allocate iommu domain + * @domain_alloc: allocate and return an iommu domain if success. Otherwise + * NULL is returned. The domain is not fully initialized until + * the caller iommu_domain_alloc() returns. + * @domain_alloc_user: Allocate an iommu domain corresponding to the input + * parameters as defined in include/uapi/linux/iommufd.h. + * Unlike @domain_alloc, it is called only by IOMMUFD and + * must fully initialize the new domain before return. + * Upon success, if the @user_data is valid and the @parent + * points to a kernel-managed domain, the new domain must be + * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be + * NULL while the @user_data can be optionally provided, the + * new domain must support __IOMMU_DOMAIN_PAGING. + * Upon failure, ERR_PTR must be returned. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -267,6 +372,9 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); + struct iommu_domain *(*domain_alloc_user)( + struct device *dev, u32 flags, struct iommu_domain *parent, + const struct iommu_user_data *user_data); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); @@ -632,6 +740,28 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) return gather && gather->queued; } +static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty, + struct iova_bitmap *bitmap, + struct iommu_iotlb_gather *gather) +{ + if (gather) + iommu_iotlb_gather_init(gather); + + dirty->bitmap = bitmap; + dirty->gather = gather; +} + +static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty, + unsigned long iova, + unsigned long length) +{ + if (dirty->bitmap) + iova_bitmap_set(dirty->bitmap, iova, length); + + if (dirty->gather) + iommu_iotlb_gather_add_range(dirty->gather, iova, length); +} + /* PCI device grouping function */ extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ @@ -737,6 +867,8 @@ struct iommu_fwspec {}; struct iommu_device {}; struct iommu_fault_param {}; struct iommu_iotlb_gather {}; +struct iommu_dirty_bitmap {}; +struct iommu_dirty_ops {}; static inline bool iommu_present(const struct bus_type *bus) { @@ -969,6 +1101,18 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) return false; } +static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty, + struct iova_bitmap *bitmap, + struct iommu_iotlb_gather *gather) +{ +} + +static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty, + unsigned long iova, + unsigned long length) +{ +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { } diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h index c006cf0a25f3..1c338f5e5b7a 100644 --- a/include/linux/iova_bitmap.h +++ b/include/linux/iova_bitmap.h @@ -7,6 +7,7 @@ #define _IOVA_BITMAP_H_ #include <linux/types.h> +#include <linux/errno.h> struct iova_bitmap; @@ -14,6 +15,7 @@ typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap, unsigned long iova, size_t length, void *opaque); +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, unsigned long page_size, u64 __user *data); @@ -22,5 +24,29 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, iova_bitmap_fn_t fn); void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length); +#else +static inline struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, + size_t length, + unsigned long page_size, + u64 __user *data) +{ + return NULL; +} + +static inline void iova_bitmap_free(struct iova_bitmap *bitmap) +{ +} + +static inline int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn) +{ + return -EOPNOTSUPP; +} + +static inline void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length) +{ +} +#endif #endif |
