diff options
Diffstat (limited to 'drivers/iommu/iommufd')
-rw-r--r-- | drivers/iommu/iommufd/Kconfig | 4 | ||||
-rw-r--r-- | drivers/iommu/iommufd/Makefile | 7 | ||||
-rw-r--r-- | drivers/iommu/iommufd/device.c | 95 | ||||
-rw-r--r-- | drivers/iommu/iommufd/driver.c | 53 | ||||
-rw-r--r-- | drivers/iommu/iommufd/fault.c | 462 | ||||
-rw-r--r-- | drivers/iommu/iommufd/hw_pagetable.c | 178 | ||||
-rw-r--r-- | drivers/iommu/iommufd/io_pagetable.c | 121 | ||||
-rw-r--r-- | drivers/iommu/iommufd/io_pagetable.h | 28 | ||||
-rw-r--r-- | drivers/iommu/iommufd/ioas.c | 269 | ||||
-rw-r--r-- | drivers/iommu/iommufd/iommufd_private.h | 189 | ||||
-rw-r--r-- | drivers/iommu/iommufd/iommufd_test.h | 42 | ||||
-rw-r--r-- | drivers/iommu/iommufd/iova_bitmap.c | 136 | ||||
-rw-r--r-- | drivers/iommu/iommufd/main.c | 103 | ||||
-rw-r--r-- | drivers/iommu/iommufd/pages.c | 336 | ||||
-rw-r--r-- | drivers/iommu/iommufd/selftest.c | 476 | ||||
-rw-r--r-- | drivers/iommu/iommufd/vfio_compat.c | 13 | ||||
-rw-r--r-- | drivers/iommu/iommufd/viommu.c | 157 |
17 files changed, 2203 insertions, 466 deletions
diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 76656fe0470d..0a07f9449fd9 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -1,4 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +config IOMMUFD_DRIVER_CORE + tristate + default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n + config IOMMUFD tristate "IOMMU Userspace API" select INTERVAL_TREE diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 34b446146961..cb784da6cddc 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,14 +1,19 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ device.o \ + fault.o \ hw_pagetable.o \ io_pagetable.o \ ioas.o \ main.o \ pages.o \ - vfio_compat.o + vfio_compat.o \ + viommu.o iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o obj-$(CONFIG_IOMMUFD) += iommufd.o obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o + +iommufd_driver-y := driver.o +obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 873630c111c1..dfd0898fb6c1 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/slab.h> -#include <linux/iommu.h> #include <uapi/linux/iommufd.h> -#include "../iommu-priv.h" +#include "../iommu-priv.h" #include "io_pagetable.h" #include "iommufd_private.h" @@ -215,6 +215,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, refcount_inc(&idev->obj.users); /* igroup refcount moves into iommufd_device */ idev->igroup = igroup; + mutex_init(&idev->iopf_lock); /* * If the caller fails after this success it must call @@ -232,7 +233,7 @@ out_group_put: iommufd_put_group(igroup); return ERR_PTR(rc); } -EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, "IOMMUFD"); /** * iommufd_ctx_has_group - True if any device within the group is bound @@ -263,7 +264,7 @@ bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group) xa_unlock(&ictx->objects); return false; } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, "IOMMUFD"); /** * iommufd_device_unbind - Undo iommufd_device_bind() @@ -278,19 +279,19 @@ void iommufd_device_unbind(struct iommufd_device *idev) { iommufd_object_destroy_user(idev->ictx, &idev->obj); } -EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, "IOMMUFD"); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev) { return idev->ictx; } -EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, "IOMMUFD"); u32 iommufd_device_to_id(struct iommufd_device *idev) { return idev->obj.id; } -EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) @@ -326,8 +327,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, return 0; } -static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, - struct iommufd_device *idev) +static int +iommufd_device_attach_reserved_iova(struct iommufd_device *idev, + struct iommufd_hwpt_paging *hwpt_paging) { int rc; @@ -353,6 +355,7 @@ static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev) { + struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); int rc; mutex_lock(&idev->igroup->lock); @@ -362,8 +365,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - if (hwpt_is_paging(hwpt)) { - rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev); + if (hwpt_paging) { + rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging); if (rc) goto err_unlock; } @@ -376,7 +379,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommu_attach_group(hwpt->domain, idev->igroup->group); + rc = iommufd_hwpt_attach_device(hwpt, idev); if (rc) goto err_unresv; idev->igroup->hwpt = hwpt; @@ -386,9 +389,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&idev->igroup->lock); return 0; err_unresv: - if (hwpt_is_paging(hwpt)) - iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, - idev->dev); + if (hwpt_paging) + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); err_unlock: mutex_unlock(&idev->igroup->lock); return rc; @@ -398,16 +400,16 @@ struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev) { struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; + struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); mutex_lock(&idev->igroup->lock); list_del(&idev->group_item); if (list_empty(&idev->igroup->device_list)) { - iommu_detach_group(hwpt->domain, idev->igroup->group); + iommufd_hwpt_detach_device(hwpt, idev); idev->igroup->hwpt = NULL; } - if (hwpt_is_paging(hwpt)) - iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, - idev->dev); + if (hwpt_paging) + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); mutex_unlock(&idev->igroup->lock); /* Caller must destroy hwpt */ @@ -439,17 +441,17 @@ iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, } static int -iommufd_group_do_replace_paging(struct iommufd_group *igroup, - struct iommufd_hwpt_paging *hwpt_paging) +iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) { - struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt; + struct iommufd_hwpt_paging *old_hwpt_paging; struct iommufd_device *cur; int rc; lockdep_assert_held(&igroup->lock); - if (!hwpt_is_paging(old_hwpt) || - hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) { + old_hwpt_paging = find_hwpt_paging(igroup->hwpt); + if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) { list_for_each_entry(cur, &igroup->device_list, group_item) { rc = iopt_table_enforce_dev_resv_regions( &hwpt_paging->ioas->iopt, cur->dev, NULL); @@ -472,6 +474,8 @@ static struct iommufd_hw_pagetable * iommufd_device_do_replace(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt) { + struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + struct iommufd_hwpt_paging *old_hwpt_paging; struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; unsigned int num_devices; @@ -490,22 +494,20 @@ iommufd_device_do_replace(struct iommufd_device *idev, } old_hwpt = igroup->hwpt; - if (hwpt_is_paging(hwpt)) { - rc = iommufd_group_do_replace_paging(igroup, - to_hwpt_paging(hwpt)); + if (hwpt_paging) { + rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging); if (rc) goto err_unlock; } - rc = iommu_group_replace_domain(igroup->group, hwpt->domain); + rc = iommufd_hwpt_replace_device(idev, hwpt, old_hwpt); if (rc) goto err_unresv; - if (hwpt_is_paging(old_hwpt) && - (!hwpt_is_paging(hwpt) || - to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas)) - iommufd_group_remove_reserved_iova(igroup, - to_hwpt_paging(old_hwpt)); + old_hwpt_paging = find_hwpt_paging(old_hwpt); + if (old_hwpt_paging && + (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas)) + iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging); igroup->hwpt = hwpt; @@ -523,9 +525,8 @@ iommufd_device_do_replace(struct iommufd_device *idev, /* Caller must destroy old_hwpt */ return old_hwpt; err_unresv: - if (hwpt_is_paging(hwpt)) - iommufd_group_remove_reserved_iova(igroup, - to_hwpt_paging(old_hwpt)); + if (hwpt_paging) + iommufd_group_remove_reserved_iova(igroup, hwpt_paging); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); @@ -691,7 +692,7 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) refcount_inc(&idev->obj.users); return 0; } -EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD"); /** * iommufd_device_replace - Change the device's iommu_domain @@ -713,7 +714,7 @@ int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) return iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_replace); } -EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, "IOMMUFD"); /** * iommufd_device_detach - Disconnect a device to an iommu_domain @@ -730,7 +731,7 @@ void iommufd_device_detach(struct iommufd_device *idev) iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } -EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, "IOMMUFD"); /* * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at @@ -852,7 +853,7 @@ iommufd_access_create(struct iommufd_ctx *ictx, mutex_init(&access->ioas_lock); return access; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_create, "IOMMUFD"); /** * iommufd_access_destroy - Destroy an iommufd_access @@ -864,7 +865,7 @@ void iommufd_access_destroy(struct iommufd_access *access) { iommufd_object_destroy_user(access->ictx, &access->obj); } -EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, "IOMMUFD"); void iommufd_access_detach(struct iommufd_access *access) { @@ -876,7 +877,7 @@ void iommufd_access_detach(struct iommufd_access *access) WARN_ON(iommufd_access_change_ioas(access, NULL)); mutex_unlock(&access->ioas_lock); } -EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, "IOMMUFD"); int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) { @@ -892,7 +893,7 @@ int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) mutex_unlock(&access->ioas_lock); return rc; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, "IOMMUFD"); int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) { @@ -907,7 +908,7 @@ int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) mutex_unlock(&access->ioas_lock); return rc; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, "IOMMUFD"); /** * iommufd_access_notify_unmap - Notify users of an iopt to stop using it @@ -990,7 +991,7 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, up_read(&iopt->iova_rwsem); mutex_unlock(&access->ioas_lock); } -EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, "IOMMUFD"); static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter) { @@ -1105,7 +1106,7 @@ err_remove: mutex_unlock(&access->ioas_lock); return rc; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, "IOMMUFD"); /** * iommufd_access_rw - Read or write data under the iova @@ -1169,7 +1170,7 @@ err_out: mutex_unlock(&access->ioas_lock); return rc; } -EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD"); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) { diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c new file mode 100644 index 000000000000..2d98b04ff1cb --- /dev/null +++ b/drivers/iommu/iommufd/driver.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "iommufd_private.h" + +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + /* Starts out bias'd by 1 until it is removed from the xarray */ + refcount_set(&obj->shortterm_users, 1); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b, + GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, "IOMMUFD"); + +/* Caller should xa_lock(&viommu->vdevs) to protect the return value */ +struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, + unsigned long vdev_id) +{ + struct iommufd_vdevice *vdev; + + lockdep_assert_held(&viommu->vdevs.xa_lock); + + vdev = xa_load(&viommu->vdevs, vdev_id); + return vdev ? vdev->dev : NULL; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, "IOMMUFD"); + +MODULE_DESCRIPTION("iommufd code shared with builtin modules"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c new file mode 100644 index 000000000000..d9a937450e55 --- /dev/null +++ b/drivers/iommu/iommufd/fault.c @@ -0,0 +1,462 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Intel Corporation + */ +#define pr_fmt(fmt) "iommufd: " fmt + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/iommufd.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/pci-ats.h> +#include <linux/poll.h> +#include <uapi/linux/iommufd.h> + +#include "../iommu-priv.h" +#include "iommufd_private.h" + +static int iommufd_fault_iopf_enable(struct iommufd_device *idev) +{ + struct device *dev = idev->dev; + int ret; + + /* + * Once we turn on PCI/PRI support for VF, the response failure code + * should not be forwarded to the hardware due to PRI being a shared + * resource between PF and VFs. There is no coordination for this + * shared capability. This waits for a vPRI reset to recover. + */ + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if (pdev->is_virtfn && pci_pri_supported(pdev)) + return -EINVAL; + } + + mutex_lock(&idev->iopf_lock); + /* Device iopf has already been on. */ + if (++idev->iopf_enabled > 1) { + mutex_unlock(&idev->iopf_lock); + return 0; + } + + ret = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_IOPF); + if (ret) + --idev->iopf_enabled; + mutex_unlock(&idev->iopf_lock); + + return ret; +} + +static void iommufd_fault_iopf_disable(struct iommufd_device *idev) +{ + mutex_lock(&idev->iopf_lock); + if (!WARN_ON(idev->iopf_enabled == 0)) { + if (--idev->iopf_enabled == 0) + iommu_dev_disable_feature(idev->dev, IOMMU_DEV_FEAT_IOPF); + } + mutex_unlock(&idev->iopf_lock); +} + +static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + struct iommufd_attach_handle *handle; + int ret; + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->idev = idev; + ret = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, + &handle->handle); + if (ret) + kfree(handle); + + return ret; +} + +int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + int ret; + + if (!hwpt->fault) + return -EINVAL; + + ret = iommufd_fault_iopf_enable(idev); + if (ret) + return ret; + + ret = __fault_domain_attach_dev(hwpt, idev); + if (ret) + iommufd_fault_iopf_disable(idev); + + return ret; +} + +static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, + struct iommufd_attach_handle *handle) +{ + struct iommufd_fault *fault = hwpt->fault; + struct iopf_group *group, *next; + struct list_head free_list; + unsigned long index; + + if (!fault) + return; + INIT_LIST_HEAD(&free_list); + + mutex_lock(&fault->mutex); + spin_lock(&fault->lock); + list_for_each_entry_safe(group, next, &fault->deliver, node) { + if (group->attach_handle != &handle->handle) + continue; + list_move(&group->node, &free_list); + } + spin_unlock(&fault->lock); + + list_for_each_entry_safe(group, next, &free_list, node) { + list_del(&group->node); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + + xa_for_each(&fault->response, index, group) { + if (group->attach_handle != &handle->handle) + continue; + xa_erase(&fault->response, index); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + mutex_unlock(&fault->mutex); +} + +static struct iommufd_attach_handle * +iommufd_device_get_attach_handle(struct iommufd_device *idev) +{ + struct iommu_attach_handle *handle; + + handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); + if (IS_ERR(handle)) + return NULL; + + return to_iommufd_handle(handle); +} + +void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + struct iommufd_attach_handle *handle; + + handle = iommufd_device_get_attach_handle(idev); + iommu_detach_group_handle(hwpt->domain, idev->igroup->group); + iommufd_auto_response_faults(hwpt, handle); + iommufd_fault_iopf_disable(idev); + kfree(handle); +} + +static int __fault_domain_replace_dev(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + struct iommufd_hw_pagetable *old) +{ + struct iommufd_attach_handle *handle, *curr = NULL; + int ret; + + if (old->fault) + curr = iommufd_device_get_attach_handle(idev); + + if (hwpt->fault) { + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->idev = idev; + ret = iommu_replace_group_handle(idev->igroup->group, + hwpt->domain, &handle->handle); + } else { + ret = iommu_replace_group_handle(idev->igroup->group, + hwpt->domain, NULL); + } + + if (!ret && curr) { + iommufd_auto_response_faults(old, curr); + kfree(curr); + } + + return ret; +} + +int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + struct iommufd_hw_pagetable *old) +{ + bool iopf_off = !hwpt->fault && old->fault; + bool iopf_on = hwpt->fault && !old->fault; + int ret; + + if (iopf_on) { + ret = iommufd_fault_iopf_enable(idev); + if (ret) + return ret; + } + + ret = __fault_domain_replace_dev(idev, hwpt, old); + if (ret) { + if (iopf_on) + iommufd_fault_iopf_disable(idev); + return ret; + } + + if (iopf_off) + iommufd_fault_iopf_disable(idev); + + return 0; +} + +void iommufd_fault_destroy(struct iommufd_object *obj) +{ + struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); + struct iopf_group *group, *next; + unsigned long index; + + /* + * The iommufd object's reference count is zero at this point. + * We can be confident that no other threads are currently + * accessing this pointer. Therefore, acquiring the mutex here + * is unnecessary. + */ + list_for_each_entry_safe(group, next, &fault->deliver, node) { + list_del(&group->node); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + xa_for_each(&fault->response, index, group) { + xa_erase(&fault->response, index); + iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); + iopf_free_group(group); + } + xa_destroy(&fault->response); + mutex_destroy(&fault->mutex); +} + +static void iommufd_compose_fault_message(struct iommu_fault *fault, + struct iommu_hwpt_pgfault *hwpt_fault, + struct iommufd_device *idev, + u32 cookie) +{ + hwpt_fault->flags = fault->prm.flags; + hwpt_fault->dev_id = idev->obj.id; + hwpt_fault->pasid = fault->prm.pasid; + hwpt_fault->grpid = fault->prm.grpid; + hwpt_fault->perm = fault->prm.perm; + hwpt_fault->addr = fault->prm.addr; + hwpt_fault->length = 0; + hwpt_fault->cookie = cookie; +} + +static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + size_t fault_size = sizeof(struct iommu_hwpt_pgfault); + struct iommufd_fault *fault = filep->private_data; + struct iommu_hwpt_pgfault data = {}; + struct iommufd_device *idev; + struct iopf_group *group; + struct iopf_fault *iopf; + size_t done = 0; + int rc = 0; + + if (*ppos || count % fault_size) + return -ESPIPE; + + mutex_lock(&fault->mutex); + while ((group = iommufd_fault_deliver_fetch(fault))) { + if (done >= count || + group->fault_count * fault_size > count - done) { + iommufd_fault_deliver_restore(fault, group); + break; + } + + rc = xa_alloc(&fault->response, &group->cookie, group, + xa_limit_32b, GFP_KERNEL); + if (rc) { + iommufd_fault_deliver_restore(fault, group); + break; + } + + idev = to_iommufd_handle(group->attach_handle)->idev; + list_for_each_entry(iopf, &group->faults, list) { + iommufd_compose_fault_message(&iopf->fault, + &data, idev, + group->cookie); + if (copy_to_user(buf + done, &data, fault_size)) { + xa_erase(&fault->response, group->cookie); + iommufd_fault_deliver_restore(fault, group); + rc = -EFAULT; + break; + } + done += fault_size; + } + } + mutex_unlock(&fault->mutex); + + return done == 0 ? rc : done; +} + +static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf, + size_t count, loff_t *ppos) +{ + size_t response_size = sizeof(struct iommu_hwpt_page_response); + struct iommufd_fault *fault = filep->private_data; + struct iommu_hwpt_page_response response; + struct iopf_group *group; + size_t done = 0; + int rc = 0; + + if (*ppos || count % response_size) + return -ESPIPE; + + mutex_lock(&fault->mutex); + while (count > done) { + rc = copy_from_user(&response, buf + done, response_size); + if (rc) + break; + + static_assert((int)IOMMUFD_PAGE_RESP_SUCCESS == + (int)IOMMU_PAGE_RESP_SUCCESS); + static_assert((int)IOMMUFD_PAGE_RESP_INVALID == + (int)IOMMU_PAGE_RESP_INVALID); + if (response.code != IOMMUFD_PAGE_RESP_SUCCESS && + response.code != IOMMUFD_PAGE_RESP_INVALID) { + rc = -EINVAL; + break; + } + + group = xa_erase(&fault->response, response.cookie); + if (!group) { + rc = -EINVAL; + break; + } + + iopf_group_response(group, response.code); + iopf_free_group(group); + done += response_size; + } + mutex_unlock(&fault->mutex); + + return done == 0 ? rc : done; +} + +static __poll_t iommufd_fault_fops_poll(struct file *filep, + struct poll_table_struct *wait) +{ + struct iommufd_fault *fault = filep->private_data; + __poll_t pollflags = EPOLLOUT; + + poll_wait(filep, &fault->wait_queue, wait); + spin_lock(&fault->lock); + if (!list_empty(&fault->deliver)) + pollflags |= EPOLLIN | EPOLLRDNORM; + spin_unlock(&fault->lock); + + return pollflags; +} + +static int iommufd_fault_fops_release(struct inode *inode, struct file *filep) +{ + struct iommufd_fault *fault = filep->private_data; + + refcount_dec(&fault->obj.users); + iommufd_ctx_put(fault->ictx); + return 0; +} + +static const struct file_operations iommufd_fault_fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .read = iommufd_fault_fops_read, + .write = iommufd_fault_fops_write, + .poll = iommufd_fault_fops_poll, + .release = iommufd_fault_fops_release, +}; + +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_fault_alloc *cmd = ucmd->cmd; + struct iommufd_fault *fault; + struct file *filep; + int fdno; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT); + if (IS_ERR(fault)) + return PTR_ERR(fault); + + fault->ictx = ucmd->ictx; + INIT_LIST_HEAD(&fault->deliver); + xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); + mutex_init(&fault->mutex); + spin_lock_init(&fault->lock); + init_waitqueue_head(&fault->wait_queue); + + filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops, + fault, O_RDWR); + if (IS_ERR(filep)) { + rc = PTR_ERR(filep); + goto out_abort; + } + + refcount_inc(&fault->obj.users); + iommufd_ctx_get(fault->ictx); + fault->filep = filep; + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + rc = fdno; + goto out_fput; + } + + cmd->out_fault_id = fault->obj.id; + cmd->out_fault_fd = fdno; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put_fdno; + iommufd_object_finalize(ucmd->ictx, &fault->obj); + + fd_install(fdno, fault->filep); + + return 0; +out_put_fdno: + put_unused_fd(fdno); +out_fput: + fput(filep); +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj); + + return rc; +} + +int iommufd_fault_iopf_handler(struct iopf_group *group) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_fault *fault; + + hwpt = group->attach_handle->domain->fault_data; + fault = hwpt->fault; + + spin_lock(&fault->lock); + list_add_tail(&group->node, &fault->deliver); + spin_unlock(&fault->lock); + + wake_up_interruptible(&fault->wait_queue); + + return 0; +} diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 33d142f8057d..598be26a14e2 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -8,6 +8,15 @@ #include "../iommu-priv.h" #include "iommufd_private.h" +static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt) +{ + if (hwpt->domain) + iommu_domain_free(hwpt->domain); + + if (hwpt->fault) + refcount_dec(&hwpt->fault->obj.users); +} + void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) { struct iommufd_hwpt_paging *hwpt_paging = @@ -22,9 +31,7 @@ void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) hwpt_paging->common.domain); } - if (hwpt_paging->common.domain) - iommu_domain_free(hwpt_paging->common.domain); - + __iommufd_hwpt_destroy(&hwpt_paging->common); refcount_dec(&hwpt_paging->ioas->obj.users); } @@ -49,10 +56,11 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) struct iommufd_hwpt_nested *hwpt_nested = container_of(obj, struct iommufd_hwpt_nested, common.obj); - if (hwpt_nested->common.domain) - iommu_domain_free(hwpt_nested->common.domain); - - refcount_dec(&hwpt_nested->parent->common.obj.users); + __iommufd_hwpt_destroy(&hwpt_nested->common); + if (hwpt_nested->viommu) + refcount_dec(&hwpt_nested->viommu->obj.users); + else + refcount_dec(&hwpt_nested->parent->common.obj.users); } void iommufd_hwpt_nested_abort(struct iommufd_object *obj) @@ -102,7 +110,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_FAULT_ID_VALID; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; @@ -110,10 +119,13 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, lockdep_assert_held(&ioas->mutex); - if ((flags || user_data) && !ops->domain_alloc_user) + if ((flags || user_data) && !ops->domain_alloc_paging_flags) return ERR_PTR(-EOPNOTSUPP); if (flags & ~valid_flags) return ERR_PTR(-EOPNOTSUPP); + if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + !device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) + return ERR_PTR(-EOPNOTSUPP); hwpt_paging = __iommufd_object_alloc( ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); @@ -127,9 +139,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, hwpt_paging->ioas = ioas; hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - if (ops->domain_alloc_user) { - hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL, - user_data); + if (ops->domain_alloc_paging_flags) { + hwpt->domain = ops->domain_alloc_paging_flags(idev->dev, + flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; @@ -137,9 +149,10 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, } hwpt->domain->owner = ops; } else { - hwpt->domain = iommu_domain_alloc(idev->dev->bus); - if (!hwpt->domain) { - rc = -ENOMEM; + hwpt->domain = iommu_paging_domain_alloc(idev->dev); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; goto out_abort; } } @@ -213,9 +226,11 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt; int rc; - if (flags || !user_data->len || !ops->domain_alloc_user) + if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) || + !user_data->len || !ops->domain_alloc_nested) return ERR_PTR(-EOPNOTSUPP); - if (parent->auto_domain || !parent->nest_parent) + if (parent->auto_domain || !parent->nest_parent || + parent->common.domain->owner != ops) return ERR_PTR(-EINVAL); hwpt_nested = __iommufd_object_alloc( @@ -227,8 +242,9 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, refcount_inc(&parent->common.obj.users); hwpt_nested->parent = parent; - hwpt->domain = ops->domain_alloc_user(idev->dev, flags, - parent->common.domain, user_data); + hwpt->domain = ops->domain_alloc_nested( + idev->dev, parent->common.domain, + flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; @@ -247,6 +263,62 @@ out_abort: return ERR_PTR(rc); } +/** + * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU + * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED + * hw_pagetable. + */ +static struct iommufd_hwpt_nested * +iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (flags & ~IOMMU_HWPT_FAULT_ID_VALID) + return ERR_PTR(-EOPNOTSUPP); + if (!user_data->len) + return ERR_PTR(-EOPNOTSUPP); + if (!viommu->ops || !viommu->ops->alloc_domain_nested) + return ERR_PTR(-EOPNOTSUPP); + + hwpt_nested = __iommufd_object_alloc( + viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + + hwpt_nested->viommu = viommu; + refcount_inc(&viommu->obj.users); + hwpt_nested->parent = viommu->hwpt; + + hwpt->domain = + viommu->ops->alloc_domain_nested(viommu, + flags & ~IOMMU_HWPT_FAULT_ID_VALID, + user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + hwpt->domain->owner = viommu->iommu_dev->ops; + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EINVAL; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; @@ -303,11 +375,42 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) goto out_unlock; } hwpt = &hwpt_nested->common; + } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_viommu *viommu; + + viommu = container_of(pt_obj, struct iommufd_viommu, obj); + if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { + rc = -EINVAL; + goto out_unlock; + } + hwpt_nested = iommufd_viommu_alloc_hwpt_nested( + viommu, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; } else { rc = -EINVAL; goto out_put_pt; } + if (cmd->flags & IOMMU_HWPT_FAULT_ID_VALID) { + struct iommufd_fault *fault; + + fault = iommufd_get_fault(ucmd, cmd->fault_id); + if (IS_ERR(fault)) { + rc = PTR_ERR(fault); + goto out_hwpt; + } + hwpt->fault = fault; + hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; + hwpt->domain->fault_data = hwpt; + refcount_inc(&fault->obj.users); + iommufd_put_object(ucmd->ictx, &fault->obj); + } + cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) @@ -384,7 +487,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) .entry_len = cmd->entry_len, .entry_num = cmd->entry_num, }; - struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *pt_obj; u32 done_num = 0; int rc; @@ -398,17 +501,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) goto out; } - hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id); - if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); + pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) { + rc = PTR_ERR(pt_obj); goto out; } + if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + if (!hwpt->domain->ops || + !hwpt->domain->ops->cache_invalidate_user) { + rc = -EOPNOTSUPP; + goto out_put_pt; + } + rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, + &data_array); + } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { + struct iommufd_viommu *viommu = + container_of(pt_obj, struct iommufd_viommu, obj); + + if (!viommu->ops || !viommu->ops->cache_invalidate) { + rc = -EOPNOTSUPP; + goto out_put_pt; + } + rc = viommu->ops->cache_invalidate(viommu, &data_array); + } else { + rc = -EINVAL; + goto out_put_pt; + } - rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, - &data_array); done_num = data_array.entry_num; - iommufd_put_object(ucmd->ictx, &hwpt->obj); +out_put_pt: + iommufd_put_object(ucmd->ictx, pt_obj); out: cmd->entry_num = done_num; if (iommufd_ucmd_respond(ucmd, sizeof(*cmd))) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 05fd9d3abf1b..8a790e597e12 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -8,17 +8,17 @@ * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/lockdep.h> -#include <linux/iommu.h> #include <linux/sched/mm.h> -#include <linux/err.h> #include <linux/slab.h> -#include <linux/errno.h> #include <uapi/linux/iommufd.h> -#include "io_pagetable.h" #include "double_span.h" +#include "io_pagetable.h" struct iopt_pages_list { struct iopt_pages *pages; @@ -107,11 +107,12 @@ static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, * Does not return a 0 IOVA even if it is valid. */ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, - unsigned long uptr, unsigned long length) + unsigned long addr, unsigned long length) { - unsigned long page_offset = uptr % PAGE_SIZE; + unsigned long page_offset = addr % PAGE_SIZE; struct interval_tree_double_span_iter used_span; struct interval_tree_span_iter allowed_span; + unsigned long max_alignment = PAGE_SIZE; unsigned long iova_alignment; lockdep_assert_held(&iopt->iova_rwsem); @@ -121,15 +122,22 @@ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, return -EOVERFLOW; /* - * Keep alignment present in the uptr when building the IOVA, this + * Keep alignment present in addr when building the IOVA, which * increases the chance we can map a THP. */ - if (!uptr) + if (!addr) iova_alignment = roundup_pow_of_two(length); else iova_alignment = min_t(unsigned long, roundup_pow_of_two(length), - 1UL << __ffs64(uptr)); + 1UL << __ffs64(addr)); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + max_alignment = HPAGE_SIZE; +#endif + /* Protect against ALIGN() overflow */ + if (iova_alignment >= max_alignment) + iova_alignment = max_alignment; if (iova_alignment < iopt->iova_alignment) return -EINVAL; @@ -240,6 +248,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, int iommu_prot, unsigned int flags) { struct iopt_pages_list *elm; + unsigned long start; unsigned long iova; int rc = 0; @@ -259,9 +268,15 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, /* Use the first entry to guess the ideal IOVA alignment */ elm = list_first_entry(pages_list, struct iopt_pages_list, next); - rc = iopt_alloc_iova( - iopt, dst_iova, - (uintptr_t)elm->pages->uptr + elm->start_byte, length); + switch (elm->pages->type) { + case IOPT_ADDRESS_USER: + start = elm->start_byte + (uintptr_t)elm->pages->uptr; + break; + case IOPT_ADDRESS_FILE: + start = elm->start_byte + elm->pages->start; + break; + } + rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) goto out_unlock; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && @@ -376,6 +391,34 @@ out_unlock_domains: return rc; } +static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + struct iopt_pages *pages, unsigned long *iova, + unsigned long length, unsigned long start_byte, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list elm = {}; + LIST_HEAD(pages_list); + int rc; + + elm.pages = pages; + elm.start_byte = start_byte; + if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && + elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) + elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; + elm.length = length; + list_add(&elm.next, &pages_list); + + rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); + if (rc) { + if (elm.area) + iopt_abort_area(elm.area); + if (elm.pages) + iopt_put_pages(elm.pages); + return rc; + } + return 0; +} + /** * iopt_map_user_pages() - Map a user VA to an iova in the io page table * @ictx: iommufd_ctx the iopt is part of @@ -400,29 +443,41 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long length, int iommu_prot, unsigned int flags) { - struct iopt_pages_list elm = {}; - LIST_HEAD(pages_list); - int rc; + struct iopt_pages *pages; - elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); - if (IS_ERR(elm.pages)) - return PTR_ERR(elm.pages); - if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && - elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) - elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; - elm.start_byte = uptr - elm.pages->uptr; - elm.length = length; - list_add(&elm.next, &pages_list); + pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) + return PTR_ERR(pages); - rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); - if (rc) { - if (elm.area) - iopt_abort_area(elm.area); - if (elm.pages) - iopt_put_pages(elm.pages); - return rc; - } - return 0; + return iopt_map_common(ictx, iopt, pages, iova, length, + uptr - pages->uptr, iommu_prot, flags); +} + +/** + * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file. + * @ictx: iommufd_ctx the iopt is part of + * @iopt: io_pagetable to act on + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains + * the chosen iova on output. Otherwise is the iova to map to on input + * @file: file to map + * @start: map file starting at this byte offset + * @length: Number of bytes to map + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping + * @flags: IOPT_ALLOC_IOVA or zero + */ +int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, struct file *file, + unsigned long start, unsigned long length, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages *pages; + + pages = iopt_alloc_file_pages(file, start, length, + iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) + return PTR_ERR(pages); + return iopt_map_common(ictx, iopt, pages, iova, length, + start - pages->start, iommu_prot, flags); } struct iova_bitmap_fn_arg { diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 0ec3509b7e33..10c928a9a463 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -6,8 +6,8 @@ #define __IO_PAGETABLE_H #include <linux/interval_tree.h> -#include <linux/mutex.h> #include <linux/kref.h> +#include <linux/mutex.h> #include <linux/xarray.h> #include "iommufd_private.h" @@ -173,6 +173,12 @@ enum { IOPT_PAGES_ACCOUNT_NONE = 0, IOPT_PAGES_ACCOUNT_USER = 1, IOPT_PAGES_ACCOUNT_MM = 2, + IOPT_PAGES_ACCOUNT_MODE_NUM = 3, +}; + +enum iopt_address_type { + IOPT_ADDRESS_USER = 0, + IOPT_ADDRESS_FILE = 1, }; /* @@ -195,7 +201,14 @@ struct iopt_pages { struct task_struct *source_task; struct mm_struct *source_mm; struct user_struct *source_user; - void __user *uptr; + enum iopt_address_type type; + union { + void __user *uptr; /* IOPT_ADDRESS_USER */ + struct { /* IOPT_ADDRESS_FILE */ + struct file *file; + unsigned long start; + }; + }; bool writable:1; u8 account_mode; @@ -206,8 +219,10 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; -struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, - bool writable); +struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, + unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { @@ -238,4 +253,9 @@ struct iopt_pages_access { unsigned int users; }; +struct pfn_reader_user; + +int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user); + #endif diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 742248276548..1542c5fd10a8 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -2,9 +2,10 @@ /* * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include <linux/file.h> #include <linux/interval_tree.h> -#include <linux/iommufd.h> #include <linux/iommu.h> +#include <linux/iommufd.h> #include <uapi/linux/iommufd.h> #include "io_pagetable.h" @@ -51,7 +52,10 @@ int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_table; + + down_read(&ucmd->ictx->ioas_creation_lock); iommufd_object_finalize(ucmd->ictx, &ioas->obj); + up_read(&ucmd->ictx->ioas_creation_lock); return 0; out_table: @@ -197,6 +201,52 @@ static int conv_iommu_prot(u32 map_flags) return iommu_prot; } +int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map_file *cmd = ucmd->cmd; + unsigned long iova = cmd->iova; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + struct file *file; + int rc; + + if (cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -EOPNOTSUPP; + + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + + file = fget(cmd->fd); + if (!file) + return -EBADF; + + rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file, + cmd->start, cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(ucmd->ictx, &ioas->obj); + fput(file); + return rc; +} + int iommufd_ioas_map(struct iommufd_ucmd *ucmd) { struct iommu_ioas_map *cmd = ucmd->cmd; @@ -213,6 +263,10 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd) if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); if (IS_ERR(ioas)) return PTR_ERR(ioas); @@ -253,6 +307,10 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) cmd->dst_iova >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id); if (IS_ERR(src_ioas)) return PTR_ERR(src_ioas); @@ -319,6 +377,215 @@ out_put: return rc; } +static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx, + struct xarray *ioas_list) +{ + struct iommufd_ioas *ioas; + unsigned long index; + + xa_for_each(ioas_list, index, ioas) { + up_write(&ioas->iopt.iova_rwsem); + refcount_dec(&ioas->obj.users); + } + up_write(&ictx->ioas_creation_lock); + xa_destroy(ioas_list); +} + +static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx, + struct xarray *ioas_list) +{ + struct iommufd_object *obj; + unsigned long index; + int rc; + + /* + * This is very ugly, it is done instead of adding a lock around + * pages->source_mm, which is a performance path for mdev, we just + * obtain the write side of all the iova_rwsems which also protects the + * pages->source_*. Due to copies we can't know which IOAS could read + * from the pages, so we just lock everything. This is the only place + * locks are nested and they are uniformly taken in ID order. + * + * ioas_creation_lock prevents new IOAS from being installed in the + * xarray while we do this, and also prevents more than one thread from + * holding nested locks. + */ + down_write(&ictx->ioas_creation_lock); + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + struct iommufd_ioas *ioas; + + if (!obj || obj->type != IOMMUFD_OBJ_IOAS) + continue; + + if (!refcount_inc_not_zero(&obj->users)) + continue; + + xa_unlock(&ictx->objects); + + ioas = container_of(obj, struct iommufd_ioas, obj); + down_write_nest_lock(&ioas->iopt.iova_rwsem, + &ictx->ioas_creation_lock); + + rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL)); + if (rc) { + iommufd_release_all_iova_rwsem(ictx, ioas_list); + return rc; + } + + xa_lock(&ictx->objects); + } + xa_unlock(&ictx->objects); + return 0; +} + +static bool need_charge_update(struct iopt_pages *pages) +{ + switch (pages->account_mode) { + case IOPT_PAGES_ACCOUNT_NONE: + return false; + case IOPT_PAGES_ACCOUNT_MM: + return pages->source_mm != current->mm; + case IOPT_PAGES_ACCOUNT_USER: + /* + * Update when mm changes because it also accounts + * in mm->pinned_vm. + */ + return (pages->source_user != current_user()) || + (pages->source_mm != current->mm); + } + return true; +} + +static int charge_current(unsigned long *npinned) +{ + struct iopt_pages tmp = { + .source_mm = current->mm, + .source_task = current->group_leader, + .source_user = current_user(), + }; + unsigned int account_mode; + int rc; + + for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM; + account_mode++) { + if (!npinned[account_mode]) + continue; + + tmp.account_mode = account_mode; + rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true, + NULL); + if (rc) + goto err_undo; + } + return 0; + +err_undo: + while (account_mode != 0) { + account_mode--; + if (!npinned[account_mode]) + continue; + tmp.account_mode = account_mode; + iopt_pages_update_pinned(&tmp, npinned[account_mode], false, + NULL); + } + return rc; +} + +static void change_mm(struct iopt_pages *pages) +{ + struct task_struct *old_task = pages->source_task; + struct user_struct *old_user = pages->source_user; + struct mm_struct *old_mm = pages->source_mm; + + pages->source_mm = current->mm; + mmgrab(pages->source_mm); + mmdrop(old_mm); + + pages->source_task = current->group_leader; + get_task_struct(pages->source_task); + put_task_struct(old_task); + + pages->source_user = get_uid(current_user()); + free_uid(old_user); +} + +#define for_each_ioas_area(_xa, _index, _ioas, _area) \ + xa_for_each((_xa), (_index), (_ioas)) \ + for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \ + _area; \ + _area = iopt_area_iter_next(_area, 0, ULONG_MAX)) + +int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_change_process *cmd = ucmd->cmd; + struct iommufd_ctx *ictx = ucmd->ictx; + unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {}; + struct iommufd_ioas *ioas; + struct iopt_area *area; + struct iopt_pages *pages; + struct xarray ioas_list; + unsigned long index; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + xa_init(&ioas_list); + rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list); + if (rc) + return rc; + + for_each_ioas_area(&ioas_list, index, ioas, area) { + if (area->pages->type != IOPT_ADDRESS_FILE) { + rc = -EINVAL; + goto out; + } + } + + /* + * Count last_pinned pages, then clear it to avoid double counting + * if the same iopt_pages is visited multiple times in this loop. + * Since we are under all the locks, npinned == last_npinned, so we + * can easily restore last_npinned before we return. + */ + for_each_ioas_area(&ioas_list, index, ioas, area) { + pages = area->pages; + + if (need_charge_update(pages)) { + all_npinned[pages->account_mode] += pages->last_npinned; + pages->last_npinned = 0; + } + } + + rc = charge_current(all_npinned); + + if (rc) { + /* Charge failed. Fix last_npinned and bail. */ + for_each_ioas_area(&ioas_list, index, ioas, area) + area->pages->last_npinned = area->pages->npinned; + goto out; + } + + for_each_ioas_area(&ioas_list, index, ioas, area) { + pages = area->pages; + + /* Uncharge the old one (which also restores last_npinned) */ + if (need_charge_update(pages)) { + int r = iopt_pages_update_pinned(pages, pages->npinned, + false, NULL); + + if (WARN_ON(r)) + rc = r; + } + change_mm(pages); + } + +out: + iommufd_release_all_iova_rwsem(ictx, &ioas_list); + return rc; +} + int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 991f864d1f9b..0b1bafc7fd99 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -4,14 +4,16 @@ #ifndef __IOMMUFD_PRIVATE_H #define __IOMMUFD_PRIVATE_H -#include <linux/rwsem.h> -#include <linux/xarray.h> -#include <linux/refcount.h> -#include <linux/uaccess.h> #include <linux/iommu.h> +#include <linux/iommufd.h> #include <linux/iova_bitmap.h> +#include <linux/rwsem.h> +#include <linux/uaccess.h> +#include <linux/xarray.h> #include <uapi/linux/iommufd.h> +#include "../iommu-priv.h" + struct iommu_domain; struct iommu_group; struct iommu_option; @@ -22,6 +24,7 @@ struct iommufd_ctx { struct xarray objects; struct xarray groups; wait_queue_head_t destroy_wait; + struct rw_semaphore ioas_creation_lock; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -67,6 +70,10 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags); +int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, struct file *file, + unsigned long start, unsigned long length, + int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags); @@ -120,28 +127,6 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, return 0; } -enum iommufd_object_type { - IOMMUFD_OBJ_NONE, - IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, - IOMMUFD_OBJ_DEVICE, - IOMMUFD_OBJ_HWPT_PAGING, - IOMMUFD_OBJ_HWPT_NESTED, - IOMMUFD_OBJ_IOAS, - IOMMUFD_OBJ_ACCESS, -#ifdef CONFIG_IOMMUFD_TEST - IOMMUFD_OBJ_SELFTEST, -#endif - IOMMUFD_OBJ_MAX, -}; - -/* Base struct for all objects with a userspace ID handle. */ -struct iommufd_object { - refcount_t shortterm_users; - refcount_t users; - enum iommufd_object_type type; - unsigned int id; -}; - static inline bool iommufd_lock_obj(struct iommufd_object *obj) { if (!refcount_inc_not_zero(&obj->users)) @@ -222,10 +207,6 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, iommufd_object_remove(ictx, obj, obj->id, 0); } -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type); - #define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ @@ -273,6 +254,8 @@ void iommufd_ioas_destroy(struct iommufd_object *obj); int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd); +int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); @@ -292,6 +275,7 @@ int iommufd_check_iova_range(struct io_pagetable *iopt, struct iommufd_hw_pagetable { struct iommufd_object obj; struct iommu_domain *domain; + struct iommufd_fault *fault; }; struct iommufd_hwpt_paging { @@ -308,6 +292,7 @@ struct iommufd_hwpt_paging { struct iommufd_hwpt_nested { struct iommufd_hw_pagetable common; struct iommufd_hwpt_paging *parent; + struct iommufd_viommu *viommu; }; static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) @@ -321,6 +306,25 @@ to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) return container_of(hwpt, struct iommufd_hwpt_paging, common); } +static inline struct iommufd_hwpt_nested * +to_hwpt_nested(struct iommufd_hw_pagetable *hwpt) +{ + return container_of(hwpt, struct iommufd_hwpt_nested, common); +} + +static inline struct iommufd_hwpt_paging * +find_hwpt_paging(struct iommufd_hw_pagetable *hwpt) +{ + switch (hwpt->obj.type) { + case IOMMUFD_OBJ_HWPT_PAGING: + return to_hwpt_paging(hwpt); + case IOMMUFD_OBJ_HWPT_NESTED: + return to_hwpt_nested(hwpt)->parent; + default: + return NULL; + } +} + static inline struct iommufd_hwpt_paging * iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) { @@ -395,6 +399,9 @@ struct iommufd_device { /* always the physical device */ struct device *dev; bool enforce_cache_coherency; + /* protect iopf_enabled counter */ + struct mutex iopf_lock; + unsigned int iopf_enabled; }; static inline struct iommufd_device * @@ -426,6 +433,128 @@ void iopt_remove_access(struct io_pagetable *iopt, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); +/* + * An iommufd_fault object represents an interface to deliver I/O page faults + * to the user space. These objects are created/destroyed by the user space and + * associated with hardware page table objects during page-table allocation. + */ +struct iommufd_fault { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct file *filep; + + spinlock_t lock; /* protects the deliver list */ + struct list_head deliver; + struct mutex mutex; /* serializes response flows */ + struct xarray response; + + struct wait_queue_head wait_queue; +}; + +/* Fetch the first node out of the fault->deliver list */ +static inline struct iopf_group * +iommufd_fault_deliver_fetch(struct iommufd_fault *fault) +{ + struct list_head *list = &fault->deliver; + struct iopf_group *group = NULL; + + spin_lock(&fault->lock); + if (!list_empty(list)) { + group = list_first_entry(list, struct iopf_group, node); + list_del(&group->node); + } + spin_unlock(&fault->lock); + return group; +} + +/* Restore a node back to the head of the fault->deliver list */ +static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault, + struct iopf_group *group) +{ + spin_lock(&fault->lock); + list_add(&group->node, &fault->deliver); + spin_unlock(&fault->lock); +} + +struct iommufd_attach_handle { + struct iommu_attach_handle handle; + struct iommufd_device *idev; +}; + +/* Convert an iommu attach handle to iommufd handle. */ +#define to_iommufd_handle(hdl) container_of(hdl, struct iommufd_attach_handle, handle) + +static inline struct iommufd_fault * +iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_FAULT), + struct iommufd_fault, obj); +} + +int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); +void iommufd_fault_destroy(struct iommufd_object *obj); +int iommufd_fault_iopf_handler(struct iopf_group *group); + +int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev); +void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev); +int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + struct iommufd_hw_pagetable *old); + +static inline int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + if (hwpt->fault) + return iommufd_fault_domain_attach_dev(hwpt, idev); + + return iommu_attach_group(hwpt->domain, idev->igroup->group); +} + +static inline void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev) +{ + if (hwpt->fault) { + iommufd_fault_domain_detach_dev(hwpt, idev); + return; + } + + iommu_detach_group(hwpt->domain, idev->igroup->group); +} + +static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + struct iommufd_hw_pagetable *old) +{ + if (old->fault || hwpt->fault) + return iommufd_fault_domain_replace_dev(idev, hwpt, old); + + return iommu_group_replace_domain(idev->igroup->group, hwpt->domain); +} + +static inline struct iommufd_viommu * +iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_VIOMMU), + struct iommufd_viommu, obj); +} + +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_viommu_destroy(struct iommufd_object *obj); +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_vdevice_destroy(struct iommufd_object *obj); + +struct iommufd_vdevice { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_viommu *viommu; + struct device *dev; + u64 id; /* per-vIOMMU virtual ID */ +}; + #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index e854d3f67205..a6b7a163f636 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -4,8 +4,8 @@ #ifndef _UAPI_IOMMUFD_TEST_H #define _UAPI_IOMMUFD_TEST_H -#include <linux/types.h> #include <linux/iommufd.h> +#include <linux/types.h> enum { IOMMU_TEST_OP_ADD_RESERVED = 1, @@ -22,6 +22,8 @@ enum { IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, IOMMU_TEST_OP_DIRTY, IOMMU_TEST_OP_MD_CHECK_IOTLB, + IOMMU_TEST_OP_TRIGGER_IOPF, + IOMMU_TEST_OP_DEV_CHECK_CACHE, }; enum { @@ -53,6 +55,11 @@ enum { MOCK_NESTED_DOMAIN_IOTLB_NUM = 4, }; +enum { + MOCK_DEV_CACHE_ID_MAX = 3, + MOCK_DEV_CACHE_NUM = 4, +}; + struct iommu_test_cmd { __u32 size; __u32 op; @@ -127,6 +134,17 @@ struct iommu_test_cmd { __u32 id; __u32 iotlb; } check_iotlb; + struct { + __u32 dev_id; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + } trigger_iopf; + struct { + __u32 id; + __u32 cache; + } check_dev_cache; }; __u32 last; }; @@ -144,6 +162,7 @@ struct iommu_test_hw_info { /* Should not be equal to any defined value in enum iommu_hwpt_data_type */ #define IOMMU_HWPT_DATA_SELFTEST 0xdead #define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef +#define IOMMU_TEST_DEV_CACHE_DEFAULT 0xbaddad /** * struct iommu_hwpt_selftest @@ -172,4 +191,25 @@ struct iommu_hwpt_invalidate_selftest { __u32 iotlb_id; }; +#define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef + +/* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */ +#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef +#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef + +/** + * struct iommu_viommu_invalidate_selftest - Invalidation data for Mock VIOMMU + * (IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) + * @flags: Invalidate flags + * @cache_id: Invalidate cache entry index + * + * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @cache_id will be ignored + */ +struct iommu_viommu_invalidate_selftest { +#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0) + __u32 flags; + __u32 vdev_id; + __u32 cache_id; +}; + #endif diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index db8c46bee155..39a86a4a1d3a 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -3,10 +3,10 @@ * Copyright (c) 2022, Oracle and/or its affiliates. * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved */ +#include <linux/highmem.h> #include <linux/iova_bitmap.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/highmem.h> #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) @@ -35,6 +35,9 @@ struct iova_bitmap_map { /* base IOVA representing bit 0 of the first page */ unsigned long iova; + /* mapped length */ + unsigned long length; + /* page size order that each bit granules to */ unsigned long pgshift; @@ -113,9 +116,6 @@ struct iova_bitmap { /* length of the IOVA range for the whole bitmap */ size_t length; - - /* length of the IOVA range set ahead the pinned pages */ - unsigned long set_ahead_length; }; /* @@ -130,7 +130,7 @@ struct iova_bitmap { static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap, unsigned long iova) { - unsigned long pgsize = 1 << bitmap->mapped.pgshift; + unsigned long pgsize = 1UL << bitmap->mapped.pgshift; return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize); } @@ -156,6 +156,8 @@ static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap) return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip); } +static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap); + /* * Pins the bitmap user pages for the current range window. * This is internal to IOVA bitmap and called when advancing the @@ -206,6 +208,7 @@ static int iova_bitmap_get(struct iova_bitmap *bitmap) * aligned. */ mapped->pgoff = offset_in_page(addr); + mapped->length = iova_bitmap_mapped_length(bitmap); return 0; } @@ -263,16 +266,13 @@ struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, goto err; } - rc = iova_bitmap_get(bitmap); - if (rc) - goto err; return bitmap; err: iova_bitmap_free(bitmap); return ERR_PTR(rc); } -EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, "IOMMUFD"); /** * iova_bitmap_free() - Frees an IOVA bitmap object @@ -294,7 +294,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) kfree(bitmap); } -EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, "IOMMUFD"); /* * Returns the remaining bitmap indexes from mapped_total_index to process for @@ -338,65 +338,34 @@ static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap) } /* - * Returns true if there's not more data to iterate. + * Returns true if [@iova..@iova+@length-1] is part of the mapped IOVA range. */ -static bool iova_bitmap_done(struct iova_bitmap *bitmap) +static bool iova_bitmap_mapped_range(struct iova_bitmap_map *mapped, + unsigned long iova, size_t length) { - return bitmap->mapped_base_index >= bitmap->mapped_total_index; -} - -static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap, - size_t set_ahead_length) -{ - int ret = 0; - - while (set_ahead_length > 0 && !iova_bitmap_done(bitmap)) { - unsigned long length = iova_bitmap_mapped_length(bitmap); - unsigned long iova = iova_bitmap_mapped_iova(bitmap); - - ret = iova_bitmap_get(bitmap); - if (ret) - break; - - length = min(length, set_ahead_length); - iova_bitmap_set(bitmap, iova, length); - - set_ahead_length -= length; - bitmap->mapped_base_index += - iova_bitmap_offset_to_index(bitmap, length - 1) + 1; - iova_bitmap_put(bitmap); - } - - bitmap->set_ahead_length = 0; - return ret; + return mapped->npages && + (iova >= mapped->iova && + (iova + length - 1) <= (mapped->iova + mapped->length - 1)); } /* - * Advances to the next range, releases the current pinned + * Advances to a selected range, releases the current pinned * pages and pins the next set of bitmap pages. * Returns 0 on success or otherwise errno. */ -static int iova_bitmap_advance(struct iova_bitmap *bitmap) +static int iova_bitmap_advance_to(struct iova_bitmap *bitmap, + unsigned long iova) { - unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1; - unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1; + unsigned long index; - bitmap->mapped_base_index += count; + index = iova_bitmap_offset_to_index(bitmap, iova - bitmap->iova); + if (index >= bitmap->mapped_total_index) + return -EINVAL; + bitmap->mapped_base_index = index; iova_bitmap_put(bitmap); - if (iova_bitmap_done(bitmap)) - return 0; - - /* Iterate, set and skip any bits requested for next iteration */ - if (bitmap->set_ahead_length) { - int ret; - ret = iova_bitmap_set_ahead(bitmap, bitmap->set_ahead_length); - if (ret) - return ret; - } - - /* When advancing the index we pin the next set of bitmap pages */ + /* Pin the next set of bitmap pages */ return iova_bitmap_get(bitmap); } @@ -416,19 +385,9 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap) int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, iova_bitmap_fn_t fn) { - int ret = 0; - - for (; !iova_bitmap_done(bitmap) && !ret; - ret = iova_bitmap_advance(bitmap)) { - ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap), - iova_bitmap_mapped_length(bitmap), opaque); - if (ret) - break; - } - - return ret; + return fn(bitmap, bitmap->iova, bitmap->length, opaque); } -EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, "IOMMUFD"); /** * iova_bitmap_set() - Records an IOVA range in bitmap @@ -444,11 +403,25 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length) { struct iova_bitmap_map *mapped = &bitmap->mapped; - unsigned long cur_bit = ((iova - mapped->iova) >> - mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; - unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> - mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; - unsigned long last_page_idx = mapped->npages - 1; + unsigned long cur_bit, last_bit, last_page_idx; + +update_indexes: + if (unlikely(!iova_bitmap_mapped_range(mapped, iova, length))) { + + /* + * The attempt to advance the base index to @iova + * may fail if it's out of bounds, or pinning the pages + * returns an error. + */ + if (iova_bitmap_advance_to(bitmap, iova)) + return; + } + + last_page_idx = mapped->npages - 1; + cur_bit = ((iova - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + last_bit = (((iova + length - 1) - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; do { unsigned int page_idx = cur_bit / BITS_PER_PAGE; @@ -457,18 +430,19 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, last_bit - cur_bit + 1); void *kaddr; - if (unlikely(page_idx > last_page_idx)) - break; + if (unlikely(page_idx > last_page_idx)) { + unsigned long left = + ((last_bit - cur_bit + 1) << mapped->pgshift); + + iova += (length - left); + length = left; + goto update_indexes; + } kaddr = kmap_local_page(mapped->pages[page_idx]); bitmap_set(kaddr, offset, nbits); kunmap_local(kaddr); cur_bit += nbits; } while (cur_bit <= last_bit); - - if (unlikely(cur_bit <= last_bit)) { - bitmap->set_ahead_length = - ((last_bit - cur_bit + 1) << bitmap->mapped.pgshift); - } } -EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, "IOMMUFD"); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 39b32932c61e..ccf616462a1c 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -8,15 +8,15 @@ */ #define pr_fmt(fmt) "iommufd: " fmt +#include <linux/bug.h> #include <linux/file.h> #include <linux/fs.h> -#include <linux/module.h> -#include <linux/slab.h> +#include <linux/iommufd.h> #include <linux/miscdevice.h> +#include <linux/module.h> #include <linux/mutex.h> -#include <linux/bug.h> +#include <linux/slab.h> #include <uapi/linux/iommufd.h> -#include <linux/iommufd.h> #include "io_pagetable.h" #include "iommufd_private.h" @@ -29,38 +29,6 @@ struct iommufd_object_ops { static const struct iommufd_object_ops iommufd_object_ops[]; static struct miscdevice vfio_misc_dev; -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type) -{ - struct iommufd_object *obj; - int rc; - - obj = kzalloc(size, GFP_KERNEL_ACCOUNT); - if (!obj) - return ERR_PTR(-ENOMEM); - obj->type = type; - /* Starts out bias'd by 1 until it is removed from the xarray */ - refcount_set(&obj->shortterm_users, 1); - refcount_set(&obj->users, 1); - - /* - * Reserve an ID in the xarray but do not publish the pointer yet since - * the caller hasn't initialized it yet. Once the pointer is published - * in the xarray and visible to other threads we can't reliably destroy - * it anymore, so the caller must complete all errorable operations - * before calling iommufd_object_finalize(). - */ - rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, - xa_limit_31b, GFP_KERNEL_ACCOUNT); - if (rc) - goto out_free; - return obj; -out_free: - kfree(obj); - return ERR_PTR(rc); -} - /* * Allow concurrent access to the object. * @@ -73,20 +41,26 @@ out_free: void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + XA_STATE(xas, &ictx->objects, obj->id); void *old; - old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); - /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ - WARN_ON(old); + xa_lock(&ictx->objects); + old = xas_store(&xas, obj); + xa_unlock(&ictx->objects); + /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */ + WARN_ON(old != XA_ZERO_ENTRY); } /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + XA_STATE(xas, &ictx->objects, obj->id); void *old; - old = xa_erase(&ictx->objects, obj->id); - WARN_ON(old); + xa_lock(&ictx->objects); + old = xas_store(&xas, NULL); + xa_unlock(&ictx->objects); + WARN_ON(old != XA_ZERO_ENTRY); kfree(obj); } @@ -130,7 +104,7 @@ static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, if (wait_event_timeout(ictx->destroy_wait, refcount_read(&to_destroy->shortterm_users) == 0, - msecs_to_jiffies(10000))) + msecs_to_jiffies(60000))) return 0; pr_crit("Time out waiting for iommufd object to become free\n"); @@ -248,6 +222,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); } + init_rwsem(&ictx->ioas_creation_lock); xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; @@ -319,6 +294,7 @@ static int iommufd_option(struct iommufd_ucmd *ucmd) union ucmd_buffer { struct iommu_destroy destroy; + struct iommu_fault_alloc fault; struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; @@ -331,7 +307,9 @@ union ucmd_buffer { struct iommu_ioas_map map; struct iommu_ioas_unmap unmap; struct iommu_option option; + struct iommu_vdevice_alloc vdev; struct iommu_vfio_ioas vfio_ioas; + struct iommu_viommu_alloc viommu; #ifdef CONFIG_IOMMUFD_TEST struct iommu_test_cmd test; #endif @@ -355,6 +333,8 @@ struct iommufd_ioctl_op { } static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, + struct iommu_fault_alloc, out_fault_fd), IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, @@ -369,18 +349,24 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process, + struct iommu_ioas_change_process, __reserved), IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, src_iova), IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, struct iommu_ioas_iova_ranges, out_iova_alignment), - IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, - iova), + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, iova), + IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file, + struct iommu_ioas_map_file, iova), IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, length), - IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, - val64), + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), + IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, + struct iommu_vdevice_alloc, virt_id), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), + IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, + struct iommu_viommu_alloc, out_viommu_id), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif @@ -439,7 +425,7 @@ void iommufd_ctx_get(struct iommufd_ctx *ictx) { get_file(ictx->file); } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, "IOMMUFD"); /** * iommufd_ctx_from_file - Acquires a reference to the iommufd context @@ -459,7 +445,7 @@ struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) iommufd_ctx_get(ictx); return ictx; } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, "IOMMUFD"); /** * iommufd_ctx_from_fd - Acquires a reference to the iommufd context @@ -483,7 +469,7 @@ struct iommufd_ctx *iommufd_ctx_from_fd(int fd) /* fget is the same as iommufd_ctx_get() */ return file->private_data; } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, "IOMMUFD"); /** * iommufd_ctx_put - Put back a reference @@ -493,7 +479,7 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx) { fput(ictx->file); } -EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, "IOMMUFD"); static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_ACCESS] = { @@ -502,8 +488,8 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_DEVICE] = { .destroy = iommufd_device_destroy, }, - [IOMMUFD_OBJ_IOAS] = { - .destroy = iommufd_ioas_destroy, + [IOMMUFD_OBJ_FAULT] = { + .destroy = iommufd_fault_destroy, }, [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hwpt_paging_destroy, @@ -513,6 +499,15 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_hwpt_nested_destroy, .abort = iommufd_hwpt_nested_abort, }, + [IOMMUFD_OBJ_IOAS] = { + .destroy = iommufd_ioas_destroy, + }, + [IOMMUFD_OBJ_VDEVICE] = { + .destroy = iommufd_vdevice_destroy, + }, + [IOMMUFD_OBJ_VIOMMU] = { + .destroy = iommufd_viommu_destroy, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, @@ -578,7 +573,7 @@ module_exit(iommufd_exit); MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif -MODULE_IMPORT_NS(IOMMUFD_INTERNAL); -MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); +MODULE_IMPORT_NS("IOMMUFD"); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 528f356238b3..3427749bc5ce 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,16 +45,17 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include <linux/file.h> +#include <linux/highmem.h> +#include <linux/iommu.h> +#include <linux/iommufd.h> +#include <linux/kthread.h> #include <linux/overflow.h> #include <linux/slab.h> -#include <linux/iommu.h> #include <linux/sched/mm.h> -#include <linux/highmem.h> -#include <linux/kthread.h> -#include <linux/iommufd.h> -#include "io_pagetable.h" #include "double_span.h" +#include "io_pagetable.h" #ifndef CONFIG_IOMMUFD_TEST #define TEMP_MEMORY_LIMIT 65536 @@ -346,27 +347,41 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) kfree(batch->pfns); } -/* true if the pfn was added, false otherwise */ -static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, + u32 nr) { const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); - - if (batch->end && - pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && - batch->npfns[batch->end - 1] != MAX_NPFNS) { - batch->npfns[batch->end - 1]++; - batch->total_pfns++; - return true; - } - if (batch->end == batch->array_size) + unsigned int end = batch->end; + + if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && + nr <= MAX_NPFNS - batch->npfns[end - 1]) { + batch->npfns[end - 1] += nr; + } else if (end < batch->array_size) { + batch->pfns[end] = pfn; + batch->npfns[end] = nr; + batch->end++; + } else { return false; - batch->total_pfns++; - batch->pfns[batch->end] = pfn; - batch->npfns[batch->end] = 1; - batch->end++; + } + + batch->total_pfns += nr; return true; } +static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) +{ + batch->npfns[batch->end - 1] -= nr; + if (batch->npfns[batch->end - 1] == 0) + batch->end--; + batch->total_pfns -= nr; +} + +/* true if the pfn was added, false otherwise */ +static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +{ + return batch_add_pfn_num(batch, pfn, 1); +} + /* * Fill the batch with pfns from the domain. When the batch is full, or it * reaches last_index, the function will return. The caller should use @@ -622,6 +637,41 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages, break; } +static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, + unsigned long *offset_p, unsigned long npages) +{ + int rc = 0; + struct folio **folios = *folios_p; + unsigned long offset = *offset_p; + + while (npages) { + struct folio *folio = *folios; + unsigned long nr = folio_nr_pages(folio) - offset; + unsigned long pfn = page_to_pfn(folio_page(folio, offset)); + + nr = min(nr, npages); + npages -= nr; + + if (!batch_add_pfn_num(batch, pfn, nr)) + break; + if (nr > 1) { + rc = folio_add_pins(folio, nr - 1); + if (rc) { + batch_remove_pfn_num(batch, nr); + goto out; + } + } + + folios++; + offset = 0; + } + +out: + *folios_p = folios; + *offset_p = offset; + return rc; +} + static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, unsigned int first_page_off, size_t npages) { @@ -703,19 +753,32 @@ struct pfn_reader_user { * neither */ int locked; + + /* The following are only valid if file != NULL. */ + struct file *file; + struct folio **ufolios; + size_t ufolios_len; + unsigned long ufolios_offset; + struct folio **ufolios_next; }; static void pfn_reader_user_init(struct pfn_reader_user *user, struct iopt_pages *pages) { user->upages = NULL; + user->upages_len = 0; user->upages_start = 0; user->upages_end = 0; user->locked = -1; - user->gup_flags = FOLL_LONGTERM; if (pages->writable) user->gup_flags |= FOLL_WRITE; + + user->file = (pages->type == IOPT_ADDRESS_FILE) ? pages->file : NULL; + user->ufolios = NULL; + user->ufolios_len = 0; + user->ufolios_next = NULL; + user->ufolios_offset = 0; } static void pfn_reader_user_destroy(struct pfn_reader_user *user, @@ -724,13 +787,67 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user, if (user->locked != -1) { if (user->locked) mmap_read_unlock(pages->source_mm); - if (pages->source_mm != current->mm) + if (!user->file && pages->source_mm != current->mm) mmput(pages->source_mm); user->locked = -1; } kfree(user->upages); user->upages = NULL; + kfree(user->ufolios); + user->ufolios = NULL; +} + +static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start, + unsigned long npages) +{ + unsigned long i; + unsigned long offset; + unsigned long npages_out = 0; + struct page **upages = user->upages; + unsigned long end = start + (npages << PAGE_SHIFT) - 1; + long nfolios = user->ufolios_len / sizeof(*user->ufolios); + + /* + * todo: memfd_pin_folios should return the last pinned offset so + * we can compute npages pinned, and avoid looping over folios here + * if upages == NULL. + */ + nfolios = memfd_pin_folios(user->file, start, end, user->ufolios, + nfolios, &offset); + if (nfolios <= 0) + return nfolios; + + offset >>= PAGE_SHIFT; + user->ufolios_next = user->ufolios; + user->ufolios_offset = offset; + + for (i = 0; i < nfolios; i++) { + struct folio *folio = user->ufolios[i]; + unsigned long nr = folio_nr_pages(folio); + unsigned long npin = min(nr - offset, npages); + + npages -= npin; + npages_out += npin; + + if (upages) { + if (npin == 1) { + *upages++ = folio_page(folio, offset); + } else { + int rc = folio_add_pins(folio, npin - 1); + + if (rc) + return rc; + + while (npin--) + *upages++ = folio_page(folio, offset++); + } + } + + offset = 0; + } + + return npages_out; } static int pfn_reader_user_pin(struct pfn_reader_user *user, @@ -739,7 +856,9 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, unsigned long last_index) { bool remote_mm = pages->source_mm != current->mm; - unsigned long npages; + unsigned long npages = last_index - start_index + 1; + unsigned long start; + unsigned long unum; uintptr_t uptr; long rc; @@ -747,40 +866,50 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, WARN_ON(last_index < start_index)) return -EINVAL; - if (!user->upages) { + if (!user->file && !user->upages) { /* All undone in pfn_reader_destroy() */ - user->upages_len = - (last_index - start_index + 1) * sizeof(*user->upages); + user->upages_len = npages * sizeof(*user->upages); user->upages = temp_kmalloc(&user->upages_len, NULL, 0); if (!user->upages) return -ENOMEM; } + if (user->file && !user->ufolios) { + user->ufolios_len = npages * sizeof(*user->ufolios); + user->ufolios = temp_kmalloc(&user->ufolios_len, NULL, 0); + if (!user->ufolios) + return -ENOMEM; + } + if (user->locked == -1) { /* * The majority of usages will run the map task within the mm * providing the pages, so we can optimize into * get_user_pages_fast() */ - if (remote_mm) { + if (!user->file && remote_mm) { if (!mmget_not_zero(pages->source_mm)) return -EFAULT; } user->locked = 0; } - npages = min_t(unsigned long, last_index - start_index + 1, - user->upages_len / sizeof(*user->upages)); - + unum = user->file ? user->ufolios_len / sizeof(*user->ufolios) : + user->upages_len / sizeof(*user->upages); + npages = min_t(unsigned long, npages, unum); if (iommufd_should_fail()) return -EFAULT; - uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); - if (!remote_mm) + if (user->file) { + start = pages->start + (start_index * PAGE_SIZE); + rc = pin_memfd_pages(user, start, npages); + } else if (!remote_mm) { + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); rc = pin_user_pages_fast(uptr, npages, user->gup_flags, user->upages); - else { + } else { + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); if (!user->locked) { mmap_read_lock(pages->source_mm); user->locked = 1; @@ -809,13 +938,14 @@ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + cur_pages = atomic_long_read(&pages->source_user->locked_vm); do { - cur_pages = atomic_long_read(&pages->source_user->locked_vm); new_pages = cur_pages + npages; if (new_pages > lock_limit) return -ENOMEM; - } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages, - new_pages) != cur_pages); + } while (!atomic_long_try_cmpxchg(&pages->source_user->locked_vm, + &cur_pages, new_pages)); return 0; } @@ -837,7 +967,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, mmap_read_unlock(pages->source_mm); user->locked = 0; /* If we had the lock then we also have a get */ - } else if ((!user || !user->upages) && + + } else if ((!user || (!user->upages && !user->ufolios)) && pages->source_mm != current->mm) { if (!mmget_not_zero(pages->source_mm)) return -EINVAL; @@ -854,8 +985,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, return rc; } -static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, - bool inc, struct pfn_reader_user *user) +int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) { int rc = 0; @@ -889,8 +1020,8 @@ static void update_unpinned(struct iopt_pages *pages) return; if (pages->npinned == pages->last_npinned) return; - do_update_pinned(pages, pages->last_npinned - pages->npinned, false, - NULL); + iopt_pages_update_pinned(pages, pages->last_npinned - pages->npinned, + false, NULL); } /* @@ -920,7 +1051,7 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, npages = pages->npinned - pages->last_npinned; inc = true; } - return do_update_pinned(pages, npages, inc, user); + return iopt_pages_update_pinned(pages, npages, inc, user); } /* @@ -977,6 +1108,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; + struct pfn_reader_user *user = &pfns->user; + unsigned long npages; struct iopt_area *area; int rc; @@ -1014,11 +1147,17 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) return rc; } - batch_from_pages(&pfns->batch, - pfns->user.upages + - (start_index - pfns->user.upages_start), - pfns->user.upages_end - start_index); - return 0; + npages = user->upages_end - start_index; + start_index -= user->upages_start; + rc = 0; + + if (!user->file) + batch_from_pages(&pfns->batch, user->upages + start_index, + npages); + else + rc = batch_from_folios(&pfns->batch, &user->ufolios_next, + &user->ufolios_offset, npages); + return rc; } static bool pfn_reader_done(struct pfn_reader *pfns) @@ -1091,16 +1230,25 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; + struct pfn_reader_user *user = &pfns->user; - if (pfns->user.upages_end > pfns->batch_end_index) { - size_t npages = pfns->user.upages_end - pfns->batch_end_index; - + if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ - unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - - pfns->user.upages_start), - npages); + + unsigned long npages = user->upages_end - pfns->batch_end_index; + unsigned long start_index = pfns->batch_end_index - + user->upages_start; + + if (!user->file) { + unpin_user_pages(user->upages + start_index, npages); + } else { + long n = user->ufolios_len / sizeof(*user->ufolios); + + unpin_folios(user->ufolios_next, + user->ufolios + n - user->ufolios_next); + } iopt_pages_sub_npinned(pages, npages); - pfns->user.upages_end = pfns->batch_end_index; + user->upages_end = pfns->batch_end_index; } if (pfns->batch_start_index != pfns->batch_end_index) { pfn_reader_unpin(pfns); @@ -1138,11 +1286,11 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, return 0; } -struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, - bool writable) +static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte, + unsigned long length, + bool writable) { struct iopt_pages *pages; - unsigned long end; /* * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP @@ -1151,9 +1299,6 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, if (length > SIZE_MAX - PAGE_SIZE || length == 0) return ERR_PTR(-EINVAL); - if (check_add_overflow((unsigned long)uptr, length, &end)) - return ERR_PTR(-EOVERFLOW); - pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); if (!pages) return ERR_PTR(-ENOMEM); @@ -1163,8 +1308,7 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, mutex_init(&pages->mutex); pages->source_mm = current->mm; mmgrab(pages->source_mm); - pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); - pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); + pages->npages = DIV_ROUND_UP(length + start_byte, PAGE_SIZE); pages->access_itree = RB_ROOT_CACHED; pages->domains_itree = RB_ROOT_CACHED; pages->writable = writable; @@ -1178,6 +1322,45 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, return pages; } +struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, + unsigned long length, bool writable) +{ + struct iopt_pages *pages; + unsigned long end; + void __user *uptr_down = + (void __user *) ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + + if (check_add_overflow((unsigned long)uptr, length, &end)) + return ERR_PTR(-EOVERFLOW); + + pages = iopt_alloc_pages(uptr - uptr_down, length, writable); + if (IS_ERR(pages)) + return pages; + pages->uptr = uptr_down; + pages->type = IOPT_ADDRESS_USER; + return pages; +} + +struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, + unsigned long length, bool writable) + +{ + struct iopt_pages *pages; + unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE); + unsigned long end; + + if (length && check_add_overflow(start, length - 1, &end)) + return ERR_PTR(-EOVERFLOW); + + pages = iopt_alloc_pages(start - start_down, length, writable); + if (IS_ERR(pages)) + return pages; + pages->file = get_file(file); + pages->start = start_down; + pages->type = IOPT_ADDRESS_FILE; + return pages; +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1190,6 +1373,8 @@ void iopt_release_pages(struct kref *kref) mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); + if (pages->type == IOPT_ADDRESS_FILE) + fput(pages->file); kfree(pages); } @@ -1629,11 +1814,11 @@ static int iopt_pages_fill_from_domain(struct iopt_pages *pages, return 0; } -static int iopt_pages_fill_from_mm(struct iopt_pages *pages, - struct pfn_reader_user *user, - unsigned long start_index, - unsigned long last_index, - struct page **out_pages) +static int iopt_pages_fill(struct iopt_pages *pages, + struct pfn_reader_user *user, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) { unsigned long cur_index = start_index; int rc; @@ -1707,8 +1892,8 @@ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, /* hole */ cur_pages = out_pages + (span.start_hole - start_index); - rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, - span.last_hole, cur_pages); + rc = iopt_pages_fill(pages, &user, span.start_hole, + span.last_hole, cur_pages); if (rc) goto out_clean_xa; rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, @@ -1788,6 +1973,10 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, struct page *page = NULL; int rc; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(pages->type != IOPT_ADDRESS_USER)) + return -EINVAL; + if (!mmget_not_zero(pages->source_mm)) return iopt_pages_rw_slow(pages, index, index, offset, data, length, flags); @@ -1843,6 +2032,15 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; + if (pages->type == IOPT_ADDRESS_FILE) + return iopt_pages_rw_slow(pages, start_index, last_index, + start_byte % PAGE_SIZE, data, length, + flags); + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(pages->type != IOPT_ADDRESS_USER)) + return -EINVAL; + if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 7a2199470f31..d40deb0a4f06 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -3,13 +3,14 @@ * * Kernel side components to support tools/testing/selftests/iommu */ -#include <linux/slab.h> -#include <linux/iommu.h> -#include <linux/xarray.h> -#include <linux/file.h> #include <linux/anon_inodes.h> +#include <linux/debugfs.h> #include <linux/fault-inject.h> +#include <linux/file.h> +#include <linux/iommu.h> #include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/xarray.h> #include <uapi/linux/iommufd.h> #include "../iommu-priv.h" @@ -125,12 +126,35 @@ struct mock_iommu_domain { struct xarray pfns; }; +static inline struct mock_iommu_domain * +to_mock_domain(struct iommu_domain *domain) +{ + return container_of(domain, struct mock_iommu_domain, domain); +} + struct mock_iommu_domain_nested { struct iommu_domain domain; + struct mock_viommu *mock_viommu; struct mock_iommu_domain *parent; u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; }; +static inline struct mock_iommu_domain_nested * +to_mock_nested(struct iommu_domain *domain) +{ + return container_of(domain, struct mock_iommu_domain_nested, domain); +} + +struct mock_viommu { + struct iommufd_viommu core; + struct mock_iommu_domain *s2_parent; +}; + +static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) +{ + return container_of(viommu, struct mock_viommu, core); +} + enum selftest_obj_type { TYPE_IDEV, }; @@ -139,8 +163,14 @@ struct mock_dev { struct device dev; unsigned long flags; int id; + u32 cache[MOCK_DEV_CACHE_NUM]; }; +static inline struct mock_dev *to_mock_dev(struct device *dev) +{ + return container_of(dev, struct mock_dev, dev); +} + struct selftest_obj { struct iommufd_object obj; enum selftest_obj_type type; @@ -154,10 +184,15 @@ struct selftest_obj { }; }; +static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) +{ + return container_of(obj, struct selftest_obj, obj); +} + static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return -EINVAL; @@ -192,8 +227,7 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, bool enable) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = mock->flags; if (enable && !domain->dirty_ops) @@ -242,8 +276,7 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, unsigned long flags, struct iommu_dirty_bitmap *dirty) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long end = iova + size; void *ent; @@ -266,107 +299,102 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, /* Clear dirty */ if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, head, pgsize); - iova = head + pgsize; + iommu_dirty_bitmap_record(dirty, iova, pgsize); + iova += pgsize; } while (iova < end); return 0; } -const struct iommu_dirty_ops dirty_ops = { +static const struct iommu_dirty_ops dirty_ops = { .set_dirty_tracking = mock_domain_set_dirty_tracking, .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; -static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) +static struct mock_iommu_domain_nested * +__mock_domain_alloc_nested(const struct iommu_user_data *user_data) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); - struct mock_iommu_domain *mock; + struct mock_iommu_domain_nested *mock_nested; + struct iommu_hwpt_selftest user_cfg; + int rc, i; - mock = kzalloc(sizeof(*mock), GFP_KERNEL); - if (!mock) - return NULL; - mock->domain.geometry.aperture_start = MOCK_APERTURE_START; - mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; - mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; - if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) - mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; - mock->domain.ops = mock_ops.default_domain_ops; - mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - xa_init(&mock->pfns); - return &mock->domain; -} + if (user_data->type != IOMMU_HWPT_DATA_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); -static struct iommu_domain * -__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, - const struct iommu_hwpt_selftest *user_cfg) -{ - struct mock_iommu_domain_nested *mock_nested; - int i; + rc = iommu_copy_struct_from_user(&user_cfg, user_data, + IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); if (!mock_nested) return ERR_PTR(-ENOMEM); - mock_nested->parent = mock_parent; mock_nested->domain.ops = &domain_nested_ops; mock_nested->domain.type = IOMMU_DOMAIN_NESTED; for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) - mock_nested->iotlb[i] = user_cfg->iotlb; - return &mock_nested->domain; + mock_nested->iotlb[i] = user_cfg.iotlb; + return mock_nested; } static struct iommu_domain * -mock_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, + u32 flags, const struct iommu_user_data *user_data) { + struct mock_iommu_domain_nested *mock_nested; struct mock_iommu_domain *mock_parent; - struct iommu_hwpt_selftest user_cfg; - int rc; - - /* must be mock_domain */ - if (!parent) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); - bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; - struct iommu_domain *domain; - - if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) - return ERR_PTR(-EOPNOTSUPP); - if (user_data || (has_dirty_flag && no_dirty_ops)) - return ERR_PTR(-EOPNOTSUPP); - domain = mock_domain_alloc_paging(dev); - if (!domain) - return ERR_PTR(-ENOMEM); - if (has_dirty_flag) - container_of(domain, struct mock_iommu_domain, domain) - ->domain.dirty_ops = &dirty_ops; - return domain; - } - /* must be mock_domain_nested */ - if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags) + if (flags) return ERR_PTR(-EOPNOTSUPP); if (!parent || parent->ops != mock_ops.default_domain_ops) return ERR_PTR(-EINVAL); - mock_parent = container_of(parent, struct mock_iommu_domain, domain); + mock_parent = to_mock_domain(parent); if (!mock_parent) return ERR_PTR(-EINVAL); - rc = iommu_copy_struct_from_user(&user_cfg, user_data, - IOMMU_HWPT_DATA_SELFTEST, iotlb); - if (rc) - return ERR_PTR(rc); + mock_nested = __mock_domain_alloc_nested(user_data); + if (IS_ERR(mock_nested)) + return ERR_CAST(mock_nested); + mock_nested->parent = mock_parent; + return &mock_nested->domain; +} + +static struct iommu_domain * +mock_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) +{ + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT; + struct mock_dev *mdev = to_mock_dev(dev); + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct mock_iommu_domain *mock; + + if (user_data) + return ERR_PTR(-EOPNOTSUPP); + if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); - return __mock_domain_alloc_nested(mock_parent, &user_cfg); + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return ERR_PTR(-ENOMEM); + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; + mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) + mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; + mock->domain.ops = mock_ops.default_domain_ops; + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; + xa_init(&mock->pfns); + + if (has_dirty_flag) + mock->domain.dirty_ops = &dirty_ops; + return &mock->domain; } static void mock_domain_free(struct iommu_domain *domain) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); WARN_ON(!xa_empty(&mock->pfns)); kfree(mock); @@ -377,8 +405,7 @@ static int mock_domain_map_pages(struct iommu_domain *domain, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = MOCK_PFN_START_IOVA; unsigned long start_iova = iova; @@ -429,8 +456,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, size_t pgcount, struct iommu_iotlb_gather *iotlb_gather) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); bool first = true; size_t ret = 0; void *ent; @@ -478,8 +504,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); void *ent; WARN_ON(iova % MOCK_IO_PAGE_SIZE); @@ -490,7 +515,7 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: @@ -504,14 +529,168 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) return false; } -static struct iommu_device mock_iommu_device = { -}; +static struct iopf_queue *mock_iommu_iopf_queue; + +static struct mock_iommu_device { + struct iommu_device iommu_dev; + struct completion complete; + refcount_t users; +} mock_iommu; static struct iommu_device *mock_probe_device(struct device *dev) { if (dev->bus != &iommufd_mock_bus_type.bus) return ERR_PTR(-ENODEV); - return &mock_iommu_device; + return &mock_iommu.iommu_dev; +} + +static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ +} + +static int mock_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) +{ + if (feat != IOMMU_DEV_FEAT_IOPF || !mock_iommu_iopf_queue) + return -ENODEV; + + return iopf_queue_add_device(mock_iommu_iopf_queue, dev); +} + +static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) +{ + if (feat != IOMMU_DEV_FEAT_IOPF || !mock_iommu_iopf_queue) + return -ENODEV; + + iopf_queue_remove_device(mock_iommu_iopf_queue, dev); + + return 0; +} + +static void mock_viommu_destroy(struct iommufd_viommu *viommu) +{ + struct mock_iommu_device *mock_iommu = container_of( + viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + + if (refcount_dec_and_test(&mock_iommu->users)) + complete(&mock_iommu->complete); + + /* iommufd core frees mock_viommu and viommu */ +} + +static struct iommu_domain * +mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); + struct mock_iommu_domain_nested *mock_nested; + + if (flags) + return ERR_PTR(-EOPNOTSUPP); + + mock_nested = __mock_domain_alloc_nested(user_data); + if (IS_ERR(mock_nested)) + return ERR_CAST(mock_nested); + mock_nested->mock_viommu = mock_viommu; + mock_nested->parent = mock_viommu->s2_parent; + return &mock_nested->domain; +} + +static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) +{ + struct iommu_viommu_invalidate_selftest *cmds; + struct iommu_viommu_invalidate_selftest *cur; + struct iommu_viommu_invalidate_selftest *end; + int rc; + + /* A zero-length array is allowed to validate the array type */ + if (array->entry_num == 0 && + array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) { + array->entry_num = 0; + return 0; + } + + cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + cur = cmds; + end = cmds + array->entry_num; + + static_assert(sizeof(*cmds) == 3 * sizeof(u32)); + rc = iommu_copy_struct_from_full_user_array( + cmds, sizeof(*cmds), array, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST); + if (rc) + goto out; + + while (cur != end) { + struct mock_dev *mdev; + struct device *dev; + int i; + + if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) { + rc = -EOPNOTSUPP; + goto out; + } + + if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) { + rc = -EINVAL; + goto out; + } + + xa_lock(&viommu->vdevs); + dev = iommufd_viommu_find_dev(viommu, + (unsigned long)cur->vdev_id); + if (!dev) { + xa_unlock(&viommu->vdevs); + rc = -EINVAL; + goto out; + } + mdev = container_of(dev, struct mock_dev, dev); + + if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) { + /* Invalidate all cache entries and ignore cache_id */ + for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) + mdev->cache[i] = 0; + } else { + mdev->cache[cur->cache_id] = 0; + } + xa_unlock(&viommu->vdevs); + + cur++; + } +out: + array->entry_num = cur - cmds; + kfree(cmds); + return rc; +} + +static struct iommufd_viommu_ops mock_viommu_ops = { + .destroy = mock_viommu_destroy, + .alloc_domain_nested = mock_viommu_alloc_domain_nested, + .cache_invalidate = mock_viommu_cache_invalidate, +}; + +static struct iommufd_viommu *mock_viommu_alloc(struct device *dev, + struct iommu_domain *domain, + struct iommufd_ctx *ictx, + unsigned int viommu_type) +{ + struct mock_iommu_device *mock_iommu = + iommu_get_iommu_dev(dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu; + + if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + + mock_viommu = iommufd_viommu_alloc(ictx, struct mock_viommu, core, + &mock_viommu_ops); + if (IS_ERR(mock_viommu)) + return ERR_CAST(mock_viommu); + + refcount_inc(&mock_iommu->users); + return &mock_viommu->core; } static const struct iommu_ops mock_ops = { @@ -524,11 +703,16 @@ static const struct iommu_ops mock_ops = { .owner = THIS_MODULE, .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, - .domain_alloc_paging = mock_domain_alloc_paging, - .domain_alloc_user = mock_domain_alloc_user, + .domain_alloc_paging_flags = mock_domain_alloc_paging_flags, + .domain_alloc_nested = mock_domain_alloc_nested, .capable = mock_domain_capable, .device_group = generic_device_group, .probe_device = mock_probe_device, + .page_response = mock_domain_page_response, + .dev_enable_feat = mock_dev_enable_feat, + .dev_disable_feat = mock_dev_disable_feat, + .user_pasid_table = true, + .viommu_alloc = mock_viommu_alloc, .default_domain_ops = &(struct iommu_domain_ops){ .free = mock_domain_free, @@ -541,18 +725,14 @@ static const struct iommu_ops mock_ops = { static void mock_domain_free_nested(struct iommu_domain *domain) { - struct mock_iommu_domain_nested *mock_nested = - container_of(domain, struct mock_iommu_domain_nested, domain); - - kfree(mock_nested); + kfree(to_mock_nested(domain)); } static int mock_domain_cache_invalidate_user(struct iommu_domain *domain, struct iommu_user_data_array *array) { - struct mock_iommu_domain_nested *mock_nested = - container_of(domain, struct mock_iommu_domain_nested, domain); + struct mock_iommu_domain_nested *mock_nested = to_mock_nested(domain); struct iommu_hwpt_invalidate_selftest inv; u32 processed = 0; int i = 0, j; @@ -627,7 +807,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } - *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); + *mock = to_mock_domain(hwpt->domain); return hwpt; } @@ -645,14 +825,13 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } - *mock_nested = container_of(hwpt->domain, - struct mock_iommu_domain_nested, domain); + *mock_nested = to_mock_nested(hwpt->domain); return hwpt; } static void mock_dev_release(struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); ida_free(&mock_dev_ida, mdev->id); kfree(mdev); @@ -661,7 +840,7 @@ static void mock_dev_release(struct device *dev) static struct mock_dev *mock_dev_create(unsigned long dev_flags) { struct mock_dev *mdev; - int rc; + int rc, i; if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) @@ -675,6 +854,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; + for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) + mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT; rc = ida_alloc(&mock_dev_ida, GFP_KERNEL); if (rc < 0) @@ -783,7 +964,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, if (IS_ERR(dev_obj)) return PTR_ERR(dev_obj); - sobj = container_of(dev_obj, struct selftest_obj, obj); + sobj = to_selftest_obj(dev_obj); if (sobj->type != TYPE_IDEV) { rc = -EINVAL; goto out_dev_obj; @@ -921,8 +1102,7 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - mock_nested = container_of(hwpt->domain, - struct mock_iommu_domain_nested, domain); + mock_nested = to_mock_nested(hwpt->domain); if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX || mock_nested->iotlb[iotlb_id] != iotlb) @@ -931,6 +1111,24 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, return rc; } +static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id, + unsigned int cache_id, u32 cache) +{ + struct iommufd_device *idev; + struct mock_dev *mdev; + int rc = 0; + + idev = iommufd_get_device(ucmd, idev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + mdev = container_of(idev->dev, struct mock_dev, dev); + + if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache) + rc = -EINVAL; + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + struct selftest_access { struct iommufd_access *access; struct file *file; @@ -1313,7 +1511,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, unsigned long page_size, void __user *uptr, u32 flags) { - unsigned long bitmap_size, i, max; + unsigned long i, max; struct iommu_test_cmd *cmd = ucmd->cmd; struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; @@ -1334,15 +1532,14 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, } max = length / page_size; - bitmap_size = max / BITS_PER_BYTE; - - tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT); + tmp = kvzalloc(DIV_ROUND_UP(max, BITS_PER_LONG) * sizeof(unsigned long), + GFP_KERNEL_ACCOUNT); if (!tmp) { rc = -ENOMEM; goto out_put; } - if (copy_from_user(tmp, uptr, bitmap_size)) { + if (copy_from_user(tmp, uptr,DIV_ROUND_UP(max, BITS_PER_BYTE))) { rc = -EFAULT; goto out_free; } @@ -1375,9 +1572,34 @@ out_put: return rc; } +static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct iopf_fault event = { }; + struct iommufd_device *idev; + + idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + event.fault.prm.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (cmd->trigger_iopf.pasid != IOMMU_NO_PASID) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = cmd->trigger_iopf.addr; + event.fault.prm.pasid = cmd->trigger_iopf.pasid; + event.fault.prm.grpid = cmd->trigger_iopf.grpid; + event.fault.prm.perm = cmd->trigger_iopf.perm; + + iommu_report_device_fault(idev->dev, &event); + iommufd_put_object(ucmd->ictx, &idev->obj); + + return 0; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { - struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); + struct selftest_obj *sobj = to_selftest_obj(obj); switch (sobj->type) { case TYPE_IDEV: @@ -1416,6 +1638,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_md_check_iotlb(ucmd, cmd->id, cmd->check_iotlb.id, cmd->check_iotlb.iotlb); + case IOMMU_TEST_OP_DEV_CHECK_CACHE: + return iommufd_test_dev_check_cache(ucmd, cmd->id, + cmd->check_dev_cache.id, + cmd->check_dev_cache.cache); case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); @@ -1450,6 +1676,8 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->dirty.page_size, u64_to_user_ptr(cmd->dirty.uptr), cmd->dirty.flags); + case IOMMU_TEST_OP_TRIGGER_IOPF: + return iommufd_test_trigger_iopf(ucmd, cmd); default: return -EOPNOTSUPP; } @@ -1480,21 +1708,27 @@ int __init iommufd_test_init(void) if (rc) goto err_platform; - rc = iommu_device_sysfs_add(&mock_iommu_device, + rc = iommu_device_sysfs_add(&mock_iommu.iommu_dev, &selftest_iommu_dev->dev, NULL, "%s", dev_name(&selftest_iommu_dev->dev)); if (rc) goto err_bus; - rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops, + rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); if (rc) goto err_sysfs; + + refcount_set(&mock_iommu.users, 1); + init_completion(&mock_iommu.complete); + + mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq"); + return 0; err_sysfs: - iommu_device_sysfs_remove(&mock_iommu_device); + iommu_device_sysfs_remove(&mock_iommu.iommu_dev); err_bus: bus_unregister(&iommufd_mock_bus_type.bus); err_platform: @@ -1504,10 +1738,32 @@ err_dbgfs: return rc; } +static void iommufd_test_wait_for_users(void) +{ + if (refcount_dec_and_test(&mock_iommu.users)) + return; + /* + * Time out waiting for iommu device user count to become 0. + * + * Note that this is just making an example here, since the selftest is + * built into the iommufd module, i.e. it only unplugs the iommu device + * when unloading the module. So, it is expected that this WARN_ON will + * not trigger, as long as any iommufd FDs are open. + */ + WARN_ON(!wait_for_completion_timeout(&mock_iommu.complete, + msecs_to_jiffies(10000))); +} + void iommufd_test_exit(void) { - iommu_device_sysfs_remove(&mock_iommu_device); - iommu_device_unregister_bus(&mock_iommu_device, + if (mock_iommu_iopf_queue) { + iopf_queue_free(mock_iommu_iopf_queue); + mock_iommu_iopf_queue = NULL; + } + + iommufd_test_wait_for_users(); + iommu_device_sysfs_remove(&mock_iommu.iommu_dev); + iommu_device_unregister_bus(&mock_iommu.iommu_dev, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); bus_unregister(&iommufd_mock_bus_type.bus); diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index a3ad5f0b6c59..a258ee2f4579 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -44,7 +44,7 @@ int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) iommufd_put_object(ictx, &ioas->obj); return 0; } -EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO); +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, "IOMMUFD_VFIO"); /** * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached @@ -66,7 +66,7 @@ int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) xa_unlock(&ictx->objects); return ret; } -EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO); +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, "IOMMUFD_VFIO"); /** * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created @@ -118,7 +118,7 @@ out_abort: iommufd_object_abort(ictx, &ioas->obj); return ret; } -EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO); +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, "IOMMUFD_VFIO"); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) { @@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, case VFIO_DMA_CC_IOMMU: return iommufd_vfio_cc_iommu(ictx); - /* - * This is obsolete, and to be removed from VFIO. It was an incomplete - * idea that got merged. - * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ - */ - case VFIO_TYPE1_NESTING_IOMMU: + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: return 0; /* diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c new file mode 100644 index 000000000000..69b88e8c7c26 --- /dev/null +++ b/drivers/iommu/iommufd/viommu.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "iommufd_private.h" + +void iommufd_viommu_destroy(struct iommufd_object *obj) +{ + struct iommufd_viommu *viommu = + container_of(obj, struct iommufd_viommu, obj); + + if (viommu->ops && viommu->ops->destroy) + viommu->ops->destroy(viommu); + refcount_dec(&viommu->hwpt->common.obj.users); + xa_destroy(&viommu->vdevs); +} + +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_viommu_alloc *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_viommu *viommu; + struct iommufd_device *idev; + const struct iommu_ops *ops; + int rc; + + if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ops = dev_iommu_ops(idev->dev); + if (!ops->viommu_alloc) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); + goto out_put_idev; + } + + if (!hwpt_paging->nest_parent) { + rc = -EINVAL; + goto out_put_hwpt; + } + + viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain, + ucmd->ictx, cmd->type); + if (IS_ERR(viommu)) { + rc = PTR_ERR(viommu); + goto out_put_hwpt; + } + + xa_init(&viommu->vdevs); + viommu->type = cmd->type; + viommu->ictx = ucmd->ictx; + viommu->hwpt = hwpt_paging; + refcount_inc(&viommu->hwpt->common.obj.users); + /* + * It is the most likely case that a physical IOMMU is unpluggable. A + * pluggable IOMMU instance (if exists) is responsible for refcounting + * on its own. + */ + viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); + + cmd->out_viommu_id = viommu->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_abort; + iommufd_object_finalize(ucmd->ictx, &viommu->obj); + goto out_put_hwpt; + +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj); +out_put_hwpt: + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); +out_put_idev: + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + +void iommufd_vdevice_destroy(struct iommufd_object *obj) +{ + struct iommufd_vdevice *vdev = + container_of(obj, struct iommufd_vdevice, obj); + struct iommufd_viommu *viommu = vdev->viommu; + + /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ + xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL); + refcount_dec(&viommu->obj.users); + put_device(vdev->dev); +} + +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_vdevice_alloc *cmd = ucmd->cmd; + struct iommufd_vdevice *vdev, *curr; + struct iommufd_viommu *viommu; + struct iommufd_device *idev; + u64 virt_id = cmd->virt_id; + int rc = 0; + + /* virt_id indexes an xarray */ + if (virt_id > ULONG_MAX) + return -EINVAL; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_put_viommu; + } + + if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { + rc = -EINVAL; + goto out_put_idev; + } + + vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE); + if (IS_ERR(vdev)) { + rc = PTR_ERR(vdev); + goto out_put_idev; + } + + vdev->id = virt_id; + vdev->dev = idev->dev; + get_device(idev->dev); + vdev->viommu = viommu; + refcount_inc(&viommu->obj.users); + + curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL); + if (curr) { + rc = xa_err(curr) ?: -EEXIST; + goto out_abort; + } + + cmd->out_vdevice_id = vdev->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_abort; + iommufd_object_finalize(ucmd->ictx, &vdev->obj); + goto out_put_idev; + +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); +out_put_idev: + iommufd_put_object(ucmd->ictx, &idev->obj); +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} |