diff options
Diffstat (limited to 'drivers/vfio')
-rw-r--r-- | drivers/vfio/Kconfig | 31 | ||||
-rw-r--r-- | drivers/vfio/Makefile | 11 | ||||
-rw-r--r-- | drivers/vfio/container.c | 145 | ||||
-rw-r--r-- | drivers/vfio/fsl-mc/vfio_fsl_mc.c | 4 | ||||
-rw-r--r-- | drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c | 1 | ||||
-rw-r--r-- | drivers/vfio/group.c | 877 | ||||
-rw-r--r-- | drivers/vfio/iommufd.c | 158 | ||||
-rw-r--r-- | drivers/vfio/iova_bitmap.c | 33 | ||||
-rw-r--r-- | drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 162 | ||||
-rw-r--r-- | drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 2 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.c | 413 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.h | 96 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/main.c | 787 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 3 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_core.c | 25 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_amba.c | 4 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_platform.c | 4 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_platform_common.c | 3 | ||||
-rw-r--r-- | drivers/vfio/vfio.h | 133 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_spapr_tce.c | 65 | ||||
-rw-r--r-- | drivers/vfio/vfio_main.c | 1126 | ||||
-rw-r--r-- | drivers/vfio/vfio_spapr_eeh.c | 107 | ||||
-rw-r--r-- | drivers/vfio/virqfd.c | 17 |
23 files changed, 2875 insertions, 1332 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 86c381ceb9a1..a8f544629467 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -2,8 +2,9 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" select IOMMU_API - select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + depends on IOMMUFD || !IOMMUFD select INTERVAL_TREE + select VFIO_CONTAINER if IOMMUFD=n help VFIO provides a framework for secure userspace device drivers. See Documentation/driver-api/vfio.rst for more details. @@ -11,6 +12,18 @@ menuconfig VFIO If you don't know what to do here, say N. if VFIO +config VFIO_CONTAINER + bool "Support for the VFIO container /dev/vfio/vfio" + select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + default y + help + The VFIO container is the classic interface to VFIO for establishing + IOMMU mappings. If N is selected here then IOMMUFD must be used to + manage the mappings. + + Unless testing IOMMUFD say Y here. + +if VFIO_CONTAINER config VFIO_IOMMU_TYPE1 tristate default n @@ -20,16 +33,6 @@ config VFIO_IOMMU_SPAPR_TCE depends on SPAPR_TCE_IOMMU default VFIO -config VFIO_SPAPR_EEH - tristate - depends on EEH && VFIO_IOMMU_SPAPR_TCE - default VFIO - -config VFIO_VIRQFD - tristate - select EVENTFD - default n - config VFIO_NOIOMMU bool "VFIO No-IOMMU support" help @@ -43,6 +46,12 @@ config VFIO_NOIOMMU this mode since there is no IOMMU to provide DMA translation. If you don't know what to do here, say N. +endif + +config VFIO_VIRQFD + bool + select EVENTFD + default n source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index b693a1169286..70e7dcb302ef 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,16 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 -vfio_virqfd-y := virqfd.o - obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ - iova_bitmap.o \ - container.o + group.o \ + iova_bitmap.o +vfio-$(CONFIG_IOMMUFD) += iommufd.o +vfio-$(CONFIG_VFIO_CONTAINER) += container.o +vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o -obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o -obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index d74164abbf40..b7a9560ab25e 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -188,8 +188,9 @@ void vfio_device_container_unregister(struct vfio_device *device) device->group->container->iommu_data, device); } -long vfio_container_ioctl_check_extension(struct vfio_container *container, - unsigned long arg) +static long +vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg) { struct vfio_iommu_driver *driver; long ret = 0; @@ -511,14 +512,15 @@ void vfio_group_detach_container(struct vfio_group *group) vfio_container_put(container); } -int vfio_device_assign_container(struct vfio_device *device) +int vfio_group_use_container(struct vfio_group *group) { - struct vfio_group *group = device->group; - lockdep_assert_held(&group->group_lock); - if (!group->container || !group->container->iommu_driver || - WARN_ON(!group->container_users)) + /* + * The container fd has been assigned with VFIO_GROUP_SET_CONTAINER but + * VFIO_SET_IOMMU hasn't been done yet. + */ + if (!group->container->iommu_driver) return -EINVAL; if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) @@ -529,122 +531,56 @@ int vfio_device_assign_container(struct vfio_device *device) return 0; } -void vfio_device_unassign_container(struct vfio_device *device) +void vfio_group_unuse_container(struct vfio_group *group) { - mutex_lock(&device->group->group_lock); - WARN_ON(device->group->container_users <= 1); - device->group->container_users--; - fput(device->group->opened_file); - mutex_unlock(&device->group->group_lock); + lockdep_assert_held(&group->group_lock); + + WARN_ON(group->container_users <= 1); + group->container_users--; + fput(group->opened_file); } -/* - * Pin contiguous user pages and return their associated host pages for local - * domain only. - * @device [in] : device - * @iova [in] : starting IOVA of user pages to be pinned. - * @npage [in] : count of pages to be pinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - * @prot [in] : protection flags - * @pages[out] : array of host pages - * Return error or number of pages pinned. - * - * A driver may only call this function if the vfio_device was created - * by vfio_register_emulated_iommu_dev(). - */ -int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, - int npage, int prot, struct page **pages) +int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages) { - struct vfio_container *container; - struct vfio_group *group = device->group; - struct vfio_iommu_driver *driver; - int ret; - - if (!pages || !npage || !vfio_assert_device_open(device)) - return -EINVAL; + struct vfio_container *container = device->group->container; + struct iommu_group *iommu_group = device->group->iommu_group; + struct vfio_iommu_driver *driver = container->iommu_driver; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - /* group->container cannot change while a vfio device is open */ - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, - group->iommu_group, iova, - npage, prot, pages); - else - ret = -ENOTTY; - - return ret; + if (unlikely(!driver || !driver->ops->pin_pages)) + return -ENOTTY; + return driver->ops->pin_pages(container->iommu_data, iommu_group, iova, + npage, prot, pages); } -EXPORT_SYMBOL(vfio_pin_pages); -/* - * Unpin contiguous host pages for local domain only. - * @device [in] : device - * @iova [in] : starting address of user pages to be unpinned. - * @npage [in] : count of pages to be unpinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - */ -void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) +void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage) { - struct vfio_container *container; - struct vfio_iommu_driver *driver; + struct vfio_container *container = device->group->container; if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) return; - if (WARN_ON(!vfio_assert_device_open(device))) - return; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; - - driver->ops->unpin_pages(container->iommu_data, iova, npage); + container->iommu_driver->ops->unpin_pages(container->iommu_data, iova, + npage); } -EXPORT_SYMBOL(vfio_unpin_pages); -/* - * This interface allows the CPUs to perform some sort of virtual DMA on - * behalf of the device. - * - * CPUs read/write from/into a range of IOVAs pointing to user space memory - * into/from a kernel buffer. - * - * As the read/write of user space memory is conducted via the CPUs and is - * not a real device DMA, it is not necessary to pin the user space memory. - * - * @device [in] : VFIO device - * @iova [in] : base IOVA of a user space buffer - * @data [in] : pointer to kernel buffer - * @len [in] : kernel buffer length - * @write : indicate read or write - * Return error code on failure or 0 on success. - */ -int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, - size_t len, bool write) +int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write) { - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret = 0; - - if (!data || len <= 0 || !vfio_assert_device_open(device)) - return -EINVAL; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; + struct vfio_container *container = device->group->container; + struct vfio_iommu_driver *driver = container->iommu_driver; - if (likely(driver && driver->ops->dma_rw)) - ret = driver->ops->dma_rw(container->iommu_data, - iova, data, len, write); - else - ret = -ENOTTY; - return ret; + if (unlikely(!driver || !driver->ops->dma_rw)) + return -ENOTTY; + return driver->ops->dma_rw(container->iommu_data, iova, data, len, + write); } -EXPORT_SYMBOL(vfio_dma_rw); int __init vfio_container_init(void) { @@ -678,3 +614,6 @@ void vfio_container_cleanup(void) misc_deregister(&vfio_dev); mutex_destroy(&vfio.iommu_drivers_lock); } + +MODULE_ALIAS_MISCDEV(VFIO_MINOR); +MODULE_ALIAS("devname:vfio/vfio"); diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index b16874e913e4..defeb8510ace 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -568,7 +568,6 @@ static void vfio_fsl_mc_release_dev(struct vfio_device *core_vdev) vfio_fsl_uninit_device(vdev); mutex_destroy(&vdev->igate); - vfio_free_device(core_vdev); } static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) @@ -592,6 +591,9 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static struct fsl_mc_driver vfio_fsl_mc_driver = { diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c index 7b428eac3d3e..64d01f3fb13d 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c @@ -8,7 +8,6 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/eventfd.h> -#include <linux/msi.h> #include "linux/fsl/mc.h" #include "vfio_fsl_mc_private.h" diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c new file mode 100644 index 000000000000..bb24b2f0271e --- /dev/null +++ b/drivers/vfio/group.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO core + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#include <linux/vfio.h> +#include <linux/iommufd.h> +#include <linux/anon_inodes.h> +#include "vfio.h" + +static struct vfio { + struct class *class; + struct list_head group_list; + struct mutex group_lock; /* locks group_list */ + struct ida group_ida; + dev_t group_devt; +} vfio; + +static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, + char *buf) +{ + struct vfio_device *it, *device = ERR_PTR(-ENODEV); + + mutex_lock(&group->device_lock); + list_for_each_entry(it, &group->device_list, group_next) { + int ret; + + if (it->ops->match) { + ret = it->ops->match(it, buf); + if (ret < 0) { + device = ERR_PTR(ret); + break; + } + } else { + ret = !strcmp(dev_name(it->dev), buf); + } + + if (ret && vfio_device_try_get_registration(it)) { + device = it; + break; + } + } + mutex_unlock(&group->device_lock); + + return device; +} + +/* + * VFIO Group fd, /dev/vfio/$GROUP + */ +static bool vfio_group_has_iommu(struct vfio_group *group) +{ + lockdep_assert_held(&group->group_lock); + /* + * There can only be users if there is a container, and if there is a + * container there must be users. + */ + WARN_ON(!group->container != !group->container_users); + + return group->container || group->iommufd; +} + +/* + * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or + * if there was no container to unset. Since the ioctl is called on + * the group, we know that still exists, therefore the only valid + * transition here is 1->0. + */ +static int vfio_group_ioctl_unset_container(struct vfio_group *group) +{ + int ret = 0; + + mutex_lock(&group->group_lock); + if (!vfio_group_has_iommu(group)) { + ret = -EINVAL; + goto out_unlock; + } + if (group->container) { + if (group->container_users != 1) { + ret = -EBUSY; + goto out_unlock; + } + vfio_group_detach_container(group); + } + if (group->iommufd) { + iommufd_ctx_put(group->iommufd); + group->iommufd = NULL; + } + +out_unlock: + mutex_unlock(&group->group_lock); + return ret; +} + +static int vfio_group_ioctl_set_container(struct vfio_group *group, + int __user *arg) +{ + struct vfio_container *container; + struct iommufd_ctx *iommufd; + struct fd f; + int ret; + int fd; + + if (get_user(fd, arg)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + mutex_lock(&group->group_lock); + if (vfio_group_has_iommu(group)) { + ret = -EINVAL; + goto out_unlock; + } + if (!group->iommu_group) { + ret = -ENODEV; + goto out_unlock; + } + + container = vfio_container_from_file(f.file); + if (container) { + ret = vfio_container_attach_group(container, group); + goto out_unlock; + } + + iommufd = iommufd_ctx_from_file(f.file); + if (!IS_ERR(iommufd)) { + u32 ioas_id; + + ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id); + if (ret) { + iommufd_ctx_put(group->iommufd); + goto out_unlock; + } + + group->iommufd = iommufd; + goto out_unlock; + } + + /* The FD passed is not recognized. */ + ret = -EBADFD; + +out_unlock: + mutex_unlock(&group->group_lock); + fdput(f); + return ret; +} + +static int vfio_device_group_open(struct vfio_device *device) +{ + int ret; + + mutex_lock(&device->group->group_lock); + if (!vfio_group_has_iommu(device->group)) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Here we pass the KVM pointer with the group under the lock. If the + * device driver will use it, it must obtain a reference and release it + * during close_device. + */ + ret = vfio_device_open(device, device->group->iommufd, + device->group->kvm); + +out_unlock: + mutex_unlock(&device->group->group_lock); + return ret; +} + +void vfio_device_group_close(struct vfio_device *device) +{ + mutex_lock(&device->group->group_lock); + vfio_device_close(device, device->group->iommufd); + mutex_unlock(&device->group->group_lock); +} + +static struct file *vfio_device_open_file(struct vfio_device *device) +{ + struct file *filep; + int ret; + + ret = vfio_device_group_open(device); + if (ret) + goto err_out; + + /* + * We can't use anon_inode_getfd() because we need to modify + * the f_mode flags directly to allow more than just ioctls + */ + filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, + device, O_RDWR); + if (IS_ERR(filep)) { + ret = PTR_ERR(filep); + goto err_close_device; + } + + /* + * TODO: add an anon_inode interface to do this. + * Appears to be missing by lack of need rather than + * explicitly prevented. Now there's need. + */ + filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); + + if (device->group->type == VFIO_NO_IOMMU) + dev_warn(device->dev, "vfio-noiommu device opened by user " + "(%s:%d)\n", current->comm, task_pid_nr(current)); + /* + * On success the ref of device is moved to the file and + * put in vfio_device_fops_release() + */ + return filep; + +err_close_device: + vfio_device_group_close(device); +err_out: + return ERR_PTR(ret); +} + +static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, + char __user *arg) +{ + struct vfio_device *device; + struct file *filep; + char *buf; + int fdno; + int ret; + + buf = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + device = vfio_device_get_from_name(group, buf); + kfree(buf); + if (IS_ERR(device)) + return PTR_ERR(device); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + ret = fdno; + goto err_put_device; + } + + filep = vfio_device_open_file(device); + if (IS_ERR(filep)) { + ret = PTR_ERR(filep); + goto err_put_fdno; + } + + fd_install(fdno, filep); + return fdno; + +err_put_fdno: + put_unused_fd(fdno); +err_put_device: + vfio_device_put_registration(device); + return ret; +} + +static int vfio_group_ioctl_get_status(struct vfio_group *group, + struct vfio_group_status __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_group_status, flags); + struct vfio_group_status status; + + if (copy_from_user(&status, arg, minsz)) + return -EFAULT; + + if (status.argsz < minsz) + return -EINVAL; + + status.flags = 0; + + mutex_lock(&group->group_lock); + if (!group->iommu_group) { + mutex_unlock(&group->group_lock); + return -ENODEV; + } + + /* + * With the container FD the iommu_group_claim_dma_owner() is done + * during SET_CONTAINER but for IOMMFD this is done during + * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd + * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due + * to viability. + */ + if (vfio_group_has_iommu(group)) + status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | + VFIO_GROUP_FLAGS_VIABLE; + else if (!iommu_group_dma_owner_claimed(group->iommu_group)) + status.flags |= VFIO_GROUP_FLAGS_VIABLE; + mutex_unlock(&group->group_lock); + + if (copy_to_user(arg, &status, minsz)) + return -EFAULT; + return 0; +} + +static long vfio_group_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_group *group = filep->private_data; + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_GROUP_GET_DEVICE_FD: + return vfio_group_ioctl_get_device_fd(group, uarg); + case VFIO_GROUP_GET_STATUS: + return vfio_group_ioctl_get_status(group, uarg); + case VFIO_GROUP_SET_CONTAINER: + return vfio_group_ioctl_set_container(group, uarg); + case VFIO_GROUP_UNSET_CONTAINER: + return vfio_group_ioctl_unset_container(group); + default: + return -ENOTTY; + } +} + +static int vfio_group_fops_open(struct inode *inode, struct file *filep) +{ + struct vfio_group *group = + container_of(inode->i_cdev, struct vfio_group, cdev); + int ret; + + mutex_lock(&group->group_lock); + + /* + * drivers can be zero if this races with vfio_device_remove_group(), it + * will be stable at 0 under the group rwsem + */ + if (refcount_read(&group->drivers) == 0) { + ret = -ENODEV; + goto out_unlock; + } + + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { + ret = -EPERM; + goto out_unlock; + } + + /* + * Do we need multiple instances of the group open? Seems not. + */ + if (group->opened_file) { + ret = -EBUSY; + goto out_unlock; + } + group->opened_file = filep; + filep->private_data = group; + ret = 0; +out_unlock: + mutex_unlock(&group->group_lock); + return ret; +} + +static int vfio_group_fops_release(struct inode *inode, struct file *filep) +{ + struct vfio_group *group = filep->private_data; + + filep->private_data = NULL; + + mutex_lock(&group->group_lock); + /* + * Device FDs hold a group file reference, therefore the group release + * is only called when there are no open devices. + */ + WARN_ON(group->notifier.head); + if (group->container) + vfio_group_detach_container(group); + if (group->iommufd) { + iommufd_ctx_put(group->iommufd); + group->iommufd = NULL; + } + group->opened_file = NULL; + mutex_unlock(&group->group_lock); + return 0; +} + +static const struct file_operations vfio_group_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vfio_group_fops_unl_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .open = vfio_group_fops_open, + .release = vfio_group_fops_release, +}; + +/* + * Group objects - create, release, get, put, search + */ +static struct vfio_group * +vfio_group_find_from_iommu(struct iommu_group *iommu_group) +{ + struct vfio_group *group; + + lockdep_assert_held(&vfio.group_lock); + + /* + * group->iommu_group from the vfio.group_list cannot be NULL + * under the vfio.group_lock. + */ + list_for_each_entry(group, &vfio.group_list, vfio_next) { + if (group->iommu_group == iommu_group) + return group; + } + return NULL; +} + +static void vfio_group_release(struct device *dev) +{ + struct vfio_group *group = container_of(dev, struct vfio_group, dev); + + mutex_destroy(&group->device_lock); + mutex_destroy(&group->group_lock); + WARN_ON(group->iommu_group); + ida_free(&vfio.group_ida, MINOR(group->dev.devt)); + kfree(group); +} + +static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, + enum vfio_group_type type) +{ + struct vfio_group *group; + int minor; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); + if (minor < 0) { + kfree(group); + return ERR_PTR(minor); + } + + device_initialize(&group->dev); + group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); + group->dev.class = vfio.class; + group->dev.release = vfio_group_release; + cdev_init(&group->cdev, &vfio_group_fops); + group->cdev.owner = THIS_MODULE; + + refcount_set(&group->drivers, 1); + mutex_init(&group->group_lock); + INIT_LIST_HEAD(&group->device_list); + mutex_init(&group->device_lock); + group->iommu_group = iommu_group; + /* put in vfio_group_release() */ + iommu_group_ref_get(iommu_group); + group->type = type; + BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); + + return group; +} + +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, + enum vfio_group_type type) +{ + struct vfio_group *group; + struct vfio_group *ret; + int err; + + lockdep_assert_held(&vfio.group_lock); + + group = vfio_group_alloc(iommu_group, type); + if (IS_ERR(group)) + return group; + + err = dev_set_name(&group->dev, "%s%d", + group->type == VFIO_NO_IOMMU ? "noiommu-" : "", + iommu_group_id(iommu_group)); + if (err) { + ret = ERR_PTR(err); + goto err_put; + } + + err = cdev_device_add(&group->cdev, &group->dev); + if (err) { + ret = ERR_PTR(err); + goto err_put; + } + + list_add(&group->vfio_next, &vfio.group_list); + + return group; + +err_put: + put_device(&group->dev); + return ret; +} + +static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, + enum vfio_group_type type) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + int ret; + + iommu_group = iommu_group_alloc(); + if (IS_ERR(iommu_group)) + return ERR_CAST(iommu_group); + + ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); + if (ret) + goto out_put_group; + ret = iommu_group_add_device(iommu_group, dev); + if (ret) + goto out_put_group; + + mutex_lock(&vfio.group_lock); + group = vfio_create_group(iommu_group, type); + mutex_unlock(&vfio.group_lock); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + goto out_remove_device; + } + iommu_group_put(iommu_group); + return group; + +out_remove_device: + iommu_group_remove_device(dev); +out_put_group: + iommu_group_put(iommu_group); + return ERR_PTR(ret); +} + +static bool vfio_group_has_device(struct vfio_group *group, struct device *dev) +{ + struct vfio_device *device; + + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (device->dev == dev) { + mutex_unlock(&group->device_lock); + return true; + } + } + mutex_unlock(&group->device_lock); + return false; +} + +static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + + iommu_group = iommu_group_get(dev); + if (!iommu_group && vfio_noiommu) { + /* + * With noiommu enabled, create an IOMMU group for devices that + * don't already have one, implying no IOMMU hardware/driver + * exists. Taint the kernel because we're about to give a DMA + * capable device to a user without IOMMU protection. + */ + group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); + if (!IS_ERR(group)) { + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); + } + return group; + } + + if (!iommu_group) + return ERR_PTR(-EINVAL); + + /* + * VFIO always sets IOMMU_CACHE because we offer no way for userspace to + * restore cache coherency. It has to be checked here because it is only + * valid for cases where we are using iommu groups. + */ + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { + iommu_group_put(iommu_group); + return ERR_PTR(-EINVAL); + } + + mutex_lock(&vfio.group_lock); + group = vfio_group_find_from_iommu(iommu_group); + if (group) { + if (WARN_ON(vfio_group_has_device(group, dev))) + group = ERR_PTR(-EINVAL); + else + refcount_inc(&group->drivers); + } else { + group = vfio_create_group(iommu_group, VFIO_IOMMU); + } + mutex_unlock(&vfio.group_lock); + + /* The vfio_group holds a reference to the iommu_group */ + iommu_group_put(iommu_group); + return group; +} + +int vfio_device_set_group(struct vfio_device *device, + enum vfio_group_type type) +{ + struct vfio_group *group; + + if (type == VFIO_IOMMU) + group = vfio_group_find_or_alloc(device->dev); + else + group = vfio_noiommu_group_alloc(device->dev, type); + + if (IS_ERR(group)) + return PTR_ERR(group); + + /* Our reference on group is moved to the device */ + device->group = group; + return 0; +} + +void vfio_device_remove_group(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + struct iommu_group *iommu_group; + + if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) + iommu_group_remove_device(device->dev); + + /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ + if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) + return; + list_del(&group->vfio_next); + + /* + * We could concurrently probe another driver in the group that might + * race vfio_device_remove_group() with vfio_get_group(), so we have to + * ensure that the sysfs is all cleaned up under lock otherwise the + * cdev_device_add() will fail due to the name aready existing. + */ + cdev_device_del(&group->cdev, &group->dev); + + mutex_lock(&group->group_lock); + /* + * These data structures all have paired operations that can only be + * undone when the caller holds a live reference on the device. Since + * all pairs must be undone these WARN_ON's indicate some caller did not + * properly hold the group reference. + */ + WARN_ON(!list_empty(&group->device_list)); + WARN_ON(group->notifier.head); + + /* + * Revoke all users of group->iommu_group. At this point we know there + * are no devices active because we are unplugging the last one. Setting + * iommu_group to NULL blocks all new users. + */ + if (group->container) + vfio_group_detach_container(group); + iommu_group = group->iommu_group; + group->iommu_group = NULL; + mutex_unlock(&group->group_lock); + mutex_unlock(&vfio.group_lock); + + iommu_group_put(iommu_group); + put_device(&group->dev); +} + +void vfio_device_group_register(struct vfio_device *device) +{ + mutex_lock(&device->group->device_lock); + list_add(&device->group_next, &device->group->device_list); + mutex_unlock(&device->group->device_lock); +} + +void vfio_device_group_unregister(struct vfio_device *device) +{ + mutex_lock(&device->group->device_lock); + list_del(&device->group_next); + mutex_unlock(&device->group->device_lock); +} + +int vfio_device_group_use_iommu(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + int ret = 0; + + lockdep_assert_held(&group->group_lock); + + if (WARN_ON(!group->container)) + return -EINVAL; + + ret = vfio_group_use_container(group); + if (ret) + return ret; + vfio_device_container_register(device); + return 0; +} + +void vfio_device_group_unuse_iommu(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + lockdep_assert_held(&group->group_lock); + + if (WARN_ON(!group->container)) + return; + + vfio_device_container_unregister(device); + vfio_group_unuse_container(group); +} + +bool vfio_device_has_container(struct vfio_device *device) +{ + return device->group->container; +} + +/** + * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file + * @file: VFIO group file + * + * The returned iommu_group is valid as long as a ref is held on the file. This + * returns a reference on the group. This function is deprecated, only the SPAPR + * path in kvm should call it. + */ +struct iommu_group *vfio_file_iommu_group(struct file *file) +{ + struct vfio_group *group = file->private_data; + struct iommu_group *iommu_group = NULL; + + if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) + return NULL; + + if (!vfio_file_is_group(file)) + return NULL; + + mutex_lock(&group->group_lock); + if (group->iommu_group) { + iommu_group = group->iommu_group; + iommu_group_ref_get(iommu_group); + } + mutex_unlock(&group->group_lock); + return iommu_group; +} +EXPORT_SYMBOL_GPL(vfio_file_iommu_group); + +/** + * vfio_file_is_group - True if the file is usable with VFIO aPIS + * @file: VFIO group file + */ +bool vfio_file_is_group(struct file *file) +{ + return file->f_op == &vfio_group_fops; +} +EXPORT_SYMBOL_GPL(vfio_file_is_group); + +/** + * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file + * is always CPU cache coherent + * @file: VFIO group file + * + * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop + * bit in DMA transactions. A return of false indicates that the user has + * rights to access additional instructions such as wbinvd on x86. + */ +bool vfio_file_enforced_coherent(struct file *file) +{ + struct vfio_group *group = file->private_data; + struct vfio_device *device; + bool ret = true; + + if (!vfio_file_is_group(file)) + return true; + + /* + * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then + * any domain later attached to it will also not support it. If the cap + * is set then the iommu_domain eventually attached to the device/group + * must use a domain with enforce_cache_coherency(). + */ + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (!device_iommu_capable(device->dev, + IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) { + ret = false; + break; + } + } + mutex_unlock(&group->device_lock); + return ret; +} +EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); + +/** + * vfio_file_set_kvm - Link a kvm with VFIO drivers + * @file: VFIO group file + * @kvm: KVM to link + * + * When a VFIO device is first opened the KVM will be available in + * device->kvm if one was associated with the group. + */ +void vfio_file_set_kvm(struct file *file, struct kvm *kvm) +{ + struct vfio_group *group = file->private_data; + + if (!vfio_file_is_group(file)) + return; + + mutex_lock(&group->group_lock); + group->kvm = kvm; + mutex_unlock(&group->group_lock); +} +EXPORT_SYMBOL_GPL(vfio_file_set_kvm); + +/** + * vfio_file_has_dev - True if the VFIO file is a handle for device + * @file: VFIO file to check + * @device: Device that must be part of the file + * + * Returns true if given file has permission to manipulate the given device. + */ +bool vfio_file_has_dev(struct file *file, struct vfio_device *device) +{ + struct vfio_group *group = file->private_data; + + if (!vfio_file_is_group(file)) + return false; + + return group == device->group; +} +EXPORT_SYMBOL_GPL(vfio_file_has_dev); + +static char *vfio_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); +} + +int __init vfio_group_init(void) +{ + int ret; + + ida_init(&vfio.group_ida); + mutex_init(&vfio.group_lock); + INIT_LIST_HEAD(&vfio.group_list); + + ret = vfio_container_init(); + if (ret) + return ret; + + /* /dev/vfio/$GROUP */ + vfio.class = class_create(THIS_MODULE, "vfio"); + if (IS_ERR(vfio.class)) { + ret = PTR_ERR(vfio.class); + goto err_group_class; + } + + vfio.class->devnode = vfio_devnode; + + ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); + if (ret) + goto err_alloc_chrdev; + return 0; + +err_alloc_chrdev: + class_destroy(vfio.class); + vfio.class = NULL; +err_group_class: + vfio_container_cleanup(); + return ret; +} + +void vfio_group_cleanup(void) +{ + WARN_ON(!list_empty(&vfio.group_list)); + ida_destroy(&vfio.group_ida); + unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); + class_destroy(vfio.class); + vfio.class = NULL; + vfio_container_cleanup(); +} diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c new file mode 100644 index 000000000000..4f82a6fa7c6c --- /dev/null +++ b/drivers/vfio/iommufd.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/vfio.h> +#include <linux/iommufd.h> + +#include "vfio.h" + +MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS(IOMMUFD_VFIO); + +int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx) +{ + u32 ioas_id; + u32 device_id; + int ret; + + lockdep_assert_held(&vdev->dev_set->lock); + + /* + * If the driver doesn't provide this op then it means the device does + * not do DMA at all. So nothing to do. + */ + if (!vdev->ops->bind_iommufd) + return 0; + + ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id); + if (ret) + return ret; + + ret = iommufd_vfio_compat_ioas_id(ictx, &ioas_id); + if (ret) + goto err_unbind; + ret = vdev->ops->attach_ioas(vdev, &ioas_id); + if (ret) + goto err_unbind; + + /* + * The legacy path has no way to return the device id or the selected + * pt_id + */ + return 0; + +err_unbind: + if (vdev->ops->unbind_iommufd) + vdev->ops->unbind_iommufd(vdev); + return ret; +} + +void vfio_iommufd_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->ops->unbind_iommufd) + vdev->ops->unbind_iommufd(vdev); +} + +/* + * The physical standard ops mean that the iommufd_device is bound to the + * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers + * using this ops set should call vfio_register_group_dev() + */ +int vfio_iommufd_physical_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id) +{ + struct iommufd_device *idev; + + idev = iommufd_device_bind(ictx, vdev->dev, out_device_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + vdev->iommufd_device = idev; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind); + +void vfio_iommufd_physical_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->iommufd_attached) { + iommufd_device_detach(vdev->iommufd_device); + vdev->iommufd_attached = false; + } + iommufd_device_unbind(vdev->iommufd_device); + vdev->iommufd_device = NULL; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_unbind); + +int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) +{ + int rc; + + rc = iommufd_device_attach(vdev->iommufd_device, pt_id); + if (rc) + return rc; + vdev->iommufd_attached = true; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); + +/* + * The emulated standard ops mean that vfio_device is going to use the + * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this + * ops set should call vfio_register_emulated_iommu_dev(). + */ + +static void vfio_emulated_unmap(void *data, unsigned long iova, + unsigned long length) +{ + struct vfio_device *vdev = data; + + vdev->ops->dma_unmap(vdev, iova, length); +} + +static const struct iommufd_access_ops vfio_user_ops = { + .needs_pin_pages = 1, + .unmap = vfio_emulated_unmap, +}; + +int vfio_iommufd_emulated_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + vdev->iommufd_ictx = ictx; + iommufd_ctx_get(ictx); + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_bind); + +void vfio_iommufd_emulated_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->iommufd_access) { + iommufd_access_destroy(vdev->iommufd_access); + vdev->iommufd_access = NULL; + } + iommufd_ctx_put(vdev->iommufd_ictx); + vdev->iommufd_ictx = NULL; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_unbind); + +int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id) +{ + struct iommufd_access *user; + + lockdep_assert_held(&vdev->dev_set->lock); + + user = iommufd_access_create(vdev->iommufd_ictx, *pt_id, &vfio_user_ops, + vdev); + if (IS_ERR(user)) + return PTR_ERR(user); + vdev->iommufd_access = user; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas); diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index 6631e8befe1b..0848f920efb7 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -5,6 +5,7 @@ */ #include <linux/iova_bitmap.h> #include <linux/mm.h> +#include <linux/slab.h> #include <linux/highmem.h> #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) @@ -295,11 +296,13 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) */ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) { - unsigned long remaining; + unsigned long remaining, bytes; + + bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff; remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; remaining = min_t(unsigned long, remaining, - (bitmap->mapped.npages << PAGE_SHIFT) / sizeof(*bitmap->bitmap)); + bytes / sizeof(*bitmap->bitmap)); return remaining; } @@ -394,29 +397,27 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, * Set the bits corresponding to the range [iova .. iova+length-1] in * the user bitmap. * - * Return: The number of bits set. */ void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length) { struct iova_bitmap_map *mapped = &bitmap->mapped; - unsigned long offset = (iova - mapped->iova) >> mapped->pgshift; - unsigned long nbits = max_t(unsigned long, 1, length >> mapped->pgshift); - unsigned long page_idx = offset / BITS_PER_PAGE; - unsigned long page_offset = mapped->pgoff; - void *kaddr; - - offset = offset % BITS_PER_PAGE; + unsigned long cur_bit = ((iova - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; do { - unsigned long size = min(BITS_PER_PAGE - offset, nbits); + unsigned int page_idx = cur_bit / BITS_PER_PAGE; + unsigned int offset = cur_bit % BITS_PER_PAGE; + unsigned int nbits = min(BITS_PER_PAGE - offset, + last_bit - cur_bit + 1); + void *kaddr; kaddr = kmap_local_page(mapped->pages[page_idx]); - bitmap_set(kaddr + page_offset, offset, size); + bitmap_set(kaddr, offset, nbits); kunmap_local(kaddr); - page_offset = offset = 0; - nbits -= size; - page_idx++; - } while (nbits > 0); + cur_bit += nbits; + } while (cur_bit <= last_bit); } EXPORT_SYMBOL_GPL(iova_bitmap_set); diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 39eeca18a0f7..0bba3b05c6c7 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -360,8 +360,8 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 que_iso_state; int ret; - if (migf->total_length < QM_MATCH_SIZE) - return -EINVAL; + if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) + return 0; if (vf_data->acc_magic != ACC_DEV_MAGIC) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); @@ -406,6 +406,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, } hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + hisi_acc_vdev->match_done = true; return 0; } @@ -493,10 +494,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct device *dev = &vf_qm->pdev->dev; int ret; - ret = vf_qm_get_match_data(hisi_acc_vdev, vf_data); - if (ret) - return ret; - if (unlikely(qm_wait_dev_not_ready(vf_qm))) { /* Update state and return with match data */ vf_data->vf_qm_state = QM_NOT_READY; @@ -673,12 +670,6 @@ static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf; int ret; - /* Check dev compatibility */ - ret = vf_qm_check_match(hisi_acc_vdev, migf); - if (ret) { - dev_err(dev, "failed to match the VF!\n"); - return ret; - } /* Recover data to VF */ ret = vf_qm_load_data(hisi_acc_vdev, migf); if (ret) { @@ -732,6 +723,10 @@ static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *bu *pos += len; done = len; migf->total_length += len; + + ret = vf_qm_check_match(migf->hisi_acc_vdev, migf); + if (ret) + done = -EFAULT; out_unlock: mutex_unlock(&migf->lock); return done; @@ -764,9 +759,58 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; return migf; } +static long hisi_acc_vf_precopy_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct hisi_acc_vf_migration_file *migf = filp->private_data; + struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev; + loff_t *pos = &filp->f_pos; + struct vfio_precopy_info info; + unsigned long minsz; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&hisi_acc_vdev->state_mutex); + if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) { + mutex_unlock(&hisi_acc_vdev->state_mutex); + return -EINVAL; + } + + mutex_lock(&migf->lock); + + if (migf->disabled) { + ret = -ENODEV; + goto out; + } + + if (*pos > migf->total_length) { + ret = -EINVAL; + goto out; + } + + info.dirty_bytes = 0; + info.initial_bytes = migf->total_length - *pos; + + ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +out: + mutex_unlock(&migf->lock); + mutex_unlock(&hisi_acc_vdev->state_mutex); + return ret; +} + static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { @@ -807,12 +851,14 @@ out_unlock: static const struct file_operations hisi_acc_vf_save_fops = { .owner = THIS_MODULE, .read = hisi_acc_vf_save_read, + .unlocked_ioctl = hisi_acc_vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = hisi_acc_vf_release_file, .llseek = no_llseek, }; static struct hisi_acc_vf_migration_file * -hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct hisi_acc_vf_migration_file *migf; int ret; @@ -832,8 +878,9 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; - ret = vf_qm_state_save(hisi_acc_vdev, migf); + ret = vf_qm_get_match_data(hisi_acc_vdev, &migf->vf_data); if (ret) { fput(migf->filp); return ERR_PTR(ret); @@ -842,6 +889,44 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) return migf; } +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + + migf->total_length = QM_MATCH_SIZE; + return migf; +} + +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, bool open) +{ + int ret; + struct hisi_acc_vf_migration_file *migf = NULL; + + if (open) { + /* + * Userspace didn't use PRECOPY support. Hence saving_migf + * is not opened yet. + */ + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + } else { + migf = hisi_acc_vdev->saving_migf; + } + + ret = vf_qm_state_save(hisi_acc_vdev, migf); + if (ret) + return ERR_PTR(ret); + + return open ? migf : NULL; +} + static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct device *dev = &hisi_acc_vdev->vf_dev->dev; @@ -869,6 +954,31 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 cur = hisi_acc_vdev->mig_state; int ret; + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) { + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_vf_pre_copy(hisi_acc_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + hisi_acc_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct hisi_acc_vf_migration_file *migf; + + ret = hisi_acc_vf_stop_device(hisi_acc_vdev); + if (ret) + return ERR_PTR(ret); + + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + return NULL; + } + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) { ret = hisi_acc_vf_stop_device(hisi_acc_vdev); if (ret) @@ -879,7 +989,7 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct hisi_acc_vf_migration_file *migf; - migf = hisi_acc_vf_stop_copy(hisi_acc_vdev); + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, true); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -911,6 +1021,11 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, return NULL; } + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) { + hisi_acc_vf_disable_fds(hisi_acc_vdev); + return NULL; + } + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) { hisi_acc_vf_start_device(hisi_acc_vdev); return NULL; @@ -958,6 +1073,14 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev, } static int +hisi_acc_vfio_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + *stop_copy_length = sizeof(struct acc_vf_data); + return 0; +} + +static int hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { @@ -1213,6 +1336,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { .migration_set_state = hisi_acc_vfio_pci_set_device_state, .migration_get_state = hisi_acc_vfio_pci_get_device_state, + .migration_get_data_size = hisi_acc_vfio_pci_get_data_size, }; static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) @@ -1227,7 +1351,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) hisi_acc_vdev->vf_dev = pdev; mutex_init(&hisi_acc_vdev->state_mutex); - core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY; core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; return vfio_pci_core_init_dev(core_vdev); @@ -1246,6 +1370,9 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { @@ -1261,6 +1388,9 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 67343325b320..dcabfeec6ca1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -91,12 +91,14 @@ struct hisi_acc_vf_migration_file { struct mutex lock; bool disabled; + struct hisi_acc_vf_core_device *hisi_acc_vdev; struct acc_vf_data vf_data; size_t total_length; }; struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; + u8 match_done:1; u8 deferred_reset:1; /* For migration state */ struct mutex state_mutex; diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c604b70437a5..64e68d13cb98 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; + int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while the device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the save + * command once it will try to turn on 'tracking' on a suspended device. + */ + if (migf) { + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + } + MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + if (migf) + complete(&migf->save_comp); + + return err; } int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) @@ -45,23 +63,54 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size) + size_t *state_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; + bool inc = query_flags & MLX5VF_QUERY_INC; int ret; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the + * incremental query command on un-tracked vhca. + */ + if (inc) { + ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); + if (ret) + return ret; + if (mvdev->saving_migf->state == + MLX5_MIGF_STATE_PRE_COPY_ERROR) { + /* + * In case we had a PRE_COPY error, only query full + * image for final image + */ + if (!(query_flags & MLX5VF_QUERY_FINAL)) { + *state_size = 0; + complete(&mvdev->saving_migf->save_comp); + return 0; + } + query_flags &= ~MLX5VF_QUERY_INC; + } + } + MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); + MLX5_SET(query_vhca_migration_state_in, in, incremental, + query_flags & MLX5VF_QUERY_INC); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); + if (inc) + complete(&mvdev->saving_migf->save_comp); + if (ret) return ret; @@ -173,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) mvdev->core_device.vdev.log_ops = log_ops; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) + mvdev->core_device.vdev.migration_flags |= + VFIO_MIGRATION_PRE_COPY; + end: mlx5_vf_put_core_dev(mvdev->mdev); } @@ -210,11 +264,11 @@ err_exec: } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; @@ -232,10 +286,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (migf) { + if (buf) { struct sg_dma_page_iter dma_iter; - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; @@ -255,35 +309,195 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, - migf ? migf->total_length : (npages * PAGE_SIZE)); + MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; } +static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; + int ret; + + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) + return -ENOTCONN; + + if (buf->dmaed || !buf->allocated_length) + return -EINVAL; + + ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + if (ret) + return ret; + + ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + if (ret) + goto err; + + buf->dmaed = true; + + return 0; +err: + dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + return ret; +} + +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5_vf_migration_file *migf = buf->migf; + struct sg_page_iter sg_iter; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (buf->dmaed) { + mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, + buf->dma_dir, 0); + } + + /* Undo alloc_pages_bulk_array() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, + enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + buf->dma_dir = dma_dir; + buf->migf = migf; + if (length) { + ret = mlx5vf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + if (dma_dir != DMA_NONE) { + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } + } + + return buf; +end: + mlx5vf_free_data_buffer(buf); + return ERR_PTR(ret); +} + +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf, *temp_buf; + struct list_head free_list; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return ERR_PTR(-ENOTCONN); + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + if (buf->dma_dir == dma_dir) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to mlx5vf_free_data_buffer which + * might sleep. + */ + list_add(&buf->buf_elm, &free_list); + } + } + spin_unlock_irq(&migf->list_lock); + buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + mlx5vf_free_data_buffer(temp_buf); + } + + return buf; +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, struct mlx5vf_async_data, work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); - struct mlx5_core_dev *mdev = migf->mvdev->mdev; mutex_lock(&migf->lock); if (async_data->status) { - migf->is_err = true; + mlx5vf_put_data_buffer(async_data->buf); + if (async_data->header_buf) + mlx5vf_put_data_buffer(async_data->header_buf); + if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) + migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; + else + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - - mlx5_core_destroy_mkey(mdev, async_data->mkey); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); + complete(&migf->save_comp); fput(migf->filp); } +static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, + size_t image_size) +{ + struct mlx5_vf_migration_file *migf = header_buf->migf; + struct mlx5_vf_migration_header header = {}; + unsigned long flags; + struct page *page; + u8 *to_buff; + + header.image_size = cpu_to_le64(image_size); + page = mlx5vf_get_migration_page(header_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + kunmap_local(to_buff); + header_buf->length = sizeof(header); + header_buf->header_image_size = image_size; + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + return 0; +} + static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) { struct mlx5vf_async_data *async_data = container_of(context, @@ -292,67 +506,96 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->total_length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + size_t image_size; + unsigned long flags; + + image_size = MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + if (async_data->header_buf) { + status = add_buf_header(async_data->header_buf, image_size); + if (status) + goto err; + } + async_data->buf->length = image_size; + async_data->buf->start_pos = migf->max_pos; + migf->max_pos += async_data->buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->state = async_data->last_chunk ? + MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); } +err: /* * The error and the cleanup flows can't run from an * interrupt context */ + if (status == -EREMOTEIO) + status = MLX5_GET(save_vhca_state_out, async_data->out, status); async_data->status = status; queue_work(migf->mvdev->cb_wq, &async_data->work); } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5_vhca_data_buffer *header_buf = NULL; struct mlx5vf_async_data *async_data; - struct mlx5_core_dev *mdev; - u32 pdn, mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); + err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, - 0); - if (err) - goto err_dma_map; - - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); - if (err) - goto err_create_mkey; + if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) + /* + * In case we had a PRE_COPY error, SAVE is triggered only for + * the final image, read device full image. + */ + inc = false; MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(save_vhca_state_in, in, mkey, mkey); - MLX5_SET(save_vhca_state_in, in, size, migf->total_length); + MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, incremental, inc); + MLX5_SET(save_vhca_state_in, in, set_track, track); async_data = &migf->async_data; + async_data->buf = buf; + async_data->last_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; goto err_out; } - /* no data exists till the callback comes back */ - migf->total_length = 0; + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } + } + + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_SAVE_LAST; + + async_data->header_buf = header_buf; get_file(migf->filp); - async_data->mkey = mkey; - async_data->pdn = pdn; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -363,68 +606,92 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return 0; err_exec: + if (header_buf) + mlx5vf_put_data_buffer(header_buf); fput(migf->filp); +err_free: kvfree(async_data->out); err_out: - mlx5_core_destroy_mkey(mdev, mkey); -err_create_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -err_dma_map: - mlx5_core_dealloc_pd(mdev, pdn); + complete(&migf->save_comp); return err; } int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { - struct mlx5_core_dev *mdev; - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; - u32 pdn, mkey; + u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mutex_lock(&migf->lock); - if (!migf->total_length) { - err = -EINVAL; - goto end; + if (!buf->dmaed) { + err = mlx5vf_dma_data_buffer(buf); + if (err) + return err; } - mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - goto end; - - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); - if (err) - goto err_reg; - - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); - if (err) - goto err_mkey; - MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(load_vhca_state_in, in, mkey, mkey); - MLX5_SET(load_vhca_state_in, in, size, migf->total_length); + MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(load_vhca_state_in, in, size, buf->length); + return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); +} - err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) +{ + int err; - mlx5_core_destroy_mkey(mdev, mkey); -err_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -err_reg: - mlx5_core_dealloc_pd(mdev, pdn); -end: - mutex_unlock(&migf->lock); + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return -ENOTCONN; + + err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); return err; } +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return; + + mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); +} + +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) +{ + struct mlx5_vhca_data_buffer *entry; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (migf->buf) { + mlx5vf_free_data_buffer(migf->buf); + migf->buf = NULL; + } + + if (migf->buf_header) { + mlx5vf_free_data_buffer(migf->buf_header); + migf->buf_header = NULL; + } + + list_splice(&migf->avail_list, &migf->buf_list); + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + mlx5vf_free_data_buffer(entry); + } + + mlx5vf_cmd_dealloc_pd(migf); +} + static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, u32 req_nodes) { diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 921d5720a1e5..5483171d57ad 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,31 +12,74 @@ #include <linux/mlx5/cq.h> #include <linux/mlx5/qp.h> +#define MLX5VF_PRE_COPY_SUPP(mvdev) \ + ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) + +enum mlx5_vf_migf_state { + MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY_ERROR, + MLX5_MIGF_STATE_PRE_COPY, + MLX5_MIGF_STATE_SAVE_LAST, + MLX5_MIGF_STATE_COMPLETE, +}; + +enum mlx5_vf_load_state { + MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, + MLX5_VF_LOAD_STATE_READ_HEADER, + MLX5_VF_LOAD_STATE_PREP_IMAGE, + MLX5_VF_LOAD_STATE_READ_IMAGE, + MLX5_VF_LOAD_STATE_LOAD_IMAGE, +}; + +struct mlx5_vf_migration_header { + __le64 image_size; + /* For future use in case we may need to change the kernel protocol */ + __le64 flags; +}; + +struct mlx5_vhca_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + u64 header_image_size; + u32 mkey; + enum dma_data_direction dma_dir; + u8 dmaed:1; + struct list_head buf_elm; + struct mlx5_vf_migration_file *migf; + /* Optimize mlx5vf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *header_buf; int status; - u32 pdn; - u32 mkey; + u8 last_chunk:1; void *out; }; struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - u8 disabled:1; - u8 is_err:1; + enum mlx5_vf_migf_state state; - struct sg_append_table table; - size_t total_length; - size_t allocated_length; - - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; + enum mlx5_vf_load_state load_state; + u32 pdn; + loff_t max_pos; + struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *buf_header; + spinlock_t list_lock; + struct list_head buf_list; + struct list_head avail_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; + struct completion save_comp; struct mlx5_async_ctx async_ctx; struct mlx5vf_async_data async_data; }; @@ -113,19 +156,42 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; +enum { + MLX5VF_QUERY_INC = (1UL << 0), + MLX5VF_QUERY_FINAL = (1UL << 1), +}; + int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size); + size_t *state_size, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages); +struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fd6ccb8454a2..9feb89c6d939 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -32,8 +32,8 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -static struct page * -mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, +struct page * +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { unsigned long cur_offset = 0; @@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned int i; /* All accesses are sequential */ - if (offset < migf->last_offset || !migf->last_offset_sg) { - migf->last_offset = 0; - migf->last_offset_sg = migf->table.sgt.sgl; - migf->sg_last_entry = 0; + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; } - cur_offset = migf->last_offset; + cur_offset = buf->last_offset; - for_each_sg(migf->last_offset_sg, sg, - migf->table.sgt.orig_nents - migf->sg_last_entry, i) { + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { if (offset < sg->length + cur_offset) { - migf->last_offset_sg = sg; - migf->sg_last_entry += i; - migf->last_offset = cur_offset; + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; return nth_page(sg_page(sg), (offset - cur_offset) / PAGE_SIZE); } @@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, return NULL; } -static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, - unsigned int npages) +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages) { unsigned int to_alloc = npages; struct page **page_list; @@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &migf->table, page_list, filled, 0, + &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL); if (ret) goto err; - migf->allocated_length += filled * PAGE_SIZE; + buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -108,16 +108,8 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { - struct sg_page_iter sg_iter; - mutex_lock(&migf->lock); - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&migf->table); - migf->disabled = true; - migf->total_length = 0; - migf->allocated_length = 0; + migf->state = MLX5_MIGF_STATE_ERROR; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -132,10 +124,91 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +static struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, + bool *end_of_data) +{ + struct mlx5_vhca_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = MLX5_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = mlx5vf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; ssize_t done = 0; if (pos) @@ -144,52 +217,56 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(migf->total_length) || migf->is_err)) + !list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { - done = -EAGAIN; - goto out_unlock; - } - if (*pos > migf->total_length) { - done = -EINVAL; - goto out_unlock; - } - if (migf->disabled || migf->is_err) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } - len = min_t(size_t, migf->total_length - *pos, len); while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *from_buff; - int ret; + ssize_t count; + + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, + &end_of_data); + if (first_loop_call) { + first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { + done = -ENOMSG; + goto out_unlock; + } + + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { + if (filp->f_flags & O_NONBLOCK) { + done = -EAGAIN; + goto out_unlock; + } + } + } + + if (end_of_data) + goto out_unlock; - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; + if (!vhca_buf) { + done = -EINVAL; goto out_unlock; } - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - from_buff = kmap_local_page(page); - ret = copy_to_user(buf, from_buff + page_offset, page_len); - kunmap_local(from_buff); - if (ret) { - done = -EFAULT; + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; goto out_unlock; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; + done += count; } out_unlock: @@ -206,27 +283,188 @@ static __poll_t mlx5vf_save_poll(struct file *filp, poll_wait(filp, &migf->poll_wait, wait); mutex_lock(&migf->lock); - if (migf->disabled || migf->is_err) + if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->total_length)) + else if (!list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_COMPLETE) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); return pollflags; } +/* + * FD is exposed and user can use it after receiving an error. + * Mark migf in error, and wake the user. + */ +static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) +{ + migf->state = MLX5_MIGF_STATE_ERROR; + wake_up_interruptible(&migf->poll_wait); +} + +static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + struct mlx5_vhca_data_buffer *buf; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + size_t inc_length = 0; + bool end_of_data; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&mvdev->state_mutex); + if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * We can't issue a SAVE command when the device is suspended, so as + * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra + * bytes that can't be read. + */ + if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { + /* + * Once the query returns it's guaranteed that there is no + * active SAVE command. + * As so, the other code below is safe with the proper locks. + */ + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, + MLX5VF_QUERY_INC); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (migf->state == MLX5_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (buf) { + if (buf->start_pos == 0) { + info.initial_bytes = buf->header_image_size - *pos; + } else if (buf->start_pos == + sizeof(struct mlx5_vf_migration_header)) { + /* First data buffer following the header */ + info.initial_bytes = buf->start_pos + + buf->length - *pos; + } else { + info.dirty_bytes = buf->start_pos + buf->length - *pos; + } + } else { + if (!end_of_data) { + ret = -EINVAL; + goto err_migf_unlock; + } + + info.dirty_bytes = inc_length; + } + + if (!end_of_data || !inc_length) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, save a new state to be ready for. + */ + buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); + if (ret) { + mlx5vf_mark_err(migf); + mlx5vf_put_data_buffer(buf); + goto err_state_unlock; + } + +done: + mlx5vf_state_mutex_unlock(mvdev); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, .poll = mlx5vf_save_poll, + .unlocked_ioctl = mlx5vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = mlx5vf_release_file, .llseek = no_llseek, }; +static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; + int ret; + + if (migf->state == MLX5_MIGF_STATE_ERROR) + return -ENODEV; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); + if (ret) + goto err; + + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); + if (ret) + goto err_save; + + return 0; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); + return ret; +} + static struct mlx5_vf_migration_file * -mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -236,43 +474,211 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, O_RDONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); + init_completion(&migf->save_comp); + /* + * save_comp is being used as a binary semaphore built from + * a completion. A normal mutex cannot be used because the lock is + * passed between kernel threads and lockdep can't model this. + */ + complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &migf->total_length); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); if (ret) - goto out_free; + goto out_pd; - ret = mlx5vf_add_migration_pages( - migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); - if (ret) - goto out_free; + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } - migf->mvdev = mvdev; - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); if (ret) - goto out_free; + goto out_save; return migf; +out_save: + mlx5vf_free_data_buffer(buf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); +end: + kfree(migf); return ERR_PTR(ret); } +static int +mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + unsigned long offset; + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + + page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + page_offset, *buf, page_len); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + *pos += page_len; + *done += page_len; + *buf += page_len; + *len -= page_len; + vhca_buf->length += page_len; + return 0; +} + +static int +mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, + loff_t requested_length, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + int ret; + + if (requested_length > MAX_MIGRATION_SIZE) + return -ENOMEM; + + if (vhca_buf->allocated_length < requested_length) { + ret = mlx5vf_add_migration_pages( + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, + PAGE_SIZE)); + if (ret) + return ret; + } + + while (*len) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, + done); + if (ret) + return ret; + } + + return 0; +} + +static ssize_t +mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + size_t image_size, const char __user **buf, + size_t *len, loff_t *pos, ssize_t *done, + bool *has_work) +{ + size_t copy_len, to_copy; + int ret; + + to_copy = min_t(size_t, *len, image_size - vhca_buf->length); + copy_len = to_copy; + while (to_copy) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, + done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == image_size) { + migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; + migf->max_pos += image_size; + *has_work = true; + } + + return 0; +} + +static int +mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, + size_t *len, loff_t *pos, + ssize_t *done, bool *has_work) +{ + struct page *page; + size_t copy_len; + u8 *to_buff; + int ret; + + copy_len = min_t(size_t, *len, + sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); + page = mlx5vf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); + if (ret) { + ret = -EFAULT; + goto end; + } + + *buf += copy_len; + *pos += copy_len; + *done += copy_len; + *len -= copy_len; + vhca_buf->length += copy_len; + if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { + u64 flags; + + vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); + if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { + ret = -ENOMEM; + goto end; + } + + flags = le64_to_cpup((__le64 *)(to_buff + + offsetof(struct mlx5_vf_migration_header, flags))); + if (flags) { + ret = -EOPNOTSUPP; + goto end; + } + + migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; + migf->max_pos += vhca_buf->length; + *has_work = true; + } +end: + kunmap_local(to_buff); + return ret; +} + static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; loff_t requested_length; + bool has_work = false; ssize_t done = 0; + int ret = 0; if (pos) return -ESPIPE; @@ -282,56 +688,83 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, check_add_overflow((loff_t)len, *pos, &requested_length)) return -EINVAL; - if (requested_length > MAX_MIGRATION_SIZE) - return -ENOMEM; - + mutex_lock(&migf->mvdev->state_mutex); mutex_lock(&migf->lock); - if (migf->disabled) { - done = -ENODEV; + if (migf->state == MLX5_MIGF_STATE_ERROR) { + ret = -ENODEV; goto out_unlock; } - if (migf->allocated_length < requested_length) { - done = mlx5vf_add_migration_pages( - migf, - DIV_ROUND_UP(requested_length - migf->allocated_length, - PAGE_SIZE)); - if (done) - goto out_unlock; - } - - while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *to_buff; - int ret; - - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; - goto out_unlock; + while (len || has_work) { + has_work = false; + switch (migf->load_state) { + case MLX5_VF_LOAD_STATE_READ_HEADER: + ret = mlx5vf_resume_read_header(migf, vhca_buf_header, + &buf, &len, pos, + &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_PREP_IMAGE: + { + u64 size = vhca_buf_header->header_image_size; + + if (vhca_buf->allocated_length < size) { + mlx5vf_free_data_buffer(vhca_buf); + + migf->buf = mlx5vf_alloc_data_buffer(migf, + size, DMA_TO_DEVICE); + if (IS_ERR(migf->buf)) { + ret = PTR_ERR(migf->buf); + migf->buf = NULL; + goto out_unlock; + } + + vhca_buf = migf->buf; + } + + vhca_buf->start_pos = migf->max_pos; + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; + break; } + case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: + ret = mlx5vf_resume_read_image_no_header(vhca_buf, + requested_length, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_READ_IMAGE: + ret = mlx5vf_resume_read_image(migf, vhca_buf, + vhca_buf_header->header_image_size, + &buf, &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_LOAD_IMAGE: + ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); + if (ret) + goto out_unlock; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + + /* prep header buf for next image */ + vhca_buf_header->length = 0; + vhca_buf_header->header_image_size = 0; + /* prep data buf for next image */ + vhca_buf->length = 0; - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - to_buff = kmap_local_page(page); - ret = copy_from_user(to_buff + page_offset, buf, page_len); - kunmap_local(to_buff); - if (ret) { - done = -EFAULT; - goto out_unlock; + break; + default: + break; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; - migf->total_length += page_len; } + out_unlock: + if (ret) + migf->state = MLX5_MIGF_STATE_ERROR; mutex_unlock(&migf->lock); - return done; + mlx5vf_state_mutex_unlock(migf->mvdev); + return ret ? ret : done; } static const struct file_operations mlx5vf_resume_fops = { @@ -345,6 +778,8 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); if (!migf) @@ -353,20 +788,59 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, O_WRONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); + ret = PTR_ERR(migf->filp); + goto end; + } - kfree(migf); - return ERR_PTR(err); + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + + buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } + + migf->buf = buf; + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + buf = mlx5vf_alloc_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_buf; + } + + migf->buf_header = buf; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + } else { + /* Initial state will be to read the image */ + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; } + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); return migf; +out_buf: + mlx5vf_free_data_buffer(migf->buf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); +out_free: + fput(migf->filp); +end: + kfree(migf); + return ERR_PTR(ret); } void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); fput(mvdev->resuming_migf->filp); mvdev->resuming_migf = NULL; } @@ -374,6 +848,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; } @@ -402,7 +877,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) @@ -410,7 +886,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) @@ -421,7 +898,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct mlx5_vf_migration_file *migf; - migf = mlx5vf_pci_save_device_data(mvdev); + migf = mlx5vf_pci_save_device_data(mvdev, false); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -429,7 +906,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return migf->filp; } - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) { mlx5vf_disable_fds(mvdev); return NULL; } @@ -446,14 +926,39 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf); - if (ret) - return ERR_PTR(ret); + if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { + ret = mlx5vf_cmd_load_vhca_state(mvdev, + mvdev->resuming_migf, + mvdev->resuming_migf->buf); + if (ret) + return ERR_PTR(ret); + } mlx5vf_disable_fds(mvdev); return NULL; } + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct mlx5_vf_migration_file *migf; + + migf = mlx5vf_pci_save_device_data(mvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + mvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = mlx5vf_cmd_suspend_vhca(mvdev, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); + if (ret) + return ERR_PTR(ret); + ret = mlx5vf_pci_save_device_inc_data(mvdev); + return ret ? ERR_PTR(ret) : NULL; + } + /* * vfio_mig_get_next_state() does not use arcs other than the above */ @@ -512,6 +1017,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev, return res; } +static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + size_t state_size; + int ret; + + mutex_lock(&mvdev->state_mutex); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &state_size, 0); + if (!ret) + *stop_copy_length = state_size; + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { @@ -577,6 +1099,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_set_state = mlx5vf_pci_set_device_state, .migration_get_state = mlx5vf_pci_get_device_state, + .migration_get_data_size = mlx5vf_pci_get_data_size, }; static const struct vfio_log_ops mlx5vf_pci_log_ops = { @@ -623,6 +1146,9 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int mlx5vf_pci_probe(struct pci_dev *pdev, @@ -676,18 +1202,7 @@ static struct pci_driver mlx5vf_pci_driver = { .driver_managed_dma = true, }; -static void __exit mlx5vf_pci_cleanup(void) -{ - pci_unregister_driver(&mlx5vf_pci_driver); -} - -static int __init mlx5vf_pci_init(void) -{ - return pci_register_driver(&mlx5vf_pci_driver); -} - -module_init(mlx5vf_pci_init); -module_exit(mlx5vf_pci_cleanup); +module_pci_driver(mlx5vf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 1d4919edfbde..29091ee2e984 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -138,6 +138,9 @@ static const struct vfio_device_ops vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index badc9d828cac..26a541cc64d1 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,6 +27,9 @@ #include <linux/vgaarb.h> #include <linux/nospec.h> #include <linux/sched/mm.h> +#if IS_ENABLED(CONFIG_EEH) +#include <asm/eeh.h> +#endif #include "vfio_pci_priv.h" @@ -686,7 +689,9 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } - vfio_spapr_pci_eeh_release(vdev->pdev); +#if IS_ENABLED(CONFIG_EEH) + eeh_dev_release(vdev->pdev); +#endif vfio_pci_core_disable(vdev); mutex_lock(&vdev->igate); @@ -705,7 +710,9 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); - vfio_spapr_pci_eeh_open(vdev->pdev); +#if IS_ENABLED(CONFIG_EEH) + eeh_dev_open(vdev->pdev); +#endif if (vdev->sriov_pf_core_dev) { mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); @@ -2109,7 +2116,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); - vfio_free_device(core_vdev); } EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); @@ -2128,7 +2134,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (vdev->vdev.mig_ops) { if (!(vdev->vdev.mig_ops->migration_get_state && - vdev->vdev.mig_ops->migration_set_state) || + vdev->vdev.mig_ops->migration_set_state && + vdev->vdev.mig_ops->migration_get_data_size) || !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY)) return -EINVAL; } @@ -2488,12 +2495,12 @@ static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) struct vfio_pci_core_device *cur; bool needs_reset = false; - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - /* No VFIO device in the set can have an open device FD */ - if (cur->vdev.open_count) - return false; + /* No other VFIO device in the set can be open. */ + if (vfio_device_set_open_count(dev_set) > 1) + return false; + + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) needs_reset |= cur->needs_reset; - } return needs_reset; } diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index eaea63e5294c..83fe54015595 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -95,7 +95,6 @@ static void vfio_amba_release_dev(struct vfio_device *core_vdev) vfio_platform_release_common(vdev); kfree(vdev->name); - vfio_free_device(core_vdev); } static void vfio_amba_remove(struct amba_device *adev) @@ -117,6 +116,9 @@ static const struct vfio_device_ops vfio_amba_ops = { .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static const struct amba_id pl330_ids[] = { diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 82cedcebfd90..22a1efca32a8 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -83,7 +83,6 @@ static void vfio_platform_release_dev(struct vfio_device *core_vdev) container_of(core_vdev, struct vfio_platform_device, vdev); vfio_platform_release_common(vdev); - vfio_free_device(core_vdev); } static int vfio_platform_remove(struct platform_device *pdev) @@ -106,6 +105,9 @@ static const struct vfio_device_ops vfio_platform_ops = { .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static struct platform_driver vfio_platform_driver = { diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 55dc4f43c31e..1a0a238ffa35 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -72,12 +72,11 @@ static int vfio_platform_acpi_call_reset(struct vfio_platform_device *vdev, const char **extra_dbg) { #ifdef CONFIG_ACPI - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; struct device *dev = vdev->device; acpi_handle handle = ACPI_HANDLE(dev); acpi_status acpi_ret; - acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, &buffer); + acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, NULL); if (ACPI_FAILURE(acpi_ret)) { if (extra_dbg) *extra_dbg = acpi_format_exception(acpi_ret); diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index bcad54bbab08..f8219a438bfb 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -6,14 +6,25 @@ #ifndef __VFIO_VFIO_H__ #define __VFIO_VFIO_H__ +#include <linux/file.h> #include <linux/device.h> #include <linux/cdev.h> #include <linux/module.h> +struct iommufd_ctx; struct iommu_group; struct vfio_device; struct vfio_container; +void vfio_device_put_registration(struct vfio_device *device); +bool vfio_device_try_get_registration(struct vfio_device *device); +int vfio_device_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm); +void vfio_device_close(struct vfio_device *device, + struct iommufd_ctx *iommufd); + +extern const struct file_operations vfio_device_fops; + enum vfio_group_type { /* * Physical device with IOMMU backing. @@ -54,14 +65,30 @@ struct vfio_group { struct list_head device_list; struct mutex device_lock; struct list_head vfio_next; +#if IS_ENABLED(CONFIG_VFIO_CONTAINER) struct list_head container_next; +#endif enum vfio_group_type type; struct mutex group_lock; struct kvm *kvm; struct file *opened_file; struct blocking_notifier_head notifier; + struct iommufd_ctx *iommufd; }; +int vfio_device_set_group(struct vfio_device *device, + enum vfio_group_type type); +void vfio_device_remove_group(struct vfio_device *device); +void vfio_device_group_register(struct vfio_device *device); +void vfio_device_group_unregister(struct vfio_device *device); +int vfio_device_group_use_iommu(struct vfio_device *device); +void vfio_device_group_unuse_iommu(struct vfio_device *device); +void vfio_device_group_close(struct vfio_device *device); +bool vfio_device_has_container(struct vfio_device *device); +int __init vfio_group_init(void); +void vfio_group_cleanup(void); + +#if IS_ENABLED(CONFIG_VFIO_CONTAINER) /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { VFIO_IOMMU_CONTAINER_CLOSE = 0, @@ -109,20 +136,114 @@ struct vfio_iommu_driver { int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); -bool vfio_assert_device_open(struct vfio_device *device); - struct vfio_container *vfio_container_from_file(struct file *filep); -int vfio_device_assign_container(struct vfio_device *device); -void vfio_device_unassign_container(struct vfio_device *device); +int vfio_group_use_container(struct vfio_group *group); +void vfio_group_unuse_container(struct vfio_group *group); int vfio_container_attach_group(struct vfio_container *container, struct vfio_group *group); void vfio_group_detach_container(struct vfio_group *group); void vfio_device_container_register(struct vfio_device *device); void vfio_device_container_unregister(struct vfio_device *device); -long vfio_container_ioctl_check_extension(struct vfio_container *container, - unsigned long arg); +int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages); +void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage); +int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write); + int __init vfio_container_init(void); void vfio_container_cleanup(void); +#else +static inline struct vfio_container * +vfio_container_from_file(struct file *filep) +{ + return NULL; +} + +static inline int vfio_group_use_container(struct vfio_group *group) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_group_unuse_container(struct vfio_group *group) +{ +} + +static inline int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_group_detach_container(struct vfio_group *group) +{ +} + +static inline void vfio_device_container_register(struct vfio_device *device) +{ +} + +static inline void vfio_device_container_unregister(struct vfio_device *device) +{ +} + +static inline int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage) +{ +} + +static inline int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write) +{ + return -EOPNOTSUPP; +} + +static inline int vfio_container_init(void) +{ + return 0; +} +static inline void vfio_container_cleanup(void) +{ +} +#endif + +#if IS_ENABLED(CONFIG_IOMMUFD) +int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx *ictx); +void vfio_iommufd_unbind(struct vfio_device *device); +#else +static inline int vfio_iommufd_bind(struct vfio_device *device, + struct iommufd_ctx *ictx) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_iommufd_unbind(struct vfio_device *device) +{ +} +#endif + +#if IS_ENABLED(CONFIG_VFIO_VIRQFD) +int __init vfio_virqfd_init(void); +void vfio_virqfd_exit(void); +#else +static inline int __init vfio_virqfd_init(void) +{ + return 0; +} +static inline void vfio_virqfd_exit(void) +{ +} +#endif #ifdef CONFIG_VFIO_NOIOMMU extern bool vfio_noiommu __read_mostly; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 169f07ac162d..60a50ce8701e 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -4,6 +4,7 @@ * * Copyright (C) 2013 IBM Corp. All rights reserved. * Author: Alexey Kardashevskiy <aik@ozlabs.ru> + * Copyright Gavin Shan, IBM Corporation 2014. * * Derived from original vfio_iommu_type1.c: * Copyright (C) 2012 Red Hat, Inc. All rights reserved. @@ -773,6 +774,57 @@ static long tce_iommu_create_default_window(struct tce_container *container) return ret; } +static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group, + unsigned long arg) +{ + struct eeh_pe *pe; + struct vfio_eeh_pe_op op; + unsigned long minsz; + + pe = eeh_iommu_group_to_pe(group); + if (!pe) + return -ENODEV; + + minsz = offsetofend(struct vfio_eeh_pe_op, op); + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + if (op.argsz < minsz || op.flags) + return -EINVAL; + + switch (op.op) { + case VFIO_EEH_PE_DISABLE: + return eeh_pe_set_option(pe, EEH_OPT_DISABLE); + case VFIO_EEH_PE_ENABLE: + return eeh_pe_set_option(pe, EEH_OPT_ENABLE); + case VFIO_EEH_PE_UNFREEZE_IO: + return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); + case VFIO_EEH_PE_UNFREEZE_DMA: + return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); + case VFIO_EEH_PE_GET_STATE: + return eeh_pe_get_state(pe); + break; + case VFIO_EEH_PE_RESET_DEACTIVATE: + return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); + case VFIO_EEH_PE_RESET_HOT: + return eeh_pe_reset(pe, EEH_RESET_HOT, true); + case VFIO_EEH_PE_RESET_FUNDAMENTAL: + return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); + case VFIO_EEH_PE_CONFIGURE: + return eeh_pe_configure(pe); + case VFIO_EEH_PE_INJECT_ERR: + minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); + if (op.argsz < minsz) + return -EINVAL; + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + + return eeh_pe_inject_err(pe, op.err.type, op.err.func, + op.err.addr, op.err.mask); + default: + return -EINVAL; + } +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -785,14 +837,12 @@ static long tce_iommu_ioctl(void *iommu_data, switch (arg) { case VFIO_SPAPR_TCE_IOMMU: case VFIO_SPAPR_TCE_v2_IOMMU: - ret = 1; - break; + return 1; + case VFIO_EEH: + return eeh_enabled(); default: - ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); - break; + return 0; } - - return (ret < 0) ? 0 : ret; } /* @@ -1046,8 +1096,7 @@ static long tce_iommu_ioctl(void *iommu_data, ret = 0; list_for_each_entry(tcegrp, &container->group_list, next) { - ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, - cmd, arg); + ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg); if (ret) return ret; } diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2d168793d4e1..5177bb061b17 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -13,8 +13,6 @@ #include <linux/cdev.h> #include <linux/compat.h> #include <linux/device.h> -#include <linux/file.h> -#include <linux/anon_inodes.h> #include <linux/fs.h> #include <linux/idr.h> #include <linux/iommu.h> @@ -35,6 +33,7 @@ #include <linux/pm_runtime.h> #include <linux/interval_tree.h> #include <linux/iova_bitmap.h> +#include <linux/iommufd.h> #include "vfio.h" #define DRIVER_VERSION "0.3" @@ -42,17 +41,11 @@ #define DRIVER_DESC "VFIO - User Level meta-driver" static struct vfio { - struct class *class; - struct list_head group_list; - struct mutex group_lock; /* locks group_list */ - struct ida group_ida; - dev_t group_devt; struct class *device_class; struct ida device_ida; } vfio; static DEFINE_XARRAY(vfio_device_set_xa); -static const struct file_operations vfio_group_fops; int vfio_assign_device_set(struct vfio_device *device, void *set_id) { @@ -125,208 +118,34 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -/* - * Group objects - create, release, get, put, search - */ -static struct vfio_group * -__vfio_group_get_from_iommu(struct iommu_group *iommu_group) -{ - struct vfio_group *group; - - /* - * group->iommu_group from the vfio.group_list cannot be NULL - * under the vfio.group_lock. - */ - list_for_each_entry(group, &vfio.group_list, vfio_next) { - if (group->iommu_group == iommu_group) { - refcount_inc(&group->drivers); - return group; - } - } - return NULL; -} - -static struct vfio_group * -vfio_group_get_from_iommu(struct iommu_group *iommu_group) -{ - struct vfio_group *group; - - mutex_lock(&vfio.group_lock); - group = __vfio_group_get_from_iommu(iommu_group); - mutex_unlock(&vfio.group_lock); - return group; -} - -static void vfio_group_release(struct device *dev) -{ - struct vfio_group *group = container_of(dev, struct vfio_group, dev); - - mutex_destroy(&group->device_lock); - mutex_destroy(&group->group_lock); - WARN_ON(group->iommu_group); - ida_free(&vfio.group_ida, MINOR(group->dev.devt)); - kfree(group); -} - -static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, - enum vfio_group_type type) -{ - struct vfio_group *group; - int minor; - - group = kzalloc(sizeof(*group), GFP_KERNEL); - if (!group) - return ERR_PTR(-ENOMEM); - - minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); - if (minor < 0) { - kfree(group); - return ERR_PTR(minor); - } - - device_initialize(&group->dev); - group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); - group->dev.class = vfio.class; - group->dev.release = vfio_group_release; - cdev_init(&group->cdev, &vfio_group_fops); - group->cdev.owner = THIS_MODULE; - - refcount_set(&group->drivers, 1); - mutex_init(&group->group_lock); - INIT_LIST_HEAD(&group->device_list); - mutex_init(&group->device_lock); - group->iommu_group = iommu_group; - /* put in vfio_group_release() */ - iommu_group_ref_get(iommu_group); - group->type = type; - BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); - - return group; -} - -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - enum vfio_group_type type) +unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) { - struct vfio_group *group; - struct vfio_group *ret; - int err; - - group = vfio_group_alloc(iommu_group, type); - if (IS_ERR(group)) - return group; - - err = dev_set_name(&group->dev, "%s%d", - group->type == VFIO_NO_IOMMU ? "noiommu-" : "", - iommu_group_id(iommu_group)); - if (err) { - ret = ERR_PTR(err); - goto err_put; - } - - mutex_lock(&vfio.group_lock); - - /* Did we race creating this group? */ - ret = __vfio_group_get_from_iommu(iommu_group); - if (ret) - goto err_unlock; + struct vfio_device *cur; + unsigned int open_count = 0; - err = cdev_device_add(&group->cdev, &group->dev); - if (err) { - ret = ERR_PTR(err); - goto err_unlock; - } - - list_add(&group->vfio_next, &vfio.group_list); - - mutex_unlock(&vfio.group_lock); - return group; + lockdep_assert_held(&dev_set->lock); -err_unlock: - mutex_unlock(&vfio.group_lock); -err_put: - put_device(&group->dev); - return ret; -} - -static void vfio_device_remove_group(struct vfio_device *device) -{ - struct vfio_group *group = device->group; - struct iommu_group *iommu_group; - - if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - - /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ - if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) - return; - list_del(&group->vfio_next); - - /* - * We could concurrently probe another driver in the group that might - * race vfio_device_remove_group() with vfio_get_group(), so we have to - * ensure that the sysfs is all cleaned up under lock otherwise the - * cdev_device_add() will fail due to the name aready existing. - */ - cdev_device_del(&group->cdev, &group->dev); - - mutex_lock(&group->group_lock); - /* - * These data structures all have paired operations that can only be - * undone when the caller holds a live reference on the device. Since - * all pairs must be undone these WARN_ON's indicate some caller did not - * properly hold the group reference. - */ - WARN_ON(!list_empty(&group->device_list)); - WARN_ON(group->notifier.head); - - /* - * Revoke all users of group->iommu_group. At this point we know there - * are no devices active because we are unplugging the last one. Setting - * iommu_group to NULL blocks all new users. - */ - if (group->container) - vfio_group_detach_container(group); - iommu_group = group->iommu_group; - group->iommu_group = NULL; - mutex_unlock(&group->group_lock); - mutex_unlock(&vfio.group_lock); - - iommu_group_put(iommu_group); - put_device(&group->dev); + list_for_each_entry(cur, &dev_set->device_list, dev_set_list) + open_count += cur->open_count; + return open_count; } +EXPORT_SYMBOL_GPL(vfio_device_set_open_count); /* * Device objects - create, release, get, put, search */ /* Device reference always implies a group reference */ -static void vfio_device_put_registration(struct vfio_device *device) +void vfio_device_put_registration(struct vfio_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } -static bool vfio_device_try_get_registration(struct vfio_device *device) +bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } -static struct vfio_device *vfio_group_get_device(struct vfio_group *group, - struct device *dev) -{ - struct vfio_device *device; - - mutex_lock(&group->device_lock); - list_for_each_entry(device, &group->device_list, group_next) { - if (device->dev == dev && - vfio_device_try_get_registration(device)) { - mutex_unlock(&group->device_lock); - return device; - } - } - mutex_unlock(&group->device_lock); - return NULL; -} - /* * VFIO driver API */ @@ -339,15 +158,15 @@ static void vfio_device_release(struct device *dev) vfio_release_device_set(device); ida_free(&vfio.device_ida, device->index); - /* - * kvfree() cannot be done here due to a life cycle mess in - * vfio-ccw. Before the ccw part is fixed all drivers are - * required to support @release and call vfio_free_device() - * from there. - */ - device->ops->release(device); + if (device->ops->release) + device->ops->release(device); + + kvfree(device); } +static int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops); + /* * Allocate and initialize vfio_device so it can be registered to vfio * core. @@ -386,11 +205,9 @@ EXPORT_SYMBOL_GPL(_vfio_alloc_device); /* * Initialize a vfio_device so it can be registered to vfio core. - * - * Only vfio-ccw driver should call this interface. */ -int vfio_init_device(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops) +static int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops) { int ret; @@ -422,107 +239,16 @@ out_uninit: ida_free(&vfio.device_ida, device->index); return ret; } -EXPORT_SYMBOL_GPL(vfio_init_device); - -/* - * The helper called by driver @release callback to free the device - * structure. Drivers which don't have private data to clean can - * simply use this helper as its @release. - */ -void vfio_free_device(struct vfio_device *device) -{ - kvfree(device); -} -EXPORT_SYMBOL_GPL(vfio_free_device); - -static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, - enum vfio_group_type type) -{ - struct iommu_group *iommu_group; - struct vfio_group *group; - int ret; - - iommu_group = iommu_group_alloc(); - if (IS_ERR(iommu_group)) - return ERR_CAST(iommu_group); - - ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); - if (ret) - goto out_put_group; - ret = iommu_group_add_device(iommu_group, dev); - if (ret) - goto out_put_group; - - group = vfio_create_group(iommu_group, type); - if (IS_ERR(group)) { - ret = PTR_ERR(group); - goto out_remove_device; - } - iommu_group_put(iommu_group); - return group; - -out_remove_device: - iommu_group_remove_device(dev); -out_put_group: - iommu_group_put(iommu_group); - return ERR_PTR(ret); -} - -static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) -{ - struct iommu_group *iommu_group; - struct vfio_group *group; - - iommu_group = iommu_group_get(dev); - if (!iommu_group && vfio_noiommu) { - /* - * With noiommu enabled, create an IOMMU group for devices that - * don't already have one, implying no IOMMU hardware/driver - * exists. Taint the kernel because we're about to give a DMA - * capable device to a user without IOMMU protection. - */ - group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); - if (!IS_ERR(group)) { - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); - } - return group; - } - - if (!iommu_group) - return ERR_PTR(-EINVAL); - - /* - * VFIO always sets IOMMU_CACHE because we offer no way for userspace to - * restore cache coherency. It has to be checked here because it is only - * valid for cases where we are using iommu groups. - */ - if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { - iommu_group_put(iommu_group); - return ERR_PTR(-EINVAL); - } - - group = vfio_group_get_from_iommu(iommu_group); - if (!group) - group = vfio_create_group(iommu_group, VFIO_IOMMU); - - /* The vfio_group holds a reference to the iommu_group */ - iommu_group_put(iommu_group); - return group; -} static int __vfio_register_dev(struct vfio_device *device, - struct vfio_group *group) + enum vfio_group_type type) { - struct vfio_device *existing_device; int ret; - /* - * In all cases group is the output of one of the group allocation - * functions and we have group->drivers incremented for us. - */ - if (IS_ERR(group)) - return PTR_ERR(group); + if (WARN_ON(device->ops->bind_iommufd && + (!device->ops->unbind_iommufd || + !device->ops->attach_ioas))) + return -EINVAL; /* * If the driver doesn't specify a set then the device is added to a @@ -531,25 +257,13 @@ static int __vfio_register_dev(struct vfio_device *device, if (!device->dev_set) vfio_assign_device_set(device, device); - existing_device = vfio_group_get_device(group, device->dev); - if (existing_device) { - /* - * group->iommu_group is non-NULL because we hold the drivers - * refcount. - */ - dev_WARN(device->dev, "Device already exists on group %d\n", - iommu_group_id(group->iommu_group)); - vfio_device_put_registration(existing_device); - ret = -EBUSY; - goto err_out; - } - - /* Our reference on group is moved to the device */ - device->group = group; - ret = dev_set_name(&device->device, "vfio%d", device->index); if (ret) - goto err_out; + return ret; + + ret = vfio_device_set_group(device, type); + if (ret) + return ret; ret = device_add(&device->device); if (ret) @@ -558,9 +272,7 @@ static int __vfio_register_dev(struct vfio_device *device, /* Refcounting can't start until the driver calls register */ refcount_set(&device->refcount, 1); - mutex_lock(&group->device_lock); - list_add(&device->group_next, &group->device_list); - mutex_unlock(&group->device_lock); + vfio_device_group_register(device); return 0; err_out: @@ -570,8 +282,7 @@ err_out: int vfio_register_group_dev(struct vfio_device *device) { - return __vfio_register_dev(device, - vfio_group_find_or_alloc(device->dev)); + return __vfio_register_dev(device, VFIO_IOMMU); } EXPORT_SYMBOL_GPL(vfio_register_group_dev); @@ -581,46 +292,15 @@ EXPORT_SYMBOL_GPL(vfio_register_group_dev); */ int vfio_register_emulated_iommu_dev(struct vfio_device *device) { - return __vfio_register_dev(device, - vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU)); + return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); } EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); -static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, - char *buf) -{ - struct vfio_device *it, *device = ERR_PTR(-ENODEV); - - mutex_lock(&group->device_lock); - list_for_each_entry(it, &group->device_list, group_next) { - int ret; - - if (it->ops->match) { - ret = it->ops->match(it, buf); - if (ret < 0) { - device = ERR_PTR(ret); - break; - } - } else { - ret = !strcmp(dev_name(it->dev), buf); - } - - if (ret && vfio_device_try_get_registration(it)) { - device = it; - break; - } - } - mutex_unlock(&group->device_lock); - - return device; -} - /* * Decrement the device reference count and wait for the device to be * removed. Open file descriptors for the device... */ void vfio_unregister_group_dev(struct vfio_device *device) { - struct vfio_group *group = device->group; unsigned int i = 0; bool interrupted = false; long rc; @@ -648,332 +328,101 @@ void vfio_unregister_group_dev(struct vfio_device *device) } } - mutex_lock(&group->device_lock); - list_del(&device->group_next); - mutex_unlock(&group->device_lock); + vfio_device_group_unregister(device); /* Balances device_add in register path */ device_del(&device->device); + /* Balances vfio_device_set_group in register path */ vfio_device_remove_group(device); } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); -/* - * VFIO Group fd, /dev/vfio/$GROUP - */ -/* - * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or - * if there was no container to unset. Since the ioctl is called on - * the group, we know that still exists, therefore the only valid - * transition here is 1->0. - */ -static int vfio_group_ioctl_unset_container(struct vfio_group *group) -{ - int ret = 0; - - mutex_lock(&group->group_lock); - if (!group->container) { - ret = -EINVAL; - goto out_unlock; - } - if (group->container_users != 1) { - ret = -EBUSY; - goto out_unlock; - } - vfio_group_detach_container(group); - -out_unlock: - mutex_unlock(&group->group_lock); - return ret; -} - -static int vfio_group_ioctl_set_container(struct vfio_group *group, - int __user *arg) -{ - struct vfio_container *container; - struct fd f; - int ret; - int fd; - - if (get_user(fd, arg)) - return -EFAULT; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - mutex_lock(&group->group_lock); - if (group->container || WARN_ON(group->container_users)) { - ret = -EINVAL; - goto out_unlock; - } - if (!group->iommu_group) { - ret = -ENODEV; - goto out_unlock; - } - - container = vfio_container_from_file(f.file); - ret = -EINVAL; - if (container) { - ret = vfio_container_attach_group(container, group); - goto out_unlock; - } - -out_unlock: - mutex_unlock(&group->group_lock); - fdput(f); - return ret; -} - -static const struct file_operations vfio_device_fops; - /* true if the vfio_device has open_device() called but not close_device() */ -bool vfio_assert_device_open(struct vfio_device *device) +static bool vfio_assert_device_open(struct vfio_device *device) { return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } -static struct file *vfio_device_open(struct vfio_device *device) +static int vfio_device_first_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm) { - struct file *filep; int ret; - mutex_lock(&device->group->group_lock); - ret = vfio_device_assign_container(device); - mutex_unlock(&device->group->group_lock); - if (ret) - return ERR_PTR(ret); - - if (!try_module_get(device->dev->driver->owner)) { - ret = -ENODEV; - goto err_unassign_container; - } + lockdep_assert_held(&device->dev_set->lock); - mutex_lock(&device->dev_set->lock); - device->open_count++; - if (device->open_count == 1) { - /* - * Here we pass the KVM pointer with the group under the read - * lock. If the device driver will use it, it must obtain a - * reference and release it during close_device. - */ - mutex_lock(&device->group->group_lock); - device->kvm = device->group->kvm; + if (!try_module_get(device->dev->driver->owner)) + return -ENODEV; - if (device->ops->open_device) { - ret = device->ops->open_device(device); - if (ret) - goto err_undo_count; - } - vfio_device_container_register(device); - mutex_unlock(&device->group->group_lock); - } - mutex_unlock(&device->dev_set->lock); + if (iommufd) + ret = vfio_iommufd_bind(device, iommufd); + else + ret = vfio_device_group_use_iommu(device); + if (ret) + goto err_module_put; - /* - * We can't use anon_inode_getfd() because we need to modify - * the f_mode flags directly to allow more than just ioctls - */ - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, - device, O_RDWR); - if (IS_ERR(filep)) { - ret = PTR_ERR(filep); - goto err_close_device; + device->kvm = kvm; + if (device->ops->open_device) { + ret = device->ops->open_device(device); + if (ret) + goto err_unuse_iommu; } + return 0; - /* - * TODO: add an anon_inode interface to do this. - * Appears to be missing by lack of need rather than - * explicitly prevented. Now there's need. - */ - filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); - - if (device->group->type == VFIO_NO_IOMMU) - dev_warn(device->dev, "vfio-noiommu device opened by user " - "(%s:%d)\n", current->comm, task_pid_nr(current)); - /* - * On success the ref of device is moved to the file and - * put in vfio_device_fops_release() - */ - return filep; - -err_close_device: - mutex_lock(&device->dev_set->lock); - mutex_lock(&device->group->group_lock); - if (device->open_count == 1 && device->ops->close_device) { - device->ops->close_device(device); - - vfio_device_container_unregister(device); - } -err_undo_count: - mutex_unlock(&device->group->group_lock); - device->open_count--; - if (device->open_count == 0 && device->kvm) - device->kvm = NULL; - mutex_unlock(&device->dev_set->lock); +err_unuse_iommu: + device->kvm = NULL; + if (iommufd) + vfio_iommufd_unbind(device); + else + vfio_device_group_unuse_iommu(device); +err_module_put: module_put(device->dev->driver->owner); -err_unassign_container: - vfio_device_unassign_container(device); - return ERR_PTR(ret); -} - -static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, - char __user *arg) -{ - struct vfio_device *device; - struct file *filep; - char *buf; - int fdno; - int ret; - - buf = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - device = vfio_device_get_from_name(group, buf); - kfree(buf); - if (IS_ERR(device)) - return PTR_ERR(device); - - fdno = get_unused_fd_flags(O_CLOEXEC); - if (fdno < 0) { - ret = fdno; - goto err_put_device; - } - - filep = vfio_device_open(device); - if (IS_ERR(filep)) { - ret = PTR_ERR(filep); - goto err_put_fdno; - } - - fd_install(fdno, filep); - return fdno; - -err_put_fdno: - put_unused_fd(fdno); -err_put_device: - vfio_device_put_registration(device); return ret; } -static int vfio_group_ioctl_get_status(struct vfio_group *group, - struct vfio_group_status __user *arg) -{ - unsigned long minsz = offsetofend(struct vfio_group_status, flags); - struct vfio_group_status status; - - if (copy_from_user(&status, arg, minsz)) - return -EFAULT; - - if (status.argsz < minsz) - return -EINVAL; - - status.flags = 0; - - mutex_lock(&group->group_lock); - if (!group->iommu_group) { - mutex_unlock(&group->group_lock); - return -ENODEV; - } - - if (group->container) - status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | - VFIO_GROUP_FLAGS_VIABLE; - else if (!iommu_group_dma_owner_claimed(group->iommu_group)) - status.flags |= VFIO_GROUP_FLAGS_VIABLE; - mutex_unlock(&group->group_lock); - - if (copy_to_user(arg, &status, minsz)) - return -EFAULT; - return 0; -} - -static long vfio_group_fops_unl_ioctl(struct file *filep, - unsigned int cmd, unsigned long arg) +static void vfio_device_last_close(struct vfio_device *device, + struct iommufd_ctx *iommufd) { - struct vfio_group *group = filep->private_data; - void __user *uarg = (void __user *)arg; + lockdep_assert_held(&device->dev_set->lock); - switch (cmd) { - case VFIO_GROUP_GET_DEVICE_FD: - return vfio_group_ioctl_get_device_fd(group, uarg); - case VFIO_GROUP_GET_STATUS: - return vfio_group_ioctl_get_status(group, uarg); - case VFIO_GROUP_SET_CONTAINER: - return vfio_group_ioctl_set_container(group, uarg); - case VFIO_GROUP_UNSET_CONTAINER: - return vfio_group_ioctl_unset_container(group); - default: - return -ENOTTY; - } + if (device->ops->close_device) + device->ops->close_device(device); + device->kvm = NULL; + if (iommufd) + vfio_iommufd_unbind(device); + else + vfio_device_group_unuse_iommu(device); + module_put(device->dev->driver->owner); } -static int vfio_group_fops_open(struct inode *inode, struct file *filep) +int vfio_device_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm) { - struct vfio_group *group = - container_of(inode->i_cdev, struct vfio_group, cdev); - int ret; - - mutex_lock(&group->group_lock); - - /* - * drivers can be zero if this races with vfio_device_remove_group(), it - * will be stable at 0 under the group rwsem - */ - if (refcount_read(&group->drivers) == 0) { - ret = -ENODEV; - goto out_unlock; - } + int ret = 0; - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { - ret = -EPERM; - goto out_unlock; + mutex_lock(&device->dev_set->lock); + device->open_count++; + if (device->open_count == 1) { + ret = vfio_device_first_open(device, iommufd, kvm); + if (ret) + device->open_count--; } + mutex_unlock(&device->dev_set->lock); - /* - * Do we need multiple instances of the group open? Seems not. - */ - if (group->opened_file) { - ret = -EBUSY; - goto out_unlock; - } - group->opened_file = filep; - filep->private_data = group; - ret = 0; -out_unlock: - mutex_unlock(&group->group_lock); return ret; } -static int vfio_group_fops_release(struct inode *inode, struct file *filep) +void vfio_device_close(struct vfio_device *device, + struct iommufd_ctx *iommufd) { - struct vfio_group *group = filep->private_data; - - filep->private_data = NULL; - - mutex_lock(&group->group_lock); - /* - * Device FDs hold a group file reference, therefore the group release - * is only called when there are no open devices. - */ - WARN_ON(group->notifier.head); - if (group->container) - vfio_group_detach_container(group); - group->opened_file = NULL; - mutex_unlock(&group->group_lock); - return 0; + mutex_lock(&device->dev_set->lock); + vfio_assert_device_open(device); + if (device->open_count == 1) + vfio_device_last_close(device, iommufd); + device->open_count--; + mutex_unlock(&device->dev_set->lock); } -static const struct file_operations vfio_group_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = vfio_group_fops_unl_ioctl, - .compat_ioctl = compat_ptr_ioctl, - .open = vfio_group_fops_open, - .release = vfio_group_fops_release, -}; - /* * Wrapper around pm_runtime_resume_and_get(). * Return error code on failure or 0 on success. @@ -1014,22 +463,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) { struct vfio_device *device = filep->private_data; - mutex_lock(&device->dev_set->lock); - vfio_assert_device_open(device); - mutex_lock(&device->group->group_lock); - if (device->open_count == 1 && device->ops->close_device) - device->ops->close_device(device); - - vfio_device_container_unregister(device); - mutex_unlock(&device->group->group_lock); - device->open_count--; - if (device->open_count == 0) - device->kvm = NULL; - mutex_unlock(&device->dev_set->lock); - - module_put(device->dev->driver->owner); - - vfio_device_unassign_container(device); + vfio_device_group_close(device); vfio_device_put_registration(device); @@ -1056,7 +490,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, enum vfio_device_mig_state new_fsm, enum vfio_device_mig_state *next_fsm) { - enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 }; + enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; /* * The coding in this table requires the driver to implement the * following FSM arcs: @@ -1071,30 +505,65 @@ int vfio_mig_get_next_state(struct vfio_device *device, * RUNNING_P2P -> RUNNING * RUNNING_P2P -> STOP * STOP -> RUNNING_P2P - * Without P2P the driver must implement: + * + * If precopy is supported then the driver must support these additional + * FSM arcs: + * RUNNING -> PRE_COPY + * PRE_COPY -> RUNNING + * PRE_COPY -> STOP_COPY + * However, if precopy and P2P are supported together then the driver + * must support these additional arcs beyond the P2P arcs above: + * PRE_COPY -> RUNNING + * PRE_COPY -> PRE_COPY_P2P + * PRE_COPY_P2P -> PRE_COPY + * PRE_COPY_P2P -> RUNNING_P2P + * PRE_COPY_P2P -> STOP_COPY + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P + * + * Without P2P and precopy the driver must implement: * RUNNING -> STOP * STOP -> RUNNING * * The coding will step through multiple states for some combination * transitions; if all optional features are supported, this means the * following ones: + * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY + * PRE_COPY -> RUNNING -> RUNNING_P2P + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING + * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING + * PRE_COPY_P2P -> RUNNING_P2P -> STOP + * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING * RESUMING -> STOP -> RUNNING_P2P + * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P * RESUMING -> STOP -> RUNNING_P2P -> RUNNING + * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY * RESUMING -> STOP -> STOP_COPY + * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P * RUNNING -> RUNNING_P2P -> STOP * RUNNING -> RUNNING_P2P -> STOP -> RESUMING * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY + * RUNNING_P2P -> RUNNING -> PRE_COPY * RUNNING_P2P -> STOP -> RESUMING * RUNNING_P2P -> STOP -> STOP_COPY + * STOP -> RUNNING_P2P -> PRE_COPY_P2P * STOP -> RUNNING_P2P -> RUNNING + * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY * STOP_COPY -> STOP -> RESUMING * STOP_COPY -> STOP -> RUNNING_P2P * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING + * + * The following transitions are blocked: + * STOP_COPY -> PRE_COPY + * STOP_COPY -> PRE_COPY_P2P */ static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { [VFIO_DEVICE_STATE_STOP] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, @@ -1103,14 +572,38 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RUNNING] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, }, + [VFIO_DEVICE_STATE_PRE_COPY] = { + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, + }, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, + }, [VFIO_DEVICE_STATE_STOP_COPY] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, @@ -1119,6 +612,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RESUMING] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, @@ -1127,6 +622,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RUNNING_P2P] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, @@ -1135,6 +632,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_ERROR] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, @@ -1145,6 +644,11 @@ int vfio_mig_get_next_state(struct vfio_device *device, static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, + [VFIO_DEVICE_STATE_PRE_COPY] = + VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RUNNING_P2P] = @@ -1256,6 +760,34 @@ out_copy: return 0; } +static int +vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + struct vfio_device_feature_mig_data_size data_size = {}; + unsigned long stop_copy_length; + int ret; + + if (!device->mig_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(data_size)); + if (ret != 1) + return ret; + + ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); + if (ret) + return ret; + + data_size.stop_copy_length = stop_copy_length; + if (copy_to_user(arg, &data_size, sizeof(data_size))) + return -EFAULT; + + return 0; +} + static int vfio_ioctl_device_feature_migration(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) @@ -1483,6 +1015,10 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, return vfio_ioctl_device_feature_logging_report( device, feature.flags, arg->data, feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: + return vfio_ioctl_device_feature_migration_data_size( + device, feature.flags, arg->data, + feature.argsz - minsz); default: if (unlikely(!device->ops->device_feature)) return -EINVAL; @@ -1552,7 +1088,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) return device->ops->mmap(device, vma); } -static const struct file_operations vfio_device_fops = { +const struct file_operations vfio_device_fops = { .owner = THIS_MODULE, .release = vfio_device_fops_release, .read = vfio_device_fops_read, @@ -1562,118 +1098,6 @@ static const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -/** - * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file - * @file: VFIO group file - * - * The returned iommu_group is valid as long as a ref is held on the file. This - * returns a reference on the group. This function is deprecated, only the SPAPR - * path in kvm should call it. - */ -struct iommu_group *vfio_file_iommu_group(struct file *file) -{ - struct vfio_group *group = file->private_data; - struct iommu_group *iommu_group = NULL; - - if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) - return NULL; - - if (!vfio_file_is_group(file)) - return NULL; - - mutex_lock(&group->group_lock); - if (group->iommu_group) { - iommu_group = group->iommu_group; - iommu_group_ref_get(iommu_group); - } - mutex_unlock(&group->group_lock); - return iommu_group; -} -EXPORT_SYMBOL_GPL(vfio_file_iommu_group); - -/** - * vfio_file_is_group - True if the file is usable with VFIO aPIS - * @file: VFIO group file - */ -bool vfio_file_is_group(struct file *file) -{ - return file->f_op == &vfio_group_fops; -} -EXPORT_SYMBOL_GPL(vfio_file_is_group); - -/** - * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file - * is always CPU cache coherent - * @file: VFIO group file - * - * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop - * bit in DMA transactions. A return of false indicates that the user has - * rights to access additional instructions such as wbinvd on x86. - */ -bool vfio_file_enforced_coherent(struct file *file) -{ - struct vfio_group *group = file->private_data; - bool ret; - - if (!vfio_file_is_group(file)) - return true; - - mutex_lock(&group->group_lock); - if (group->container) { - ret = vfio_container_ioctl_check_extension(group->container, - VFIO_DMA_CC_IOMMU); - } else { - /* - * Since the coherency state is determined only once a container - * is attached the user must do so before they can prove they - * have permission. - */ - ret = true; - } - mutex_unlock(&group->group_lock); - return ret; -} -EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); - -/** - * vfio_file_set_kvm - Link a kvm with VFIO drivers - * @file: VFIO group file - * @kvm: KVM to link - * - * When a VFIO device is first opened the KVM will be available in - * device->kvm if one was associated with the group. - */ -void vfio_file_set_kvm(struct file *file, struct kvm *kvm) -{ - struct vfio_group *group = file->private_data; - - if (!vfio_file_is_group(file)) - return; - - mutex_lock(&group->group_lock); - group->kvm = kvm; - mutex_unlock(&group->group_lock); -} -EXPORT_SYMBOL_GPL(vfio_file_set_kvm); - -/** - * vfio_file_has_dev - True if the VFIO file is a handle for device - * @file: VFIO file to check - * @device: Device that must be part of the file - * - * Returns true if given file has permission to manipulate the given device. - */ -bool vfio_file_has_dev(struct file *file, struct vfio_device *device) -{ - struct vfio_group *group = file->private_data; - - if (!vfio_file_is_group(file)) - return false; - - return group == device->group; -} -EXPORT_SYMBOL_GPL(vfio_file_has_dev); - /* * Sub-module support */ @@ -1794,34 +1218,139 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); /* - * Module/class support + * Pin contiguous user pages and return their associated host pages for local + * domain only. + * @device [in] : device + * @iova [in] : starting IOVA of user pages to be pinned. + * @npage [in] : count of pages to be pinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * @prot [in] : protection flags + * @pages[out] : array of host pages + * Return error or number of pages pinned. + * + * A driver may only call this function if the vfio_device was created + * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). */ -static char *vfio_devnode(struct device *dev, umode_t *mode) +int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, + int npage, int prot, struct page **pages) { - return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); + /* group->container cannot change while a vfio device is open */ + if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) + return -EINVAL; + if (vfio_device_has_container(device)) + return vfio_device_container_pin_pages(device, iova, + npage, prot, pages); + if (device->iommufd_access) { + int ret; + + if (iova > ULONG_MAX) + return -EINVAL; + /* + * VFIO ignores the sub page offset, npages is from the start of + * a PAGE_SIZE chunk of IOVA. The caller is expected to recover + * the sub page offset by doing: + * pages[0] + (iova % PAGE_SIZE) + */ + ret = iommufd_access_pin_pages( + device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), + npage * PAGE_SIZE, pages, + (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); + if (ret) + return ret; + return npage; + } + return -EINVAL; +} +EXPORT_SYMBOL(vfio_pin_pages); + +/* + * Unpin contiguous host pages for local domain only. + * @device [in] : device + * @iova [in] : starting address of user pages to be unpinned. + * @npage [in] : count of pages to be unpinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + */ +void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) +{ + if (WARN_ON(!vfio_assert_device_open(device))) + return; + + if (vfio_device_has_container(device)) { + vfio_device_container_unpin_pages(device, iova, npage); + return; + } + if (device->iommufd_access) { + if (WARN_ON(iova > ULONG_MAX)) + return; + iommufd_access_unpin_pages(device->iommufd_access, + ALIGN_DOWN(iova, PAGE_SIZE), + npage * PAGE_SIZE); + return; + } } +EXPORT_SYMBOL(vfio_unpin_pages); +/* + * This interface allows the CPUs to perform some sort of virtual DMA on + * behalf of the device. + * + * CPUs read/write from/into a range of IOVAs pointing to user space memory + * into/from a kernel buffer. + * + * As the read/write of user space memory is conducted via the CPUs and is + * not a real device DMA, it is not necessary to pin the user space memory. + * + * @device [in] : VFIO device + * @iova [in] : base IOVA of a user space buffer + * @data [in] : pointer to kernel buffer + * @len [in] : kernel buffer length + * @write : indicate read or write + * Return error code on failure or 0 on success. + */ +int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, + size_t len, bool write) +{ + if (!data || len <= 0 || !vfio_assert_device_open(device)) + return -EINVAL; + + if (vfio_device_has_container(device)) + return vfio_device_container_dma_rw(device, iova, + data, len, write); + + if (device->iommufd_access) { + unsigned int flags = 0; + + if (iova > ULONG_MAX) + return -EINVAL; + + /* VFIO historically tries to auto-detect a kthread */ + if (!current->mm) + flags |= IOMMUFD_ACCESS_RW_KTHREAD; + if (write) + flags |= IOMMUFD_ACCESS_RW_WRITE; + return iommufd_access_rw(device->iommufd_access, iova, data, + len, flags); + } + return -EINVAL; +} +EXPORT_SYMBOL(vfio_dma_rw); + +/* + * Module/class support + */ static int __init vfio_init(void) { int ret; - ida_init(&vfio.group_ida); ida_init(&vfio.device_ida); - mutex_init(&vfio.group_lock); - INIT_LIST_HEAD(&vfio.group_list); - ret = vfio_container_init(); + ret = vfio_group_init(); if (ret) return ret; - /* /dev/vfio/$GROUP */ - vfio.class = class_create(THIS_MODULE, "vfio"); - if (IS_ERR(vfio.class)) { - ret = PTR_ERR(vfio.class); - goto err_group_class; - } - - vfio.class->devnode = vfio_devnode; + ret = vfio_virqfd_init(); + if (ret) + goto err_virqfd; /* /sys/class/vfio-dev/vfioX */ vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); @@ -1830,36 +1359,23 @@ static int __init vfio_init(void) goto err_dev_class; } - ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); - if (ret) - goto err_alloc_chrdev; - pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; -err_alloc_chrdev: - class_destroy(vfio.device_class); - vfio.device_class = NULL; err_dev_class: - class_destroy(vfio.class); - vfio.class = NULL; -err_group_class: - vfio_container_cleanup(); + vfio_virqfd_exit(); +err_virqfd: + vfio_group_cleanup(); return ret; } static void __exit vfio_cleanup(void) { - WARN_ON(!list_empty(&vfio.group_list)); - ida_destroy(&vfio.device_ida); - ida_destroy(&vfio.group_ida); - unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); class_destroy(vfio.device_class); vfio.device_class = NULL; - class_destroy(vfio.class); - vfio_container_cleanup(); - vfio.class = NULL; + vfio_virqfd_exit(); + vfio_group_cleanup(); xa_destroy(&vfio_device_set_xa); } @@ -1870,6 +1386,4 @@ MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); -MODULE_ALIAS_MISCDEV(VFIO_MINOR); -MODULE_ALIAS("devname:vfio/vfio"); MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c deleted file mode 100644 index 67f55ac1d459..000000000000 --- a/drivers/vfio/vfio_spapr_eeh.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * EEH functionality support for VFIO devices. The feature is only - * available on sPAPR compatible platforms. - * - * Copyright Gavin Shan, IBM Corporation 2014. - */ - -#include <linux/module.h> -#include <linux/uaccess.h> -#include <linux/vfio.h> -#include <asm/eeh.h> - -#define DRIVER_VERSION "0.1" -#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" -#define DRIVER_DESC "VFIO IOMMU SPAPR EEH" - -/* We might build address mapping here for "fast" path later */ -void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) -{ - eeh_dev_open(pdev); -} -EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open); - -void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) -{ - eeh_dev_release(pdev); -} -EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release); - -long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, - unsigned int cmd, unsigned long arg) -{ - struct eeh_pe *pe; - struct vfio_eeh_pe_op op; - unsigned long minsz; - long ret = -EINVAL; - - switch (cmd) { - case VFIO_CHECK_EXTENSION: - if (arg == VFIO_EEH) - ret = eeh_enabled() ? 1 : 0; - else - ret = 0; - break; - case VFIO_EEH_PE_OP: - pe = eeh_iommu_group_to_pe(group); - if (!pe) - return -ENODEV; - - minsz = offsetofend(struct vfio_eeh_pe_op, op); - if (copy_from_user(&op, (void __user *)arg, minsz)) - return -EFAULT; - if (op.argsz < minsz || op.flags) - return -EINVAL; - - switch (op.op) { - case VFIO_EEH_PE_DISABLE: - ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE); - break; - case VFIO_EEH_PE_ENABLE: - ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE); - break; - case VFIO_EEH_PE_UNFREEZE_IO: - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); - break; - case VFIO_EEH_PE_UNFREEZE_DMA: - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); - break; - case VFIO_EEH_PE_GET_STATE: - ret = eeh_pe_get_state(pe); - break; - case VFIO_EEH_PE_RESET_DEACTIVATE: - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); - break; - case VFIO_EEH_PE_RESET_HOT: - ret = eeh_pe_reset(pe, EEH_RESET_HOT, true); - break; - case VFIO_EEH_PE_RESET_FUNDAMENTAL: - ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); - break; - case VFIO_EEH_PE_CONFIGURE: - ret = eeh_pe_configure(pe); - break; - case VFIO_EEH_PE_INJECT_ERR: - minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); - if (op.argsz < minsz) - return -EINVAL; - if (copy_from_user(&op, (void __user *)arg, minsz)) - return -EFAULT; - - ret = eeh_pe_inject_err(pe, op.err.type, op.err.func, - op.err.addr, op.err.mask); - break; - default: - ret = -EINVAL; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl); - -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 414e98d82b02..497a17b37865 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -12,15 +12,12 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/slab.h> - -#define DRIVER_VERSION "0.1" -#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" -#define DRIVER_DESC "IRQFD support for VFIO bus drivers" +#include "vfio.h" static struct workqueue_struct *vfio_irqfd_cleanup_wq; static DEFINE_SPINLOCK(virqfd_lock); -static int __init vfio_virqfd_init(void) +int __init vfio_virqfd_init(void) { vfio_irqfd_cleanup_wq = create_singlethread_workqueue("vfio-irqfd-cleanup"); @@ -30,7 +27,7 @@ static int __init vfio_virqfd_init(void) return 0; } -static void __exit vfio_virqfd_exit(void) +void vfio_virqfd_exit(void) { destroy_workqueue(vfio_irqfd_cleanup_wq); } @@ -216,11 +213,3 @@ void vfio_virqfd_disable(struct virqfd **pvirqfd) flush_workqueue(vfio_irqfd_cleanup_wq); } EXPORT_SYMBOL_GPL(vfio_virqfd_disable); - -module_init(vfio_virqfd_init); -module_exit(vfio_virqfd_exit); - -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); |