summaryrefslogtreecommitdiff
path: root/drivers/vfio
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/cdx/Makefile2
-rw-r--r--drivers/vfio/cdx/intr.c217
-rw-r--r--drivers/vfio/cdx/main.c65
-rw-r--r--drivers/vfio/cdx/private.h18
-rw-r--r--drivers/vfio/device_cdev.c7
-rw-r--r--drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c11
-rw-r--r--drivers/vfio/group.c17
-rw-r--r--drivers/vfio/iommufd.c4
-rw-r--r--drivers/vfio/mdev/mdev_core.c4
-rw-r--r--drivers/vfio/mdev/mdev_driver.c4
-rw-r--r--drivers/vfio/mdev/mdev_private.h5
-rw-r--r--drivers/vfio/mdev/mdev_sysfs.c2
-rw-r--r--drivers/vfio/pci/Kconfig4
-rw-r--r--drivers/vfio/pci/Makefile4
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c316
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h25
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c218
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h11
-rw-r--r--drivers/vfio/pci/mlx5/main.c187
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/Kconfig10
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/Makefile3
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/main.c1015
-rw-r--r--drivers/vfio/pci/pds/dirty.c7
-rw-r--r--drivers/vfio/pci/pds/lm.c15
-rw-r--r--drivers/vfio/pci/pds/lm.h1
-rw-r--r--drivers/vfio/pci/pds/pci_drv.c29
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.c45
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.h8
-rw-r--r--drivers/vfio/pci/qat/Kconfig12
-rw-r--r--drivers/vfio/pci/qat/Makefile3
-rw-r--r--drivers/vfio/pci/qat/main.c700
-rw-r--r--drivers/vfio/pci/vfio_pci.c2
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c71
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c452
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c225
-rw-r--r--drivers/vfio/pci/vfio_pci_rdwr.c164
-rw-r--r--drivers/vfio/pci/virtio/Kconfig42
-rw-r--r--drivers/vfio/pci/virtio/Makefile3
-rw-r--r--drivers/vfio/pci/virtio/common.h127
-rw-r--r--drivers/vfio/pci/virtio/legacy_io.c418
-rw-r--r--drivers/vfio/pci/virtio/main.c496
-rw-r--r--drivers/vfio/pci/virtio/migrate.c1337
-rw-r--r--drivers/vfio/platform/vfio_amba.c7
-rw-r--r--drivers/vfio/platform/vfio_platform.c3
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c10
-rw-r--r--drivers/vfio/platform/vfio_platform_irq.c105
-rw-r--r--drivers/vfio/vfio.h2
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c13
-rw-r--r--drivers/vfio/vfio_iommu_type1.c45
-rw-r--r--drivers/vfio/vfio_main.c50
-rw-r--r--drivers/vfio/virqfd.c41
51 files changed, 5246 insertions, 1336 deletions
diff --git a/drivers/vfio/cdx/Makefile b/drivers/vfio/cdx/Makefile
index cd4a2e6fe609..df92b320122a 100644
--- a/drivers/vfio/cdx/Makefile
+++ b/drivers/vfio/cdx/Makefile
@@ -5,4 +5,4 @@
obj-$(CONFIG_VFIO_CDX) += vfio-cdx.o
-vfio-cdx-objs := main.o
+vfio-cdx-objs := main.o intr.o
diff --git a/drivers/vfio/cdx/intr.c b/drivers/vfio/cdx/intr.c
new file mode 100644
index 000000000000..986fa2a45fa4
--- /dev/null
+++ b/drivers/vfio/cdx/intr.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ */
+
+#include <linux/vfio.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/eventfd.h>
+#include <linux/msi.h>
+#include <linux/interrupt.h>
+
+#include "linux/cdx/cdx_bus.h"
+#include "private.h"
+
+static irqreturn_t vfio_cdx_msihandler(int irq_no, void *arg)
+{
+ struct eventfd_ctx *trigger = arg;
+
+ eventfd_signal(trigger);
+ return IRQ_HANDLED;
+}
+
+static int vfio_cdx_msi_enable(struct vfio_cdx_device *vdev, int nvec)
+{
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ struct device *dev = vdev->vdev.dev;
+ int msi_idx, ret;
+
+ vdev->cdx_irqs = kcalloc(nvec, sizeof(struct vfio_cdx_irq), GFP_KERNEL);
+ if (!vdev->cdx_irqs)
+ return -ENOMEM;
+
+ ret = cdx_enable_msi(cdx_dev);
+ if (ret) {
+ kfree(vdev->cdx_irqs);
+ return ret;
+ }
+
+ /* Allocate cdx MSIs */
+ ret = msi_domain_alloc_irqs(dev, MSI_DEFAULT_DOMAIN, nvec);
+ if (ret) {
+ cdx_disable_msi(cdx_dev);
+ kfree(vdev->cdx_irqs);
+ return ret;
+ }
+
+ for (msi_idx = 0; msi_idx < nvec; msi_idx++)
+ vdev->cdx_irqs[msi_idx].irq_no = msi_get_virq(dev, msi_idx);
+
+ vdev->msi_count = nvec;
+ vdev->config_msi = 1;
+
+ return 0;
+}
+
+static int vfio_cdx_msi_set_vector_signal(struct vfio_cdx_device *vdev,
+ int vector, int fd)
+{
+ struct eventfd_ctx *trigger;
+ int irq_no, ret;
+
+ if (vector < 0 || vector >= vdev->msi_count)
+ return -EINVAL;
+
+ irq_no = vdev->cdx_irqs[vector].irq_no;
+
+ if (vdev->cdx_irqs[vector].trigger) {
+ free_irq(irq_no, vdev->cdx_irqs[vector].trigger);
+ kfree(vdev->cdx_irqs[vector].name);
+ eventfd_ctx_put(vdev->cdx_irqs[vector].trigger);
+ vdev->cdx_irqs[vector].trigger = NULL;
+ }
+
+ if (fd < 0)
+ return 0;
+
+ vdev->cdx_irqs[vector].name = kasprintf(GFP_KERNEL, "vfio-msi[%d](%s)",
+ vector, dev_name(vdev->vdev.dev));
+ if (!vdev->cdx_irqs[vector].name)
+ return -ENOMEM;
+
+ trigger = eventfd_ctx_fdget(fd);
+ if (IS_ERR(trigger)) {
+ kfree(vdev->cdx_irqs[vector].name);
+ return PTR_ERR(trigger);
+ }
+
+ ret = request_irq(irq_no, vfio_cdx_msihandler, 0,
+ vdev->cdx_irqs[vector].name, trigger);
+ if (ret) {
+ kfree(vdev->cdx_irqs[vector].name);
+ eventfd_ctx_put(trigger);
+ return ret;
+ }
+
+ vdev->cdx_irqs[vector].trigger = trigger;
+
+ return 0;
+}
+
+static int vfio_cdx_msi_set_block(struct vfio_cdx_device *vdev,
+ unsigned int start, unsigned int count,
+ int32_t *fds)
+{
+ int i, j, ret = 0;
+
+ if (start >= vdev->msi_count || start + count > vdev->msi_count)
+ return -EINVAL;
+
+ for (i = 0, j = start; i < count && !ret; i++, j++) {
+ int fd = fds ? fds[i] : -1;
+
+ ret = vfio_cdx_msi_set_vector_signal(vdev, j, fd);
+ }
+
+ if (ret) {
+ for (--j; j >= (int)start; j--)
+ vfio_cdx_msi_set_vector_signal(vdev, j, -1);
+ }
+
+ return ret;
+}
+
+static void vfio_cdx_msi_disable(struct vfio_cdx_device *vdev)
+{
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ struct device *dev = vdev->vdev.dev;
+
+ vfio_cdx_msi_set_block(vdev, 0, vdev->msi_count, NULL);
+
+ if (!vdev->config_msi)
+ return;
+
+ msi_domain_free_irqs_all(dev, MSI_DEFAULT_DOMAIN);
+ cdx_disable_msi(cdx_dev);
+ kfree(vdev->cdx_irqs);
+
+ vdev->cdx_irqs = NULL;
+ vdev->msi_count = 0;
+ vdev->config_msi = 0;
+}
+
+static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev,
+ unsigned int index, unsigned int start,
+ unsigned int count, u32 flags,
+ void *data)
+{
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ int i;
+
+ if (start + count > cdx_dev->num_msi)
+ return -EINVAL;
+
+ if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
+ vfio_cdx_msi_disable(vdev);
+ return 0;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ s32 *fds = data;
+ int ret;
+
+ if (vdev->config_msi)
+ return vfio_cdx_msi_set_block(vdev, start, count,
+ fds);
+ ret = vfio_cdx_msi_enable(vdev, cdx_dev->num_msi);
+ if (ret)
+ return ret;
+
+ ret = vfio_cdx_msi_set_block(vdev, start, count, fds);
+ if (ret)
+ vfio_cdx_msi_disable(vdev);
+
+ return ret;
+ }
+
+ for (i = start; i < start + count; i++) {
+ if (!vdev->cdx_irqs[i].trigger)
+ continue;
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ eventfd_signal(vdev->cdx_irqs[i].trigger);
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ u8 *bools = data;
+
+ if (bools[i - start])
+ eventfd_signal(vdev->cdx_irqs[i].trigger);
+ }
+ }
+
+ return 0;
+}
+
+int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev,
+ u32 flags, unsigned int index,
+ unsigned int start, unsigned int count,
+ void *data)
+{
+ if (flags & VFIO_IRQ_SET_ACTION_TRIGGER)
+ return vfio_cdx_set_msi_trigger(vdev, index, start,
+ count, flags, data);
+ else
+ return -EINVAL;
+}
+
+/* Free All IRQs for the given device */
+void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev)
+{
+ /*
+ * Device does not support any interrupt or the interrupts
+ * were not configured
+ */
+ if (!vdev->cdx_irqs)
+ return;
+
+ vfio_cdx_set_msi_trigger(vdev, 0, 0, 0, VFIO_IRQ_SET_DATA_NONE, NULL);
+}
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index 9cff8d75789e..5dd5f5ad7686 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -61,6 +61,7 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev)
kfree(vdev->regions);
cdx_dev_reset(core_vdev->dev);
+ vfio_cdx_irqs_cleanup(vdev);
}
static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags,
@@ -123,7 +124,7 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
info.flags |= VFIO_DEVICE_FLAGS_RESET;
info.num_regions = cdx_dev->res_count;
- info.num_irqs = 0;
+ info.num_irqs = cdx_dev->num_msi ? 1 : 0;
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
@@ -152,6 +153,62 @@ static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev,
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
+static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev,
+ struct vfio_irq_info __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_irq_info, count);
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ struct vfio_irq_info info;
+
+ if (copy_from_user(&info, arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ if (info.index >= 1)
+ return -EINVAL;
+
+ if (!cdx_dev->num_msi)
+ return -EINVAL;
+
+ info.flags = VFIO_IRQ_INFO_EVENTFD | VFIO_IRQ_INFO_NORESIZE;
+ info.count = cdx_dev->num_msi;
+
+ return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static int vfio_cdx_ioctl_set_irqs(struct vfio_cdx_device *vdev,
+ struct vfio_irq_set __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_irq_set, count);
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ struct vfio_irq_set hdr;
+ size_t data_size = 0;
+ u8 *data = NULL;
+ int ret = 0;
+
+ if (copy_from_user(&hdr, arg, minsz))
+ return -EFAULT;
+
+ ret = vfio_set_irqs_validate_and_prepare(&hdr, cdx_dev->num_msi,
+ 1, &data_size);
+ if (ret)
+ return ret;
+
+ if (data_size) {
+ data = memdup_user(arg->data, data_size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ }
+
+ ret = vfio_cdx_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
+ hdr.start, hdr.count, data);
+ kfree(data);
+
+ return ret;
+}
+
static long vfio_cdx_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
@@ -164,6 +221,10 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev,
return vfio_cdx_ioctl_get_info(vdev, uarg);
case VFIO_DEVICE_GET_REGION_INFO:
return vfio_cdx_ioctl_get_region_info(vdev, uarg);
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ return vfio_cdx_ioctl_get_irq_info(vdev, uarg);
+ case VFIO_DEVICE_SET_IRQS:
+ return vfio_cdx_ioctl_set_irqs(vdev, uarg);
case VFIO_DEVICE_RESET:
return cdx_dev_reset(core_vdev->dev);
default:
@@ -286,4 +347,4 @@ module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver");
-MODULE_IMPORT_NS(CDX_BUS);
+MODULE_IMPORT_NS("CDX_BUS");
diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h
index 8e9d25913728..dc56729b3114 100644
--- a/drivers/vfio/cdx/private.h
+++ b/drivers/vfio/cdx/private.h
@@ -13,6 +13,14 @@ static inline u64 vfio_cdx_index_to_offset(u32 index)
return ((u64)(index) << VFIO_CDX_OFFSET_SHIFT);
}
+struct vfio_cdx_irq {
+ u32 flags;
+ u32 count;
+ int irq_no;
+ struct eventfd_ctx *trigger;
+ char *name;
+};
+
struct vfio_cdx_region {
u32 flags;
u32 type;
@@ -23,8 +31,18 @@ struct vfio_cdx_region {
struct vfio_cdx_device {
struct vfio_device vdev;
struct vfio_cdx_region *regions;
+ struct vfio_cdx_irq *cdx_irqs;
u32 flags;
#define BME_SUPPORT BIT(0)
+ u32 msi_count;
+ u8 config_msi;
};
+int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev,
+ u32 flags, unsigned int index,
+ unsigned int start, unsigned int count,
+ void *data);
+
+void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev);
+
#endif /* VFIO_CDX_PRIVATE_H */
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index e75da0a70d1f..bb1817bd4ff3 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -39,6 +39,13 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
filep->private_data = df;
+ /*
+ * Use the pseudo fs inode on the device to link all mmaps
+ * to the same address space, allowing us to unmap all vmas
+ * associated to this device using unmap_mapping_range().
+ */
+ filep->f_mapping = device->inode->i_mapping;
+
return 0;
err_put_registration:
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
index d62fbfff20b8..7e7988c4258f 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
@@ -108,10 +108,10 @@ static int vfio_fsl_mc_set_irq_trigger(struct vfio_fsl_mc_device *vdev,
void *data)
{
struct fsl_mc_device *mc_dev = vdev->mc_dev;
- int ret, hwirq;
struct vfio_fsl_mc_irq *irq;
struct device *cont_dev = fsl_mc_cont_dev(&mc_dev->dev);
struct fsl_mc_device *mc_cont = to_fsl_mc_device(cont_dev);
+ int ret;
if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
return vfio_set_trigger(vdev, index, -1);
@@ -136,18 +136,17 @@ static int vfio_fsl_mc_set_irq_trigger(struct vfio_fsl_mc_device *vdev,
return vfio_set_trigger(vdev, index, fd);
}
- hwirq = vdev->mc_dev->irqs[index]->virq;
-
irq = &vdev->mc_irqs[index];
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_fsl_mc_irq_handler(hwirq, irq);
+ if (irq->trigger)
+ eventfd_signal(irq->trigger);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
u8 trigger = *(u8 *)data;
- if (trigger)
- vfio_fsl_mc_irq_handler(hwirq, irq);
+ if (trigger && irq->trigger)
+ eventfd_signal(irq->trigger);
}
return 0;
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index 610a429c6191..49559605177e 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -104,15 +104,14 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
{
struct vfio_container *container;
struct iommufd_ctx *iommufd;
- struct fd f;
int ret;
int fd;
if (get_user(fd, arg))
return -EFAULT;
- f = fdget(fd);
- if (!f.file)
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
mutex_lock(&group->group_lock);
@@ -125,13 +124,13 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
goto out_unlock;
}
- container = vfio_container_from_file(f.file);
+ container = vfio_container_from_file(fd_file(f));
if (container) {
ret = vfio_container_attach_group(container, group);
goto out_unlock;
}
- iommufd = iommufd_ctx_from_file(f.file);
+ iommufd = iommufd_ctx_from_file(fd_file(f));
if (!IS_ERR(iommufd)) {
if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) &&
group->type == VFIO_NO_IOMMU)
@@ -153,7 +152,6 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
out_unlock:
mutex_unlock(&group->group_lock);
- fdput(f);
return ret;
}
@@ -286,6 +284,13 @@ static struct file *vfio_device_open_file(struct vfio_device *device)
*/
filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
+ /*
+ * Use the pseudo fs inode on the device to link all mmaps
+ * to the same address space, allowing us to unmap all vmas
+ * associated to this device using unmap_mapping_range().
+ */
+ filep->f_mapping = device->inode->i_mapping;
+
if (device->group->type == VFIO_NO_IOMMU)
dev_warn(device->dev, "vfio-noiommu device opened by user "
"(%s:%d)\n", current->comm, task_pid_nr(current));
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 82eba6966fa5..516294fd901b 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -7,8 +7,8 @@
#include "vfio.h"
-MODULE_IMPORT_NS(IOMMUFD);
-MODULE_IMPORT_NS(IOMMUFD_VFIO);
+MODULE_IMPORT_NS("IOMMUFD");
+MODULE_IMPORT_NS("IOMMUFD_VFIO");
bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev,
struct iommufd_ctx *ictx)
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index ed4737de4528..f2e686f8f1ef 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -76,7 +76,7 @@ int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
if (ret)
return ret;
- ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL);
+ ret = class_compat_create_link(mdev_bus_compat_class, dev);
if (ret)
dev_warn(dev, "Failed to create compatibility class link\n");
@@ -98,7 +98,7 @@ void mdev_unregister_parent(struct mdev_parent *parent)
dev_info(parent->dev, "MDEV: Unregistering\n");
down_write(&parent->unreg_sem);
- class_compat_remove_link(mdev_bus_compat_class, parent->dev, NULL);
+ class_compat_remove_link(mdev_bus_compat_class, parent->dev);
device_for_each_child(parent->dev, NULL, mdev_device_remove_cb);
parent_remove_sysfs_files(parent);
up_write(&parent->unreg_sem);
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 7825d83a55f8..ad5b834806ff 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -31,7 +31,7 @@ static void mdev_remove(struct device *dev)
drv->remove(to_mdev_device(dev));
}
-static int mdev_match(struct device *dev, struct device_driver *drv)
+static int mdev_match(struct device *dev, const struct device_driver *drv)
{
/*
* No drivers automatically match. Drivers are only bound by explicit
@@ -40,7 +40,7 @@ static int mdev_match(struct device *dev, struct device_driver *drv)
return 0;
}
-struct bus_type mdev_bus_type = {
+const struct bus_type mdev_bus_type = {
.name = "mdev",
.probe = mdev_probe,
.remove = mdev_remove,
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index af457b27f607..5f61acd0fe42 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -10,10 +10,7 @@
#ifndef MDEV_PRIVATE_H
#define MDEV_PRIVATE_H
-int mdev_bus_register(void);
-void mdev_bus_unregister(void);
-
-extern struct bus_type mdev_bus_type;
+extern const struct bus_type mdev_bus_type;
extern const struct attribute_group *mdev_device_groups[];
#define to_mdev_type_attr(_attr) \
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 9d2738e10c0b..e44bb44c581e 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -160,7 +160,7 @@ static void mdev_type_release(struct kobject *kobj)
put_device(type->parent->dev);
}
-static struct kobj_type mdev_type_ktype = {
+static const struct kobj_type mdev_type_ktype = {
.sysfs_ops = &mdev_type_sysfs_ops,
.release = mdev_type_release,
.default_groups = mdev_type_groups,
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 18c397df566d..bf50ffa10bde 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -67,4 +67,8 @@ source "drivers/vfio/pci/pds/Kconfig"
source "drivers/vfio/pci/virtio/Kconfig"
+source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
+
+source "drivers/vfio/pci/qat/Kconfig"
+
endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 046139a4eca5..cf00c0a7e55c 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -15,3 +15,7 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
obj-$(CONFIG_PDS_VFIO_PCI) += pds/
obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
+
+obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
+
+obj-$(CONFIG_QAT_VFIO_PCI) += qat/
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 4d27465c8f1a..451c639299eb 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -486,31 +486,11 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return 0;
}
-static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
- struct hisi_acc_vf_migration_file *migf)
+static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data)
{
- struct acc_vf_data *vf_data = &migf->vf_data;
- struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
struct device *dev = &vf_qm->pdev->dev;
int ret;
- if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
- /* Update state and return with match data */
- vf_data->vf_qm_state = QM_NOT_READY;
- hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
- migf->total_length = QM_MATCH_SIZE;
- return 0;
- }
-
- vf_data->vf_qm_state = QM_READY;
- hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
-
- ret = vf_qm_cache_wb(vf_qm);
- if (ret) {
- dev_err(dev, "failed to writeback QM Cache!\n");
- return ret;
- }
-
ret = qm_get_regs(vf_qm, vf_data);
if (ret)
return -EINVAL;
@@ -536,6 +516,38 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return -EINVAL;
}
+ return 0;
+}
+
+static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ struct device *dev = &vf_qm->pdev->dev;
+ int ret;
+
+ if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
+ /* Update state and return with match data */
+ vf_data->vf_qm_state = QM_NOT_READY;
+ hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+ migf->total_length = QM_MATCH_SIZE;
+ return 0;
+ }
+
+ vf_data->vf_qm_state = QM_READY;
+ hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+
+ ret = vf_qm_cache_wb(vf_qm);
+ if (ret) {
+ dev_err(dev, "failed to writeback QM Cache!\n");
+ return ret;
+ }
+
+ ret = vf_qm_read_data(vf_qm, vf_data);
+ if (ret)
+ return -EINVAL;
+
migf->total_length = sizeof(struct acc_vf_data);
return 0;
}
@@ -615,40 +627,48 @@ static void hisi_acc_vf_disable_fd(struct hisi_acc_vf_migration_file *migf)
mutex_unlock(&migf->lock);
}
+static void
+hisi_acc_debug_migf_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *src_migf)
+{
+ struct hisi_acc_vf_migration_file *dst_migf = hisi_acc_vdev->debug_migf;
+
+ if (!dst_migf)
+ return;
+
+ dst_migf->total_length = src_migf->total_length;
+ memcpy(&dst_migf->vf_data, &src_migf->vf_data,
+ sizeof(struct acc_vf_data));
+}
+
static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
if (hisi_acc_vdev->resuming_migf) {
+ hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->resuming_migf);
hisi_acc_vf_disable_fd(hisi_acc_vdev->resuming_migf);
fput(hisi_acc_vdev->resuming_migf->filp);
hisi_acc_vdev->resuming_migf = NULL;
}
if (hisi_acc_vdev->saving_migf) {
+ hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->saving_migf);
hisi_acc_vf_disable_fd(hisi_acc_vdev->saving_migf);
fput(hisi_acc_vdev->saving_migf->filp);
hisi_acc_vdev->saving_migf = NULL;
}
}
-/*
- * This function is called in all state_mutex unlock cases to
- * handle a 'deferred_reset' if exists.
- */
-static void
-hisi_acc_vf_state_mutex_unlock(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+static struct hisi_acc_vf_core_device *hisi_acc_get_vf_dev(struct vfio_device *vdev)
{
-again:
- spin_lock(&hisi_acc_vdev->reset_lock);
- if (hisi_acc_vdev->deferred_reset) {
- hisi_acc_vdev->deferred_reset = false;
- spin_unlock(&hisi_acc_vdev->reset_lock);
- hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
- hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
- hisi_acc_vf_disable_fds(hisi_acc_vdev);
- goto again;
- }
- mutex_unlock(&hisi_acc_vdev->state_mutex);
- spin_unlock(&hisi_acc_vdev->reset_lock);
+ return container_of(vdev, struct hisi_acc_vf_core_device,
+ core_device.vdev);
+}
+
+static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
+ hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ hisi_acc_vf_disable_fds(hisi_acc_vdev);
}
static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vdev)
@@ -737,7 +757,6 @@ static const struct file_operations hisi_acc_vf_resume_fops = {
.owner = THIS_MODULE,
.write = hisi_acc_vf_resume_write,
.release = hisi_acc_vf_release_file,
- .llseek = no_llseek,
};
static struct hisi_acc_vf_migration_file *
@@ -804,8 +823,10 @@ static long hisi_acc_vf_precopy_ioctl(struct file *filp,
info.dirty_bytes = 0;
info.initial_bytes = migf->total_length - *pos;
+ mutex_unlock(&migf->lock);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
- ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+ return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
out:
mutex_unlock(&migf->lock);
mutex_unlock(&hisi_acc_vdev->state_mutex);
@@ -857,7 +878,6 @@ static const struct file_operations hisi_acc_vf_save_fops = {
.unlocked_ioctl = hisi_acc_vf_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = hisi_acc_vf_release_file,
- .llseek = no_llseek,
};
static struct hisi_acc_vf_migration_file *
@@ -1045,8 +1065,7 @@ static struct file *
hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state new_state)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
enum vfio_device_mig_state next_state;
struct file *res = NULL;
int ret;
@@ -1071,7 +1090,7 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev,
break;
}
}
- hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
return res;
}
@@ -1087,12 +1106,11 @@ static int
hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state *curr_state)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
mutex_lock(&hisi_acc_vdev->state_mutex);
*curr_state = hisi_acc_vdev->mig_state;
- hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
return 0;
}
@@ -1104,21 +1122,9 @@ static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev)
VFIO_MIGRATION_STOP_COPY)
return;
- /*
- * As the higher VFIO layers are holding locks across reset and using
- * those same locks with the mm_lock we need to prevent ABBA deadlock
- * with the state_mutex and mm_lock.
- * In case the state_mutex was taken already we defer the cleanup work
- * to the unlock flow of the other running context.
- */
- spin_lock(&hisi_acc_vdev->reset_lock);
- hisi_acc_vdev->deferred_reset = true;
- if (!mutex_trylock(&hisi_acc_vdev->state_mutex)) {
- spin_unlock(&hisi_acc_vdev->reset_lock);
- return;
- }
- spin_unlock(&hisi_acc_vdev->reset_lock);
- hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+ mutex_lock(&hisi_acc_vdev->state_mutex);
+ hisi_acc_vf_reset(hisi_acc_vdev);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
}
static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
@@ -1302,10 +1308,132 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int
return vfio_pci_core_ioctl(core_vdev, cmd, arg);
}
+static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ int ret;
+
+ lockdep_assert_held(&hisi_acc_vdev->open_mutex);
+ /*
+ * When the device is not opened, the io_base is not mapped.
+ * The driver cannot perform device read and write operations.
+ */
+ if (!hisi_acc_vdev->dev_opened) {
+ seq_puts(seq, "device not opened!\n");
+ return -EINVAL;
+ }
+
+ ret = qm_wait_dev_not_ready(vf_qm);
+ if (ret) {
+ seq_puts(seq, "VF device not ready!\n");
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int hisi_acc_vf_debug_cmd(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ u64 value;
+ int ret;
+
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ ret = hisi_acc_vf_debug_check(seq, vdev);
+ if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ return ret;
+ }
+
+ value = readl(vf_qm->io_base + QM_MB_CMD_SEND_BASE);
+ if (value == QM_MB_CMD_NOT_READY) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ seq_puts(seq, "mailbox cmd channel not ready!\n");
+ return -EINVAL;
+ }
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ seq_puts(seq, "mailbox cmd channel ready!\n");
+
+ return 0;
+}
+
+static int hisi_acc_vf_dev_read(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ size_t vf_data_sz = offsetofend(struct acc_vf_data, padding);
+ struct acc_vf_data *vf_data;
+ int ret;
+
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ ret = hisi_acc_vf_debug_check(seq, vdev);
+ if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ return ret;
+ }
+
+ mutex_lock(&hisi_acc_vdev->state_mutex);
+ vf_data = kzalloc(sizeof(*vf_data), GFP_KERNEL);
+ if (!vf_data) {
+ ret = -ENOMEM;
+ goto mutex_release;
+ }
+
+ vf_data->vf_qm_state = hisi_acc_vdev->vf_qm_state;
+ ret = vf_qm_read_data(&hisi_acc_vdev->vf_qm, vf_data);
+ if (ret)
+ goto migf_err;
+
+ seq_hex_dump(seq, "Dev Data:", DUMP_PREFIX_OFFSET, 16, 1,
+ (const void *)vf_data, vf_data_sz, false);
+
+ seq_printf(seq,
+ "guest driver load: %u\n"
+ "data size: %lu\n",
+ hisi_acc_vdev->vf_qm_state,
+ sizeof(struct acc_vf_data));
+
+migf_err:
+ kfree(vf_data);
+mutex_release:
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+
+ return ret;
+}
+
+static int hisi_acc_vf_migf_read(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ size_t vf_data_sz = offsetofend(struct acc_vf_data, padding);
+ struct hisi_acc_vf_migration_file *debug_migf = hisi_acc_vdev->debug_migf;
+
+ /* Check whether the live migration operation has been performed */
+ if (debug_migf->total_length < QM_MATCH_SIZE) {
+ seq_puts(seq, "device not migrated!\n");
+ return -EAGAIN;
+ }
+
+ seq_hex_dump(seq, "Mig Data:", DUMP_PREFIX_OFFSET, 16, 1,
+ (const void *)&debug_migf->vf_data, vf_data_sz, false);
+ seq_printf(seq, "migrate data length: %lu\n", debug_migf->total_length);
+
+ return 0;
+}
+
static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
int ret;
@@ -1314,12 +1442,16 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
return ret;
if (core_vdev->mig_ops) {
+ mutex_lock(&hisi_acc_vdev->open_mutex);
ret = hisi_acc_vf_qm_init(hisi_acc_vdev);
if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_disable(vdev);
return ret;
}
hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ hisi_acc_vdev->dev_opened = true;
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
}
vfio_pci_core_finish_enable(vdev);
@@ -1328,11 +1460,13 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ hisi_acc_vdev->dev_opened = false;
iounmap(vf_qm->io_base);
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_close_device(core_vdev);
}
@@ -1344,8 +1478,7 @@ static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = {
static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
struct hisi_qm *pf_qm = hisi_acc_get_pf_qm(pdev);
@@ -1353,6 +1486,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
hisi_acc_vdev->pf_qm = pf_qm;
hisi_acc_vdev->vf_dev = pdev;
mutex_init(&hisi_acc_vdev->state_mutex);
+ mutex_init(&hisi_acc_vdev->open_mutex);
core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY;
core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops;
@@ -1398,6 +1532,47 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
+static void hisi_acc_vfio_debug_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct vfio_device *vdev = &hisi_acc_vdev->core_device.vdev;
+ struct hisi_acc_vf_migration_file *migf;
+ struct dentry *vfio_dev_migration;
+ struct dentry *vfio_hisi_acc;
+ struct device *dev = vdev->dev;
+
+ if (!debugfs_initialized() ||
+ !IS_ENABLED(CONFIG_VFIO_DEBUGFS))
+ return;
+
+ if (vdev->ops != &hisi_acc_vfio_pci_migrn_ops)
+ return;
+
+ vfio_dev_migration = debugfs_lookup("migration", vdev->debug_root);
+ if (!vfio_dev_migration) {
+ dev_err(dev, "failed to lookup migration debugfs file!\n");
+ return;
+ }
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return;
+ hisi_acc_vdev->debug_migf = migf;
+
+ vfio_hisi_acc = debugfs_create_dir("hisi_acc", vfio_dev_migration);
+ debugfs_create_devm_seqfile(dev, "dev_data", vfio_hisi_acc,
+ hisi_acc_vf_dev_read);
+ debugfs_create_devm_seqfile(dev, "migf_data", vfio_hisi_acc,
+ hisi_acc_vf_migf_read);
+ debugfs_create_devm_seqfile(dev, "cmd_state", vfio_hisi_acc,
+ hisi_acc_vf_debug_cmd);
+}
+
+static void hisi_acc_vf_debugfs_exit(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ kfree(hisi_acc_vdev->debug_migf);
+ hisi_acc_vdev->debug_migf = NULL;
+}
+
static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct hisi_acc_vf_core_device *hisi_acc_vdev;
@@ -1424,6 +1599,8 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device
ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device);
if (ret)
goto out_put_vdev;
+
+ hisi_acc_vfio_debug_init(hisi_acc_vdev);
return 0;
out_put_vdev:
@@ -1436,6 +1613,7 @@ static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev)
struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev);
vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device);
+ hisi_acc_vf_debugfs_exit(hisi_acc_vdev);
vfio_put_device(&hisi_acc_vdev->core_device.vdev);
}
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index dcabfeec6ca1..245d7537b2bc 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -32,6 +32,7 @@
#define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0)
#define QM_SQC_VFT_NUM_SHIFT_V2 45
#define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0)
+#define QM_MB_CMD_NOT_READY 0xffffffff
/* RW regs */
#define QM_REGS_MAX_LEN 7
@@ -98,8 +99,15 @@ struct hisi_acc_vf_migration_file {
struct hisi_acc_vf_core_device {
struct vfio_pci_core_device core_device;
- u8 match_done:1;
- u8 deferred_reset:1;
+ u8 match_done;
+ /*
+ * io_base is only valid when dev_opened is true,
+ * which is protected by open_mutex.
+ */
+ bool dev_opened;
+ /* Ensure the accuracy of dev_opened operation */
+ struct mutex open_mutex;
+
/* For migration state */
struct mutex state_mutex;
enum vfio_device_mig_state mig_state;
@@ -107,11 +115,20 @@ struct hisi_acc_vf_core_device {
struct pci_dev *vf_dev;
struct hisi_qm *pf_qm;
struct hisi_qm vf_qm;
+ /*
+ * vf_qm_state represents the QM_VF_STATE register value.
+ * It is set by Guest driver for the ACC VF dev indicating
+ * the driver has loaded and configured the dev correctly.
+ */
u32 vf_qm_state;
int vf_id;
- /* For reset handler */
- spinlock_t reset_lock;
struct hisi_acc_vf_migration_file *resuming_migf;
struct hisi_acc_vf_migration_file *saving_migf;
+
+ /*
+ * It holds migration data corresponding to the last migration
+ * and is used by the debugfs interface to report it.
+ */
+ struct hisi_acc_vf_migration_file *debug_migf;
};
#endif /* HISI_ACC_VFIO_PCI_H */
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index efd1d252cdc9..11eda6b207f1 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -108,8 +108,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
if (ret)
return ret;
- if (mvdev->saving_migf->state ==
- MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+ /* Upon cleanup, ignore previous pre_copy error state */
+ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
+ !(query_flags & MLX5VF_QUERY_CLEANUP)) {
/*
* In case we had a PRE_COPY error, only query full
* image for final image
@@ -121,6 +122,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
}
query_flags &= ~MLX5VF_QUERY_INC;
}
+ /* Block incremental query which is state-dependent */
+ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
+ complete(&mvdev->saving_migf->save_comp);
+ return -ENODEV;
+ }
}
MLX5_SET(query_vhca_migration_state_in, in, opcode,
@@ -149,6 +155,12 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
return 0;
}
+static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
+{
+ mvdev->tracker.object_changed = true;
+ complete(&mvdev->tracker_comp);
+}
+
static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
{
/* Mark the tracker under an error and wake it up if it's running */
@@ -189,7 +201,7 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
/* Must be done outside the lock to let it progress */
set_tracker_error(mvdev);
mutex_lock(&mvdev->state_mutex);
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
_mlx5vf_free_page_tracker_resources(mvdev);
mlx5vf_state_mutex_unlock(mvdev);
}
@@ -221,6 +233,10 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
if (!MLX5_CAP_GEN(mvdev->mdev, migration))
goto end;
+ if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
+ MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
+ goto end;
+
mvdev->vf_id = pci_iov_vf_id(pdev);
if (mvdev->vf_id < 0)
goto end;
@@ -250,17 +266,14 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
mvdev->migrate_cap = 1;
mvdev->core_device.vdev.migration_flags =
VFIO_MIGRATION_STOP_COPY |
- VFIO_MIGRATION_P2P;
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+
mvdev->core_device.vdev.mig_ops = mig_ops;
init_completion(&mvdev->tracker_comp);
if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
mvdev->core_device.vdev.log_ops = log_ops;
- if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
- MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
- mvdev->core_device.vdev.migration_flags |=
- VFIO_MIGRATION_PRE_COPY;
-
if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
mvdev->chunk_mode = 1;
@@ -395,13 +408,61 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
buf->dma_dir, 0);
}
- /* Undo alloc_pages_bulk_array() */
+ /* Undo alloc_pages_bulk() */
for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
__free_page(sg_page_iter_page(&sg_iter));
sg_free_append_table(&buf->table);
kfree(buf);
}
+static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages)
+{
+ unsigned int to_alloc = npages;
+ struct page **page_list;
+ unsigned long filled;
+ unsigned int to_fill;
+ int ret;
+ int i;
+
+ to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+ page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+ if (!page_list)
+ return -ENOMEM;
+
+ do {
+ filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill,
+ page_list);
+ if (!filled) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ to_alloc -= filled;
+ ret = sg_alloc_append_table_from_pages(
+ &buf->table, page_list, filled, 0,
+ filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
+ GFP_KERNEL_ACCOUNT);
+
+ if (ret)
+ goto err_append;
+ buf->allocated_length += filled * PAGE_SIZE;
+ /* clean input for another bulk allocation */
+ memset(page_list, 0, filled * sizeof(*page_list));
+ to_fill = min_t(unsigned int, to_alloc,
+ PAGE_SIZE / sizeof(*page_list));
+ } while (to_alloc > 0);
+
+ kvfree(page_list);
+ return 0;
+
+err_append:
+ for (i = filled - 1; i >= 0; i--)
+ __free_page(page_list[i]);
+err:
+ kvfree(page_list);
+ return ret;
+}
+
struct mlx5_vhca_data_buffer *
mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
size_t length,
@@ -608,8 +669,13 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
err:
/* The error flow can't run from an interrupt context */
- if (status == -EREMOTEIO)
+ if (status == -EREMOTEIO) {
status = MLX5_GET(save_vhca_state_out, async_data->out, status);
+ /* Failed in FW, print cmd out failure details */
+ mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
+ async_data->out);
+ }
+
async_data->status = status;
queue_work(migf->mvdev->cb_wq, &async_data->work);
}
@@ -623,6 +689,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
struct mlx5_vhca_data_buffer *header_buf = NULL;
struct mlx5vf_async_data *async_data;
+ bool pre_copy_cleanup = false;
int err;
lockdep_assert_held(&mvdev->state_mutex);
@@ -633,6 +700,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (err)
return err;
+ if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
+ pre_copy_cleanup = true;
+
if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
/*
* In case we had a PRE_COPY error, SAVE is triggered only for
@@ -651,29 +722,27 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
async_data = &migf->async_data;
async_data->buf = buf;
- async_data->stop_copy_chunk = !track;
+ async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
goto err_out;
}
- if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
- if (async_data->stop_copy_chunk) {
- u8 header_idx = buf->stop_copy_chunk_num ?
- buf->stop_copy_chunk_num - 1 : 0;
+ if (async_data->stop_copy_chunk) {
+ u8 header_idx = buf->stop_copy_chunk_num ?
+ buf->stop_copy_chunk_num - 1 : 0;
- header_buf = migf->buf_header[header_idx];
- migf->buf_header[header_idx] = NULL;
- }
+ header_buf = migf->buf_header[header_idx];
+ migf->buf_header[header_idx] = NULL;
+ }
- if (!header_buf) {
- header_buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
- if (IS_ERR(header_buf)) {
- err = PTR_ERR(header_buf);
- goto err_free;
- }
+ if (!header_buf) {
+ header_buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(header_buf)) {
+ err = PTR_ERR(header_buf);
+ goto err_free;
}
}
@@ -900,6 +969,29 @@ static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
}
+static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_page_tracker *tracker)
+{
+ u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+ void *obj_context;
+ void *cmd_hdr;
+ int err;
+
+ cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
+
+ err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ return err;
+
+ obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
+ tracker->status = MLX5_GET(page_track, obj_context, state);
+ return 0;
+}
+
static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
struct mlx5_vhca_cq_buf *buf, int nent,
int cqe_size)
@@ -957,9 +1049,11 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
struct mlx5vf_pci_core_device *mvdev = container_of(
tracker, struct mlx5vf_pci_core_device, tracker);
+ struct mlx5_eqe_obj_change *object;
struct mlx5_eqe *eqe = data;
u8 event_type = (u8)type;
u8 queue_type;
+ u32 obj_id;
int qp_num;
switch (event_type) {
@@ -975,6 +1069,12 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
break;
set_tracker_error(mvdev);
break;
+ case MLX5_EVENT_TYPE_OBJECT_CHANGE:
+ object = &eqe->data.obj_change;
+ obj_id = be32_to_cpu(object->obj_id);
+ if (obj_id == tracker->id)
+ set_tracker_change_event(mvdev);
+ break;
default:
break;
}
@@ -1242,7 +1342,7 @@ static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
{
int i;
- /* Undo alloc_pages_bulk_array() */
+ /* Undo alloc_pages_bulk() */
for (i = 0; i < recv_buf->npages; i++)
__free_page(recv_buf->page_list[i]);
@@ -1261,9 +1361,9 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
return -ENOMEM;
for (;;) {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
- npages - done,
- recv_buf->page_list + done);
+ filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT,
+ npages - done,
+ recv_buf->page_list + done);
if (!filled)
goto err;
@@ -1417,7 +1517,8 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev,
struct mlx5_vhca_qp *host_qp;
struct mlx5_vhca_qp *fw_qp;
struct mlx5_core_dev *mdev;
- u32 max_msg_size = PAGE_SIZE;
+ u32 log_max_msg_size;
+ u32 max_msg_size;
u64 rq_size = SZ_2M;
u32 max_recv_wr;
int err;
@@ -1434,6 +1535,12 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev,
}
mdev = mvdev->mdev;
+ log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size);
+ max_msg_size = (1ULL << log_max_msg_size);
+ /* The RQ must hold at least 4 WQEs/messages for successful QP creation */
+ if (rq_size < 4 * max_msg_size)
+ rq_size = 4 * max_msg_size;
+
memset(tracker, 0, sizeof(*tracker));
tracker->uar = mlx5_get_uars_page(mdev);
if (IS_ERR(tracker->uar)) {
@@ -1523,25 +1630,41 @@ set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
{
u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
u32 nent = size / entry_size;
+ u32 nent_in_page;
+ u32 nent_to_set;
struct page *page;
+ u32 page_offset;
+ u32 page_index;
+ u32 buf_offset;
+ void *kaddr;
u64 addr;
u64 *buf;
int i;
- if (WARN_ON(index >= qp->recv_buf.npages ||
+ buf_offset = index * qp->max_msg_size;
+ if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE ||
(nent > qp->max_msg_size / entry_size)))
return;
- page = qp->recv_buf.page_list[index];
- buf = kmap_local_page(page);
- for (i = 0; i < nent; i++) {
- addr = MLX5_GET(page_track_report_entry, buf + i,
- dirty_address_low);
- addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
- dirty_address_high) << 32;
- iova_bitmap_set(dirty, addr, qp->tracked_page_size);
- }
- kunmap_local(buf);
+ do {
+ page_index = buf_offset / PAGE_SIZE;
+ page_offset = buf_offset % PAGE_SIZE;
+ nent_in_page = (PAGE_SIZE - page_offset) / entry_size;
+ page = qp->recv_buf.page_list[page_index];
+ kaddr = kmap_local_page(page);
+ buf = kaddr + page_offset;
+ nent_to_set = min(nent, nent_in_page);
+ for (i = 0; i < nent_to_set; i++) {
+ addr = MLX5_GET(page_track_report_entry, buf + i,
+ dirty_address_low);
+ addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
+ dirty_address_high) << 32;
+ iova_bitmap_set(dirty, addr, qp->tracked_page_size);
+ }
+ kunmap_local(kaddr);
+ buf_offset += (nent_to_set * entry_size);
+ nent -= nent_to_set;
+ } while (nent);
}
static void
@@ -1634,6 +1757,11 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
goto end;
}
+ if (tracker->is_err) {
+ err = -EIO;
+ goto end;
+ }
+
mdev = mvdev->mdev;
err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
MLX5_PAGE_TRACK_STATE_REPORTING);
@@ -1652,6 +1780,12 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
dirty, &tracker->status);
if (poll_err == CQ_EMPTY) {
wait_for_completion(&mvdev->tracker_comp);
+ if (tracker->object_changed) {
+ tracker->object_changed = false;
+ err = mlx5vf_cmd_query_tracker(mdev, tracker);
+ if (err)
+ goto end;
+ }
continue;
}
}
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index f2c7227fa683..df421dc6de04 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -13,9 +13,6 @@
#include <linux/mlx5/cq.h>
#include <linux/mlx5/qp.h>
-#define MLX5VF_PRE_COPY_SUPP(mvdev) \
- ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY)
-
enum mlx5_vf_migf_state {
MLX5_MIGF_STATE_ERROR = 1,
MLX5_MIGF_STATE_PRE_COPY_ERROR,
@@ -25,7 +22,6 @@ enum mlx5_vf_migf_state {
};
enum mlx5_vf_load_state {
- MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER,
MLX5_VF_LOAD_STATE_READ_HEADER,
MLX5_VF_LOAD_STATE_PREP_HEADER_DATA,
MLX5_VF_LOAD_STATE_READ_HEADER_DATA,
@@ -162,6 +158,7 @@ struct mlx5_vhca_page_tracker {
u32 id;
u32 pdn;
u8 is_err:1;
+ u8 object_changed:1;
struct mlx5_uars_page *uar;
struct mlx5_vhca_cq cq;
struct mlx5_vhca_qp *host_qp;
@@ -196,6 +193,7 @@ struct mlx5vf_pci_core_device {
enum {
MLX5VF_QUERY_INC = (1UL << 0),
MLX5VF_QUERY_FINAL = (1UL << 1),
+ MLX5VF_QUERY_CLEANUP = (1UL << 2),
};
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
@@ -226,12 +224,11 @@ struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
size_t length, enum dma_data_direction dma_dir);
void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
-int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
- unsigned int npages);
struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
unsigned long offset);
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
-void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
+ enum mlx5_vf_migf_state *last_save_state);
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
u8 chunk_num, size_t next_required_umem_size);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index fe09a8c8af95..709543e7eb04 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -65,50 +65,6 @@ mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
return NULL;
}
-int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
- unsigned int npages)
-{
- unsigned int to_alloc = npages;
- struct page **page_list;
- unsigned long filled;
- unsigned int to_fill;
- int ret;
-
- to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
- page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
- if (!page_list)
- return -ENOMEM;
-
- do {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
- page_list);
- if (!filled) {
- ret = -ENOMEM;
- goto err;
- }
- to_alloc -= filled;
- ret = sg_alloc_append_table_from_pages(
- &buf->table, page_list, filled, 0,
- filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
- GFP_KERNEL_ACCOUNT);
-
- if (ret)
- goto err;
- buf->allocated_length += filled * PAGE_SIZE;
- /* clean input for another bulk allocation */
- memset(page_list, 0, filled * sizeof(*page_list));
- to_fill = min_t(unsigned int, to_alloc,
- PAGE_SIZE / sizeof(*page_list));
- } while (to_alloc > 0);
-
- kvfree(page_list);
- return 0;
-
-err:
- kvfree(page_list);
- return ret;
-}
-
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
mutex_lock(&migf->lock);
@@ -631,7 +587,6 @@ static const struct file_operations mlx5vf_save_fops = {
.unlocked_ioctl = mlx5vf_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = mlx5vf_release_file,
- .llseek = no_llseek,
};
static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
@@ -685,14 +640,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
O_RDONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
- goto end;
+ kfree(migf);
+ return ERR_PTR(ret);
}
migf->mvdev = mvdev;
- ret = mlx5vf_cmd_alloc_pd(migf);
- if (ret)
- goto out_free;
-
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
init_waitqueue_head(&migf->poll_wait);
@@ -708,6 +660,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
+
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out;
+
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
if (ret)
goto out_pd;
@@ -737,10 +694,8 @@ out_save:
mlx5vf_free_data_buffer(buf);
out_pd:
mlx5fv_cmd_clean_migf_resources(migf);
-out_free:
+out:
fput(migf->filp);
-end:
- kfree(migf);
return ERR_PTR(ret);
}
@@ -777,36 +732,6 @@ mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
return 0;
}
-static int
-mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
- loff_t requested_length,
- const char __user **buf, size_t *len,
- loff_t *pos, ssize_t *done)
-{
- int ret;
-
- if (requested_length > MAX_LOAD_SIZE)
- return -ENOMEM;
-
- if (vhca_buf->allocated_length < requested_length) {
- ret = mlx5vf_add_migration_pages(
- vhca_buf,
- DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
- PAGE_SIZE));
- if (ret)
- return ret;
- }
-
- while (*len) {
- ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
- done);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
struct mlx5_vhca_data_buffer *vhca_buf,
@@ -1038,13 +963,6 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
break;
}
- case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
- ret = mlx5vf_resume_read_image_no_header(vhca_buf,
- requested_length,
- &buf, &len, pos, &done);
- if (ret)
- goto out_unlock;
- break;
case MLX5_VF_LOAD_STATE_READ_IMAGE:
ret = mlx5vf_resume_read_image(migf, vhca_buf,
migf->record_size,
@@ -1081,7 +999,6 @@ static const struct file_operations mlx5vf_resume_fops = {
.owner = THIS_MODULE,
.write = mlx5vf_resume_write,
.release = mlx5vf_release_file,
- .llseek = no_llseek,
};
static struct mlx5_vf_migration_file *
@@ -1099,13 +1016,19 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
O_WRONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
- goto end;
+ kfree(migf);
+ return ERR_PTR(ret);
}
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
migf->mvdev = mvdev;
ret = mlx5vf_cmd_alloc_pd(migf);
if (ret)
- goto out_free;
+ goto out;
buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
if (IS_ERR(buf)) {
@@ -1114,39 +1037,28 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
}
migf->buf[0] = buf;
- if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
- buf = mlx5vf_alloc_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
- if (IS_ERR(buf)) {
- ret = PTR_ERR(buf);
- goto out_buf;
- }
-
- migf->buf_header[0] = buf;
- migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
- } else {
- /* Initial state will be to read the image */
- migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
+ buf = mlx5vf_alloc_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_buf;
}
- stream_open(migf->filp->f_inode, migf->filp);
- mutex_init(&migf->lock);
- INIT_LIST_HEAD(&migf->buf_list);
- INIT_LIST_HEAD(&migf->avail_list);
- spin_lock_init(&migf->list_lock);
+ migf->buf_header[0] = buf;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+
return migf;
out_buf:
mlx5vf_free_data_buffer(migf->buf[0]);
out_pd:
mlx5vf_cmd_dealloc_pd(migf);
-out_free:
+out:
fput(migf->filp);
-end:
- kfree(migf);
return ERR_PTR(ret);
}
-void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
+ enum mlx5_vf_migf_state *last_save_state)
{
if (mvdev->resuming_migf) {
mlx5vf_disable_fd(mvdev->resuming_migf);
@@ -1157,6 +1069,8 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
if (mvdev->saving_migf) {
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
+ if (last_save_state)
+ *last_save_state = mvdev->saving_migf->state;
mlx5vf_disable_fd(mvdev->saving_migf);
wake_up_interruptible(&mvdev->saving_migf->poll_wait);
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
@@ -1217,12 +1131,34 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return migf->filp;
}
- if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
- (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+ mlx5vf_disable_fds(mvdev, NULL);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
- mlx5vf_disable_fds(mvdev);
- return NULL;
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ struct mlx5_vhca_data_buffer *buf;
+ enum mlx5_vf_migf_state state;
+ size_t size;
+
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
+ MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
+ if (ret)
+ return ERR_PTR(ret);
+ buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
+ if (IS_ERR(buf))
+ return ERR_CAST(buf);
+ /* pre_copy cleanup */
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
+ if (ret) {
+ mlx5vf_put_data_buffer(buf);
+ return ERR_PTR(ret);
+ }
+ mlx5vf_disable_fds(mvdev, &state);
+ return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
@@ -1237,14 +1173,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
}
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
- if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
- ret = mlx5vf_cmd_load_vhca_state(mvdev,
- mvdev->resuming_migf,
- mvdev->resuming_migf->buf[0]);
- if (ret)
- return ERR_PTR(ret);
- }
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
return NULL;
}
@@ -1289,7 +1218,7 @@ again:
mvdev->deferred_reset = false;
spin_unlock(&mvdev->reset_lock);
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
goto again;
}
mutex_unlock(&mvdev->state_mutex);
@@ -1517,7 +1446,7 @@ static struct pci_driver mlx5vf_pci_driver = {
module_pci_driver(mlx5vf_pci_driver);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
diff --git a/drivers/vfio/pci/nvgrace-gpu/Kconfig b/drivers/vfio/pci/nvgrace-gpu/Kconfig
new file mode 100644
index 000000000000..a7f624b37e41
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NVGRACE_GPU_VFIO_PCI
+ tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip"
+ depends on ARM64 || (COMPILE_TEST && 64BIT)
+ select VFIO_PCI_CORE
+ help
+ VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is
+ required to assign the GPU device to userspace using KVM/qemu/etc.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/nvgrace-gpu/Makefile b/drivers/vfio/pci/nvgrace-gpu/Makefile
new file mode 100644
index 000000000000..3ca8c187897a
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o
+nvgrace-gpu-vfio-pci-y := main.o
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
new file mode 100644
index 000000000000..e5ac39c4cc6b
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -0,0 +1,1015 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/sizes.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+
+/*
+ * The device memory usable to the workloads running in the VM is cached
+ * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region)
+ * to the VM and is represented as usemem.
+ * Moreover, the VM GPU device driver needs a non-cacheable region to
+ * support the MIG feature. This region is also exposed as a 64b BAR
+ * (comprising of BAR2 and BAR3 region) and represented as resmem.
+ */
+#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
+#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
+
+/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
+#define MEMBLK_SIZE SZ_512M
+
+#define DVSEC_BITMAP_OFFSET 0xA
+#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
+
+#define GPU_CAP_DVSEC_REGISTER 3
+
+#define C2C_LINK_BAR0_OFFSET 0x1498
+#define HBM_TRAINING_BAR0_OFFSET 0x200BC
+#define STATUS_READY 0xFF
+
+#define POLL_QUANTUM_MS 1000
+#define POLL_TIMEOUT_MS (30 * 1000)
+
+/*
+ * The state of the two device memory region - resmem and usemem - is
+ * saved as struct mem_region.
+ */
+struct mem_region {
+ phys_addr_t memphys; /* Base physical address of the region */
+ size_t memlength; /* Region size */
+ size_t bar_size; /* Reported region BAR size */
+ __le64 bar_val; /* Emulated BAR offset registers */
+ union {
+ void *memaddr;
+ void __iomem *ioaddr;
+ }; /* Base virtual address of the region */
+};
+
+struct nvgrace_gpu_pci_core_device {
+ struct vfio_pci_core_device core_device;
+ /* Cached and usable memory for the VM. */
+ struct mem_region usemem;
+ /* Non cached memory carved out from the end of device memory */
+ struct mem_region resmem;
+ /* Lock to control device memory kernel mapping */
+ struct mutex remap_lock;
+ bool has_mig_hw_bug;
+};
+
+static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+
+ nvdev->resmem.bar_val = 0;
+ nvdev->usemem.bar_val = 0;
+}
+
+/* Choose the structure corresponding to the fake BAR with a given index. */
+static struct mem_region *
+nvgrace_gpu_memregion(int index,
+ struct nvgrace_gpu_pci_core_device *nvdev)
+{
+ if (index == USEMEM_REGION_INDEX)
+ return &nvdev->usemem;
+
+ if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
+ return &nvdev->resmem;
+
+ return NULL;
+}
+
+static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
+{
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ int ret;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ if (nvdev->usemem.memlength) {
+ nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
+ mutex_init(&nvdev->remap_lock);
+ }
+
+ vfio_pci_core_finish_enable(vdev);
+
+ return 0;
+}
+
+static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+
+ /* Unmap the mapping to the device memory cached region */
+ if (nvdev->usemem.memaddr) {
+ memunmap(nvdev->usemem.memaddr);
+ nvdev->usemem.memaddr = NULL;
+ }
+
+ /* Unmap the mapping to the device memory non-cached region */
+ if (nvdev->resmem.ioaddr) {
+ iounmap(nvdev->resmem.ioaddr);
+ nvdev->resmem.ioaddr = NULL;
+ }
+
+ mutex_destroy(&nvdev->remap_lock);
+
+ vfio_pci_core_close_device(core_vdev);
+}
+
+static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
+ struct vm_area_struct *vma)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ struct mem_region *memregion;
+ unsigned long start_pfn;
+ u64 req_len, pgoff, end;
+ unsigned int index;
+ int ret = 0;
+
+ index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+ if (!memregion)
+ return vfio_pci_core_mmap(core_vdev, vma);
+
+ /*
+ * Request to mmap the BAR. Map to the CPU accessible memory on the
+ * GPU using the memory information gathered from the system ACPI
+ * tables.
+ */
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
+ check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
+ check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
+ return -EOVERFLOW;
+
+ /*
+ * Check that the mapping request does not go beyond available device
+ * memory size
+ */
+ if (end > memregion->memlength)
+ return -EINVAL;
+
+ /*
+ * The carved out region of the device memory needs the NORMAL_NC
+ * property. Communicate as such to the hypervisor.
+ */
+ if (index == RESMEM_REGION_INDEX) {
+ /*
+ * The nvgrace-gpu module has no issues with uncontained
+ * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is
+ * set to communicate to the KVM to S2 map as NORMAL_NC.
+ * This opens up guest usage of NORMAL_NC for this mapping.
+ */
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED);
+
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+ }
+
+ /*
+ * Perform a PFN map to the memory and back the device BAR by the
+ * GPU memory.
+ *
+ * The available GPU memory size may not be power-of-2 aligned. The
+ * remainder is only backed by vfio_device_ops read/write handlers.
+ *
+ * During device reset, the GPU is safely disconnected to the CPU
+ * and access to the BAR will be immediately returned preventing
+ * machine check.
+ */
+ ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
+ req_len, vma->vm_page_prot);
+ if (ret)
+ return ret;
+
+ vma->vm_pgoff = start_pfn;
+
+ return 0;
+}
+
+static long
+nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
+ unsigned long arg)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+ struct vfio_region_info_cap_sparse_mmap *sparse;
+ struct vfio_region_info info;
+ struct mem_region *memregion;
+ u32 size;
+ int ret;
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ /*
+ * Request to determine the BAR region information. Send the
+ * GPU memory information.
+ */
+ memregion = nvgrace_gpu_memregion(info.index, nvdev);
+ if (!memregion)
+ return vfio_pci_core_ioctl(core_vdev,
+ VFIO_DEVICE_GET_REGION_INFO, arg);
+
+ size = struct_size(sparse, areas, 1);
+
+ /*
+ * Setup for sparse mapping for the device memory. Only the
+ * available device memory on the hardware is shown as a
+ * mappable region.
+ */
+ sparse = kzalloc(size, GFP_KERNEL);
+ if (!sparse)
+ return -ENOMEM;
+
+ sparse->nr_areas = 1;
+ sparse->areas[0].offset = 0;
+ sparse->areas[0].size = memregion->memlength;
+ sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->header.version = 1;
+
+ ret = vfio_info_add_capability(&caps, &sparse->header, size);
+ kfree(sparse);
+ if (ret)
+ return ret;
+
+ info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ /*
+ * The region memory size may not be power-of-2 aligned.
+ * Given that the memory as a BAR and may not be
+ * aligned, roundup to the next power-of-2.
+ */
+ info.size = memregion->bar_size;
+ info.flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP;
+
+ if (caps.size) {
+ info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ info.cap_offset = 0;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user((void __user *)arg +
+ sizeof(info), caps.buf,
+ caps.size)) {
+ kfree(caps.buf);
+ return -EFAULT;
+ }
+ info.cap_offset = sizeof(info);
+ }
+ kfree(caps.buf);
+ }
+ return copy_to_user((void __user *)arg, &info, minsz) ?
+ -EFAULT : 0;
+}
+
+static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg);
+ case VFIO_DEVICE_IOEVENTFD:
+ return -ENOTTY;
+ case VFIO_DEVICE_RESET:
+ nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
+ fallthrough;
+ default:
+ return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ }
+}
+
+static __le64
+nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64)
+{
+ u64 tmp_val;
+
+ tmp_val = le64_to_cpu(val64);
+ tmp_val &= ~(bar_size - 1);
+ tmp_val |= flags;
+
+ return cpu_to_le64(tmp_val);
+}
+
+/*
+ * Both the usable (usemem) and the reserved (resmem) device memory region
+ * are exposed as a 64b fake device BARs in the VM. These fake BARs must
+ * respond to the accesses on their respective PCI config space offsets.
+ *
+ * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3.
+ * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5.
+ */
+static ssize_t
+nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ struct mem_region *memregion = NULL;
+ __le64 val64;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+ int ret;
+
+ ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
+ if (ret < 0)
+ return ret;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
+ sizeof(val64),
+ &copy_offset, &copy_count,
+ &register_offset))
+ memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
+ else if (vfio_pci_core_range_intersect_range(pos, count,
+ PCI_BASE_ADDRESS_4,
+ sizeof(val64),
+ &copy_offset, &copy_count,
+ &register_offset))
+ memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
+
+ if (memregion) {
+ val64 = nvgrace_gpu_get_read_value(memregion->bar_size,
+ PCI_BASE_ADDRESS_MEM_TYPE_64 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH,
+ memregion->bar_val);
+ if (copy_to_user(buf + copy_offset,
+ (void *)&val64 + register_offset, copy_count)) {
+ /*
+ * The position has been incremented in
+ * vfio_pci_core_read. Reset the offset back to the
+ * starting position.
+ */
+ *ppos -= count;
+ return -EFAULT;
+ }
+ }
+
+ return count;
+}
+
+static ssize_t
+nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ struct mem_region *memregion = NULL;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
+ sizeof(u64), &copy_offset,
+ &copy_count, &register_offset))
+ memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
+ else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4,
+ sizeof(u64), &copy_offset,
+ &copy_count, &register_offset))
+ memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
+
+ if (memregion) {
+ if (copy_from_user((void *)&memregion->bar_val + register_offset,
+ buf + copy_offset, copy_count))
+ return -EFAULT;
+ *ppos += copy_count;
+ return copy_count;
+ }
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+/*
+ * Ad hoc map the device memory in the module kernel VA space. Primarily needed
+ * as vfio does not require the userspace driver to only perform accesses through
+ * mmaps of the vfio-pci BAR regions and such accesses should be supported using
+ * vfio_device_ops read/write implementations.
+ *
+ * The usemem region is cacheable memory and hence is memremaped.
+ * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC).
+ */
+static int
+nvgrace_gpu_map_device_mem(int index,
+ struct nvgrace_gpu_pci_core_device *nvdev)
+{
+ struct mem_region *memregion;
+ int ret = 0;
+
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+ if (!memregion)
+ return -EINVAL;
+
+ mutex_lock(&nvdev->remap_lock);
+
+ if (memregion->memaddr)
+ goto unlock;
+
+ if (index == USEMEM_REGION_INDEX)
+ memregion->memaddr = memremap(memregion->memphys,
+ memregion->memlength,
+ MEMREMAP_WB);
+ else
+ memregion->ioaddr = ioremap_wc(memregion->memphys,
+ memregion->memlength);
+
+ if (!memregion->memaddr)
+ ret = -ENOMEM;
+
+unlock:
+ mutex_unlock(&nvdev->remap_lock);
+
+ return ret;
+}
+
+/*
+ * Read the data from the device memory (mapped either through ioremap
+ * or memremap) into the user buffer.
+ */
+static int
+nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev,
+ char __user *buf, size_t mem_count, loff_t *ppos)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+ int ret;
+
+ if (!mem_count)
+ return 0;
+
+ /*
+ * Handle read on the BAR regions. Map to the target device memory
+ * physical address and copy to the request read buffer.
+ */
+ ret = nvgrace_gpu_map_device_mem(index, nvdev);
+ if (ret)
+ return ret;
+
+ if (index == USEMEM_REGION_INDEX) {
+ if (copy_to_user(buf,
+ (u8 *)nvdev->usemem.memaddr + offset,
+ mem_count))
+ ret = -EFAULT;
+ } else {
+ /*
+ * The hardware ensures that the system does not crash when
+ * the device memory is accessed with the memory enable
+ * turned off. It synthesizes ~0 on such read. So there is
+ * no need to check or support the disablement/enablement of
+ * BAR through PCI_COMMAND config space register. Pass
+ * test_mem flag as false.
+ */
+ ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
+ nvdev->resmem.ioaddr,
+ buf, offset, mem_count,
+ 0, 0, false);
+ }
+
+ return ret;
+}
+
+/*
+ * Read count bytes from the device memory at an offset. The actual device
+ * memory size (available) may not be a power-of-2. So the driver fakes
+ * the size to a power-of-2 (reported) when exposing to a user space driver.
+ *
+ * Reads starting beyond the reported size generate -EINVAL; reads extending
+ * beyond the actual device size is filled with ~0; reads extending beyond
+ * the reported size are truncated.
+ */
+static ssize_t
+nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ struct mem_region *memregion;
+ size_t mem_count, i;
+ u8 val = 0xFF;
+ int ret;
+
+ /* No need to do NULL check as caller does. */
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+
+ if (offset >= memregion->bar_size)
+ return -EINVAL;
+
+ /* Clip short the read request beyond reported BAR size */
+ count = min(count, memregion->bar_size - (size_t)offset);
+
+ /*
+ * Determine how many bytes to be actually read from the device memory.
+ * Read request beyond the actual device memory size is filled with ~0,
+ * while those beyond the actual reported size is skipped.
+ */
+ if (offset >= memregion->memlength)
+ mem_count = 0;
+ else
+ mem_count = min(count, memregion->memlength - (size_t)offset);
+
+ ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+
+ /*
+ * Only the device memory present on the hardware is mapped, which may
+ * not be power-of-2 aligned. A read to an offset beyond the device memory
+ * size is filled with ~0.
+ */
+ for (i = mem_count; i < count; i++) {
+ ret = put_user(val, (unsigned char __user *)(buf + i));
+ if (ret)
+ return ret;
+ }
+
+ *ppos += count;
+ return count;
+}
+
+static ssize_t
+nvgrace_gpu_read(struct vfio_device *core_vdev,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+
+ if (nvgrace_gpu_memregion(index, nvdev))
+ return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
+
+ return vfio_pci_core_read(core_vdev, buf, count, ppos);
+}
+
+/*
+ * Write the data to the device memory (mapped either through ioremap
+ * or memremap) from the user buffer.
+ */
+static int
+nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev,
+ const char __user *buf, size_t mem_count,
+ loff_t *ppos)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ int ret;
+
+ if (!mem_count)
+ return 0;
+
+ ret = nvgrace_gpu_map_device_mem(index, nvdev);
+ if (ret)
+ return ret;
+
+ if (index == USEMEM_REGION_INDEX) {
+ if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos,
+ buf, mem_count))
+ return -EFAULT;
+ } else {
+ /*
+ * The hardware ensures that the system does not crash when
+ * the device memory is accessed with the memory enable
+ * turned off. It drops such writes. So there is no need to
+ * check or support the disablement/enablement of BAR
+ * through PCI_COMMAND config space register. Pass test_mem
+ * flag as false.
+ */
+ ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
+ nvdev->resmem.ioaddr,
+ (char __user *)buf, pos, mem_count,
+ 0, 0, true);
+ }
+
+ return ret;
+}
+
+/*
+ * Write count bytes to the device memory at a given offset. The actual device
+ * memory size (available) may not be a power-of-2. So the driver fakes the
+ * size to a power-of-2 (reported) when exposing to a user space driver.
+ *
+ * Writes extending beyond the reported size are truncated; writes starting
+ * beyond the reported size generate -EINVAL.
+ */
+static ssize_t
+nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
+ size_t count, loff_t *ppos, const char __user *buf)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+ struct mem_region *memregion;
+ size_t mem_count;
+ int ret = 0;
+
+ /* No need to do NULL check as caller does. */
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+
+ if (offset >= memregion->bar_size)
+ return -EINVAL;
+
+ /* Clip short the write request beyond reported BAR size */
+ count = min(count, memregion->bar_size - (size_t)offset);
+
+ /*
+ * Determine how many bytes to be actually written to the device memory.
+ * Do not write to the offset beyond available size.
+ */
+ if (offset >= memregion->memlength)
+ goto exitfn;
+
+ /*
+ * Only the device memory present on the hardware is mapped, which may
+ * not be power-of-2 aligned. Drop access outside the available device
+ * memory on the hardware.
+ */
+ mem_count = min(count, memregion->memlength - (size_t)offset);
+
+ ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+
+exitfn:
+ *ppos += count;
+ return count;
+}
+
+static ssize_t
+nvgrace_gpu_write(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+
+ if (nvgrace_gpu_memregion(index, nvdev))
+ return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
+ .name = "nvgrace-gpu-vfio-pci",
+ .init = vfio_pci_core_init_dev,
+ .release = vfio_pci_core_release_dev,
+ .open_device = nvgrace_gpu_open_device,
+ .close_device = nvgrace_gpu_close_device,
+ .ioctl = nvgrace_gpu_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = nvgrace_gpu_read,
+ .write = nvgrace_gpu_write,
+ .mmap = nvgrace_gpu_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
+ .name = "nvgrace-gpu-vfio-pci-core",
+ .init = vfio_pci_core_init_dev,
+ .release = vfio_pci_core_release_dev,
+ .open_device = nvgrace_gpu_open_device,
+ .close_device = vfio_pci_core_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int
+nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
+ u64 *pmemphys, u64 *pmemlength)
+{
+ int ret;
+
+ /*
+ * The memory information is present in the system ACPI tables as DSD
+ * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size.
+ */
+ ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa",
+ pmemphys);
+ if (ret)
+ return ret;
+
+ if (*pmemphys > type_max(phys_addr_t))
+ return -EOVERFLOW;
+
+ ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size",
+ pmemlength);
+ if (ret)
+ return ret;
+
+ if (*pmemlength > type_max(size_t))
+ return -EOVERFLOW;
+
+ /*
+ * If the C2C link is not up due to an error, the coherent device
+ * memory size is returned as 0. Fail in such case.
+ */
+ if (*pmemlength == 0)
+ return -ENOMEM;
+
+ return ret;
+}
+
+static int
+nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
+ struct nvgrace_gpu_pci_core_device *nvdev,
+ u64 memphys, u64 memlength)
+{
+ int ret = 0;
+ u64 resmem_size = 0;
+
+ /*
+ * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
+ * region to support the MIG feature owing to a hardware bug. Since the
+ * device memory is mapped as NORMAL cached, carve out a region from the end
+ * with a different NORMAL_NC property (called as reserved memory and
+ * represented as resmem). This region then is exposed as a 64b BAR
+ * (region 2 and 3) to the VM, while exposing the rest (termed as usable
+ * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
+ *
+ * devmem (memlength)
+ * |-------------------------------------------------|
+ * | |
+ * usemem.memphys resmem.memphys
+ *
+ * This hardware bug is fixed on the Grace Blackwell platforms and the
+ * presence of the bug can be determined through nvdev->has_mig_hw_bug.
+ * Thus on systems with the hardware fix, there is no need to partition
+ * the GPU device memory and the entire memory is usable and mapped as
+ * NORMAL cached (i.e. resmem size is 0).
+ */
+ if (nvdev->has_mig_hw_bug)
+ resmem_size = SZ_1G;
+
+ nvdev->usemem.memphys = memphys;
+
+ /*
+ * The device memory exposed to the VM is added to the kernel by the
+ * VM driver module in chunks of memory block size. Note that only the
+ * usable memory (usemem) is added to the kernel for usage by the VM
+ * workloads.
+ */
+ if (check_sub_overflow(memlength, resmem_size,
+ &nvdev->usemem.memlength)) {
+ ret = -EOVERFLOW;
+ goto done;
+ }
+
+ /*
+ * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
+ * Calculate and save the BAR size for the region.
+ */
+ nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
+
+ /*
+ * If the hardware has the fix for MIG, there is no requirement
+ * for splitting the device memory to create RESMEM. The entire
+ * device memory is usable and will be USEMEM. Return here for
+ * such case.
+ */
+ if (!nvdev->has_mig_hw_bug)
+ goto done;
+
+ /*
+ * When the device memory is split to workaround the MIG bug on
+ * Grace Hopper, the USEMEM part of the device memory has to be
+ * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
+ * GPU FW and VFIO driver. The VM device driver is also aware of it
+ * and make use of the value for its calculation to determine USEMEM
+ * size. Note that the device memory may not be 512M aligned.
+ */
+ nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
+ MEMBLK_SIZE);
+ if (nvdev->usemem.memlength == 0) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ if ((check_add_overflow(nvdev->usemem.memphys,
+ nvdev->usemem.memlength,
+ &nvdev->resmem.memphys)) ||
+ (check_sub_overflow(memlength, nvdev->usemem.memlength,
+ &nvdev->resmem.memlength))) {
+ ret = -EOVERFLOW;
+ goto done;
+ }
+
+ /*
+ * The resmem region is exposed as a 64b BAR composed of region 2 and 3
+ * for Grace Hopper. Calculate and save the BAR size for the region.
+ */
+ nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
+done:
+ return ret;
+}
+
+static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
+{
+ int pcie_dvsec;
+ u16 dvsec_ctrl16;
+
+ pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
+ GPU_CAP_DVSEC_REGISTER);
+
+ if (pcie_dvsec) {
+ pci_read_config_word(pdev,
+ pcie_dvsec + DVSEC_BITMAP_OFFSET,
+ &dvsec_ctrl16);
+
+ if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * To reduce the system bootup time, the HBM training has
+ * been moved out of the UEFI on the Grace-Blackwell systems.
+ *
+ * The onus of checking whether the HBM training has completed
+ * thus falls on the module. The HBM training status can be
+ * determined from a BAR0 register.
+ *
+ * Similarly, another BAR0 register exposes the status of the
+ * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
+ *
+ * Poll these register and check for 30s. If the HBM training is
+ * not complete or if the C2C link is not ready, fail the probe.
+ *
+ * While the wait is not required on Grace Hopper systems, it
+ * is beneficial to make the check to ensure the device is in an
+ * expected state.
+ *
+ * Ensure that the BAR0 region is enabled before accessing the
+ * registers.
+ */
+static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
+{
+ unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
+ void __iomem *io;
+ int ret = -ETIME;
+
+ ret = pci_enable_device(pdev);
+ if (ret)
+ return ret;
+
+ ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
+ if (ret)
+ goto request_region_exit;
+
+ io = pci_iomap(pdev, 0, 0);
+ if (!io) {
+ ret = -ENOMEM;
+ goto iomap_exit;
+ }
+
+ do {
+ if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+ (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
+ ret = 0;
+ goto reg_check_exit;
+ }
+ msleep(POLL_QUANTUM_MS);
+ } while (!time_after(jiffies, timeout));
+
+reg_check_exit:
+ pci_iounmap(pdev, io);
+iomap_exit:
+ pci_release_selected_regions(pdev, 1 << 0);
+request_region_exit:
+ pci_disable_device(pdev);
+ return ret;
+}
+
+static int nvgrace_gpu_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops;
+ struct nvgrace_gpu_pci_core_device *nvdev;
+ u64 memphys, memlength;
+ int ret;
+
+ ret = nvgrace_gpu_wait_device_ready(pdev);
+ if (ret)
+ return ret;
+
+ ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
+ if (!ret)
+ ops = &nvgrace_gpu_pci_ops;
+
+ nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev,
+ &pdev->dev, ops);
+ if (IS_ERR(nvdev))
+ return PTR_ERR(nvdev);
+
+ dev_set_drvdata(&pdev->dev, &nvdev->core_device);
+
+ if (ops == &nvgrace_gpu_pci_ops) {
+ nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
+
+ /*
+ * Device memory properties are identified in the host ACPI
+ * table. Set the nvgrace_gpu_pci_core_device structure.
+ */
+ ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
+ memphys, memlength);
+ if (ret)
+ goto out_put_vdev;
+ }
+
+ ret = vfio_pci_core_register_device(&nvdev->core_device);
+ if (ret)
+ goto out_put_vdev;
+
+ return ret;
+
+out_put_vdev:
+ vfio_put_device(&nvdev->core_device.vdev);
+ return ret;
+}
+
+static void nvgrace_gpu_remove(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+ vfio_pci_core_unregister_device(core_device);
+ vfio_put_device(&core_device->vdev);
+}
+
+static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
+ /* GH200 120GB */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
+ /* GH200 480GB */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
+ /* GH200 SKU */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
+ /* GB200 SKU */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
+ {}
+};
+
+MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
+
+static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = nvgrace_gpu_vfio_pci_table,
+ .probe = nvgrace_gpu_probe,
+ .remove = nvgrace_gpu_remove,
+ .err_handler = &vfio_pci_core_err_handlers,
+ .driver_managed_dma = true,
+};
+
+module_pci_driver(nvgrace_gpu_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>");
+MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>");
+MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");
diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
index 8ddf4346fcd5..c51f5e4c3dd6 100644
--- a/drivers/vfio/pci/pds/dirty.c
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -3,6 +3,7 @@
#include <linux/interval_tree.h>
#include <linux/vfio.h>
+#include <linux/vmalloc.h>
#include <linux/pds/pds_common.h>
#include <linux/pds/pds_core_if.h>
@@ -607,7 +608,7 @@ int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
mutex_lock(&pds_vfio->state_mutex);
err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
- pds_vfio_state_mutex_unlock(pds_vfio);
+ mutex_unlock(&pds_vfio->state_mutex);
return err;
}
@@ -624,7 +625,7 @@ int pds_vfio_dma_logging_start(struct vfio_device *vdev,
mutex_lock(&pds_vfio->state_mutex);
pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
- pds_vfio_state_mutex_unlock(pds_vfio);
+ mutex_unlock(&pds_vfio->state_mutex);
return err;
}
@@ -637,7 +638,7 @@ int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
mutex_lock(&pds_vfio->state_mutex);
pds_vfio_dirty_disable(pds_vfio, true);
- pds_vfio_state_mutex_unlock(pds_vfio);
+ mutex_unlock(&pds_vfio->state_mutex);
return 0;
}
diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
index 79fe2e66bb49..f2673d395236 100644
--- a/drivers/vfio/pci/pds/lm.c
+++ b/drivers/vfio/pci/pds/lm.c
@@ -92,8 +92,10 @@ static void pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
{
mutex_lock(&lm_file->lock);
+ lm_file->disabled = true;
lm_file->size = 0;
lm_file->alloc_size = 0;
+ lm_file->filep->f_pos = 0;
/* Free scatter list of file pages */
sg_free_table(&lm_file->sg_table);
@@ -183,6 +185,12 @@ static ssize_t pds_vfio_save_read(struct file *filp, char __user *buf,
pos = &filp->f_pos;
mutex_lock(&lm_file->lock);
+
+ if (lm_file->disabled) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
if (*pos > lm_file->size) {
done = -EINVAL;
goto out_unlock;
@@ -227,7 +235,6 @@ static const struct file_operations pds_vfio_save_fops = {
.owner = THIS_MODULE,
.read = pds_vfio_save_read,
.release = pds_vfio_release_file,
- .llseek = no_llseek,
};
static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
@@ -283,6 +290,11 @@ static ssize_t pds_vfio_restore_write(struct file *filp, const char __user *buf,
mutex_lock(&lm_file->lock);
+ if (lm_file->disabled) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
while (len) {
size_t page_offset;
struct page *page;
@@ -321,7 +333,6 @@ static const struct file_operations pds_vfio_restore_fops = {
.owner = THIS_MODULE,
.write = pds_vfio_restore_write,
.release = pds_vfio_release_file,
- .llseek = no_llseek,
};
static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h
index 13be893198b7..9511b1afc6a1 100644
--- a/drivers/vfio/pci/pds/lm.h
+++ b/drivers/vfio/pci/pds/lm.h
@@ -27,6 +27,7 @@ struct pds_vfio_lm_file {
struct scatterlist *last_offset_sg; /* Iterator */
unsigned int sg_last_entry;
unsigned long last_offset;
+ bool disabled;
};
struct pds_vfio_pci_device;
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index a34dda516629..4923f1823126 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -21,16 +21,13 @@
static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
{
- bool deferred_reset_needed = false;
-
/*
* Documentation states that the kernel migration driver must not
* generate asynchronous device state transitions outside of
* manipulation by the user or the VFIO_DEVICE_RESET ioctl.
*
* Since recovery is an asynchronous event received from the device,
- * initiate a deferred reset. Issue a deferred reset in the following
- * situations:
+ * initiate a reset in the following situations:
* 1. Migration is in progress, which will cause the next step of
* the migration to fail.
* 2. If the device is in a state that will be set to
@@ -42,24 +39,8 @@ static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
pds_vfio->state != VFIO_DEVICE_STATE_ERROR) ||
(pds_vfio->state == VFIO_DEVICE_STATE_RUNNING &&
pds_vfio_dirty_is_enabled(pds_vfio)))
- deferred_reset_needed = true;
+ pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_ERROR);
mutex_unlock(&pds_vfio->state_mutex);
-
- /*
- * On the next user initiated state transition, the device will
- * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's
- * responsibility to reset the device.
- *
- * If a VFIO_DEVICE_RESET is requested post recovery and before the next
- * state transition, then the deferred reset state will be set to
- * VFIO_DEVICE_STATE_RUNNING.
- */
- if (deferred_reset_needed) {
- mutex_lock(&pds_vfio->reset_mutex);
- pds_vfio->deferred_reset = true;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;
- mutex_unlock(&pds_vfio->reset_mutex);
- }
}
static int pds_vfio_pci_notify_handler(struct notifier_block *nb,
@@ -185,7 +166,9 @@ static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
{
struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
- pds_vfio_reset(pds_vfio);
+ mutex_lock(&pds_vfio->state_mutex);
+ pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_RUNNING);
+ mutex_unlock(&pds_vfio->state_mutex);
}
static const struct pci_error_handlers pds_vfio_pci_err_handlers = {
@@ -204,7 +187,7 @@ static struct pci_driver pds_vfio_pci_driver = {
module_pci_driver(pds_vfio_pci_driver);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index 4c351c59d05a..76a80ae7087b 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -26,37 +26,14 @@ struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev)
vfio_coredev);
}
-void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state state)
{
-again:
- mutex_lock(&pds_vfio->reset_mutex);
- if (pds_vfio->deferred_reset) {
- pds_vfio->deferred_reset = false;
- if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
- pds_vfio_put_restore_file(pds_vfio);
- pds_vfio_put_save_file(pds_vfio);
- pds_vfio_dirty_disable(pds_vfio, false);
- }
- pds_vfio->state = pds_vfio->deferred_reset_state;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
- mutex_unlock(&pds_vfio->reset_mutex);
- goto again;
- }
- mutex_unlock(&pds_vfio->state_mutex);
- mutex_unlock(&pds_vfio->reset_mutex);
-}
-
-void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
-{
- mutex_lock(&pds_vfio->reset_mutex);
- pds_vfio->deferred_reset = true;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
- if (!mutex_trylock(&pds_vfio->state_mutex)) {
- mutex_unlock(&pds_vfio->reset_mutex);
- return;
- }
- mutex_unlock(&pds_vfio->reset_mutex);
- pds_vfio_state_mutex_unlock(pds_vfio);
+ pds_vfio_put_restore_file(pds_vfio);
+ pds_vfio_put_save_file(pds_vfio);
+ if (state == VFIO_DEVICE_STATE_ERROR)
+ pds_vfio_dirty_disable(pds_vfio, false);
+ pds_vfio->state = state;
}
static struct file *
@@ -97,8 +74,7 @@ pds_vfio_set_device_state(struct vfio_device *vdev,
break;
}
}
- pds_vfio_state_mutex_unlock(pds_vfio);
- /* still waiting on a deferred_reset */
+ mutex_unlock(&pds_vfio->state_mutex);
if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR)
res = ERR_PTR(-EIO);
@@ -114,7 +90,7 @@ static int pds_vfio_get_device_state(struct vfio_device *vdev,
mutex_lock(&pds_vfio->state_mutex);
*current_state = pds_vfio->state;
- pds_vfio_state_mutex_unlock(pds_vfio);
+ mutex_unlock(&pds_vfio->state_mutex);
return 0;
}
@@ -156,7 +132,6 @@ static int pds_vfio_init_device(struct vfio_device *vdev)
pds_vfio->vf_id = vf_id;
mutex_init(&pds_vfio->state_mutex);
- mutex_init(&pds_vfio->reset_mutex);
vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
vdev->mig_ops = &pds_vfio_lm_ops;
@@ -178,7 +153,6 @@ static void pds_vfio_release_device(struct vfio_device *vdev)
vfio_coredev.vdev);
mutex_destroy(&pds_vfio->state_mutex);
- mutex_destroy(&pds_vfio->reset_mutex);
vfio_pci_core_release_dev(vdev);
}
@@ -194,7 +168,6 @@ static int pds_vfio_open_device(struct vfio_device *vdev)
return err;
pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
index e7b01080a1ec..803d99d69c73 100644
--- a/drivers/vfio/pci/pds/vfio_dev.h
+++ b/drivers/vfio/pci/pds/vfio_dev.h
@@ -18,20 +18,16 @@ struct pds_vfio_pci_device {
struct pds_vfio_dirty dirty;
struct mutex state_mutex; /* protect migration state */
enum vfio_device_mig_state state;
- struct mutex reset_mutex; /* protect reset_done flow */
- u8 deferred_reset;
- enum vfio_device_mig_state deferred_reset_state;
struct notifier_block nb;
int vf_id;
u16 client_id;
};
-void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio);
-
const struct vfio_device_ops *pds_vfio_ops_info(void);
struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
-void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state state);
struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio);
struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio);
diff --git a/drivers/vfio/pci/qat/Kconfig b/drivers/vfio/pci/qat/Kconfig
new file mode 100644
index 000000000000..bf52cfa4b595
--- /dev/null
+++ b/drivers/vfio/pci/qat/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config QAT_VFIO_PCI
+ tristate "VFIO support for QAT VF PCI devices"
+ select VFIO_PCI_CORE
+ depends on CRYPTO_DEV_QAT_4XXX
+ help
+ This provides migration support for Intel(R) QAT Virtual Function
+ using the VFIO framework.
+
+ To compile this as a module, choose M here: the module
+ will be called qat_vfio_pci. If you don't know what to do here,
+ say N.
diff --git a/drivers/vfio/pci/qat/Makefile b/drivers/vfio/pci/qat/Makefile
new file mode 100644
index 000000000000..5fe5c4ec19d3
--- /dev/null
+++ b/drivers/vfio/pci/qat/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o
+qat_vfio_pci-y := main.o
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
new file mode 100644
index 000000000000..845ed15b6771
--- /dev/null
+++ b/drivers/vfio/pci/qat/main.c
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation */
+
+#include <linux/anon_inodes.h>
+#include <linux/container_of.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/qat/qat_mig_dev.h>
+
+/*
+ * The migration data of each Intel QAT VF device is encapsulated into a
+ * 4096 bytes block. The data consists of two parts.
+ * The first is a pre-configured set of attributes of the VF being migrated,
+ * which are only set when it is created. This can be migrated during pre-copy
+ * stage and used for a device compatibility check.
+ * The second is the VF state. This includes the required MMIO regions and
+ * the shadow states maintained by the QAT PF driver. This part can only be
+ * saved when the VF is fully quiesced and be migrated during stop-copy stage.
+ * Both these 2 parts of data are saved in hierarchical structures including
+ * a preamble section and several raw state sections.
+ * When the pre-configured part of the migration data is fully retrieved from
+ * user space, the preamble section are used to validate the correctness of
+ * the data blocks and check the version compatibility. The raw state sections
+ * are then used to do a device compatibility check.
+ * When the device transits from RESUMING state, the VF states are extracted
+ * from the raw state sections of the VF state part of the migration data and
+ * then loaded into the device.
+ */
+
+struct qat_vf_migration_file {
+ struct file *filp;
+ /* protects migration region context */
+ struct mutex lock;
+ bool disabled;
+ struct qat_vf_core_device *qat_vdev;
+ ssize_t filled_size;
+};
+
+struct qat_vf_core_device {
+ struct vfio_pci_core_device core_device;
+ struct qat_mig_dev *mdev;
+ /* protects migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ struct qat_vf_migration_file *resuming_migf;
+ struct qat_vf_migration_file *saving_migf;
+};
+
+static int qat_vf_pci_open_device(struct vfio_device *core_vdev)
+{
+ struct qat_vf_core_device *qat_vdev =
+ container_of(core_vdev, struct qat_vf_core_device,
+ core_device.vdev);
+ struct vfio_pci_core_device *vdev = &qat_vdev->core_device;
+ int ret;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ ret = qat_vfmig_open(qat_vdev->mdev);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
+ }
+ qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+
+ vfio_pci_core_finish_enable(vdev);
+
+ return 0;
+}
+
+static void qat_vf_disable_fd(struct qat_vf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->disabled = true;
+ migf->filp->f_pos = 0;
+ migf->filled_size = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static void qat_vf_disable_fds(struct qat_vf_core_device *qat_vdev)
+{
+ if (qat_vdev->resuming_migf) {
+ qat_vf_disable_fd(qat_vdev->resuming_migf);
+ fput(qat_vdev->resuming_migf->filp);
+ qat_vdev->resuming_migf = NULL;
+ }
+
+ if (qat_vdev->saving_migf) {
+ qat_vf_disable_fd(qat_vdev->saving_migf);
+ fput(qat_vdev->saving_migf->filp);
+ qat_vdev->saving_migf = NULL;
+ }
+}
+
+static void qat_vf_pci_close_device(struct vfio_device *core_vdev)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+ struct qat_vf_core_device, core_device.vdev);
+
+ qat_vfmig_close(qat_vdev->mdev);
+ qat_vf_disable_fds(qat_vdev);
+ vfio_pci_core_close_device(core_vdev);
+}
+
+static long qat_vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct qat_vf_migration_file *migf = filp->private_data;
+ struct qat_vf_core_device *qat_vdev = migf->qat_vdev;
+ struct qat_mig_dev *mig_dev = qat_vdev->mdev;
+ struct vfio_precopy_info info;
+ loff_t *pos = &filp->f_pos;
+ unsigned long minsz;
+ int ret = 0;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&qat_vdev->state_mutex);
+ if (qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ mutex_unlock(&qat_vdev->state_mutex);
+ return -EINVAL;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->disabled) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (*pos > mig_dev->setup_size) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ info.dirty_bytes = 0;
+ info.initial_bytes = mig_dev->setup_size - *pos;
+
+out:
+ mutex_unlock(&migf->lock);
+ mutex_unlock(&qat_vdev->state_mutex);
+ if (ret)
+ return ret;
+ return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static ssize_t qat_vf_save_read(struct file *filp, char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct qat_vf_migration_file *migf = filp->private_data;
+ struct qat_mig_dev *mig_dev = migf->qat_vdev->mdev;
+ ssize_t done = 0;
+ loff_t *offs;
+ int ret;
+
+ if (pos)
+ return -ESPIPE;
+ offs = &filp->f_pos;
+
+ mutex_lock(&migf->lock);
+ if (*offs > migf->filled_size || *offs < 0) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (migf->disabled) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ len = min_t(size_t, migf->filled_size - *offs, len);
+ if (len) {
+ ret = copy_to_user(buf, mig_dev->state + *offs, len);
+ if (ret) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *offs += len;
+ done = len;
+ }
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static int qat_vf_release_file(struct inode *inode, struct file *filp)
+{
+ struct qat_vf_migration_file *migf = filp->private_data;
+
+ qat_vf_disable_fd(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+
+ return 0;
+}
+
+static const struct file_operations qat_vf_save_fops = {
+ .owner = THIS_MODULE,
+ .read = qat_vf_save_read,
+ .unlocked_ioctl = qat_vf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = qat_vf_release_file,
+};
+
+static int qat_vf_save_state(struct qat_vf_core_device *qat_vdev,
+ struct qat_vf_migration_file *migf)
+{
+ int ret;
+
+ ret = qat_vfmig_save_state(qat_vdev->mdev);
+ if (ret)
+ return ret;
+ migf->filled_size = qat_vdev->mdev->state_size;
+
+ return 0;
+}
+
+static int qat_vf_save_setup(struct qat_vf_core_device *qat_vdev,
+ struct qat_vf_migration_file *migf)
+{
+ int ret;
+
+ ret = qat_vfmig_save_setup(qat_vdev->mdev);
+ if (ret)
+ return ret;
+ migf->filled_size = qat_vdev->mdev->setup_size;
+
+ return 0;
+}
+
+/*
+ * Allocate a file handler for user space and then save the migration data for
+ * the device being migrated. If this is called in the pre-copy stage, save the
+ * pre-configured device data. Otherwise, if this is called in the stop-copy
+ * stage, save the device state. In both cases, update the data size which can
+ * then be read from user space.
+ */
+static struct qat_vf_migration_file *
+qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev, bool pre_copy)
+{
+ struct qat_vf_migration_file *migf;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops,
+ migf, O_RDONLY);
+ ret = PTR_ERR_OR_ZERO(migf->filp);
+ if (ret) {
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+
+ if (pre_copy)
+ ret = qat_vf_save_setup(qat_vdev, migf);
+ else
+ ret = qat_vf_save_state(qat_vdev, migf);
+ if (ret) {
+ fput(migf->filp);
+ return ERR_PTR(ret);
+ }
+
+ migf->qat_vdev = qat_vdev;
+
+ return migf;
+}
+
+static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct qat_vf_migration_file *migf = filp->private_data;
+ struct qat_mig_dev *mig_dev = migf->qat_vdev->mdev;
+ loff_t end, *offs;
+ ssize_t done = 0;
+ int ret;
+
+ if (pos)
+ return -ESPIPE;
+ offs = &filp->f_pos;
+
+ if (*offs < 0 ||
+ check_add_overflow(len, *offs, &end))
+ return -EOVERFLOW;
+
+ if (end > mig_dev->state_size)
+ return -ENOMEM;
+
+ mutex_lock(&migf->lock);
+ if (migf->disabled) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ ret = copy_from_user(mig_dev->state + *offs, buf, len);
+ if (ret) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *offs += len;
+ migf->filled_size += len;
+
+ /*
+ * Load the pre-configured device data first to check if the target
+ * device is compatible with the source device.
+ */
+ ret = qat_vfmig_load_setup(mig_dev, migf->filled_size);
+ if (ret && ret != -EAGAIN) {
+ done = ret;
+ goto out_unlock;
+ }
+ done = len;
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static const struct file_operations qat_vf_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = qat_vf_resume_write,
+ .release = qat_vf_release_file,
+};
+
+static struct qat_vf_migration_file *
+qat_vf_resume_device_data(struct qat_vf_core_device *qat_vdev)
+{
+ struct qat_vf_migration_file *migf;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_resume_fops, migf, O_WRONLY);
+ ret = PTR_ERR_OR_ZERO(migf->filp);
+ if (ret) {
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ migf->qat_vdev = qat_vdev;
+ migf->filled_size = 0;
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+
+ return migf;
+}
+
+static int qat_vf_load_device_data(struct qat_vf_core_device *qat_vdev)
+{
+ return qat_vfmig_load_state(qat_vdev->mdev);
+}
+
+static struct file *qat_vf_pci_step_device_state(struct qat_vf_core_device *qat_vdev, u32 new)
+{
+ u32 cur = qat_vdev->mig_state;
+ int ret;
+
+ /*
+ * As the device is not capable of just stopping P2P DMAs, suspend the
+ * device completely once any of the P2P states are reached.
+ * When it is suspended, all its MMIO registers can still be operated
+ * correctly, jobs submitted through ring are queued while no jobs are
+ * processed by the device. The MMIO states can be safely migrated to
+ * the target VF during stop-copy stage and restored correctly in the
+ * target VF. All queued jobs can be resumed then.
+ */
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ ret = qat_vfmig_suspend(qat_vdev->mdev);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
+ qat_vfmig_resume(qat_vdev->mdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
+ return NULL;
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct qat_vf_migration_file *migf;
+
+ migf = qat_vf_save_device_data(qat_vdev, false);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ qat_vdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct qat_vf_migration_file *migf;
+
+ migf = qat_vf_resume_device_data(qat_vdev);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ qat_vdev->resuming_migf = migf;
+ return migf->filp;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+ qat_vf_disable_fds(qat_vdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct qat_vf_migration_file *migf;
+
+ migf = qat_vf_save_device_data(qat_vdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ qat_vdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct qat_vf_migration_file *migf = qat_vdev->saving_migf;
+
+ if (!migf)
+ return ERR_PTR(-EINVAL);
+ ret = qat_vf_save_state(qat_vdev, migf);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ ret = qat_vf_load_device_data(qat_vdev);
+ if (ret)
+ return ERR_PTR(ret);
+
+ qat_vf_disable_fds(qat_vdev);
+ return NULL;
+ }
+
+ /* vfio_mig_get_next_state() does not use arcs other than the above */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+static void qat_vf_reset_done(struct qat_vf_core_device *qat_vdev)
+{
+ qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ qat_vfmig_reset(qat_vdev->mdev);
+ qat_vf_disable_fds(qat_vdev);
+}
+
+static struct file *qat_vf_pci_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(vdev,
+ struct qat_vf_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *res = NULL;
+ int ret;
+
+ mutex_lock(&qat_vdev->state_mutex);
+ while (new_state != qat_vdev->mig_state) {
+ ret = vfio_mig_get_next_state(vdev, qat_vdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ res = ERR_PTR(ret);
+ break;
+ }
+ res = qat_vf_pci_step_device_state(qat_vdev, next_state);
+ if (IS_ERR(res))
+ break;
+ qat_vdev->mig_state = next_state;
+ if (WARN_ON(res && new_state != qat_vdev->mig_state)) {
+ fput(res);
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mutex_unlock(&qat_vdev->state_mutex);
+
+ return res;
+}
+
+static int qat_vf_pci_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(vdev,
+ struct qat_vf_core_device, core_device.vdev);
+
+ mutex_lock(&qat_vdev->state_mutex);
+ *curr_state = qat_vdev->mig_state;
+ mutex_unlock(&qat_vdev->state_mutex);
+
+ return 0;
+}
+
+static int qat_vf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(vdev,
+ struct qat_vf_core_device, core_device.vdev);
+
+ mutex_lock(&qat_vdev->state_mutex);
+ *stop_copy_length = qat_vdev->mdev->state_size;
+ mutex_unlock(&qat_vdev->state_mutex);
+
+ return 0;
+}
+
+static const struct vfio_migration_ops qat_vf_pci_mig_ops = {
+ .migration_set_state = qat_vf_pci_set_device_state,
+ .migration_get_state = qat_vf_pci_get_device_state,
+ .migration_get_data_size = qat_vf_pci_get_data_size,
+};
+
+static void qat_vf_pci_release_dev(struct vfio_device *core_vdev)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+ struct qat_vf_core_device, core_device.vdev);
+
+ qat_vfmig_cleanup(qat_vdev->mdev);
+ qat_vfmig_destroy(qat_vdev->mdev);
+ mutex_destroy(&qat_vdev->state_mutex);
+ vfio_pci_core_release_dev(core_vdev);
+}
+
+static int qat_vf_pci_init_dev(struct vfio_device *core_vdev)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+ struct qat_vf_core_device, core_device.vdev);
+ struct qat_mig_dev *mdev;
+ struct pci_dev *parent;
+ int ret, vf_id;
+
+ core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+ core_vdev->mig_ops = &qat_vf_pci_mig_ops;
+
+ ret = vfio_pci_core_init_dev(core_vdev);
+ if (ret)
+ return ret;
+
+ mutex_init(&qat_vdev->state_mutex);
+
+ parent = pci_physfn(qat_vdev->core_device.pdev);
+ vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev);
+ if (vf_id < 0) {
+ ret = -ENODEV;
+ goto err_rel;
+ }
+
+ mdev = qat_vfmig_create(parent, vf_id);
+ if (IS_ERR(mdev)) {
+ ret = PTR_ERR(mdev);
+ goto err_rel;
+ }
+
+ ret = qat_vfmig_init(mdev);
+ if (ret)
+ goto err_destroy;
+
+ qat_vdev->mdev = mdev;
+
+ return 0;
+
+err_destroy:
+ qat_vfmig_destroy(mdev);
+err_rel:
+ vfio_pci_core_release_dev(core_vdev);
+ return ret;
+}
+
+static const struct vfio_device_ops qat_vf_pci_ops = {
+ .name = "qat-vf-vfio-pci",
+ .init = qat_vf_pci_init_dev,
+ .release = qat_vf_pci_release_dev,
+ .open_device = qat_vf_pci_open_device,
+ .close_device = qat_vf_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static struct qat_vf_core_device *qat_vf_drvdata(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = pci_get_drvdata(pdev);
+
+ return container_of(core_device, struct qat_vf_core_device, core_device);
+}
+
+static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+ struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
+
+ if (!qat_vdev->mdev)
+ return;
+
+ mutex_lock(&qat_vdev->state_mutex);
+ qat_vf_reset_done(qat_vdev);
+ mutex_unlock(&qat_vdev->state_mutex);
+}
+
+static int
+qat_vf_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct device *dev = &pdev->dev;
+ struct qat_vf_core_device *qat_vdev;
+ int ret;
+
+ qat_vdev = vfio_alloc_device(qat_vf_core_device, core_device.vdev, dev, &qat_vf_pci_ops);
+ if (IS_ERR(qat_vdev))
+ return PTR_ERR(qat_vdev);
+
+ pci_set_drvdata(pdev, &qat_vdev->core_device);
+ ret = vfio_pci_core_register_device(&qat_vdev->core_device);
+ if (ret)
+ goto out_put_device;
+
+ return 0;
+
+out_put_device:
+ vfio_put_device(&qat_vdev->core_device.vdev);
+ return ret;
+}
+
+static void qat_vf_vfio_pci_remove(struct pci_dev *pdev)
+{
+ struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
+
+ vfio_pci_core_unregister_device(&qat_vdev->core_device);
+ vfio_put_device(&qat_vdev->core_device.vdev);
+}
+
+static const struct pci_device_id qat_vf_vfio_pci_table[] = {
+ /* Intel QAT GEN4 4xxx VF device */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) },
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) },
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) },
+ {}
+};
+MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table);
+
+static const struct pci_error_handlers qat_vf_err_handlers = {
+ .reset_done = qat_vf_pci_aer_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver qat_vf_vfio_pci_driver = {
+ .name = "qat_vfio_pci",
+ .id_table = qat_vf_vfio_pci_table,
+ .probe = qat_vf_vfio_pci_probe,
+ .remove = qat_vf_vfio_pci_remove,
+ .err_handler = &qat_vf_err_handlers,
+ .driver_managed_dma = true,
+};
+module_pci_driver(qat_vf_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Xin Zeng <xin.zeng@intel.com>");
+MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT GEN4 device family");
+MODULE_IMPORT_NS("CRYPTO_QAT");
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index cb5b7f865d58..e727941f589d 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -71,6 +71,8 @@ static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
+ case PCI_DEVICE_ID_INTEL_DSA_SPR0:
+ case PCI_DEVICE_ID_INTEL_IAX_SPR0:
return true;
default:
return false;
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 7e2e62ab0869..94142581c98c 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -313,6 +313,10 @@ static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos,
return count;
}
+static struct perm_bits direct_ro_perms = {
+ .readfn = vfio_direct_config_read,
+};
+
/* Default capability regions to read-only, no-virtualization */
static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
[0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
@@ -507,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
- } else if (pdev->resource[PCI_ROM_RESOURCE].flags &
- IORESOURCE_ROM_SHADOW) {
- mask = ~(0x20000 - 1);
+ } else if (pdev->rom && pdev->romlen) {
+ mask = ~(roundup_pow_of_two(pdev->romlen) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
- } else
+ } else {
*vbar = 0;
+ }
vdev->bardirty = false;
}
@@ -1385,11 +1389,12 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo
switch (ecap) {
case PCI_EXT_CAP_ID_VNDR:
- ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword);
+ ret = pci_read_config_dword(pdev, epos + PCI_VNDR_HEADER,
+ &dword);
if (ret)
return pcibios_err_to_errno(ret);
- return dword >> PCI_VSEC_HDR_LEN_SHIFT;
+ return PCI_VNDR_HEADER_LEN(dword);
case PCI_EXT_CAP_ID_VC:
case PCI_EXT_CAP_ID_VC9:
case PCI_EXT_CAP_ID_MFVC:
@@ -1897,9 +1902,17 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user
cap_start = *ppos;
} else {
if (*ppos >= PCI_CFG_SPACE_SIZE) {
- WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
+ /*
+ * We can get a cap_id that exceeds PCI_EXT_CAP_ID_MAX
+ * if we're hiding an unknown capability at the start
+ * of the extended capability list. Use default, ro
+ * access, which will virtualize the id and next values.
+ */
+ if (cap_id > PCI_EXT_CAP_ID_MAX)
+ perm = &direct_ro_perms;
+ else
+ perm = &ecap_perms[cap_id];
- perm = &ecap_perms[cap_id];
cap_start = vfio_find_cap_start(vdev, *ppos);
} else {
WARN_ON(cap_id > PCI_CAP_ID_MAX);
@@ -1966,3 +1979,45 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf,
return done;
}
+
+/**
+ * vfio_pci_core_range_intersect_range() - Determine overlap between a buffer
+ * and register offset ranges.
+ * @buf_start: start offset of the buffer
+ * @buf_cnt: number of buffer bytes
+ * @reg_start: start register offset
+ * @reg_cnt: number of register bytes
+ * @buf_offset: start offset of overlap in the buffer
+ * @intersect_count: number of overlapping bytes
+ * @register_offset: start offset of overlap in register
+ *
+ * Returns: true if there is overlap, false if not.
+ * The overlap start and size is returned through function args.
+ */
+bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
+ loff_t reg_start, size_t reg_cnt,
+ loff_t *buf_offset,
+ size_t *intersect_count,
+ size_t *register_offset)
+{
+ if (buf_start <= reg_start &&
+ buf_start + buf_cnt > reg_start) {
+ *buf_offset = reg_start - buf_start;
+ *intersect_count = min_t(size_t, reg_cnt,
+ buf_start + buf_cnt - reg_start);
+ *register_offset = 0;
+ return true;
+ }
+
+ if (buf_start > reg_start &&
+ buf_start < reg_start + reg_cnt) {
+ *buf_offset = 0;
+ *intersect_count = min_t(size_t, buf_cnt,
+ reg_start + reg_cnt - buf_start);
+ *register_offset = buf_start - reg_start;
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_range_intersect_range);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1cbc990d42e0..586e49efb81b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -20,6 +20,7 @@
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/pci.h>
+#include <linux/pfn_t.h>
#include <linux/pm_runtime.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -57,11 +58,6 @@ struct vfio_pci_vf_token {
int users;
};
-struct vfio_pci_mmap_vma {
- struct vm_area_struct *vma;
- struct list_head vma_next;
-};
-
static inline bool vfio_vga_disabled(void)
{
#ifdef CONFIG_VFIO_PCI_VGA
@@ -778,25 +774,26 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
}
struct vfio_pci_fill_info {
- struct vfio_pci_dependent_device __user *devices;
- struct vfio_pci_dependent_device __user *devices_end;
struct vfio_device *vdev;
+ struct vfio_pci_dependent_device *devices;
+ int nr_devices;
u32 count;
u32 flags;
};
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
{
- struct vfio_pci_dependent_device info = {
- .segment = pci_domain_nr(pdev->bus),
- .bus = pdev->bus->number,
- .devfn = pdev->devfn,
- };
+ struct vfio_pci_dependent_device *info;
struct vfio_pci_fill_info *fill = data;
- fill->count++;
- if (fill->devices >= fill->devices_end)
- return 0;
+ /* The topology changed since we counted devices */
+ if (fill->count >= fill->nr_devices)
+ return -EAGAIN;
+
+ info = &fill->devices[fill->count++];
+ info->segment = pci_domain_nr(pdev->bus);
+ info->bus = pdev->bus->number;
+ info->devfn = pdev->devfn;
if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
@@ -809,19 +806,19 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
*/
vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
if (!vdev) {
- info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ info->devid = VFIO_PCI_DEVID_NOT_OWNED;
} else {
int id = vfio_iommufd_get_dev_id(vdev, iommufd);
if (id > 0)
- info.devid = id;
+ info->devid = id;
else if (id == -ENOENT)
- info.devid = VFIO_PCI_DEVID_OWNED;
+ info->devid = VFIO_PCI_DEVID_OWNED;
else
- info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ info->devid = VFIO_PCI_DEVID_NOT_OWNED;
}
/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
- if (info.devid == VFIO_PCI_DEVID_NOT_OWNED)
+ if (info->devid == VFIO_PCI_DEVID_NOT_OWNED)
fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
} else {
struct iommu_group *iommu_group;
@@ -830,13 +827,10 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
if (!iommu_group)
return -EPERM; /* Cannot reset non-isolated devices */
- info.group_id = iommu_group_id(iommu_group);
+ info->group_id = iommu_group_id(iommu_group);
iommu_group_put(iommu_group);
}
- if (copy_to_user(fill->devices, &info, sizeof(info)))
- return -EFAULT;
- fill->devices++;
return 0;
}
@@ -1060,31 +1054,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.flags = 0;
+ info.size = 0;
- /* Report the BAR size, not the ROM size */
- info.size = pci_resource_len(pdev, info.index);
- if (!info.size) {
- /* Shadow ROMs appear as PCI option ROMs */
- if (pdev->resource[PCI_ROM_RESOURCE].flags &
- IORESOURCE_ROM_SHADOW)
- info.size = 0x20000;
- else
- break;
- }
-
- /*
- * Is it really there? Enable memory decode for implicit access
- * in pci_map_rom().
- */
- cmd = vfio_pci_memory_lock_and_enable(vdev);
- io = pci_map_rom(pdev, &size);
- if (io) {
+ if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
+ /*
+ * Check ROM content is valid. Need to enable memory
+ * decode for ROM access in pci_map_rom().
+ */
+ cmd = vfio_pci_memory_lock_and_enable(vdev);
+ io = pci_map_rom(pdev, &size);
+ if (io) {
+ info.flags = VFIO_REGION_INFO_FLAG_READ;
+ /* Report the BAR size, not the ROM size. */
+ info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
+ pci_unmap_rom(pdev, io);
+ }
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
+ } else if (pdev->rom && pdev->romlen) {
info.flags = VFIO_REGION_INFO_FLAG_READ;
- pci_unmap_rom(pdev, io);
- } else {
- info.size = 0;
+ /* Report BAR size as power of two. */
+ info.size = roundup_pow_of_two(pdev->romlen);
}
- vfio_pci_memory_unlock_and_restore(vdev, cmd);
break;
}
@@ -1258,10 +1248,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
{
unsigned long minsz =
offsetofend(struct vfio_pci_hot_reset_info, count);
+ struct vfio_pci_dependent_device *devices = NULL;
struct vfio_pci_hot_reset_info hdr;
struct vfio_pci_fill_info fill = {};
bool slot = false;
- int ret = 0;
+ int ret, count = 0;
if (copy_from_user(&hdr, arg, minsz))
return -EFAULT;
@@ -1277,9 +1268,26 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
else if (pci_probe_reset_bus(vdev->pdev->bus))
return -ENODEV;
- fill.devices = arg->devices;
- fill.devices_end = arg->devices +
- (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]);
+ ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
+ &count, slot);
+ if (ret)
+ return ret;
+
+ if (WARN_ON(!count)) /* Should always be at least one */
+ return -ERANGE;
+
+ if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) {
+ hdr.count = count;
+ ret = -ENOSPC;
+ goto header;
+ }
+
+ devices = kcalloc(count, sizeof(*devices), GFP_KERNEL);
+ if (!devices)
+ return -ENOMEM;
+
+ fill.devices = devices;
+ fill.nr_devices = count;
fill.vdev = &vdev->vdev;
if (vfio_device_cdev_opened(&vdev->vdev))
@@ -1291,21 +1299,28 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
&fill, slot);
mutex_unlock(&vdev->vdev.dev_set->lock);
if (ret)
- return ret;
+ goto out;
+
+ if (copy_to_user(arg->devices, devices,
+ sizeof(*devices) * fill.count)) {
+ ret = -EFAULT;
+ goto out;
+ }
hdr.count = fill.count;
hdr.flags = fill.flags;
- if (copy_to_user(arg, &hdr, minsz))
- return -EFAULT;
- if (fill.count > fill.devices - arg->devices)
- return -ENOSPC;
- return 0;
+header:
+ if (copy_to_user(arg, &hdr, minsz))
+ ret = -EFAULT;
+out:
+ kfree(devices);
+ return ret;
}
static int
vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
- int array_count, bool slot,
+ u32 array_count, bool slot,
struct vfio_pci_hot_reset __user *arg)
{
int32_t *group_fds;
@@ -1587,100 +1602,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
}
EXPORT_SYMBOL_GPL(vfio_pci_core_write);
-/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
-static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
+static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev)
{
- struct vfio_pci_mmap_vma *mmap_vma, *tmp;
-
- /*
- * Lock ordering:
- * vma_lock is nested under mmap_lock for vm_ops callback paths.
- * The memory_lock semaphore is used by both code paths calling
- * into this function to zap vmas and the vm_ops.fault callback
- * to protect the memory enable state of the device.
- *
- * When zapping vmas we need to maintain the mmap_lock => vma_lock
- * ordering, which requires using vma_lock to walk vma_list to
- * acquire an mm, then dropping vma_lock to get the mmap_lock and
- * reacquiring vma_lock. This logic is derived from similar
- * requirements in uverbs_user_mmap_disassociate().
- *
- * mmap_lock must always be the top-level lock when it is taken.
- * Therefore we can only hold the memory_lock write lock when
- * vma_list is empty, as we'd need to take mmap_lock to clear
- * entries. vma_list can only be guaranteed empty when holding
- * vma_lock, thus memory_lock is nested under vma_lock.
- *
- * This enables the vm_ops.fault callback to acquire vma_lock,
- * followed by memory_lock read lock, while already holding
- * mmap_lock without risk of deadlock.
- */
- while (1) {
- struct mm_struct *mm = NULL;
+ struct vfio_device *core_vdev = &vdev->vdev;
+ loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX);
+ loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX);
+ loff_t len = end - start;
- if (try) {
- if (!mutex_trylock(&vdev->vma_lock))
- return 0;
- } else {
- mutex_lock(&vdev->vma_lock);
- }
- while (!list_empty(&vdev->vma_list)) {
- mmap_vma = list_first_entry(&vdev->vma_list,
- struct vfio_pci_mmap_vma,
- vma_next);
- mm = mmap_vma->vma->vm_mm;
- if (mmget_not_zero(mm))
- break;
-
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
- mm = NULL;
- }
- if (!mm)
- return 1;
- mutex_unlock(&vdev->vma_lock);
-
- if (try) {
- if (!mmap_read_trylock(mm)) {
- mmput(mm);
- return 0;
- }
- } else {
- mmap_read_lock(mm);
- }
- if (try) {
- if (!mutex_trylock(&vdev->vma_lock)) {
- mmap_read_unlock(mm);
- mmput(mm);
- return 0;
- }
- } else {
- mutex_lock(&vdev->vma_lock);
- }
- list_for_each_entry_safe(mmap_vma, tmp,
- &vdev->vma_list, vma_next) {
- struct vm_area_struct *vma = mmap_vma->vma;
-
- if (vma->vm_mm != mm)
- continue;
-
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
-
- zap_vma_ptes(vma, vma->vm_start,
- vma->vm_end - vma->vm_start);
- }
- mutex_unlock(&vdev->vma_lock);
- mmap_read_unlock(mm);
- mmput(mm);
- }
+ unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true);
}
void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
{
- vfio_pci_zap_and_vma_lock(vdev, false);
down_write(&vdev->memory_lock);
- mutex_unlock(&vdev->vma_lock);
+ vfio_pci_zap_bars(vdev);
}
u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
@@ -1702,100 +1637,83 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c
up_write(&vdev->memory_lock);
}
-/* Caller holds vma_lock */
-static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
- struct vm_area_struct *vma)
-{
- struct vfio_pci_mmap_vma *mmap_vma;
-
- mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT);
- if (!mmap_vma)
- return -ENOMEM;
-
- mmap_vma->vma = vma;
- list_add(&mmap_vma->vma_next, &vdev->vma_list);
-
- return 0;
-}
-
-/*
- * Zap mmaps on open so that we can fault them in on access and therefore
- * our vma_list only tracks mappings accessed since last zap.
- */
-static void vfio_pci_mmap_open(struct vm_area_struct *vma)
-{
- zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
-}
-
-static void vfio_pci_mmap_close(struct vm_area_struct *vma)
+static unsigned long vma_to_pfn(struct vm_area_struct *vma)
{
struct vfio_pci_core_device *vdev = vma->vm_private_data;
- struct vfio_pci_mmap_vma *mmap_vma;
+ int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+ u64 pgoff;
- mutex_lock(&vdev->vma_lock);
- list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
- if (mmap_vma->vma == vma) {
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
- break;
- }
- }
- mutex_unlock(&vdev->vma_lock);
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
}
-static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
+static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
{
struct vm_area_struct *vma = vmf->vma;
struct vfio_pci_core_device *vdev = vma->vm_private_data;
- struct vfio_pci_mmap_vma *mmap_vma;
- vm_fault_t ret = VM_FAULT_NOPAGE;
+ unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
- mutex_lock(&vdev->vma_lock);
- down_read(&vdev->memory_lock);
+ pfn = vma_to_pfn(vma) + pgoff;
- /*
- * Memory region cannot be accessed if the low power feature is engaged
- * or memory access is disabled.
- */
- if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) {
- ret = VM_FAULT_SIGBUS;
- goto up_out;
+ if (order && (pfn & ((1 << order) - 1) ||
+ vmf->address & ((PAGE_SIZE << order) - 1) ||
+ vmf->address + (PAGE_SIZE << order) > vma->vm_end)) {
+ ret = VM_FAULT_FALLBACK;
+ goto out;
}
- /*
- * We populate the whole vma on fault, so we need to test whether
- * the vma has already been mapped, such as for concurrent faults
- * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
- * we ask it to fill the same range again.
- */
- list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
- if (mmap_vma->vma == vma)
- goto up_out;
- }
+ down_read(&vdev->memory_lock);
- if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
- vma->vm_end - vma->vm_start,
- vma->vm_page_prot)) {
- ret = VM_FAULT_SIGBUS;
- zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
- goto up_out;
- }
+ if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
+ goto out_unlock;
- if (__vfio_pci_add_vma(vdev, vma)) {
- ret = VM_FAULT_OOM;
- zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+ switch (order) {
+ case 0:
+ ret = vmf_insert_pfn(vma, vmf->address, pfn);
+ break;
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+ case PMD_ORDER:
+ ret = vmf_insert_pfn_pmd(vmf,
+ __pfn_to_pfn_t(pfn, PFN_DEV), false);
+ break;
+#endif
+#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+ case PUD_ORDER:
+ ret = vmf_insert_pfn_pud(vmf,
+ __pfn_to_pfn_t(pfn, PFN_DEV), false);
+ break;
+#endif
+ default:
+ ret = VM_FAULT_FALLBACK;
}
-up_out:
+out_unlock:
up_read(&vdev->memory_lock);
- mutex_unlock(&vdev->vma_lock);
+out:
+ dev_dbg_ratelimited(&vdev->pdev->dev,
+ "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n",
+ __func__, order,
+ vma->vm_pgoff >>
+ (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT),
+ pgoff, (unsigned int)ret);
+
return ret;
}
+static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf)
+{
+ return vfio_pci_mmap_huge_fault(vmf, 0);
+}
+
static const struct vm_operations_struct vfio_pci_mmap_ops = {
- .open = vfio_pci_mmap_open,
- .close = vfio_pci_mmap_close,
- .fault = vfio_pci_mmap_fault,
+ .fault = vfio_pci_mmap_page_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+ .huge_fault = vfio_pci_mmap_huge_fault,
+#endif
};
int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
@@ -1857,13 +1775,31 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
vma->vm_private_data = vdev;
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
/*
- * See remap_pfn_range(), called from vfio_pci_fault() but we can't
- * change vm_flags within the fault handler. Set them now.
+ * Set vm_flags now, they should not be changed in the fault handler.
+ * We want the same flags and page protection (decrypted above) as
+ * io_remap_pfn_range() would set.
+ *
+ * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
+ * allowing KVM stage 2 device mapping attributes to use Normal-NC
+ * rather than DEVICE_nGnRE, which allows guest mappings
+ * supporting write-combining attributes (WC). ARM does not
+ * architecturally guarantee this is safe, and indeed some MMIO
+ * regions like the GICv2 VCPU interface can trigger uncontained
+ * faults if Normal-NC is used.
+ *
+ * To safely use VFIO in KVM the platform must guarantee full
+ * safety in the guest where no action taken against a MMIO
+ * mapping can trigger an uncontained failure. The assumption is
+ * that most VFIO PCI platforms support this for both mapping types,
+ * at least in common flows, based on some expectations of how
+ * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in
+ * the VMA flags.
*/
- vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+ VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &vfio_pci_mmap_ops;
return 0;
@@ -2047,6 +1983,7 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb,
pci_name(pdev));
pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
vdev->vdev.ops->name);
+ WARN_ON(!pdev->driver_override);
} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
pdev->is_virtfn && physfn == vdev->pdev) {
struct pci_driver *drv = pci_dev_driver(pdev);
@@ -2161,8 +2098,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
mutex_init(&vdev->ioeventfds_lock);
INIT_LIST_HEAD(&vdev->dummy_resources_list);
INIT_LIST_HEAD(&vdev->ioeventfds_list);
- mutex_init(&vdev->vma_lock);
- INIT_LIST_HEAD(&vdev->vma_list);
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);
@@ -2178,7 +2113,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
mutex_destroy(&vdev->igate);
mutex_destroy(&vdev->ioeventfds_lock);
- mutex_destroy(&vdev->vma_lock);
kfree(vdev->region);
kfree(vdev->pm_save);
}
@@ -2456,26 +2390,15 @@ unwind:
return ret;
}
-/*
- * We need to get memory_lock for each device, but devices can share mmap_lock,
- * therefore we need to zap and hold the vma_lock for each device, and only then
- * get each memory_lock.
- */
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
struct vfio_pci_group_info *groups,
struct iommufd_ctx *iommufd_ctx)
{
- struct vfio_pci_core_device *cur_mem;
- struct vfio_pci_core_device *cur_vma;
- struct vfio_pci_core_device *cur;
+ struct vfio_pci_core_device *vdev;
struct pci_dev *pdev;
- bool is_mem = true;
int ret;
mutex_lock(&dev_set->lock);
- cur_mem = list_first_entry(&dev_set->device_list,
- struct vfio_pci_core_device,
- vdev.dev_set_list);
pdev = vfio_pci_dev_set_resettable(dev_set);
if (!pdev) {
@@ -2492,7 +2415,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
if (ret)
goto err_unlock;
- list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
+ list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
bool owned;
/*
@@ -2516,38 +2439,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
* Otherwise, reset is not allowed.
*/
if (iommufd_ctx) {
- int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev,
+ int devid = vfio_iommufd_get_dev_id(&vdev->vdev,
iommufd_ctx);
owned = (devid > 0 || devid == -ENOENT);
} else {
- owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
+ owned = vfio_dev_in_groups(&vdev->vdev, groups);
}
if (!owned) {
ret = -EINVAL;
- goto err_undo;
+ break;
}
/*
- * Locking multiple devices is prone to deadlock, runaway and
- * unwind if we hit contention.
+ * Take the memory write lock for each device and zap BAR
+ * mappings to prevent the user accessing the device while in
+ * reset. Locking multiple devices is prone to deadlock,
+ * runaway and unwind if we hit contention.
*/
- if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
+ if (!down_write_trylock(&vdev->memory_lock)) {
ret = -EBUSY;
- goto err_undo;
+ break;
}
+
+ vfio_pci_zap_bars(vdev);
}
- cur_vma = NULL;
- list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
- if (!down_write_trylock(&cur_mem->memory_lock)) {
- ret = -EBUSY;
- goto err_undo;
- }
- mutex_unlock(&cur_mem->vma_lock);
+ if (!list_entry_is_head(vdev,
+ &dev_set->device_list, vdev.dev_set_list)) {
+ vdev = list_prev_entry(vdev, vdev.dev_set_list);
+ goto err_undo;
}
- cur_mem = NULL;
/*
* The pci_reset_bus() will reset all the devices in the bus.
@@ -2558,25 +2481,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
* cause the PCI config space reset without restoring the original
* state (saved locally in 'vdev->pm_save').
*/
- list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
- vfio_pci_set_power_state(cur, PCI_D0);
+ list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+ vfio_pci_set_power_state(vdev, PCI_D0);
ret = pci_reset_bus(pdev);
+ vdev = list_last_entry(&dev_set->device_list,
+ struct vfio_pci_core_device, vdev.dev_set_list);
+
err_undo:
- list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
- if (cur == cur_mem)
- is_mem = false;
- if (cur == cur_vma)
- break;
- if (is_mem)
- up_write(&cur->memory_lock);
- else
- mutex_unlock(&cur->vma_lock);
- }
+ list_for_each_entry_from_reverse(vdev, &dev_set->device_list,
+ vdev.dev_set_list)
+ up_write(&vdev->memory_lock);
+
+ list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+ pm_runtime_put(&vdev->pdev->dev);
- list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
- pm_runtime_put(&cur->pdev->dev);
err_unlock:
mutex_unlock(&dev_set->lock);
return ret;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 237beac83809..8382c5834335 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -23,11 +23,12 @@
#include "vfio_pci_priv.h"
struct vfio_pci_irq_ctx {
- struct eventfd_ctx *trigger;
- struct virqfd *unmask;
- struct virqfd *mask;
- char *name;
- bool masked;
+ struct vfio_pci_core_device *vdev;
+ struct eventfd_ctx *trigger;
+ struct virqfd *unmask;
+ struct virqfd *mask;
+ char *name;
+ bool masked;
struct irq_bypass_producer producer;
};
@@ -84,28 +85,29 @@ vfio_irq_ctx_alloc(struct vfio_pci_core_device *vdev, unsigned long index)
/*
* INTx
*/
-static void vfio_send_intx_eventfd(void *opaque, void *unused)
+static void vfio_send_intx_eventfd(void *opaque, void *data)
{
struct vfio_pci_core_device *vdev = opaque;
if (likely(is_intx(vdev) && !vdev->virq_disabled)) {
- struct vfio_pci_irq_ctx *ctx;
+ struct vfio_pci_irq_ctx *ctx = data;
+ struct eventfd_ctx *trigger = READ_ONCE(ctx->trigger);
- ctx = vfio_irq_ctx_get(vdev, 0);
- if (WARN_ON_ONCE(!ctx))
- return;
- eventfd_signal(ctx->trigger);
+ if (likely(trigger))
+ eventfd_signal(trigger);
}
}
/* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */
-bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
+static bool __vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
{
struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_irq_ctx *ctx;
unsigned long flags;
bool masked_changed = false;
+ lockdep_assert_held(&vdev->igate);
+
spin_lock_irqsave(&vdev->irqlock, flags);
/*
@@ -143,17 +145,28 @@ out_unlock:
return masked_changed;
}
+bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
+{
+ bool mask_changed;
+
+ mutex_lock(&vdev->igate);
+ mask_changed = __vfio_pci_intx_mask(vdev);
+ mutex_unlock(&vdev->igate);
+
+ return mask_changed;
+}
+
/*
* If this is triggered by an eventfd, we can't call eventfd_signal
* or else we'll deadlock on the eventfd wait queue. Return >0 when
* a signal is necessary, which can then be handled via a work queue
* or directly depending on the caller.
*/
-static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
+static int vfio_pci_intx_unmask_handler(void *opaque, void *data)
{
struct vfio_pci_core_device *vdev = opaque;
struct pci_dev *pdev = vdev->pdev;
- struct vfio_pci_irq_ctx *ctx;
+ struct vfio_pci_irq_ctx *ctx = data;
unsigned long flags;
int ret = 0;
@@ -169,10 +182,6 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
goto out_unlock;
}
- ctx = vfio_irq_ctx_get(vdev, 0);
- if (WARN_ON_ONCE(!ctx))
- goto out_unlock;
-
if (ctx->masked && !vdev->virq_disabled) {
/*
* A pending interrupt here would immediately trigger,
@@ -194,23 +203,30 @@ out_unlock:
return ret;
}
+static void __vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0);
+
+ lockdep_assert_held(&vdev->igate);
+
+ if (vfio_pci_intx_unmask_handler(vdev, ctx) > 0)
+ vfio_send_intx_eventfd(vdev, ctx);
+}
+
void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
{
- if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
- vfio_send_intx_eventfd(vdev, NULL);
+ mutex_lock(&vdev->igate);
+ __vfio_pci_intx_unmask(vdev);
+ mutex_unlock(&vdev->igate);
}
static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
{
- struct vfio_pci_core_device *vdev = dev_id;
- struct vfio_pci_irq_ctx *ctx;
+ struct vfio_pci_irq_ctx *ctx = dev_id;
+ struct vfio_pci_core_device *vdev = ctx->vdev;
unsigned long flags;
int ret = IRQ_NONE;
- ctx = vfio_irq_ctx_get(vdev, 0);
- if (WARN_ON_ONCE(!ctx))
- return ret;
-
spin_lock_irqsave(&vdev->irqlock, flags);
if (!vdev->pci_2_3) {
@@ -226,102 +242,108 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
spin_unlock_irqrestore(&vdev->irqlock, flags);
if (ret == IRQ_HANDLED)
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, ctx);
return ret;
}
-static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
+static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
+ struct eventfd_ctx *trigger)
{
+ struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_irq_ctx *ctx;
+ unsigned long irqflags;
+ char *name;
+ int ret;
if (!is_irq_none(vdev))
return -EINVAL;
- if (!vdev->pdev->irq)
+ if (!pdev->irq)
return -ENODEV;
+ name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
ctx = vfio_irq_ctx_alloc(vdev, 0);
- if (!ctx)
+ if (!ctx) {
+ kfree(name);
return -ENOMEM;
+ }
+
+ ctx->name = name;
+ ctx->trigger = trigger;
+ ctx->vdev = vdev;
/*
- * If the virtual interrupt is masked, restore it. Devices
- * supporting DisINTx can be masked at the hardware level
- * here, non-PCI-2.3 devices will have to wait until the
- * interrupt is enabled.
+ * Fill the initial masked state based on virq_disabled. After
+ * enable, changing the DisINTx bit in vconfig directly changes INTx
+ * masking. igate prevents races during setup, once running masked
+ * is protected via irqlock.
+ *
+ * Devices supporting DisINTx also reflect the current mask state in
+ * the physical DisINTx bit, which is not affected during IRQ setup.
+ *
+ * Devices without DisINTx support require an exclusive interrupt.
+ * IRQ masking is performed at the IRQ chip. Again, igate protects
+ * against races during setup and IRQ handlers and irqfds are not
+ * yet active, therefore masked is stable and can be used to
+ * conditionally auto-enable the IRQ.
+ *
+ * irq_type must be stable while the IRQ handler is registered,
+ * therefore it must be set before request_irq().
*/
ctx->masked = vdev->virq_disabled;
- if (vdev->pci_2_3)
- pci_intx(vdev->pdev, !ctx->masked);
+ if (vdev->pci_2_3) {
+ pci_intx(pdev, !ctx->masked);
+ irqflags = IRQF_SHARED;
+ } else {
+ irqflags = ctx->masked ? IRQF_NO_AUTOEN : 0;
+ }
vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
+ ret = request_irq(pdev->irq, vfio_intx_handler,
+ irqflags, ctx->name, ctx);
+ if (ret) {
+ vdev->irq_type = VFIO_PCI_NUM_IRQS;
+ kfree(name);
+ vfio_irq_ctx_free(vdev, ctx, 0);
+ return ret;
+ }
+
return 0;
}
-static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
+static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev,
+ struct eventfd_ctx *trigger)
{
struct pci_dev *pdev = vdev->pdev;
- unsigned long irqflags = IRQF_SHARED;
struct vfio_pci_irq_ctx *ctx;
- struct eventfd_ctx *trigger;
- unsigned long flags;
- int ret;
+ struct eventfd_ctx *old;
ctx = vfio_irq_ctx_get(vdev, 0);
if (WARN_ON_ONCE(!ctx))
return -EINVAL;
- if (ctx->trigger) {
- free_irq(pdev->irq, vdev);
- kfree(ctx->name);
- eventfd_ctx_put(ctx->trigger);
- ctx->trigger = NULL;
- }
-
- if (fd < 0) /* Disable only */
- return 0;
-
- ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)",
- pci_name(pdev));
- if (!ctx->name)
- return -ENOMEM;
-
- trigger = eventfd_ctx_fdget(fd);
- if (IS_ERR(trigger)) {
- kfree(ctx->name);
- return PTR_ERR(trigger);
- }
-
- ctx->trigger = trigger;
+ old = ctx->trigger;
- if (!vdev->pci_2_3)
- irqflags = 0;
+ WRITE_ONCE(ctx->trigger, trigger);
- ret = request_irq(pdev->irq, vfio_intx_handler,
- irqflags, ctx->name, vdev);
- if (ret) {
- ctx->trigger = NULL;
- kfree(ctx->name);
- eventfd_ctx_put(trigger);
- return ret;
+ /* Releasing an old ctx requires synchronizing in-flight users */
+ if (old) {
+ synchronize_irq(pdev->irq);
+ vfio_virqfd_flush_thread(&ctx->unmask);
+ eventfd_ctx_put(old);
}
- /*
- * INTx disable will stick across the new irq setup,
- * disable_irq won't.
- */
- spin_lock_irqsave(&vdev->irqlock, flags);
- if (!vdev->pci_2_3 && ctx->masked)
- disable_irq_nosync(pdev->irq);
- spin_unlock_irqrestore(&vdev->irqlock, flags);
-
return 0;
}
static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
{
+ struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_irq_ctx *ctx;
ctx = vfio_irq_ctx_get(vdev, 0);
@@ -329,10 +351,13 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
if (ctx) {
vfio_virqfd_disable(&ctx->unmask);
vfio_virqfd_disable(&ctx->mask);
+ free_irq(pdev->irq, ctx);
+ if (ctx->trigger)
+ eventfd_ctx_put(ctx->trigger);
+ kfree(ctx->name);
+ vfio_irq_ctx_free(vdev, ctx, 0);
}
- vfio_intx_set_signal(vdev, -1);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
- vfio_irq_ctx_free(vdev, ctx, 0);
}
/*
@@ -560,11 +585,11 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_pci_intx_unmask(vdev);
+ __vfio_pci_intx_unmask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t unmask = *(uint8_t *)data;
if (unmask)
- vfio_pci_intx_unmask(vdev);
+ __vfio_pci_intx_unmask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0);
int32_t fd = *(int32_t *)data;
@@ -574,7 +599,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
if (fd >= 0)
return vfio_virqfd_enable((void *) vdev,
vfio_pci_intx_unmask_handler,
- vfio_send_intx_eventfd, NULL,
+ vfio_send_intx_eventfd, ctx,
&ctx->unmask, fd);
vfio_virqfd_disable(&ctx->unmask);
@@ -591,11 +616,11 @@ static int vfio_pci_set_intx_mask(struct vfio_pci_core_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_pci_intx_mask(vdev);
+ __vfio_pci_intx_mask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t mask = *(uint8_t *)data;
if (mask)
- vfio_pci_intx_mask(vdev);
+ __vfio_pci_intx_mask(vdev);
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
return -ENOTTY; /* XXX implement me */
}
@@ -616,19 +641,23 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ struct eventfd_ctx *trigger = NULL;
int32_t fd = *(int32_t *)data;
int ret;
- if (is_intx(vdev))
- return vfio_intx_set_signal(vdev, fd);
+ if (fd >= 0) {
+ trigger = eventfd_ctx_fdget(fd);
+ if (IS_ERR(trigger))
+ return PTR_ERR(trigger);
+ }
- ret = vfio_intx_enable(vdev);
- if (ret)
- return ret;
+ if (is_intx(vdev))
+ ret = vfio_intx_set_signal(vdev, trigger);
+ else
+ ret = vfio_intx_enable(vdev, trigger);
- ret = vfio_intx_set_signal(vdev, fd);
- if (ret)
- vfio_intx_disable(vdev);
+ if (ret && trigger)
+ eventfd_ctx_put(trigger);
return ret;
}
@@ -637,11 +666,11 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0));
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger = *(uint8_t *)data;
if (trigger)
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0));
}
return 0;
}
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 07fea08ea8a2..6192788c8ba3 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -16,6 +16,7 @@
#include <linux/io.h>
#include <linux/vfio.h>
#include <linux/vgaarb.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
#include "vfio_pci_priv.h"
@@ -61,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size);
VFIO_IOWRITE(8)
VFIO_IOWRITE(16)
VFIO_IOWRITE(32)
-#ifdef iowrite64
VFIO_IOWRITE(64)
-#endif
#define VFIO_IOREAD(size) \
int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \
@@ -89,6 +88,43 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size);
VFIO_IOREAD(8)
VFIO_IOREAD(16)
VFIO_IOREAD(32)
+VFIO_IOREAD(64)
+
+#define VFIO_IORDWR(size) \
+static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\
+ bool iswrite, bool test_mem, \
+ void __iomem *io, char __user *buf, \
+ loff_t off, size_t *filled) \
+{ \
+ u##size val; \
+ int ret; \
+ \
+ if (iswrite) { \
+ if (copy_from_user(&val, buf, sizeof(val))) \
+ return -EFAULT; \
+ \
+ ret = vfio_pci_core_iowrite##size(vdev, test_mem, \
+ val, io + off); \
+ if (ret) \
+ return ret; \
+ } else { \
+ ret = vfio_pci_core_ioread##size(vdev, test_mem, \
+ &val, io + off); \
+ if (ret) \
+ return ret; \
+ \
+ if (copy_to_user(buf, &val, sizeof(val))) \
+ return -EFAULT; \
+ } \
+ \
+ *filled = sizeof(val); \
+ return 0; \
+} \
+
+VFIO_IORDWR(8)
+VFIO_IORDWR(16)
+VFIO_IORDWR(32)
+VFIO_IORDWR(64)
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
@@ -96,10 +132,10 @@ VFIO_IOREAD(32)
* reads with -1. This is intended for handling MSI-X vector tables and
* leftover space for ROM BARs.
*/
-static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
- void __iomem *io, char __user *buf,
- loff_t off, size_t count, size_t x_start,
- size_t x_end, bool iswrite)
+ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
+ void __iomem *io, char __user *buf,
+ loff_t off, size_t count, size_t x_start,
+ size_t x_end, bool iswrite)
{
ssize_t done = 0;
int ret;
@@ -114,72 +150,31 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
else
fillable = 0;
+ if (fillable >= 8 && !(off % 8)) {
+ ret = vfio_pci_iordwr64(vdev, iswrite, test_mem,
+ io, buf, off, &filled);
+ if (ret)
+ return ret;
+
+ } else
if (fillable >= 4 && !(off % 4)) {
- u32 val;
-
- if (iswrite) {
- if (copy_from_user(&val, buf, 4))
- return -EFAULT;
-
- ret = vfio_pci_core_iowrite32(vdev, test_mem,
- val, io + off);
- if (ret)
- return ret;
- } else {
- ret = vfio_pci_core_ioread32(vdev, test_mem,
- &val, io + off);
- if (ret)
- return ret;
-
- if (copy_to_user(buf, &val, 4))
- return -EFAULT;
- }
+ ret = vfio_pci_iordwr32(vdev, iswrite, test_mem,
+ io, buf, off, &filled);
+ if (ret)
+ return ret;
- filled = 4;
} else if (fillable >= 2 && !(off % 2)) {
- u16 val;
-
- if (iswrite) {
- if (copy_from_user(&val, buf, 2))
- return -EFAULT;
-
- ret = vfio_pci_core_iowrite16(vdev, test_mem,
- val, io + off);
- if (ret)
- return ret;
- } else {
- ret = vfio_pci_core_ioread16(vdev, test_mem,
- &val, io + off);
- if (ret)
- return ret;
-
- if (copy_to_user(buf, &val, 2))
- return -EFAULT;
- }
+ ret = vfio_pci_iordwr16(vdev, iswrite, test_mem,
+ io, buf, off, &filled);
+ if (ret)
+ return ret;
- filled = 2;
} else if (fillable) {
- u8 val;
-
- if (iswrite) {
- if (copy_from_user(&val, buf, 1))
- return -EFAULT;
-
- ret = vfio_pci_core_iowrite8(vdev, test_mem,
- val, io + off);
- if (ret)
- return ret;
- } else {
- ret = vfio_pci_core_ioread8(vdev, test_mem,
- &val, io + off);
- if (ret)
- return ret;
-
- if (copy_to_user(buf, &val, 1))
- return -EFAULT;
- }
+ ret = vfio_pci_iordwr8(vdev, iswrite, test_mem,
+ io, buf, off, &filled);
+ if (ret)
+ return ret;
- filled = 1;
} else {
/* Fill reads with -1, drop writes */
filled = min(count, (size_t)(x_end - off));
@@ -201,6 +196,7 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
return done;
}
+EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw);
int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
{
@@ -241,9 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
if (pci_resource_start(pdev, bar))
end = pci_resource_len(pdev, bar);
- else if (bar == PCI_ROM_RESOURCE &&
- pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
- end = 0x20000;
+ else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen)
+ end = roundup_pow_of_two(pdev->romlen);
else
return -EINVAL;
@@ -258,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
* excluded range at the end of the actual ROM. This makes
* filling large ROM BARs much faster.
*/
- io = pci_map_rom(pdev, &x_start);
- if (!io) {
- done = -ENOMEM;
- goto out;
+ if (pci_resource_start(pdev, bar)) {
+ io = pci_map_rom(pdev, &x_start);
+ } else {
+ io = ioremap(pdev->rom, pdev->romlen);
+ x_start = pdev->romlen;
}
+ if (!io)
+ return -ENOMEM;
x_end = end;
} else {
int ret = vfio_pci_core_setup_barmap(vdev, bar);
@@ -279,14 +277,19 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
x_end = vdev->msix_offset + vdev->msix_size;
}
- done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
- count, x_start, x_end, iswrite);
+ done = vfio_pci_core_do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+ count, x_start, x_end, iswrite);
if (done >= 0)
*ppos += done;
- if (bar == PCI_ROM_RESOURCE)
- pci_unmap_rom(pdev, io);
+ if (bar == PCI_ROM_RESOURCE) {
+ if (pci_resource_start(pdev, bar))
+ pci_unmap_rom(pdev, io);
+ else
+ iounmap(io);
+ }
+
out:
return done;
}
@@ -348,7 +351,8 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
* probing, so we don't currently worry about access in relation
* to the memory enable bit in the command register.
*/
- done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
+ done = vfio_pci_core_do_io_rw(vdev, false, iomem, buf, off, count,
+ 0, 0, iswrite);
vga_put(vdev->pdev, rsrc);
@@ -377,12 +381,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem,
ioeventfd->data, ioeventfd->addr);
break;
-#ifdef iowrite64
case 8:
vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem,
ioeventfd->data, ioeventfd->addr);
break;
-#endif
}
}
@@ -436,10 +438,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
pos >= vdev->msix_offset + vdev->msix_size))
return -EINVAL;
-#ifndef iowrite64
if (count == 8)
return -EINVAL;
-#endif
ret = vfio_pci_core_setup_barmap(vdev, bar);
if (ret)
diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig
index bd80eca4a196..2770f7eb702c 100644
--- a/drivers/vfio/pci/virtio/Kconfig
+++ b/drivers/vfio/pci/virtio/Kconfig
@@ -1,15 +1,31 @@
# SPDX-License-Identifier: GPL-2.0-only
config VIRTIO_VFIO_PCI
- tristate "VFIO support for VIRTIO NET PCI devices"
- depends on VIRTIO_PCI && VIRTIO_PCI_ADMIN_LEGACY
- select VFIO_PCI_CORE
- help
- This provides support for exposing VIRTIO NET VF devices which support
- legacy IO access, using the VFIO framework that can work with a legacy
- virtio driver in the guest.
- Based on PCIe spec, VFs do not support I/O Space.
- As of that this driver emulates I/O BAR in software to let a VF be
- seen as a transitional device by its users and let it work with
- a legacy driver.
-
- If you don't know what to do here, say N.
+ tristate "VFIO support for VIRTIO NET PCI VF devices"
+ depends on VIRTIO_PCI
+ select VFIO_PCI_CORE
+ help
+ This provides migration support for VIRTIO NET PCI VF devices
+ using the VFIO framework. Migration support requires the
+ SR-IOV PF device to support specific VIRTIO extensions,
+ otherwise this driver provides no additional functionality
+ beyond vfio-pci.
+
+ Migration support in this driver relies on dirty page tracking
+ provided by the IOMMU hardware and exposed through IOMMUFD, any
+ other use cases are dis-recommended.
+
+ If you don't know what to do here, say N.
+
+config VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ bool "Legacy I/O support for VIRTIO NET PCI VF devices"
+ depends on VIRTIO_VFIO_PCI && VIRTIO_PCI_ADMIN_LEGACY
+ default y
+ help
+ This extends the virtio-vfio-pci driver to support legacy I/O
+ access, allowing use of legacy virtio drivers with VIRTIO NET
+ PCI VF devices. Legacy I/O support requires the SR-IOV PF
+ device to support and enable specific VIRTIO extensions,
+ otherwise this driver provides no additional functionality
+ beyond vfio-pci.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/virtio/Makefile b/drivers/vfio/pci/virtio/Makefile
index 7171105baf33..d9b0bb40d6b3 100644
--- a/drivers/vfio/pci/virtio/Makefile
+++ b/drivers/vfio/pci/virtio/Makefile
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio-vfio-pci.o
-virtio-vfio-pci-y := main.o
+virtio-vfio-pci-y := main.o migrate.o
+virtio-vfio-pci-$(CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY) += legacy_io.o
diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h
new file mode 100644
index 000000000000..c7d7e27af386
--- /dev/null
+++ b/drivers/vfio/pci/virtio/common.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef VIRTIO_VFIO_COMMON_H
+#define VIRTIO_VFIO_COMMON_H
+
+#include <linux/kernel.h>
+#include <linux/virtio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+
+enum virtiovf_migf_state {
+ VIRTIOVF_MIGF_STATE_ERROR = 1,
+ VIRTIOVF_MIGF_STATE_PRECOPY = 2,
+ VIRTIOVF_MIGF_STATE_COMPLETE = 3,
+};
+
+enum virtiovf_load_state {
+ VIRTIOVF_LOAD_STATE_READ_HEADER,
+ VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA,
+ VIRTIOVF_LOAD_STATE_READ_HEADER_DATA,
+ VIRTIOVF_LOAD_STATE_PREP_CHUNK,
+ VIRTIOVF_LOAD_STATE_READ_CHUNK,
+ VIRTIOVF_LOAD_STATE_LOAD_CHUNK,
+};
+
+struct virtiovf_data_buffer {
+ struct sg_append_table table;
+ loff_t start_pos;
+ u64 length;
+ u64 allocated_length;
+ struct list_head buf_elm;
+ u8 include_header_object:1;
+ struct virtiovf_migration_file *migf;
+ /* Optimize virtiovf_get_migration_page() for sequential access */
+ struct scatterlist *last_offset_sg;
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
+enum virtiovf_migf_header_flags {
+ VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0,
+ VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0,
+};
+
+enum virtiovf_migf_header_tag {
+ VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA = 0,
+};
+
+struct virtiovf_migration_header {
+ __le64 record_size;
+ /* For future use in case we may need to change the kernel protocol */
+ __le32 flags; /* Use virtiovf_migf_header_flags */
+ __le32 tag; /* Use virtiovf_migf_header_tag */
+ __u8 data[]; /* Its size is given in the record_size */
+};
+
+struct virtiovf_migration_file {
+ struct file *filp;
+ /* synchronize access to the file state */
+ struct mutex lock;
+ loff_t max_pos;
+ u64 pre_copy_initial_bytes;
+ struct ratelimit_state pre_copy_rl_state;
+ u64 record_size;
+ u32 record_tag;
+ u8 has_obj_id:1;
+ u32 obj_id;
+ enum virtiovf_migf_state state;
+ enum virtiovf_load_state load_state;
+ /* synchronize access to the lists */
+ spinlock_t list_lock;
+ struct list_head buf_list;
+ struct list_head avail_list;
+ struct virtiovf_data_buffer *buf;
+ struct virtiovf_data_buffer *buf_header;
+ struct virtiovf_pci_core_device *virtvdev;
+};
+
+struct virtiovf_pci_core_device {
+ struct vfio_pci_core_device core_device;
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ u8 *bar0_virtual_buf;
+ /* synchronize access to the virtual buf */
+ struct mutex bar_mutex;
+ void __iomem *notify_addr;
+ u64 notify_offset;
+ __le32 pci_base_addr_0;
+ __le16 pci_cmd;
+ u8 bar0_virtual_buf_size;
+ u8 notify_bar;
+#endif
+
+ /* LM related */
+ u8 migrate_cap:1;
+ u8 deferred_reset:1;
+ /* protect migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ /* protect the reset_done flow */
+ spinlock_t reset_lock;
+ struct virtiovf_migration_file *resuming_migf;
+ struct virtiovf_migration_file *saving_migf;
+};
+
+void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_migration_reset_done(struct pci_dev *pdev);
+
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg);
+int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg);
+ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count,
+ loff_t *ppos);
+ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+ size_t count, loff_t *ppos);
+bool virtiovf_support_legacy_io(struct pci_dev *pdev);
+int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_legacy_io_reset_done(struct pci_dev *pdev);
+#endif
+
+#endif /* VIRTIO_VFIO_COMMON_H */
diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c
new file mode 100644
index 000000000000..20382ee15fac
--- /dev/null
+++ b/drivers/vfio/pci/virtio/legacy_io.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_pci_admin.h>
+
+#include "common.h"
+
+static int
+virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev,
+ loff_t pos, char __user *buf,
+ size_t count, bool read)
+{
+ bool msix_enabled =
+ (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX);
+ struct pci_dev *pdev = virtvdev->core_device.pdev;
+ u8 *bar0_buf = virtvdev->bar0_virtual_buf;
+ bool common;
+ u8 offset;
+ int ret;
+
+ common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled);
+ /* offset within the relevant configuration area */
+ offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled);
+ mutex_lock(&virtvdev->bar_mutex);
+ if (read) {
+ if (common)
+ ret = virtio_pci_admin_legacy_common_io_read(pdev, offset,
+ count, bar0_buf + pos);
+ else
+ ret = virtio_pci_admin_legacy_device_io_read(pdev, offset,
+ count, bar0_buf + pos);
+ if (ret)
+ goto out;
+ if (copy_to_user(buf, bar0_buf + pos, count))
+ ret = -EFAULT;
+ } else {
+ if (copy_from_user(bar0_buf + pos, buf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (common)
+ ret = virtio_pci_admin_legacy_common_io_write(pdev, offset,
+ count, bar0_buf + pos);
+ else
+ ret = virtio_pci_admin_legacy_device_io_write(pdev, offset,
+ count, bar0_buf + pos);
+ }
+out:
+ mutex_unlock(&virtvdev->bar_mutex);
+ return ret;
+}
+
+static int
+virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev,
+ loff_t pos, char __user *buf,
+ size_t count, bool read)
+{
+ struct vfio_pci_core_device *core_device = &virtvdev->core_device;
+ struct pci_dev *pdev = core_device->pdev;
+ u16 queue_notify;
+ int ret;
+
+ if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO))
+ return -EIO;
+
+ if (pos + count > virtvdev->bar0_virtual_buf_size)
+ return -EINVAL;
+
+ ret = pm_runtime_resume_and_get(&pdev->dev);
+ if (ret) {
+ pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret);
+ return -EIO;
+ }
+
+ switch (pos) {
+ case VIRTIO_PCI_QUEUE_NOTIFY:
+ if (count != sizeof(queue_notify)) {
+ ret = -EINVAL;
+ goto end;
+ }
+ if (read) {
+ ret = vfio_pci_core_ioread16(core_device, true, &queue_notify,
+ virtvdev->notify_addr);
+ if (ret)
+ goto end;
+ if (copy_to_user(buf, &queue_notify,
+ sizeof(queue_notify))) {
+ ret = -EFAULT;
+ goto end;
+ }
+ } else {
+ if (copy_from_user(&queue_notify, buf, count)) {
+ ret = -EFAULT;
+ goto end;
+ }
+ ret = vfio_pci_core_iowrite16(core_device, true, queue_notify,
+ virtvdev->notify_addr);
+ }
+ break;
+ default:
+ ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count,
+ read);
+ }
+
+end:
+ pm_runtime_put(&pdev->dev);
+ return ret ? ret : count;
+}
+
+static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
+ char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+ __le32 val32;
+ __le16 val16;
+ u8 val8;
+ int ret;
+
+ ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
+ if (ret < 0)
+ return ret;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count))
+ return -EFAULT;
+ }
+
+ if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) &&
+ vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ val16 |= cpu_to_le16(PCI_COMMAND_IO);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID,
+ sizeof(val8), &copy_offset,
+ &copy_count, &register_offset)) {
+ /* Transional needs to have revision 0 */
+ val8 = 0;
+ if (copy_to_user(buf + copy_offset, &val8, copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+ sizeof(val32), &copy_offset,
+ &copy_count, &register_offset)) {
+ u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1);
+ u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0);
+
+ val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO);
+ if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ /*
+ * Transitional devices use the PCI subsystem device id as
+ * virtio device id, same as legacy driver always did.
+ */
+ val16 = cpu_to_le16(VIRTIO_ID_NET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ return count;
+}
+
+ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (!count)
+ return 0;
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return virtiovf_pci_read_config(core_vdev, buf, count, ppos);
+
+ if (index == VFIO_PCI_BAR0_REGION_INDEX)
+ return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true);
+
+ return vfio_pci_core_read(core_vdev, buf, count, ppos);
+}
+
+static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+ sizeof(virtvdev->pci_cmd),
+ &copy_offset, &copy_count,
+ &register_offset)) {
+ if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset,
+ buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+ sizeof(virtvdev->pci_base_addr_0),
+ &copy_offset, &copy_count,
+ &register_offset)) {
+ if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset,
+ buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (!count)
+ return 0;
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return virtiovf_pci_write_config(core_vdev, buf, count, ppos);
+
+ if (index == VFIO_PCI_BAR0_REGION_INDEX)
+ return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false);
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ void __user *uarg = (void __user *)arg;
+ struct vfio_region_info info = {};
+
+ if (copy_from_user(&info, uarg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ switch (info.index) {
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info.size = virtvdev->bar0_virtual_buf_size;
+ info.flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
+ default:
+ return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ }
+}
+
+long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg);
+ default:
+ return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ }
+}
+
+static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct vfio_pci_core_device *core_device = &virtvdev->core_device;
+ int ret;
+
+ /*
+ * Setup the BAR where the 'notify' exists to be used by vfio as well
+ * This will let us mmap it only once and use it when needed.
+ */
+ ret = vfio_pci_core_setup_barmap(core_device,
+ virtvdev->notify_bar);
+ if (ret)
+ return ret;
+
+ virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] +
+ virtvdev->notify_offset;
+ return 0;
+}
+
+int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->bar0_virtual_buf)
+ return 0;
+
+ /*
+ * Upon close_device() the vfio_pci_core_disable() is called
+ * and will close all the previous mmaps, so it seems that the
+ * valid life cycle for the 'notify' addr is per open/close.
+ */
+ return virtiovf_set_notify_addr(virtvdev);
+}
+
+static int virtiovf_get_device_config_size(unsigned short device)
+{
+ /* Network card */
+ return offsetofend(struct virtio_net_config, status);
+}
+
+static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev)
+{
+ u64 offset;
+ int ret;
+ u8 bar;
+
+ ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev,
+ VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM,
+ &bar, &offset);
+ if (ret)
+ return ret;
+
+ virtvdev->notify_bar = bar;
+ virtvdev->notify_offset = offset;
+ return 0;
+}
+
+static bool virtiovf_bar0_exists(struct pci_dev *pdev)
+{
+ struct resource *res = pdev->resource;
+
+ return res->flags;
+}
+
+bool virtiovf_support_legacy_io(struct pci_dev *pdev)
+{
+ return virtio_pci_admin_has_legacy_io(pdev) && !virtiovf_bar0_exists(pdev);
+}
+
+int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct pci_dev *pdev = virtvdev->core_device.pdev;
+ int ret;
+
+ ret = virtiovf_read_notify_info(virtvdev);
+ if (ret)
+ return ret;
+
+ virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) +
+ virtiovf_get_device_config_size(pdev->device);
+ BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size));
+ virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size,
+ GFP_KERNEL);
+ if (!virtvdev->bar0_virtual_buf)
+ return -ENOMEM;
+ mutex_init(&virtvdev->bar_mutex);
+ return 0;
+}
+
+void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ kfree(virtvdev->bar0_virtual_buf);
+}
+
+void virtiovf_legacy_io_reset_done(struct pci_dev *pdev)
+{
+ struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
+
+ virtvdev->pci_cmd = 0;
+}
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index d5af683837d3..d534d48c4163 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -16,367 +16,12 @@
#include <linux/virtio_net.h>
#include <linux/virtio_pci_admin.h>
-struct virtiovf_pci_core_device {
- struct vfio_pci_core_device core_device;
- u8 *bar0_virtual_buf;
- /* synchronize access to the virtual buf */
- struct mutex bar_mutex;
- void __iomem *notify_addr;
- u64 notify_offset;
- __le32 pci_base_addr_0;
- __le16 pci_cmd;
- u8 bar0_virtual_buf_size;
- u8 notify_bar;
-};
-
-static int
-virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev,
- loff_t pos, char __user *buf,
- size_t count, bool read)
-{
- bool msix_enabled =
- (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX);
- struct pci_dev *pdev = virtvdev->core_device.pdev;
- u8 *bar0_buf = virtvdev->bar0_virtual_buf;
- bool common;
- u8 offset;
- int ret;
-
- common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled);
- /* offset within the relevant configuration area */
- offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled);
- mutex_lock(&virtvdev->bar_mutex);
- if (read) {
- if (common)
- ret = virtio_pci_admin_legacy_common_io_read(pdev, offset,
- count, bar0_buf + pos);
- else
- ret = virtio_pci_admin_legacy_device_io_read(pdev, offset,
- count, bar0_buf + pos);
- if (ret)
- goto out;
- if (copy_to_user(buf, bar0_buf + pos, count))
- ret = -EFAULT;
- } else {
- if (copy_from_user(bar0_buf + pos, buf, count)) {
- ret = -EFAULT;
- goto out;
- }
-
- if (common)
- ret = virtio_pci_admin_legacy_common_io_write(pdev, offset,
- count, bar0_buf + pos);
- else
- ret = virtio_pci_admin_legacy_device_io_write(pdev, offset,
- count, bar0_buf + pos);
- }
-out:
- mutex_unlock(&virtvdev->bar_mutex);
- return ret;
-}
-
-static int
-virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev,
- loff_t pos, char __user *buf,
- size_t count, bool read)
-{
- struct vfio_pci_core_device *core_device = &virtvdev->core_device;
- struct pci_dev *pdev = core_device->pdev;
- u16 queue_notify;
- int ret;
-
- if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO))
- return -EIO;
-
- if (pos + count > virtvdev->bar0_virtual_buf_size)
- return -EINVAL;
-
- ret = pm_runtime_resume_and_get(&pdev->dev);
- if (ret) {
- pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret);
- return -EIO;
- }
-
- switch (pos) {
- case VIRTIO_PCI_QUEUE_NOTIFY:
- if (count != sizeof(queue_notify)) {
- ret = -EINVAL;
- goto end;
- }
- if (read) {
- ret = vfio_pci_core_ioread16(core_device, true, &queue_notify,
- virtvdev->notify_addr);
- if (ret)
- goto end;
- if (copy_to_user(buf, &queue_notify,
- sizeof(queue_notify))) {
- ret = -EFAULT;
- goto end;
- }
- } else {
- if (copy_from_user(&queue_notify, buf, count)) {
- ret = -EFAULT;
- goto end;
- }
- ret = vfio_pci_core_iowrite16(core_device, true, queue_notify,
- virtvdev->notify_addr);
- }
- break;
- default:
- ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count,
- read);
- }
-
-end:
- pm_runtime_put(&pdev->dev);
- return ret ? ret : count;
-}
-
-static bool range_intersect_range(loff_t range1_start, size_t count1,
- loff_t range2_start, size_t count2,
- loff_t *start_offset,
- size_t *intersect_count,
- size_t *register_offset)
-{
- if (range1_start <= range2_start &&
- range1_start + count1 > range2_start) {
- *start_offset = range2_start - range1_start;
- *intersect_count = min_t(size_t, count2,
- range1_start + count1 - range2_start);
- *register_offset = 0;
- return true;
- }
-
- if (range1_start > range2_start &&
- range1_start < range2_start + count2) {
- *start_offset = 0;
- *intersect_count = min_t(size_t, count1,
- range2_start + count2 - range1_start);
- *register_offset = range1_start - range2_start;
- return true;
- }
-
- return false;
-}
-
-static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
- char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- size_t register_offset;
- loff_t copy_offset;
- size_t copy_count;
- __le32 val32;
- __le16 val16;
- u8 val8;
- int ret;
-
- ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
- if (ret < 0)
- return ret;
-
- if (range_intersect_range(pos, count, PCI_DEVICE_ID, sizeof(val16),
- &copy_offset, &copy_count, &register_offset)) {
- val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count))
- return -EFAULT;
- }
-
- if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) &&
- range_intersect_range(pos, count, PCI_COMMAND, sizeof(val16),
- &copy_offset, &copy_count, &register_offset)) {
- if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset,
- copy_count))
- return -EFAULT;
- val16 |= cpu_to_le16(PCI_COMMAND_IO);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (range_intersect_range(pos, count, PCI_REVISION_ID, sizeof(val8),
- &copy_offset, &copy_count, &register_offset)) {
- /* Transional needs to have revision 0 */
- val8 = 0;
- if (copy_to_user(buf + copy_offset, &val8, copy_count))
- return -EFAULT;
- }
-
- if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, sizeof(val32),
- &copy_offset, &copy_count, &register_offset)) {
- u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1);
- u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0);
-
- val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO);
- if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count))
- return -EFAULT;
- }
-
- if (range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, sizeof(val16),
- &copy_offset, &copy_count, &register_offset)) {
- /*
- * Transitional devices use the PCI subsystem device id as
- * virtio device id, same as legacy driver always did.
- */
- val16 = cpu_to_le16(VIRTIO_ID_NET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, sizeof(val16),
- &copy_offset, &copy_count, &register_offset)) {
- val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- return count;
-}
-
-static ssize_t
-virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
-
- if (!count)
- return 0;
-
- if (index == VFIO_PCI_CONFIG_REGION_INDEX)
- return virtiovf_pci_read_config(core_vdev, buf, count, ppos);
-
- if (index == VFIO_PCI_BAR0_REGION_INDEX)
- return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true);
-
- return vfio_pci_core_read(core_vdev, buf, count, ppos);
-}
-
-static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev,
- const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- size_t register_offset;
- loff_t copy_offset;
- size_t copy_count;
-
- if (range_intersect_range(pos, count, PCI_COMMAND, sizeof(virtvdev->pci_cmd),
- &copy_offset, &copy_count,
- &register_offset)) {
- if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset,
- buf + copy_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
- sizeof(virtvdev->pci_base_addr_0),
- &copy_offset, &copy_count,
- &register_offset)) {
- if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset,
- buf + copy_offset,
- copy_count))
- return -EFAULT;
- }
-
- return vfio_pci_core_write(core_vdev, buf, count, ppos);
-}
-
-static ssize_t
-virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
-
- if (!count)
- return 0;
-
- if (index == VFIO_PCI_CONFIG_REGION_INDEX)
- return virtiovf_pci_write_config(core_vdev, buf, count, ppos);
-
- if (index == VFIO_PCI_BAR0_REGION_INDEX)
- return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false);
-
- return vfio_pci_core_write(core_vdev, buf, count, ppos);
-}
-
-static int
-virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
- void __user *uarg = (void __user *)arg;
- struct vfio_region_info info = {};
-
- if (copy_from_user(&info, uarg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
- case VFIO_PCI_BAR0_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = virtvdev->bar0_virtual_buf_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
-}
-
-static long
-virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
- unsigned long arg)
-{
- switch (cmd) {
- case VFIO_DEVICE_GET_REGION_INFO:
- return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg);
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
-}
-
-static int
-virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
-{
- struct vfio_pci_core_device *core_device = &virtvdev->core_device;
- int ret;
-
- /*
- * Setup the BAR where the 'notify' exists to be used by vfio as well
- * This will let us mmap it only once and use it when needed.
- */
- ret = vfio_pci_core_setup_barmap(core_device,
- virtvdev->notify_bar);
- if (ret)
- return ret;
-
- virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] +
- virtvdev->notify_offset;
- return 0;
-}
+#include "common.h"
static int virtiovf_pci_open_device(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
struct vfio_pci_core_device *vdev = &virtvdev->core_device;
int ret;
@@ -384,88 +29,84 @@ static int virtiovf_pci_open_device(struct vfio_device *core_vdev)
if (ret)
return ret;
- if (virtvdev->bar0_virtual_buf) {
- /*
- * Upon close_device() the vfio_pci_core_disable() is called
- * and will close all the previous mmaps, so it seems that the
- * valid life cycle for the 'notify' addr is per open/close.
- */
- ret = virtiovf_set_notify_addr(virtvdev);
- if (ret) {
- vfio_pci_core_disable(vdev);
- return ret;
- }
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ ret = virtiovf_open_legacy_io(virtvdev);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
}
+#endif
+ virtiovf_open_migration(virtvdev);
vfio_pci_core_finish_enable(vdev);
return 0;
}
-static int virtiovf_get_device_config_size(unsigned short device)
-{
- /* Network card */
- return offsetofend(struct virtio_net_config, status);
-}
-
-static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev)
+static void virtiovf_pci_close_device(struct vfio_device *core_vdev)
{
- u64 offset;
- int ret;
- u8 bar;
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
- ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev,
- VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM,
- &bar, &offset);
- if (ret)
- return ret;
-
- virtvdev->notify_bar = bar;
- virtvdev->notify_offset = offset;
- return 0;
+ virtiovf_close_migration(virtvdev);
+ vfio_pci_core_close_device(core_vdev);
}
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
static int virtiovf_pci_init_device(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- struct pci_dev *pdev;
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
int ret;
ret = vfio_pci_core_init_dev(core_vdev);
if (ret)
return ret;
- pdev = virtvdev->core_device.pdev;
- ret = virtiovf_read_notify_info(virtvdev);
- if (ret)
- return ret;
-
- virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) +
- virtiovf_get_device_config_size(pdev->device);
- BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size));
- virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size,
- GFP_KERNEL);
- if (!virtvdev->bar0_virtual_buf)
- return -ENOMEM;
- mutex_init(&virtvdev->bar_mutex);
- return 0;
+ /*
+ * The vfio_device_ops.init() callback is set to virtiovf_pci_init_device()
+ * only when legacy I/O is supported. Now, let's initialize it.
+ */
+ return virtiovf_init_legacy_io(virtvdev);
}
+#endif
static void virtiovf_pci_core_release_dev(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
- kfree(virtvdev->bar0_virtual_buf);
+ virtiovf_release_legacy_io(virtvdev);
+#endif
vfio_pci_core_release_dev(core_vdev);
}
-static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = {
- .name = "virtio-vfio-pci-trans",
+static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = {
+ .name = "virtio-vfio-pci-lm",
+ .init = vfio_pci_core_init_dev,
+ .release = virtiovf_pci_core_release_dev,
+ .open_device = virtiovf_pci_open_device,
+ .close_device = virtiovf_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = {
+ .name = "virtio-vfio-pci-trans-lm",
.init = virtiovf_pci_init_device,
.release = virtiovf_pci_core_release_dev,
.open_device = virtiovf_pci_open_device,
- .close_device = vfio_pci_core_close_device,
+ .close_device = virtiovf_pci_close_device,
.ioctl = virtiovf_vfio_pci_core_ioctl,
.device_feature = vfio_pci_core_ioctl_feature,
.read = virtiovf_pci_core_read,
@@ -478,6 +119,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = {
.attach_ioas = vfio_iommufd_physical_attach_ioas,
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
+#endif
static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.name = "virtio-vfio-pci",
@@ -498,29 +140,34 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
-static bool virtiovf_bar0_exists(struct pci_dev *pdev)
-{
- struct resource *res = pdev->resource;
-
- return res->flags;
-}
-
static int virtiovf_pci_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
const struct vfio_device_ops *ops = &virtiovf_vfio_pci_ops;
struct virtiovf_pci_core_device *virtvdev;
+ bool sup_legacy_io = false;
+ bool sup_lm = false;
int ret;
- if (pdev->is_virtfn && virtio_pci_admin_has_legacy_io(pdev) &&
- !virtiovf_bar0_exists(pdev))
- ops = &virtiovf_vfio_pci_tran_ops;
+ if (pdev->is_virtfn) {
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ sup_legacy_io = virtiovf_support_legacy_io(pdev);
+ if (sup_legacy_io)
+ ops = &virtiovf_vfio_pci_tran_lm_ops;
+#endif
+ sup_lm = virtio_pci_admin_has_dev_parts(pdev);
+ if (sup_lm && !sup_legacy_io)
+ ops = &virtiovf_vfio_pci_lm_ops;
+ }
virtvdev = vfio_alloc_device(virtiovf_pci_core_device, core_device.vdev,
&pdev->dev, ops);
if (IS_ERR(virtvdev))
return PTR_ERR(virtvdev);
+ if (sup_lm)
+ virtiovf_set_migratable(virtvdev);
+
dev_set_drvdata(&pdev->dev, &virtvdev->core_device);
ret = vfio_pci_core_register_device(&virtvdev->core_device);
if (ret)
@@ -549,9 +196,10 @@ MODULE_DEVICE_TABLE(pci, virtiovf_pci_table);
static void virtiovf_pci_aer_reset_done(struct pci_dev *pdev)
{
- struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
-
- virtvdev->pci_cmd = 0;
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ virtiovf_legacy_io_reset_done(pdev);
+#endif
+ virtiovf_migration_reset_done(pdev);
}
static const struct pci_error_handlers virtiovf_err_handlers = {
diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c
new file mode 100644
index 000000000000..ba92bb4e9af9
--- /dev/null
+++ b/drivers/vfio/pci/virtio/migrate.c
@@ -0,0 +1,1337 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_pci_admin.h>
+#include <linux/anon_inodes.h>
+
+#include "common.h"
+
+/* Device specification max parts size */
+#define MAX_LOAD_SIZE (BIT_ULL(BITS_PER_TYPE \
+ (((struct virtio_admin_cmd_dev_parts_metadata_result *)0)->parts_size.size)) - 1)
+
+/* Initial target buffer size */
+#define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M
+
+static int
+virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
+ u32 ctx_size);
+
+static struct page *
+virtiovf_get_migration_page(struct virtiovf_data_buffer *buf,
+ unsigned long offset)
+{
+ unsigned long cur_offset = 0;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* All accesses are sequential */
+ if (offset < buf->last_offset || !buf->last_offset_sg) {
+ buf->last_offset = 0;
+ buf->last_offset_sg = buf->table.sgt.sgl;
+ buf->sg_last_entry = 0;
+ }
+
+ cur_offset = buf->last_offset;
+
+ for_each_sg(buf->last_offset_sg, sg,
+ buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
+ if (offset < sg->length + cur_offset) {
+ buf->last_offset_sg = sg;
+ buf->sg_last_entry += i;
+ buf->last_offset = cur_offset;
+ return nth_page(sg_page(sg),
+ (offset - cur_offset) / PAGE_SIZE);
+ }
+ cur_offset += sg->length;
+ }
+ return NULL;
+}
+
+static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf,
+ unsigned int npages)
+{
+ unsigned int to_alloc = npages;
+ struct page **page_list;
+ unsigned long filled;
+ unsigned int to_fill;
+ int ret;
+ int i;
+
+ to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+ page_list = kvcalloc(to_fill, sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+ if (!page_list)
+ return -ENOMEM;
+
+ do {
+ filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill,
+ page_list);
+ if (!filled) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ to_alloc -= filled;
+ ret = sg_alloc_append_table_from_pages(&buf->table, page_list,
+ filled, 0, filled << PAGE_SHIFT, UINT_MAX,
+ SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT);
+
+ if (ret)
+ goto err_append;
+ buf->allocated_length += filled * PAGE_SIZE;
+ /* clean input for another bulk allocation */
+ memset(page_list, 0, filled * sizeof(*page_list));
+ to_fill = min_t(unsigned int, to_alloc,
+ PAGE_SIZE / sizeof(*page_list));
+ } while (to_alloc > 0);
+
+ kvfree(page_list);
+ return 0;
+
+err_append:
+ for (i = filled - 1; i >= 0; i--)
+ __free_page(page_list[i]);
+err:
+ kvfree(page_list);
+ return ret;
+}
+
+static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf)
+{
+ struct sg_page_iter sg_iter;
+
+ /* Undo alloc_pages_bulk() */
+ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
+ __free_page(sg_page_iter_page(&sg_iter));
+ sg_free_append_table(&buf->table);
+ kfree(buf);
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_alloc_data_buffer(struct virtiovf_migration_file *migf, size_t length)
+{
+ struct virtiovf_data_buffer *buf;
+ int ret;
+
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ ret = virtiovf_add_migration_pages(buf,
+ DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (ret)
+ goto end;
+
+ buf->migf = migf;
+ return buf;
+end:
+ virtiovf_free_data_buffer(buf);
+ return ERR_PTR(ret);
+}
+
+static void virtiovf_put_data_buffer(struct virtiovf_data_buffer *buf)
+{
+ spin_lock_irq(&buf->migf->list_lock);
+ list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
+ spin_unlock_irq(&buf->migf->list_lock);
+}
+
+static int
+virtiovf_pci_alloc_obj_id(struct virtiovf_pci_core_device *virtvdev, u8 type,
+ u32 *obj_id)
+{
+ return virtio_pci_admin_obj_create(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, type, obj_id);
+}
+
+static void
+virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id)
+{
+ virtio_pci_admin_obj_destroy(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id);
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length)
+{
+ struct virtiovf_data_buffer *buf, *temp_buf;
+ struct list_head free_list;
+
+ INIT_LIST_HEAD(&free_list);
+
+ spin_lock_irq(&migf->list_lock);
+ list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+ list_del_init(&buf->buf_elm);
+ if (buf->allocated_length >= length) {
+ spin_unlock_irq(&migf->list_lock);
+ goto found;
+ }
+ /*
+ * Prevent holding redundant buffers. Put in a free
+ * list and call at the end not under the spin lock
+ * (&migf->list_lock) to minimize its scope usage.
+ */
+ list_add(&buf->buf_elm, &free_list);
+ }
+ spin_unlock_irq(&migf->list_lock);
+ buf = virtiovf_alloc_data_buffer(migf, length);
+
+found:
+ while ((temp_buf = list_first_entry_or_null(&free_list,
+ struct virtiovf_data_buffer, buf_elm))) {
+ list_del(&temp_buf->buf_elm);
+ virtiovf_free_data_buffer(temp_buf);
+ }
+
+ return buf;
+}
+
+static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf)
+{
+ struct virtiovf_data_buffer *entry;
+
+ if (migf->buf) {
+ virtiovf_free_data_buffer(migf->buf);
+ migf->buf = NULL;
+ }
+
+ if (migf->buf_header) {
+ virtiovf_free_data_buffer(migf->buf_header);
+ migf->buf_header = NULL;
+ }
+
+ list_splice(&migf->avail_list, &migf->buf_list);
+
+ while ((entry = list_first_entry_or_null(&migf->buf_list,
+ struct virtiovf_data_buffer, buf_elm))) {
+ list_del(&entry->buf_elm);
+ virtiovf_free_data_buffer(entry);
+ }
+
+ if (migf->has_obj_id)
+ virtiovf_pci_free_obj_id(migf->virtvdev, migf->obj_id);
+}
+
+static void virtiovf_disable_fd(struct virtiovf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static void virtiovf_disable_fds(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (virtvdev->resuming_migf) {
+ virtiovf_disable_fd(virtvdev->resuming_migf);
+ virtiovf_clean_migf_resources(virtvdev->resuming_migf);
+ fput(virtvdev->resuming_migf->filp);
+ virtvdev->resuming_migf = NULL;
+ }
+ if (virtvdev->saving_migf) {
+ virtiovf_disable_fd(virtvdev->saving_migf);
+ virtiovf_clean_migf_resources(virtvdev->saving_migf);
+ fput(virtvdev->saving_migf->filp);
+ virtvdev->saving_migf = NULL;
+ }
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+static void virtiovf_state_mutex_unlock(struct virtiovf_pci_core_device *virtvdev)
+{
+again:
+ spin_lock(&virtvdev->reset_lock);
+ if (virtvdev->deferred_reset) {
+ virtvdev->deferred_reset = false;
+ spin_unlock(&virtvdev->reset_lock);
+ virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ virtiovf_disable_fds(virtvdev);
+ goto again;
+ }
+ mutex_unlock(&virtvdev->state_mutex);
+ spin_unlock(&virtvdev->reset_lock);
+}
+
+void virtiovf_migration_reset_done(struct pci_dev *pdev)
+{
+ struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
+
+ if (!virtvdev->migrate_cap)
+ return;
+
+ /*
+ * As the higher VFIO layers are holding locks across reset and using
+ * those same locks with the mm_lock we need to prevent ABBA deadlock
+ * with the state_mutex and mm_lock.
+ * In case the state_mutex was taken already we defer the cleanup work
+ * to the unlock flow of the other running context.
+ */
+ spin_lock(&virtvdev->reset_lock);
+ virtvdev->deferred_reset = true;
+ if (!mutex_trylock(&virtvdev->state_mutex)) {
+ spin_unlock(&virtvdev->reset_lock);
+ return;
+ }
+ spin_unlock(&virtvdev->reset_lock);
+ virtiovf_state_mutex_unlock(virtvdev);
+}
+
+static int virtiovf_release_file(struct inode *inode, struct file *filp)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+
+ virtiovf_disable_fd(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+ return 0;
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf,
+ loff_t pos, bool *end_of_data)
+{
+ struct virtiovf_data_buffer *buf;
+ bool found = false;
+
+ *end_of_data = false;
+ spin_lock_irq(&migf->list_lock);
+ if (list_empty(&migf->buf_list)) {
+ *end_of_data = true;
+ goto end;
+ }
+
+ buf = list_first_entry(&migf->buf_list, struct virtiovf_data_buffer,
+ buf_elm);
+ if (pos >= buf->start_pos &&
+ pos < buf->start_pos + buf->length) {
+ found = true;
+ goto end;
+ }
+
+ /*
+ * As we use a stream based FD we may expect having the data always
+ * on first chunk
+ */
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+
+end:
+ spin_unlock_irq(&migf->list_lock);
+ return found ? buf : NULL;
+}
+
+static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf,
+ char __user **buf, size_t *len, loff_t *pos)
+{
+ unsigned long offset;
+ ssize_t done = 0;
+ size_t copy_len;
+
+ copy_len = min_t(size_t,
+ vhca_buf->start_pos + vhca_buf->length - *pos, *len);
+ while (copy_len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+ offset -= page_offset;
+ page = virtiovf_get_migration_page(vhca_buf, offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ ret = copy_to_user(*buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (ret)
+ return -EFAULT;
+ *pos += page_len;
+ *len -= page_len;
+ *buf += page_len;
+ done += page_len;
+ copy_len -= page_len;
+ }
+
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
+ spin_lock_irq(&vhca_buf->migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&vhca_buf->migf->list_lock);
+ }
+
+ return done;
+}
+
+static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_data_buffer *vhca_buf;
+ bool first_loop_call = true;
+ bool end_of_data;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len) {
+ ssize_t count;
+
+ vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+ if (first_loop_call) {
+ first_loop_call = false;
+ /* Temporary end of file as part of PRE_COPY */
+ if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) {
+ done = -ENOMSG;
+ goto out_unlock;
+ }
+ if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
+ if (end_of_data)
+ goto out_unlock;
+
+ if (!vhca_buf) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ count = virtiovf_buf_read(vhca_buf, &buf, &len, pos);
+ if (count < 0) {
+ done = count;
+ goto out_unlock;
+ }
+ done += count;
+ }
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
+ struct vfio_precopy_info info = {};
+ loff_t *pos = &filp->f_pos;
+ bool end_of_data = false;
+ unsigned long minsz;
+ u32 ctx_size = 0;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&virtvdev->state_mutex);
+ if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto err_state_unlock;
+ }
+
+ /*
+ * The virtio specification does not include a PRE_COPY concept.
+ * Since we can expect the data to remain the same for a certain period,
+ * we use a rate limiter mechanism before making a call to the device.
+ */
+ if (__ratelimit(&migf->pre_copy_rl_state)) {
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto err_state_unlock;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto err_migf_unlock;
+ }
+
+ if (migf->pre_copy_initial_bytes > *pos) {
+ info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
+ } else {
+ info.dirty_bytes = migf->max_pos - *pos;
+ if (!info.dirty_bytes)
+ end_of_data = true;
+ info.dirty_bytes += ctx_size;
+ }
+
+ if (!end_of_data || !ctx_size) {
+ mutex_unlock(&migf->lock);
+ goto done;
+ }
+
+ mutex_unlock(&migf->lock);
+ /*
+ * We finished transferring the current state and the device has a
+ * dirty state, read a new state.
+ */
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ /*
+ * The machine is running, and context size could be grow, so no reason to mark
+ * the device state as VIRTIOVF_MIGF_STATE_ERROR.
+ */
+ goto err_state_unlock;
+
+done:
+ virtiovf_state_mutex_unlock(virtvdev);
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+
+err_migf_unlock:
+ mutex_unlock(&migf->lock);
+err_state_unlock:
+ virtiovf_state_mutex_unlock(virtvdev);
+ return ret;
+}
+
+static const struct file_operations virtiovf_save_fops = {
+ .owner = THIS_MODULE,
+ .read = virtiovf_save_read,
+ .unlocked_ioctl = virtiovf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = virtiovf_release_file,
+};
+
+static int
+virtiovf_add_buf_header(struct virtiovf_data_buffer *header_buf,
+ u32 data_size)
+{
+ struct virtiovf_migration_file *migf = header_buf->migf;
+ struct virtiovf_migration_header header = {};
+ struct page *page;
+ u8 *to_buff;
+
+ header.record_size = cpu_to_le64(data_size);
+ header.flags = cpu_to_le32(VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY);
+ header.tag = cpu_to_le32(VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA);
+ page = virtiovf_get_migration_page(header_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &header, sizeof(header));
+ kunmap_local(to_buff);
+ header_buf->length = sizeof(header);
+ header_buf->start_pos = header_buf->migf->max_pos;
+ migf->max_pos += header_buf->length;
+ spin_lock_irq(&migf->list_lock);
+ list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+ spin_unlock_irq(&migf->list_lock);
+ return 0;
+}
+
+static int
+virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
+ u32 ctx_size)
+{
+ struct virtiovf_data_buffer *header_buf;
+ struct virtiovf_data_buffer *buf;
+ bool unmark_end = false;
+ struct scatterlist *sg;
+ unsigned int i;
+ u32 res_size;
+ int nent;
+ int ret;
+
+ buf = virtiovf_get_data_buffer(migf, ctx_size);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ /* Find the total count of SG entries which satisfies the size */
+ nent = sg_nents_for_len(buf->table.sgt.sgl, ctx_size);
+ if (nent <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Iterate to that SG entry and mark it as last (if it's not already)
+ * to let underlay layers iterate only till that entry.
+ */
+ for_each_sg(buf->table.sgt.sgl, sg, nent - 1, i)
+ ;
+
+ if (!sg_is_last(sg)) {
+ unmark_end = true;
+ sg_mark_end(sg);
+ }
+
+ ret = virtio_pci_admin_dev_parts_get(migf->virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS,
+ migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL,
+ buf->table.sgt.sgl, &res_size);
+ /* Restore the original SG mark end */
+ if (unmark_end)
+ sg_unmark_end(sg);
+ if (ret)
+ goto out;
+
+ buf->length = res_size;
+ header_buf = virtiovf_get_data_buffer(migf,
+ sizeof(struct virtiovf_migration_header));
+ if (IS_ERR(header_buf)) {
+ ret = PTR_ERR(header_buf);
+ goto out;
+ }
+
+ ret = virtiovf_add_buf_header(header_buf, res_size);
+ if (ret)
+ goto out_header;
+
+ buf->start_pos = buf->migf->max_pos;
+ migf->max_pos += buf->length;
+ spin_lock(&migf->list_lock);
+ list_add_tail(&buf->buf_elm, &migf->buf_list);
+ spin_unlock_irq(&migf->list_lock);
+ return 0;
+
+out_header:
+ virtiovf_put_data_buffer(header_buf);
+out:
+ virtiovf_put_data_buffer(buf);
+ return ret;
+}
+
+static int
+virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct virtiovf_migration_file *migf = virtvdev->saving_migf;
+ u32 ctx_size;
+ int ret;
+
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR)
+ return -ENODEV;
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto err;
+
+ if (!ctx_size) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ goto err;
+
+ migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
+ return 0;
+
+err:
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ return ret;
+}
+
+static struct virtiovf_migration_file *
+virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev,
+ bool pre_copy)
+{
+ struct virtiovf_migration_file *migf;
+ u32 ctx_size;
+ u32 obj_id;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_save_fops, migf,
+ O_RDONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ migf->virtvdev = virtvdev;
+
+ lockdep_assert_held(&virtvdev->state_mutex);
+ ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET,
+ &obj_id);
+ if (ret)
+ goto out;
+
+ migf->obj_id = obj_id;
+ /* Mark as having a valid obj id which can be even 0 */
+ migf->has_obj_id = true;
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto out_clean;
+
+ if (!ctx_size) {
+ ret = -EINVAL;
+ goto out_clean;
+ }
+
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ goto out_clean;
+
+ if (pre_copy) {
+ migf->pre_copy_initial_bytes = migf->max_pos;
+ /* Arbitrarily set the pre-copy rate limit to 1-second intervals */
+ ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1);
+ /* Prevent any rate messages upon its usage */
+ ratelimit_set_flags(&migf->pre_copy_rl_state,
+ RATELIMIT_MSG_ON_RELEASE);
+ migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
+ } else {
+ migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
+ }
+
+ return migf;
+
+out_clean:
+ virtiovf_clean_migf_resources(migf);
+out:
+ fput(migf->filp);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Set the required object header at the beginning of the buffer.
+ * The actual device parts data will be written post of the header offset.
+ */
+static int virtiovf_set_obj_cmd_header(struct virtiovf_data_buffer *vhca_buf)
+{
+ struct virtio_admin_cmd_resource_obj_cmd_hdr obj_hdr = {};
+ struct page *page;
+ u8 *to_buff;
+
+ obj_hdr.type = cpu_to_le16(VIRTIO_RESOURCE_OBJ_DEV_PARTS);
+ obj_hdr.id = cpu_to_le32(vhca_buf->migf->obj_id);
+ page = virtiovf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &obj_hdr, sizeof(obj_hdr));
+ kunmap_local(to_buff);
+
+ /* Mark the buffer as including the header object data */
+ vhca_buf->include_header_object = 1;
+ return 0;
+}
+
+static int
+virtiovf_append_page_to_mig_buf(struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ unsigned long offset;
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+
+ if (vhca_buf->include_header_object)
+ /* The buffer holds the object header, update the offset accordingly */
+ offset += sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr);
+
+ page_offset = offset % PAGE_SIZE;
+
+ page = virtiovf_get_migration_page(vhca_buf, offset - page_offset);
+ if (!page)
+ return -EINVAL;
+
+ page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + page_offset, *buf, page_len);
+ kunmap_local(to_buff);
+ if (ret)
+ return -EFAULT;
+
+ *pos += page_len;
+ *done += page_len;
+ *buf += page_len;
+ *len -= page_len;
+ vhca_buf->length += page_len;
+ return 0;
+}
+
+static ssize_t
+virtiovf_resume_read_chunk(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ size_t chunk_size, const char __user **buf,
+ size_t *len, loff_t *pos, ssize_t *done,
+ bool *has_work)
+{
+ size_t copy_len, to_copy;
+ int ret;
+
+ to_copy = min_t(size_t, *len, chunk_size - vhca_buf->length);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy,
+ pos, done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == chunk_size) {
+ migf->load_state = VIRTIOVF_LOAD_STATE_LOAD_CHUNK;
+ migf->max_pos += chunk_size;
+ *has_work = true;
+ }
+
+ return 0;
+}
+
+static int
+virtiovf_resume_read_header_data(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ size_t copy_len, to_copy;
+ size_t required_data;
+ int ret;
+
+ required_data = migf->record_size - vhca_buf->length;
+ to_copy = min_t(size_t, *len, required_data);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy,
+ pos, done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == migf->record_size) {
+ switch (migf->record_tag) {
+ default:
+ /* Optional tag */
+ break;
+ }
+
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+ migf->max_pos += migf->record_size;
+ vhca_buf->length = 0;
+ }
+
+ return 0;
+}
+
+static int
+virtiovf_resume_read_header(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf,
+ size_t *len, loff_t *pos,
+ ssize_t *done, bool *has_work)
+{
+ struct page *page;
+ size_t copy_len;
+ u8 *to_buff;
+ int ret;
+
+ copy_len = min_t(size_t, *len,
+ sizeof(struct virtiovf_migration_header) - vhca_buf->length);
+ page = virtiovf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto end;
+ }
+
+ *buf += copy_len;
+ *pos += copy_len;
+ *done += copy_len;
+ *len -= copy_len;
+ vhca_buf->length += copy_len;
+ if (vhca_buf->length == sizeof(struct virtiovf_migration_header)) {
+ u64 record_size;
+ u32 flags;
+
+ record_size = le64_to_cpup((__le64 *)to_buff);
+ if (record_size > MAX_LOAD_SIZE) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ migf->record_size = record_size;
+ flags = le32_to_cpup((__le32 *)(to_buff +
+ offsetof(struct virtiovf_migration_header, flags)));
+ migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
+ offsetof(struct virtiovf_migration_header, tag)));
+ switch (migf->record_tag) {
+ case VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA:
+ migf->load_state = VIRTIOVF_LOAD_STATE_PREP_CHUNK;
+ break;
+ default:
+ if (!(flags & VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
+ ret = -EOPNOTSUPP;
+ goto end;
+ }
+ /* We may read and skip this optional record data */
+ migf->load_state = VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA;
+ }
+
+ migf->max_pos += vhca_buf->length;
+ vhca_buf->length = 0;
+ *has_work = true;
+ }
+end:
+ kunmap_local(to_buff);
+ return ret;
+}
+
+static ssize_t virtiovf_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_data_buffer *vhca_buf = migf->buf;
+ struct virtiovf_data_buffer *vhca_buf_header = migf->buf_header;
+ unsigned int orig_length;
+ bool has_work = false;
+ ssize_t done = 0;
+ int ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+ if (*pos < vhca_buf->start_pos)
+ return -EINVAL;
+
+ mutex_lock(&migf->virtvdev->state_mutex);
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len || has_work) {
+ has_work = false;
+ switch (migf->load_state) {
+ case VIRTIOVF_LOAD_STATE_READ_HEADER:
+ ret = virtiovf_resume_read_header(migf, vhca_buf_header, &buf,
+ &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA:
+ if (vhca_buf_header->allocated_length < migf->record_size) {
+ virtiovf_free_data_buffer(vhca_buf_header);
+
+ migf->buf_header = virtiovf_alloc_data_buffer(migf,
+ migf->record_size);
+ if (IS_ERR(migf->buf_header)) {
+ ret = PTR_ERR(migf->buf_header);
+ migf->buf_header = NULL;
+ goto out_unlock;
+ }
+
+ vhca_buf_header = migf->buf_header;
+ }
+
+ vhca_buf_header->start_pos = migf->max_pos;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER_DATA;
+ break;
+ case VIRTIOVF_LOAD_STATE_READ_HEADER_DATA:
+ ret = virtiovf_resume_read_header_data(migf, vhca_buf_header,
+ &buf, &len, pos, &done);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_PREP_CHUNK:
+ {
+ u32 cmd_size = migf->record_size +
+ sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr);
+
+ /*
+ * The DMA map/unmap is managed in virtio layer, we just need to extend
+ * the SG pages to hold the extra required chunk data.
+ */
+ if (vhca_buf->allocated_length < cmd_size) {
+ ret = virtiovf_add_migration_pages(vhca_buf,
+ DIV_ROUND_UP_ULL(cmd_size - vhca_buf->allocated_length,
+ PAGE_SIZE));
+ if (ret)
+ goto out_unlock;
+ }
+
+ vhca_buf->start_pos = migf->max_pos;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_CHUNK;
+ break;
+ }
+ case VIRTIOVF_LOAD_STATE_READ_CHUNK:
+ ret = virtiovf_resume_read_chunk(migf, vhca_buf, migf->record_size,
+ &buf, &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_LOAD_CHUNK:
+ /* Mark the last SG entry and set its length */
+ sg_mark_end(vhca_buf->last_offset_sg);
+ orig_length = vhca_buf->last_offset_sg->length;
+ /* Length should include the resource object command header */
+ vhca_buf->last_offset_sg->length = vhca_buf->length +
+ sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr) -
+ vhca_buf->last_offset;
+ ret = virtio_pci_admin_dev_parts_set(migf->virtvdev->core_device.pdev,
+ vhca_buf->table.sgt.sgl);
+ /* Restore the original SG data */
+ vhca_buf->last_offset_sg->length = orig_length;
+ sg_unmark_end(vhca_buf->last_offset_sg);
+ if (ret)
+ goto out_unlock;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+ /* be ready for reading the next chunk */
+ vhca_buf->length = 0;
+ break;
+ default:
+ break;
+ }
+ }
+
+out_unlock:
+ if (ret)
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ mutex_unlock(&migf->lock);
+ virtiovf_state_mutex_unlock(migf->virtvdev);
+ return ret ? ret : done;
+}
+
+static const struct file_operations virtiovf_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = virtiovf_resume_write,
+ .release = virtiovf_release_file,
+};
+
+static struct virtiovf_migration_file *
+virtiovf_pci_resume_device_data(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct virtiovf_migration_file *migf;
+ struct virtiovf_data_buffer *buf;
+ u32 obj_id;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_resume_fops, migf,
+ O_WRONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+
+ buf = virtiovf_alloc_data_buffer(migf, VIRTIOVF_TARGET_INITIAL_BUF_SIZE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out;
+ }
+
+ migf->buf = buf;
+
+ buf = virtiovf_alloc_data_buffer(migf,
+ sizeof(struct virtiovf_migration_header));
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_clean;
+ }
+
+ migf->buf_header = buf;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+
+ migf->virtvdev = virtvdev;
+ ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET,
+ &obj_id);
+ if (ret)
+ goto out_clean;
+
+ migf->obj_id = obj_id;
+ /* Mark as having a valid obj id which can be even 0 */
+ migf->has_obj_id = true;
+ ret = virtiovf_set_obj_cmd_header(migf->buf);
+ if (ret)
+ goto out_clean;
+
+ return migf;
+
+out_clean:
+ virtiovf_clean_migf_resources(migf);
+out:
+ fput(migf->filp);
+ return ERR_PTR(ret);
+}
+
+static struct file *
+virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
+ u32 new)
+{
+ u32 cur = virtvdev->mig_state;
+ int ret;
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
+ /* NOP */
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ /* NOP */
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev,
+ BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED));
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
+ ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_save_device_data(virtvdev, false);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+ virtiovf_disable_fds(virtvdev);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_resume_device_data(virtvdev);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->resuming_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ virtiovf_disable_fds(virtvdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_save_device_data(virtvdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ ret = virtiovf_pci_save_device_final_data(virtvdev);
+ return ret ? ERR_PTR(ret) : NULL;
+ }
+
+ /*
+ * vfio_mig_get_next_state() does not use arcs other than the above
+ */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct file *
+virtiovf_pci_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *res = NULL;
+ int ret;
+
+ mutex_lock(&virtvdev->state_mutex);
+ while (new_state != virtvdev->mig_state) {
+ ret = vfio_mig_get_next_state(vdev, virtvdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ res = ERR_PTR(ret);
+ break;
+ }
+ res = virtiovf_pci_step_device_state_locked(virtvdev, next_state);
+ if (IS_ERR(res))
+ break;
+ virtvdev->mig_state = next_state;
+ if (WARN_ON(res && new_state != virtvdev->mig_state)) {
+ fput(res);
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ virtiovf_state_mutex_unlock(virtvdev);
+ return res;
+}
+
+static int virtiovf_pci_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+
+ mutex_lock(&virtvdev->state_mutex);
+ *curr_state = virtvdev->mig_state;
+ virtiovf_state_mutex_unlock(virtvdev);
+ return 0;
+}
+
+static int virtiovf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ bool obj_id_exists;
+ u32 res_size;
+ u32 obj_id;
+ int ret;
+
+ mutex_lock(&virtvdev->state_mutex);
+ obj_id_exists = virtvdev->saving_migf && virtvdev->saving_migf->has_obj_id;
+ if (!obj_id_exists) {
+ ret = virtiovf_pci_alloc_obj_id(virtvdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET,
+ &obj_id);
+ if (ret)
+ goto end;
+ } else {
+ obj_id = virtvdev->saving_migf->obj_id;
+ }
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &res_size);
+ if (!ret)
+ *stop_copy_length = res_size;
+
+ /*
+ * We can't leave this obj_id alive if didn't exist before, otherwise, it might
+ * stay alive, even without an active migration flow (e.g. migration was cancelled)
+ */
+ if (!obj_id_exists)
+ virtiovf_pci_free_obj_id(virtvdev, obj_id);
+end:
+ virtiovf_state_mutex_unlock(virtvdev);
+ return ret;
+}
+
+static const struct vfio_migration_ops virtvdev_pci_mig_ops = {
+ .migration_set_state = virtiovf_pci_set_device_state,
+ .migration_get_state = virtiovf_pci_get_device_state,
+ .migration_get_data_size = virtiovf_pci_get_data_size,
+};
+
+void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev)
+{
+ virtvdev->migrate_cap = 1;
+ mutex_init(&virtvdev->state_mutex);
+ spin_lock_init(&virtvdev->reset_lock);
+ virtvdev->core_device.vdev.migration_flags =
+ VFIO_MIGRATION_STOP_COPY |
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+ virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops;
+}
+
+void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->migrate_cap)
+ return;
+
+ virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+}
+
+void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->migrate_cap)
+ return;
+
+ virtiovf_disable_fds(virtvdev);
+}
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 6464b3939ebc..ff8ff8480968 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -122,19 +122,18 @@ static const struct vfio_device_ops vfio_amba_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
-static const struct amba_id pl330_ids[] = {
+static const struct amba_id vfio_amba_ids[] = {
{ 0, 0 },
};
-MODULE_DEVICE_TABLE(amba, pl330_ids);
+MODULE_DEVICE_TABLE(amba, vfio_amba_ids);
static struct amba_driver vfio_amba_driver = {
.probe = vfio_amba_probe,
.remove = vfio_amba_remove,
- .id_table = pl330_ids,
+ .id_table = vfio_amba_ids,
.drv = {
.name = "vfio-amba",
- .owner = THIS_MODULE,
},
.driver_managed_dma = true,
};
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 8cf22fa65baa..512533501eb7 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -85,14 +85,13 @@ static void vfio_platform_release_dev(struct vfio_device *core_vdev)
vfio_platform_release_common(vdev);
}
-static int vfio_platform_remove(struct platform_device *pdev)
+static void vfio_platform_remove(struct platform_device *pdev)
{
struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev);
vfio_unregister_group_dev(&vdev->vdev);
pm_runtime_disable(vdev->device);
vfio_put_device(&vdev->vdev);
- return 0;
}
static const struct vfio_device_ops vfio_platform_ops = {
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index e53757d1d095..3bf1043cd795 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -388,6 +388,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
{
unsigned int done = 0;
+ if (off >= reg->size)
+ return -EINVAL;
+
+ count = min_t(size_t, count, reg->size - off);
+
if (!reg->ioaddr) {
reg->ioaddr =
ioremap(reg->addr, reg->size);
@@ -467,6 +472,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
{
unsigned int done = 0;
+ if (off >= reg->size)
+ return -EINVAL;
+
+ count = min_t(size_t, count, reg->size - off);
+
if (!reg->ioaddr) {
reg->ioaddr =
ioremap(reg->addr, reg->size);
diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c
index 61a1bfb68ac7..ef41ecef83af 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -136,6 +136,16 @@ static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
return 0;
}
+/*
+ * The trigger eventfd is guaranteed valid in the interrupt path
+ * and protected by the igate mutex when triggered via ioctl.
+ */
+static void vfio_send_eventfd(struct vfio_platform_irq *irq_ctx)
+{
+ if (likely(irq_ctx->trigger))
+ eventfd_signal(irq_ctx->trigger);
+}
+
static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id)
{
struct vfio_platform_irq *irq_ctx = dev_id;
@@ -155,7 +165,7 @@ static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id)
spin_unlock_irqrestore(&irq_ctx->lock, flags);
if (ret == IRQ_HANDLED)
- eventfd_signal(irq_ctx->trigger);
+ vfio_send_eventfd(irq_ctx);
return ret;
}
@@ -164,52 +174,40 @@ static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
{
struct vfio_platform_irq *irq_ctx = dev_id;
- eventfd_signal(irq_ctx->trigger);
+ vfio_send_eventfd(irq_ctx);
return IRQ_HANDLED;
}
static int vfio_set_trigger(struct vfio_platform_device *vdev, int index,
- int fd, irq_handler_t handler)
+ int fd)
{
struct vfio_platform_irq *irq = &vdev->irqs[index];
struct eventfd_ctx *trigger;
- int ret;
if (irq->trigger) {
- irq_clear_status_flags(irq->hwirq, IRQ_NOAUTOEN);
- free_irq(irq->hwirq, irq);
- kfree(irq->name);
+ disable_irq(irq->hwirq);
eventfd_ctx_put(irq->trigger);
irq->trigger = NULL;
}
if (fd < 0) /* Disable only */
return 0;
- irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)",
- irq->hwirq, vdev->name);
- if (!irq->name)
- return -ENOMEM;
trigger = eventfd_ctx_fdget(fd);
- if (IS_ERR(trigger)) {
- kfree(irq->name);
+ if (IS_ERR(trigger))
return PTR_ERR(trigger);
- }
irq->trigger = trigger;
- irq_set_status_flags(irq->hwirq, IRQ_NOAUTOEN);
- ret = request_irq(irq->hwirq, handler, 0, irq->name, irq);
- if (ret) {
- kfree(irq->name);
- eventfd_ctx_put(trigger);
- irq->trigger = NULL;
- return ret;
- }
-
- if (!irq->masked)
- enable_irq(irq->hwirq);
+ /*
+ * irq->masked effectively provides nested disables within the overall
+ * enable relative to trigger. Specifically request_irq() is called
+ * with NO_AUTOEN, therefore the IRQ is initially disabled. The user
+ * may only further disable the IRQ with a MASK operations because
+ * irq->masked is initially false.
+ */
+ enable_irq(irq->hwirq);
return 0;
}
@@ -228,7 +226,7 @@ static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev,
handler = vfio_irq_handler;
if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
- return vfio_set_trigger(vdev, index, -1, handler);
+ return vfio_set_trigger(vdev, index, -1);
if (start != 0 || count != 1)
return -EINVAL;
@@ -236,7 +234,7 @@ static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev,
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int32_t fd = *(int32_t *)data;
- return vfio_set_trigger(vdev, index, fd, handler);
+ return vfio_set_trigger(vdev, index, fd);
}
if (flags & VFIO_IRQ_SET_DATA_NONE) {
@@ -260,6 +258,14 @@ int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
unsigned start, unsigned count, uint32_t flags,
void *data) = NULL;
+ /*
+ * For compatibility, errors from request_irq() are local to the
+ * SET_IRQS path and reflected in the name pointer. This allows,
+ * for example, polling mode fallback for an exclusive IRQ failure.
+ */
+ if (IS_ERR(vdev->irqs[index].name))
+ return PTR_ERR(vdev->irqs[index].name);
+
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_MASK:
func = vfio_platform_set_irq_mask;
@@ -280,7 +286,7 @@ int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
int vfio_platform_irq_init(struct vfio_platform_device *vdev)
{
- int cnt = 0, i;
+ int cnt = 0, i, ret = 0;
while (vdev->get_irq(vdev, cnt) >= 0)
cnt++;
@@ -292,37 +298,70 @@ int vfio_platform_irq_init(struct vfio_platform_device *vdev)
for (i = 0; i < cnt; i++) {
int hwirq = vdev->get_irq(vdev, i);
+ irq_handler_t handler = vfio_irq_handler;
- if (hwirq < 0)
+ if (hwirq < 0) {
+ ret = -EINVAL;
goto err;
+ }
spin_lock_init(&vdev->irqs[i].lock);
vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD;
- if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
+ if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK) {
vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE
| VFIO_IRQ_INFO_AUTOMASKED;
+ handler = vfio_automasked_irq_handler;
+ }
vdev->irqs[i].count = 1;
vdev->irqs[i].hwirq = hwirq;
vdev->irqs[i].masked = false;
+ vdev->irqs[i].name = kasprintf(GFP_KERNEL_ACCOUNT,
+ "vfio-irq[%d](%s)", hwirq,
+ vdev->name);
+ if (!vdev->irqs[i].name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = request_irq(hwirq, handler, IRQF_NO_AUTOEN,
+ vdev->irqs[i].name, &vdev->irqs[i]);
+ if (ret) {
+ kfree(vdev->irqs[i].name);
+ vdev->irqs[i].name = ERR_PTR(ret);
+ }
}
vdev->num_irqs = cnt;
return 0;
err:
+ for (--i; i >= 0; i--) {
+ if (!IS_ERR(vdev->irqs[i].name)) {
+ free_irq(vdev->irqs[i].hwirq, &vdev->irqs[i]);
+ kfree(vdev->irqs[i].name);
+ }
+ }
kfree(vdev->irqs);
- return -EINVAL;
+ return ret;
}
void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
{
int i;
- for (i = 0; i < vdev->num_irqs; i++)
- vfio_set_trigger(vdev, i, -1, NULL);
+ for (i = 0; i < vdev->num_irqs; i++) {
+ vfio_virqfd_disable(&vdev->irqs[i].mask);
+ vfio_virqfd_disable(&vdev->irqs[i].unmask);
+ if (!IS_ERR(vdev->irqs[i].name)) {
+ free_irq(vdev->irqs[i].hwirq, &vdev->irqs[i]);
+ if (vdev->irqs[i].trigger)
+ eventfd_ctx_put(vdev->irqs[i].trigger);
+ kfree(vdev->irqs[i].name);
+ }
+ }
vdev->num_irqs = 0;
kfree(vdev->irqs);
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index bde84ad344e5..50128da18bca 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -434,7 +434,7 @@ static inline void vfio_virqfd_exit(void)
}
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm);
void vfio_device_put_kvm(struct vfio_device *device);
#else
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index a94ec6225d31..5f9e7e477078 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -364,7 +364,6 @@ static void tce_iommu_release(void *iommu_data)
if (!tbl)
continue;
- tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
tce_iommu_free_table(container, tbl);
}
@@ -720,6 +719,8 @@ static long tce_iommu_remove_window(struct tce_container *container,
BUG_ON(!tbl->it_size);
+ tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+
/* Detach groups from IOMMUs */
list_for_each_entry(tcegrp, &container->group_list, next) {
table_group = iommu_group_get_iommudata(tcegrp->grp);
@@ -738,7 +739,6 @@ static long tce_iommu_remove_window(struct tce_container *container,
}
/* Free table */
- tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
tce_iommu_free_table(container, tbl);
container->tables[num] = NULL;
@@ -1197,9 +1197,14 @@ static void tce_iommu_release_ownership(struct tce_container *container,
return;
}
- for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
- if (container->tables[i])
+ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+ if (container->tables[i]) {
+ tce_iommu_clear(container, container->tables[i],
+ container->tables[i]->it_offset,
+ container->tables[i]->it_size);
table_group->ops->unset_window(table_group, i);
+ }
+ }
}
static long tce_iommu_take_ownership(struct tce_container *container,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b2854d7939ce..50ebc9593c9d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,7 +72,6 @@ struct vfio_iommu {
uint64_t pgsize_bitmap;
uint64_t num_non_pinned_groups;
bool v2;
- bool nesting;
bool dirty_page_tracking;
struct list_head emulated_iommu_groups;
};
@@ -513,12 +512,10 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
unsigned long vaddr, unsigned long *pfn,
bool write_fault)
{
- pte_t *ptep;
- pte_t pte;
- spinlock_t *ptl;
+ struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
int ret;
- ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+ ret = follow_pfnmap_start(&args);
if (ret) {
bool unlocked = false;
@@ -532,19 +529,17 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
if (ret)
return ret;
- ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+ ret = follow_pfnmap_start(&args);
if (ret)
return ret;
}
- pte = ptep_get(ptep);
-
- if (write_fault && !pte_write(pte))
+ if (write_fault && !args.writable)
ret = -EFAULT;
else
- *pfn = pte_pfn(pte);
+ *pfn = args.pfn;
- pte_unmap_unlock(ptep, ptl);
+ follow_pfnmap_end(&args);
return ret;
}
@@ -567,18 +562,6 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
pages, NULL);
if (ret > 0) {
- int i;
-
- /*
- * The zero page is always resident, we don't need to pin it
- * and it falls into our invalid/reserved test so we don't
- * unpin in put_pfn(). Unpin all zero pages in the batch here.
- */
- for (i = 0 ; i < ret; i++) {
- if (unlikely(is_zero_pfn(page_to_pfn(pages[i]))))
- unpin_user_page(pages[i]);
- }
-
*pfn = page_to_pfn(pages[0]);
goto done;
}
@@ -2147,7 +2130,7 @@ static int vfio_iommu_domain_alloc(struct device *dev, void *data)
{
struct iommu_domain **domain = data;
- *domain = iommu_domain_alloc(dev->bus);
+ *domain = iommu_paging_domain_alloc(dev);
return 1; /* Don't iterate */
}
@@ -2204,16 +2187,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
* us a representative device for the IOMMU API call. We don't actually
* want to iterate beyond the first device (if any).
*/
- ret = -EIO;
iommu_group_for_each_dev(iommu_group, &domain->domain,
vfio_iommu_domain_alloc);
- if (!domain->domain)
+ if (IS_ERR(domain->domain)) {
+ ret = PTR_ERR(domain->domain);
goto out_free_domain;
-
- if (iommu->nesting) {
- ret = iommu_enable_nesting(domain->domain);
- if (ret)
- goto out_domain;
}
ret = iommu_attach_group(domain->domain, group->iommu_group);
@@ -2556,9 +2534,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
switch (arg) {
case VFIO_TYPE1_IOMMU:
break;
- case VFIO_TYPE1_NESTING_IOMMU:
- iommu->nesting = true;
- fallthrough;
+ case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
case VFIO_TYPE1v2_IOMMU:
iommu->v2 = true;
break;
@@ -2653,7 +2629,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
switch (arg) {
case VFIO_TYPE1_IOMMU:
case VFIO_TYPE1v2_IOMMU:
- case VFIO_TYPE1_NESTING_IOMMU:
case VFIO_UNMAP_ALL:
return 1;
case VFIO_UPDATE_VADDR:
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 1cc93aac99a2..1fd261efc582 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -16,14 +16,16 @@
#include <linux/fs.h>
#include <linux/idr.h>
#include <linux/iommu.h>
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
#include <linux/kvm_host.h>
#endif
#include <linux/list.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
+#include <linux/mount.h>
#include <linux/mutex.h>
#include <linux/pci.h>
+#include <linux/pseudo_fs.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -43,9 +45,13 @@
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
#define DRIVER_DESC "VFIO - User Level meta-driver"
+#define VFIO_MAGIC 0x5646494f /* "VFIO" */
+
static struct vfio {
struct class *device_class;
struct ida device_ida;
+ struct vfsmount *vfs_mount;
+ int fs_count;
} vfio;
#ifdef CONFIG_VFIO_NOIOMMU
@@ -186,6 +192,8 @@ static void vfio_device_release(struct device *dev)
if (device->ops->release)
device->ops->release(device);
+ iput(device->inode);
+ simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
kvfree(device);
}
@@ -228,6 +236,34 @@ out_free:
}
EXPORT_SYMBOL_GPL(_vfio_alloc_device);
+static int vfio_fs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type vfio_fs_type = {
+ .name = "vfio",
+ .owner = THIS_MODULE,
+ .init_fs_context = vfio_fs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static struct inode *vfio_fs_inode_new(void)
+{
+ struct inode *inode;
+ int ret;
+
+ ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
+ if (ret)
+ return ERR_PTR(ret);
+
+ inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
+ if (IS_ERR(inode))
+ simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
+
+ return inode;
+}
+
/*
* Initialize a vfio_device so it can be registered to vfio core.
*/
@@ -246,6 +282,11 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
init_completion(&device->comp);
device->dev = dev;
device->ops = ops;
+ device->inode = vfio_fs_inode_new();
+ if (IS_ERR(device->inode)) {
+ ret = PTR_ERR(device->inode);
+ goto out_inode;
+ }
if (ops->init) {
ret = ops->init(device);
@@ -260,6 +301,9 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
return 0;
out_uninit:
+ iput(device->inode);
+ simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
+out_inode:
vfio_release_device_set(device);
ida_free(&vfio.device_ida, device->index);
return ret;
@@ -385,7 +429,7 @@ void vfio_unregister_group_dev(struct vfio_device *device)
}
EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
{
void (*pfn)(struct kvm *kvm);
@@ -1707,7 +1751,7 @@ static void __exit vfio_cleanup(void)
module_init(vfio_init);
module_exit(vfio_cleanup);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 29c564b7a6e1..aa2891f97508 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -101,12 +101,18 @@ static void virqfd_inject(struct work_struct *work)
virqfd->thread(virqfd->opaque, virqfd->data);
}
+static void virqfd_flush_inject(struct work_struct *work)
+{
+ struct virqfd *virqfd = container_of(work, struct virqfd, flush_inject);
+
+ flush_work(&virqfd->inject);
+}
+
int vfio_virqfd_enable(void *opaque,
int (*handler)(void *, void *),
void (*thread)(void *, void *),
void *data, struct virqfd **pvirqfd, int fd)
{
- struct fd irqfd;
struct eventfd_ctx *ctx;
struct virqfd *virqfd;
int ret = 0;
@@ -124,17 +130,18 @@ int vfio_virqfd_enable(void *opaque,
INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
INIT_WORK(&virqfd->inject, virqfd_inject);
+ INIT_WORK(&virqfd->flush_inject, virqfd_flush_inject);
- irqfd = fdget(fd);
- if (!irqfd.file) {
+ CLASS(fd, irqfd)(fd);
+ if (fd_empty(irqfd)) {
ret = -EBADF;
goto err_fd;
}
- ctx = eventfd_ctx_fileget(irqfd.file);
+ ctx = eventfd_ctx_fileget(fd_file(irqfd));
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
- goto err_ctx;
+ goto err_fd;
}
virqfd->eventfd = ctx;
@@ -163,7 +170,7 @@ int vfio_virqfd_enable(void *opaque,
init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
- events = vfs_poll(irqfd.file, &virqfd->pt);
+ events = vfs_poll(fd_file(irqfd), &virqfd->pt);
/*
* Check if there was an event already pending on the eventfd
@@ -173,18 +180,9 @@ int vfio_virqfd_enable(void *opaque,
if ((!handler || handler(opaque, data)) && thread)
schedule_work(&virqfd->inject);
}
-
- /*
- * Do not drop the file until the irqfd is fully initialized,
- * otherwise we might race against the EPOLLHUP.
- */
- fdput(irqfd);
-
return 0;
err_busy:
eventfd_ctx_put(ctx);
-err_ctx:
- fdput(irqfd);
err_fd:
kfree(virqfd);
@@ -213,3 +211,16 @@ void vfio_virqfd_disable(struct virqfd **pvirqfd)
flush_workqueue(vfio_irqfd_cleanup_wq);
}
EXPORT_SYMBOL_GPL(vfio_virqfd_disable);
+
+void vfio_virqfd_flush_thread(struct virqfd **pvirqfd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&virqfd_lock, flags);
+ if (*pvirqfd && (*pvirqfd)->thread)
+ queue_work(vfio_irqfd_cleanup_wq, &(*pvirqfd)->flush_inject);
+ spin_unlock_irqrestore(&virqfd_lock, flags);
+
+ flush_workqueue(vfio_irqfd_cleanup_wq);
+}
+EXPORT_SYMBOL_GPL(vfio_virqfd_flush_thread);