summaryrefslogtreecommitdiff
path: root/drivers/vfio
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/cdx/Makefile2
-rw-r--r--drivers/vfio/cdx/intr.c217
-rw-r--r--drivers/vfio/cdx/main.c65
-rw-r--r--drivers/vfio/cdx/private.h18
-rw-r--r--drivers/vfio/device_cdev.c67
-rw-r--r--drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c4
-rw-r--r--drivers/vfio/group.c27
-rw-r--r--drivers/vfio/iommufd.c64
-rw-r--r--drivers/vfio/mdev/mdev_core.c4
-rw-r--r--drivers/vfio/mdev/mdev_driver.c2
-rw-r--r--drivers/vfio/mdev/mdev_private.h3
-rw-r--r--drivers/vfio/mdev/mdev_sysfs.c2
-rw-r--r--drivers/vfio/pci/Kconfig6
-rw-r--r--drivers/vfio/pci/Makefile2
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c371
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h33
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c416
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h35
-rw-r--r--drivers/vfio/pci/mlx5/main.c126
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/main.c171
-rw-r--r--drivers/vfio/pci/pds/dirty.c1
-rw-r--r--drivers/vfio/pci/pds/lm.c2
-rw-r--r--drivers/vfio/pci/pds/pci_drv.c2
-rw-r--r--drivers/vfio/pci/qat/Kconfig12
-rw-r--r--drivers/vfio/pci/qat/Makefile3
-rw-r--r--drivers/vfio/pci/qat/main.c700
-rw-r--r--drivers/vfio/pci/vfio_pci.c8
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c32
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c446
-rw-r--r--drivers/vfio/pci/vfio_pci_igd.c6
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c63
-rw-r--r--drivers/vfio/pci/vfio_pci_priv.h6
-rw-r--r--drivers/vfio/pci/vfio_pci_rdwr.c148
-rw-r--r--drivers/vfio/pci/virtio/Kconfig42
-rw-r--r--drivers/vfio/pci/virtio/Makefile3
-rw-r--r--drivers/vfio/pci/virtio/common.h127
-rw-r--r--drivers/vfio/pci/virtio/legacy_io.c420
-rw-r--r--drivers/vfio/pci/virtio/main.c481
-rw-r--r--drivers/vfio/pci/virtio/migrate.c1337
-rw-r--r--drivers/vfio/platform/vfio_amba.c1
-rw-r--r--drivers/vfio/platform/vfio_platform.c2
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c10
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c13
-rw-r--r--drivers/vfio/vfio_iommu_type1.c205
-rw-r--r--drivers/vfio/vfio_main.c46
-rw-r--r--drivers/vfio/virqfd.c20
46 files changed, 4395 insertions, 1376 deletions
diff --git a/drivers/vfio/cdx/Makefile b/drivers/vfio/cdx/Makefile
index cd4a2e6fe609..df92b320122a 100644
--- a/drivers/vfio/cdx/Makefile
+++ b/drivers/vfio/cdx/Makefile
@@ -5,4 +5,4 @@
obj-$(CONFIG_VFIO_CDX) += vfio-cdx.o
-vfio-cdx-objs := main.o
+vfio-cdx-objs := main.o intr.o
diff --git a/drivers/vfio/cdx/intr.c b/drivers/vfio/cdx/intr.c
new file mode 100644
index 000000000000..986fa2a45fa4
--- /dev/null
+++ b/drivers/vfio/cdx/intr.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ */
+
+#include <linux/vfio.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/eventfd.h>
+#include <linux/msi.h>
+#include <linux/interrupt.h>
+
+#include "linux/cdx/cdx_bus.h"
+#include "private.h"
+
+static irqreturn_t vfio_cdx_msihandler(int irq_no, void *arg)
+{
+ struct eventfd_ctx *trigger = arg;
+
+ eventfd_signal(trigger);
+ return IRQ_HANDLED;
+}
+
+static int vfio_cdx_msi_enable(struct vfio_cdx_device *vdev, int nvec)
+{
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ struct device *dev = vdev->vdev.dev;
+ int msi_idx, ret;
+
+ vdev->cdx_irqs = kcalloc(nvec, sizeof(struct vfio_cdx_irq), GFP_KERNEL);
+ if (!vdev->cdx_irqs)
+ return -ENOMEM;
+
+ ret = cdx_enable_msi(cdx_dev);
+ if (ret) {
+ kfree(vdev->cdx_irqs);
+ return ret;
+ }
+
+ /* Allocate cdx MSIs */
+ ret = msi_domain_alloc_irqs(dev, MSI_DEFAULT_DOMAIN, nvec);
+ if (ret) {
+ cdx_disable_msi(cdx_dev);
+ kfree(vdev->cdx_irqs);
+ return ret;
+ }
+
+ for (msi_idx = 0; msi_idx < nvec; msi_idx++)
+ vdev->cdx_irqs[msi_idx].irq_no = msi_get_virq(dev, msi_idx);
+
+ vdev->msi_count = nvec;
+ vdev->config_msi = 1;
+
+ return 0;
+}
+
+static int vfio_cdx_msi_set_vector_signal(struct vfio_cdx_device *vdev,
+ int vector, int fd)
+{
+ struct eventfd_ctx *trigger;
+ int irq_no, ret;
+
+ if (vector < 0 || vector >= vdev->msi_count)
+ return -EINVAL;
+
+ irq_no = vdev->cdx_irqs[vector].irq_no;
+
+ if (vdev->cdx_irqs[vector].trigger) {
+ free_irq(irq_no, vdev->cdx_irqs[vector].trigger);
+ kfree(vdev->cdx_irqs[vector].name);
+ eventfd_ctx_put(vdev->cdx_irqs[vector].trigger);
+ vdev->cdx_irqs[vector].trigger = NULL;
+ }
+
+ if (fd < 0)
+ return 0;
+
+ vdev->cdx_irqs[vector].name = kasprintf(GFP_KERNEL, "vfio-msi[%d](%s)",
+ vector, dev_name(vdev->vdev.dev));
+ if (!vdev->cdx_irqs[vector].name)
+ return -ENOMEM;
+
+ trigger = eventfd_ctx_fdget(fd);
+ if (IS_ERR(trigger)) {
+ kfree(vdev->cdx_irqs[vector].name);
+ return PTR_ERR(trigger);
+ }
+
+ ret = request_irq(irq_no, vfio_cdx_msihandler, 0,
+ vdev->cdx_irqs[vector].name, trigger);
+ if (ret) {
+ kfree(vdev->cdx_irqs[vector].name);
+ eventfd_ctx_put(trigger);
+ return ret;
+ }
+
+ vdev->cdx_irqs[vector].trigger = trigger;
+
+ return 0;
+}
+
+static int vfio_cdx_msi_set_block(struct vfio_cdx_device *vdev,
+ unsigned int start, unsigned int count,
+ int32_t *fds)
+{
+ int i, j, ret = 0;
+
+ if (start >= vdev->msi_count || start + count > vdev->msi_count)
+ return -EINVAL;
+
+ for (i = 0, j = start; i < count && !ret; i++, j++) {
+ int fd = fds ? fds[i] : -1;
+
+ ret = vfio_cdx_msi_set_vector_signal(vdev, j, fd);
+ }
+
+ if (ret) {
+ for (--j; j >= (int)start; j--)
+ vfio_cdx_msi_set_vector_signal(vdev, j, -1);
+ }
+
+ return ret;
+}
+
+static void vfio_cdx_msi_disable(struct vfio_cdx_device *vdev)
+{
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ struct device *dev = vdev->vdev.dev;
+
+ vfio_cdx_msi_set_block(vdev, 0, vdev->msi_count, NULL);
+
+ if (!vdev->config_msi)
+ return;
+
+ msi_domain_free_irqs_all(dev, MSI_DEFAULT_DOMAIN);
+ cdx_disable_msi(cdx_dev);
+ kfree(vdev->cdx_irqs);
+
+ vdev->cdx_irqs = NULL;
+ vdev->msi_count = 0;
+ vdev->config_msi = 0;
+}
+
+static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev,
+ unsigned int index, unsigned int start,
+ unsigned int count, u32 flags,
+ void *data)
+{
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ int i;
+
+ if (start + count > cdx_dev->num_msi)
+ return -EINVAL;
+
+ if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
+ vfio_cdx_msi_disable(vdev);
+ return 0;
+ }
+
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ s32 *fds = data;
+ int ret;
+
+ if (vdev->config_msi)
+ return vfio_cdx_msi_set_block(vdev, start, count,
+ fds);
+ ret = vfio_cdx_msi_enable(vdev, cdx_dev->num_msi);
+ if (ret)
+ return ret;
+
+ ret = vfio_cdx_msi_set_block(vdev, start, count, fds);
+ if (ret)
+ vfio_cdx_msi_disable(vdev);
+
+ return ret;
+ }
+
+ for (i = start; i < start + count; i++) {
+ if (!vdev->cdx_irqs[i].trigger)
+ continue;
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ eventfd_signal(vdev->cdx_irqs[i].trigger);
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ u8 *bools = data;
+
+ if (bools[i - start])
+ eventfd_signal(vdev->cdx_irqs[i].trigger);
+ }
+ }
+
+ return 0;
+}
+
+int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev,
+ u32 flags, unsigned int index,
+ unsigned int start, unsigned int count,
+ void *data)
+{
+ if (flags & VFIO_IRQ_SET_ACTION_TRIGGER)
+ return vfio_cdx_set_msi_trigger(vdev, index, start,
+ count, flags, data);
+ else
+ return -EINVAL;
+}
+
+/* Free All IRQs for the given device */
+void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev)
+{
+ /*
+ * Device does not support any interrupt or the interrupts
+ * were not configured
+ */
+ if (!vdev->cdx_irqs)
+ return;
+
+ vfio_cdx_set_msi_trigger(vdev, 0, 0, 0, VFIO_IRQ_SET_DATA_NONE, NULL);
+}
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index 9cff8d75789e..5dd5f5ad7686 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -61,6 +61,7 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev)
kfree(vdev->regions);
cdx_dev_reset(core_vdev->dev);
+ vfio_cdx_irqs_cleanup(vdev);
}
static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags,
@@ -123,7 +124,7 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
info.flags |= VFIO_DEVICE_FLAGS_RESET;
info.num_regions = cdx_dev->res_count;
- info.num_irqs = 0;
+ info.num_irqs = cdx_dev->num_msi ? 1 : 0;
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
@@ -152,6 +153,62 @@ static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev,
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
+static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev,
+ struct vfio_irq_info __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_irq_info, count);
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ struct vfio_irq_info info;
+
+ if (copy_from_user(&info, arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ if (info.index >= 1)
+ return -EINVAL;
+
+ if (!cdx_dev->num_msi)
+ return -EINVAL;
+
+ info.flags = VFIO_IRQ_INFO_EVENTFD | VFIO_IRQ_INFO_NORESIZE;
+ info.count = cdx_dev->num_msi;
+
+ return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static int vfio_cdx_ioctl_set_irqs(struct vfio_cdx_device *vdev,
+ struct vfio_irq_set __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_irq_set, count);
+ struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
+ struct vfio_irq_set hdr;
+ size_t data_size = 0;
+ u8 *data = NULL;
+ int ret = 0;
+
+ if (copy_from_user(&hdr, arg, minsz))
+ return -EFAULT;
+
+ ret = vfio_set_irqs_validate_and_prepare(&hdr, cdx_dev->num_msi,
+ 1, &data_size);
+ if (ret)
+ return ret;
+
+ if (data_size) {
+ data = memdup_user(arg->data, data_size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ }
+
+ ret = vfio_cdx_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
+ hdr.start, hdr.count, data);
+ kfree(data);
+
+ return ret;
+}
+
static long vfio_cdx_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
@@ -164,6 +221,10 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev,
return vfio_cdx_ioctl_get_info(vdev, uarg);
case VFIO_DEVICE_GET_REGION_INFO:
return vfio_cdx_ioctl_get_region_info(vdev, uarg);
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ return vfio_cdx_ioctl_get_irq_info(vdev, uarg);
+ case VFIO_DEVICE_SET_IRQS:
+ return vfio_cdx_ioctl_set_irqs(vdev, uarg);
case VFIO_DEVICE_RESET:
return cdx_dev_reset(core_vdev->dev);
default:
@@ -286,4 +347,4 @@ module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver");
-MODULE_IMPORT_NS(CDX_BUS);
+MODULE_IMPORT_NS("CDX_BUS");
diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h
index 8e9d25913728..dc56729b3114 100644
--- a/drivers/vfio/cdx/private.h
+++ b/drivers/vfio/cdx/private.h
@@ -13,6 +13,14 @@ static inline u64 vfio_cdx_index_to_offset(u32 index)
return ((u64)(index) << VFIO_CDX_OFFSET_SHIFT);
}
+struct vfio_cdx_irq {
+ u32 flags;
+ u32 count;
+ int irq_no;
+ struct eventfd_ctx *trigger;
+ char *name;
+};
+
struct vfio_cdx_region {
u32 flags;
u32 type;
@@ -23,8 +31,18 @@ struct vfio_cdx_region {
struct vfio_cdx_device {
struct vfio_device vdev;
struct vfio_cdx_region *regions;
+ struct vfio_cdx_irq *cdx_irqs;
u32 flags;
#define BME_SUPPORT BIT(0)
+ u32 msi_count;
+ u8 config_msi;
};
+int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev,
+ u32 flags, unsigned int index,
+ unsigned int start, unsigned int count,
+ void *data);
+
+void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev);
+
#endif /* VFIO_CDX_PRIVATE_H */
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index e75da0a70d1f..281a8dc3ed49 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -39,6 +39,13 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
filep->private_data = df;
+ /*
+ * Use the pseudo fs inode on the device to link all mmaps
+ * to the same address space, allowing us to unmap all vmas
+ * associated to this device using unmap_mapping_range().
+ */
+ filep->f_mapping = device->inode->i_mapping;
+
return 0;
err_put_registration:
@@ -155,9 +162,9 @@ void vfio_df_unbind_iommufd(struct vfio_device_file *df)
int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
struct vfio_device_attach_iommufd_pt __user *arg)
{
- struct vfio_device *device = df->device;
struct vfio_device_attach_iommufd_pt attach;
- unsigned long minsz;
+ struct vfio_device *device = df->device;
+ unsigned long minsz, xend = 0;
int ret;
minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id);
@@ -165,11 +172,34 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
if (copy_from_user(&attach, arg, minsz))
return -EFAULT;
- if (attach.argsz < minsz || attach.flags)
+ if (attach.argsz < minsz)
return -EINVAL;
+ if (attach.flags & ~VFIO_DEVICE_ATTACH_PASID)
+ return -EINVAL;
+
+ if (attach.flags & VFIO_DEVICE_ATTACH_PASID) {
+ if (!device->ops->pasid_attach_ioas)
+ return -EOPNOTSUPP;
+ xend = offsetofend(struct vfio_device_attach_iommufd_pt, pasid);
+ }
+
+ if (xend) {
+ if (attach.argsz < xend)
+ return -EINVAL;
+
+ if (copy_from_user((void *)&attach + minsz,
+ (void __user *)arg + minsz, xend - minsz))
+ return -EFAULT;
+ }
+
mutex_lock(&device->dev_set->lock);
- ret = device->ops->attach_ioas(device, &attach.pt_id);
+ if (attach.flags & VFIO_DEVICE_ATTACH_PASID)
+ ret = device->ops->pasid_attach_ioas(device,
+ attach.pasid,
+ &attach.pt_id);
+ else
+ ret = device->ops->attach_ioas(device, &attach.pt_id);
if (ret)
goto out_unlock;
@@ -191,20 +221,41 @@ out_unlock:
int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
struct vfio_device_detach_iommufd_pt __user *arg)
{
- struct vfio_device *device = df->device;
struct vfio_device_detach_iommufd_pt detach;
- unsigned long minsz;
+ struct vfio_device *device = df->device;
+ unsigned long minsz, xend = 0;
minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags);
if (copy_from_user(&detach, arg, minsz))
return -EFAULT;
- if (detach.argsz < minsz || detach.flags)
+ if (detach.argsz < minsz)
+ return -EINVAL;
+
+ if (detach.flags & ~VFIO_DEVICE_DETACH_PASID)
return -EINVAL;
+ if (detach.flags & VFIO_DEVICE_DETACH_PASID) {
+ if (!device->ops->pasid_detach_ioas)
+ return -EOPNOTSUPP;
+ xend = offsetofend(struct vfio_device_detach_iommufd_pt, pasid);
+ }
+
+ if (xend) {
+ if (detach.argsz < xend)
+ return -EINVAL;
+
+ if (copy_from_user((void *)&detach + minsz,
+ (void __user *)arg + minsz, xend - minsz))
+ return -EFAULT;
+ }
+
mutex_lock(&device->dev_set->lock);
- device->ops->detach_ioas(device);
+ if (detach.flags & VFIO_DEVICE_DETACH_PASID)
+ device->ops->pasid_detach_ioas(device, detach.pasid);
+ else
+ device->ops->detach_ioas(device);
mutex_unlock(&device->dev_set->lock);
return 0;
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
index 82b2afa9b7e3..7e7988c4258f 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
@@ -108,10 +108,10 @@ static int vfio_fsl_mc_set_irq_trigger(struct vfio_fsl_mc_device *vdev,
void *data)
{
struct fsl_mc_device *mc_dev = vdev->mc_dev;
- int ret, hwirq;
struct vfio_fsl_mc_irq *irq;
struct device *cont_dev = fsl_mc_cont_dev(&mc_dev->dev);
struct fsl_mc_device *mc_cont = to_fsl_mc_device(cont_dev);
+ int ret;
if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
return vfio_set_trigger(vdev, index, -1);
@@ -136,8 +136,6 @@ static int vfio_fsl_mc_set_irq_trigger(struct vfio_fsl_mc_device *vdev,
return vfio_set_trigger(vdev, index, fd);
}
- hwirq = vdev->mc_dev->irqs[index]->virq;
-
irq = &vdev->mc_irqs[index];
if (flags & VFIO_IRQ_SET_DATA_NONE) {
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index 610a429c6191..c321d442f0da 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -104,15 +104,14 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
{
struct vfio_container *container;
struct iommufd_ctx *iommufd;
- struct fd f;
int ret;
int fd;
if (get_user(fd, arg))
return -EFAULT;
- f = fdget(fd);
- if (!f.file)
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
mutex_lock(&group->group_lock);
@@ -125,13 +124,13 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
goto out_unlock;
}
- container = vfio_container_from_file(f.file);
+ container = vfio_container_from_file(fd_file(f));
if (container) {
ret = vfio_container_attach_group(container, group);
goto out_unlock;
}
- iommufd = iommufd_ctx_from_file(f.file);
+ iommufd = iommufd_ctx_from_file(fd_file(f));
if (!IS_ERR(iommufd)) {
if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) &&
group->type == VFIO_NO_IOMMU)
@@ -153,7 +152,6 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
out_unlock:
mutex_unlock(&group->group_lock);
- fdput(f);
return ret;
}
@@ -268,23 +266,18 @@ static struct file *vfio_device_open_file(struct vfio_device *device)
if (ret)
goto err_free;
- /*
- * We can't use anon_inode_getfd() because we need to modify
- * the f_mode flags directly to allow more than just ioctls
- */
- filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
- df, O_RDWR);
+ filep = anon_inode_getfile_fmode("[vfio-device]", &vfio_device_fops,
+ df, O_RDWR, FMODE_PREAD | FMODE_PWRITE);
if (IS_ERR(filep)) {
ret = PTR_ERR(filep);
goto err_close_device;
}
-
/*
- * TODO: add an anon_inode interface to do this.
- * Appears to be missing by lack of need rather than
- * explicitly prevented. Now there's need.
+ * Use the pseudo fs inode on the device to link all mmaps
+ * to the same address space, allowing us to unmap all vmas
+ * associated to this device using unmap_mapping_range().
*/
- filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
+ filep->f_mapping = device->inode->i_mapping;
if (device->group->type == VFIO_NO_IOMMU)
dev_warn(device->dev, "vfio-noiommu device opened by user "
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 82eba6966fa5..c8c3a2d53f86 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -7,8 +7,8 @@
#include "vfio.h"
-MODULE_IMPORT_NS(IOMMUFD);
-MODULE_IMPORT_NS(IOMMUFD_VFIO);
+MODULE_IMPORT_NS("IOMMUFD");
+MODULE_IMPORT_NS("IOMMUFD_VFIO");
bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev,
struct iommufd_ctx *ictx)
@@ -119,16 +119,24 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev,
if (IS_ERR(idev))
return PTR_ERR(idev);
vdev->iommufd_device = idev;
+ ida_init(&vdev->pasids);
return 0;
}
EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind);
void vfio_iommufd_physical_unbind(struct vfio_device *vdev)
{
+ int pasid;
+
lockdep_assert_held(&vdev->dev_set->lock);
+ while ((pasid = ida_find_first(&vdev->pasids)) >= 0) {
+ iommufd_device_detach(vdev->iommufd_device, pasid);
+ ida_free(&vdev->pasids, pasid);
+ }
+
if (vdev->iommufd_attached) {
- iommufd_device_detach(vdev->iommufd_device);
+ iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID);
vdev->iommufd_attached = false;
}
iommufd_device_unbind(vdev->iommufd_device);
@@ -146,9 +154,11 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id)
return -EINVAL;
if (vdev->iommufd_attached)
- rc = iommufd_device_replace(vdev->iommufd_device, pt_id);
+ rc = iommufd_device_replace(vdev->iommufd_device,
+ IOMMU_NO_PASID, pt_id);
else
- rc = iommufd_device_attach(vdev->iommufd_device, pt_id);
+ rc = iommufd_device_attach(vdev->iommufd_device,
+ IOMMU_NO_PASID, pt_id);
if (rc)
return rc;
vdev->iommufd_attached = true;
@@ -163,11 +173,53 @@ void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev)
if (WARN_ON(!vdev->iommufd_device) || !vdev->iommufd_attached)
return;
- iommufd_device_detach(vdev->iommufd_device);
+ iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID);
vdev->iommufd_attached = false;
}
EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas);
+int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev,
+ u32 pasid, u32 *pt_id)
+{
+ int rc;
+
+ lockdep_assert_held(&vdev->dev_set->lock);
+
+ if (WARN_ON(!vdev->iommufd_device))
+ return -EINVAL;
+
+ if (ida_exists(&vdev->pasids, pasid))
+ return iommufd_device_replace(vdev->iommufd_device,
+ pasid, pt_id);
+
+ rc = ida_alloc_range(&vdev->pasids, pasid, pasid, GFP_KERNEL);
+ if (rc < 0)
+ return rc;
+
+ rc = iommufd_device_attach(vdev->iommufd_device, pasid, pt_id);
+ if (rc)
+ ida_free(&vdev->pasids, pasid);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_attach_ioas);
+
+void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev,
+ u32 pasid)
+{
+ lockdep_assert_held(&vdev->dev_set->lock);
+
+ if (WARN_ON(!vdev->iommufd_device))
+ return;
+
+ if (!ida_exists(&vdev->pasids, pasid))
+ return;
+
+ iommufd_device_detach(vdev->iommufd_device, pasid);
+ ida_free(&vdev->pasids, pasid);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_detach_ioas);
+
/*
* The emulated standard ops mean that vfio_device is going to use the
* "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index ed4737de4528..f2e686f8f1ef 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -76,7 +76,7 @@ int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
if (ret)
return ret;
- ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL);
+ ret = class_compat_create_link(mdev_bus_compat_class, dev);
if (ret)
dev_warn(dev, "Failed to create compatibility class link\n");
@@ -98,7 +98,7 @@ void mdev_unregister_parent(struct mdev_parent *parent)
dev_info(parent->dev, "MDEV: Unregistering\n");
down_write(&parent->unreg_sem);
- class_compat_remove_link(mdev_bus_compat_class, parent->dev, NULL);
+ class_compat_remove_link(mdev_bus_compat_class, parent->dev);
device_for_each_child(parent->dev, NULL, mdev_device_remove_cb);
parent_remove_sysfs_files(parent);
up_write(&parent->unreg_sem);
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index b98322966b3e..ad5b834806ff 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -31,7 +31,7 @@ static void mdev_remove(struct device *dev)
drv->remove(to_mdev_device(dev));
}
-static int mdev_match(struct device *dev, struct device_driver *drv)
+static int mdev_match(struct device *dev, const struct device_driver *drv)
{
/*
* No drivers automatically match. Drivers are only bound by explicit
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index 63a1316b08b7..5f61acd0fe42 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -10,9 +10,6 @@
#ifndef MDEV_PRIVATE_H
#define MDEV_PRIVATE_H
-int mdev_bus_register(void);
-void mdev_bus_unregister(void);
-
extern const struct bus_type mdev_bus_type;
extern const struct attribute_group *mdev_device_groups[];
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 9d2738e10c0b..e44bb44c581e 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -160,7 +160,7 @@ static void mdev_type_release(struct kobject *kobj)
put_device(type->parent->dev);
}
-static struct kobj_type mdev_type_ktype = {
+static const struct kobj_type mdev_type_ktype = {
.sysfs_ops = &mdev_type_sysfs_ops,
.release = mdev_type_release,
.default_groups = mdev_type_groups,
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 15821a2d77d2..c3bcb6911c53 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -7,10 +7,6 @@ config VFIO_PCI_CORE
select VFIO_VIRQFD
select IRQ_BYPASS_MANAGER
-config VFIO_PCI_MMAP
- def_bool y if !S390
- depends on VFIO_PCI_CORE
-
config VFIO_PCI_INTX
def_bool y if !S390
depends on VFIO_PCI_CORE
@@ -69,4 +65,6 @@ source "drivers/vfio/pci/virtio/Kconfig"
source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
+source "drivers/vfio/pci/qat/Kconfig"
+
endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index ce7a61f1d912..cf00c0a7e55c 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -17,3 +17,5 @@ obj-$(CONFIG_PDS_VFIO_PCI) += pds/
obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
+
+obj-$(CONFIG_QAT_VFIO_PCI) += qat/
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 9a3e97108ace..2149f49aeec7 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -190,9 +190,10 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
int ret;
/* Check VF state */
- if (unlikely(hisi_qm_wait_mb_ready(qm))) {
+ ret = hisi_qm_wait_mb_ready(qm);
+ if (unlikely(ret)) {
dev_err(&qm->pdev->dev, "QM device is not ready to write\n");
- return -EBUSY;
+ return ret;
}
ret = qm_write_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
@@ -325,13 +326,15 @@ static void qm_dev_cmd_init(struct hisi_qm *qm)
static int vf_qm_cache_wb(struct hisi_qm *qm)
{
unsigned int val;
+ int ret;
writel(0x1, qm->io_base + QM_CACHE_WB_START);
- if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE,
+ ret = readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE,
val, val & BIT(0), MB_POLL_PERIOD_US,
- MB_POLL_TIMEOUT_US)) {
+ MB_POLL_TIMEOUT_US);
+ if (ret) {
dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail\n");
- return -EINVAL;
+ return ret;
}
return 0;
@@ -350,6 +353,32 @@ static int vf_qm_func_stop(struct hisi_qm *qm)
return hisi_qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0);
}
+static int vf_qm_version_check(struct acc_vf_data *vf_data, struct device *dev)
+{
+ switch (vf_data->acc_magic) {
+ case ACC_DEV_MAGIC_V2:
+ if (vf_data->major_ver != ACC_DRV_MAJOR_VER) {
+ dev_info(dev, "migration driver version<%u.%u> not match!\n",
+ vf_data->major_ver, vf_data->minor_ver);
+ return -EINVAL;
+ }
+ break;
+ case ACC_DEV_MAGIC_V1:
+ /* Correct dma address */
+ vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH];
+ vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET;
+ vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW];
+ vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH];
+ vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET;
+ vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW];
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
struct hisi_acc_vf_migration_file *migf)
{
@@ -363,9 +392,10 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done)
return 0;
- if (vf_data->acc_magic != ACC_DEV_MAGIC) {
+ ret = vf_qm_version_check(vf_data, dev);
+ if (ret) {
dev_err(dev, "failed to match ACC_DEV_MAGIC\n");
- return -EINVAL;
+ return ret;
}
if (vf_data->dev_id != hisi_acc_vdev->vf_dev->device) {
@@ -377,7 +407,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
ret = qm_get_vft(vf_qm, &vf_qm->qp_base);
if (ret <= 0) {
dev_err(dev, "failed to get vft qp nums\n");
- return -EINVAL;
+ return ret;
}
if (ret != vf_data->qp_num) {
@@ -399,13 +429,6 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return -EINVAL;
}
- ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
- if (ret) {
- dev_err(dev, "failed to write QM_VF_STATE\n");
- return ret;
- }
-
- hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
hisi_acc_vdev->match_done = true;
return 0;
}
@@ -418,7 +441,9 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
int vf_id = hisi_acc_vdev->vf_id;
int ret;
- vf_data->acc_magic = ACC_DEV_MAGIC;
+ vf_data->acc_magic = ACC_DEV_MAGIC_V2;
+ vf_data->major_ver = ACC_DRV_MAJOR_VER;
+ vf_data->minor_ver = ACC_DRV_MINOR_VER;
/* Save device id */
vf_data->dev_id = hisi_acc_vdev->vf_dev->device;
@@ -441,6 +466,19 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return 0;
}
+static void vf_qm_xeqc_save(struct hisi_qm *qm,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ u16 eq_head, aeq_head;
+
+ eq_head = vf_data->qm_eqc_dw[0] & 0xFFFF;
+ qm_db(qm, 0, QM_DOORBELL_CMD_EQ, eq_head, 0);
+
+ aeq_head = vf_data->qm_aeqc_dw[0] & 0xFFFF;
+ qm_db(qm, 0, QM_DOORBELL_CMD_AEQ, aeq_head, 0);
+}
+
static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
struct hisi_acc_vf_migration_file *migf)
{
@@ -456,6 +494,20 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
if (migf->total_length < sizeof(struct acc_vf_data))
return -EINVAL;
+ if (!vf_data->eqe_dma || !vf_data->aeqe_dma ||
+ !vf_data->sqc_dma || !vf_data->cqc_dma) {
+ dev_info(dev, "resume dma addr is NULL!\n");
+ hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
+ return 0;
+ }
+
+ ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_VF_STATE\n");
+ return ret;
+ }
+ hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+
qm->eqe_dma = vf_data->eqe_dma;
qm->aeqe_dma = vf_data->aeqe_dma;
qm->sqc_dma = vf_data->sqc_dma;
@@ -486,57 +538,65 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return 0;
}
-static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
- struct hisi_acc_vf_migration_file *migf)
+static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data)
{
- struct acc_vf_data *vf_data = &migf->vf_data;
- struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
struct device *dev = &vf_qm->pdev->dev;
int ret;
- if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
- /* Update state and return with match data */
- vf_data->vf_qm_state = QM_NOT_READY;
- hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
- migf->total_length = QM_MATCH_SIZE;
- return 0;
- }
-
- vf_data->vf_qm_state = QM_READY;
- hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
-
- ret = vf_qm_cache_wb(vf_qm);
- if (ret) {
- dev_err(dev, "failed to writeback QM Cache!\n");
- return ret;
- }
-
ret = qm_get_regs(vf_qm, vf_data);
if (ret)
- return -EINVAL;
+ return ret;
/* Every reg is 32 bit, the dma address is 64 bit. */
- vf_data->eqe_dma = vf_data->qm_eqc_dw[1];
+ vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH];
vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET;
- vf_data->eqe_dma |= vf_data->qm_eqc_dw[0];
- vf_data->aeqe_dma = vf_data->qm_aeqc_dw[1];
+ vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW];
+ vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH];
vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET;
- vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[0];
+ vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW];
/* Through SQC_BT/CQC_BT to get sqc and cqc address */
ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma);
if (ret) {
dev_err(dev, "failed to read SQC addr!\n");
- return -EINVAL;
+ return ret;
}
ret = qm_get_cqc(vf_qm, &vf_data->cqc_dma);
if (ret) {
dev_err(dev, "failed to read CQC addr!\n");
- return -EINVAL;
+ return ret;
}
+ return 0;
+}
+
+static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ int ret;
+
+ if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
+ /* Update state and return with match data */
+ vf_data->vf_qm_state = QM_NOT_READY;
+ hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+ migf->total_length = QM_MATCH_SIZE;
+ return 0;
+ }
+
+ vf_data->vf_qm_state = QM_READY;
+ hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+
+ ret = vf_qm_read_data(vf_qm, vf_data);
+ if (ret)
+ return ret;
+
migf->total_length = sizeof(struct acc_vf_data);
+ /* Save eqc and aeqc interrupt information */
+ vf_qm_xeqc_save(vf_qm, migf);
+
return 0;
}
@@ -615,21 +675,43 @@ static void hisi_acc_vf_disable_fd(struct hisi_acc_vf_migration_file *migf)
mutex_unlock(&migf->lock);
}
+static void
+hisi_acc_debug_migf_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *src_migf)
+{
+ struct hisi_acc_vf_migration_file *dst_migf = hisi_acc_vdev->debug_migf;
+
+ if (!dst_migf)
+ return;
+
+ dst_migf->total_length = src_migf->total_length;
+ memcpy(&dst_migf->vf_data, &src_migf->vf_data,
+ sizeof(struct acc_vf_data));
+}
+
static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
if (hisi_acc_vdev->resuming_migf) {
+ hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->resuming_migf);
hisi_acc_vf_disable_fd(hisi_acc_vdev->resuming_migf);
fput(hisi_acc_vdev->resuming_migf->filp);
hisi_acc_vdev->resuming_migf = NULL;
}
if (hisi_acc_vdev->saving_migf) {
+ hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->saving_migf);
hisi_acc_vf_disable_fd(hisi_acc_vdev->saving_migf);
fput(hisi_acc_vdev->saving_migf->filp);
hisi_acc_vdev->saving_migf = NULL;
}
}
+static struct hisi_acc_vf_core_device *hisi_acc_get_vf_dev(struct vfio_device *vdev)
+{
+ return container_of(vdev, struct hisi_acc_vf_core_device,
+ core_device.vdev);
+}
+
static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
@@ -723,7 +805,6 @@ static const struct file_operations hisi_acc_vf_resume_fops = {
.owner = THIS_MODULE,
.write = hisi_acc_vf_resume_write,
.release = hisi_acc_vf_release_file,
- .llseek = no_llseek,
};
static struct hisi_acc_vf_migration_file *
@@ -845,7 +926,6 @@ static const struct file_operations hisi_acc_vf_save_fops = {
.unlocked_ioctl = hisi_acc_vf_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = hisi_acc_vf_release_file,
- .llseek = no_llseek,
};
static struct hisi_acc_vf_migration_file *
@@ -935,6 +1015,13 @@ static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev
dev_err(dev, "failed to check QM INT state!\n");
return ret;
}
+
+ ret = vf_qm_cache_wb(vf_qm);
+ if (ret) {
+ dev_err(dev, "failed to writeback QM cache!\n");
+ return ret;
+ }
+
return 0;
}
@@ -1033,8 +1120,7 @@ static struct file *
hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state new_state)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
enum vfio_device_mig_state next_state;
struct file *res = NULL;
int ret;
@@ -1075,8 +1161,7 @@ static int
hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state *curr_state)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
mutex_lock(&hisi_acc_vdev->state_mutex);
*curr_state = hisi_acc_vdev->mig_state;
@@ -1278,10 +1363,132 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int
return vfio_pci_core_ioctl(core_vdev, cmd, arg);
}
+static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ int ret;
+
+ lockdep_assert_held(&hisi_acc_vdev->open_mutex);
+ /*
+ * When the device is not opened, the io_base is not mapped.
+ * The driver cannot perform device read and write operations.
+ */
+ if (!hisi_acc_vdev->dev_opened) {
+ seq_puts(seq, "device not opened!\n");
+ return -EINVAL;
+ }
+
+ ret = qm_wait_dev_not_ready(vf_qm);
+ if (ret) {
+ seq_puts(seq, "VF device not ready!\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int hisi_acc_vf_debug_cmd(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ u64 value;
+ int ret;
+
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ ret = hisi_acc_vf_debug_check(seq, vdev);
+ if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ return ret;
+ }
+
+ value = readl(vf_qm->io_base + QM_MB_CMD_SEND_BASE);
+ if (value == QM_MB_CMD_NOT_READY) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ seq_puts(seq, "mailbox cmd channel not ready!\n");
+ return -EINVAL;
+ }
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ seq_puts(seq, "mailbox cmd channel ready!\n");
+
+ return 0;
+}
+
+static int hisi_acc_vf_dev_read(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ size_t vf_data_sz = offsetofend(struct acc_vf_data, padding);
+ struct acc_vf_data *vf_data;
+ int ret;
+
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ ret = hisi_acc_vf_debug_check(seq, vdev);
+ if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ return ret;
+ }
+
+ mutex_lock(&hisi_acc_vdev->state_mutex);
+ vf_data = kzalloc(sizeof(*vf_data), GFP_KERNEL);
+ if (!vf_data) {
+ ret = -ENOMEM;
+ goto mutex_release;
+ }
+
+ vf_data->vf_qm_state = hisi_acc_vdev->vf_qm_state;
+ ret = vf_qm_read_data(&hisi_acc_vdev->vf_qm, vf_data);
+ if (ret)
+ goto migf_err;
+
+ seq_hex_dump(seq, "Dev Data:", DUMP_PREFIX_OFFSET, 16, 1,
+ (const void *)vf_data, vf_data_sz, false);
+
+ seq_printf(seq,
+ "guest driver load: %u\n"
+ "data size: %lu\n",
+ hisi_acc_vdev->vf_qm_state,
+ sizeof(struct acc_vf_data));
+
+migf_err:
+ kfree(vf_data);
+mutex_release:
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+
+ return ret;
+}
+
+static int hisi_acc_vf_migf_read(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ size_t vf_data_sz = offsetofend(struct acc_vf_data, padding);
+ struct hisi_acc_vf_migration_file *debug_migf = hisi_acc_vdev->debug_migf;
+
+ /* Check whether the live migration operation has been performed */
+ if (debug_migf->total_length < QM_MATCH_SIZE) {
+ seq_puts(seq, "device not migrated!\n");
+ return -EAGAIN;
+ }
+
+ seq_hex_dump(seq, "Mig Data:", DUMP_PREFIX_OFFSET, 16, 1,
+ (const void *)&debug_migf->vf_data, vf_data_sz, false);
+ seq_printf(seq, "migrate data length: %lu\n", debug_migf->total_length);
+
+ return 0;
+}
+
static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
int ret;
@@ -1290,12 +1497,16 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
return ret;
if (core_vdev->mig_ops) {
+ mutex_lock(&hisi_acc_vdev->open_mutex);
ret = hisi_acc_vf_qm_init(hisi_acc_vdev);
if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_disable(vdev);
return ret;
}
hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ hisi_acc_vdev->dev_opened = true;
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
}
vfio_pci_core_finish_enable(vdev);
@@ -1304,11 +1515,14 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ hisi_acc_vf_disable_fds(hisi_acc_vdev);
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ hisi_acc_vdev->dev_opened = false;
iounmap(vf_qm->io_base);
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_close_device(core_vdev);
}
@@ -1320,15 +1534,16 @@ static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = {
static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
struct hisi_qm *pf_qm = hisi_acc_get_pf_qm(pdev);
hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1;
hisi_acc_vdev->pf_qm = pf_qm;
hisi_acc_vdev->vf_dev = pdev;
+ hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
mutex_init(&hisi_acc_vdev->state_mutex);
+ mutex_init(&hisi_acc_vdev->open_mutex);
core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY;
core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops;
@@ -1374,6 +1589,47 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
+static void hisi_acc_vfio_debug_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct vfio_device *vdev = &hisi_acc_vdev->core_device.vdev;
+ struct hisi_acc_vf_migration_file *migf;
+ struct dentry *vfio_dev_migration;
+ struct dentry *vfio_hisi_acc;
+ struct device *dev = vdev->dev;
+
+ if (!debugfs_initialized() ||
+ !IS_ENABLED(CONFIG_VFIO_DEBUGFS))
+ return;
+
+ if (vdev->ops != &hisi_acc_vfio_pci_migrn_ops)
+ return;
+
+ vfio_dev_migration = debugfs_lookup("migration", vdev->debug_root);
+ if (!vfio_dev_migration) {
+ dev_err(dev, "failed to lookup migration debugfs file!\n");
+ return;
+ }
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return;
+ hisi_acc_vdev->debug_migf = migf;
+
+ vfio_hisi_acc = debugfs_create_dir("hisi_acc", vfio_dev_migration);
+ debugfs_create_devm_seqfile(dev, "dev_data", vfio_hisi_acc,
+ hisi_acc_vf_dev_read);
+ debugfs_create_devm_seqfile(dev, "migf_data", vfio_hisi_acc,
+ hisi_acc_vf_migf_read);
+ debugfs_create_devm_seqfile(dev, "cmd_state", vfio_hisi_acc,
+ hisi_acc_vf_debug_cmd);
+}
+
+static void hisi_acc_vf_debugfs_exit(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ kfree(hisi_acc_vdev->debug_migf);
+ hisi_acc_vdev->debug_migf = NULL;
+}
+
static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct hisi_acc_vf_core_device *hisi_acc_vdev;
@@ -1400,6 +1656,8 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device
ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device);
if (ret)
goto out_put_vdev;
+
+ hisi_acc_vfio_debug_init(hisi_acc_vdev);
return 0;
out_put_vdev:
@@ -1412,6 +1670,7 @@ static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev)
struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev);
vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device);
+ hisi_acc_vf_debugfs_exit(hisi_acc_vdev);
vfio_put_device(&hisi_acc_vdev->core_device.vdev);
}
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index 5bab46602fad..91002ceeebc1 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -32,12 +32,16 @@
#define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0)
#define QM_SQC_VFT_NUM_SHIFT_V2 45
#define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0)
+#define QM_MB_CMD_NOT_READY 0xffffffff
/* RW regs */
#define QM_REGS_MAX_LEN 7
#define QM_REG_ADDR_OFFSET 0x0004
#define QM_XQC_ADDR_OFFSET 32U
+#define QM_XQC_ADDR_LOW 0x1
+#define QM_XQC_ADDR_HIGH 0x2
+
#define QM_VF_AEQ_INT_MASK 0x0004
#define QM_VF_EQ_INT_MASK 0x000c
#define QM_IFC_INT_SOURCE_V 0x0020
@@ -49,10 +53,15 @@
#define QM_EQC_DW0 0X8000
#define QM_AEQC_DW0 0X8020
+#define ACC_DRV_MAJOR_VER 1
+#define ACC_DRV_MINOR_VER 0
+
+#define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC
+#define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE
+
struct acc_vf_data {
#define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state)
/* QM match information */
-#define ACC_DEV_MAGIC 0XCDCDCDCDFEEDAACC
u64 acc_magic;
u32 qp_num;
u32 dev_id;
@@ -60,7 +69,9 @@ struct acc_vf_data {
u32 qp_base;
u32 vf_qm_state;
/* QM reserved match information */
- u32 qm_rsv_state[3];
+ u16 major_ver;
+ u16 minor_ver;
+ u32 qm_rsv_state[2];
/* QM RW regs */
u32 aeq_int_mask;
@@ -99,6 +110,13 @@ struct hisi_acc_vf_migration_file {
struct hisi_acc_vf_core_device {
struct vfio_pci_core_device core_device;
u8 match_done;
+ /*
+ * io_base is only valid when dev_opened is true,
+ * which is protected by open_mutex.
+ */
+ bool dev_opened;
+ /* Ensure the accuracy of dev_opened operation */
+ struct mutex open_mutex;
/* For migration state */
struct mutex state_mutex;
@@ -107,9 +125,20 @@ struct hisi_acc_vf_core_device {
struct pci_dev *vf_dev;
struct hisi_qm *pf_qm;
struct hisi_qm vf_qm;
+ /*
+ * vf_qm_state represents the QM_VF_STATE register value.
+ * It is set by Guest driver for the ACC VF dev indicating
+ * the driver has loaded and configured the dev correctly.
+ */
u32 vf_qm_state;
int vf_id;
struct hisi_acc_vf_migration_file *resuming_migf;
struct hisi_acc_vf_migration_file *saving_migf;
+
+ /*
+ * It holds migration data corresponding to the last migration
+ * and is used by the debugfs interface to report it.
+ */
+ struct hisi_acc_vf_migration_file *debug_migf;
};
#endif /* HISI_ACC_VFIO_PCI_H */
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 41a4b0cf4297..5b919a0b2524 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -313,40 +313,21 @@ err_exec:
return ret;
}
-static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
- struct mlx5_vhca_data_buffer *buf,
- struct mlx5_vhca_recv_buf *recv_buf,
- u32 *mkey)
+static u32 *alloc_mkey_in(u32 npages, u32 pdn)
{
- size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
- recv_buf->npages;
- int err = 0, inlen;
- __be64 *mtt;
+ int inlen;
void *mkc;
u32 *in;
inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
- sizeof(*mtt) * round_up(npages, 2);
+ sizeof(__be64) * round_up(npages, 2);
- in = kvzalloc(inlen, GFP_KERNEL);
+ in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT);
if (!in)
- return -ENOMEM;
+ return NULL;
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
DIV_ROUND_UP(npages, 2));
- mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
-
- if (buf) {
- struct sg_dma_page_iter dma_iter;
-
- for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
- *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
- } else {
- int i;
-
- for (i = 0; i < npages; i++)
- *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
- }
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
@@ -360,8 +341,81 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
- err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
- kvfree(in);
+
+ return in;
+}
+
+static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in,
+ u32 *mkey)
+{
+ int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
+ sizeof(__be64) * round_up(npages, 2);
+
+ return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen);
+}
+
+static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ u32 *mkey_in, struct dma_iova_state *state,
+ enum dma_data_direction dir)
+{
+ dma_addr_t addr;
+ __be64 *mtt;
+ int i;
+
+ if (dma_use_iova(state)) {
+ dma_iova_destroy(mdev->device, state, npages * PAGE_SIZE, dir,
+ 0);
+ } else {
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in,
+ klm_pas_mtt);
+ for (i = npages - 1; i >= 0; i--) {
+ addr = be64_to_cpu(mtt[i]);
+ dma_unmap_page(mdev->device, addr, PAGE_SIZE, dir);
+ }
+ }
+}
+
+static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ struct page **page_list, u32 *mkey_in,
+ struct dma_iova_state *state,
+ enum dma_data_direction dir)
+{
+ dma_addr_t addr;
+ size_t mapped = 0;
+ __be64 *mtt;
+ int i, err;
+
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
+
+ if (dma_iova_try_alloc(mdev->device, state, 0, npages * PAGE_SIZE)) {
+ addr = state->addr;
+ for (i = 0; i < npages; i++) {
+ err = dma_iova_link(mdev->device, state,
+ page_to_phys(page_list[i]), mapped,
+ PAGE_SIZE, dir, 0);
+ if (err)
+ goto error;
+ *mtt++ = cpu_to_be64(addr);
+ addr += PAGE_SIZE;
+ mapped += PAGE_SIZE;
+ }
+ err = dma_iova_sync(mdev->device, state, 0, mapped);
+ if (err)
+ goto error;
+ } else {
+ for (i = 0; i < npages; i++) {
+ addr = dma_map_page(mdev->device, page_list[i], 0,
+ PAGE_SIZE, dir);
+ err = dma_mapping_error(mdev->device, addr);
+ if (err)
+ goto error;
+ *mtt++ = cpu_to_be64(addr);
+ }
+ }
+ return 0;
+
+error:
+ unregister_dma_pages(mdev, i, mkey_in, state, dir);
return err;
}
@@ -375,93 +429,97 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
if (mvdev->mdev_detach)
return -ENOTCONN;
- if (buf->dmaed || !buf->allocated_length)
+ if (buf->mkey_in || !buf->npages)
return -EINVAL;
- ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
- if (ret)
- return ret;
+ buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn);
+ if (!buf->mkey_in)
+ return -ENOMEM;
- ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
+ ret = register_dma_pages(mdev, buf->npages, buf->page_list,
+ buf->mkey_in, &buf->state, buf->dma_dir);
if (ret)
- goto err;
+ goto err_register_dma;
- buf->dmaed = true;
+ ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey);
+ if (ret)
+ goto err_create_mkey;
return 0;
-err:
- dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+
+err_create_mkey:
+ unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->state,
+ buf->dma_dir);
+err_register_dma:
+ kvfree(buf->mkey_in);
+ buf->mkey_in = NULL;
return ret;
}
+static void free_page_list(u32 npages, struct page **page_list)
+{
+ int i;
+
+ /* Undo alloc_pages_bulk() */
+ for (i = npages - 1; i >= 0; i--)
+ __free_page(page_list[i]);
+
+ kvfree(page_list);
+}
+
void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
{
- struct mlx5_vf_migration_file *migf = buf->migf;
- struct sg_page_iter sg_iter;
+ struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
+ struct mlx5_core_dev *mdev = mvdev->mdev;
- lockdep_assert_held(&migf->mvdev->state_mutex);
- WARN_ON(migf->mvdev->mdev_detach);
+ lockdep_assert_held(&mvdev->state_mutex);
+ WARN_ON(mvdev->mdev_detach);
- if (buf->dmaed) {
- mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
- dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
- buf->dma_dir, 0);
+ if (buf->mkey_in) {
+ mlx5_core_destroy_mkey(mdev, buf->mkey);
+ unregister_dma_pages(mdev, buf->npages, buf->mkey_in,
+ &buf->state, buf->dma_dir);
+ kvfree(buf->mkey_in);
}
- /* Undo alloc_pages_bulk_array() */
- for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
- __free_page(sg_page_iter_page(&sg_iter));
- sg_free_append_table(&buf->table);
+ free_page_list(buf->npages, buf->page_list);
kfree(buf);
}
-static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
- unsigned int npages)
+static int mlx5vf_add_pages(struct page ***page_list, unsigned int npages)
{
- unsigned int to_alloc = npages;
- struct page **page_list;
- unsigned long filled;
- unsigned int to_fill;
- int ret;
+ unsigned int filled, done = 0;
+ int i;
- to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
- page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
- if (!page_list)
+ *page_list =
+ kvcalloc(npages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!*page_list)
return -ENOMEM;
- do {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
- page_list);
- if (!filled) {
- ret = -ENOMEM;
+ for (;;) {
+ filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, npages - done,
+ *page_list + done);
+ if (!filled)
goto err;
- }
- to_alloc -= filled;
- ret = sg_alloc_append_table_from_pages(
- &buf->table, page_list, filled, 0,
- filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
- GFP_KERNEL_ACCOUNT);
- if (ret)
- goto err;
- buf->allocated_length += filled * PAGE_SIZE;
- /* clean input for another bulk allocation */
- memset(page_list, 0, filled * sizeof(*page_list));
- to_fill = min_t(unsigned int, to_alloc,
- PAGE_SIZE / sizeof(*page_list));
- } while (to_alloc > 0);
+ done += filled;
+ if (done == npages)
+ break;
+ }
- kvfree(page_list);
return 0;
err:
- kvfree(page_list);
- return ret;
+ for (i = 0; i < done; i++)
+ __free_page(*page_list[i]);
+
+ kvfree(*page_list);
+ *page_list = NULL;
+ return -ENOMEM;
}
struct mlx5_vhca_data_buffer *
-mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length,
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
enum dma_data_direction dma_dir)
{
struct mlx5_vhca_data_buffer *buf;
@@ -473,12 +531,13 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
buf->dma_dir = dma_dir;
buf->migf = migf;
- if (length) {
- ret = mlx5vf_add_migration_pages(buf,
- DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (npages) {
+ ret = mlx5vf_add_pages(&buf->page_list, npages);
if (ret)
goto end;
+ buf->npages = npages;
+
if (dma_dir != DMA_NONE) {
ret = mlx5vf_dma_data_buffer(buf);
if (ret)
@@ -501,8 +560,8 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
}
struct mlx5_vhca_data_buffer *
-mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir)
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir)
{
struct mlx5_vhca_data_buffer *buf, *temp_buf;
struct list_head free_list;
@@ -517,7 +576,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
if (buf->dma_dir == dma_dir) {
list_del_init(&buf->buf_elm);
- if (buf->allocated_length >= length) {
+ if (buf->npages >= npages) {
spin_unlock_irq(&migf->list_lock);
goto found;
}
@@ -531,7 +590,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
}
}
spin_unlock_irq(&migf->list_lock);
- buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
+ buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir);
found:
while ((temp_buf = list_first_entry_or_null(&free_list,
@@ -712,7 +771,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
MLX5_SET(save_vhca_state_in, in, op_mod, 0);
MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
- MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
+ MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE);
MLX5_SET(save_vhca_state_in, in, incremental, inc);
MLX5_SET(save_vhca_state_in, in, set_track, track);
@@ -734,8 +793,11 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}
if (!header_buf) {
- header_buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ header_buf = mlx5vf_get_data_buffer(
+ migf,
+ DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(header_buf)) {
err = PTR_ERR(header_buf);
goto err_free;
@@ -779,7 +841,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (mvdev->mdev_detach)
return -ENOTCONN;
- if (!buf->dmaed) {
+ if (!buf->mkey_in) {
err = mlx5vf_dma_data_buffer(buf);
if (err)
return err;
@@ -1334,103 +1396,16 @@ static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
kfree(qp);
}
-static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
-{
- int i;
-
- /* Undo alloc_pages_bulk_array() */
- for (i = 0; i < recv_buf->npages; i++)
- __free_page(recv_buf->page_list[i]);
-
- kvfree(recv_buf->page_list);
-}
-
-static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
- unsigned int npages)
-{
- unsigned int filled = 0, done = 0;
- int i;
-
- recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
- GFP_KERNEL_ACCOUNT);
- if (!recv_buf->page_list)
- return -ENOMEM;
-
- for (;;) {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
- npages - done,
- recv_buf->page_list + done);
- if (!filled)
- goto err;
-
- done += filled;
- if (done == npages)
- break;
- }
-
- recv_buf->npages = npages;
- return 0;
-
-err:
- for (i = 0; i < npages; i++) {
- if (recv_buf->page_list[i])
- __free_page(recv_buf->page_list[i]);
- }
-
- kvfree(recv_buf->page_list);
- return -ENOMEM;
-}
-
-static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
- struct mlx5_vhca_recv_buf *recv_buf)
-{
- int i, j;
-
- recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
- sizeof(*recv_buf->dma_addrs),
- GFP_KERNEL_ACCOUNT);
- if (!recv_buf->dma_addrs)
- return -ENOMEM;
-
- for (i = 0; i < recv_buf->npages; i++) {
- recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
- recv_buf->page_list[i],
- 0, PAGE_SIZE,
- DMA_FROM_DEVICE);
- if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
- goto error;
- }
- return 0;
-
-error:
- for (j = 0; j < i; j++)
- dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
- PAGE_SIZE, DMA_FROM_DEVICE);
-
- kvfree(recv_buf->dma_addrs);
- return -ENOMEM;
-}
-
-static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
- struct mlx5_vhca_recv_buf *recv_buf)
-{
- int i;
-
- for (i = 0; i < recv_buf->npages; i++)
- dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
- PAGE_SIZE, DMA_FROM_DEVICE);
-
- kvfree(recv_buf->dma_addrs);
-}
-
static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
struct mlx5_vhca_qp *qp)
{
struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
- unregister_dma_recv_pages(mdev, recv_buf);
- free_recv_pages(&qp->recv_buf);
+ unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in,
+ &recv_buf->state, DMA_FROM_DEVICE);
+ kvfree(recv_buf->mkey_in);
+ free_page_list(recv_buf->npages, recv_buf->page_list);
}
static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
@@ -1441,24 +1416,38 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
int err;
- err = alloc_recv_pages(recv_buf, npages);
- if (err < 0)
+ err = mlx5vf_add_pages(&recv_buf->page_list, npages);
+ if (err)
return err;
- err = register_dma_recv_pages(mdev, recv_buf);
- if (err)
+ recv_buf->npages = npages;
+
+ recv_buf->mkey_in = alloc_mkey_in(npages, pdn);
+ if (!recv_buf->mkey_in) {
+ err = -ENOMEM;
goto end;
+ }
+
+ err = register_dma_pages(mdev, npages, recv_buf->page_list,
+ recv_buf->mkey_in, &recv_buf->state,
+ DMA_FROM_DEVICE);
+ if (err)
+ goto err_register_dma;
- err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
+ err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey);
if (err)
goto err_create_mkey;
return 0;
err_create_mkey:
- unregister_dma_recv_pages(mdev, recv_buf);
+ unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->state,
+ DMA_FROM_DEVICE);
+err_register_dma:
+ kvfree(recv_buf->mkey_in);
+ recv_buf->mkey_in = NULL;
end:
- free_recv_pages(recv_buf);
+ free_page_list(npages, recv_buf->page_list);
return err;
}
@@ -1513,7 +1502,8 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev,
struct mlx5_vhca_qp *host_qp;
struct mlx5_vhca_qp *fw_qp;
struct mlx5_core_dev *mdev;
- u32 max_msg_size = PAGE_SIZE;
+ u32 log_max_msg_size;
+ u32 max_msg_size;
u64 rq_size = SZ_2M;
u32 max_recv_wr;
int err;
@@ -1530,6 +1520,12 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev,
}
mdev = mvdev->mdev;
+ log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size);
+ max_msg_size = (1ULL << log_max_msg_size);
+ /* The RQ must hold at least 4 WQEs/messages for successful QP creation */
+ if (rq_size < 4 * max_msg_size)
+ rq_size = 4 * max_msg_size;
+
memset(tracker, 0, sizeof(*tracker));
tracker->uar = mlx5_get_uars_page(mdev);
if (IS_ERR(tracker->uar)) {
@@ -1619,25 +1615,41 @@ set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
{
u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
u32 nent = size / entry_size;
+ u32 nent_in_page;
+ u32 nent_to_set;
struct page *page;
+ u32 page_offset;
+ u32 page_index;
+ u32 buf_offset;
+ void *kaddr;
u64 addr;
u64 *buf;
int i;
- if (WARN_ON(index >= qp->recv_buf.npages ||
+ buf_offset = index * qp->max_msg_size;
+ if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE ||
(nent > qp->max_msg_size / entry_size)))
return;
- page = qp->recv_buf.page_list[index];
- buf = kmap_local_page(page);
- for (i = 0; i < nent; i++) {
- addr = MLX5_GET(page_track_report_entry, buf + i,
- dirty_address_low);
- addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
- dirty_address_high) << 32;
- iova_bitmap_set(dirty, addr, qp->tracked_page_size);
- }
- kunmap_local(buf);
+ do {
+ page_index = buf_offset / PAGE_SIZE;
+ page_offset = buf_offset % PAGE_SIZE;
+ nent_in_page = (PAGE_SIZE - page_offset) / entry_size;
+ page = qp->recv_buf.page_list[page_index];
+ kaddr = kmap_local_page(page);
+ buf = kaddr + page_offset;
+ nent_to_set = min(nent, nent_in_page);
+ for (i = 0; i < nent_to_set; i++) {
+ addr = MLX5_GET(page_track_report_entry, buf + i,
+ dirty_address_low);
+ addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
+ dirty_address_high) << 32;
+ iova_bitmap_set(dirty, addr, qp->tracked_page_size);
+ }
+ kunmap_local(kaddr);
+ buf_offset += (nent_to_set * entry_size);
+ nent -= nent_to_set;
+ } while (nent);
}
static void
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index df421dc6de04..d7821b5ca772 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -53,20 +53,17 @@ struct mlx5_vf_migration_header {
};
struct mlx5_vhca_data_buffer {
- struct sg_append_table table;
+ struct page **page_list;
+ struct dma_iova_state state;
loff_t start_pos;
u64 length;
- u64 allocated_length;
+ u32 npages;
u32 mkey;
+ u32 *mkey_in;
enum dma_data_direction dma_dir;
- u8 dmaed:1;
u8 stop_copy_chunk_num;
struct list_head buf_elm;
struct mlx5_vf_migration_file *migf;
- /* Optimize mlx5vf_get_migration_page() for sequential access */
- struct scatterlist *last_offset_sg;
- unsigned int sg_last_entry;
- unsigned long last_offset;
};
struct mlx5vf_async_data {
@@ -133,8 +130,9 @@ struct mlx5_vhca_cq {
struct mlx5_vhca_recv_buf {
u32 npages;
struct page **page_list;
- dma_addr_t *dma_addrs;
+ struct dma_iova_state state;
u32 next_rq_offset;
+ u32 *mkey_in;
u32 mkey;
};
@@ -217,15 +215,24 @@ int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf);
void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf);
void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf);
struct mlx5_vhca_data_buffer *
-mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir);
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir);
void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf);
struct mlx5_vhca_data_buffer *
-mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir);
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir);
void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
-struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
- unsigned long offset);
+static inline struct page *
+mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+ unsigned long offset)
+{
+ int page_entry = offset / PAGE_SIZE;
+
+ if (page_entry >= buf->npages)
+ return NULL;
+
+ return buf->page_list[page_entry];
+}
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
enum mlx5_vf_migf_state *last_save_state);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 61d9b0f9146d..93f894fe60d2 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -34,37 +34,6 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
core_device);
}
-struct page *
-mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
- unsigned long offset)
-{
- unsigned long cur_offset = 0;
- struct scatterlist *sg;
- unsigned int i;
-
- /* All accesses are sequential */
- if (offset < buf->last_offset || !buf->last_offset_sg) {
- buf->last_offset = 0;
- buf->last_offset_sg = buf->table.sgt.sgl;
- buf->sg_last_entry = 0;
- }
-
- cur_offset = buf->last_offset;
-
- for_each_sg(buf->last_offset_sg, sg,
- buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
- if (offset < sg->length + cur_offset) {
- buf->last_offset_sg = sg;
- buf->sg_last_entry += i;
- buf->last_offset = cur_offset;
- return nth_page(sg_page(sg),
- (offset - cur_offset) / PAGE_SIZE);
- }
- cur_offset += sg->length;
- }
- return NULL;
-}
-
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
mutex_lock(&migf->lock);
@@ -308,6 +277,7 @@ static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
u8 index, size_t required_length)
{
+ u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE);
struct mlx5_vhca_data_buffer *buf = migf->buf[index];
u8 chunk_num;
@@ -315,12 +285,11 @@ mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
chunk_num = buf->stop_copy_chunk_num;
buf->migf->buf[index] = NULL;
/* Checking whether the pre-allocated buffer can fit */
- if (buf->allocated_length >= required_length)
+ if (buf->npages >= npages)
return buf;
mlx5vf_put_data_buffer(buf);
- buf = mlx5vf_get_data_buffer(buf->migf, required_length,
- DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE);
if (IS_ERR(buf))
return buf;
@@ -373,7 +342,8 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
u8 *to_buff;
int ret;
- header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
+ header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(header_buf))
return PTR_ERR(header_buf);
@@ -388,7 +358,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header));
header_buf->length = sizeof(header);
- data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
+ data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE);
memcpy(to_buff + sizeof(header), &data, sizeof(data));
header_buf->length += sizeof(data);
kunmap_local(to_buff);
@@ -437,15 +407,20 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
for (i = 0; i < num_chunks; i++) {
- buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(
+ migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE),
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
migf->buf[i] = buf;
- buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ buf = mlx5vf_get_data_buffer(
+ migf,
+ DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
@@ -553,7 +528,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
* We finished transferring the current state and the device has a
* dirty state, save a new state to be ready for.
*/
- buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE),
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
mlx5vf_mark_err(migf);
@@ -587,7 +563,6 @@ static const struct file_operations mlx5vf_save_fops = {
.unlocked_ioctl = mlx5vf_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = mlx5vf_release_file,
- .llseek = no_llseek,
};
static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
@@ -641,14 +616,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
O_RDONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
- goto end;
+ kfree(migf);
+ return ERR_PTR(ret);
}
migf->mvdev = mvdev;
- ret = mlx5vf_cmd_alloc_pd(migf);
- if (ret)
- goto out_free;
-
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
init_waitqueue_head(&migf->poll_wait);
@@ -664,6 +636,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
+
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out;
+
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
if (ret)
goto out_pd;
@@ -674,8 +651,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
if (track) {
/* leave the allocated buffer ready for the stop-copy phase */
- buf = mlx5vf_alloc_data_buffer(migf,
- migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
+ buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages,
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_pd;
@@ -693,10 +670,8 @@ out_save:
mlx5vf_free_data_buffer(buf);
out_pd:
mlx5fv_cmd_clean_migf_resources(migf);
-out_free:
+out:
fput(migf->filp);
-end:
- kfree(migf);
return ERR_PTR(ret);
}
@@ -918,11 +893,14 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
goto out_unlock;
break;
case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
- if (vhca_buf_header->allocated_length < migf->record_size) {
+ {
+ u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE);
+
+ if (vhca_buf_header->npages < npages) {
mlx5vf_free_data_buffer(vhca_buf_header);
- migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
- migf->record_size, DMA_NONE);
+ migf->buf_header[0] = mlx5vf_alloc_data_buffer(
+ migf, npages, DMA_NONE);
if (IS_ERR(migf->buf_header[0])) {
ret = PTR_ERR(migf->buf_header[0]);
migf->buf_header[0] = NULL;
@@ -935,6 +913,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
vhca_buf_header->start_pos = migf->max_pos;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
break;
+ }
case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
&buf, &len, pos, &done);
@@ -945,12 +924,13 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
{
u64 size = max(migf->record_size,
migf->stop_copy_prep_size);
+ u32 npages = DIV_ROUND_UP(size, PAGE_SIZE);
- if (vhca_buf->allocated_length < size) {
+ if (vhca_buf->npages < npages) {
mlx5vf_free_data_buffer(vhca_buf);
- migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
- size, DMA_TO_DEVICE);
+ migf->buf[0] = mlx5vf_alloc_data_buffer(
+ migf, npages, DMA_TO_DEVICE);
if (IS_ERR(migf->buf[0])) {
ret = PTR_ERR(migf->buf[0]);
migf->buf[0] = NULL;
@@ -1000,7 +980,6 @@ static const struct file_operations mlx5vf_resume_fops = {
.owner = THIS_MODULE,
.write = mlx5vf_resume_write,
.release = mlx5vf_release_file,
- .llseek = no_llseek,
};
static struct mlx5_vf_migration_file *
@@ -1018,13 +997,19 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
O_WRONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
- goto end;
+ kfree(migf);
+ return ERR_PTR(ret);
}
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
migf->mvdev = mvdev;
ret = mlx5vf_cmd_alloc_pd(migf);
if (ret)
- goto out_free;
+ goto out;
buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
if (IS_ERR(buf)) {
@@ -1033,8 +1018,11 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
}
migf->buf[0] = buf;
- buf = mlx5vf_alloc_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ buf = mlx5vf_alloc_data_buffer(
+ migf,
+ DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_buf;
@@ -1043,20 +1031,13 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
migf->buf_header[0] = buf;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
- stream_open(migf->filp->f_inode, migf->filp);
- mutex_init(&migf->lock);
- INIT_LIST_HEAD(&migf->buf_list);
- INIT_LIST_HEAD(&migf->avail_list);
- spin_lock_init(&migf->list_lock);
return migf;
out_buf:
mlx5vf_free_data_buffer(migf->buf[0]);
out_pd:
mlx5vf_cmd_dealloc_pd(migf);
-out_free:
+out:
fput(migf->filp);
-end:
- kfree(migf);
return ERR_PTR(ret);
}
@@ -1151,7 +1132,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
if (ret)
return ERR_PTR(ret);
- buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(migf,
+ DIV_ROUND_UP(size, PAGE_SIZE), DMA_FROM_DEVICE);
if (IS_ERR(buf))
return ERR_CAST(buf);
/* pre_copy cleanup */
@@ -1449,7 +1431,7 @@ static struct pci_driver mlx5vf_pci_driver = {
module_pci_driver(mlx5vf_pci_driver);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index a7fd018aa548..e5ac39c4cc6b 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -5,6 +5,8 @@
#include <linux/sizes.h>
#include <linux/vfio_pci_core.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
/*
* The device memory usable to the workloads running in the VM is cached
@@ -17,12 +19,21 @@
#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
-/* Memory size expected as non cached and reserved by the VM driver */
-#define RESMEM_SIZE SZ_1G
-
/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
#define MEMBLK_SIZE SZ_512M
+#define DVSEC_BITMAP_OFFSET 0xA
+#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
+
+#define GPU_CAP_DVSEC_REGISTER 3
+
+#define C2C_LINK_BAR0_OFFSET 0x1498
+#define HBM_TRAINING_BAR0_OFFSET 0x200BC
+#define STATUS_READY 0xFF
+
+#define POLL_QUANTUM_MS 1000
+#define POLL_TIMEOUT_MS (30 * 1000)
+
/*
* The state of the two device memory region - resmem and usemem - is
* saved as struct mem_region.
@@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device {
struct mem_region resmem;
/* Lock to control device memory kernel mapping */
struct mutex remap_lock;
+ bool has_mig_hw_bug;
};
static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index,
if (index == USEMEM_REGION_INDEX)
return &nvdev->usemem;
- if (index == RESMEM_REGION_INDEX)
+ if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
return &nvdev->resmem;
return NULL;
@@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
u64 memphys, u64 memlength)
{
int ret = 0;
+ u64 resmem_size = 0;
/*
- * The VM GPU device driver needs a non-cacheable region to support
- * the MIG feature. Since the device memory is mapped as NORMAL cached,
- * carve out a region from the end with a different NORMAL_NC
- * property (called as reserved memory and represented as resmem). This
- * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
- * exposing the rest (termed as usable memory and represented using usemem)
- * as cacheable 64b BAR (region 4 and 5).
+ * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
+ * region to support the MIG feature owing to a hardware bug. Since the
+ * device memory is mapped as NORMAL cached, carve out a region from the end
+ * with a different NORMAL_NC property (called as reserved memory and
+ * represented as resmem). This region then is exposed as a 64b BAR
+ * (region 2 and 3) to the VM, while exposing the rest (termed as usable
+ * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
*
* devmem (memlength)
* |-------------------------------------------------|
* | |
* usemem.memphys resmem.memphys
+ *
+ * This hardware bug is fixed on the Grace Blackwell platforms and the
+ * presence of the bug can be determined through nvdev->has_mig_hw_bug.
+ * Thus on systems with the hardware fix, there is no need to partition
+ * the GPU device memory and the entire memory is usable and mapped as
+ * NORMAL cached (i.e. resmem size is 0).
*/
+ if (nvdev->has_mig_hw_bug)
+ resmem_size = SZ_1G;
+
nvdev->usemem.memphys = memphys;
/*
* The device memory exposed to the VM is added to the kernel by the
- * VM driver module in chunks of memory block size. Only the usable
- * memory (usemem) is added to the kernel for usage by the VM
- * workloads. Make the usable memory size memblock aligned.
+ * VM driver module in chunks of memory block size. Note that only the
+ * usable memory (usemem) is added to the kernel for usage by the VM
+ * workloads.
*/
- if (check_sub_overflow(memlength, RESMEM_SIZE,
+ if (check_sub_overflow(memlength, resmem_size,
&nvdev->usemem.memlength)) {
ret = -EOVERFLOW;
goto done;
}
/*
- * The USEMEM part of the device memory has to be MEMBLK_SIZE
- * aligned. This is a hardwired ABI value between the GPU FW and
- * VFIO driver. The VM device driver is also aware of it and make
- * use of the value for its calculation to determine USEMEM size.
+ * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
+ * Calculate and save the BAR size for the region.
+ */
+ nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
+
+ /*
+ * If the hardware has the fix for MIG, there is no requirement
+ * for splitting the device memory to create RESMEM. The entire
+ * device memory is usable and will be USEMEM. Return here for
+ * such case.
+ */
+ if (!nvdev->has_mig_hw_bug)
+ goto done;
+
+ /*
+ * When the device memory is split to workaround the MIG bug on
+ * Grace Hopper, the USEMEM part of the device memory has to be
+ * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
+ * GPU FW and VFIO driver. The VM device driver is also aware of it
+ * and make use of the value for its calculation to determine USEMEM
+ * size. Note that the device memory may not be 512M aligned.
*/
nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
MEMBLK_SIZE);
@@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
}
/*
- * The memory regions are exposed as BARs. Calculate and save
- * the BAR size for them.
+ * The resmem region is exposed as a 64b BAR composed of region 2 and 3
+ * for Grace Hopper. Calculate and save the BAR size for the region.
*/
- nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
done:
return ret;
}
+static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
+{
+ int pcie_dvsec;
+ u16 dvsec_ctrl16;
+
+ pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
+ GPU_CAP_DVSEC_REGISTER);
+
+ if (pcie_dvsec) {
+ pci_read_config_word(pdev,
+ pcie_dvsec + DVSEC_BITMAP_OFFSET,
+ &dvsec_ctrl16);
+
+ if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * To reduce the system bootup time, the HBM training has
+ * been moved out of the UEFI on the Grace-Blackwell systems.
+ *
+ * The onus of checking whether the HBM training has completed
+ * thus falls on the module. The HBM training status can be
+ * determined from a BAR0 register.
+ *
+ * Similarly, another BAR0 register exposes the status of the
+ * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
+ *
+ * Poll these register and check for 30s. If the HBM training is
+ * not complete or if the C2C link is not ready, fail the probe.
+ *
+ * While the wait is not required on Grace Hopper systems, it
+ * is beneficial to make the check to ensure the device is in an
+ * expected state.
+ *
+ * Ensure that the BAR0 region is enabled before accessing the
+ * registers.
+ */
+static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
+{
+ unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
+ void __iomem *io;
+ int ret = -ETIME;
+
+ ret = pci_enable_device(pdev);
+ if (ret)
+ return ret;
+
+ ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
+ if (ret)
+ goto request_region_exit;
+
+ io = pci_iomap(pdev, 0, 0);
+ if (!io) {
+ ret = -ENOMEM;
+ goto iomap_exit;
+ }
+
+ do {
+ if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+ (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
+ ret = 0;
+ goto reg_check_exit;
+ }
+ msleep(POLL_QUANTUM_MS);
+ } while (!time_after(jiffies, timeout));
+
+reg_check_exit:
+ pci_iounmap(pdev, io);
+iomap_exit:
+ pci_release_selected_regions(pdev, 1 << 0);
+request_region_exit:
+ pci_disable_device(pdev);
+ return ret;
+}
+
static int nvgrace_gpu_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
@@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
u64 memphys, memlength;
int ret;
+ ret = nvgrace_gpu_wait_device_ready(pdev);
+ if (ret)
+ return ret;
+
ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
if (!ret)
ops = &nvgrace_gpu_pci_ops;
@@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
dev_set_drvdata(&pdev->dev, &nvdev->core_device);
if (ops == &nvgrace_gpu_pci_ops) {
+ nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
+
/*
* Device memory properties are identified in the host ACPI
* table. Set the nvgrace_gpu_pci_core_device structure.
@@ -866,6 +989,10 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
/* GH200 480GB */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
+ /* GH200 SKU */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
+ /* GB200 SKU */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
{}
};
diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
index 68e8f006dfdb..c51f5e4c3dd6 100644
--- a/drivers/vfio/pci/pds/dirty.c
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -3,6 +3,7 @@
#include <linux/interval_tree.h>
#include <linux/vfio.h>
+#include <linux/vmalloc.h>
#include <linux/pds/pds_common.h>
#include <linux/pds/pds_core_if.h>
diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
index 6b94cc0bf45b..f2673d395236 100644
--- a/drivers/vfio/pci/pds/lm.c
+++ b/drivers/vfio/pci/pds/lm.c
@@ -235,7 +235,6 @@ static const struct file_operations pds_vfio_save_fops = {
.owner = THIS_MODULE,
.read = pds_vfio_save_read,
.release = pds_vfio_release_file,
- .llseek = no_llseek,
};
static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
@@ -334,7 +333,6 @@ static const struct file_operations pds_vfio_restore_fops = {
.owner = THIS_MODULE,
.write = pds_vfio_restore_write,
.release = pds_vfio_release_file,
- .llseek = no_llseek,
};
static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index 16e93b11ab1b..4923f1823126 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -187,7 +187,7 @@ static struct pci_driver pds_vfio_pci_driver = {
module_pci_driver(pds_vfio_pci_driver);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/vfio/pci/qat/Kconfig b/drivers/vfio/pci/qat/Kconfig
new file mode 100644
index 000000000000..bf52cfa4b595
--- /dev/null
+++ b/drivers/vfio/pci/qat/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config QAT_VFIO_PCI
+ tristate "VFIO support for QAT VF PCI devices"
+ select VFIO_PCI_CORE
+ depends on CRYPTO_DEV_QAT_4XXX
+ help
+ This provides migration support for Intel(R) QAT Virtual Function
+ using the VFIO framework.
+
+ To compile this as a module, choose M here: the module
+ will be called qat_vfio_pci. If you don't know what to do here,
+ say N.
diff --git a/drivers/vfio/pci/qat/Makefile b/drivers/vfio/pci/qat/Makefile
new file mode 100644
index 000000000000..5fe5c4ec19d3
--- /dev/null
+++ b/drivers/vfio/pci/qat/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o
+qat_vfio_pci-y := main.o
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
new file mode 100644
index 000000000000..845ed15b6771
--- /dev/null
+++ b/drivers/vfio/pci/qat/main.c
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation */
+
+#include <linux/anon_inodes.h>
+#include <linux/container_of.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/qat/qat_mig_dev.h>
+
+/*
+ * The migration data of each Intel QAT VF device is encapsulated into a
+ * 4096 bytes block. The data consists of two parts.
+ * The first is a pre-configured set of attributes of the VF being migrated,
+ * which are only set when it is created. This can be migrated during pre-copy
+ * stage and used for a device compatibility check.
+ * The second is the VF state. This includes the required MMIO regions and
+ * the shadow states maintained by the QAT PF driver. This part can only be
+ * saved when the VF is fully quiesced and be migrated during stop-copy stage.
+ * Both these 2 parts of data are saved in hierarchical structures including
+ * a preamble section and several raw state sections.
+ * When the pre-configured part of the migration data is fully retrieved from
+ * user space, the preamble section are used to validate the correctness of
+ * the data blocks and check the version compatibility. The raw state sections
+ * are then used to do a device compatibility check.
+ * When the device transits from RESUMING state, the VF states are extracted
+ * from the raw state sections of the VF state part of the migration data and
+ * then loaded into the device.
+ */
+
+struct qat_vf_migration_file {
+ struct file *filp;
+ /* protects migration region context */
+ struct mutex lock;
+ bool disabled;
+ struct qat_vf_core_device *qat_vdev;
+ ssize_t filled_size;
+};
+
+struct qat_vf_core_device {
+ struct vfio_pci_core_device core_device;
+ struct qat_mig_dev *mdev;
+ /* protects migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ struct qat_vf_migration_file *resuming_migf;
+ struct qat_vf_migration_file *saving_migf;
+};
+
+static int qat_vf_pci_open_device(struct vfio_device *core_vdev)
+{
+ struct qat_vf_core_device *qat_vdev =
+ container_of(core_vdev, struct qat_vf_core_device,
+ core_device.vdev);
+ struct vfio_pci_core_device *vdev = &qat_vdev->core_device;
+ int ret;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ ret = qat_vfmig_open(qat_vdev->mdev);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
+ }
+ qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+
+ vfio_pci_core_finish_enable(vdev);
+
+ return 0;
+}
+
+static void qat_vf_disable_fd(struct qat_vf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->disabled = true;
+ migf->filp->f_pos = 0;
+ migf->filled_size = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static void qat_vf_disable_fds(struct qat_vf_core_device *qat_vdev)
+{
+ if (qat_vdev->resuming_migf) {
+ qat_vf_disable_fd(qat_vdev->resuming_migf);
+ fput(qat_vdev->resuming_migf->filp);
+ qat_vdev->resuming_migf = NULL;
+ }
+
+ if (qat_vdev->saving_migf) {
+ qat_vf_disable_fd(qat_vdev->saving_migf);
+ fput(qat_vdev->saving_migf->filp);
+ qat_vdev->saving_migf = NULL;
+ }
+}
+
+static void qat_vf_pci_close_device(struct vfio_device *core_vdev)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+ struct qat_vf_core_device, core_device.vdev);
+
+ qat_vfmig_close(qat_vdev->mdev);
+ qat_vf_disable_fds(qat_vdev);
+ vfio_pci_core_close_device(core_vdev);
+}
+
+static long qat_vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct qat_vf_migration_file *migf = filp->private_data;
+ struct qat_vf_core_device *qat_vdev = migf->qat_vdev;
+ struct qat_mig_dev *mig_dev = qat_vdev->mdev;
+ struct vfio_precopy_info info;
+ loff_t *pos = &filp->f_pos;
+ unsigned long minsz;
+ int ret = 0;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&qat_vdev->state_mutex);
+ if (qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ mutex_unlock(&qat_vdev->state_mutex);
+ return -EINVAL;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->disabled) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (*pos > mig_dev->setup_size) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ info.dirty_bytes = 0;
+ info.initial_bytes = mig_dev->setup_size - *pos;
+
+out:
+ mutex_unlock(&migf->lock);
+ mutex_unlock(&qat_vdev->state_mutex);
+ if (ret)
+ return ret;
+ return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+}
+
+static ssize_t qat_vf_save_read(struct file *filp, char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct qat_vf_migration_file *migf = filp->private_data;
+ struct qat_mig_dev *mig_dev = migf->qat_vdev->mdev;
+ ssize_t done = 0;
+ loff_t *offs;
+ int ret;
+
+ if (pos)
+ return -ESPIPE;
+ offs = &filp->f_pos;
+
+ mutex_lock(&migf->lock);
+ if (*offs > migf->filled_size || *offs < 0) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (migf->disabled) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ len = min_t(size_t, migf->filled_size - *offs, len);
+ if (len) {
+ ret = copy_to_user(buf, mig_dev->state + *offs, len);
+ if (ret) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *offs += len;
+ done = len;
+ }
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static int qat_vf_release_file(struct inode *inode, struct file *filp)
+{
+ struct qat_vf_migration_file *migf = filp->private_data;
+
+ qat_vf_disable_fd(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+
+ return 0;
+}
+
+static const struct file_operations qat_vf_save_fops = {
+ .owner = THIS_MODULE,
+ .read = qat_vf_save_read,
+ .unlocked_ioctl = qat_vf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = qat_vf_release_file,
+};
+
+static int qat_vf_save_state(struct qat_vf_core_device *qat_vdev,
+ struct qat_vf_migration_file *migf)
+{
+ int ret;
+
+ ret = qat_vfmig_save_state(qat_vdev->mdev);
+ if (ret)
+ return ret;
+ migf->filled_size = qat_vdev->mdev->state_size;
+
+ return 0;
+}
+
+static int qat_vf_save_setup(struct qat_vf_core_device *qat_vdev,
+ struct qat_vf_migration_file *migf)
+{
+ int ret;
+
+ ret = qat_vfmig_save_setup(qat_vdev->mdev);
+ if (ret)
+ return ret;
+ migf->filled_size = qat_vdev->mdev->setup_size;
+
+ return 0;
+}
+
+/*
+ * Allocate a file handler for user space and then save the migration data for
+ * the device being migrated. If this is called in the pre-copy stage, save the
+ * pre-configured device data. Otherwise, if this is called in the stop-copy
+ * stage, save the device state. In both cases, update the data size which can
+ * then be read from user space.
+ */
+static struct qat_vf_migration_file *
+qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev, bool pre_copy)
+{
+ struct qat_vf_migration_file *migf;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops,
+ migf, O_RDONLY);
+ ret = PTR_ERR_OR_ZERO(migf->filp);
+ if (ret) {
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+
+ if (pre_copy)
+ ret = qat_vf_save_setup(qat_vdev, migf);
+ else
+ ret = qat_vf_save_state(qat_vdev, migf);
+ if (ret) {
+ fput(migf->filp);
+ return ERR_PTR(ret);
+ }
+
+ migf->qat_vdev = qat_vdev;
+
+ return migf;
+}
+
+static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct qat_vf_migration_file *migf = filp->private_data;
+ struct qat_mig_dev *mig_dev = migf->qat_vdev->mdev;
+ loff_t end, *offs;
+ ssize_t done = 0;
+ int ret;
+
+ if (pos)
+ return -ESPIPE;
+ offs = &filp->f_pos;
+
+ if (*offs < 0 ||
+ check_add_overflow(len, *offs, &end))
+ return -EOVERFLOW;
+
+ if (end > mig_dev->state_size)
+ return -ENOMEM;
+
+ mutex_lock(&migf->lock);
+ if (migf->disabled) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ ret = copy_from_user(mig_dev->state + *offs, buf, len);
+ if (ret) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *offs += len;
+ migf->filled_size += len;
+
+ /*
+ * Load the pre-configured device data first to check if the target
+ * device is compatible with the source device.
+ */
+ ret = qat_vfmig_load_setup(mig_dev, migf->filled_size);
+ if (ret && ret != -EAGAIN) {
+ done = ret;
+ goto out_unlock;
+ }
+ done = len;
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static const struct file_operations qat_vf_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = qat_vf_resume_write,
+ .release = qat_vf_release_file,
+};
+
+static struct qat_vf_migration_file *
+qat_vf_resume_device_data(struct qat_vf_core_device *qat_vdev)
+{
+ struct qat_vf_migration_file *migf;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_resume_fops, migf, O_WRONLY);
+ ret = PTR_ERR_OR_ZERO(migf->filp);
+ if (ret) {
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ migf->qat_vdev = qat_vdev;
+ migf->filled_size = 0;
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+
+ return migf;
+}
+
+static int qat_vf_load_device_data(struct qat_vf_core_device *qat_vdev)
+{
+ return qat_vfmig_load_state(qat_vdev->mdev);
+}
+
+static struct file *qat_vf_pci_step_device_state(struct qat_vf_core_device *qat_vdev, u32 new)
+{
+ u32 cur = qat_vdev->mig_state;
+ int ret;
+
+ /*
+ * As the device is not capable of just stopping P2P DMAs, suspend the
+ * device completely once any of the P2P states are reached.
+ * When it is suspended, all its MMIO registers can still be operated
+ * correctly, jobs submitted through ring are queued while no jobs are
+ * processed by the device. The MMIO states can be safely migrated to
+ * the target VF during stop-copy stage and restored correctly in the
+ * target VF. All queued jobs can be resumed then.
+ */
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ ret = qat_vfmig_suspend(qat_vdev->mdev);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
+ qat_vfmig_resume(qat_vdev->mdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
+ return NULL;
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct qat_vf_migration_file *migf;
+
+ migf = qat_vf_save_device_data(qat_vdev, false);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ qat_vdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct qat_vf_migration_file *migf;
+
+ migf = qat_vf_resume_device_data(qat_vdev);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ qat_vdev->resuming_migf = migf;
+ return migf->filp;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+ qat_vf_disable_fds(qat_vdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct qat_vf_migration_file *migf;
+
+ migf = qat_vf_save_device_data(qat_vdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ qat_vdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct qat_vf_migration_file *migf = qat_vdev->saving_migf;
+
+ if (!migf)
+ return ERR_PTR(-EINVAL);
+ ret = qat_vf_save_state(qat_vdev, migf);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ ret = qat_vf_load_device_data(qat_vdev);
+ if (ret)
+ return ERR_PTR(ret);
+
+ qat_vf_disable_fds(qat_vdev);
+ return NULL;
+ }
+
+ /* vfio_mig_get_next_state() does not use arcs other than the above */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+static void qat_vf_reset_done(struct qat_vf_core_device *qat_vdev)
+{
+ qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ qat_vfmig_reset(qat_vdev->mdev);
+ qat_vf_disable_fds(qat_vdev);
+}
+
+static struct file *qat_vf_pci_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(vdev,
+ struct qat_vf_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *res = NULL;
+ int ret;
+
+ mutex_lock(&qat_vdev->state_mutex);
+ while (new_state != qat_vdev->mig_state) {
+ ret = vfio_mig_get_next_state(vdev, qat_vdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ res = ERR_PTR(ret);
+ break;
+ }
+ res = qat_vf_pci_step_device_state(qat_vdev, next_state);
+ if (IS_ERR(res))
+ break;
+ qat_vdev->mig_state = next_state;
+ if (WARN_ON(res && new_state != qat_vdev->mig_state)) {
+ fput(res);
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mutex_unlock(&qat_vdev->state_mutex);
+
+ return res;
+}
+
+static int qat_vf_pci_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(vdev,
+ struct qat_vf_core_device, core_device.vdev);
+
+ mutex_lock(&qat_vdev->state_mutex);
+ *curr_state = qat_vdev->mig_state;
+ mutex_unlock(&qat_vdev->state_mutex);
+
+ return 0;
+}
+
+static int qat_vf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(vdev,
+ struct qat_vf_core_device, core_device.vdev);
+
+ mutex_lock(&qat_vdev->state_mutex);
+ *stop_copy_length = qat_vdev->mdev->state_size;
+ mutex_unlock(&qat_vdev->state_mutex);
+
+ return 0;
+}
+
+static const struct vfio_migration_ops qat_vf_pci_mig_ops = {
+ .migration_set_state = qat_vf_pci_set_device_state,
+ .migration_get_state = qat_vf_pci_get_device_state,
+ .migration_get_data_size = qat_vf_pci_get_data_size,
+};
+
+static void qat_vf_pci_release_dev(struct vfio_device *core_vdev)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+ struct qat_vf_core_device, core_device.vdev);
+
+ qat_vfmig_cleanup(qat_vdev->mdev);
+ qat_vfmig_destroy(qat_vdev->mdev);
+ mutex_destroy(&qat_vdev->state_mutex);
+ vfio_pci_core_release_dev(core_vdev);
+}
+
+static int qat_vf_pci_init_dev(struct vfio_device *core_vdev)
+{
+ struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+ struct qat_vf_core_device, core_device.vdev);
+ struct qat_mig_dev *mdev;
+ struct pci_dev *parent;
+ int ret, vf_id;
+
+ core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+ core_vdev->mig_ops = &qat_vf_pci_mig_ops;
+
+ ret = vfio_pci_core_init_dev(core_vdev);
+ if (ret)
+ return ret;
+
+ mutex_init(&qat_vdev->state_mutex);
+
+ parent = pci_physfn(qat_vdev->core_device.pdev);
+ vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev);
+ if (vf_id < 0) {
+ ret = -ENODEV;
+ goto err_rel;
+ }
+
+ mdev = qat_vfmig_create(parent, vf_id);
+ if (IS_ERR(mdev)) {
+ ret = PTR_ERR(mdev);
+ goto err_rel;
+ }
+
+ ret = qat_vfmig_init(mdev);
+ if (ret)
+ goto err_destroy;
+
+ qat_vdev->mdev = mdev;
+
+ return 0;
+
+err_destroy:
+ qat_vfmig_destroy(mdev);
+err_rel:
+ vfio_pci_core_release_dev(core_vdev);
+ return ret;
+}
+
+static const struct vfio_device_ops qat_vf_pci_ops = {
+ .name = "qat-vf-vfio-pci",
+ .init = qat_vf_pci_init_dev,
+ .release = qat_vf_pci_release_dev,
+ .open_device = qat_vf_pci_open_device,
+ .close_device = qat_vf_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static struct qat_vf_core_device *qat_vf_drvdata(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = pci_get_drvdata(pdev);
+
+ return container_of(core_device, struct qat_vf_core_device, core_device);
+}
+
+static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+ struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
+
+ if (!qat_vdev->mdev)
+ return;
+
+ mutex_lock(&qat_vdev->state_mutex);
+ qat_vf_reset_done(qat_vdev);
+ mutex_unlock(&qat_vdev->state_mutex);
+}
+
+static int
+qat_vf_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct device *dev = &pdev->dev;
+ struct qat_vf_core_device *qat_vdev;
+ int ret;
+
+ qat_vdev = vfio_alloc_device(qat_vf_core_device, core_device.vdev, dev, &qat_vf_pci_ops);
+ if (IS_ERR(qat_vdev))
+ return PTR_ERR(qat_vdev);
+
+ pci_set_drvdata(pdev, &qat_vdev->core_device);
+ ret = vfio_pci_core_register_device(&qat_vdev->core_device);
+ if (ret)
+ goto out_put_device;
+
+ return 0;
+
+out_put_device:
+ vfio_put_device(&qat_vdev->core_device.vdev);
+ return ret;
+}
+
+static void qat_vf_vfio_pci_remove(struct pci_dev *pdev)
+{
+ struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
+
+ vfio_pci_core_unregister_device(&qat_vdev->core_device);
+ vfio_put_device(&qat_vdev->core_device.vdev);
+}
+
+static const struct pci_device_id qat_vf_vfio_pci_table[] = {
+ /* Intel QAT GEN4 4xxx VF device */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) },
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) },
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) },
+ {}
+};
+MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table);
+
+static const struct pci_error_handlers qat_vf_err_handlers = {
+ .reset_done = qat_vf_pci_aer_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver qat_vf_vfio_pci_driver = {
+ .name = "qat_vfio_pci",
+ .id_table = qat_vf_vfio_pci_table,
+ .probe = qat_vf_vfio_pci_probe,
+ .remove = qat_vf_vfio_pci_remove,
+ .err_handler = &qat_vf_err_handlers,
+ .driver_managed_dma = true,
+};
+module_pci_driver(qat_vf_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Xin Zeng <xin.zeng@intel.com>");
+MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT GEN4 device family");
+MODULE_IMPORT_NS("CRYPTO_QAT");
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index cb5b7f865d58..5ba39f7623bb 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -71,6 +71,8 @@ static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
+ case PCI_DEVICE_ID_INTEL_DSA_SPR0:
+ case PCI_DEVICE_ID_INTEL_IAX_SPR0:
return true;
default:
return false;
@@ -109,9 +111,7 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev)
if (ret)
return ret;
- if (vfio_pci_is_vga(pdev) &&
- pdev->vendor == PCI_VENDOR_ID_INTEL &&
- IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
+ if (vfio_pci_is_intel_display(pdev)) {
ret = vfio_pci_igd_init(vdev);
if (ret && ret != -ENODEV) {
pci_warn(pdev, "Failed to setup Intel IGD regions\n");
@@ -142,6 +142,8 @@ static const struct vfio_device_ops vfio_pci_ops = {
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
.detach_ioas = vfio_iommufd_physical_detach_ioas,
+ .pasid_attach_ioas = vfio_iommufd_physical_pasid_attach_ioas,
+ .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas,
};
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 97422aafaa7b..8f02f236b5b4 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -313,6 +313,10 @@ static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos,
return count;
}
+static struct perm_bits direct_ro_perms = {
+ .readfn = vfio_direct_config_read,
+};
+
/* Default capability regions to read-only, no-virtualization */
static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
[0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
@@ -507,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
- } else if (pdev->resource[PCI_ROM_RESOURCE].flags &
- IORESOURCE_ROM_SHADOW) {
- mask = ~(0x20000 - 1);
+ } else if (pdev->rom && pdev->romlen) {
+ mask = ~(roundup_pow_of_two(pdev->romlen) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
- } else
+ } else {
*vbar = 0;
+ }
vdev->bardirty = false;
}
@@ -1385,11 +1389,12 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo
switch (ecap) {
case PCI_EXT_CAP_ID_VNDR:
- ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword);
+ ret = pci_read_config_dword(pdev, epos + PCI_VNDR_HEADER,
+ &dword);
if (ret)
return pcibios_err_to_errno(ret);
- return dword >> PCI_VSEC_HDR_LEN_SHIFT;
+ return PCI_VNDR_HEADER_LEN(dword);
case PCI_EXT_CAP_ID_VC:
case PCI_EXT_CAP_ID_VC9:
case PCI_EXT_CAP_ID_MFVC:
@@ -1809,7 +1814,8 @@ int vfio_config_init(struct vfio_pci_core_device *vdev)
cpu_to_le16(PCI_COMMAND_MEMORY);
}
- if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx)
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx ||
+ !vdev->pdev->irq || vdev->pdev->irq == IRQ_NOTCONNECTED)
vconfig[PCI_INTERRUPT_PIN] = 0;
ret = vfio_cap_init(vdev);
@@ -1897,9 +1903,17 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user
cap_start = *ppos;
} else {
if (*ppos >= PCI_CFG_SPACE_SIZE) {
- WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
+ /*
+ * We can get a cap_id that exceeds PCI_EXT_CAP_ID_MAX
+ * if we're hiding an unknown capability at the start
+ * of the extended capability list. Use default, ro
+ * access, which will virtualize the id and next values.
+ */
+ if (cap_id > PCI_EXT_CAP_ID_MAX)
+ perm = &direct_ro_perms;
+ else
+ perm = &ecap_perms[cap_id];
- perm = &ecap_perms[cap_id];
cap_start = vfio_find_cap_start(vdev, *ppos);
} else {
WARN_ON(cap_id > PCI_CAP_ID_MAX);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index d94d61b92c1a..6328c3a05bcd 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -20,6 +20,7 @@
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/pci.h>
+#include <linux/pfn_t.h>
#include <linux/pm_runtime.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -57,11 +58,6 @@ struct vfio_pci_vf_token {
int users;
};
-struct vfio_pci_mmap_vma {
- struct vm_area_struct *vma;
- struct list_head vma_next;
-};
-
static inline bool vfio_vga_disabled(void)
{
#ifdef CONFIG_VFIO_PCI_VGA
@@ -120,7 +116,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
res = &vdev->pdev->resource[bar];
- if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
+ if (vdev->pdev->non_mappable_bars)
goto no_mmap;
if (!(res->flags & IORESOURCE_MEM))
@@ -731,15 +727,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
{
if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
- u8 pin;
-
- if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
- vdev->nointx || vdev->pdev->is_virtfn)
- return 0;
-
- pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
-
- return pin ? 1 : 0;
+ return vdev->vconfig[PCI_INTERRUPT_PIN] ? 1 : 0;
} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
u8 pos;
u16 flags;
@@ -778,25 +766,26 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
}
struct vfio_pci_fill_info {
- struct vfio_pci_dependent_device __user *devices;
- struct vfio_pci_dependent_device __user *devices_end;
struct vfio_device *vdev;
+ struct vfio_pci_dependent_device *devices;
+ int nr_devices;
u32 count;
u32 flags;
};
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
{
- struct vfio_pci_dependent_device info = {
- .segment = pci_domain_nr(pdev->bus),
- .bus = pdev->bus->number,
- .devfn = pdev->devfn,
- };
+ struct vfio_pci_dependent_device *info;
struct vfio_pci_fill_info *fill = data;
- fill->count++;
- if (fill->devices >= fill->devices_end)
- return 0;
+ /* The topology changed since we counted devices */
+ if (fill->count >= fill->nr_devices)
+ return -EAGAIN;
+
+ info = &fill->devices[fill->count++];
+ info->segment = pci_domain_nr(pdev->bus);
+ info->bus = pdev->bus->number;
+ info->devfn = pdev->devfn;
if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
@@ -809,19 +798,19 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
*/
vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
if (!vdev) {
- info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ info->devid = VFIO_PCI_DEVID_NOT_OWNED;
} else {
int id = vfio_iommufd_get_dev_id(vdev, iommufd);
if (id > 0)
- info.devid = id;
+ info->devid = id;
else if (id == -ENOENT)
- info.devid = VFIO_PCI_DEVID_OWNED;
+ info->devid = VFIO_PCI_DEVID_OWNED;
else
- info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ info->devid = VFIO_PCI_DEVID_NOT_OWNED;
}
/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
- if (info.devid == VFIO_PCI_DEVID_NOT_OWNED)
+ if (info->devid == VFIO_PCI_DEVID_NOT_OWNED)
fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
} else {
struct iommu_group *iommu_group;
@@ -830,13 +819,10 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
if (!iommu_group)
return -EPERM; /* Cannot reset non-isolated devices */
- info.group_id = iommu_group_id(iommu_group);
+ info->group_id = iommu_group_id(iommu_group);
iommu_group_put(iommu_group);
}
- if (copy_to_user(fill->devices, &info, sizeof(info)))
- return -EFAULT;
- fill->devices++;
return 0;
}
@@ -1060,31 +1046,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.flags = 0;
+ info.size = 0;
- /* Report the BAR size, not the ROM size */
- info.size = pci_resource_len(pdev, info.index);
- if (!info.size) {
- /* Shadow ROMs appear as PCI option ROMs */
- if (pdev->resource[PCI_ROM_RESOURCE].flags &
- IORESOURCE_ROM_SHADOW)
- info.size = 0x20000;
- else
- break;
- }
-
- /*
- * Is it really there? Enable memory decode for implicit access
- * in pci_map_rom().
- */
- cmd = vfio_pci_memory_lock_and_enable(vdev);
- io = pci_map_rom(pdev, &size);
- if (io) {
+ if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
+ /*
+ * Check ROM content is valid. Need to enable memory
+ * decode for ROM access in pci_map_rom().
+ */
+ cmd = vfio_pci_memory_lock_and_enable(vdev);
+ io = pci_map_rom(pdev, &size);
+ if (io) {
+ info.flags = VFIO_REGION_INFO_FLAG_READ;
+ /* Report the BAR size, not the ROM size. */
+ info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
+ pci_unmap_rom(pdev, io);
+ }
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
+ } else if (pdev->rom && pdev->romlen) {
info.flags = VFIO_REGION_INFO_FLAG_READ;
- pci_unmap_rom(pdev, io);
- } else {
- info.size = 0;
+ /* Report BAR size as power of two. */
+ info.size = roundup_pow_of_two(pdev->romlen);
}
- vfio_pci_memory_unlock_and_restore(vdev, cmd);
break;
}
@@ -1258,10 +1240,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
{
unsigned long minsz =
offsetofend(struct vfio_pci_hot_reset_info, count);
+ struct vfio_pci_dependent_device *devices = NULL;
struct vfio_pci_hot_reset_info hdr;
struct vfio_pci_fill_info fill = {};
bool slot = false;
- int ret = 0;
+ int ret, count = 0;
if (copy_from_user(&hdr, arg, minsz))
return -EFAULT;
@@ -1277,9 +1260,26 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
else if (pci_probe_reset_bus(vdev->pdev->bus))
return -ENODEV;
- fill.devices = arg->devices;
- fill.devices_end = arg->devices +
- (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]);
+ ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
+ &count, slot);
+ if (ret)
+ return ret;
+
+ if (WARN_ON(!count)) /* Should always be at least one */
+ return -ERANGE;
+
+ if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) {
+ hdr.count = count;
+ ret = -ENOSPC;
+ goto header;
+ }
+
+ devices = kcalloc(count, sizeof(*devices), GFP_KERNEL);
+ if (!devices)
+ return -ENOMEM;
+
+ fill.devices = devices;
+ fill.nr_devices = count;
fill.vdev = &vdev->vdev;
if (vfio_device_cdev_opened(&vdev->vdev))
@@ -1291,21 +1291,28 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
&fill, slot);
mutex_unlock(&vdev->vdev.dev_set->lock);
if (ret)
- return ret;
+ goto out;
+
+ if (copy_to_user(arg->devices, devices,
+ sizeof(*devices) * fill.count)) {
+ ret = -EFAULT;
+ goto out;
+ }
hdr.count = fill.count;
hdr.flags = fill.flags;
- if (copy_to_user(arg, &hdr, minsz))
- return -EFAULT;
- if (fill.count > fill.devices - arg->devices)
- return -ENOSPC;
- return 0;
+header:
+ if (copy_to_user(arg, &hdr, minsz))
+ ret = -EFAULT;
+out:
+ kfree(devices);
+ return ret;
}
static int
vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
- int array_count, bool slot,
+ u32 array_count, bool slot,
struct vfio_pci_hot_reset __user *arg)
{
int32_t *group_fds;
@@ -1587,100 +1594,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
}
EXPORT_SYMBOL_GPL(vfio_pci_core_write);
-/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
-static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
+static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev)
{
- struct vfio_pci_mmap_vma *mmap_vma, *tmp;
-
- /*
- * Lock ordering:
- * vma_lock is nested under mmap_lock for vm_ops callback paths.
- * The memory_lock semaphore is used by both code paths calling
- * into this function to zap vmas and the vm_ops.fault callback
- * to protect the memory enable state of the device.
- *
- * When zapping vmas we need to maintain the mmap_lock => vma_lock
- * ordering, which requires using vma_lock to walk vma_list to
- * acquire an mm, then dropping vma_lock to get the mmap_lock and
- * reacquiring vma_lock. This logic is derived from similar
- * requirements in uverbs_user_mmap_disassociate().
- *
- * mmap_lock must always be the top-level lock when it is taken.
- * Therefore we can only hold the memory_lock write lock when
- * vma_list is empty, as we'd need to take mmap_lock to clear
- * entries. vma_list can only be guaranteed empty when holding
- * vma_lock, thus memory_lock is nested under vma_lock.
- *
- * This enables the vm_ops.fault callback to acquire vma_lock,
- * followed by memory_lock read lock, while already holding
- * mmap_lock without risk of deadlock.
- */
- while (1) {
- struct mm_struct *mm = NULL;
-
- if (try) {
- if (!mutex_trylock(&vdev->vma_lock))
- return 0;
- } else {
- mutex_lock(&vdev->vma_lock);
- }
- while (!list_empty(&vdev->vma_list)) {
- mmap_vma = list_first_entry(&vdev->vma_list,
- struct vfio_pci_mmap_vma,
- vma_next);
- mm = mmap_vma->vma->vm_mm;
- if (mmget_not_zero(mm))
- break;
-
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
- mm = NULL;
- }
- if (!mm)
- return 1;
- mutex_unlock(&vdev->vma_lock);
-
- if (try) {
- if (!mmap_read_trylock(mm)) {
- mmput(mm);
- return 0;
- }
- } else {
- mmap_read_lock(mm);
- }
- if (try) {
- if (!mutex_trylock(&vdev->vma_lock)) {
- mmap_read_unlock(mm);
- mmput(mm);
- return 0;
- }
- } else {
- mutex_lock(&vdev->vma_lock);
- }
- list_for_each_entry_safe(mmap_vma, tmp,
- &vdev->vma_list, vma_next) {
- struct vm_area_struct *vma = mmap_vma->vma;
-
- if (vma->vm_mm != mm)
- continue;
+ struct vfio_device *core_vdev = &vdev->vdev;
+ loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX);
+ loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX);
+ loff_t len = end - start;
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
-
- zap_vma_ptes(vma, vma->vm_start,
- vma->vm_end - vma->vm_start);
- }
- mutex_unlock(&vdev->vma_lock);
- mmap_read_unlock(mm);
- mmput(mm);
- }
+ unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true);
}
void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
{
- vfio_pci_zap_and_vma_lock(vdev, false);
down_write(&vdev->memory_lock);
- mutex_unlock(&vdev->vma_lock);
+ vfio_pci_zap_bars(vdev);
}
u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
@@ -1702,100 +1629,83 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c
up_write(&vdev->memory_lock);
}
-/* Caller holds vma_lock */
-static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
- struct vm_area_struct *vma)
-{
- struct vfio_pci_mmap_vma *mmap_vma;
-
- mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT);
- if (!mmap_vma)
- return -ENOMEM;
-
- mmap_vma->vma = vma;
- list_add(&mmap_vma->vma_next, &vdev->vma_list);
-
- return 0;
-}
-
-/*
- * Zap mmaps on open so that we can fault them in on access and therefore
- * our vma_list only tracks mappings accessed since last zap.
- */
-static void vfio_pci_mmap_open(struct vm_area_struct *vma)
-{
- zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
-}
-
-static void vfio_pci_mmap_close(struct vm_area_struct *vma)
+static unsigned long vma_to_pfn(struct vm_area_struct *vma)
{
struct vfio_pci_core_device *vdev = vma->vm_private_data;
- struct vfio_pci_mmap_vma *mmap_vma;
+ int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+ u64 pgoff;
- mutex_lock(&vdev->vma_lock);
- list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
- if (mmap_vma->vma == vma) {
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
- break;
- }
- }
- mutex_unlock(&vdev->vma_lock);
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
}
-static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
+static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
{
struct vm_area_struct *vma = vmf->vma;
struct vfio_pci_core_device *vdev = vma->vm_private_data;
- struct vfio_pci_mmap_vma *mmap_vma;
- vm_fault_t ret = VM_FAULT_NOPAGE;
-
- mutex_lock(&vdev->vma_lock);
- down_read(&vdev->memory_lock);
+ unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
+ unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ unsigned long pfn = vma_to_pfn(vma) + pgoff;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
- /*
- * Memory region cannot be accessed if the low power feature is engaged
- * or memory access is disabled.
- */
- if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) {
- ret = VM_FAULT_SIGBUS;
- goto up_out;
+ if (order && (addr < vma->vm_start ||
+ addr + (PAGE_SIZE << order) > vma->vm_end ||
+ pfn & ((1 << order) - 1))) {
+ ret = VM_FAULT_FALLBACK;
+ goto out;
}
- /*
- * We populate the whole vma on fault, so we need to test whether
- * the vma has already been mapped, such as for concurrent faults
- * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
- * we ask it to fill the same range again.
- */
- list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
- if (mmap_vma->vma == vma)
- goto up_out;
- }
+ down_read(&vdev->memory_lock);
- if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
- vma->vm_end - vma->vm_start,
- vma->vm_page_prot)) {
- ret = VM_FAULT_SIGBUS;
- zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
- goto up_out;
- }
+ if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
+ goto out_unlock;
- if (__vfio_pci_add_vma(vdev, vma)) {
- ret = VM_FAULT_OOM;
- zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+ switch (order) {
+ case 0:
+ ret = vmf_insert_pfn(vma, vmf->address, pfn);
+ break;
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+ case PMD_ORDER:
+ ret = vmf_insert_pfn_pmd(vmf,
+ __pfn_to_pfn_t(pfn, PFN_DEV), false);
+ break;
+#endif
+#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+ case PUD_ORDER:
+ ret = vmf_insert_pfn_pud(vmf,
+ __pfn_to_pfn_t(pfn, PFN_DEV), false);
+ break;
+#endif
+ default:
+ ret = VM_FAULT_FALLBACK;
}
-up_out:
+out_unlock:
up_read(&vdev->memory_lock);
- mutex_unlock(&vdev->vma_lock);
+out:
+ dev_dbg_ratelimited(&vdev->pdev->dev,
+ "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n",
+ __func__, order,
+ vma->vm_pgoff >>
+ (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT),
+ pgoff, (unsigned int)ret);
+
return ret;
}
+static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf)
+{
+ return vfio_pci_mmap_huge_fault(vmf, 0);
+}
+
static const struct vm_operations_struct vfio_pci_mmap_ops = {
- .open = vfio_pci_mmap_open,
- .close = vfio_pci_mmap_close,
- .fault = vfio_pci_mmap_fault,
+ .fault = vfio_pci_mmap_page_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+ .huge_fault = vfio_pci_mmap_huge_fault,
+#endif
};
int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
@@ -1857,11 +1767,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
vma->vm_private_data = vdev;
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
/*
- * See remap_pfn_range(), called from vfio_pci_fault() but we can't
- * change vm_flags within the fault handler. Set them now.
+ * Set vm_flags now, they should not be changed in the fault handler.
+ * We want the same flags and page protection (decrypted above) as
+ * io_remap_pfn_range() would set.
*
* VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
* allowing KVM stage 2 device mapping attributes to use Normal-NC
@@ -2179,8 +2090,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
mutex_init(&vdev->ioeventfds_lock);
INIT_LIST_HEAD(&vdev->dummy_resources_list);
INIT_LIST_HEAD(&vdev->ioeventfds_list);
- mutex_init(&vdev->vma_lock);
- INIT_LIST_HEAD(&vdev->vma_list);
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);
@@ -2196,7 +2105,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
mutex_destroy(&vdev->igate);
mutex_destroy(&vdev->ioeventfds_lock);
- mutex_destroy(&vdev->vma_lock);
kfree(vdev->region);
kfree(vdev->pm_save);
}
@@ -2474,26 +2382,15 @@ unwind:
return ret;
}
-/*
- * We need to get memory_lock for each device, but devices can share mmap_lock,
- * therefore we need to zap and hold the vma_lock for each device, and only then
- * get each memory_lock.
- */
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
struct vfio_pci_group_info *groups,
struct iommufd_ctx *iommufd_ctx)
{
- struct vfio_pci_core_device *cur_mem;
- struct vfio_pci_core_device *cur_vma;
- struct vfio_pci_core_device *cur;
+ struct vfio_pci_core_device *vdev;
struct pci_dev *pdev;
- bool is_mem = true;
int ret;
mutex_lock(&dev_set->lock);
- cur_mem = list_first_entry(&dev_set->device_list,
- struct vfio_pci_core_device,
- vdev.dev_set_list);
pdev = vfio_pci_dev_set_resettable(dev_set);
if (!pdev) {
@@ -2510,7 +2407,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
if (ret)
goto err_unlock;
- list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
+ list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
bool owned;
/*
@@ -2534,38 +2431,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
* Otherwise, reset is not allowed.
*/
if (iommufd_ctx) {
- int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev,
+ int devid = vfio_iommufd_get_dev_id(&vdev->vdev,
iommufd_ctx);
owned = (devid > 0 || devid == -ENOENT);
} else {
- owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
+ owned = vfio_dev_in_groups(&vdev->vdev, groups);
}
if (!owned) {
ret = -EINVAL;
- goto err_undo;
+ break;
}
/*
- * Locking multiple devices is prone to deadlock, runaway and
- * unwind if we hit contention.
+ * Take the memory write lock for each device and zap BAR
+ * mappings to prevent the user accessing the device while in
+ * reset. Locking multiple devices is prone to deadlock,
+ * runaway and unwind if we hit contention.
*/
- if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
+ if (!down_write_trylock(&vdev->memory_lock)) {
ret = -EBUSY;
- goto err_undo;
+ break;
}
+
+ vfio_pci_zap_bars(vdev);
}
- cur_vma = NULL;
- list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
- if (!down_write_trylock(&cur_mem->memory_lock)) {
- ret = -EBUSY;
- goto err_undo;
- }
- mutex_unlock(&cur_mem->vma_lock);
+ if (!list_entry_is_head(vdev,
+ &dev_set->device_list, vdev.dev_set_list)) {
+ vdev = list_prev_entry(vdev, vdev.dev_set_list);
+ goto err_undo;
}
- cur_mem = NULL;
/*
* The pci_reset_bus() will reset all the devices in the bus.
@@ -2576,25 +2473,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
* cause the PCI config space reset without restoring the original
* state (saved locally in 'vdev->pm_save').
*/
- list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
- vfio_pci_set_power_state(cur, PCI_D0);
+ list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+ vfio_pci_set_power_state(vdev, PCI_D0);
ret = pci_reset_bus(pdev);
+ vdev = list_last_entry(&dev_set->device_list,
+ struct vfio_pci_core_device, vdev.dev_set_list);
+
err_undo:
- list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
- if (cur == cur_mem)
- is_mem = false;
- if (cur == cur_vma)
- break;
- if (is_mem)
- up_write(&cur->memory_lock);
- else
- mutex_unlock(&cur->vma_lock);
- }
+ list_for_each_entry_from_reverse(vdev, &dev_set->device_list,
+ vdev.dev_set_list)
+ up_write(&vdev->memory_lock);
+
+ list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
+ pm_runtime_put(&vdev->pdev->dev);
- list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
- pm_runtime_put(&cur->pdev->dev);
err_unlock:
mutex_unlock(&dev_set->lock);
return ret;
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index dd70e2431bd7..ef490a4545f4 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -435,6 +435,12 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev)
return 0;
}
+bool vfio_pci_is_intel_display(struct pci_dev *pdev)
+{
+ return (pdev->vendor == PCI_VENDOR_ID_INTEL) &&
+ ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY);
+}
+
int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
{
int ret;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index fb5392b749ff..565966351dfa 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -23,11 +23,12 @@
#include "vfio_pci_priv.h"
struct vfio_pci_irq_ctx {
- struct eventfd_ctx *trigger;
- struct virqfd *unmask;
- struct virqfd *mask;
- char *name;
- bool masked;
+ struct vfio_pci_core_device *vdev;
+ struct eventfd_ctx *trigger;
+ struct virqfd *unmask;
+ struct virqfd *mask;
+ char *name;
+ bool masked;
struct irq_bypass_producer producer;
};
@@ -84,19 +85,14 @@ vfio_irq_ctx_alloc(struct vfio_pci_core_device *vdev, unsigned long index)
/*
* INTx
*/
-static void vfio_send_intx_eventfd(void *opaque, void *unused)
+static void vfio_send_intx_eventfd(void *opaque, void *data)
{
struct vfio_pci_core_device *vdev = opaque;
if (likely(is_intx(vdev) && !vdev->virq_disabled)) {
- struct vfio_pci_irq_ctx *ctx;
- struct eventfd_ctx *trigger;
+ struct vfio_pci_irq_ctx *ctx = data;
+ struct eventfd_ctx *trigger = READ_ONCE(ctx->trigger);
- ctx = vfio_irq_ctx_get(vdev, 0);
- if (WARN_ON_ONCE(!ctx))
- return;
-
- trigger = READ_ONCE(ctx->trigger);
if (likely(trigger))
eventfd_signal(trigger);
}
@@ -166,11 +162,11 @@ bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev)
* a signal is necessary, which can then be handled via a work queue
* or directly depending on the caller.
*/
-static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
+static int vfio_pci_intx_unmask_handler(void *opaque, void *data)
{
struct vfio_pci_core_device *vdev = opaque;
struct pci_dev *pdev = vdev->pdev;
- struct vfio_pci_irq_ctx *ctx;
+ struct vfio_pci_irq_ctx *ctx = data;
unsigned long flags;
int ret = 0;
@@ -186,10 +182,6 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
goto out_unlock;
}
- ctx = vfio_irq_ctx_get(vdev, 0);
- if (WARN_ON_ONCE(!ctx))
- goto out_unlock;
-
if (ctx->masked && !vdev->virq_disabled) {
/*
* A pending interrupt here would immediately trigger,
@@ -213,10 +205,12 @@ out_unlock:
static void __vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
{
+ struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0);
+
lockdep_assert_held(&vdev->igate);
- if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
- vfio_send_intx_eventfd(vdev, NULL);
+ if (vfio_pci_intx_unmask_handler(vdev, ctx) > 0)
+ vfio_send_intx_eventfd(vdev, ctx);
}
void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
@@ -228,15 +222,11 @@ void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev)
static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
{
- struct vfio_pci_core_device *vdev = dev_id;
- struct vfio_pci_irq_ctx *ctx;
+ struct vfio_pci_irq_ctx *ctx = dev_id;
+ struct vfio_pci_core_device *vdev = ctx->vdev;
unsigned long flags;
int ret = IRQ_NONE;
- ctx = vfio_irq_ctx_get(vdev, 0);
- if (WARN_ON_ONCE(!ctx))
- return ret;
-
spin_lock_irqsave(&vdev->irqlock, flags);
if (!vdev->pci_2_3) {
@@ -252,7 +242,7 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
spin_unlock_irqrestore(&vdev->irqlock, flags);
if (ret == IRQ_HANDLED)
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, ctx);
return ret;
}
@@ -269,7 +259,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
if (!is_irq_none(vdev))
return -EINVAL;
- if (!pdev->irq)
+ if (!pdev->irq || pdev->irq == IRQ_NOTCONNECTED)
return -ENODEV;
name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
@@ -277,11 +267,14 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
return -ENOMEM;
ctx = vfio_irq_ctx_alloc(vdev, 0);
- if (!ctx)
+ if (!ctx) {
+ kfree(name);
return -ENOMEM;
+ }
ctx->name = name;
ctx->trigger = trigger;
+ ctx->vdev = vdev;
/*
* Fill the initial masked state based on virq_disabled. After
@@ -312,7 +305,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
ret = request_irq(pdev->irq, vfio_intx_handler,
- irqflags, ctx->name, vdev);
+ irqflags, ctx->name, ctx);
if (ret) {
vdev->irq_type = VFIO_PCI_NUM_IRQS;
kfree(name);
@@ -358,7 +351,7 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
if (ctx) {
vfio_virqfd_disable(&ctx->unmask);
vfio_virqfd_disable(&ctx->mask);
- free_irq(pdev->irq, vdev);
+ free_irq(pdev->irq, ctx);
if (ctx->trigger)
eventfd_ctx_put(ctx->trigger);
kfree(ctx->name);
@@ -606,7 +599,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev,
if (fd >= 0)
return vfio_virqfd_enable((void *) vdev,
vfio_pci_intx_unmask_handler,
- vfio_send_intx_eventfd, NULL,
+ vfio_send_intx_eventfd, ctx,
&ctx->unmask, fd);
vfio_virqfd_disable(&ctx->unmask);
@@ -673,11 +666,11 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0));
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger = *(uint8_t *)data;
if (trigger)
- vfio_send_intx_eventfd(vdev, NULL);
+ vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0));
}
return 0;
}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 5e4fa69aee16..a9972eacb293 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -67,8 +67,14 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
u16 cmd);
#ifdef CONFIG_VFIO_PCI_IGD
+bool vfio_pci_is_intel_display(struct pci_dev *pdev);
int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
#else
+static inline bool vfio_pci_is_intel_display(struct pci_dev *pdev)
+{
+ return false;
+}
+
static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
{
return -ENODEV;
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 03b8f7ada1ac..6192788c8ba3 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -16,6 +16,7 @@
#include <linux/io.h>
#include <linux/vfio.h>
#include <linux/vgaarb.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
#include "vfio_pci_priv.h"
@@ -61,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size);
VFIO_IOWRITE(8)
VFIO_IOWRITE(16)
VFIO_IOWRITE(32)
-#ifdef iowrite64
VFIO_IOWRITE(64)
-#endif
#define VFIO_IOREAD(size) \
int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \
@@ -89,6 +88,43 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size);
VFIO_IOREAD(8)
VFIO_IOREAD(16)
VFIO_IOREAD(32)
+VFIO_IOREAD(64)
+
+#define VFIO_IORDWR(size) \
+static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\
+ bool iswrite, bool test_mem, \
+ void __iomem *io, char __user *buf, \
+ loff_t off, size_t *filled) \
+{ \
+ u##size val; \
+ int ret; \
+ \
+ if (iswrite) { \
+ if (copy_from_user(&val, buf, sizeof(val))) \
+ return -EFAULT; \
+ \
+ ret = vfio_pci_core_iowrite##size(vdev, test_mem, \
+ val, io + off); \
+ if (ret) \
+ return ret; \
+ } else { \
+ ret = vfio_pci_core_ioread##size(vdev, test_mem, \
+ &val, io + off); \
+ if (ret) \
+ return ret; \
+ \
+ if (copy_to_user(buf, &val, sizeof(val))) \
+ return -EFAULT; \
+ } \
+ \
+ *filled = sizeof(val); \
+ return 0; \
+} \
+
+VFIO_IORDWR(8)
+VFIO_IORDWR(16)
+VFIO_IORDWR(32)
+VFIO_IORDWR(64)
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
@@ -114,72 +150,31 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
else
fillable = 0;
+ if (fillable >= 8 && !(off % 8)) {
+ ret = vfio_pci_iordwr64(vdev, iswrite, test_mem,
+ io, buf, off, &filled);
+ if (ret)
+ return ret;
+
+ } else
if (fillable >= 4 && !(off % 4)) {
- u32 val;
-
- if (iswrite) {
- if (copy_from_user(&val, buf, 4))
- return -EFAULT;
-
- ret = vfio_pci_core_iowrite32(vdev, test_mem,
- val, io + off);
- if (ret)
- return ret;
- } else {
- ret = vfio_pci_core_ioread32(vdev, test_mem,
- &val, io + off);
- if (ret)
- return ret;
-
- if (copy_to_user(buf, &val, 4))
- return -EFAULT;
- }
+ ret = vfio_pci_iordwr32(vdev, iswrite, test_mem,
+ io, buf, off, &filled);
+ if (ret)
+ return ret;
- filled = 4;
} else if (fillable >= 2 && !(off % 2)) {
- u16 val;
-
- if (iswrite) {
- if (copy_from_user(&val, buf, 2))
- return -EFAULT;
-
- ret = vfio_pci_core_iowrite16(vdev, test_mem,
- val, io + off);
- if (ret)
- return ret;
- } else {
- ret = vfio_pci_core_ioread16(vdev, test_mem,
- &val, io + off);
- if (ret)
- return ret;
-
- if (copy_to_user(buf, &val, 2))
- return -EFAULT;
- }
+ ret = vfio_pci_iordwr16(vdev, iswrite, test_mem,
+ io, buf, off, &filled);
+ if (ret)
+ return ret;
- filled = 2;
} else if (fillable) {
- u8 val;
-
- if (iswrite) {
- if (copy_from_user(&val, buf, 1))
- return -EFAULT;
-
- ret = vfio_pci_core_iowrite8(vdev, test_mem,
- val, io + off);
- if (ret)
- return ret;
- } else {
- ret = vfio_pci_core_ioread8(vdev, test_mem,
- &val, io + off);
- if (ret)
- return ret;
-
- if (copy_to_user(buf, &val, 1))
- return -EFAULT;
- }
+ ret = vfio_pci_iordwr8(vdev, iswrite, test_mem,
+ io, buf, off, &filled);
+ if (ret)
+ return ret;
- filled = 1;
} else {
/* Fill reads with -1, drop writes */
filled = min(count, (size_t)(x_end - off));
@@ -242,9 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
if (pci_resource_start(pdev, bar))
end = pci_resource_len(pdev, bar);
- else if (bar == PCI_ROM_RESOURCE &&
- pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
- end = 0x20000;
+ else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen)
+ end = roundup_pow_of_two(pdev->romlen);
else
return -EINVAL;
@@ -259,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
* excluded range at the end of the actual ROM. This makes
* filling large ROM BARs much faster.
*/
- io = pci_map_rom(pdev, &x_start);
- if (!io) {
- done = -ENOMEM;
- goto out;
+ if (pci_resource_start(pdev, bar)) {
+ io = pci_map_rom(pdev, &x_start);
+ } else {
+ io = ioremap(pdev->rom, pdev->romlen);
+ x_start = pdev->romlen;
}
+ if (!io)
+ return -ENOMEM;
x_end = end;
} else {
int ret = vfio_pci_core_setup_barmap(vdev, bar);
@@ -286,8 +283,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
if (done >= 0)
*ppos += done;
- if (bar == PCI_ROM_RESOURCE)
- pci_unmap_rom(pdev, io);
+ if (bar == PCI_ROM_RESOURCE) {
+ if (pci_resource_start(pdev, bar))
+ pci_unmap_rom(pdev, io);
+ else
+ iounmap(io);
+ }
+
out:
return done;
}
@@ -379,12 +381,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem,
ioeventfd->data, ioeventfd->addr);
break;
-#ifdef iowrite64
case 8:
vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem,
ioeventfd->data, ioeventfd->addr);
break;
-#endif
}
}
@@ -438,10 +438,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
pos >= vdev->msix_offset + vdev->msix_size))
return -EINVAL;
-#ifndef iowrite64
if (count == 8)
return -EINVAL;
-#endif
ret = vfio_pci_core_setup_barmap(vdev, bar);
if (ret)
diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig
index bd80eca4a196..33e04e65bec6 100644
--- a/drivers/vfio/pci/virtio/Kconfig
+++ b/drivers/vfio/pci/virtio/Kconfig
@@ -1,15 +1,31 @@
# SPDX-License-Identifier: GPL-2.0-only
config VIRTIO_VFIO_PCI
- tristate "VFIO support for VIRTIO NET PCI devices"
- depends on VIRTIO_PCI && VIRTIO_PCI_ADMIN_LEGACY
- select VFIO_PCI_CORE
- help
- This provides support for exposing VIRTIO NET VF devices which support
- legacy IO access, using the VFIO framework that can work with a legacy
- virtio driver in the guest.
- Based on PCIe spec, VFs do not support I/O Space.
- As of that this driver emulates I/O BAR in software to let a VF be
- seen as a transitional device by its users and let it work with
- a legacy driver.
-
- If you don't know what to do here, say N.
+ tristate "VFIO support for VIRTIO PCI VF devices"
+ depends on VIRTIO_PCI
+ select VFIO_PCI_CORE
+ help
+ This provides migration support for VIRTIO NET and BLOCK PCI VF
+ devices using the VFIO framework. Migration support requires the
+ SR-IOV PF device to support specific VIRTIO extensions,
+ otherwise this driver provides no additional functionality
+ beyond vfio-pci.
+
+ Migration support in this driver relies on dirty page tracking
+ provided by the IOMMU hardware and exposed through IOMMUFD, any
+ other use cases are dis-recommended.
+
+ If you don't know what to do here, say N.
+
+config VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ bool "Legacy I/O support for VIRTIO NET PCI VF devices"
+ depends on VIRTIO_VFIO_PCI && VIRTIO_PCI_ADMIN_LEGACY
+ default y
+ help
+ This extends the virtio-vfio-pci driver to support legacy I/O
+ access, allowing use of legacy virtio drivers with VIRTIO NET
+ PCI VF devices. Legacy I/O support requires the SR-IOV PF
+ device to support and enable specific VIRTIO extensions,
+ otherwise this driver provides no additional functionality
+ beyond vfio-pci.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/virtio/Makefile b/drivers/vfio/pci/virtio/Makefile
index 7171105baf33..d9b0bb40d6b3 100644
--- a/drivers/vfio/pci/virtio/Makefile
+++ b/drivers/vfio/pci/virtio/Makefile
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio-vfio-pci.o
-virtio-vfio-pci-y := main.o
+virtio-vfio-pci-y := main.o migrate.o
+virtio-vfio-pci-$(CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY) += legacy_io.o
diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h
new file mode 100644
index 000000000000..c7d7e27af386
--- /dev/null
+++ b/drivers/vfio/pci/virtio/common.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef VIRTIO_VFIO_COMMON_H
+#define VIRTIO_VFIO_COMMON_H
+
+#include <linux/kernel.h>
+#include <linux/virtio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+
+enum virtiovf_migf_state {
+ VIRTIOVF_MIGF_STATE_ERROR = 1,
+ VIRTIOVF_MIGF_STATE_PRECOPY = 2,
+ VIRTIOVF_MIGF_STATE_COMPLETE = 3,
+};
+
+enum virtiovf_load_state {
+ VIRTIOVF_LOAD_STATE_READ_HEADER,
+ VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA,
+ VIRTIOVF_LOAD_STATE_READ_HEADER_DATA,
+ VIRTIOVF_LOAD_STATE_PREP_CHUNK,
+ VIRTIOVF_LOAD_STATE_READ_CHUNK,
+ VIRTIOVF_LOAD_STATE_LOAD_CHUNK,
+};
+
+struct virtiovf_data_buffer {
+ struct sg_append_table table;
+ loff_t start_pos;
+ u64 length;
+ u64 allocated_length;
+ struct list_head buf_elm;
+ u8 include_header_object:1;
+ struct virtiovf_migration_file *migf;
+ /* Optimize virtiovf_get_migration_page() for sequential access */
+ struct scatterlist *last_offset_sg;
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
+enum virtiovf_migf_header_flags {
+ VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0,
+ VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0,
+};
+
+enum virtiovf_migf_header_tag {
+ VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA = 0,
+};
+
+struct virtiovf_migration_header {
+ __le64 record_size;
+ /* For future use in case we may need to change the kernel protocol */
+ __le32 flags; /* Use virtiovf_migf_header_flags */
+ __le32 tag; /* Use virtiovf_migf_header_tag */
+ __u8 data[]; /* Its size is given in the record_size */
+};
+
+struct virtiovf_migration_file {
+ struct file *filp;
+ /* synchronize access to the file state */
+ struct mutex lock;
+ loff_t max_pos;
+ u64 pre_copy_initial_bytes;
+ struct ratelimit_state pre_copy_rl_state;
+ u64 record_size;
+ u32 record_tag;
+ u8 has_obj_id:1;
+ u32 obj_id;
+ enum virtiovf_migf_state state;
+ enum virtiovf_load_state load_state;
+ /* synchronize access to the lists */
+ spinlock_t list_lock;
+ struct list_head buf_list;
+ struct list_head avail_list;
+ struct virtiovf_data_buffer *buf;
+ struct virtiovf_data_buffer *buf_header;
+ struct virtiovf_pci_core_device *virtvdev;
+};
+
+struct virtiovf_pci_core_device {
+ struct vfio_pci_core_device core_device;
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ u8 *bar0_virtual_buf;
+ /* synchronize access to the virtual buf */
+ struct mutex bar_mutex;
+ void __iomem *notify_addr;
+ u64 notify_offset;
+ __le32 pci_base_addr_0;
+ __le16 pci_cmd;
+ u8 bar0_virtual_buf_size;
+ u8 notify_bar;
+#endif
+
+ /* LM related */
+ u8 migrate_cap:1;
+ u8 deferred_reset:1;
+ /* protect migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ /* protect the reset_done flow */
+ spinlock_t reset_lock;
+ struct virtiovf_migration_file *resuming_migf;
+ struct virtiovf_migration_file *saving_migf;
+};
+
+void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_migration_reset_done(struct pci_dev *pdev);
+
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg);
+int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg);
+ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count,
+ loff_t *ppos);
+ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+ size_t count, loff_t *ppos);
+bool virtiovf_support_legacy_io(struct pci_dev *pdev);
+int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_legacy_io_reset_done(struct pci_dev *pdev);
+#endif
+
+#endif /* VIRTIO_VFIO_COMMON_H */
diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c
new file mode 100644
index 000000000000..832af5ba267c
--- /dev/null
+++ b/drivers/vfio/pci/virtio/legacy_io.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_pci_admin.h>
+
+#include "common.h"
+
+static int
+virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev,
+ loff_t pos, char __user *buf,
+ size_t count, bool read)
+{
+ bool msix_enabled =
+ (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX);
+ struct pci_dev *pdev = virtvdev->core_device.pdev;
+ u8 *bar0_buf = virtvdev->bar0_virtual_buf;
+ bool common;
+ u8 offset;
+ int ret;
+
+ common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled);
+ /* offset within the relevant configuration area */
+ offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled);
+ mutex_lock(&virtvdev->bar_mutex);
+ if (read) {
+ if (common)
+ ret = virtio_pci_admin_legacy_common_io_read(pdev, offset,
+ count, bar0_buf + pos);
+ else
+ ret = virtio_pci_admin_legacy_device_io_read(pdev, offset,
+ count, bar0_buf + pos);
+ if (ret)
+ goto out;
+ if (copy_to_user(buf, bar0_buf + pos, count))
+ ret = -EFAULT;
+ } else {
+ if (copy_from_user(bar0_buf + pos, buf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (common)
+ ret = virtio_pci_admin_legacy_common_io_write(pdev, offset,
+ count, bar0_buf + pos);
+ else
+ ret = virtio_pci_admin_legacy_device_io_write(pdev, offset,
+ count, bar0_buf + pos);
+ }
+out:
+ mutex_unlock(&virtvdev->bar_mutex);
+ return ret;
+}
+
+static int
+virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev,
+ loff_t pos, char __user *buf,
+ size_t count, bool read)
+{
+ struct vfio_pci_core_device *core_device = &virtvdev->core_device;
+ struct pci_dev *pdev = core_device->pdev;
+ u16 queue_notify;
+ int ret;
+
+ if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO))
+ return -EIO;
+
+ if (pos + count > virtvdev->bar0_virtual_buf_size)
+ return -EINVAL;
+
+ ret = pm_runtime_resume_and_get(&pdev->dev);
+ if (ret) {
+ pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret);
+ return -EIO;
+ }
+
+ switch (pos) {
+ case VIRTIO_PCI_QUEUE_NOTIFY:
+ if (count != sizeof(queue_notify)) {
+ ret = -EINVAL;
+ goto end;
+ }
+ if (read) {
+ ret = vfio_pci_core_ioread16(core_device, true, &queue_notify,
+ virtvdev->notify_addr);
+ if (ret)
+ goto end;
+ if (copy_to_user(buf, &queue_notify,
+ sizeof(queue_notify))) {
+ ret = -EFAULT;
+ goto end;
+ }
+ } else {
+ if (copy_from_user(&queue_notify, buf, count)) {
+ ret = -EFAULT;
+ goto end;
+ }
+ ret = vfio_pci_core_iowrite16(core_device, true, queue_notify,
+ virtvdev->notify_addr);
+ }
+ break;
+ default:
+ ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count,
+ read);
+ }
+
+end:
+ pm_runtime_put(&pdev->dev);
+ return ret ? ret : count;
+}
+
+static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
+ char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+ __le32 val32;
+ __le16 val16;
+ u8 val8;
+ int ret;
+
+ ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
+ if (ret < 0)
+ return ret;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count))
+ return -EFAULT;
+ }
+
+ if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) &&
+ vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ val16 |= cpu_to_le16(PCI_COMMAND_IO);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID,
+ sizeof(val8), &copy_offset,
+ &copy_count, &register_offset)) {
+ /* Transional needs to have revision 0 */
+ val8 = 0;
+ if (copy_to_user(buf + copy_offset, &val8, copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+ sizeof(val32), &copy_offset,
+ &copy_count, &register_offset)) {
+ u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1);
+ u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0);
+
+ val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO);
+ if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ /*
+ * Transitional devices use the PCI subsystem device id as
+ * virtio device id, same as legacy driver always did.
+ */
+ val16 = cpu_to_le16(VIRTIO_ID_NET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ return count;
+}
+
+ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (!count)
+ return 0;
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return virtiovf_pci_read_config(core_vdev, buf, count, ppos);
+
+ if (index == VFIO_PCI_BAR0_REGION_INDEX)
+ return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true);
+
+ return vfio_pci_core_read(core_vdev, buf, count, ppos);
+}
+
+static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+ sizeof(virtvdev->pci_cmd),
+ &copy_offset, &copy_count,
+ &register_offset)) {
+ if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset,
+ buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+ sizeof(virtvdev->pci_base_addr_0),
+ &copy_offset, &copy_count,
+ &register_offset)) {
+ if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset,
+ buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (!count)
+ return 0;
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return virtiovf_pci_write_config(core_vdev, buf, count, ppos);
+
+ if (index == VFIO_PCI_BAR0_REGION_INDEX)
+ return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false);
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ void __user *uarg = (void __user *)arg;
+ struct vfio_region_info info = {};
+
+ if (copy_from_user(&info, uarg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ switch (info.index) {
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info.size = virtvdev->bar0_virtual_buf_size;
+ info.flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
+ default:
+ return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ }
+}
+
+long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg);
+ default:
+ return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ }
+}
+
+static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct vfio_pci_core_device *core_device = &virtvdev->core_device;
+ int ret;
+
+ /*
+ * Setup the BAR where the 'notify' exists to be used by vfio as well
+ * This will let us mmap it only once and use it when needed.
+ */
+ ret = vfio_pci_core_setup_barmap(core_device,
+ virtvdev->notify_bar);
+ if (ret)
+ return ret;
+
+ virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] +
+ virtvdev->notify_offset;
+ return 0;
+}
+
+int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->bar0_virtual_buf)
+ return 0;
+
+ /*
+ * Upon close_device() the vfio_pci_core_disable() is called
+ * and will close all the previous mmaps, so it seems that the
+ * valid life cycle for the 'notify' addr is per open/close.
+ */
+ return virtiovf_set_notify_addr(virtvdev);
+}
+
+static int virtiovf_get_device_config_size(unsigned short device)
+{
+ /* Network card */
+ return offsetofend(struct virtio_net_config, status);
+}
+
+static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev)
+{
+ u64 offset;
+ int ret;
+ u8 bar;
+
+ ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev,
+ VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM,
+ &bar, &offset);
+ if (ret)
+ return ret;
+
+ virtvdev->notify_bar = bar;
+ virtvdev->notify_offset = offset;
+ return 0;
+}
+
+static bool virtiovf_bar0_exists(struct pci_dev *pdev)
+{
+ struct resource *res = pdev->resource;
+
+ return res->flags;
+}
+
+bool virtiovf_support_legacy_io(struct pci_dev *pdev)
+{
+ /* For now, the legacy IO functionality is supported only for virtio-net */
+ return pdev->device == 0x1041 && virtio_pci_admin_has_legacy_io(pdev) &&
+ !virtiovf_bar0_exists(pdev);
+}
+
+int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct pci_dev *pdev = virtvdev->core_device.pdev;
+ int ret;
+
+ ret = virtiovf_read_notify_info(virtvdev);
+ if (ret)
+ return ret;
+
+ virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) +
+ virtiovf_get_device_config_size(pdev->device);
+ BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size));
+ virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size,
+ GFP_KERNEL);
+ if (!virtvdev->bar0_virtual_buf)
+ return -ENOMEM;
+ mutex_init(&virtvdev->bar_mutex);
+ return 0;
+}
+
+void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ kfree(virtvdev->bar0_virtual_buf);
+}
+
+void virtiovf_legacy_io_reset_done(struct pci_dev *pdev)
+{
+ struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
+
+ virtvdev->pci_cmd = 0;
+}
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index b5d3a8c5bbc9..515fe1b9f94d 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -16,347 +16,12 @@
#include <linux/virtio_net.h>
#include <linux/virtio_pci_admin.h>
-struct virtiovf_pci_core_device {
- struct vfio_pci_core_device core_device;
- u8 *bar0_virtual_buf;
- /* synchronize access to the virtual buf */
- struct mutex bar_mutex;
- void __iomem *notify_addr;
- u64 notify_offset;
- __le32 pci_base_addr_0;
- __le16 pci_cmd;
- u8 bar0_virtual_buf_size;
- u8 notify_bar;
-};
-
-static int
-virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev,
- loff_t pos, char __user *buf,
- size_t count, bool read)
-{
- bool msix_enabled =
- (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX);
- struct pci_dev *pdev = virtvdev->core_device.pdev;
- u8 *bar0_buf = virtvdev->bar0_virtual_buf;
- bool common;
- u8 offset;
- int ret;
-
- common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled);
- /* offset within the relevant configuration area */
- offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled);
- mutex_lock(&virtvdev->bar_mutex);
- if (read) {
- if (common)
- ret = virtio_pci_admin_legacy_common_io_read(pdev, offset,
- count, bar0_buf + pos);
- else
- ret = virtio_pci_admin_legacy_device_io_read(pdev, offset,
- count, bar0_buf + pos);
- if (ret)
- goto out;
- if (copy_to_user(buf, bar0_buf + pos, count))
- ret = -EFAULT;
- } else {
- if (copy_from_user(bar0_buf + pos, buf, count)) {
- ret = -EFAULT;
- goto out;
- }
-
- if (common)
- ret = virtio_pci_admin_legacy_common_io_write(pdev, offset,
- count, bar0_buf + pos);
- else
- ret = virtio_pci_admin_legacy_device_io_write(pdev, offset,
- count, bar0_buf + pos);
- }
-out:
- mutex_unlock(&virtvdev->bar_mutex);
- return ret;
-}
-
-static int
-virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev,
- loff_t pos, char __user *buf,
- size_t count, bool read)
-{
- struct vfio_pci_core_device *core_device = &virtvdev->core_device;
- struct pci_dev *pdev = core_device->pdev;
- u16 queue_notify;
- int ret;
-
- if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO))
- return -EIO;
-
- if (pos + count > virtvdev->bar0_virtual_buf_size)
- return -EINVAL;
-
- ret = pm_runtime_resume_and_get(&pdev->dev);
- if (ret) {
- pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret);
- return -EIO;
- }
-
- switch (pos) {
- case VIRTIO_PCI_QUEUE_NOTIFY:
- if (count != sizeof(queue_notify)) {
- ret = -EINVAL;
- goto end;
- }
- if (read) {
- ret = vfio_pci_core_ioread16(core_device, true, &queue_notify,
- virtvdev->notify_addr);
- if (ret)
- goto end;
- if (copy_to_user(buf, &queue_notify,
- sizeof(queue_notify))) {
- ret = -EFAULT;
- goto end;
- }
- } else {
- if (copy_from_user(&queue_notify, buf, count)) {
- ret = -EFAULT;
- goto end;
- }
- ret = vfio_pci_core_iowrite16(core_device, true, queue_notify,
- virtvdev->notify_addr);
- }
- break;
- default:
- ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count,
- read);
- }
-
-end:
- pm_runtime_put(&pdev->dev);
- return ret ? ret : count;
-}
-
-static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
- char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- size_t register_offset;
- loff_t copy_offset;
- size_t copy_count;
- __le32 val32;
- __le16 val16;
- u8 val8;
- int ret;
-
- ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
- if (ret < 0)
- return ret;
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID,
- sizeof(val16), &copy_offset,
- &copy_count, &register_offset)) {
- val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count))
- return -EFAULT;
- }
-
- if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) &&
- vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
- sizeof(val16), &copy_offset,
- &copy_count, &register_offset)) {
- if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset,
- copy_count))
- return -EFAULT;
- val16 |= cpu_to_le16(PCI_COMMAND_IO);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID,
- sizeof(val8), &copy_offset,
- &copy_count, &register_offset)) {
- /* Transional needs to have revision 0 */
- val8 = 0;
- if (copy_to_user(buf + copy_offset, &val8, copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
- sizeof(val32), &copy_offset,
- &copy_count, &register_offset)) {
- u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1);
- u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0);
-
- val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO);
- if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID,
- sizeof(val16), &copy_offset,
- &copy_count, &register_offset)) {
- /*
- * Transitional devices use the PCI subsystem device id as
- * virtio device id, same as legacy driver always did.
- */
- val16 = cpu_to_le16(VIRTIO_ID_NET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID,
- sizeof(val16), &copy_offset,
- &copy_count, &register_offset)) {
- val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- return count;
-}
-
-static ssize_t
-virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
-
- if (!count)
- return 0;
-
- if (index == VFIO_PCI_CONFIG_REGION_INDEX)
- return virtiovf_pci_read_config(core_vdev, buf, count, ppos);
-
- if (index == VFIO_PCI_BAR0_REGION_INDEX)
- return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true);
-
- return vfio_pci_core_read(core_vdev, buf, count, ppos);
-}
-
-static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev,
- const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- size_t register_offset;
- loff_t copy_offset;
- size_t copy_count;
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
- sizeof(virtvdev->pci_cmd),
- &copy_offset, &copy_count,
- &register_offset)) {
- if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset,
- buf + copy_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
- sizeof(virtvdev->pci_base_addr_0),
- &copy_offset, &copy_count,
- &register_offset)) {
- if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset,
- buf + copy_offset,
- copy_count))
- return -EFAULT;
- }
-
- return vfio_pci_core_write(core_vdev, buf, count, ppos);
-}
-
-static ssize_t
-virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
-
- if (!count)
- return 0;
-
- if (index == VFIO_PCI_CONFIG_REGION_INDEX)
- return virtiovf_pci_write_config(core_vdev, buf, count, ppos);
-
- if (index == VFIO_PCI_BAR0_REGION_INDEX)
- return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false);
-
- return vfio_pci_core_write(core_vdev, buf, count, ppos);
-}
-
-static int
-virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
- void __user *uarg = (void __user *)arg;
- struct vfio_region_info info = {};
-
- if (copy_from_user(&info, uarg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
- case VFIO_PCI_BAR0_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = virtvdev->bar0_virtual_buf_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
-}
-
-static long
-virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
- unsigned long arg)
-{
- switch (cmd) {
- case VFIO_DEVICE_GET_REGION_INFO:
- return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg);
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
-}
-
-static int
-virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
-{
- struct vfio_pci_core_device *core_device = &virtvdev->core_device;
- int ret;
-
- /*
- * Setup the BAR where the 'notify' exists to be used by vfio as well
- * This will let us mmap it only once and use it when needed.
- */
- ret = vfio_pci_core_setup_barmap(core_device,
- virtvdev->notify_bar);
- if (ret)
- return ret;
-
- virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] +
- virtvdev->notify_offset;
- return 0;
-}
+#include "common.h"
static int virtiovf_pci_open_device(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
struct vfio_pci_core_device *vdev = &virtvdev->core_device;
int ret;
@@ -364,88 +29,84 @@ static int virtiovf_pci_open_device(struct vfio_device *core_vdev)
if (ret)
return ret;
- if (virtvdev->bar0_virtual_buf) {
- /*
- * Upon close_device() the vfio_pci_core_disable() is called
- * and will close all the previous mmaps, so it seems that the
- * valid life cycle for the 'notify' addr is per open/close.
- */
- ret = virtiovf_set_notify_addr(virtvdev);
- if (ret) {
- vfio_pci_core_disable(vdev);
- return ret;
- }
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ ret = virtiovf_open_legacy_io(virtvdev);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
}
+#endif
+ virtiovf_open_migration(virtvdev);
vfio_pci_core_finish_enable(vdev);
return 0;
}
-static int virtiovf_get_device_config_size(unsigned short device)
+static void virtiovf_pci_close_device(struct vfio_device *core_vdev)
{
- /* Network card */
- return offsetofend(struct virtio_net_config, status);
-}
-
-static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev)
-{
- u64 offset;
- int ret;
- u8 bar;
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
- ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev,
- VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM,
- &bar, &offset);
- if (ret)
- return ret;
-
- virtvdev->notify_bar = bar;
- virtvdev->notify_offset = offset;
- return 0;
+ virtiovf_close_migration(virtvdev);
+ vfio_pci_core_close_device(core_vdev);
}
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
static int virtiovf_pci_init_device(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- struct pci_dev *pdev;
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
int ret;
ret = vfio_pci_core_init_dev(core_vdev);
if (ret)
return ret;
- pdev = virtvdev->core_device.pdev;
- ret = virtiovf_read_notify_info(virtvdev);
- if (ret)
- return ret;
-
- virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) +
- virtiovf_get_device_config_size(pdev->device);
- BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size));
- virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size,
- GFP_KERNEL);
- if (!virtvdev->bar0_virtual_buf)
- return -ENOMEM;
- mutex_init(&virtvdev->bar_mutex);
- return 0;
+ /*
+ * The vfio_device_ops.init() callback is set to virtiovf_pci_init_device()
+ * only when legacy I/O is supported. Now, let's initialize it.
+ */
+ return virtiovf_init_legacy_io(virtvdev);
}
+#endif
static void virtiovf_pci_core_release_dev(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
- kfree(virtvdev->bar0_virtual_buf);
+ virtiovf_release_legacy_io(virtvdev);
+#endif
vfio_pci_core_release_dev(core_vdev);
}
-static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = {
- .name = "virtio-vfio-pci-trans",
+static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = {
+ .name = "virtio-vfio-pci-lm",
+ .init = vfio_pci_core_init_dev,
+ .release = virtiovf_pci_core_release_dev,
+ .open_device = virtiovf_pci_open_device,
+ .close_device = virtiovf_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = {
+ .name = "virtio-vfio-pci-trans-lm",
.init = virtiovf_pci_init_device,
.release = virtiovf_pci_core_release_dev,
.open_device = virtiovf_pci_open_device,
- .close_device = vfio_pci_core_close_device,
+ .close_device = virtiovf_pci_close_device,
.ioctl = virtiovf_vfio_pci_core_ioctl,
.device_feature = vfio_pci_core_ioctl_feature,
.read = virtiovf_pci_core_read,
@@ -458,6 +119,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = {
.attach_ioas = vfio_iommufd_physical_attach_ioas,
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
+#endif
static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.name = "virtio-vfio-pci",
@@ -478,29 +140,34 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
-static bool virtiovf_bar0_exists(struct pci_dev *pdev)
-{
- struct resource *res = pdev->resource;
-
- return res->flags;
-}
-
static int virtiovf_pci_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
const struct vfio_device_ops *ops = &virtiovf_vfio_pci_ops;
struct virtiovf_pci_core_device *virtvdev;
+ bool sup_legacy_io = false;
+ bool sup_lm = false;
int ret;
- if (pdev->is_virtfn && virtio_pci_admin_has_legacy_io(pdev) &&
- !virtiovf_bar0_exists(pdev))
- ops = &virtiovf_vfio_pci_tran_ops;
+ if (pdev->is_virtfn) {
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ sup_legacy_io = virtiovf_support_legacy_io(pdev);
+ if (sup_legacy_io)
+ ops = &virtiovf_vfio_pci_tran_lm_ops;
+#endif
+ sup_lm = virtio_pci_admin_has_dev_parts(pdev);
+ if (sup_lm && !sup_legacy_io)
+ ops = &virtiovf_vfio_pci_lm_ops;
+ }
virtvdev = vfio_alloc_device(virtiovf_pci_core_device, core_device.vdev,
&pdev->dev, ops);
if (IS_ERR(virtvdev))
return PTR_ERR(virtvdev);
+ if (sup_lm)
+ virtiovf_set_migratable(virtvdev);
+
dev_set_drvdata(&pdev->dev, &virtvdev->core_device);
ret = vfio_pci_core_register_device(&virtvdev->core_device);
if (ret)
@@ -520,8 +187,9 @@ static void virtiovf_pci_remove(struct pci_dev *pdev)
}
static const struct pci_device_id virtiovf_pci_table[] = {
- /* Only virtio-net is supported/tested so far */
+ /* Only virtio-net and virtio-block are supported/tested so far */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1041) },
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042) },
{}
};
@@ -529,9 +197,10 @@ MODULE_DEVICE_TABLE(pci, virtiovf_pci_table);
static void virtiovf_pci_aer_reset_done(struct pci_dev *pdev)
{
- struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
-
- virtvdev->pci_cmd = 0;
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ virtiovf_legacy_io_reset_done(pdev);
+#endif
+ virtiovf_migration_reset_done(pdev);
}
static const struct pci_error_handlers virtiovf_err_handlers = {
@@ -553,4 +222,4 @@ module_pci_driver(virtiovf_pci_driver);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
MODULE_DESCRIPTION(
- "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET devices");
+ "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET and BLOCK devices");
diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c
new file mode 100644
index 000000000000..ba92bb4e9af9
--- /dev/null
+++ b/drivers/vfio/pci/virtio/migrate.c
@@ -0,0 +1,1337 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_pci_admin.h>
+#include <linux/anon_inodes.h>
+
+#include "common.h"
+
+/* Device specification max parts size */
+#define MAX_LOAD_SIZE (BIT_ULL(BITS_PER_TYPE \
+ (((struct virtio_admin_cmd_dev_parts_metadata_result *)0)->parts_size.size)) - 1)
+
+/* Initial target buffer size */
+#define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M
+
+static int
+virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
+ u32 ctx_size);
+
+static struct page *
+virtiovf_get_migration_page(struct virtiovf_data_buffer *buf,
+ unsigned long offset)
+{
+ unsigned long cur_offset = 0;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* All accesses are sequential */
+ if (offset < buf->last_offset || !buf->last_offset_sg) {
+ buf->last_offset = 0;
+ buf->last_offset_sg = buf->table.sgt.sgl;
+ buf->sg_last_entry = 0;
+ }
+
+ cur_offset = buf->last_offset;
+
+ for_each_sg(buf->last_offset_sg, sg,
+ buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
+ if (offset < sg->length + cur_offset) {
+ buf->last_offset_sg = sg;
+ buf->sg_last_entry += i;
+ buf->last_offset = cur_offset;
+ return nth_page(sg_page(sg),
+ (offset - cur_offset) / PAGE_SIZE);
+ }
+ cur_offset += sg->length;
+ }
+ return NULL;
+}
+
+static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf,
+ unsigned int npages)
+{
+ unsigned int to_alloc = npages;
+ struct page **page_list;
+ unsigned long filled;
+ unsigned int to_fill;
+ int ret;
+ int i;
+
+ to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+ page_list = kvcalloc(to_fill, sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+ if (!page_list)
+ return -ENOMEM;
+
+ do {
+ filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill,
+ page_list);
+ if (!filled) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ to_alloc -= filled;
+ ret = sg_alloc_append_table_from_pages(&buf->table, page_list,
+ filled, 0, filled << PAGE_SHIFT, UINT_MAX,
+ SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT);
+
+ if (ret)
+ goto err_append;
+ buf->allocated_length += filled * PAGE_SIZE;
+ /* clean input for another bulk allocation */
+ memset(page_list, 0, filled * sizeof(*page_list));
+ to_fill = min_t(unsigned int, to_alloc,
+ PAGE_SIZE / sizeof(*page_list));
+ } while (to_alloc > 0);
+
+ kvfree(page_list);
+ return 0;
+
+err_append:
+ for (i = filled - 1; i >= 0; i--)
+ __free_page(page_list[i]);
+err:
+ kvfree(page_list);
+ return ret;
+}
+
+static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf)
+{
+ struct sg_page_iter sg_iter;
+
+ /* Undo alloc_pages_bulk() */
+ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
+ __free_page(sg_page_iter_page(&sg_iter));
+ sg_free_append_table(&buf->table);
+ kfree(buf);
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_alloc_data_buffer(struct virtiovf_migration_file *migf, size_t length)
+{
+ struct virtiovf_data_buffer *buf;
+ int ret;
+
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ ret = virtiovf_add_migration_pages(buf,
+ DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (ret)
+ goto end;
+
+ buf->migf = migf;
+ return buf;
+end:
+ virtiovf_free_data_buffer(buf);
+ return ERR_PTR(ret);
+}
+
+static void virtiovf_put_data_buffer(struct virtiovf_data_buffer *buf)
+{
+ spin_lock_irq(&buf->migf->list_lock);
+ list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
+ spin_unlock_irq(&buf->migf->list_lock);
+}
+
+static int
+virtiovf_pci_alloc_obj_id(struct virtiovf_pci_core_device *virtvdev, u8 type,
+ u32 *obj_id)
+{
+ return virtio_pci_admin_obj_create(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, type, obj_id);
+}
+
+static void
+virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id)
+{
+ virtio_pci_admin_obj_destroy(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id);
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length)
+{
+ struct virtiovf_data_buffer *buf, *temp_buf;
+ struct list_head free_list;
+
+ INIT_LIST_HEAD(&free_list);
+
+ spin_lock_irq(&migf->list_lock);
+ list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+ list_del_init(&buf->buf_elm);
+ if (buf->allocated_length >= length) {
+ spin_unlock_irq(&migf->list_lock);
+ goto found;
+ }
+ /*
+ * Prevent holding redundant buffers. Put in a free
+ * list and call at the end not under the spin lock
+ * (&migf->list_lock) to minimize its scope usage.
+ */
+ list_add(&buf->buf_elm, &free_list);
+ }
+ spin_unlock_irq(&migf->list_lock);
+ buf = virtiovf_alloc_data_buffer(migf, length);
+
+found:
+ while ((temp_buf = list_first_entry_or_null(&free_list,
+ struct virtiovf_data_buffer, buf_elm))) {
+ list_del(&temp_buf->buf_elm);
+ virtiovf_free_data_buffer(temp_buf);
+ }
+
+ return buf;
+}
+
+static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf)
+{
+ struct virtiovf_data_buffer *entry;
+
+ if (migf->buf) {
+ virtiovf_free_data_buffer(migf->buf);
+ migf->buf = NULL;
+ }
+
+ if (migf->buf_header) {
+ virtiovf_free_data_buffer(migf->buf_header);
+ migf->buf_header = NULL;
+ }
+
+ list_splice(&migf->avail_list, &migf->buf_list);
+
+ while ((entry = list_first_entry_or_null(&migf->buf_list,
+ struct virtiovf_data_buffer, buf_elm))) {
+ list_del(&entry->buf_elm);
+ virtiovf_free_data_buffer(entry);
+ }
+
+ if (migf->has_obj_id)
+ virtiovf_pci_free_obj_id(migf->virtvdev, migf->obj_id);
+}
+
+static void virtiovf_disable_fd(struct virtiovf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static void virtiovf_disable_fds(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (virtvdev->resuming_migf) {
+ virtiovf_disable_fd(virtvdev->resuming_migf);
+ virtiovf_clean_migf_resources(virtvdev->resuming_migf);
+ fput(virtvdev->resuming_migf->filp);
+ virtvdev->resuming_migf = NULL;
+ }
+ if (virtvdev->saving_migf) {
+ virtiovf_disable_fd(virtvdev->saving_migf);
+ virtiovf_clean_migf_resources(virtvdev->saving_migf);
+ fput(virtvdev->saving_migf->filp);
+ virtvdev->saving_migf = NULL;
+ }
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+static void virtiovf_state_mutex_unlock(struct virtiovf_pci_core_device *virtvdev)
+{
+again:
+ spin_lock(&virtvdev->reset_lock);
+ if (virtvdev->deferred_reset) {
+ virtvdev->deferred_reset = false;
+ spin_unlock(&virtvdev->reset_lock);
+ virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ virtiovf_disable_fds(virtvdev);
+ goto again;
+ }
+ mutex_unlock(&virtvdev->state_mutex);
+ spin_unlock(&virtvdev->reset_lock);
+}
+
+void virtiovf_migration_reset_done(struct pci_dev *pdev)
+{
+ struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
+
+ if (!virtvdev->migrate_cap)
+ return;
+
+ /*
+ * As the higher VFIO layers are holding locks across reset and using
+ * those same locks with the mm_lock we need to prevent ABBA deadlock
+ * with the state_mutex and mm_lock.
+ * In case the state_mutex was taken already we defer the cleanup work
+ * to the unlock flow of the other running context.
+ */
+ spin_lock(&virtvdev->reset_lock);
+ virtvdev->deferred_reset = true;
+ if (!mutex_trylock(&virtvdev->state_mutex)) {
+ spin_unlock(&virtvdev->reset_lock);
+ return;
+ }
+ spin_unlock(&virtvdev->reset_lock);
+ virtiovf_state_mutex_unlock(virtvdev);
+}
+
+static int virtiovf_release_file(struct inode *inode, struct file *filp)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+
+ virtiovf_disable_fd(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+ return 0;
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf,
+ loff_t pos, bool *end_of_data)
+{
+ struct virtiovf_data_buffer *buf;
+ bool found = false;
+
+ *end_of_data = false;
+ spin_lock_irq(&migf->list_lock);
+ if (list_empty(&migf->buf_list)) {
+ *end_of_data = true;
+ goto end;
+ }
+
+ buf = list_first_entry(&migf->buf_list, struct virtiovf_data_buffer,
+ buf_elm);
+ if (pos >= buf->start_pos &&
+ pos < buf->start_pos + buf->length) {
+ found = true;
+ goto end;
+ }
+
+ /*
+ * As we use a stream based FD we may expect having the data always
+ * on first chunk
+ */
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+
+end:
+ spin_unlock_irq(&migf->list_lock);
+ return found ? buf : NULL;
+}
+
+static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf,
+ char __user **buf, size_t *len, loff_t *pos)
+{
+ unsigned long offset;
+ ssize_t done = 0;
+ size_t copy_len;
+
+ copy_len = min_t(size_t,
+ vhca_buf->start_pos + vhca_buf->length - *pos, *len);
+ while (copy_len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+ offset -= page_offset;
+ page = virtiovf_get_migration_page(vhca_buf, offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ ret = copy_to_user(*buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (ret)
+ return -EFAULT;
+ *pos += page_len;
+ *len -= page_len;
+ *buf += page_len;
+ done += page_len;
+ copy_len -= page_len;
+ }
+
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
+ spin_lock_irq(&vhca_buf->migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&vhca_buf->migf->list_lock);
+ }
+
+ return done;
+}
+
+static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_data_buffer *vhca_buf;
+ bool first_loop_call = true;
+ bool end_of_data;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len) {
+ ssize_t count;
+
+ vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+ if (first_loop_call) {
+ first_loop_call = false;
+ /* Temporary end of file as part of PRE_COPY */
+ if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) {
+ done = -ENOMSG;
+ goto out_unlock;
+ }
+ if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
+ if (end_of_data)
+ goto out_unlock;
+
+ if (!vhca_buf) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ count = virtiovf_buf_read(vhca_buf, &buf, &len, pos);
+ if (count < 0) {
+ done = count;
+ goto out_unlock;
+ }
+ done += count;
+ }
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
+ struct vfio_precopy_info info = {};
+ loff_t *pos = &filp->f_pos;
+ bool end_of_data = false;
+ unsigned long minsz;
+ u32 ctx_size = 0;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&virtvdev->state_mutex);
+ if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto err_state_unlock;
+ }
+
+ /*
+ * The virtio specification does not include a PRE_COPY concept.
+ * Since we can expect the data to remain the same for a certain period,
+ * we use a rate limiter mechanism before making a call to the device.
+ */
+ if (__ratelimit(&migf->pre_copy_rl_state)) {
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto err_state_unlock;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto err_migf_unlock;
+ }
+
+ if (migf->pre_copy_initial_bytes > *pos) {
+ info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
+ } else {
+ info.dirty_bytes = migf->max_pos - *pos;
+ if (!info.dirty_bytes)
+ end_of_data = true;
+ info.dirty_bytes += ctx_size;
+ }
+
+ if (!end_of_data || !ctx_size) {
+ mutex_unlock(&migf->lock);
+ goto done;
+ }
+
+ mutex_unlock(&migf->lock);
+ /*
+ * We finished transferring the current state and the device has a
+ * dirty state, read a new state.
+ */
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ /*
+ * The machine is running, and context size could be grow, so no reason to mark
+ * the device state as VIRTIOVF_MIGF_STATE_ERROR.
+ */
+ goto err_state_unlock;
+
+done:
+ virtiovf_state_mutex_unlock(virtvdev);
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+
+err_migf_unlock:
+ mutex_unlock(&migf->lock);
+err_state_unlock:
+ virtiovf_state_mutex_unlock(virtvdev);
+ return ret;
+}
+
+static const struct file_operations virtiovf_save_fops = {
+ .owner = THIS_MODULE,
+ .read = virtiovf_save_read,
+ .unlocked_ioctl = virtiovf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = virtiovf_release_file,
+};
+
+static int
+virtiovf_add_buf_header(struct virtiovf_data_buffer *header_buf,
+ u32 data_size)
+{
+ struct virtiovf_migration_file *migf = header_buf->migf;
+ struct virtiovf_migration_header header = {};
+ struct page *page;
+ u8 *to_buff;
+
+ header.record_size = cpu_to_le64(data_size);
+ header.flags = cpu_to_le32(VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY);
+ header.tag = cpu_to_le32(VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA);
+ page = virtiovf_get_migration_page(header_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &header, sizeof(header));
+ kunmap_local(to_buff);
+ header_buf->length = sizeof(header);
+ header_buf->start_pos = header_buf->migf->max_pos;
+ migf->max_pos += header_buf->length;
+ spin_lock_irq(&migf->list_lock);
+ list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+ spin_unlock_irq(&migf->list_lock);
+ return 0;
+}
+
+static int
+virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
+ u32 ctx_size)
+{
+ struct virtiovf_data_buffer *header_buf;
+ struct virtiovf_data_buffer *buf;
+ bool unmark_end = false;
+ struct scatterlist *sg;
+ unsigned int i;
+ u32 res_size;
+ int nent;
+ int ret;
+
+ buf = virtiovf_get_data_buffer(migf, ctx_size);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ /* Find the total count of SG entries which satisfies the size */
+ nent = sg_nents_for_len(buf->table.sgt.sgl, ctx_size);
+ if (nent <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Iterate to that SG entry and mark it as last (if it's not already)
+ * to let underlay layers iterate only till that entry.
+ */
+ for_each_sg(buf->table.sgt.sgl, sg, nent - 1, i)
+ ;
+
+ if (!sg_is_last(sg)) {
+ unmark_end = true;
+ sg_mark_end(sg);
+ }
+
+ ret = virtio_pci_admin_dev_parts_get(migf->virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS,
+ migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL,
+ buf->table.sgt.sgl, &res_size);
+ /* Restore the original SG mark end */
+ if (unmark_end)
+ sg_unmark_end(sg);
+ if (ret)
+ goto out;
+
+ buf->length = res_size;
+ header_buf = virtiovf_get_data_buffer(migf,
+ sizeof(struct virtiovf_migration_header));
+ if (IS_ERR(header_buf)) {
+ ret = PTR_ERR(header_buf);
+ goto out;
+ }
+
+ ret = virtiovf_add_buf_header(header_buf, res_size);
+ if (ret)
+ goto out_header;
+
+ buf->start_pos = buf->migf->max_pos;
+ migf->max_pos += buf->length;
+ spin_lock(&migf->list_lock);
+ list_add_tail(&buf->buf_elm, &migf->buf_list);
+ spin_unlock_irq(&migf->list_lock);
+ return 0;
+
+out_header:
+ virtiovf_put_data_buffer(header_buf);
+out:
+ virtiovf_put_data_buffer(buf);
+ return ret;
+}
+
+static int
+virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct virtiovf_migration_file *migf = virtvdev->saving_migf;
+ u32 ctx_size;
+ int ret;
+
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR)
+ return -ENODEV;
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto err;
+
+ if (!ctx_size) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ goto err;
+
+ migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
+ return 0;
+
+err:
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ return ret;
+}
+
+static struct virtiovf_migration_file *
+virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev,
+ bool pre_copy)
+{
+ struct virtiovf_migration_file *migf;
+ u32 ctx_size;
+ u32 obj_id;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_save_fops, migf,
+ O_RDONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ migf->virtvdev = virtvdev;
+
+ lockdep_assert_held(&virtvdev->state_mutex);
+ ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET,
+ &obj_id);
+ if (ret)
+ goto out;
+
+ migf->obj_id = obj_id;
+ /* Mark as having a valid obj id which can be even 0 */
+ migf->has_obj_id = true;
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto out_clean;
+
+ if (!ctx_size) {
+ ret = -EINVAL;
+ goto out_clean;
+ }
+
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ goto out_clean;
+
+ if (pre_copy) {
+ migf->pre_copy_initial_bytes = migf->max_pos;
+ /* Arbitrarily set the pre-copy rate limit to 1-second intervals */
+ ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1);
+ /* Prevent any rate messages upon its usage */
+ ratelimit_set_flags(&migf->pre_copy_rl_state,
+ RATELIMIT_MSG_ON_RELEASE);
+ migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
+ } else {
+ migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
+ }
+
+ return migf;
+
+out_clean:
+ virtiovf_clean_migf_resources(migf);
+out:
+ fput(migf->filp);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Set the required object header at the beginning of the buffer.
+ * The actual device parts data will be written post of the header offset.
+ */
+static int virtiovf_set_obj_cmd_header(struct virtiovf_data_buffer *vhca_buf)
+{
+ struct virtio_admin_cmd_resource_obj_cmd_hdr obj_hdr = {};
+ struct page *page;
+ u8 *to_buff;
+
+ obj_hdr.type = cpu_to_le16(VIRTIO_RESOURCE_OBJ_DEV_PARTS);
+ obj_hdr.id = cpu_to_le32(vhca_buf->migf->obj_id);
+ page = virtiovf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &obj_hdr, sizeof(obj_hdr));
+ kunmap_local(to_buff);
+
+ /* Mark the buffer as including the header object data */
+ vhca_buf->include_header_object = 1;
+ return 0;
+}
+
+static int
+virtiovf_append_page_to_mig_buf(struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ unsigned long offset;
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+
+ if (vhca_buf->include_header_object)
+ /* The buffer holds the object header, update the offset accordingly */
+ offset += sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr);
+
+ page_offset = offset % PAGE_SIZE;
+
+ page = virtiovf_get_migration_page(vhca_buf, offset - page_offset);
+ if (!page)
+ return -EINVAL;
+
+ page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + page_offset, *buf, page_len);
+ kunmap_local(to_buff);
+ if (ret)
+ return -EFAULT;
+
+ *pos += page_len;
+ *done += page_len;
+ *buf += page_len;
+ *len -= page_len;
+ vhca_buf->length += page_len;
+ return 0;
+}
+
+static ssize_t
+virtiovf_resume_read_chunk(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ size_t chunk_size, const char __user **buf,
+ size_t *len, loff_t *pos, ssize_t *done,
+ bool *has_work)
+{
+ size_t copy_len, to_copy;
+ int ret;
+
+ to_copy = min_t(size_t, *len, chunk_size - vhca_buf->length);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy,
+ pos, done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == chunk_size) {
+ migf->load_state = VIRTIOVF_LOAD_STATE_LOAD_CHUNK;
+ migf->max_pos += chunk_size;
+ *has_work = true;
+ }
+
+ return 0;
+}
+
+static int
+virtiovf_resume_read_header_data(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ size_t copy_len, to_copy;
+ size_t required_data;
+ int ret;
+
+ required_data = migf->record_size - vhca_buf->length;
+ to_copy = min_t(size_t, *len, required_data);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy,
+ pos, done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == migf->record_size) {
+ switch (migf->record_tag) {
+ default:
+ /* Optional tag */
+ break;
+ }
+
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+ migf->max_pos += migf->record_size;
+ vhca_buf->length = 0;
+ }
+
+ return 0;
+}
+
+static int
+virtiovf_resume_read_header(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf,
+ size_t *len, loff_t *pos,
+ ssize_t *done, bool *has_work)
+{
+ struct page *page;
+ size_t copy_len;
+ u8 *to_buff;
+ int ret;
+
+ copy_len = min_t(size_t, *len,
+ sizeof(struct virtiovf_migration_header) - vhca_buf->length);
+ page = virtiovf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto end;
+ }
+
+ *buf += copy_len;
+ *pos += copy_len;
+ *done += copy_len;
+ *len -= copy_len;
+ vhca_buf->length += copy_len;
+ if (vhca_buf->length == sizeof(struct virtiovf_migration_header)) {
+ u64 record_size;
+ u32 flags;
+
+ record_size = le64_to_cpup((__le64 *)to_buff);
+ if (record_size > MAX_LOAD_SIZE) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ migf->record_size = record_size;
+ flags = le32_to_cpup((__le32 *)(to_buff +
+ offsetof(struct virtiovf_migration_header, flags)));
+ migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
+ offsetof(struct virtiovf_migration_header, tag)));
+ switch (migf->record_tag) {
+ case VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA:
+ migf->load_state = VIRTIOVF_LOAD_STATE_PREP_CHUNK;
+ break;
+ default:
+ if (!(flags & VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
+ ret = -EOPNOTSUPP;
+ goto end;
+ }
+ /* We may read and skip this optional record data */
+ migf->load_state = VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA;
+ }
+
+ migf->max_pos += vhca_buf->length;
+ vhca_buf->length = 0;
+ *has_work = true;
+ }
+end:
+ kunmap_local(to_buff);
+ return ret;
+}
+
+static ssize_t virtiovf_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_data_buffer *vhca_buf = migf->buf;
+ struct virtiovf_data_buffer *vhca_buf_header = migf->buf_header;
+ unsigned int orig_length;
+ bool has_work = false;
+ ssize_t done = 0;
+ int ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+ if (*pos < vhca_buf->start_pos)
+ return -EINVAL;
+
+ mutex_lock(&migf->virtvdev->state_mutex);
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len || has_work) {
+ has_work = false;
+ switch (migf->load_state) {
+ case VIRTIOVF_LOAD_STATE_READ_HEADER:
+ ret = virtiovf_resume_read_header(migf, vhca_buf_header, &buf,
+ &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA:
+ if (vhca_buf_header->allocated_length < migf->record_size) {
+ virtiovf_free_data_buffer(vhca_buf_header);
+
+ migf->buf_header = virtiovf_alloc_data_buffer(migf,
+ migf->record_size);
+ if (IS_ERR(migf->buf_header)) {
+ ret = PTR_ERR(migf->buf_header);
+ migf->buf_header = NULL;
+ goto out_unlock;
+ }
+
+ vhca_buf_header = migf->buf_header;
+ }
+
+ vhca_buf_header->start_pos = migf->max_pos;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER_DATA;
+ break;
+ case VIRTIOVF_LOAD_STATE_READ_HEADER_DATA:
+ ret = virtiovf_resume_read_header_data(migf, vhca_buf_header,
+ &buf, &len, pos, &done);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_PREP_CHUNK:
+ {
+ u32 cmd_size = migf->record_size +
+ sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr);
+
+ /*
+ * The DMA map/unmap is managed in virtio layer, we just need to extend
+ * the SG pages to hold the extra required chunk data.
+ */
+ if (vhca_buf->allocated_length < cmd_size) {
+ ret = virtiovf_add_migration_pages(vhca_buf,
+ DIV_ROUND_UP_ULL(cmd_size - vhca_buf->allocated_length,
+ PAGE_SIZE));
+ if (ret)
+ goto out_unlock;
+ }
+
+ vhca_buf->start_pos = migf->max_pos;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_CHUNK;
+ break;
+ }
+ case VIRTIOVF_LOAD_STATE_READ_CHUNK:
+ ret = virtiovf_resume_read_chunk(migf, vhca_buf, migf->record_size,
+ &buf, &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_LOAD_CHUNK:
+ /* Mark the last SG entry and set its length */
+ sg_mark_end(vhca_buf->last_offset_sg);
+ orig_length = vhca_buf->last_offset_sg->length;
+ /* Length should include the resource object command header */
+ vhca_buf->last_offset_sg->length = vhca_buf->length +
+ sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr) -
+ vhca_buf->last_offset;
+ ret = virtio_pci_admin_dev_parts_set(migf->virtvdev->core_device.pdev,
+ vhca_buf->table.sgt.sgl);
+ /* Restore the original SG data */
+ vhca_buf->last_offset_sg->length = orig_length;
+ sg_unmark_end(vhca_buf->last_offset_sg);
+ if (ret)
+ goto out_unlock;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+ /* be ready for reading the next chunk */
+ vhca_buf->length = 0;
+ break;
+ default:
+ break;
+ }
+ }
+
+out_unlock:
+ if (ret)
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ mutex_unlock(&migf->lock);
+ virtiovf_state_mutex_unlock(migf->virtvdev);
+ return ret ? ret : done;
+}
+
+static const struct file_operations virtiovf_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = virtiovf_resume_write,
+ .release = virtiovf_release_file,
+};
+
+static struct virtiovf_migration_file *
+virtiovf_pci_resume_device_data(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct virtiovf_migration_file *migf;
+ struct virtiovf_data_buffer *buf;
+ u32 obj_id;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_resume_fops, migf,
+ O_WRONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+
+ buf = virtiovf_alloc_data_buffer(migf, VIRTIOVF_TARGET_INITIAL_BUF_SIZE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out;
+ }
+
+ migf->buf = buf;
+
+ buf = virtiovf_alloc_data_buffer(migf,
+ sizeof(struct virtiovf_migration_header));
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_clean;
+ }
+
+ migf->buf_header = buf;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+
+ migf->virtvdev = virtvdev;
+ ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET,
+ &obj_id);
+ if (ret)
+ goto out_clean;
+
+ migf->obj_id = obj_id;
+ /* Mark as having a valid obj id which can be even 0 */
+ migf->has_obj_id = true;
+ ret = virtiovf_set_obj_cmd_header(migf->buf);
+ if (ret)
+ goto out_clean;
+
+ return migf;
+
+out_clean:
+ virtiovf_clean_migf_resources(migf);
+out:
+ fput(migf->filp);
+ return ERR_PTR(ret);
+}
+
+static struct file *
+virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
+ u32 new)
+{
+ u32 cur = virtvdev->mig_state;
+ int ret;
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
+ /* NOP */
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ /* NOP */
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev,
+ BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED));
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
+ ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_save_device_data(virtvdev, false);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+ virtiovf_disable_fds(virtvdev);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_resume_device_data(virtvdev);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->resuming_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ virtiovf_disable_fds(virtvdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_save_device_data(virtvdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ ret = virtiovf_pci_save_device_final_data(virtvdev);
+ return ret ? ERR_PTR(ret) : NULL;
+ }
+
+ /*
+ * vfio_mig_get_next_state() does not use arcs other than the above
+ */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct file *
+virtiovf_pci_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *res = NULL;
+ int ret;
+
+ mutex_lock(&virtvdev->state_mutex);
+ while (new_state != virtvdev->mig_state) {
+ ret = vfio_mig_get_next_state(vdev, virtvdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ res = ERR_PTR(ret);
+ break;
+ }
+ res = virtiovf_pci_step_device_state_locked(virtvdev, next_state);
+ if (IS_ERR(res))
+ break;
+ virtvdev->mig_state = next_state;
+ if (WARN_ON(res && new_state != virtvdev->mig_state)) {
+ fput(res);
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ virtiovf_state_mutex_unlock(virtvdev);
+ return res;
+}
+
+static int virtiovf_pci_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+
+ mutex_lock(&virtvdev->state_mutex);
+ *curr_state = virtvdev->mig_state;
+ virtiovf_state_mutex_unlock(virtvdev);
+ return 0;
+}
+
+static int virtiovf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ bool obj_id_exists;
+ u32 res_size;
+ u32 obj_id;
+ int ret;
+
+ mutex_lock(&virtvdev->state_mutex);
+ obj_id_exists = virtvdev->saving_migf && virtvdev->saving_migf->has_obj_id;
+ if (!obj_id_exists) {
+ ret = virtiovf_pci_alloc_obj_id(virtvdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET,
+ &obj_id);
+ if (ret)
+ goto end;
+ } else {
+ obj_id = virtvdev->saving_migf->obj_id;
+ }
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &res_size);
+ if (!ret)
+ *stop_copy_length = res_size;
+
+ /*
+ * We can't leave this obj_id alive if didn't exist before, otherwise, it might
+ * stay alive, even without an active migration flow (e.g. migration was cancelled)
+ */
+ if (!obj_id_exists)
+ virtiovf_pci_free_obj_id(virtvdev, obj_id);
+end:
+ virtiovf_state_mutex_unlock(virtvdev);
+ return ret;
+}
+
+static const struct vfio_migration_ops virtvdev_pci_mig_ops = {
+ .migration_set_state = virtiovf_pci_set_device_state,
+ .migration_get_state = virtiovf_pci_get_device_state,
+ .migration_get_data_size = virtiovf_pci_get_data_size,
+};
+
+void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev)
+{
+ virtvdev->migrate_cap = 1;
+ mutex_init(&virtvdev->state_mutex);
+ spin_lock_init(&virtvdev->reset_lock);
+ virtvdev->core_device.vdev.migration_flags =
+ VFIO_MIGRATION_STOP_COPY |
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+ virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops;
+}
+
+void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->migrate_cap)
+ return;
+
+ virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+}
+
+void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->migrate_cap)
+ return;
+
+ virtiovf_disable_fds(virtvdev);
+}
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 485c6f9161a9..ff8ff8480968 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -134,7 +134,6 @@ static struct amba_driver vfio_amba_driver = {
.id_table = vfio_amba_ids,
.drv = {
.name = "vfio-amba",
- .owner = THIS_MODULE,
},
.driver_managed_dma = true,
};
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 42d1462c5e19..512533501eb7 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -112,7 +112,7 @@ static const struct vfio_device_ops vfio_platform_ops = {
static struct platform_driver vfio_platform_driver = {
.probe = vfio_platform_probe,
- .remove_new = vfio_platform_remove,
+ .remove = vfio_platform_remove,
.driver = {
.name = "vfio-platform",
},
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index e53757d1d095..3bf1043cd795 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -388,6 +388,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
{
unsigned int done = 0;
+ if (off >= reg->size)
+ return -EINVAL;
+
+ count = min_t(size_t, count, reg->size - off);
+
if (!reg->ioaddr) {
reg->ioaddr =
ioremap(reg->addr, reg->size);
@@ -467,6 +472,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
{
unsigned int done = 0;
+ if (off >= reg->size)
+ return -EINVAL;
+
+ count = min_t(size_t, count, reg->size - off);
+
if (!reg->ioaddr) {
reg->ioaddr =
ioremap(reg->addr, reg->size);
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index a94ec6225d31..5f9e7e477078 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -364,7 +364,6 @@ static void tce_iommu_release(void *iommu_data)
if (!tbl)
continue;
- tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
tce_iommu_free_table(container, tbl);
}
@@ -720,6 +719,8 @@ static long tce_iommu_remove_window(struct tce_container *container,
BUG_ON(!tbl->it_size);
+ tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+
/* Detach groups from IOMMUs */
list_for_each_entry(tcegrp, &container->group_list, next) {
table_group = iommu_group_get_iommudata(tcegrp->grp);
@@ -738,7 +739,6 @@ static long tce_iommu_remove_window(struct tce_container *container,
}
/* Free table */
- tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
tce_iommu_free_table(container, tbl);
container->tables[num] = NULL;
@@ -1197,9 +1197,14 @@ static void tce_iommu_release_ownership(struct tce_container *container,
return;
}
- for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
- if (container->tables[i])
+ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+ if (container->tables[i]) {
+ tce_iommu_clear(container, container->tables[i],
+ container->tables[i]->it_offset,
+ container->tables[i]->it_size);
table_group->ops->unset_window(table_group, i);
+ }
+ }
}
static long tce_iommu_take_ownership(struct tce_container *container,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b5c15fe8f9fc..1136d7ac6b59 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,7 +72,6 @@ struct vfio_iommu {
uint64_t pgsize_bitmap;
uint64_t num_non_pinned_groups;
bool v2;
- bool nesting;
bool dirty_page_tracking;
struct list_head emulated_iommu_groups;
};
@@ -81,7 +80,6 @@ struct vfio_domain {
struct iommu_domain *domain;
struct list_head next;
struct list_head group_list;
- bool fgsp : 1; /* Fine-grained super pages */
bool enforce_cache_coherency : 1;
};
@@ -104,9 +102,9 @@ struct vfio_dma {
struct vfio_batch {
struct page **pages; /* for pin_user_pages_remote */
struct page *fallback_page; /* if pages alloc fails */
- int capacity; /* length of pages array */
- int size; /* of batch currently */
- int offset; /* of next entry in pages */
+ unsigned int capacity; /* length of pages array */
+ unsigned int size; /* of batch currently */
+ unsigned int offset; /* of next entry in pages */
};
struct vfio_iommu_group {
@@ -294,7 +292,7 @@ static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
struct rb_node *p;
for (p = rb_prev(n); p; p = rb_prev(p)) {
- struct vfio_dma *dma = rb_entry(n,
+ struct vfio_dma *dma = rb_entry(p,
struct vfio_dma, node);
vfio_dma_bitmap_free(dma);
@@ -472,12 +470,12 @@ static int put_pfn(unsigned long pfn, int prot)
#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
-static void vfio_batch_init(struct vfio_batch *batch)
+static void __vfio_batch_init(struct vfio_batch *batch, bool single)
{
batch->size = 0;
batch->offset = 0;
- if (unlikely(disable_hugepages))
+ if (single || unlikely(disable_hugepages))
goto fallback;
batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
@@ -492,6 +490,16 @@ fallback:
batch->capacity = 1;
}
+static void vfio_batch_init(struct vfio_batch *batch)
+{
+ __vfio_batch_init(batch, false);
+}
+
+static void vfio_batch_init_single(struct vfio_batch *batch)
+{
+ __vfio_batch_init(batch, true);
+}
+
static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
{
while (batch->size) {
@@ -511,14 +519,12 @@ static void vfio_batch_fini(struct vfio_batch *batch)
static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
unsigned long vaddr, unsigned long *pfn,
- bool write_fault)
+ unsigned long *addr_mask, bool write_fault)
{
- pte_t *ptep;
- pte_t pte;
- spinlock_t *ptl;
+ struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
int ret;
- ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+ ret = follow_pfnmap_start(&args);
if (ret) {
bool unlocked = false;
@@ -532,43 +538,51 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
if (ret)
return ret;
- ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+ ret = follow_pfnmap_start(&args);
if (ret)
return ret;
}
- pte = ptep_get(ptep);
-
- if (write_fault && !pte_write(pte))
+ if (write_fault && !args.writable) {
ret = -EFAULT;
- else
- *pfn = pte_pfn(pte);
+ } else {
+ *pfn = args.pfn;
+ *addr_mask = args.addr_mask;
+ }
- pte_unmap_unlock(ptep, ptl);
+ follow_pfnmap_end(&args);
return ret;
}
/*
* Returns the positive number of pfns successfully obtained or a negative
- * error code.
+ * error code. The initial pfn is stored in the pfn arg. For page-backed
+ * pfns, the provided batch is also updated to indicate the filled pages and
+ * initial offset. For VM_PFNMAP pfns, only the returned number of pfns and
+ * returned initial pfn are provided; subsequent pfns are contiguous.
*/
-static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
- long npages, int prot, unsigned long *pfn,
- struct page **pages)
+static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
+ unsigned long npages, int prot, unsigned long *pfn,
+ struct vfio_batch *batch)
{
+ unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity);
struct vm_area_struct *vma;
unsigned int flags = 0;
- int ret;
+ long ret;
if (prot & IOMMU_WRITE)
flags |= FOLL_WRITE;
mmap_read_lock(mm);
- ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
- pages, NULL);
+ ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM,
+ batch->pages, NULL);
if (ret > 0) {
- *pfn = page_to_pfn(pages[0]);
+ *pfn = page_to_pfn(batch->pages[0]);
+ batch->size = ret;
+ batch->offset = 0;
goto done;
+ } else if (!ret) {
+ ret = -EFAULT;
}
vaddr = untagged_addr_remote(mm, vaddr);
@@ -577,15 +591,22 @@ retry:
vma = vma_lookup(mm, vaddr);
if (vma && vma->vm_flags & VM_PFNMAP) {
- ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
+ unsigned long addr_mask;
+
+ ret = follow_fault_pfn(vma, mm, vaddr, pfn, &addr_mask,
+ prot & IOMMU_WRITE);
if (ret == -EAGAIN)
goto retry;
if (!ret) {
- if (is_invalid_reserved_pfn(*pfn))
- ret = 1;
- else
+ if (is_invalid_reserved_pfn(*pfn)) {
+ unsigned long epfn;
+
+ epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1;
+ ret = min_t(long, npages, epfn - *pfn);
+ } else {
ret = -EFAULT;
+ }
}
}
done:
@@ -599,7 +620,7 @@ done:
* first page and all consecutive pages with the same locking.
*/
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
- long npage, unsigned long *pfn_base,
+ unsigned long npage, unsigned long *pfn_base,
unsigned long limit, struct vfio_batch *batch)
{
unsigned long pfn;
@@ -621,32 +642,42 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
*pfn_base = 0;
}
+ if (unlikely(disable_hugepages))
+ npage = 1;
+
while (npage) {
if (!batch->size) {
/* Empty batch, so refill it. */
- long req_pages = min_t(long, npage, batch->capacity);
-
- ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
- &pfn, batch->pages);
+ ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot,
+ &pfn, batch);
if (ret < 0)
goto unpin_out;
- batch->size = ret;
- batch->offset = 0;
-
if (!*pfn_base) {
*pfn_base = pfn;
rsvd = is_invalid_reserved_pfn(*pfn_base);
}
+
+ /* Handle pfnmap */
+ if (!batch->size) {
+ if (pfn != *pfn_base + pinned || !rsvd)
+ goto out;
+
+ pinned += ret;
+ npage -= ret;
+ vaddr += (PAGE_SIZE * ret);
+ iova += (PAGE_SIZE * ret);
+ continue;
+ }
}
/*
- * pfn is preset for the first iteration of this inner loop and
- * updated at the end to handle a VM_PFNMAP pfn. In that case,
- * batch->pages isn't valid (there's no struct page), so allow
- * batch->pages to be touched only when there's more than one
- * pfn to check, which guarantees the pfns are from a
- * !VM_PFNMAP vma.
+ * pfn is preset for the first iteration of this inner loop
+ * due to the fact that vaddr_get_pfns() needs to provide the
+ * initial pfn for pfnmaps. Therefore to reduce redundancy,
+ * the next pfn is fetched at the end of the loop.
+ * A PageReserved() page could still qualify as page backed
+ * and rsvd here, and therefore continues to use the batch.
*/
while (true) {
if (pfn != *pfn_base + pinned ||
@@ -681,21 +712,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
pfn = page_to_pfn(batch->pages[batch->offset]);
}
-
- if (unlikely(disable_hugepages))
- break;
}
out:
ret = vfio_lock_acct(dma, lock_acct, false);
unpin_out:
- if (batch->size == 1 && !batch->offset) {
- /* May be a VM_PFNMAP pfn, which the batch can't remember. */
- put_pfn(pfn, dma->prot);
- batch->size = 0;
- }
-
if (ret < 0) {
if (pinned && !rsvd) {
for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
@@ -710,7 +732,7 @@ unpin_out:
}
static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
- unsigned long pfn, long npage,
+ unsigned long pfn, unsigned long npage,
bool do_accounting)
{
long unlocked = 0, locked = 0;
@@ -733,7 +755,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
unsigned long *pfn_base, bool do_accounting)
{
- struct page *pages[1];
+ struct vfio_batch batch;
struct mm_struct *mm;
int ret;
@@ -741,7 +763,9 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
if (!mmget_not_zero(mm))
return -ENODEV;
- ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
+ vfio_batch_init_single(&batch);
+
+ ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch);
if (ret != 1)
goto out;
@@ -760,6 +784,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
}
out:
+ vfio_batch_fini(&batch);
mmput(mm);
return ret;
}
@@ -1069,8 +1094,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
* may require hardware cache flushing, try to find the
* largest contiguous physical memory chunk to unmap.
*/
- for (len = PAGE_SIZE;
- !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
+ for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) {
next = iommu_iova_to_phys(domain->domain, iova + len);
if (next != phys + len)
break;
@@ -1807,49 +1831,6 @@ unwind:
return ret;
}
-/*
- * We change our unmap behavior slightly depending on whether the IOMMU
- * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
- * for practically any contiguous power-of-two mapping we give it. This means
- * we don't need to look for contiguous chunks ourselves to make unmapping
- * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
- * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
- * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
- * hugetlbfs is in use.
- */
-static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions)
-{
- int ret, order = get_order(PAGE_SIZE * 2);
- struct vfio_iova *region;
- struct page *pages;
- dma_addr_t start;
-
- pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
- if (!pages)
- return;
-
- list_for_each_entry(region, regions, list) {
- start = ALIGN(region->start, PAGE_SIZE * 2);
- if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
- continue;
-
- ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2,
- IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE,
- GFP_KERNEL_ACCOUNT);
- if (!ret) {
- size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE);
-
- if (unmapped == PAGE_SIZE)
- iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE);
- else
- domain->fgsp = true;
- }
- break;
- }
-
- __free_pages(pages, order);
-}
-
static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
struct iommu_group *iommu_group)
{
@@ -2135,7 +2116,7 @@ static int vfio_iommu_domain_alloc(struct device *dev, void *data)
{
struct iommu_domain **domain = data;
- *domain = iommu_domain_alloc(dev->bus);
+ *domain = iommu_paging_domain_alloc(dev);
return 1; /* Don't iterate */
}
@@ -2192,16 +2173,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
* us a representative device for the IOMMU API call. We don't actually
* want to iterate beyond the first device (if any).
*/
- ret = -EIO;
iommu_group_for_each_dev(iommu_group, &domain->domain,
vfio_iommu_domain_alloc);
- if (!domain->domain)
+ if (IS_ERR(domain->domain)) {
+ ret = PTR_ERR(domain->domain);
goto out_free_domain;
-
- if (iommu->nesting) {
- ret = iommu_enable_nesting(domain->domain);
- if (ret)
- goto out_domain;
}
ret = iommu_attach_group(domain->domain, group->iommu_group);
@@ -2293,8 +2269,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
}
}
- vfio_test_domain_fgsp(domain, &iova_copy);
-
/* replay mappings on new domains */
ret = vfio_iommu_replay(iommu, domain);
if (ret)
@@ -2544,9 +2518,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
switch (arg) {
case VFIO_TYPE1_IOMMU:
break;
- case VFIO_TYPE1_NESTING_IOMMU:
- iommu->nesting = true;
- fallthrough;
+ case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
case VFIO_TYPE1v2_IOMMU:
iommu->v2 = true;
break;
@@ -2641,7 +2613,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
switch (arg) {
case VFIO_TYPE1_IOMMU:
case VFIO_TYPE1v2_IOMMU:
- case VFIO_TYPE1_NESTING_IOMMU:
case VFIO_UNMAP_ALL:
return 1;
case VFIO_UPDATE_VADDR:
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index e97d796a54fb..1fd261efc582 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -22,8 +22,10 @@
#include <linux/list.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
+#include <linux/mount.h>
#include <linux/mutex.h>
#include <linux/pci.h>
+#include <linux/pseudo_fs.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -43,9 +45,13 @@
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
#define DRIVER_DESC "VFIO - User Level meta-driver"
+#define VFIO_MAGIC 0x5646494f /* "VFIO" */
+
static struct vfio {
struct class *device_class;
struct ida device_ida;
+ struct vfsmount *vfs_mount;
+ int fs_count;
} vfio;
#ifdef CONFIG_VFIO_NOIOMMU
@@ -186,6 +192,8 @@ static void vfio_device_release(struct device *dev)
if (device->ops->release)
device->ops->release(device);
+ iput(device->inode);
+ simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
kvfree(device);
}
@@ -228,6 +236,34 @@ out_free:
}
EXPORT_SYMBOL_GPL(_vfio_alloc_device);
+static int vfio_fs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type vfio_fs_type = {
+ .name = "vfio",
+ .owner = THIS_MODULE,
+ .init_fs_context = vfio_fs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static struct inode *vfio_fs_inode_new(void)
+{
+ struct inode *inode;
+ int ret;
+
+ ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
+ if (ret)
+ return ERR_PTR(ret);
+
+ inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
+ if (IS_ERR(inode))
+ simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
+
+ return inode;
+}
+
/*
* Initialize a vfio_device so it can be registered to vfio core.
*/
@@ -246,6 +282,11 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
init_completion(&device->comp);
device->dev = dev;
device->ops = ops;
+ device->inode = vfio_fs_inode_new();
+ if (IS_ERR(device->inode)) {
+ ret = PTR_ERR(device->inode);
+ goto out_inode;
+ }
if (ops->init) {
ret = ops->init(device);
@@ -260,6 +301,9 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev,
return 0;
out_uninit:
+ iput(device->inode);
+ simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
+out_inode:
vfio_release_device_set(device);
ida_free(&vfio.device_ida, device->index);
return ret;
@@ -1707,7 +1751,7 @@ static void __exit vfio_cleanup(void)
module_init(vfio_init);
module_exit(vfio_cleanup);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 532269133801..aa2891f97508 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -113,7 +113,6 @@ int vfio_virqfd_enable(void *opaque,
void (*thread)(void *, void *),
void *data, struct virqfd **pvirqfd, int fd)
{
- struct fd irqfd;
struct eventfd_ctx *ctx;
struct virqfd *virqfd;
int ret = 0;
@@ -133,16 +132,16 @@ int vfio_virqfd_enable(void *opaque,
INIT_WORK(&virqfd->inject, virqfd_inject);
INIT_WORK(&virqfd->flush_inject, virqfd_flush_inject);
- irqfd = fdget(fd);
- if (!irqfd.file) {
+ CLASS(fd, irqfd)(fd);
+ if (fd_empty(irqfd)) {
ret = -EBADF;
goto err_fd;
}
- ctx = eventfd_ctx_fileget(irqfd.file);
+ ctx = eventfd_ctx_fileget(fd_file(irqfd));
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
- goto err_ctx;
+ goto err_fd;
}
virqfd->eventfd = ctx;
@@ -171,7 +170,7 @@ int vfio_virqfd_enable(void *opaque,
init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
- events = vfs_poll(irqfd.file, &virqfd->pt);
+ events = vfs_poll(fd_file(irqfd), &virqfd->pt);
/*
* Check if there was an event already pending on the eventfd
@@ -181,18 +180,9 @@ int vfio_virqfd_enable(void *opaque,
if ((!handler || handler(opaque, data)) && thread)
schedule_work(&virqfd->inject);
}
-
- /*
- * Do not drop the file until the irqfd is fully initialized,
- * otherwise we might race against the EPOLLHUP.
- */
- fdput(irqfd);
-
return 0;
err_busy:
eventfd_ctx_put(ctx);
-err_ctx:
- fdput(irqfd);
err_fd:
kfree(virqfd);