summaryrefslogtreecommitdiff
path: root/drivers/vfio/pci
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-08-30 20:36:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-08-30 20:36:01 -0700
commitec0e2dc81072300acbda5deca2e02f98485eafa9 (patch)
tree994ebae628855d50b49c0da483ac29142dd85ff8 /drivers/vfio/pci
parentb6f6167ea8a424d14b41c172fe7a5f49e164f221 (diff)
parent642265e22ecc7fe05c49cb8e1e0000a049df9857 (diff)
Merge tag 'vfio-v6.6-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - VFIO direct character device (cdev) interface support. This extracts the vfio device fd from the container and group model, and is intended to be the native uAPI for use with IOMMUFD (Yi Liu) - Enhancements to the PCI hot reset interface in support of cdev usage (Yi Liu) - Fix a potential race between registering and unregistering vfio files in the kvm-vfio interface and extend use of a lock to avoid extra drop and acquires (Dmitry Torokhov) - A new vfio-pci variant driver for the AMD/Pensando Distributed Services Card (PDS) Ethernet device, supporting live migration (Brett Creeley) - Cleanups to remove redundant owner setup in cdx and fsl bus drivers, and simplify driver init/exit in fsl code (Li Zetao) - Fix uninitialized hole in data structure and pad capability structures for alignment (Stefan Hajnoczi) * tag 'vfio-v6.6-rc1' of https://github.com/awilliam/linux-vfio: (53 commits) vfio/pds: Send type for SUSPEND_STATUS command vfio/pds: fix return value in pds_vfio_get_lm_file() pds_core: Fix function header descriptions vfio: align capability structures vfio/type1: fix cap_migration information leak vfio/fsl-mc: Use module_fsl_mc_driver macro to simplify the code vfio/cdx: Remove redundant initialization owner in vfio_cdx_driver vfio/pds: Add Kconfig and documentation vfio/pds: Add support for firmware recovery vfio/pds: Add support for dirty page tracking vfio/pds: Add VFIO live migration support vfio/pds: register with the pds_core PF pds_core: Require callers of register/unregister to pass PF drvdata vfio/pds: Initial support for pds VFIO driver vfio: Commonize combine_ranges for use in other VFIO drivers kvm/vfio: avoid bouncing the mutex when adding and deleting groups kvm/vfio: ensure kvg instance stays around in kvm_vfio_group_add() docs: vfio: Add vfio device cdev description vfio: Compile vfio_group infrastructure optionally vfio: Move the IOMMU_CAP_CACHE_COHERENCY check in __vfio_register_dev() ...
Diffstat (limited to 'drivers/vfio/pci')
-rw-r--r--drivers/vfio/pci/Kconfig2
-rw-r--r--drivers/vfio/pci/Makefile2
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c2
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c48
-rw-r--r--drivers/vfio/pci/mlx5/main.c1
-rw-r--r--drivers/vfio/pci/pds/Kconfig19
-rw-r--r--drivers/vfio/pci/pds/Makefile11
-rw-r--r--drivers/vfio/pci/pds/cmds.c510
-rw-r--r--drivers/vfio/pci/pds/cmds.h25
-rw-r--r--drivers/vfio/pci/pds/dirty.c564
-rw-r--r--drivers/vfio/pci/pds/dirty.h39
-rw-r--r--drivers/vfio/pci/pds/lm.c434
-rw-r--r--drivers/vfio/pci/pds/lm.h41
-rw-r--r--drivers/vfio/pci/pds/pci_drv.c209
-rw-r--r--drivers/vfio/pci/pds/pci_drv.h9
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.c227
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.h39
-rw-r--r--drivers/vfio/pci/vfio_pci.c1
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c261
19 files changed, 2291 insertions, 153 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 86bb7835cf3c..8125e5f37832 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -63,4 +63,6 @@ source "drivers/vfio/pci/mlx5/Kconfig"
source "drivers/vfio/pci/hisilicon/Kconfig"
+source "drivers/vfio/pci/pds/Kconfig"
+
endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 24c524224da5..45167be462d8 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -11,3 +11,5 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/
obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
+
+obj-$(CONFIG_PDS_VFIO_PCI) += pds/
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index a117eaf21c14..b2f9778c8366 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -1373,6 +1373,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
@@ -1391,6 +1392,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index c82c1f4fc588..33574b04477d 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -732,52 +732,6 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
mlx5vf_cmd_dealloc_pd(migf);
}
-static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
- u32 req_nodes)
-{
- struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
- unsigned long min_gap;
- unsigned long curr_gap;
-
- /* Special shortcut when a single range is required */
- if (req_nodes == 1) {
- unsigned long last;
-
- curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
- while (curr) {
- last = curr->last;
- prev = curr;
- curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
- if (prev != comb_start)
- interval_tree_remove(prev, root);
- }
- comb_start->last = last;
- return;
- }
-
- /* Combine ranges which have the smallest gap */
- while (cur_nodes > req_nodes) {
- prev = NULL;
- min_gap = ULONG_MAX;
- curr = interval_tree_iter_first(root, 0, ULONG_MAX);
- while (curr) {
- if (prev) {
- curr_gap = curr->start - prev->last;
- if (curr_gap < min_gap) {
- min_gap = curr_gap;
- comb_start = prev;
- comb_end = curr;
- }
- }
- prev = curr;
- curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
- }
- comb_start->last = comb_end->last;
- interval_tree_remove(comb_end, root);
- cur_nodes--;
- }
-}
-
static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
struct mlx5vf_pci_core_device *mvdev,
struct rb_root_cached *ranges, u32 nnodes)
@@ -800,7 +754,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
int i;
if (num_ranges > max_num_range) {
- combine_ranges(ranges, nnodes, max_num_range);
+ vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
num_ranges = max_num_range;
}
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index d95fd382814c..42ec574a8622 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1320,6 +1320,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static int mlx5vf_pci_probe(struct pci_dev *pdev,
diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig
new file mode 100644
index 000000000000..407b3fd32733
--- /dev/null
+++ b/drivers/vfio/pci/pds/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+
+config PDS_VFIO_PCI
+ tristate "VFIO support for PDS PCI devices"
+ depends on PDS_CORE
+ select VFIO_PCI_CORE
+ help
+ This provides generic PCI support for PDS devices using the VFIO
+ framework.
+
+ More specific information on this driver can be
+ found in
+ <file:Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst>.
+
+ To compile this driver as a module, choose M here. The module
+ will be called pds-vfio-pci.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile
new file mode 100644
index 000000000000..d5a06d81634f
--- /dev/null
+++ b/drivers/vfio/pci/pds/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+
+obj-$(CONFIG_PDS_VFIO_PCI) += pds-vfio-pci.o
+
+pds-vfio-pci-y := \
+ cmds.o \
+ dirty.o \
+ lm.o \
+ pci_drv.o \
+ vfio_dev.o
diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c
new file mode 100644
index 000000000000..36463ccc3df9
--- /dev/null
+++ b/drivers/vfio/pci/pds/cmds.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+
+#define SUSPEND_TIMEOUT_S 5
+#define SUSPEND_CHECK_INTERVAL_MS 1
+
+static int pds_vfio_client_adminq_cmd(struct pds_vfio_pci_device *pds_vfio,
+ union pds_core_adminq_cmd *req,
+ union pds_core_adminq_comp *resp,
+ bool fast_poll)
+{
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ union pds_core_adminq_cmd cmd = {};
+ struct pdsc *pdsc;
+ int err;
+
+ /* Wrap the client request */
+ cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD;
+ cmd.client_request.client_id = cpu_to_le16(pds_vfio->client_id);
+ memcpy(cmd.client_request.client_cmd, req,
+ sizeof(cmd.client_request.client_cmd));
+
+ pdsc = pdsc_get_pf_struct(pdev);
+ if (IS_ERR(pdsc))
+ return PTR_ERR(pdsc);
+
+ err = pdsc_adminq_post(pdsc, &cmd, resp, fast_poll);
+ if (err && err != -EAGAIN)
+ dev_err(pds_vfio_to_dev(pds_vfio),
+ "client admin cmd failed: %pe\n", ERR_PTR(err));
+
+ return err;
+}
+
+int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ char devname[PDS_DEVNAME_LEN];
+ struct pdsc *pdsc;
+ int ci;
+
+ snprintf(devname, sizeof(devname), "%s.%d-%u", PDS_VFIO_LM_DEV_NAME,
+ pci_domain_nr(pdev->bus),
+ PCI_DEVID(pdev->bus->number, pdev->devfn));
+
+ pdsc = pdsc_get_pf_struct(pdev);
+ if (IS_ERR(pdsc))
+ return PTR_ERR(pdsc);
+
+ ci = pds_client_register(pdsc, devname);
+ if (ci < 0)
+ return ci;
+
+ pds_vfio->client_id = ci;
+
+ return 0;
+}
+
+void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ struct pdsc *pdsc;
+ int err;
+
+ pdsc = pdsc_get_pf_struct(pdev);
+ if (IS_ERR(pdsc))
+ return;
+
+ err = pds_client_unregister(pdsc, pds_vfio->client_id);
+ if (err)
+ dev_err(&pdev->dev, "unregister from DSC failed: %pe\n",
+ ERR_PTR(err));
+
+ pds_vfio->client_id = 0;
+}
+
+static int
+pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_suspend_status = {
+ .opcode = PDS_LM_CMD_SUSPEND_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .type = type,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ unsigned long time_limit;
+ unsigned long time_start;
+ unsigned long time_done;
+ int err;
+
+ time_start = jiffies;
+ time_limit = time_start + HZ * SUSPEND_TIMEOUT_S;
+ do {
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+ if (err != -EAGAIN)
+ break;
+
+ msleep(SUSPEND_CHECK_INTERVAL_MS);
+ } while (time_before(jiffies, time_limit));
+
+ time_done = jiffies;
+ dev_dbg(dev, "%s: vf%u: Suspend comp received in %d msecs\n", __func__,
+ pds_vfio->vf_id, jiffies_to_msecs(time_done - time_start));
+
+ /* Check the results */
+ if (time_after_eq(time_done, time_limit)) {
+ dev_err(dev, "%s: vf%u: Suspend comp timeout\n", __func__,
+ pds_vfio->vf_id);
+ err = -ETIMEDOUT;
+ }
+
+ return err;
+}
+
+int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_suspend = {
+ .opcode = PDS_LM_CMD_SUSPEND,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .type = type,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "vf%u: Suspend device\n", pds_vfio->vf_id);
+
+ /*
+ * The initial suspend request to the firmware starts the device suspend
+ * operation and the firmware returns success if it's started
+ * successfully.
+ */
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+ if (err) {
+ dev_err(dev, "vf%u: Suspend failed: %pe\n", pds_vfio->vf_id,
+ ERR_PTR(err));
+ return err;
+ }
+
+ /*
+ * The subsequent suspend status request(s) check if the firmware has
+ * completed the device suspend process.
+ */
+ return pds_vfio_suspend_wait_device_cmd(pds_vfio, type);
+}
+
+int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_resume = {
+ .opcode = PDS_LM_CMD_RESUME,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .type = type,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+
+ dev_dbg(dev, "vf%u: Resume device\n", pds_vfio->vf_id);
+
+ return pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+}
+
+int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_state_size = {
+ .opcode = PDS_LM_CMD_STATE_SIZE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "vf%u: Get migration status\n", pds_vfio->vf_id);
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err)
+ return err;
+
+ *size = le64_to_cpu(comp.lm_state_size.size);
+ return 0;
+}
+
+static int pds_vfio_dma_map_lm_file(struct device *dev,
+ enum dma_data_direction dir,
+ struct pds_vfio_lm_file *lm_file)
+{
+ struct pds_lm_sg_elem *sgl, *sge;
+ struct scatterlist *sg;
+ dma_addr_t sgl_addr;
+ size_t sgl_size;
+ int err;
+ int i;
+
+ if (!lm_file)
+ return -EINVAL;
+
+ /* dma map file pages */
+ err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0);
+ if (err)
+ return err;
+
+ lm_file->num_sge = lm_file->sg_table.nents;
+
+ /* alloc sgl */
+ sgl_size = lm_file->num_sge * sizeof(struct pds_lm_sg_elem);
+ sgl = kzalloc(sgl_size, GFP_KERNEL);
+ if (!sgl) {
+ err = -ENOMEM;
+ goto out_unmap_sgtable;
+ }
+
+ /* fill sgl */
+ sge = sgl;
+ for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) {
+ sge->addr = cpu_to_le64(sg_dma_address(sg));
+ sge->len = cpu_to_le32(sg_dma_len(sg));
+ dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len);
+ sge++;
+ }
+
+ sgl_addr = dma_map_single(dev, sgl, sgl_size, DMA_TO_DEVICE);
+ if (dma_mapping_error(dev, sgl_addr)) {
+ err = -EIO;
+ goto out_free_sgl;
+ }
+
+ lm_file->sgl = sgl;
+ lm_file->sgl_addr = sgl_addr;
+
+ return 0;
+
+out_free_sgl:
+ kfree(sgl);
+out_unmap_sgtable:
+ lm_file->num_sge = 0;
+ dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+ return err;
+}
+
+static void pds_vfio_dma_unmap_lm_file(struct device *dev,
+ enum dma_data_direction dir,
+ struct pds_vfio_lm_file *lm_file)
+{
+ if (!lm_file)
+ return;
+
+ /* free sgl */
+ if (lm_file->sgl) {
+ dma_unmap_single(dev, lm_file->sgl_addr,
+ lm_file->num_sge * sizeof(*lm_file->sgl),
+ DMA_TO_DEVICE);
+ kfree(lm_file->sgl);
+ lm_file->sgl = NULL;
+ lm_file->sgl_addr = DMA_MAPPING_ERROR;
+ lm_file->num_sge = 0;
+ }
+
+ /* dma unmap file pages */
+ dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+}
+
+int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_save = {
+ .opcode = PDS_LM_CMD_SAVE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ union pds_core_adminq_comp comp = {};
+ struct pds_vfio_lm_file *lm_file;
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id);
+
+ lm_file = pds_vfio->save_file;
+
+ err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file);
+ if (err) {
+ dev_err(&pdev->dev, "failed to map save migration file: %pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ cmd.lm_save.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+ cmd.lm_save.num_sge = cpu_to_le32(lm_file->num_sge);
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err)
+ dev_err(&pdev->dev, "failed to get migration state: %pe\n",
+ ERR_PTR(err));
+
+ pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file);
+
+ return err;
+}
+
+int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_restore = {
+ .opcode = PDS_LM_CMD_RESTORE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ union pds_core_adminq_comp comp = {};
+ struct pds_vfio_lm_file *lm_file;
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id);
+
+ lm_file = pds_vfio->restore_file;
+
+ err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file);
+ if (err) {
+ dev_err(&pdev->dev,
+ "failed to map restore migration file: %pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ cmd.lm_restore.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+ cmd.lm_restore.num_sge = cpu_to_le32(lm_file->num_sge);
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err)
+ dev_err(&pdev->dev, "failed to set migration state: %pe\n",
+ ERR_PTR(err));
+
+ pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file);
+
+ return err;
+}
+
+void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ enum pds_lm_host_vf_status vf_status)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_host_vf_status = {
+ .opcode = PDS_LM_CMD_HOST_VF_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .status = vf_status,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "vf%u: Set host VF LM status: %u", pds_vfio->vf_id,
+ vf_status);
+ if (vf_status != PDS_LM_STA_IN_PROGRESS &&
+ vf_status != PDS_LM_STA_NONE) {
+ dev_warn(dev, "Invalid host VF migration status, %d\n",
+ vf_status);
+ return;
+ }
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err)
+ dev_warn(dev, "failed to send host VF migration status: %pe\n",
+ ERR_PTR(err));
+}
+
+int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 regions_dma, u8 *max_regions, u8 *num_regions)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_dirty_status = {
+ .opcode = PDS_LM_CMD_DIRTY_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "vf%u: Dirty status\n", pds_vfio->vf_id);
+
+ cmd.lm_dirty_status.regions_dma = cpu_to_le64(regions_dma);
+ cmd.lm_dirty_status.max_regions = *max_regions;
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err) {
+ dev_err(dev, "failed to get dirty status: %pe\n", ERR_PTR(err));
+ return err;
+ }
+
+ /* only support seq_ack approach for now */
+ if (!(le32_to_cpu(comp.lm_dirty_status.bmp_type_mask) &
+ BIT(PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK))) {
+ dev_err(dev, "Dirty bitmap tracking SEQ_ACK not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ *num_regions = comp.lm_dirty_status.num_regions;
+ *max_regions = comp.lm_dirty_status.max_regions;
+
+ dev_dbg(dev,
+ "Page Tracking Status command successful, max_regions: %d, num_regions: %d, bmp_type: %s\n",
+ *max_regions, *num_regions, "PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK");
+
+ return 0;
+}
+
+int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 regions_dma, u8 num_regions)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_dirty_enable = {
+ .opcode = PDS_LM_CMD_DIRTY_ENABLE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .regions_dma = cpu_to_le64(regions_dma),
+ .bmp_type = PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK,
+ .num_regions = num_regions,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err) {
+ dev_err(dev, "failed dirty tracking enable: %pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+}
+
+int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_dirty_disable = {
+ .opcode = PDS_LM_CMD_DIRTY_DISABLE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err || comp.lm_dirty_status.num_regions != 0) {
+ /* in case num_regions is still non-zero after disable */
+ err = err ? err : -EIO;
+ dev_err(dev,
+ "failed dirty tracking disable: %pe, num_regions %d\n",
+ ERR_PTR(err), comp.lm_dirty_status.num_regions);
+ return err;
+ }
+
+ return 0;
+}
+
+int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 sgl_dma, u16 num_sge, u32 offset,
+ u32 total_len, bool read_seq)
+{
+ const char *cmd_type_str = read_seq ? "read_seq" : "write_ack";
+ union pds_core_adminq_cmd cmd = {
+ .lm_dirty_seq_ack = {
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .len_bytes = cpu_to_le32(total_len),
+ .off_bytes = cpu_to_le32(offset),
+ .sgl_addr = cpu_to_le64(sgl_dma),
+ .num_sge = cpu_to_le16(num_sge),
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ if (read_seq)
+ cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_READ_SEQ;
+ else
+ cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_WRITE_ACK;
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err) {
+ dev_err(dev, "failed cmd Page Tracking %s: %pe\n", cmd_type_str,
+ ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+}
diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h
new file mode 100644
index 000000000000..95221100b954
--- /dev/null
+++ b/drivers/vfio/pci/pds/cmds.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _CMDS_H_
+#define _CMDS_H_
+
+int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type);
+int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type);
+int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size);
+int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ enum pds_lm_host_vf_status vf_status);
+int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 regions_dma, u8 *max_regions,
+ u8 *num_regions);
+int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 regions_dma, u8 num_regions);
+int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 sgl_dma, u16 num_sge, u32 offset,
+ u32 total_len, bool read_seq);
+#endif /* _CMDS_H_ */
diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
new file mode 100644
index 000000000000..c937aa6f3954
--- /dev/null
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/interval_tree.h>
+#include <linux/vfio.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+#include "dirty.h"
+
+#define READ_SEQ true
+#define WRITE_ACK false
+
+bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio)
+{
+ return pds_vfio->dirty.is_enabled;
+}
+
+void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio)
+{
+ pds_vfio->dirty.is_enabled = true;
+}
+
+void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio)
+{
+ pds_vfio->dirty.is_enabled = false;
+}
+
+static void
+pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
+ u8 max_regions)
+{
+ int len = max_regions * sizeof(struct pds_lm_dirty_region_info);
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ struct pds_lm_dirty_region_info *region_info;
+ dma_addr_t regions_dma;
+ u8 num_regions;
+ int err;
+
+ region_info = kcalloc(max_regions,
+ sizeof(struct pds_lm_dirty_region_info),
+ GFP_KERNEL);
+ if (!region_info)
+ return;
+
+ regions_dma =
+ dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE);
+ if (dma_mapping_error(pdsc_dev, regions_dma))
+ goto out_free_region_info;
+
+ err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions,
+ &num_regions);
+ dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE);
+ if (err)
+ goto out_free_region_info;
+
+ for (unsigned int i = 0; i < num_regions; i++)
+ dev_dbg(&pdev->dev,
+ "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n",
+ i, le64_to_cpu(region_info[i].dma_base),
+ le32_to_cpu(region_info[i].page_count),
+ region_info[i].page_size_log2);
+
+out_free_region_info:
+ kfree(region_info);
+}
+
+static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
+ unsigned long bytes)
+{
+ unsigned long *host_seq_bmp, *host_ack_bmp;
+
+ host_seq_bmp = vzalloc(bytes);
+ if (!host_seq_bmp)
+ return -ENOMEM;
+
+ host_ack_bmp = vzalloc(bytes);
+ if (!host_ack_bmp) {
+ bitmap_free(host_seq_bmp);
+ return -ENOMEM;
+ }
+
+ dirty->host_seq.bmp = host_seq_bmp;
+ dirty->host_ack.bmp = host_ack_bmp;
+
+ return 0;
+}
+
+static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
+{
+ vfree(dirty->host_seq.bmp);
+ vfree(dirty->host_ack.bmp);
+ dirty->host_seq.bmp = NULL;
+ dirty->host_ack.bmp = NULL;
+}
+
+static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
+ struct pds_vfio_bmp_info *bmp_info)
+{
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+
+ dma_unmap_single(pdsc_dev, bmp_info->sgl_addr,
+ bmp_info->num_sge * sizeof(struct pds_lm_sg_elem),
+ DMA_BIDIRECTIONAL);
+ kfree(bmp_info->sgl);
+
+ bmp_info->num_sge = 0;
+ bmp_info->sgl = NULL;
+ bmp_info->sgl_addr = 0;
+}
+
+static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (pds_vfio->dirty.host_seq.sgl)
+ __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq);
+ if (pds_vfio->dirty.host_ack.sgl)
+ __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack);
+}
+
+static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
+ struct pds_vfio_bmp_info *bmp_info,
+ u32 page_count)
+{
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ struct pds_lm_sg_elem *sgl;
+ dma_addr_t sgl_addr;
+ size_t sgl_size;
+ u32 max_sge;
+
+ max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8);
+ sgl_size = max_sge * sizeof(struct pds_lm_sg_elem);
+
+ sgl = kzalloc(sgl_size, GFP_KERNEL);
+ if (!sgl)
+ return -ENOMEM;
+
+ sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(pdsc_dev, sgl_addr)) {
+ kfree(sgl);
+ return -EIO;
+ }
+
+ bmp_info->sgl = sgl;
+ bmp_info->num_sge = max_sge;
+ bmp_info->sgl_addr = sgl_addr;
+
+ return 0;
+}
+
+static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
+ u32 page_count)
+{
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+ int err;
+
+ err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq,
+ page_count);
+ if (err)
+ return err;
+
+ err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack,
+ page_count);
+ if (err) {
+ __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq);
+ return err;
+ }
+
+ return 0;
+}
+
+static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
+ struct rb_root_cached *ranges, u32 nnodes,
+ u64 *page_size)
+{
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+ u64 region_start, region_size, region_page_size;
+ struct pds_lm_dirty_region_info *region_info;
+ struct interval_tree_node *node = NULL;
+ u8 max_regions = 0, num_regions;
+ dma_addr_t regions_dma = 0;
+ u32 num_ranges = nnodes;
+ u32 page_count;
+ u16 len;
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
+ pds_vfio->vf_id);
+
+ if (pds_vfio_dirty_is_enabled(pds_vfio))
+ return -EINVAL;
+
+ /* find if dirty tracking is disabled, i.e. num_regions == 0 */
+ err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
+ &num_regions);
+ if (err < 0) {
+ dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
+ ERR_PTR(err));
+ return err;
+ } else if (num_regions) {
+ dev_err(&pdev->dev,
+ "Dirty tracking already enabled for %d regions\n",
+ num_regions);
+ return -EEXIST;
+ } else if (!max_regions) {
+ dev_err(&pdev->dev,
+ "Device doesn't support dirty tracking, max_regions %d\n",
+ max_regions);
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Only support 1 region for now. If there are any large gaps in the
+ * VM's address regions, then this would be a waste of memory as we are
+ * generating 2 bitmaps (ack/seq) from the min address to the max
+ * address of the VM's address regions. In the future, if we support
+ * more than one region in the device/driver we can split the bitmaps
+ * on the largest address region gaps. We can do this split up to the
+ * max_regions times returned from the dirty_status command.
+ */
+ max_regions = 1;
+ if (num_ranges > max_regions) {
+ vfio_combine_iova_ranges(ranges, nnodes, max_regions);
+ num_ranges = max_regions;
+ }
+
+ node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
+ if (!node)
+ return -EINVAL;
+
+ region_size = node->last - node->start + 1;
+ region_start = node->start;
+ region_page_size = *page_size;
+
+ len = sizeof(*region_info);
+ region_info = kzalloc(len, GFP_KERNEL);
+ if (!region_info)
+ return -ENOMEM;
+
+ page_count = DIV_ROUND_UP(region_size, region_page_size);
+
+ region_info->dma_base = cpu_to_le64(region_start);
+ region_info->page_count = cpu_to_le32(page_count);
+ region_info->page_size_log2 = ilog2(region_page_size);
+
+ regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(pdsc_dev, regions_dma)) {
+ err = -ENOMEM;
+ goto out_free_region_info;
+ }
+
+ err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
+ dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
+ if (err)
+ goto out_free_region_info;
+
+ /*
+ * page_count might be adjusted by the device,
+ * update it before freeing region_info DMA
+ */
+ page_count = le32_to_cpu(region_info->page_count);
+
+ dev_dbg(&pdev->dev,
+ "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n",
+ regions_dma, region_start, page_count,
+ (u8)ilog2(region_page_size));
+
+ err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
+ ERR_PTR(err));
+ goto out_free_region_info;
+ }
+
+ err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
+ ERR_PTR(err));
+ goto out_free_bitmaps;
+ }
+
+ dirty->region_start = region_start;
+ dirty->region_size = region_size;
+ dirty->region_page_size = region_page_size;
+ pds_vfio_dirty_set_enabled(pds_vfio);
+
+ pds_vfio_print_guest_region_info(pds_vfio, max_regions);
+
+ kfree(region_info);
+
+ return 0;
+
+out_free_bitmaps:
+ pds_vfio_dirty_free_bitmaps(dirty);
+out_free_region_info:
+ kfree(region_info);
+ return err;
+}
+
+void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
+{
+ if (pds_vfio_dirty_is_enabled(pds_vfio)) {
+ pds_vfio_dirty_set_disabled(pds_vfio);
+ if (send_cmd)
+ pds_vfio_dirty_disable_cmd(pds_vfio);
+ pds_vfio_dirty_free_sgl(pds_vfio);
+ pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
+ }
+
+ if (send_cmd)
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
+}
+
+static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
+ struct pds_vfio_bmp_info *bmp_info,
+ u32 offset, u32 bmp_bytes, bool read_seq)
+{
+ const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
+ u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ unsigned long long npages;
+ struct sg_table sg_table;
+ struct scatterlist *sg;
+ struct page **pages;
+ u32 page_offset;
+ const void *bmp;
+ size_t size;
+ u16 num_sge;
+ int err;
+ int i;
+
+ bmp = (void *)((u64)bmp_info->bmp + offset);
+ page_offset = offset_in_page(bmp);
+ bmp -= page_offset;
+
+ /*
+ * Start and end of bitmap section to seq/ack might not be page
+ * aligned, so use the page_offset to account for that so there
+ * will be enough pages to represent the bmp_bytes
+ */
+ npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
+ pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ for (unsigned long long i = 0; i < npages; i++) {
+ struct page *page = vmalloc_to_page(bmp);
+
+ if (!page) {
+ err = -EFAULT;
+ goto out_free_pages;
+ }
+
+ pages[i] = page;
+ bmp += PAGE_SIZE;
+ }
+
+ err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
+ bmp_bytes, GFP_KERNEL);
+ if (err)
+ goto out_free_pages;
+
+ err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
+ if (err)
+ goto out_free_sg_table;
+
+ for_each_sgtable_dma_sg(&sg_table, sg, i) {
+ struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
+
+ sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
+ sg_elem->len = cpu_to_le32(sg_dma_len(sg));
+ }
+
+ num_sge = sg_table.nents;
+ size = num_sge * sizeof(struct pds_lm_sg_elem);
+ dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
+ err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge,
+ offset, bmp_bytes, read_seq);
+ if (err)
+ dev_err(&pdev->dev,
+ "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n",
+ bmp_type_str, offset, bmp_bytes,
+ num_sge, bmp_info->sgl_addr, ERR_PTR(err));
+ dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
+
+ dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
+out_free_sg_table:
+ sg_free_table(&sg_table);
+out_free_pages:
+ kfree(pages);
+
+ return err;
+}
+
+static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
+ u32 offset, u32 len)
+{
+ return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
+ offset, len, WRITE_ACK);
+}
+
+static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
+ u32 offset, u32 len)
+{
+ return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
+ offset, len, READ_SEQ);
+}
+
+static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
+ struct iova_bitmap *dirty_bitmap,
+ u32 bmp_offset, u32 len_bytes)
+{
+ u64 page_size = pds_vfio->dirty.region_page_size;
+ u64 region_start = pds_vfio->dirty.region_start;
+ u32 bmp_offset_bit;
+ __le64 *seq, *ack;
+ int dword_count;
+
+ dword_count = len_bytes / sizeof(u64);
+ seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
+ ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
+ bmp_offset_bit = bmp_offset * 8;
+
+ for (int i = 0; i < dword_count; i++) {
+ u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
+
+ /* prepare for next write_ack call */
+ ack[i] = seq[i];
+
+ for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
+ if (xor & BIT(bit_i)) {
+ u64 abs_bit_i = bmp_offset_bit +
+ i * BITS_PER_TYPE(u64) + bit_i;
+ u64 addr = abs_bit_i * page_size + region_start;
+
+ iova_bitmap_set(dirty_bitmap, addr, page_size);
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
+ struct iova_bitmap *dirty_bitmap,
+ unsigned long iova, unsigned long length)
+{
+ struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+ u64 bmp_offset, bmp_bytes;
+ u64 bitmap_size, pages;
+ int err;
+
+ dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
+
+ if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
+ dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
+ pds_vfio->vf_id);
+ return -EINVAL;
+ }
+
+ pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
+ bitmap_size =
+ round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
+
+ dev_dbg(dev,
+ "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n",
+ pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
+ pages, bitmap_size);
+
+ if (!length || ((dirty->region_start + iova + length) >
+ (dirty->region_start + dirty->region_size))) {
+ dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
+ iova, length);
+ return -EINVAL;
+ }
+
+ /* bitmap is modified in 64 bit chunks */
+ bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
+ sizeof(u64)),
+ sizeof(u64));
+ if (bmp_bytes != bitmap_size) {
+ dev_err(dev,
+ "Calculated bitmap bytes %llu not equal to bitmap size %llu\n",
+ bmp_bytes, bitmap_size);
+ return -EINVAL;
+ }
+
+ bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size, sizeof(u64));
+
+ dev_dbg(dev,
+ "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n",
+ iova, length, bmp_offset, bmp_bytes);
+
+ err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
+ if (err)
+ return err;
+
+ err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset,
+ bmp_bytes);
+ if (err)
+ return err;
+
+ err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
+ unsigned long length, struct iova_bitmap *dirty)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ int err;
+
+ mutex_lock(&pds_vfio->state_mutex);
+ err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+
+ return err;
+}
+
+int pds_vfio_dma_logging_start(struct vfio_device *vdev,
+ struct rb_root_cached *ranges, u32 nnodes,
+ u64 *page_size)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ int err;
+
+ mutex_lock(&pds_vfio->state_mutex);
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
+ err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+
+ return err;
+}
+
+int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+
+ mutex_lock(&pds_vfio->state_mutex);
+ pds_vfio_dirty_disable(pds_vfio, true);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+
+ return 0;
+}
diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h
new file mode 100644
index 000000000000..f78da25d75ca
--- /dev/null
+++ b/drivers/vfio/pci/pds/dirty.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _DIRTY_H_
+#define _DIRTY_H_
+
+struct pds_vfio_bmp_info {
+ unsigned long *bmp;
+ u32 bmp_bytes;
+ struct pds_lm_sg_elem *sgl;
+ dma_addr_t sgl_addr;
+ u16 num_sge;
+};
+
+struct pds_vfio_dirty {
+ struct pds_vfio_bmp_info host_seq;
+ struct pds_vfio_bmp_info host_ack;
+ u64 region_size;
+ u64 region_start;
+ u64 region_page_size;
+ bool is_enabled;
+};
+
+struct pds_vfio_pci_device;
+
+bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio,
+ bool send_cmd);
+
+int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
+ unsigned long length,
+ struct iova_bitmap *dirty);
+int pds_vfio_dma_logging_start(struct vfio_device *vdev,
+ struct rb_root_cached *ranges, u32 nnodes,
+ u64 *page_size);
+int pds_vfio_dma_logging_stop(struct vfio_device *vdev);
+#endif /* _DIRTY_H_ */
diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
new file mode 100644
index 000000000000..79fe2e66bb49
--- /dev/null
+++ b/drivers/vfio/pci/pds/lm.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+
+static struct pds_vfio_lm_file *
+pds_vfio_get_lm_file(const struct file_operations *fops, int flags, u64 size)
+{
+ struct pds_vfio_lm_file *lm_file = NULL;
+ unsigned long long npages;
+ struct page **pages;
+ void *page_mem;
+ const void *p;
+
+ if (!size)
+ return NULL;
+
+ /* Alloc file structure */
+ lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL);
+ if (!lm_file)
+ return NULL;
+
+ /* Create file */
+ lm_file->filep =
+ anon_inode_getfile("pds_vfio_lm", fops, lm_file, flags);
+ if (IS_ERR(lm_file->filep))
+ goto out_free_file;
+
+ stream_open(lm_file->filep->f_inode, lm_file->filep);
+ mutex_init(&lm_file->lock);
+
+ /* prevent file from being released before we are done with it */
+ get_file(lm_file->filep);
+
+ /* Allocate memory for file pages */
+ npages = DIV_ROUND_UP_ULL(size, PAGE_SIZE);
+ pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ goto out_put_file;
+
+ page_mem = kvzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
+ if (!page_mem)
+ goto out_free_pages_array;
+
+ p = page_mem - offset_in_page(page_mem);
+ for (unsigned long long i = 0; i < npages; i++) {
+ if (is_vmalloc_addr(p))
+ pages[i] = vmalloc_to_page(p);
+ else
+ pages[i] = kmap_to_page((void *)p);
+ if (!pages[i])
+ goto out_free_page_mem;
+
+ p += PAGE_SIZE;
+ }
+
+ /* Create scatterlist of file pages to use for DMA mapping later */
+ if (sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages, 0,
+ size, GFP_KERNEL))
+ goto out_free_page_mem;
+
+ lm_file->size = size;
+ lm_file->pages = pages;
+ lm_file->npages = npages;
+ lm_file->page_mem = page_mem;
+ lm_file->alloc_size = npages * PAGE_SIZE;
+
+ return lm_file;
+
+out_free_page_mem:
+ kvfree(page_mem);
+out_free_pages_array:
+ kfree(pages);
+out_put_file:
+ fput(lm_file->filep);
+ mutex_destroy(&lm_file->lock);
+out_free_file:
+ kfree(lm_file);
+
+ return NULL;
+}
+
+static void pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
+{
+ mutex_lock(&lm_file->lock);
+
+ lm_file->size = 0;
+ lm_file->alloc_size = 0;
+
+ /* Free scatter list of file pages */
+ sg_free_table(&lm_file->sg_table);
+
+ kvfree(lm_file->page_mem);
+ lm_file->page_mem = NULL;
+ kfree(lm_file->pages);
+ lm_file->pages = NULL;
+
+ mutex_unlock(&lm_file->lock);
+
+ /* allow file to be released since we are done with it */
+ fput(lm_file->filep);
+}
+
+void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (!pds_vfio->save_file)
+ return;
+
+ pds_vfio_put_lm_file(pds_vfio->save_file);
+ pds_vfio->save_file = NULL;
+}
+
+void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (!pds_vfio->restore_file)
+ return;
+
+ pds_vfio_put_lm_file(pds_vfio->restore_file);
+ pds_vfio->restore_file = NULL;
+}
+
+static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file,
+ unsigned long offset)
+{
+ unsigned long cur_offset = 0;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* All accesses are sequential */
+ if (offset < lm_file->last_offset || !lm_file->last_offset_sg) {
+ lm_file->last_offset = 0;
+ lm_file->last_offset_sg = lm_file->sg_table.sgl;
+ lm_file->sg_last_entry = 0;
+ }
+
+ cur_offset = lm_file->last_offset;
+
+ for_each_sg(lm_file->last_offset_sg, sg,
+ lm_file->sg_table.orig_nents - lm_file->sg_last_entry, i) {
+ if (offset < sg->length + cur_offset) {
+ lm_file->last_offset_sg = sg;
+ lm_file->sg_last_entry += i;
+ lm_file->last_offset = cur_offset;
+ return nth_page(sg_page(sg),
+ (offset - cur_offset) / PAGE_SIZE);
+ }
+ cur_offset += sg->length;
+ }
+
+ return NULL;
+}
+
+static int pds_vfio_release_file(struct inode *inode, struct file *filp)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+
+ mutex_lock(&lm_file->lock);
+ lm_file->filep->f_pos = 0;
+ lm_file->size = 0;
+ mutex_unlock(&lm_file->lock);
+ mutex_destroy(&lm_file->lock);
+ kfree(lm_file);
+
+ return 0;
+}
+
+static ssize_t pds_vfio_save_read(struct file *filp, char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ mutex_lock(&lm_file->lock);
+ if (*pos > lm_file->size) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ len = min_t(size_t, lm_file->size - *pos, len);
+ while (len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int err;
+
+ page_offset = (*pos) % PAGE_SIZE;
+ page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+ if (!page) {
+ if (done == 0)
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ err = copy_to_user(buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (err) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += page_len;
+ len -= page_len;
+ done += page_len;
+ buf += page_len;
+ }
+
+out_unlock:
+ mutex_unlock(&lm_file->lock);
+ return done;
+}
+
+static const struct file_operations pds_vfio_save_fops = {
+ .owner = THIS_MODULE,
+ .read = pds_vfio_save_read,
+ .release = pds_vfio_release_file,
+ .llseek = no_llseek,
+};
+
+static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+ struct pds_vfio_lm_file *lm_file;
+ u64 size;
+ int err;
+
+ /* Get live migration state size in this state */
+ err = pds_vfio_get_lm_state_size_cmd(pds_vfio, &size);
+ if (err) {
+ dev_err(dev, "failed to get save status: %pe\n", ERR_PTR(err));
+ return err;
+ }
+
+ dev_dbg(dev, "save status, size = %lld\n", size);
+
+ if (!size) {
+ dev_err(dev, "invalid state size\n");
+ return -EIO;
+ }
+
+ lm_file = pds_vfio_get_lm_file(&pds_vfio_save_fops, O_RDONLY, size);
+ if (!lm_file) {
+ dev_err(dev, "failed to create save file\n");
+ return -ENOENT;
+ }
+
+ dev_dbg(dev, "size = %lld, alloc_size = %lld, npages = %lld\n",
+ lm_file->size, lm_file->alloc_size, lm_file->npages);
+
+ pds_vfio->save_file = lm_file;
+
+ return 0;
+}
+
+static ssize_t pds_vfio_restore_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+ loff_t requested_length;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+
+ if (*pos < 0 ||
+ check_add_overflow((loff_t)len, *pos, &requested_length))
+ return -EINVAL;
+
+ mutex_lock(&lm_file->lock);
+
+ while (len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int err;
+
+ page_offset = (*pos) % PAGE_SIZE;
+ page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+ if (!page) {
+ if (done == 0)
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ err = copy_from_user(to_buff + page_offset, buf, page_len);
+ kunmap_local(to_buff);
+ if (err) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += page_len;
+ len -= page_len;
+ done += page_len;
+ buf += page_len;
+ lm_file->size += page_len;
+ }
+out_unlock:
+ mutex_unlock(&lm_file->lock);
+ return done;
+}
+
+static const struct file_operations pds_vfio_restore_fops = {
+ .owner = THIS_MODULE,
+ .write = pds_vfio_restore_write,
+ .release = pds_vfio_release_file,
+ .llseek = no_llseek,
+};
+
+static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+ struct pds_vfio_lm_file *lm_file;
+ u64 size;
+
+ size = sizeof(union pds_lm_dev_state);
+ dev_dbg(dev, "restore status, size = %lld\n", size);
+
+ if (!size) {
+ dev_err(dev, "invalid state size");
+ return -EIO;
+ }
+
+ lm_file = pds_vfio_get_lm_file(&pds_vfio_restore_fops, O_WRONLY, size);
+ if (!lm_file) {
+ dev_err(dev, "failed to create restore file");
+ return -ENOENT;
+ }
+ pds_vfio->restore_file = lm_file;
+
+ return 0;
+}
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state next)
+{
+ enum vfio_device_mig_state cur = pds_vfio->state;
+ int err;
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) {
+ err = pds_vfio_get_save_file(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ err = pds_vfio_get_lm_state_cmd(pds_vfio);
+ if (err) {
+ pds_vfio_put_save_file(pds_vfio);
+ return ERR_PTR(err);
+ }
+
+ return pds_vfio->save_file->filep;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) {
+ pds_vfio_put_save_file(pds_vfio);
+ pds_vfio_dirty_disable(pds_vfio, true);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) {
+ err = pds_vfio_get_restore_file(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ return pds_vfio->restore_file->filep;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) {
+ err = pds_vfio_set_lm_state_cmd(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ pds_vfio_put_restore_file(pds_vfio);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
+ PDS_LM_STA_IN_PROGRESS);
+ err = pds_vfio_suspend_device_cmd(pds_vfio,
+ PDS_LM_SUSPEND_RESUME_TYPE_P2P);
+ if (err)
+ return ERR_PTR(err);
+
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) {
+ err = pds_vfio_resume_device_cmd(pds_vfio,
+ PDS_LM_SUSPEND_RESUME_TYPE_FULL);
+ if (err)
+ return ERR_PTR(err);
+
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ err = pds_vfio_resume_device_cmd(pds_vfio,
+ PDS_LM_SUSPEND_RESUME_TYPE_P2P);
+ if (err)
+ return ERR_PTR(err);
+
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP) {
+ err = pds_vfio_suspend_device_cmd(pds_vfio,
+ PDS_LM_SUSPEND_RESUME_TYPE_FULL);
+ if (err)
+ return ERR_PTR(err);
+ return NULL;
+ }
+
+ return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h
new file mode 100644
index 000000000000..13be893198b7
--- /dev/null
+++ b/drivers/vfio/pci/pds/lm.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _LM_H_
+#define _LM_H_
+
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/types.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_adminq.h>
+
+struct pds_vfio_lm_file {
+ struct file *filep;
+ struct mutex lock; /* protect live migration data file */
+ u64 size; /* Size with valid data */
+ u64 alloc_size; /* Total allocated size. Always >= len */
+ void *page_mem; /* memory allocated for pages */
+ struct page **pages; /* Backing pages for file */
+ unsigned long long npages;
+ struct sg_table sg_table; /* SG table for backing pages */
+ struct pds_lm_sg_elem *sgl; /* DMA mapping */
+ dma_addr_t sgl_addr;
+ u16 num_sge;
+ struct scatterlist *last_offset_sg; /* Iterator */
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
+struct pds_vfio_pci_device;
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state next);
+
+void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio);
+
+#endif /* _LM_H_ */
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
new file mode 100644
index 000000000000..ab4b5958e413
--- /dev/null
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "pci_drv.h"
+#include "cmds.h"
+
+#define PDS_VFIO_DRV_DESCRIPTION "AMD/Pensando VFIO Device Driver"
+#define PCI_VENDOR_ID_PENSANDO 0x1dd8
+
+static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
+{
+ bool deferred_reset_needed = false;
+
+ /*
+ * Documentation states that the kernel migration driver must not
+ * generate asynchronous device state transitions outside of
+ * manipulation by the user or the VFIO_DEVICE_RESET ioctl.
+ *
+ * Since recovery is an asynchronous event received from the device,
+ * initiate a deferred reset. Issue a deferred reset in the following
+ * situations:
+ * 1. Migration is in progress, which will cause the next step of
+ * the migration to fail.
+ * 2. If the device is in a state that will be set to
+ * VFIO_DEVICE_STATE_RUNNING on the next action (i.e. VM is
+ * shutdown and device is in VFIO_DEVICE_STATE_STOP).
+ */
+ mutex_lock(&pds_vfio->state_mutex);
+ if ((pds_vfio->state != VFIO_DEVICE_STATE_RUNNING &&
+ pds_vfio->state != VFIO_DEVICE_STATE_ERROR) ||
+ (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING &&
+ pds_vfio_dirty_is_enabled(pds_vfio)))
+ deferred_reset_needed = true;
+ mutex_unlock(&pds_vfio->state_mutex);
+
+ /*
+ * On the next user initiated state transition, the device will
+ * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's
+ * responsibility to reset the device.
+ *
+ * If a VFIO_DEVICE_RESET is requested post recovery and before the next
+ * state transition, then the deferred reset state will be set to
+ * VFIO_DEVICE_STATE_RUNNING.
+ */
+ if (deferred_reset_needed) {
+ spin_lock(&pds_vfio->reset_lock);
+ pds_vfio->deferred_reset = true;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;
+ spin_unlock(&pds_vfio->reset_lock);
+ }
+}
+
+static int pds_vfio_pci_notify_handler(struct notifier_block *nb,
+ unsigned long ecode, void *data)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(nb, struct pds_vfio_pci_device, nb);
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_notifyq_comp *event = data;
+
+ dev_dbg(dev, "%s: event code %lu\n", __func__, ecode);
+
+ /*
+ * We don't need to do anything for RESET state==0 as there is no notify
+ * or feedback mechanism available, and it is possible that we won't
+ * even see a state==0 event since the pds_core recovery is pending.
+ *
+ * Any requests from VFIO while state==0 will fail, which will return
+ * error and may cause migration to fail.
+ */
+ if (ecode == PDS_EVENT_RESET) {
+ dev_info(dev, "%s: PDS_EVENT_RESET event received, state==%d\n",
+ __func__, event->reset.state);
+ /*
+ * pds_core device finished recovery and sent us the
+ * notification (state == 1) to allow us to recover
+ */
+ if (event->reset.state == 1)
+ pds_vfio_recovery(pds_vfio);
+ }
+
+ return 0;
+}
+
+static int
+pds_vfio_pci_register_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ struct notifier_block *nb = &pds_vfio->nb;
+ int err;
+
+ if (!nb->notifier_call) {
+ nb->notifier_call = pds_vfio_pci_notify_handler;
+ err = pdsc_register_notify(nb);
+ if (err) {
+ nb->notifier_call = NULL;
+ dev_err(dev,
+ "failed to register pds event handler: %pe\n",
+ ERR_PTR(err));
+ return -EINVAL;
+ }
+ dev_dbg(dev, "pds event handler registered\n");
+ }
+
+ return 0;
+}
+
+static void
+pds_vfio_pci_unregister_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (pds_vfio->nb.notifier_call) {
+ pdsc_unregister_notify(&pds_vfio->nb);
+ pds_vfio->nb.notifier_call = NULL;
+ }
+}
+
+static int pds_vfio_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct pds_vfio_pci_device *pds_vfio;
+ int err;
+
+ pds_vfio = vfio_alloc_device(pds_vfio_pci_device, vfio_coredev.vdev,
+ &pdev->dev, pds_vfio_ops_info());
+ if (IS_ERR(pds_vfio))
+ return PTR_ERR(pds_vfio);
+
+ dev_set_drvdata(&pdev->dev, &pds_vfio->vfio_coredev);
+
+ err = vfio_pci_core_register_device(&pds_vfio->vfio_coredev);
+ if (err)
+ goto out_put_vdev;
+
+ err = pds_vfio_register_client_cmd(pds_vfio);
+ if (err) {
+ dev_err(&pdev->dev, "failed to register as client: %pe\n",
+ ERR_PTR(err));
+ goto out_unregister_coredev;
+ }
+
+ err = pds_vfio_pci_register_event_handler(pds_vfio);
+ if (err)
+ goto out_unregister_client;
+
+ return 0;
+
+out_unregister_client:
+ pds_vfio_unregister_client_cmd(pds_vfio);
+out_unregister_coredev:
+ vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
+out_put_vdev:
+ vfio_put_device(&pds_vfio->vfio_coredev.vdev);
+ return err;
+}
+
+static void pds_vfio_pci_remove(struct pci_dev *pdev)
+{
+ struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+
+ pds_vfio_pci_unregister_event_handler(pds_vfio);
+ pds_vfio_unregister_client_cmd(pds_vfio);
+ vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
+ vfio_put_device(&pds_vfio->vfio_coredev.vdev);
+}
+
+static const struct pci_device_id pds_vfio_pci_table[] = {
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_PENSANDO, 0x1003) }, /* Ethernet VF */
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table);
+
+static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
+{
+ struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+
+ pds_vfio_reset(pds_vfio);
+}
+
+static const struct pci_error_handlers pds_vfio_pci_err_handlers = {
+ .reset_done = pds_vfio_pci_aer_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver pds_vfio_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = pds_vfio_pci_table,
+ .probe = pds_vfio_pci_probe,
+ .remove = pds_vfio_pci_remove,
+ .err_handler = &pds_vfio_pci_err_handlers,
+ .driver_managed_dma = true,
+};
+
+module_pci_driver(pds_vfio_pci_driver);
+
+MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
+MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/vfio/pci/pds/pci_drv.h b/drivers/vfio/pci/pds/pci_drv.h
new file mode 100644
index 000000000000..e79bed12ed14
--- /dev/null
+++ b/drivers/vfio/pci/pds/pci_drv.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PCI_DRV_H
+#define _PCI_DRV_H
+
+#include <linux/pci.h>
+
+#endif /* _PCI_DRV_H */
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
new file mode 100644
index 000000000000..b46174f5eb09
--- /dev/null
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include "lm.h"
+#include "dirty.h"
+#include "vfio_dev.h"
+
+struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio)
+{
+ return pds_vfio->vfio_coredev.pdev;
+}
+
+struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio)
+{
+ return &pds_vfio_to_pci_dev(pds_vfio)->dev;
+}
+
+struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+ return container_of(core_device, struct pds_vfio_pci_device,
+ vfio_coredev);
+}
+
+void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
+{
+again:
+ spin_lock(&pds_vfio->reset_lock);
+ if (pds_vfio->deferred_reset) {
+ pds_vfio->deferred_reset = false;
+ if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
+ pds_vfio_put_restore_file(pds_vfio);
+ pds_vfio_put_save_file(pds_vfio);
+ pds_vfio_dirty_disable(pds_vfio, false);
+ }
+ pds_vfio->state = pds_vfio->deferred_reset_state;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+ spin_unlock(&pds_vfio->reset_lock);
+ goto again;
+ }
+ mutex_unlock(&pds_vfio->state_mutex);
+ spin_unlock(&pds_vfio->reset_lock);
+}
+
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
+{
+ spin_lock(&pds_vfio->reset_lock);
+ pds_vfio->deferred_reset = true;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+ if (!mutex_trylock(&pds_vfio->state_mutex)) {
+ spin_unlock(&pds_vfio->reset_lock);
+ return;
+ }
+ spin_unlock(&pds_vfio->reset_lock);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+}
+
+static struct file *
+pds_vfio_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ struct file *res = NULL;
+
+ mutex_lock(&pds_vfio->state_mutex);
+ /*
+ * only way to transition out of VFIO_DEVICE_STATE_ERROR is via
+ * VFIO_DEVICE_RESET, so prevent the state machine from running since
+ * vfio_mig_get_next_state() will throw a WARN_ON() when transitioning
+ * from VFIO_DEVICE_STATE_ERROR to any other state
+ */
+ while (pds_vfio->state != VFIO_DEVICE_STATE_ERROR &&
+ new_state != pds_vfio->state) {
+ enum vfio_device_mig_state next_state;
+
+ int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
+ new_state, &next_state);
+ if (err) {
+ res = ERR_PTR(err);
+ break;
+ }
+
+ res = pds_vfio_step_device_state_locked(pds_vfio, next_state);
+ if (IS_ERR(res))
+ break;
+
+ pds_vfio->state = next_state;
+
+ if (WARN_ON(res && new_state != pds_vfio->state)) {
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ pds_vfio_state_mutex_unlock(pds_vfio);
+ /* still waiting on a deferred_reset */
+ if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR)
+ res = ERR_PTR(-EIO);
+
+ return res;
+}
+
+static int pds_vfio_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *current_state)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+
+ mutex_lock(&pds_vfio->state_mutex);
+ *current_state = pds_vfio->state;
+ pds_vfio_state_mutex_unlock(pds_vfio);
+ return 0;
+}
+
+static int pds_vfio_get_device_state_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ *stop_copy_length = PDS_LM_DEVICE_STATE_LENGTH;
+ return 0;
+}
+
+static const struct vfio_migration_ops pds_vfio_lm_ops = {
+ .migration_set_state = pds_vfio_set_device_state,
+ .migration_get_state = pds_vfio_get_device_state,
+ .migration_get_data_size = pds_vfio_get_device_state_size
+};
+
+static const struct vfio_log_ops pds_vfio_log_ops = {
+ .log_start = pds_vfio_dma_logging_start,
+ .log_stop = pds_vfio_dma_logging_stop,
+ .log_read_and_clear = pds_vfio_dma_logging_report,
+};
+
+static int pds_vfio_init_device(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ struct pci_dev *pdev = to_pci_dev(vdev->dev);
+ int err, vf_id, pci_id;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ err = vfio_pci_core_init_dev(vdev);
+ if (err)
+ return err;
+
+ pds_vfio->vf_id = vf_id;
+
+ vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+ vdev->mig_ops = &pds_vfio_lm_ops;
+ vdev->log_ops = &pds_vfio_log_ops;
+
+ pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn);
+ dev_dbg(&pdev->dev,
+ "%s: PF %#04x VF %#04x vf_id %d domain %d pds_vfio %p\n",
+ __func__, pci_dev_id(pdev->physfn), pci_id, vf_id,
+ pci_domain_nr(pdev->bus), pds_vfio);
+
+ return 0;
+}
+
+static int pds_vfio_open_device(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ int err;
+
+ err = vfio_pci_core_enable(&pds_vfio->vfio_coredev);
+ if (err)
+ return err;
+
+ mutex_init(&pds_vfio->state_mutex);
+ pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+
+ vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
+
+ return 0;
+}
+
+static void pds_vfio_close_device(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+
+ mutex_lock(&pds_vfio->state_mutex);
+ pds_vfio_put_restore_file(pds_vfio);
+ pds_vfio_put_save_file(pds_vfio);
+ pds_vfio_dirty_disable(pds_vfio, true);
+ mutex_unlock(&pds_vfio->state_mutex);
+ mutex_destroy(&pds_vfio->state_mutex);
+ vfio_pci_core_close_device(vdev);
+}
+
+static const struct vfio_device_ops pds_vfio_ops = {
+ .name = "pds-vfio",
+ .init = pds_vfio_init_device,
+ .release = vfio_pci_core_release_dev,
+ .open_device = pds_vfio_open_device,
+ .close_device = pds_vfio_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+};
+
+const struct vfio_device_ops *pds_vfio_ops_info(void)
+{
+ return &pds_vfio_ops;
+}
diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
new file mode 100644
index 000000000000..b8f2d667608f
--- /dev/null
+++ b/drivers/vfio/pci/pds/vfio_dev.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _VFIO_DEV_H_
+#define _VFIO_DEV_H_
+
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
+
+#include "dirty.h"
+#include "lm.h"
+
+struct pds_vfio_pci_device {
+ struct vfio_pci_core_device vfio_coredev;
+
+ struct pds_vfio_lm_file *save_file;
+ struct pds_vfio_lm_file *restore_file;
+ struct pds_vfio_dirty dirty;
+ struct mutex state_mutex; /* protect migration state */
+ enum vfio_device_mig_state state;
+ spinlock_t reset_lock; /* protect reset_done flow */
+ u8 deferred_reset;
+ enum vfio_device_mig_state deferred_reset_state;
+ struct notifier_block nb;
+
+ int vf_id;
+ u16 client_id;
+};
+
+void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio);
+
+const struct vfio_device_ops *pds_vfio_ops_info(void);
+struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
+
+struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio);
+struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio);
+
+#endif /* _VFIO_DEV_H_ */
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 29091ee2e984..cb5b7f865d58 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -141,6 +141,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 20d7b69ea6ff..1929103ee59a 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -27,6 +27,7 @@
#include <linux/vgaarb.h>
#include <linux/nospec.h>
#include <linux/sched/mm.h>
+#include <linux/iommufd.h>
#if IS_ENABLED(CONFIG_EEH)
#include <asm/eeh.h>
#endif
@@ -180,7 +181,8 @@ no_mmap:
struct vfio_pci_group_info;
static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
- struct vfio_pci_group_info *groups);
+ struct vfio_pci_group_info *groups,
+ struct iommufd_ctx *iommufd_ctx);
/*
* INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
@@ -776,29 +778,65 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
}
struct vfio_pci_fill_info {
- int max;
- int cur;
- struct vfio_pci_dependent_device *devices;
+ struct vfio_pci_dependent_device __user *devices;
+ struct vfio_pci_dependent_device __user *devices_end;
+ struct vfio_device *vdev;
+ u32 count;
+ u32 flags;
};
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
{
+ struct vfio_pci_dependent_device info = {
+ .segment = pci_domain_nr(pdev->bus),
+ .bus = pdev->bus->number,
+ .devfn = pdev->devfn,
+ };
struct vfio_pci_fill_info *fill = data;
- struct iommu_group *iommu_group;
- if (fill->cur == fill->max)
- return -EAGAIN; /* Something changed, try again */
+ fill->count++;
+ if (fill->devices >= fill->devices_end)
+ return 0;
+
+ if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
+ struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
+ struct vfio_device_set *dev_set = fill->vdev->dev_set;
+ struct vfio_device *vdev;
+
+ /*
+ * hot-reset requires all affected devices be represented in
+ * the dev_set.
+ */
+ vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
+ if (!vdev) {
+ info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ } else {
+ int id = vfio_iommufd_get_dev_id(vdev, iommufd);
+
+ if (id > 0)
+ info.devid = id;
+ else if (id == -ENOENT)
+ info.devid = VFIO_PCI_DEVID_OWNED;
+ else
+ info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ }
+ /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
+ if (info.devid == VFIO_PCI_DEVID_NOT_OWNED)
+ fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+ } else {
+ struct iommu_group *iommu_group;
+
+ iommu_group = iommu_group_get(&pdev->dev);
+ if (!iommu_group)
+ return -EPERM; /* Cannot reset non-isolated devices */
- iommu_group = iommu_group_get(&pdev->dev);
- if (!iommu_group)
- return -EPERM; /* Cannot reset non-isolated devices */
+ info.group_id = iommu_group_id(iommu_group);
+ iommu_group_put(iommu_group);
+ }
- fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
- fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
- fill->devices[fill->cur].bus = pdev->bus->number;
- fill->devices[fill->cur].devfn = pdev->devfn;
- fill->cur++;
- iommu_group_put(iommu_group);
+ if (copy_to_user(fill->devices, &info, sizeof(info)))
+ return -EFAULT;
+ fill->devices++;
return 0;
}
@@ -920,24 +958,17 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
struct vfio_device_info __user *arg)
{
unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs);
- struct vfio_device_info info;
+ struct vfio_device_info info = {};
struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
- unsigned long capsz;
int ret;
- /* For backward compatibility, cannot require this */
- capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
-
if (copy_from_user(&info, arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
- if (info.argsz >= capsz) {
- minsz = capsz;
- info.cap_offset = 0;
- }
+ minsz = min_t(size_t, info.argsz, sizeof(info));
info.flags = VFIO_DEVICE_FLAGS_PCI;
@@ -1228,8 +1259,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
unsigned long minsz =
offsetofend(struct vfio_pci_hot_reset_info, count);
struct vfio_pci_hot_reset_info hdr;
- struct vfio_pci_fill_info fill = { 0 };
- struct vfio_pci_dependent_device *devices = NULL;
+ struct vfio_pci_fill_info fill = {};
bool slot = false;
int ret = 0;
@@ -1247,78 +1277,42 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
else if (pci_probe_reset_bus(vdev->pdev->bus))
return -ENODEV;
- /* How many devices are affected? */
- ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
- &fill.max, slot);
- if (ret)
- return ret;
+ fill.devices = arg->devices;
+ fill.devices_end = arg->devices +
+ (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]);
+ fill.vdev = &vdev->vdev;
- WARN_ON(!fill.max); /* Should always be at least one */
-
- /*
- * If there's enough space, fill it now, otherwise return -ENOSPC and
- * the number of devices affected.
- */
- if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
- ret = -ENOSPC;
- hdr.count = fill.max;
- goto reset_info_exit;
- }
-
- devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
- if (!devices)
- return -ENOMEM;
-
- fill.devices = devices;
+ if (vfio_device_cdev_opened(&vdev->vdev))
+ fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID |
+ VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+ mutex_lock(&vdev->vdev.dev_set->lock);
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
&fill, slot);
+ mutex_unlock(&vdev->vdev.dev_set->lock);
+ if (ret)
+ return ret;
- /*
- * If a device was removed between counting and filling, we may come up
- * short of fill.max. If a device was added, we'll have a return of
- * -EAGAIN above.
- */
- if (!ret)
- hdr.count = fill.cur;
-
-reset_info_exit:
+ hdr.count = fill.count;
+ hdr.flags = fill.flags;
if (copy_to_user(arg, &hdr, minsz))
- ret = -EFAULT;
-
- if (!ret) {
- if (copy_to_user(&arg->devices, devices,
- hdr.count * sizeof(*devices)))
- ret = -EFAULT;
- }
+ return -EFAULT;
- kfree(devices);
- return ret;
+ if (fill.count > fill.devices - arg->devices)
+ return -ENOSPC;
+ return 0;
}
-static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
- struct vfio_pci_hot_reset __user *arg)
+static int
+vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
+ int array_count, bool slot,
+ struct vfio_pci_hot_reset __user *arg)
{
- unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
- struct vfio_pci_hot_reset hdr;
int32_t *group_fds;
struct file **files;
struct vfio_pci_group_info info;
- bool slot = false;
int file_idx, count = 0, ret = 0;
- if (copy_from_user(&hdr, arg, minsz))
- return -EFAULT;
-
- if (hdr.argsz < minsz || hdr.flags)
- return -EINVAL;
-
- /* Can we do a slot or bus reset or neither? */
- if (!pci_probe_reset_slot(vdev->pdev->slot))
- slot = true;
- else if (pci_probe_reset_bus(vdev->pdev->bus))
- return -ENODEV;
-
/*
* We can't let userspace give us an arbitrarily large buffer to copy,
* so verify how many we think there could be. Note groups can have
@@ -1329,12 +1323,11 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
if (ret)
return ret;
- /* Somewhere between 1 and count is OK */
- if (!hdr.count || hdr.count > count)
+ if (array_count > count)
return -EINVAL;
- group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
- files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL);
+ group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
+ files = kcalloc(array_count, sizeof(*files), GFP_KERNEL);
if (!group_fds || !files) {
kfree(group_fds);
kfree(files);
@@ -1342,18 +1335,17 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
}
if (copy_from_user(group_fds, arg->group_fds,
- hdr.count * sizeof(*group_fds))) {
+ array_count * sizeof(*group_fds))) {
kfree(group_fds);
kfree(files);
return -EFAULT;
}
/*
- * For each group_fd, get the group through the vfio external user
- * interface and store the group and iommu ID. This ensures the group
- * is held across the reset.
+ * Get the group file for each fd to ensure the group is held across
+ * the reset
*/
- for (file_idx = 0; file_idx < hdr.count; file_idx++) {
+ for (file_idx = 0; file_idx < array_count; file_idx++) {
struct file *file = fget(group_fds[file_idx]);
if (!file) {
@@ -1377,10 +1369,10 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
if (ret)
goto hot_reset_release;
- info.count = hdr.count;
+ info.count = array_count;
info.files = files;
- ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
+ ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
hot_reset_release:
for (file_idx--; file_idx >= 0; file_idx--)
@@ -1390,6 +1382,36 @@ hot_reset_release:
return ret;
}
+static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_hot_reset __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
+ struct vfio_pci_hot_reset hdr;
+ bool slot = false;
+
+ if (copy_from_user(&hdr, arg, minsz))
+ return -EFAULT;
+
+ if (hdr.argsz < minsz || hdr.flags)
+ return -EINVAL;
+
+ /* zero-length array is only for cdev opened devices */
+ if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev))
+ return -EINVAL;
+
+ /* Can we do a slot or bus reset or neither? */
+ if (!pci_probe_reset_slot(vdev->pdev->slot))
+ slot = true;
+ else if (pci_probe_reset_bus(vdev->pdev->bus))
+ return -ENODEV;
+
+ if (hdr.count)
+ return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
+
+ return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
+ vfio_iommufd_device_ictx(&vdev->vdev));
+}
+
static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
struct vfio_device_ioeventfd __user *arg)
{
@@ -2355,13 +2377,16 @@ const struct pci_error_handlers vfio_pci_core_err_handlers = {
};
EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
-static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
+static bool vfio_dev_in_groups(struct vfio_device *vdev,
struct vfio_pci_group_info *groups)
{
unsigned int i;
+ if (!groups)
+ return false;
+
for (i = 0; i < groups->count; i++)
- if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
+ if (vfio_file_has_dev(groups->files[i], vdev))
return true;
return false;
}
@@ -2369,12 +2394,8 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
{
struct vfio_device_set *dev_set = data;
- struct vfio_device *cur;
- list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
- if (cur->dev == &pdev->dev)
- return 0;
- return -EBUSY;
+ return vfio_find_device_in_devset(dev_set, &pdev->dev) ? 0 : -ENODEV;
}
/*
@@ -2441,7 +2462,8 @@ unwind:
* get each memory_lock.
*/
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
- struct vfio_pci_group_info *groups)
+ struct vfio_pci_group_info *groups,
+ struct iommufd_ctx *iommufd_ctx)
{
struct vfio_pci_core_device *cur_mem;
struct vfio_pci_core_device *cur_vma;
@@ -2471,11 +2493,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
goto err_unlock;
list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
+ bool owned;
+
/*
- * Test whether all the affected devices are contained by the
- * set of groups provided by the user.
+ * Test whether all the affected devices can be reset by the
+ * user.
+ *
+ * If called from a group opened device and the user provides
+ * a set of groups, all the devices in the dev_set should be
+ * contained by the set of groups provided by the user.
+ *
+ * If called from a cdev opened device and the user provides
+ * a zero-length array, all the devices in the dev_set must
+ * be bound to the same iommufd_ctx as the input iommufd_ctx.
+ * If there is any device that has not been bound to any
+ * iommufd_ctx yet, check if its iommu_group has any device
+ * bound to the input iommufd_ctx. Such devices can be
+ * considered owned by the input iommufd_ctx as the device
+ * cannot be owned by another iommufd_ctx when its iommu_group
+ * is owned.
+ *
+ * Otherwise, reset is not allowed.
*/
- if (!vfio_dev_in_groups(cur_vma, groups)) {
+ if (iommufd_ctx) {
+ int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev,
+ iommufd_ctx);
+
+ owned = (devid > 0 || devid == -ENOENT);
+ } else {
+ owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
+ }
+
+ if (!owned) {
ret = -EINVAL;
goto err_undo;
}