diff options
Diffstat (limited to 'drivers/vfio/pci')
27 files changed, 3806 insertions, 1193 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 15821a2d77d2..c3bcb6911c53 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -7,10 +7,6 @@ config VFIO_PCI_CORE select VFIO_VIRQFD select IRQ_BYPASS_MANAGER -config VFIO_PCI_MMAP - def_bool y if !S390 - depends on VFIO_PCI_CORE - config VFIO_PCI_INTX def_bool y if !S390 depends on VFIO_PCI_CORE @@ -69,4 +65,6 @@ source "drivers/vfio/pci/virtio/Kconfig" source "drivers/vfio/pci/nvgrace-gpu/Kconfig" +source "drivers/vfio/pci/qat/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index ce7a61f1d912..cf00c0a7e55c 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -17,3 +17,5 @@ obj-$(CONFIG_PDS_VFIO_PCI) += pds/ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/ + +obj-$(CONFIG_QAT_VFIO_PCI) += qat/ diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 9a3e97108ace..2149f49aeec7 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -190,9 +190,10 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) int ret; /* Check VF state */ - if (unlikely(hisi_qm_wait_mb_ready(qm))) { + ret = hisi_qm_wait_mb_ready(qm); + if (unlikely(ret)) { dev_err(&qm->pdev->dev, "QM device is not ready to write\n"); - return -EBUSY; + return ret; } ret = qm_write_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -325,13 +326,15 @@ static void qm_dev_cmd_init(struct hisi_qm *qm) static int vf_qm_cache_wb(struct hisi_qm *qm) { unsigned int val; + int ret; writel(0x1, qm->io_base + QM_CACHE_WB_START); - if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE, + ret = readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE, val, val & BIT(0), MB_POLL_PERIOD_US, - MB_POLL_TIMEOUT_US)) { + MB_POLL_TIMEOUT_US); + if (ret) { dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail\n"); - return -EINVAL; + return ret; } return 0; @@ -350,6 +353,32 @@ static int vf_qm_func_stop(struct hisi_qm *qm) return hisi_qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0); } +static int vf_qm_version_check(struct acc_vf_data *vf_data, struct device *dev) +{ + switch (vf_data->acc_magic) { + case ACC_DEV_MAGIC_V2: + if (vf_data->major_ver != ACC_DRV_MAJOR_VER) { + dev_info(dev, "migration driver version<%u.%u> not match!\n", + vf_data->major_ver, vf_data->minor_ver); + return -EINVAL; + } + break; + case ACC_DEV_MAGIC_V1: + /* Correct dma address */ + vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH]; + vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH]; + vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW]; + break; + default: + return -EINVAL; + } + + return 0; +} + static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_acc_vf_migration_file *migf) { @@ -363,9 +392,10 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) return 0; - if (vf_data->acc_magic != ACC_DEV_MAGIC) { + ret = vf_qm_version_check(vf_data, dev); + if (ret) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); - return -EINVAL; + return ret; } if (vf_data->dev_id != hisi_acc_vdev->vf_dev->device) { @@ -377,7 +407,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, ret = qm_get_vft(vf_qm, &vf_qm->qp_base); if (ret <= 0) { dev_err(dev, "failed to get vft qp nums\n"); - return -EINVAL; + return ret; } if (ret != vf_data->qp_num) { @@ -399,13 +429,6 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; } - ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); - if (ret) { - dev_err(dev, "failed to write QM_VF_STATE\n"); - return ret; - } - - hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; hisi_acc_vdev->match_done = true; return 0; } @@ -418,7 +441,9 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, int vf_id = hisi_acc_vdev->vf_id; int ret; - vf_data->acc_magic = ACC_DEV_MAGIC; + vf_data->acc_magic = ACC_DEV_MAGIC_V2; + vf_data->major_ver = ACC_DRV_MAJOR_VER; + vf_data->minor_ver = ACC_DRV_MINOR_VER; /* Save device id */ vf_data->dev_id = hisi_acc_vdev->vf_dev->device; @@ -441,6 +466,19 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, return 0; } +static void vf_qm_xeqc_save(struct hisi_qm *qm, + struct hisi_acc_vf_migration_file *migf) +{ + struct acc_vf_data *vf_data = &migf->vf_data; + u16 eq_head, aeq_head; + + eq_head = vf_data->qm_eqc_dw[0] & 0xFFFF; + qm_db(qm, 0, QM_DOORBELL_CMD_EQ, eq_head, 0); + + aeq_head = vf_data->qm_aeqc_dw[0] & 0xFFFF; + qm_db(qm, 0, QM_DOORBELL_CMD_AEQ, aeq_head, 0); +} + static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_acc_vf_migration_file *migf) { @@ -456,6 +494,20 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (migf->total_length < sizeof(struct acc_vf_data)) return -EINVAL; + if (!vf_data->eqe_dma || !vf_data->aeqe_dma || + !vf_data->sqc_dma || !vf_data->cqc_dma) { + dev_info(dev, "resume dma addr is NULL!\n"); + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; + return 0; + } + + ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_STATE\n"); + return ret; + } + hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + qm->eqe_dma = vf_data->eqe_dma; qm->aeqe_dma = vf_data->aeqe_dma; qm->sqc_dma = vf_data->sqc_dma; @@ -486,57 +538,65 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, return 0; } -static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, - struct hisi_acc_vf_migration_file *migf) +static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data) { - struct acc_vf_data *vf_data = &migf->vf_data; - struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; struct device *dev = &vf_qm->pdev->dev; int ret; - if (unlikely(qm_wait_dev_not_ready(vf_qm))) { - /* Update state and return with match data */ - vf_data->vf_qm_state = QM_NOT_READY; - hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; - migf->total_length = QM_MATCH_SIZE; - return 0; - } - - vf_data->vf_qm_state = QM_READY; - hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; - - ret = vf_qm_cache_wb(vf_qm); - if (ret) { - dev_err(dev, "failed to writeback QM Cache!\n"); - return ret; - } - ret = qm_get_regs(vf_qm, vf_data); if (ret) - return -EINVAL; + return ret; /* Every reg is 32 bit, the dma address is 64 bit. */ - vf_data->eqe_dma = vf_data->qm_eqc_dw[1]; + vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH]; vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->eqe_dma |= vf_data->qm_eqc_dw[0]; - vf_data->aeqe_dma = vf_data->qm_aeqc_dw[1]; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH]; vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[0]; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW]; /* Through SQC_BT/CQC_BT to get sqc and cqc address */ ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma); if (ret) { dev_err(dev, "failed to read SQC addr!\n"); - return -EINVAL; + return ret; } ret = qm_get_cqc(vf_qm, &vf_data->cqc_dma); if (ret) { dev_err(dev, "failed to read CQC addr!\n"); - return -EINVAL; + return ret; } + return 0; +} + +static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, + struct hisi_acc_vf_migration_file *migf) +{ + struct acc_vf_data *vf_data = &migf->vf_data; + struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + int ret; + + if (unlikely(qm_wait_dev_not_ready(vf_qm))) { + /* Update state and return with match data */ + vf_data->vf_qm_state = QM_NOT_READY; + hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + migf->total_length = QM_MATCH_SIZE; + return 0; + } + + vf_data->vf_qm_state = QM_READY; + hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + + ret = vf_qm_read_data(vf_qm, vf_data); + if (ret) + return ret; + migf->total_length = sizeof(struct acc_vf_data); + /* Save eqc and aeqc interrupt information */ + vf_qm_xeqc_save(vf_qm, migf); + return 0; } @@ -615,21 +675,43 @@ static void hisi_acc_vf_disable_fd(struct hisi_acc_vf_migration_file *migf) mutex_unlock(&migf->lock); } +static void +hisi_acc_debug_migf_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, + struct hisi_acc_vf_migration_file *src_migf) +{ + struct hisi_acc_vf_migration_file *dst_migf = hisi_acc_vdev->debug_migf; + + if (!dst_migf) + return; + + dst_migf->total_length = src_migf->total_length; + memcpy(&dst_migf->vf_data, &src_migf->vf_data, + sizeof(struct acc_vf_data)); +} + static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vdev) { if (hisi_acc_vdev->resuming_migf) { + hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->resuming_migf); hisi_acc_vf_disable_fd(hisi_acc_vdev->resuming_migf); fput(hisi_acc_vdev->resuming_migf->filp); hisi_acc_vdev->resuming_migf = NULL; } if (hisi_acc_vdev->saving_migf) { + hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->saving_migf); hisi_acc_vf_disable_fd(hisi_acc_vdev->saving_migf); fput(hisi_acc_vdev->saving_migf->filp); hisi_acc_vdev->saving_migf = NULL; } } +static struct hisi_acc_vf_core_device *hisi_acc_get_vf_dev(struct vfio_device *vdev) +{ + return container_of(vdev, struct hisi_acc_vf_core_device, + core_device.vdev); +} + static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev) { hisi_acc_vdev->vf_qm_state = QM_NOT_READY; @@ -723,7 +805,6 @@ static const struct file_operations hisi_acc_vf_resume_fops = { .owner = THIS_MODULE, .write = hisi_acc_vf_resume_write, .release = hisi_acc_vf_release_file, - .llseek = no_llseek, }; static struct hisi_acc_vf_migration_file * @@ -845,7 +926,6 @@ static const struct file_operations hisi_acc_vf_save_fops = { .unlocked_ioctl = hisi_acc_vf_precopy_ioctl, .compat_ioctl = compat_ptr_ioctl, .release = hisi_acc_vf_release_file, - .llseek = no_llseek, }; static struct hisi_acc_vf_migration_file * @@ -935,6 +1015,13 @@ static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev dev_err(dev, "failed to check QM INT state!\n"); return ret; } + + ret = vf_qm_cache_wb(vf_qm); + if (ret) { + dev_err(dev, "failed to writeback QM cache!\n"); + return ret; + } + return 0; } @@ -1033,8 +1120,7 @@ static struct file * hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev, enum vfio_device_mig_state new_state) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); enum vfio_device_mig_state next_state; struct file *res = NULL; int ret; @@ -1075,8 +1161,7 @@ static int hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); mutex_lock(&hisi_acc_vdev->state_mutex); *curr_state = hisi_acc_vdev->mig_state; @@ -1278,10 +1363,132 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int return vfio_pci_core_ioctl(core_vdev, cmd, arg); } +static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); + struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + int ret; + + lockdep_assert_held(&hisi_acc_vdev->open_mutex); + /* + * When the device is not opened, the io_base is not mapped. + * The driver cannot perform device read and write operations. + */ + if (!hisi_acc_vdev->dev_opened) { + seq_puts(seq, "device not opened!\n"); + return -EINVAL; + } + + ret = qm_wait_dev_not_ready(vf_qm); + if (ret) { + seq_puts(seq, "VF device not ready!\n"); + return ret; + } + + return 0; +} + +static int hisi_acc_vf_debug_cmd(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev); + struct vfio_device *vdev = &core_device->vdev; + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); + struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + u64 value; + int ret; + + mutex_lock(&hisi_acc_vdev->open_mutex); + ret = hisi_acc_vf_debug_check(seq, vdev); + if (ret) { + mutex_unlock(&hisi_acc_vdev->open_mutex); + return ret; + } + + value = readl(vf_qm->io_base + QM_MB_CMD_SEND_BASE); + if (value == QM_MB_CMD_NOT_READY) { + mutex_unlock(&hisi_acc_vdev->open_mutex); + seq_puts(seq, "mailbox cmd channel not ready!\n"); + return -EINVAL; + } + mutex_unlock(&hisi_acc_vdev->open_mutex); + seq_puts(seq, "mailbox cmd channel ready!\n"); + + return 0; +} + +static int hisi_acc_vf_dev_read(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev); + struct vfio_device *vdev = &core_device->vdev; + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); + size_t vf_data_sz = offsetofend(struct acc_vf_data, padding); + struct acc_vf_data *vf_data; + int ret; + + mutex_lock(&hisi_acc_vdev->open_mutex); + ret = hisi_acc_vf_debug_check(seq, vdev); + if (ret) { + mutex_unlock(&hisi_acc_vdev->open_mutex); + return ret; + } + + mutex_lock(&hisi_acc_vdev->state_mutex); + vf_data = kzalloc(sizeof(*vf_data), GFP_KERNEL); + if (!vf_data) { + ret = -ENOMEM; + goto mutex_release; + } + + vf_data->vf_qm_state = hisi_acc_vdev->vf_qm_state; + ret = vf_qm_read_data(&hisi_acc_vdev->vf_qm, vf_data); + if (ret) + goto migf_err; + + seq_hex_dump(seq, "Dev Data:", DUMP_PREFIX_OFFSET, 16, 1, + (const void *)vf_data, vf_data_sz, false); + + seq_printf(seq, + "guest driver load: %u\n" + "data size: %lu\n", + hisi_acc_vdev->vf_qm_state, + sizeof(struct acc_vf_data)); + +migf_err: + kfree(vf_data); +mutex_release: + mutex_unlock(&hisi_acc_vdev->state_mutex); + mutex_unlock(&hisi_acc_vdev->open_mutex); + + return ret; +} + +static int hisi_acc_vf_migf_read(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev); + struct vfio_device *vdev = &core_device->vdev; + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); + size_t vf_data_sz = offsetofend(struct acc_vf_data, padding); + struct hisi_acc_vf_migration_file *debug_migf = hisi_acc_vdev->debug_migf; + + /* Check whether the live migration operation has been performed */ + if (debug_migf->total_length < QM_MATCH_SIZE) { + seq_puts(seq, "device not migrated!\n"); + return -EAGAIN; + } + + seq_hex_dump(seq, "Mig Data:", DUMP_PREFIX_OFFSET, 16, 1, + (const void *)&debug_migf->vf_data, vf_data_sz, false); + seq_printf(seq, "migrate data length: %lu\n", debug_migf->total_length); + + return 0; +} + static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev); struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; int ret; @@ -1290,12 +1497,16 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) return ret; if (core_vdev->mig_ops) { + mutex_lock(&hisi_acc_vdev->open_mutex); ret = hisi_acc_vf_qm_init(hisi_acc_vdev); if (ret) { + mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_disable(vdev); return ret; } hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + hisi_acc_vdev->dev_opened = true; + mutex_unlock(&hisi_acc_vdev->open_mutex); } vfio_pci_core_finish_enable(vdev); @@ -1304,11 +1515,14 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev); struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + hisi_acc_vf_disable_fds(hisi_acc_vdev); + mutex_lock(&hisi_acc_vdev->open_mutex); + hisi_acc_vdev->dev_opened = false; iounmap(vf_qm->io_base); + mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } @@ -1320,15 +1534,16 @@ static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev); struct pci_dev *pdev = to_pci_dev(core_vdev->dev); struct hisi_qm *pf_qm = hisi_acc_get_pf_qm(pdev); hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1; hisi_acc_vdev->pf_qm = pf_qm; hisi_acc_vdev->vf_dev = pdev; + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; mutex_init(&hisi_acc_vdev->state_mutex); + mutex_init(&hisi_acc_vdev->open_mutex); core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY; core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; @@ -1374,6 +1589,47 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static void hisi_acc_vfio_debug_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + struct vfio_device *vdev = &hisi_acc_vdev->core_device.vdev; + struct hisi_acc_vf_migration_file *migf; + struct dentry *vfio_dev_migration; + struct dentry *vfio_hisi_acc; + struct device *dev = vdev->dev; + + if (!debugfs_initialized() || + !IS_ENABLED(CONFIG_VFIO_DEBUGFS)) + return; + + if (vdev->ops != &hisi_acc_vfio_pci_migrn_ops) + return; + + vfio_dev_migration = debugfs_lookup("migration", vdev->debug_root); + if (!vfio_dev_migration) { + dev_err(dev, "failed to lookup migration debugfs file!\n"); + return; + } + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return; + hisi_acc_vdev->debug_migf = migf; + + vfio_hisi_acc = debugfs_create_dir("hisi_acc", vfio_dev_migration); + debugfs_create_devm_seqfile(dev, "dev_data", vfio_hisi_acc, + hisi_acc_vf_dev_read); + debugfs_create_devm_seqfile(dev, "migf_data", vfio_hisi_acc, + hisi_acc_vf_migf_read); + debugfs_create_devm_seqfile(dev, "cmd_state", vfio_hisi_acc, + hisi_acc_vf_debug_cmd); +} + +static void hisi_acc_vf_debugfs_exit(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + kfree(hisi_acc_vdev->debug_migf); + hisi_acc_vdev->debug_migf = NULL; +} + static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct hisi_acc_vf_core_device *hisi_acc_vdev; @@ -1400,6 +1656,8 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device); if (ret) goto out_put_vdev; + + hisi_acc_vfio_debug_init(hisi_acc_vdev); return 0; out_put_vdev: @@ -1412,6 +1670,7 @@ static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev) struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device); + hisi_acc_vf_debugfs_exit(hisi_acc_vdev); vfio_put_device(&hisi_acc_vdev->core_device.vdev); } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 5bab46602fad..91002ceeebc1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -32,12 +32,16 @@ #define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0) #define QM_SQC_VFT_NUM_SHIFT_V2 45 #define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0) +#define QM_MB_CMD_NOT_READY 0xffffffff /* RW regs */ #define QM_REGS_MAX_LEN 7 #define QM_REG_ADDR_OFFSET 0x0004 #define QM_XQC_ADDR_OFFSET 32U +#define QM_XQC_ADDR_LOW 0x1 +#define QM_XQC_ADDR_HIGH 0x2 + #define QM_VF_AEQ_INT_MASK 0x0004 #define QM_VF_EQ_INT_MASK 0x000c #define QM_IFC_INT_SOURCE_V 0x0020 @@ -49,10 +53,15 @@ #define QM_EQC_DW0 0X8000 #define QM_AEQC_DW0 0X8020 +#define ACC_DRV_MAJOR_VER 1 +#define ACC_DRV_MINOR_VER 0 + +#define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC +#define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ -#define ACC_DEV_MAGIC 0XCDCDCDCDFEEDAACC u64 acc_magic; u32 qp_num; u32 dev_id; @@ -60,7 +69,9 @@ struct acc_vf_data { u32 qp_base; u32 vf_qm_state; /* QM reserved match information */ - u32 qm_rsv_state[3]; + u16 major_ver; + u16 minor_ver; + u32 qm_rsv_state[2]; /* QM RW regs */ u32 aeq_int_mask; @@ -99,6 +110,13 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; u8 match_done; + /* + * io_base is only valid when dev_opened is true, + * which is protected by open_mutex. + */ + bool dev_opened; + /* Ensure the accuracy of dev_opened operation */ + struct mutex open_mutex; /* For migration state */ struct mutex state_mutex; @@ -107,9 +125,20 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + /* + * vf_qm_state represents the QM_VF_STATE register value. + * It is set by Guest driver for the ACC VF dev indicating + * the driver has loaded and configured the dev correctly. + */ u32 vf_qm_state; int vf_id; struct hisi_acc_vf_migration_file *resuming_migf; struct hisi_acc_vf_migration_file *saving_migf; + + /* + * It holds migration data corresponding to the last migration + * and is used by the debugfs interface to report it. + */ + struct hisi_acc_vf_migration_file *debug_migf; }; #endif /* HISI_ACC_VFIO_PCI_H */ diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 41a4b0cf4297..5b919a0b2524 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -313,40 +313,21 @@ err_exec: return ret; } -static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vhca_data_buffer *buf, - struct mlx5_vhca_recv_buf *recv_buf, - u32 *mkey) +static u32 *alloc_mkey_in(u32 npages, u32 pdn) { - size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : - recv_buf->npages; - int err = 0, inlen; - __be64 *mtt; + int inlen; void *mkc; u32 *in; inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + - sizeof(*mtt) * round_up(npages, 2); + sizeof(__be64) * round_up(npages, 2); - in = kvzalloc(inlen, GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT); if (!in) - return -ENOMEM; + return NULL; MLX5_SET(create_mkey_in, in, translations_octword_actual_size, DIV_ROUND_UP(npages, 2)); - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - - if (buf) { - struct sg_dma_page_iter dma_iter; - - for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); - } else { - int i; - - for (i = 0; i < npages; i++) - *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); - } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); @@ -360,8 +341,81 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); - err = mlx5_core_create_mkey(mdev, mkey, in, inlen); - kvfree(in); + + return in; +} + +static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in, + u32 *mkey) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(__be64) * round_up(npages, 2); + + return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); +} + +static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + u32 *mkey_in, struct dma_iova_state *state, + enum dma_data_direction dir) +{ + dma_addr_t addr; + __be64 *mtt; + int i; + + if (dma_use_iova(state)) { + dma_iova_destroy(mdev->device, state, npages * PAGE_SIZE, dir, + 0); + } else { + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, + klm_pas_mtt); + for (i = npages - 1; i >= 0; i--) { + addr = be64_to_cpu(mtt[i]); + dma_unmap_page(mdev->device, addr, PAGE_SIZE, dir); + } + } +} + +static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + struct page **page_list, u32 *mkey_in, + struct dma_iova_state *state, + enum dma_data_direction dir) +{ + dma_addr_t addr; + size_t mapped = 0; + __be64 *mtt; + int i, err; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + if (dma_iova_try_alloc(mdev->device, state, 0, npages * PAGE_SIZE)) { + addr = state->addr; + for (i = 0; i < npages; i++) { + err = dma_iova_link(mdev->device, state, + page_to_phys(page_list[i]), mapped, + PAGE_SIZE, dir, 0); + if (err) + goto error; + *mtt++ = cpu_to_be64(addr); + addr += PAGE_SIZE; + mapped += PAGE_SIZE; + } + err = dma_iova_sync(mdev->device, state, 0, mapped); + if (err) + goto error; + } else { + for (i = 0; i < npages; i++) { + addr = dma_map_page(mdev->device, page_list[i], 0, + PAGE_SIZE, dir); + err = dma_mapping_error(mdev->device, addr); + if (err) + goto error; + *mtt++ = cpu_to_be64(addr); + } + } + return 0; + +error: + unregister_dma_pages(mdev, i, mkey_in, state, dir); return err; } @@ -375,93 +429,97 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (mvdev->mdev_detach) return -ENOTCONN; - if (buf->dmaed || !buf->allocated_length) + if (buf->mkey_in || !buf->npages) return -EINVAL; - ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); - if (ret) - return ret; + buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn); + if (!buf->mkey_in) + return -ENOMEM; - ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + ret = register_dma_pages(mdev, buf->npages, buf->page_list, + buf->mkey_in, &buf->state, buf->dma_dir); if (ret) - goto err; + goto err_register_dma; - buf->dmaed = true; + ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey); + if (ret) + goto err_create_mkey; return 0; -err: - dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + +err_create_mkey: + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->state, + buf->dma_dir); +err_register_dma: + kvfree(buf->mkey_in); + buf->mkey_in = NULL; return ret; } +static void free_page_list(u32 npages, struct page **page_list) +{ + int i; + + /* Undo alloc_pages_bulk() */ + for (i = npages - 1; i >= 0; i--) + __free_page(page_list[i]); + + kvfree(page_list); +} + void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) { - struct mlx5_vf_migration_file *migf = buf->migf; - struct sg_page_iter sg_iter; + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; - lockdep_assert_held(&migf->mvdev->state_mutex); - WARN_ON(migf->mvdev->mdev_detach); + lockdep_assert_held(&mvdev->state_mutex); + WARN_ON(mvdev->mdev_detach); - if (buf->dmaed) { - mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); - dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, - buf->dma_dir, 0); + if (buf->mkey_in) { + mlx5_core_destroy_mkey(mdev, buf->mkey); + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, + &buf->state, buf->dma_dir); + kvfree(buf->mkey_in); } - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&buf->table); + free_page_list(buf->npages, buf->page_list); kfree(buf); } -static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, - unsigned int npages) +static int mlx5vf_add_pages(struct page ***page_list, unsigned int npages) { - unsigned int to_alloc = npages; - struct page **page_list; - unsigned long filled; - unsigned int to_fill; - int ret; + unsigned int filled, done = 0; + int i; - to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); - if (!page_list) + *page_list = + kvcalloc(npages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!*page_list) return -ENOMEM; - do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); - if (!filled) { - ret = -ENOMEM; + for (;;) { + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, npages - done, + *page_list + done); + if (!filled) goto err; - } - to_alloc -= filled; - ret = sg_alloc_append_table_from_pages( - &buf->table, page_list, filled, 0, - filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, - GFP_KERNEL_ACCOUNT); - if (ret) - goto err; - buf->allocated_length += filled * PAGE_SIZE; - /* clean input for another bulk allocation */ - memset(page_list, 0, filled * sizeof(*page_list)); - to_fill = min_t(unsigned int, to_alloc, - PAGE_SIZE / sizeof(*page_list)); - } while (to_alloc > 0); + done += filled; + if (done == npages) + break; + } - kvfree(page_list); return 0; err: - kvfree(page_list); - return ret; + for (i = 0; i < done; i++) + __free_page(*page_list[i]); + + kvfree(*page_list); + *page_list = NULL; + return -ENOMEM; } struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf; @@ -473,12 +531,13 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, buf->dma_dir = dma_dir; buf->migf = migf; - if (length) { - ret = mlx5vf_add_migration_pages(buf, - DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (npages) { + ret = mlx5vf_add_pages(&buf->page_list, npages); if (ret) goto end; + buf->npages = npages; + if (dma_dir != DMA_NONE) { ret = mlx5vf_dma_data_buffer(buf); if (ret) @@ -501,8 +560,8 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) } struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir) +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf, *temp_buf; struct list_head free_list; @@ -517,7 +576,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { if (buf->dma_dir == dma_dir) { list_del_init(&buf->buf_elm); - if (buf->allocated_length >= length) { + if (buf->npages >= npages) { spin_unlock_irq(&migf->list_lock); goto found; } @@ -531,7 +590,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, } } spin_unlock_irq(&migf->list_lock); - buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir); found: while ((temp_buf = list_first_entry_or_null(&free_list, @@ -712,7 +771,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); - MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE); MLX5_SET(save_vhca_state_in, in, incremental, inc); MLX5_SET(save_vhca_state_in, in, set_track, track); @@ -734,8 +793,11 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (!header_buf) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + header_buf = mlx5vf_get_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(header_buf)) { err = PTR_ERR(header_buf); goto err_free; @@ -779,7 +841,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (mvdev->mdev_detach) return -ENOTCONN; - if (!buf->dmaed) { + if (!buf->mkey_in) { err = mlx5vf_dma_data_buffer(buf); if (err) return err; @@ -1334,103 +1396,16 @@ static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, kfree(qp); } -static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) -{ - int i; - - /* Undo alloc_pages_bulk_array() */ - for (i = 0; i < recv_buf->npages; i++) - __free_page(recv_buf->page_list[i]); - - kvfree(recv_buf->page_list); -} - -static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, - unsigned int npages) -{ - unsigned int filled = 0, done = 0; - int i; - - recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), - GFP_KERNEL_ACCOUNT); - if (!recv_buf->page_list) - return -ENOMEM; - - for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, - npages - done, - recv_buf->page_list + done); - if (!filled) - goto err; - - done += filled; - if (done == npages) - break; - } - - recv_buf->npages = npages; - return 0; - -err: - for (i = 0; i < npages; i++) { - if (recv_buf->page_list[i]) - __free_page(recv_buf->page_list[i]); - } - - kvfree(recv_buf->page_list); - return -ENOMEM; -} - -static int register_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) -{ - int i, j; - - recv_buf->dma_addrs = kvcalloc(recv_buf->npages, - sizeof(*recv_buf->dma_addrs), - GFP_KERNEL_ACCOUNT); - if (!recv_buf->dma_addrs) - return -ENOMEM; - - for (i = 0; i < recv_buf->npages; i++) { - recv_buf->dma_addrs[i] = dma_map_page(mdev->device, - recv_buf->page_list[i], - 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) - goto error; - } - return 0; - -error: - for (j = 0; j < i; j++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], - PAGE_SIZE, DMA_FROM_DEVICE); - - kvfree(recv_buf->dma_addrs); - return -ENOMEM; -} - -static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) -{ - int i; - - for (i = 0; i < recv_buf->npages; i++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], - PAGE_SIZE, DMA_FROM_DEVICE); - - kvfree(recv_buf->dma_addrs); -} - static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp) { struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); - unregister_dma_recv_pages(mdev, recv_buf); - free_recv_pages(&qp->recv_buf); + unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in, + &recv_buf->state, DMA_FROM_DEVICE); + kvfree(recv_buf->mkey_in); + free_page_list(recv_buf->npages, recv_buf->page_list); } static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, @@ -1441,24 +1416,38 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; int err; - err = alloc_recv_pages(recv_buf, npages); - if (err < 0) + err = mlx5vf_add_pages(&recv_buf->page_list, npages); + if (err) return err; - err = register_dma_recv_pages(mdev, recv_buf); - if (err) + recv_buf->npages = npages; + + recv_buf->mkey_in = alloc_mkey_in(npages, pdn); + if (!recv_buf->mkey_in) { + err = -ENOMEM; goto end; + } + + err = register_dma_pages(mdev, npages, recv_buf->page_list, + recv_buf->mkey_in, &recv_buf->state, + DMA_FROM_DEVICE); + if (err) + goto err_register_dma; - err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); + err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: - unregister_dma_recv_pages(mdev, recv_buf); + unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->state, + DMA_FROM_DEVICE); +err_register_dma: + kvfree(recv_buf->mkey_in); + recv_buf->mkey_in = NULL; end: - free_recv_pages(recv_buf); + free_page_list(npages, recv_buf->page_list); return err; } @@ -1513,7 +1502,8 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, struct mlx5_vhca_qp *host_qp; struct mlx5_vhca_qp *fw_qp; struct mlx5_core_dev *mdev; - u32 max_msg_size = PAGE_SIZE; + u32 log_max_msg_size; + u32 max_msg_size; u64 rq_size = SZ_2M; u32 max_recv_wr; int err; @@ -1530,6 +1520,12 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, } mdev = mvdev->mdev; + log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size); + max_msg_size = (1ULL << log_max_msg_size); + /* The RQ must hold at least 4 WQEs/messages for successful QP creation */ + if (rq_size < 4 * max_msg_size) + rq_size = 4 * max_msg_size; + memset(tracker, 0, sizeof(*tracker)); tracker->uar = mlx5_get_uars_page(mdev); if (IS_ERR(tracker->uar)) { @@ -1619,25 +1615,41 @@ set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, { u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); u32 nent = size / entry_size; + u32 nent_in_page; + u32 nent_to_set; struct page *page; + u32 page_offset; + u32 page_index; + u32 buf_offset; + void *kaddr; u64 addr; u64 *buf; int i; - if (WARN_ON(index >= qp->recv_buf.npages || + buf_offset = index * qp->max_msg_size; + if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE || (nent > qp->max_msg_size / entry_size))) return; - page = qp->recv_buf.page_list[index]; - buf = kmap_local_page(page); - for (i = 0; i < nent; i++) { - addr = MLX5_GET(page_track_report_entry, buf + i, - dirty_address_low); - addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, - dirty_address_high) << 32; - iova_bitmap_set(dirty, addr, qp->tracked_page_size); - } - kunmap_local(buf); + do { + page_index = buf_offset / PAGE_SIZE; + page_offset = buf_offset % PAGE_SIZE; + nent_in_page = (PAGE_SIZE - page_offset) / entry_size; + page = qp->recv_buf.page_list[page_index]; + kaddr = kmap_local_page(page); + buf = kaddr + page_offset; + nent_to_set = min(nent, nent_in_page); + for (i = 0; i < nent_to_set; i++) { + addr = MLX5_GET(page_track_report_entry, buf + i, + dirty_address_low); + addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, + dirty_address_high) << 32; + iova_bitmap_set(dirty, addr, qp->tracked_page_size); + } + kunmap_local(kaddr); + buf_offset += (nent_to_set * entry_size); + nent -= nent_to_set; + } while (nent); } static void diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index df421dc6de04..d7821b5ca772 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -53,20 +53,17 @@ struct mlx5_vf_migration_header { }; struct mlx5_vhca_data_buffer { - struct sg_append_table table; + struct page **page_list; + struct dma_iova_state state; loff_t start_pos; u64 length; - u64 allocated_length; + u32 npages; u32 mkey; + u32 *mkey_in; enum dma_data_direction dma_dir; - u8 dmaed:1; u8 stop_copy_chunk_num; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; }; struct mlx5vf_async_data { @@ -133,8 +130,9 @@ struct mlx5_vhca_cq { struct mlx5_vhca_recv_buf { u32 npages; struct page **page_list; - dma_addr_t *dma_addrs; + struct dma_iova_state state; u32 next_rq_offset; + u32 *mkey_in; u32 mkey; }; @@ -217,15 +215,24 @@ int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); -struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, - unsigned long offset); +static inline struct page * +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset) +{ + int page_entry = offset / PAGE_SIZE; + + if (page_entry >= buf->npages) + return NULL; + + return buf->page_list[page_entry]; +} void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, enum mlx5_vf_migf_state *last_save_state); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 61d9b0f9146d..93f894fe60d2 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -34,37 +34,6 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -struct page * -mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, - unsigned long offset) -{ - unsigned long cur_offset = 0; - struct scatterlist *sg; - unsigned int i; - - /* All accesses are sequential */ - if (offset < buf->last_offset || !buf->last_offset_sg) { - buf->last_offset = 0; - buf->last_offset_sg = buf->table.sgt.sgl; - buf->sg_last_entry = 0; - } - - cur_offset = buf->last_offset; - - for_each_sg(buf->last_offset_sg, sg, - buf->table.sgt.orig_nents - buf->sg_last_entry, i) { - if (offset < sg->length + cur_offset) { - buf->last_offset_sg = sg; - buf->sg_last_entry += i; - buf->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); - } - cur_offset += sg->length; - } - return NULL; -} - static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); @@ -308,6 +277,7 @@ static struct mlx5_vhca_data_buffer * mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, u8 index, size_t required_length) { + u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE); struct mlx5_vhca_data_buffer *buf = migf->buf[index]; u8 chunk_num; @@ -315,12 +285,11 @@ mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, chunk_num = buf->stop_copy_chunk_num; buf->migf->buf[index] = NULL; /* Checking whether the pre-allocated buffer can fit */ - if (buf->allocated_length >= required_length) + if (buf->npages >= npages) return buf; mlx5vf_put_data_buffer(buf); - buf = mlx5vf_get_data_buffer(buf->migf, required_length, - DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE); if (IS_ERR(buf)) return buf; @@ -373,7 +342,8 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, u8 *to_buff; int ret; - header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); + header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE), + DMA_NONE); if (IS_ERR(header_buf)) return PTR_ERR(header_buf); @@ -388,7 +358,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, to_buff = kmap_local_page(page); memcpy(to_buff, &header, sizeof(header)); header_buf->length = sizeof(header); - data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); + data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE); memcpy(to_buff + sizeof(header), &data, sizeof(data)); header_buf->length += sizeof(data); kunmap_local(to_buff); @@ -437,15 +407,20 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; for (i = 0; i < num_chunks; i++) { - buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer( + migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; } migf->buf[i] = buf; - buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_get_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; @@ -553,7 +528,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, * We finished transferring the current state and the device has a * dirty state, save a new state to be ready for. */ - buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); mlx5vf_mark_err(migf); @@ -587,7 +563,6 @@ static const struct file_operations mlx5vf_save_fops = { .unlocked_ioctl = mlx5vf_precopy_ioctl, .compat_ioctl = compat_ptr_ioctl, .release = mlx5vf_release_file, - .llseek = no_llseek, }; static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) @@ -641,14 +616,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) O_RDONLY); if (IS_ERR(migf->filp)) { ret = PTR_ERR(migf->filp); - goto end; + kfree(migf); + return ERR_PTR(ret); } migf->mvdev = mvdev; - ret = mlx5vf_cmd_alloc_pd(migf); - if (ret) - goto out_free; - stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); @@ -664,6 +636,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); + + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out; + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); if (ret) goto out_pd; @@ -674,8 +651,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) if (track) { /* leave the allocated buffer ready for the stop-copy phase */ - buf = mlx5vf_alloc_data_buffer(migf, - migf->buf[0]->allocated_length, DMA_FROM_DEVICE); + buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages, + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_pd; @@ -693,10 +670,8 @@ out_save: mlx5vf_free_data_buffer(buf); out_pd: mlx5fv_cmd_clean_migf_resources(migf); -out_free: +out: fput(migf->filp); -end: - kfree(migf); return ERR_PTR(ret); } @@ -918,11 +893,14 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; break; case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: - if (vhca_buf_header->allocated_length < migf->record_size) { + { + u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE); + + if (vhca_buf_header->npages < npages) { mlx5vf_free_data_buffer(vhca_buf_header); - migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, - migf->record_size, DMA_NONE); + migf->buf_header[0] = mlx5vf_alloc_data_buffer( + migf, npages, DMA_NONE); if (IS_ERR(migf->buf_header[0])) { ret = PTR_ERR(migf->buf_header[0]); migf->buf_header[0] = NULL; @@ -935,6 +913,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, vhca_buf_header->start_pos = migf->max_pos; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; break; + } case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, &buf, &len, pos, &done); @@ -945,12 +924,13 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, { u64 size = max(migf->record_size, migf->stop_copy_prep_size); + u32 npages = DIV_ROUND_UP(size, PAGE_SIZE); - if (vhca_buf->allocated_length < size) { + if (vhca_buf->npages < npages) { mlx5vf_free_data_buffer(vhca_buf); - migf->buf[0] = mlx5vf_alloc_data_buffer(migf, - size, DMA_TO_DEVICE); + migf->buf[0] = mlx5vf_alloc_data_buffer( + migf, npages, DMA_TO_DEVICE); if (IS_ERR(migf->buf[0])) { ret = PTR_ERR(migf->buf[0]); migf->buf[0] = NULL; @@ -1000,7 +980,6 @@ static const struct file_operations mlx5vf_resume_fops = { .owner = THIS_MODULE, .write = mlx5vf_resume_write, .release = mlx5vf_release_file, - .llseek = no_llseek, }; static struct mlx5_vf_migration_file * @@ -1018,13 +997,19 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) O_WRONLY); if (IS_ERR(migf->filp)) { ret = PTR_ERR(migf->filp); - goto end; + kfree(migf); + return ERR_PTR(ret); } + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); migf->mvdev = mvdev; ret = mlx5vf_cmd_alloc_pd(migf); if (ret) - goto out_free; + goto out; buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); if (IS_ERR(buf)) { @@ -1033,8 +1018,11 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) } migf->buf[0] = buf; - buf = mlx5vf_alloc_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_alloc_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_buf; @@ -1043,20 +1031,13 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf_header[0] = buf; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; - stream_open(migf->filp->f_inode, migf->filp); - mutex_init(&migf->lock); - INIT_LIST_HEAD(&migf->buf_list); - INIT_LIST_HEAD(&migf->avail_list); - spin_lock_init(&migf->list_lock); return migf; out_buf: mlx5vf_free_data_buffer(migf->buf[0]); out_pd: mlx5vf_cmd_dealloc_pd(migf); -out_free: +out: fput(migf->filp); -end: - kfree(migf); return ERR_PTR(ret); } @@ -1151,7 +1132,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); if (ret) return ERR_PTR(ret); - buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(migf, + DIV_ROUND_UP(size, PAGE_SIZE), DMA_FROM_DEVICE); if (IS_ERR(buf)) return ERR_CAST(buf); /* pre_copy cleanup */ @@ -1449,7 +1431,7 @@ static struct pci_driver mlx5vf_pci_driver = { module_pci_driver(mlx5vf_pci_driver); -MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS("IOMMUFD"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a7fd018aa548..e5ac39c4cc6b 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -5,6 +5,8 @@ #include <linux/sizes.h> #include <linux/vfio_pci_core.h> +#include <linux/delay.h> +#include <linux/jiffies.h> /* * The device memory usable to the workloads running in the VM is cached @@ -17,12 +19,21 @@ #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX -/* Memory size expected as non cached and reserved by the VM driver */ -#define RESMEM_SIZE SZ_1G - /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ #define MEMBLK_SIZE SZ_512M +#define DVSEC_BITMAP_OFFSET 0xA +#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) + +#define GPU_CAP_DVSEC_REGISTER 3 + +#define C2C_LINK_BAR0_OFFSET 0x1498 +#define HBM_TRAINING_BAR0_OFFSET 0x200BC +#define STATUS_READY 0xFF + +#define POLL_QUANTUM_MS 1000 +#define POLL_TIMEOUT_MS (30 * 1000) + /* * The state of the two device memory region - resmem and usemem - is * saved as struct mem_region. @@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device { struct mem_region resmem; /* Lock to control device memory kernel mapping */ struct mutex remap_lock; + bool has_mig_hw_bug; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index, if (index == USEMEM_REGION_INDEX) return &nvdev->usemem; - if (index == RESMEM_REGION_INDEX) + if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) return &nvdev->resmem; return NULL; @@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, u64 memphys, u64 memlength) { int ret = 0; + u64 resmem_size = 0; /* - * The VM GPU device driver needs a non-cacheable region to support - * the MIG feature. Since the device memory is mapped as NORMAL cached, - * carve out a region from the end with a different NORMAL_NC - * property (called as reserved memory and represented as resmem). This - * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while - * exposing the rest (termed as usable memory and represented using usemem) - * as cacheable 64b BAR (region 4 and 5). + * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable + * region to support the MIG feature owing to a hardware bug. Since the + * device memory is mapped as NORMAL cached, carve out a region from the end + * with a different NORMAL_NC property (called as reserved memory and + * represented as resmem). This region then is exposed as a 64b BAR + * (region 2 and 3) to the VM, while exposing the rest (termed as usable + * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). * * devmem (memlength) * |-------------------------------------------------| * | | * usemem.memphys resmem.memphys + * + * This hardware bug is fixed on the Grace Blackwell platforms and the + * presence of the bug can be determined through nvdev->has_mig_hw_bug. + * Thus on systems with the hardware fix, there is no need to partition + * the GPU device memory and the entire memory is usable and mapped as + * NORMAL cached (i.e. resmem size is 0). */ + if (nvdev->has_mig_hw_bug) + resmem_size = SZ_1G; + nvdev->usemem.memphys = memphys; /* * The device memory exposed to the VM is added to the kernel by the - * VM driver module in chunks of memory block size. Only the usable - * memory (usemem) is added to the kernel for usage by the VM - * workloads. Make the usable memory size memblock aligned. + * VM driver module in chunks of memory block size. Note that only the + * usable memory (usemem) is added to the kernel for usage by the VM + * workloads. */ - if (check_sub_overflow(memlength, RESMEM_SIZE, + if (check_sub_overflow(memlength, resmem_size, &nvdev->usemem.memlength)) { ret = -EOVERFLOW; goto done; } /* - * The USEMEM part of the device memory has to be MEMBLK_SIZE - * aligned. This is a hardwired ABI value between the GPU FW and - * VFIO driver. The VM device driver is also aware of it and make - * use of the value for its calculation to determine USEMEM size. + * The usemem region is exposed as a 64B Bar composed of region 4 and 5. + * Calculate and save the BAR size for the region. + */ + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); + + /* + * If the hardware has the fix for MIG, there is no requirement + * for splitting the device memory to create RESMEM. The entire + * device memory is usable and will be USEMEM. Return here for + * such case. + */ + if (!nvdev->has_mig_hw_bug) + goto done; + + /* + * When the device memory is split to workaround the MIG bug on + * Grace Hopper, the USEMEM part of the device memory has to be + * MEMBLK_SIZE aligned. This is a hardwired ABI value between the + * GPU FW and VFIO driver. The VM device driver is also aware of it + * and make use of the value for its calculation to determine USEMEM + * size. Note that the device memory may not be 512M aligned. */ nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, MEMBLK_SIZE); @@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, } /* - * The memory regions are exposed as BARs. Calculate and save - * the BAR size for them. + * The resmem region is exposed as a 64b BAR composed of region 2 and 3 + * for Grace Hopper. Calculate and save the BAR size for the region. */ - nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); done: return ret; } +static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) +{ + int pcie_dvsec; + u16 dvsec_ctrl16; + + pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, + GPU_CAP_DVSEC_REGISTER); + + if (pcie_dvsec) { + pci_read_config_word(pdev, + pcie_dvsec + DVSEC_BITMAP_OFFSET, + &dvsec_ctrl16); + + if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) + return false; + } + + return true; +} + +/* + * To reduce the system bootup time, the HBM training has + * been moved out of the UEFI on the Grace-Blackwell systems. + * + * The onus of checking whether the HBM training has completed + * thus falls on the module. The HBM training status can be + * determined from a BAR0 register. + * + * Similarly, another BAR0 register exposes the status of the + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. + * + * Poll these register and check for 30s. If the HBM training is + * not complete or if the C2C link is not ready, fail the probe. + * + * While the wait is not required on Grace Hopper systems, it + * is beneficial to make the check to ensure the device is in an + * expected state. + * + * Ensure that the BAR0 region is enabled before accessing the + * registers. + */ +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + void __iomem *io; + int ret = -ETIME; + + ret = pci_enable_device(pdev); + if (ret) + return ret; + + ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); + if (ret) + goto request_region_exit; + + io = pci_iomap(pdev, 0, 0); + if (!io) { + ret = -ENOMEM; + goto iomap_exit; + } + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { + ret = 0; + goto reg_check_exit; + } + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + +reg_check_exit: + pci_iounmap(pdev, io); +iomap_exit: + pci_release_selected_regions(pdev, 1 << 0); +request_region_exit: + pci_disable_device(pdev); + return ret; +} + static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; + ret = nvgrace_gpu_wait_device_ready(pdev); + if (ret) + return ret; + ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); if (!ret) ops = &nvgrace_gpu_pci_ops; @@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, dev_set_drvdata(&pdev->dev, &nvdev->core_device); if (ops == &nvgrace_gpu_pci_ops) { + nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); + /* * Device memory properties are identified in the host ACPI * table. Set the nvgrace_gpu_pci_core_device structure. @@ -866,6 +989,10 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, /* GH200 480GB */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, + /* GH200 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, + /* GB200 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, {} }; diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index 68e8f006dfdb..c51f5e4c3dd6 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -3,6 +3,7 @@ #include <linux/interval_tree.h> #include <linux/vfio.h> +#include <linux/vmalloc.h> #include <linux/pds/pds_common.h> #include <linux/pds/pds_core_if.h> diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c index 6b94cc0bf45b..f2673d395236 100644 --- a/drivers/vfio/pci/pds/lm.c +++ b/drivers/vfio/pci/pds/lm.c @@ -235,7 +235,6 @@ static const struct file_operations pds_vfio_save_fops = { .owner = THIS_MODULE, .read = pds_vfio_save_read, .release = pds_vfio_release_file, - .llseek = no_llseek, }; static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio) @@ -334,7 +333,6 @@ static const struct file_operations pds_vfio_restore_fops = { .owner = THIS_MODULE, .write = pds_vfio_restore_write, .release = pds_vfio_release_file, - .llseek = no_llseek, }; static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio) diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c index 16e93b11ab1b..4923f1823126 100644 --- a/drivers/vfio/pci/pds/pci_drv.c +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -187,7 +187,7 @@ static struct pci_driver pds_vfio_pci_driver = { module_pci_driver(pds_vfio_pci_driver); -MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS("IOMMUFD"); MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION); MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/vfio/pci/qat/Kconfig b/drivers/vfio/pci/qat/Kconfig new file mode 100644 index 000000000000..bf52cfa4b595 --- /dev/null +++ b/drivers/vfio/pci/qat/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config QAT_VFIO_PCI + tristate "VFIO support for QAT VF PCI devices" + select VFIO_PCI_CORE + depends on CRYPTO_DEV_QAT_4XXX + help + This provides migration support for Intel(R) QAT Virtual Function + using the VFIO framework. + + To compile this as a module, choose M here: the module + will be called qat_vfio_pci. If you don't know what to do here, + say N. diff --git a/drivers/vfio/pci/qat/Makefile b/drivers/vfio/pci/qat/Makefile new file mode 100644 index 000000000000..5fe5c4ec19d3 --- /dev/null +++ b/drivers/vfio/pci/qat/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o +qat_vfio_pci-y := main.o diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c new file mode 100644 index 000000000000..845ed15b6771 --- /dev/null +++ b/drivers/vfio/pci/qat/main.c @@ -0,0 +1,700 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024 Intel Corporation */ + +#include <linux/anon_inodes.h> +#include <linux/container_of.h> +#include <linux/device.h> +#include <linux/file.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/sizes.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vfio_pci_core.h> +#include <linux/qat/qat_mig_dev.h> + +/* + * The migration data of each Intel QAT VF device is encapsulated into a + * 4096 bytes block. The data consists of two parts. + * The first is a pre-configured set of attributes of the VF being migrated, + * which are only set when it is created. This can be migrated during pre-copy + * stage and used for a device compatibility check. + * The second is the VF state. This includes the required MMIO regions and + * the shadow states maintained by the QAT PF driver. This part can only be + * saved when the VF is fully quiesced and be migrated during stop-copy stage. + * Both these 2 parts of data are saved in hierarchical structures including + * a preamble section and several raw state sections. + * When the pre-configured part of the migration data is fully retrieved from + * user space, the preamble section are used to validate the correctness of + * the data blocks and check the version compatibility. The raw state sections + * are then used to do a device compatibility check. + * When the device transits from RESUMING state, the VF states are extracted + * from the raw state sections of the VF state part of the migration data and + * then loaded into the device. + */ + +struct qat_vf_migration_file { + struct file *filp; + /* protects migration region context */ + struct mutex lock; + bool disabled; + struct qat_vf_core_device *qat_vdev; + ssize_t filled_size; +}; + +struct qat_vf_core_device { + struct vfio_pci_core_device core_device; + struct qat_mig_dev *mdev; + /* protects migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + struct qat_vf_migration_file *resuming_migf; + struct qat_vf_migration_file *saving_migf; +}; + +static int qat_vf_pci_open_device(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = + container_of(core_vdev, struct qat_vf_core_device, + core_device.vdev); + struct vfio_pci_core_device *vdev = &qat_vdev->core_device; + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + ret = qat_vfmig_open(qat_vdev->mdev); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + + vfio_pci_core_finish_enable(vdev); + + return 0; +} + +static void qat_vf_disable_fd(struct qat_vf_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->disabled = true; + migf->filp->f_pos = 0; + migf->filled_size = 0; + mutex_unlock(&migf->lock); +} + +static void qat_vf_disable_fds(struct qat_vf_core_device *qat_vdev) +{ + if (qat_vdev->resuming_migf) { + qat_vf_disable_fd(qat_vdev->resuming_migf); + fput(qat_vdev->resuming_migf->filp); + qat_vdev->resuming_migf = NULL; + } + + if (qat_vdev->saving_migf) { + qat_vf_disable_fd(qat_vdev->saving_migf); + fput(qat_vdev->saving_migf->filp); + qat_vdev->saving_migf = NULL; + } +} + +static void qat_vf_pci_close_device(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + + qat_vfmig_close(qat_vdev->mdev); + qat_vf_disable_fds(qat_vdev); + vfio_pci_core_close_device(core_vdev); +} + +static long qat_vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct qat_vf_migration_file *migf = filp->private_data; + struct qat_vf_core_device *qat_vdev = migf->qat_vdev; + struct qat_mig_dev *mig_dev = qat_vdev->mdev; + struct vfio_precopy_info info; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + int ret = 0; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&qat_vdev->state_mutex); + if (qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + mutex_unlock(&qat_vdev->state_mutex); + return -EINVAL; + } + + mutex_lock(&migf->lock); + if (migf->disabled) { + ret = -ENODEV; + goto out; + } + + if (*pos > mig_dev->setup_size) { + ret = -EINVAL; + goto out; + } + + info.dirty_bytes = 0; + info.initial_bytes = mig_dev->setup_size - *pos; + +out: + mutex_unlock(&migf->lock); + mutex_unlock(&qat_vdev->state_mutex); + if (ret) + return ret; + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} + +static ssize_t qat_vf_save_read(struct file *filp, char __user *buf, + size_t len, loff_t *pos) +{ + struct qat_vf_migration_file *migf = filp->private_data; + struct qat_mig_dev *mig_dev = migf->qat_vdev->mdev; + ssize_t done = 0; + loff_t *offs; + int ret; + + if (pos) + return -ESPIPE; + offs = &filp->f_pos; + + mutex_lock(&migf->lock); + if (*offs > migf->filled_size || *offs < 0) { + done = -EINVAL; + goto out_unlock; + } + + if (migf->disabled) { + done = -ENODEV; + goto out_unlock; + } + + len = min_t(size_t, migf->filled_size - *offs, len); + if (len) { + ret = copy_to_user(buf, mig_dev->state + *offs, len); + if (ret) { + done = -EFAULT; + goto out_unlock; + } + *offs += len; + done = len; + } + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static int qat_vf_release_file(struct inode *inode, struct file *filp) +{ + struct qat_vf_migration_file *migf = filp->private_data; + + qat_vf_disable_fd(migf); + mutex_destroy(&migf->lock); + kfree(migf); + + return 0; +} + +static const struct file_operations qat_vf_save_fops = { + .owner = THIS_MODULE, + .read = qat_vf_save_read, + .unlocked_ioctl = qat_vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .release = qat_vf_release_file, +}; + +static int qat_vf_save_state(struct qat_vf_core_device *qat_vdev, + struct qat_vf_migration_file *migf) +{ + int ret; + + ret = qat_vfmig_save_state(qat_vdev->mdev); + if (ret) + return ret; + migf->filled_size = qat_vdev->mdev->state_size; + + return 0; +} + +static int qat_vf_save_setup(struct qat_vf_core_device *qat_vdev, + struct qat_vf_migration_file *migf) +{ + int ret; + + ret = qat_vfmig_save_setup(qat_vdev->mdev); + if (ret) + return ret; + migf->filled_size = qat_vdev->mdev->setup_size; + + return 0; +} + +/* + * Allocate a file handler for user space and then save the migration data for + * the device being migrated. If this is called in the pre-copy stage, save the + * pre-configured device data. Otherwise, if this is called in the stop-copy + * stage, save the device state. In both cases, update the data size which can + * then be read from user space. + */ +static struct qat_vf_migration_file * +qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev, bool pre_copy) +{ + struct qat_vf_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops, + migf, O_RDONLY); + ret = PTR_ERR_OR_ZERO(migf->filp); + if (ret) { + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + + if (pre_copy) + ret = qat_vf_save_setup(qat_vdev, migf); + else + ret = qat_vf_save_state(qat_vdev, migf); + if (ret) { + fput(migf->filp); + return ERR_PTR(ret); + } + + migf->qat_vdev = qat_vdev; + + return migf; +} + +static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct qat_vf_migration_file *migf = filp->private_data; + struct qat_mig_dev *mig_dev = migf->qat_vdev->mdev; + loff_t end, *offs; + ssize_t done = 0; + int ret; + + if (pos) + return -ESPIPE; + offs = &filp->f_pos; + + if (*offs < 0 || + check_add_overflow(len, *offs, &end)) + return -EOVERFLOW; + + if (end > mig_dev->state_size) + return -ENOMEM; + + mutex_lock(&migf->lock); + if (migf->disabled) { + done = -ENODEV; + goto out_unlock; + } + + ret = copy_from_user(mig_dev->state + *offs, buf, len); + if (ret) { + done = -EFAULT; + goto out_unlock; + } + *offs += len; + migf->filled_size += len; + + /* + * Load the pre-configured device data first to check if the target + * device is compatible with the source device. + */ + ret = qat_vfmig_load_setup(mig_dev, migf->filled_size); + if (ret && ret != -EAGAIN) { + done = ret; + goto out_unlock; + } + done = len; + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static const struct file_operations qat_vf_resume_fops = { + .owner = THIS_MODULE, + .write = qat_vf_resume_write, + .release = qat_vf_release_file, +}; + +static struct qat_vf_migration_file * +qat_vf_resume_device_data(struct qat_vf_core_device *qat_vdev) +{ + struct qat_vf_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_resume_fops, migf, O_WRONLY); + ret = PTR_ERR_OR_ZERO(migf->filp); + if (ret) { + kfree(migf); + return ERR_PTR(ret); + } + + migf->qat_vdev = qat_vdev; + migf->filled_size = 0; + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + + return migf; +} + +static int qat_vf_load_device_data(struct qat_vf_core_device *qat_vdev) +{ + return qat_vfmig_load_state(qat_vdev->mdev); +} + +static struct file *qat_vf_pci_step_device_state(struct qat_vf_core_device *qat_vdev, u32 new) +{ + u32 cur = qat_vdev->mig_state; + int ret; + + /* + * As the device is not capable of just stopping P2P DMAs, suspend the + * device completely once any of the P2P states are reached. + * When it is suspended, all its MMIO registers can still be operated + * correctly, jobs submitted through ring are queued while no jobs are + * processed by the device. The MMIO states can be safely migrated to + * the target VF during stop-copy stage and restored correctly in the + * target VF. All queued jobs can be resumed then. + */ + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + ret = qat_vfmig_suspend(qat_vdev->mdev); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { + qat_vfmig_resume(qat_vdev->mdev); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) + return NULL; + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct qat_vf_migration_file *migf; + + migf = qat_vf_save_device_data(qat_vdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + qat_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { + struct qat_vf_migration_file *migf; + + migf = qat_vf_resume_device_data(qat_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + qat_vdev->resuming_migf = migf; + return migf->filp; + } + + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) { + qat_vf_disable_fds(qat_vdev); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct qat_vf_migration_file *migf; + + migf = qat_vf_save_device_data(qat_vdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + qat_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct qat_vf_migration_file *migf = qat_vdev->saving_migf; + + if (!migf) + return ERR_PTR(-EINVAL); + ret = qat_vf_save_state(qat_vdev, migf); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { + ret = qat_vf_load_device_data(qat_vdev); + if (ret) + return ERR_PTR(ret); + + qat_vf_disable_fds(qat_vdev); + return NULL; + } + + /* vfio_mig_get_next_state() does not use arcs other than the above */ + WARN_ON(true); + return ERR_PTR(-EINVAL); +} + +static void qat_vf_reset_done(struct qat_vf_core_device *qat_vdev) +{ + qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + qat_vfmig_reset(qat_vdev->mdev); + qat_vf_disable_fds(qat_vdev); +} + +static struct file *qat_vf_pci_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *res = NULL; + int ret; + + mutex_lock(&qat_vdev->state_mutex); + while (new_state != qat_vdev->mig_state) { + ret = vfio_mig_get_next_state(vdev, qat_vdev->mig_state, + new_state, &next_state); + if (ret) { + res = ERR_PTR(ret); + break; + } + res = qat_vf_pci_step_device_state(qat_vdev, next_state); + if (IS_ERR(res)) + break; + qat_vdev->mig_state = next_state; + if (WARN_ON(res && new_state != qat_vdev->mig_state)) { + fput(res); + res = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&qat_vdev->state_mutex); + + return res; +} + +static int qat_vf_pci_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *curr_state) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + + mutex_lock(&qat_vdev->state_mutex); + *curr_state = qat_vdev->mig_state; + mutex_unlock(&qat_vdev->state_mutex); + + return 0; +} + +static int qat_vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + + mutex_lock(&qat_vdev->state_mutex); + *stop_copy_length = qat_vdev->mdev->state_size; + mutex_unlock(&qat_vdev->state_mutex); + + return 0; +} + +static const struct vfio_migration_ops qat_vf_pci_mig_ops = { + .migration_set_state = qat_vf_pci_set_device_state, + .migration_get_state = qat_vf_pci_get_device_state, + .migration_get_data_size = qat_vf_pci_get_data_size, +}; + +static void qat_vf_pci_release_dev(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + + qat_vfmig_cleanup(qat_vdev->mdev); + qat_vfmig_destroy(qat_vdev->mdev); + mutex_destroy(&qat_vdev->state_mutex); + vfio_pci_core_release_dev(core_vdev); +} + +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + struct qat_mig_dev *mdev; + struct pci_dev *parent; + int ret, vf_id; + + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY; + core_vdev->mig_ops = &qat_vf_pci_mig_ops; + + ret = vfio_pci_core_init_dev(core_vdev); + if (ret) + return ret; + + mutex_init(&qat_vdev->state_mutex); + + parent = pci_physfn(qat_vdev->core_device.pdev); + vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev); + if (vf_id < 0) { + ret = -ENODEV; + goto err_rel; + } + + mdev = qat_vfmig_create(parent, vf_id); + if (IS_ERR(mdev)) { + ret = PTR_ERR(mdev); + goto err_rel; + } + + ret = qat_vfmig_init(mdev); + if (ret) + goto err_destroy; + + qat_vdev->mdev = mdev; + + return 0; + +err_destroy: + qat_vfmig_destroy(mdev); +err_rel: + vfio_pci_core_release_dev(core_vdev); + return ret; +} + +static const struct vfio_device_ops qat_vf_pci_ops = { + .name = "qat-vf-vfio-pci", + .init = qat_vf_pci_init_dev, + .release = qat_vf_pci_release_dev, + .open_device = qat_vf_pci_open_device, + .close_device = qat_vf_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static struct qat_vf_core_device *qat_vf_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = pci_get_drvdata(pdev); + + return container_of(core_device, struct qat_vf_core_device, core_device); +} + +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) +{ + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); + + if (!qat_vdev->mdev) + return; + + mutex_lock(&qat_vdev->state_mutex); + qat_vf_reset_done(qat_vdev); + mutex_unlock(&qat_vdev->state_mutex); +} + +static int +qat_vf_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct device *dev = &pdev->dev; + struct qat_vf_core_device *qat_vdev; + int ret; + + qat_vdev = vfio_alloc_device(qat_vf_core_device, core_device.vdev, dev, &qat_vf_pci_ops); + if (IS_ERR(qat_vdev)) + return PTR_ERR(qat_vdev); + + pci_set_drvdata(pdev, &qat_vdev->core_device); + ret = vfio_pci_core_register_device(&qat_vdev->core_device); + if (ret) + goto out_put_device; + + return 0; + +out_put_device: + vfio_put_device(&qat_vdev->core_device.vdev); + return ret; +} + +static void qat_vf_vfio_pci_remove(struct pci_dev *pdev) +{ + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); + + vfio_pci_core_unregister_device(&qat_vdev->core_device); + vfio_put_device(&qat_vdev->core_device.vdev); +} + +static const struct pci_device_id qat_vf_vfio_pci_table[] = { + /* Intel QAT GEN4 4xxx VF device */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) }, + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) }, + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) }, + {} +}; +MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table); + +static const struct pci_error_handlers qat_vf_err_handlers = { + .reset_done = qat_vf_pci_aer_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static struct pci_driver qat_vf_vfio_pci_driver = { + .name = "qat_vfio_pci", + .id_table = qat_vf_vfio_pci_table, + .probe = qat_vf_vfio_pci_probe, + .remove = qat_vf_vfio_pci_remove, + .err_handler = &qat_vf_err_handlers, + .driver_managed_dma = true, +}; +module_pci_driver(qat_vf_vfio_pci_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Xin Zeng <xin.zeng@intel.com>"); +MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT GEN4 device family"); +MODULE_IMPORT_NS("CRYPTO_QAT"); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index cb5b7f865d58..5ba39f7623bb 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -71,6 +71,8 @@ static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) case PCI_DEVICE_ID_INTEL_QAT_C62X_VF: case PCI_DEVICE_ID_INTEL_QAT_DH895XCC: case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF: + case PCI_DEVICE_ID_INTEL_DSA_SPR0: + case PCI_DEVICE_ID_INTEL_IAX_SPR0: return true; default: return false; @@ -109,9 +111,7 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) if (ret) return ret; - if (vfio_pci_is_vga(pdev) && - pdev->vendor == PCI_VENDOR_ID_INTEL && - IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { + if (vfio_pci_is_intel_display(pdev)) { ret = vfio_pci_igd_init(vdev); if (ret && ret != -ENODEV) { pci_warn(pdev, "Failed to setup Intel IGD regions\n"); @@ -142,6 +142,8 @@ static const struct vfio_device_ops vfio_pci_ops = { .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, .detach_ioas = vfio_iommufd_physical_detach_ioas, + .pasid_attach_ioas = vfio_iommufd_physical_pasid_attach_ioas, + .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 97422aafaa7b..8f02f236b5b4 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -313,6 +313,10 @@ static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos, return count; } +static struct perm_bits direct_ro_perms = { + .readfn = vfio_direct_config_read, +}; + /* Default capability regions to read-only, no-virtualization */ static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } @@ -507,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev) mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *vbar &= cpu_to_le32((u32)mask); - } else if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) { - mask = ~(0x20000 - 1); + } else if (pdev->rom && pdev->romlen) { + mask = ~(roundup_pow_of_two(pdev->romlen) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *vbar &= cpu_to_le32((u32)mask); - } else + } else { *vbar = 0; + } vdev->bardirty = false; } @@ -1385,11 +1389,12 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo switch (ecap) { case PCI_EXT_CAP_ID_VNDR: - ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword); + ret = pci_read_config_dword(pdev, epos + PCI_VNDR_HEADER, + &dword); if (ret) return pcibios_err_to_errno(ret); - return dword >> PCI_VSEC_HDR_LEN_SHIFT; + return PCI_VNDR_HEADER_LEN(dword); case PCI_EXT_CAP_ID_VC: case PCI_EXT_CAP_ID_VC9: case PCI_EXT_CAP_ID_MFVC: @@ -1809,7 +1814,8 @@ int vfio_config_init(struct vfio_pci_core_device *vdev) cpu_to_le16(PCI_COMMAND_MEMORY); } - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx) + if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx || + !vdev->pdev->irq || vdev->pdev->irq == IRQ_NOTCONNECTED) vconfig[PCI_INTERRUPT_PIN] = 0; ret = vfio_cap_init(vdev); @@ -1897,9 +1903,17 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user cap_start = *ppos; } else { if (*ppos >= PCI_CFG_SPACE_SIZE) { - WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); + /* + * We can get a cap_id that exceeds PCI_EXT_CAP_ID_MAX + * if we're hiding an unknown capability at the start + * of the extended capability list. Use default, ro + * access, which will virtualize the id and next values. + */ + if (cap_id > PCI_EXT_CAP_ID_MAX) + perm = &direct_ro_perms; + else + perm = &ecap_perms[cap_id]; - perm = &ecap_perms[cap_id]; cap_start = vfio_find_cap_start(vdev, *ppos); } else { WARN_ON(cap_id > PCI_CAP_ID_MAX); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index d94d61b92c1a..6328c3a05bcd 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -20,6 +20,7 @@ #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/pci.h> +#include <linux/pfn_t.h> #include <linux/pm_runtime.h> #include <linux/slab.h> #include <linux/types.h> @@ -57,11 +58,6 @@ struct vfio_pci_vf_token { int users; }; -struct vfio_pci_mmap_vma { - struct vm_area_struct *vma; - struct list_head vma_next; -}; - static inline bool vfio_vga_disabled(void) { #ifdef CONFIG_VFIO_PCI_VGA @@ -120,7 +116,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev) res = &vdev->pdev->resource[bar]; - if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) + if (vdev->pdev->non_mappable_bars) goto no_mmap; if (!(res->flags & IORESOURCE_MEM)) @@ -731,15 +727,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable); static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type) { if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { - u8 pin; - - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || - vdev->nointx || vdev->pdev->is_virtfn) - return 0; - - pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); - - return pin ? 1 : 0; + return vdev->vconfig[PCI_INTERRUPT_PIN] ? 1 : 0; } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { u8 pos; u16 flags; @@ -778,25 +766,26 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) } struct vfio_pci_fill_info { - struct vfio_pci_dependent_device __user *devices; - struct vfio_pci_dependent_device __user *devices_end; struct vfio_device *vdev; + struct vfio_pci_dependent_device *devices; + int nr_devices; u32 count; u32 flags; }; static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) { - struct vfio_pci_dependent_device info = { - .segment = pci_domain_nr(pdev->bus), - .bus = pdev->bus->number, - .devfn = pdev->devfn, - }; + struct vfio_pci_dependent_device *info; struct vfio_pci_fill_info *fill = data; - fill->count++; - if (fill->devices >= fill->devices_end) - return 0; + /* The topology changed since we counted devices */ + if (fill->count >= fill->nr_devices) + return -EAGAIN; + + info = &fill->devices[fill->count++]; + info->segment = pci_domain_nr(pdev->bus); + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); @@ -809,19 +798,19 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) */ vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); if (!vdev) { - info.devid = VFIO_PCI_DEVID_NOT_OWNED; + info->devid = VFIO_PCI_DEVID_NOT_OWNED; } else { int id = vfio_iommufd_get_dev_id(vdev, iommufd); if (id > 0) - info.devid = id; + info->devid = id; else if (id == -ENOENT) - info.devid = VFIO_PCI_DEVID_OWNED; + info->devid = VFIO_PCI_DEVID_OWNED; else - info.devid = VFIO_PCI_DEVID_NOT_OWNED; + info->devid = VFIO_PCI_DEVID_NOT_OWNED; } /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ - if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) + if (info->devid == VFIO_PCI_DEVID_NOT_OWNED) fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; } else { struct iommu_group *iommu_group; @@ -830,13 +819,10 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) if (!iommu_group) return -EPERM; /* Cannot reset non-isolated devices */ - info.group_id = iommu_group_id(iommu_group); + info->group_id = iommu_group_id(iommu_group); iommu_group_put(iommu_group); } - if (copy_to_user(fill->devices, &info, sizeof(info))) - return -EFAULT; - fill->devices++; return 0; } @@ -1060,31 +1046,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); info.flags = 0; + info.size = 0; - /* Report the BAR size, not the ROM size */ - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - /* Shadow ROMs appear as PCI option ROMs */ - if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) - info.size = 0x20000; - else - break; - } - - /* - * Is it really there? Enable memory decode for implicit access - * in pci_map_rom(). - */ - cmd = vfio_pci_memory_lock_and_enable(vdev); - io = pci_map_rom(pdev, &size); - if (io) { + if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { + /* + * Check ROM content is valid. Need to enable memory + * decode for ROM access in pci_map_rom(). + */ + cmd = vfio_pci_memory_lock_and_enable(vdev); + io = pci_map_rom(pdev, &size); + if (io) { + info.flags = VFIO_REGION_INFO_FLAG_READ; + /* Report the BAR size, not the ROM size. */ + info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + pci_unmap_rom(pdev, io); + } + vfio_pci_memory_unlock_and_restore(vdev, cmd); + } else if (pdev->rom && pdev->romlen) { info.flags = VFIO_REGION_INFO_FLAG_READ; - pci_unmap_rom(pdev, io); - } else { - info.size = 0; + /* Report BAR size as power of two. */ + info.size = roundup_pow_of_two(pdev->romlen); } - vfio_pci_memory_unlock_and_restore(vdev, cmd); break; } @@ -1258,10 +1240,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); + struct vfio_pci_dependent_device *devices = NULL; struct vfio_pci_hot_reset_info hdr; struct vfio_pci_fill_info fill = {}; bool slot = false; - int ret = 0; + int ret, count = 0; if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; @@ -1277,9 +1260,26 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( else if (pci_probe_reset_bus(vdev->pdev->bus)) return -ENODEV; - fill.devices = arg->devices; - fill.devices_end = arg->devices + - (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; + + if (WARN_ON(!count)) /* Should always be at least one */ + return -ERANGE; + + if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) { + hdr.count = count; + ret = -ENOSPC; + goto header; + } + + devices = kcalloc(count, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; + + fill.devices = devices; + fill.nr_devices = count; fill.vdev = &vdev->vdev; if (vfio_device_cdev_opened(&vdev->vdev)) @@ -1291,21 +1291,28 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( &fill, slot); mutex_unlock(&vdev->vdev.dev_set->lock); if (ret) - return ret; + goto out; + + if (copy_to_user(arg->devices, devices, + sizeof(*devices) * fill.count)) { + ret = -EFAULT; + goto out; + } hdr.count = fill.count; hdr.flags = fill.flags; - if (copy_to_user(arg, &hdr, minsz)) - return -EFAULT; - if (fill.count > fill.devices - arg->devices) - return -ENOSPC; - return 0; +header: + if (copy_to_user(arg, &hdr, minsz)) + ret = -EFAULT; +out: + kfree(devices); + return ret; } static int vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev, - int array_count, bool slot, + u32 array_count, bool slot, struct vfio_pci_hot_reset __user *arg) { int32_t *group_fds; @@ -1587,100 +1594,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu } EXPORT_SYMBOL_GPL(vfio_pci_core_write); -/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ -static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) +static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev) { - struct vfio_pci_mmap_vma *mmap_vma, *tmp; - - /* - * Lock ordering: - * vma_lock is nested under mmap_lock for vm_ops callback paths. - * The memory_lock semaphore is used by both code paths calling - * into this function to zap vmas and the vm_ops.fault callback - * to protect the memory enable state of the device. - * - * When zapping vmas we need to maintain the mmap_lock => vma_lock - * ordering, which requires using vma_lock to walk vma_list to - * acquire an mm, then dropping vma_lock to get the mmap_lock and - * reacquiring vma_lock. This logic is derived from similar - * requirements in uverbs_user_mmap_disassociate(). - * - * mmap_lock must always be the top-level lock when it is taken. - * Therefore we can only hold the memory_lock write lock when - * vma_list is empty, as we'd need to take mmap_lock to clear - * entries. vma_list can only be guaranteed empty when holding - * vma_lock, thus memory_lock is nested under vma_lock. - * - * This enables the vm_ops.fault callback to acquire vma_lock, - * followed by memory_lock read lock, while already holding - * mmap_lock without risk of deadlock. - */ - while (1) { - struct mm_struct *mm = NULL; - - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) - return 0; - } else { - mutex_lock(&vdev->vma_lock); - } - while (!list_empty(&vdev->vma_list)) { - mmap_vma = list_first_entry(&vdev->vma_list, - struct vfio_pci_mmap_vma, - vma_next); - mm = mmap_vma->vma->vm_mm; - if (mmget_not_zero(mm)) - break; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - mm = NULL; - } - if (!mm) - return 1; - mutex_unlock(&vdev->vma_lock); - - if (try) { - if (!mmap_read_trylock(mm)) { - mmput(mm); - return 0; - } - } else { - mmap_read_lock(mm); - } - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) { - mmap_read_unlock(mm); - mmput(mm); - return 0; - } - } else { - mutex_lock(&vdev->vma_lock); - } - list_for_each_entry_safe(mmap_vma, tmp, - &vdev->vma_list, vma_next) { - struct vm_area_struct *vma = mmap_vma->vma; - - if (vma->vm_mm != mm) - continue; + struct vfio_device *core_vdev = &vdev->vdev; + loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX); + loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX); + loff_t len = end - start; - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - } - mutex_unlock(&vdev->vma_lock); - mmap_read_unlock(mm); - mmput(mm); - } + unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true); } void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) { - vfio_pci_zap_and_vma_lock(vdev, false); down_write(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + vfio_pci_zap_bars(vdev); } u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) @@ -1702,100 +1629,83 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c up_write(&vdev->memory_lock); } -/* Caller holds vma_lock */ -static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, - struct vm_area_struct *vma) -{ - struct vfio_pci_mmap_vma *mmap_vma; - - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); - if (!mmap_vma) - return -ENOMEM; - - mmap_vma->vma = vma; - list_add(&mmap_vma->vma_next, &vdev->vma_list); - - return 0; -} - -/* - * Zap mmaps on open so that we can fault them in on access and therefore - * our vma_list only tracks mappings accessed since last zap. - */ -static void vfio_pci_mmap_open(struct vm_area_struct *vma) -{ - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); -} - -static void vfio_pci_mmap_close(struct vm_area_struct *vma) +static unsigned long vma_to_pfn(struct vm_area_struct *vma) { struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; + int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + u64 pgoff; - mutex_lock(&vdev->vma_lock); - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) { - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - break; - } - } - mutex_unlock(&vdev->vma_lock); + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } -static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) +static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, + unsigned int order) { struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; - vm_fault_t ret = VM_FAULT_NOPAGE; - - mutex_lock(&vdev->vma_lock); - down_read(&vdev->memory_lock); + unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); + unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + unsigned long pfn = vma_to_pfn(vma) + pgoff; + vm_fault_t ret = VM_FAULT_SIGBUS; - /* - * Memory region cannot be accessed if the low power feature is engaged - * or memory access is disabled. - */ - if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { - ret = VM_FAULT_SIGBUS; - goto up_out; + if (order && (addr < vma->vm_start || + addr + (PAGE_SIZE << order) > vma->vm_end || + pfn & ((1 << order) - 1))) { + ret = VM_FAULT_FALLBACK; + goto out; } - /* - * We populate the whole vma on fault, so we need to test whether - * the vma has already been mapped, such as for concurrent faults - * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if - * we ask it to fill the same range again. - */ - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) - goto up_out; - } + down_read(&vdev->memory_lock); - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) { - ret = VM_FAULT_SIGBUS; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - goto up_out; - } + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) + goto out_unlock; - if (__vfio_pci_add_vma(vdev, vma)) { - ret = VM_FAULT_OOM; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + switch (order) { + case 0: + ret = vmf_insert_pfn(vma, vmf->address, pfn); + break; +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP + case PMD_ORDER: + ret = vmf_insert_pfn_pmd(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); + break; +#endif +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP + case PUD_ORDER: + ret = vmf_insert_pfn_pud(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); + break; +#endif + default: + ret = VM_FAULT_FALLBACK; } -up_out: +out_unlock: up_read(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); +out: + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", + __func__, order, + vma->vm_pgoff >> + (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), + pgoff, (unsigned int)ret); + return ret; } +static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) +{ + return vfio_pci_mmap_huge_fault(vmf, 0); +} + static const struct vm_operations_struct vfio_pci_mmap_ops = { - .open = vfio_pci_mmap_open, - .close = vfio_pci_mmap_close, - .fault = vfio_pci_mmap_fault, + .fault = vfio_pci_mmap_page_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = vfio_pci_mmap_huge_fault, +#endif }; int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) @@ -1857,11 +1767,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); /* - * See remap_pfn_range(), called from vfio_pci_fault() but we can't - * change vm_flags within the fault handler. Set them now. + * Set vm_flags now, they should not be changed in the fault handler. + * We want the same flags and page protection (decrypted above) as + * io_remap_pfn_range() would set. * * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, * allowing KVM stage 2 device mapping attributes to use Normal-NC @@ -2179,8 +2090,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) mutex_init(&vdev->ioeventfds_lock); INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); - mutex_init(&vdev->vma_lock); - INIT_LIST_HEAD(&vdev->vma_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2196,7 +2105,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->igate); mutex_destroy(&vdev->ioeventfds_lock); - mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); } @@ -2474,26 +2382,15 @@ unwind: return ret; } -/* - * We need to get memory_lock for each device, but devices can share mmap_lock, - * therefore we need to zap and hold the vma_lock for each device, and only then - * get each memory_lock. - */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups, struct iommufd_ctx *iommufd_ctx) { - struct vfio_pci_core_device *cur_mem; - struct vfio_pci_core_device *cur_vma; - struct vfio_pci_core_device *cur; + struct vfio_pci_core_device *vdev; struct pci_dev *pdev; - bool is_mem = true; int ret; mutex_lock(&dev_set->lock); - cur_mem = list_first_entry(&dev_set->device_list, - struct vfio_pci_core_device, - vdev.dev_set_list); pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) { @@ -2510,7 +2407,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, if (ret) goto err_unlock; - list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) { bool owned; /* @@ -2534,38 +2431,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * Otherwise, reset is not allowed. */ if (iommufd_ctx) { - int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + int devid = vfio_iommufd_get_dev_id(&vdev->vdev, iommufd_ctx); owned = (devid > 0 || devid == -ENOENT); } else { - owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + owned = vfio_dev_in_groups(&vdev->vdev, groups); } if (!owned) { ret = -EINVAL; - goto err_undo; + break; } /* - * Locking multiple devices is prone to deadlock, runaway and - * unwind if we hit contention. + * Take the memory write lock for each device and zap BAR + * mappings to prevent the user accessing the device while in + * reset. Locking multiple devices is prone to deadlock, + * runaway and unwind if we hit contention. */ - if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { + if (!down_write_trylock(&vdev->memory_lock)) { ret = -EBUSY; - goto err_undo; + break; } + + vfio_pci_zap_bars(vdev); } - cur_vma = NULL; - list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { - if (!down_write_trylock(&cur_mem->memory_lock)) { - ret = -EBUSY; - goto err_undo; - } - mutex_unlock(&cur_mem->vma_lock); + if (!list_entry_is_head(vdev, + &dev_set->device_list, vdev.dev_set_list)) { + vdev = list_prev_entry(vdev, vdev.dev_set_list); + goto err_undo; } - cur_mem = NULL; /* * The pci_reset_bus() will reset all the devices in the bus. @@ -2576,25 +2473,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * cause the PCI config space reset without restoring the original * state (saved locally in 'vdev->pm_save'). */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_reset_bus(pdev); + vdev = list_last_entry(&dev_set->device_list, + struct vfio_pci_core_device, vdev.dev_set_list); + err_undo: - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - if (cur == cur_mem) - is_mem = false; - if (cur == cur_vma) - break; - if (is_mem) - up_write(&cur->memory_lock); - else - mutex_unlock(&cur->vma_lock); - } + list_for_each_entry_from_reverse(vdev, &dev_set->device_list, + vdev.dev_set_list) + up_write(&vdev->memory_lock); + + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&vdev->pdev->dev); - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index dd70e2431bd7..ef490a4545f4 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -435,6 +435,12 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev) return 0; } +bool vfio_pci_is_intel_display(struct pci_dev *pdev) +{ + return (pdev->vendor == PCI_VENDOR_ID_INTEL) && + ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY); +} + int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) { int ret; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index fb5392b749ff..565966351dfa 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -23,11 +23,12 @@ #include "vfio_pci_priv.h" struct vfio_pci_irq_ctx { - struct eventfd_ctx *trigger; - struct virqfd *unmask; - struct virqfd *mask; - char *name; - bool masked; + struct vfio_pci_core_device *vdev; + struct eventfd_ctx *trigger; + struct virqfd *unmask; + struct virqfd *mask; + char *name; + bool masked; struct irq_bypass_producer producer; }; @@ -84,19 +85,14 @@ vfio_irq_ctx_alloc(struct vfio_pci_core_device *vdev, unsigned long index) /* * INTx */ -static void vfio_send_intx_eventfd(void *opaque, void *unused) +static void vfio_send_intx_eventfd(void *opaque, void *data) { struct vfio_pci_core_device *vdev = opaque; if (likely(is_intx(vdev) && !vdev->virq_disabled)) { - struct vfio_pci_irq_ctx *ctx; - struct eventfd_ctx *trigger; + struct vfio_pci_irq_ctx *ctx = data; + struct eventfd_ctx *trigger = READ_ONCE(ctx->trigger); - ctx = vfio_irq_ctx_get(vdev, 0); - if (WARN_ON_ONCE(!ctx)) - return; - - trigger = READ_ONCE(ctx->trigger); if (likely(trigger)) eventfd_signal(trigger); } @@ -166,11 +162,11 @@ bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) * a signal is necessary, which can then be handled via a work queue * or directly depending on the caller. */ -static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) +static int vfio_pci_intx_unmask_handler(void *opaque, void *data) { struct vfio_pci_core_device *vdev = opaque; struct pci_dev *pdev = vdev->pdev; - struct vfio_pci_irq_ctx *ctx; + struct vfio_pci_irq_ctx *ctx = data; unsigned long flags; int ret = 0; @@ -186,10 +182,6 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) goto out_unlock; } - ctx = vfio_irq_ctx_get(vdev, 0); - if (WARN_ON_ONCE(!ctx)) - goto out_unlock; - if (ctx->masked && !vdev->virq_disabled) { /* * A pending interrupt here would immediately trigger, @@ -213,10 +205,12 @@ out_unlock: static void __vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) { + struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0); + lockdep_assert_held(&vdev->igate); - if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) - vfio_send_intx_eventfd(vdev, NULL); + if (vfio_pci_intx_unmask_handler(vdev, ctx) > 0) + vfio_send_intx_eventfd(vdev, ctx); } void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) @@ -228,15 +222,11 @@ void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) static irqreturn_t vfio_intx_handler(int irq, void *dev_id) { - struct vfio_pci_core_device *vdev = dev_id; - struct vfio_pci_irq_ctx *ctx; + struct vfio_pci_irq_ctx *ctx = dev_id; + struct vfio_pci_core_device *vdev = ctx->vdev; unsigned long flags; int ret = IRQ_NONE; - ctx = vfio_irq_ctx_get(vdev, 0); - if (WARN_ON_ONCE(!ctx)) - return ret; - spin_lock_irqsave(&vdev->irqlock, flags); if (!vdev->pci_2_3) { @@ -252,7 +242,7 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id) spin_unlock_irqrestore(&vdev->irqlock, flags); if (ret == IRQ_HANDLED) - vfio_send_intx_eventfd(vdev, NULL); + vfio_send_intx_eventfd(vdev, ctx); return ret; } @@ -269,7 +259,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, if (!is_irq_none(vdev)) return -EINVAL; - if (!pdev->irq) + if (!pdev->irq || pdev->irq == IRQ_NOTCONNECTED) return -ENODEV; name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev)); @@ -277,11 +267,14 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, return -ENOMEM; ctx = vfio_irq_ctx_alloc(vdev, 0); - if (!ctx) + if (!ctx) { + kfree(name); return -ENOMEM; + } ctx->name = name; ctx->trigger = trigger; + ctx->vdev = vdev; /* * Fill the initial masked state based on virq_disabled. After @@ -312,7 +305,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; ret = request_irq(pdev->irq, vfio_intx_handler, - irqflags, ctx->name, vdev); + irqflags, ctx->name, ctx); if (ret) { vdev->irq_type = VFIO_PCI_NUM_IRQS; kfree(name); @@ -358,7 +351,7 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev) if (ctx) { vfio_virqfd_disable(&ctx->unmask); vfio_virqfd_disable(&ctx->mask); - free_irq(pdev->irq, vdev); + free_irq(pdev->irq, ctx); if (ctx->trigger) eventfd_ctx_put(ctx->trigger); kfree(ctx->name); @@ -606,7 +599,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev, if (fd >= 0) return vfio_virqfd_enable((void *) vdev, vfio_pci_intx_unmask_handler, - vfio_send_intx_eventfd, NULL, + vfio_send_intx_eventfd, ctx, &ctx->unmask, fd); vfio_virqfd_disable(&ctx->unmask); @@ -673,11 +666,11 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev, return -EINVAL; if (flags & VFIO_IRQ_SET_DATA_NONE) { - vfio_send_intx_eventfd(vdev, NULL); + vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0)); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger = *(uint8_t *)data; if (trigger) - vfio_send_intx_eventfd(vdev, NULL); + vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0)); } return 0; } diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 5e4fa69aee16..a9972eacb293 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -67,8 +67,14 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd); #ifdef CONFIG_VFIO_PCI_IGD +bool vfio_pci_is_intel_display(struct pci_dev *pdev); int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); #else +static inline bool vfio_pci_is_intel_display(struct pci_dev *pdev) +{ + return false; +} + static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) { return -ENODEV; diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 03b8f7ada1ac..6192788c8ba3 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -16,6 +16,7 @@ #include <linux/io.h> #include <linux/vfio.h> #include <linux/vgaarb.h> +#include <linux/io-64-nonatomic-lo-hi.h> #include "vfio_pci_priv.h" @@ -61,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size); VFIO_IOWRITE(8) VFIO_IOWRITE(16) VFIO_IOWRITE(32) -#ifdef iowrite64 VFIO_IOWRITE(64) -#endif #define VFIO_IOREAD(size) \ int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ @@ -89,6 +88,43 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size); VFIO_IOREAD(8) VFIO_IOREAD(16) VFIO_IOREAD(32) +VFIO_IOREAD(64) + +#define VFIO_IORDWR(size) \ +static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\ + bool iswrite, bool test_mem, \ + void __iomem *io, char __user *buf, \ + loff_t off, size_t *filled) \ +{ \ + u##size val; \ + int ret; \ + \ + if (iswrite) { \ + if (copy_from_user(&val, buf, sizeof(val))) \ + return -EFAULT; \ + \ + ret = vfio_pci_core_iowrite##size(vdev, test_mem, \ + val, io + off); \ + if (ret) \ + return ret; \ + } else { \ + ret = vfio_pci_core_ioread##size(vdev, test_mem, \ + &val, io + off); \ + if (ret) \ + return ret; \ + \ + if (copy_to_user(buf, &val, sizeof(val))) \ + return -EFAULT; \ + } \ + \ + *filled = sizeof(val); \ + return 0; \ +} \ + +VFIO_IORDWR(8) +VFIO_IORDWR(16) +VFIO_IORDWR(32) +VFIO_IORDWR(64) /* * Read or write from an __iomem region (MMIO or I/O port) with an excluded @@ -114,72 +150,31 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, else fillable = 0; + if (fillable >= 8 && !(off % 8)) { + ret = vfio_pci_iordwr64(vdev, iswrite, test_mem, + io, buf, off, &filled); + if (ret) + return ret; + + } else if (fillable >= 4 && !(off % 4)) { - u32 val; - - if (iswrite) { - if (copy_from_user(&val, buf, 4)) - return -EFAULT; - - ret = vfio_pci_core_iowrite32(vdev, test_mem, - val, io + off); - if (ret) - return ret; - } else { - ret = vfio_pci_core_ioread32(vdev, test_mem, - &val, io + off); - if (ret) - return ret; - - if (copy_to_user(buf, &val, 4)) - return -EFAULT; - } + ret = vfio_pci_iordwr32(vdev, iswrite, test_mem, + io, buf, off, &filled); + if (ret) + return ret; - filled = 4; } else if (fillable >= 2 && !(off % 2)) { - u16 val; - - if (iswrite) { - if (copy_from_user(&val, buf, 2)) - return -EFAULT; - - ret = vfio_pci_core_iowrite16(vdev, test_mem, - val, io + off); - if (ret) - return ret; - } else { - ret = vfio_pci_core_ioread16(vdev, test_mem, - &val, io + off); - if (ret) - return ret; - - if (copy_to_user(buf, &val, 2)) - return -EFAULT; - } + ret = vfio_pci_iordwr16(vdev, iswrite, test_mem, + io, buf, off, &filled); + if (ret) + return ret; - filled = 2; } else if (fillable) { - u8 val; - - if (iswrite) { - if (copy_from_user(&val, buf, 1)) - return -EFAULT; - - ret = vfio_pci_core_iowrite8(vdev, test_mem, - val, io + off); - if (ret) - return ret; - } else { - ret = vfio_pci_core_ioread8(vdev, test_mem, - &val, io + off); - if (ret) - return ret; - - if (copy_to_user(buf, &val, 1)) - return -EFAULT; - } + ret = vfio_pci_iordwr8(vdev, iswrite, test_mem, + io, buf, off, &filled); + if (ret) + return ret; - filled = 1; } else { /* Fill reads with -1, drop writes */ filled = min(count, (size_t)(x_end - off)); @@ -242,9 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (pci_resource_start(pdev, bar)) end = pci_resource_len(pdev, bar); - else if (bar == PCI_ROM_RESOURCE && - pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW) - end = 0x20000; + else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen) + end = roundup_pow_of_two(pdev->romlen); else return -EINVAL; @@ -259,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, * excluded range at the end of the actual ROM. This makes * filling large ROM BARs much faster. */ - io = pci_map_rom(pdev, &x_start); - if (!io) { - done = -ENOMEM; - goto out; + if (pci_resource_start(pdev, bar)) { + io = pci_map_rom(pdev, &x_start); + } else { + io = ioremap(pdev->rom, pdev->romlen); + x_start = pdev->romlen; } + if (!io) + return -ENOMEM; x_end = end; } else { int ret = vfio_pci_core_setup_barmap(vdev, bar); @@ -286,8 +283,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (done >= 0) *ppos += done; - if (bar == PCI_ROM_RESOURCE) - pci_unmap_rom(pdev, io); + if (bar == PCI_ROM_RESOURCE) { + if (pci_resource_start(pdev, bar)) + pci_unmap_rom(pdev, io); + else + iounmap(io); + } + out: return done; } @@ -379,12 +381,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; -#ifdef iowrite64 case 8: vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; -#endif } } @@ -438,10 +438,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, pos >= vdev->msix_offset + vdev->msix_size)) return -EINVAL; -#ifndef iowrite64 if (count == 8) return -EINVAL; -#endif ret = vfio_pci_core_setup_barmap(vdev, bar); if (ret) diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig index bd80eca4a196..33e04e65bec6 100644 --- a/drivers/vfio/pci/virtio/Kconfig +++ b/drivers/vfio/pci/virtio/Kconfig @@ -1,15 +1,31 @@ # SPDX-License-Identifier: GPL-2.0-only config VIRTIO_VFIO_PCI - tristate "VFIO support for VIRTIO NET PCI devices" - depends on VIRTIO_PCI && VIRTIO_PCI_ADMIN_LEGACY - select VFIO_PCI_CORE - help - This provides support for exposing VIRTIO NET VF devices which support - legacy IO access, using the VFIO framework that can work with a legacy - virtio driver in the guest. - Based on PCIe spec, VFs do not support I/O Space. - As of that this driver emulates I/O BAR in software to let a VF be - seen as a transitional device by its users and let it work with - a legacy driver. - - If you don't know what to do here, say N. + tristate "VFIO support for VIRTIO PCI VF devices" + depends on VIRTIO_PCI + select VFIO_PCI_CORE + help + This provides migration support for VIRTIO NET and BLOCK PCI VF + devices using the VFIO framework. Migration support requires the + SR-IOV PF device to support specific VIRTIO extensions, + otherwise this driver provides no additional functionality + beyond vfio-pci. + + Migration support in this driver relies on dirty page tracking + provided by the IOMMU hardware and exposed through IOMMUFD, any + other use cases are dis-recommended. + + If you don't know what to do here, say N. + +config VIRTIO_VFIO_PCI_ADMIN_LEGACY + bool "Legacy I/O support for VIRTIO NET PCI VF devices" + depends on VIRTIO_VFIO_PCI && VIRTIO_PCI_ADMIN_LEGACY + default y + help + This extends the virtio-vfio-pci driver to support legacy I/O + access, allowing use of legacy virtio drivers with VIRTIO NET + PCI VF devices. Legacy I/O support requires the SR-IOV PF + device to support and enable specific VIRTIO extensions, + otherwise this driver provides no additional functionality + beyond vfio-pci. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/virtio/Makefile b/drivers/vfio/pci/virtio/Makefile index 7171105baf33..d9b0bb40d6b3 100644 --- a/drivers/vfio/pci/virtio/Makefile +++ b/drivers/vfio/pci/virtio/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio-vfio-pci.o -virtio-vfio-pci-y := main.o +virtio-vfio-pci-y := main.o migrate.o +virtio-vfio-pci-$(CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY) += legacy_io.o diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h new file mode 100644 index 000000000000..c7d7e27af386 --- /dev/null +++ b/drivers/vfio/pci/virtio/common.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef VIRTIO_VFIO_COMMON_H +#define VIRTIO_VFIO_COMMON_H + +#include <linux/kernel.h> +#include <linux/virtio.h> +#include <linux/vfio_pci_core.h> +#include <linux/virtio_pci.h> + +enum virtiovf_migf_state { + VIRTIOVF_MIGF_STATE_ERROR = 1, + VIRTIOVF_MIGF_STATE_PRECOPY = 2, + VIRTIOVF_MIGF_STATE_COMPLETE = 3, +}; + +enum virtiovf_load_state { + VIRTIOVF_LOAD_STATE_READ_HEADER, + VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA, + VIRTIOVF_LOAD_STATE_READ_HEADER_DATA, + VIRTIOVF_LOAD_STATE_PREP_CHUNK, + VIRTIOVF_LOAD_STATE_READ_CHUNK, + VIRTIOVF_LOAD_STATE_LOAD_CHUNK, +}; + +struct virtiovf_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + struct list_head buf_elm; + u8 include_header_object:1; + struct virtiovf_migration_file *migf; + /* Optimize virtiovf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + +enum virtiovf_migf_header_flags { + VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0, + VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0, +}; + +enum virtiovf_migf_header_tag { + VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA = 0, +}; + +struct virtiovf_migration_header { + __le64 record_size; + /* For future use in case we may need to change the kernel protocol */ + __le32 flags; /* Use virtiovf_migf_header_flags */ + __le32 tag; /* Use virtiovf_migf_header_tag */ + __u8 data[]; /* Its size is given in the record_size */ +}; + +struct virtiovf_migration_file { + struct file *filp; + /* synchronize access to the file state */ + struct mutex lock; + loff_t max_pos; + u64 pre_copy_initial_bytes; + struct ratelimit_state pre_copy_rl_state; + u64 record_size; + u32 record_tag; + u8 has_obj_id:1; + u32 obj_id; + enum virtiovf_migf_state state; + enum virtiovf_load_state load_state; + /* synchronize access to the lists */ + spinlock_t list_lock; + struct list_head buf_list; + struct list_head avail_list; + struct virtiovf_data_buffer *buf; + struct virtiovf_data_buffer *buf_header; + struct virtiovf_pci_core_device *virtvdev; +}; + +struct virtiovf_pci_core_device { + struct vfio_pci_core_device core_device; +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + u8 *bar0_virtual_buf; + /* synchronize access to the virtual buf */ + struct mutex bar_mutex; + void __iomem *notify_addr; + u64 notify_offset; + __le32 pci_base_addr_0; + __le16 pci_cmd; + u8 bar0_virtual_buf_size; + u8 notify_bar; +#endif + + /* LM related */ + u8 migrate_cap:1; + u8 deferred_reset:1; + /* protect migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + /* protect the reset_done flow */ + spinlock_t reset_lock; + struct virtiovf_migration_file *resuming_migf; + struct virtiovf_migration_file *saving_migf; +}; + +void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_migration_reset_done(struct pci_dev *pdev); + +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY +int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); +long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg); +int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg); +ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, + const char __user *buf, size_t count, + loff_t *ppos); +ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf, + size_t count, loff_t *ppos); +bool virtiovf_support_legacy_io(struct pci_dev *pdev); +int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_legacy_io_reset_done(struct pci_dev *pdev); +#endif + +#endif /* VIRTIO_VFIO_COMMON_H */ diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c new file mode 100644 index 000000000000..832af5ba267c --- /dev/null +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include <linux/device.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/pm_runtime.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_core.h> +#include <linux/virtio_pci.h> +#include <linux/virtio_net.h> +#include <linux/virtio_pci_admin.h> + +#include "common.h" + +static int +virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev, + loff_t pos, char __user *buf, + size_t count, bool read) +{ + bool msix_enabled = + (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX); + struct pci_dev *pdev = virtvdev->core_device.pdev; + u8 *bar0_buf = virtvdev->bar0_virtual_buf; + bool common; + u8 offset; + int ret; + + common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled); + /* offset within the relevant configuration area */ + offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled); + mutex_lock(&virtvdev->bar_mutex); + if (read) { + if (common) + ret = virtio_pci_admin_legacy_common_io_read(pdev, offset, + count, bar0_buf + pos); + else + ret = virtio_pci_admin_legacy_device_io_read(pdev, offset, + count, bar0_buf + pos); + if (ret) + goto out; + if (copy_to_user(buf, bar0_buf + pos, count)) + ret = -EFAULT; + } else { + if (copy_from_user(bar0_buf + pos, buf, count)) { + ret = -EFAULT; + goto out; + } + + if (common) + ret = virtio_pci_admin_legacy_common_io_write(pdev, offset, + count, bar0_buf + pos); + else + ret = virtio_pci_admin_legacy_device_io_write(pdev, offset, + count, bar0_buf + pos); + } +out: + mutex_unlock(&virtvdev->bar_mutex); + return ret; +} + +static int +virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev, + loff_t pos, char __user *buf, + size_t count, bool read) +{ + struct vfio_pci_core_device *core_device = &virtvdev->core_device; + struct pci_dev *pdev = core_device->pdev; + u16 queue_notify; + int ret; + + if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO)) + return -EIO; + + if (pos + count > virtvdev->bar0_virtual_buf_size) + return -EINVAL; + + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret) { + pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret); + return -EIO; + } + + switch (pos) { + case VIRTIO_PCI_QUEUE_NOTIFY: + if (count != sizeof(queue_notify)) { + ret = -EINVAL; + goto end; + } + if (read) { + ret = vfio_pci_core_ioread16(core_device, true, &queue_notify, + virtvdev->notify_addr); + if (ret) + goto end; + if (copy_to_user(buf, &queue_notify, + sizeof(queue_notify))) { + ret = -EFAULT; + goto end; + } + } else { + if (copy_from_user(&queue_notify, buf, count)) { + ret = -EFAULT; + goto end; + } + ret = vfio_pci_core_iowrite16(core_device, true, queue_notify, + virtvdev->notify_addr); + } + break; + default: + ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count, + read); + } + +end: + pm_runtime_put(&pdev->dev); + return ret ? ret : count; +} + +static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, + char __user *buf, size_t count, + loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t register_offset; + loff_t copy_offset; + size_t copy_count; + __le32 val32; + __le16 val16; + u8 val8; + int ret; + + ret = vfio_pci_core_read(core_vdev, buf, count, ppos); + if (ret < 0) + return ret; + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { + val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) + return -EFAULT; + } + + if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) && + vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { + if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset, + copy_count)) + return -EFAULT; + val16 |= cpu_to_le16(PCI_COMMAND_IO); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID, + sizeof(val8), ©_offset, + ©_count, ®ister_offset)) { + /* Transional needs to have revision 0 */ + val8 = 0; + if (copy_to_user(buf + copy_offset, &val8, copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, + sizeof(val32), ©_offset, + ©_count, ®ister_offset)) { + u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1); + u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0); + + val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO); + if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { + /* + * Transitional devices use the PCI subsystem device id as + * virtio device id, same as legacy driver always did. + */ + val16 = cpu_to_le16(VIRTIO_ID_NET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { + val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + return count; +} + +ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf, + size_t count, loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count) + return 0; + + if (index == VFIO_PCI_CONFIG_REGION_INDEX) + return virtiovf_pci_read_config(core_vdev, buf, count, ppos); + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true); + + return vfio_pci_core_read(core_vdev, buf, count, ppos); +} + +static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t register_offset; + loff_t copy_offset; + size_t copy_count; + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, + sizeof(virtvdev->pci_cmd), + ©_offset, ©_count, + ®ister_offset)) { + if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset, + buf + copy_offset, + copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, + sizeof(virtvdev->pci_base_addr_0), + ©_offset, ©_count, + ®ister_offset)) { + if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset, + buf + copy_offset, + copy_count)) + return -EFAULT; + } + + return vfio_pci_core_write(core_vdev, buf, count, ppos); +} + +ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count) + return 0; + + if (index == VFIO_PCI_CONFIG_REGION_INDEX) + return virtiovf_pci_write_config(core_vdev, buf, count, ppos); + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false); + + return vfio_pci_core_write(core_vdev, buf, count, ppos); +} + +int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + void __user *uarg = (void __user *)arg; + struct vfio_region_info info = {}; + + if (copy_from_user(&info, uarg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_BAR0_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = virtvdev->bar0_virtual_buf_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; + default: + return vfio_pci_core_ioctl(core_vdev, cmd, arg); + } +} + +long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case VFIO_DEVICE_GET_REGION_INFO: + return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); + default: + return vfio_pci_core_ioctl(core_vdev, cmd, arg); + } +} + +static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) +{ + struct vfio_pci_core_device *core_device = &virtvdev->core_device; + int ret; + + /* + * Setup the BAR where the 'notify' exists to be used by vfio as well + * This will let us mmap it only once and use it when needed. + */ + ret = vfio_pci_core_setup_barmap(core_device, + virtvdev->notify_bar); + if (ret) + return ret; + + virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] + + virtvdev->notify_offset; + return 0; +} + +int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev) +{ + if (!virtvdev->bar0_virtual_buf) + return 0; + + /* + * Upon close_device() the vfio_pci_core_disable() is called + * and will close all the previous mmaps, so it seems that the + * valid life cycle for the 'notify' addr is per open/close. + */ + return virtiovf_set_notify_addr(virtvdev); +} + +static int virtiovf_get_device_config_size(unsigned short device) +{ + /* Network card */ + return offsetofend(struct virtio_net_config, status); +} + +static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev) +{ + u64 offset; + int ret; + u8 bar; + + ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev, + VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM, + &bar, &offset); + if (ret) + return ret; + + virtvdev->notify_bar = bar; + virtvdev->notify_offset = offset; + return 0; +} + +static bool virtiovf_bar0_exists(struct pci_dev *pdev) +{ + struct resource *res = pdev->resource; + + return res->flags; +} + +bool virtiovf_support_legacy_io(struct pci_dev *pdev) +{ + /* For now, the legacy IO functionality is supported only for virtio-net */ + return pdev->device == 0x1041 && virtio_pci_admin_has_legacy_io(pdev) && + !virtiovf_bar0_exists(pdev); +} + +int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev) +{ + struct pci_dev *pdev = virtvdev->core_device.pdev; + int ret; + + ret = virtiovf_read_notify_info(virtvdev); + if (ret) + return ret; + + virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) + + virtiovf_get_device_config_size(pdev->device); + BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size)); + virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size, + GFP_KERNEL); + if (!virtvdev->bar0_virtual_buf) + return -ENOMEM; + mutex_init(&virtvdev->bar_mutex); + return 0; +} + +void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev) +{ + kfree(virtvdev->bar0_virtual_buf); +} + +void virtiovf_legacy_io_reset_done(struct pci_dev *pdev) +{ + struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); + + virtvdev->pci_cmd = 0; +} diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index b5d3a8c5bbc9..515fe1b9f94d 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -16,347 +16,12 @@ #include <linux/virtio_net.h> #include <linux/virtio_pci_admin.h> -struct virtiovf_pci_core_device { - struct vfio_pci_core_device core_device; - u8 *bar0_virtual_buf; - /* synchronize access to the virtual buf */ - struct mutex bar_mutex; - void __iomem *notify_addr; - u64 notify_offset; - __le32 pci_base_addr_0; - __le16 pci_cmd; - u8 bar0_virtual_buf_size; - u8 notify_bar; -}; - -static int -virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev, - loff_t pos, char __user *buf, - size_t count, bool read) -{ - bool msix_enabled = - (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX); - struct pci_dev *pdev = virtvdev->core_device.pdev; - u8 *bar0_buf = virtvdev->bar0_virtual_buf; - bool common; - u8 offset; - int ret; - - common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled); - /* offset within the relevant configuration area */ - offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled); - mutex_lock(&virtvdev->bar_mutex); - if (read) { - if (common) - ret = virtio_pci_admin_legacy_common_io_read(pdev, offset, - count, bar0_buf + pos); - else - ret = virtio_pci_admin_legacy_device_io_read(pdev, offset, - count, bar0_buf + pos); - if (ret) - goto out; - if (copy_to_user(buf, bar0_buf + pos, count)) - ret = -EFAULT; - } else { - if (copy_from_user(bar0_buf + pos, buf, count)) { - ret = -EFAULT; - goto out; - } - - if (common) - ret = virtio_pci_admin_legacy_common_io_write(pdev, offset, - count, bar0_buf + pos); - else - ret = virtio_pci_admin_legacy_device_io_write(pdev, offset, - count, bar0_buf + pos); - } -out: - mutex_unlock(&virtvdev->bar_mutex); - return ret; -} - -static int -virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev, - loff_t pos, char __user *buf, - size_t count, bool read) -{ - struct vfio_pci_core_device *core_device = &virtvdev->core_device; - struct pci_dev *pdev = core_device->pdev; - u16 queue_notify; - int ret; - - if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO)) - return -EIO; - - if (pos + count > virtvdev->bar0_virtual_buf_size) - return -EINVAL; - - ret = pm_runtime_resume_and_get(&pdev->dev); - if (ret) { - pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret); - return -EIO; - } - - switch (pos) { - case VIRTIO_PCI_QUEUE_NOTIFY: - if (count != sizeof(queue_notify)) { - ret = -EINVAL; - goto end; - } - if (read) { - ret = vfio_pci_core_ioread16(core_device, true, &queue_notify, - virtvdev->notify_addr); - if (ret) - goto end; - if (copy_to_user(buf, &queue_notify, - sizeof(queue_notify))) { - ret = -EFAULT; - goto end; - } - } else { - if (copy_from_user(&queue_notify, buf, count)) { - ret = -EFAULT; - goto end; - } - ret = vfio_pci_core_iowrite16(core_device, true, queue_notify, - virtvdev->notify_addr); - } - break; - default: - ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count, - read); - } - -end: - pm_runtime_put(&pdev->dev); - return ret ? ret : count; -} - -static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, - char __user *buf, size_t count, - loff_t *ppos) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - size_t register_offset; - loff_t copy_offset; - size_t copy_count; - __le32 val32; - __le16 val16; - u8 val8; - int ret; - - ret = vfio_pci_core_read(core_vdev, buf, count, ppos); - if (ret < 0) - return ret; - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID, - sizeof(val16), ©_offset, - ©_count, ®ister_offset)) { - val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET); - if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) - return -EFAULT; - } - - if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) && - vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, - sizeof(val16), ©_offset, - ©_count, ®ister_offset)) { - if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset, - copy_count)) - return -EFAULT; - val16 |= cpu_to_le16(PCI_COMMAND_IO); - if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, - copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID, - sizeof(val8), ©_offset, - ©_count, ®ister_offset)) { - /* Transional needs to have revision 0 */ - val8 = 0; - if (copy_to_user(buf + copy_offset, &val8, copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, - sizeof(val32), ©_offset, - ©_count, ®ister_offset)) { - u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1); - u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0); - - val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO); - if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, - sizeof(val16), ©_offset, - ©_count, ®ister_offset)) { - /* - * Transitional devices use the PCI subsystem device id as - * virtio device id, same as legacy driver always did. - */ - val16 = cpu_to_le16(VIRTIO_ID_NET); - if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, - copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, - sizeof(val16), ©_offset, - ©_count, ®ister_offset)) { - val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET); - if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, - copy_count)) - return -EFAULT; - } - - return count; -} - -static ssize_t -virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf, - size_t count, loff_t *ppos) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - - if (!count) - return 0; - - if (index == VFIO_PCI_CONFIG_REGION_INDEX) - return virtiovf_pci_read_config(core_vdev, buf, count, ppos); - - if (index == VFIO_PCI_BAR0_REGION_INDEX) - return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true); - - return vfio_pci_core_read(core_vdev, buf, count, ppos); -} - -static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev, - const char __user *buf, size_t count, - loff_t *ppos) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - size_t register_offset; - loff_t copy_offset; - size_t copy_count; - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, - sizeof(virtvdev->pci_cmd), - ©_offset, ©_count, - ®ister_offset)) { - if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset, - buf + copy_offset, - copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, - sizeof(virtvdev->pci_base_addr_0), - ©_offset, ©_count, - ®ister_offset)) { - if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset, - buf + copy_offset, - copy_count)) - return -EFAULT; - } - - return vfio_pci_core_write(core_vdev, buf, count, ppos); -} - -static ssize_t -virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - - if (!count) - return 0; - - if (index == VFIO_PCI_CONFIG_REGION_INDEX) - return virtiovf_pci_write_config(core_vdev, buf, count, ppos); - - if (index == VFIO_PCI_BAR0_REGION_INDEX) - return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false); - - return vfio_pci_core_write(core_vdev, buf, count, ppos); -} - -static int -virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - void __user *uarg = (void __user *)arg; - struct vfio_region_info info = {}; - - if (copy_from_user(&info, uarg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -static long -virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -static int -virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) -{ - struct vfio_pci_core_device *core_device = &virtvdev->core_device; - int ret; - - /* - * Setup the BAR where the 'notify' exists to be used by vfio as well - * This will let us mmap it only once and use it when needed. - */ - ret = vfio_pci_core_setup_barmap(core_device, - virtvdev->notify_bar); - if (ret) - return ret; - - virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] + - virtvdev->notify_offset; - return 0; -} +#include "common.h" static int virtiovf_pci_open_device(struct vfio_device *core_vdev) { - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev, + struct virtiovf_pci_core_device, core_device.vdev); struct vfio_pci_core_device *vdev = &virtvdev->core_device; int ret; @@ -364,88 +29,84 @@ static int virtiovf_pci_open_device(struct vfio_device *core_vdev) if (ret) return ret; - if (virtvdev->bar0_virtual_buf) { - /* - * Upon close_device() the vfio_pci_core_disable() is called - * and will close all the previous mmaps, so it seems that the - * valid life cycle for the 'notify' addr is per open/close. - */ - ret = virtiovf_set_notify_addr(virtvdev); - if (ret) { - vfio_pci_core_disable(vdev); - return ret; - } +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + ret = virtiovf_open_legacy_io(virtvdev); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; } +#endif + virtiovf_open_migration(virtvdev); vfio_pci_core_finish_enable(vdev); return 0; } -static int virtiovf_get_device_config_size(unsigned short device) +static void virtiovf_pci_close_device(struct vfio_device *core_vdev) { - /* Network card */ - return offsetofend(struct virtio_net_config, status); -} - -static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev) -{ - u64 offset; - int ret; - u8 bar; + struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev, + struct virtiovf_pci_core_device, core_device.vdev); - ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev, - VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM, - &bar, &offset); - if (ret) - return ret; - - virtvdev->notify_bar = bar; - virtvdev->notify_offset = offset; - return 0; + virtiovf_close_migration(virtvdev); + vfio_pci_core_close_device(core_vdev); } +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY static int virtiovf_pci_init_device(struct vfio_device *core_vdev) { - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - struct pci_dev *pdev; + struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev, + struct virtiovf_pci_core_device, core_device.vdev); int ret; ret = vfio_pci_core_init_dev(core_vdev); if (ret) return ret; - pdev = virtvdev->core_device.pdev; - ret = virtiovf_read_notify_info(virtvdev); - if (ret) - return ret; - - virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) + - virtiovf_get_device_config_size(pdev->device); - BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size)); - virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size, - GFP_KERNEL); - if (!virtvdev->bar0_virtual_buf) - return -ENOMEM; - mutex_init(&virtvdev->bar_mutex); - return 0; + /* + * The vfio_device_ops.init() callback is set to virtiovf_pci_init_device() + * only when legacy I/O is supported. Now, let's initialize it. + */ + return virtiovf_init_legacy_io(virtvdev); } +#endif static void virtiovf_pci_core_release_dev(struct vfio_device *core_vdev) { - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev, + struct virtiovf_pci_core_device, core_device.vdev); - kfree(virtvdev->bar0_virtual_buf); + virtiovf_release_legacy_io(virtvdev); +#endif vfio_pci_core_release_dev(core_vdev); } -static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = { - .name = "virtio-vfio-pci-trans", +static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { + .name = "virtio-vfio-pci-lm", + .init = vfio_pci_core_init_dev, + .release = virtiovf_pci_core_release_dev, + .open_device = virtiovf_pci_open_device, + .close_device = virtiovf_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY +static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { + .name = "virtio-vfio-pci-trans-lm", .init = virtiovf_pci_init_device, .release = virtiovf_pci_core_release_dev, .open_device = virtiovf_pci_open_device, - .close_device = vfio_pci_core_close_device, + .close_device = virtiovf_pci_close_device, .ioctl = virtiovf_vfio_pci_core_ioctl, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, @@ -458,6 +119,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = { .attach_ioas = vfio_iommufd_physical_attach_ioas, .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +#endif static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .name = "virtio-vfio-pci", @@ -478,29 +140,34 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; -static bool virtiovf_bar0_exists(struct pci_dev *pdev) -{ - struct resource *res = pdev->resource; - - return res->flags; -} - static int virtiovf_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { const struct vfio_device_ops *ops = &virtiovf_vfio_pci_ops; struct virtiovf_pci_core_device *virtvdev; + bool sup_legacy_io = false; + bool sup_lm = false; int ret; - if (pdev->is_virtfn && virtio_pci_admin_has_legacy_io(pdev) && - !virtiovf_bar0_exists(pdev)) - ops = &virtiovf_vfio_pci_tran_ops; + if (pdev->is_virtfn) { +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + sup_legacy_io = virtiovf_support_legacy_io(pdev); + if (sup_legacy_io) + ops = &virtiovf_vfio_pci_tran_lm_ops; +#endif + sup_lm = virtio_pci_admin_has_dev_parts(pdev); + if (sup_lm && !sup_legacy_io) + ops = &virtiovf_vfio_pci_lm_ops; + } virtvdev = vfio_alloc_device(virtiovf_pci_core_device, core_device.vdev, &pdev->dev, ops); if (IS_ERR(virtvdev)) return PTR_ERR(virtvdev); + if (sup_lm) + virtiovf_set_migratable(virtvdev); + dev_set_drvdata(&pdev->dev, &virtvdev->core_device); ret = vfio_pci_core_register_device(&virtvdev->core_device); if (ret) @@ -520,8 +187,9 @@ static void virtiovf_pci_remove(struct pci_dev *pdev) } static const struct pci_device_id virtiovf_pci_table[] = { - /* Only virtio-net is supported/tested so far */ + /* Only virtio-net and virtio-block are supported/tested so far */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1041) }, + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042) }, {} }; @@ -529,9 +197,10 @@ MODULE_DEVICE_TABLE(pci, virtiovf_pci_table); static void virtiovf_pci_aer_reset_done(struct pci_dev *pdev) { - struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); - - virtvdev->pci_cmd = 0; +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + virtiovf_legacy_io_reset_done(pdev); +#endif + virtiovf_migration_reset_done(pdev); } static const struct pci_error_handlers virtiovf_err_handlers = { @@ -553,4 +222,4 @@ module_pci_driver(virtiovf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); MODULE_DESCRIPTION( - "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET devices"); + "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET and BLOCK devices"); diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c new file mode 100644 index 000000000000..ba92bb4e9af9 --- /dev/null +++ b/drivers/vfio/pci/virtio/migrate.c @@ -0,0 +1,1337 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include <linux/device.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/pm_runtime.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_core.h> +#include <linux/virtio_pci.h> +#include <linux/virtio_net.h> +#include <linux/virtio_pci_admin.h> +#include <linux/anon_inodes.h> + +#include "common.h" + +/* Device specification max parts size */ +#define MAX_LOAD_SIZE (BIT_ULL(BITS_PER_TYPE \ + (((struct virtio_admin_cmd_dev_parts_metadata_result *)0)->parts_size.size)) - 1) + +/* Initial target buffer size */ +#define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M + +static int +virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, + u32 ctx_size); + +static struct page * +virtiovf_get_migration_page(struct virtiovf_data_buffer *buf, + unsigned long offset) +{ + unsigned long cur_offset = 0; + struct scatterlist *sg; + unsigned int i; + + /* All accesses are sequential */ + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; + } + + cur_offset = buf->last_offset; + + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { + if (offset < sg->length + cur_offset) { + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; + return nth_page(sg_page(sg), + (offset - cur_offset) / PAGE_SIZE); + } + cur_offset += sg->length; + } + return NULL; +} + +static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, + unsigned int npages) +{ + unsigned int to_alloc = npages; + struct page **page_list; + unsigned long filled; + unsigned int to_fill; + int ret; + int i; + + to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); + page_list = kvcalloc(to_fill, sizeof(*page_list), GFP_KERNEL_ACCOUNT); + if (!page_list) + return -ENOMEM; + + do { + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, + page_list); + if (!filled) { + ret = -ENOMEM; + goto err; + } + to_alloc -= filled; + ret = sg_alloc_append_table_from_pages(&buf->table, page_list, + filled, 0, filled << PAGE_SHIFT, UINT_MAX, + SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT); + + if (ret) + goto err_append; + buf->allocated_length += filled * PAGE_SIZE; + /* clean input for another bulk allocation */ + memset(page_list, 0, filled * sizeof(*page_list)); + to_fill = min_t(unsigned int, to_alloc, + PAGE_SIZE / sizeof(*page_list)); + } while (to_alloc > 0); + + kvfree(page_list); + return 0; + +err_append: + for (i = filled - 1; i >= 0; i--) + __free_page(page_list[i]); +err: + kvfree(page_list); + return ret; +} + +static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf) +{ + struct sg_page_iter sg_iter; + + /* Undo alloc_pages_bulk() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +static struct virtiovf_data_buffer * +virtiovf_alloc_data_buffer(struct virtiovf_migration_file *migf, size_t length) +{ + struct virtiovf_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); + if (!buf) + return ERR_PTR(-ENOMEM); + + ret = virtiovf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + buf->migf = migf; + return buf; +end: + virtiovf_free_data_buffer(buf); + return ERR_PTR(ret); +} + +static void virtiovf_put_data_buffer(struct virtiovf_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +static int +virtiovf_pci_alloc_obj_id(struct virtiovf_pci_core_device *virtvdev, u8 type, + u32 *obj_id) +{ + return virtio_pci_admin_obj_create(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, type, obj_id); +} + +static void +virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id) +{ + virtio_pci_admin_obj_destroy(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id); +} + +static struct virtiovf_data_buffer * +virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length) +{ + struct virtiovf_data_buffer *buf, *temp_buf; + struct list_head free_list; + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to minimize its scope usage. + */ + list_add(&buf->buf_elm, &free_list); + } + spin_unlock_irq(&migf->list_lock); + buf = virtiovf_alloc_data_buffer(migf, length); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct virtiovf_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + virtiovf_free_data_buffer(temp_buf); + } + + return buf; +} + +static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf) +{ + struct virtiovf_data_buffer *entry; + + if (migf->buf) { + virtiovf_free_data_buffer(migf->buf); + migf->buf = NULL; + } + + if (migf->buf_header) { + virtiovf_free_data_buffer(migf->buf_header); + migf->buf_header = NULL; + } + + list_splice(&migf->avail_list, &migf->buf_list); + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct virtiovf_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + virtiovf_free_data_buffer(entry); + } + + if (migf->has_obj_id) + virtiovf_pci_free_obj_id(migf->virtvdev, migf->obj_id); +} + +static void virtiovf_disable_fd(struct virtiovf_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + migf->filp->f_pos = 0; + mutex_unlock(&migf->lock); +} + +static void virtiovf_disable_fds(struct virtiovf_pci_core_device *virtvdev) +{ + if (virtvdev->resuming_migf) { + virtiovf_disable_fd(virtvdev->resuming_migf); + virtiovf_clean_migf_resources(virtvdev->resuming_migf); + fput(virtvdev->resuming_migf->filp); + virtvdev->resuming_migf = NULL; + } + if (virtvdev->saving_migf) { + virtiovf_disable_fd(virtvdev->saving_migf); + virtiovf_clean_migf_resources(virtvdev->saving_migf); + fput(virtvdev->saving_migf->filp); + virtvdev->saving_migf = NULL; + } +} + +/* + * This function is called in all state_mutex unlock cases to + * handle a 'deferred_reset' if exists. + */ +static void virtiovf_state_mutex_unlock(struct virtiovf_pci_core_device *virtvdev) +{ +again: + spin_lock(&virtvdev->reset_lock); + if (virtvdev->deferred_reset) { + virtvdev->deferred_reset = false; + spin_unlock(&virtvdev->reset_lock); + virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + virtiovf_disable_fds(virtvdev); + goto again; + } + mutex_unlock(&virtvdev->state_mutex); + spin_unlock(&virtvdev->reset_lock); +} + +void virtiovf_migration_reset_done(struct pci_dev *pdev) +{ + struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); + + if (!virtvdev->migrate_cap) + return; + + /* + * As the higher VFIO layers are holding locks across reset and using + * those same locks with the mm_lock we need to prevent ABBA deadlock + * with the state_mutex and mm_lock. + * In case the state_mutex was taken already we defer the cleanup work + * to the unlock flow of the other running context. + */ + spin_lock(&virtvdev->reset_lock); + virtvdev->deferred_reset = true; + if (!mutex_trylock(&virtvdev->state_mutex)) { + spin_unlock(&virtvdev->reset_lock); + return; + } + spin_unlock(&virtvdev->reset_lock); + virtiovf_state_mutex_unlock(virtvdev); +} + +static int virtiovf_release_file(struct inode *inode, struct file *filp) +{ + struct virtiovf_migration_file *migf = filp->private_data; + + virtiovf_disable_fd(migf); + mutex_destroy(&migf->lock); + kfree(migf); + return 0; +} + +static struct virtiovf_data_buffer * +virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf, + loff_t pos, bool *end_of_data) +{ + struct virtiovf_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct virtiovf_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = virtiovf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + +static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t len, + loff_t *pos) +{ + struct virtiovf_migration_file *migf = filp->private_data; + struct virtiovf_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; + ssize_t done = 0; + + if (pos) + return -ESPIPE; + pos = &filp->f_pos; + + mutex_lock(&migf->lock); + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { + done = -ENODEV; + goto out_unlock; + } + + while (len) { + ssize_t count; + + vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (first_loop_call) { + first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) { + done = -ENOMSG; + goto out_unlock; + } + if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) { + done = -EINVAL; + goto out_unlock; + } + } + + if (end_of_data) + goto out_unlock; + + if (!vhca_buf) { + done = -EINVAL; + goto out_unlock; + } + + count = virtiovf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; + goto out_unlock; + } + done += count; + } + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct virtiovf_migration_file *migf = filp->private_data; + struct virtiovf_pci_core_device *virtvdev = migf->virtvdev; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + bool end_of_data = false; + unsigned long minsz; + u32 ctx_size = 0; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&virtvdev->state_mutex); + if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * The virtio specification does not include a PRE_COPY concept. + * Since we can expect the data to remain the same for a certain period, + * we use a rate limiter mechanism before making a call to the device. + */ + if (__ratelimit(&migf->pre_copy_rl_state)) { + + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &ctx_size); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + if (migf->pre_copy_initial_bytes > *pos) { + info.initial_bytes = migf->pre_copy_initial_bytes - *pos; + } else { + info.dirty_bytes = migf->max_pos - *pos; + if (!info.dirty_bytes) + end_of_data = true; + info.dirty_bytes += ctx_size; + } + + if (!end_of_data || !ctx_size) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, read a new state. + */ + ret = virtiovf_read_device_context_chunk(migf, ctx_size); + if (ret) + /* + * The machine is running, and context size could be grow, so no reason to mark + * the device state as VIRTIOVF_MIGF_STATE_ERROR. + */ + goto err_state_unlock; + +done: + virtiovf_state_mutex_unlock(virtvdev); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + virtiovf_state_mutex_unlock(virtvdev); + return ret; +} + +static const struct file_operations virtiovf_save_fops = { + .owner = THIS_MODULE, + .read = virtiovf_save_read, + .unlocked_ioctl = virtiovf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .release = virtiovf_release_file, +}; + +static int +virtiovf_add_buf_header(struct virtiovf_data_buffer *header_buf, + u32 data_size) +{ + struct virtiovf_migration_file *migf = header_buf->migf; + struct virtiovf_migration_header header = {}; + struct page *page; + u8 *to_buff; + + header.record_size = cpu_to_le64(data_size); + header.flags = cpu_to_le32(VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY); + header.tag = cpu_to_le32(VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA); + page = virtiovf_get_migration_page(header_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + kunmap_local(to_buff); + header_buf->length = sizeof(header); + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irq(&migf->list_lock); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irq(&migf->list_lock); + return 0; +} + +static int +virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, + u32 ctx_size) +{ + struct virtiovf_data_buffer *header_buf; + struct virtiovf_data_buffer *buf; + bool unmark_end = false; + struct scatterlist *sg; + unsigned int i; + u32 res_size; + int nent; + int ret; + + buf = virtiovf_get_data_buffer(migf, ctx_size); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + /* Find the total count of SG entries which satisfies the size */ + nent = sg_nents_for_len(buf->table.sgt.sgl, ctx_size); + if (nent <= 0) { + ret = -EINVAL; + goto out; + } + + /* + * Iterate to that SG entry and mark it as last (if it's not already) + * to let underlay layers iterate only till that entry. + */ + for_each_sg(buf->table.sgt.sgl, sg, nent - 1, i) + ; + + if (!sg_is_last(sg)) { + unmark_end = true; + sg_mark_end(sg); + } + + ret = virtio_pci_admin_dev_parts_get(migf->virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, + migf->obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL, + buf->table.sgt.sgl, &res_size); + /* Restore the original SG mark end */ + if (unmark_end) + sg_unmark_end(sg); + if (ret) + goto out; + + buf->length = res_size; + header_buf = virtiovf_get_data_buffer(migf, + sizeof(struct virtiovf_migration_header)); + if (IS_ERR(header_buf)) { + ret = PTR_ERR(header_buf); + goto out; + } + + ret = virtiovf_add_buf_header(header_buf, res_size); + if (ret) + goto out_header; + + buf->start_pos = buf->migf->max_pos; + migf->max_pos += buf->length; + spin_lock(&migf->list_lock); + list_add_tail(&buf->buf_elm, &migf->buf_list); + spin_unlock_irq(&migf->list_lock); + return 0; + +out_header: + virtiovf_put_data_buffer(header_buf); +out: + virtiovf_put_data_buffer(buf); + return ret; +} + +static int +virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev) +{ + struct virtiovf_migration_file *migf = virtvdev->saving_migf; + u32 ctx_size; + int ret; + + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) + return -ENODEV; + + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &ctx_size); + if (ret) + goto err; + + if (!ctx_size) { + ret = -EINVAL; + goto err; + } + + ret = virtiovf_read_device_context_chunk(migf, ctx_size); + if (ret) + goto err; + + migf->state = VIRTIOVF_MIGF_STATE_COMPLETE; + return 0; + +err: + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + return ret; +} + +static struct virtiovf_migration_file * +virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev, + bool pre_copy) +{ + struct virtiovf_migration_file *migf; + u32 ctx_size; + u32 obj_id; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_save_fops, migf, + O_RDONLY); + if (IS_ERR(migf->filp)) { + ret = PTR_ERR(migf->filp); + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); + migf->virtvdev = virtvdev; + + lockdep_assert_held(&virtvdev->state_mutex); + ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET, + &obj_id); + if (ret) + goto out; + + migf->obj_id = obj_id; + /* Mark as having a valid obj id which can be even 0 */ + migf->has_obj_id = true; + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &ctx_size); + if (ret) + goto out_clean; + + if (!ctx_size) { + ret = -EINVAL; + goto out_clean; + } + + ret = virtiovf_read_device_context_chunk(migf, ctx_size); + if (ret) + goto out_clean; + + if (pre_copy) { + migf->pre_copy_initial_bytes = migf->max_pos; + /* Arbitrarily set the pre-copy rate limit to 1-second intervals */ + ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1); + /* Prevent any rate messages upon its usage */ + ratelimit_set_flags(&migf->pre_copy_rl_state, + RATELIMIT_MSG_ON_RELEASE); + migf->state = VIRTIOVF_MIGF_STATE_PRECOPY; + } else { + migf->state = VIRTIOVF_MIGF_STATE_COMPLETE; + } + + return migf; + +out_clean: + virtiovf_clean_migf_resources(migf); +out: + fput(migf->filp); + return ERR_PTR(ret); +} + +/* + * Set the required object header at the beginning of the buffer. + * The actual device parts data will be written post of the header offset. + */ +static int virtiovf_set_obj_cmd_header(struct virtiovf_data_buffer *vhca_buf) +{ + struct virtio_admin_cmd_resource_obj_cmd_hdr obj_hdr = {}; + struct page *page; + u8 *to_buff; + + obj_hdr.type = cpu_to_le16(VIRTIO_RESOURCE_OBJ_DEV_PARTS); + obj_hdr.id = cpu_to_le32(vhca_buf->migf->obj_id); + page = virtiovf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &obj_hdr, sizeof(obj_hdr)); + kunmap_local(to_buff); + + /* Mark the buffer as including the header object data */ + vhca_buf->include_header_object = 1; + return 0; +} + +static int +virtiovf_append_page_to_mig_buf(struct virtiovf_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + unsigned long offset; + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + + if (vhca_buf->include_header_object) + /* The buffer holds the object header, update the offset accordingly */ + offset += sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr); + + page_offset = offset % PAGE_SIZE; + + page = virtiovf_get_migration_page(vhca_buf, offset - page_offset); + if (!page) + return -EINVAL; + + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + page_offset, *buf, page_len); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + *pos += page_len; + *done += page_len; + *buf += page_len; + *len -= page_len; + vhca_buf->length += page_len; + return 0; +} + +static ssize_t +virtiovf_resume_read_chunk(struct virtiovf_migration_file *migf, + struct virtiovf_data_buffer *vhca_buf, + size_t chunk_size, const char __user **buf, + size_t *len, loff_t *pos, ssize_t *done, + bool *has_work) +{ + size_t copy_len, to_copy; + int ret; + + to_copy = min_t(size_t, *len, chunk_size - vhca_buf->length); + copy_len = to_copy; + while (to_copy) { + ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, + pos, done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == chunk_size) { + migf->load_state = VIRTIOVF_LOAD_STATE_LOAD_CHUNK; + migf->max_pos += chunk_size; + *has_work = true; + } + + return 0; +} + +static int +virtiovf_resume_read_header_data(struct virtiovf_migration_file *migf, + struct virtiovf_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + size_t copy_len, to_copy; + size_t required_data; + int ret; + + required_data = migf->record_size - vhca_buf->length; + to_copy = min_t(size_t, *len, required_data); + copy_len = to_copy; + while (to_copy) { + ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, + pos, done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == migf->record_size) { + switch (migf->record_tag) { + default: + /* Optional tag */ + break; + } + + migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER; + migf->max_pos += migf->record_size; + vhca_buf->length = 0; + } + + return 0; +} + +static int +virtiovf_resume_read_header(struct virtiovf_migration_file *migf, + struct virtiovf_data_buffer *vhca_buf, + const char __user **buf, + size_t *len, loff_t *pos, + ssize_t *done, bool *has_work) +{ + struct page *page; + size_t copy_len; + u8 *to_buff; + int ret; + + copy_len = min_t(size_t, *len, + sizeof(struct virtiovf_migration_header) - vhca_buf->length); + page = virtiovf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); + if (ret) { + ret = -EFAULT; + goto end; + } + + *buf += copy_len; + *pos += copy_len; + *done += copy_len; + *len -= copy_len; + vhca_buf->length += copy_len; + if (vhca_buf->length == sizeof(struct virtiovf_migration_header)) { + u64 record_size; + u32 flags; + + record_size = le64_to_cpup((__le64 *)to_buff); + if (record_size > MAX_LOAD_SIZE) { + ret = -ENOMEM; + goto end; + } + + migf->record_size = record_size; + flags = le32_to_cpup((__le32 *)(to_buff + + offsetof(struct virtiovf_migration_header, flags))); + migf->record_tag = le32_to_cpup((__le32 *)(to_buff + + offsetof(struct virtiovf_migration_header, tag))); + switch (migf->record_tag) { + case VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA: + migf->load_state = VIRTIOVF_LOAD_STATE_PREP_CHUNK; + break; + default: + if (!(flags & VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { + ret = -EOPNOTSUPP; + goto end; + } + /* We may read and skip this optional record data */ + migf->load_state = VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA; + } + + migf->max_pos += vhca_buf->length; + vhca_buf->length = 0; + *has_work = true; + } +end: + kunmap_local(to_buff); + return ret; +} + +static ssize_t virtiovf_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct virtiovf_migration_file *migf = filp->private_data; + struct virtiovf_data_buffer *vhca_buf = migf->buf; + struct virtiovf_data_buffer *vhca_buf_header = migf->buf_header; + unsigned int orig_length; + bool has_work = false; + ssize_t done = 0; + int ret = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + if (*pos < vhca_buf->start_pos) + return -EINVAL; + + mutex_lock(&migf->virtvdev->state_mutex); + mutex_lock(&migf->lock); + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { + done = -ENODEV; + goto out_unlock; + } + + while (len || has_work) { + has_work = false; + switch (migf->load_state) { + case VIRTIOVF_LOAD_STATE_READ_HEADER: + ret = virtiovf_resume_read_header(migf, vhca_buf_header, &buf, + &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA: + if (vhca_buf_header->allocated_length < migf->record_size) { + virtiovf_free_data_buffer(vhca_buf_header); + + migf->buf_header = virtiovf_alloc_data_buffer(migf, + migf->record_size); + if (IS_ERR(migf->buf_header)) { + ret = PTR_ERR(migf->buf_header); + migf->buf_header = NULL; + goto out_unlock; + } + + vhca_buf_header = migf->buf_header; + } + + vhca_buf_header->start_pos = migf->max_pos; + migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER_DATA; + break; + case VIRTIOVF_LOAD_STATE_READ_HEADER_DATA: + ret = virtiovf_resume_read_header_data(migf, vhca_buf_header, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; + case VIRTIOVF_LOAD_STATE_PREP_CHUNK: + { + u32 cmd_size = migf->record_size + + sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr); + + /* + * The DMA map/unmap is managed in virtio layer, we just need to extend + * the SG pages to hold the extra required chunk data. + */ + if (vhca_buf->allocated_length < cmd_size) { + ret = virtiovf_add_migration_pages(vhca_buf, + DIV_ROUND_UP_ULL(cmd_size - vhca_buf->allocated_length, + PAGE_SIZE)); + if (ret) + goto out_unlock; + } + + vhca_buf->start_pos = migf->max_pos; + migf->load_state = VIRTIOVF_LOAD_STATE_READ_CHUNK; + break; + } + case VIRTIOVF_LOAD_STATE_READ_CHUNK: + ret = virtiovf_resume_read_chunk(migf, vhca_buf, migf->record_size, + &buf, &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case VIRTIOVF_LOAD_STATE_LOAD_CHUNK: + /* Mark the last SG entry and set its length */ + sg_mark_end(vhca_buf->last_offset_sg); + orig_length = vhca_buf->last_offset_sg->length; + /* Length should include the resource object command header */ + vhca_buf->last_offset_sg->length = vhca_buf->length + + sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr) - + vhca_buf->last_offset; + ret = virtio_pci_admin_dev_parts_set(migf->virtvdev->core_device.pdev, + vhca_buf->table.sgt.sgl); + /* Restore the original SG data */ + vhca_buf->last_offset_sg->length = orig_length; + sg_unmark_end(vhca_buf->last_offset_sg); + if (ret) + goto out_unlock; + migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER; + /* be ready for reading the next chunk */ + vhca_buf->length = 0; + break; + default: + break; + } + } + +out_unlock: + if (ret) + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + mutex_unlock(&migf->lock); + virtiovf_state_mutex_unlock(migf->virtvdev); + return ret ? ret : done; +} + +static const struct file_operations virtiovf_resume_fops = { + .owner = THIS_MODULE, + .write = virtiovf_resume_write, + .release = virtiovf_release_file, +}; + +static struct virtiovf_migration_file * +virtiovf_pci_resume_device_data(struct virtiovf_pci_core_device *virtvdev) +{ + struct virtiovf_migration_file *migf; + struct virtiovf_data_buffer *buf; + u32 obj_id; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_resume_fops, migf, + O_WRONLY); + if (IS_ERR(migf->filp)) { + ret = PTR_ERR(migf->filp); + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); + + buf = virtiovf_alloc_data_buffer(migf, VIRTIOVF_TARGET_INITIAL_BUF_SIZE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out; + } + + migf->buf = buf; + + buf = virtiovf_alloc_data_buffer(migf, + sizeof(struct virtiovf_migration_header)); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_clean; + } + + migf->buf_header = buf; + migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER; + + migf->virtvdev = virtvdev; + ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET, + &obj_id); + if (ret) + goto out_clean; + + migf->obj_id = obj_id; + /* Mark as having a valid obj id which can be even 0 */ + migf->has_obj_id = true; + ret = virtiovf_set_obj_cmd_header(migf->buf); + if (ret) + goto out_clean; + + return migf; + +out_clean: + virtiovf_clean_migf_resources(migf); +out: + fput(migf->filp); + return ERR_PTR(ret); +} + +static struct file * +virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev, + u32 new) +{ + u32 cur = virtvdev->mig_state; + int ret; + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { + /* NOP */ + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + /* NOP */ + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, + BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED)); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { + ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct virtiovf_migration_file *migf; + + migf = virtiovf_pci_save_device_data(virtvdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + virtvdev->saving_migf = migf; + return migf->filp; + } + + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) { + virtiovf_disable_fds(virtvdev); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { + struct virtiovf_migration_file *migf; + + migf = virtiovf_pci_resume_device_data(virtvdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + virtvdev->resuming_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { + virtiovf_disable_fds(virtvdev); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct virtiovf_migration_file *migf; + + migf = virtiovf_pci_save_device_data(virtvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + virtvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = virtiovf_pci_save_device_final_data(virtvdev); + return ret ? ERR_PTR(ret) : NULL; + } + + /* + * vfio_mig_get_next_state() does not use arcs other than the above + */ + WARN_ON(true); + return ERR_PTR(-EINVAL); +} + +static struct file * +virtiovf_pci_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + vdev, struct virtiovf_pci_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *res = NULL; + int ret; + + mutex_lock(&virtvdev->state_mutex); + while (new_state != virtvdev->mig_state) { + ret = vfio_mig_get_next_state(vdev, virtvdev->mig_state, + new_state, &next_state); + if (ret) { + res = ERR_PTR(ret); + break; + } + res = virtiovf_pci_step_device_state_locked(virtvdev, next_state); + if (IS_ERR(res)) + break; + virtvdev->mig_state = next_state; + if (WARN_ON(res && new_state != virtvdev->mig_state)) { + fput(res); + res = ERR_PTR(-EINVAL); + break; + } + } + virtiovf_state_mutex_unlock(virtvdev); + return res; +} + +static int virtiovf_pci_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *curr_state) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + vdev, struct virtiovf_pci_core_device, core_device.vdev); + + mutex_lock(&virtvdev->state_mutex); + *curr_state = virtvdev->mig_state; + virtiovf_state_mutex_unlock(virtvdev); + return 0; +} + +static int virtiovf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + vdev, struct virtiovf_pci_core_device, core_device.vdev); + bool obj_id_exists; + u32 res_size; + u32 obj_id; + int ret; + + mutex_lock(&virtvdev->state_mutex); + obj_id_exists = virtvdev->saving_migf && virtvdev->saving_migf->has_obj_id; + if (!obj_id_exists) { + ret = virtiovf_pci_alloc_obj_id(virtvdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET, + &obj_id); + if (ret) + goto end; + } else { + obj_id = virtvdev->saving_migf->obj_id; + } + + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &res_size); + if (!ret) + *stop_copy_length = res_size; + + /* + * We can't leave this obj_id alive if didn't exist before, otherwise, it might + * stay alive, even without an active migration flow (e.g. migration was cancelled) + */ + if (!obj_id_exists) + virtiovf_pci_free_obj_id(virtvdev, obj_id); +end: + virtiovf_state_mutex_unlock(virtvdev); + return ret; +} + +static const struct vfio_migration_ops virtvdev_pci_mig_ops = { + .migration_set_state = virtiovf_pci_set_device_state, + .migration_get_state = virtiovf_pci_get_device_state, + .migration_get_data_size = virtiovf_pci_get_data_size, +}; + +void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev) +{ + virtvdev->migrate_cap = 1; + mutex_init(&virtvdev->state_mutex); + spin_lock_init(&virtvdev->reset_lock); + virtvdev->core_device.vdev.migration_flags = + VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY; + virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops; +} + +void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev) +{ + if (!virtvdev->migrate_cap) + return; + + virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; +} + +void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev) +{ + if (!virtvdev->migrate_cap) + return; + + virtiovf_disable_fds(virtvdev); +} |