From cd48ebc5c4f2e94830b238f035ebf04f1c3a4433 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 22 Sep 2022 20:35:07 +0800 Subject: vfio/mlx5: Switch to use module_pci_driver() macro Since pci provides the helper macro module_pci_driver(), we may replace the module_init/exit with it. Signed-off-by: Shang XiaoJing Reviewed-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220922123507.11222-1-shangxiaojing@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fd6ccb8454a2..457138b92f13 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -676,18 +676,7 @@ static struct pci_driver mlx5vf_pci_driver = { .driver_managed_dma = true, }; -static void __exit mlx5vf_pci_cleanup(void) -{ - pci_unregister_driver(&mlx5vf_pci_driver); -} - -static int __init mlx5vf_pci_init(void) -{ - return pci_register_driver(&mlx5vf_pci_driver); -} - -module_init(mlx5vf_pci_init); -module_exit(mlx5vf_pci_cleanup); +module_pci_driver(mlx5vf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy "); -- cgit From 913447d06f032a9e9c84870bec0b1adb8c588f29 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:07 +0100 Subject: vfio: Remove vfio_free_device With the "mess" sorted out, we should be able to inline the vfio_free_device call introduced by commit cb9ff3f3b84c ("vfio: Add helpers for unifying vfio_device life cycle") and remove them from driver release callbacks. Signed-off-by: Eric Farman Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Cornelia Huck Reviewed-by: Tony Krowiak # vfio-ap part Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-8-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index badc9d828cac..9be2d5be5d95 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2109,7 +2109,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); - vfio_free_device(core_vdev); } EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); -- cgit From 4e016f969529f2aec0545e90119e7eb3cb124c46 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:18 +0200 Subject: vfio: Add an option to get migration data size Add an option to get migration data size by introducing a new migration feature named VFIO_DEVICE_FEATURE_MIG_DATA_SIZE. Upon VFIO_DEVICE_FEATURE_GET the estimated data length that will be required to complete STOP_COPY is returned. This option may better enable user space to consider before moving to STOP_COPY whether it can meet the downtime SLA based on the returned data. The patch also includes the implementation for mlx5 and hisi for this new option to make it feature complete for the existing drivers in this area. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Reviewed-by: Longfang Liu Link: https://lore.kernel.org/r/20221106174630.25909-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 9 +++++++++ drivers/vfio/pci/mlx5/main.c | 18 ++++++++++++++++++ drivers/vfio/pci/vfio_pci_core.c | 3 ++- 3 files changed, 29 insertions(+), 1 deletion(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 39eeca18a0f7..0c0c0c7f0521 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -957,6 +957,14 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev, return res; } +static int +hisi_acc_vfio_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + *stop_copy_length = sizeof(struct acc_vf_data); + return 0; +} + static int hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) @@ -1213,6 +1221,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { .migration_set_state = hisi_acc_vfio_pci_set_device_state, .migration_get_state = hisi_acc_vfio_pci_get_device_state, + .migration_get_data_size = hisi_acc_vfio_pci_get_data_size, }; static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 457138b92f13..6e9cf2aacc52 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -512,6 +512,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev, return res; } +static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + size_t state_size; + int ret; + + mutex_lock(&mvdev->state_mutex); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &state_size); + if (!ret) + *stop_copy_length = state_size; + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { @@ -577,6 +594,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_set_state = mlx5vf_pci_set_device_state, .migration_get_state = mlx5vf_pci_get_device_state, + .migration_get_data_size = mlx5vf_pci_get_data_size, }; static const struct vfio_log_ops mlx5vf_pci_log_ops = { diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9be2d5be5d95..189d4930c276 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2127,7 +2127,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (vdev->vdev.mig_ops) { if (!(vdev->vdev.mig_ops->migration_get_state && - vdev->vdev.mig_ops->migration_set_state) || + vdev->vdev.mig_ops->migration_set_state && + vdev->vdev.mig_ops->migration_get_data_size) || !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY)) return -EINVAL; } -- cgit From 2f5d8cef45c30edcf3972d345f606df563d3a48e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:19 +0200 Subject: vfio/mlx5: Fix a typo in mlx5vf_cmd_load_vhca_state() Fix a typo in mlx5vf_cmd_load_vhca_state() to use the 'load' memory layout. As in/out sizes are equal for save and load commands there wasn't any functional issue. Fixes: f1d98f346ee3 ("vfio/mlx5: Expose migration commands over mlx5 device") Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221106174630.25909-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c604b70437a5..0848bc905d3e 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -378,8 +378,8 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { struct mlx5_core_dev *mdev; - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; u32 pdn, mkey; int err; -- cgit From 8f8bcc8c720c360885639de66fe69756febed824 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:16 -0400 Subject: vfio/pci: Move all the SPAPR PCI specific logic to vfio_pci_core.ko The vfio_spapr_pci_eeh_open/release() functions are one line wrappers around an arch function. Just call them directly. This eliminates some weird exported symbols that don't need to exist. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/1-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 189d4930c276..56501e7ef564 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,6 +27,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#include +#endif #include "vfio_pci_priv.h" @@ -686,7 +689,9 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } - vfio_spapr_pci_eeh_release(vdev->pdev); +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) + eeh_dev_release(vdev->pdev); +#endif vfio_pci_core_disable(vdev); mutex_lock(&vdev->igate); @@ -705,7 +710,9 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); - vfio_spapr_pci_eeh_open(vdev->pdev); +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) + eeh_dev_open(vdev->pdev); +#endif if (vdev->sriov_pf_core_dev) { mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); -- cgit From 20601c45a0fa20bbb5545f4dd69f4f18448f4973 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:19 -0400 Subject: vfio: Remove CONFIG_VFIO_SPAPR_EEH We don't need a kconfig symbol for this, just directly test CONFIG_EEH in the few places that need it. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 56501e7ef564..f9365a5bc961 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,7 +27,7 @@ #include #include #include -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) #include #endif @@ -689,7 +689,7 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) eeh_dev_release(vdev->pdev); #endif vfio_pci_core_disable(vdev); @@ -710,7 +710,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) eeh_dev_open(vdev->pdev); #endif -- cgit From 0e7caa65d707b93fbb4322c6313f739fa9103dfa Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:27 +0200 Subject: vfio/mlx5: Enforce a single SAVE command at a time Enforce a single SAVE command at a time. As the SAVE command is an asynchronous one, we must enforce running only a single command at a time. This will preserve ordering between multiple calls and protect from races on the migration file data structure. This is a must for the next patches from the series where as part of PRE_COPY we may have multiple images to be saved and multiple SAVE commands may be issued from different flows. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 6 ++++++ drivers/vfio/pci/mlx5/cmd.h | 1 + drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0848bc905d3e..55ee8036f59c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -281,6 +281,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); + complete(&migf->save_comp); fput(migf->filp); } @@ -321,6 +322,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return -ENOTCONN; mdev = mvdev->mdev; + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + err = mlx5_core_alloc_pd(mdev, &pdn); if (err) return err; @@ -371,6 +376,7 @@ err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: mlx5_core_dealloc_pd(mdev, pdn); + complete(&migf->save_comp); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 921d5720a1e5..8ffa7699872c 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -37,6 +37,7 @@ struct mlx5_vf_migration_file { unsigned long last_offset; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; + struct completion save_comp; struct mlx5_async_ctx async_ctx; struct mlx5vf_async_data async_data; }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 6e9cf2aacc52..0d71ebb2a972 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -245,6 +245,13 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); + init_completion(&migf->save_comp); + /* + * save_comp is being used as a binary semaphore built from + * a completion. A normal mutex cannot be used because the lock is + * passed between kernel threads and lockdep can't model this. + */ + complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, -- cgit From 9945a67ea4b30657dd998c7fbbea1b3950747168 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:28 +0200 Subject: vfio/mlx5: Refactor PD usage This patch refactors PD usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED and a single PD can be simply reused. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 53 +++++++++++++++++++++++++++++--------------- drivers/vfio/pci/mlx5/cmd.h | 5 ++++- drivers/vfio/pci/mlx5/main.c | 44 ++++++++++++++++++++++++++---------- 3 files changed, 71 insertions(+), 31 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 55ee8036f59c..a97eac49e3d6 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -279,7 +279,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5_core_destroy_mkey(mdev, async_data->mkey); dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -314,7 +313,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; struct mlx5_core_dev *mdev; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -326,16 +325,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); if (err) goto err_dma_map; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_create_mkey; @@ -357,7 +352,6 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, migf->total_length = 0; get_file(migf->filp); async_data->mkey = mkey; - async_data->pdn = pdn; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -375,7 +369,6 @@ err_out: err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: - mlx5_core_dealloc_pd(mdev, pdn); complete(&migf->save_comp); return err; } @@ -386,7 +379,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -400,15 +393,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, } mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - goto end; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); if (err) - goto err_reg; + goto end; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_mkey; @@ -424,13 +413,41 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, mlx5_core_destroy_mkey(mdev, mkey); err_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -err_reg: - mlx5_core_dealloc_pd(mdev, pdn); end: mutex_unlock(&migf->lock); return err; } +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) +{ + int err; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return -ENOTCONN; + + err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); + return err; +} + +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return; + + mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); +} + +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + + WARN_ON(migf->mvdev->mdev_detach); + + mlx5vf_cmd_dealloc_pd(migf); +} + static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, u32 req_nodes) { diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8ffa7699872c..ba760f956d53 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -16,7 +16,6 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 pdn; u32 mkey; void *out; }; @@ -27,6 +26,7 @@ struct mlx5_vf_migration_file { u8 disabled:1; u8 is_err:1; + u32 pdn; struct sg_append_table table; size_t total_length; size_t allocated_length; @@ -127,6 +127,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 0d71ebb2a972..1916f7c1468c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -236,12 +236,15 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, O_RDONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); @@ -257,20 +260,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &migf->total_length); if (ret) - goto out_free; + goto out_pd; ret = mlx5vf_add_migration_pages( migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); if (ret) - goto out_free; + goto out_pd; - migf->mvdev = mvdev; ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); if (ret) - goto out_free; + goto out_save; return migf; +out_save: + mlx5vf_disable_fd(migf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); +end: + kfree(migf); return ERR_PTR(ret); } @@ -352,6 +360,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); if (!migf) @@ -360,20 +369,30 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, O_WRONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_free: + fput(migf->filp); +end: + kfree(migf); + return ERR_PTR(ret); } void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); fput(mvdev->resuming_migf->filp); mvdev->resuming_migf = NULL; } @@ -381,6 +400,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; } -- cgit From 91454f8b9bf4ce6be1d9a0b4de401bc3c6313a95 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:29 +0200 Subject: vfio/mlx5: Refactor MKEY usage This patch refactors MKEY usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED. We achieve it by having a new struct named mlx5_vhca_data_buffer which holds the mkey and its related stuff as of sg_append_table, allocated_length, etc. The above fields were taken out from the migration file main struct, into mlx5_vhca_data_buffer dedicated struct with the proper helpers in place. For now we have a single mlx5_vhca_data_buffer per migration file. However, in coming patches we'll have multiple of them to support multiple images. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 162 +++++++++++++++++++++++++++---------------- drivers/vfio/pci/mlx5/cmd.h | 37 +++++++--- drivers/vfio/pci/mlx5/main.c | 92 +++++++++++++----------- 3 files changed, 178 insertions(+), 113 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index a97eac49e3d6..ed4c472d2eae 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -210,11 +210,11 @@ err_exec: } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; @@ -232,10 +232,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (migf) { + if (buf) { struct sg_dma_page_iter dma_iter; - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; @@ -255,20 +255,99 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, - migf ? migf->total_length : (npages * PAGE_SIZE)); + MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; } +static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; + int ret; + + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) + return -ENOTCONN; + + if (buf->dmaed || !buf->allocated_length) + return -EINVAL; + + ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + if (ret) + return ret; + + ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + if (ret) + goto err; + + buf->dmaed = true; + + return 0; +err: + dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + return ret; +} + +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5_vf_migration_file *migf = buf->migf; + struct sg_page_iter sg_iter; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (buf->dmaed) { + mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, + buf->dma_dir, 0); + } + + /* Undo alloc_pages_bulk_array() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, + enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + buf->dma_dir = dma_dir; + buf->migf = migf; + if (length) { + ret = mlx5vf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } + + return buf; +end: + mlx5vf_free_data_buffer(buf); + return ERR_PTR(ret); +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, struct mlx5vf_async_data, work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); - struct mlx5_core_dev *mdev = migf->mvdev->mdev; mutex_lock(&migf->lock); if (async_data->status) { @@ -276,9 +355,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - - mlx5_core_destroy_mkey(mdev, async_data->mkey); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -292,7 +368,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->total_length, + WRITE_ONCE(migf->buf->length, MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size)); wake_up_interruptible(&migf->poll_wait); @@ -307,39 +383,28 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; - struct mlx5_core_dev *mdev; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mdev = mvdev->mdev; err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, - 0); - if (err) - goto err_dma_map; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_create_mkey; - MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(save_vhca_state_in, in, mkey, mkey); - MLX5_SET(save_vhca_state_in, in, size, migf->total_length); + MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; async_data->out = kvzalloc(out_size, GFP_KERNEL); @@ -348,10 +413,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } - /* no data exists till the callback comes back */ - migf->total_length = 0; get_file(migf->filp); - async_data->mkey = mkey; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -365,57 +427,33 @@ err_exec: fput(migf->filp); kvfree(async_data->out); err_out: - mlx5_core_destroy_mkey(mdev, mkey); -err_create_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -err_dma_map: complete(&migf->save_comp); return err; } int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { - struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mutex_lock(&migf->lock); - if (!migf->total_length) { - err = -EINVAL; - goto end; - } - - mdev = mvdev->mdev; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); + err = mlx5vf_dma_data_buffer(buf); if (err) - goto end; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_mkey; + return err; MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(load_vhca_state_in, in, mkey, mkey); - MLX5_SET(load_vhca_state_in, in, size, migf->total_length); - - err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); - - mlx5_core_destroy_mkey(mdev, mkey); -err_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -end: - mutex_unlock(&migf->lock); - return err; + MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(load_vhca_state_in, in, size, buf->length); + return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); } int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) @@ -445,6 +483,10 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) WARN_ON(migf->mvdev->mdev_detach); + if (migf->buf) { + mlx5vf_free_data_buffer(migf->buf); + migf->buf = NULL; + } mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index ba760f956d53..b0f08dfc8120 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,11 +12,25 @@ #include #include +struct mlx5_vhca_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + u32 mkey; + enum dma_data_direction dma_dir; + u8 dmaed:1; + struct mlx5_vf_migration_file *migf; + /* Optimize mlx5vf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 mkey; void *out; }; @@ -27,14 +41,7 @@ struct mlx5_vf_migration_file { u8 is_err:1; u32 pdn; - struct sg_append_table table; - size_t total_length; - size_t allocated_length; - - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; + struct mlx5_vhca_data_buffer *buf; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -124,12 +131,20 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 1916f7c1468c..5f694fce854c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -33,7 +33,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) } static struct page * -mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { unsigned long cur_offset = 0; @@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned int i; /* All accesses are sequential */ - if (offset < migf->last_offset || !migf->last_offset_sg) { - migf->last_offset = 0; - migf->last_offset_sg = migf->table.sgt.sgl; - migf->sg_last_entry = 0; + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; } - cur_offset = migf->last_offset; + cur_offset = buf->last_offset; - for_each_sg(migf->last_offset_sg, sg, - migf->table.sgt.orig_nents - migf->sg_last_entry, i) { + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { if (offset < sg->length + cur_offset) { - migf->last_offset_sg = sg; - migf->sg_last_entry += i; - migf->last_offset = cur_offset; + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; return nth_page(sg_page(sg), (offset - cur_offset) / PAGE_SIZE); } @@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, return NULL; } -static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, - unsigned int npages) +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages) { unsigned int to_alloc = npages; struct page **page_list; @@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &migf->table, page_list, filled, 0, + &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL); if (ret) goto err; - migf->allocated_length += filled * PAGE_SIZE; + buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -108,16 +108,8 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { - struct sg_page_iter sg_iter; - mutex_lock(&migf->lock); - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&migf->table); migf->disabled = true; - migf->total_length = 0; - migf->allocated_length = 0; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -136,6 +128,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; ssize_t done = 0; if (pos) @@ -144,16 +137,16 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(migf->total_length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || migf->is_err)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { + if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { done = -EAGAIN; goto out_unlock; } - if (*pos > migf->total_length) { + if (*pos > vhca_buf->length) { done = -EINVAL; goto out_unlock; } @@ -162,7 +155,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, goto out_unlock; } - len = min_t(size_t, migf->total_length - *pos, len); + len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { size_t page_offset; struct page *page; @@ -171,7 +164,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -208,7 +201,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->disabled || migf->is_err) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->total_length)) + else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -227,6 +220,8 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -257,22 +252,23 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &migf->total_length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; - ret = mlx5vf_add_migration_pages( - migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); - if (ret) + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); goto out_pd; + } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; + migf->buf = buf; return migf; out_save: - mlx5vf_disable_fd(migf); + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -286,6 +282,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; loff_t requested_length; ssize_t done = 0; @@ -306,10 +303,10 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; } - if (migf->allocated_length < requested_length) { + if (vhca_buf->allocated_length < requested_length) { done = mlx5vf_add_migration_pages( - migf, - DIV_ROUND_UP(requested_length - migf->allocated_length, + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, PAGE_SIZE)); if (done) goto out_unlock; @@ -323,7 +320,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -342,7 +339,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, len -= page_len; done += page_len; buf += page_len; - migf->total_length += page_len; + vhca_buf->length += page_len; } out_unlock: mutex_unlock(&migf->lock); @@ -360,6 +357,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -378,9 +376,18 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto out_free; + buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } + + migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); end: @@ -474,7 +481,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf); + mvdev->resuming_migf, + mvdev->resuming_migf->buf); if (ret) return ERR_PTR(ret); mlx5vf_disable_fds(mvdev); -- cgit From 8b599d143419669e57da3881d8293f17809688d7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:30 +0200 Subject: vfio/mlx5: Refactor migration file state Refactor migration file state to be an emum which is mutual exclusive. As of that dropped the 'disabled' state as 'error' is the same from functional point of view. Next patches from the series will extend this enum for other relevant states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 2 +- drivers/vfio/pci/mlx5/cmd.h | 7 +++++-- drivers/vfio/pci/mlx5/main.c | 11 ++++++----- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index ed4c472d2eae..fcba12326185 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,7 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->is_err = true; + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index b0f08dfc8120..14403e654e4e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,6 +12,10 @@ #include #include +enum mlx5_vf_migf_state { + MLX5_MIGF_STATE_ERROR = 1, +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; @@ -37,8 +41,7 @@ struct mlx5vf_async_data { struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - u8 disabled:1; - u8 is_err:1; + enum mlx5_vf_migf_state state; u32 pdn; struct mlx5_vhca_data_buffer *buf; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5f694fce854c..d95646c2f010 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -109,7 +109,7 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); - migf->disabled = true; + migf->state = MLX5_MIGF_STATE_ERROR; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -137,7 +137,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || + migf->state == MLX5_MIGF_STATE_ERROR)) return -ERESTARTSYS; } @@ -150,7 +151,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, done = -EINVAL; goto out_unlock; } - if (migf->disabled || migf->is_err) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } @@ -199,7 +200,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, poll_wait(filp, &migf->poll_wait, wait); mutex_lock(&migf->lock); - if (migf->disabled || migf->is_err) + if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; @@ -298,7 +299,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, return -ENOMEM; mutex_lock(&migf->lock); - if (migf->disabled) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } -- cgit From c668878381b5702f867ec7f43ee3b74259c6ea03 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:31 +0200 Subject: vfio/mlx5: Refactor to use queue based data chunks Refactor to use queue based data chunks on the migration file. The SAVE command adds a chunk to the tail of the queue while the read() API finds the required chunk and returns its data. In case the queue is empty but the state of the migration file is MLX5_MIGF_STATE_COMPLETE, read() may not be blocked but will return 0 to indicate end of file. This is a step towards maintaining multiple images and their meta data (i.e. headers) on the migration file as part of next patches from the series. Note: At that point, we still use a single chunk on the migration file but becomes ready to support multiple. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 24 +++++-- drivers/vfio/pci/mlx5/cmd.h | 5 ++ drivers/vfio/pci/mlx5/main.c | 145 +++++++++++++++++++++++++++++++++---------- 3 files changed, 136 insertions(+), 38 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index fcba12326185..0e36b4c8c816 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,6 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { + migf->buf = async_data->buf; migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -368,9 +369,15 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->buf->length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + unsigned long flags; + + async_data->buf->length = + MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -407,6 +414,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; + async_data->buf = buf; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -479,14 +487,22 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { - lockdep_assert_held(&migf->mvdev->state_mutex); + struct mlx5_vhca_data_buffer *entry; + lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); if (migf->buf) { mlx5vf_free_data_buffer(migf->buf); migf->buf = NULL; } + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + mlx5vf_free_data_buffer(entry); + } + mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 14403e654e4e..6e594689566e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -14,6 +14,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_COMPLETE, }; struct mlx5_vhca_data_buffer { @@ -24,6 +25,7 @@ struct mlx5_vhca_data_buffer { u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; + struct list_head buf_elm; struct mlx5_vf_migration_file *migf; /* Optimize mlx5vf_get_migration_page() for sequential access */ struct scatterlist *last_offset_sg; @@ -34,6 +36,7 @@ struct mlx5_vhca_data_buffer { struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct mlx5_vhca_data_buffer *buf; int status; void *out; }; @@ -45,6 +48,8 @@ struct mlx5_vf_migration_file { u32 pdn; struct mlx5_vhca_data_buffer *buf; + spinlock_t list_lock; + struct list_head buf_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index d95646c2f010..ca16425811c4 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -124,11 +124,90 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +static struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, + bool *end_of_data) +{ + struct mlx5_vhca_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = MLX5_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = mlx5vf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; - struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; ssize_t done = 0; if (pos) @@ -137,53 +216,47 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || - migf->state == MLX5_MIGF_STATE_ERROR)) + !list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { - done = -EAGAIN; - goto out_unlock; - } - if (*pos > vhca_buf->length) { - done = -EINVAL; - goto out_unlock; - } if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } - len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *from_buff; - int ret; + ssize_t count; + + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, + &end_of_data); + if (first_loop_call) { + first_loop_call = false; + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { + if (filp->f_flags & O_NONBLOCK) { + done = -EAGAIN; + goto out_unlock; + } + } + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; + if (end_of_data) + goto out_unlock; + + if (!vhca_buf) { + done = -EINVAL; goto out_unlock; } - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - from_buff = kmap_local_page(page); - ret = copy_to_user(buf, from_buff + page_offset, page_len); - kunmap_local(from_buff); - if (ret) { - done = -EFAULT; + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; goto out_unlock; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; + done += count; } out_unlock: @@ -202,7 +275,8 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->buf->length)) + else if (!list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_COMPLETE) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -253,6 +327,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; @@ -266,7 +342,6 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; - migf->buf = buf; return migf; out_save: mlx5vf_free_data_buffer(buf); @@ -386,6 +461,8 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); return migf; out_pd: mlx5vf_cmd_dealloc_pd(migf); -- cgit From 3319d287f4c04b9deece8ea00e27a70bbe32941b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:32 +0200 Subject: vfio/mlx5: Introduce device transitions of PRE_COPY In order to support PRE_COPY, mlx5 driver is transferring multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. The device is saving three kinds of states: 1) Initial state - when the device moves to PRE_COPY state. 2) Middle state - during PRE_COPY phase via VFIO_MIG_GET_PRECOPY_INFO. There can be multiple states of this type. 3) Final state - when the device moves to STOP_COPY state. After moving to PRE_COPY state, user is holding the saving migf FD and can use it. For example: user can start transferring data via read() callback. Also, user can switch from PRE_COPY to STOP_COPY whenever he sees it fits. This will invoke saving of final state. This means that mlx5 VFIO device can be switched to STOP_COPY without transferring any data in PRE_COPY state. Therefore, when the device moves to STOP_COPY, mlx5 will store the final state on a dedicated queue entry on the list. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 96 ++++++++++++++++++++++++++++++++++++++++---- drivers/vfio/pci/mlx5/cmd.h | 16 +++++++- drivers/vfio/pci/mlx5/main.c | 90 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 184 insertions(+), 18 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0e36b4c8c816..5fcece201d4c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; + int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while the device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the save + * command once it will try to turn on 'tracking' on a suspended device. + */ + if (migf) { + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + } + MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + if (migf) + complete(&migf->save_comp); + + return err; } int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) @@ -45,7 +63,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size) + size_t *state_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -59,6 +77,8 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); + MLX5_SET(query_vhca_migration_state_in, in, incremental, + query_flags & MLX5VF_QUERY_INC); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); @@ -342,6 +362,56 @@ end: return ERR_PTR(ret); } +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf, *temp_buf; + struct list_head free_list; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return ERR_PTR(-ENOTCONN); + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + if (buf->dma_dir == dma_dir) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to mlx5vf_free_data_buffer which + * might sleep. + */ + list_add(&buf->buf_elm, &free_list); + } + } + spin_unlock_irq(&migf->list_lock); + buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + mlx5vf_free_data_buffer(temp_buf); + } + + return buf; +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, @@ -351,7 +421,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->buf = async_data->buf; + mlx5vf_put_data_buffer(async_data->buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -369,15 +439,19 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { + size_t image_size; unsigned long flags; - async_data->buf->length = - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size); + image_size = MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + async_data->buf->length = image_size; + async_data->buf->start_pos = migf->max_pos; + migf->max_pos += async_data->buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - migf->state = MLX5_MIGF_STATE_COMPLETE; + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -391,7 +465,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf) + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; @@ -412,9 +487,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, incremental, inc); + MLX5_SET(save_vhca_state_in, in, set_track, track); async_data = &migf->async_data; async_data->buf = buf; + async_data->last_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -497,6 +575,8 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + list_splice(&migf->avail_list, &migf->buf_list); + while ((entry = list_first_entry_or_null(&migf->buf_list, struct mlx5_vhca_data_buffer, buf_elm))) { list_del(&entry->buf_elm); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 6e594689566e..34e61c7aa23d 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -38,6 +38,7 @@ struct mlx5vf_async_data { struct work_struct work; struct mlx5_vhca_data_buffer *buf; int status; + u8 last_chunk:1; void *out; }; @@ -47,9 +48,11 @@ struct mlx5_vf_migration_file { enum mlx5_vf_migf_state state; u32 pdn; + loff_t max_pos; struct mlx5_vhca_data_buffer *buf; spinlock_t list_lock; struct list_head buf_list; + struct list_head avail_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -129,10 +132,14 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; +enum { + MLX5VF_QUERY_INC = (1UL << 0), +}; + int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size); + size_t *state_size, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); @@ -140,7 +147,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf); + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf); @@ -151,6 +159,10 @@ struct mlx5_vhca_data_buffer * mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index ca16425811c4..9cabba456044 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -195,6 +195,7 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, if (*pos >= vhca_buf->start_pos + vhca_buf->length) { spin_lock_irq(&vhca_buf->migf->list_lock); list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); spin_unlock_irq(&vhca_buf->migf->list_lock); } @@ -283,6 +284,16 @@ static __poll_t mlx5vf_save_poll(struct file *filp, return pollflags; } +/* + * FD is exposed and user can use it after receiving an error. + * Mark migf in error, and wake the user. + */ +static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) +{ + migf->state = MLX5_MIGF_STATE_ERROR; + wake_up_interruptible(&migf->poll_wait); +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, @@ -291,8 +302,42 @@ static const struct file_operations mlx5vf_save_fops = { .llseek = no_llseek, }; +static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; + int ret; + + if (migf->state == MLX5_MIGF_STATE_ERROR) + return -ENODEV; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + MLX5VF_QUERY_INC); + if (ret) + goto err; + + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); + if (ret) + goto err_save; + + return 0; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); + return ret; +} + static struct mlx5_vf_migration_file * -mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) { struct mlx5_vf_migration_file *migf; struct mlx5_vhca_data_buffer *buf; @@ -328,8 +373,9 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); if (ret) goto out_pd; @@ -339,7 +385,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) goto out_pd; } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); if (ret) goto out_save; return migf; @@ -462,6 +508,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); return migf; out_pd: @@ -514,7 +561,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) @@ -522,7 +570,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) @@ -533,7 +582,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct mlx5_vf_migration_file *migf; - migf = mlx5vf_pci_save_device_data(mvdev); + migf = mlx5vf_pci_save_device_data(mvdev, false); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -541,7 +590,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return migf->filp; } - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) { mlx5vf_disable_fds(mvdev); return NULL; } @@ -567,6 +619,28 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct mlx5_vf_migration_file *migf; + + migf = mlx5vf_pci_save_device_data(mvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + mvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = mlx5vf_cmd_suspend_vhca(mvdev, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); + if (ret) + return ERR_PTR(ret); + ret = mlx5vf_pci_save_device_inc_data(mvdev); + return ret ? ERR_PTR(ret) : NULL; + } + /* * vfio_mig_get_next_state() does not use arcs other than the above */ @@ -635,7 +709,7 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, mutex_lock(&mvdev->state_mutex); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &state_size); + &state_size, 0); if (!ret) *stop_copy_length = state_size; mlx5vf_state_mutex_unlock(mvdev); -- cgit From 0c9a38fee8b210a8dfd3f177526daac567ec9265 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:33 +0200 Subject: vfio/mlx5: Introduce SW headers for migration states As mentioned in the previous patches, mlx5 is transferring multiple states when the PRE_COPY protocol is used. This states mechanism requires the target VM to know the states' size in order to execute multiple loads. Therefore, add SW header, with the needed information, for each saved state the source VM is transferring to the target VM. This patch implements the source VM handling of the headers, following patch will implement the target VM handling of the headers. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-10-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 56 +++++++++++++++++++++++++++++++++++++++++--- drivers/vfio/pci/mlx5/cmd.h | 13 ++++++++++ drivers/vfio/pci/mlx5/main.c | 2 +- 3 files changed, 67 insertions(+), 4 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5fcece201d4c..160fa38fc78d 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,9 +351,11 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, if (ret) goto end; - ret = mlx5vf_dma_data_buffer(buf); - if (ret) - goto end; + if (dma_dir != DMA_NONE) { + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } } return buf; @@ -422,6 +424,8 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { mlx5vf_put_data_buffer(async_data->buf); + if (async_data->header_buf) + mlx5vf_put_data_buffer(async_data->header_buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -431,6 +435,32 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) fput(migf->filp); } +static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, + size_t image_size) +{ + struct mlx5_vf_migration_file *migf = header_buf->migf; + struct mlx5_vf_migration_header header = {}; + unsigned long flags; + struct page *page; + u8 *to_buff; + + header.image_size = cpu_to_le64(image_size); + page = mlx5vf_get_migration_page(header_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + kunmap_local(to_buff); + header_buf->length = sizeof(header); + header_buf->header_image_size = image_size; + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + return 0; +} + static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) { struct mlx5vf_async_data *async_data = container_of(context, @@ -444,6 +474,11 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); + if (async_data->header_buf) { + status = add_buf_header(async_data->header_buf, image_size); + if (status) + goto err; + } async_data->buf->length = image_size; async_data->buf->start_pos = migf->max_pos; migf->max_pos += async_data->buf->length; @@ -455,6 +490,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) wake_up_interruptible(&migf->poll_wait); } +err: /* * The error and the cleanup flows can't run from an * interrupt context @@ -470,6 +506,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5_vhca_data_buffer *header_buf = NULL; struct mlx5vf_async_data *async_data; int err; @@ -499,6 +536,16 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } + } + + async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, @@ -510,7 +557,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return 0; err_exec: + if (header_buf) + mlx5vf_put_data_buffer(header_buf); fput(migf->filp); +err_free: kvfree(async_data->out); err_out: complete(&migf->save_comp); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 34e61c7aa23d..3e36ccca820a 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,16 +12,26 @@ #include #include +#define MLX5VF_PRE_COPY_SUPP(mvdev) \ + ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) + enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, MLX5_MIGF_STATE_COMPLETE, }; +struct mlx5_vf_migration_header { + __le64 image_size; + /* For future use in case we may need to change the kernel protocol */ + __le64 flags; +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; u64 length; u64 allocated_length; + u64 header_image_size; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -37,6 +47,7 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *header_buf; int status; u8 last_chunk:1; void *out; @@ -165,6 +176,8 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); +struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9cabba456044..9a36e36ec33b 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -32,7 +32,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -static struct page * +struct page * mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { -- cgit From 0dce165b1adf8d7f67030bb257e00107db8022de Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:34 +0200 Subject: vfio/mlx5: Introduce vfio precopy ioctl implementation vfio precopy ioctl returns an estimation of data available for transferring from the device. Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current state of the device, and if needed, append the dirty data to the transfer FD data. This is done by saving a middle state. As mlx5 runs the SAVE command asynchronously, make sure to query for incremental data only once there is no active save command. Running both in parallel, might end-up with a failure in the incremental query command on un-tracked vhca. Also, a middle state will be saved only after the previous state has finished its SAVE command and has been fully transferred, this prevents endless use resources. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-11-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 16 +++++++ drivers/vfio/pci/mlx5/main.c | 111 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 160fa38fc78d..12e74ecebe64 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -67,12 +67,25 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; + bool inc = query_flags & MLX5VF_QUERY_INC; int ret; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the + * incremental query command on un-tracked vhca. + */ + if (inc) { + ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); + if (ret) + return ret; + } + MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); @@ -82,6 +95,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); + if (inc) + complete(&mvdev->saving_migf->save_comp); + if (ret) return ret; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9a36e36ec33b..2c8ac763057c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -294,10 +294,121 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } +static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + struct mlx5_vhca_data_buffer *buf; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + size_t inc_length = 0; + bool end_of_data; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&mvdev->state_mutex); + if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * We can't issue a SAVE command when the device is suspended, so as + * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra + * bytes that can't be read. + */ + if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { + /* + * Once the query returns it's guaranteed that there is no + * active SAVE command. + * As so, the other code below is safe with the proper locks. + */ + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, + MLX5VF_QUERY_INC); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (migf->state == MLX5_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (buf) { + if (buf->start_pos == 0) { + info.initial_bytes = buf->header_image_size - *pos; + } else if (buf->start_pos == + sizeof(struct mlx5_vf_migration_header)) { + /* First data buffer following the header */ + info.initial_bytes = buf->start_pos + + buf->length - *pos; + } else { + info.dirty_bytes = buf->start_pos + buf->length - *pos; + } + } else { + if (!end_of_data) { + ret = -EINVAL; + goto err_migf_unlock; + } + + info.dirty_bytes = inc_length; + } + + if (!end_of_data || !inc_length) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, save a new state to be ready for. + */ + buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); + if (ret) { + mlx5vf_mark_err(migf); + mlx5vf_put_data_buffer(buf); + goto err_state_unlock; + } + +done: + mlx5vf_state_mutex_unlock(mvdev); + return copy_to_user((void __user *)arg, &info, minsz); +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, .poll = mlx5vf_save_poll, + .unlocked_ioctl = mlx5vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = mlx5vf_release_file, .llseek = no_llseek, }; -- cgit From 81156c27271c4a6c594e492c8d119fbacfc99f36 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:35 +0200 Subject: vfio/mlx5: Consider temporary end of stream as part of PRE_COPY During PRE_COPY the migration data FD may have a temporary "end of stream" that is reached when the initial_bytes were read and no other dirty data exists yet. For instance, this may indicate that the device is idle and not currently dirtying any internal state. When read() is done on this temporary end of stream the kernel driver should return ENOMSG from read(). Userspace can wait for more data or consider moving to STOP_COPY. To not block the user upon read() and let it get ENOMSG we add a new state named MLX5_MIGF_STATE_PRE_COPY on the migration file. In addition, we add the MLX5_MIGF_STATE_SAVE_LAST state to block the read() once we call the last SAVE upon moving to STOP_COPY. Any further error will be marked with MLX5_MIGF_STATE_ERROR and the user won't be blocked. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-12-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 7 +++++-- drivers/vfio/pci/mlx5/cmd.h | 2 ++ drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 12e74ecebe64..f6293da033cc 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -501,8 +501,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - if (async_data->last_chunk) - migf->state = MLX5_MIGF_STATE_COMPLETE; + migf->state = async_data->last_chunk ? + MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); } @@ -561,6 +561,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } } + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_SAVE_LAST; + async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 3e36ccca820a..d048f23977dd 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -17,6 +17,8 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY, + MLX5_MIGF_STATE_SAVE_LAST, MLX5_MIGF_STATE_COMPLETE, }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 2c8ac763057c..44b1543c751c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -219,6 +219,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (wait_event_interruptible(migf->poll_wait, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } @@ -236,6 +237,12 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, &end_of_data); if (first_loop_call) { first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && migf->state == MLX5_MIGF_STATE_PRE_COPY) { + done = -ENOMSG; + goto out_unlock; + } + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { if (filp->f_flags & O_NONBLOCK) { done = -EAGAIN; -- cgit From 34e2f27143d1b373f088e805f7e11cdf778f791d Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:36 +0200 Subject: vfio/mlx5: Introduce multiple loads In order to support PRE_COPY, mlx5 driver transfers multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. This patch implements the changes for the target VF to decompose the header for each state and to write and load multiple states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-13-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 13 +- drivers/vfio/pci/mlx5/cmd.h | 10 ++ drivers/vfio/pci/mlx5/main.c | 279 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 257 insertions(+), 45 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index f6293da033cc..993749818d90 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -598,9 +598,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (mvdev->mdev_detach) return -ENOTCONN; - err = mlx5vf_dma_data_buffer(buf); - if (err) - return err; + if (!buf->dmaed) { + err = mlx5vf_dma_data_buffer(buf); + if (err) + return err; + } MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); @@ -644,6 +646,11 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + if (migf->buf_header) { + mlx5vf_free_data_buffer(migf->buf_header); + migf->buf_header = NULL; + } + list_splice(&migf->avail_list, &migf->buf_list); while ((entry = list_first_entry_or_null(&migf->buf_list, diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index d048f23977dd..7729eac8c78c 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -22,6 +22,14 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_COMPLETE, }; +enum mlx5_vf_load_state { + MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, + MLX5_VF_LOAD_STATE_READ_HEADER, + MLX5_VF_LOAD_STATE_PREP_IMAGE, + MLX5_VF_LOAD_STATE_READ_IMAGE, + MLX5_VF_LOAD_STATE_LOAD_IMAGE, +}; + struct mlx5_vf_migration_header { __le64 image_size; /* For future use in case we may need to change the kernel protocol */ @@ -60,9 +68,11 @@ struct mlx5_vf_migration_file { struct mutex lock; enum mlx5_vf_migf_state state; + enum mlx5_vf_load_state load_state; u32 pdn; loff_t max_pos; struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *buf_header; spinlock_t list_lock; struct list_head buf_list; struct list_head avail_list; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 44b1543c751c..5a669b73994a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -518,13 +518,162 @@ end: return ERR_PTR(ret); } +static int +mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + unsigned long offset; + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + + page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + page_offset, *buf, page_len); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + *pos += page_len; + *done += page_len; + *buf += page_len; + *len -= page_len; + vhca_buf->length += page_len; + return 0; +} + +static int +mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, + loff_t requested_length, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + int ret; + + if (requested_length > MAX_MIGRATION_SIZE) + return -ENOMEM; + + if (vhca_buf->allocated_length < requested_length) { + ret = mlx5vf_add_migration_pages( + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, + PAGE_SIZE)); + if (ret) + return ret; + } + + while (*len) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, + done); + if (ret) + return ret; + } + + return 0; +} + +static ssize_t +mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + size_t image_size, const char __user **buf, + size_t *len, loff_t *pos, ssize_t *done, + bool *has_work) +{ + size_t copy_len, to_copy; + int ret; + + to_copy = min_t(size_t, *len, image_size - vhca_buf->length); + copy_len = to_copy; + while (to_copy) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, + done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == image_size) { + migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; + migf->max_pos += image_size; + *has_work = true; + } + + return 0; +} + +static int +mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, + size_t *len, loff_t *pos, + ssize_t *done, bool *has_work) +{ + struct page *page; + size_t copy_len; + u8 *to_buff; + int ret; + + copy_len = min_t(size_t, *len, + sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); + page = mlx5vf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); + if (ret) { + ret = -EFAULT; + goto end; + } + + *buf += copy_len; + *pos += copy_len; + *done += copy_len; + *len -= copy_len; + vhca_buf->length += copy_len; + if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { + u64 flags; + + vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); + if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { + ret = -ENOMEM; + goto end; + } + + flags = le64_to_cpup((__le64 *)(to_buff + + offsetof(struct mlx5_vf_migration_header, flags))); + if (flags) { + ret = -EOPNOTSUPP; + goto end; + } + + migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; + migf->max_pos += vhca_buf->length; + *has_work = true; + } +end: + kunmap_local(to_buff); + return ret; +} + static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; loff_t requested_length; + bool has_work = false; ssize_t done = 0; + int ret = 0; if (pos) return -ESPIPE; @@ -534,56 +683,83 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, check_add_overflow((loff_t)len, *pos, &requested_length)) return -EINVAL; - if (requested_length > MAX_MIGRATION_SIZE) - return -ENOMEM; - + mutex_lock(&migf->mvdev->state_mutex); mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) { - done = -ENODEV; + ret = -ENODEV; goto out_unlock; } - if (vhca_buf->allocated_length < requested_length) { - done = mlx5vf_add_migration_pages( - vhca_buf, - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, - PAGE_SIZE)); - if (done) - goto out_unlock; - } + while (len || has_work) { + has_work = false; + switch (migf->load_state) { + case MLX5_VF_LOAD_STATE_READ_HEADER: + ret = mlx5vf_resume_read_header(migf, vhca_buf_header, + &buf, &len, pos, + &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_PREP_IMAGE: + { + u64 size = vhca_buf_header->header_image_size; + + if (vhca_buf->allocated_length < size) { + mlx5vf_free_data_buffer(vhca_buf); + + migf->buf = mlx5vf_alloc_data_buffer(migf, + size, DMA_TO_DEVICE); + if (IS_ERR(migf->buf)) { + ret = PTR_ERR(migf->buf); + migf->buf = NULL; + goto out_unlock; + } - while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *to_buff; - int ret; + vhca_buf = migf->buf; + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; - goto out_unlock; + vhca_buf->start_pos = migf->max_pos; + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; + break; } + case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: + ret = mlx5vf_resume_read_image_no_header(vhca_buf, + requested_length, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_READ_IMAGE: + ret = mlx5vf_resume_read_image(migf, vhca_buf, + vhca_buf_header->header_image_size, + &buf, &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_LOAD_IMAGE: + ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); + if (ret) + goto out_unlock; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - to_buff = kmap_local_page(page); - ret = copy_from_user(to_buff + page_offset, buf, page_len); - kunmap_local(to_buff); - if (ret) { - done = -EFAULT; - goto out_unlock; + /* prep header buf for next image */ + vhca_buf_header->length = 0; + vhca_buf_header->header_image_size = 0; + /* prep data buf for next image */ + vhca_buf->length = 0; + + break; + default: + break; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; - vhca_buf->length += page_len; } + out_unlock: + if (ret) + migf->state = MLX5_MIGF_STATE_ERROR; mutex_unlock(&migf->lock); - return done; + mlx5vf_state_mutex_unlock(migf->mvdev); + return ret ? ret : done; } static const struct file_operations mlx5vf_resume_fops = { @@ -623,12 +799,29 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) } migf->buf = buf; + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + buf = mlx5vf_alloc_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_buf; + } + + migf->buf_header = buf; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + } else { + /* Initial state will be to read the image */ + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; + } + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); return migf; +out_buf: + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -728,11 +921,13 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf, - mvdev->resuming_migf->buf); - if (ret) - return ERR_PTR(ret); + if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { + ret = mlx5vf_cmd_load_vhca_state(mvdev, + mvdev->resuming_migf, + mvdev->resuming_migf->buf); + if (ret) + return ERR_PTR(ret); + } mlx5vf_disable_fds(mvdev); return NULL; } -- cgit From d6e18a4bec431c181a60d32876c6c89955b2a4f8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:37 +0200 Subject: vfio/mlx5: Fallback to STOP_COPY upon specific PRE_COPY error Before a SAVE command is issued, a QUERY command is issued in order to know the device data size. In case PRE_COPY is used, the above commands are issued while the device is running. Thus, it is possible that between the QUERY and the SAVE commands the state of the device will be changed significantly and thus the SAVE will fail. Currently, if a SAVE command is failing, the driver will fail the migration. In the above case, don't fail the migration, but don't allow for new SAVEs to be executed while the device is in a RUNNING state. Once the device will be moved to STOP_COPY, SAVE can be executed again and the full device state will be read. Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-14-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 27 ++++++++++++++++++++++++++- drivers/vfio/pci/mlx5/cmd.h | 2 ++ drivers/vfio/pci/mlx5/main.c | 6 ++++-- 3 files changed, 32 insertions(+), 3 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 993749818d90..01ef695e9441 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -84,6 +84,19 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); if (ret) return ret; + if (mvdev->saving_migf->state == + MLX5_MIGF_STATE_PRE_COPY_ERROR) { + /* + * In case we had a PRE_COPY error, only query full + * image for final image + */ + if (!(query_flags & MLX5VF_QUERY_FINAL)) { + *state_size = 0; + complete(&mvdev->saving_migf->save_comp); + return 0; + } + query_flags &= ~MLX5VF_QUERY_INC; + } } MLX5_SET(query_vhca_migration_state_in, in, opcode, @@ -442,7 +455,10 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5vf_put_data_buffer(async_data->buf); if (async_data->header_buf) mlx5vf_put_data_buffer(async_data->header_buf); - migf->state = MLX5_MIGF_STATE_ERROR; + if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) + migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; + else + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); @@ -511,6 +527,8 @@ err: * The error and the cleanup flows can't run from an * interrupt context */ + if (status == -EREMOTEIO) + status = MLX5_GET(save_vhca_state_out, async_data->out, status); async_data->status = status; queue_work(migf->mvdev->cb_wq, &async_data->work); } @@ -534,6 +552,13 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; + if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) + /* + * In case we had a PRE_COPY error, SAVE is triggered only for + * the final image, read device full image. + */ + inc = false; + MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 7729eac8c78c..5483171d57ad 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -17,6 +17,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY_ERROR, MLX5_MIGF_STATE_PRE_COPY, MLX5_MIGF_STATE_SAVE_LAST, MLX5_MIGF_STATE_COMPLETE, @@ -157,6 +158,7 @@ struct mlx5vf_pci_core_device { enum { MLX5VF_QUERY_INC = (1UL << 0), + MLX5VF_QUERY_FINAL = (1UL << 1), }; int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5a669b73994a..cd90eb86128c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -219,6 +219,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (wait_event_interruptible(migf->poll_wait, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; @@ -238,7 +239,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (first_loop_call) { first_loop_call = false; /* Temporary end of file as part of PRE_COPY */ - if (end_of_data && migf->state == MLX5_MIGF_STATE_PRE_COPY) { + if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { done = -ENOMSG; goto out_unlock; } @@ -431,7 +433,7 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) return -ENODEV; ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, - MLX5VF_QUERY_INC); + MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); if (ret) goto err; -- cgit From ccc2a52e464d1c983efede1a6d44728c151cb2ed Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:38 +0200 Subject: vfio/mlx5: Enable MIGRATION_PRE_COPY flag Now that everything has been set up for MIGRATION_PRE_COPY, enable it. Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-15-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 01ef695e9441..64e68d13cb98 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -222,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) mvdev->core_device.vdev.log_ops = log_ops; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) + mvdev->core_device.vdev.migration_flags |= + VFIO_MIGRATION_PRE_COPY; + end: mlx5_vf_put_core_dev(mvdev->mdev); } -- cgit From 64ffbbb1e948876dd9044c32a3ab8a790662b9bb Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:33 +0000 Subject: hisi_acc_vfio_pci: Add support for precopy IOCTL PRECOPY IOCTL in the case of HiSiIicon ACC driver can be used to perform the device compatibility check earlier during migration. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-2-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 52 ++++++++++++++++++++++++++ drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 + 2 files changed, 53 insertions(+) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0c0c0c7f0521..f3b74a06edb6 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -764,9 +764,58 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; return migf; } +static long hisi_acc_vf_precopy_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct hisi_acc_vf_migration_file *migf = filp->private_data; + struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev; + loff_t *pos = &filp->f_pos; + struct vfio_precopy_info info; + unsigned long minsz; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&hisi_acc_vdev->state_mutex); + if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) { + mutex_unlock(&hisi_acc_vdev->state_mutex); + return -EINVAL; + } + + mutex_lock(&migf->lock); + + if (migf->disabled) { + ret = -ENODEV; + goto out; + } + + if (*pos > migf->total_length) { + ret = -EINVAL; + goto out; + } + + info.dirty_bytes = 0; + info.initial_bytes = migf->total_length - *pos; + + ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +out: + mutex_unlock(&migf->lock); + mutex_unlock(&hisi_acc_vdev->state_mutex); + return ret; +} + static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { @@ -807,6 +856,8 @@ out_unlock: static const struct file_operations hisi_acc_vf_save_fops = { .owner = THIS_MODULE, .read = hisi_acc_vf_save_read, + .unlocked_ioctl = hisi_acc_vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = hisi_acc_vf_release_file, .llseek = no_llseek, }; @@ -832,6 +883,7 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; ret = vf_qm_state_save(hisi_acc_vdev, migf); if (ret) { diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 67343325b320..11d51345f5b5 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -91,6 +91,7 @@ struct hisi_acc_vf_migration_file { struct mutex lock; bool disabled; + struct hisi_acc_vf_core_device *hisi_acc_vdev; struct acc_vf_data vf_data; size_t total_length; }; -- cgit From d9a871e4a143047d1d84a606772af319f11516f9 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:34 +0000 Subject: hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions The saving_migf is open in PRE_COPY state if it is supported and reads initial device match data. hisi_acc_vf_stop_copy() is refactored to make use of common code. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-3-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 74 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index f3b74a06edb6..c8658636a84c 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -863,7 +863,7 @@ static const struct file_operations hisi_acc_vf_save_fops = { }; static struct hisi_acc_vf_migration_file * -hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct hisi_acc_vf_migration_file *migf; int ret; @@ -885,7 +885,7 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) mutex_init(&migf->lock); migf->hisi_acc_vdev = hisi_acc_vdev; - ret = vf_qm_state_save(hisi_acc_vdev, migf); + ret = vf_qm_get_match_data(hisi_acc_vdev, &migf->vf_data); if (ret) { fput(migf->filp); return ERR_PTR(ret); @@ -894,6 +894,44 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) return migf; } +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + + migf->total_length = QM_MATCH_SIZE; + return migf; +} + +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, bool open) +{ + int ret; + struct hisi_acc_vf_migration_file *migf = NULL; + + if (open) { + /* + * Userspace didn't use PRECOPY support. Hence saving_migf + * is not opened yet. + */ + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + } else { + migf = hisi_acc_vdev->saving_migf; + } + + ret = vf_qm_state_save(hisi_acc_vdev, migf); + if (ret) + return ERR_PTR(ret); + + return open ? migf : NULL; +} + static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct device *dev = &hisi_acc_vdev->vf_dev->dev; @@ -921,6 +959,31 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 cur = hisi_acc_vdev->mig_state; int ret; + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) { + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_vf_pre_copy(hisi_acc_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + hisi_acc_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct hisi_acc_vf_migration_file *migf; + + ret = hisi_acc_vf_stop_device(hisi_acc_vdev); + if (ret) + return ERR_PTR(ret); + + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + return NULL; + } + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) { ret = hisi_acc_vf_stop_device(hisi_acc_vdev); if (ret) @@ -931,7 +994,7 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct hisi_acc_vf_migration_file *migf; - migf = hisi_acc_vf_stop_copy(hisi_acc_vdev); + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, true); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -963,6 +1026,11 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, return NULL; } + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) { + hisi_acc_vf_disable_fds(hisi_acc_vdev); + return NULL; + } + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) { hisi_acc_vf_start_device(hisi_acc_vdev); return NULL; -- cgit From 190125adcad4c5850fd74ecd697e20a446b74ed8 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:35 +0000 Subject: hisi_acc_vfio_pci: Move the dev compatibility tests for early check Instead of waiting till data transfer is complete to perform dev compatibility, do it as soon as we have enough data to perform the check. This will be useful when we enable the support for PRE_COPY. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-4-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 19 +++++++------------ drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 + 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index c8658636a84c..9a51f41e1d2a 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -360,8 +360,8 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 que_iso_state; int ret; - if (migf->total_length < QM_MATCH_SIZE) - return -EINVAL; + if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) + return 0; if (vf_data->acc_magic != ACC_DEV_MAGIC) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); @@ -406,6 +406,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, } hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + hisi_acc_vdev->match_done = true; return 0; } @@ -493,10 +494,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct device *dev = &vf_qm->pdev->dev; int ret; - ret = vf_qm_get_match_data(hisi_acc_vdev, vf_data); - if (ret) - return ret; - if (unlikely(qm_wait_dev_not_ready(vf_qm))) { /* Update state and return with match data */ vf_data->vf_qm_state = QM_NOT_READY; @@ -673,12 +670,6 @@ static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf; int ret; - /* Check dev compatibility */ - ret = vf_qm_check_match(hisi_acc_vdev, migf); - if (ret) { - dev_err(dev, "failed to match the VF!\n"); - return ret; - } /* Recover data to VF */ ret = vf_qm_load_data(hisi_acc_vdev, migf); if (ret) { @@ -732,6 +723,10 @@ static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *bu *pos += len; done = len; migf->total_length += len; + + ret = vf_qm_check_match(migf->hisi_acc_vdev, migf); + if (ret) + done = -EFAULT; out_unlock: mutex_unlock(&migf->lock); return done; diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 11d51345f5b5..dcabfeec6ca1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -98,6 +98,7 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; + u8 match_done:1; u8 deferred_reset:1; /* For migration state */ struct mutex state_mutex; -- cgit From f2240b4441cc5bd87820371ee79ad7c9125e76b6 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:36 +0000 Subject: hisi_acc_vfio_pci: Enable PRE_COPY flag Now that we have everything to support the PRE_COPY state, enable it. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-5-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 9a51f41e1d2a..51941bb4f31f 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1351,7 +1351,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) hisi_acc_vdev->vf_dev = pdev; mutex_init(&hisi_acc_vdev->state_mutex); - core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY; core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; return vfio_pci_core_init_dev(core_vdev); -- cgit From fe3dd71db2b81c202bc80532bbe0e07238a45ed9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Dec 2022 19:01:26 +0300 Subject: vfio/mlx5: fix error code in mlx5vf_precopy_ioctl() The copy_to_user() function returns the number of bytes remaining to be copied but we want to return a negative error code here. Fixes: 0dce165b1adf ("vfio/mlx5: Introduce vfio precopy ioctl implementation") Signed-off-by: Dan Carpenter Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/Y5IKVknlf5Z5NPtU@kili Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index cd90eb86128c..94f7a0fd10e8 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -404,7 +404,10 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, done: mlx5vf_state_mutex_unlock(mvdev); - return copy_to_user((void __user *)arg, &info, minsz); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + err_migf_unlock: mutex_unlock(&migf->lock); err_state_unlock: -- cgit From 70be6f322860d322ebcd120cf0c05402ead5c6de Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Dec 2022 19:02:17 +0300 Subject: vfio/mlx5: error pointer dereference in error handling This code frees the wrong "buf" variable and results in an error pointer dereference. Fixes: 34e2f27143d1 ("vfio/mlx5: Introduce multiple loads") Signed-off-by: Dan Carpenter Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/Y5IKia5SaiVxYmG5@kili Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 94f7a0fd10e8..031ac8cc215d 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -826,7 +826,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) spin_lock_init(&migf->list_lock); return migf; out_buf: - mlx5vf_free_data_buffer(buf); + mlx5vf_free_data_buffer(migf->buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: -- cgit