diff options
author | Jason Gunthorpe <jgg@mellanox.com> | 2019-10-09 13:09:34 -0300 |
---|---|---|
committer | Jason Gunthorpe <jgg@mellanox.com> | 2019-10-28 16:41:14 -0300 |
commit | 09689703d29a3b75c510c198c3aca85d7d8b50c7 (patch) | |
tree | 378ec2883f70d270bbc1dc2068ede78979163c39 /drivers/infiniband/hw/mlx5/mr.c | |
parent | d561987f34f263dd176ccd8fb782cb153d72f441 (diff) |
RDMA/mlx5: Do not race with mlx5_ib_invalidate_range during create and destroy
For creation, as soon as the umem_odp is created the notifier can be
called, however the underlying MR may not have been setup yet. This would
cause problems if mlx5_ib_invalidate_range() runs. There is some
confusing/ulocked/racy code that might by trying to solve this, but
without locks it isn't going to work right.
Instead trivially solve the problem by short-circuiting the invalidation
if there are not yet any DMA mapped pages. By definition there is nothing
to invalidate in this case.
The create code will have the umem fully setup before anything is DMA
mapped, and npages is fully locked by the umem_mutex.
For destroy, invalidate the entire MR at the HW to stop DMA then DMA unmap
the pages before destroying the MR. This drives npages to zero and
prevents similar racing with invalidate while the MR is undergoing
destruction.
Arguably it would be better if the umem was created after the MR and
destroyed before, but that would require a big rework of the MR code.
Fixes: 6aec21f6a832 ("IB/mlx5: Page faults handling infrastructure")
Link: https://lore.kernel.org/r/20191009160934.3143-15-jgg@ziepe.ca
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'drivers/infiniband/hw/mlx5/mr.c')
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 74 |
1 files changed, 24 insertions, 50 deletions
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 1e91f61efa8a..199f7959aaa5 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -50,7 +50,6 @@ enum { static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); -static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { @@ -495,7 +494,7 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) c = order2idx(dev, mr->order); WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES); - if (unreg_umr(dev, mr)) { + if (mlx5_mr_cache_invalidate(mr)) { mr->allocated_from_cache = false; destroy_mkey(dev, mr); ent = &cache->ent[c]; @@ -1333,22 +1332,29 @@ error: return ERR_PTR(err); } -static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +/** + * mlx5_mr_cache_invalidate - Fence all DMA on the MR + * @mr: The MR to fence + * + * Upon return the NIC will not be doing any DMA to the pages under the MR, + * and any DMA inprogress will be completed. Failure of this function + * indicates the HW has failed catastrophically. + */ +int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr) { - struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_umr_wr umrwr = {}; - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + if (mr->dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) return 0; umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; umrwr.wr.opcode = MLX5_IB_WR_UMR; - umrwr.pd = dev->umrc.pd; + umrwr.pd = mr->dev->umrc.pd; umrwr.mkey = mr->mmkey.key; umrwr.ignore_free_state = 1; - return mlx5_ib_post_send_wait(dev, &umrwr); + return mlx5_ib_post_send_wait(mr->dev, &umrwr); } static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, @@ -1432,7 +1438,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, * UMR can't be used - MKey needs to be replaced. */ if (mr->allocated_from_cache) - err = unreg_umr(dev, mr); + err = mlx5_mr_cache_invalidate(mr); else err = destroy_mkey(dev, mr); if (err) @@ -1561,52 +1567,20 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) int npages = mr->npages; struct ib_umem *umem = mr->umem; - if (is_odp_mr(mr)) { - struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem); - - /* Prevent new page faults and - * prefetch requests from succeeding - */ - xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); - - /* Wait for all running page-fault handlers to finish. */ - synchronize_srcu(&dev->odp_srcu); - - /* dequeue pending prefetch requests for the mr */ - if (atomic_read(&mr->num_deferred_work)) { - flush_workqueue(system_unbound_wq); - WARN_ON(atomic_read(&mr->num_deferred_work)); - } - - /* Destroy all page mappings */ - mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - - /* - * We kill the umem before the MR for ODP, - * so that there will not be any invalidations in - * flight, looking at the *mr struct. - */ - ib_umem_odp_release(umem_odp); - atomic_sub(npages, &dev->mdev->priv.reg_pages); - - /* Avoid double-freeing the umem. */ - umem = NULL; - } + /* Stop all DMA */ + if (is_odp_mr(mr)) + mlx5_ib_fence_odp_mr(mr); + else + clean_mr(dev, mr); - clean_mr(dev, mr); + if (mr->allocated_from_cache) + mlx5_mr_cache_free(dev, mr); + else + kfree(mr); - /* - * We should unregister the DMA address from the HCA before - * remove the DMA mapping. - */ - mlx5_mr_cache_free(dev, mr); ib_umem_release(umem); - if (umem) - atomic_sub(npages, &dev->mdev->priv.reg_pages); + atomic_sub(npages, &dev->mdev->priv.reg_pages); - if (!mr->allocated_from_cache) - kfree(mr); } int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) |