summaryrefslogtreecommitdiff
path: root/drivers/infiniband/hw/mlx5/umr.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5/umr.c')
-rw-r--r--drivers/infiniband/hw/mlx5/umr.c307
1 files changed, 283 insertions, 24 deletions
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index 5be4426a2884..7ef35cddce81 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -32,13 +32,15 @@ static __be64 get_umr_disable_mr_mask(void)
return cpu_to_be64(result);
}
-static __be64 get_umr_update_translation_mask(void)
+static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev)
{
u64 result;
result = MLX5_MKEY_MASK_LEN |
MLX5_MKEY_MASK_PAGE_SIZE |
MLX5_MKEY_MASK_START_ADDR;
+ if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))
+ result |= MLX5_MKEY_MASK_PAGE_SIZE_5;
return cpu_to_be64(result);
}
@@ -654,9 +656,12 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
if (update_translation) {
- wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask();
+ wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev);
if (!mr->ibmr.length)
MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
+ if (flags & MLX5_IB_UPD_XLT_KEEP_PGSZ)
+ wqe->ctrl_seg.mkey_mask &=
+ cpu_to_be64(~MLX5_MKEY_MASK_PAGE_SIZE);
}
wqe->ctrl_seg.xlt_octowords =
@@ -664,46 +669,78 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
wqe->data_seg.byte_count = cpu_to_be32(sg->length);
}
+static void
+_mlx5r_umr_init_wqe(struct mlx5_ib_mr *mr, struct mlx5r_umr_wqe *wqe,
+ struct ib_sge *sg, unsigned int flags,
+ unsigned int page_shift, bool dd)
+{
+ struct mlx5_ib_dev *dev = mr_to_mdev(mr);
+
+ mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg);
+ mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift);
+ if (dd) /* Use the data direct internal kernel PD */
+ MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn);
+ mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg);
+}
+
static int
-_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
+_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd,
+ size_t start_block, size_t nblocks)
{
size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
struct device *ddev = &dev->mdev->pdev->dev;
struct mlx5r_umr_wqe wqe = {};
+ size_t processed_blocks = 0;
struct ib_block_iter biter;
+ size_t cur_block_idx = 0;
struct mlx5_ksm *cur_ksm;
struct mlx5_mtt *cur_mtt;
size_t orig_sg_length;
+ size_t total_blocks;
size_t final_size;
void *curr_entry;
struct ib_sge sg;
void *entry;
- u64 offset = 0;
+ u64 offset;
int err = 0;
- entry = mlx5r_umr_create_xlt(dev, &sg,
- ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
- ent_size, flags);
+ total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift);
+ if (start_block > total_blocks)
+ return -EINVAL;
+
+ /* nblocks 0 means update all blocks starting from start_block */
+ if (nblocks)
+ total_blocks = nblocks;
+
+ entry = mlx5r_umr_create_xlt(dev, &sg, total_blocks, ent_size, flags);
if (!entry)
return -ENOMEM;
orig_sg_length = sg.length;
- mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
- mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
- mr->page_shift);
- if (dd) {
- /* Use the data direct internal kernel PD */
- MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
+
+ _mlx5r_umr_init_wqe(mr, &wqe, &sg, flags, mr->page_shift, dd);
+
+ /* Set initial translation offset to start_block */
+ offset = (u64)start_block * ent_size;
+ mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
+
+ if (dd)
cur_ksm = entry;
- } else {
+ else
cur_mtt = entry;
- }
-
- mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
curr_entry = entry;
+
rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
+ if (cur_block_idx < start_block) {
+ cur_block_idx++;
+ continue;
+ }
+
+ if (nblocks && processed_blocks >= nblocks)
+ break;
+
if (curr_entry == entry + sg.length) {
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
@@ -725,6 +762,11 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
if (dd) {
cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
+ if (mr->umem->is_dmabuf &&
+ (flags & MLX5_IB_UPD_XLT_ZAP)) {
+ cur_ksm->va = 0;
+ cur_ksm->key = 0;
+ }
cur_ksm++;
curr_entry = cur_ksm;
} else {
@@ -736,6 +778,8 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
cur_mtt++;
curr_entry = cur_mtt;
}
+
+ processed_blocks++;
}
final_size = curr_entry - entry;
@@ -752,13 +796,32 @@ err:
return err;
}
-int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr,
+ unsigned int flags,
+ size_t start_block,
+ size_t nblocks)
{
/* No invalidation flow is expected */
- if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP))
+ if (WARN_ON(!mr->umem->is_dmabuf) || ((flags & MLX5_IB_UPD_XLT_ZAP) &&
+ !(flags & MLX5_IB_UPD_XLT_KEEP_PGSZ)))
return -EINVAL;
- return _mlx5r_umr_update_mr_pas(mr, flags, true);
+ return _mlx5r_umr_update_mr_pas(mr, flags, true, start_block, nblocks);
+}
+
+int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr,
+ unsigned int flags)
+{
+ return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 0, 0);
+}
+
+int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags,
+ size_t start_block, size_t nblocks)
+{
+ if (WARN_ON(mr->umem->is_odp))
+ return -EINVAL;
+
+ return _mlx5r_umr_update_mr_pas(mr, flags, false, start_block, nblocks);
}
/*
@@ -768,10 +831,7 @@ int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int fla
*/
int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
{
- if (WARN_ON(mr->umem->is_odp))
- return -EINVAL;
-
- return _mlx5r_umr_update_mr_pas(mr, flags, false);
+ return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0);
}
static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
@@ -864,3 +924,202 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
return err;
}
+
+/*
+ * Update only the page-size (log_page_size) field of an existing memory key
+ * using UMR. This is useful when the MR's physical layout stays the same
+ * but the optimal page shift has changed (e.g. dmabuf after pages are
+ * pinned and the HW can switch from 4K to huge-page alignment).
+ */
+int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr,
+ unsigned int page_shift,
+ bool dd)
+{
+ struct mlx5_ib_dev *dev = mr_to_mdev(mr);
+ struct mlx5r_umr_wqe wqe = {};
+ int err;
+
+ /* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */
+ wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev);
+
+ /* MR must be free while page size is modified */
+ wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE;
+
+ /* Fill mkey segment with the new page size, keep the rest unchanged */
+ MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift);
+
+ if (dd)
+ MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
+ else
+ MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
+
+ MLX5_SET64(mkc, &wqe.mkey_seg, start_addr, mr->ibmr.iova);
+ MLX5_SET64(mkc, &wqe.mkey_seg, len, mr->ibmr.length);
+ MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
+ MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
+ mlx5_mkey_variant(mr->mmkey.key));
+
+ err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
+ if (!err)
+ mr->page_shift = page_shift;
+
+ return err;
+}
+
+static inline int
+_mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr *mr, unsigned int flags,
+ size_t start_block, size_t nblocks, bool dd)
+{
+ if (dd)
+ return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags,
+ start_block,
+ nblocks);
+ else
+ return mlx5r_umr_update_mr_pas_range(mr, flags, start_block,
+ nblocks);
+}
+
+/**
+ * This function makes an mkey non-present by zapping the translation entries of
+ * the mkey by zapping (zeroing out) the first N entries, where N is determined
+ * by the largest page size supported by the device and the MR length.
+ * It then updates the mkey's page size to the largest possible value, ensuring
+ * the MR is completely non-present and safe for further updates.
+ * It is useful to update the page size of a dmabuf MR on a page fault.
+ *
+ * Return: On success, returns the number of entries that were zapped.
+ * On error, returns a negative error code.
+ */
+static int _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr,
+ unsigned int flags,
+ unsigned int page_shift,
+ size_t *nblocks,
+ bool dd)
+{
+ unsigned int old_page_shift = mr->page_shift;
+ struct mlx5_ib_dev *dev = mr_to_mdev(mr);
+ unsigned int max_page_shift;
+ size_t page_shift_nblocks;
+ unsigned int max_log_size;
+ int access_mode;
+ int err;
+
+ access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT;
+ flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP |
+ MLX5_IB_UPD_XLT_ATOMIC;
+ max_log_size = get_max_log_entity_size_cap(dev, access_mode);
+ max_page_shift = order_base_2(mr->ibmr.length);
+ max_page_shift = min(max(max_page_shift, page_shift), max_log_size);
+ /* Count blocks in units of max_page_shift, we will zap exactly this
+ * many to make the whole MR non-present.
+ * Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may
+ * be used as offset into the XLT later on.
+ */
+ *nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift);
+ if (dd)
+ *nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT);
+ else
+ *nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT);
+ page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem,
+ 1UL << page_shift);
+ /* If the number of blocks at max possible page shift is greater than
+ * the number of blocks at the new page size, we should just go over the
+ * whole mkey entries.
+ */
+ if (*nblocks >= page_shift_nblocks)
+ *nblocks = 0;
+
+ /* Make the first nblocks entries non-present without changing
+ * page size yet.
+ */
+ if (*nblocks)
+ mr->page_shift = max_page_shift;
+ err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd);
+ if (err) {
+ mr->page_shift = old_page_shift;
+ return err;
+ }
+
+ /* Change page size to the max page size now that the MR is completely
+ * non-present.
+ */
+ if (*nblocks) {
+ err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd);
+ if (err) {
+ mr->page_shift = old_page_shift;
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its
+ * entries accordingly
+ * @mr: The memory region to update
+ * @xlt_flags: Translation table update flags
+ * @page_shift: The new (optimized) page shift to use
+ *
+ * This function updates the page size and mkey translation entries for a DMABUF
+ * MR in a safe, multi-step process to avoid exposing partially updated mappings
+ * The update is performed in 5 steps:
+ * 1. Make the first X entries non-present, while X is calculated to be
+ * minimal according to a large page shift that can be used to cover the
+ * MR length.
+ * 2. Update the page size to the large supported page size
+ * 3. Load the remaining N-X entries according to the (optimized) page_shift
+ * 4. Update the page size according to the (optimized) page_shift
+ * 5. Load the first X entries with the correct translations
+ *
+ * This ensures that at no point is the MR accessible with a partially updated
+ * translation table, maintaining correctness and preventing access to stale or
+ * inconsistent mappings.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags,
+ unsigned int page_shift)
+{
+ unsigned int old_page_shift = mr->page_shift;
+ size_t zapped_blocks;
+ size_t total_blocks;
+ int err;
+
+ err = _mlx5r_umr_zap_mkey(mr, xlt_flags, page_shift, &zapped_blocks,
+ mr->data_direct);
+ if (err)
+ return err;
+
+ /* _mlx5r_umr_zap_mkey already enables the mkey */
+ xlt_flags &= ~MLX5_IB_UPD_XLT_ENABLE;
+ mr->page_shift = page_shift;
+ total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift);
+ if (zapped_blocks && zapped_blocks < total_blocks) {
+ /* Update PAS according to the new page size but don't update
+ * the page size in the mkey yet.
+ */
+ err = _mlx5r_dmabuf_umr_update_pas(
+ mr,
+ xlt_flags | MLX5_IB_UPD_XLT_KEEP_PGSZ,
+ zapped_blocks,
+ total_blocks - zapped_blocks,
+ mr->data_direct);
+ if (err)
+ goto err;
+ }
+
+ err = mlx5r_umr_update_mr_page_shift(mr, mr->page_shift,
+ mr->data_direct);
+ if (err)
+ goto err;
+ err = _mlx5r_dmabuf_umr_update_pas(mr, xlt_flags, 0, zapped_blocks,
+ mr->data_direct);
+ if (err)
+ goto err;
+
+ return 0;
+err:
+ mr->page_shift = old_page_shift;
+ return err;
+}