diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5/umr.c')
-rw-r--r-- | drivers/infiniband/hw/mlx5/umr.c | 307 |
1 files changed, 283 insertions, 24 deletions
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 5be4426a2884..7ef35cddce81 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -32,13 +32,15 @@ static __be64 get_umr_disable_mr_mask(void) return cpu_to_be64(result); } -static __be64 get_umr_update_translation_mask(void) +static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev) { u64 result; result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | MLX5_MKEY_MASK_START_ADDR; + if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5)) + result |= MLX5_MKEY_MASK_PAGE_SIZE_5; return cpu_to_be64(result); } @@ -654,9 +656,12 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR; if (update_translation) { - wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(); + wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev); if (!mr->ibmr.length) MLX5_SET(mkc, &wqe->mkey_seg, length64, 1); + if (flags & MLX5_IB_UPD_XLT_KEEP_PGSZ) + wqe->ctrl_seg.mkey_mask &= + cpu_to_be64(~MLX5_MKEY_MASK_PAGE_SIZE); } wqe->ctrl_seg.xlt_octowords = @@ -664,46 +669,78 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, wqe->data_seg.byte_count = cpu_to_be32(sg->length); } +static void +_mlx5r_umr_init_wqe(struct mlx5_ib_mr *mr, struct mlx5r_umr_wqe *wqe, + struct ib_sge *sg, unsigned int flags, + unsigned int page_shift, bool dd) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + + mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg); + mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift); + if (dd) /* Use the data direct internal kernel PD */ + MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn); + mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg); +} + static int -_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) +_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd, + size_t start_block, size_t nblocks) { size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct device *ddev = &dev->mdev->pdev->dev; struct mlx5r_umr_wqe wqe = {}; + size_t processed_blocks = 0; struct ib_block_iter biter; + size_t cur_block_idx = 0; struct mlx5_ksm *cur_ksm; struct mlx5_mtt *cur_mtt; size_t orig_sg_length; + size_t total_blocks; size_t final_size; void *curr_entry; struct ib_sge sg; void *entry; - u64 offset = 0; + u64 offset; int err = 0; - entry = mlx5r_umr_create_xlt(dev, &sg, - ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), - ent_size, flags); + total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift); + if (start_block > total_blocks) + return -EINVAL; + + /* nblocks 0 means update all blocks starting from start_block */ + if (nblocks) + total_blocks = nblocks; + + entry = mlx5r_umr_create_xlt(dev, &sg, total_blocks, ent_size, flags); if (!entry) return -ENOMEM; orig_sg_length = sg.length; - mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); - mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, - mr->page_shift); - if (dd) { - /* Use the data direct internal kernel PD */ - MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + + _mlx5r_umr_init_wqe(mr, &wqe, &sg, flags, mr->page_shift, dd); + + /* Set initial translation offset to start_block */ + offset = (u64)start_block * ent_size; + mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); + + if (dd) cur_ksm = entry; - } else { + else cur_mtt = entry; - } - - mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); curr_entry = entry; + rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { + if (cur_block_idx < start_block) { + cur_block_idx++; + continue; + } + + if (nblocks && processed_blocks >= nblocks) + break; + if (curr_entry == entry + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -725,6 +762,11 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) if (dd) { cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + if (mr->umem->is_dmabuf && + (flags & MLX5_IB_UPD_XLT_ZAP)) { + cur_ksm->va = 0; + cur_ksm->key = 0; + } cur_ksm++; curr_entry = cur_ksm; } else { @@ -736,6 +778,8 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) cur_mtt++; curr_entry = cur_mtt; } + + processed_blocks++; } final_size = curr_entry - entry; @@ -752,13 +796,32 @@ err: return err; } -int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) +int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr, + unsigned int flags, + size_t start_block, + size_t nblocks) { /* No invalidation flow is expected */ - if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) + if (WARN_ON(!mr->umem->is_dmabuf) || ((flags & MLX5_IB_UPD_XLT_ZAP) && + !(flags & MLX5_IB_UPD_XLT_KEEP_PGSZ))) return -EINVAL; - return _mlx5r_umr_update_mr_pas(mr, flags, true); + return _mlx5r_umr_update_mr_pas(mr, flags, true, start_block, nblocks); +} + +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, + unsigned int flags) +{ + return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 0, 0); +} + +int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags, + size_t start_block, size_t nblocks) +{ + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, false, start_block, nblocks); } /* @@ -768,10 +831,7 @@ int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int fla */ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) { - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - return _mlx5r_umr_update_mr_pas(mr, flags, false); + return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0); } static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) @@ -864,3 +924,202 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, mlx5r_umr_unmap_free_xlt(dev, xlt, &sg); return err; } + +/* + * Update only the page-size (log_page_size) field of an existing memory key + * using UMR. This is useful when the MR's physical layout stays the same + * but the optimal page shift has changed (e.g. dmabuf after pages are + * pinned and the HW can switch from 4K to huge-page alignment). + */ +int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr, + unsigned int page_shift, + bool dd) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct mlx5r_umr_wqe wqe = {}; + int err; + + /* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */ + wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev); + + /* MR must be free while page size is modified */ + wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE; + + /* Fill mkey segment with the new page size, keep the rest unchanged */ + MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift); + + if (dd) + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + else + MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn); + + MLX5_SET64(mkc, &wqe.mkey_seg, start_addr, mr->ibmr.iova); + MLX5_SET64(mkc, &wqe.mkey_seg, len, mr->ibmr.length); + MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, + mlx5_mkey_variant(mr->mmkey.key)); + + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); + if (!err) + mr->page_shift = page_shift; + + return err; +} + +static inline int +_mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr *mr, unsigned int flags, + size_t start_block, size_t nblocks, bool dd) +{ + if (dd) + return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, + start_block, + nblocks); + else + return mlx5r_umr_update_mr_pas_range(mr, flags, start_block, + nblocks); +} + +/** + * This function makes an mkey non-present by zapping the translation entries of + * the mkey by zapping (zeroing out) the first N entries, where N is determined + * by the largest page size supported by the device and the MR length. + * It then updates the mkey's page size to the largest possible value, ensuring + * the MR is completely non-present and safe for further updates. + * It is useful to update the page size of a dmabuf MR on a page fault. + * + * Return: On success, returns the number of entries that were zapped. + * On error, returns a negative error code. + */ +static int _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr, + unsigned int flags, + unsigned int page_shift, + size_t *nblocks, + bool dd) +{ + unsigned int old_page_shift = mr->page_shift; + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + unsigned int max_page_shift; + size_t page_shift_nblocks; + unsigned int max_log_size; + int access_mode; + int err; + + access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT; + flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC; + max_log_size = get_max_log_entity_size_cap(dev, access_mode); + max_page_shift = order_base_2(mr->ibmr.length); + max_page_shift = min(max(max_page_shift, page_shift), max_log_size); + /* Count blocks in units of max_page_shift, we will zap exactly this + * many to make the whole MR non-present. + * Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may + * be used as offset into the XLT later on. + */ + *nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift); + if (dd) + *nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT); + else + *nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT); + page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem, + 1UL << page_shift); + /* If the number of blocks at max possible page shift is greater than + * the number of blocks at the new page size, we should just go over the + * whole mkey entries. + */ + if (*nblocks >= page_shift_nblocks) + *nblocks = 0; + + /* Make the first nblocks entries non-present without changing + * page size yet. + */ + if (*nblocks) + mr->page_shift = max_page_shift; + err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd); + if (err) { + mr->page_shift = old_page_shift; + return err; + } + + /* Change page size to the max page size now that the MR is completely + * non-present. + */ + if (*nblocks) { + err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd); + if (err) { + mr->page_shift = old_page_shift; + return err; + } + } + + return 0; +} + +/** + * mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its + * entries accordingly + * @mr: The memory region to update + * @xlt_flags: Translation table update flags + * @page_shift: The new (optimized) page shift to use + * + * This function updates the page size and mkey translation entries for a DMABUF + * MR in a safe, multi-step process to avoid exposing partially updated mappings + * The update is performed in 5 steps: + * 1. Make the first X entries non-present, while X is calculated to be + * minimal according to a large page shift that can be used to cover the + * MR length. + * 2. Update the page size to the large supported page size + * 3. Load the remaining N-X entries according to the (optimized) page_shift + * 4. Update the page size according to the (optimized) page_shift + * 5. Load the first X entries with the correct translations + * + * This ensures that at no point is the MR accessible with a partially updated + * translation table, maintaining correctness and preventing access to stale or + * inconsistent mappings. + * + * Returns 0 on success or a negative error code on failure. + */ +int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags, + unsigned int page_shift) +{ + unsigned int old_page_shift = mr->page_shift; + size_t zapped_blocks; + size_t total_blocks; + int err; + + err = _mlx5r_umr_zap_mkey(mr, xlt_flags, page_shift, &zapped_blocks, + mr->data_direct); + if (err) + return err; + + /* _mlx5r_umr_zap_mkey already enables the mkey */ + xlt_flags &= ~MLX5_IB_UPD_XLT_ENABLE; + mr->page_shift = page_shift; + total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift); + if (zapped_blocks && zapped_blocks < total_blocks) { + /* Update PAS according to the new page size but don't update + * the page size in the mkey yet. + */ + err = _mlx5r_dmabuf_umr_update_pas( + mr, + xlt_flags | MLX5_IB_UPD_XLT_KEEP_PGSZ, + zapped_blocks, + total_blocks - zapped_blocks, + mr->data_direct); + if (err) + goto err; + } + + err = mlx5r_umr_update_mr_page_shift(mr, mr->page_shift, + mr->data_direct); + if (err) + goto err; + err = _mlx5r_dmabuf_umr_update_pas(mr, xlt_flags, 0, zapped_blocks, + mr->data_direct); + if (err) + goto err; + + return 0; +err: + mr->page_shift = old_page_shift; + return err; +} |