summaryrefslogtreecommitdiff
path: root/drivers/infiniband/hw/mlx5/mr.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5/mr.c')
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c491
1 files changed, 376 insertions, 115 deletions
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index a8ee2ca1f4a1..753faa9ad06a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -43,18 +43,22 @@
#include "dm.h"
#include "mlx5_ib.h"
#include "umr.h"
+#include "data_direct.h"
enum {
MAX_PENDING_REG_MR = 8,
};
+#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
#define MLX5_UMR_ALIGN 2048
static void
create_mkey_callback(int status, struct mlx5_async_work *context);
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u64 iova, int access_flags,
- unsigned int page_size, bool populate);
+ unsigned int page_size, bool populate,
+ int access_mode);
+static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
struct ib_pd *pd)
@@ -211,9 +215,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
push_mkey_locked(ent, mkey_out->mkey);
+ ent->pending--;
/* If we are doing fill_to_high_water then keep going. */
queue_adjust_cache_locked(ent);
- ent->pending--;
spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
kfree(mkey_out);
}
@@ -246,6 +250,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
MLX5_SET(mkc, mkc, access_mode_4_2,
(ent->rb_key.access_mode >> 2) & 0x7);
+ MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
MLX5_SET(mkc, mkc, translations_octword_size,
get_mkc_octo_size(ent->rb_key.access_mode,
@@ -526,6 +531,21 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
}
}
+static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
+{
+ u32 mkey;
+
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ while (ent->mkeys_queue.ci) {
+ mkey = pop_mkey_locked(ent);
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+ mlx5_core_destroy_mkey(dev->mdev, mkey);
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ }
+ ent->tmp_cleanup_scheduled = false;
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+}
+
static void __cache_work_func(struct mlx5_cache_ent *ent)
{
struct mlx5_ib_dev *dev = ent->dev;
@@ -597,7 +617,11 @@ static void delayed_cache_work_func(struct work_struct *work)
struct mlx5_cache_ent *ent;
ent = container_of(work, struct mlx5_cache_ent, dwork.work);
- __cache_work_func(ent);
+ /* temp entries are never filled, only cleaned */
+ if (ent->is_tmp)
+ clean_keys(ent->dev, ent);
+ else
+ __cache_work_func(ent);
}
static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
@@ -641,10 +665,8 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
new = &((*new)->rb_left);
if (cmp < 0)
new = &((*new)->rb_right);
- if (cmp == 0) {
- mutex_unlock(&cache->rb_lock);
+ if (cmp == 0)
return -EEXIST;
- }
}
/* Add new node and rebalance tree. */
@@ -660,6 +682,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
{
struct rb_node *node = dev->cache.rb_root.rb_node;
struct mlx5_cache_ent *cur, *smallest = NULL;
+ u64 ndescs_limit;
int cmp;
/*
@@ -678,10 +701,18 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
return cur;
}
+ /*
+ * Limit the usage of mkeys larger than twice the required size while
+ * also allowing the usage of smallest cache entry for small MRs.
+ */
+ ndescs_limit = max_t(u64, rb_key.ndescs * 2,
+ MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
+
return (smallest &&
smallest->rb_key.access_mode == rb_key.access_mode &&
smallest->rb_key.access_flags == rb_key.access_flags &&
- smallest->rb_key.ats == rb_key.ats) ?
+ smallest->rb_key.ats == rb_key.ats &&
+ smallest->rb_key.ndescs <= ndescs_limit) ?
smallest :
NULL;
}
@@ -719,6 +750,8 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
}
mr->mmkey.cache_ent = ent;
mr->mmkey.type = MLX5_MKEY_MR;
+ mr->mmkey.rb_key = ent->rb_key;
+ mr->mmkey.cacheable = true;
init_waitqueue_head(&mr->mmkey.wait);
return mr;
}
@@ -764,21 +797,6 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
return _mlx5_mr_cache_alloc(dev, ent, access_flags);
}
-static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
-{
- u32 mkey;
-
- cancel_delayed_work(&ent->dwork);
- spin_lock_irq(&ent->mkeys_queue.lock);
- while (ent->mkeys_queue.ci) {
- mkey = pop_mkey_locked(ent);
- spin_unlock_irq(&ent->mkeys_queue.lock);
- mlx5_core_destroy_mkey(dev->mdev, mkey);
- spin_lock_irq(&ent->mkeys_queue.lock);
- }
- spin_unlock_irq(&ent->mkeys_queue.lock);
-}
-
static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
{
if (!mlx5_debugfs_root || dev->is_rep)
@@ -891,10 +909,6 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
ent->limit = 0;
mlx5_mkey_cache_debugfs_add_ent(dev, ent);
- } else {
- mod_delayed_work(ent->dev->cache.wq,
- &ent->dev->cache.remove_ent_dwork,
- msecs_to_jiffies(30 * 1000));
}
return ent;
@@ -905,35 +919,6 @@ mkeys_err:
return ERR_PTR(ret);
}
-static void remove_ent_work_func(struct work_struct *work)
-{
- struct mlx5_mkey_cache *cache;
- struct mlx5_cache_ent *ent;
- struct rb_node *cur;
-
- cache = container_of(work, struct mlx5_mkey_cache,
- remove_ent_dwork.work);
- mutex_lock(&cache->rb_lock);
- cur = rb_last(&cache->rb_root);
- while (cur) {
- ent = rb_entry(cur, struct mlx5_cache_ent, node);
- cur = rb_prev(cur);
- mutex_unlock(&cache->rb_lock);
-
- spin_lock_irq(&ent->mkeys_queue.lock);
- if (!ent->is_tmp) {
- spin_unlock_irq(&ent->mkeys_queue.lock);
- mutex_lock(&cache->rb_lock);
- continue;
- }
- spin_unlock_irq(&ent->mkeys_queue.lock);
-
- clean_keys(ent->dev, ent);
- mutex_lock(&cache->rb_lock);
- }
- mutex_unlock(&cache->rb_lock);
-}
-
int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
{
struct mlx5_mkey_cache *cache = &dev->cache;
@@ -949,7 +934,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
mutex_init(&dev->slow_path_mutex);
mutex_init(&dev->cache.rb_lock);
dev->cache.rb_root = RB_ROOT;
- INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
if (!cache->wq) {
mlx5_ib_warn(dev, "failed to create work queue\n");
@@ -961,7 +945,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
mlx5_mkey_cache_debugfs_init(dev);
mutex_lock(&cache->rb_lock);
for (i = 0; i <= mkey_cache_max_order(dev); i++) {
- rb_key.ndescs = 1 << (i + 2);
+ rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
if (IS_ERR(ent)) {
ret = PTR_ERR(ent);
@@ -1000,7 +984,6 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
return;
mutex_lock(&dev->cache.rb_lock);
- cancel_delayed_work(&dev->cache.remove_ent_dwork);
for (node = rb_first(root); node; node = rb_next(node)) {
ent = rb_entry(node, struct mlx5_cache_ent, node);
spin_lock_irq(&ent->mkeys_queue.lock);
@@ -1061,6 +1044,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
MLX5_SET(mkc, mkc, length64, 1);
set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
pd);
+ MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
if (err)
@@ -1125,12 +1109,10 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
struct ib_umem *umem, u64 iova,
- int access_flags)
+ int access_flags, int access_mode)
{
- struct mlx5r_cache_rb_key rb_key = {
- .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
- };
struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5r_cache_rb_key rb_key = {};
struct mlx5_cache_ent *ent;
struct mlx5_ib_mr *mr;
unsigned int page_size;
@@ -1138,11 +1120,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
if (umem->is_dmabuf)
page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
else
- page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
- 0, iova);
+ page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
if (WARN_ON(!page_size))
return ERR_PTR(-EINVAL);
+ rb_key.access_mode = access_mode;
rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
@@ -1153,11 +1135,12 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
*/
if (!ent) {
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, false);
+ mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
mutex_unlock(&dev->slow_path_mutex);
if (IS_ERR(mr))
return mr;
mr->mmkey.rb_key = rb_key;
+ mr->mmkey.cacheable = true;
return mr;
}
@@ -1173,13 +1156,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
return mr;
}
+static struct ib_mr *
+reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
+ u32 crossed_lkey)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
+ struct mlx5_ib_mr *mr;
+ void *mkc;
+ int inlen;
+ u32 *in;
+ int err;
+
+ if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_1;
+ }
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, crossing_target_vhca_id,
+ MLX5_CAP_GEN(dev->mdev, vhca_id));
+ MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
+ MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
+ MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
+
+ /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
+ set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
+ MLX5_SET64(mkc, mkc, len, iova + length);
+
+ MLX5_SET(mkc, mkc, free, 0);
+ MLX5_SET(mkc, mkc, umr_en, 0);
+ err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
+ if (err)
+ goto err_2;
+
+ mr->mmkey.type = MLX5_MKEY_MR;
+ set_mr_fields(dev, mr, length, access_flags, iova);
+ mr->ibmr.pd = pd;
+ kvfree(in);
+ mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
+
+ return &mr->ibmr;
+err_2:
+ kvfree(in);
+err_1:
+ kfree(mr);
+ return ERR_PTR(err);
+}
+
/*
* If ibmr is NULL it will be allocated by reg_create.
* Else, the given ibmr will be used.
*/
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u64 iova, int access_flags,
- unsigned int page_size, bool populate)
+ unsigned int page_size, bool populate,
+ int access_mode)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr;
@@ -1188,7 +1229,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
int inlen;
u32 *in;
int err;
- bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
+ bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
+ (access_mode == MLX5_MKC_ACCESS_MODE_MTT);
+ bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
if (!page_size)
return ERR_PTR(-EINVAL);
@@ -1211,7 +1254,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
}
pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
if (populate) {
- if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
+ if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
err = -EINVAL;
goto err_2;
}
@@ -1227,14 +1270,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
populate ? pd : dev->umrc.pd);
+ /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
+ if (umem->is_dmabuf && ksm_mode)
+ MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
+
MLX5_SET(mkc, mkc, free, !populate);
- MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+ MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
MLX5_SET(mkc, mkc, umr_en, 1);
MLX5_SET64(mkc, mkc, len, umem->length);
MLX5_SET(mkc, mkc, bsf_octword_size, 0);
- MLX5_SET(mkc, mkc, translations_octword_size,
- get_octo_len(iova, umem->length, mr->page_shift));
+ if (ksm_mode)
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ get_octo_len(iova, umem->length, mr->page_shift) * 2);
+ else
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ get_octo_len(iova, umem->length, mr->page_shift));
MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
if (mlx5_umem_needs_ats(dev, umem, access_flags))
MLX5_SET(mkc, mkc, ma_translation_mode, 1);
@@ -1371,13 +1422,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
if (xlt_with_umr) {
- mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
+ mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
+ MLX5_MKC_ACCESS_MODE_MTT);
} else {
- unsigned int page_size = mlx5_umem_find_best_pgsz(
- umem, mkc, log_page_size, 0, iova);
+ unsigned int page_size =
+ mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, true);
+ mr = reg_create(pd, umem, iova, access_flags, page_size,
+ true, MLX5_MKC_ACCESS_MODE_MTT);
mutex_unlock(&dev->slow_path_mutex);
}
if (IS_ERR(mr)) {
@@ -1440,7 +1493,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
if (IS_ERR(odp))
return ERR_CAST(odp);
- mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
+ mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
+ MLX5_MKC_ACCESS_MODE_MTT);
if (IS_ERR(mr)) {
ib_umem_release(&odp->umem);
return ERR_CAST(mr);
@@ -1468,6 +1522,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct ib_umem *umem;
+ int err;
if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
return ERR_PTR(-EOPNOTSUPP);
@@ -1475,6 +1530,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
start, iova, length, access_flags);
+ err = mlx5r_umr_resource_init(dev);
+ if (err)
+ return ERR_PTR(err);
+
if (access_flags & IB_ACCESS_ON_DEMAND)
return create_user_odp_mr(pd, start, length, iova, access_flags,
udata);
@@ -1491,7 +1550,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
- if (!umem_dmabuf->sgt)
+ if (!umem_dmabuf->sgt || !mr)
return;
mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
@@ -1503,31 +1562,31 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
.move_notify = mlx5_ib_dmabuf_invalidate_cb,
};
-struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
- u64 length, u64 virt_addr,
- int fd, int access_flags,
- struct ib_udata *udata)
+static struct ib_mr *
+reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
+ u64 offset, u64 length, u64 virt_addr,
+ int fd, int access_flags, int access_mode)
{
+ bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr = NULL;
struct ib_umem_dmabuf *umem_dmabuf;
int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
- !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
- return ERR_PTR(-EOPNOTSUPP);
-
- mlx5_ib_dbg(dev,
- "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
- offset, virt_addr, length, fd, access_flags);
+ err = mlx5r_umr_resource_init(dev);
+ if (err)
+ return ERR_PTR(err);
- /* dmabuf requires xlt update via umr to work. */
- if (!mlx5r_umr_can_load_pas(dev, length))
- return ERR_PTR(-EINVAL);
+ if (!pinned_mode)
+ umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
+ offset, length, fd,
+ access_flags,
+ &mlx5_ib_dmabuf_attach_ops);
+ else
+ umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
+ dma_device, offset, length,
+ fd, access_flags);
- umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
- access_flags,
- &mlx5_ib_dmabuf_attach_ops);
if (IS_ERR(umem_dmabuf)) {
mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
PTR_ERR(umem_dmabuf));
@@ -1535,7 +1594,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
}
mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
- access_flags);
+ access_flags, access_mode);
if (IS_ERR(mr)) {
ib_umem_release(&umem_dmabuf->umem);
return ERR_CAST(mr);
@@ -1545,9 +1604,13 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
umem_dmabuf->private = mr;
- err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
- if (err)
- goto err_dereg_mr;
+ if (!pinned_mode) {
+ err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
+ if (err)
+ goto err_dereg_mr;
+ } else {
+ mr->data_direct = true;
+ }
err = mlx5_ib_init_dmabuf_mr(mr);
if (err)
@@ -1555,10 +1618,101 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
return &mr->ibmr;
err_dereg_mr:
- mlx5_ib_dereg_mr(&mr->ibmr, NULL);
+ __mlx5_ib_dereg_mr(&mr->ibmr);
return ERR_PTR(err);
}
+static struct ib_mr *
+reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5_data_direct_dev *data_direct_dev;
+ struct ib_mr *crossing_mr;
+ struct ib_mr *crossed_mr;
+ int ret = 0;
+
+ /* As of HW behaviour the IOVA must be page aligned in KSM mode */
+ if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mutex_lock(&dev->data_direct_lock);
+ data_direct_dev = dev->data_direct_dev;
+ if (!data_direct_dev) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* The device's 'data direct mkey' was created without RO flags to
+ * simplify things and allow for a single mkey per device.
+ * Since RO is not a must, mask it out accordingly.
+ */
+ access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
+ crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
+ offset, length, virt_addr, fd,
+ access_flags, MLX5_MKC_ACCESS_MODE_KSM);
+ if (IS_ERR(crossed_mr)) {
+ ret = PTR_ERR(crossed_mr);
+ goto end;
+ }
+
+ mutex_lock(&dev->slow_path_mutex);
+ crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
+ crossed_mr->lkey);
+ mutex_unlock(&dev->slow_path_mutex);
+ if (IS_ERR(crossing_mr)) {
+ __mlx5_ib_dereg_mr(crossed_mr);
+ ret = PTR_ERR(crossing_mr);
+ goto end;
+ }
+
+ list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
+ to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
+ to_mmr(crossing_mr)->data_direct = true;
+end:
+ mutex_unlock(&dev->data_direct_lock);
+ return ret ? ERR_PTR(ret) : crossing_mr;
+}
+
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ int mlx5_access_flags = 0;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
+ !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
+ err = uverbs_get_flags32(&mlx5_access_flags, attrs,
+ MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ mlx5_ib_dbg(dev,
+ "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
+ offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
+
+ /* dmabuf requires xlt update via umr to work. */
+ if (!mlx5r_umr_can_load_pas(dev, length))
+ return ERR_PTR(-EINVAL);
+
+ if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
+ return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
+ fd, access_flags);
+
+ return reg_user_mr_dmabuf(pd, pd->device->dma_device,
+ offset, length, virt_addr,
+ fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
+}
+
/*
* True if the change in access flags can be done via UMR, only some access
* flags can be updated.
@@ -1570,7 +1724,8 @@ static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
unsigned int diffs = current_access_flags ^ target_access_flags;
if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
+ IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
+ IB_ACCESS_REMOTE_ATOMIC))
return false;
return mlx5r_umr_can_reconfig(dev, current_access_flags,
target_access_flags);
@@ -1589,8 +1744,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
return false;
- *page_size =
- mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
+ *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova);
if (WARN_ON(!*page_size))
return false;
return (mr->mmkey.cache_ent->rb_key.ndescs) >=
@@ -1653,7 +1807,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
struct mlx5_ib_mr *mr = to_mmr(ib_mr);
int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
return ERR_PTR(-EOPNOTSUPP);
mlx5_ib_dbg(
@@ -1781,7 +1935,8 @@ err:
static void
mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
{
- if (!mr->umem && mr->descs) {
+ if (!mr->umem && !mr->data_direct &&
+ mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
struct ib_device *device = mr->ibmr.device;
int size = mr->max_descs * mr->desc_size;
struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1835,7 +1990,86 @@ end:
return ret;
}
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+ struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+ int err;
+
+ lockdep_assert_held(&dev->data_direct_lock);
+ mr->revoked = true;
+ err = mlx5r_umr_revoke_mr(mr);
+ if (WARN_ON(err))
+ return err;
+
+ ib_umem_dmabuf_revoke(umem_dmabuf);
+ return 0;
+}
+
+void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
+{
+ struct mlx5_ib_mr *mr, *next;
+
+ lockdep_assert_held(&dev->data_direct_lock);
+
+ list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
+ list_del(&mr->dd_node);
+ mlx5_ib_revoke_data_direct_mr(mr);
+ }
+}
+
+static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+ struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
+ bool is_odp = is_odp_mr(mr);
+ bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
+ !to_ib_umem_dmabuf(mr->umem)->pinned;
+ int ret = 0;
+
+ if (is_odp)
+ mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
+
+ if (is_odp_dma_buf)
+ dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL);
+
+ if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
+ ent = mr->mmkey.cache_ent;
+ /* upon storing to a clean temp entry - schedule its cleanup */
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
+ mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
+ msecs_to_jiffies(30 * 1000));
+ ent->tmp_cleanup_scheduled = true;
+ }
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+ goto out;
+ }
+
+ if (ent) {
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ ent->in_use--;
+ mr->mmkey.cache_ent = NULL;
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+ }
+ ret = destroy_mkey(dev, mr);
+out:
+ if (is_odp) {
+ if (!ret)
+ to_ib_umem_odp(mr->umem)->private = NULL;
+ mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
+ }
+
+ if (is_odp_dma_buf) {
+ if (!ret)
+ to_ib_umem_dmabuf(mr->umem)->private = NULL;
+ dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
+ }
+
+ return ret;
+}
+
+static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
{
struct mlx5_ib_mr *mr = to_mmr(ibmr);
struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
@@ -1880,16 +2114,9 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
}
/* Stop DMA */
- if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length))
- if (mlx5r_umr_revoke_mr(mr) ||
- cache_ent_find_and_store(dev, mr))
- mr->mmkey.cache_ent = NULL;
-
- if (!mr->mmkey.cache_ent) {
- rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
- if (rc)
- return rc;
- }
+ rc = mlx5_revoke_mr(mr);
+ if (rc)
+ return rc;
if (mr->umem) {
bool is_odp = is_odp_mr(mr);
@@ -1909,9 +2136,40 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
return 0;
}
+static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_mr *mr)
+{
+ struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
+ int ret;
+
+ ret = __mlx5_ib_dereg_mr(&mr->ibmr);
+ if (ret)
+ return ret;
+
+ mutex_lock(&dev->data_direct_lock);
+ if (!dd_crossed_mr->revoked)
+ list_del(&dd_crossed_mr->dd_node);
+
+ ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
+ mutex_unlock(&dev->data_direct_lock);
+ return ret;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
+ struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+
+ if (mr->data_direct)
+ return dereg_crossing_data_direct_mr(dev, mr);
+
+ return __mlx5_ib_dereg_mr(ibmr);
+}
+
static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
int access_mode, int page_shift)
{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
void *mkc;
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
@@ -1924,6 +2182,9 @@ static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
MLX5_SET(mkc, mkc, umr_en, 1);
MLX5_SET(mkc, mkc, log_page_size, page_shift);
+ if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
+ access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+ MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
}
static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,