summaryrefslogtreecommitdiff
path: root/drivers/infiniband/hw/mlx5
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r--drivers/infiniband/hw/mlx5/Makefile1
-rw-r--r--drivers/infiniband/hw/mlx5/counters.c30
-rw-r--r--drivers/infiniband/hw/mlx5/counters.h13
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c19
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c6
-rw-r--r--drivers/infiniband/hw/mlx5/dm.c2
-rw-r--r--drivers/infiniband/hw/mlx5/dmah.c54
-rw-r--r--drivers/infiniband/hw/mlx5/dmah.h23
-rw-r--r--drivers/infiniband/hw/mlx5/fs.c121
-rw-r--r--drivers/infiniband/hw/mlx5/fs.h8
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c3
-rw-r--r--drivers/infiniband/hw/mlx5/main.c13
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h99
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c116
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c32
-rw-r--r--drivers/infiniband/hw/mlx5/umr.c307
-rw-r--r--drivers/infiniband/hw/mlx5/umr.h13
17 files changed, 694 insertions, 166 deletions
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 11878ddf7cc7..dd7bb377f491 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -8,6 +8,7 @@ mlx5_ib-y := ah.o \
cq.o \
data_direct.o \
dm.o \
+ dmah.o \
doorbell.o \
fs.o \
gsi.o \
diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c
index a506fafd2b15..e042e0719ead 100644
--- a/drivers/infiniband/hw/mlx5/counters.c
+++ b/drivers/infiniband/hw/mlx5/counters.c
@@ -16,6 +16,18 @@ struct mlx5_ib_counter {
u32 type;
};
+struct mlx5_rdma_counter {
+ struct rdma_counter rdma_counter;
+
+ struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX];
+ struct xarray qpn_opfc_xa;
+};
+
+static struct mlx5_rdma_counter *to_mcounter(struct rdma_counter *counter)
+{
+ return container_of(counter, struct mlx5_rdma_counter, rdma_counter);
+}
+
#define INIT_Q_COUNTER(_name) \
{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
@@ -602,7 +614,7 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
return 0;
WARN_ON(!xa_empty(&mcounter->qpn_opfc_xa));
- mlx5r_fs_destroy_fcs(dev, counter);
+ mlx5r_fs_destroy_fcs(dev, mcounter->fc);
MLX5_SET(dealloc_q_counter_in, in, opcode,
MLX5_CMD_OP_DEALLOC_Q_COUNTER);
MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id);
@@ -612,6 +624,7 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
struct ib_qp *qp, u32 port)
{
+ struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
struct mlx5_ib_dev *dev = to_mdev(qp->device);
bool new = false;
int err;
@@ -635,7 +648,11 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
if (err)
goto fail_set_counter;
- err = mlx5r_fs_bind_op_fc(qp, counter, port);
+ if (!counter->mode.bind_opcnt)
+ return 0;
+
+ err = mlx5r_fs_bind_op_fc(qp, mcounter->fc, &mcounter->qpn_opfc_xa,
+ port);
if (err)
goto fail_bind_op_fc;
@@ -655,9 +672,12 @@ fail_set_counter:
static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port)
{
struct rdma_counter *counter = qp->counter;
+ struct mlx5_rdma_counter *mcounter;
int err;
- mlx5r_fs_unbind_op_fc(qp, counter);
+ mcounter = to_mcounter(counter);
+
+ mlx5r_fs_unbind_op_fc(qp, &mcounter->qpn_opfc_xa);
err = mlx5_ib_qp_set_counter(qp, NULL);
if (err)
@@ -666,7 +686,9 @@ static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port)
return 0;
fail_set_counter:
- mlx5r_fs_bind_op_fc(qp, counter, port);
+ if (counter->mode.bind_opcnt)
+ mlx5r_fs_bind_op_fc(qp, mcounter->fc,
+ &mcounter->qpn_opfc_xa, port);
return err;
}
diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h
index bd03cee42014..a04e7dd59455 100644
--- a/drivers/infiniband/hw/mlx5/counters.h
+++ b/drivers/infiniband/hw/mlx5/counters.h
@@ -8,19 +8,6 @@
#include "mlx5_ib.h"
-struct mlx5_rdma_counter {
- struct rdma_counter rdma_counter;
-
- struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX];
- struct xarray qpn_opfc_xa;
-};
-
-static inline struct mlx5_rdma_counter *
-to_mcounter(struct rdma_counter *counter)
-{
- return container_of(counter, struct mlx5_rdma_counter, rdma_counter);
-}
-
int mlx5_ib_counters_init(struct mlx5_ib_dev *dev);
void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev);
void mlx5_ib_counters_clear_description(struct ib_counters *counters);
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 1aa5311b03e9..9c8003a78334 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -1055,20 +1055,31 @@ err_cqb:
return err;
}
-int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+int mlx5_ib_pre_destroy_cq(struct ib_cq *cq)
{
struct mlx5_ib_dev *dev = to_mdev(cq->device);
struct mlx5_ib_cq *mcq = to_mcq(cq);
+
+ return mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
+}
+
+void mlx5_ib_post_destroy_cq(struct ib_cq *cq)
+{
+ destroy_cq_kernel(to_mdev(cq->device), to_mcq(cq));
+}
+
+int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+{
int ret;
- ret = mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
+ ret = mlx5_ib_pre_destroy_cq(cq);
if (ret)
return ret;
if (udata)
- destroy_cq_user(mcq, udata);
+ destroy_cq_user(to_mcq(cq), udata);
else
- destroy_cq_kernel(dev, mcq);
+ mlx5_ib_post_destroy_cq(cq);
return 0;
}
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 843dcd312242..028d9f031dde 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -159,7 +159,7 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps)
uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx);
if (is_user &&
(MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX) &&
- capable(CAP_NET_RAW))
+ rdma_dev_has_raw_cap(&dev->ib_dev))
cap |= MLX5_UCTX_CAP_RAW_TX;
if (is_user &&
(MLX5_CAP_GEN(dev->mdev, uctx_cap) &
@@ -1393,6 +1393,10 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
}
MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1);
+ /* TPH is not allowed to bypass the regular kernel's verbs flow */
+ MLX5_SET(mkc, mkc, pcie_tph_en, 0);
+ MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index,
+ MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX);
return 0;
}
diff --git a/drivers/infiniband/hw/mlx5/dm.c b/drivers/infiniband/hw/mlx5/dm.c
index b4c97fb62abf..9ded2b7c1e31 100644
--- a/drivers/infiniband/hw/mlx5/dm.c
+++ b/drivers/infiniband/hw/mlx5/dm.c
@@ -282,7 +282,7 @@ static struct ib_dm *handle_alloc_dm_memic(struct ib_ucontext *ctx,
int err;
u64 address;
- if (!MLX5_CAP_DEV_MEM(dm_db->dev, memic))
+ if (!dm_db || !MLX5_CAP_DEV_MEM(dm_db->dev, memic))
return ERR_PTR(-EOPNOTSUPP);
dm = kzalloc(sizeof(*dm), GFP_KERNEL);
diff --git a/drivers/infiniband/hw/mlx5/dmah.c b/drivers/infiniband/hw/mlx5/dmah.c
new file mode 100644
index 000000000000..362a88992ffa
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/dmah.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include <linux/pci-tph.h>
+#include "dmah.h"
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+static int mlx5_ib_alloc_dmah(struct ib_dmah *ibdmah,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct mlx5_core_dev *mdev = to_mdev(ibdmah->device)->mdev;
+ struct mlx5_ib_dmah *dmah = to_mdmah(ibdmah);
+ u16 st_bits = BIT(IB_DMAH_CPU_ID_EXISTS) |
+ BIT(IB_DMAH_MEM_TYPE_EXISTS);
+ int err;
+
+ /* PH is a must for TPH following PCIe spec 6.2-1.0 */
+ if (!(ibdmah->valid_fields & BIT(IB_DMAH_PH_EXISTS)))
+ return -EINVAL;
+
+ /* ST is optional; however, partial data for it is not allowed */
+ if (ibdmah->valid_fields & st_bits) {
+ if ((ibdmah->valid_fields & st_bits) != st_bits)
+ return -EINVAL;
+ err = mlx5_st_alloc_index(mdev, ibdmah->mem_type,
+ ibdmah->cpu_id, &dmah->st_index);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int mlx5_ib_dealloc_dmah(struct ib_dmah *ibdmah,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct mlx5_ib_dmah *dmah = to_mdmah(ibdmah);
+ struct mlx5_core_dev *mdev = to_mdev(ibdmah->device)->mdev;
+
+ if (ibdmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
+ return mlx5_st_dealloc_index(mdev, dmah->st_index);
+
+ return 0;
+}
+
+const struct ib_device_ops mlx5_ib_dev_dmah_ops = {
+ .alloc_dmah = mlx5_ib_alloc_dmah,
+ .dealloc_dmah = mlx5_ib_dealloc_dmah,
+};
diff --git a/drivers/infiniband/hw/mlx5/dmah.h b/drivers/infiniband/hw/mlx5/dmah.h
new file mode 100644
index 000000000000..68de72b4744a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/dmah.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#ifndef _MLX5_IB_DMAH_H
+#define _MLX5_IB_DMAH_H
+
+#include "mlx5_ib.h"
+
+extern const struct ib_device_ops mlx5_ib_dev_dmah_ops;
+
+struct mlx5_ib_dmah {
+ struct ib_dmah ibdmah;
+ u16 st_index;
+};
+
+static inline struct mlx5_ib_dmah *to_mdmah(struct ib_dmah *ibdmah)
+{
+ return container_of(ibdmah, struct mlx5_ib_dmah, ibdmah);
+}
+
+#endif /* _MLX5_IB_DMAH_H */
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index 680627f1de33..b0f7663c24c1 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -1012,14 +1012,14 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev,
return 0;
}
-static struct mlx5_per_qp_opfc *
-get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new)
+static struct mlx5_per_qp_opfc *get_per_qp_opfc(struct xarray *qpn_opfc_xa,
+ u32 qp_num, bool *new)
{
struct mlx5_per_qp_opfc *per_qp_opfc;
*new = false;
- per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp_num);
+ per_qp_opfc = xa_load(qpn_opfc_xa, qp_num);
if (per_qp_opfc)
return per_qp_opfc;
per_qp_opfc = kzalloc(sizeof(*per_qp_opfc), GFP_KERNEL);
@@ -1032,7 +1032,8 @@ get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new)
}
static int add_op_fc_rules(struct mlx5_ib_dev *dev,
- struct mlx5_rdma_counter *mcounter,
+ struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX],
+ struct xarray *qpn_opfc_xa,
struct mlx5_per_qp_opfc *per_qp_opfc,
struct mlx5_ib_flow_prio *prio,
enum mlx5_ib_optional_counter_type type,
@@ -1055,7 +1056,7 @@ static int add_op_fc_rules(struct mlx5_ib_dev *dev,
return 0;
}
- opfc->fc = mcounter->fc[type];
+ opfc->fc = fc_arr[type];
spec = kcalloc(MAX_OPFC_RULES, sizeof(*spec), GFP_KERNEL);
if (!spec) {
@@ -1148,8 +1149,7 @@ static int add_op_fc_rules(struct mlx5_ib_dev *dev,
}
prio->refcount += spec_num;
- err = xa_err(xa_store(&mcounter->qpn_opfc_xa, qp_num, per_qp_opfc,
- GFP_KERNEL));
+ err = xa_err(xa_store(qpn_opfc_xa, qp_num, per_qp_opfc, GFP_KERNEL));
if (err)
goto del_rules;
@@ -1168,8 +1168,9 @@ null_fc:
return err;
}
-static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter,
- u32 type, struct mlx5_fc **fc)
+static bool
+is_fc_shared_and_in_use(struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], u32 type,
+ struct mlx5_fc **fc)
{
u32 shared_fc_type;
@@ -1190,7 +1191,7 @@ static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter,
return false;
}
- *fc = mcounter->fc[shared_fc_type];
+ *fc = fc_arr[shared_fc_type];
if (!(*fc))
return false;
@@ -1198,24 +1199,23 @@ static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter,
}
void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev,
- struct rdma_counter *counter)
+ struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX])
{
- struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
struct mlx5_fc *in_use_fc;
int i;
for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP;
i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) {
- if (!mcounter->fc[i])
+ if (!fc_arr[i])
continue;
- if (is_fc_shared_and_in_use(mcounter, i, &in_use_fc)) {
- mcounter->fc[i] = NULL;
+ if (is_fc_shared_and_in_use(fc_arr, i, &in_use_fc)) {
+ fc_arr[i] = NULL;
continue;
}
- mlx5_fc_destroy(dev->mdev, mcounter->fc[i]);
- mcounter->fc[i] = NULL;
+ mlx5_fc_destroy(dev->mdev, fc_arr[i]);
+ fc_arr[i] = NULL;
}
}
@@ -1359,16 +1359,15 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev,
put_per_qp_prio(dev, type);
}
-void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter)
+void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct xarray *qpn_opfc_xa)
{
- struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
- struct mlx5_ib_dev *dev = to_mdev(counter->device);
+ struct mlx5_ib_dev *dev = to_mdev(qp->device);
struct mlx5_per_qp_opfc *per_qp_opfc;
struct mlx5_ib_op_fc *in_use_opfc;
struct mlx5_ib_flow_prio *prio;
int i, j;
- per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp->qp_num);
+ per_qp_opfc = xa_load(qpn_opfc_xa, qp->qp_num);
if (!per_qp_opfc)
return;
@@ -1394,13 +1393,13 @@ void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter)
}
kfree(per_qp_opfc);
- xa_erase(&mcounter->qpn_opfc_xa, qp->qp_num);
+ xa_erase(qpn_opfc_xa, qp->qp_num);
}
-int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter,
- u32 port)
+int mlx5r_fs_bind_op_fc(struct ib_qp *qp,
+ struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX],
+ struct xarray *qpn_opfc_xa, u32 port)
{
- struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
struct mlx5_ib_dev *dev = to_mdev(qp->device);
struct mlx5_per_qp_opfc *per_qp_opfc;
struct mlx5_ib_flow_prio *prio;
@@ -1410,9 +1409,6 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter,
int i, err, per_qp_type;
bool new;
- if (!counter->mode.bind_opcnt)
- return 0;
-
cnts = &dev->port[port - 1].cnts;
for (i = 0; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; i++) {
@@ -1424,23 +1420,22 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter,
prio = get_opfc_prio(dev, per_qp_type);
WARN_ON(!prio->flow_table);
- if (is_fc_shared_and_in_use(mcounter, per_qp_type, &in_use_fc))
- mcounter->fc[per_qp_type] = in_use_fc;
+ if (is_fc_shared_and_in_use(fc_arr, per_qp_type, &in_use_fc))
+ fc_arr[per_qp_type] = in_use_fc;
- if (!mcounter->fc[per_qp_type]) {
- mcounter->fc[per_qp_type] = mlx5_fc_create(dev->mdev,
- false);
- if (IS_ERR(mcounter->fc[per_qp_type]))
- return PTR_ERR(mcounter->fc[per_qp_type]);
+ if (!fc_arr[per_qp_type]) {
+ fc_arr[per_qp_type] = mlx5_fc_create(dev->mdev, false);
+ if (IS_ERR(fc_arr[per_qp_type]))
+ return PTR_ERR(fc_arr[per_qp_type]);
}
- per_qp_opfc = get_per_qp_opfc(mcounter, qp->qp_num, &new);
+ per_qp_opfc = get_per_qp_opfc(qpn_opfc_xa, qp->qp_num, &new);
if (!per_qp_opfc) {
err = -ENOMEM;
goto free_fc;
}
- err = add_op_fc_rules(dev, mcounter, per_qp_opfc, prio,
- per_qp_type, qp->qp_num, port);
+ err = add_op_fc_rules(dev, fc_arr, qpn_opfc_xa, per_qp_opfc,
+ prio, per_qp_type, qp->qp_num, port);
if (err)
goto del_rules;
}
@@ -1448,12 +1443,12 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter,
return 0;
del_rules:
- mlx5r_fs_unbind_op_fc(qp, counter);
+ mlx5r_fs_unbind_op_fc(qp, qpn_opfc_xa);
if (new)
kfree(per_qp_opfc);
free_fc:
- if (xa_empty(&mcounter->qpn_opfc_xa))
- mlx5r_fs_destroy_fcs(dev, counter);
+ if (xa_empty(qpn_opfc_xa))
+ mlx5r_fs_destroy_fcs(dev, fc_arr);
return err;
}
@@ -1966,7 +1961,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
break;
case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX:
case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX:
- if (ib_port == 0 || user_priority > MLX5_RDMA_TRANSPORT_BYPASS_PRIO)
+ if (ib_port == 0 ||
+ user_priority >= MLX5_RDMA_TRANSPORT_BYPASS_PRIO)
return ERR_PTR(-EINVAL);
ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags,
&vport_idx, &vport,
@@ -2016,10 +2012,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
prio = &dev->flow_db->rdma_tx[priority];
break;
case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX:
- prio = &dev->flow_db->rdma_transport_rx[ib_port - 1];
+ prio = &dev->flow_db->rdma_transport_rx[priority][ib_port - 1];
break;
case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX:
- prio = &dev->flow_db->rdma_transport_tx[ib_port - 1];
+ prio = &dev->flow_db->rdma_transport_tx[priority][ib_port - 1];
break;
default: return ERR_PTR(-EINVAL);
}
@@ -2458,7 +2454,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
struct mlx5_ib_dev *dev;
u32 flags;
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
fs_matcher = uverbs_attr_get_obj(attrs,
@@ -2989,7 +2985,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)(
u32 ft_id;
int err;
- if (!capable(CAP_NET_RAW))
+ if (!rdma_dev_has_raw_cap(&dev->ib_dev))
return -EPERM;
err = uverbs_get_const(&ib_uapi_ft_type, attrs,
@@ -3466,31 +3462,40 @@ static const struct ib_device_ops flow_ops = {
int mlx5_ib_fs_init(struct mlx5_ib_dev *dev)
{
+ int i, j;
+
dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
if (!dev->flow_db)
return -ENOMEM;
- dev->flow_db->rdma_transport_rx = kcalloc(dev->num_ports,
- sizeof(struct mlx5_ib_flow_prio),
- GFP_KERNEL);
- if (!dev->flow_db->rdma_transport_rx)
- goto free_flow_db;
+ for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) {
+ dev->flow_db->rdma_transport_rx[i] =
+ kcalloc(dev->num_ports,
+ sizeof(struct mlx5_ib_flow_prio), GFP_KERNEL);
+ if (!dev->flow_db->rdma_transport_rx[i])
+ goto free_rdma_transport_rx;
+ }
- dev->flow_db->rdma_transport_tx = kcalloc(dev->num_ports,
- sizeof(struct mlx5_ib_flow_prio),
- GFP_KERNEL);
- if (!dev->flow_db->rdma_transport_tx)
- goto free_rdma_transport_rx;
+ for (j = 0; j < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; j++) {
+ dev->flow_db->rdma_transport_tx[j] =
+ kcalloc(dev->num_ports,
+ sizeof(struct mlx5_ib_flow_prio), GFP_KERNEL);
+ if (!dev->flow_db->rdma_transport_tx[j])
+ goto free_rdma_transport_tx;
+ }
mutex_init(&dev->flow_db->lock);
ib_set_device_ops(&dev->ib_dev, &flow_ops);
return 0;
+free_rdma_transport_tx:
+ while (j--)
+ kfree(dev->flow_db->rdma_transport_tx[j]);
free_rdma_transport_rx:
- kfree(dev->flow_db->rdma_transport_rx);
-free_flow_db:
+ while (i--)
+ kfree(dev->flow_db->rdma_transport_rx[i]);
kfree(dev->flow_db);
return -ENOMEM;
}
diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h
index 2ebe86e5be10..7abba0e2837c 100644
--- a/drivers/infiniband/hw/mlx5/fs.h
+++ b/drivers/infiniband/hw/mlx5/fs.h
@@ -13,6 +13,8 @@ void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev);
static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev)
{
+ int i;
+
/* When a steering anchor is created, a special flow table is also
* created for the user to reference. Since the user can reference it,
* the kernel cannot trust that when the user destroys the steering
@@ -25,8 +27,10 @@ static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev)
* is a safe assumption that all references are gone.
*/
mlx5_ib_fs_cleanup_anchor(dev);
- kfree(dev->flow_db->rdma_transport_tx);
- kfree(dev->flow_db->rdma_transport_rx);
+ for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++)
+ kfree(dev->flow_db->rdma_transport_tx[i]);
+ for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++)
+ kfree(dev->flow_db->rdma_transport_rx[i]);
kfree(dev->flow_db);
}
#endif /* _MLX5_IB_FS_H */
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 49af1cfbe6d1..cc8859d3c2f5 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -88,7 +88,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
else
return mlx5_ib_set_vport_rep(lag_master, rep, vport_index);
- ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
+ ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
+ mlx5_core_net(lag_master));
if (!ibdev)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index df6557ddbdfc..d456e4fde3e1 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -50,6 +50,7 @@
#include <rdma/ib_ucaps.h>
#include "macsec.h"
#include "data_direct.h"
+#include "dmah.h"
#define UVERBS_MODULE_NAME mlx5_ib
#include <rdma/uverbs_named_ioctl.h>
@@ -4190,7 +4191,9 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.modify_port = mlx5_ib_modify_port,
.modify_qp = mlx5_ib_modify_qp,
.modify_srq = mlx5_ib_modify_srq,
+ .pre_destroy_cq = mlx5_ib_pre_destroy_cq,
.poll_cq = mlx5_ib_poll_cq,
+ .post_destroy_cq = mlx5_ib_post_destroy_cq,
.post_recv = mlx5_ib_post_recv_nodrain,
.post_send = mlx5_ib_post_send_nodrain,
.post_srq_recv = mlx5_ib_post_srq_recv,
@@ -4212,6 +4215,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
+ INIT_RDMA_OBJ_SIZE(ib_dmah, mlx5_ib_dmah, ibdmah),
INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
INIT_RDMA_OBJ_SIZE(ib_qp, mlx5_ib_qp, ibqp),
INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
@@ -4339,6 +4343,9 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
+ if (mdev->st)
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dmah_ops);
+
ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
@@ -4824,7 +4831,8 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
!MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud))
return ERR_PTR(-EOPNOTSUPP);
- mplane = ib_alloc_device(mlx5_ib_dev, ib_dev);
+ mplane = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
+ mlx5_core_net(mparent->mdev));
if (!mplane)
return ERR_PTR(-ENOMEM);
@@ -4938,7 +4946,8 @@ static int mlx5r_probe(struct auxiliary_device *adev,
num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
MLX5_CAP_GEN(mdev, num_vhca_ports));
- dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
+ dev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
+ mlx5_core_net(mdev));
if (!dev)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index fde859d207ae..7ffc7ee92cf0 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -104,19 +104,6 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff(
__mlx5_bit_sz(typ, page_offset_fld), 0, scale, \
page_offset_quantized)
-static inline unsigned long
-mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf)
-{
- /*
- * mkeys used for dmabuf are fixed at PAGE_SIZE because we must be able
- * to hold any sgl after a move operation. Ideally the mkc page size
- * could be changed at runtime to be optimal, but right now the driver
- * cannot do that.
- */
- return ib_umem_find_best_pgsz(&umem_dmabuf->umem, PAGE_SIZE,
- umem_dmabuf->umem.iova);
-}
-
enum {
MLX5_IB_MMAP_OFFSET_START = 9,
MLX5_IB_MMAP_OFFSET_END = 255,
@@ -320,8 +307,8 @@ struct mlx5_ib_flow_db {
struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT];
struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX];
struct mlx5_flow_table *lag_demux_ft;
- struct mlx5_ib_flow_prio *rdma_transport_rx;
- struct mlx5_ib_flow_prio *rdma_transport_tx;
+ struct mlx5_ib_flow_prio *rdma_transport_rx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO];
+ struct mlx5_ib_flow_prio *rdma_transport_tx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO];
/* Protect flow steering bypass flow tables
* when add/del flow rules.
* only single add/removal of flow steering rule could be done
@@ -352,6 +339,7 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)
+#define MLX5_IB_UPD_XLT_KEEP_PGSZ BIT(8)
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
*
@@ -650,8 +638,13 @@ enum mlx5_mkey_type {
MLX5_MKEY_IMPLICIT_CHILD,
};
+/* Used for non-existent ph value */
+#define MLX5_IB_NO_PH 0xff
+
struct mlx5r_cache_rb_key {
u8 ats:1;
+ u8 ph;
+ u16 st_index;
unsigned int access_mode;
unsigned int access_flags;
unsigned int ndescs;
@@ -739,6 +732,8 @@ struct mlx5_ib_mr {
struct mlx5_ib_mr *dd_crossed_mr;
struct list_head dd_node;
u8 revoked :1;
+ /* Indicates previous dmabuf page fault occurred */
+ u8 dmabuf_faulted:1;
struct mlx5_ib_mkey null_mmkey;
};
};
@@ -899,13 +894,14 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev,
struct mlx5_ib_op_fc *opfc,
enum mlx5_ib_optional_counter_type type);
-int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter,
- u32 port);
+int mlx5r_fs_bind_op_fc(struct ib_qp *qp,
+ struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX],
+ struct xarray *qpn_opfc_xa, u32 port);
-void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter);
+void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct xarray *qpn_opfc_xa);
void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev,
- struct rdma_counter *counter);
+ struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX]);
struct mlx5_ib_multiport_info;
@@ -1372,16 +1368,20 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct uverbs_attr_bundle *attrs);
int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx5_ib_pre_destroy_cq(struct ib_cq *cq);
+void mlx5_ib_post_destroy_cq(struct ib_cq *cq);
int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
+ struct ib_dmah *dmah,
struct ib_udata *udata);
struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
u64 length, u64 virt_addr,
int fd, int access_flags,
+ struct ib_dmah *dmah,
struct uverbs_attr_bundle *attrs);
int mlx5_ib_advise_mr(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
@@ -1748,20 +1748,71 @@ static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port)
return (port - 1) / dev->num_ports + 1;
}
+static inline unsigned int get_max_log_entity_size_cap(struct mlx5_ib_dev *dev,
+ int access_mode)
+{
+ int max_log_size = 0;
+
+ if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+ max_log_size =
+ MLX5_CAP_GEN_2(dev->mdev, max_mkey_log_entity_size_mtt);
+ else if (access_mode == MLX5_MKC_ACCESS_MODE_KSM)
+ max_log_size = MLX5_CAP_GEN_2(
+ dev->mdev, max_mkey_log_entity_size_fixed_buffer);
+
+ if (!max_log_size ||
+ (max_log_size > 31 &&
+ !MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5)))
+ max_log_size = 31;
+
+ return max_log_size;
+}
+
+static inline unsigned int get_min_log_entity_size_cap(struct mlx5_ib_dev *dev,
+ int access_mode)
+{
+ int min_log_size = 0;
+
+ if (access_mode == MLX5_MKC_ACCESS_MODE_KSM &&
+ MLX5_CAP_GEN_2(dev->mdev,
+ min_mkey_log_entity_size_fixed_buffer_valid))
+ min_log_size = MLX5_CAP_GEN_2(
+ dev->mdev, min_mkey_log_entity_size_fixed_buffer);
+ else
+ min_log_size =
+ MLX5_CAP_GEN_2(dev->mdev, log_min_mkey_entity_size);
+
+ min_log_size = max(min_log_size, MLX5_ADAPTER_PAGE_SHIFT);
+ return min_log_size;
+}
+
/*
* For mkc users, instead of a page_offset the command has a start_iova which
* specifies both the page_offset and the on-the-wire IOVA
*/
static __always_inline unsigned long
mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem,
- u64 iova)
+ u64 iova, int access_mode)
{
- int page_size_bits =
- MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5;
- unsigned long bitmap =
- __mlx5_log_page_size_to_bitmap(page_size_bits, 0);
+ unsigned int max_log_entity_size_cap, min_log_entity_size_cap;
+ unsigned long bitmap;
+
+ max_log_entity_size_cap = get_max_log_entity_size_cap(dev, access_mode);
+ min_log_entity_size_cap = get_min_log_entity_size_cap(dev, access_mode);
+
+ bitmap = GENMASK_ULL(max_log_entity_size_cap, min_log_entity_size_cap);
return ib_umem_find_best_pgsz(umem, bitmap, iova);
}
+static inline unsigned long
+mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf,
+ int access_mode)
+{
+ return mlx5_umem_mkc_find_best_pgsz(to_mdev(umem_dmabuf->umem.ibdev),
+ &umem_dmabuf->umem,
+ umem_dmabuf->umem.iova,
+ access_mode);
+}
+
#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index bd35e75d9ce5..1317f2cb38a4 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -44,6 +44,7 @@
#include "mlx5_ib.h"
#include "umr.h"
#include "data_direct.h"
+#include "dmah.h"
enum {
MAX_PENDING_REG_MR = 8,
@@ -57,7 +58,7 @@ create_mkey_callback(int status, struct mlx5_async_work *context);
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u64 iova, int access_flags,
unsigned long page_size, bool populate,
- int access_mode);
+ int access_mode, u16 st_index, u8 ph);
static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
@@ -256,6 +257,14 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
get_mkc_octo_size(ent->rb_key.access_mode,
ent->rb_key.ndescs));
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+
+ if (ent->rb_key.ph != MLX5_IB_NO_PH) {
+ MLX5_SET(mkc, mkc, pcie_tph_en, 1);
+ MLX5_SET(mkc, mkc, pcie_tph_ph, ent->rb_key.ph);
+ if (ent->rb_key.st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+ MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index,
+ ent->rb_key.st_index);
+ }
}
/* Asynchronously schedule new MRs to be populated in the cache. */
@@ -641,6 +650,14 @@ static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
if (res)
return res;
+ res = key1.st_index - key2.st_index;
+ if (res)
+ return res;
+
+ res = key1.ph - key2.ph;
+ if (res)
+ return res;
+
/*
* keep ndescs the last in the compare table since the find function
* searches for an exact match on all properties and only closest
@@ -712,6 +729,8 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
smallest->rb_key.access_mode == rb_key.access_mode &&
smallest->rb_key.access_flags == rb_key.access_flags &&
smallest->rb_key.ats == rb_key.ats &&
+ smallest->rb_key.st_index == rb_key.st_index &&
+ smallest->rb_key.ph == rb_key.ph &&
smallest->rb_key.ndescs <= ndescs_limit) ?
smallest :
NULL;
@@ -786,7 +805,8 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
struct mlx5r_cache_rb_key rb_key = {
.ndescs = ndescs,
.access_mode = access_mode,
- .access_flags = get_unchangeable_access_flags(dev, access_flags)
+ .access_flags = get_unchangeable_access_flags(dev, access_flags),
+ .ph = MLX5_IB_NO_PH,
};
struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
@@ -943,6 +963,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
struct rb_root *root = &dev->cache.rb_root;
struct mlx5r_cache_rb_key rb_key = {
.access_mode = MLX5_MKC_ACCESS_MODE_MTT,
+ .ph = MLX5_IB_NO_PH,
};
struct mlx5_cache_ent *ent;
struct rb_node *node;
@@ -1119,7 +1140,8 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
struct ib_umem *umem, u64 iova,
- int access_flags, int access_mode)
+ int access_flags, int access_mode,
+ u16 st_index, u8 ph)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5r_cache_rb_key rb_key = {};
@@ -1130,7 +1152,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
if (umem->is_dmabuf)
page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
else
- page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
+ page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova,
+ access_mode);
if (WARN_ON(!page_size))
return ERR_PTR(-EINVAL);
@@ -1138,6 +1161,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
+ rb_key.st_index = st_index;
+ rb_key.ph = ph;
ent = mkey_cache_ent_from_rb_key(dev, rb_key);
/*
* If the MR can't come from the cache then synchronously create an uncached
@@ -1145,7 +1170,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
*/
if (!ent) {
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
+ mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode,
+ st_index, ph);
mutex_unlock(&dev->slow_path_mutex);
if (IS_ERR(mr))
return mr;
@@ -1230,7 +1256,7 @@ err_1:
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u64 iova, int access_flags,
unsigned long page_size, bool populate,
- int access_mode)
+ int access_mode, u16 st_index, u8 ph)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr;
@@ -1240,7 +1266,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u32 *in;
int err;
bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
- (access_mode == MLX5_MKC_ACCESS_MODE_MTT);
+ (access_mode == MLX5_MKC_ACCESS_MODE_MTT) &&
+ (ph == MLX5_IB_NO_PH);
bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
if (!page_size)
@@ -1304,6 +1331,13 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
get_octo_len(iova, umem->length, mr->page_shift));
}
+ if (ph != MLX5_IB_NO_PH) {
+ MLX5_SET(mkc, mkc, pcie_tph_en, 1);
+ MLX5_SET(mkc, mkc, pcie_tph_ph, ph);
+ if (st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+ MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, st_index);
+ }
+
err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
if (err) {
mlx5_ib_warn(dev, "create mkey failed\n");
@@ -1423,24 +1457,37 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
}
static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
- u64 iova, int access_flags)
+ u64 iova, int access_flags,
+ struct ib_dmah *dmah)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr = NULL;
bool xlt_with_umr;
+ u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX;
+ u8 ph = MLX5_IB_NO_PH;
int err;
+ if (dmah) {
+ struct mlx5_ib_dmah *mdmah = to_mdmah(dmah);
+
+ ph = dmah->ph;
+ if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
+ st_index = mdmah->st_index;
+ }
+
xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
if (xlt_with_umr) {
mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
- MLX5_MKC_ACCESS_MODE_MTT);
+ MLX5_MKC_ACCESS_MODE_MTT,
+ st_index, ph);
} else {
- unsigned long page_size =
- mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
+ unsigned long page_size = mlx5_umem_mkc_find_best_pgsz(
+ dev, umem, iova, MLX5_MKC_ACCESS_MODE_MTT);
mutex_lock(&dev->slow_path_mutex);
mr = reg_create(pd, umem, iova, access_flags, page_size,
- true, MLX5_MKC_ACCESS_MODE_MTT);
+ true, MLX5_MKC_ACCESS_MODE_MTT,
+ st_index, ph);
mutex_unlock(&dev->slow_path_mutex);
}
if (IS_ERR(mr)) {
@@ -1504,7 +1551,9 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
return ERR_CAST(odp);
mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
- MLX5_MKC_ACCESS_MODE_MTT);
+ MLX5_MKC_ACCESS_MODE_MTT,
+ MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX,
+ MLX5_IB_NO_PH);
if (IS_ERR(mr)) {
ib_umem_release(&odp->umem);
return ERR_CAST(mr);
@@ -1528,13 +1577,15 @@ err_dereg_mr:
struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 iova, int access_flags,
+ struct ib_dmah *dmah,
struct ib_udata *udata)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct ib_umem *umem;
int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
+ ((access_flags & IB_ACCESS_ON_DEMAND) && dmah))
return ERR_PTR(-EOPNOTSUPP);
mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
@@ -1550,7 +1601,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
if (IS_ERR(umem))
return ERR_CAST(umem);
- return create_real_mr(pd, umem, iova, access_flags);
+ return create_real_mr(pd, umem, iova, access_flags, dmah);
}
static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
@@ -1575,12 +1626,15 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
static struct ib_mr *
reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
u64 offset, u64 length, u64 virt_addr,
- int fd, int access_flags, int access_mode)
+ int fd, int access_flags, int access_mode,
+ struct ib_dmah *dmah)
{
bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr = NULL;
struct ib_umem_dmabuf *umem_dmabuf;
+ u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX;
+ u8 ph = MLX5_IB_NO_PH;
int err;
err = mlx5r_umr_resource_init(dev);
@@ -1603,8 +1657,17 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
return ERR_CAST(umem_dmabuf);
}
+ if (dmah) {
+ struct mlx5_ib_dmah *mdmah = to_mdmah(dmah);
+
+ ph = dmah->ph;
+ if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
+ st_index = mdmah->st_index;
+ }
+
mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
- access_flags, access_mode);
+ access_flags, access_mode,
+ st_index, ph);
if (IS_ERR(mr)) {
ib_umem_release(&umem_dmabuf->umem);
return ERR_CAST(mr);
@@ -1661,7 +1724,8 @@ reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
offset, length, virt_addr, fd,
- access_flags, MLX5_MKC_ACCESS_MODE_KSM);
+ access_flags, MLX5_MKC_ACCESS_MODE_KSM,
+ NULL);
if (IS_ERR(crossed_mr)) {
ret = PTR_ERR(crossed_mr);
goto end;
@@ -1688,6 +1752,7 @@ end:
struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
u64 length, u64 virt_addr,
int fd, int access_flags,
+ struct ib_dmah *dmah,
struct uverbs_attr_bundle *attrs)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
@@ -1720,7 +1785,8 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
return reg_user_mr_dmabuf(pd, pd->device->dma_device,
offset, length, virt_addr,
- fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
+ fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT,
+ dmah);
}
/*
@@ -1754,7 +1820,8 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
return false;
- *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova);
+ *page_size = mlx5_umem_mkc_find_best_pgsz(
+ dev, new_umem, iova, mr->mmkey.cache_ent->rb_key.access_mode);
if (WARN_ON(!*page_size))
return false;
return (mr->mmkey.cache_ent->rb_key.ndescs) >=
@@ -1817,7 +1884,8 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
struct mlx5_ib_mr *mr = to_mmr(ib_mr);
int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct ||
+ mr->mmkey.rb_key.ph != MLX5_IB_NO_PH)
return ERR_PTR(-EOPNOTSUPP);
mlx5_ib_dbg(
@@ -1861,7 +1929,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
return create_real_mr(new_pd, umem, mr->ibmr.iova,
- new_access_flags);
+ new_access_flags, NULL);
}
/*
@@ -1892,7 +1960,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
}
return NULL;
}
- return create_real_mr(new_pd, new_umem, iova, new_access_flags);
+ return create_real_mr(new_pd, new_umem, iova, new_access_flags, NULL);
}
/*
@@ -1901,7 +1969,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
*/
recreate:
return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
- new_access_flags, udata);
+ new_access_flags, NULL, udata);
}
static int
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index f6abd64f07f7..0e8ae85af5a6 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -836,9 +836,13 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
u32 *bytes_mapped, u32 flags)
{
struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+ int access_mode = mr->data_direct ? MLX5_MKC_ACCESS_MODE_KSM :
+ MLX5_MKC_ACCESS_MODE_MTT;
+ unsigned int old_page_shift = mr->page_shift;
+ unsigned int page_shift;
+ unsigned long page_size;
u32 xlt_flags = 0;
int err;
- unsigned long page_size;
if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
@@ -850,20 +854,33 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
return err;
}
- page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf);
+ page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf, access_mode);
if (!page_size) {
ib_umem_dmabuf_unmap_pages(umem_dmabuf);
err = -EINVAL;
} else {
- if (mr->data_direct)
- err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags);
- else
- err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
+ page_shift = order_base_2(page_size);
+ if (page_shift != mr->page_shift && mr->dmabuf_faulted) {
+ err = mlx5r_umr_dmabuf_update_pgsz(mr, xlt_flags,
+ page_shift);
+ } else {
+ mr->page_shift = page_shift;
+ if (mr->data_direct)
+ err = mlx5r_umr_update_data_direct_ksm_pas(
+ mr, xlt_flags);
+ else
+ err = mlx5r_umr_update_mr_pas(mr,
+ xlt_flags);
+ }
}
dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
- if (err)
+ if (err) {
+ mr->page_shift = old_page_shift;
return err;
+ }
+
+ mr->dmabuf_faulted = 1;
if (bytes_mapped)
*bytes_mapped += bcnt;
@@ -1866,6 +1883,7 @@ int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
struct mlx5r_cache_rb_key rb_key = {
.access_mode = MLX5_MKC_ACCESS_MODE_KSM,
.ndescs = mlx5_imr_ksm_entries,
+ .ph = MLX5_IB_NO_PH,
};
struct mlx5_cache_ent *ent;
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index 5be4426a2884..7ef35cddce81 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -32,13 +32,15 @@ static __be64 get_umr_disable_mr_mask(void)
return cpu_to_be64(result);
}
-static __be64 get_umr_update_translation_mask(void)
+static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev)
{
u64 result;
result = MLX5_MKEY_MASK_LEN |
MLX5_MKEY_MASK_PAGE_SIZE |
MLX5_MKEY_MASK_START_ADDR;
+ if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))
+ result |= MLX5_MKEY_MASK_PAGE_SIZE_5;
return cpu_to_be64(result);
}
@@ -654,9 +656,12 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
if (update_translation) {
- wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask();
+ wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev);
if (!mr->ibmr.length)
MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
+ if (flags & MLX5_IB_UPD_XLT_KEEP_PGSZ)
+ wqe->ctrl_seg.mkey_mask &=
+ cpu_to_be64(~MLX5_MKEY_MASK_PAGE_SIZE);
}
wqe->ctrl_seg.xlt_octowords =
@@ -664,46 +669,78 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
wqe->data_seg.byte_count = cpu_to_be32(sg->length);
}
+static void
+_mlx5r_umr_init_wqe(struct mlx5_ib_mr *mr, struct mlx5r_umr_wqe *wqe,
+ struct ib_sge *sg, unsigned int flags,
+ unsigned int page_shift, bool dd)
+{
+ struct mlx5_ib_dev *dev = mr_to_mdev(mr);
+
+ mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg);
+ mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift);
+ if (dd) /* Use the data direct internal kernel PD */
+ MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn);
+ mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg);
+}
+
static int
-_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
+_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd,
+ size_t start_block, size_t nblocks)
{
size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
struct device *ddev = &dev->mdev->pdev->dev;
struct mlx5r_umr_wqe wqe = {};
+ size_t processed_blocks = 0;
struct ib_block_iter biter;
+ size_t cur_block_idx = 0;
struct mlx5_ksm *cur_ksm;
struct mlx5_mtt *cur_mtt;
size_t orig_sg_length;
+ size_t total_blocks;
size_t final_size;
void *curr_entry;
struct ib_sge sg;
void *entry;
- u64 offset = 0;
+ u64 offset;
int err = 0;
- entry = mlx5r_umr_create_xlt(dev, &sg,
- ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
- ent_size, flags);
+ total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift);
+ if (start_block > total_blocks)
+ return -EINVAL;
+
+ /* nblocks 0 means update all blocks starting from start_block */
+ if (nblocks)
+ total_blocks = nblocks;
+
+ entry = mlx5r_umr_create_xlt(dev, &sg, total_blocks, ent_size, flags);
if (!entry)
return -ENOMEM;
orig_sg_length = sg.length;
- mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
- mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
- mr->page_shift);
- if (dd) {
- /* Use the data direct internal kernel PD */
- MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
+
+ _mlx5r_umr_init_wqe(mr, &wqe, &sg, flags, mr->page_shift, dd);
+
+ /* Set initial translation offset to start_block */
+ offset = (u64)start_block * ent_size;
+ mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
+
+ if (dd)
cur_ksm = entry;
- } else {
+ else
cur_mtt = entry;
- }
-
- mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
curr_entry = entry;
+
rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
+ if (cur_block_idx < start_block) {
+ cur_block_idx++;
+ continue;
+ }
+
+ if (nblocks && processed_blocks >= nblocks)
+ break;
+
if (curr_entry == entry + sg.length) {
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
@@ -725,6 +762,11 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
if (dd) {
cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
+ if (mr->umem->is_dmabuf &&
+ (flags & MLX5_IB_UPD_XLT_ZAP)) {
+ cur_ksm->va = 0;
+ cur_ksm->key = 0;
+ }
cur_ksm++;
curr_entry = cur_ksm;
} else {
@@ -736,6 +778,8 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
cur_mtt++;
curr_entry = cur_mtt;
}
+
+ processed_blocks++;
}
final_size = curr_entry - entry;
@@ -752,13 +796,32 @@ err:
return err;
}
-int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr,
+ unsigned int flags,
+ size_t start_block,
+ size_t nblocks)
{
/* No invalidation flow is expected */
- if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP))
+ if (WARN_ON(!mr->umem->is_dmabuf) || ((flags & MLX5_IB_UPD_XLT_ZAP) &&
+ !(flags & MLX5_IB_UPD_XLT_KEEP_PGSZ)))
return -EINVAL;
- return _mlx5r_umr_update_mr_pas(mr, flags, true);
+ return _mlx5r_umr_update_mr_pas(mr, flags, true, start_block, nblocks);
+}
+
+int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr,
+ unsigned int flags)
+{
+ return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 0, 0);
+}
+
+int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags,
+ size_t start_block, size_t nblocks)
+{
+ if (WARN_ON(mr->umem->is_odp))
+ return -EINVAL;
+
+ return _mlx5r_umr_update_mr_pas(mr, flags, false, start_block, nblocks);
}
/*
@@ -768,10 +831,7 @@ int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int fla
*/
int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
{
- if (WARN_ON(mr->umem->is_odp))
- return -EINVAL;
-
- return _mlx5r_umr_update_mr_pas(mr, flags, false);
+ return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0);
}
static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
@@ -864,3 +924,202 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
return err;
}
+
+/*
+ * Update only the page-size (log_page_size) field of an existing memory key
+ * using UMR. This is useful when the MR's physical layout stays the same
+ * but the optimal page shift has changed (e.g. dmabuf after pages are
+ * pinned and the HW can switch from 4K to huge-page alignment).
+ */
+int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr,
+ unsigned int page_shift,
+ bool dd)
+{
+ struct mlx5_ib_dev *dev = mr_to_mdev(mr);
+ struct mlx5r_umr_wqe wqe = {};
+ int err;
+
+ /* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */
+ wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev);
+
+ /* MR must be free while page size is modified */
+ wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE;
+
+ /* Fill mkey segment with the new page size, keep the rest unchanged */
+ MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift);
+
+ if (dd)
+ MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
+ else
+ MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
+
+ MLX5_SET64(mkc, &wqe.mkey_seg, start_addr, mr->ibmr.iova);
+ MLX5_SET64(mkc, &wqe.mkey_seg, len, mr->ibmr.length);
+ MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
+ MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
+ mlx5_mkey_variant(mr->mmkey.key));
+
+ err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
+ if (!err)
+ mr->page_shift = page_shift;
+
+ return err;
+}
+
+static inline int
+_mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr *mr, unsigned int flags,
+ size_t start_block, size_t nblocks, bool dd)
+{
+ if (dd)
+ return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags,
+ start_block,
+ nblocks);
+ else
+ return mlx5r_umr_update_mr_pas_range(mr, flags, start_block,
+ nblocks);
+}
+
+/**
+ * This function makes an mkey non-present by zapping the translation entries of
+ * the mkey by zapping (zeroing out) the first N entries, where N is determined
+ * by the largest page size supported by the device and the MR length.
+ * It then updates the mkey's page size to the largest possible value, ensuring
+ * the MR is completely non-present and safe for further updates.
+ * It is useful to update the page size of a dmabuf MR on a page fault.
+ *
+ * Return: On success, returns the number of entries that were zapped.
+ * On error, returns a negative error code.
+ */
+static int _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr,
+ unsigned int flags,
+ unsigned int page_shift,
+ size_t *nblocks,
+ bool dd)
+{
+ unsigned int old_page_shift = mr->page_shift;
+ struct mlx5_ib_dev *dev = mr_to_mdev(mr);
+ unsigned int max_page_shift;
+ size_t page_shift_nblocks;
+ unsigned int max_log_size;
+ int access_mode;
+ int err;
+
+ access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT;
+ flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP |
+ MLX5_IB_UPD_XLT_ATOMIC;
+ max_log_size = get_max_log_entity_size_cap(dev, access_mode);
+ max_page_shift = order_base_2(mr->ibmr.length);
+ max_page_shift = min(max(max_page_shift, page_shift), max_log_size);
+ /* Count blocks in units of max_page_shift, we will zap exactly this
+ * many to make the whole MR non-present.
+ * Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may
+ * be used as offset into the XLT later on.
+ */
+ *nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift);
+ if (dd)
+ *nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT);
+ else
+ *nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT);
+ page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem,
+ 1UL << page_shift);
+ /* If the number of blocks at max possible page shift is greater than
+ * the number of blocks at the new page size, we should just go over the
+ * whole mkey entries.
+ */
+ if (*nblocks >= page_shift_nblocks)
+ *nblocks = 0;
+
+ /* Make the first nblocks entries non-present without changing
+ * page size yet.
+ */
+ if (*nblocks)
+ mr->page_shift = max_page_shift;
+ err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd);
+ if (err) {
+ mr->page_shift = old_page_shift;
+ return err;
+ }
+
+ /* Change page size to the max page size now that the MR is completely
+ * non-present.
+ */
+ if (*nblocks) {
+ err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd);
+ if (err) {
+ mr->page_shift = old_page_shift;
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its
+ * entries accordingly
+ * @mr: The memory region to update
+ * @xlt_flags: Translation table update flags
+ * @page_shift: The new (optimized) page shift to use
+ *
+ * This function updates the page size and mkey translation entries for a DMABUF
+ * MR in a safe, multi-step process to avoid exposing partially updated mappings
+ * The update is performed in 5 steps:
+ * 1. Make the first X entries non-present, while X is calculated to be
+ * minimal according to a large page shift that can be used to cover the
+ * MR length.
+ * 2. Update the page size to the large supported page size
+ * 3. Load the remaining N-X entries according to the (optimized) page_shift
+ * 4. Update the page size according to the (optimized) page_shift
+ * 5. Load the first X entries with the correct translations
+ *
+ * This ensures that at no point is the MR accessible with a partially updated
+ * translation table, maintaining correctness and preventing access to stale or
+ * inconsistent mappings.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags,
+ unsigned int page_shift)
+{
+ unsigned int old_page_shift = mr->page_shift;
+ size_t zapped_blocks;
+ size_t total_blocks;
+ int err;
+
+ err = _mlx5r_umr_zap_mkey(mr, xlt_flags, page_shift, &zapped_blocks,
+ mr->data_direct);
+ if (err)
+ return err;
+
+ /* _mlx5r_umr_zap_mkey already enables the mkey */
+ xlt_flags &= ~MLX5_IB_UPD_XLT_ENABLE;
+ mr->page_shift = page_shift;
+ total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift);
+ if (zapped_blocks && zapped_blocks < total_blocks) {
+ /* Update PAS according to the new page size but don't update
+ * the page size in the mkey yet.
+ */
+ err = _mlx5r_dmabuf_umr_update_pas(
+ mr,
+ xlt_flags | MLX5_IB_UPD_XLT_KEEP_PGSZ,
+ zapped_blocks,
+ total_blocks - zapped_blocks,
+ mr->data_direct);
+ if (err)
+ goto err;
+ }
+
+ err = mlx5r_umr_update_mr_page_shift(mr, mr->page_shift,
+ mr->data_direct);
+ if (err)
+ goto err;
+ err = _mlx5r_dmabuf_umr_update_pas(mr, xlt_flags, 0, zapped_blocks,
+ mr->data_direct);
+ if (err)
+ goto err;
+
+ return 0;
+err:
+ mr->page_shift = old_page_shift;
+ return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h
index 4a02c9b5aad8..e9361f0140e7 100644
--- a/drivers/infiniband/hw/mlx5/umr.h
+++ b/drivers/infiniband/hw/mlx5/umr.h
@@ -94,9 +94,20 @@ struct mlx5r_umr_wqe {
int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr);
int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
int access_flags);
-int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags);
+int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr,
+ unsigned int flags,
+ size_t start_block,
+ size_t nblocks);
int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags);
+int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags,
+ size_t start_block, size_t nblocks);
+int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags);
int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags);
+int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr,
+ unsigned int page_shift,
+ bool dd);
+int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags,
+ unsigned int page_shift);
#endif /* _MLX5_IB_UMR_H */