diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r-- | drivers/infiniband/hw/mlx5/Makefile | 1 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/counters.c | 30 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/counters.h | 13 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cq.c | 19 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/devx.c | 6 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/dm.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/dmah.c | 54 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/dmah.h | 23 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/fs.c | 121 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/fs.h | 8 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/ib_rep.c | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 13 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 99 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 116 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 32 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/umr.c | 307 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/umr.h | 13 |
17 files changed, 694 insertions, 166 deletions
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 11878ddf7cc7..dd7bb377f491 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -8,6 +8,7 @@ mlx5_ib-y := ah.o \ cq.o \ data_direct.o \ dm.o \ + dmah.o \ doorbell.o \ fs.o \ gsi.o \ diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index a506fafd2b15..e042e0719ead 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -16,6 +16,18 @@ struct mlx5_ib_counter { u32 type; }; +struct mlx5_rdma_counter { + struct rdma_counter rdma_counter; + + struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX]; + struct xarray qpn_opfc_xa; +}; + +static struct mlx5_rdma_counter *to_mcounter(struct rdma_counter *counter) +{ + return container_of(counter, struct mlx5_rdma_counter, rdma_counter); +} + #define INIT_Q_COUNTER(_name) \ { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)} @@ -602,7 +614,7 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) return 0; WARN_ON(!xa_empty(&mcounter->qpn_opfc_xa)); - mlx5r_fs_destroy_fcs(dev, counter); + mlx5r_fs_destroy_fcs(dev, mcounter->fc); MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id); @@ -612,6 +624,7 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp, u32 port) { + struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_ib_dev *dev = to_mdev(qp->device); bool new = false; int err; @@ -635,7 +648,11 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, if (err) goto fail_set_counter; - err = mlx5r_fs_bind_op_fc(qp, counter, port); + if (!counter->mode.bind_opcnt) + return 0; + + err = mlx5r_fs_bind_op_fc(qp, mcounter->fc, &mcounter->qpn_opfc_xa, + port); if (err) goto fail_bind_op_fc; @@ -655,9 +672,12 @@ fail_set_counter: static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port) { struct rdma_counter *counter = qp->counter; + struct mlx5_rdma_counter *mcounter; int err; - mlx5r_fs_unbind_op_fc(qp, counter); + mcounter = to_mcounter(counter); + + mlx5r_fs_unbind_op_fc(qp, &mcounter->qpn_opfc_xa); err = mlx5_ib_qp_set_counter(qp, NULL); if (err) @@ -666,7 +686,9 @@ static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port) return 0; fail_set_counter: - mlx5r_fs_bind_op_fc(qp, counter, port); + if (counter->mode.bind_opcnt) + mlx5r_fs_bind_op_fc(qp, mcounter->fc, + &mcounter->qpn_opfc_xa, port); return err; } diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h index bd03cee42014..a04e7dd59455 100644 --- a/drivers/infiniband/hw/mlx5/counters.h +++ b/drivers/infiniband/hw/mlx5/counters.h @@ -8,19 +8,6 @@ #include "mlx5_ib.h" -struct mlx5_rdma_counter { - struct rdma_counter rdma_counter; - - struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX]; - struct xarray qpn_opfc_xa; -}; - -static inline struct mlx5_rdma_counter * -to_mcounter(struct rdma_counter *counter) -{ - return container_of(counter, struct mlx5_rdma_counter, rdma_counter); -} - int mlx5_ib_counters_init(struct mlx5_ib_dev *dev); void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev); void mlx5_ib_counters_clear_description(struct ib_counters *counters); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 1aa5311b03e9..9c8003a78334 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1055,20 +1055,31 @@ err_cqb: return err; } -int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +int mlx5_ib_pre_destroy_cq(struct ib_cq *cq) { struct mlx5_ib_dev *dev = to_mdev(cq->device); struct mlx5_ib_cq *mcq = to_mcq(cq); + + return mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); +} + +void mlx5_ib_post_destroy_cq(struct ib_cq *cq) +{ + destroy_cq_kernel(to_mdev(cq->device), to_mcq(cq)); +} + +int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +{ int ret; - ret = mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + ret = mlx5_ib_pre_destroy_cq(cq); if (ret) return ret; if (udata) - destroy_cq_user(mcq, udata); + destroy_cq_user(to_mcq(cq), udata); else - destroy_cq_kernel(dev, mcq); + mlx5_ib_post_destroy_cq(cq); return 0; } diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 843dcd312242..028d9f031dde 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -159,7 +159,7 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps) uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx); if (is_user && (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX) && - capable(CAP_NET_RAW)) + rdma_dev_has_raw_cap(&dev->ib_dev)) cap |= MLX5_UCTX_CAP_RAW_TX; if (is_user && (MLX5_CAP_GEN(dev->mdev, uctx_cap) & @@ -1393,6 +1393,10 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, } MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1); + /* TPH is not allowed to bypass the regular kernel's verbs flow */ + MLX5_SET(mkc, mkc, pcie_tph_en, 0); + MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, + MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX); return 0; } diff --git a/drivers/infiniband/hw/mlx5/dm.c b/drivers/infiniband/hw/mlx5/dm.c index b4c97fb62abf..9ded2b7c1e31 100644 --- a/drivers/infiniband/hw/mlx5/dm.c +++ b/drivers/infiniband/hw/mlx5/dm.c @@ -282,7 +282,7 @@ static struct ib_dm *handle_alloc_dm_memic(struct ib_ucontext *ctx, int err; u64 address; - if (!MLX5_CAP_DEV_MEM(dm_db->dev, memic)) + if (!dm_db || !MLX5_CAP_DEV_MEM(dm_db->dev, memic)) return ERR_PTR(-EOPNOTSUPP); dm = kzalloc(sizeof(*dm), GFP_KERNEL); diff --git a/drivers/infiniband/hw/mlx5/dmah.c b/drivers/infiniband/hw/mlx5/dmah.c new file mode 100644 index 000000000000..362a88992ffa --- /dev/null +++ b/drivers/infiniband/hw/mlx5/dmah.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include <rdma/uverbs_std_types.h> +#include <linux/pci-tph.h> +#include "dmah.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + +static int mlx5_ib_alloc_dmah(struct ib_dmah *ibdmah, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_core_dev *mdev = to_mdev(ibdmah->device)->mdev; + struct mlx5_ib_dmah *dmah = to_mdmah(ibdmah); + u16 st_bits = BIT(IB_DMAH_CPU_ID_EXISTS) | + BIT(IB_DMAH_MEM_TYPE_EXISTS); + int err; + + /* PH is a must for TPH following PCIe spec 6.2-1.0 */ + if (!(ibdmah->valid_fields & BIT(IB_DMAH_PH_EXISTS))) + return -EINVAL; + + /* ST is optional; however, partial data for it is not allowed */ + if (ibdmah->valid_fields & st_bits) { + if ((ibdmah->valid_fields & st_bits) != st_bits) + return -EINVAL; + err = mlx5_st_alloc_index(mdev, ibdmah->mem_type, + ibdmah->cpu_id, &dmah->st_index); + if (err) + return err; + } + + return 0; +} + +static int mlx5_ib_dealloc_dmah(struct ib_dmah *ibdmah, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dmah *dmah = to_mdmah(ibdmah); + struct mlx5_core_dev *mdev = to_mdev(ibdmah->device)->mdev; + + if (ibdmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS)) + return mlx5_st_dealloc_index(mdev, dmah->st_index); + + return 0; +} + +const struct ib_device_ops mlx5_ib_dev_dmah_ops = { + .alloc_dmah = mlx5_ib_alloc_dmah, + .dealloc_dmah = mlx5_ib_dealloc_dmah, +}; diff --git a/drivers/infiniband/hw/mlx5/dmah.h b/drivers/infiniband/hw/mlx5/dmah.h new file mode 100644 index 000000000000..68de72b4744a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/dmah.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _MLX5_IB_DMAH_H +#define _MLX5_IB_DMAH_H + +#include "mlx5_ib.h" + +extern const struct ib_device_ops mlx5_ib_dev_dmah_ops; + +struct mlx5_ib_dmah { + struct ib_dmah ibdmah; + u16 st_index; +}; + +static inline struct mlx5_ib_dmah *to_mdmah(struct ib_dmah *ibdmah) +{ + return container_of(ibdmah, struct mlx5_ib_dmah, ibdmah); +} + +#endif /* _MLX5_IB_DMAH_H */ diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 680627f1de33..b0f7663c24c1 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -1012,14 +1012,14 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, return 0; } -static struct mlx5_per_qp_opfc * -get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new) +static struct mlx5_per_qp_opfc *get_per_qp_opfc(struct xarray *qpn_opfc_xa, + u32 qp_num, bool *new) { struct mlx5_per_qp_opfc *per_qp_opfc; *new = false; - per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp_num); + per_qp_opfc = xa_load(qpn_opfc_xa, qp_num); if (per_qp_opfc) return per_qp_opfc; per_qp_opfc = kzalloc(sizeof(*per_qp_opfc), GFP_KERNEL); @@ -1032,7 +1032,8 @@ get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new) } static int add_op_fc_rules(struct mlx5_ib_dev *dev, - struct mlx5_rdma_counter *mcounter, + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], + struct xarray *qpn_opfc_xa, struct mlx5_per_qp_opfc *per_qp_opfc, struct mlx5_ib_flow_prio *prio, enum mlx5_ib_optional_counter_type type, @@ -1055,7 +1056,7 @@ static int add_op_fc_rules(struct mlx5_ib_dev *dev, return 0; } - opfc->fc = mcounter->fc[type]; + opfc->fc = fc_arr[type]; spec = kcalloc(MAX_OPFC_RULES, sizeof(*spec), GFP_KERNEL); if (!spec) { @@ -1148,8 +1149,7 @@ static int add_op_fc_rules(struct mlx5_ib_dev *dev, } prio->refcount += spec_num; - err = xa_err(xa_store(&mcounter->qpn_opfc_xa, qp_num, per_qp_opfc, - GFP_KERNEL)); + err = xa_err(xa_store(qpn_opfc_xa, qp_num, per_qp_opfc, GFP_KERNEL)); if (err) goto del_rules; @@ -1168,8 +1168,9 @@ null_fc: return err; } -static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, - u32 type, struct mlx5_fc **fc) +static bool +is_fc_shared_and_in_use(struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], u32 type, + struct mlx5_fc **fc) { u32 shared_fc_type; @@ -1190,7 +1191,7 @@ static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, return false; } - *fc = mcounter->fc[shared_fc_type]; + *fc = fc_arr[shared_fc_type]; if (!(*fc)) return false; @@ -1198,24 +1199,23 @@ static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter, } void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, - struct rdma_counter *counter) + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX]) { - struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_fc *in_use_fc; int i; for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) { - if (!mcounter->fc[i]) + if (!fc_arr[i]) continue; - if (is_fc_shared_and_in_use(mcounter, i, &in_use_fc)) { - mcounter->fc[i] = NULL; + if (is_fc_shared_and_in_use(fc_arr, i, &in_use_fc)) { + fc_arr[i] = NULL; continue; } - mlx5_fc_destroy(dev->mdev, mcounter->fc[i]); - mcounter->fc[i] = NULL; + mlx5_fc_destroy(dev->mdev, fc_arr[i]); + fc_arr[i] = NULL; } } @@ -1359,16 +1359,15 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, put_per_qp_prio(dev, type); } -void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter) +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct xarray *qpn_opfc_xa) { - struct mlx5_rdma_counter *mcounter = to_mcounter(counter); - struct mlx5_ib_dev *dev = to_mdev(counter->device); + struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_per_qp_opfc *per_qp_opfc; struct mlx5_ib_op_fc *in_use_opfc; struct mlx5_ib_flow_prio *prio; int i, j; - per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp->qp_num); + per_qp_opfc = xa_load(qpn_opfc_xa, qp->qp_num); if (!per_qp_opfc) return; @@ -1394,13 +1393,13 @@ void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter) } kfree(per_qp_opfc); - xa_erase(&mcounter->qpn_opfc_xa, qp->qp_num); + xa_erase(qpn_opfc_xa, qp->qp_num); } -int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, - u32 port) +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], + struct xarray *qpn_opfc_xa, u32 port) { - struct mlx5_rdma_counter *mcounter = to_mcounter(counter); struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_per_qp_opfc *per_qp_opfc; struct mlx5_ib_flow_prio *prio; @@ -1410,9 +1409,6 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, int i, err, per_qp_type; bool new; - if (!counter->mode.bind_opcnt) - return 0; - cnts = &dev->port[port - 1].cnts; for (i = 0; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; i++) { @@ -1424,23 +1420,22 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, prio = get_opfc_prio(dev, per_qp_type); WARN_ON(!prio->flow_table); - if (is_fc_shared_and_in_use(mcounter, per_qp_type, &in_use_fc)) - mcounter->fc[per_qp_type] = in_use_fc; + if (is_fc_shared_and_in_use(fc_arr, per_qp_type, &in_use_fc)) + fc_arr[per_qp_type] = in_use_fc; - if (!mcounter->fc[per_qp_type]) { - mcounter->fc[per_qp_type] = mlx5_fc_create(dev->mdev, - false); - if (IS_ERR(mcounter->fc[per_qp_type])) - return PTR_ERR(mcounter->fc[per_qp_type]); + if (!fc_arr[per_qp_type]) { + fc_arr[per_qp_type] = mlx5_fc_create(dev->mdev, false); + if (IS_ERR(fc_arr[per_qp_type])) + return PTR_ERR(fc_arr[per_qp_type]); } - per_qp_opfc = get_per_qp_opfc(mcounter, qp->qp_num, &new); + per_qp_opfc = get_per_qp_opfc(qpn_opfc_xa, qp->qp_num, &new); if (!per_qp_opfc) { err = -ENOMEM; goto free_fc; } - err = add_op_fc_rules(dev, mcounter, per_qp_opfc, prio, - per_qp_type, qp->qp_num, port); + err = add_op_fc_rules(dev, fc_arr, qpn_opfc_xa, per_qp_opfc, + prio, per_qp_type, qp->qp_num, port); if (err) goto del_rules; } @@ -1448,12 +1443,12 @@ int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, return 0; del_rules: - mlx5r_fs_unbind_op_fc(qp, counter); + mlx5r_fs_unbind_op_fc(qp, qpn_opfc_xa); if (new) kfree(per_qp_opfc); free_fc: - if (xa_empty(&mcounter->qpn_opfc_xa)) - mlx5r_fs_destroy_fcs(dev, counter); + if (xa_empty(qpn_opfc_xa)) + mlx5r_fs_destroy_fcs(dev, fc_arr); return err; } @@ -1966,7 +1961,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, break; case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: - if (ib_port == 0 || user_priority > MLX5_RDMA_TRANSPORT_BYPASS_PRIO) + if (ib_port == 0 || + user_priority >= MLX5_RDMA_TRANSPORT_BYPASS_PRIO) return ERR_PTR(-EINVAL); ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags, &vport_idx, &vport, @@ -2016,10 +2012,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, prio = &dev->flow_db->rdma_tx[priority]; break; case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX: - prio = &dev->flow_db->rdma_transport_rx[ib_port - 1]; + prio = &dev->flow_db->rdma_transport_rx[priority][ib_port - 1]; break; case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX: - prio = &dev->flow_db->rdma_transport_tx[ib_port - 1]; + prio = &dev->flow_db->rdma_transport_tx[priority][ib_port - 1]; break; default: return ERR_PTR(-EINVAL); } @@ -2458,7 +2454,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct mlx5_ib_dev *dev; u32 flags; - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; fs_matcher = uverbs_attr_get_obj(attrs, @@ -2989,7 +2985,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( u32 ft_id; int err; - if (!capable(CAP_NET_RAW)) + if (!rdma_dev_has_raw_cap(&dev->ib_dev)) return -EPERM; err = uverbs_get_const(&ib_uapi_ft_type, attrs, @@ -3466,31 +3462,40 @@ static const struct ib_device_ops flow_ops = { int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) { + int i, j; + dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); if (!dev->flow_db) return -ENOMEM; - dev->flow_db->rdma_transport_rx = kcalloc(dev->num_ports, - sizeof(struct mlx5_ib_flow_prio), - GFP_KERNEL); - if (!dev->flow_db->rdma_transport_rx) - goto free_flow_db; + for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) { + dev->flow_db->rdma_transport_rx[i] = + kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), GFP_KERNEL); + if (!dev->flow_db->rdma_transport_rx[i]) + goto free_rdma_transport_rx; + } - dev->flow_db->rdma_transport_tx = kcalloc(dev->num_ports, - sizeof(struct mlx5_ib_flow_prio), - GFP_KERNEL); - if (!dev->flow_db->rdma_transport_tx) - goto free_rdma_transport_rx; + for (j = 0; j < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; j++) { + dev->flow_db->rdma_transport_tx[j] = + kcalloc(dev->num_ports, + sizeof(struct mlx5_ib_flow_prio), GFP_KERNEL); + if (!dev->flow_db->rdma_transport_tx[j]) + goto free_rdma_transport_tx; + } mutex_init(&dev->flow_db->lock); ib_set_device_ops(&dev->ib_dev, &flow_ops); return 0; +free_rdma_transport_tx: + while (j--) + kfree(dev->flow_db->rdma_transport_tx[j]); free_rdma_transport_rx: - kfree(dev->flow_db->rdma_transport_rx); -free_flow_db: + while (i--) + kfree(dev->flow_db->rdma_transport_rx[i]); kfree(dev->flow_db); return -ENOMEM; } diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index 2ebe86e5be10..7abba0e2837c 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -13,6 +13,8 @@ void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev); static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) { + int i; + /* When a steering anchor is created, a special flow table is also * created for the user to reference. Since the user can reference it, * the kernel cannot trust that when the user destroys the steering @@ -25,8 +27,10 @@ static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) * is a safe assumption that all references are gone. */ mlx5_ib_fs_cleanup_anchor(dev); - kfree(dev->flow_db->rdma_transport_tx); - kfree(dev->flow_db->rdma_transport_rx); + for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) + kfree(dev->flow_db->rdma_transport_tx[i]); + for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) + kfree(dev->flow_db->rdma_transport_rx[i]); kfree(dev->flow_db); } #endif /* _MLX5_IB_FS_H */ diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 49af1cfbe6d1..cc8859d3c2f5 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -88,7 +88,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) else return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); - ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev); + ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(lag_master)); if (!ibdev) return -ENOMEM; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index df6557ddbdfc..d456e4fde3e1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -50,6 +50,7 @@ #include <rdma/ib_ucaps.h> #include "macsec.h" #include "data_direct.h" +#include "dmah.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -4190,7 +4191,9 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .modify_port = mlx5_ib_modify_port, .modify_qp = mlx5_ib_modify_qp, .modify_srq = mlx5_ib_modify_srq, + .pre_destroy_cq = mlx5_ib_pre_destroy_cq, .poll_cq = mlx5_ib_poll_cq, + .post_destroy_cq = mlx5_ib_post_destroy_cq, .post_recv = mlx5_ib_post_recv_nodrain, .post_send = mlx5_ib_post_send_nodrain, .post_srq_recv = mlx5_ib_post_srq_recv, @@ -4212,6 +4215,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_dmah, mlx5_ib_dmah, ibdmah), INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, mlx5_ib_qp, ibqp), INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq), @@ -4339,6 +4343,9 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops); + if (mdev->st) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dmah_ops); + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops); if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) @@ -4824,7 +4831,8 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud)) return ERR_PTR(-EOPNOTSUPP); - mplane = ib_alloc_device(mlx5_ib_dev, ib_dev); + mplane = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(mparent->mdev)); if (!mplane) return ERR_PTR(-ENOMEM); @@ -4938,7 +4946,8 @@ static int mlx5r_probe(struct auxiliary_device *adev, num_ports = max(MLX5_CAP_GEN(mdev, num_ports), MLX5_CAP_GEN(mdev, num_vhca_ports)); - dev = ib_alloc_device(mlx5_ib_dev, ib_dev); + dev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(mdev)); if (!dev) return -ENOMEM; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index fde859d207ae..7ffc7ee92cf0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -104,19 +104,6 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff( __mlx5_bit_sz(typ, page_offset_fld), 0, scale, \ page_offset_quantized) -static inline unsigned long -mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf) -{ - /* - * mkeys used for dmabuf are fixed at PAGE_SIZE because we must be able - * to hold any sgl after a move operation. Ideally the mkc page size - * could be changed at runtime to be optimal, but right now the driver - * cannot do that. - */ - return ib_umem_find_best_pgsz(&umem_dmabuf->umem, PAGE_SIZE, - umem_dmabuf->umem.iova); -} - enum { MLX5_IB_MMAP_OFFSET_START = 9, MLX5_IB_MMAP_OFFSET_END = 255, @@ -320,8 +307,8 @@ struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX]; struct mlx5_flow_table *lag_demux_ft; - struct mlx5_ib_flow_prio *rdma_transport_rx; - struct mlx5_ib_flow_prio *rdma_transport_tx; + struct mlx5_ib_flow_prio *rdma_transport_rx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO]; + struct mlx5_ib_flow_prio *rdma_transport_tx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO]; /* Protect flow steering bypass flow tables * when add/del flow rules. * only single add/removal of flow steering rule could be done @@ -352,6 +339,7 @@ struct mlx5_ib_flow_db { #define MLX5_IB_UPD_XLT_ACCESS BIT(5) #define MLX5_IB_UPD_XLT_INDIRECT BIT(6) #define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7) +#define MLX5_IB_UPD_XLT_KEEP_PGSZ BIT(8) /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * @@ -650,8 +638,13 @@ enum mlx5_mkey_type { MLX5_MKEY_IMPLICIT_CHILD, }; +/* Used for non-existent ph value */ +#define MLX5_IB_NO_PH 0xff + struct mlx5r_cache_rb_key { u8 ats:1; + u8 ph; + u16 st_index; unsigned int access_mode; unsigned int access_flags; unsigned int ndescs; @@ -739,6 +732,8 @@ struct mlx5_ib_mr { struct mlx5_ib_mr *dd_crossed_mr; struct list_head dd_node; u8 revoked :1; + /* Indicates previous dmabuf page fault occurred */ + u8 dmabuf_faulted:1; struct mlx5_ib_mkey null_mmkey; }; }; @@ -899,13 +894,14 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type); -int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter, - u32 port); +int mlx5r_fs_bind_op_fc(struct ib_qp *qp, + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX], + struct xarray *qpn_opfc_xa, u32 port); -void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter); +void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct xarray *qpn_opfc_xa); void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev, - struct rdma_counter *counter); + struct mlx5_fc *fc_arr[MLX5_IB_OPCOUNTER_MAX]); struct mlx5_ib_multiport_info; @@ -1372,16 +1368,20 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_pre_destroy_cq(struct ib_cq *cq); +void mlx5_ib_post_destroy_cq(struct ib_cq *cq); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, @@ -1748,20 +1748,71 @@ static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port) return (port - 1) / dev->num_ports + 1; } +static inline unsigned int get_max_log_entity_size_cap(struct mlx5_ib_dev *dev, + int access_mode) +{ + int max_log_size = 0; + + if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) + max_log_size = + MLX5_CAP_GEN_2(dev->mdev, max_mkey_log_entity_size_mtt); + else if (access_mode == MLX5_MKC_ACCESS_MODE_KSM) + max_log_size = MLX5_CAP_GEN_2( + dev->mdev, max_mkey_log_entity_size_fixed_buffer); + + if (!max_log_size || + (max_log_size > 31 && + !MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))) + max_log_size = 31; + + return max_log_size; +} + +static inline unsigned int get_min_log_entity_size_cap(struct mlx5_ib_dev *dev, + int access_mode) +{ + int min_log_size = 0; + + if (access_mode == MLX5_MKC_ACCESS_MODE_KSM && + MLX5_CAP_GEN_2(dev->mdev, + min_mkey_log_entity_size_fixed_buffer_valid)) + min_log_size = MLX5_CAP_GEN_2( + dev->mdev, min_mkey_log_entity_size_fixed_buffer); + else + min_log_size = + MLX5_CAP_GEN_2(dev->mdev, log_min_mkey_entity_size); + + min_log_size = max(min_log_size, MLX5_ADAPTER_PAGE_SHIFT); + return min_log_size; +} + /* * For mkc users, instead of a page_offset the command has a start_iova which * specifies both the page_offset and the on-the-wire IOVA */ static __always_inline unsigned long mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem, - u64 iova) + u64 iova, int access_mode) { - int page_size_bits = - MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5; - unsigned long bitmap = - __mlx5_log_page_size_to_bitmap(page_size_bits, 0); + unsigned int max_log_entity_size_cap, min_log_entity_size_cap; + unsigned long bitmap; + + max_log_entity_size_cap = get_max_log_entity_size_cap(dev, access_mode); + min_log_entity_size_cap = get_min_log_entity_size_cap(dev, access_mode); + + bitmap = GENMASK_ULL(max_log_entity_size_cap, min_log_entity_size_cap); return ib_umem_find_best_pgsz(umem, bitmap, iova); } +static inline unsigned long +mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf, + int access_mode) +{ + return mlx5_umem_mkc_find_best_pgsz(to_mdev(umem_dmabuf->umem.ibdev), + &umem_dmabuf->umem, + umem_dmabuf->umem.iova, + access_mode); +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index bd35e75d9ce5..1317f2cb38a4 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -44,6 +44,7 @@ #include "mlx5_ib.h" #include "umr.h" #include "data_direct.h" +#include "dmah.h" enum { MAX_PENDING_REG_MR = 8, @@ -57,7 +58,7 @@ create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, unsigned long page_size, bool populate, - int access_mode); + int access_mode, u16 st_index, u8 ph); static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, @@ -256,6 +257,14 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) get_mkc_octo_size(ent->rb_key.access_mode, ent->rb_key.ndescs)); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); + + if (ent->rb_key.ph != MLX5_IB_NO_PH) { + MLX5_SET(mkc, mkc, pcie_tph_en, 1); + MLX5_SET(mkc, mkc, pcie_tph_ph, ent->rb_key.ph); + if (ent->rb_key.st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) + MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, + ent->rb_key.st_index); + } } /* Asynchronously schedule new MRs to be populated in the cache. */ @@ -641,6 +650,14 @@ static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, if (res) return res; + res = key1.st_index - key2.st_index; + if (res) + return res; + + res = key1.ph - key2.ph; + if (res) + return res; + /* * keep ndescs the last in the compare table since the find function * searches for an exact match on all properties and only closest @@ -712,6 +729,8 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, smallest->rb_key.access_mode == rb_key.access_mode && smallest->rb_key.access_flags == rb_key.access_flags && smallest->rb_key.ats == rb_key.ats && + smallest->rb_key.st_index == rb_key.st_index && + smallest->rb_key.ph == rb_key.ph && smallest->rb_key.ndescs <= ndescs_limit) ? smallest : NULL; @@ -786,7 +805,8 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, struct mlx5r_cache_rb_key rb_key = { .ndescs = ndescs, .access_mode = access_mode, - .access_flags = get_unchangeable_access_flags(dev, access_flags) + .access_flags = get_unchangeable_access_flags(dev, access_flags), + .ph = MLX5_IB_NO_PH, }; struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); @@ -943,6 +963,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) struct rb_root *root = &dev->cache.rb_root; struct mlx5r_cache_rb_key rb_key = { .access_mode = MLX5_MKC_ACCESS_MODE_MTT, + .ph = MLX5_IB_NO_PH, }; struct mlx5_cache_ent *ent; struct rb_node *node; @@ -1119,7 +1140,8 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct ib_umem *umem, u64 iova, - int access_flags, int access_mode) + int access_flags, int access_mode, + u16 st_index, u8 ph) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5r_cache_rb_key rb_key = {}; @@ -1130,7 +1152,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); else - page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); + page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova, + access_mode); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); @@ -1138,6 +1161,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); + rb_key.st_index = st_index; + rb_key.ph = ph; ent = mkey_cache_ent_from_rb_key(dev, rb_key); /* * If the MR can't come from the cache then synchronously create an uncached @@ -1145,7 +1170,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, */ if (!ent) { mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); + mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode, + st_index, ph); mutex_unlock(&dev->slow_path_mutex); if (IS_ERR(mr)) return mr; @@ -1230,7 +1256,7 @@ err_1: static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, unsigned long page_size, bool populate, - int access_mode) + int access_mode, u16 st_index, u8 ph) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; @@ -1240,7 +1266,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u32 *in; int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && - (access_mode == MLX5_MKC_ACCESS_MODE_MTT); + (access_mode == MLX5_MKC_ACCESS_MODE_MTT) && + (ph == MLX5_IB_NO_PH); bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); if (!page_size) @@ -1304,6 +1331,13 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, get_octo_len(iova, umem->length, mr->page_shift)); } + if (ph != MLX5_IB_NO_PH) { + MLX5_SET(mkc, mkc, pcie_tph_en, 1); + MLX5_SET(mkc, mkc, pcie_tph_ph, ph); + if (st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) + MLX5_SET(mkc, mkc, pcie_tph_steering_tag_index, st_index); + } + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); @@ -1423,24 +1457,37 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, } static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, - u64 iova, int access_flags) + u64 iova, int access_flags, + struct ib_dmah *dmah) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; bool xlt_with_umr; + u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX; + u8 ph = MLX5_IB_NO_PH; int err; + if (dmah) { + struct mlx5_ib_dmah *mdmah = to_mdmah(dmah); + + ph = dmah->ph; + if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS)) + st_index = mdmah->st_index; + } + xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { mr = alloc_cacheable_mr(pd, umem, iova, access_flags, - MLX5_MKC_ACCESS_MODE_MTT); + MLX5_MKC_ACCESS_MODE_MTT, + st_index, ph); } else { - unsigned long page_size = - mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); + unsigned long page_size = mlx5_umem_mkc_find_best_pgsz( + dev, umem, iova, MLX5_MKC_ACCESS_MODE_MTT); mutex_lock(&dev->slow_path_mutex); mr = reg_create(pd, umem, iova, access_flags, page_size, - true, MLX5_MKC_ACCESS_MODE_MTT); + true, MLX5_MKC_ACCESS_MODE_MTT, + st_index, ph); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { @@ -1504,7 +1551,9 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_CAST(odp); mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, - MLX5_MKC_ACCESS_MODE_MTT); + MLX5_MKC_ACCESS_MODE_MTT, + MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX, + MLX5_IB_NO_PH); if (IS_ERR(mr)) { ib_umem_release(&odp->umem); return ERR_CAST(mr); @@ -1528,13 +1577,15 @@ err_dereg_mr: struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 iova, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem *umem; int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || + ((access_flags & IB_ACCESS_ON_DEMAND) && dmah)) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", @@ -1550,7 +1601,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); if (IS_ERR(umem)) return ERR_CAST(umem); - return create_real_mr(pd, umem, iova, access_flags); + return create_real_mr(pd, umem, iova, access_flags, dmah); } static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) @@ -1575,12 +1626,15 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { static struct ib_mr * reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, u64 offset, u64 length, u64 virt_addr, - int fd, int access_flags, int access_mode) + int fd, int access_flags, int access_mode, + struct ib_dmah *dmah) { bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; struct ib_umem_dmabuf *umem_dmabuf; + u16 st_index = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX; + u8 ph = MLX5_IB_NO_PH; int err; err = mlx5r_umr_resource_init(dev); @@ -1603,8 +1657,17 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, return ERR_CAST(umem_dmabuf); } + if (dmah) { + struct mlx5_ib_dmah *mdmah = to_mdmah(dmah); + + ph = dmah->ph; + if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS)) + st_index = mdmah->st_index; + } + mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, - access_flags, access_mode); + access_flags, access_mode, + st_index, ph); if (IS_ERR(mr)) { ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); @@ -1661,7 +1724,8 @@ reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, access_flags &= ~IB_ACCESS_RELAXED_ORDERING; crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, offset, length, virt_addr, fd, - access_flags, MLX5_MKC_ACCESS_MODE_KSM); + access_flags, MLX5_MKC_ACCESS_MODE_KSM, + NULL); if (IS_ERR(crossed_mr)) { ret = PTR_ERR(crossed_mr); goto end; @@ -1688,6 +1752,7 @@ end: struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs) { struct mlx5_ib_dev *dev = to_mdev(pd->device); @@ -1720,7 +1785,8 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return reg_user_mr_dmabuf(pd, pd->device->dma_device, offset, length, virt_addr, - fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); + fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT, + dmah); } /* @@ -1754,7 +1820,8 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; - *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); + *page_size = mlx5_umem_mkc_find_best_pgsz( + dev, new_umem, iova, mr->mmkey.cache_ent->rb_key.access_mode); if (WARN_ON(!*page_size)) return false; return (mr->mmkey.cache_ent->rb_key.ndescs) >= @@ -1817,7 +1884,8 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, struct mlx5_ib_mr *mr = to_mmr(ib_mr); int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct || + mr->mmkey.rb_key.ph != MLX5_IB_NO_PH) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg( @@ -1861,7 +1929,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); return create_real_mr(new_pd, umem, mr->ibmr.iova, - new_access_flags); + new_access_flags, NULL); } /* @@ -1892,7 +1960,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, } return NULL; } - return create_real_mr(new_pd, new_umem, iova, new_access_flags); + return create_real_mr(new_pd, new_umem, iova, new_access_flags, NULL); } /* @@ -1901,7 +1969,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, */ recreate: return mlx5_ib_reg_user_mr(new_pd, start, length, iova, - new_access_flags, udata); + new_access_flags, NULL, udata); } static int diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f6abd64f07f7..0e8ae85af5a6 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -836,9 +836,13 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, u32 *bytes_mapped, u32 flags) { struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + int access_mode = mr->data_direct ? MLX5_MKC_ACCESS_MODE_KSM : + MLX5_MKC_ACCESS_MODE_MTT; + unsigned int old_page_shift = mr->page_shift; + unsigned int page_shift; + unsigned long page_size; u32 xlt_flags = 0; int err; - unsigned long page_size; if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; @@ -850,20 +854,33 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, return err; } - page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf); + page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf, access_mode); if (!page_size) { ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { - if (mr->data_direct) - err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags); - else - err = mlx5r_umr_update_mr_pas(mr, xlt_flags); + page_shift = order_base_2(page_size); + if (page_shift != mr->page_shift && mr->dmabuf_faulted) { + err = mlx5r_umr_dmabuf_update_pgsz(mr, xlt_flags, + page_shift); + } else { + mr->page_shift = page_shift; + if (mr->data_direct) + err = mlx5r_umr_update_data_direct_ksm_pas( + mr, xlt_flags); + else + err = mlx5r_umr_update_mr_pas(mr, + xlt_flags); + } } dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); - if (err) + if (err) { + mr->page_shift = old_page_shift; return err; + } + + mr->dmabuf_faulted = 1; if (bytes_mapped) *bytes_mapped += bcnt; @@ -1866,6 +1883,7 @@ int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) struct mlx5r_cache_rb_key rb_key = { .access_mode = MLX5_MKC_ACCESS_MODE_KSM, .ndescs = mlx5_imr_ksm_entries, + .ph = MLX5_IB_NO_PH, }; struct mlx5_cache_ent *ent; diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 5be4426a2884..7ef35cddce81 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -32,13 +32,15 @@ static __be64 get_umr_disable_mr_mask(void) return cpu_to_be64(result); } -static __be64 get_umr_update_translation_mask(void) +static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev) { u64 result; result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | MLX5_MKEY_MASK_START_ADDR; + if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5)) + result |= MLX5_MKEY_MASK_PAGE_SIZE_5; return cpu_to_be64(result); } @@ -654,9 +656,12 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR; if (update_translation) { - wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(); + wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev); if (!mr->ibmr.length) MLX5_SET(mkc, &wqe->mkey_seg, length64, 1); + if (flags & MLX5_IB_UPD_XLT_KEEP_PGSZ) + wqe->ctrl_seg.mkey_mask &= + cpu_to_be64(~MLX5_MKEY_MASK_PAGE_SIZE); } wqe->ctrl_seg.xlt_octowords = @@ -664,46 +669,78 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, wqe->data_seg.byte_count = cpu_to_be32(sg->length); } +static void +_mlx5r_umr_init_wqe(struct mlx5_ib_mr *mr, struct mlx5r_umr_wqe *wqe, + struct ib_sge *sg, unsigned int flags, + unsigned int page_shift, bool dd) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + + mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg); + mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift); + if (dd) /* Use the data direct internal kernel PD */ + MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn); + mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg); +} + static int -_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) +_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd, + size_t start_block, size_t nblocks) { size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct device *ddev = &dev->mdev->pdev->dev; struct mlx5r_umr_wqe wqe = {}; + size_t processed_blocks = 0; struct ib_block_iter biter; + size_t cur_block_idx = 0; struct mlx5_ksm *cur_ksm; struct mlx5_mtt *cur_mtt; size_t orig_sg_length; + size_t total_blocks; size_t final_size; void *curr_entry; struct ib_sge sg; void *entry; - u64 offset = 0; + u64 offset; int err = 0; - entry = mlx5r_umr_create_xlt(dev, &sg, - ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), - ent_size, flags); + total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift); + if (start_block > total_blocks) + return -EINVAL; + + /* nblocks 0 means update all blocks starting from start_block */ + if (nblocks) + total_blocks = nblocks; + + entry = mlx5r_umr_create_xlt(dev, &sg, total_blocks, ent_size, flags); if (!entry) return -ENOMEM; orig_sg_length = sg.length; - mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); - mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, - mr->page_shift); - if (dd) { - /* Use the data direct internal kernel PD */ - MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + + _mlx5r_umr_init_wqe(mr, &wqe, &sg, flags, mr->page_shift, dd); + + /* Set initial translation offset to start_block */ + offset = (u64)start_block * ent_size; + mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); + + if (dd) cur_ksm = entry; - } else { + else cur_mtt = entry; - } - - mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); curr_entry = entry; + rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { + if (cur_block_idx < start_block) { + cur_block_idx++; + continue; + } + + if (nblocks && processed_blocks >= nblocks) + break; + if (curr_entry == entry + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -725,6 +762,11 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) if (dd) { cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + if (mr->umem->is_dmabuf && + (flags & MLX5_IB_UPD_XLT_ZAP)) { + cur_ksm->va = 0; + cur_ksm->key = 0; + } cur_ksm++; curr_entry = cur_ksm; } else { @@ -736,6 +778,8 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) cur_mtt++; curr_entry = cur_mtt; } + + processed_blocks++; } final_size = curr_entry - entry; @@ -752,13 +796,32 @@ err: return err; } -int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) +int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr, + unsigned int flags, + size_t start_block, + size_t nblocks) { /* No invalidation flow is expected */ - if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) + if (WARN_ON(!mr->umem->is_dmabuf) || ((flags & MLX5_IB_UPD_XLT_ZAP) && + !(flags & MLX5_IB_UPD_XLT_KEEP_PGSZ))) return -EINVAL; - return _mlx5r_umr_update_mr_pas(mr, flags, true); + return _mlx5r_umr_update_mr_pas(mr, flags, true, start_block, nblocks); +} + +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, + unsigned int flags) +{ + return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 0, 0); +} + +int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags, + size_t start_block, size_t nblocks) +{ + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, false, start_block, nblocks); } /* @@ -768,10 +831,7 @@ int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int fla */ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) { - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - return _mlx5r_umr_update_mr_pas(mr, flags, false); + return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0); } static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) @@ -864,3 +924,202 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, mlx5r_umr_unmap_free_xlt(dev, xlt, &sg); return err; } + +/* + * Update only the page-size (log_page_size) field of an existing memory key + * using UMR. This is useful when the MR's physical layout stays the same + * but the optimal page shift has changed (e.g. dmabuf after pages are + * pinned and the HW can switch from 4K to huge-page alignment). + */ +int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr, + unsigned int page_shift, + bool dd) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct mlx5r_umr_wqe wqe = {}; + int err; + + /* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */ + wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev); + + /* MR must be free while page size is modified */ + wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE; + + /* Fill mkey segment with the new page size, keep the rest unchanged */ + MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift); + + if (dd) + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + else + MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn); + + MLX5_SET64(mkc, &wqe.mkey_seg, start_addr, mr->ibmr.iova); + MLX5_SET64(mkc, &wqe.mkey_seg, len, mr->ibmr.length); + MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, + mlx5_mkey_variant(mr->mmkey.key)); + + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); + if (!err) + mr->page_shift = page_shift; + + return err; +} + +static inline int +_mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr *mr, unsigned int flags, + size_t start_block, size_t nblocks, bool dd) +{ + if (dd) + return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, + start_block, + nblocks); + else + return mlx5r_umr_update_mr_pas_range(mr, flags, start_block, + nblocks); +} + +/** + * This function makes an mkey non-present by zapping the translation entries of + * the mkey by zapping (zeroing out) the first N entries, where N is determined + * by the largest page size supported by the device and the MR length. + * It then updates the mkey's page size to the largest possible value, ensuring + * the MR is completely non-present and safe for further updates. + * It is useful to update the page size of a dmabuf MR on a page fault. + * + * Return: On success, returns the number of entries that were zapped. + * On error, returns a negative error code. + */ +static int _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr, + unsigned int flags, + unsigned int page_shift, + size_t *nblocks, + bool dd) +{ + unsigned int old_page_shift = mr->page_shift; + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + unsigned int max_page_shift; + size_t page_shift_nblocks; + unsigned int max_log_size; + int access_mode; + int err; + + access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT; + flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC; + max_log_size = get_max_log_entity_size_cap(dev, access_mode); + max_page_shift = order_base_2(mr->ibmr.length); + max_page_shift = min(max(max_page_shift, page_shift), max_log_size); + /* Count blocks in units of max_page_shift, we will zap exactly this + * many to make the whole MR non-present. + * Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may + * be used as offset into the XLT later on. + */ + *nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift); + if (dd) + *nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT); + else + *nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT); + page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem, + 1UL << page_shift); + /* If the number of blocks at max possible page shift is greater than + * the number of blocks at the new page size, we should just go over the + * whole mkey entries. + */ + if (*nblocks >= page_shift_nblocks) + *nblocks = 0; + + /* Make the first nblocks entries non-present without changing + * page size yet. + */ + if (*nblocks) + mr->page_shift = max_page_shift; + err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd); + if (err) { + mr->page_shift = old_page_shift; + return err; + } + + /* Change page size to the max page size now that the MR is completely + * non-present. + */ + if (*nblocks) { + err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd); + if (err) { + mr->page_shift = old_page_shift; + return err; + } + } + + return 0; +} + +/** + * mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its + * entries accordingly + * @mr: The memory region to update + * @xlt_flags: Translation table update flags + * @page_shift: The new (optimized) page shift to use + * + * This function updates the page size and mkey translation entries for a DMABUF + * MR in a safe, multi-step process to avoid exposing partially updated mappings + * The update is performed in 5 steps: + * 1. Make the first X entries non-present, while X is calculated to be + * minimal according to a large page shift that can be used to cover the + * MR length. + * 2. Update the page size to the large supported page size + * 3. Load the remaining N-X entries according to the (optimized) page_shift + * 4. Update the page size according to the (optimized) page_shift + * 5. Load the first X entries with the correct translations + * + * This ensures that at no point is the MR accessible with a partially updated + * translation table, maintaining correctness and preventing access to stale or + * inconsistent mappings. + * + * Returns 0 on success or a negative error code on failure. + */ +int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags, + unsigned int page_shift) +{ + unsigned int old_page_shift = mr->page_shift; + size_t zapped_blocks; + size_t total_blocks; + int err; + + err = _mlx5r_umr_zap_mkey(mr, xlt_flags, page_shift, &zapped_blocks, + mr->data_direct); + if (err) + return err; + + /* _mlx5r_umr_zap_mkey already enables the mkey */ + xlt_flags &= ~MLX5_IB_UPD_XLT_ENABLE; + mr->page_shift = page_shift; + total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift); + if (zapped_blocks && zapped_blocks < total_blocks) { + /* Update PAS according to the new page size but don't update + * the page size in the mkey yet. + */ + err = _mlx5r_dmabuf_umr_update_pas( + mr, + xlt_flags | MLX5_IB_UPD_XLT_KEEP_PGSZ, + zapped_blocks, + total_blocks - zapped_blocks, + mr->data_direct); + if (err) + goto err; + } + + err = mlx5r_umr_update_mr_page_shift(mr, mr->page_shift, + mr->data_direct); + if (err) + goto err; + err = _mlx5r_dmabuf_umr_update_pas(mr, xlt_flags, 0, zapped_blocks, + mr->data_direct); + if (err) + goto err; + + return 0; +err: + mr->page_shift = old_page_shift; + return err; +} diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h index 4a02c9b5aad8..e9361f0140e7 100644 --- a/drivers/infiniband/hw/mlx5/umr.h +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -94,9 +94,20 @@ struct mlx5r_umr_wqe { int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr); int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, int access_flags); -int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr, + unsigned int flags, + size_t start_block, + size_t nblocks); int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags, + size_t start_block, size_t nblocks); +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); +int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr, + unsigned int page_shift, + bool dd); +int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags, + unsigned int page_shift); #endif /* _MLX5_IB_UMR_H */ |