summaryrefslogtreecommitdiff
path: root/drivers/infiniband/hw/mlx5
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r--drivers/infiniband/hw/mlx5/Makefile3
-rw-r--r--drivers/infiniband/hw/mlx5/ah.c13
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.c33
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.h4
-rw-r--r--drivers/infiniband/hw/mlx5/cong.c6
-rw-r--r--drivers/infiniband/hw/mlx5/counters.c207
-rw-r--r--drivers/infiniband/hw/mlx5/counters.h15
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c33
-rw-r--r--drivers/infiniband/hw/mlx5/data_direct.c227
-rw-r--r--drivers/infiniband/hw/mlx5/data_direct.h23
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c138
-rw-r--r--drivers/infiniband/hw/mlx5/devx.h9
-rw-r--r--drivers/infiniband/hw/mlx5/fs.c730
-rw-r--r--drivers/infiniband/hw/mlx5/fs.h17
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c22
-rw-r--r--drivers/infiniband/hw/mlx5/mad.c78
-rw-r--r--drivers/infiniband/hw/mlx5/main.c850
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c200
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h144
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c533
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c543
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c95
-rw-r--r--drivers/infiniband/hw/mlx5/qp.h1
-rw-r--r--drivers/infiniband/hw/mlx5/qpc.c43
-rw-r--r--drivers/infiniband/hw/mlx5/restrack.c38
-rw-r--r--drivers/infiniband/hw/mlx5/srq.c17
-rw-r--r--drivers/infiniband/hw/mlx5/std_types.c76
-rw-r--r--drivers/infiniband/hw/mlx5/umr.c232
-rw-r--r--drivers/infiniband/hw/mlx5/umr.h4
-rw-r--r--drivers/infiniband/hw/mlx5/wr.c2
30 files changed, 3416 insertions, 920 deletions
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 72a526236c2e..11878ddf7cc7 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -6,8 +6,10 @@ mlx5_ib-y := ah.o \
cong.o \
counters.o \
cq.o \
+ data_direct.o \
dm.o \
doorbell.o \
+ fs.o \
gsi.o \
ib_virt.o \
mad.o \
@@ -25,7 +27,6 @@ mlx5_ib-y := ah.o \
mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \
- fs.o \
qos.o \
std_types.o
mlx5_ib-$(CONFIG_MLX5_MACSEC) += macsec.o
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
index 505bc47fd575..531a57f9ee7e 100644
--- a/drivers/infiniband/hw/mlx5/ah.c
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -50,11 +50,12 @@ static __be16 mlx5_ah_get_udp_sport(const struct mlx5_ib_dev *dev,
return sport;
}
-static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
+static int create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
struct rdma_ah_init_attr *init_attr)
{
struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
enum ib_gid_type gid_type;
+ int rate_val;
if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
@@ -67,7 +68,10 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
ah->av.tclass = grh->traffic_class;
}
- ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4);
+ rate_val = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr));
+ if (rate_val < 0)
+ return rate_val;
+ ah->av.stat_rate_sl = rate_val << 4;
if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
if (init_attr->xmit_slave)
@@ -88,6 +92,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f;
ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf);
}
+
+ return 0;
}
int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
@@ -120,8 +126,7 @@ int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
return err;
}
- create_ib_ah(dev, ah, init_attr);
- return 0;
+ return create_ib_ah(dev, ah, init_attr);
}
int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 1d0c8d5e745b..7c08e3008927 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -177,7 +177,7 @@ int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid)
return mlx5_cmd_exec_in(dev, dealloc_xrcd, in);
}
-int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb,
u16 opmod, u8 port)
{
int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out);
@@ -195,12 +195,18 @@ int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC);
MLX5_SET(mad_ifc_in, in, op_mod, opmod);
- MLX5_SET(mad_ifc_in, in, port, port);
+ if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
+ MLX5_SET(mad_ifc_in, in, plane_index, port);
+ MLX5_SET(mad_ifc_in, in, port,
+ smi_to_native_portnum(dev, port));
+ } else {
+ MLX5_SET(mad_ifc_in, in, port, port);
+ }
data = MLX5_ADDR_OF(mad_ifc_in, in, mad);
memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad));
- err = mlx5_cmd_exec_inout(dev, mad_ifc, in, out);
+ err = mlx5_cmd_exec_inout(dev->mdev, mad_ifc, in, out);
if (err)
goto out;
@@ -239,3 +245,24 @@ int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid)
MLX5_SET(dealloc_uar_in, in, uid, uid);
return mlx5_cmd_exec_in(dev, dealloc_uar, in);
}
+
+int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct,
+ char *out_vuid)
+{
+ u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) +
+ MLX5_ST_SZ_BYTES(array1024_auto)] = {};
+ u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {};
+ char *vuid;
+ int err;
+
+ MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID);
+ MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(dev, vhca_id));
+ MLX5_SET(query_vuid_in, in, data_direct, data_direct);
+ err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ return err;
+
+ vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid);
+ memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto));
+ return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index 93a971a40d11..e6c88b6ebd0d 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -54,8 +54,10 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
u32 qpn, u16 uid);
int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid);
int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid);
-int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb,
u16 opmod, u8 port);
int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid);
int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid);
+int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct,
+ char *out_vuid);
#endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c
index f87531318feb..a78a067e3ce7 100644
--- a/drivers/infiniband/hw/mlx5/cong.c
+++ b/drivers/infiniband/hw/mlx5/cong.c
@@ -458,6 +458,12 @@ void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num)
dbg_cc_params->root = debugfs_create_dir("cc_params", mlx5_debugfs_get_dev_root(mdev));
for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) {
+ if ((i == MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP_VALID ||
+ i == MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP))
+ if (!MLX5_CAP_GEN(mdev, roce) ||
+ !MLX5_CAP_ROCE(mdev, roce_cc_general))
+ continue;
+
dbg_cc_params->params[i].offset = i;
dbg_cc_params->params[i].dev = dev;
dbg_cc_params->params[i].port_num = port_num;
diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c
index 8300ce622835..b847084dcd99 100644
--- a/drivers/infiniband/hw/mlx5/counters.c
+++ b/drivers/infiniband/hw/mlx5/counters.c
@@ -83,6 +83,8 @@ static const struct mlx5_ib_counter extended_err_cnts[] = {
INIT_Q_COUNTER(resp_remote_access_errors),
INIT_Q_COUNTER(resp_cqe_flush_error),
INIT_Q_COUNTER(req_cqe_flush_error),
+ INIT_Q_COUNTER(req_transport_retries_exceeded),
+ INIT_Q_COUNTER(req_rnr_retries_exceeded),
};
static const struct mlx5_ib_counter roce_accl_cnts[] = {
@@ -102,6 +104,8 @@ static const struct mlx5_ib_counter vport_extended_err_cnts[] = {
INIT_VPORT_Q_COUNTER(resp_remote_access_errors),
INIT_VPORT_Q_COUNTER(resp_cqe_flush_error),
INIT_VPORT_Q_COUNTER(req_cqe_flush_error),
+ INIT_VPORT_Q_COUNTER(req_transport_retries_exceeded),
+ INIT_VPORT_Q_COUNTER(req_rnr_retries_exceeded),
};
static const struct mlx5_ib_counter vport_roce_accl_cnts[] = {
@@ -136,6 +140,13 @@ static const struct mlx5_ib_counter rdmatx_cnp_op_cnts[] = {
INIT_OP_COUNTER(cc_tx_cnp_pkts, CC_TX_CNP_PKTS),
};
+static const struct mlx5_ib_counter packets_op_cnts[] = {
+ INIT_OP_COUNTER(rdma_tx_packets, RDMA_TX_PACKETS),
+ INIT_OP_COUNTER(rdma_tx_bytes, RDMA_TX_BYTES),
+ INIT_OP_COUNTER(rdma_rx_packets, RDMA_RX_PACKETS),
+ INIT_OP_COUNTER(rdma_rx_bytes, RDMA_RX_BYTES),
+};
+
static int mlx5_ib_read_counters(struct ib_counters *counters,
struct ib_counters_read_attr *read_attr,
struct uverbs_attr_bundle *attrs)
@@ -423,6 +434,52 @@ done:
return num_counters;
}
+static bool is_rdma_bytes_counter(u32 type)
+{
+ if (type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES ||
+ type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES ||
+ type == MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP ||
+ type == MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP)
+ return true;
+
+ return false;
+}
+
+static int do_per_qp_get_op_stat(struct rdma_counter *counter)
+{
+ struct mlx5_ib_dev *dev = to_mdev(counter->device);
+ const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port);
+ struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
+ int i, ret, index, num_hw_counters;
+ u64 packets = 0, bytes = 0;
+
+ for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP;
+ i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) {
+ if (!mcounter->fc[i])
+ continue;
+
+ ret = mlx5_fc_query(dev->mdev, mcounter->fc[i],
+ &packets, &bytes);
+ if (ret)
+ return ret;
+
+ num_hw_counters = cnts->num_q_counters +
+ cnts->num_cong_counters +
+ cnts->num_ext_ppcnt_counters;
+
+ index = i - MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP +
+ num_hw_counters;
+
+ if (is_rdma_bytes_counter(i))
+ counter->stats->value[index] = bytes;
+ else
+ counter->stats->value[index] = packets;
+
+ clear_bit(index, counter->stats->is_disabled);
+ }
+ return 0;
+}
+
static int do_get_op_stat(struct ib_device *ibdev,
struct rdma_hw_stats *stats,
u32 port_num, int index)
@@ -430,7 +487,7 @@ static int do_get_op_stat(struct ib_device *ibdev,
struct mlx5_ib_dev *dev = to_mdev(ibdev);
const struct mlx5_ib_counters *cnts;
const struct mlx5_ib_op_fc *opfcs;
- u64 packets = 0, bytes;
+ u64 packets, bytes;
u32 type;
int ret;
@@ -449,8 +506,11 @@ static int do_get_op_stat(struct ib_device *ibdev,
if (ret)
return ret;
+ if (is_rdma_bytes_counter(type))
+ stats->value[index] = bytes;
+ else
+ stats->value[index] = packets;
out:
- stats->value[index] = packets;
return index;
}
@@ -519,19 +579,30 @@ static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
{
struct mlx5_ib_dev *dev = to_mdev(counter->device);
const struct mlx5_ib_counters *cnts = get_counters(dev, counter->port);
+ int ret;
+
+ ret = mlx5_ib_query_q_counters(dev->mdev, cnts, counter->stats,
+ counter->id);
+ if (ret)
+ return ret;
+
+ if (!counter->mode.bind_opcnt)
+ return 0;
- return mlx5_ib_query_q_counters(dev->mdev, cnts,
- counter->stats, counter->id);
+ return do_per_qp_get_op_stat(counter);
}
static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
{
+ struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
struct mlx5_ib_dev *dev = to_mdev(counter->device);
u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
if (!counter->id)
return 0;
+ WARN_ON(!xa_empty(&mcounter->qpn_opfc_xa));
+ mlx5r_fs_destroy_fcs(dev, counter);
MLX5_SET(dealloc_q_counter_in, in, opcode,
MLX5_CMD_OP_DEALLOC_Q_COUNTER);
MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id);
@@ -539,9 +610,10 @@ static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
}
static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
- struct ib_qp *qp)
+ struct ib_qp *qp, u32 port)
{
struct mlx5_ib_dev *dev = to_mdev(qp->device);
+ bool new = false;
int err;
if (!counter->id) {
@@ -556,24 +628,46 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
return err;
counter->id =
MLX5_GET(alloc_q_counter_out, out, counter_set_id);
+ new = true;
}
err = mlx5_ib_qp_set_counter(qp, counter);
if (err)
goto fail_set_counter;
+ err = mlx5r_fs_bind_op_fc(qp, counter, port);
+ if (err)
+ goto fail_bind_op_fc;
+
return 0;
+fail_bind_op_fc:
+ mlx5_ib_qp_set_counter(qp, NULL);
fail_set_counter:
- mlx5_ib_counter_dealloc(counter);
- counter->id = 0;
+ if (new) {
+ mlx5_ib_counter_dealloc(counter);
+ counter->id = 0;
+ }
return err;
}
-static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
+static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp, u32 port)
{
- return mlx5_ib_qp_set_counter(qp, NULL);
+ struct rdma_counter *counter = qp->counter;
+ int err;
+
+ mlx5r_fs_unbind_op_fc(qp, counter);
+
+ err = mlx5_ib_qp_set_counter(qp, NULL);
+ if (err)
+ goto fail_set_counter;
+
+ return 0;
+
+fail_set_counter:
+ mlx5r_fs_bind_op_fc(qp, counter, port);
+ return err;
}
static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
@@ -673,6 +767,12 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
descs[j].priv = &rdmatx_cnp_op_cnts[i].type;
}
}
+
+ for (i = 0; i < ARRAY_SIZE(packets_op_cnts); i++, j++) {
+ descs[j].name = packets_op_cnts[i].name;
+ descs[j].flags |= IB_STAT_FLAG_OPTIONAL;
+ descs[j].priv = &packets_op_cnts[i].type;
+ }
}
@@ -723,6 +823,8 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
num_op_counters = ARRAY_SIZE(basic_op_cnts);
+ num_op_counters += ARRAY_SIZE(packets_op_cnts);
+
if (MLX5_CAP_FLOWTABLE(dev->mdev,
ft_field_support_2_nic_receive_rdma.bth_opcode))
num_op_counters += ARRAY_SIZE(rdmarx_cnp_op_cnts);
@@ -752,10 +854,58 @@ err:
return -ENOMEM;
}
+/*
+ * Checks if the given flow counter type should be sharing the same flow counter
+ * with another type and if it should, checks if that other type flow counter
+ * was already created, if both conditions are met return true and the counter
+ * else return false.
+ */
+bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, u32 type,
+ struct mlx5_ib_op_fc **opfc)
+{
+ u32 shared_fc_type;
+
+ switch (type) {
+ case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP;
+ break;
+ default:
+ return false;
+ }
+
+ *opfc = &opfcs[shared_fc_type];
+ if (!(*opfc)->fc)
+ return false;
+
+ return true;
+}
+
static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
{
u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
int num_cnt_ports = dev->num_ports;
+ struct mlx5_ib_op_fc *in_use_opfc;
int i, j;
if (is_mdev_switchdev_mode(dev->mdev))
@@ -777,11 +927,15 @@ static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
if (!dev->port[i].cnts.opfcs[j].fc)
continue;
- if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
- mlx5_ib_fs_remove_op_fc(dev,
- &dev->port[i].cnts.opfcs[j], j);
+ if (mlx5r_is_opfc_shared_and_in_use(
+ dev->port[i].cnts.opfcs, j, &in_use_opfc))
+ goto skip;
+
+ mlx5_ib_fs_remove_op_fc(dev,
+ &dev->port[i].cnts.opfcs[j], j);
mlx5_fc_destroy(dev->mdev,
dev->port[i].cnts.opfcs[j].fc);
+skip:
dev->port[i].cnts.opfcs[j].fc = NULL;
}
}
@@ -975,8 +1129,8 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port,
unsigned int index, bool enable)
{
struct mlx5_ib_dev *dev = to_mdev(device);
+ struct mlx5_ib_op_fc *opfc, *in_use_opfc;
struct mlx5_ib_counters *cnts;
- struct mlx5_ib_op_fc *opfc;
u32 num_hw_counters, type;
int ret;
@@ -1000,6 +1154,13 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port,
if (opfc->fc)
return -EEXIST;
+ if (mlx5r_is_opfc_shared_and_in_use(cnts->opfcs, type,
+ &in_use_opfc)) {
+ opfc->fc = in_use_opfc->fc;
+ opfc->rule[0] = in_use_opfc->rule[0];
+ return 0;
+ }
+
opfc->fc = mlx5_fc_create(dev->mdev, false);
if (IS_ERR(opfc->fc))
return PTR_ERR(opfc->fc);
@@ -1015,12 +1176,23 @@ static int mlx5_ib_modify_stat(struct ib_device *device, u32 port,
if (!opfc->fc)
return -EINVAL;
+ if (mlx5r_is_opfc_shared_and_in_use(cnts->opfcs, type, &in_use_opfc))
+ goto out;
+
mlx5_ib_fs_remove_op_fc(dev, opfc, type);
mlx5_fc_destroy(dev->mdev, opfc->fc);
+out:
opfc->fc = NULL;
return 0;
}
+static void mlx5_ib_counter_init(struct rdma_counter *counter)
+{
+ struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
+
+ xa_init(&mcounter->qpn_opfc_xa);
+}
+
static const struct ib_device_ops hw_stats_ops = {
.alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats,
.get_hw_stats = mlx5_ib_get_hw_stats,
@@ -1029,8 +1201,10 @@ static const struct ib_device_ops hw_stats_ops = {
.counter_dealloc = mlx5_ib_counter_dealloc,
.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
.counter_update_stats = mlx5_ib_counter_update_stats,
- .modify_hw_stat = IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) ?
- mlx5_ib_modify_stat : NULL,
+ .modify_hw_stat = mlx5_ib_modify_stat,
+ .counter_init = mlx5_ib_counter_init,
+
+ INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter),
};
static const struct ib_device_ops hw_switchdev_vport_op = {
@@ -1045,6 +1219,9 @@ static const struct ib_device_ops hw_switchdev_stats_ops = {
.counter_dealloc = mlx5_ib_counter_dealloc,
.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
.counter_update_stats = mlx5_ib_counter_update_stats,
+ .counter_init = mlx5_ib_counter_init,
+
+ INIT_RDMA_OBJ_SIZE(rdma_counter, mlx5_rdma_counter, rdma_counter),
};
static const struct ib_device_ops counters_ops = {
diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h
index 6bcaaa52e2b2..bd03cee42014 100644
--- a/drivers/infiniband/hw/mlx5/counters.h
+++ b/drivers/infiniband/hw/mlx5/counters.h
@@ -8,10 +8,25 @@
#include "mlx5_ib.h"
+struct mlx5_rdma_counter {
+ struct rdma_counter rdma_counter;
+
+ struct mlx5_fc *fc[MLX5_IB_OPCOUNTER_MAX];
+ struct xarray qpn_opfc_xa;
+};
+
+static inline struct mlx5_rdma_counter *
+to_mcounter(struct rdma_counter *counter)
+{
+ return container_of(counter, struct mlx5_rdma_counter, rdma_counter);
+}
+
int mlx5_ib_counters_init(struct mlx5_ib_dev *dev);
void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev);
void mlx5_ib_counters_clear_description(struct ib_counters *counters);
int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters,
struct mlx5_ib_create_flow *ucmd);
u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num);
+bool mlx5r_is_opfc_shared_and_in_use(struct mlx5_ib_op_fc *opfcs, u32 type,
+ struct mlx5_ib_op_fc **opfc);
#endif /* _MLX5_IB_COUNTERS_H */
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 9773d2a3d97f..1aa5311b03e9 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -38,6 +38,9 @@
#include "srq.h"
#include "qp.h"
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe)
{
struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
@@ -487,7 +490,7 @@ repoll:
}
qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff;
- if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) {
+ if (!*cur_qp || (qpn != (*cur_qp)->trans_qp.base.mqp.qpn)) {
/* We do not have to take the QP table lock here,
* because CQs will be locked while QPs are removed
* from the table.
@@ -714,7 +717,8 @@ static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format)
static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
struct mlx5_ib_cq *cq, int entries, u32 **cqb,
- int *cqe_size, int *index, int *inlen)
+ int *cqe_size, int *index, int *inlen,
+ struct uverbs_attr_bundle *attrs)
{
struct mlx5_ib_create_cq ucmd = {};
unsigned long page_size;
@@ -788,7 +792,11 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT);
MLX5_SET(cqc, cqc, page_offset, page_offset_quantized);
- if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) {
+ if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX)) {
+ err = uverbs_copy_from(index, attrs, MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX);
+ if (err)
+ goto err_cqb;
+ } else if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) {
*index = ucmd.uar_page_index;
} else if (context->bfregi.lib_uar_dyn) {
err = -EINVAL;
@@ -942,8 +950,9 @@ static void notify_soft_wc_handler(struct work_struct *work)
}
int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct ib_udata *udata)
+ struct uverbs_attr_bundle *attrs)
{
+ struct ib_udata *udata = &attrs->driver_udata;
struct ib_device *ibdev = ibcq->device;
int entries = attr->cqe;
int vector = attr->comp_vector;
@@ -980,7 +989,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
if (udata) {
err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size,
- &index, &inlen);
+ &index, &inlen, attrs);
if (err)
return err;
} else {
@@ -1442,3 +1451,17 @@ int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc)
return 0;
}
+
+ADD_UVERBS_ATTRIBUTES_SIMPLE(
+ mlx5_ib_cq_create,
+ UVERBS_OBJECT_CQ,
+ UVERBS_METHOD_CQ_CREATE,
+ UVERBS_ATTR_PTR_IN(
+ MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL));
+
+const struct uapi_definition mlx5_ib_create_cq_defs[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_CQ, &mlx5_ib_cq_create),
+ {},
+};
diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c
new file mode 100644
index 000000000000..b9ba84afaae2
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/data_direct.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "mlx5_ib.h"
+#include "data_direct.h"
+
+static LIST_HEAD(mlx5_data_direct_dev_list);
+static LIST_HEAD(mlx5_data_direct_reg_list);
+
+/*
+ * This mutex should be held when accessing either of the above lists
+ */
+static DEFINE_MUTEX(mlx5_data_direct_mutex);
+
+struct mlx5_data_direct_registration {
+ struct mlx5_ib_dev *ibdev;
+ char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1];
+ struct list_head list;
+};
+
+static const struct pci_device_id mlx5_data_direct_pci_table[] = {
+ { PCI_VDEVICE(MELLANOX, 0x2100) }, /* ConnectX-8 Data Direct */
+ { 0, }
+};
+
+static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev)
+{
+ struct pci_dev *pdev = dev->pdev;
+ unsigned int vpd_size, kw_len;
+ u8 *vpd_data;
+ int start;
+ int ret;
+
+ vpd_data = pci_vpd_alloc(pdev, &vpd_size);
+ if (IS_ERR(vpd_data)) {
+ pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data));
+ return PTR_ERR(vpd_data);
+ }
+
+ start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len);
+ if (start < 0) {
+ ret = start;
+ pci_err(pdev, "VU keyword not found, err=%d\n", ret);
+ goto end;
+ }
+
+ dev->vuid = kmemdup_nul(vpd_data + start, kw_len, GFP_KERNEL);
+ ret = dev->vuid ? 0 : -ENOMEM;
+
+end:
+ kfree(vpd_data);
+ return ret;
+}
+
+static void mlx5_data_direct_shutdown(struct pci_dev *pdev)
+{
+ pci_disable_device(pdev);
+}
+
+static int mlx5_data_direct_set_dma_caps(struct pci_dev *pdev)
+{
+ int err;
+
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+ if (err) {
+ dev_warn(&pdev->dev,
+ "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err);
+ err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+ if (err) {
+ dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err);
+ return err;
+ }
+ }
+
+ dma_set_max_seg_size(&pdev->dev, SZ_2G);
+ return 0;
+}
+
+int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid)
+{
+ struct mlx5_data_direct_registration *reg;
+ struct mlx5_data_direct_dev *dev;
+
+ reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+ if (!reg)
+ return -ENOMEM;
+
+ reg->ibdev = ibdev;
+ strcpy(reg->vuid, vuid);
+
+ mutex_lock(&mlx5_data_direct_mutex);
+ list_for_each_entry(dev, &mlx5_data_direct_dev_list, list) {
+ if (strcmp(dev->vuid, vuid) == 0) {
+ mlx5_ib_data_direct_bind(ibdev, dev);
+ break;
+ }
+ }
+
+ /* Add the registration to its global list, to be used upon bind/unbind
+ * of its affiliated data direct device
+ */
+ list_add_tail(&reg->list, &mlx5_data_direct_reg_list);
+ mutex_unlock(&mlx5_data_direct_mutex);
+ return 0;
+}
+
+void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev)
+{
+ struct mlx5_data_direct_registration *reg;
+
+ mutex_lock(&mlx5_data_direct_mutex);
+ list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+ if (reg->ibdev == ibdev) {
+ list_del(&reg->list);
+ kfree(reg);
+ goto end;
+ }
+ }
+
+ WARN_ON(true);
+end:
+ mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static void mlx5_data_direct_dev_reg(struct mlx5_data_direct_dev *dev)
+{
+ struct mlx5_data_direct_registration *reg;
+
+ mutex_lock(&mlx5_data_direct_mutex);
+ list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+ if (strcmp(dev->vuid, reg->vuid) == 0)
+ mlx5_ib_data_direct_bind(reg->ibdev, dev);
+ }
+
+ /* Add the data direct device to the global list, further IB devices may
+ * use it later as well
+ */
+ list_add_tail(&dev->list, &mlx5_data_direct_dev_list);
+ mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static void mlx5_data_direct_dev_unreg(struct mlx5_data_direct_dev *dev)
+{
+ struct mlx5_data_direct_registration *reg;
+
+ mutex_lock(&mlx5_data_direct_mutex);
+ /* Prevent any further affiliations */
+ list_del(&dev->list);
+ list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+ if (strcmp(dev->vuid, reg->vuid) == 0)
+ mlx5_ib_data_direct_unbind(reg->ibdev);
+ }
+ mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static int mlx5_data_direct_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct mlx5_data_direct_dev *dev;
+ int err;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->device = &pdev->dev;
+ dev->pdev = pdev;
+
+ pci_set_drvdata(dev->pdev, dev);
+ err = pci_enable_device(pdev);
+ if (err) {
+ dev_err(dev->device, "Cannot enable PCI device, err=%d\n", err);
+ goto err;
+ }
+
+ pci_set_master(pdev);
+ err = mlx5_data_direct_set_dma_caps(pdev);
+ if (err)
+ goto err_disable;
+
+ if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) &&
+ pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) &&
+ pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128))
+ dev_dbg(dev->device, "Enabling pci atomics failed\n");
+
+ err = mlx5_data_direct_vpd_get_vuid(dev);
+ if (err)
+ goto err_disable;
+
+ mlx5_data_direct_dev_reg(dev);
+ return 0;
+
+err_disable:
+ pci_disable_device(pdev);
+err:
+ kfree(dev);
+ return err;
+}
+
+static void mlx5_data_direct_remove(struct pci_dev *pdev)
+{
+ struct mlx5_data_direct_dev *dev = pci_get_drvdata(pdev);
+
+ mlx5_data_direct_dev_unreg(dev);
+ pci_disable_device(pdev);
+ kfree(dev->vuid);
+ kfree(dev);
+}
+
+static struct pci_driver mlx5_data_direct_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = mlx5_data_direct_pci_table,
+ .probe = mlx5_data_direct_probe,
+ .remove = mlx5_data_direct_remove,
+ .shutdown = mlx5_data_direct_shutdown,
+};
+
+int mlx5_data_direct_driver_register(void)
+{
+ return pci_register_driver(&mlx5_data_direct_driver);
+}
+
+void mlx5_data_direct_driver_unregister(void)
+{
+ pci_unregister_driver(&mlx5_data_direct_driver);
+}
diff --git a/drivers/infiniband/hw/mlx5/data_direct.h b/drivers/infiniband/hw/mlx5/data_direct.h
new file mode 100644
index 000000000000..2fd2bdbe8f69
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/data_direct.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#ifndef _MLX5_IB_DATA_DIRECT_H
+#define _MLX5_IB_DATA_DIRECT_H
+
+struct mlx5_ib_dev;
+
+struct mlx5_data_direct_dev {
+ struct device *device;
+ struct pci_dev *pdev;
+ char *vuid;
+ struct list_head list;
+};
+
+int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid);
+void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev);
+int mlx5_data_direct_driver_register(void);
+void mlx5_data_direct_driver_unregister(void);
+
+#endif
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 869369cb5b5f..2479da8620ca 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -13,6 +13,7 @@
#include <rdma/uverbs_std_types.h>
#include <linux/mlx5/driver.h>
#include <linux/mlx5/fs.h>
+#include <rdma/ib_ucaps.h>
#include "mlx5_ib.h"
#include "devx.h"
#include "qp.h"
@@ -27,6 +28,19 @@ enum devx_obj_flags {
DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0,
DEVX_OBJ_FLAGS_DCT = 1 << 1,
DEVX_OBJ_FLAGS_CQ = 1 << 2,
+ DEVX_OBJ_FLAGS_HW_FREED = 1 << 3,
+};
+
+#define MAX_ASYNC_CMDS 8
+
+struct mlx5_async_cmd {
+ struct ib_uobject *uobject;
+ void *in;
+ int in_size;
+ u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
+ int err;
+ struct mlx5_async_work cb_work;
+ struct completion comp;
};
struct devx_async_data {
@@ -109,7 +123,27 @@ devx_ufile2uctx(const struct uverbs_attr_bundle *attrs)
return to_mucontext(ib_uverbs_get_ucontext(attrs));
}
-int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
+static int set_uctx_ucaps(struct mlx5_ib_dev *dev, u64 req_ucaps, u32 *cap)
+{
+ if (UCAP_ENABLED(req_ucaps, RDMA_UCAP_MLX5_CTRL_LOCAL)) {
+ if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL)
+ *cap |= MLX5_UCTX_CAP_RDMA_CTRL;
+ else
+ return -EOPNOTSUPP;
+ }
+
+ if (UCAP_ENABLED(req_ucaps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA)) {
+ if (MLX5_CAP_GEN(dev->mdev, uctx_cap) &
+ MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA)
+ *cap |= MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA;
+ else
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps)
{
u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {};
u32 out[MLX5_ST_SZ_DW(create_uctx_out)] = {};
@@ -123,14 +157,22 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
return -EINVAL;
uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx);
- if (is_user && capable(CAP_NET_RAW) &&
- (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX))
+ if (is_user &&
+ (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX) &&
+ capable(CAP_NET_RAW))
cap |= MLX5_UCTX_CAP_RAW_TX;
- if (is_user && capable(CAP_SYS_RAWIO) &&
+ if (is_user &&
(MLX5_CAP_GEN(dev->mdev, uctx_cap) &
- MLX5_UCTX_CAP_INTERNAL_DEV_RES))
+ MLX5_UCTX_CAP_INTERNAL_DEV_RES) &&
+ capable(CAP_SYS_RAWIO))
cap |= MLX5_UCTX_CAP_INTERNAL_DEV_RES;
+ if (req_ucaps) {
+ err = set_uctx_ucaps(dev, req_ucaps, &cap);
+ if (err)
+ return err;
+ }
+
MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX);
MLX5_SET(uctx, uctx, cap, cap);
@@ -1405,7 +1447,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
*/
mlx5r_deref_wait_odp_mkey(&obj->mkey);
- if (obj->flags & DEVX_OBJ_FLAGS_DCT)
+ if (obj->flags & DEVX_OBJ_FLAGS_HW_FREED)
+ ret = 0;
+ else if (obj->flags & DEVX_OBJ_FLAGS_DCT)
ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct);
else if (obj->flags & DEVX_OBJ_FLAGS_CQ)
ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq);
@@ -2558,7 +2602,7 @@ int mlx5_ib_devx_init(struct mlx5_ib_dev *dev)
struct mlx5_devx_event_table *table = &dev->devx_event_table;
int uid;
- uid = mlx5_ib_devx_create(dev, false);
+ uid = mlx5_ib_devx_create(dev, false, 0);
if (uid > 0) {
dev->devx_whitelist_uid = uid;
xa_init(&table->event_xa);
@@ -2595,6 +2639,82 @@ void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev)
}
}
+static void devx_async_destroy_cb(int status, struct mlx5_async_work *context)
+{
+ struct mlx5_async_cmd *devx_out = container_of(context,
+ struct mlx5_async_cmd, cb_work);
+ struct devx_obj *obj = devx_out->uobject->object;
+
+ if (!status)
+ obj->flags |= DEVX_OBJ_FLAGS_HW_FREED;
+
+ complete(&devx_out->comp);
+}
+
+static void devx_async_destroy(struct mlx5_ib_dev *dev,
+ struct mlx5_async_cmd *cmd)
+{
+ init_completion(&cmd->comp);
+ cmd->err = mlx5_cmd_exec_cb(&dev->async_ctx, cmd->in, cmd->in_size,
+ &cmd->out, sizeof(cmd->out),
+ devx_async_destroy_cb, &cmd->cb_work);
+}
+
+static void devx_wait_async_destroy(struct mlx5_async_cmd *cmd)
+{
+ if (!cmd->err)
+ wait_for_completion(&cmd->comp);
+ atomic_set(&cmd->uobject->usecnt, 0);
+}
+
+void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile)
+{
+ struct mlx5_async_cmd async_cmd[MAX_ASYNC_CMDS];
+ struct ib_ucontext *ucontext = ufile->ucontext;
+ struct ib_device *device = ucontext->device;
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ struct ib_uobject *uobject;
+ struct devx_obj *obj;
+ int head = 0;
+ int tail = 0;
+
+ list_for_each_entry(uobject, &ufile->uobjects, list) {
+ WARN_ON(uverbs_try_lock_object(uobject, UVERBS_LOOKUP_WRITE));
+
+ /*
+ * Currently we only support QP destruction, if other objects
+ * are to be destroyed need to add type synchronization to the
+ * cleanup algorithm and handle pre/post FW cleanup for the
+ * new types if needed.
+ */
+ if (uobj_get_object_id(uobject) != MLX5_IB_OBJECT_DEVX_OBJ ||
+ (get_dec_obj_type(uobject->object, MLX5_EVENT_TYPE_MAX) !=
+ MLX5_OBJ_TYPE_QP)) {
+ atomic_set(&uobject->usecnt, 0);
+ continue;
+ }
+
+ obj = uobject->object;
+
+ async_cmd[tail % MAX_ASYNC_CMDS].in = obj->dinbox;
+ async_cmd[tail % MAX_ASYNC_CMDS].in_size = obj->dinlen;
+ async_cmd[tail % MAX_ASYNC_CMDS].uobject = uobject;
+
+ devx_async_destroy(dev, &async_cmd[tail % MAX_ASYNC_CMDS]);
+ tail++;
+
+ if (tail - head == MAX_ASYNC_CMDS) {
+ devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]);
+ head++;
+ }
+ }
+
+ while (head != tail) {
+ devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]);
+ head++;
+ }
+}
+
static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
size_t count, loff_t *pos)
{
@@ -2673,7 +2793,6 @@ static const struct file_operations devx_async_cmd_event_fops = {
.read = devx_async_cmd_event_read,
.poll = devx_async_cmd_event_poll,
.release = uverbs_uobject_fd_release,
- .llseek = no_llseek,
};
static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
@@ -2788,7 +2907,6 @@ static const struct file_operations devx_async_event_fops = {
.read = devx_async_event_read,
.poll = devx_async_event_poll,
.release = uverbs_uobject_fd_release,
- .llseek = no_llseek,
};
static void devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj,
@@ -2949,7 +3067,7 @@ DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_DEVX_OBJ_MODIFY,
UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE,
UVERBS_IDR_ANY_OBJECT,
- UVERBS_ACCESS_WRITE,
+ UVERBS_ACCESS_READ,
UA_MANDATORY),
UVERBS_ATTR_PTR_IN(
MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN,
diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h
index ee2213275fd6..ee9e7d3af93f 100644
--- a/drivers/infiniband/hw/mlx5/devx.h
+++ b/drivers/infiniband/hw/mlx5/devx.h
@@ -24,12 +24,14 @@ struct devx_obj {
struct list_head event_sub; /* holds devx_event_subscription entries */
};
#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
-int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user, u64 req_ucaps);
void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
int mlx5_ib_devx_init(struct mlx5_ib_dev *dev);
void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev);
+void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile);
#else
-static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
+static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user,
+ u64 req_ucaps)
{
return -EOPNOTSUPP;
}
@@ -41,5 +43,8 @@ static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev)
static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev)
{
}
+static inline void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile)
+{
+}
#endif
#endif /* _MLX5_IB_DEVX_H */
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index 520034acf73a..680627f1de33 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -12,6 +12,7 @@
#include <rdma/mlx5_user_ioctl_verbs.h>
#include <rdma/ib_hdrs.h>
#include <rdma/ib_umem.h>
+#include <rdma/ib_ucaps.h>
#include <linux/mlx5/driver.h>
#include <linux/mlx5/fs.h>
#include <linux/mlx5/fs_helpers.h>
@@ -32,6 +33,11 @@ enum {
MATCH_CRITERIA_ENABLE_MISC2_BIT
};
+
+struct mlx5_per_qp_opfc {
+ struct mlx5_ib_op_fc opfcs[MLX5_IB_OPCOUNTER_MAX];
+};
+
#define HEADER_IS_ZERO(match_criteria, headers) \
!(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \
@@ -678,7 +684,7 @@ enum flow_table_type {
#define MLX5_FS_MAX_TYPES 6
#define MLX5_FS_MAX_ENTRIES BIT(16)
-static bool mlx5_ib_shared_ft_allowed(struct ib_device *device)
+static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device)
{
struct mlx5_ib_dev *dev = to_mdev(device);
@@ -690,7 +696,7 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev,
struct mlx5_ib_flow_prio *prio,
int priority,
int num_entries, int num_groups,
- u32 flags)
+ u32 flags, u16 vport)
{
struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_table *ft;
@@ -698,6 +704,7 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev,
ft_attr.prio = priority;
ft_attr.max_fte = num_entries;
ft_attr.flags = flags;
+ ft_attr.vport = vport;
ft_attr.autogroup.max_num_groups = num_groups;
ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
if (IS_ERR(ft))
@@ -792,18 +799,25 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
ft = prio->flow_table;
if (!ft)
return _get_prio(dev, ns, prio, priority, max_table_size,
- num_groups, flags);
+ num_groups, flags, 0);
return prio;
}
enum {
+ RDMA_RX_ECN_OPCOUNTER_PER_QP_PRIO,
+ RDMA_RX_CNP_OPCOUNTER_PER_QP_PRIO,
+ RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO,
RDMA_RX_ECN_OPCOUNTER_PRIO,
RDMA_RX_CNP_OPCOUNTER_PRIO,
+ RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO,
};
enum {
+ RDMA_TX_CNP_OPCOUNTER_PER_QP_PRIO,
+ RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO,
RDMA_TX_CNP_OPCOUNTER_PRIO,
+ RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO,
};
static int set_vhca_port_spec(struct mlx5_ib_dev *dev, u32 port_num,
@@ -867,6 +881,344 @@ static int set_cnp_spec(struct mlx5_ib_dev *dev, u32 port_num,
return 0;
}
+/* Returns the prio we should use for the given optional counter type,
+ * whereas for bytes type we use the packet type, since they share the same
+ * resources.
+ */
+static struct mlx5_ib_flow_prio *get_opfc_prio(struct mlx5_ib_dev *dev,
+ u32 type)
+{
+ u32 prio_type;
+
+ switch (type) {
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES:
+ prio_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES:
+ prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP:
+ prio_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP:
+ prio_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP;
+ break;
+ default:
+ prio_type = type;
+ }
+
+ return &dev->flow_db->opfcs[prio_type];
+}
+
+static void put_per_qp_prio(struct mlx5_ib_dev *dev,
+ enum mlx5_ib_optional_counter_type type)
+{
+ enum mlx5_ib_optional_counter_type per_qp_type;
+ struct mlx5_ib_flow_prio *prio;
+
+ switch (type) {
+ case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS:
+ per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS:
+ per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS:
+ per_qp_type = MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS:
+ per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES:
+ per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS:
+ per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES:
+ per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP;
+ break;
+ default:
+ return;
+ }
+
+ prio = get_opfc_prio(dev, per_qp_type);
+ put_flow_table(dev, prio, true);
+}
+
+static int get_per_qp_prio(struct mlx5_ib_dev *dev,
+ enum mlx5_ib_optional_counter_type type)
+{
+ enum mlx5_ib_optional_counter_type per_qp_type;
+ enum mlx5_flow_namespace_type fn_type;
+ struct mlx5_flow_namespace *ns;
+ struct mlx5_ib_flow_prio *prio;
+ int priority;
+
+ switch (type) {
+ case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS:
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS;
+ priority = RDMA_RX_ECN_OPCOUNTER_PER_QP_PRIO;
+ per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS:
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS;
+ priority = RDMA_RX_CNP_OPCOUNTER_PER_QP_PRIO;
+ per_qp_type = MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS:
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS;
+ priority = RDMA_TX_CNP_OPCOUNTER_PER_QP_PRIO;
+ per_qp_type = MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS:
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS;
+ priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO;
+ per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES:
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS;
+ priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO;
+ per_qp_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS:
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS;
+ priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO;
+ per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES:
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS;
+ priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PER_QP_PRIO;
+ per_qp_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
+ if (!ns)
+ return -EOPNOTSUPP;
+
+ prio = get_opfc_prio(dev, per_qp_type);
+ if (prio->flow_table)
+ return 0;
+
+ prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0);
+ if (IS_ERR(prio))
+ return PTR_ERR(prio);
+
+ prio->refcount = 1;
+
+ return 0;
+}
+
+static struct mlx5_per_qp_opfc *
+get_per_qp_opfc(struct mlx5_rdma_counter *mcounter, u32 qp_num, bool *new)
+{
+ struct mlx5_per_qp_opfc *per_qp_opfc;
+
+ *new = false;
+
+ per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp_num);
+ if (per_qp_opfc)
+ return per_qp_opfc;
+ per_qp_opfc = kzalloc(sizeof(*per_qp_opfc), GFP_KERNEL);
+
+ if (!per_qp_opfc)
+ return NULL;
+
+ *new = true;
+ return per_qp_opfc;
+}
+
+static int add_op_fc_rules(struct mlx5_ib_dev *dev,
+ struct mlx5_rdma_counter *mcounter,
+ struct mlx5_per_qp_opfc *per_qp_opfc,
+ struct mlx5_ib_flow_prio *prio,
+ enum mlx5_ib_optional_counter_type type,
+ u32 qp_num, u32 port_num)
+{
+ struct mlx5_ib_op_fc *opfc = &per_qp_opfc->opfcs[type], *in_use_opfc;
+ struct mlx5_flow_act flow_act = {};
+ struct mlx5_flow_destination dst;
+ struct mlx5_flow_spec *spec;
+ int i, err, spec_num;
+ bool is_tx;
+
+ if (opfc->fc)
+ return -EEXIST;
+
+ if (mlx5r_is_opfc_shared_and_in_use(per_qp_opfc->opfcs, type,
+ &in_use_opfc)) {
+ opfc->fc = in_use_opfc->fc;
+ opfc->rule[0] = in_use_opfc->rule[0];
+ return 0;
+ }
+
+ opfc->fc = mcounter->fc[type];
+
+ spec = kcalloc(MAX_OPFC_RULES, sizeof(*spec), GFP_KERNEL);
+ if (!spec) {
+ err = -ENOMEM;
+ goto null_fc;
+ }
+
+ switch (type) {
+ case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP:
+ if (set_ecn_ce_spec(dev, port_num, &spec[0],
+ MLX5_FS_IPV4_VERSION) ||
+ set_ecn_ce_spec(dev, port_num, &spec[1],
+ MLX5_FS_IPV6_VERSION)) {
+ err = -EOPNOTSUPP;
+ goto free_spec;
+ }
+ spec_num = 2;
+ is_tx = false;
+
+ MLX5_SET_TO_ONES(fte_match_param, spec[1].match_criteria,
+ misc_parameters.bth_dst_qp);
+ MLX5_SET(fte_match_param, spec[1].match_value,
+ misc_parameters.bth_dst_qp, qp_num);
+ spec[1].match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS;
+ break;
+ case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP:
+ if (!MLX5_CAP_FLOWTABLE(
+ dev->mdev,
+ ft_field_support_2_nic_receive_rdma.bth_opcode) ||
+ set_cnp_spec(dev, port_num, &spec[0])) {
+ err = -EOPNOTSUPP;
+ goto free_spec;
+ }
+ spec_num = 1;
+ is_tx = false;
+ break;
+ case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP:
+ if (!MLX5_CAP_FLOWTABLE(
+ dev->mdev,
+ ft_field_support_2_nic_transmit_rdma.bth_opcode) ||
+ set_cnp_spec(dev, port_num, &spec[0])) {
+ err = -EOPNOTSUPP;
+ goto free_spec;
+ }
+ spec_num = 1;
+ is_tx = true;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP:
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP:
+ spec_num = 1;
+ is_tx = true;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP:
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP:
+ spec_num = 1;
+ is_tx = false;
+ break;
+ default:
+ err = -EINVAL;
+ goto free_spec;
+ }
+
+ if (is_tx) {
+ MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+ misc_parameters.source_sqn);
+ MLX5_SET(fte_match_param, spec->match_value,
+ misc_parameters.source_sqn, qp_num);
+ } else {
+ MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+ misc_parameters.bth_dst_qp);
+ MLX5_SET(fte_match_param, spec->match_value,
+ misc_parameters.bth_dst_qp, qp_num);
+ }
+
+ spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS;
+
+ dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+ dst.counter = opfc->fc;
+
+ flow_act.action =
+ MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+
+ for (i = 0; i < spec_num; i++) {
+ opfc->rule[i] = mlx5_add_flow_rules(prio->flow_table, &spec[i],
+ &flow_act, &dst, 1);
+ if (IS_ERR(opfc->rule[i])) {
+ err = PTR_ERR(opfc->rule[i]);
+ goto del_rules;
+ }
+ }
+ prio->refcount += spec_num;
+
+ err = xa_err(xa_store(&mcounter->qpn_opfc_xa, qp_num, per_qp_opfc,
+ GFP_KERNEL));
+ if (err)
+ goto del_rules;
+
+ kfree(spec);
+
+ return 0;
+
+del_rules:
+ while (i--)
+ mlx5_del_flow_rules(opfc->rule[i]);
+ put_flow_table(dev, prio, false);
+free_spec:
+ kfree(spec);
+null_fc:
+ opfc->fc = NULL;
+ return err;
+}
+
+static bool is_fc_shared_and_in_use(struct mlx5_rdma_counter *mcounter,
+ u32 type, struct mlx5_fc **fc)
+{
+ u32 shared_fc_type;
+
+ switch (type) {
+ case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP;
+ break;
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP:
+ shared_fc_type = MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP;
+ break;
+ default:
+ return false;
+ }
+
+ *fc = mcounter->fc[shared_fc_type];
+ if (!(*fc))
+ return false;
+
+ return true;
+}
+
+void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev,
+ struct rdma_counter *counter)
+{
+ struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
+ struct mlx5_fc *in_use_fc;
+ int i;
+
+ for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP;
+ i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) {
+ if (!mcounter->fc[i])
+ continue;
+
+ if (is_fc_shared_and_in_use(mcounter, i, &in_use_fc)) {
+ mcounter->fc[i] = NULL;
+ continue;
+ }
+
+ mlx5_fc_destroy(dev->mdev, mcounter->fc[i]);
+ mcounter->fc[i] = NULL;
+ }
+}
+
int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
struct mlx5_ib_op_fc *opfc,
enum mlx5_ib_optional_counter_type type)
@@ -921,6 +1273,20 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
priority = RDMA_TX_CNP_OPCOUNTER_PRIO;
break;
+ case MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS:
+ case MLX5_IB_OPCOUNTER_RDMA_TX_BYTES:
+ spec_num = 1;
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS;
+ priority = RDMA_TX_PKTS_BYTES_OPCOUNTER_PRIO;
+ break;
+
+ case MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS:
+ case MLX5_IB_OPCOUNTER_RDMA_RX_BYTES:
+ spec_num = 1;
+ fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS;
+ priority = RDMA_RX_PKTS_BYTES_OPCOUNTER_PRIO;
+ break;
+
default:
err = -EOPNOTSUPP;
goto free;
@@ -932,18 +1298,22 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
goto free;
}
- prio = &dev->flow_db->opfcs[type];
+ prio = get_opfc_prio(dev, type);
if (!prio->flow_table) {
+ err = get_per_qp_prio(dev, type);
+ if (err)
+ goto free;
+
prio = _get_prio(dev, ns, prio, priority,
- dev->num_ports * MAX_OPFC_RULES, 1, 0);
+ dev->num_ports * MAX_OPFC_RULES, 1, 0, 0);
if (IS_ERR(prio)) {
err = PTR_ERR(prio);
- goto free;
+ goto put_prio;
}
}
dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dst.counter_id = mlx5_fc_id(opfc->fc);
+ dst.counter = opfc->fc;
flow_act.action =
MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW;
@@ -965,6 +1335,8 @@ del_rules:
for (i -= 1; i >= 0; i--)
mlx5_del_flow_rules(opfc->rule[i]);
put_flow_table(dev, prio, false);
+put_prio:
+ put_per_qp_prio(dev, type);
free:
kfree(spec);
return err;
@@ -974,12 +1346,115 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev,
struct mlx5_ib_op_fc *opfc,
enum mlx5_ib_optional_counter_type type)
{
+ struct mlx5_ib_flow_prio *prio;
int i;
+ prio = get_opfc_prio(dev, type);
+
for (i = 0; i < MAX_OPFC_RULES && opfc->rule[i]; i++) {
mlx5_del_flow_rules(opfc->rule[i]);
- put_flow_table(dev, &dev->flow_db->opfcs[type], true);
+ put_flow_table(dev, prio, true);
+ }
+
+ put_per_qp_prio(dev, type);
+}
+
+void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter)
+{
+ struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
+ struct mlx5_ib_dev *dev = to_mdev(counter->device);
+ struct mlx5_per_qp_opfc *per_qp_opfc;
+ struct mlx5_ib_op_fc *in_use_opfc;
+ struct mlx5_ib_flow_prio *prio;
+ int i, j;
+
+ per_qp_opfc = xa_load(&mcounter->qpn_opfc_xa, qp->qp_num);
+ if (!per_qp_opfc)
+ return;
+
+ for (i = MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP;
+ i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP; i++) {
+ if (!per_qp_opfc->opfcs[i].fc)
+ continue;
+
+ if (mlx5r_is_opfc_shared_and_in_use(per_qp_opfc->opfcs, i,
+ &in_use_opfc)) {
+ per_qp_opfc->opfcs[i].fc = NULL;
+ continue;
+ }
+
+ for (j = 0; j < MAX_OPFC_RULES; j++) {
+ if (!per_qp_opfc->opfcs[i].rule[j])
+ continue;
+ mlx5_del_flow_rules(per_qp_opfc->opfcs[i].rule[j]);
+ prio = get_opfc_prio(dev, i);
+ put_flow_table(dev, prio, true);
+ }
+ per_qp_opfc->opfcs[i].fc = NULL;
+ }
+
+ kfree(per_qp_opfc);
+ xa_erase(&mcounter->qpn_opfc_xa, qp->qp_num);
+}
+
+int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter,
+ u32 port)
+{
+ struct mlx5_rdma_counter *mcounter = to_mcounter(counter);
+ struct mlx5_ib_dev *dev = to_mdev(qp->device);
+ struct mlx5_per_qp_opfc *per_qp_opfc;
+ struct mlx5_ib_flow_prio *prio;
+ struct mlx5_ib_counters *cnts;
+ struct mlx5_ib_op_fc *opfc;
+ struct mlx5_fc *in_use_fc;
+ int i, err, per_qp_type;
+ bool new;
+
+ if (!counter->mode.bind_opcnt)
+ return 0;
+
+ cnts = &dev->port[port - 1].cnts;
+
+ for (i = 0; i <= MLX5_IB_OPCOUNTER_RDMA_RX_BYTES; i++) {
+ opfc = &cnts->opfcs[i];
+ if (!opfc->fc)
+ continue;
+
+ per_qp_type = i + MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP;
+ prio = get_opfc_prio(dev, per_qp_type);
+ WARN_ON(!prio->flow_table);
+
+ if (is_fc_shared_and_in_use(mcounter, per_qp_type, &in_use_fc))
+ mcounter->fc[per_qp_type] = in_use_fc;
+
+ if (!mcounter->fc[per_qp_type]) {
+ mcounter->fc[per_qp_type] = mlx5_fc_create(dev->mdev,
+ false);
+ if (IS_ERR(mcounter->fc[per_qp_type]))
+ return PTR_ERR(mcounter->fc[per_qp_type]);
+ }
+
+ per_qp_opfc = get_per_qp_opfc(mcounter, qp->qp_num, &new);
+ if (!per_qp_opfc) {
+ err = -ENOMEM;
+ goto free_fc;
+ }
+ err = add_op_fc_rules(dev, mcounter, per_qp_opfc, prio,
+ per_qp_type, qp->qp_num, port);
+ if (err)
+ goto del_rules;
}
+
+ return 0;
+
+del_rules:
+ mlx5r_fs_unbind_op_fc(qp, counter);
+ if (new)
+ kfree(per_qp_opfc);
+free_fc:
+ if (xa_empty(&mcounter->qpn_opfc_xa))
+ mlx5r_fs_destroy_fcs(dev, counter);
+ return err;
}
static void set_underlay_qp(struct mlx5_ib_dev *dev,
@@ -1113,8 +1588,8 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
handler->ibcounters = flow_act.counters;
dest_arr[dest_num].type =
MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dest_arr[dest_num].counter_id =
- mlx5_fc_id(mcounters->hw_cntrs_hndl);
+ dest_arr[dest_num].counter =
+ mcounters->hw_cntrs_hndl;
dest_num++;
}
@@ -1170,11 +1645,6 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
}
-enum {
- LEFTOVERS_MC,
- LEFTOVERS_UC,
-};
-
static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
struct mlx5_ib_flow_prio *ft_prio,
struct ib_flow_attr *flow_attr,
@@ -1184,43 +1654,32 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de
struct mlx5_ib_flow_handler *handler = NULL;
static struct {
- struct ib_flow_attr flow_attr;
struct ib_flow_spec_eth eth_flow;
- } leftovers_specs[] = {
- [LEFTOVERS_MC] = {
- .flow_attr = {
- .num_of_specs = 1,
- .size = sizeof(leftovers_specs[0])
- },
- .eth_flow = {
- .type = IB_FLOW_SPEC_ETH,
- .size = sizeof(struct ib_flow_spec_eth),
- .mask = {.dst_mac = {0x1} },
- .val = {.dst_mac = {0x1} }
- }
- },
- [LEFTOVERS_UC] = {
- .flow_attr = {
- .num_of_specs = 1,
- .size = sizeof(leftovers_specs[0])
- },
- .eth_flow = {
- .type = IB_FLOW_SPEC_ETH,
- .size = sizeof(struct ib_flow_spec_eth),
- .mask = {.dst_mac = {0x1} },
- .val = {.dst_mac = {} }
- }
- }
- };
+ struct ib_flow_attr flow_attr;
+ } leftovers_wc = { .flow_attr = { .num_of_specs = 1,
+ .size = sizeof(leftovers_wc) },
+ .eth_flow = {
+ .type = IB_FLOW_SPEC_ETH,
+ .size = sizeof(struct ib_flow_spec_eth),
+ .mask = { .dst_mac = { 0x1 } },
+ .val = { .dst_mac = { 0x1 } } } };
- handler = create_flow_rule(dev, ft_prio,
- &leftovers_specs[LEFTOVERS_MC].flow_attr,
- dst);
+ static struct {
+ struct ib_flow_spec_eth eth_flow;
+ struct ib_flow_attr flow_attr;
+ } leftovers_uc = { .flow_attr = { .num_of_specs = 1,
+ .size = sizeof(leftovers_uc) },
+ .eth_flow = {
+ .type = IB_FLOW_SPEC_ETH,
+ .size = sizeof(struct ib_flow_spec_eth),
+ .mask = { .dst_mac = { 0x1 } },
+ .val = { .dst_mac = {} } } };
+
+ handler = create_flow_rule(dev, ft_prio, &leftovers_wc.flow_attr, dst);
if (!IS_ERR(handler) &&
flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
handler_ucast = create_flow_rule(dev, ft_prio,
- &leftovers_specs[LEFTOVERS_UC].flow_attr,
- dst);
+ &leftovers_uc.flow_attr, dst);
if (IS_ERR(handler_ucast)) {
mlx5_del_flow_rules(handler->rule);
ft_prio->refcount--;
@@ -1413,17 +1872,51 @@ free_ucmd:
return ERR_PTR(err);
}
+static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev,
+ enum mlx5_flow_namespace_type type,
+ u32 *flags, u16 *vport_idx,
+ u16 *vport,
+ struct mlx5_core_dev **ft_mdev,
+ u32 ib_port)
+{
+ struct mlx5_core_dev *esw_mdev;
+
+ if (!is_mdev_switchdev_mode(dev->mdev))
+ return 0;
+
+ if (!MLX5_CAP_ADV_RDMA(dev->mdev, rdma_transport_manager))
+ return -EOPNOTSUPP;
+
+ if (!dev->port[ib_port - 1].rep)
+ return -EINVAL;
+
+ esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw);
+ if (esw_mdev != dev->mdev)
+ return -EOPNOTSUPP;
+
+ *flags |= MLX5_FLOW_TABLE_OTHER_VPORT;
+ *ft_mdev = esw_mdev;
+ *vport = dev->port[ib_port - 1].rep->vport;
+ *vport_idx = dev->port[ib_port - 1].rep->vport_index;
+
+ return 0;
+}
+
static struct mlx5_ib_flow_prio *
_get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
enum mlx5_flow_namespace_type ns_type,
- bool mcast)
+ bool mcast, u32 ib_port)
{
+ struct mlx5_core_dev *ft_mdev = dev->mdev;
struct mlx5_flow_namespace *ns = NULL;
struct mlx5_ib_flow_prio *prio = NULL;
int max_table_size = 0;
+ u16 vport_idx = 0;
bool esw_encap;
u32 flags = 0;
+ u16 vport = 0;
int priority;
+ int ret;
if (mcast)
priority = MLX5_IB_FLOW_MCAST_PRIO;
@@ -1471,13 +1964,38 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size));
priority = user_priority;
break;
+ case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX:
+ case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX:
+ if (ib_port == 0 || user_priority > MLX5_RDMA_TRANSPORT_BYPASS_PRIO)
+ return ERR_PTR(-EINVAL);
+ ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags,
+ &vport_idx, &vport,
+ &ft_mdev, ib_port);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX)
+ max_table_size =
+ BIT(MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(
+ ft_mdev, log_max_ft_size));
+ else
+ max_table_size =
+ BIT(MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(
+ ft_mdev, log_max_ft_size));
+ priority = user_priority;
+ break;
default:
break;
}
max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
- ns = mlx5_get_flow_namespace(dev->mdev, ns_type);
+ if (ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX ||
+ ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX)
+ ns = mlx5_get_flow_vport_namespace(ft_mdev, ns_type, vport_idx);
+ else
+ ns = mlx5_get_flow_namespace(ft_mdev, ns_type);
+
if (!ns)
return ERR_PTR(-EOPNOTSUPP);
@@ -1497,6 +2015,12 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
case MLX5_FLOW_NAMESPACE_RDMA_TX:
prio = &dev->flow_db->rdma_tx[priority];
break;
+ case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX:
+ prio = &dev->flow_db->rdma_transport_rx[ib_port - 1];
+ break;
+ case MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX:
+ prio = &dev->flow_db->rdma_transport_tx[ib_port - 1];
+ break;
default: return ERR_PTR(-EINVAL);
}
@@ -1507,7 +2031,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
return prio;
return _get_prio(dev, ns, prio, priority, max_table_size,
- MLX5_FS_MAX_TYPES, flags);
+ MLX5_FS_MAX_TYPES, flags, vport);
}
static struct mlx5_ib_flow_handler *
@@ -1603,7 +2127,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
static struct mlx5_ib_flow_handler *raw_fs_rule_add(
struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act,
- u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type)
+ struct mlx5_fc *counter, void *cmd_in, int inlen, int dest_id, int dest_type)
{
struct mlx5_flow_destination *dst;
struct mlx5_ib_flow_prio *ft_prio;
@@ -1626,7 +2150,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add(
mutex_lock(&dev->flow_db->lock);
ft_prio = _get_flow_table(dev, fs_matcher->priority,
- fs_matcher->ns_type, mcast);
+ fs_matcher->ns_type, mcast,
+ fs_matcher->ib_port);
if (IS_ERR(ft_prio)) {
err = PTR_ERR(ft_prio);
goto unlock;
@@ -1652,8 +2177,12 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add(
}
if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+ if (WARN_ON(!counter)) {
+ err = -EINVAL;
+ goto unlock;
+ }
dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dst[dst_num].counter_id = counter_id;
+ dst[dst_num].counter = counter;
dst_num++;
}
@@ -1738,6 +2267,12 @@ mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX:
*namespace = MLX5_FLOW_NAMESPACE_RDMA_TX;
break;
+ case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_RX:
+ *namespace = MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX;
+ break;
+ case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TRANSPORT_TX:
+ *namespace = MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX;
+ break;
default:
return -EINVAL;
}
@@ -1827,7 +2362,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs,
return -EINVAL;
/* Allow only DEVX object or QP as dest when inserting to RDMA_RX */
- if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+ if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX ||
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) &&
((!dest_devx && !dest_qp) || (dest_devx && dest_qp)))
return -EINVAL;
@@ -1844,7 +2380,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs,
return -EINVAL;
/* Allow only flow table as dest when inserting to FDB or RDMA_RX */
if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB_BYPASS ||
- fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX ||
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX) &&
*dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
return -EINVAL;
} else if (dest_qp) {
@@ -1865,20 +2402,23 @@ static int get_dests(struct uverbs_attr_bundle *attrs,
*dest_id = mqp->raw_packet_qp.rq.tirn;
*dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
} else if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS ||
- fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) &&
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX ||
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) &&
!(*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)) {
*dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
}
if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
(fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS ||
- fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX))
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX ||
+ fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX))
return -EINVAL;
return 0;
}
-static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id)
+static bool
+is_flow_counter(void *obj, u32 offset, u32 *counter_id, u32 *fc_bulk_size)
{
struct devx_obj *devx_obj = obj;
u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
@@ -1888,6 +2428,7 @@ static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id)
if (offset && offset >= devx_obj->flow_counter_bulk_size)
return false;
+ *fc_bulk_size = devx_obj->flow_counter_bulk_size;
*counter_id = MLX5_GET(dealloc_flow_counter_in,
devx_obj->dinbox,
flow_counter_id);
@@ -1904,13 +2445,13 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
{
struct mlx5_flow_context flow_context = {.flow_tag =
MLX5_FS_DEFAULT_FLOW_TAG};
- u32 *offset_attr, offset = 0, counter_id = 0;
int dest_id, dest_type = -1, inlen, len, ret, i;
struct mlx5_ib_flow_handler *flow_handler;
struct mlx5_ib_flow_matcher *fs_matcher;
struct ib_uobject **arr_flow_actions;
struct ib_uflow_resources *uflow_res;
struct mlx5_flow_act flow_act = {};
+ struct mlx5_fc *counter = NULL;
struct ib_qp *qp = NULL;
void *devx_obj, *cmd_in;
struct ib_uobject *uobj;
@@ -1937,6 +2478,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
len = uverbs_attr_get_uobjs_arr(attrs,
MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
if (len) {
+ u32 *offset_attr, fc_bulk_size, offset = 0, counter_id = 0;
devx_obj = arr_flow_actions[0]->object;
if (uverbs_attr_is_valid(attrs,
@@ -1956,8 +2498,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
offset = *offset_attr;
}
- if (!is_flow_counter(devx_obj, offset, &counter_id))
+ if (!is_flow_counter(devx_obj, offset, &counter_id, &fc_bulk_size))
return -EINVAL;
+ counter = mlx5_fc_local_create(counter_id, offset, fc_bulk_size);
+ if (IS_ERR(counter))
+ return PTR_ERR(counter);
flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
}
@@ -1968,8 +2513,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS);
- if (!uflow_res)
- return -ENOMEM;
+ if (!uflow_res) {
+ ret = -ENOMEM;
+ goto destroy_counter;
+ }
len = uverbs_attr_get_uobjs_arr(attrs,
MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions);
@@ -1996,7 +2543,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
flow_handler =
raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act,
- counter_id, cmd_in, inlen, dest_id, dest_type);
+ counter, cmd_in, inlen, dest_id, dest_type);
if (IS_ERR(flow_handler)) {
ret = PTR_ERR(flow_handler);
goto err_out;
@@ -2007,6 +2554,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
return 0;
err_out:
ib_uverbs_flow_resources_free(uflow_res);
+destroy_counter:
+ if (counter)
+ mlx5_fc_local_destroy(counter);
return ret;
}
@@ -2338,6 +2888,15 @@ static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
return 0;
}
+static bool verify_context_caps(struct mlx5_ib_dev *dev, u64 enabled_caps)
+{
+ if (is_mdev_switchdev_mode(dev->mdev))
+ return UCAP_ENABLED(enabled_caps,
+ RDMA_UCAP_MLX5_CTRL_OTHER_VHCA);
+
+ return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL);
+}
+
static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
struct uverbs_attr_bundle *attrs)
{
@@ -2386,6 +2945,26 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
goto end;
}
+ if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT)) {
+ err = uverbs_copy_from(&obj->ib_port, attrs,
+ MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT);
+ if (err)
+ goto end;
+ if (!rdma_is_port_valid(&dev->ib_dev, obj->ib_port)) {
+ err = -EINVAL;
+ goto end;
+ }
+ if (obj->ns_type != MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX &&
+ obj->ns_type != MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX) {
+ err = -EINVAL;
+ goto end;
+ }
+ if (!verify_context_caps(dev, uobj->context->enabled_caps)) {
+ err = -EOPNOTSUPP;
+ goto end;
+ }
+ }
+
uobj->object = obj;
obj->mdev = dev->mdev;
atomic_set(&obj->usecnt, 0);
@@ -2433,7 +3012,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)(
mutex_lock(&dev->flow_db->lock);
- ft_prio = _get_flow_table(dev, priority, ns_type, 0);
+ ft_prio = _get_flow_table(dev, priority, ns_type, 0, 0);
if (IS_ERR(ft_prio)) {
err = PTR_ERR(ft_prio);
goto free_obj;
@@ -2819,7 +3398,10 @@ DECLARE_UVERBS_NAMED_METHOD(
UA_OPTIONAL),
UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE,
enum mlx5_ib_uapi_flow_table_type,
- UA_OPTIONAL));
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_IB_PORT,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL));
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
@@ -2889,8 +3471,26 @@ int mlx5_ib_fs_init(struct mlx5_ib_dev *dev)
if (!dev->flow_db)
return -ENOMEM;
+ dev->flow_db->rdma_transport_rx = kcalloc(dev->num_ports,
+ sizeof(struct mlx5_ib_flow_prio),
+ GFP_KERNEL);
+ if (!dev->flow_db->rdma_transport_rx)
+ goto free_flow_db;
+
+ dev->flow_db->rdma_transport_tx = kcalloc(dev->num_ports,
+ sizeof(struct mlx5_ib_flow_prio),
+ GFP_KERNEL);
+ if (!dev->flow_db->rdma_transport_tx)
+ goto free_rdma_transport_rx;
+
mutex_init(&dev->flow_db->lock);
ib_set_device_ops(&dev->ib_dev, &flow_ops);
return 0;
+
+free_rdma_transport_rx:
+ kfree(dev->flow_db->rdma_transport_rx);
+free_flow_db:
+ kfree(dev->flow_db);
+ return -ENOMEM;
}
diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h
index b9734904f5f0..2ebe86e5be10 100644
--- a/drivers/infiniband/hw/mlx5/fs.h
+++ b/drivers/infiniband/hw/mlx5/fs.h
@@ -8,23 +8,8 @@
#include "mlx5_ib.h"
-#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
int mlx5_ib_fs_init(struct mlx5_ib_dev *dev);
void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev);
-#else
-static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev)
-{
- dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
-
- if (!dev->flow_db)
- return -ENOMEM;
-
- mutex_init(&dev->flow_db->lock);
- return 0;
-}
-
-inline void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) {}
-#endif
static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev)
{
@@ -40,6 +25,8 @@ static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev)
* is a safe assumption that all references are gone.
*/
mlx5_ib_fs_cleanup_anchor(dev);
+ kfree(dev->flow_db->rdma_transport_tx);
+ kfree(dev->flow_db->rdma_transport_rx);
kfree(dev->flow_db);
}
#endif /* _MLX5_IB_FS_H */
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index c7a4ee896121..49af1cfbe6d1 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -13,6 +13,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
int vport_index)
{
struct mlx5_ib_dev *ibdev;
+ struct net_device *ndev;
ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB);
if (!ibdev)
@@ -20,12 +21,9 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
ibdev->port[vport_index].rep = rep;
rep->rep_data[REP_IB].priv = ibdev;
- write_lock(&ibdev->port[vport_index].roce.netdev_lock);
- ibdev->port[vport_index].roce.netdev =
- mlx5_ib_get_rep_netdev(rep->esw, rep->vport);
- write_unlock(&ibdev->port[vport_index].roce.netdev_lock);
+ ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport);
- return 0;
+ return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1);
}
static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev);
@@ -104,10 +102,15 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
ibdev->is_rep = true;
vport_index = rep->vport_index;
ibdev->port[vport_index].rep = rep;
- ibdev->port[vport_index].roce.netdev =
- mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport);
ibdev->mdev = lag_master;
ibdev->num_ports = num_ports;
+ ibdev->ib_dev.phys_port_cnt = num_ports;
+ ret = ib_device_set_netdev(&ibdev->ib_dev,
+ mlx5_ib_get_rep_netdev(lag_master->priv.eswitch,
+ rep->vport),
+ vport_index + 1);
+ if (ret)
+ goto fail_add;
ret = __mlx5_ib_add(ibdev, profile);
if (ret)
@@ -160,9 +163,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
}
port = &dev->port[vport_index];
- write_lock(&port->roce.netdev_lock);
- port->roce.netdev = NULL;
- write_unlock(&port->roce.netdev_lock);
+
+ ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1);
rep->rep_data[REP_IB].priv = NULL;
port->rep = NULL;
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 0c3c4e64812c..2453ae4384a7 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -69,7 +69,7 @@ static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey,
if (ignore_bkey || !in_wc)
op_modifier |= 0x2;
- return mlx5_cmd_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier,
+ return mlx5_cmd_mad_ifc(dev, in_mad, response_mad, op_modifier,
port);
}
@@ -147,8 +147,39 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt,
vl_15_dropped);
}
-static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out,
- size_t sz)
+static void pma_cnt_ext_assign_ppcnt(struct ib_pma_portcounters_ext *cnt_ext,
+ void *out)
+{
+ void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out,
+ counter_set);
+
+#define MLX5_GET_EXT_CNTR(counter_name) \
+ MLX5_GET64(ib_ext_port_cntrs_grp_data_layout, \
+ out_pma, counter_name##_high)
+
+ cnt_ext->port_xmit_data =
+ cpu_to_be64(MLX5_GET_EXT_CNTR(port_xmit_data) >> 2);
+ cnt_ext->port_rcv_data =
+ cpu_to_be64(MLX5_GET_EXT_CNTR(port_rcv_data) >> 2);
+
+ cnt_ext->port_xmit_packets =
+ cpu_to_be64(MLX5_GET_EXT_CNTR(port_xmit_pkts));
+ cnt_ext->port_rcv_packets =
+ cpu_to_be64(MLX5_GET_EXT_CNTR(port_rcv_pkts));
+
+ cnt_ext->port_unicast_xmit_packets =
+ cpu_to_be64(MLX5_GET_EXT_CNTR(port_unicast_xmit_pkts));
+ cnt_ext->port_unicast_rcv_packets =
+ cpu_to_be64(MLX5_GET_EXT_CNTR(port_unicast_rcv_pkts));
+
+ cnt_ext->port_multicast_xmit_packets =
+ cpu_to_be64(MLX5_GET_EXT_CNTR(port_multicast_xmit_pkts));
+ cnt_ext->port_multicast_rcv_packets =
+ cpu_to_be64(MLX5_GET_EXT_CNTR(port_multicast_rcv_pkts));
+}
+
+static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, u8 plane_num,
+ void *out, size_t sz, bool ext)
{
u32 *in;
int err;
@@ -160,8 +191,14 @@ static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out,
}
MLX5_SET(ppcnt_reg, in, local_port, port_num);
-
- MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP);
+ MLX5_SET(ppcnt_reg, in, plane_ind, plane_num);
+
+ if (ext)
+ MLX5_SET(ppcnt_reg, in, grp,
+ MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP);
+ else
+ MLX5_SET(ppcnt_reg, in, grp,
+ MLX5_INFINIBAND_PORT_COUNTERS_GROUP);
err = mlx5_core_access_reg(dev, in, sz, out,
sz, MLX5_REG_PPCNT, 0, 0);
@@ -188,7 +225,9 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num,
mdev = dev->mdev;
mdev_port_num = 1;
}
- if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1) {
+ if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1 &&
+ !mlx5_core_mp_enabled(mdev) &&
+ dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) {
/* set local port to one for Function-Per-Port HCA. */
mdev = dev->mdev;
mdev_port_num = 1;
@@ -207,7 +246,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num,
if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) {
struct ib_pma_portcounters_ext *pma_cnt_ext =
(struct ib_pma_portcounters_ext *)(out_mad->data + 40);
- int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+ int sz = max(MLX5_ST_SZ_BYTES(query_vport_counter_out),
+ MLX5_ST_SZ_BYTES(ppcnt_reg));
out_cnt = kvzalloc(sz, GFP_KERNEL);
if (!out_cnt) {
@@ -215,10 +255,18 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num,
goto done;
}
- err = mlx5_core_query_vport_counter(mdev, 0, 0, mdev_port_num,
- out_cnt);
- if (!err)
- pma_cnt_ext_assign(pma_cnt_ext, out_cnt);
+ if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
+ err = query_ib_ppcnt(mdev, mdev_port_num,
+ port_num, out_cnt, sz, 1);
+ if (!err)
+ pma_cnt_ext_assign_ppcnt(pma_cnt_ext, out_cnt);
+ } else {
+ err = mlx5_core_query_vport_counter(mdev, 0, 0,
+ mdev_port_num,
+ out_cnt);
+ if (!err)
+ pma_cnt_ext_assign(pma_cnt_ext, out_cnt);
+ }
} else {
struct ib_pma_portcounters *pma_cnt =
(struct ib_pma_portcounters *)(out_mad->data + 40);
@@ -230,7 +278,13 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num,
goto done;
}
- err = query_ib_ppcnt(mdev, mdev_port_num, out_cnt, sz);
+ if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI)
+ err = query_ib_ppcnt(mdev, mdev_port_num, port_num,
+ out_cnt, sz, 0);
+ else
+ err = query_ib_ppcnt(mdev, mdev_port_num, 0,
+ out_cnt, sz, 0);
+
if (!err)
pma_cnt_assign(pma_cnt, out_cnt);
}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index c2b557e64290..ce7610740412 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -47,7 +47,9 @@
#include <rdma/uverbs_ioctl.h>
#include <rdma/mlx5_user_ioctl_verbs.h>
#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/ib_ucaps.h>
#include "macsec.h"
+#include "data_direct.h"
#define UVERBS_MODULE_NAME mlx5_ib
#include <rdma/uverbs_named_ioctl.h>
@@ -146,16 +148,52 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
if (upper && port->rep->vport == MLX5_VPORT_UPLINK)
continue;
-
- read_lock(&port->roce.netdev_lock);
- rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw,
- port->rep->vport);
- if (rep_ndev == ndev) {
- read_unlock(&port->roce.netdev_lock);
+ rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1);
+ if (rep_ndev && rep_ndev == ndev) {
+ dev_put(rep_ndev);
*port_num = i + 1;
return &port->roce;
}
- read_unlock(&port->roce.netdev_lock);
+
+ dev_put(rep_ndev);
+ }
+
+ return NULL;
+}
+
+static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev,
+ struct net_device *ndev,
+ struct net_device *upper,
+ struct net_device *ib_ndev)
+{
+ if (!dev->ib_active)
+ return false;
+
+ /* Event is about our upper device */
+ if (upper == ndev)
+ return true;
+
+ /* RDMA device is not in lag and not in switchdev */
+ if (!dev->is_rep && !upper && ndev == ib_ndev)
+ return true;
+
+ /* RDMA devie is in switchdev */
+ if (dev->is_rep && ndev == ib_ndev)
+ return true;
+
+ return false;
+}
+
+static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev)
+{
+ struct mlx5_ib_port *port;
+ int i;
+
+ for (i = 0; i < ibdev->num_ports; i++) {
+ port = &ibdev->port[i];
+ if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) {
+ return ib_device_get_netdev(&ibdev->ib_dev, i + 1);
+ }
}
return NULL;
@@ -167,6 +205,7 @@ static int mlx5_netdev_event(struct notifier_block *this,
struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
u32 port_num = roce->native_port_num;
+ struct net_device *ib_ndev = NULL;
struct mlx5_core_dev *mdev;
struct mlx5_ib_dev *ibdev;
@@ -180,47 +219,67 @@ static int mlx5_netdev_event(struct notifier_block *this,
/* Should already be registered during the load */
if (ibdev->is_rep)
break;
- write_lock(&roce->netdev_lock);
+
+ ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
+ /* Exit if already registered */
+ if (ib_ndev)
+ goto put_ndev;
+
if (ndev->dev.parent == mdev->device)
- roce->netdev = ndev;
- write_unlock(&roce->netdev_lock);
+ ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num);
break;
case NETDEV_UNREGISTER:
/* In case of reps, ib device goes away before the netdevs */
- write_lock(&roce->netdev_lock);
- if (roce->netdev == ndev)
- roce->netdev = NULL;
- write_unlock(&roce->netdev_lock);
- break;
+ if (ibdev->is_rep)
+ break;
+ ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
+ if (ib_ndev == ndev)
+ ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num);
+ goto put_ndev;
case NETDEV_CHANGE:
case NETDEV_UP:
case NETDEV_DOWN: {
- struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
struct net_device *upper = NULL;
- if (lag_ndev) {
- upper = netdev_master_upper_dev_get(lag_ndev);
- dev_put(lag_ndev);
+ if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev) &&
+ !mlx5_core_mp_enabled(mdev))
+ return NOTIFY_DONE;
+
+ if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
+ struct net_device *lag_ndev;
+
+ if(mlx5_lag_is_roce(mdev))
+ lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1);
+ else /* sriov lag */
+ lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev);
+
+ if (lag_ndev) {
+ upper = netdev_master_upper_dev_get(lag_ndev);
+ dev_put(lag_ndev);
+ } else {
+ goto done;
+ }
}
if (ibdev->is_rep)
roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num);
if (!roce)
return NOTIFY_DONE;
- if ((upper == ndev ||
- ((!upper || ibdev->is_rep) && ndev == roce->netdev)) &&
- ibdev->ib_active) {
+
+ ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
+
+ if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) {
struct ib_event ibev = { };
enum ib_port_state port_state;
if (get_port_state(&ibdev->ib_dev, port_num,
&port_state))
- goto done;
+ goto put_ndev;
if (roce->last_port_state == port_state)
- goto done;
+ goto put_ndev;
roce->last_port_state = port_state;
ibev.device = &ibdev->ib_dev;
@@ -229,7 +288,7 @@ static int mlx5_netdev_event(struct notifier_block *this,
else if (port_state == IB_PORT_ACTIVE)
ibev.event = IB_EVENT_PORT_ACTIVE;
else
- goto done;
+ goto put_ndev;
ibev.element.port_num = port_num;
ib_dispatch_event(&ibev);
@@ -240,39 +299,13 @@ static int mlx5_netdev_event(struct notifier_block *this,
default:
break;
}
+put_ndev:
+ dev_put(ib_ndev);
done:
mlx5_ib_put_native_port_mdev(ibdev, port_num);
return NOTIFY_DONE;
}
-static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
- u32 port_num)
-{
- struct mlx5_ib_dev *ibdev = to_mdev(device);
- struct net_device *ndev;
- struct mlx5_core_dev *mdev;
-
- mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
- if (!mdev)
- return NULL;
-
- ndev = mlx5_lag_get_roce_netdev(mdev);
- if (ndev)
- goto out;
-
- /* Ensure ndev does not disappear before we invoke dev_hold()
- */
- read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
- ndev = ibdev->port[port_num - 1].roce.netdev;
- if (ndev)
- dev_hold(ndev);
- read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
-
-out:
- mlx5_ib_put_native_port_mdev(ibdev, port_num);
- return ndev;
-}
-
struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
u32 ib_port_num,
u32 *native_port_num)
@@ -283,6 +316,14 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
struct mlx5_ib_multiport_info *mpi;
struct mlx5_ib_port *port;
+ if (ibdev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
+ if (native_port_num)
+ *native_port_num = smi_to_native_portnum(ibdev,
+ ib_port_num);
+ return ibdev->mdev;
+
+ }
+
if (!mlx5_core_mp_enabled(ibdev->mdev) ||
ll != IB_LINK_LAYER_ETHERNET) {
if (native_port_num)
@@ -444,6 +485,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_2X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_200GAUI_1_200GBASE_CR1_KR1):
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_XDR;
+ break;
case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_HDR;
@@ -452,10 +497,18 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_400GAUI_2_400GBASE_CR2_KR2):
+ *active_width = IB_WIDTH_2X;
+ *active_speed = IB_SPEED_XDR;
+ break;
case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_800GAUI_4_800GBASE_CR4_KR4):
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_XDR;
+ break;
default:
return -EINVAL;
}
@@ -504,10 +557,10 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num,
*/
if (dev->is_rep)
err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
- 1);
+ 1, 0);
else
err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
- mdev_port_num);
+ mdev_port_num, 0);
if (err)
goto out;
ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability);
@@ -539,11 +592,11 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num,
if (!put_mdev)
goto out;
- ndev = mlx5_ib_get_netdev(device, port_num);
+ ndev = ib_device_get_netdev(device, port_num);
if (!ndev)
goto out;
- if (dev->lag_active) {
+ if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
rcu_read_lock();
upper = netdev_master_upper_dev_get_rcu(ndev);
if (upper) {
@@ -1146,6 +1199,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
+
+ if (MLX5_CAP_GEN_2(mdev, dp_ordering_force) &&
+ (MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc) ||
+ MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc) ||
+ MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc) ||
+ MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud) ||
+ MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc)))
+ resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP;
}
if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
@@ -1334,11 +1395,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_hca_vport_context *rep;
+ u8 vl_hw_cap, plane_index = 0;
u16 max_mtu;
u16 oper_mtu;
int err;
u16 ib_link_width_oper;
- u8 vl_hw_cap;
rep = kzalloc(sizeof(*rep), GFP_KERNEL);
if (!rep) {
@@ -1348,6 +1409,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
/* props being zeroed by the caller, avoid zeroing it here */
+ if (ibdev->type == RDMA_DEVICE_TYPE_SMI) {
+ plane_index = port;
+ port = smi_to_native_portnum(dev, port);
+ }
+
err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
if (err)
goto out;
@@ -1358,7 +1424,14 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
props->sm_sl = rep->sm_sl;
props->state = rep->vport_state;
props->phys_state = rep->port_physical_state;
- props->port_cap_flags = rep->cap_mask1;
+
+ props->port_cap_flags = rep->cap_mask1;
+ if (dev->num_plane) {
+ props->port_cap_flags |= IB_PORT_SM_DISABLED;
+ props->port_cap_flags &= ~IB_PORT_SM;
+ } else if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
+ props->port_cap_flags &= ~IB_PORT_CM_SUP;
+
props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
@@ -1371,7 +1444,7 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
props->port_cap_flags2 = rep->cap_mask2;
err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper,
- &props->active_speed, port);
+ &props->active_speed, port, plane_index);
if (err)
goto out;
@@ -1811,7 +1884,7 @@ static int set_ucontext_resp(struct ib_ucontext *uctx,
}
resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
- if (dev->wc_support)
+ if (mlx5_wc_support_get(dev->mdev))
resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev,
log_bf_reg_size);
resp->cache_line_size = cache_line_size();
@@ -1874,6 +1947,12 @@ static int set_ucontext_resp(struct ib_ucontext *uctx,
return 0;
}
+static bool uctx_rdma_ctrl_is_enabled(u64 enabled_caps)
+{
+ return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL) ||
+ UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA);
+}
+
static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
struct ib_udata *udata)
{
@@ -1916,10 +1995,17 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
return -EINVAL;
if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
- err = mlx5_ib_devx_create(dev, true);
+ err = mlx5_ib_devx_create(dev, true, uctx->enabled_caps);
if (err < 0)
goto out_ctx;
context->devx_uid = err;
+
+ if (uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) {
+ err = mlx5_cmd_add_privileged_uid(dev->mdev,
+ context->devx_uid);
+ if (err)
+ goto out_devx;
+ }
}
lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
@@ -1934,7 +2020,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
/* updates req->total_num_bfregs */
err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
if (err)
- goto out_devx;
+ goto out_ucap;
mutex_init(&bfregi->lock);
bfregi->lib_uar_4k = lib_uar_4k;
@@ -1942,7 +2028,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
GFP_KERNEL);
if (!bfregi->count) {
err = -ENOMEM;
- goto out_devx;
+ goto out_ucap;
}
bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
@@ -2006,6 +2092,11 @@ out_sys_pages:
out_count:
kfree(bfregi->count);
+out_ucap:
+ if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX &&
+ uctx_rdma_ctrl_is_enabled(uctx->enabled_caps))
+ mlx5_cmd_remove_privileged_uid(dev->mdev, context->devx_uid);
+
out_devx:
if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
mlx5_ib_devx_destroy(dev, context->devx_uid);
@@ -2050,8 +2141,12 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
kfree(bfregi->sys_pages);
kfree(bfregi->count);
- if (context->devx_uid)
+ if (context->devx_uid) {
+ if (uctx_rdma_ctrl_is_enabled(ibcontext->enabled_caps))
+ mlx5_cmd_remove_privileged_uid(dev->mdev,
+ context->devx_uid);
mlx5_ib_devx_destroy(dev, context->devx_uid);
+ }
}
static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
@@ -2338,7 +2433,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
switch (command) {
case MLX5_IB_MMAP_WC_PAGE:
case MLX5_IB_MMAP_ALLOC_WC:
- if (!dev->wc_support)
+ if (!mlx5_wc_support_get(dev->mdev))
return -EPERM;
fallthrough;
case MLX5_IB_MMAP_NC_PAGE:
@@ -2777,6 +2872,23 @@ static int mlx5_ib_event_slave_port(struct notifier_block *nb,
return NOTIFY_OK;
}
+static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane)
+{
+ struct mlx5_hca_vport_context vport_ctx;
+ int err;
+
+ *num_plane = 0;
+ if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane))
+ return 0;
+
+ err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx);
+ if (err)
+ return err;
+
+ *num_plane = vport_ctx.num_plane;
+ return 0;
+}
+
static int set_has_smi_cap(struct mlx5_ib_dev *dev)
{
struct mlx5_hca_vport_context vport_ctx;
@@ -2787,10 +2899,15 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev)
return 0;
for (port = 1; port <= dev->num_ports; port++) {
- if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) {
+ if (dev->num_plane) {
+ dev->port_caps[port - 1].has_smi = false;
+ continue;
+ } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt) ||
+ dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
dev->port_caps[port - 1].has_smi = true;
continue;
}
+
err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0,
&vport_ctx);
if (err) {
@@ -2824,37 +2941,72 @@ static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
}
}
-static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
+int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev)
{
struct mlx5_ib_resources *devr = &dev->devr;
- struct ib_srq_init_attr attr;
- struct ib_device *ibdev;
struct ib_cq_init_attr cq_attr = {.cqe = 1};
- int port;
+ struct ib_device *ibdev;
+ struct ib_pd *pd;
+ struct ib_cq *cq;
int ret = 0;
- ibdev = &dev->ib_dev;
- if (!MLX5_CAP_GEN(dev->mdev, xrc))
- return -EOPNOTSUPP;
+ /*
+ * devr->c0 is set once, never changed until device unload.
+ * Avoid taking the mutex if initialization is already done.
+ */
+ if (devr->c0)
+ return 0;
- devr->p0 = ib_alloc_pd(ibdev, 0);
- if (IS_ERR(devr->p0))
- return PTR_ERR(devr->p0);
+ mutex_lock(&devr->cq_lock);
+ if (devr->c0)
+ goto unlock;
- devr->c0 = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
- if (IS_ERR(devr->c0)) {
- ret = PTR_ERR(devr->c0);
- goto error1;
+ ibdev = &dev->ib_dev;
+ pd = ib_alloc_pd(ibdev, 0);
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
+ mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret);
+ goto unlock;
}
- ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0);
- if (ret)
- goto error2;
+ cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
+ if (IS_ERR(cq)) {
+ ret = PTR_ERR(cq);
+ mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret);
+ ib_dealloc_pd(pd);
+ goto unlock;
+ }
- ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0);
+ devr->p0 = pd;
+ devr->c0 = cq;
+
+unlock:
+ mutex_unlock(&devr->cq_lock);
+ return ret;
+}
+
+int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev)
+{
+ struct mlx5_ib_resources *devr = &dev->devr;
+ struct ib_srq_init_attr attr;
+ struct ib_srq *s0, *s1;
+ int ret = 0;
+
+ /*
+ * devr->s1 is set once, never changed until device unload.
+ * Avoid taking the mutex if initialization is already done.
+ */
+ if (devr->s1)
+ return 0;
+
+ mutex_lock(&devr->srq_lock);
+ if (devr->s1)
+ goto unlock;
+
+ ret = mlx5_ib_dev_res_cq_init(dev);
if (ret)
- goto error3;
+ goto unlock;
memset(&attr, 0, sizeof(attr));
attr.attr.max_sge = 1;
@@ -2862,10 +3014,11 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
attr.srq_type = IB_SRQT_XRC;
attr.ext.cq = devr->c0;
- devr->s0 = ib_create_srq(devr->p0, &attr);
- if (IS_ERR(devr->s0)) {
- ret = PTR_ERR(devr->s0);
- goto err_create;
+ s0 = ib_create_srq(devr->p0, &attr);
+ if (IS_ERR(s0)) {
+ ret = PTR_ERR(s0);
+ mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret);
+ goto unlock;
}
memset(&attr, 0, sizeof(attr));
@@ -2873,51 +3026,116 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
attr.attr.max_wr = 1;
attr.srq_type = IB_SRQT_BASIC;
- devr->s1 = ib_create_srq(devr->p0, &attr);
- if (IS_ERR(devr->s1)) {
- ret = PTR_ERR(devr->s1);
- goto error6;
+ s1 = ib_create_srq(devr->p0, &attr);
+ if (IS_ERR(s1)) {
+ ret = PTR_ERR(s1);
+ mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret);
+ ib_destroy_srq(s0);
}
- for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
- INIT_WORK(&devr->ports[port].pkey_change_work,
- pkey_change_handler);
-
- return 0;
+ devr->s0 = s0;
+ devr->s1 = s1;
-error6:
- ib_destroy_srq(devr->s0);
-err_create:
- mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
-error3:
- mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
-error2:
- ib_destroy_cq(devr->c0);
-error1:
- ib_dealloc_pd(devr->p0);
+unlock:
+ mutex_unlock(&devr->srq_lock);
return ret;
}
-static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
+static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
{
struct mlx5_ib_resources *devr = &dev->devr;
- int port;
+ int ret;
- /*
- * Make sure no change P_Key work items are still executing.
- *
- * At this stage, the mlx5_ib_event should be unregistered
- * and it ensures that no new works are added.
- */
- for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
- cancel_work_sync(&devr->ports[port].pkey_change_work);
+ if (!MLX5_CAP_GEN(dev->mdev, xrc))
+ return -EOPNOTSUPP;
+
+ ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0);
+ if (ret)
+ return ret;
+
+ ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0);
+ if (ret) {
+ mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
+ return ret;
+ }
+
+ mutex_init(&devr->cq_lock);
+ mutex_init(&devr->srq_lock);
- ib_destroy_srq(devr->s1);
- ib_destroy_srq(devr->s0);
+ return 0;
+}
+
+static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
+{
+ struct mlx5_ib_resources *devr = &dev->devr;
+
+ /* After s0/s1 init, they are not unset during the device lifetime. */
+ if (devr->s1) {
+ ib_destroy_srq(devr->s1);
+ ib_destroy_srq(devr->s0);
+ }
mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
- ib_destroy_cq(devr->c0);
- ib_dealloc_pd(devr->p0);
+ /* After p0/c0 init, they are not unset during the device lifetime. */
+ if (devr->c0) {
+ ib_destroy_cq(devr->c0);
+ ib_dealloc_pd(devr->p0);
+ }
+ mutex_destroy(&devr->cq_lock);
+ mutex_destroy(&devr->srq_lock);
+}
+
+static int
+mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev)
+{
+ int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+ struct mlx5_core_dev *mdev = dev->mdev;
+ void *mkc;
+ u32 mkey;
+ u32 pdn;
+ u32 *in;
+ int err;
+
+ err = mlx5_core_alloc_pd(mdev, &pdn);
+ if (err)
+ return err;
+
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ MLX5_SET(create_mkey_in, in, data_direct, 1);
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+ MLX5_SET(mkc, mkc, lw, 1);
+ MLX5_SET(mkc, mkc, lr, 1);
+ MLX5_SET(mkc, mkc, rw, 1);
+ MLX5_SET(mkc, mkc, rr, 1);
+ MLX5_SET(mkc, mkc, a, 1);
+ MLX5_SET(mkc, mkc, pd, pdn);
+ MLX5_SET(mkc, mkc, length64, 1);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+ err = mlx5_core_create_mkey(mdev, &mkey, in, inlen);
+ kvfree(in);
+ if (err)
+ goto err;
+
+ dev->ddr.mkey = mkey;
+ dev->ddr.pdn = pdn;
+ return 0;
+
+err:
+ mlx5_core_dealloc_pd(mdev, pdn);
+ return err;
+}
+
+static void
+mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev)
+{
+ mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey);
+ mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn);
}
static u32 get_core_cap_flags(struct ib_device *ibdev,
@@ -2933,6 +3151,13 @@ static u32 get_core_cap_flags(struct ib_device *ibdev,
if (rep->grh_required)
ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED;
+ if (dev->num_plane)
+ return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD |
+ RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA |
+ RDMA_CORE_CAP_AF_IB;
+ else if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
+ return ret | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_SMI;
+
if (ll == IB_LINK_LAYER_INFINIBAND)
return ret | RDMA_CORE_PORT_IBA_IB;
@@ -2968,6 +3193,9 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num,
return err;
if (ll == IB_LINK_LAYER_INFINIBAND) {
+ if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
+ port_num = smi_to_native_portnum(dev, port_num);
+
err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
&rep);
if (err)
@@ -3010,6 +3238,67 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str)
fw_rev_sub(dev->mdev));
}
+static int lag_event(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev,
+ lag_events);
+ struct mlx5_core_dev *mdev = dev->mdev;
+ struct ib_device *ibdev = &dev->ib_dev;
+ struct net_device *old_ndev = NULL;
+ struct mlx5_ib_port *port;
+ struct net_device *ndev;
+ u32 portnum = 0;
+ int ret = 0;
+ int i;
+
+ switch (event) {
+ case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE:
+ ndev = data;
+ if (ndev) {
+ if (!mlx5_lag_is_roce(mdev)) {
+ // sriov lag
+ for (i = 0; i < dev->num_ports; i++) {
+ port = &dev->port[i];
+ if (port->rep && port->rep->vport ==
+ MLX5_VPORT_UPLINK) {
+ portnum = i;
+ break;
+ }
+ }
+ }
+ old_ndev = ib_device_get_netdev(ibdev, portnum + 1);
+ ret = ib_device_set_netdev(ibdev, ndev, portnum + 1);
+ if (ret)
+ goto out;
+
+ if (old_ndev)
+ roce_del_all_netdev_gids(ibdev, portnum + 1,
+ old_ndev);
+ rdma_roce_rescan_port(ibdev, portnum + 1);
+ }
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+out:
+ dev_put(old_ndev);
+ return notifier_from_errno(ret);
+}
+
+static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev)
+{
+ dev->lag_events.notifier_call = lag_event;
+ blocking_notifier_chain_register(&dev->mdev->priv.lag_nh,
+ &dev->lag_events);
+}
+
+static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev)
+{
+ blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh,
+ &dev->lag_events);
+}
+
static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
@@ -3031,6 +3320,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
goto err_destroy_vport_lag;
}
+ mlx5e_lag_event_register(dev);
dev->flow_db->lag_demux_ft = ft;
dev->lag_ports = mlx5_lag_get_num_ports(mdev);
dev->lag_active = true;
@@ -3048,6 +3338,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
if (dev->lag_active) {
dev->lag_active = false;
+ mlx5e_lag_event_unregister(dev);
mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
dev->flow_db->lag_demux_ft = NULL;
@@ -3306,6 +3597,41 @@ unbind:
return false;
}
+static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev)
+{
+ char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {};
+ int ret;
+
+ if (!MLX5_CAP_GEN(dev->mdev, data_direct) ||
+ !MLX5_CAP_GEN_2(dev->mdev, query_vuid))
+ return 0;
+
+ ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid);
+ if (ret)
+ return ret;
+
+ ret = mlx5_ib_create_data_direct_resources(dev);
+ if (ret)
+ return ret;
+
+ INIT_LIST_HEAD(&dev->data_direct_mr_list);
+ ret = mlx5_data_direct_ib_reg(dev, vuid);
+ if (ret)
+ mlx5_ib_free_data_direct_resources(dev);
+
+ return ret;
+}
+
+static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev)
+{
+ if (!MLX5_CAP_GEN(dev->mdev, data_direct) ||
+ !MLX5_CAP_GEN_2(dev->mdev, query_vuid))
+ return;
+
+ mlx5_data_direct_ib_unreg(dev);
+ mlx5_ib_free_data_direct_resources(dev);
+}
+
static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
{
u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
@@ -3352,7 +3678,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
list) {
if (dev->sys_image_guid == mpi->sys_image_guid &&
- (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
+ (mlx5_core_native_port_num(mpi->mdev) - 1) == i &&
+ mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) {
bound = mlx5_ib_bind_slave_port(dev, mpi);
}
@@ -3613,7 +3940,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
return -EOPNOTSUPP;
- if (!to_mdev(c->ibucontext.device)->wc_support &&
+ if (!mlx5_wc_support_get(to_mdev(c->ibucontext.device)->mdev) &&
alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
return -EOPNOTSUPP;
@@ -3682,14 +4009,24 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE(
dump_fill_mkey),
UA_MANDATORY));
+ADD_UVERBS_ATTRIBUTES_SIMPLE(
+ mlx5_ib_reg_dmabuf_mr,
+ UVERBS_OBJECT_MR,
+ UVERBS_METHOD_REG_DMABUF_MR,
+ UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ enum mlx5_ib_uapi_reg_dmabuf_flags,
+ UA_OPTIONAL));
+
static const struct uapi_definition mlx5_ib_defs[] = {
UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
UAPI_DEF_CHAIN(mlx5_ib_std_types_defs),
UAPI_DEF_CHAIN(mlx5_ib_dm_defs),
+ UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs),
UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
+ UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr),
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
@@ -3698,6 +4035,7 @@ static const struct uapi_definition mlx5_ib_defs[] = {
static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
{
+ mlx5_ib_data_direct_cleanup(dev);
mlx5_ib_cleanup_multiport_master(dev);
WARN_ON(!xa_empty(&dev->odp_mkeys));
mutex_destroy(&dev->cap_mask_mutex);
@@ -3713,13 +4051,11 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
dev->ib_dev.node_type = RDMA_NODE_IB_CA;
dev->ib_dev.local_dma_lkey = 0 /* not supported for now */;
- dev->ib_dev.phys_port_cnt = dev->num_ports;
dev->ib_dev.dev.parent = mdev->device;
dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES;
for (i = 0; i < dev->num_ports; i++) {
spin_lock_init(&dev->port[i].mp.mpi_lock);
- rwlock_init(&dev->port[i].roce.netdev_lock);
dev->port[i].roce.dev = dev;
dev->port[i].roce.native_port_num = i + 1;
dev->port[i].roce.last_port_state = IB_PORT_DOWN;
@@ -3751,6 +4087,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev);
mutex_init(&dev->cap_mask_mutex);
+ mutex_init(&dev->data_direct_lock);
INIT_LIST_HEAD(&dev->qp_list);
spin_lock_init(&dev->reset_flow_resource_lock);
xa_init(&dev->odp_mkeys);
@@ -3759,25 +4096,22 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
spin_lock_init(&dev->dm.lock);
dev->dm.dev = mdev;
+ err = mlx5_ib_data_direct_init(dev);
+ if (err)
+ goto err_mp;
+
return 0;
-err:
- mlx5r_macsec_dealloc_gids(dev);
err_mp:
mlx5_ib_cleanup_multiport_master(dev);
+err:
+ mlx5r_macsec_dealloc_gids(dev);
return err;
}
-static int mlx5_ib_enable_driver(struct ib_device *dev)
-{
- struct mlx5_ib_dev *mdev = to_mdev(dev);
- int ret;
-
- ret = mlx5_ib_test_wc(mdev);
- mlx5_ib_dbg(mdev, "Write-Combining %s",
- mdev->wc_support ? "supported" : "not supported");
-
- return ret;
-}
+static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
+ enum rdma_nl_dev_type type,
+ const char *name);
+static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev);
static const struct ib_device_ops mlx5_ib_dev_ops = {
.owner = THIS_MODULE,
@@ -3785,6 +4119,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION,
.add_gid = mlx5_ib_add_gid,
+ .add_sub_dev = mlx5_ib_add_sub_dev,
.alloc_mr = mlx5_ib_alloc_mr,
.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
.alloc_pd = mlx5_ib_alloc_pd,
@@ -3799,6 +4134,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.dealloc_pd = mlx5_ib_dealloc_pd,
.dealloc_ucontext = mlx5_ib_dealloc_ucontext,
.del_gid = mlx5_ib_del_gid,
+ .del_sub_dev = mlx5_ib_del_sub_dev,
.dereg_mr = mlx5_ib_dereg_mr,
.destroy_ah = mlx5_ib_destroy_ah,
.destroy_cq = mlx5_ib_destroy_cq,
@@ -3809,7 +4145,6 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.drain_rq = mlx5_ib_drain_rq,
.drain_sq = mlx5_ib_drain_sq,
.device_group = &mlx5_attr_group,
- .enable_driver = mlx5_ib_enable_driver,
.get_dev_fw_str = get_dev_fw_str,
.get_dma_mr = mlx5_ib_get_dma_mr,
.get_link_layer = mlx5_ib_port_link_layer,
@@ -3839,6 +4174,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.req_notify_cq = mlx5_ib_arm_cq,
.rereg_user_mr = mlx5_ib_rereg_user_mr,
.resize_cq = mlx5_ib_resize_cq,
+ .ufile_hw_cleanup = mlx5_ib_ufile_hw_cleanup,
INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
@@ -3900,8 +4236,47 @@ static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
return (var_table->bitmap) ? 0 : -ENOMEM;
}
+static void mlx5_ib_cleanup_ucaps(struct mlx5_ib_dev *dev)
+{
+ if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL)
+ ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL);
+
+ if (MLX5_CAP_GEN(dev->mdev, uctx_cap) &
+ MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA)
+ ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA);
+}
+
+static int mlx5_ib_init_ucaps(struct mlx5_ib_dev *dev)
+{
+ int ret;
+
+ if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) {
+ ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL);
+ if (ret)
+ return ret;
+ }
+
+ if (MLX5_CAP_GEN(dev->mdev, uctx_cap) &
+ MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) {
+ ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA);
+ if (ret)
+ goto remove_local;
+ }
+
+ return 0;
+
+remove_local:
+ if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL)
+ ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL);
+ return ret;
+}
+
static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev)
{
+ if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) &
+ MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL)
+ mlx5_ib_cleanup_ucaps(dev);
+
bitmap_free(dev->var_table.bitmap);
}
@@ -3952,6 +4327,13 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
return err;
}
+ if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) &
+ MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) {
+ err = mlx5_ib_init_ucaps(dev);
+ if (err)
+ return err;
+ }
+
dev->ib_dev.use_cq_dim = true;
return 0;
@@ -3985,7 +4367,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
.create_wq = mlx5_ib_create_wq,
.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
.destroy_wq = mlx5_ib_destroy_wq,
- .get_netdev = mlx5_ib_get_netdev,
.modify_wq = mlx5_ib_modify_wq,
INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table,
@@ -4053,17 +4434,6 @@ static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
mlx5_core_native_port_num(dev->mdev) - 1);
}
-static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
-{
- dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
- return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
-}
-
-static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
-{
- mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
-}
-
static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
{
int err;
@@ -4089,7 +4459,10 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
{
const char *name;
- if (!mlx5_lag_is_active(dev->mdev))
+ if (dev->sub_dev_name) {
+ name = dev->sub_dev_name;
+ ib_mark_name_assigned_by_user(&dev->ib_dev);
+ } else if (!mlx5_lag_is_active(dev->mdev))
name = "mlx5_%d";
else
name = "mlx5_bond_%d";
@@ -4100,6 +4473,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_mkey_cache_cleanup(dev);
mlx5r_umr_resource_cleanup(dev);
+ mlx5r_umr_cleanup(dev);
}
static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
@@ -4111,7 +4485,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
{
int ret;
- ret = mlx5r_umr_resource_init(dev);
+ ret = mlx5r_umr_init(dev);
if (ret)
return ret;
@@ -4166,6 +4540,13 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
{
+ struct mlx5_ib_resources *devr = &dev->devr;
+ int port;
+
+ for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
+ INIT_WORK(&devr->ports[port].pkey_change_work,
+ pkey_change_handler);
+
dev->mdev_events.notifier_call = mlx5_ib_event;
mlx5_notifier_register(dev->mdev, &dev->mdev_events);
@@ -4176,8 +4557,30 @@ static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
{
+ struct mlx5_ib_resources *devr = &dev->devr;
+ int port;
+
mlx5r_macsec_event_unregister(dev);
mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
+
+ for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
+ cancel_work_sync(&devr->ports[port].pkey_change_work);
+}
+
+void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
+ struct mlx5_data_direct_dev *dev)
+{
+ mutex_lock(&ibdev->data_direct_lock);
+ ibdev->data_direct_dev = dev;
+ mutex_unlock(&ibdev->data_direct_lock);
+}
+
+void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev)
+{
+ mutex_lock(&ibdev->data_direct_lock);
+ mlx5_ib_revoke_data_direct_mrs(ibdev);
+ ibdev->data_direct_dev = NULL;
+ mutex_unlock(&ibdev->data_direct_lock);
}
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
@@ -4251,9 +4654,6 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_dev_res_init,
mlx5_ib_dev_res_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
- mlx5_ib_stage_dev_notifier_init,
- mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_ODP,
mlx5_ib_odp_init_one,
mlx5_ib_odp_cleanup_one),
@@ -4263,9 +4663,6 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_UAR,
- mlx5_ib_stage_uar_init,
- mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
@@ -4278,6 +4675,9 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
+ mlx5_ib_stage_dev_notifier_init,
+ mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
mlx5_ib_stage_post_ib_reg_umr_init,
NULL),
@@ -4314,18 +4714,12 @@ const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_dev_res_init,
mlx5_ib_dev_res_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
- mlx5_ib_stage_dev_notifier_init,
- mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
mlx5_ib_counters_init,
mlx5_ib_counters_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_UAR,
- mlx5_ib_stage_uar_init,
- mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
@@ -4338,6 +4732,9 @@ const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
+ mlx5_ib_stage_dev_notifier_init,
+ mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
mlx5_ib_stage_post_ib_reg_umr_init,
NULL),
@@ -4349,6 +4746,90 @@ const struct mlx5_ib_profile raw_eth_profile = {
NULL),
};
+static const struct mlx5_ib_profile plane_profile = {
+ STAGE_CREATE(MLX5_IB_STAGE_INIT,
+ mlx5_ib_stage_init_init,
+ mlx5_ib_stage_init_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_CAPS,
+ mlx5_ib_stage_caps_init,
+ mlx5_ib_stage_caps_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
+ mlx5_ib_stage_non_default_cb,
+ NULL),
+ STAGE_CREATE(MLX5_IB_STAGE_QP,
+ mlx5_init_qp_table,
+ mlx5_cleanup_qp_table),
+ STAGE_CREATE(MLX5_IB_STAGE_SRQ,
+ mlx5_init_srq_table,
+ mlx5_cleanup_srq_table),
+ STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
+ mlx5_ib_dev_res_init,
+ mlx5_ib_dev_res_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_BFREG,
+ mlx5_ib_stage_bfrag_init,
+ mlx5_ib_stage_bfrag_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
+ mlx5_ib_stage_ib_reg_init,
+ mlx5_ib_stage_ib_reg_cleanup),
+};
+
+static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
+ enum rdma_nl_dev_type type,
+ const char *name)
+{
+ struct mlx5_ib_dev *mparent = to_mdev(parent), *mplane;
+ enum rdma_link_layer ll;
+ int ret;
+
+ if (mparent->smi_dev)
+ return ERR_PTR(-EEXIST);
+
+ ll = mlx5_port_type_cap_to_rdma_ll(MLX5_CAP_GEN(mparent->mdev,
+ port_type));
+ if (type != RDMA_DEVICE_TYPE_SMI || !mparent->num_plane ||
+ ll != IB_LINK_LAYER_INFINIBAND ||
+ !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mplane = ib_alloc_device(mlx5_ib_dev, ib_dev);
+ if (!mplane)
+ return ERR_PTR(-ENOMEM);
+
+ mplane->port = kcalloc(mparent->num_plane * mparent->num_ports,
+ sizeof(*mplane->port), GFP_KERNEL);
+ if (!mplane->port) {
+ ret = -ENOMEM;
+ goto fail_kcalloc;
+ }
+
+ mplane->ib_dev.type = type;
+ mplane->mdev = mparent->mdev;
+ mplane->num_ports = mparent->num_plane;
+ mplane->sub_dev_name = name;
+ mplane->ib_dev.phys_port_cnt = mplane->num_ports;
+
+ ret = __mlx5_ib_add(mplane, &plane_profile);
+ if (ret)
+ goto fail_ib_add;
+
+ mparent->smi_dev = mplane;
+ return &mplane->ib_dev;
+
+fail_ib_add:
+ kfree(mplane->port);
+fail_kcalloc:
+ ib_dealloc_device(&mplane->ib_dev);
+ return ERR_PTR(ret);
+}
+
+static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev)
+{
+ struct mlx5_ib_dev *mdev = to_mdev(sub_dev);
+
+ to_mdev(sub_dev->parent)->smi_dev = NULL;
+ __mlx5_ib_remove(mdev, mdev->profile, MLX5_IB_STAGE_MAX);
+}
+
static int mlx5r_mp_probe(struct auxiliary_device *adev,
const struct auxiliary_device_id *id)
{
@@ -4373,7 +4854,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev,
mutex_lock(&mlx5_ib_multiport_mutex);
list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
- if (dev->sys_image_guid == mpi->sys_image_guid)
+ if (dev->sys_image_guid == mpi->sys_image_guid &&
+ mlx5_core_same_coredev_type(dev->mdev, mpi->mdev))
bound = mlx5_ib_bind_slave_port(dev, mpi);
if (bound) {
@@ -4426,15 +4908,23 @@ static int mlx5r_probe(struct auxiliary_device *adev,
dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
if (!dev)
return -ENOMEM;
+
+ if (ll == IB_LINK_LAYER_INFINIBAND) {
+ ret = mlx5_ib_get_plane_num(mdev, &dev->num_plane);
+ if (ret)
+ goto fail;
+ }
+
dev->port = kcalloc(num_ports, sizeof(*dev->port),
GFP_KERNEL);
if (!dev->port) {
- ib_dealloc_device(&dev->ib_dev);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail;
}
dev->mdev = mdev;
dev->num_ports = num_ports;
+ dev->ib_dev.phys_port_cnt = num_ports;
if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev))
profile = &raw_eth_profile;
@@ -4442,14 +4932,17 @@ static int mlx5r_probe(struct auxiliary_device *adev,
profile = &pf_profile;
ret = __mlx5_ib_add(dev, profile);
- if (ret) {
- kfree(dev->port);
- ib_dealloc_device(&dev->ib_dev);
- return ret;
- }
+ if (ret)
+ goto fail_ib_add;
auxiliary_set_drvdata(adev, dev);
return 0;
+
+fail_ib_add:
+ kfree(dev->port);
+fail:
+ ib_dealloc_device(&dev->ib_dev);
+ return ret;
}
static void mlx5r_remove(struct auxiliary_device *adev)
@@ -4509,17 +5002,23 @@ static int __init mlx5_ib_init(void)
ret = mlx5r_rep_init();
if (ret)
goto rep_err;
+ ret = mlx5_data_direct_driver_register();
+ if (ret)
+ goto dd_err;
ret = auxiliary_driver_register(&mlx5r_mp_driver);
if (ret)
goto mp_err;
ret = auxiliary_driver_register(&mlx5r_driver);
if (ret)
goto drv_err;
+
return 0;
drv_err:
auxiliary_driver_unregister(&mlx5r_mp_driver);
mp_err:
+ mlx5_data_direct_driver_unregister();
+dd_err:
mlx5r_rep_cleanup();
rep_err:
mlx5_ib_qp_event_cleanup();
@@ -4531,6 +5030,7 @@ qp_event_err:
static void __exit mlx5_ib_cleanup(void)
{
+ mlx5_data_direct_driver_unregister();
auxiliary_driver_unregister(&mlx5r_driver);
auxiliary_driver_unregister(&mlx5r_mp_driver);
mlx5r_rep_cleanup();
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 96ffbbaf0a73..af321f6ef7f5 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -32,7 +32,6 @@
#include <rdma/ib_umem_odp.h>
#include "mlx5_ib.h"
-#include <linux/jiffies.h>
/*
* Fill in a physical address list. ib_umem_num_dma_blocks() entries will be
@@ -94,202 +93,3 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff(
return 0;
return page_size;
}
-
-#define WR_ID_BF 0xBF
-#define WR_ID_END 0xBAD
-#define TEST_WC_NUM_WQES 255
-#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100)
-static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id,
- bool signaled)
-{
- struct mlx5_ib_qp *qp = to_mqp(ibqp);
- struct mlx5_wqe_ctrl_seg *ctrl;
- struct mlx5_bf *bf = &qp->bf;
- __be32 mmio_wqe[16] = {};
- unsigned long flags;
- unsigned int idx;
- int i;
-
- if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
- return -EIO;
-
- spin_lock_irqsave(&qp->sq.lock, flags);
-
- idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
- ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);
-
- memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg));
- ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0;
- ctrl->opmod_idx_opcode =
- cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP);
- ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) |
- (qp->trans_qp.base.mqp.qpn << 8));
-
- qp->sq.wrid[idx] = wr_id;
- qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP;
- qp->sq.wqe_head[idx] = qp->sq.head + 1;
- qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg),
- MLX5_SEND_WQE_BB);
- qp->sq.w_list[idx].next = qp->sq.cur_post;
- qp->sq.head++;
-
- memcpy(mmio_wqe, ctrl, sizeof(*ctrl));
- ((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |=
- MLX5_WQE_CTRL_CQ_UPDATE;
-
- /* Make sure that descriptors are written before
- * updating doorbell record and ringing the doorbell
- */
- wmb();
-
- qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
-
- /* Make sure doorbell record is visible to the HCA before
- * we hit doorbell
- */
- wmb();
- for (i = 0; i < 8; i++)
- mlx5_write64(&mmio_wqe[i * 2],
- bf->bfreg->map + bf->offset + i * 8);
- io_stop_wc();
-
- bf->offset ^= bf->buf_size;
-
- spin_unlock_irqrestore(&qp->sq.lock, flags);
-
- return 0;
-}
-
-static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq)
-{
- int ret;
- struct ib_wc wc = {};
- unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES;
-
- do {
- ret = ib_poll_cq(cq, 1, &wc);
- if (ret < 0 || wc.status)
- return ret < 0 ? ret : -EINVAL;
- if (ret)
- break;
- } while (!time_after(jiffies, end));
-
- if (!ret)
- return -ETIMEDOUT;
-
- if (wc.wr_id != WR_ID_BF)
- ret = 0;
-
- return ret;
-}
-
-static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp)
-{
- int err, i;
-
- for (i = 0; i < TEST_WC_NUM_WQES; i++) {
- err = post_send_nop(dev, qp, WR_ID_BF, false);
- if (err)
- return err;
- }
-
- return post_send_nop(dev, qp, WR_ID_END, true);
-}
-
-int mlx5_ib_test_wc(struct mlx5_ib_dev *dev)
-{
- struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 };
- int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
- struct ib_qp_init_attr qp_init_attr = {
- .cap = { .max_send_wr = TEST_WC_NUM_WQES },
- .qp_type = IB_QPT_UD,
- .sq_sig_type = IB_SIGNAL_REQ_WR,
- .create_flags = MLX5_IB_QP_CREATE_WC_TEST,
- };
- struct ib_qp_attr qp_attr = { .port_num = 1 };
- struct ib_device *ibdev = &dev->ib_dev;
- struct ib_qp *qp;
- struct ib_cq *cq;
- struct ib_pd *pd;
- int ret;
-
- if (!MLX5_CAP_GEN(dev->mdev, bf))
- return 0;
-
- if (!dev->mdev->roce.roce_en &&
- port_type_cap == MLX5_CAP_PORT_TYPE_ETH) {
- if (mlx5_core_is_pf(dev->mdev))
- dev->wc_support = arch_can_pci_mmap_wc();
- return 0;
- }
-
- ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false);
- if (ret)
- goto print_err;
-
- if (!dev->wc_bfreg.wc)
- goto out1;
-
- pd = ib_alloc_pd(ibdev, 0);
- if (IS_ERR(pd)) {
- ret = PTR_ERR(pd);
- goto out1;
- }
-
- cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
- if (IS_ERR(cq)) {
- ret = PTR_ERR(cq);
- goto out2;
- }
-
- qp_init_attr.recv_cq = cq;
- qp_init_attr.send_cq = cq;
- qp = ib_create_qp(pd, &qp_init_attr);
- if (IS_ERR(qp)) {
- ret = PTR_ERR(qp);
- goto out3;
- }
-
- qp_attr.qp_state = IB_QPS_INIT;
- ret = ib_modify_qp(qp, &qp_attr,
- IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX |
- IB_QP_QKEY);
- if (ret)
- goto out4;
-
- qp_attr.qp_state = IB_QPS_RTR;
- ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
- if (ret)
- goto out4;
-
- qp_attr.qp_state = IB_QPS_RTS;
- ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
- if (ret)
- goto out4;
-
- ret = test_wc_do_send(dev, qp);
- if (ret < 0)
- goto out4;
-
- ret = test_wc_poll_cq_result(dev, cq);
- if (ret > 0) {
- dev->wc_support = true;
- ret = 0;
- }
-
-out4:
- ib_destroy_qp(qp);
-out3:
- ib_destroy_cq(cq);
-out2:
- ib_dealloc_pd(pd);
-out1:
- mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg);
-print_err:
- if (ret)
- mlx5_ib_err(
- dev,
- "Error %d while trying to test write-combining support\n",
- ret);
- return ret;
-}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index bbe79b86c717..fde859d207ae 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -63,17 +63,6 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits,
return GENMASK(largest_pg_shift, pgsz_shift);
}
-/*
- * For mkc users, instead of a page_offset the command has a start_iova which
- * specifies both the page_offset and the on-the-wire IOVA
- */
-#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \
- ib_umem_find_best_pgsz(umem, \
- __mlx5_log_page_size_to_bitmap( \
- __mlx5_bit_sz(typ, log_pgsz_fld), \
- pgsz_shift), \
- iova)
-
static __always_inline unsigned long
__mlx5_page_offset_to_bitmask(unsigned int page_offset_bits,
unsigned int offset_shift)
@@ -115,6 +104,19 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff(
__mlx5_bit_sz(typ, page_offset_fld), 0, scale, \
page_offset_quantized)
+static inline unsigned long
+mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ /*
+ * mkeys used for dmabuf are fixed at PAGE_SIZE because we must be able
+ * to hold any sgl after a move operation. Ideally the mkc page size
+ * could be changed at runtime to be optimal, but right now the driver
+ * cannot do that.
+ */
+ return ib_umem_find_best_pgsz(&umem_dmabuf->umem, PAGE_SIZE,
+ umem_dmabuf->umem.iova);
+}
+
enum {
MLX5_IB_MMAP_OFFSET_START = 9,
MLX5_IB_MMAP_OFFSET_END = 255,
@@ -274,6 +276,7 @@ struct mlx5_ib_flow_matcher {
struct mlx5_core_dev *mdev;
atomic_t usecnt;
u8 match_criteria_enable;
+ u32 ib_port;
};
struct mlx5_ib_steering_anchor {
@@ -291,6 +294,18 @@ enum mlx5_ib_optional_counter_type {
MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS,
MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS,
MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS,
+ MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS,
+ MLX5_IB_OPCOUNTER_RDMA_TX_BYTES,
+ MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS,
+ MLX5_IB_OPCOUNTER_RDMA_RX_BYTES,
+
+ MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS_PER_QP,
+ MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS_PER_QP,
+ MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS_PER_QP,
+ MLX5_IB_OPCOUNTER_RDMA_TX_PACKETS_PER_QP,
+ MLX5_IB_OPCOUNTER_RDMA_TX_BYTES_PER_QP,
+ MLX5_IB_OPCOUNTER_RDMA_RX_PACKETS_PER_QP,
+ MLX5_IB_OPCOUNTER_RDMA_RX_BYTES_PER_QP,
MLX5_IB_OPCOUNTER_MAX,
};
@@ -305,6 +320,8 @@ struct mlx5_ib_flow_db {
struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT];
struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX];
struct mlx5_flow_table *lag_demux_ft;
+ struct mlx5_ib_flow_prio *rdma_transport_rx;
+ struct mlx5_ib_flow_prio *rdma_transport_tx;
/* Protect flow steering bypass flow tables
* when add/del flow rules.
* only single add/removal of flow steering rule could be done
@@ -334,6 +351,7 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_UPD_XLT_PD BIT(4)
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
+#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
*
@@ -341,7 +359,6 @@ struct mlx5_ib_flow_db {
* rely on the range reserved for that use in the ib_qp_create_flags enum.
*/
#define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START
-#define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1)
struct wr_list {
u16 opcode;
@@ -520,6 +537,7 @@ struct mlx5_ib_qp {
struct mlx5_bf bf;
u8 has_rq:1;
u8 is_rss:1;
+ u8 is_ooo_rq:1;
/* only for user space QPs. For kernel
* we have it from the bf object
@@ -628,6 +646,8 @@ enum mlx5_mkey_type {
MLX5_MKEY_MR = 1,
MLX5_MKEY_MW,
MLX5_MKEY_INDIRECT_DEVX,
+ MLX5_MKEY_NULL,
+ MLX5_MKEY_IMPLICIT_CHILD,
};
struct mlx5r_cache_rb_key {
@@ -643,9 +663,10 @@ struct mlx5_ib_mkey {
unsigned int ndescs;
struct wait_queue_head wait;
refcount_t usecount;
- /* User Mkey must hold either a rb_key or a cache_ent. */
+ /* Cacheable user Mkey must hold either a rb_key or a cache_ent. */
struct mlx5r_cache_rb_key rb_key;
struct mlx5_cache_ent *cache_ent;
+ u8 cacheable : 1;
};
#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
@@ -664,11 +685,19 @@ struct mlx5_ib_mkey {
#define mlx5_update_odp_stats(mr, counter_name, value) \
atomic64_add(value, &((mr)->odp_stats.counter_name))
+#define mlx5_update_odp_stats_with_handled(mr, counter_name, value) \
+ do { \
+ mlx5_update_odp_stats(mr, counter_name, value); \
+ atomic64_add(1, &((mr)->odp_stats.counter_name##_handled)); \
+ } while (0)
+
struct mlx5_ib_mr {
struct ib_mr ibmr;
struct mlx5_ib_mkey mmkey;
struct ib_umem *umem;
+ /* The mr is data direct related */
+ u8 data_direct :1;
union {
/* Used only by kernel MRs (umem == NULL) */
@@ -706,6 +735,11 @@ struct mlx5_ib_mr {
} odp_destroy;
struct ib_odp_counters odp_stats;
bool is_odp_implicit;
+ /* The affilated data direct crossed mr */
+ struct mlx5_ib_mr *dd_crossed_mr;
+ struct list_head dd_node;
+ u8 revoked :1;
+ struct mlx5_ib_mkey null_mmkey;
};
};
};
@@ -751,6 +785,8 @@ struct umr_common {
*/
struct mutex lock;
unsigned int state;
+ /* Protects from repeat UMR QP creation */
+ struct mutex init_lock;
};
#define NUM_MKEYS_PER_PAGE \
@@ -781,6 +817,7 @@ struct mlx5_cache_ent {
u8 is_tmp:1;
u8 disabled:1;
u8 fill_to_high_water:1;
+ u8 tmp_cleanup_scheduled:1;
/*
* - limit is the low water mark for stored mkeys, 2* limit is the
@@ -812,7 +849,6 @@ struct mlx5_mkey_cache {
struct mutex rb_lock;
struct dentry *fs_root;
unsigned long last_add;
- struct delayed_work remove_ent_dwork;
};
struct mlx5_ib_port_resources {
@@ -820,13 +856,20 @@ struct mlx5_ib_port_resources {
struct work_struct pkey_change_work;
};
+struct mlx5_data_direct_resources {
+ u32 pdn;
+ u32 mkey;
+};
+
struct mlx5_ib_resources {
struct ib_cq *c0;
+ struct mutex cq_lock;
u32 xrcdn0;
u32 xrcdn1;
struct ib_pd *p0;
struct ib_srq *s0;
struct ib_srq *s1;
+ struct mutex srq_lock;
struct mlx5_ib_port_resources ports[2];
};
@@ -856,6 +899,14 @@ void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev,
struct mlx5_ib_op_fc *opfc,
enum mlx5_ib_optional_counter_type type);
+int mlx5r_fs_bind_op_fc(struct ib_qp *qp, struct rdma_counter *counter,
+ u32 port);
+
+void mlx5r_fs_unbind_op_fc(struct ib_qp *qp, struct rdma_counter *counter);
+
+void mlx5r_fs_destroy_fcs(struct mlx5_ib_dev *dev,
+ struct rdma_counter *counter);
+
struct mlx5_ib_multiport_info;
struct mlx5_ib_multiport {
@@ -868,8 +919,6 @@ struct mlx5_roce {
/* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
* netdev pointer
*/
- rwlock_t netdev_lock;
- struct net_device *netdev;
struct notifier_block nb;
struct netdev_net_notifier nn;
struct notifier_block mdev_nb;
@@ -954,15 +1003,14 @@ enum mlx5_ib_stages {
MLX5_IB_STAGE_QP,
MLX5_IB_STAGE_SRQ,
MLX5_IB_STAGE_DEVICE_RESOURCES,
- MLX5_IB_STAGE_DEVICE_NOTIFIER,
MLX5_IB_STAGE_ODP,
MLX5_IB_STAGE_COUNTERS,
MLX5_IB_STAGE_CONG_DEBUGFS,
- MLX5_IB_STAGE_UAR,
MLX5_IB_STAGE_BFREG,
MLX5_IB_STAGE_PRE_IB_REG_UMR,
MLX5_IB_STAGE_WHITELIST_UID,
MLX5_IB_STAGE_IB_REG,
+ MLX5_IB_STAGE_DEVICE_NOTIFIER,
MLX5_IB_STAGE_POST_IB_REG_UMR,
MLX5_IB_STAGE_DELAY_DROP,
MLX5_IB_STAGE_RESTRACK,
@@ -1114,7 +1162,11 @@ struct mlx5_macsec {
struct mlx5_ib_dev {
struct ib_device ib_dev;
struct mlx5_core_dev *mdev;
+ struct mlx5_data_direct_dev *data_direct_dev;
+ /* protect accessing data_direct_dev */
+ struct mutex data_direct_lock;
struct notifier_block mdev_events;
+ struct notifier_block lag_events;
int num_ports;
/* serialize update of capability mask
*/
@@ -1122,7 +1174,6 @@ struct mlx5_ib_dev {
u8 ib_active:1;
u8 is_rep:1;
u8 lag_active:1;
- u8 wc_support:1;
u8 fill_delay;
struct umr_common umrc;
/* sync used page count stats
@@ -1145,10 +1196,10 @@ struct mlx5_ib_dev {
/* protect resources needed as part of reset flow */
spinlock_t reset_flow_resource_lock;
struct list_head qp_list;
+ struct list_head data_direct_mr_list;
/* Array with num_ports elements */
struct mlx5_ib_port *port;
struct mlx5_sq_bfreg bfreg;
- struct mlx5_sq_bfreg wc_bfreg;
struct mlx5_sq_bfreg fp_bfreg;
struct mlx5_ib_delay_drop delay_drop;
const struct mlx5_ib_profile *profile;
@@ -1170,10 +1221,15 @@ struct mlx5_ib_dev {
u16 pkey_table_len;
u8 lag_ports;
struct mlx5_special_mkeys mkeys;
+ struct mlx5_data_direct_resources ddr;
#ifdef CONFIG_MLX5_MACSEC
struct mlx5_macsec macsec;
#endif
+
+ u8 num_plane;
+ struct mlx5_ib_dev *smi_dev;
+ const char *sub_dev_name;
};
static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -1272,6 +1328,8 @@ to_mmmap(struct rdma_user_mmap_entry *rdma_entry)
struct mlx5_user_mmap_entry, rdma_entry);
}
+int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev);
+int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev);
int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
struct mlx5_db *db);
void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
@@ -1311,7 +1369,7 @@ int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
size_t buflen, size_t *bc);
int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct ib_udata *udata);
+ struct uverbs_attr_bundle *attrs);
int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
@@ -1324,7 +1382,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
u64 length, u64 virt_addr,
int fd, int access_flags,
- struct ib_udata *udata);
+ struct uverbs_attr_bundle *attrs);
int mlx5_ib_advise_mr(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
u32 flags,
@@ -1335,7 +1393,6 @@ int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata);
int mlx5_ib_dealloc_mw(struct ib_mw *mw);
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
int access_flags);
-void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr);
struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
u64 length, u64 virt_addr, int access_flags,
@@ -1377,7 +1434,6 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props);
void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
u64 access_flags);
-void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
@@ -1405,6 +1461,10 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
struct ib_dm_mr_attr *attr,
struct uverbs_attr_bundle *attrs);
+void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
+ struct mlx5_data_direct_dev *dev);
+void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev);
+void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
@@ -1413,8 +1473,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev);
-void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags);
+int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags);
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
@@ -1435,8 +1495,11 @@ static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
{
return 0;
}
-static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags) {}
+static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
+{
+ return -EOPNOTSUPP;
+}
static inline int
mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
@@ -1512,6 +1575,7 @@ extern const struct uapi_definition mlx5_ib_devx_defs[];
extern const struct uapi_definition mlx5_ib_flow_defs[];
extern const struct uapi_definition mlx5_ib_qos_defs[];
extern const struct uapi_definition mlx5_ib_std_types_defs[];
+extern const struct uapi_definition mlx5_ib_create_cq_defs[];
static inline int is_qp1(enum ib_qp_type qp_type)
{
@@ -1612,8 +1676,6 @@ static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_ib_mkey *mmkey)
wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0);
}
-int mlx5_ib_test_wc(struct mlx5_ib_dev *dev);
-
static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev)
{
/*
@@ -1680,4 +1742,26 @@ static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev,
int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num,
unsigned int index, const union ib_gid *gid,
const struct ib_gid_attr *attr);
+
+static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port)
+{
+ return (port - 1) / dev->num_ports + 1;
+}
+
+/*
+ * For mkc users, instead of a page_offset the command has a start_iova which
+ * specifies both the page_offset and the on-the-wire IOVA
+ */
+static __always_inline unsigned long
+mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+ u64 iova)
+{
+ int page_size_bits =
+ MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5;
+ unsigned long bitmap =
+ __mlx5_log_page_size_to_bitmap(page_size_bits, 0);
+
+ return ib_umem_find_best_pgsz(umem, bitmap, iova);
+}
+
#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index a8ee2ca1f4a1..57f9bc2a4a3a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -43,18 +43,22 @@
#include "dm.h"
#include "mlx5_ib.h"
#include "umr.h"
+#include "data_direct.h"
enum {
MAX_PENDING_REG_MR = 8,
};
+#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
#define MLX5_UMR_ALIGN 2048
static void
create_mkey_callback(int status, struct mlx5_async_work *context);
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u64 iova, int access_flags,
- unsigned int page_size, bool populate);
+ unsigned long page_size, bool populate,
+ int access_mode);
+static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
struct ib_pd *pd)
@@ -211,9 +215,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
push_mkey_locked(ent, mkey_out->mkey);
+ ent->pending--;
/* If we are doing fill_to_high_water then keep going. */
queue_adjust_cache_locked(ent);
- ent->pending--;
spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
kfree(mkey_out);
}
@@ -246,6 +250,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
MLX5_SET(mkc, mkc, access_mode_4_2,
(ent->rb_key.access_mode >> 2) & 0x7);
+ MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
MLX5_SET(mkc, mkc, translations_octword_size,
get_mkc_octo_size(ent->rb_key.access_mode,
@@ -520,12 +525,27 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
ent->fill_to_high_water = false;
if (ent->pending)
queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
- msecs_to_jiffies(1000));
+ secs_to_jiffies(1));
else
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
}
}
+static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
+{
+ u32 mkey;
+
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ while (ent->mkeys_queue.ci) {
+ mkey = pop_mkey_locked(ent);
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+ mlx5_core_destroy_mkey(dev->mdev, mkey);
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ }
+ ent->tmp_cleanup_scheduled = false;
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+}
+
static void __cache_work_func(struct mlx5_cache_ent *ent)
{
struct mlx5_ib_dev *dev = ent->dev;
@@ -556,7 +576,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
"add keys command failed, err %d\n",
err);
queue_delayed_work(cache->wq, &ent->dwork,
- msecs_to_jiffies(1000));
+ secs_to_jiffies(1));
}
}
} else if (ent->mkeys_queue.ci > 2 * ent->limit) {
@@ -597,7 +617,11 @@ static void delayed_cache_work_func(struct work_struct *work)
struct mlx5_cache_ent *ent;
ent = container_of(work, struct mlx5_cache_ent, dwork.work);
- __cache_work_func(ent);
+ /* temp entries are never filled, only cleaned */
+ if (ent->is_tmp)
+ clean_keys(ent->dev, ent);
+ else
+ __cache_work_func(ent);
}
static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
@@ -641,10 +665,8 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
new = &((*new)->rb_left);
if (cmp < 0)
new = &((*new)->rb_right);
- if (cmp == 0) {
- mutex_unlock(&cache->rb_lock);
+ if (cmp == 0)
return -EEXIST;
- }
}
/* Add new node and rebalance tree. */
@@ -660,6 +682,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
{
struct rb_node *node = dev->cache.rb_root.rb_node;
struct mlx5_cache_ent *cur, *smallest = NULL;
+ u64 ndescs_limit;
int cmp;
/*
@@ -678,17 +701,24 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
return cur;
}
+ /*
+ * Limit the usage of mkeys larger than twice the required size while
+ * also allowing the usage of smallest cache entry for small MRs.
+ */
+ ndescs_limit = max_t(u64, rb_key.ndescs * 2,
+ MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
+
return (smallest &&
smallest->rb_key.access_mode == rb_key.access_mode &&
smallest->rb_key.access_flags == rb_key.access_flags &&
- smallest->rb_key.ats == rb_key.ats) ?
+ smallest->rb_key.ats == rb_key.ats &&
+ smallest->rb_key.ndescs <= ndescs_limit) ?
smallest :
NULL;
}
static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
- struct mlx5_cache_ent *ent,
- int access_flags)
+ struct mlx5_cache_ent *ent)
{
struct mlx5_ib_mr *mr;
int err;
@@ -719,6 +749,8 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
}
mr->mmkey.cache_ent = ent;
mr->mmkey.type = MLX5_MKEY_MR;
+ mr->mmkey.rb_key = ent->rb_key;
+ mr->mmkey.cacheable = true;
init_waitqueue_head(&mr->mmkey.wait);
return mr;
}
@@ -761,22 +793,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
if (!ent)
return ERR_PTR(-EOPNOTSUPP);
- return _mlx5_mr_cache_alloc(dev, ent, access_flags);
-}
-
-static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
-{
- u32 mkey;
-
- cancel_delayed_work(&ent->dwork);
- spin_lock_irq(&ent->mkeys_queue.lock);
- while (ent->mkeys_queue.ci) {
- mkey = pop_mkey_locked(ent);
- spin_unlock_irq(&ent->mkeys_queue.lock);
- mlx5_core_destroy_mkey(dev->mdev, mkey);
- spin_lock_irq(&ent->mkeys_queue.lock);
- }
- spin_unlock_irq(&ent->mkeys_queue.lock);
+ return _mlx5_mr_cache_alloc(dev, ent);
}
static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
@@ -821,7 +838,7 @@ static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
static void delay_time_func(struct timer_list *t)
{
- struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
+ struct mlx5_ib_dev *dev = timer_container_of(dev, t, delay_timer);
WRITE_ONCE(dev->fill_delay, 0);
}
@@ -891,10 +908,6 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
ent->limit = 0;
mlx5_mkey_cache_debugfs_add_ent(dev, ent);
- } else {
- mod_delayed_work(ent->dev->cache.wq,
- &ent->dev->cache.remove_ent_dwork,
- msecs_to_jiffies(30 * 1000));
}
return ent;
@@ -905,33 +918,23 @@ mkeys_err:
return ERR_PTR(ret);
}
-static void remove_ent_work_func(struct work_struct *work)
+static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev)
{
- struct mlx5_mkey_cache *cache;
+ struct rb_root *root = &dev->cache.rb_root;
struct mlx5_cache_ent *ent;
- struct rb_node *cur;
-
- cache = container_of(work, struct mlx5_mkey_cache,
- remove_ent_dwork.work);
- mutex_lock(&cache->rb_lock);
- cur = rb_last(&cache->rb_root);
- while (cur) {
- ent = rb_entry(cur, struct mlx5_cache_ent, node);
- cur = rb_prev(cur);
- mutex_unlock(&cache->rb_lock);
-
- spin_lock_irq(&ent->mkeys_queue.lock);
- if (!ent->is_tmp) {
- spin_unlock_irq(&ent->mkeys_queue.lock);
- mutex_lock(&cache->rb_lock);
- continue;
- }
- spin_unlock_irq(&ent->mkeys_queue.lock);
+ struct rb_node *node;
- clean_keys(ent->dev, ent);
- mutex_lock(&cache->rb_lock);
+ mutex_lock(&dev->cache.rb_lock);
+ node = rb_first(root);
+ while (node) {
+ ent = rb_entry(node, struct mlx5_cache_ent, node);
+ node = rb_next(node);
+ clean_keys(dev, ent);
+ rb_erase(&ent->node, root);
+ mlx5r_mkeys_uninit(ent);
+ kfree(ent);
}
- mutex_unlock(&cache->rb_lock);
+ mutex_unlock(&dev->cache.rb_lock);
}
int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
@@ -949,7 +952,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
mutex_init(&dev->slow_path_mutex);
mutex_init(&dev->cache.rb_lock);
dev->cache.rb_root = RB_ROOT;
- INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
if (!cache->wq) {
mlx5_ib_warn(dev, "failed to create work queue\n");
@@ -961,7 +963,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
mlx5_mkey_cache_debugfs_init(dev);
mutex_lock(&cache->rb_lock);
for (i = 0; i <= mkey_cache_max_order(dev); i++) {
- rb_key.ndescs = 1 << (i + 2);
+ rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
if (IS_ERR(ent)) {
ret = PTR_ERR(ent);
@@ -986,6 +988,8 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
err:
mutex_unlock(&cache->rb_lock);
mlx5_mkey_cache_debugfs_cleanup(dev);
+ mlx5r_destroy_cache_entries(dev);
+ destroy_workqueue(cache->wq);
mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
return ret;
}
@@ -1000,7 +1004,6 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
return;
mutex_lock(&dev->cache.rb_lock);
- cancel_delayed_work(&dev->cache.remove_ent_dwork);
for (node = rb_first(root); node; node = rb_next(node)) {
ent = rb_entry(node, struct mlx5_cache_ent, node);
spin_lock_irq(&ent->mkeys_queue.lock);
@@ -1020,20 +1023,10 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
/* At this point all entries are disabled and have no concurrent work. */
- mutex_lock(&dev->cache.rb_lock);
- node = rb_first(root);
- while (node) {
- ent = rb_entry(node, struct mlx5_cache_ent, node);
- node = rb_next(node);
- clean_keys(dev, ent);
- rb_erase(&ent->node, root);
- mlx5r_mkeys_uninit(ent);
- kfree(ent);
- }
- mutex_unlock(&dev->cache.rb_lock);
+ mlx5r_destroy_cache_entries(dev);
destroy_workqueue(dev->cache.wq);
- del_timer_sync(&dev->delay_timer);
+ timer_delete_sync(&dev->delay_timer);
}
struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
@@ -1061,6 +1054,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
MLX5_SET(mkc, mkc, length64, 1);
set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
pd);
+ MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
if (err)
@@ -1125,24 +1119,22 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
struct ib_umem *umem, u64 iova,
- int access_flags)
+ int access_flags, int access_mode)
{
- struct mlx5r_cache_rb_key rb_key = {
- .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
- };
struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5r_cache_rb_key rb_key = {};
struct mlx5_cache_ent *ent;
struct mlx5_ib_mr *mr;
- unsigned int page_size;
+ unsigned long page_size;
if (umem->is_dmabuf)
page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
else
- page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
- 0, iova);
+ page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
if (WARN_ON(!page_size))
return ERR_PTR(-EINVAL);
+ rb_key.access_mode = access_mode;
rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
@@ -1153,15 +1145,16 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
*/
if (!ent) {
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, false);
+ mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
mutex_unlock(&dev->slow_path_mutex);
if (IS_ERR(mr))
return mr;
mr->mmkey.rb_key = rb_key;
+ mr->mmkey.cacheable = true;
return mr;
}
- mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
+ mr = _mlx5_mr_cache_alloc(dev, ent);
if (IS_ERR(mr))
return mr;
@@ -1173,13 +1166,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
return mr;
}
+static struct ib_mr *
+reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
+ u32 crossed_lkey)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
+ struct mlx5_ib_mr *mr;
+ void *mkc;
+ int inlen;
+ u32 *in;
+ int err;
+
+ if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_1;
+ }
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, crossing_target_vhca_id,
+ MLX5_CAP_GEN(dev->mdev, vhca_id));
+ MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
+ MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
+ MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
+
+ /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
+ set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
+ MLX5_SET64(mkc, mkc, len, iova + length);
+
+ MLX5_SET(mkc, mkc, free, 0);
+ MLX5_SET(mkc, mkc, umr_en, 0);
+ err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
+ if (err)
+ goto err_2;
+
+ mr->mmkey.type = MLX5_MKEY_MR;
+ set_mr_fields(dev, mr, length, access_flags, iova);
+ mr->ibmr.pd = pd;
+ kvfree(in);
+ mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
+
+ return &mr->ibmr;
+err_2:
+ kvfree(in);
+err_1:
+ kfree(mr);
+ return ERR_PTR(err);
+}
+
/*
* If ibmr is NULL it will be allocated by reg_create.
* Else, the given ibmr will be used.
*/
static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
u64 iova, int access_flags,
- unsigned int page_size, bool populate)
+ unsigned long page_size, bool populate,
+ int access_mode)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr;
@@ -1188,7 +1239,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
int inlen;
u32 *in;
int err;
- bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
+ bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
+ (access_mode == MLX5_MKC_ACCESS_MODE_MTT);
+ bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
if (!page_size)
return ERR_PTR(-EINVAL);
@@ -1211,7 +1264,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
}
pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
if (populate) {
- if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
+ if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
err = -EINVAL;
goto err_2;
}
@@ -1227,14 +1280,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
populate ? pd : dev->umrc.pd);
+ /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
+ if (umem->is_dmabuf && ksm_mode)
+ MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
+
MLX5_SET(mkc, mkc, free, !populate);
- MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+ MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
MLX5_SET(mkc, mkc, umr_en, 1);
MLX5_SET64(mkc, mkc, len, umem->length);
MLX5_SET(mkc, mkc, bsf_octword_size, 0);
- MLX5_SET(mkc, mkc, translations_octword_size,
- get_octo_len(iova, umem->length, mr->page_shift));
+ if (ksm_mode)
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ get_octo_len(iova, umem->length, mr->page_shift) * 2);
+ else
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ get_octo_len(iova, umem->length, mr->page_shift));
MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
if (mlx5_umem_needs_ats(dev, umem, access_flags))
MLX5_SET(mkc, mkc, ma_translation_mode, 1);
@@ -1371,13 +1432,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
if (xlt_with_umr) {
- mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
+ mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
+ MLX5_MKC_ACCESS_MODE_MTT);
} else {
- unsigned int page_size = mlx5_umem_find_best_pgsz(
- umem, mkc, log_page_size, 0, iova);
+ unsigned long page_size =
+ mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, true);
+ mr = reg_create(pd, umem, iova, access_flags, page_size,
+ true, MLX5_MKC_ACCESS_MODE_MTT);
mutex_unlock(&dev->slow_path_mutex);
}
if (IS_ERR(mr)) {
@@ -1440,7 +1503,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
if (IS_ERR(odp))
return ERR_CAST(odp);
- mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
+ mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
+ MLX5_MKC_ACCESS_MODE_MTT);
if (IS_ERR(mr)) {
ib_umem_release(&odp->umem);
return ERR_CAST(mr);
@@ -1468,6 +1532,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct ib_umem *umem;
+ int err;
if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
return ERR_PTR(-EOPNOTSUPP);
@@ -1475,6 +1540,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
start, iova, length, access_flags);
+ err = mlx5r_umr_resource_init(dev);
+ if (err)
+ return ERR_PTR(err);
+
if (access_flags & IB_ACCESS_ON_DEMAND)
return create_user_odp_mr(pd, start, length, iova, access_flags,
udata);
@@ -1491,7 +1560,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
- if (!umem_dmabuf->sgt)
+ if (!umem_dmabuf->sgt || !mr)
return;
mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
@@ -1503,31 +1572,31 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
.move_notify = mlx5_ib_dmabuf_invalidate_cb,
};
-struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
- u64 length, u64 virt_addr,
- int fd, int access_flags,
- struct ib_udata *udata)
+static struct ib_mr *
+reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
+ u64 offset, u64 length, u64 virt_addr,
+ int fd, int access_flags, int access_mode)
{
+ bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_mr *mr = NULL;
struct ib_umem_dmabuf *umem_dmabuf;
int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
- !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
- return ERR_PTR(-EOPNOTSUPP);
-
- mlx5_ib_dbg(dev,
- "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
- offset, virt_addr, length, fd, access_flags);
+ err = mlx5r_umr_resource_init(dev);
+ if (err)
+ return ERR_PTR(err);
- /* dmabuf requires xlt update via umr to work. */
- if (!mlx5r_umr_can_load_pas(dev, length))
- return ERR_PTR(-EINVAL);
+ if (!pinned_mode)
+ umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
+ offset, length, fd,
+ access_flags,
+ &mlx5_ib_dmabuf_attach_ops);
+ else
+ umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
+ dma_device, offset, length,
+ fd, access_flags);
- umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
- access_flags,
- &mlx5_ib_dmabuf_attach_ops);
if (IS_ERR(umem_dmabuf)) {
mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
PTR_ERR(umem_dmabuf));
@@ -1535,7 +1604,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
}
mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
- access_flags);
+ access_flags, access_mode);
if (IS_ERR(mr)) {
ib_umem_release(&umem_dmabuf->umem);
return ERR_CAST(mr);
@@ -1545,9 +1614,13 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
umem_dmabuf->private = mr;
- err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
- if (err)
- goto err_dereg_mr;
+ if (!pinned_mode) {
+ err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
+ if (err)
+ goto err_dereg_mr;
+ } else {
+ mr->data_direct = true;
+ }
err = mlx5_ib_init_dmabuf_mr(mr);
if (err)
@@ -1555,10 +1628,101 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
return &mr->ibmr;
err_dereg_mr:
- mlx5_ib_dereg_mr(&mr->ibmr, NULL);
+ __mlx5_ib_dereg_mr(&mr->ibmr);
return ERR_PTR(err);
}
+static struct ib_mr *
+reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5_data_direct_dev *data_direct_dev;
+ struct ib_mr *crossing_mr;
+ struct ib_mr *crossed_mr;
+ int ret = 0;
+
+ /* As of HW behaviour the IOVA must be page aligned in KSM mode */
+ if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ mutex_lock(&dev->data_direct_lock);
+ data_direct_dev = dev->data_direct_dev;
+ if (!data_direct_dev) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* The device's 'data direct mkey' was created without RO flags to
+ * simplify things and allow for a single mkey per device.
+ * Since RO is not a must, mask it out accordingly.
+ */
+ access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
+ crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
+ offset, length, virt_addr, fd,
+ access_flags, MLX5_MKC_ACCESS_MODE_KSM);
+ if (IS_ERR(crossed_mr)) {
+ ret = PTR_ERR(crossed_mr);
+ goto end;
+ }
+
+ mutex_lock(&dev->slow_path_mutex);
+ crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
+ crossed_mr->lkey);
+ mutex_unlock(&dev->slow_path_mutex);
+ if (IS_ERR(crossing_mr)) {
+ __mlx5_ib_dereg_mr(crossed_mr);
+ ret = PTR_ERR(crossing_mr);
+ goto end;
+ }
+
+ list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
+ to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
+ to_mmr(crossing_mr)->data_direct = true;
+end:
+ mutex_unlock(&dev->data_direct_lock);
+ return ret ? ERR_PTR(ret) : crossing_mr;
+}
+
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ int mlx5_access_flags = 0;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
+ !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
+ err = uverbs_get_flags32(&mlx5_access_flags, attrs,
+ MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ mlx5_ib_dbg(dev,
+ "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
+ offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
+
+ /* dmabuf requires xlt update via umr to work. */
+ if (!mlx5r_umr_can_load_pas(dev, length))
+ return ERR_PTR(-EINVAL);
+
+ if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
+ return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
+ fd, access_flags);
+
+ return reg_user_mr_dmabuf(pd, pd->device->dma_device,
+ offset, length, virt_addr,
+ fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
+}
+
/*
* True if the change in access flags can be done via UMR, only some access
* flags can be updated.
@@ -1570,7 +1734,8 @@ static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
unsigned int diffs = current_access_flags ^ target_access_flags;
if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
+ IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
+ IB_ACCESS_REMOTE_ATOMIC))
return false;
return mlx5r_umr_can_reconfig(dev, current_access_flags,
target_access_flags);
@@ -1589,8 +1754,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
return false;
- *page_size =
- mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
+ *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova);
if (WARN_ON(!*page_size))
return false;
return (mr->mmkey.cache_ent->rb_key.ndescs) >=
@@ -1653,7 +1817,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
struct mlx5_ib_mr *mr = to_mmr(ib_mr);
int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+ if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
return ERR_PTR(-EOPNOTSUPP);
mlx5_ib_dbg(
@@ -1781,7 +1945,8 @@ err:
static void
mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
{
- if (!mr->umem && mr->descs) {
+ if (!mr->umem && !mr->data_direct &&
+ mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
struct ib_device *device = mr->ibmr.device;
int size = mr->max_descs * mr->desc_size;
struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1802,7 +1967,6 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
if (mr->mmkey.cache_ent) {
spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
- mr->mmkey.cache_ent->in_use--;
goto end;
}
@@ -1835,7 +1999,89 @@ end:
return ret;
}
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+ struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+ int err;
+
+ lockdep_assert_held(&dev->data_direct_lock);
+ mr->revoked = true;
+ err = mlx5r_umr_revoke_mr(mr);
+ if (WARN_ON(err))
+ return err;
+
+ ib_umem_dmabuf_revoke(umem_dmabuf);
+ return 0;
+}
+
+void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
+{
+ struct mlx5_ib_mr *mr, *next;
+
+ lockdep_assert_held(&dev->data_direct_lock);
+
+ list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
+ list_del(&mr->dd_node);
+ mlx5_ib_revoke_data_direct_mr(mr);
+ }
+}
+
+static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
+{
+ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+ struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
+ bool is_odp = is_odp_mr(mr);
+ bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
+ !to_ib_umem_dmabuf(mr->umem)->pinned;
+ bool from_cache = !!ent;
+ int ret = 0;
+
+ if (is_odp)
+ mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
+
+ if (is_odp_dma_buf)
+ dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL);
+
+ if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
+ ent = mr->mmkey.cache_ent;
+ /* upon storing to a clean temp entry - schedule its cleanup */
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ if (from_cache)
+ ent->in_use--;
+ if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
+ mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
+ secs_to_jiffies(30));
+ ent->tmp_cleanup_scheduled = true;
+ }
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+ goto out;
+ }
+
+ if (ent) {
+ spin_lock_irq(&ent->mkeys_queue.lock);
+ ent->in_use--;
+ mr->mmkey.cache_ent = NULL;
+ spin_unlock_irq(&ent->mkeys_queue.lock);
+ }
+ ret = destroy_mkey(dev, mr);
+out:
+ if (is_odp) {
+ if (!ret)
+ to_ib_umem_odp(mr->umem)->private = NULL;
+ mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
+ }
+
+ if (is_odp_dma_buf) {
+ if (!ret)
+ to_ib_umem_dmabuf(mr->umem)->private = NULL;
+ dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
+ }
+
+ return ret;
+}
+
+static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
{
struct mlx5_ib_mr *mr = to_mmr(ibmr);
struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
@@ -1880,16 +2126,9 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
}
/* Stop DMA */
- if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length))
- if (mlx5r_umr_revoke_mr(mr) ||
- cache_ent_find_and_store(dev, mr))
- mr->mmkey.cache_ent = NULL;
-
- if (!mr->mmkey.cache_ent) {
- rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
- if (rc)
- return rc;
- }
+ rc = mlx5_revoke_mr(mr);
+ if (rc)
+ return rc;
if (mr->umem) {
bool is_odp = is_odp_mr(mr);
@@ -1909,9 +2148,40 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
return 0;
}
+static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_mr *mr)
+{
+ struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
+ int ret;
+
+ ret = __mlx5_ib_dereg_mr(&mr->ibmr);
+ if (ret)
+ return ret;
+
+ mutex_lock(&dev->data_direct_lock);
+ if (!dd_crossed_mr->revoked)
+ list_del(&dd_crossed_mr->dd_node);
+
+ ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
+ mutex_unlock(&dev->data_direct_lock);
+ return ret;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
+ struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+
+ if (mr->data_direct)
+ return dereg_crossing_data_direct_mr(dev, mr);
+
+ return __mlx5_ib_dereg_mr(ibmr);
+}
+
static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
int access_mode, int page_shift)
{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
void *mkc;
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
@@ -1924,6 +2194,9 @@ static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
MLX5_SET(mkc, mkc, umr_en, 1);
MLX5_SET(mkc, mkc, log_page_size, page_shift);
+ if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
+ access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+ MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
}
static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 4a04cbc5b78a..eaa2f9f5f3a9 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -34,6 +34,9 @@
#include <linux/kernel.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
+#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
+#include <linux/pci-p2pdma.h>
#include "mlx5_ib.h"
#include "cmd.h"
@@ -45,7 +48,7 @@
/* Contains the details of a pagefault. */
struct mlx5_pagefault {
u32 bytes_committed;
- u32 token;
+ u64 token;
u8 event_subtype;
u8 type;
union {
@@ -74,6 +77,14 @@ struct mlx5_pagefault {
u32 rdma_op_len;
u64 rdma_va;
} rdma;
+ struct {
+ u64 va;
+ u32 mkey;
+ u32 fault_byte_count;
+ u32 prefetch_before_byte_count;
+ u32 prefetch_after_byte_count;
+ u8 flags;
+ } memory;
};
struct mlx5_ib_pf_eq *eq;
@@ -99,13 +110,20 @@ static u64 mlx5_imr_ksm_entries;
static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
struct mlx5_ib_mr *imr, int flags)
{
+ struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev;
struct mlx5_klm *end = pklm + nentries;
+ int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0;
+ __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ?
+ cpu_to_be32(imr->null_mmkey.key) :
+ mr_to_mdev(imr)->mkeys.null_mkey;
+ u64 va =
+ MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0;
if (flags & MLX5_IB_UPD_XLT_ZAP) {
- for (; pklm != end; pklm++, idx++) {
+ for (; pklm != end; pklm++, idx++, va += step) {
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
- pklm->va = 0;
+ pklm->key = key;
+ pklm->va = cpu_to_be64(va);
}
return;
}
@@ -129,7 +147,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
*/
lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
- for (; pklm != end; pklm++, idx++) {
+ for (; pklm != end; pklm++, idx++, va += step) {
struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
@@ -137,47 +155,56 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
pklm->key = cpu_to_be32(mtt->ibmr.lkey);
pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
} else {
- pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
- pklm->va = 0;
+ pklm->key = key;
+ pklm->va = cpu_to_be64(va);
}
}
}
-static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
-{
- u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
-
- if (umem_dma & ODP_READ_ALLOWED_BIT)
- mtt_entry |= MLX5_IB_MTT_READ;
- if (umem_dma & ODP_WRITE_ALLOWED_BIT)
- mtt_entry |= MLX5_IB_MTT_WRITE;
-
- return mtt_entry;
-}
-
-static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags)
+static int populate_mtt(__be64 *pas, size_t start, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- dma_addr_t pa;
+ bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
+ struct pci_p2pdma_map_state p2pdma_state = {};
+ struct ib_device *dev = odp->umem.ibdev;
size_t i;
if (flags & MLX5_IB_UPD_XLT_ZAP)
- return;
+ return 0;
for (i = 0; i < nentries; i++) {
- pa = odp->dma_list[idx + i];
- pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+ unsigned long pfn = odp->map.pfn_list[start + i];
+ dma_addr_t dma_addr;
+
+ pfn = odp->map.pfn_list[start + i];
+ if (!(pfn & HMM_PFN_VALID))
+ /* ODP initialization */
+ continue;
+
+ dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map,
+ start + i, &p2pdma_state);
+ if (ib_dma_mapping_error(dev, dma_addr))
+ return -EFAULT;
+
+ dma_addr |= MLX5_IB_MTT_READ;
+ if ((pfn & HMM_PFN_WRITE) && !downgrade)
+ dma_addr |= MLX5_IB_MTT_WRITE;
+
+ pas[i] = cpu_to_be64(dma_addr);
+ odp->npages++;
}
+ return 0;
}
-void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags)
+int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
{
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
populate_klm(xlt, idx, nentries, mr, flags);
+ return 0;
} else {
- populate_mtt(xlt, idx, nentries, mr, flags);
+ return populate_mtt(xlt, idx, nentries, mr, flags);
}
}
@@ -213,10 +240,28 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *imr = mr->parent;
+ /*
+ * If userspace is racing freeing the parent implicit ODP MR then we can
+ * loose the race with parent destruction. In this case
+ * mlx5_ib_free_odp_mr() will free everything in the implicit_children
+ * xarray so NOP is fine. This child MR cannot be destroyed here because
+ * we are under its umem_mutex.
+ */
if (!refcount_inc_not_zero(&imr->mmkey.usecount))
return;
- xa_erase(&imr->implicit_children, idx);
+ xa_lock(&imr->implicit_children);
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) !=
+ mr) {
+ xa_unlock(&imr->implicit_children);
+ mlx5r_deref_odp_mkey(&imr->mmkey);
+ return;
+ }
+
+ if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault))
+ __xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key));
+ xa_unlock(&imr->implicit_children);
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
@@ -250,6 +295,8 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
if (!umem_odp->npages)
goto out;
mr = umem_odp->private;
+ if (!mr)
+ goto out;
start = max_t(u64, ib_umem_start(umem_odp), range->start);
end = min_t(u64, ib_umem_end(umem_odp), range->end);
@@ -268,15 +315,11 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
* estimate the cost of another UMR vs. the cost of bigger
* UMR.
*/
- if (umem_odp->dma_list[idx] &
- (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+ if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) {
if (!in_block) {
blk_start_idx = idx;
in_block = 1;
}
-
- /* Count page invalidations */
- invalidations += idx - blk_start_idx + 1;
} else {
u64 umr_offset = idx & umr_block_mask;
@@ -286,16 +329,21 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC);
in_block = 0;
+ /* Count page invalidations */
+ invalidations += idx - blk_start_idx + 1;
}
}
}
- if (in_block)
+ if (in_block) {
mlx5r_umr_update_xlt(mr, blk_start_idx,
idx - blk_start_idx + 1, 0,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC);
+ /* Count page invalidations */
+ invalidations += idx - blk_start_idx + 1;
+ }
- mlx5_update_odp_stats(mr, invalidations, invalidations);
+ mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations);
/*
* We are now sure that the device will not access the
@@ -332,46 +380,46 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
else
dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
- if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.send))
caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
- if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.srq_receive))
caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.send))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.receive))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.write))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.read))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.atomic))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
- if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.srq_receive))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.send))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.receive))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.write))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.read))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.atomic))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
- if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
+ if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.srq_receive))
caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
@@ -388,13 +436,29 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
pfault->wqe.wq_num : pfault->token;
u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
+ void *info;
int err;
MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
- MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
- MLX5_SET(page_fault_resume_in, in, token, pfault->token);
- MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
- MLX5_SET(page_fault_resume_in, in, error, !!error);
+
+ if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) {
+ info = MLX5_ADDR_OF(page_fault_resume_in, in,
+ page_fault_info.mem_page_fault_info);
+ MLX5_SET(mem_page_fault_info, info, fault_token_31_0,
+ pfault->token & 0xffffffff);
+ MLX5_SET(mem_page_fault_info, info, fault_token_47_32,
+ (pfault->token >> 32) & 0xffff);
+ MLX5_SET(mem_page_fault_info, info, error, !!error);
+ } else {
+ info = MLX5_ADDR_OF(page_fault_resume_in, in,
+ page_fault_info.trans_page_fault_info);
+ MLX5_SET(trans_page_fault_info, info, page_fault_type,
+ pfault->type);
+ MLX5_SET(trans_page_fault_info, info, fault_token,
+ pfault->token);
+ MLX5_SET(trans_page_fault_info, info, wq_number, wq_num);
+ MLX5_SET(trans_page_fault_info, info, error, !!error);
+ }
err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
if (err)
@@ -466,8 +530,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_inc(&ret->mmkey.usecount);
goto out_lock;
}
- xa_unlock(&imr->implicit_children);
+ if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
+ ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+ &mr->mmkey, GFP_KERNEL);
+ if (xa_is_err(ret)) {
+ ret = ERR_PTR(xa_err(ret));
+ __xa_erase(&imr->implicit_children, idx);
+ goto out_lock;
+ }
+ mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD;
+ }
+ xa_unlock(&imr->implicit_children);
mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
return mr;
@@ -478,6 +552,57 @@ out_mr:
return ret;
}
+/*
+ * When using memory scheme ODP, implicit MRs can't use the reserved null mkey
+ * and each implicit MR needs to assign a private null mkey to get the page
+ * faults on.
+ * The null mkey is created with the properties to enable getting the page
+ * fault for every time it is accessed and having all relevant access flags.
+ */
+static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_mr *imr,
+ struct mlx5_ib_pd *pd)
+{
+ size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64;
+ void *mkc;
+ u32 *in;
+ int err;
+
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4);
+ MLX5_SET(create_mkey_in, in, pg_access, 1);
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, a, 1);
+ MLX5_SET(mkc, mkc, rw, 1);
+ MLX5_SET(mkc, mkc, rr, 1);
+ MLX5_SET(mkc, mkc, lw, 1);
+ MLX5_SET(mkc, mkc, lr, 1);
+ MLX5_SET(mkc, mkc, free, 0);
+ MLX5_SET(mkc, mkc, umr_en, 0);
+ MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+
+ MLX5_SET(mkc, mkc, translations_octword_size, 4);
+ MLX5_SET(mkc, mkc, log_page_size, 61);
+ MLX5_SET(mkc, mkc, length64, 1);
+ MLX5_SET(mkc, mkc, pd, pd->pdn);
+ MLX5_SET64(mkc, mkc, start_addr, 0);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+ err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen);
+ if (err)
+ goto free_in;
+
+ imr->null_mmkey.type = MLX5_MKEY_NULL;
+
+free_in:
+ kfree(in);
+ return err;
+}
+
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
int access_flags)
{
@@ -510,6 +635,16 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
imr->is_odp_implicit = true;
xa_init(&imr->implicit_children);
+ if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
+ err = alloc_implicit_mr_null_mkey(dev, imr, pd);
+ if (err)
+ goto out_mr;
+
+ err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey);
+ if (err)
+ goto out_mr;
+ }
+
err = mlx5r_umr_update_xlt(imr, 0,
mlx5_imr_ksm_entries,
MLX5_KSM_PAGE_SHIFT,
@@ -544,6 +679,14 @@ void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
xa_erase(&mr->implicit_children, idx);
mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
}
+
+ if (mr->null_mmkey.key) {
+ xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->null_mmkey.key));
+
+ mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev,
+ mr->null_mmkey.key);
+ }
}
#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
@@ -555,7 +698,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
{
int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
- u64 access_mask;
+ u64 access_mask = 0;
u64 start_idx;
bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
@@ -563,12 +706,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
+ if (flags & MLX5_PF_FLAGS_DOWNGRADE)
+ xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
+
page_shift = odp->page_shift;
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
- access_mask = ODP_READ_ALLOWED_BIT;
if (odp->umem.writable && !downgrade)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
+ access_mask |= HMM_PFN_WRITE;
np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
if (np < 0)
@@ -693,7 +838,7 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
u32 xlt_flags = 0;
int err;
- unsigned int page_size;
+ unsigned long page_size;
if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
@@ -705,14 +850,15 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
return err;
}
- page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
- log_page_size, 0,
- umem_dmabuf->umem.iova);
- if (unlikely(page_size < PAGE_SIZE)) {
+ page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf);
+ if (!page_size) {
ib_umem_dmabuf_unmap_pages(umem_dmabuf);
err = -EINVAL;
} else {
- err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
+ if (mr->data_direct)
+ err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags);
+ else
+ err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
}
dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
@@ -735,24 +881,31 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
* >0: Number of pages mapped
*/
static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
- u32 *bytes_mapped, u32 flags)
+ u32 *bytes_mapped, u32 flags, bool permissive_fault)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- if (unlikely(io_virt < mr->ibmr.iova))
+ if (unlikely(io_virt < mr->ibmr.iova) && !permissive_fault)
return -EFAULT;
if (mr->umem->is_dmabuf)
return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
if (!odp->is_implicit_odp) {
+ u64 offset = io_virt < mr->ibmr.iova ? 0 : io_virt - mr->ibmr.iova;
u64 user_va;
- if (check_add_overflow(io_virt - mr->ibmr.iova,
- (u64)odp->umem.address, &user_va))
+ if (check_add_overflow(offset, (u64)odp->umem.address,
+ &user_va))
return -EFAULT;
- if (unlikely(user_va >= ib_umem_end(odp) ||
- ib_umem_end(odp) - user_va < bcnt))
+
+ if (permissive_fault) {
+ if (user_va < ib_umem_start(odp))
+ user_va = ib_umem_start(odp);
+ if ((user_va + bcnt) > ib_umem_end(odp))
+ bcnt = ib_umem_end(odp) - user_va;
+ } else if (unlikely(user_va >= ib_umem_end(odp) ||
+ ib_umem_end(odp) - user_va < bcnt))
return -EFAULT;
return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
flags);
@@ -799,11 +952,31 @@ static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key)
return mmkey->key == key;
}
+static struct mlx5_ib_mkey *find_odp_mkey(struct mlx5_ib_dev *dev, u32 key)
+{
+ struct mlx5_ib_mkey *mmkey;
+
+ xa_lock(&dev->odp_mkeys);
+ mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
+ if (!mmkey) {
+ mmkey = ERR_PTR(-ENOENT);
+ goto out;
+ }
+ if (!mkey_is_eq(mmkey, key)) {
+ mmkey = ERR_PTR(-EFAULT);
+ goto out;
+ }
+ refcount_inc(&mmkey->usecount);
+out:
+ xa_unlock(&dev->odp_mkeys);
+
+ return mmkey;
+}
+
/*
* Handle a single data segment in a page-fault WQE or RDMA region.
*
- * Returns number of OS pages retrieved on success. The caller may continue to
- * the next data segment.
+ * Returns zero on success. The caller may continue to the next data segment.
* Can return the following error codes:
* -EAGAIN to designate a temporary error. The caller will abort handling the
* page fault and resolve it.
@@ -816,7 +989,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
u32 *bytes_committed,
u32 *bytes_mapped)
{
- int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
+ int ret, i, outlen, cur_outlen = 0, depth = 0, pages_in_range;
struct pf_frame *head = NULL, *frame;
struct mlx5_ib_mkey *mmkey;
struct mlx5_ib_mr *mr;
@@ -826,32 +999,24 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
io_virt += *bytes_committed;
bcnt -= *bytes_committed;
-
next_mr:
- xa_lock(&dev->odp_mkeys);
- mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
- if (!mmkey) {
- xa_unlock(&dev->odp_mkeys);
- mlx5_ib_dbg(
- dev,
- "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
- key);
- if (bytes_mapped)
- *bytes_mapped += bcnt;
- /*
- * The user could specify a SGL with multiple lkeys and only
- * some of them are ODP. Treat the non-ODP ones as fully
- * faulted.
- */
- ret = 0;
- goto end;
- }
- refcount_inc(&mmkey->usecount);
- xa_unlock(&dev->odp_mkeys);
-
- if (!mkey_is_eq(mmkey, key)) {
- mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
- ret = -EFAULT;
+ mmkey = find_odp_mkey(dev, key);
+ if (IS_ERR(mmkey)) {
+ ret = PTR_ERR(mmkey);
+ if (ret == -ENOENT) {
+ mlx5_ib_dbg(
+ dev,
+ "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+ key);
+ if (bytes_mapped)
+ *bytes_mapped += bcnt;
+ /*
+ * The user could specify a SGL with multiple lkeys and
+ * only some of them are ODP. Treat the non-ODP ones as
+ * fully faulted.
+ */
+ ret = 0;
+ }
goto end;
}
@@ -859,13 +1024,20 @@ next_mr:
case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
+ pages_in_range = (ALIGN(io_virt + bcnt, PAGE_SIZE) -
+ (io_virt & PAGE_MASK)) >>
+ PAGE_SHIFT;
+ ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false);
if (ret < 0)
goto end;
- mlx5_update_odp_stats(mr, faults, ret);
+ mlx5_update_odp_stats_with_handled(mr, faults, ret);
+
+ if (ret < pages_in_range) {
+ ret = -EFAULT;
+ goto end;
+ }
- npages += ret;
ret = 0;
break;
@@ -946,7 +1118,7 @@ next_mr:
}
end:
- if (mmkey)
+ if (!IS_ERR(mmkey))
mlx5r_deref_odp_mkey(mmkey);
while (head) {
frame = head;
@@ -956,7 +1128,7 @@ end:
kfree(out);
*bytes_committed = 0;
- return ret ? ret : npages;
+ return ret;
}
/*
@@ -975,8 +1147,7 @@ end:
* the committed bytes).
* @receive_queue: receive WQE end of sg list
*
- * Returns the number of pages loaded if positive, zero for an empty WQE, or a
- * negative error code.
+ * Returns zero for success or a negative error code.
*/
static int pagefault_data_segments(struct mlx5_ib_dev *dev,
struct mlx5_pagefault *pfault,
@@ -984,7 +1155,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
void *wqe_end, u32 *bytes_mapped,
u32 *total_wqe_bytes, bool receive_queue)
{
- int ret = 0, npages = 0;
+ int ret = 0;
u64 io_virt;
__be32 key;
u32 byte_count;
@@ -1041,10 +1212,9 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
bytes_mapped);
if (ret < 0)
break;
- npages += ret;
}
- return ret < 0 ? ret : npages;
+ return ret;
}
/*
@@ -1268,7 +1438,7 @@ read_user:
if (ret)
mlx5_ib_err(
dev,
- "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
+ "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n",
ret, wqe_index, pfault->token);
resolve_page_fault:
@@ -1280,12 +1450,6 @@ resolve_page_fault:
free_page((unsigned long)wqe_start);
}
-static int pages_in_range(u64 address, u32 length)
-{
- return (ALIGN(address + length, PAGE_SIZE) -
- (address & PAGE_MASK)) >> PAGE_SHIFT;
-}
-
static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
struct mlx5_pagefault *pfault)
{
@@ -1324,16 +1488,16 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
if (ret == -EAGAIN) {
/* We're racing with an invalidation, don't prefetch */
prefetch_activated = 0;
- } else if (ret < 0 || pages_in_range(address, length) > ret) {
+ } else if (ret < 0) {
mlx5_ib_page_fault_resume(dev, pfault, 1);
if (ret != -ENOENT)
- mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
+ mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n",
ret, pfault->token, pfault->type);
return;
}
mlx5_ib_page_fault_resume(dev, pfault, 0);
- mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
+ mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n",
pfault->token, pfault->type,
prefetch_activated);
@@ -1349,12 +1513,80 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
prefetch_len,
&bytes_committed, NULL);
if (ret < 0 && ret != -EAGAIN) {
- mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
+ mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n",
ret, pfault->token, address, prefetch_len);
}
}
}
+#define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7)
+static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev,
+ struct mlx5_pagefault *pfault)
+{
+ u64 prefetch_va =
+ pfault->memory.va - pfault->memory.prefetch_before_byte_count;
+ size_t prefetch_size = pfault->memory.prefetch_before_byte_count +
+ pfault->memory.fault_byte_count +
+ pfault->memory.prefetch_after_byte_count;
+ struct mlx5_ib_mkey *mmkey;
+ struct mlx5_ib_mr *mr, *child_mr;
+ int ret = 0;
+
+ mmkey = find_odp_mkey(dev, pfault->memory.mkey);
+ if (IS_ERR(mmkey))
+ goto err;
+
+ switch (mmkey->type) {
+ case MLX5_MKEY_IMPLICIT_CHILD:
+ child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+ mr = child_mr->parent;
+ break;
+ case MLX5_MKEY_NULL:
+ mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey);
+ break;
+ default:
+ mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+ break;
+ }
+
+ /* If prefetch fails, handle only demanded page fault */
+ ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true);
+ if (ret < 0) {
+ ret = pagefault_mr(mr, pfault->memory.va,
+ pfault->memory.fault_byte_count, NULL, 0,
+ true);
+ if (ret < 0)
+ goto err;
+ }
+
+ mlx5_update_odp_stats_with_handled(mr, faults, ret);
+ mlx5r_deref_odp_mkey(mmkey);
+
+ if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST)
+ mlx5_ib_page_fault_resume(dev, pfault, 0);
+
+ mlx5_ib_dbg(
+ dev,
+ "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n",
+ pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ?
+ "" :
+ "without resume cmd",
+ pfault->token, pfault->memory.mkey, pfault->memory.va,
+ pfault->memory.fault_byte_count);
+
+ return;
+
+err:
+ if (!IS_ERR(mmkey))
+ mlx5r_deref_odp_mkey(mmkey);
+ mlx5_ib_page_fault_resume(dev, pfault, 1);
+ mlx5_ib_dbg(
+ dev,
+ "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n",
+ pfault->token, pfault->memory.mkey, pfault->memory.va,
+ pfault->memory.fault_byte_count, ret);
+}
+
static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
{
u8 event_subtype = pfault->event_subtype;
@@ -1366,6 +1598,9 @@ static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfaul
case MLX5_PFAULT_SUBTYPE_RDMA:
mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
break;
+ case MLX5_PFAULT_SUBTYPE_MEMORY:
+ mlx5_ib_mr_memory_pfault_handler(dev, pfault);
+ break;
default:
mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
event_subtype);
@@ -1384,6 +1619,7 @@ static void mlx5_ib_eqe_pf_action(struct work_struct *work)
mempool_free(pfault, eq->pool);
}
+#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096
static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
{
struct mlx5_eqe_page_fault *pf_eqe;
@@ -1400,15 +1636,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
pf_eqe = &eqe->data.page_fault;
pfault->event_subtype = eqe->sub_type;
- pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
-
- mlx5_ib_dbg(eq->dev,
- "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
- eqe->sub_type, pfault->bytes_committed);
switch (eqe->sub_type) {
case MLX5_PFAULT_SUBTYPE_RDMA:
/* RDMA based event */
+ pfault->bytes_committed =
+ be32_to_cpu(pf_eqe->rdma.bytes_committed);
pfault->type =
be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
pfault->token =
@@ -1422,10 +1655,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
be32_to_cpu(pf_eqe->rdma.rdma_op_len);
pfault->rdma.rdma_va =
be64_to_cpu(pf_eqe->rdma.rdma_va);
- mlx5_ib_dbg(eq->dev,
- "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
- pfault->type, pfault->token,
- pfault->rdma.r_key);
+ mlx5_ib_dbg(
+ eq->dev,
+ "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n",
+ eqe->sub_type, pfault->bytes_committed,
+ pfault->type, pfault->token,
+ pfault->rdma.r_key);
mlx5_ib_dbg(eq->dev,
"PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
pfault->rdma.rdma_op_len,
@@ -1434,6 +1669,8 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
case MLX5_PFAULT_SUBTYPE_WQE:
/* WQE based event */
+ pfault->bytes_committed =
+ be32_to_cpu(pf_eqe->wqe.bytes_committed);
pfault->type =
(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
pfault->token =
@@ -1445,11 +1682,47 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
be16_to_cpu(pf_eqe->wqe.wqe_index);
pfault->wqe.packet_size =
be16_to_cpu(pf_eqe->wqe.packet_length);
- mlx5_ib_dbg(eq->dev,
- "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
- pfault->type, pfault->token,
- pfault->wqe.wq_num,
- pfault->wqe.wqe_index);
+ mlx5_ib_dbg(
+ eq->dev,
+ "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+ eqe->sub_type, pfault->bytes_committed,
+ pfault->type, pfault->token, pfault->wqe.wq_num,
+ pfault->wqe.wqe_index);
+ break;
+
+ case MLX5_PFAULT_SUBTYPE_MEMORY:
+ /* Memory based event */
+ pfault->bytes_committed = 0;
+ pfault->token =
+ be32_to_cpu(pf_eqe->memory.token31_0) |
+ ((u64)be16_to_cpu(pf_eqe->memory.token47_32)
+ << 32);
+ pfault->memory.va = be64_to_cpu(pf_eqe->memory.va);
+ pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey);
+ pfault->memory.fault_byte_count = (be32_to_cpu(
+ pf_eqe->memory.demand_fault_pages) >> 12) *
+ MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+ pfault->memory.prefetch_before_byte_count =
+ be16_to_cpu(
+ pf_eqe->memory.pre_demand_fault_pages) *
+ MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+ pfault->memory.prefetch_after_byte_count =
+ be16_to_cpu(
+ pf_eqe->memory.post_demand_fault_pages) *
+ MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+ pfault->memory.flags = pf_eqe->memory.flags;
+ mlx5_ib_dbg(
+ eq->dev,
+ "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n",
+ eqe->sub_type, pfault->token,
+ pfault->memory.mkey,
+ pfault->memory.fault_byte_count,
+ pfault->memory.va, pfault->memory.flags);
+ mlx5_ib_dbg(
+ eq->dev,
+ "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n",
+ pfault->memory.prefetch_before_byte_count,
+ pfault->memory.prefetch_after_byte_count);
break;
default:
@@ -1712,7 +1985,7 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
for (i = 0; i < work->num_sge; ++i) {
ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
work->frags[i].length, &bytes_mapped,
- work->pf_flags);
+ work->pf_flags, false);
if (ret <= 0)
continue;
mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
@@ -1763,7 +2036,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
if (IS_ERR(mr))
return PTR_ERR(mr);
ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
- &bytes_mapped, pf_flags);
+ &bytes_mapped, pf_flags, false);
if (ret < 0) {
mlx5r_deref_odp_mkey(&mr->mmkey);
return ret;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 8115ab107149..88724d15705d 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1107,8 +1107,6 @@ static int _create_kernel_qp(struct mlx5_ib_dev *dev,
if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
qp->bf.bfreg = &dev->fp_bfreg;
- else if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST)
- qp->bf.bfreg = &dev->wc_bfreg;
else
qp->bf.bfreg = &dev->bfreg;
@@ -1962,7 +1960,7 @@ static int atomic_size_to_mode(int size_mask)
}
static int get_atomic_mode(struct mlx5_ib_dev *dev,
- enum ib_qp_type qp_type)
+ struct mlx5_ib_qp *qp)
{
u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic);
@@ -1972,7 +1970,7 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev,
if (!atomic)
return -EOPNOTSUPP;
- if (qp_type == MLX5_IB_QPT_DCT)
+ if (qp->type == MLX5_IB_QPT_DCT)
atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
else
atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
@@ -1986,6 +1984,10 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev,
atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD))
atomic_mode = MLX5_ATOMIC_MODE_IB_COMP;
+ /* OOO DP QPs do not support larger than 8-Bytes atomic operations */
+ if (atomic_mode > MLX5_ATOMIC_MODE_8B && qp->is_ooo_rq)
+ atomic_mode = MLX5_ATOMIC_MODE_8B;
+
return atomic_mode;
}
@@ -2841,6 +2843,29 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd,
return 0;
}
+static bool get_dp_ooo_cap(struct mlx5_core_dev *mdev, enum ib_qp_type qp_type)
+{
+ if (!MLX5_CAP_GEN_2(mdev, dp_ordering_force))
+ return false;
+
+ switch (qp_type) {
+ case IB_QPT_RC:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc);
+ case IB_QPT_XRC_INI:
+ case IB_QPT_XRC_TGT:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc);
+ case IB_QPT_UC:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc);
+ case IB_QPT_UD:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud);
+ case MLX5_IB_QPT_DCI:
+ case MLX5_IB_QPT_DCT:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc);
+ default:
+ return false;
+ }
+}
+
static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag,
bool cond, struct mlx5_ib_qp *qp)
{
@@ -2959,14 +2984,6 @@ static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag,
return;
}
- if (flag == MLX5_IB_QP_CREATE_WC_TEST) {
- /*
- * Special case, if condition didn't meet, it won't be error,
- * just different in-kernel flow.
- */
- *flags &= ~MLX5_IB_QP_CREATE_WC_TEST;
- return;
- }
mlx5_ib_dbg(dev, "Verbs create QP flag 0x%X is not supported\n", flag);
}
@@ -3027,8 +3044,6 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
IB_QP_CREATE_PCI_WRITE_END_PADDING,
MLX5_CAP_GEN(mdev, end_pad), qp);
- process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_WC_TEST,
- qp_type != MLX5_IB_QPT_REG_UMR, qp);
process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_SQPN_QP1,
true, qp);
@@ -3097,7 +3112,6 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
switch (qp->type) {
case MLX5_IB_QPT_DCT:
err = create_dct(dev, pd, qp, params);
- rdma_restrack_no_track(&qp->ibqp.res);
break;
case MLX5_IB_QPT_DCI:
err = create_dci(dev, pd, qp, params);
@@ -3109,9 +3123,9 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
err = mlx5_ib_create_gsi(pd, qp, params->attr);
break;
case MLX5_IB_QPT_HW_GSI:
- case MLX5_IB_QPT_REG_UMR:
rdma_restrack_no_track(&qp->ibqp.res);
fallthrough;
+ case MLX5_IB_QPT_REG_UMR:
default:
if (params->udata)
err = create_user_qp(dev, pd, qp, params);
@@ -3247,6 +3261,10 @@ int mlx5_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
enum ib_qp_type type;
int err;
+ err = mlx5_ib_dev_res_srq_init(dev);
+ if (err)
+ return err;
+
err = check_qp_type(dev, attr, &type);
if (err)
return err;
@@ -3374,7 +3392,7 @@ static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp,
if (access_flags & IB_ACCESS_REMOTE_ATOMIC) {
int atomic_mode;
- atomic_mode = get_atomic_mode(dev, qp->type);
+ atomic_mode = get_atomic_mode(dev, qp);
if (atomic_mode < 0)
return -EOPNOTSUPP;
@@ -3429,11 +3447,11 @@ static int ib_to_mlx5_rate_map(u8 rate)
return 0;
}
-static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
+int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate)
{
u32 stat_rate_support;
- if (rate == IB_RATE_PORT_CURRENT)
+ if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS)
return 0;
if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS)
@@ -3578,7 +3596,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
sizeof(grh->dgid.raw));
}
- err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah));
+ err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah));
if (err < 0)
return err;
MLX5_SET(ads, path, stat_rate, err);
@@ -4226,7 +4244,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
/* todo implement counter_index functionality */
- if (is_sqp(qp->type))
+ if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI && is_qp0(qp->type)) {
+ MLX5_SET(ads, pri_path, vhca_port_num,
+ smi_to_native_portnum(dev, qp->port));
+ if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)
+ MLX5_SET(ads, pri_path, plane_index, qp->port);
+ } else if (is_sqp(qp->type))
MLX5_SET(ads, pri_path, vhca_port_num, qp->port);
if (attr_mask & IB_QP_PORT)
@@ -4272,14 +4295,14 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic)
- MLX5_SET(qpc, qpc, log_sra_max, ilog2(attr->max_rd_atomic));
+ MLX5_SET(qpc, qpc, log_sra_max, fls(attr->max_rd_atomic - 1));
if (attr_mask & IB_QP_SQ_PSN)
MLX5_SET(qpc, qpc, next_send_psn, attr->sq_psn);
if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic)
MLX5_SET(qpc, qpc, log_rra_max,
- ilog2(attr->max_dest_rd_atomic));
+ fls(attr->max_dest_rd_atomic - 1));
if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
err = set_qpc_atomic_flags(qp, attr, attr_mask, qpc);
@@ -4320,6 +4343,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)
MLX5_SET(qpc, qpc, deth_sqpn, 1);
+ if (qp->is_ooo_rq && cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+ MLX5_SET(qpc, qpc, dp_ordering_1, 1);
+ MLX5_SET(qpc, qpc, dp_ordering_force, 1);
+ }
+
mlx5_cur = to_mlx5_state(cur_state);
mlx5_new = to_mlx5_state(new_state);
@@ -4535,7 +4563,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) {
int atomic_mode;
- atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT);
+ atomic_mode = get_atomic_mode(dev, qp);
if (atomic_mode < 0)
return -EOPNOTSUPP;
@@ -4551,6 +4579,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1);
MLX5_SET(dctc, dctc, counter_set_id, set_id);
+
+ qp->port = attr->port_num;
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
struct mlx5_ib_modify_qp_resp resp = {};
u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {};
@@ -4577,6 +4607,10 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7);
+ if (qp->is_ooo_rq) {
+ MLX5_SET(dctc, dctc, dp_ordering_1, 1);
+ MLX5_SET(dctc, dctc, dp_ordering_force, 1);
+ }
err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in,
MLX5_ST_SZ_BYTES(create_dct_in), out,
@@ -4610,10 +4644,6 @@ static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev,
if (qp->type == IB_QPT_RAW_PACKET || qp->type == MLX5_IB_QPT_REG_UMR)
return true;
- /* Internal QP used for wc testing, with NOPs in wq */
- if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST)
- return true;
-
return false;
}
@@ -4684,11 +4714,16 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
min(udata->inlen, sizeof(ucmd))))
return -EFAULT;
- if (ucmd.comp_mask ||
+ if (ucmd.comp_mask & ~MLX5_IB_MODIFY_QP_OOO_DP ||
memchr_inv(&ucmd.burst_info.reserved, 0,
sizeof(ucmd.burst_info.reserved)))
return -EOPNOTSUPP;
+ if (ucmd.comp_mask & MLX5_IB_MODIFY_QP_OOO_DP) {
+ if (!get_dp_ooo_cap(dev->mdev, qp->type))
+ return -EOPNOTSUPP;
+ qp->is_ooo_rq = 1;
+ }
}
if (qp->type == IB_QPT_GSI)
@@ -5041,7 +5076,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp,
}
if (qp_attr_mask & IB_QP_PORT)
- qp_attr->port_num = MLX5_GET(dctc, dctc, port);
+ qp_attr->port_num = mqp->port;
if (qp_attr_mask & IB_QP_MIN_RNR_TIMER)
qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak);
if (qp_attr_mask & IB_QP_AV) {
diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
index b6ee7c3ee1ca..2530e7730635 100644
--- a/drivers/infiniband/hw/mlx5/qp.h
+++ b/drivers/infiniband/hw/mlx5/qp.h
@@ -56,4 +56,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn);
int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
int mlx5_ib_qp_event_init(void);
void mlx5_ib_qp_event_cleanup(void);
+int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate);
#endif /* _MLX5_IB_QP_H */
diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
index d9cf6982d645..146d03ae40bd 100644
--- a/drivers/infiniband/hw/mlx5/qpc.c
+++ b/drivers/infiniband/hw/mlx5/qpc.c
@@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn)
spin_lock_irqsave(&table->lock, flags);
common = radix_tree_lookup(&table->tree, rsn);
- if (common)
+ if (common && !common->invalid)
refcount_inc(&common->refcount);
+ else
+ common = NULL;
spin_unlock_irqrestore(&table->lock, flags);
@@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev,
return 0;
}
+static void modify_resource_common_state(struct mlx5_ib_dev *dev,
+ struct mlx5_core_qp *qp,
+ bool invalid)
+{
+ struct mlx5_qp_table *table = &dev->qp_table;
+ unsigned long flags;
+
+ spin_lock_irqsave(&table->lock, flags);
+ qp->common.invalid = invalid;
+ spin_unlock_irqrestore(&table->lock, flags);
+}
+
static void destroy_resource_common(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *qp)
{
@@ -249,7 +263,8 @@ int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
if (err)
goto err_cmd;
- mlx5_debug_qp_add(dev->mdev, qp);
+ if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI)
+ mlx5_debug_qp_add(dev->mdev, qp);
return 0;
@@ -307,7 +322,8 @@ int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp)
{
u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
- mlx5_debug_qp_remove(dev->mdev, qp);
+ if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI)
+ mlx5_debug_qp_remove(dev->mdev, qp);
destroy_resource_common(dev, qp);
@@ -504,7 +520,9 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev)
spin_lock_init(&table->lock);
INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
xa_init(&table->dct_xa);
- mlx5_qp_debugfs_init(dev->mdev);
+
+ if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI)
+ mlx5_qp_debugfs_init(dev->mdev);
table->nb.notifier_call = rsc_event_notifier;
mlx5_notifier_register(dev->mdev, &table->nb);
@@ -517,7 +535,8 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev)
struct mlx5_qp_table *table = &dev->qp_table;
mlx5_notifier_unregister(dev->mdev, &table->nb);
- mlx5_qp_debugfs_cleanup(dev->mdev);
+ if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI)
+ mlx5_qp_debugfs_cleanup(dev->mdev);
}
int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
@@ -604,8 +623,20 @@ err_destroy_rq:
int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *rq)
{
+ int ret;
+
+ /* The rq destruction can be called again in case it fails, hence we
+ * mark the common resource as invalid and only once FW destruction
+ * is completed successfully we actually destroy the resources.
+ */
+ modify_resource_common_state(dev, rq, true);
+ ret = destroy_rq_tracked(dev, rq->qpn, rq->uid);
+ if (ret) {
+ modify_resource_common_state(dev, rq, false);
+ return ret;
+ }
destroy_resource_common(dev, rq);
- return destroy_rq_tracked(dev, rq->qpn, rq->uid);
+ return 0;
}
static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid)
diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c
index 4ac429e72004..67841922c7b8 100644
--- a/drivers/infiniband/hw/mlx5/restrack.c
+++ b/drivers/infiniband/hw/mlx5/restrack.c
@@ -96,9 +96,18 @@ static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr)
atomic64_read(&mr->odp_stats.faults)))
goto err_table;
if (rdma_nl_stat_hwcounter_entry(
+ msg, "page_faults_handled",
+ atomic64_read(&mr->odp_stats.faults_handled)))
+ goto err_table;
+ if (rdma_nl_stat_hwcounter_entry(
msg, "page_invalidations",
atomic64_read(&mr->odp_stats.invalidations)))
goto err_table;
+ if (rdma_nl_stat_hwcounter_entry(
+ msg, "page_invalidations_handled",
+ atomic64_read(&mr->odp_stats.invalidations_handled)))
+ goto err_table;
+
if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch",
atomic64_read(&mr->odp_stats.prefetch)))
goto err_table;
@@ -156,6 +165,34 @@ static int fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ibcq)
return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_CQ, cq->mcq.cqn);
}
+static int fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ibqp)
+{
+ struct mlx5_ib_qp *qp = to_mqp(ibqp);
+ int ret;
+
+ if (qp->type < IB_QPT_DRIVER)
+ return 0;
+
+ switch (qp->type) {
+ case MLX5_IB_QPT_REG_UMR:
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE,
+ "REG_UMR");
+ break;
+ case MLX5_IB_QPT_DCT:
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, "DCT");
+ break;
+ case MLX5_IB_QPT_DCI:
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, "DCI");
+ break;
+ default:
+ return 0;
+ }
+ if (ret)
+ return ret;
+
+ return nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, IB_QPT_DRIVER);
+}
+
static int fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp)
{
struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
@@ -168,6 +205,7 @@ static const struct ib_device_ops restrack_ops = {
.fill_res_cq_entry_raw = fill_res_cq_entry_raw,
.fill_res_mr_entry = fill_res_mr_entry,
.fill_res_mr_entry_raw = fill_res_mr_entry_raw,
+ .fill_res_qp_entry = fill_res_qp_entry,
.fill_res_qp_entry_raw = fill_res_qp_entry_raw,
.fill_stat_mr_entry = fill_stat_mr_entry,
};
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index a056ea835da5..bcb6b324af50 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -199,20 +199,27 @@ int mlx5_ib_create_srq(struct ib_srq *ib_srq,
int err;
struct mlx5_srq_attr in = {};
__u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+ __u32 max_sge_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq) /
+ sizeof(struct mlx5_wqe_data_seg);
if (init_attr->srq_type != IB_SRQT_BASIC &&
init_attr->srq_type != IB_SRQT_XRC &&
init_attr->srq_type != IB_SRQT_TM)
return -EOPNOTSUPP;
- /* Sanity check SRQ size before proceeding */
- if (init_attr->attr.max_wr >= max_srq_wqes) {
- mlx5_ib_dbg(dev, "max_wr %d, cap %d\n",
- init_attr->attr.max_wr,
- max_srq_wqes);
+ /* Sanity check SRQ and sge size before proceeding */
+ if (init_attr->attr.max_wr >= max_srq_wqes ||
+ init_attr->attr.max_sge > max_sge_sz) {
+ mlx5_ib_dbg(dev, "max_wr %d,wr_cap %d,max_sge %d, sge_cap:%d\n",
+ init_attr->attr.max_wr, max_srq_wqes,
+ init_attr->attr.max_sge, max_sge_sz);
return -EINVAL;
}
+ err = mlx5_ib_dev_res_cq_init(dev);
+ if (err)
+ return err;
+
mutex_init(&srq->mutex);
spin_lock_init(&srq->lock);
srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1);
diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c
index bbfcce3bdc84..bdb568411091 100644
--- a/drivers/infiniband/hw/mlx5/std_types.c
+++ b/drivers/infiniband/hw/mlx5/std_types.c
@@ -10,6 +10,7 @@
#include <linux/mlx5/eswitch.h>
#include <linux/mlx5/vport.h>
#include "mlx5_ib.h"
+#include "data_direct.h"
#define UVERBS_MODULE_NAME mlx5_ib
#include <rdma/uverbs_named_ioctl.h>
@@ -111,6 +112,23 @@ out:
return err;
}
+static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num,
+ struct mlx5_ib_uapi_query_port *info)
+{
+ struct mlx5_core_dev *mdev;
+
+ mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL);
+ if (!mdev)
+ return -EINVAL;
+
+ info->vport_vhca_id = MLX5_CAP_GEN(mdev, vhca_id);
+ info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID;
+
+ mlx5_ib_put_native_port_mdev(dev, port_num);
+
+ return 0;
+}
+
static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num,
struct mlx5_ib_uapi_query_port *info)
{
@@ -177,12 +195,60 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)(
ret = fill_switchdev_info(dev, port_num, &info);
if (ret)
return ret;
+ } else if (mlx5_core_mp_enabled(dev->mdev)) {
+ ret = fill_multiport_info(dev, port_num, &info);
+ if (ret)
+ return ret;
}
return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info,
sizeof(info));
}
+static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct mlx5_data_direct_dev *data_direct_dev;
+ struct mlx5_ib_ucontext *c;
+ struct mlx5_ib_dev *dev;
+ int out_len = uverbs_attr_get_len(attrs,
+ MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH);
+ u32 dev_path_len;
+ char *dev_path;
+ int ret;
+
+ c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ dev = to_mdev(c->ibucontext.device);
+ mutex_lock(&dev->data_direct_lock);
+ data_direct_dev = dev->data_direct_dev;
+ if (!data_direct_dev) {
+ ret = -ENODEV;
+ goto end;
+ }
+
+ dev_path = kobject_get_path(&data_direct_dev->device->kobj, GFP_KERNEL);
+ if (!dev_path) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ dev_path_len = strlen(dev_path) + 1;
+ if (dev_path_len > out_len) {
+ ret = -ENOSPC;
+ goto end;
+ }
+
+ ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path,
+ dev_path_len);
+ kfree(dev_path);
+
+end:
+ mutex_unlock(&dev->data_direct_lock);
+ return ret;
+}
+
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_QUERY_PORT,
UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM,
@@ -193,9 +259,17 @@ DECLARE_UVERBS_NAMED_METHOD(
reg_c0),
UA_MANDATORY));
+DECLARE_UVERBS_NAMED_METHOD(
+ MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH,
+ UVERBS_ATTR_PTR_OUT(
+ MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH,
+ UVERBS_ATTR_MIN_SIZE(0),
+ UA_MANDATORY));
+
ADD_UVERBS_METHODS(mlx5_ib_device,
UVERBS_OBJECT_DEVICE,
- &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT));
+ &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT),
+ &UVERBS_METHOD(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH));
DECLARE_UVERBS_NAMED_METHOD(
MLX5_IB_METHOD_PD_QUERY,
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index e76142f6fa88..5be4426a2884 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -135,22 +135,28 @@ static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp)
int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
{
struct ib_qp_init_attr init_attr = {};
- struct ib_pd *pd;
struct ib_cq *cq;
struct ib_qp *qp;
- int ret;
+ int ret = 0;
- pd = ib_alloc_pd(&dev->ib_dev, 0);
- if (IS_ERR(pd)) {
- mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
- return PTR_ERR(pd);
- }
+
+ /*
+ * UMR qp is set once, never changed until device unload.
+ * Avoid taking the mutex if initialization is already done.
+ */
+ if (dev->umrc.qp)
+ return 0;
+
+ mutex_lock(&dev->umrc.init_lock);
+ /* First user allocates the UMR resources. Skip if already allocated. */
+ if (dev->umrc.qp)
+ goto unlock;
cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
if (IS_ERR(cq)) {
mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
ret = PTR_ERR(cq);
- goto destroy_pd;
+ goto unlock;
}
init_attr.send_cq = cq;
@@ -160,7 +166,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
init_attr.cap.max_send_sge = 1;
init_attr.qp_type = MLX5_IB_QPT_REG_UMR;
init_attr.port_num = 1;
- qp = ib_create_qp(pd, &init_attr);
+ qp = ib_create_qp(dev->umrc.pd, &init_attr);
if (IS_ERR(qp)) {
mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
ret = PTR_ERR(qp);
@@ -171,22 +177,22 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
if (ret)
goto destroy_qp;
- dev->umrc.qp = qp;
dev->umrc.cq = cq;
- dev->umrc.pd = pd;
sema_init(&dev->umrc.sem, MAX_UMR_WR);
mutex_init(&dev->umrc.lock);
dev->umrc.state = MLX5_UMR_STATE_ACTIVE;
+ dev->umrc.qp = qp;
+ mutex_unlock(&dev->umrc.init_lock);
return 0;
destroy_qp:
ib_destroy_qp(qp);
destroy_cq:
ib_free_cq(cq);
-destroy_pd:
- ib_dealloc_pd(pd);
+unlock:
+ mutex_unlock(&dev->umrc.init_lock);
return ret;
}
@@ -194,36 +200,38 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
{
if (dev->umrc.state == MLX5_UMR_STATE_UNINIT)
return;
+ mutex_destroy(&dev->umrc.lock);
+ /* After device init, UMR cp/qp are not unset during the lifetime. */
ib_destroy_qp(dev->umrc.qp);
ib_free_cq(dev->umrc.cq);
- ib_dealloc_pd(dev->umrc.pd);
}
-static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
+int mlx5r_umr_init(struct mlx5_ib_dev *dev)
{
- struct umr_common *umrc = &dev->umrc;
- struct ib_qp_attr attr;
- int err;
+ struct ib_pd *pd;
- attr.qp_state = IB_QPS_RESET;
- err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
- if (err) {
- mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
- goto err;
+ pd = ib_alloc_pd(&dev->ib_dev, 0);
+ if (IS_ERR(pd)) {
+ mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
+ return PTR_ERR(pd);
}
+ dev->umrc.pd = pd;
- err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
- if (err)
- goto err;
+ mutex_init(&dev->umrc.init_lock);
- umrc->state = MLX5_UMR_STATE_ACTIVE;
return 0;
+}
-err:
- umrc->state = MLX5_UMR_STATE_ERR;
- return err;
+void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev)
+{
+ if (!dev->umrc.pd)
+ return;
+
+ mutex_destroy(&dev->umrc.init_lock);
+ ib_dealloc_pd(dev->umrc.pd);
}
+
static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
struct mlx5r_umr_wqe *wqe, bool with_data)
{
@@ -270,6 +278,61 @@ out:
return err;
}
+static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey,
+ struct mlx5r_umr_context *umr_context,
+ struct mlx5r_umr_wqe *wqe, bool with_data)
+{
+ struct umr_common *umrc = &dev->umrc;
+ struct ib_qp_attr attr;
+ int err;
+
+ mutex_lock(&umrc->lock);
+ /* Preventing any further WRs to be sent now */
+ if (umrc->state != MLX5_UMR_STATE_RECOVER) {
+ mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
+ umrc->state);
+ umrc->state = MLX5_UMR_STATE_RECOVER;
+ }
+ mutex_unlock(&umrc->lock);
+
+ /* Sending a final/barrier WR (the failed one) and wait for its completion.
+ * This will ensure that all the previous WRs got a completion before
+ * we set the QP state to RESET.
+ */
+ err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
+ with_data);
+ if (err) {
+ mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err);
+ goto err;
+ }
+
+ /* Since the QP is in an error state, it will only receive
+ * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier
+ * we don't care about its status.
+ */
+ wait_for_completion(&umr_context->done);
+
+ attr.qp_state = IB_QPS_RESET;
+ err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
+ if (err) {
+ mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err);
+ goto err;
+ }
+
+ err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
+ if (err) {
+ mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err);
+ goto err;
+ }
+
+ umrc->state = MLX5_UMR_STATE_ACTIVE;
+ return 0;
+
+err:
+ umrc->state = MLX5_UMR_STATE_ERR;
+ return err;
+}
+
static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct mlx5_ib_umr_context *context =
@@ -334,9 +397,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
mlx5_ib_warn(dev,
"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
umr_context.status, mkey);
- mutex_lock(&umrc->lock);
- err = mlx5r_umr_recover(dev);
- mutex_unlock(&umrc->lock);
+ err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data);
if (err)
mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
err);
@@ -603,44 +664,47 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
wqe->data_seg.byte_count = cpu_to_be32(sg->length);
}
-/*
- * Send the DMA list to the HW for a normal MR using UMR.
- * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
- * flag may be used.
- */
-int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+static int
+_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
{
+ size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
struct device *ddev = &dev->mdev->pdev->dev;
struct mlx5r_umr_wqe wqe = {};
struct ib_block_iter biter;
+ struct mlx5_ksm *cur_ksm;
struct mlx5_mtt *cur_mtt;
size_t orig_sg_length;
- struct mlx5_mtt *mtt;
size_t final_size;
+ void *curr_entry;
struct ib_sge sg;
+ void *entry;
u64 offset = 0;
int err = 0;
- if (WARN_ON(mr->umem->is_odp))
- return -EINVAL;
-
- mtt = mlx5r_umr_create_xlt(
- dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
- sizeof(*mtt), flags);
- if (!mtt)
+ entry = mlx5r_umr_create_xlt(dev, &sg,
+ ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
+ ent_size, flags);
+ if (!entry)
return -ENOMEM;
orig_sg_length = sg.length;
-
mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
mr->page_shift);
+ if (dd) {
+ /* Use the data direct internal kernel PD */
+ MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
+ cur_ksm = entry;
+ } else {
+ cur_mtt = entry;
+ }
+
mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
- cur_mtt = mtt;
+ curr_entry = entry;
rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
- if (cur_mtt == (void *)mtt + sg.length) {
+ if (curr_entry == entry + sg.length) {
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
@@ -652,23 +716,31 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
DMA_TO_DEVICE);
offset += sg.length;
mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
-
- cur_mtt = mtt;
+ if (dd)
+ cur_ksm = entry;
+ else
+ cur_mtt = entry;
}
- cur_mtt->ptag =
- cpu_to_be64(rdma_block_iter_dma_address(&biter) |
- MLX5_IB_MTT_PRESENT);
-
- if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
- cur_mtt->ptag = 0;
-
- cur_mtt++;
+ if (dd) {
+ cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
+ cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
+ cur_ksm++;
+ curr_entry = cur_ksm;
+ } else {
+ cur_mtt->ptag =
+ cpu_to_be64(rdma_block_iter_dma_address(&biter) |
+ MLX5_IB_MTT_PRESENT);
+ if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
+ cur_mtt->ptag = 0;
+ cur_mtt++;
+ curr_entry = cur_mtt;
+ }
}
- final_size = (void *)cur_mtt - (void *)mtt;
+ final_size = curr_entry - entry;
sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
- memset(cur_mtt, 0, sg.length - final_size);
+ memset(curr_entry, 0, sg.length - final_size);
mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
@@ -676,10 +748,32 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
err:
sg.length = orig_sg_length;
- mlx5r_umr_unmap_free_xlt(dev, mtt, &sg);
+ mlx5r_umr_unmap_free_xlt(dev, entry, &sg);
return err;
}
+int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+{
+ /* No invalidation flow is expected */
+ if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP))
+ return -EINVAL;
+
+ return _mlx5r_umr_update_mr_pas(mr, flags, true);
+}
+
+/*
+ * Send the DMA list to the HW for a normal MR using UMR.
+ * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
+ * flag may be used.
+ */
+int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+{
+ if (WARN_ON(mr->umem->is_odp))
+ return -EINVAL;
+
+ return _mlx5r_umr_update_mr_pas(mr, flags, false);
+}
+
static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
{
return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
@@ -746,7 +840,17 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
size_to_map = npages * desc_size;
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
- mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
+ /*
+ * npages is the maximum number of pages to map, but we
+ * can't guarantee that all pages are actually mapped.
+ *
+ * For example, if page is p2p of type which is not supported
+ * for mapping, the number of pages mapped will be less than
+ * requested.
+ */
+ err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
+ if (err)
+ return err;
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h
index 3799bb758e49..4a02c9b5aad8 100644
--- a/drivers/infiniband/hw/mlx5/umr.h
+++ b/drivers/infiniband/hw/mlx5/umr.h
@@ -16,6 +16,9 @@
int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev);
void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev);
+int mlx5r_umr_init(struct mlx5_ib_dev *dev);
+void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev);
+
static inline bool mlx5r_umr_can_load_pas(struct mlx5_ib_dev *dev,
size_t length)
{
@@ -92,6 +95,7 @@ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr);
int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
int access_flags);
int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags);
+int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags);
int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags);
diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c
index df1d1b0a3ef7..9947feb7fb8a 100644
--- a/drivers/infiniband/hw/mlx5/wr.c
+++ b/drivers/infiniband/hw/mlx5/wr.c
@@ -78,7 +78,7 @@ static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
*/
copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start,
left);
- memcpy(eseg->inline_hdr.start, pdata, copysz);
+ memcpy(eseg->inline_hdr.data, pdata, copysz);
stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) -
sizeof(eseg->inline_hdr.start) + copysz, 16);
*size += stride / 16;