summaryrefslogtreecommitdiff
path: root/drivers/infiniband
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/Kconfig1
-rw-r--r--drivers/infiniband/core/cm.c9
-rw-r--r--drivers/infiniband/core/cma.c2
-rw-r--r--drivers/infiniband/core/device.c4
-rw-r--r--drivers/infiniband/core/restrack.c4
-rw-r--r--drivers/infiniband/core/ucma.c2
-rw-r--r--drivers/infiniband/core/umem.c8
-rw-r--r--drivers/infiniband/core/verbs.c3
-rw-r--r--drivers/infiniband/hw/Makefile1
-rw-r--r--drivers/infiniband/hw/bng_re/Kconfig10
-rw-r--r--drivers/infiniband/hw/bng_re/Makefile8
-rw-r--r--drivers/infiniband/hw/bng_re/bng_debugfs.c39
-rw-r--r--drivers/infiniband/hw/bng_re/bng_debugfs.h12
-rw-r--r--drivers/infiniband/hw/bng_re/bng_dev.c534
-rw-r--r--drivers/infiniband/hw/bng_re/bng_fw.c767
-rw-r--r--drivers/infiniband/hw/bng_re/bng_fw.h211
-rw-r--r--drivers/infiniband/hw/bng_re/bng_re.h85
-rw-r--r--drivers/infiniband/hw/bng_re/bng_res.c279
-rw-r--r--drivers/infiniband/hw/bng_re/bng_res.h215
-rw-r--r--drivers/infiniband/hw/bng_re/bng_sp.c131
-rw-r--r--drivers/infiniband/hw/bng_re/bng_sp.h47
-rw-r--r--drivers/infiniband/hw/bng_re/bng_tlv.h128
-rw-r--r--drivers/infiniband/hw/bnxt_re/bnxt_re.h2
-rw-r--r--drivers/infiniband/hw/bnxt_re/debugfs.c128
-rw-r--r--drivers/infiniband/hw/bnxt_re/debugfs.h19
-rw-r--r--drivers/infiniband/hw/bnxt_re/ib_verbs.c8
-rw-r--r--drivers/infiniband/hw/bnxt_re/main.c1
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.c3
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.h1
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.c8
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.h2
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c2
-rw-r--r--drivers/infiniband/hw/hfi1/init.c4
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.c4
-rw-r--r--drivers/infiniband/hw/hns/Makefile4
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_ah.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_bond.c1012
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_bond.h95
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_device.h16
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.c141
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.h20
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_main.c185
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_pd.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_qp.c5
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_srq.c1
-rw-r--r--drivers/infiniband/hw/irdma/cm.c2
-rw-r--r--drivers/infiniband/hw/irdma/ctrl.c107
-rw-r--r--drivers/infiniband/hw/irdma/hw.c3
-rw-r--r--drivers/infiniband/hw/irdma/icrdma_if.c6
-rw-r--r--drivers/infiniband/hw/irdma/ig3rdma_if.c4
-rw-r--r--drivers/infiniband/hw/irdma/main.h3
-rw-r--r--drivers/infiniband/hw/irdma/pble.c6
-rw-r--r--drivers/infiniband/hw/irdma/puda.c20
-rw-r--r--drivers/infiniband/hw/irdma/type.h5
-rw-r--r--drivers/infiniband/hw/irdma/uk.c67
-rw-r--r--drivers/infiniband/hw/irdma/user.h6
-rw-r--r--drivers/infiniband/hw/irdma/utils.c58
-rw-r--r--drivers/infiniband/hw/irdma/verbs.c49
-rw-r--r--drivers/infiniband/hw/irdma/verbs.h3
-rw-r--r--drivers/infiniband/hw/mlx4/cm.c2
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c14
-rw-r--r--drivers/infiniband/hw/mlx5/fs.c65
-rw-r--r--drivers/infiniband/hw/mlx5/ib_rep.c74
-rw-r--r--drivers/infiniband/hw/mlx5/main.c4
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c93
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c5
-rw-r--r--drivers/infiniband/sw/rdmavt/cq.c3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c1
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.c49
-rw-r--r--drivers/infiniband/sw/rxe/rxe_odp.c1
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c49
-rw-r--r--drivers/infiniband/sw/rxe/rxe_srq.c7
-rw-r--r--drivers/infiniband/sw/siw/siw_cm.c51
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c2
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.c2
-rw-r--r--drivers/infiniband/ulp/rtrs/rtrs-srv.c2
76 files changed, 4542 insertions, 384 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index f0323f1d6f01..794b9778816b 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -80,6 +80,7 @@ config INFINIBAND_VIRT_DMA
if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
if !UML
source "drivers/infiniband/hw/bnxt_re/Kconfig"
+source "drivers/infiniband/hw/bng_re/Kconfig"
source "drivers/infiniband/hw/cxgb4/Kconfig"
source "drivers/infiniband/hw/efa/Kconfig"
source "drivers/infiniband/hw/erdma/Kconfig"
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 01bede8ba105..024df6ee239d 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -34,7 +34,6 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
-#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
@@ -1057,6 +1056,7 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
{
struct cm_id_private *cm_id_priv;
enum ib_cm_state old_state;
+ unsigned long timeout;
struct cm_work *work;
int ret;
@@ -1167,10 +1167,9 @@ retest:
xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
cm_deref_id(cm_id_priv);
+ timeout = msecs_to_jiffies((cm_id_priv->max_cm_retries * cm_id_priv->timeout_ms * 5) / 4);
do {
- ret = wait_for_completion_timeout(&cm_id_priv->comp,
- msecs_to_jiffies(
- CM_DESTROY_ID_WAIT_TIMEOUT));
+ ret = wait_for_completion_timeout(&cm_id_priv->comp, timeout);
if (!ret) /* timeout happened */
cm_destroy_id_wait_timeout(cm_id, old_state);
} while (!ret);
@@ -4518,7 +4517,7 @@ static int __init ib_cm_init(void)
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
INIT_LIST_HEAD(&cm.timewait_list);
- cm.wq = alloc_workqueue("ib_cm", 0, 1);
+ cm.wq = alloc_workqueue("ib_cm", WQ_PERCPU, 1);
if (!cm.wq) {
ret = -ENOMEM;
goto error2;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 5b2d3ae3f9fc..95e89f5c147c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4475,6 +4475,8 @@ int rdma_connect_locked(struct rdma_cm_id *id,
container_of(id, struct rdma_id_private, id);
int ret;
+ lockdep_assert_held(&id_priv->handler_mutex);
+
if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
return -EINVAL;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b4f3c835844a..13e8a1714bbd 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -3021,7 +3021,7 @@ static int __init ib_core_init(void)
{
int ret = -ENOMEM;
- ib_wq = alloc_workqueue("infiniband", 0, 0);
+ ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0);
if (!ib_wq)
return -ENOMEM;
@@ -3031,7 +3031,7 @@ static int __init ib_core_init(void)
goto err;
ib_comp_wq = alloc_workqueue("ib-comp-wq",
- WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0);
if (!ib_comp_wq)
goto err_unbound;
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index a7de6f403fca..b097cfcade1c 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -175,7 +175,7 @@ void rdma_restrack_new(struct rdma_restrack_entry *res,
EXPORT_SYMBOL(rdma_restrack_new);
/**
- * rdma_restrack_add() - add object to the reource tracking database
+ * rdma_restrack_add() - add object to the resource tracking database
* @res: resource entry
*/
void rdma_restrack_add(struct rdma_restrack_entry *res)
@@ -277,7 +277,7 @@ int rdma_restrack_put(struct rdma_restrack_entry *res)
EXPORT_SYMBOL(rdma_restrack_put);
/**
- * rdma_restrack_del() - delete object from the reource tracking database
+ * rdma_restrack_del() - delete object from the resource tracking database
* @res: resource entry
*/
void rdma_restrack_del(struct rdma_restrack_entry *res)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index f86ece701db6..ec3be65a2b88 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -366,7 +366,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
xa_lock(&ctx_table);
if (xa_load(&ctx_table, ctx->id) == ctx)
- queue_work(system_unbound_wq, &ctx->close_work);
+ queue_work(system_dfl_wq, &ctx->close_work);
xa_unlock(&ctx_table);
}
return 0;
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index c5b686394760..8137031c2a65 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -45,6 +45,8 @@
#include "uverbs.h"
+#define RESCHED_LOOP_CNT_THRESHOLD 0x1000
+
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
bool make_dirty = umem->writable && dirty;
@@ -55,10 +57,14 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt,
DMA_BIDIRECTIONAL, 0);
- for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i)
+ for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) {
unpin_user_page_range_dirty_lock(sg_page(sg),
DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);
+ if (i && !(i % RESCHED_LOOP_CNT_THRESHOLD))
+ cond_resched();
+ }
+
sg_free_append_table(&umem->sgt_append);
}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3a5f81402d2f..11b1a194de44 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -148,6 +148,7 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
case IB_RATE_400_GBPS: return 160;
case IB_RATE_600_GBPS: return 240;
case IB_RATE_800_GBPS: return 320;
+ case IB_RATE_1600_GBPS: return 640;
default: return -1;
}
}
@@ -178,6 +179,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
case 160: return IB_RATE_400_GBPS;
case 240: return IB_RATE_600_GBPS;
case 320: return IB_RATE_800_GBPS;
+ case 640: return IB_RATE_1600_GBPS;
default: return IB_RATE_PORT_CURRENT;
}
}
@@ -208,6 +210,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
case IB_RATE_400_GBPS: return 425000;
case IB_RATE_600_GBPS: return 637500;
case IB_RATE_800_GBPS: return 850000;
+ case IB_RATE_1600_GBPS: return 1700000;
default: return -1;
}
}
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index b706dc0d0263..c42b22ac3303 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -13,5 +13,6 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/
obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/
obj-$(CONFIG_INFINIBAND_QEDR) += qedr/
obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/
+obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re/
obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/
obj-$(CONFIG_INFINIBAND_IONIC) += ionic/
diff --git a/drivers/infiniband/hw/bng_re/Kconfig b/drivers/infiniband/hw/bng_re/Kconfig
new file mode 100644
index 000000000000..85845f72c64d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config INFINIBAND_BNG_RE
+ tristate "Broadcom Next generation RoCE HCA support"
+ depends on 64BIT
+ depends on INET && DCB && BNGE
+ help
+ This driver supports Broadcom Next generation
+ 50/100/200/400/800 gigabit RoCE HCAs. The module
+ will be called bng_re. To compile this driver
+ as a module, choose M here.
diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile
new file mode 100644
index 000000000000..c6aaaf853c77
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge -I $(srctree)/drivers/infiniband/hw/bnxt_re
+
+obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o
+
+bng_re-y := bng_dev.o bng_fw.o \
+ bng_res.o bng_sp.o \
+ bng_debugfs.o
diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.c b/drivers/infiniband/hw/bng_re/bng_debugfs.c
new file mode 100644
index 000000000000..9ec5a8785250
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_debugfs.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/debugfs.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bnge.h"
+#include "bnge_auxr.h"
+#include "bng_re.h"
+#include "bng_debugfs.h"
+
+static struct dentry *bng_re_debugfs_root;
+
+void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev)
+{
+ struct pci_dev *pdev = rdev->aux_dev->pdev;
+
+ rdev->dbg_root =
+ debugfs_create_dir(dev_name(&pdev->dev), bng_re_debugfs_root);
+}
+
+void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev)
+{
+ debugfs_remove_recursive(rdev->dbg_root);
+ rdev->dbg_root = NULL;
+}
+
+void bng_re_register_debugfs(void)
+{
+ bng_re_debugfs_root = debugfs_create_dir("bng_re", NULL);
+}
+
+void bng_re_unregister_debugfs(void)
+{
+ debugfs_remove(bng_re_debugfs_root);
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.h b/drivers/infiniband/hw/bng_re/bng_debugfs.h
new file mode 100644
index 000000000000..baef71df4242
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_debugfs.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RE_DEBUGFS__
+#define __BNG_RE_DEBUGFS__
+
+void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev);
+void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev);
+
+void bng_re_register_debugfs(void);
+void bng_re_unregister_debugfs(void);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c
new file mode 100644
index 000000000000..d8f8d7f7075f
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_dev.c
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/auxiliary_bus.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "bng_res.h"
+#include "bng_sp.h"
+#include "bng_fw.h"
+#include "bnge.h"
+#include "bnge_auxr.h"
+#include "bng_re.h"
+#include "bnge_hwrm.h"
+#include "bng_debugfs.h"
+
+MODULE_AUTHOR("Siva Reddy Kallam <siva.kallam@broadcom.com>");
+MODULE_DESCRIPTION(BNG_RE_DESC);
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct bng_re_dev *bng_re_dev_add(struct auxiliary_device *adev,
+ struct bnge_auxr_dev *aux_dev)
+{
+ struct bng_re_dev *rdev;
+
+ /* Allocate bng_re_dev instance */
+ rdev = ib_alloc_device(bng_re_dev, ibdev);
+ if (!rdev) {
+ pr_err("%s: bng_re_dev allocation failure!", KBUILD_MODNAME);
+ return NULL;
+ }
+
+ /* Assign auxiliary device specific data */
+ rdev->netdev = aux_dev->net;
+ rdev->aux_dev = aux_dev;
+ rdev->adev = adev;
+ rdev->fn_id = rdev->aux_dev->pdev->devfn;
+
+ return rdev;
+}
+
+
+static int bng_re_register_netdev(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev;
+
+ aux_dev = rdev->aux_dev;
+ return bnge_register_dev(aux_dev, rdev->adev);
+}
+
+static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev)
+{
+ struct bng_re_chip_ctx *chip_ctx;
+
+ if (!rdev->chip_ctx)
+ return;
+
+ kfree(rdev->dev_attr);
+ rdev->dev_attr = NULL;
+
+ chip_ctx = rdev->chip_ctx;
+ rdev->chip_ctx = NULL;
+ rdev->rcfw.res = NULL;
+ rdev->bng_res.cctx = NULL;
+ rdev->bng_res.pdev = NULL;
+ kfree(chip_ctx);
+}
+
+static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev)
+{
+ struct bng_re_chip_ctx *chip_ctx;
+ struct bnge_auxr_dev *aux_dev;
+ int rc = -ENOMEM;
+
+ aux_dev = rdev->aux_dev;
+ rdev->bng_res.pdev = aux_dev->pdev;
+ rdev->rcfw.res = &rdev->bng_res;
+ chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL);
+ if (!chip_ctx)
+ return -ENOMEM;
+ chip_ctx->chip_num = aux_dev->chip_num;
+ chip_ctx->hw_stats_size = aux_dev->hw_ring_stats_size;
+
+ rdev->chip_ctx = chip_ctx;
+ rdev->bng_res.cctx = rdev->chip_ctx;
+ rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL);
+ if (!rdev->dev_attr)
+ goto free_chip_ctx;
+ rdev->bng_res.dattr = rdev->dev_attr;
+
+ return 0;
+free_chip_ctx:
+ kfree(rdev->chip_ctx);
+ rdev->chip_ctx = NULL;
+ return rc;
+}
+
+static void bng_re_init_hwrm_hdr(struct input *hdr, u16 opcd)
+{
+ hdr->req_type = cpu_to_le16(opcd);
+ hdr->cmpl_ring = cpu_to_le16(-1);
+ hdr->target_id = cpu_to_le16(-1);
+}
+
+static void bng_re_fill_fw_msg(struct bnge_fw_msg *fw_msg, void *msg,
+ int msg_len, void *resp, int resp_max_len,
+ int timeout)
+{
+ fw_msg->msg = msg;
+ fw_msg->msg_len = msg_len;
+ fw_msg->resp = resp;
+ fw_msg->resp_max_len = resp_max_len;
+ fw_msg->timeout = timeout;
+}
+
+static int bng_re_net_ring_free(struct bng_re_dev *rdev,
+ u16 fw_ring_id, int type)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ring_free_input req = {};
+ struct hwrm_ring_free_output resp;
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!rdev)
+ return rc;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE);
+ req.ring_type = type;
+ req.ring_id = cpu_to_le16(fw_ring_id);
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc)
+ ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x",
+ req.ring_id, rc);
+ return rc;
+}
+
+static int bng_re_net_ring_alloc(struct bng_re_dev *rdev,
+ struct bng_re_ring_attr *ring_attr,
+ u16 *fw_ring_id)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ring_alloc_input req = {};
+ struct hwrm_ring_alloc_output resp;
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC);
+ req.enables = 0;
+ req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]);
+ if (ring_attr->pages > 1) {
+ /* Page size is in log2 units */
+ req.page_size = BNGE_PAGE_SHIFT;
+ req.page_tbl_depth = 1;
+ }
+ req.fbo = 0;
+ /* Association of ring index with doorbell index and MSIX number */
+ req.logical_id = cpu_to_le16(ring_attr->lrid);
+ req.length = cpu_to_le32(ring_attr->depth + 1);
+ req.ring_type = ring_attr->type;
+ req.int_mode = ring_attr->mode;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (!rc)
+ *fw_ring_id = le16_to_cpu(resp.ring_id);
+
+ return rc;
+}
+
+static int bng_re_stats_ctx_free(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_stat_ctx_free_input req = {};
+ struct hwrm_stat_ctx_free_output resp = {};
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE);
+ req.stat_ctx_id = cpu_to_le32(rdev->stats_ctx.fw_id);
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc)
+ ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x",
+ rc);
+
+ return rc;
+}
+
+static int bng_re_stats_ctx_alloc(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct bng_re_stats *stats = &rdev->stats_ctx;
+ struct hwrm_stat_ctx_alloc_output resp = {};
+ struct hwrm_stat_ctx_alloc_input req = {};
+ struct bnge_fw_msg fw_msg = {};
+ int rc = -EINVAL;
+
+ stats->fw_id = BNGE_INVALID_STATS_CTX_ID;
+
+ if (!aux_dev)
+ return rc;
+
+ bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC);
+ req.update_period_ms = cpu_to_le32(1000);
+ req.stats_dma_addr = cpu_to_le64(stats->dma_map);
+ req.stats_dma_length = cpu_to_le16(rdev->chip_ctx->hw_stats_size);
+ req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+ sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (!rc)
+ stats->fw_id = le32_to_cpu(resp.stat_ctx_id);
+ return rc;
+}
+
+static void bng_re_query_hwrm_version(struct bng_re_dev *rdev)
+{
+ struct bnge_auxr_dev *aux_dev = rdev->aux_dev;
+ struct hwrm_ver_get_output ver_get_resp = {};
+ struct hwrm_ver_get_input ver_get_req = {};
+ struct bng_re_chip_ctx *cctx;
+ struct bnge_fw_msg fw_msg = {};
+ int rc;
+
+ bng_re_init_hwrm_hdr((void *)&ver_get_req, HWRM_VER_GET);
+ ver_get_req.hwrm_intf_maj = HWRM_VERSION_MAJOR;
+ ver_get_req.hwrm_intf_min = HWRM_VERSION_MINOR;
+ ver_get_req.hwrm_intf_upd = HWRM_VERSION_UPDATE;
+ bng_re_fill_fw_msg(&fw_msg, (void *)&ver_get_req, sizeof(ver_get_req),
+ (void *)&ver_get_resp, sizeof(ver_get_resp),
+ BNGE_DFLT_HWRM_CMD_TIMEOUT);
+ rc = bnge_send_msg(aux_dev, &fw_msg);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x",
+ rc);
+ return;
+ }
+
+ cctx = rdev->chip_ctx;
+ cctx->hwrm_intf_ver =
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_major) << 48 |
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_minor) << 32 |
+ (u64)le16_to_cpu(ver_get_resp.hwrm_intf_build) << 16 |
+ le16_to_cpu(ver_get_resp.hwrm_intf_patch);
+
+ cctx->hwrm_cmd_max_timeout = le16_to_cpu(ver_get_resp.max_req_timeout);
+
+ if (!cctx->hwrm_cmd_max_timeout)
+ cctx->hwrm_cmd_max_timeout = BNG_ROCE_FW_MAX_TIMEOUT;
+}
+
+static void bng_re_dev_uninit(struct bng_re_dev *rdev)
+{
+ int rc;
+ bng_re_debugfs_rem_pdev(rdev);
+
+ if (test_and_clear_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
+ rc = bng_re_deinit_rcfw(&rdev->rcfw);
+ if (rc)
+ ibdev_warn(&rdev->ibdev,
+ "Failed to deinitialize RCFW: %#x", rc);
+ bng_re_stats_ctx_free(rdev);
+ bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx);
+ bng_re_disable_rcfw_channel(&rdev->rcfw);
+ bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id,
+ RING_ALLOC_REQ_RING_TYPE_NQ);
+ bng_re_free_rcfw_channel(&rdev->rcfw);
+ }
+
+ kfree(rdev->nqr);
+ rdev->nqr = NULL;
+ bng_re_destroy_chip_ctx(rdev);
+ if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags))
+ bnge_unregister_dev(rdev->aux_dev);
+}
+
+static int bng_re_dev_init(struct bng_re_dev *rdev)
+{
+ struct bng_re_ring_attr rattr = {};
+ struct bng_re_creq_ctx *creq;
+ u32 db_offt;
+ int vid;
+ u8 type;
+ int rc;
+
+ /* Registered a new RoCE device instance to netdev */
+ rc = bng_re_register_netdev(rdev);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to register with netedev: %#x\n", rc);
+ return -EINVAL;
+ }
+
+ set_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+
+ if (rdev->aux_dev->auxr_info->msix_requested < BNG_RE_MIN_MSIX) {
+ ibdev_err(&rdev->ibdev,
+ "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n",
+ rdev->aux_dev->auxr_info->msix_requested);
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ return -EINVAL;
+ }
+ ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n",
+ rdev->aux_dev->auxr_info->msix_requested);
+
+ rc = bng_re_setup_chip_ctx(rdev);
+ if (rc) {
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ ibdev_err(&rdev->ibdev, "Failed to get chip context\n");
+ return -EINVAL;
+ }
+
+ bng_re_query_hwrm_version(rdev);
+
+ rc = bng_re_alloc_fw_channel(&rdev->bng_res, &rdev->rcfw);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate RCFW Channel: %#x\n", rc);
+ goto fail;
+ }
+
+ /* Allocate nq record memory */
+ rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL);
+ if (!rdev->nqr) {
+ bng_re_destroy_chip_ctx(rdev);
+ bnge_unregister_dev(rdev->aux_dev);
+ clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+ return -ENOMEM;
+ }
+
+ rdev->nqr->num_msix = rdev->aux_dev->auxr_info->msix_requested;
+ memcpy(rdev->nqr->msix_entries, rdev->aux_dev->msix_info,
+ sizeof(struct bnge_msix_info) * rdev->nqr->num_msix);
+
+ type = RING_ALLOC_REQ_RING_TYPE_NQ;
+ creq = &rdev->rcfw.creq;
+ rattr.dma_arr = creq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr;
+ rattr.pages = creq->hwq.pbl[creq->hwq.level].pg_count;
+ rattr.type = type;
+ rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+ rattr.depth = BNG_FW_CREQE_MAX_CNT - 1;
+ rattr.lrid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].ring_idx;
+ rc = bng_re_net_ring_alloc(rdev, &rattr, &creq->ring_id);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc);
+ goto free_rcfw;
+ }
+ db_offt = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].db_offset;
+ vid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].vector;
+
+ rc = bng_re_enable_fw_channel(&rdev->rcfw,
+ vid, db_offt);
+ if (rc) {
+ ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n",
+ rc);
+ goto free_ring;
+ }
+
+ rc = bng_re_get_dev_attr(&rdev->rcfw);
+ if (rc)
+ goto disable_rcfw;
+
+ bng_re_debugfs_add_pdev(rdev);
+ rc = bng_re_alloc_stats_ctx_mem(rdev->bng_res.pdev, rdev->chip_ctx,
+ &rdev->stats_ctx);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate stats context: %#x\n", rc);
+ goto disable_rcfw;
+ }
+
+ rc = bng_re_stats_ctx_alloc(rdev);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to allocate QPLIB context: %#x\n", rc);
+ goto free_stats_ctx;
+ }
+
+ rc = bng_re_init_rcfw(&rdev->rcfw, &rdev->stats_ctx);
+ if (rc) {
+ ibdev_err(&rdev->ibdev,
+ "Failed to initialize RCFW: %#x\n", rc);
+ goto free_sctx;
+ }
+ set_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags);
+
+ return 0;
+free_sctx:
+ bng_re_stats_ctx_free(rdev);
+free_stats_ctx:
+ bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx);
+disable_rcfw:
+ bng_re_disable_rcfw_channel(&rdev->rcfw);
+free_ring:
+ bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type);
+free_rcfw:
+ bng_re_free_rcfw_channel(&rdev->rcfw);
+fail:
+ bng_re_dev_uninit(rdev);
+ return rc;
+}
+
+static int bng_re_add_device(struct auxiliary_device *adev)
+{
+ struct bnge_auxr_priv *auxr_priv =
+ container_of(adev, struct bnge_auxr_priv, aux_dev);
+ struct bng_re_en_dev_info *dev_info;
+ struct bng_re_dev *rdev;
+ int rc;
+
+ dev_info = auxiliary_get_drvdata(adev);
+
+ rdev = bng_re_dev_add(adev, auxr_priv->auxr_dev);
+ if (!rdev) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ dev_info->rdev = rdev;
+
+ rc = bng_re_dev_init(rdev);
+ if (rc)
+ goto re_dev_dealloc;
+
+ return 0;
+
+re_dev_dealloc:
+ ib_dealloc_device(&rdev->ibdev);
+exit:
+ return rc;
+}
+
+
+static void bng_re_remove_device(struct bng_re_dev *rdev,
+ struct auxiliary_device *aux_dev)
+{
+ bng_re_dev_uninit(rdev);
+ ib_dealloc_device(&rdev->ibdev);
+}
+
+
+static int bng_re_probe(struct auxiliary_device *adev,
+ const struct auxiliary_device_id *id)
+{
+ struct bnge_auxr_priv *aux_priv =
+ container_of(adev, struct bnge_auxr_priv, aux_dev);
+ struct bng_re_en_dev_info *en_info;
+ int rc;
+
+ en_info = kzalloc(sizeof(*en_info), GFP_KERNEL);
+ if (!en_info)
+ return -ENOMEM;
+
+ en_info->auxr_dev = aux_priv->auxr_dev;
+
+ auxiliary_set_drvdata(adev, en_info);
+
+ rc = bng_re_add_device(adev);
+ if (rc)
+ kfree(en_info);
+
+ return rc;
+}
+
+static void bng_re_remove(struct auxiliary_device *adev)
+{
+ struct bng_re_en_dev_info *dev_info = auxiliary_get_drvdata(adev);
+ struct bng_re_dev *rdev;
+
+ rdev = dev_info->rdev;
+
+ if (rdev)
+ bng_re_remove_device(rdev, adev);
+ kfree(dev_info);
+}
+
+static const struct auxiliary_device_id bng_re_id_table[] = {
+ { .name = BNG_RE_ADEV_NAME ".rdma", },
+ {},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, bng_re_id_table);
+
+static struct auxiliary_driver bng_re_driver = {
+ .name = "rdma",
+ .probe = bng_re_probe,
+ .remove = bng_re_remove,
+ .id_table = bng_re_id_table,
+};
+
+static int __init bng_re_mod_init(void)
+{
+ int rc;
+
+
+ bng_re_register_debugfs();
+
+ rc = auxiliary_driver_register(&bng_re_driver);
+ if (rc) {
+ pr_err("%s: Failed to register auxiliary driver\n",
+ KBUILD_MODNAME);
+ goto unreg_debugfs;
+ }
+ return 0;
+unreg_debugfs:
+ bng_re_unregister_debugfs();
+ return rc;
+}
+
+static void __exit bng_re_mod_exit(void)
+{
+ auxiliary_driver_unregister(&bng_re_driver);
+ bng_re_unregister_debugfs();
+}
+
+module_init(bng_re_mod_init);
+module_exit(bng_re_mod_exit);
diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c
new file mode 100644
index 000000000000..7d9539113cf5
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_fw.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/pci.h>
+
+#include "roce_hsi.h"
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bng_sp.h"
+
+/**
+ * bng_re_map_rc - map return type based on opcode
+ * @opcode: roce slow path opcode
+ *
+ * case #1
+ * Firmware initiated error recovery is a safe state machine and
+ * driver can consider all the underlying rdma resources are free.
+ * In this state, it is safe to return success for opcodes related to
+ * destroying rdma resources (like destroy qp, destroy cq etc.).
+ *
+ * case #2
+ * If driver detect potential firmware stall, it is not safe state machine
+ * and the driver can not consider all the underlying rdma resources are
+ * freed.
+ * In this state, it is not safe to return success for opcodes related to
+ * destroying rdma resources (like destroy qp, destroy cq etc.).
+ *
+ * Scope of this helper function is only for case #1.
+ *
+ * Returns:
+ * 0 to communicate success to caller.
+ * Non zero error code to communicate failure to caller.
+ */
+static int bng_re_map_rc(u8 opcode)
+{
+ switch (opcode) {
+ case CMDQ_BASE_OPCODE_DESTROY_QP:
+ case CMDQ_BASE_OPCODE_DESTROY_SRQ:
+ case CMDQ_BASE_OPCODE_DESTROY_CQ:
+ case CMDQ_BASE_OPCODE_DEALLOCATE_KEY:
+ case CMDQ_BASE_OPCODE_DEREGISTER_MR:
+ case CMDQ_BASE_OPCODE_DELETE_GID:
+ case CMDQ_BASE_OPCODE_DESTROY_QP1:
+ case CMDQ_BASE_OPCODE_DESTROY_AH:
+ case CMDQ_BASE_OPCODE_DEINITIALIZE_FW:
+ case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC:
+ case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE:
+ return 0;
+ default:
+ return -ETIMEDOUT;
+ }
+}
+
+void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw)
+{
+ kfree(rcfw->crsqe_tbl);
+ bng_re_free_hwq(rcfw->res, &rcfw->cmdq.hwq);
+ bng_re_free_hwq(rcfw->res, &rcfw->creq.hwq);
+ rcfw->pdev = NULL;
+}
+
+int bng_re_alloc_fw_channel(struct bng_re_res *res,
+ struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_hwq_attr hwq_attr = {};
+ struct bng_re_sg_info sginfo = {};
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_creq_ctx *creq;
+
+ rcfw->pdev = res->pdev;
+ cmdq = &rcfw->cmdq;
+ creq = &rcfw->creq;
+ rcfw->res = res;
+
+ sginfo.pgsize = PAGE_SIZE;
+ sginfo.pgshft = PAGE_SHIFT;
+
+ hwq_attr.sginfo = &sginfo;
+ hwq_attr.res = rcfw->res;
+ hwq_attr.depth = BNG_FW_CREQE_MAX_CNT;
+ hwq_attr.stride = BNG_FW_CREQE_UNITS;
+ hwq_attr.type = BNG_HWQ_TYPE_QUEUE;
+
+ if (bng_re_alloc_init_hwq(&creq->hwq, &hwq_attr)) {
+ dev_err(&rcfw->pdev->dev,
+ "HW channel CREQ allocation failed\n");
+ goto fail;
+ }
+
+ rcfw->cmdq_depth = BNG_FW_CMDQE_MAX_CNT;
+
+ sginfo.pgsize = bng_fw_cmdqe_page_size(rcfw->cmdq_depth);
+ hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF;
+ hwq_attr.stride = BNG_FW_CMDQE_UNITS;
+ hwq_attr.type = BNG_HWQ_TYPE_CTX;
+ if (bng_re_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) {
+ dev_err(&rcfw->pdev->dev,
+ "HW channel CMDQ allocation failed\n");
+ goto fail;
+ }
+
+ rcfw->crsqe_tbl = kcalloc(cmdq->hwq.max_elements,
+ sizeof(*rcfw->crsqe_tbl), GFP_KERNEL);
+ if (!rcfw->crsqe_tbl)
+ goto fail;
+
+ spin_lock_init(&rcfw->tbl_lock);
+
+ rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout;
+ return 0;
+
+fail:
+ bng_re_free_rcfw_channel(rcfw);
+ return -ENOMEM;
+}
+
+static int bng_re_process_qp_event(struct bng_re_rcfw *rcfw,
+ struct creq_qp_event *qp_event,
+ u32 *num_wait)
+{
+ struct bng_re_hwq *hwq = &rcfw->cmdq.hwq;
+ struct bng_re_crsqe *crsqe;
+ u32 req_size;
+ u16 cookie;
+ bool is_waiter_alive;
+ struct pci_dev *pdev;
+ u32 wait_cmds = 0;
+ int rc = 0;
+
+ pdev = rcfw->pdev;
+ switch (qp_event->event) {
+ case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
+ dev_err(&pdev->dev, "Received QP error notification\n");
+ break;
+ default:
+ /*
+ * Command Response
+ * cmdq->lock needs to be acquired to synchronie
+ * the command send and completion reaping. This function
+ * is always called with creq->lock held. Using
+ * the nested variant of spin_lock.
+ *
+ */
+
+ spin_lock_nested(&hwq->lock, SINGLE_DEPTH_NESTING);
+ cookie = le16_to_cpu(qp_event->cookie);
+ cookie &= BNG_FW_MAX_COOKIE_VALUE;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED,
+ &rcfw->cmdq.flags),
+ "Unreponsive rcfw channel detected.!!")) {
+ dev_info(&pdev->dev,
+ "rcfw timedout: cookie = %#x, free_slots = %d",
+ cookie, crsqe->free_slots);
+ spin_unlock(&hwq->lock);
+ return rc;
+ }
+
+ if (crsqe->is_waiter_alive) {
+ if (crsqe->resp) {
+ memcpy(crsqe->resp, qp_event, sizeof(*qp_event));
+ /* Insert write memory barrier to ensure that
+ * response data is copied before clearing the
+ * flags
+ */
+ smp_wmb();
+ }
+ }
+
+ wait_cmds++;
+
+ req_size = crsqe->req_size;
+ is_waiter_alive = crsqe->is_waiter_alive;
+
+ crsqe->req_size = 0;
+ if (!is_waiter_alive)
+ crsqe->resp = NULL;
+
+ crsqe->is_in_used = false;
+
+ hwq->cons += req_size;
+
+ spin_unlock(&hwq->lock);
+ }
+ *num_wait += wait_cmds;
+ return rc;
+}
+
+/* function events */
+static int bng_re_process_func_event(struct bng_re_rcfw *rcfw,
+ struct creq_func_event *func_event)
+{
+ switch (func_event->event) {
+ case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CQ_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TQM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_TIM_ERROR:
+ case CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST:
+ case CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* CREQ Completion handlers */
+static void bng_re_service_creq(struct tasklet_struct *t)
+{
+ struct bng_re_rcfw *rcfw = from_tasklet(rcfw, t, creq.creq_tasklet);
+ struct bng_re_creq_ctx *creq = &rcfw->creq;
+ u32 type, budget = BNG_FW_CREQ_ENTRY_POLL_BUDGET;
+ struct bng_re_hwq *hwq = &creq->hwq;
+ struct creq_base *creqe;
+ u32 num_wakeup = 0;
+ u32 hw_polled = 0;
+
+ /* Service the CREQ until budget is over */
+ spin_lock_bh(&hwq->lock);
+ while (budget > 0) {
+ creqe = bng_re_get_qe(hwq, hwq->cons, NULL);
+ if (!BNG_FW_CREQ_CMP_VALID(creqe, creq->creq_db.dbinfo.flags))
+ break;
+ /* The valid test of the entry must be done first before
+ * reading any further.
+ */
+ dma_rmb();
+
+ type = creqe->type & CREQ_BASE_TYPE_MASK;
+ switch (type) {
+ case CREQ_BASE_TYPE_QP_EVENT:
+ bng_re_process_qp_event
+ (rcfw, (struct creq_qp_event *)creqe,
+ &num_wakeup);
+ creq->stats.creq_qp_event_processed++;
+ break;
+ case CREQ_BASE_TYPE_FUNC_EVENT:
+ if (!bng_re_process_func_event
+ (rcfw, (struct creq_func_event *)creqe))
+ creq->stats.creq_func_event_processed++;
+ else
+ dev_warn(&rcfw->pdev->dev,
+ "aeqe:%#x Not handled\n", type);
+ break;
+ default:
+ if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT)
+ dev_warn(&rcfw->pdev->dev,
+ "creqe with event 0x%x not handled\n",
+ type);
+ break;
+ }
+ budget--;
+ hw_polled++;
+ bng_re_hwq_incr_cons(hwq->max_elements, &hwq->cons,
+ 1, &creq->creq_db.dbinfo.flags);
+ }
+
+ if (hw_polled)
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo,
+ rcfw->res->cctx, true);
+ spin_unlock_bh(&hwq->lock);
+ if (num_wakeup)
+ wake_up_nr(&rcfw->cmdq.waitq, num_wakeup);
+}
+
+static int __send_message_basic_sanity(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg,
+ u8 opcode)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+
+ cmdq = &rcfw->cmdq;
+
+ if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags))
+ return -ETIMEDOUT;
+
+ if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
+ opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
+ dev_err(&rcfw->pdev->dev, "RCFW already initialized!");
+ return -EINVAL;
+ }
+
+ if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
+ (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
+ opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
+ opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
+ dev_err(&rcfw->pdev->dev,
+ "RCFW not initialized, reject opcode 0x%x",
+ opcode);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int __send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg, u8 opcode)
+{
+ u32 bsize, free_slots, required_slots;
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_crsqe *crsqe;
+ struct bng_fw_cmdqe *cmdqe;
+ struct bng_re_hwq *hwq;
+ u32 sw_prod, cmdq_prod;
+ struct pci_dev *pdev;
+ u16 cookie;
+ u8 *preq;
+
+ cmdq = &rcfw->cmdq;
+ hwq = &cmdq->hwq;
+ pdev = rcfw->pdev;
+
+ /* Cmdq are in 16-byte units, each request can consume 1 or more
+ * cmdqe
+ */
+ spin_lock_bh(&hwq->lock);
+ required_slots = bng_re_get_cmd_slots(msg->req);
+ free_slots = HWQ_FREE_SLOTS(hwq);
+ cookie = cmdq->seq_num & BNG_FW_MAX_COOKIE_VALUE;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ if (required_slots >= free_slots) {
+ dev_info_ratelimited(&pdev->dev,
+ "CMDQ is full req/free %d/%d!",
+ required_slots, free_slots);
+ spin_unlock_bh(&hwq->lock);
+ return -EAGAIN;
+ }
+ __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie));
+
+ bsize = bng_re_set_cmd_slots(msg->req);
+ crsqe->free_slots = free_slots;
+ crsqe->resp = (struct creq_qp_event *)msg->resp;
+ crsqe->is_waiter_alive = true;
+ crsqe->is_in_used = true;
+ crsqe->opcode = opcode;
+
+ crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz);
+ if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) {
+ struct bng_re_rcfw_sbuf *sbuf = msg->sb;
+
+ __set_cmdq_base_resp_addr(msg->req, msg->req_sz,
+ cpu_to_le64(sbuf->dma_addr));
+ __set_cmdq_base_resp_size(msg->req, msg->req_sz,
+ ALIGN(sbuf->size,
+ BNG_FW_CMDQE_UNITS) /
+ BNG_FW_CMDQE_UNITS);
+ }
+
+ preq = (u8 *)msg->req;
+ do {
+ /* Locate the next cmdq slot */
+ sw_prod = HWQ_CMP(hwq->prod, hwq);
+ cmdqe = bng_re_get_qe(hwq, sw_prod, NULL);
+ /* Copy a segment of the req cmd to the cmdq */
+ memset(cmdqe, 0, sizeof(*cmdqe));
+ memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe)));
+ preq += min_t(u32, bsize, sizeof(*cmdqe));
+ bsize -= min_t(u32, bsize, sizeof(*cmdqe));
+ hwq->prod++;
+ } while (bsize > 0);
+ cmdq->seq_num++;
+
+ cmdq_prod = hwq->prod & 0xFFFF;
+ if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) {
+ /* The very first doorbell write
+ * is required to set this flag
+ * which prompts the FW to reset
+ * its internal pointers
+ */
+ cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG);
+ clear_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
+ }
+ /* ring CMDQ DB */
+ wmb();
+ writel(cmdq_prod, cmdq->cmdq_mbox.prod);
+ writel(BNG_FW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db);
+ spin_unlock_bh(&hwq->lock);
+ /* Return the CREQ response pointer */
+ return 0;
+}
+
+/**
+ * __wait_for_resp - Don't hold the cpu context and wait for response
+ * @rcfw: rcfw channel instance of rdev
+ * @cookie: cookie to track the command
+ *
+ * Wait for command completion in sleepable context.
+ *
+ * Returns:
+ * 0 if command is completed by firmware.
+ * Non zero error code for rest of the case.
+ */
+static int __wait_for_resp(struct bng_re_rcfw *rcfw, u16 cookie)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_crsqe *crsqe;
+
+ cmdq = &rcfw->cmdq;
+ crsqe = &rcfw->crsqe_tbl[cookie];
+
+ do {
+ wait_event_timeout(cmdq->waitq,
+ !crsqe->is_in_used,
+ secs_to_jiffies(rcfw->max_timeout));
+
+ if (!crsqe->is_in_used)
+ return 0;
+
+ bng_re_service_creq(&rcfw->creq.creq_tasklet);
+
+ if (!crsqe->is_in_used)
+ return 0;
+ } while (true);
+};
+
+/**
+ * bng_re_rcfw_send_message - interface to send
+ * and complete rcfw command.
+ * @rcfw: rcfw channel instance of rdev
+ * @msg: message to send
+ *
+ * This function does not account shadow queue depth. It will send
+ * all the command unconditionally as long as send queue is not full.
+ *
+ * Returns:
+ * 0 if command completed by firmware.
+ * Non zero if the command is not completed by firmware.
+ */
+int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg)
+{
+ struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp;
+ struct bng_re_crsqe *crsqe;
+ u16 cookie;
+ int rc;
+ u8 opcode;
+
+ opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz);
+
+ rc = __send_message_basic_sanity(rcfw, msg, opcode);
+ if (rc)
+ return rc == -ENXIO ? bng_re_map_rc(opcode) : rc;
+
+ rc = __send_message(rcfw, msg, opcode);
+ if (rc)
+ return rc;
+
+ cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz))
+ & BNG_FW_MAX_COOKIE_VALUE;
+
+ rc = __wait_for_resp(rcfw, cookie);
+
+ if (rc) {
+ spin_lock_bh(&rcfw->cmdq.hwq.lock);
+ crsqe = &rcfw->crsqe_tbl[cookie];
+ crsqe->is_waiter_alive = false;
+ if (rc == -ENODEV)
+ set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags);
+ spin_unlock_bh(&rcfw->cmdq.hwq.lock);
+ return -ETIMEDOUT;
+ }
+
+ if (evnt->status) {
+ /* failed with status */
+ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n",
+ cookie, opcode, evnt->status);
+ rc = -EIO;
+ }
+
+ return rc;
+}
+
+static int bng_re_map_cmdq_mbox(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_cmdq_mbox *mbox;
+ resource_size_t bar_reg;
+ struct pci_dev *pdev;
+
+ pdev = rcfw->pdev;
+ mbox = &rcfw->cmdq.cmdq_mbox;
+
+ mbox->reg.bar_id = BNG_FW_COMM_PCI_BAR_REGION;
+ mbox->reg.len = BNG_FW_COMM_SIZE;
+ mbox->reg.bar_base = pci_resource_start(pdev, mbox->reg.bar_id);
+ if (!mbox->reg.bar_base) {
+ dev_err(&pdev->dev,
+ "CMDQ BAR region %d resc start is 0!\n",
+ mbox->reg.bar_id);
+ return -ENOMEM;
+ }
+
+ bar_reg = mbox->reg.bar_base + BNG_FW_COMM_BASE_OFFSET;
+ mbox->reg.len = BNG_FW_COMM_SIZE;
+ mbox->reg.bar_reg = ioremap(bar_reg, mbox->reg.len);
+ if (!mbox->reg.bar_reg) {
+ dev_err(&pdev->dev,
+ "CMDQ BAR region %d mapping failed\n",
+ mbox->reg.bar_id);
+ return -ENOMEM;
+ }
+
+ mbox->prod = (void __iomem *)(mbox->reg.bar_reg +
+ BNG_FW_PF_VF_COMM_PROD_OFFSET);
+ mbox->db = (void __iomem *)(mbox->reg.bar_reg + BNG_FW_COMM_TRIG_OFFSET);
+ return 0;
+}
+
+static irqreturn_t bng_re_creq_irq(int irq, void *dev_instance)
+{
+ struct bng_re_rcfw *rcfw = dev_instance;
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_hwq *hwq;
+ u32 sw_cons;
+
+ creq = &rcfw->creq;
+ hwq = &creq->hwq;
+ /* Prefetch the CREQ element */
+ sw_cons = HWQ_CMP(hwq->cons, hwq);
+ bng_re_get_qe(hwq, sw_cons, NULL);
+
+ tasklet_schedule(&creq->creq_tasklet);
+ return IRQ_HANDLED;
+}
+
+int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector,
+ bool need_init)
+{
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_res *res;
+ int rc;
+
+ creq = &rcfw->creq;
+ res = rcfw->res;
+
+ if (creq->irq_handler_avail)
+ return -EFAULT;
+
+ creq->msix_vec = msix_vector;
+ if (need_init)
+ tasklet_setup(&creq->creq_tasklet, bng_re_service_creq);
+ else
+ tasklet_enable(&creq->creq_tasklet);
+
+ creq->irq_name = kasprintf(GFP_KERNEL, "bng_re-creq@pci:%s",
+ pci_name(res->pdev));
+ if (!creq->irq_name)
+ return -ENOMEM;
+ rc = request_irq(creq->msix_vec, bng_re_creq_irq, 0,
+ creq->irq_name, rcfw);
+ if (rc) {
+ kfree(creq->irq_name);
+ creq->irq_name = NULL;
+ tasklet_disable(&creq->creq_tasklet);
+ return rc;
+ }
+ creq->irq_handler_avail = true;
+
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true);
+ atomic_inc(&rcfw->rcfw_intr_enabled);
+
+ return 0;
+}
+
+static int bng_re_map_creq_db(struct bng_re_rcfw *rcfw, u32 reg_offt)
+{
+ struct bng_re_creq_db *creq_db;
+ resource_size_t bar_reg;
+ struct pci_dev *pdev;
+
+ pdev = rcfw->pdev;
+ creq_db = &rcfw->creq.creq_db;
+
+ creq_db->dbinfo.flags = 0;
+ creq_db->reg.bar_id = BNG_FW_COMM_CONS_PCI_BAR_REGION;
+ creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id);
+ if (!creq_db->reg.bar_id)
+ dev_err(&pdev->dev,
+ "CREQ BAR region %d resc start is 0!",
+ creq_db->reg.bar_id);
+
+ bar_reg = creq_db->reg.bar_base + reg_offt;
+
+ creq_db->reg.len = BNG_FW_CREQ_DB_LEN;
+ creq_db->reg.bar_reg = ioremap(bar_reg, creq_db->reg.len);
+ if (!creq_db->reg.bar_reg) {
+ dev_err(&pdev->dev,
+ "CREQ BAR region %d mapping failed",
+ creq_db->reg.bar_id);
+ return -ENOMEM;
+ }
+ creq_db->dbinfo.db = creq_db->reg.bar_reg;
+ creq_db->dbinfo.hwq = &rcfw->creq.hwq;
+ creq_db->dbinfo.xid = rcfw->creq.ring_id;
+ return 0;
+}
+
+void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill)
+{
+ struct bng_re_creq_ctx *creq;
+
+ creq = &rcfw->creq;
+
+ if (!creq->irq_handler_avail)
+ return;
+
+ creq->irq_handler_avail = false;
+ /* Mask h/w interrupts */
+ bng_re_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false);
+ /* Sync with last running IRQ-handler */
+ synchronize_irq(creq->msix_vec);
+ free_irq(creq->msix_vec, rcfw);
+ kfree(creq->irq_name);
+ creq->irq_name = NULL;
+ atomic_set(&rcfw->rcfw_intr_enabled, 0);
+ if (kill)
+ tasklet_kill(&creq->creq_tasklet);
+ tasklet_disable(&creq->creq_tasklet);
+}
+
+void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_cmdq_ctx *cmdq;
+
+ creq = &rcfw->creq;
+ cmdq = &rcfw->cmdq;
+ /* Make sure the HW channel is stopped! */
+ bng_re_rcfw_stop_irq(rcfw, true);
+
+ iounmap(cmdq->cmdq_mbox.reg.bar_reg);
+ iounmap(creq->creq_db.reg.bar_reg);
+
+ cmdq->cmdq_mbox.reg.bar_reg = NULL;
+ creq->creq_db.reg.bar_reg = NULL;
+ creq->msix_vec = 0;
+}
+
+static void bng_re_start_rcfw(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ struct bng_re_creq_ctx *creq;
+ struct bng_re_cmdq_mbox *mbox;
+ struct cmdq_init init = {0};
+
+ cmdq = &rcfw->cmdq;
+ creq = &rcfw->creq;
+ mbox = &cmdq->cmdq_mbox;
+
+ init.cmdq_pbl = cpu_to_le64(cmdq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr[0]);
+ init.cmdq_size_cmdq_lvl =
+ cpu_to_le16(((rcfw->cmdq_depth <<
+ CMDQ_INIT_CMDQ_SIZE_SFT) &
+ CMDQ_INIT_CMDQ_SIZE_MASK) |
+ ((cmdq->hwq.level <<
+ CMDQ_INIT_CMDQ_LVL_SFT) &
+ CMDQ_INIT_CMDQ_LVL_MASK));
+ init.creq_ring_id = cpu_to_le16(creq->ring_id);
+ /* Write to the mailbox register */
+ __iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4);
+}
+
+int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw,
+ int msix_vector,
+ int cp_bar_reg_off)
+{
+ struct bng_re_cmdq_ctx *cmdq;
+ int rc;
+
+ cmdq = &rcfw->cmdq;
+
+ /* Assign defaults */
+ cmdq->seq_num = 0;
+ set_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
+ init_waitqueue_head(&cmdq->waitq);
+
+ rc = bng_re_map_cmdq_mbox(rcfw);
+ if (rc)
+ return rc;
+
+ rc = bng_re_map_creq_db(rcfw, cp_bar_reg_off);
+ if (rc)
+ return rc;
+
+ rc = bng_re_rcfw_start_irq(rcfw, msix_vector, true);
+ if (rc) {
+ dev_err(&rcfw->pdev->dev,
+ "Failed to request IRQ for CREQ rc = 0x%x\n", rc);
+ bng_re_disable_rcfw_channel(rcfw);
+ return rc;
+ }
+
+ bng_re_start_rcfw(rcfw);
+ return 0;
+}
+
+int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw)
+{
+ struct creq_deinitialize_fw_resp resp = {};
+ struct cmdq_deinitialize_fw req = {};
+ struct bng_re_cmdqmsg msg = {};
+ int rc;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_DEINITIALIZE_FW,
+ sizeof(req));
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL,
+ sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return rc;
+
+ clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
+ return 0;
+}
+static inline bool _is_hw_retx_supported(u16 dev_cap_flags)
+{
+ return dev_cap_flags &
+ (CREQ_QUERY_FUNC_RESP_SB_HW_REQUESTER_RETX_ENABLED |
+ CREQ_QUERY_FUNC_RESP_SB_HW_RESPONDER_RETX_ENABLED);
+}
+
+#define BNG_RE_HW_RETX(a) _is_hw_retx_supported((a))
+static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2)
+{
+ return dev_cap_ext_flags2 &
+ CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED;
+}
+
+int bng_re_init_rcfw(struct bng_re_rcfw *rcfw,
+ struct bng_re_stats *stats_ctx)
+{
+ struct creq_initialize_fw_resp resp = {};
+ struct cmdq_initialize_fw req = {};
+ struct bng_re_cmdqmsg msg = {};
+ int rc;
+ u16 flags = 0;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_INITIALIZE_FW,
+ sizeof(req));
+ /* Supply (log-base-2-of-host-page-size - base-page-shift)
+ * to bono to adjust the doorbell page sizes.
+ */
+ req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT -
+ BNG_FW_DBR_BASE_PAGE_SHIFT);
+ if (BNG_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags))
+ flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED;
+ if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2))
+ flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED;
+ req.flags |= cpu_to_le16(flags);
+ req.stat_ctx_id = cpu_to_le32(stats_ctx->fw_id);
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return rc;
+ set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
+ return 0;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h
new file mode 100644
index 000000000000..c89c926ec2fc
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_fw.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_FW_H__
+#define __BNG_FW_H__
+
+#include "bng_tlv.h"
+
+/* FW DB related */
+#define BNG_FW_CMDQ_TRIG_VAL 1
+#define BNG_FW_COMM_PCI_BAR_REGION 0
+#define BNG_FW_COMM_CONS_PCI_BAR_REGION 2
+#define BNG_FW_DBR_BASE_PAGE_SHIFT 12
+#define BNG_FW_COMM_SIZE 0x104
+#define BNG_FW_COMM_BASE_OFFSET 0x600
+#define BNG_FW_COMM_TRIG_OFFSET 0x100
+#define BNG_FW_PF_VF_COMM_PROD_OFFSET 0xc
+#define BNG_FW_CREQ_DB_LEN 8
+
+/* CREQ */
+#define BNG_FW_CREQE_MAX_CNT (64 * 1024)
+#define BNG_FW_CREQE_UNITS 16
+#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100
+#define BNG_FW_CREQ_CMP_VALID(hdr, pass) \
+ (!!((hdr)->v & CREQ_BASE_V) == \
+ !((pass) & BNG_RE_FLAG_EPOCH_CONS_MASK))
+#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100
+
+/* CMDQ */
+struct bng_fw_cmdqe {
+ u8 data[16];
+};
+
+#define BNG_FW_CMDQE_MAX_CNT 8192
+#define BNG_FW_CMDQE_UNITS sizeof(struct bng_fw_cmdqe)
+#define BNG_FW_CMDQE_BYTES(depth) ((depth) * BNG_FW_CMDQE_UNITS)
+
+#define BNG_FW_MAX_COOKIE_VALUE (BNG_FW_CMDQE_MAX_CNT - 1)
+#define BNG_FW_CMD_IS_BLOCKING 0x8000
+
+/* Crsq buf is 1024-Byte */
+struct bng_re_crsbe {
+ u8 data[1024];
+};
+
+
+static inline u32 bng_fw_cmdqe_npages(u32 depth)
+{
+ u32 npages;
+
+ npages = BNG_FW_CMDQE_BYTES(depth) / PAGE_SIZE;
+ if (BNG_FW_CMDQE_BYTES(depth) % PAGE_SIZE)
+ npages++;
+ return npages;
+}
+
+static inline u32 bng_fw_cmdqe_page_size(u32 depth)
+{
+ return (bng_fw_cmdqe_npages(depth) * PAGE_SIZE);
+}
+struct bng_re_cmdq_mbox {
+ struct bng_re_reg_desc reg;
+ void __iomem *prod;
+ void __iomem *db;
+};
+
+/* HWQ */
+struct bng_re_cmdq_ctx {
+ struct bng_re_hwq hwq;
+ struct bng_re_cmdq_mbox cmdq_mbox;
+ unsigned long flags;
+#define FIRMWARE_INITIALIZED_FLAG (0)
+#define FIRMWARE_STALL_DETECTED (3)
+#define FIRMWARE_FIRST_FLAG (31)
+ wait_queue_head_t waitq;
+ u32 seq_num;
+};
+
+struct bng_re_creq_db {
+ struct bng_re_reg_desc reg;
+ struct bng_re_db_info dbinfo;
+};
+
+struct bng_re_creq_stat {
+ u64 creq_qp_event_processed;
+ u64 creq_func_event_processed;
+};
+
+struct bng_re_creq_ctx {
+ struct bng_re_hwq hwq;
+ struct bng_re_creq_db creq_db;
+ struct bng_re_creq_stat stats;
+ struct tasklet_struct creq_tasklet;
+ u16 ring_id;
+ int msix_vec;
+ bool irq_handler_avail;
+ char *irq_name;
+};
+
+struct bng_re_crsqe {
+ struct creq_qp_event *resp;
+ u32 req_size;
+ /* Free slots at the time of submission */
+ u32 free_slots;
+ u8 opcode;
+ bool is_waiter_alive;
+ bool is_in_used;
+};
+
+struct bng_re_rcfw_sbuf {
+ void *sb;
+ dma_addr_t dma_addr;
+ u32 size;
+};
+
+/* RoCE FW Communication Channels */
+struct bng_re_rcfw {
+ struct pci_dev *pdev;
+ struct bng_re_res *res;
+ struct bng_re_cmdq_ctx cmdq;
+ struct bng_re_creq_ctx creq;
+ struct bng_re_crsqe *crsqe_tbl;
+ /* To synchronize the qp-handle hash table */
+ spinlock_t tbl_lock;
+ u32 cmdq_depth;
+ /* cached from chip cctx for quick reference in slow path */
+ u16 max_timeout;
+ atomic_t rcfw_intr_enabled;
+};
+
+struct bng_re_cmdqmsg {
+ struct cmdq_base *req;
+ struct creq_base *resp;
+ void *sb;
+ u32 req_sz;
+ u32 res_sz;
+ u8 block;
+};
+
+static inline void bng_re_rcfw_cmd_prep(struct cmdq_base *req,
+ u8 opcode, u8 cmd_size)
+{
+ req->opcode = opcode;
+ req->cmd_size = cmd_size;
+}
+
+static inline void bng_re_fill_cmdqmsg(struct bng_re_cmdqmsg *msg,
+ void *req, void *resp, void *sb,
+ u32 req_sz, u32 res_sz, u8 block)
+{
+ msg->req = req;
+ msg->resp = resp;
+ msg->sb = sb;
+ msg->req_sz = req_sz;
+ msg->res_sz = res_sz;
+ msg->block = block;
+}
+
+/* Get the number of command units required for the req. The
+ * function returns correct value only if called before
+ * setting using bng_re_set_cmd_slots
+ */
+static inline u32 bng_re_get_cmd_slots(struct cmdq_base *req)
+{
+ u32 cmd_units = 0;
+
+ if (HAS_TLV_HEADER(req)) {
+ struct roce_tlv *tlv_req = (struct roce_tlv *)req;
+
+ cmd_units = tlv_req->total_size;
+ } else {
+ cmd_units = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) /
+ BNG_FW_CMDQE_UNITS;
+ }
+
+ return cmd_units;
+}
+
+static inline u32 bng_re_set_cmd_slots(struct cmdq_base *req)
+{
+ u32 cmd_byte = 0;
+
+ if (HAS_TLV_HEADER(req)) {
+ struct roce_tlv *tlv_req = (struct roce_tlv *)req;
+
+ cmd_byte = tlv_req->total_size * BNG_FW_CMDQE_UNITS;
+ } else {
+ cmd_byte = req->cmd_size;
+ req->cmd_size = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) /
+ BNG_FW_CMDQE_UNITS;
+ }
+
+ return cmd_byte;
+}
+
+void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw);
+int bng_re_alloc_fw_channel(struct bng_re_res *res,
+ struct bng_re_rcfw *rcfw);
+int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw,
+ int msix_vector,
+ int cp_bar_reg_off);
+void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw);
+int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector,
+ bool need_init);
+void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill);
+int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw,
+ struct bng_re_cmdqmsg *msg);
+int bng_re_init_rcfw(struct bng_re_rcfw *rcfw,
+ struct bng_re_stats *stats_ctx);
+int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h
new file mode 100644
index 000000000000..dae4862621a7
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_re.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RE_H__
+#define __BNG_RE_H__
+
+#include "bng_res.h"
+
+#define BNG_RE_ADEV_NAME "bng_en"
+
+#define BNG_RE_DESC "Broadcom 800G RoCE Driver"
+
+#define rdev_to_dev(rdev) ((rdev) ? (&(rdev)->ibdev.dev) : NULL)
+
+#define BNG_RE_MIN_MSIX 2
+#define BNG_RE_MAX_MSIX BNGE_MAX_ROCE_MSIX
+
+#define BNG_RE_CREQ_NQ_IDX 0
+
+#define BNGE_INVALID_STATS_CTX_ID -1
+/* NQ specific structures */
+struct bng_re_nq_db {
+ struct bng_re_reg_desc reg;
+ struct bng_re_db_info dbinfo;
+};
+
+struct bng_re_nq {
+ struct pci_dev *pdev;
+ struct bng_re_res *res;
+ char *name;
+ struct bng_re_hwq hwq;
+ struct bng_re_nq_db nq_db;
+ u16 ring_id;
+ int msix_vec;
+ cpumask_t mask;
+ struct tasklet_struct nq_tasklet;
+ bool requested;
+ int budget;
+ u32 load;
+
+ struct workqueue_struct *cqn_wq;
+};
+
+struct bng_re_nq_record {
+ struct bnge_msix_info msix_entries[BNG_RE_MAX_MSIX];
+ struct bng_re_nq nq[BNG_RE_MAX_MSIX];
+ int num_msix;
+ /* serialize NQ access */
+ struct mutex load_lock;
+};
+
+struct bng_re_en_dev_info {
+ struct bng_re_dev *rdev;
+ struct bnge_auxr_dev *auxr_dev;
+};
+
+struct bng_re_ring_attr {
+ dma_addr_t *dma_arr;
+ int pages;
+ int type;
+ u32 depth;
+ u32 lrid; /* Logical ring id */
+ u8 mode;
+};
+
+struct bng_re_dev {
+ struct ib_device ibdev;
+ unsigned long flags;
+#define BNG_RE_FLAG_NETDEV_REGISTERED 0
+#define BNG_RE_FLAG_RCFW_CHANNEL_EN 1
+ struct net_device *netdev;
+ struct auxiliary_device *adev;
+ struct bnge_auxr_dev *aux_dev;
+ struct bng_re_chip_ctx *chip_ctx;
+ int fn_id;
+ struct bng_re_res bng_res;
+ struct bng_re_rcfw rcfw;
+ struct bng_re_nq_record *nqr;
+ /* Device Resources */
+ struct bng_re_dev_attr *dev_attr;
+ struct dentry *dbg_root;
+ struct bng_re_stats stats_ctx;
+};
+
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_res.c b/drivers/infiniband/hw/bng_re/bng_res.c
new file mode 100644
index 000000000000..c50823758b53
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_res.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+#include <rdma/ib_umem.h>
+
+#include <linux/bnxt/hsi.h>
+#include "bng_res.h"
+#include "roce_hsi.h"
+
+/* Stats */
+void bng_re_free_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_stats *stats)
+{
+ if (stats->dma) {
+ dma_free_coherent(&pdev->dev, stats->size,
+ stats->dma, stats->dma_map);
+ }
+ memset(stats, 0, sizeof(*stats));
+ stats->fw_id = -1;
+}
+
+int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_chip_ctx *cctx,
+ struct bng_re_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->fw_id = -1;
+ stats->size = cctx->hw_stats_size;
+ stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
+ &stats->dma_map, GFP_KERNEL);
+ if (!stats->dma)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void bng_free_pbl(struct bng_re_res *res, struct bng_re_pbl *pbl)
+{
+ struct pci_dev *pdev = res->pdev;
+ int i;
+
+ for (i = 0; i < pbl->pg_count; i++) {
+ if (pbl->pg_arr[i])
+ dma_free_coherent(&pdev->dev, pbl->pg_size,
+ (void *)((unsigned long)
+ pbl->pg_arr[i] &
+ PAGE_MASK),
+ pbl->pg_map_arr[i]);
+ else
+ dev_warn(&pdev->dev,
+ "PBL free pg_arr[%d] empty?!\n", i);
+ pbl->pg_arr[i] = NULL;
+ }
+
+ vfree(pbl->pg_arr);
+ pbl->pg_arr = NULL;
+ vfree(pbl->pg_map_arr);
+ pbl->pg_map_arr = NULL;
+ pbl->pg_count = 0;
+ pbl->pg_size = 0;
+}
+
+static int bng_alloc_pbl(struct bng_re_res *res,
+ struct bng_re_pbl *pbl,
+ struct bng_re_sg_info *sginfo)
+{
+ struct pci_dev *pdev = res->pdev;
+ u32 pages;
+ int i;
+
+ if (sginfo->nopte)
+ return 0;
+ pages = sginfo->npages;
+
+ /* page ptr arrays */
+ pbl->pg_arr = vmalloc_array(pages, sizeof(void *));
+ if (!pbl->pg_arr)
+ return -ENOMEM;
+
+ pbl->pg_map_arr = vmalloc_array(pages, sizeof(dma_addr_t));
+ if (!pbl->pg_map_arr) {
+ vfree(pbl->pg_arr);
+ pbl->pg_arr = NULL;
+ return -ENOMEM;
+ }
+ pbl->pg_count = 0;
+ pbl->pg_size = sginfo->pgsize;
+
+ for (i = 0; i < pages; i++) {
+ pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
+ pbl->pg_size,
+ &pbl->pg_map_arr[i],
+ GFP_KERNEL);
+ if (!pbl->pg_arr[i])
+ goto fail;
+ pbl->pg_count++;
+ }
+
+ return 0;
+fail:
+ bng_free_pbl(res, pbl);
+ return -ENOMEM;
+}
+
+void bng_re_free_hwq(struct bng_re_res *res,
+ struct bng_re_hwq *hwq)
+{
+ int i;
+
+ if (!hwq->max_elements)
+ return;
+ if (hwq->level >= BNG_PBL_LVL_MAX)
+ return;
+
+ for (i = 0; i < hwq->level + 1; i++)
+ bng_free_pbl(res, &hwq->pbl[i]);
+
+ hwq->level = BNG_PBL_LVL_MAX;
+ hwq->max_elements = 0;
+ hwq->element_size = 0;
+ hwq->prod = 0;
+ hwq->cons = 0;
+}
+
+/* All HWQs are power of 2 in size */
+int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq,
+ struct bng_re_hwq_attr *hwq_attr)
+{
+ u32 npages, pg_size;
+ struct bng_re_sg_info sginfo = {};
+ u32 depth, stride, npbl, npde;
+ dma_addr_t *src_phys_ptr, **dst_virt_ptr;
+ struct bng_re_res *res;
+ struct pci_dev *pdev;
+ int i, rc, lvl;
+
+ res = hwq_attr->res;
+ pdev = res->pdev;
+ pg_size = hwq_attr->sginfo->pgsize;
+ hwq->level = BNG_PBL_LVL_MAX;
+
+ depth = roundup_pow_of_two(hwq_attr->depth);
+ stride = roundup_pow_of_two(hwq_attr->stride);
+
+ npages = (depth * stride) / pg_size;
+ if ((depth * stride) % pg_size)
+ npages++;
+ if (!npages)
+ return -EINVAL;
+ hwq_attr->sginfo->npages = npages;
+
+ if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) {
+ /* This request is Level 0, map PTE */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_0;
+ goto done;
+ }
+
+ if (npages >= MAX_PBL_LVL_0_PGS) {
+ if (npages > MAX_PBL_LVL_1_PGS) {
+ u32 flag = PTU_PTE_VALID;
+ /* 2 levels of indirection */
+ npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+ if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+ npbl++;
+ npde = npbl >> MAX_PDL_LVL_SHIFT;
+ if (npbl % BIT(MAX_PDL_LVL_SHIFT))
+ npde++;
+ /* Alloc PDE pages */
+ sginfo.pgsize = npde * pg_size;
+ sginfo.npages = 1;
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo);
+ if (rc)
+ goto fail;
+
+ /* Alloc PBL pages */
+ sginfo.npages = npbl;
+ sginfo.pgsize = PAGE_SIZE;
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], &sginfo);
+ if (rc)
+ goto fail;
+ /* Fill PDL with PBL page pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++)
+ dst_virt_ptr[0][i] = src_phys_ptr[i] | flag;
+
+ /* Alloc or init PTEs */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_2],
+ hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_2;
+ if (hwq_attr->sginfo->nopte)
+ goto done;
+ /* Fill PBLs with PTE pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_1].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_2].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_2].pg_count; i++) {
+ dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+ src_phys_ptr[i] | PTU_PTE_VALID;
+ }
+ if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) {
+ /* Find the last pg of the size */
+ i = hwq->pbl[BNG_PBL_LVL_2].pg_count;
+ dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+ PTU_PTE_LAST;
+ if (i > 1)
+ dst_virt_ptr[PTR_PG(i - 2)]
+ [PTR_IDX(i - 2)] |=
+ PTU_PTE_NEXT_TO_LAST;
+ }
+ } else { /* pages < 512 npbl = 1, npde = 0 */
+ u32 flag = PTU_PTE_VALID;
+
+ /* 1 level of indirection */
+ npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+ if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+ npbl++;
+ sginfo.npages = npbl;
+ sginfo.pgsize = PAGE_SIZE;
+ /* Alloc PBL page */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo);
+ if (rc)
+ goto fail;
+ /* Alloc or init PTEs */
+ rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1],
+ hwq_attr->sginfo);
+ if (rc)
+ goto fail;
+ hwq->level = BNG_PBL_LVL_1;
+ if (hwq_attr->sginfo->nopte)
+ goto done;
+ /* Fill PBL with PTE pointers */
+ dst_virt_ptr =
+ (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr;
+ src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr;
+ for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++)
+ dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+ src_phys_ptr[i] | flag;
+ if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) {
+ /* Find the last pg of the size */
+ i = hwq->pbl[BNG_PBL_LVL_1].pg_count;
+ dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+ PTU_PTE_LAST;
+ if (i > 1)
+ dst_virt_ptr[PTR_PG(i - 2)]
+ [PTR_IDX(i - 2)] |=
+ PTU_PTE_NEXT_TO_LAST;
+ }
+ }
+ }
+done:
+ hwq->prod = 0;
+ hwq->cons = 0;
+ hwq->pdev = pdev;
+ hwq->depth = hwq_attr->depth;
+ hwq->max_elements = hwq->depth;
+ hwq->element_size = stride;
+ hwq->qe_ppg = pg_size / stride;
+ /* For direct access to the elements */
+ lvl = hwq->level;
+ if (hwq_attr->sginfo->nopte && hwq->level)
+ lvl = hwq->level - 1;
+ hwq->pbl_ptr = hwq->pbl[lvl].pg_arr;
+ hwq->pbl_dma_ptr = hwq->pbl[lvl].pg_map_arr;
+ spin_lock_init(&hwq->lock);
+
+ return 0;
+fail:
+ bng_re_free_hwq(res, hwq);
+ return -ENOMEM;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h
new file mode 100644
index 000000000000..9997f86d6a0e
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_res.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_RES_H__
+#define __BNG_RES_H__
+
+#include "roce_hsi.h"
+
+#define BNG_ROCE_FW_MAX_TIMEOUT 60
+
+#define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *))
+#define PTR_MAX_IDX_PER_PG (PTR_CNT_PER_PG - 1)
+#define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG)
+#define PTR_IDX(x) ((x) & PTR_MAX_IDX_PER_PG)
+
+#define HWQ_CMP(idx, hwq) ((idx) & ((hwq)->max_elements - 1))
+#define HWQ_FREE_SLOTS(hwq) (hwq->max_elements - \
+ ((HWQ_CMP(hwq->prod, hwq)\
+ - HWQ_CMP(hwq->cons, hwq))\
+ & (hwq->max_elements - 1)))
+
+#define MAX_PBL_LVL_0_PGS 1
+#define MAX_PBL_LVL_1_PGS 512
+#define MAX_PBL_LVL_1_PGS_SHIFT 9
+#define MAX_PBL_LVL_1_PGS_FOR_LVL_2 256
+#define MAX_PBL_LVL_2_PGS (256 * 512)
+#define MAX_PDL_LVL_SHIFT 9
+
+#define BNG_RE_DBR_VALID (0x1UL << 26)
+#define BNG_RE_DBR_EPOCH_SHIFT 24
+#define BNG_RE_DBR_TOGGLE_SHIFT 25
+
+#define BNG_MAX_TQM_ALLOC_REQ 48
+
+struct bng_re_reg_desc {
+ u8 bar_id;
+ resource_size_t bar_base;
+ unsigned long offset;
+ void __iomem *bar_reg;
+ size_t len;
+};
+
+struct bng_re_db_info {
+ void __iomem *db;
+ void __iomem *priv_db;
+ struct bng_re_hwq *hwq;
+ u32 xid;
+ u32 max_slot;
+ u32 flags;
+ u8 toggle;
+};
+
+enum bng_re_db_info_flags_mask {
+ BNG_RE_FLAG_EPOCH_CONS_SHIFT = 0x0UL,
+ BNG_RE_FLAG_EPOCH_PROD_SHIFT = 0x1UL,
+ BNG_RE_FLAG_EPOCH_CONS_MASK = 0x1UL,
+ BNG_RE_FLAG_EPOCH_PROD_MASK = 0x2UL,
+};
+
+enum bng_re_db_epoch_flag_shift {
+ BNG_RE_DB_EPOCH_CONS_SHIFT = BNG_RE_DBR_EPOCH_SHIFT,
+ BNG_RE_DB_EPOCH_PROD_SHIFT = (BNG_RE_DBR_EPOCH_SHIFT - 1),
+};
+
+struct bng_re_chip_ctx {
+ u16 chip_num;
+ u16 hw_stats_size;
+ u64 hwrm_intf_ver;
+ u16 hwrm_cmd_max_timeout;
+};
+
+struct bng_re_pbl {
+ u32 pg_count;
+ u32 pg_size;
+ void **pg_arr;
+ dma_addr_t *pg_map_arr;
+};
+
+enum bng_re_pbl_lvl {
+ BNG_PBL_LVL_0,
+ BNG_PBL_LVL_1,
+ BNG_PBL_LVL_2,
+ BNG_PBL_LVL_MAX
+};
+
+enum bng_re_hwq_type {
+ BNG_HWQ_TYPE_CTX,
+ BNG_HWQ_TYPE_QUEUE
+};
+
+struct bng_re_sg_info {
+ u32 npages;
+ u32 pgshft;
+ u32 pgsize;
+ bool nopte;
+};
+
+struct bng_re_hwq_attr {
+ struct bng_re_res *res;
+ struct bng_re_sg_info *sginfo;
+ enum bng_re_hwq_type type;
+ u32 depth;
+ u32 stride;
+ u32 aux_stride;
+ u32 aux_depth;
+};
+
+struct bng_re_hwq {
+ struct pci_dev *pdev;
+ /* lock to protect hwq */
+ spinlock_t lock;
+ struct bng_re_pbl pbl[BNG_PBL_LVL_MAX + 1];
+ /* Valid values: 0, 1, 2 */
+ enum bng_re_pbl_lvl level;
+ /* PBL entries */
+ void **pbl_ptr;
+ /* PBL dma_addr */
+ dma_addr_t *pbl_dma_ptr;
+ u32 max_elements;
+ u32 depth;
+ u16 element_size;
+ u32 prod;
+ u32 cons;
+ /* queue entry per page */
+ u16 qe_ppg;
+};
+
+struct bng_re_stats {
+ dma_addr_t dma_map;
+ void *dma;
+ u32 size;
+ u32 fw_id;
+};
+
+struct bng_re_res {
+ struct pci_dev *pdev;
+ struct bng_re_chip_ctx *cctx;
+ struct bng_re_dev_attr *dattr;
+};
+
+static inline void *bng_re_get_qe(struct bng_re_hwq *hwq,
+ u32 indx, u64 *pg)
+{
+ u32 pg_num, pg_idx;
+
+ pg_num = (indx / hwq->qe_ppg);
+ pg_idx = (indx % hwq->qe_ppg);
+ if (pg)
+ *pg = (u64)&hwq->pbl_ptr[pg_num];
+ return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx);
+}
+
+#define BNG_RE_INIT_DBHDR(xid, type, indx, toggle) \
+ (((u64)(((xid) & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | \
+ (type) | BNG_RE_DBR_VALID) << 32) | (indx) | \
+ (((u32)(toggle)) << (BNG_RE_DBR_TOGGLE_SHIFT)))
+
+static inline void bng_re_ring_db(struct bng_re_db_info *info,
+ u32 type)
+{
+ u64 key = 0;
+ u32 indx;
+ u8 toggle = 0;
+
+ if (type == DBC_DBC_TYPE_CQ_ARMALL ||
+ type == DBC_DBC_TYPE_CQ_ARMSE)
+ toggle = info->toggle;
+
+ indx = (info->hwq->cons & DBC_DBC_INDEX_MASK) |
+ ((info->flags & BNG_RE_FLAG_EPOCH_CONS_MASK) <<
+ BNG_RE_DB_EPOCH_CONS_SHIFT);
+
+ key = BNG_RE_INIT_DBHDR(info->xid, type, indx, toggle);
+ writeq(key, info->db);
+}
+
+static inline void bng_re_ring_nq_db(struct bng_re_db_info *info,
+ struct bng_re_chip_ctx *cctx,
+ bool arm)
+{
+ u32 type;
+
+ type = arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+ bng_re_ring_db(info, type);
+}
+
+static inline void bng_re_hwq_incr_cons(u32 max_elements, u32 *cons, u32 cnt,
+ u32 *dbinfo_flags)
+{
+ /* move cons and update toggle/epoch if wrap around */
+ *cons += cnt;
+ if (*cons >= max_elements) {
+ *cons %= max_elements;
+ *dbinfo_flags ^= 1UL << BNG_RE_FLAG_EPOCH_CONS_SHIFT;
+ }
+}
+
+static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2)
+{
+ return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED);
+}
+
+void bng_re_free_hwq(struct bng_re_res *res,
+ struct bng_re_hwq *hwq);
+
+int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq,
+ struct bng_re_hwq_attr *hwq_attr);
+
+void bng_re_free_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_stats *stats);
+
+int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev,
+ struct bng_re_chip_ctx *cctx,
+ struct bng_re_stats *stats);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_sp.c b/drivers/infiniband/hw/bng_re/bng_sp.c
new file mode 100644
index 000000000000..83099e05328d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_sp.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2025 Broadcom.
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include "bng_res.h"
+#include "bng_fw.h"
+#include "bng_sp.h"
+#include "bng_tlv.h"
+
+static bool bng_re_is_atomic_cap(struct bng_re_rcfw *rcfw)
+{
+ u16 pcie_ctl2 = 0;
+
+ pcie_capability_read_word(rcfw->pdev, PCI_EXP_DEVCTL2, &pcie_ctl2);
+ return (pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ);
+}
+
+static void bng_re_query_version(struct bng_re_rcfw *rcfw,
+ char *fw_ver)
+{
+ struct creq_query_version_resp resp = {};
+ struct bng_re_cmdqmsg msg = {};
+ struct cmdq_query_version req = {};
+ int rc;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_QUERY_VERSION,
+ sizeof(req));
+
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ return;
+ fw_ver[0] = resp.fw_maj;
+ fw_ver[1] = resp.fw_minor;
+ fw_ver[2] = resp.fw_bld;
+ fw_ver[3] = resp.fw_rsvd;
+}
+
+int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw)
+{
+ struct bng_re_dev_attr *attr = rcfw->res->dattr;
+ struct creq_query_func_resp resp = {};
+ struct bng_re_cmdqmsg msg = {};
+ struct creq_query_func_resp_sb *sb;
+ struct bng_re_rcfw_sbuf sbuf;
+ struct cmdq_query_func req = {};
+ u8 *tqm_alloc;
+ int i, rc;
+ u32 temp;
+
+ bng_re_rcfw_cmd_prep((struct cmdq_base *)&req,
+ CMDQ_BASE_OPCODE_QUERY_FUNC,
+ sizeof(req));
+
+ sbuf.size = ALIGN(sizeof(*sb), BNG_FW_CMDQE_UNITS);
+ sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size,
+ &sbuf.dma_addr, GFP_KERNEL);
+ if (!sbuf.sb)
+ return -ENOMEM;
+ sb = sbuf.sb;
+ req.resp_size = sbuf.size / BNG_FW_CMDQE_UNITS;
+ bng_re_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req),
+ sizeof(resp), 0);
+ rc = bng_re_rcfw_send_message(rcfw, &msg);
+ if (rc)
+ goto bail;
+ /* Extract the context from the side buffer */
+ attr->max_qp = le32_to_cpu(sb->max_qp);
+ /* max_qp value reported by FW doesn't include the QP1 */
+ attr->max_qp += 1;
+ attr->max_qp_rd_atom =
+ sb->max_qp_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ?
+ BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom;
+ attr->max_qp_init_rd_atom =
+ sb->max_qp_init_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ?
+ BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom;
+ attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1;
+
+ /* Adjust for max_qp_wqes for variable wqe */
+ attr->max_qp_wqes = min_t(u32, attr->max_qp_wqes, BNG_VAR_MAX_WQE - 1);
+
+ attr->max_qp_sges = min_t(u32, sb->max_sge_var_wqe, BNG_VAR_MAX_SGE);
+ attr->max_cq = le32_to_cpu(sb->max_cq);
+ attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
+ attr->max_cq_sges = attr->max_qp_sges;
+ attr->max_mr = le32_to_cpu(sb->max_mr);
+ attr->max_mw = le32_to_cpu(sb->max_mw);
+
+ attr->max_mr_size = le64_to_cpu(sb->max_mr_size);
+ attr->max_pd = 64 * 1024;
+ attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp);
+ attr->max_ah = le32_to_cpu(sb->max_ah);
+
+ attr->max_srq = le16_to_cpu(sb->max_srq);
+ attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
+ attr->max_srq_sges = sb->max_srq_sge;
+ attr->max_pkey = 1;
+ attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
+ /*
+ * Read the max gid supported by HW.
+ * For each entry in HW GID in HW table, we consume 2
+ * GID entries in the kernel GID table. So max_gid reported
+ * to stack can be up to twice the value reported by the HW, up to 256 gids.
+ */
+ attr->max_sgid = le32_to_cpu(sb->max_gid);
+ attr->max_sgid = min_t(u32, BNG_RE_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid);
+ attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags);
+ attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2);
+
+ if (_is_max_srq_ext_supported(attr->dev_cap_flags2))
+ attr->max_srq += le16_to_cpu(sb->max_srq_ext);
+
+ bng_re_query_version(rcfw, attr->fw_ver);
+ for (i = 0; i < BNG_MAX_TQM_ALLOC_REQ / 4; i++) {
+ temp = le32_to_cpu(sb->tqm_alloc_reqs[i]);
+ tqm_alloc = (u8 *)&temp;
+ attr->tqm_alloc_reqs[i * 4] = *tqm_alloc;
+ attr->tqm_alloc_reqs[i * 4 + 1] = *(++tqm_alloc);
+ attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc);
+ attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc);
+ }
+
+ attr->max_dpi = le32_to_cpu(sb->max_dpi);
+ attr->is_atomic = bng_re_is_atomic_cap(rcfw);
+bail:
+ dma_free_coherent(&rcfw->pdev->dev, sbuf.size,
+ sbuf.sb, sbuf.dma_addr);
+ return rc;
+}
diff --git a/drivers/infiniband/hw/bng_re/bng_sp.h b/drivers/infiniband/hw/bng_re/bng_sp.h
new file mode 100644
index 000000000000..e15190515ed1
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_sp.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2025 Broadcom.
+
+#ifndef __BNG_SP_H__
+#define __BNG_SP_H__
+
+#include "bng_fw.h"
+
+#define BNG_VAR_MAX_WQE 4352
+#define BNG_VAR_MAX_SGE 13
+
+struct bng_re_dev_attr {
+#define FW_VER_ARR_LEN 4
+ u8 fw_ver[FW_VER_ARR_LEN];
+#define BNG_RE_NUM_GIDS_SUPPORTED 256
+ u16 max_sgid;
+ u16 max_mrw;
+ u32 max_qp;
+#define BNG_RE_MAX_OUT_RD_ATOM 126
+ u32 max_qp_rd_atom;
+ u32 max_qp_init_rd_atom;
+ u32 max_qp_wqes;
+ u32 max_qp_sges;
+ u32 max_cq;
+ u32 max_cq_wqes;
+ u32 max_cq_sges;
+ u32 max_mr;
+ u64 max_mr_size;
+ u32 max_pd;
+ u32 max_mw;
+ u32 max_raw_ethy_qp;
+ u32 max_ah;
+ u32 max_srq;
+ u32 max_srq_wqes;
+ u32 max_srq_sges;
+ u32 max_pkey;
+ u32 max_inline_data;
+ u32 l2_db_size;
+ u8 tqm_alloc_reqs[BNG_MAX_TQM_ALLOC_REQ];
+ bool is_atomic;
+ u16 dev_cap_flags;
+ u16 dev_cap_flags2;
+ u32 max_dpi;
+};
+
+int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw);
+#endif
diff --git a/drivers/infiniband/hw/bng_re/bng_tlv.h b/drivers/infiniband/hw/bng_re/bng_tlv.h
new file mode 100644
index 000000000000..278f4922962d
--- /dev/null
+++ b/drivers/infiniband/hw/bng_re/bng_tlv.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+
+#ifndef __BNG_TLV_H__
+#define __BNG_TLV_H__
+
+#include "roce_hsi.h"
+
+struct roce_tlv {
+ struct tlv tlv;
+ u8 total_size; // in units of 16 byte chunks
+ u8 unused[7]; // for 16 byte alignment
+};
+
+/*
+ * TLV size in units of 16 byte chunks
+ */
+#define TLV_SIZE ((sizeof(struct roce_tlv) + 15) / 16)
+/*
+ * TLV length in bytes
+ */
+#define TLV_BYTES (TLV_SIZE * 16)
+
+#define HAS_TLV_HEADER(msg) (le16_to_cpu(((struct tlv *)(msg))->cmd_discr) == CMD_DISCR_TLV_ENCAP)
+#define GET_TLV_DATA(tlv) ((void *)&((uint8_t *)(tlv))[TLV_BYTES])
+
+static inline u8 __get_cmdq_base_opcode(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->opcode;
+ else
+ return req->opcode;
+}
+
+static inline void __set_cmdq_base_opcode(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->opcode = val;
+ else
+ req->opcode = val;
+}
+
+static inline __le16 __get_cmdq_base_cookie(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->cookie;
+ else
+ return req->cookie;
+}
+
+static inline void __set_cmdq_base_cookie(struct cmdq_base *req,
+ u32 size, __le16 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->cookie = val;
+ else
+ req->cookie = val;
+}
+
+static inline __le64 __get_cmdq_base_resp_addr(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr;
+ else
+ return req->resp_addr;
+}
+
+static inline void __set_cmdq_base_resp_addr(struct cmdq_base *req,
+ u32 size, __le64 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr = val;
+ else
+ req->resp_addr = val;
+}
+
+static inline u8 __get_cmdq_base_resp_size(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size;
+ else
+ return req->resp_size;
+}
+
+static inline void __set_cmdq_base_resp_size(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size = val;
+ else
+ req->resp_size = val;
+}
+
+static inline u8 __get_cmdq_base_cmd_size(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct roce_tlv *)(req))->total_size;
+ else
+ return req->cmd_size;
+}
+
+static inline void __set_cmdq_base_cmd_size(struct cmdq_base *req,
+ u32 size, u8 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->cmd_size = val;
+ else
+ req->cmd_size = val;
+}
+
+static inline __le16 __get_cmdq_base_flags(struct cmdq_base *req, u32 size)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ return ((struct cmdq_base *)GET_TLV_DATA(req))->flags;
+ else
+ return req->flags;
+}
+
+static inline void __set_cmdq_base_flags(struct cmdq_base *req,
+ u32 size, __le16 val)
+{
+ if (HAS_TLV_HEADER(req) && size > TLV_BYTES)
+ ((struct cmdq_base *)GET_TLV_DATA(req))->flags = val;
+ else
+ req->flags = val;
+}
+
+#endif /* __BNG_TLV_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 3485e495ac6a..3a7ce4729fcf 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -224,6 +224,8 @@ struct bnxt_re_dev {
struct workqueue_struct *dcb_wq;
struct dentry *cc_config;
struct bnxt_re_dbg_cc_config_params *cc_config_params;
+ struct dentry *cq_coal_cfg;
+ struct bnxt_re_dbg_cq_coal_params *cq_coal_cfg_params;
#define BNXT_VPD_FLD_LEN 32
char board_partno[BNXT_VPD_FLD_LEN];
/* RoCE mirror */
diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c
index be5e9b5ca2f0..88817c86ae24 100644
--- a/drivers/infiniband/hw/bnxt_re/debugfs.c
+++ b/drivers/infiniband/hw/bnxt_re/debugfs.c
@@ -23,6 +23,14 @@
static struct dentry *bnxt_re_debugfs_root;
+static const char * const bnxt_re_cq_coal_str[] = {
+ "buf_maxtime",
+ "normal_maxbuf",
+ "during_maxbuf",
+ "en_ring_idle_mode",
+ "enable",
+};
+
static const char * const bnxt_re_cc_gen0_name[] = {
"enable_cc",
"run_avg_weight_g",
@@ -349,6 +357,123 @@ static void bnxt_re_debugfs_add_info(struct bnxt_re_dev *rdev)
debugfs_create_file("info", 0400, rdev->dbg_root, rdev, &info_fops);
}
+static ssize_t cq_coal_cfg_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct seq_file *s = file->private_data;
+ struct bnxt_re_cq_coal_param *param = s->private;
+ struct bnxt_re_dev *rdev = param->rdev;
+ int offset = param->offset;
+ char lbuf[16] = { };
+ u32 val;
+
+ if (count > sizeof(lbuf))
+ return -EINVAL;
+
+ if (copy_from_user(lbuf, buf, count))
+ return -EFAULT;
+
+ lbuf[sizeof(lbuf) - 1] = '\0';
+
+ if (kstrtou32(lbuf, 0, &val))
+ return -EINVAL;
+
+ switch (offset) {
+ case BNXT_RE_COAL_CQ_BUF_MAXTIME:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME)
+ return -EINVAL;
+ rdev->cq_coalescing.buf_maxtime = val;
+ break;
+ case BNXT_RE_COAL_CQ_NORMAL_MAXBUF:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF)
+ return -EINVAL;
+ rdev->cq_coalescing.normal_maxbuf = val;
+ break;
+ case BNXT_RE_COAL_CQ_DURING_MAXBUF:
+ if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF)
+ return -EINVAL;
+ rdev->cq_coalescing.during_maxbuf = val;
+ break;
+ case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE:
+ if (val > BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE)
+ return -EINVAL;
+ rdev->cq_coalescing.en_ring_idle_mode = val;
+ break;
+ case BNXT_RE_COAL_CQ_ENABLE:
+ if (val > 1)
+ return -EINVAL;
+ rdev->cq_coalescing.enable = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return count;
+}
+
+static int cq_coal_cfg_show(struct seq_file *s, void *unused)
+{
+ struct bnxt_re_cq_coal_param *param = s->private;
+ struct bnxt_re_dev *rdev = param->rdev;
+ int offset = param->offset;
+ u32 val = 0;
+
+ switch (offset) {
+ case BNXT_RE_COAL_CQ_BUF_MAXTIME:
+ val = rdev->cq_coalescing.buf_maxtime;
+ break;
+ case BNXT_RE_COAL_CQ_NORMAL_MAXBUF:
+ val = rdev->cq_coalescing.normal_maxbuf;
+ break;
+ case BNXT_RE_COAL_CQ_DURING_MAXBUF:
+ val = rdev->cq_coalescing.during_maxbuf;
+ break;
+ case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE:
+ val = rdev->cq_coalescing.en_ring_idle_mode;
+ break;
+ case BNXT_RE_COAL_CQ_ENABLE:
+ val = rdev->cq_coalescing.enable;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ seq_printf(s, "%u\n", val);
+ return 0;
+}
+DEFINE_SHOW_STORE_ATTRIBUTE(cq_coal_cfg);
+
+static void bnxt_re_cleanup_cq_coal_debugfs(struct bnxt_re_dev *rdev)
+{
+ debugfs_remove_recursive(rdev->cq_coal_cfg);
+ kfree(rdev->cq_coal_cfg_params);
+}
+
+static void bnxt_re_init_cq_coal_debugfs(struct bnxt_re_dev *rdev)
+{
+ struct bnxt_re_dbg_cq_coal_params *dbg_cq_coal_params;
+ int i;
+
+ if (!_is_cq_coalescing_supported(rdev->dev_attr->dev_cap_flags2))
+ return;
+
+ dbg_cq_coal_params = kzalloc(sizeof(*dbg_cq_coal_params), GFP_KERNEL);
+ if (!dbg_cq_coal_params)
+ return;
+
+ rdev->cq_coal_cfg = debugfs_create_dir("cq_coal_cfg", rdev->dbg_root);
+ rdev->cq_coal_cfg_params = dbg_cq_coal_params;
+
+ for (i = 0; i < BNXT_RE_COAL_CQ_MAX; i++) {
+ dbg_cq_coal_params->params[i].offset = i;
+ dbg_cq_coal_params->params[i].rdev = rdev;
+ debugfs_create_file(bnxt_re_cq_coal_str[i],
+ 0600, rdev->cq_coal_cfg,
+ &dbg_cq_coal_params->params[i],
+ &cq_coal_cfg_fops);
+ }
+}
+
void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev)
{
struct pci_dev *pdev = rdev->en_dev->pdev;
@@ -374,10 +499,13 @@ void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev)
rdev->cc_config, tmp_params,
&bnxt_re_cc_config_ops);
}
+
+ bnxt_re_init_cq_coal_debugfs(rdev);
}
void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev)
{
+ bnxt_re_cleanup_cq_coal_debugfs(rdev);
debugfs_remove_recursive(rdev->qp_debugfs);
debugfs_remove_recursive(rdev->cc_config);
kfree(rdev->cc_config_params);
diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h
index 8f101df4e838..98f4620ef245 100644
--- a/drivers/infiniband/hw/bnxt_re/debugfs.h
+++ b/drivers/infiniband/hw/bnxt_re/debugfs.h
@@ -33,4 +33,23 @@ struct bnxt_re_cc_param {
struct bnxt_re_dbg_cc_config_params {
struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0];
};
+
+struct bnxt_re_cq_coal_param {
+ struct bnxt_re_dev *rdev;
+ u32 offset;
+};
+
+enum bnxt_re_cq_coal_types {
+ BNXT_RE_COAL_CQ_BUF_MAXTIME,
+ BNXT_RE_COAL_CQ_NORMAL_MAXBUF,
+ BNXT_RE_COAL_CQ_DURING_MAXBUF,
+ BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE,
+ BNXT_RE_COAL_CQ_ENABLE,
+ BNXT_RE_COAL_CQ_MAX
+
+};
+
+struct bnxt_re_dbg_cq_coal_params {
+ struct bnxt_re_cq_coal_param params[BNXT_RE_COAL_CQ_MAX];
+};
#endif
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 84ce3fce2826..f19b55c13d58 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -601,7 +601,8 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
mr->qplib_mr.va = (u64)(unsigned long)fence->va;
mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES;
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL,
- BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE);
+ BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE,
+ _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags));
if (rc) {
ibdev_err(&rdev->ibdev, "Failed to register fence-MR\n");
goto fail;
@@ -4027,7 +4028,7 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags)
mr->qplib_mr.hwq.level = PBL_LVL_MAX;
mr->qplib_mr.total_size = -1; /* Infinte length */
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, 0,
- PAGE_SIZE);
+ PAGE_SIZE, false);
if (rc)
goto fail_mr;
@@ -4257,7 +4258,8 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64
umem_pgs = ib_umem_num_dma_blocks(umem, page_size);
rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem,
- umem_pgs, page_size);
+ umem_pgs, page_size,
+ _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags));
if (rc) {
ibdev_err(&rdev->ibdev, "Failed to register user MR - rc = %d\n", rc);
rc = -EIO;
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index b13810572c2e..73003ad25ee8 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1453,6 +1453,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev,
atomic_set(&rdev->stats.res.pd_count, 0);
rdev->cosq[0] = 0xFFFF;
rdev->cosq[1] = 0xFFFF;
+ rdev->cq_coalescing.enable = 1;
rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME;
if (bnxt_re_chip_gen_p7(en_dev->chip_num)) {
rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index ce90d3d834d4..c88f049136fc 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -2226,7 +2226,8 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
req.cq_handle = cpu_to_le64(cq->cq_handle);
req.cq_size = cpu_to_le32(cq->max_wqe);
- if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) {
+ if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2) &&
+ cq->coalescing->enable) {
req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID);
coalescing |= ((cq->coalescing->buf_maxtime <<
CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) &
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index b990d0c0ce1a..1b414a73b46d 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -395,6 +395,7 @@ struct bnxt_qplib_cq_coal_param {
u8 normal_maxbuf;
u8 during_maxbuf;
u8 en_ring_idle_mode;
+ u8 enable;
};
#define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 9ef581ed785c..408a34df2667 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -162,7 +162,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw)
attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
attr->max_srq_sges = sb->max_srq_sge;
attr->max_pkey = 1;
- attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
+ attr->max_inline_data = attr->max_qp_sges * sizeof(struct sq_sge);
if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx))
attr->l2_db_size = (sb->l2_db_space_size + 1) *
(0x01 << RCFW_DBR_BASE_PAGE_SHIFT);
@@ -578,7 +578,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
}
int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
- struct ib_umem *umem, int num_pbls, u32 buf_pg_size)
+ struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr)
{
struct bnxt_qplib_rcfw *rcfw = res->rcfw;
struct bnxt_qplib_hwq_attr hwq_attr = {};
@@ -640,7 +640,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
req.access = (mr->access_flags & BNXT_QPLIB_MR_ACCESS_MASK);
req.va = cpu_to_le64(mr->va);
req.key = cpu_to_le32(mr->lkey);
- if (_is_alloc_mr_unified(res->dattr->dev_cap_flags))
+ if (unified_mr)
req.key = cpu_to_le32(mr->pd->id);
req.flags = cpu_to_le16(mr->flags);
req.mr_size = cpu_to_le64(mr->total_size);
@@ -651,7 +651,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
if (rc)
goto fail;
- if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) {
+ if (unified_mr) {
mr->lkey = le32_to_cpu(resp.xid);
mr->rkey = mr->lkey;
}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
index 147b5d9c0313..5a45c55c6464 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -341,7 +341,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
bool block);
int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
- struct ib_umem *umem, int num_pbls, u32 buf_pg_size);
+ struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr);
int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr);
int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res,
struct bnxt_qplib_mrw *mr, int max);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index dcdfe250bdbe..adeed7447e7b 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -348,7 +348,7 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
{
int err;
- pr_debug("*pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+ pr_debug("*pbl_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
pbl_addr, rdev->lldi.vr->pbl.start,
pbl_size);
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index b35f92e7d865..e4aef102dac0 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -745,8 +745,8 @@ static int create_workqueues(struct hfi1_devdata *dd)
ppd->hfi1_wq =
alloc_workqueue(
"hfi%d_%d",
- WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
- WQ_MEM_RECLAIM,
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM |
+ WQ_PERCPU,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
dd->unit, pidx);
if (!ppd->hfi1_wq)
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
index 370a5a8eaa71..6e0e3458d202 100644
--- a/drivers/infiniband/hw/hfi1/opfn.c
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -305,8 +305,8 @@ void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
int opfn_init(void)
{
opfn_wq = alloc_workqueue("hfi_opfn",
- WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
- WQ_MEM_RECLAIM,
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM |
+ WQ_PERCPU,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
if (!opfn_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile
index baf592e6f21b..d07ef02c5231 100644
--- a/drivers/infiniband/hw/hns/Makefile
+++ b/drivers/infiniband/hw/hns/Makefile
@@ -4,11 +4,13 @@
#
ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
+ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3pf
+ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3_common
ccflags-y += -I $(src)
hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \
- hns_roce_debugfs.o hns_roce_hw_v2.o
+ hns_roce_debugfs.o hns_roce_hw_v2.o hns_roce_bond.o
obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index 307c35888b30..0c1c32d23c88 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include "hns_roce_device.h"
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c
new file mode 100644
index 000000000000..cc85f3ce1f3e
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.c
@@ -0,0 +1,1012 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#include <net/lag.h>
+#include <net/bonding.h>
+#include "hns_roce_device.h"
+#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
+
+static DEFINE_XARRAY(roce_bond_xa);
+
+static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev)
+{
+ struct ib_device *ibdev =
+ ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS);
+
+ if (!ibdev)
+ return NULL;
+
+ return container_of(ibdev, struct hns_roce_dev, ib_dev);
+}
+
+static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev)
+{
+ struct net_device *upper_dev;
+
+ rcu_read_lock();
+ upper_dev = netdev_master_upper_dev_get_rcu(net_dev);
+ dev_hold(upper_dev);
+ rcu_read_unlock();
+
+ return upper_dev;
+}
+
+static int get_netdev_bond_slave_id(struct net_device *net_dev,
+ struct hns_roce_bond_group *bond_grp)
+{
+ int i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++)
+ if (net_dev == bond_grp->bond_func_info[i].net_dev)
+ return i;
+
+ return -ENOENT;
+}
+
+struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
+ u8 bus_num)
+{
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+ struct hns_roce_bond_group *bond_grp;
+ struct net_device *upper_dev = NULL;
+ int i;
+
+ if (!die_info)
+ return NULL;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ return bond_grp;
+ if (bond_grp->upper_dev) {
+ upper_dev = get_upper_dev_from_ndev(net_dev);
+ if (bond_grp->upper_dev == upper_dev) {
+ dev_put(upper_dev);
+ return bond_grp;
+ }
+ dev_put(upper_dev);
+ }
+ }
+
+ return NULL;
+}
+
+static int hns_roce_set_bond_netdev(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev)
+{
+ struct net_device *active_dev;
+ struct net_device *old_dev;
+ int i, ret = 0;
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ rcu_read_lock();
+ active_dev =
+ bond_option_active_slave_get_rcu(netdev_priv(bond_grp->upper_dev));
+ rcu_read_unlock();
+ } else {
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ active_dev = bond_grp->bond_func_info[i].net_dev;
+ if (active_dev &&
+ ib_get_curr_port_state(active_dev) == IB_PORT_ACTIVE)
+ break;
+ }
+ }
+
+ if (!active_dev || i == ROCE_BOND_FUNC_MAX)
+ active_dev = get_hr_netdev(hr_dev, 0);
+
+ old_dev = ib_device_get_netdev(&hr_dev->ib_dev, 1);
+ if (old_dev == active_dev)
+ goto out;
+
+ ret = ib_device_set_netdev(&hr_dev->ib_dev, active_dev, 1);
+ if (ret) {
+ dev_err(hr_dev->dev, "failed to set netdev for bond.\n");
+ goto out;
+ }
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ if (old_dev)
+ roce_del_all_netdev_gids(&hr_dev->ib_dev, 1, old_dev);
+ rdma_roce_rescan_port(&hr_dev->ib_dev, 1);
+ }
+out:
+ dev_put(old_dev);
+ return ret;
+}
+
+bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED &&
+ bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED)
+ return true;
+
+ return false;
+}
+
+static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ u32 active_slave_map = 0;
+ u8 active_slave_num = 0;
+ bool active;
+ u8 i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if (!net_dev || !(bond_grp->slave_map & (1U << i)))
+ continue;
+
+ active = (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ?
+ net_lag_port_dev_txable(net_dev) :
+ (ib_get_curr_port_state(net_dev) == IB_PORT_ACTIVE);
+ if (active) {
+ active_slave_num++;
+ active_slave_map |= (1U << i);
+ }
+ }
+
+ bond_grp->active_slave_num = active_slave_num;
+ bond_grp->active_slave_map = active_slave_map;
+}
+
+static int hns_roce_recover_bond(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev)
+{
+ bond_grp->main_hr_dev = hr_dev;
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ return hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND);
+}
+
+static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx)
+{
+ struct hnae3_handle *handle;
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (handle->priv)
+ hns_roce_bond_uninit_client(bond_grp, func_idx);
+}
+
+static struct hns_roce_dev
+ *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx, bool need_switch);
+
+static int switch_main_dev(struct hns_roce_bond_group *bond_grp,
+ u8 main_func_idx)
+{
+ struct hns_roce_dev *hr_dev;
+ struct net_device *net_dev;
+ u8 i;
+
+ bond_grp->main_hr_dev = NULL;
+ hns_roce_bond_uninit_client(bond_grp, main_func_idx);
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if ((bond_grp->slave_map & (1U << i)) && net_dev) {
+ /* In case this slave is still being registered as
+ * a non-bonded PF, uninit it first and then re-init
+ * it as the main device.
+ */
+ hns_roce_slave_uninit(bond_grp, i);
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev) {
+ bond_grp->main_hr_dev = hr_dev;
+ break;
+ }
+ }
+ }
+
+ if (!bond_grp->main_hr_dev)
+ return -ENODEV;
+
+ return 0;
+}
+
+static struct hns_roce_dev
+ *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp,
+ u8 func_idx, bool need_switch)
+{
+ struct hns_roce_dev *hr_dev = NULL;
+ struct hnae3_handle *handle;
+ u8 main_func_idx;
+ int ret;
+
+ if (need_switch) {
+ main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
+ if (func_idx == main_func_idx) {
+ ret = switch_main_dev(bond_grp, main_func_idx);
+ if (ret == -ENODEV)
+ return NULL;
+ }
+ }
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (handle) {
+ if (handle->priv)
+ return handle->priv;
+ /* Prevent this device from being initialized as a bond device */
+ if (need_switch)
+ bond_grp->bond_func_info[func_idx].net_dev = NULL;
+ hr_dev = hns_roce_bond_init_client(bond_grp, func_idx);
+ if (!hr_dev)
+ BOND_ERR_LOG("failed to init slave %u.\n", func_idx);
+ }
+
+ return hr_dev;
+}
+
+static struct hns_roce_die_info *alloc_die_info(int bus_num)
+{
+ struct hns_roce_die_info *die_info;
+ int ret;
+
+ die_info = kzalloc(sizeof(*die_info), GFP_KERNEL);
+ if (!die_info)
+ return NULL;
+
+ ret = xa_err(xa_store(&roce_bond_xa, bus_num, die_info, GFP_KERNEL));
+ if (ret) {
+ kfree(die_info);
+ return NULL;
+ }
+
+ mutex_init(&die_info->die_mutex);
+
+ return die_info;
+}
+
+static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num)
+{
+ mutex_destroy(&die_info->die_mutex);
+ xa_erase(&roce_bond_xa, bus_num);
+ kfree(die_info);
+}
+
+static int alloc_bond_id(struct hns_roce_bond_group *bond_grp)
+{
+ u8 bus_num = bond_grp->bus_num;
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+ int i;
+
+ if (!die_info) {
+ die_info = alloc_die_info(bus_num);
+ if (!die_info)
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ if (die_info->bond_id_mask & BOND_ID(i))
+ continue;
+
+ die_info->bond_id_mask |= BOND_ID(i);
+ die_info->bgrps[i] = bond_grp;
+ bond_grp->bond_id = i;
+
+ return 0;
+ }
+
+ return -ENOSPC;
+}
+
+static int remove_bond_id(int bus_num, u8 bond_id)
+{
+ struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num);
+
+ if (bond_id >= ROCE_BOND_NUM_MAX)
+ return -EINVAL;
+
+ if (!die_info)
+ return -ENODEV;
+
+ die_info->bond_id_mask &= ~BOND_ID(bond_id);
+ die_info->bgrps[bond_id] = NULL;
+ if (!die_info->bond_id_mask)
+ dealloc_die_info(die_info, bus_num);
+
+ return 0;
+}
+
+static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp)
+{
+ struct hns_roce_dev *hr_dev;
+ int ret;
+ int i;
+
+ for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) {
+ if (bond_grp->slave_map & (1 << i))
+ hns_roce_slave_uninit(bond_grp, i);
+ }
+
+ mutex_lock(&bond_grp->bond_mutex);
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+ bond_grp->main_hr_dev = NULL;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ if (bond_grp->slave_map & (1 << i)) {
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev) {
+ bond_grp->main_hr_dev = hr_dev;
+ break;
+ }
+ }
+ }
+
+ if (!bond_grp->main_hr_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND);
+
+out:
+ if (ret) {
+ BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret);
+ hns_roce_cleanup_bond(bond_grp);
+ } else {
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE set bond finished!\n");
+ }
+}
+
+static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp)
+{
+ u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn);
+ struct hns_roce_dev *hr_dev;
+ u8 i;
+
+ if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED)
+ goto out;
+
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ bond_grp->main_hr_dev = NULL;
+
+ hns_roce_slave_uninit(bond_grp, main_func_idx);
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ hr_dev = hns_roce_slave_init(bond_grp, i, false);
+ if (hr_dev)
+ bond_grp->main_hr_dev = hr_dev;
+ }
+
+out:
+ hns_roce_cleanup_bond(bond_grp);
+}
+
+static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND);
+
+ mutex_lock(&bond_grp->bond_mutex);
+ if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGESTATE)
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+
+ if (ret)
+ ibdev_err(&bond_grp->main_hr_dev->ib_dev,
+ "failed to change RoCE bond slave state, ret = %d.\n",
+ ret);
+ else
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE slave changestate finished!\n");
+}
+
+static void hns_roce_slave_change_num(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+ u8 i;
+
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ if (bond_grp->slave_map & (1U << i)) {
+ if (i == PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn))
+ continue;
+ hns_roce_slave_uninit(bond_grp, i);
+ } else {
+ hns_roce_slave_init(bond_grp, i, true);
+ if (!bond_grp->main_hr_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ bond_grp->bond_func_info[i].net_dev = NULL;
+ bond_grp->bond_func_info[i].handle = NULL;
+ }
+ }
+
+ hns_roce_bond_get_active_slave(bond_grp);
+
+ ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND);
+
+out:
+ if (ret) {
+ BOND_ERR_LOG("failed to change RoCE bond slave num, ret = %d.\n", ret);
+ hns_roce_cleanup_bond(bond_grp);
+ } else {
+ mutex_lock(&bond_grp->bond_mutex);
+ if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGE_NUM)
+ bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED;
+ mutex_unlock(&bond_grp->bond_mutex);
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE slave change num finished!\n");
+ }
+}
+
+static void hns_roce_bond_info_update_nolock(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ struct hns_roce_v2_priv *priv;
+ struct hns_roce_dev *hr_dev;
+ struct net_device *net_dev;
+ int func_idx;
+
+ bond_grp->slave_map = 0;
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(upper_dev, net_dev) {
+ func_idx = get_netdev_bond_slave_id(net_dev, bond_grp);
+ if (func_idx < 0) {
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev)
+ continue;
+ func_idx = PCI_FUNC(hr_dev->pci_dev->devfn);
+ if (!bond_grp->bond_func_info[func_idx].net_dev) {
+ priv = hr_dev->priv;
+ bond_grp->bond_func_info[func_idx].net_dev =
+ net_dev;
+ bond_grp->bond_func_info[func_idx].handle =
+ priv->handle;
+ }
+ ib_device_put(&hr_dev->ib_dev);
+ }
+
+ bond_grp->slave_map |= (1 << func_idx);
+ }
+ rcu_read_unlock();
+}
+
+static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct hns_roce_dev *hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ bool ret = true;
+
+ if (!hr_dev) {
+ if (bond_grp &&
+ get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ return true;
+ else
+ return false;
+ }
+
+ if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) {
+ ret = false;
+ goto out;
+ }
+
+ if (hr_dev->is_vf || pci_num_vf(hr_dev->pci_dev) > 0) {
+ ret = false;
+ goto out;
+ }
+
+ if (bond_grp->bus_num != get_hr_bus_num(hr_dev))
+ ret = false;
+
+out:
+ ib_device_put(&hr_dev->ib_dev);
+ return ret;
+}
+
+static bool check_slave_support(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ struct net_device *net_dev;
+ u8 slave_num = 0;
+
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(upper_dev, net_dev) {
+ if (is_dev_bond_supported(bond_grp, net_dev)) {
+ slave_num++;
+ continue;
+ }
+ rcu_read_unlock();
+ return false;
+ }
+ rcu_read_unlock();
+
+ return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX);
+}
+
+static void hns_roce_bond_work(struct work_struct *work)
+{
+ struct delayed_work *delayed_work = to_delayed_work(work);
+ struct hns_roce_bond_group *bond_grp =
+ container_of(delayed_work, struct hns_roce_bond_group,
+ bond_work);
+ enum hns_roce_bond_state bond_state;
+ bool bond_ready;
+
+ mutex_lock(&bond_grp->bond_mutex);
+ bond_ready = check_slave_support(bond_grp, bond_grp->upper_dev);
+ hns_roce_bond_info_update_nolock(bond_grp, bond_grp->upper_dev);
+ bond_state = bond_grp->bond_state;
+ bond_grp->bond_ready = bond_ready;
+ mutex_unlock(&bond_grp->bond_mutex);
+
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "bond work: bond_ready - %d, bond_state - %d.\n",
+ bond_ready, bond_state);
+
+ if (!bond_ready) {
+ hns_roce_clear_bond(bond_grp);
+ return;
+ }
+
+ switch (bond_state) {
+ case HNS_ROCE_BOND_NOT_BONDED:
+ hns_roce_set_bond(bond_grp);
+ /* In set_bond flow, we don't need to set bond netdev here as
+ * it has been done when bond_grp->main_hr_dev is registered.
+ */
+ return;
+ case HNS_ROCE_BOND_SLAVE_CHANGESTATE:
+ hns_roce_slave_changestate(bond_grp);
+ break;
+ case HNS_ROCE_BOND_SLAVE_CHANGE_NUM:
+ hns_roce_slave_change_num(bond_grp);
+ break;
+ default:
+ return;
+ }
+ hns_roce_set_bond_netdev(bond_grp, bond_grp->main_hr_dev);
+}
+
+static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp,
+ struct hns_roce_dev *hr_dev,
+ struct net_device *upper_dev)
+{
+ bond_grp->upper_dev = upper_dev;
+ bond_grp->main_hr_dev = hr_dev;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ bond_grp->bond_ready = false;
+}
+
+static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp)
+{
+ mutex_lock(&bond_grp->bond_mutex);
+
+ cancel_delayed_work(&bond_grp->bond_work);
+ bond_grp->upper_dev = NULL;
+ bond_grp->main_hr_dev = NULL;
+ bond_grp->bond_ready = false;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
+ bond_grp->slave_map = 0;
+ memset(bond_grp->bond_func_info, 0, sizeof(bond_grp->bond_func_info));
+
+ mutex_unlock(&bond_grp->bond_mutex);
+}
+
+void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp)
+{
+ int ret;
+
+ ret = bond_grp->main_hr_dev ?
+ hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO;
+ if (ret)
+ BOND_ERR_LOG("failed to clear RoCE bond, ret = %d.\n", ret);
+ else
+ ibdev_info(&bond_grp->main_hr_dev->ib_dev,
+ "RoCE clear bond finished!\n");
+
+ hns_roce_detach_bond_grp(bond_grp);
+}
+
+static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct hns_roce_bond_group *bond_grp_tmp;
+
+ bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bond_grp->bus_num);
+ return bond_grp_tmp == bond_grp;
+}
+
+static void lowerstate_event_setting(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changelowerstate_info *info)
+{
+ mutex_lock(&bond_grp->bond_mutex);
+
+ if (bond_grp->bond_ready &&
+ bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED)
+ bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGESTATE;
+
+ mutex_unlock(&bond_grp->bond_mutex);
+}
+
+static bool hns_roce_bond_lowerstate_event(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changelowerstate_info *info)
+{
+ struct net_device *net_dev =
+ netdev_notifier_info_to_dev((struct netdev_notifier_info *)info);
+
+ if (!netif_is_lag_port(net_dev))
+ return false;
+
+ if (!lowerstate_event_filter(bond_grp, net_dev))
+ return false;
+
+ lowerstate_event_setting(bond_grp, info);
+
+ return true;
+}
+
+static bool is_bond_setting_supported(struct netdev_lag_upper_info *bond_info)
+{
+ if (!bond_info)
+ return false;
+
+ if (bond_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
+ bond_info->tx_type != NETDEV_LAG_TX_TYPE_HASH)
+ return false;
+
+ if (bond_info->tx_type == NETDEV_LAG_TX_TYPE_HASH &&
+ bond_info->hash_type > NETDEV_LAG_HASH_L23)
+ return false;
+
+ return true;
+}
+
+static void upper_event_setting(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netdev_lag_upper_info *bond_upper_info = NULL;
+ bool slave_inc = info->linking;
+
+ if (slave_inc)
+ bond_upper_info = info->upper_info;
+
+ if (bond_upper_info) {
+ bond_grp->tx_type = bond_upper_info->tx_type;
+ bond_grp->hash_type = bond_upper_info->hash_type;
+ }
+}
+
+static bool check_unlinking_bond_support(struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ u8 slave_num = 0;
+
+ rcu_read_lock();
+ for_each_netdev_in_bond_rcu(bond_grp->upper_dev, net_dev) {
+ if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0)
+ slave_num++;
+ }
+ rcu_read_unlock();
+
+ return (slave_num > 1);
+}
+
+static bool check_linking_bond_support(struct netdev_lag_upper_info *bond_info,
+ struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev)
+{
+ if (!is_bond_setting_supported(bond_info))
+ return false;
+
+ return check_slave_support(bond_grp, upper_dev);
+}
+
+static enum bond_support_type
+ check_bond_support(struct hns_roce_bond_group *bond_grp,
+ struct net_device *upper_dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ bool bond_grp_exist = false;
+ bool support;
+
+ if (upper_dev == bond_grp->upper_dev)
+ bond_grp_exist = true;
+
+ if (!info->linking && !bond_grp_exist)
+ return BOND_NOT_SUPPORT;
+
+ if (info->linking)
+ support = check_linking_bond_support(info->upper_info, bond_grp,
+ upper_dev);
+ else
+ support = check_unlinking_bond_support(bond_grp);
+
+ if (support)
+ return BOND_SUPPORT;
+
+ return bond_grp_exist ? BOND_EXISTING_NOT_SUPPORT : BOND_NOT_SUPPORT;
+}
+
+static bool upper_event_filter(struct netdev_notifier_changeupper_info *info,
+ struct hns_roce_bond_group *bond_grp,
+ struct net_device *net_dev)
+{
+ struct net_device *upper_dev = info->upper_dev;
+ struct hns_roce_bond_group *bond_grp_tmp;
+ struct hns_roce_dev *hr_dev;
+ bool ret = true;
+ u8 bus_num;
+
+ if (!info->linking ||
+ bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED)
+ return bond_grp->upper_dev == upper_dev;
+
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev)
+ return false;
+
+ bus_num = get_hr_bus_num(hr_dev);
+ if (bond_grp->bus_num != bus_num) {
+ ret = false;
+ goto out;
+ }
+
+ bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp_tmp && bond_grp_tmp != bond_grp)
+ ret = false;
+out:
+ ib_device_put(&hr_dev->ib_dev);
+ return ret;
+}
+
+static bool hns_roce_bond_upper_event(struct hns_roce_bond_group *bond_grp,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *net_dev =
+ netdev_notifier_info_to_dev((struct netdev_notifier_info *)info);
+ struct net_device *upper_dev = info->upper_dev;
+ enum bond_support_type support = BOND_SUPPORT;
+ struct hns_roce_dev *hr_dev;
+ int slave_id;
+
+ if (!upper_dev || !netif_is_lag_master(upper_dev))
+ return false;
+
+ if (!upper_event_filter(info, bond_grp, net_dev))
+ return false;
+
+ mutex_lock(&bond_grp->bond_mutex);
+ support = check_bond_support(bond_grp, upper_dev, info);
+ if (support == BOND_NOT_SUPPORT) {
+ mutex_unlock(&bond_grp->bond_mutex);
+ return false;
+ }
+
+ if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_ATTACHED) {
+ hr_dev = hns_roce_get_hrdev_by_netdev(net_dev);
+ if (!hr_dev) {
+ mutex_unlock(&bond_grp->bond_mutex);
+ return false;
+ }
+ hns_roce_attach_bond_grp(bond_grp, hr_dev, upper_dev);
+ ib_device_put(&hr_dev->ib_dev);
+ }
+
+ /* In the case of netdev being unregistered, the roce
+ * instance shouldn't be inited.
+ */
+ if (net_dev->reg_state >= NETREG_UNREGISTERING) {
+ slave_id = get_netdev_bond_slave_id(net_dev, bond_grp);
+ if (slave_id >= 0) {
+ bond_grp->bond_func_info[slave_id].net_dev = NULL;
+ bond_grp->bond_func_info[slave_id].handle = NULL;
+ }
+ }
+
+ if (support == BOND_SUPPORT) {
+ bond_grp->bond_ready = true;
+ if (bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED)
+ bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGE_NUM;
+ }
+ mutex_unlock(&bond_grp->bond_mutex);
+ if (support == BOND_SUPPORT)
+ upper_event_setting(bond_grp, info);
+
+ return true;
+}
+
+static int hns_roce_bond_event(struct notifier_block *self,
+ unsigned long event, void *ptr)
+{
+ struct hns_roce_bond_group *bond_grp =
+ container_of(self, struct hns_roce_bond_group, bond_nb);
+ bool changed = false;
+
+ if (event == NETDEV_CHANGEUPPER)
+ changed = hns_roce_bond_upper_event(bond_grp, ptr);
+ if (event == NETDEV_CHANGELOWERSTATE)
+ changed = hns_roce_bond_lowerstate_event(bond_grp, ptr);
+
+ if (changed)
+ schedule_delayed_work(&bond_grp->bond_work, HZ);
+
+ return NOTIFY_DONE;
+}
+
+int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev)
+{
+ struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ int ret;
+ int i;
+
+ if (xa_load(&roce_bond_xa, bus_num))
+ return 0;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = kvzalloc(sizeof(*bond_grp), GFP_KERNEL);
+ if (!bond_grp) {
+ ret = -ENOMEM;
+ goto mem_err;
+ }
+
+ mutex_init(&bond_grp->bond_mutex);
+ INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_bond_work);
+
+ bond_grp->bond_ready = false;
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED;
+ bond_grp->bus_num = bus_num;
+
+ ret = alloc_bond_id(bond_grp);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "failed to alloc bond ID, ret = %d.\n", ret);
+ goto alloc_id_err;
+ }
+
+ bond_grp->bond_nb.notifier_call = hns_roce_bond_event;
+ ret = register_netdevice_notifier(&bond_grp->bond_nb);
+ if (ret) {
+ ibdev_err(&hr_dev->ib_dev,
+ "failed to register bond nb, ret = %d.\n", ret);
+ goto register_nb_err;
+ }
+ bgrps[i] = bond_grp;
+ }
+
+ return 0;
+
+register_nb_err:
+ remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
+alloc_id_err:
+ mutex_destroy(&bond_grp->bond_mutex);
+ kvfree(bond_grp);
+mem_err:
+ for (i--; i >= 0; i--) {
+ unregister_netdevice_notifier(&bgrps[i]->bond_nb);
+ cancel_delayed_work_sync(&bgrps[i]->bond_work);
+ remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id);
+ mutex_destroy(&bgrps[i]->bond_mutex);
+ kvfree(bgrps[i]);
+ }
+ return ret;
+}
+
+void hns_roce_dealloc_bond_grp(void)
+{
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ unsigned long id;
+ int i;
+
+ xa_for_each(&roce_bond_xa, id, die_info) {
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ unregister_netdevice_notifier(&bond_grp->bond_nb);
+ cancel_delayed_work_sync(&bond_grp->bond_work);
+ remove_bond_id(bond_grp->bus_num, bond_grp->bond_id);
+ mutex_destroy(&bond_grp->bond_mutex);
+ kvfree(bond_grp);
+ }
+ }
+}
+
+int hns_roce_bond_init(struct hns_roce_dev *hr_dev)
+{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
+ struct hns_roce_v2_priv *priv = hr_dev->priv;
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ int ret;
+
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+
+ if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT) {
+ ret = hns_roce_recover_bond(bond_grp, hr_dev);
+ if (ret) {
+ dev_err(hr_dev->dev,
+ "failed to recover RoCE bond, ret = %d.\n", ret);
+ return ret;
+ }
+ }
+
+ return hns_roce_set_bond_netdev(bond_grp, hr_dev);
+}
+
+void hns_roce_bond_suspend(struct hnae3_handle *handle)
+{
+ u8 bus_num = handle->pdev->bus->number;
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ int i;
+
+ die_info = xa_load(&roce_bond_xa, bus_num);
+ if (!die_info)
+ return;
+
+ mutex_lock(&die_info->die_mutex);
+
+ /*
+ * Avoid duplicated processing when calling this function
+ * multiple times.
+ */
+ if (die_info->suspend_cnt)
+ goto out;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ unregister_netdevice_notifier(&bond_grp->bond_nb);
+ cancel_delayed_work_sync(&bond_grp->bond_work);
+ }
+
+out:
+ die_info->suspend_cnt++;
+ mutex_unlock(&die_info->die_mutex);
+}
+
+void hns_roce_bond_resume(struct hnae3_handle *handle)
+{
+ u8 bus_num = handle->pdev->bus->number;
+ struct hns_roce_bond_group *bond_grp;
+ struct hns_roce_die_info *die_info;
+ int i, ret;
+
+ die_info = xa_load(&roce_bond_xa, bus_num);
+ if (!die_info)
+ return;
+
+ mutex_lock(&die_info->die_mutex);
+
+ die_info->suspend_cnt--;
+ if (die_info->suspend_cnt)
+ goto out;
+
+ for (i = 0; i < ROCE_BOND_NUM_MAX; i++) {
+ bond_grp = die_info->bgrps[i];
+ if (!bond_grp)
+ continue;
+ ret = register_netdevice_notifier(&bond_grp->bond_nb);
+ if (ret)
+ dev_err(&handle->pdev->dev,
+ "failed to resume bond notifier(bus_num = %u, id = %u), ret = %d.\n",
+ bus_num, bond_grp->bond_id, ret);
+ }
+
+out:
+ mutex_unlock(&die_info->die_mutex);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h
new file mode 100644
index 000000000000..98c295d78ca1
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_bond.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#ifndef _HNS_ROCE_BOND_H
+#define _HNS_ROCE_BOND_H
+
+#include <linux/netdevice.h>
+#include <net/bonding.h>
+
+#define ROCE_BOND_FUNC_MAX 4
+#define ROCE_BOND_NUM_MAX 2
+
+#define BOND_ID(id) BIT(id)
+
+#define BOND_ERR_LOG(fmt, ...) \
+ pr_err("HNS RoCE Bonding: " fmt, ##__VA_ARGS__)
+
+enum {
+ BOND_MODE_1,
+ BOND_MODE_2_4,
+};
+
+enum hns_roce_bond_hashtype {
+ BOND_HASH_L2,
+ BOND_HASH_L34,
+ BOND_HASH_L23,
+};
+
+enum bond_support_type {
+ BOND_NOT_SUPPORT,
+ /*
+ * bond_grp already exists, but in the current
+ * conditions it's no longer supported
+ */
+ BOND_EXISTING_NOT_SUPPORT,
+ BOND_SUPPORT,
+};
+
+enum hns_roce_bond_state {
+ HNS_ROCE_BOND_NOT_ATTACHED,
+ HNS_ROCE_BOND_NOT_BONDED,
+ HNS_ROCE_BOND_IS_BONDED,
+ HNS_ROCE_BOND_SLAVE_CHANGE_NUM,
+ HNS_ROCE_BOND_SLAVE_CHANGESTATE,
+};
+
+enum hns_roce_bond_cmd_type {
+ HNS_ROCE_SET_BOND,
+ HNS_ROCE_CHANGE_BOND,
+ HNS_ROCE_CLEAR_BOND,
+};
+
+struct hns_roce_func_info {
+ struct net_device *net_dev;
+ struct hnae3_handle *handle;
+};
+
+struct hns_roce_bond_group {
+ struct net_device *upper_dev;
+ struct hns_roce_dev *main_hr_dev;
+ u8 active_slave_num;
+ u32 slave_map;
+ u32 active_slave_map;
+ u8 bond_id;
+ u8 bus_num;
+ struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX];
+ bool bond_ready;
+ enum hns_roce_bond_state bond_state;
+ enum netdev_lag_tx_type tx_type;
+ enum netdev_lag_hash hash_type;
+ struct mutex bond_mutex;
+ struct notifier_block bond_nb;
+ struct delayed_work bond_work;
+};
+
+struct hns_roce_die_info {
+ u8 bond_id_mask;
+ struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX];
+ struct mutex die_mutex;
+ u8 suspend_cnt;
+};
+
+struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev,
+ u8 bus_num);
+int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev);
+void hns_roce_dealloc_bond_grp(void);
+void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp);
+bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev);
+int hns_roce_bond_init(struct hns_roce_dev *hr_dev);
+void hns_roce_bond_suspend(struct hnae3_handle *handle);
+void hns_roce_bond_resume(struct hnae3_handle *handle);
+
+#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 06832c0ac055..318f18cf37aa 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -33,6 +33,7 @@
#ifndef _HNS_ROCE_DEVICE_H
#define _HNS_ROCE_DEVICE_H
+#include <linux/pci.h>
#include <rdma/ib_verbs.h>
#include <rdma/hns-abi.h>
#include "hns_roce_debugfs.h"
@@ -153,6 +154,7 @@ enum {
HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14),
HNS_ROCE_CAP_FLAG_STASH = BIT(17),
HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19),
+ HNS_ROCE_CAP_FLAG_BOND = BIT(21),
HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22),
};
@@ -177,6 +179,7 @@ enum hns_roce_instance_state {
HNS_ROCE_STATE_INIT,
HNS_ROCE_STATE_INITED,
HNS_ROCE_STATE_UNINIT,
+ HNS_ROCE_STATE_BOND_UNINIT,
};
enum {
@@ -1167,6 +1170,17 @@ static inline u8 get_tclass(const struct ib_global_route *grh)
grh->traffic_class >> DSCP_SHIFT : grh->traffic_class;
}
+static inline struct net_device *get_hr_netdev(struct hns_roce_dev *hr_dev,
+ u8 port)
+{
+ return hr_dev->iboe.netdevs[port];
+}
+
+static inline u8 get_hr_bus_num(struct hns_roce_dev *hr_dev)
+{
+ return hr_dev->pci_dev->bus->number;
+}
+
void hns_roce_init_uar_table(struct hns_roce_dev *dev);
int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar);
@@ -1293,7 +1307,7 @@ void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn);
void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
int hns_roce_init(struct hns_roce_dev *hr_dev);
-void hns_roce_exit(struct hns_roce_dev *hr_dev);
+void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup);
int hns_roce_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq);
int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq);
int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 63052c0e7613..2d6ae89e525b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -43,11 +43,13 @@
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
+#include "hclge_main.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_cmd.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
#define CREATE_TRACE_POINTS
#include "hns_roce_trace.h"
@@ -1434,6 +1436,79 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
return ret;
}
+static enum hns_roce_opcode_type
+ get_bond_opcode(enum hns_roce_bond_cmd_type bond_type)
+{
+ switch (bond_type) {
+ case HNS_ROCE_SET_BOND:
+ return HNS_ROCE_OPC_SET_BOND_INFO;
+ case HNS_ROCE_CHANGE_BOND:
+ return HNS_ROCE_OPC_CHANGE_ACTIVE_PORT;
+ case HNS_ROCE_CLEAR_BOND:
+ return HNS_ROCE_OPC_CLEAR_BOND_INFO;
+ default:
+ WARN(true, "Invalid bond type %d!\n", bond_type);
+ return HNS_ROCE_OPC_SET_BOND_INFO;
+ }
+}
+
+static enum hns_roce_bond_hashtype
+ get_bond_hashtype(enum netdev_lag_hash netdev_hashtype)
+{
+ switch (netdev_hashtype) {
+ case NETDEV_LAG_HASH_L2:
+ return BOND_HASH_L2;
+ case NETDEV_LAG_HASH_L34:
+ return BOND_HASH_L34;
+ case NETDEV_LAG_HASH_L23:
+ return BOND_HASH_L23;
+ default:
+ WARN(true, "Invalid hash type %d!\n", netdev_hashtype);
+ return BOND_HASH_L2;
+ }
+}
+
+int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
+ enum hns_roce_bond_cmd_type bond_type)
+{
+ enum hns_roce_opcode_type opcode = get_bond_opcode(bond_type);
+ struct hns_roce_bond_info *slave_info;
+ struct hns_roce_cmq_desc desc = {};
+ int ret;
+
+ slave_info = (struct hns_roce_bond_info *)desc.data;
+ hns_roce_cmq_setup_basic_desc(&desc, opcode, false);
+
+ slave_info->bond_id = cpu_to_le32(bond_grp->bond_id);
+ if (bond_type == HNS_ROCE_CLEAR_BOND)
+ goto out;
+
+ if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+ slave_info->bond_mode = cpu_to_le32(BOND_MODE_1);
+ if (bond_grp->active_slave_num != 1)
+ ibdev_warn(&bond_grp->main_hr_dev->ib_dev,
+ "active slave cnt(%u) in Mode 1 is invalid.\n",
+ bond_grp->active_slave_num);
+ } else {
+ slave_info->bond_mode = cpu_to_le32(BOND_MODE_2_4);
+ slave_info->hash_policy =
+ cpu_to_le32(get_bond_hashtype(bond_grp->hash_type));
+ }
+
+ slave_info->active_slave_cnt = cpu_to_le32(bond_grp->active_slave_num);
+ slave_info->active_slave_mask = cpu_to_le32(bond_grp->active_slave_map);
+ slave_info->slave_mask = cpu_to_le32(bond_grp->slave_map);
+
+out:
+ ret = hns_roce_cmq_send(bond_grp->main_hr_dev, &desc, 1);
+ if (ret)
+ ibdev_err(&bond_grp->main_hr_dev->ib_dev,
+ "cmq bond type(%d) failed, ret = %d.\n",
+ bond_type, ret);
+
+ return ret;
+}
+
static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev,
dma_addr_t base_addr, u8 cmd, unsigned long tag)
{
@@ -2275,6 +2350,9 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) <<
HNS_ROCE_CAP_FLAGS_EX_SHIFT;
+ if (hr_dev->is_vf)
+ caps->flags &= ~HNS_ROCE_CAP_FLAG_BOND;
+
caps->num_cqs = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_CQS);
caps->gid_table_len[0] = hr_reg_read(resp_c, PF_CAPS_C_MAX_GID);
caps->max_cqes = 1 << hr_reg_read(resp_c, PF_CAPS_C_CQ_DEPTH);
@@ -7067,7 +7145,7 @@ error_failed_kzalloc:
}
static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
- bool reset)
+ bool reset, bool bond_cleanup)
{
struct hns_roce_dev *hr_dev = handle->priv;
@@ -7079,7 +7157,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT;
hns_roce_handle_device_err(hr_dev);
- hns_roce_exit(hr_dev);
+ hns_roce_exit(hr_dev, bond_cleanup);
kfree(hr_dev->priv);
ib_dealloc_device(&hr_dev->ib_dev);
}
@@ -7130,12 +7208,51 @@ reset_chk_err:
static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
bool reset)
{
+ /* Suspend bond to avoid concurrency */
+ hns_roce_bond_suspend(handle);
+
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
- return;
+ goto out;
handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
- __hns_roce_hw_v2_uninit_instance(handle, reset);
+ __hns_roce_hw_v2_uninit_instance(handle, reset, true);
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
+
+out:
+ hns_roce_bond_resume(handle);
+}
+
+struct hns_roce_dev
+ *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx)
+{
+ struct hnae3_handle *handle;
+ int ret;
+
+ handle = bond_grp->bond_func_info[func_idx].handle;
+ if (!handle || !handle->client)
+ return NULL;
+
+ ret = hns_roce_hw_v2_init_instance(handle);
+ if (ret)
+ return NULL;
+
+ return handle->priv;
+}
+
+void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx)
+{
+ struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle;
+
+ if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED)
+ return;
+
+ handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT;
+
+ __hns_roce_hw_v2_uninit_instance(handle, false, false);
handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
}
@@ -7144,6 +7261,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
{
struct hns_roce_dev *hr_dev;
+ /* Suspend bond to avoid concurrency */
+ hns_roce_bond_suspend(handle);
+
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
return 0;
@@ -7174,6 +7294,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN,
&handle->rinfo.state)) {
handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED;
+ hns_roce_bond_resume(handle);
return 0;
}
@@ -7193,6 +7314,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle)
dev_info(dev, "reset done, RoCE client reinit finished.\n");
}
+ hns_roce_bond_resume(handle);
return ret;
}
@@ -7204,7 +7326,7 @@ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle)
handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT;
dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n");
msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY);
- __hns_roce_hw_v2_uninit_instance(handle, false);
+ __hns_roce_hw_v2_uninit_instance(handle, false, false);
return 0;
}
@@ -7240,6 +7362,14 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle,
if (linkup || !hr_dev)
return;
+ /* For bond device, the link status depends on the upper netdev,
+ * and the upper device's link status depends on all the slaves'
+ * netdev but not only one. So bond device cannot get a correct
+ * link status from this path.
+ */
+ if (hns_roce_get_bond_grp(netdev, get_hr_bus_num(hr_dev)))
+ return;
+
ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev);
}
@@ -7264,6 +7394,7 @@ static int __init hns_roce_hw_v2_init(void)
static void __exit hns_roce_hw_v2_exit(void)
{
+ hns_roce_dealloc_bond_grp();
hnae3_unregister_client(&hns_roce_hw_v2_client);
hns_roce_cleanup_debugfs();
}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index e64a04d6f85b..285fe0875fac 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -35,6 +35,7 @@
#include <linux/bitops.h>
#include "hnae3.h"
+#include "hns_roce_bond.h"
#define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32
#define HNS_ROCE_V2_MTT_ENTRY_SZ 64
@@ -228,6 +229,9 @@ enum hns_roce_opcode_type {
HNS_ROCE_OPC_CFG_GMV_BT = 0x8510,
HNS_ROCE_QUERY_RAM_ECC = 0x8513,
HNS_SWITCH_PARAMETER_CFG = 0x1033,
+ HNS_ROCE_OPC_SET_BOND_INFO = 0x8601,
+ HNS_ROCE_OPC_CLEAR_BOND_INFO = 0x8602,
+ HNS_ROCE_OPC_CHANGE_ACTIVE_PORT = 0x8603,
};
#define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000
@@ -1465,7 +1469,23 @@ struct hns_roce_sccc_clr_done {
__le32 rsv[5];
};
+struct hns_roce_bond_info {
+ __le32 bond_id;
+ __le32 bond_mode;
+ __le32 active_slave_cnt;
+ __le32 active_slave_mask;
+ __le32 slave_mask;
+ __le32 hash_policy;
+};
+
+struct hns_roce_dev
+ *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx);
+void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp,
+ int func_idx);
int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp,
+ enum hns_roce_bond_cmd_type bond_type);
static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
void __iomem *dest)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index f3607fe107a7..2f4864ab7d4e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -32,7 +32,6 @@
*/
#include <linux/acpi.h>
#include <linux/module.h>
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
@@ -41,6 +40,7 @@
#include "hns_roce_device.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#include "hns_roce_bond.h"
static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port,
const u8 *addr)
@@ -89,30 +89,75 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
return ret;
}
-static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port,
- unsigned long event)
+static int hns_roce_get_port_state(struct hns_roce_dev *hr_dev, u32 port_num,
+ enum ib_port_state *state)
{
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+ struct net_device *net_dev;
+
+ net_dev = ib_device_get_netdev(&hr_dev->ib_dev, port_num);
+ if (!net_dev)
+ return -ENODEV;
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp) {
+ *state = ib_get_curr_port_state(bond_grp->upper_dev);
+ goto out;
+ }
+ }
+
+ *state = ib_get_curr_port_state(net_dev);
+out:
+ dev_put(net_dev);
+ return 0;
+}
+
+static int handle_en_event(struct net_device *netdev,
+ struct hns_roce_dev *hr_dev,
+ u32 port, unsigned long event)
+{
+ struct ib_device *ibdev = &hr_dev->ib_dev;
struct device *dev = hr_dev->dev;
- struct net_device *netdev;
+ enum ib_port_state curr_state;
+ struct ib_event ibevent;
int ret = 0;
- netdev = hr_dev->iboe.netdevs[port];
if (!netdev) {
dev_err(dev, "can't find netdev on port(%u)!\n", port);
return -ENODEV;
}
switch (event) {
- case NETDEV_UP:
- case NETDEV_CHANGE:
case NETDEV_REGISTER:
case NETDEV_CHANGEADDR:
ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
break;
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
+ if (ret)
+ return ret;
+ fallthrough;
case NETDEV_DOWN:
- /*
- * In v1 engine, only support all ports closed together.
- */
+ if (!netif_is_lag_master(netdev))
+ break;
+ curr_state = ib_get_curr_port_state(netdev);
+
+ write_lock_irq(&ibdev->cache_lock);
+ if (ibdev->port_data[port].cache.last_port_state == curr_state) {
+ write_unlock_irq(&ibdev->cache_lock);
+ return 0;
+ }
+ ibdev->port_data[port].cache.last_port_state = curr_state;
+ write_unlock_irq(&ibdev->cache_lock);
+
+ ibevent.event = (curr_state == IB_PORT_DOWN) ?
+ IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE;
+ ibevent.device = ibdev;
+ ibevent.element.port_num = port + 1;
+ ib_dispatch_event(&ibevent);
break;
default:
dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event));
@@ -126,17 +171,25 @@ static int hns_roce_netdev_event(struct notifier_block *self,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct hns_roce_bond_group *bond_grp;
struct hns_roce_ib_iboe *iboe = NULL;
struct hns_roce_dev *hr_dev = NULL;
+ struct net_device *upper = NULL;
int ret;
u32 port;
hr_dev = container_of(self, struct hns_roce_dev, iboe.nb);
iboe = &hr_dev->iboe;
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0),
+ get_hr_bus_num(hr_dev));
+ upper = bond_grp ? bond_grp->upper_dev : NULL;
+ }
for (port = 0; port < hr_dev->caps.num_ports; port++) {
- if (dev == iboe->netdevs[port]) {
- ret = handle_en_event(hr_dev, port, event);
+ if ((!upper && dev == iboe->netdevs[port]) ||
+ (upper && dev == upper)) {
+ ret = handle_en_event(dev, hr_dev, port, event);
if (ret)
return NOTIFY_DONE;
break;
@@ -148,12 +201,13 @@ static int hns_roce_netdev_event(struct notifier_block *self,
static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev)
{
+ struct net_device *net_dev;
int ret;
u8 i;
for (i = 0; i < hr_dev->caps.num_ports; i++) {
- ret = hns_roce_set_mac(hr_dev, i,
- hr_dev->iboe.netdevs[i]->dev_addr);
+ net_dev = get_hr_netdev(hr_dev, i);
+ ret = hns_roce_set_mac(hr_dev, i, net_dev->dev_addr);
if (ret)
return ret;
}
@@ -221,9 +275,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
struct ib_port_attr *props)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
- struct device *dev = hr_dev->dev;
struct net_device *net_dev;
- unsigned long flags;
enum ib_mtu mtu;
u32 port;
int ret;
@@ -244,26 +296,26 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
if (ret)
ibdev_warn(ib_dev, "failed to get speed, ret = %d.\n", ret);
- spin_lock_irqsave(&hr_dev->iboe.lock, flags);
-
- net_dev = hr_dev->iboe.netdevs[port];
+ net_dev = ib_device_get_netdev(ib_dev, port_num);
if (!net_dev) {
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
- dev_err(dev, "find netdev %u failed!\n", port);
+ ibdev_err(ib_dev, "find netdev %u failed!\n", port);
return -EINVAL;
}
mtu = iboe_get_mtu(net_dev->mtu);
props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256;
- props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ?
- IB_PORT_ACTIVE :
- IB_PORT_DOWN;
+
+ dev_put(net_dev);
+
+ ret = hns_roce_get_port_state(hr_dev, port_num, &props->state);
+ if (ret) {
+ ibdev_err(ib_dev, "failed to get port state.\n");
+ return ret;
+ }
+
props->phys_state = props->state == IB_PORT_ACTIVE ?
IB_PORT_PHYS_STATE_LINK_UP :
IB_PORT_PHYS_STATE_DISABLED;
-
- spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
-
return 0;
}
@@ -617,9 +669,40 @@ static int hns_roce_get_hw_stats(struct ib_device *device,
return num_counters;
}
-static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
+static void
+ hns_roce_unregister_bond_cleanup(struct hns_roce_dev *hr_dev,
+ struct hns_roce_bond_group *bond_grp)
+{
+ struct net_device *net_dev;
+ int i;
+
+ /* To avoid the loss of other slave devices when main_hr_dev
+ * is unregistered, re-initialize the remaining slaves before
+ * the bond resources cleanup.
+ */
+ bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED;
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) {
+ net_dev = bond_grp->bond_func_info[i].net_dev;
+ if (net_dev && net_dev != get_hr_netdev(hr_dev, 0))
+ hns_roce_bond_init_client(bond_grp, i);
+ }
+
+ hns_roce_cleanup_bond(bond_grp);
+}
+
+static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev,
+ bool bond_cleanup)
{
+ struct net_device *net_dev = get_hr_netdev(hr_dev, 0);
struct hns_roce_ib_iboe *iboe = &hr_dev->iboe;
+ struct hns_roce_bond_group *bond_grp;
+ u8 bus_num = get_hr_bus_num(hr_dev);
+
+ if (bond_cleanup && hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
+ if (bond_grp)
+ hns_roce_unregister_bond_cleanup(hr_dev, bond_grp);
+ }
hr_dev->active = false;
unregister_netdevice_notifier(&iboe->nb);
@@ -708,11 +791,12 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = {
static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
{
- int ret;
struct hns_roce_ib_iboe *iboe = NULL;
- struct ib_device *ib_dev = NULL;
struct device *dev = hr_dev->dev;
+ struct ib_device *ib_dev = NULL;
+ struct net_device *net_dev;
unsigned int i;
+ int ret;
iboe = &hr_dev->iboe;
spin_lock_init(&iboe->lock);
@@ -747,17 +831,38 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops);
- for (i = 0; i < hr_dev->caps.num_ports; i++) {
- if (!hr_dev->iboe.netdevs[i])
- continue;
- ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i],
- i + 1);
- if (ret)
+ dma_set_max_seg_size(dev, SZ_2G);
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) {
+ ret = hns_roce_alloc_bond_grp(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to alloc bond_grp for bus %u, ret = %d\n",
+ get_hr_bus_num(hr_dev), ret);
return ret;
+ }
+ }
+
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND &&
+ hns_roce_bond_is_active(hr_dev)) {
+ ret = hns_roce_bond_init(hr_dev);
+ if (ret) {
+ dev_err(dev, "failed to init bond!\n");
+ return ret;
+ }
+ ret = ib_register_device(ib_dev, "hns_bond_%d", dev);
+ } else {
+ for (i = 0; i < hr_dev->caps.num_ports; i++) {
+ net_dev = get_hr_netdev(hr_dev, i);
+ if (!net_dev)
+ continue;
+
+ ret = ib_device_set_netdev(ib_dev, net_dev, i + 1);
+ if (ret)
+ return ret;
+ }
+ ret = ib_register_device(ib_dev, "hns_%d", dev);
}
- dma_set_max_seg_size(dev, SZ_2G);
- ret = ib_register_device(ib_dev, "hns_%d", dev);
if (ret) {
dev_err(dev, "ib_register_device failed!\n");
return ret;
@@ -1157,10 +1262,10 @@ error_failed_alloc_dfx_cnt:
return ret;
}
-void hns_roce_exit(struct hns_roce_dev *hr_dev)
+void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup)
{
hns_roce_unregister_debugfs(hr_dev);
- hns_roce_unregister_device(hr_dev);
+ hns_roce_unregister_device(hr_dev, bond_cleanup);
if (hr_dev->hw->hw_exit)
hr_dev->hw->hw_exit(hr_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index d35cf59d0f43..225c3e328e0e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include "hns_roce_device.h"
void hns_roce_init_pd_table(struct hns_roce_dev *hr_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index bdd879ac12dd..d1640c5fbaab 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -31,7 +31,6 @@
* SOFTWARE.
*/
-#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
@@ -1348,11 +1347,13 @@ static int check_mtu_validate(struct hns_roce_dev *hr_dev,
struct hns_roce_qp *hr_qp,
struct ib_qp_attr *attr, int attr_mask)
{
+ struct net_device *net_dev;
enum ib_mtu active_mtu;
int p;
p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
- active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu);
+ net_dev = get_hr_netdev(hr_dev, p);
+ active_mtu = iboe_get_mtu(net_dev->mtu);
if ((hr_dev->caps.max_mtu >= IB_MTU_2048 &&
attr->path_mtu > hr_dev->caps.max_mtu) ||
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 1090051f493b..8a6efb6b9c9e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -3,7 +3,6 @@
* Copyright (c) 2018 Hisilicon Limited.
*/
-#include <linux/pci.h>
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
#include "hns_roce_device.h"
diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c
index c6a0a661d6e7..f4f4f92ba63a 100644
--- a/drivers/infiniband/hw/irdma/cm.c
+++ b/drivers/infiniband/hw/irdma/cm.c
@@ -3710,7 +3710,7 @@ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
iwpd = iwqp->iwpd;
tagged_offset = (uintptr_t)iwqp->ietf_mem.va;
ibmr = irdma_reg_phys_mr(&iwpd->ibpd, iwqp->ietf_mem.pa, buf_len,
- IB_ACCESS_LOCAL_WRITE, &tagged_offset);
+ IB_ACCESS_LOCAL_WRITE, &tagged_offset, false);
if (IS_ERR(ibmr)) {
ret = -ENOMEM;
goto error;
diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c
index 4ef1c29032f7..ce5cf89c463c 100644
--- a/drivers/infiniband/hw/irdma/ctrl.c
+++ b/drivers/infiniband/hw/irdma/ctrl.c
@@ -2943,8 +2943,6 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch,
__le64 *wqe;
struct irdma_sc_cqp *cqp;
u64 hdr;
- struct irdma_sc_ceq *ceq;
- int ret_code = 0;
cqp = cq->dev->cqp;
if (cq->cq_uk.cq_id >= cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt)
@@ -2953,19 +2951,9 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch,
if (cq->ceq_id >= cq->dev->hmc_fpm_misc.max_ceqs)
return -EINVAL;
- ceq = cq->dev->ceq[cq->ceq_id];
- if (ceq && ceq->reg_cq)
- ret_code = irdma_sc_add_cq_ctx(ceq, cq);
-
- if (ret_code)
- return ret_code;
-
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
- if (!wqe) {
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, cq);
+ if (!wqe)
return -ENOMEM;
- }
set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
@@ -3018,17 +3006,12 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq)
struct irdma_sc_cqp *cqp;
__le64 *wqe;
u64 hdr;
- struct irdma_sc_ceq *ceq;
cqp = cq->dev->cqp;
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
if (!wqe)
return -ENOMEM;
- ceq = cq->dev->ceq[cq->ceq_id];
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, cq);
-
set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
set_64bit_val(wqe, 8, (uintptr_t)cq >> 1);
set_64bit_val(wqe, 40, cq->shadow_area_pa);
@@ -3602,71 +3585,6 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
}
/**
- * irdma_sc_find_reg_cq - find cq ctx index
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-static u32 irdma_sc_find_reg_cq(struct irdma_sc_ceq *ceq,
- struct irdma_sc_cq *cq)
-{
- u32 i;
-
- for (i = 0; i < ceq->reg_cq_size; i++) {
- if (cq == ceq->reg_cq[i])
- return i;
- }
-
- return IRDMA_INVALID_CQ_IDX;
-}
-
-/**
- * irdma_sc_add_cq_ctx - add cq ctx tracking for ceq
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-int irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
-
- if (ceq->reg_cq_size == ceq->elem_cnt) {
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
- return -ENOMEM;
- }
-
- ceq->reg_cq[ceq->reg_cq_size++] = cq;
-
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
-
- return 0;
-}
-
-/**
- * irdma_sc_remove_cq_ctx - remove cq ctx tracking for ceq
- * @ceq: ceq sc structure
- * @cq: cq sc structure
- */
-void irdma_sc_remove_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq)
-{
- unsigned long flags;
- u32 cq_ctx_idx;
-
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
- cq_ctx_idx = irdma_sc_find_reg_cq(ceq, cq);
- if (cq_ctx_idx == IRDMA_INVALID_CQ_IDX)
- goto exit;
-
- ceq->reg_cq_size--;
- if (cq_ctx_idx != ceq->reg_cq_size)
- ceq->reg_cq[cq_ctx_idx] = ceq->reg_cq[ceq->reg_cq_size];
- ceq->reg_cq[ceq->reg_cq_size] = NULL;
-
-exit:
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
-}
-
-/**
* irdma_sc_cqp_init - Initialize buffers for a control Queue Pair
* @cqp: IWARP control queue pair pointer
* @info: IWARP control queue pair init info pointer
@@ -3950,11 +3868,13 @@ int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp)
*/
void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
{
+ unsigned long flags;
u64 temp_val;
u16 sw_cq_sel;
u8 arm_next_se;
u8 arm_seq_num;
+ spin_lock_irqsave(&ccq->dev->cqp_lock, flags);
get_64bit_val(ccq->cq_uk.shadow_area, 32, &temp_val);
sw_cq_sel = (u16)FIELD_GET(IRDMA_CQ_DBSA_SW_CQ_SELECT, temp_val);
arm_next_se = (u8)FIELD_GET(IRDMA_CQ_DBSA_ARM_NEXT_SE, temp_val);
@@ -3965,6 +3885,7 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT_SE, arm_next_se) |
FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT, 1);
set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val);
+ spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags);
dma_wmb(); /* make sure shadow area is updated before arming */
@@ -4387,9 +4308,6 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq,
ceq->ceq_elem_pa = info->ceqe_pa;
ceq->virtual_map = info->virtual_map;
ceq->itr_no_expire = info->itr_no_expire;
- ceq->reg_cq = info->reg_cq;
- ceq->reg_cq_size = 0;
- spin_lock_init(&ceq->req_cq_lock);
ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0);
ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0);
ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL);
@@ -4472,9 +4390,6 @@ int irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq)
{
struct irdma_sc_cqp *cqp;
- if (ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, ceq->dev->ccq);
-
cqp = ceq->dev->cqp;
cqp->process_cqp_sds = irdma_update_sds_noccq;
@@ -4493,11 +4408,6 @@ int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch)
struct irdma_sc_dev *dev = ceq->dev;
dev->ccq->vsi_idx = ceq->vsi_idx;
- if (ceq->reg_cq) {
- ret_code = irdma_sc_add_cq_ctx(ceq, ceq->dev->ccq);
- if (ret_code)
- return ret_code;
- }
ret_code = irdma_sc_ceq_create(ceq, scratch, true);
if (!ret_code)
@@ -4562,7 +4472,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq)
struct irdma_sc_cq *temp_cq;
u8 polarity;
u32 cq_idx;
- unsigned long flags;
do {
cq_idx = 0;
@@ -4583,11 +4492,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq)
}
cq = temp_cq;
- if (ceq->reg_cq) {
- spin_lock_irqsave(&ceq->req_cq_lock, flags);
- cq_idx = irdma_sc_find_reg_cq(ceq, cq);
- spin_unlock_irqrestore(&ceq->req_cq_lock, flags);
- }
IRDMA_RING_MOVE_TAIL(ceq->ceq_ring);
if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring))
@@ -4731,7 +4635,8 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch,
u64 hdr;
dev = aeq->dev;
- if (dev->privileged)
+
+ if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2)
writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]);
cqp = dev->cqp;
diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c
index 7bad0e38786a..d1fc5726b979 100644
--- a/drivers/infiniband/hw/irdma/hw.c
+++ b/drivers/infiniband/hw/irdma/hw.c
@@ -2365,7 +2365,6 @@ static int irdma_cqp_manage_apbvt_cmd(struct irdma_device *iwdev,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.manage_apbvt_entry.info;
- memset(info, 0, sizeof(*info));
info->add = add_port;
info->port = accel_local_port;
cqp_info->cqp_cmd = IRDMA_OP_MANAGE_APBVT_ENTRY;
@@ -2474,7 +2473,6 @@ void irdma_manage_arp_cache(struct irdma_pci_f *rf,
if (action == IRDMA_ARP_ADD) {
cqp_info->cqp_cmd = IRDMA_OP_ADD_ARP_CACHE_ENTRY;
info = &cqp_info->in.u.add_arp_cache_entry.info;
- memset(info, 0, sizeof(*info));
info->arp_index = (u16)arp_index;
info->permanent = true;
ether_addr_copy(info->mac_addr, mac_addr);
@@ -2533,7 +2531,6 @@ int irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.manage_qhash_table_entry.info;
- memset(info, 0, sizeof(*info));
info->vsi = &iwdev->vsi;
info->manage = mtype;
info->entry_type = etype;
diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c
index 27b191f61caf..b49fd9cf2476 100644
--- a/drivers/infiniband/hw/irdma/icrdma_if.c
+++ b/drivers/infiniband/hw/irdma/icrdma_if.c
@@ -302,7 +302,8 @@ err_rt_init:
err_ctrl_init:
icrdma_deinit_interrupts(rf, cdev_info);
err_init_interrupts:
- kfree(iwdev->rf);
+ mutex_destroy(&rf->ah_tbl_lock);
+ kfree(rf);
ib_dealloc_device(&iwdev->ibdev);
return err;
@@ -319,6 +320,9 @@ static void icrdma_remove(struct auxiliary_device *aux_dev)
ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false);
irdma_ib_unregister_device(iwdev);
icrdma_deinit_interrupts(iwdev->rf, cdev_info);
+ mutex_destroy(&iwdev->rf->ah_tbl_lock);
+
+ kfree(iwdev->rf);
pr_debug("INIT: Gen[%d] func[%d] device remove success\n",
rdma_ver, PCI_FUNC(cdev_info->pdev->devfn));
diff --git a/drivers/infiniband/hw/irdma/ig3rdma_if.c b/drivers/infiniband/hw/irdma/ig3rdma_if.c
index 1bb42eb298ba..e1d6670d9396 100644
--- a/drivers/infiniband/hw/irdma/ig3rdma_if.c
+++ b/drivers/infiniband/hw/irdma/ig3rdma_if.c
@@ -55,6 +55,7 @@ static int ig3rdma_vchnl_init(struct irdma_pci_f *rf,
ret = irdma_sc_vchnl_init(&rf->sc_dev, &virt_info);
if (ret) {
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
return ret;
}
@@ -124,7 +125,9 @@ static void ig3rdma_decfg_rf(struct irdma_pci_f *rf)
{
struct irdma_hw *hw = &rf->hw;
+ mutex_destroy(&rf->ah_tbl_lock);
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
kfree(hw->io_regs);
iounmap(hw->rdma_reg.addr);
}
@@ -149,6 +152,7 @@ static int ig3rdma_cfg_rf(struct irdma_pci_f *rf,
err = ig3rdma_cfg_regions(&rf->hw, cdev_info);
if (err) {
destroy_workqueue(rf->vchnl_wq);
+ mutex_destroy(&rf->sc_dev.vchnl_mutex);
return err;
}
diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h
index 886b30da188a..baab61e424a2 100644
--- a/drivers/infiniband/hw/irdma/main.h
+++ b/drivers/infiniband/hw/irdma/main.h
@@ -556,7 +556,7 @@ void irdma_copy_ip_htonl(__be32 *dst, u32 *src);
u16 irdma_get_vlan_ipv4(u32 *addr);
void irdma_get_vlan_mac_ipv6(u32 *addr, u16 *vlan_id, u8 *mac);
struct ib_mr *irdma_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
- int acc, u64 *iova_start);
+ int acc, u64 *iova_start, bool dma_mr);
int irdma_upload_qp_context(struct irdma_qp *iwqp, bool freeze, bool raw);
void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq);
int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd,
@@ -564,7 +564,6 @@ int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd,
void (*callback_fcn)(struct irdma_cqp_request *cqp_request),
void *cb_param);
void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request);
-bool irdma_cq_empty(struct irdma_cq *iwcq);
int irdma_inetaddr_event(struct notifier_block *notifier, unsigned long event,
void *ptr);
int irdma_inet6addr_event(struct notifier_block *notifier, unsigned long event,
diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c
index fa6325adaede..28dfad7f940c 100644
--- a/drivers/infiniband/hw/irdma/pble.c
+++ b/drivers/infiniband/hw/irdma/pble.c
@@ -506,12 +506,14 @@ exit:
void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc,
struct irdma_pble_alloc *palloc)
{
- pble_rsrc->freedpbles += palloc->total_cnt;
-
if (palloc->level == PBLE_LEVEL_2)
free_lvl2(pble_rsrc, palloc);
else
irdma_prm_return_pbles(&pble_rsrc->pinfo,
&palloc->level1.chunkinfo);
+
+ mutex_lock(&pble_rsrc->pble_mutex_lock);
+ pble_rsrc->freedpbles += palloc->total_cnt;
pble_rsrc->stats_alloc_freed++;
+ mutex_unlock(&pble_rsrc->pble_mutex_lock);
}
diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c
index 694e5a9ed15d..cee47ddbd1b5 100644
--- a/drivers/infiniband/hw/irdma/puda.c
+++ b/drivers/infiniband/hw/irdma/puda.c
@@ -685,7 +685,6 @@ static int irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc)
ukqp->rq_size = rsrc->rq_size;
IRDMA_RING_INIT(ukqp->sq_ring, ukqp->sq_size);
- IRDMA_RING_INIT(ukqp->initial_ring, ukqp->sq_size);
IRDMA_RING_INIT(ukqp->rq_ring, ukqp->rq_size);
ukqp->wqe_alloc_db = qp->pd->dev->wqe_alloc_db;
@@ -726,7 +725,6 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq)
struct irdma_sc_cqp *cqp;
u64 hdr;
struct irdma_ccq_cqe_info compl_info;
- int status = 0;
cqp = dev->cqp;
wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0);
@@ -756,16 +754,8 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq)
print_hex_dump_debug("PUDA: PUDA CREATE CQ", DUMP_PREFIX_OFFSET, 16,
8, wqe, IRDMA_CQP_WQE_SIZE * 8, false);
irdma_sc_cqp_post_sq(dev->cqp);
- status = irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ,
- &compl_info);
- if (!status) {
- struct irdma_sc_ceq *ceq = dev->ceq[0];
-
- if (ceq && ceq->reg_cq)
- status = irdma_sc_add_cq_ctx(ceq, cq);
- }
-
- return status;
+ return irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ,
+ &compl_info);
}
/**
@@ -897,23 +887,17 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type,
struct irdma_puda_buf *buf = NULL;
struct irdma_puda_buf *nextbuf = NULL;
struct irdma_virt_mem *vmem;
- struct irdma_sc_ceq *ceq;
- ceq = vsi->dev->ceq[0];
switch (type) {
case IRDMA_PUDA_RSRC_TYPE_ILQ:
rsrc = vsi->ilq;
vmem = &vsi->ilq_mem;
vsi->ilq = NULL;
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, &rsrc->cq);
break;
case IRDMA_PUDA_RSRC_TYPE_IEQ:
rsrc = vsi->ieq;
vmem = &vsi->ieq_mem;
vsi->ieq = NULL;
- if (ceq && ceq->reg_cq)
- irdma_sc_remove_cq_ctx(ceq, &rsrc->cq);
break;
default:
ibdev_dbg(to_ibdev(dev), "PUDA: error resource type = 0x%x\n",
diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h
index c1b8f81ea283..cab4896640a1 100644
--- a/drivers/infiniband/hw/irdma/type.h
+++ b/drivers/infiniband/hw/irdma/type.h
@@ -492,9 +492,6 @@ struct irdma_sc_ceq {
u32 first_pm_pbl_idx;
u8 polarity;
u16 vsi_idx;
- struct irdma_sc_cq **reg_cq;
- u32 reg_cq_size;
- spinlock_t req_cq_lock; /* protect access to reg_cq array */
bool virtual_map:1;
bool tph_en:1;
bool itr_no_expire:1;
@@ -894,8 +891,6 @@ struct irdma_ceq_init_info {
u8 tph_val;
u16 vsi_idx;
u32 first_pm_pbl_idx;
- struct irdma_sc_cq **reg_cq;
- u32 reg_cq_idx;
};
struct irdma_aeq_init_info {
diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c
index ce1ae10c30fc..f0846b800913 100644
--- a/drivers/infiniband/hw/irdma/uk.c
+++ b/drivers/infiniband/hw/irdma/uk.c
@@ -114,33 +114,8 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx)
*/
void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp)
{
- u64 temp;
- u32 hw_sq_tail;
- u32 sw_sq_head;
-
- /* valid bit is written and loads completed before reading shadow */
- mb();
-
- /* read the doorbell shadow area */
- get_64bit_val(qp->shadow_area, 0, &temp);
-
- hw_sq_tail = (u32)FIELD_GET(IRDMA_QP_DBSA_HW_SQ_TAIL, temp);
- sw_sq_head = IRDMA_RING_CURRENT_HEAD(qp->sq_ring);
- if (sw_sq_head != qp->initial_ring.head) {
- if (sw_sq_head != hw_sq_tail) {
- if (sw_sq_head > qp->initial_ring.head) {
- if (hw_sq_tail >= qp->initial_ring.head &&
- hw_sq_tail < sw_sq_head)
- writel(qp->qp_id, qp->wqe_alloc_db);
- } else {
- if (hw_sq_tail >= qp->initial_ring.head ||
- hw_sq_tail < sw_sq_head)
- writel(qp->qp_id, qp->wqe_alloc_db);
- }
- }
- }
-
- qp->initial_ring.head = qp->sq_ring.head;
+ dma_wmb();
+ writel(qp->qp_id, qp->wqe_alloc_db);
}
/**
@@ -194,6 +169,7 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx,
qp->sq_wrtrk_array[*wqe_idx].wrid = info->wr_id;
qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size;
qp->sq_wrtrk_array[*wqe_idx].quanta = quanta;
+ qp->sq_wrtrk_array[*wqe_idx].signaled = info->signaled;
return wqe;
}
@@ -1137,6 +1113,27 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq,
}
/**
+ * irdma_uk_cq_empty - Check if CQ is empty
+ * @cq: hw cq
+ */
+bool irdma_uk_cq_empty(struct irdma_cq_uk *cq)
+{
+ __le64 *cqe;
+ u8 polarity;
+ u64 qword3;
+
+ if (cq->avoid_mem_cflct)
+ cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(cq);
+ else
+ cqe = IRDMA_GET_CURRENT_CQ_ELEM(cq);
+
+ get_64bit_val(cqe, 24, &qword3);
+ polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
+
+ return polarity != cq->polarity;
+}
+
+/**
* irdma_uk_cq_poll_cmpl - get cq completion info
* @cq: hw cq
* @info: cq poll information returned
@@ -1287,6 +1284,8 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3);
if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) {
+ unsigned long flags;
+
srq = qp->srq_uk;
get_64bit_val(cqe, 8, &info->wr_id);
@@ -1299,8 +1298,11 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
} else {
info->stag_invalid_set = false;
}
+ spin_lock_irqsave(srq->lock, flags);
IRDMA_RING_MOVE_TAIL(srq->srq_ring);
+ spin_unlock_irqrestore(srq->lock, flags);
pring = &srq->srq_ring;
+
} else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) {
u32 array_idx;
@@ -1355,6 +1357,10 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
if (!info->comp_status)
info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len;
+ if (!qp->sq_wrtrk_array[wqe_idx].signaled) {
+ ret_code = -EFAULT;
+ goto exit;
+ }
info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3);
IRDMA_RING_SET_TAIL(qp->sq_ring,
wqe_idx + qp->sq_wrtrk_array[wqe_idx].quanta);
@@ -1420,8 +1426,9 @@ exit:
IRDMA_RING_MOVE_TAIL(cq->cq_ring);
if (!cq->avoid_mem_cflct && ext_valid)
IRDMA_RING_MOVE_TAIL(cq->cq_ring);
- set_64bit_val(cq->shadow_area, 0,
- IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
+ if (IRDMA_RING_CURRENT_HEAD(cq->cq_ring) & 0x3F || irdma_uk_cq_empty(cq))
+ set_64bit_val(cq->shadow_area, 0,
+ IRDMA_RING_CURRENT_HEAD(cq->cq_ring));
} else {
qword3 &= ~IRDMA_CQ_WQEIDX;
qword3 |= FIELD_PREP(IRDMA_CQ_WQEIDX, pring->tail);
@@ -1574,7 +1581,6 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp,
qp->conn_wqes = move_cnt;
IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->sq_ring, move_cnt);
IRDMA_RING_MOVE_TAIL_BY_COUNT(qp->sq_ring, move_cnt);
- IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->initial_ring, move_cnt);
}
/**
@@ -1719,7 +1725,6 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info)
qp->max_sq_frag_cnt = info->max_sq_frag_cnt;
sq_ring_size = qp->sq_size << info->sq_shift;
IRDMA_RING_INIT(qp->sq_ring, sq_ring_size);
- IRDMA_RING_INIT(qp->initial_ring, sq_ring_size);
if (info->first_sq_wq) {
irdma_setup_connection_wqes(qp, info);
qp->swqe_polarity = 1;
diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h
index ab57f689827a..9eb7fd0b1cbf 100644
--- a/drivers/infiniband/hw/irdma/user.h
+++ b/drivers/infiniband/hw/irdma/user.h
@@ -429,6 +429,7 @@ struct irdma_wqe_uk_ops {
struct irdma_bind_window *op_info);
};
+bool irdma_uk_cq_empty(struct irdma_cq_uk *cq);
int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
struct irdma_cq_poll_info *info);
void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq,
@@ -456,7 +457,6 @@ struct irdma_srq_uk {
struct irdma_uk_attrs *uk_attrs;
__le64 *shadow_area;
struct irdma_ring srq_ring;
- struct irdma_ring initial_ring;
u32 srq_id;
u32 srq_size;
u32 max_srq_frag_cnt;
@@ -465,6 +465,7 @@ struct irdma_srq_uk {
u8 wqe_size;
u8 wqe_size_multiplier;
u8 deferred_flag;
+ spinlock_t *lock;
};
struct irdma_srq_uk_init_info {
@@ -482,7 +483,8 @@ struct irdma_sq_uk_wr_trk_info {
u64 wrid;
u32 wr_len;
u16 quanta;
- u8 reserved[2];
+ u8 signaled;
+ u8 reserved[1];
};
struct irdma_qp_quanta {
diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c
index 8b94d87b0192..cc2a12f735d3 100644
--- a/drivers/infiniband/hw/irdma/utils.c
+++ b/drivers/infiniband/hw/irdma/utils.c
@@ -452,6 +452,7 @@ struct irdma_cqp_request *irdma_alloc_and_get_cqp_request(struct irdma_cqp *cqp,
cqp_request->waiting = wait;
refcount_set(&cqp_request->refcnt, 1);
memset(&cqp_request->compl_info, 0, sizeof(cqp_request->compl_info));
+ memset(&cqp_request->info, 0, sizeof(cqp_request->info));
return cqp_request;
}
@@ -1068,7 +1069,6 @@ int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp)
cqp_info = &cqp_request->info;
qp_info = &cqp_request->info.in.u.qp_create.info;
- memset(qp_info, 0, sizeof(*qp_info));
qp_info->cq_num_valid = true;
qp_info->next_iwarp_state = IRDMA_QP_STATE_RTS;
cqp_info->cqp_cmd = IRDMA_OP_QP_CREATE;
@@ -1343,7 +1343,6 @@ int irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp)
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = IRDMA_OP_QP_DESTROY;
cqp_info->post_sq = 1;
cqp_info->in.u.qp_destroy.qp = qp;
@@ -1749,7 +1748,6 @@ int irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = IRDMA_OP_STATS_GATHER;
cqp_info->post_sq = 1;
cqp_info->in.u.stats_gather.info = pestat->gather_info;
@@ -1789,7 +1787,6 @@ int irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = cmd;
cqp_info->post_sq = 1;
cqp_info->in.u.stats_manage.info = *stats_info;
@@ -1890,7 +1887,6 @@ int irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd,
return -ENOMEM;
cqp_info = &cqp_request->info;
- memset(cqp_info, 0, sizeof(*cqp_info));
cqp_info->cqp_cmd = cmd;
cqp_info->post_sq = 1;
cqp_info->in.u.ws_node.info = *node_info;
@@ -2357,24 +2353,6 @@ void irdma_ib_qp_event(struct irdma_qp *iwqp, enum irdma_qp_event_type event)
iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context);
}
-bool irdma_cq_empty(struct irdma_cq *iwcq)
-{
- struct irdma_cq_uk *ukcq;
- u64 qword3;
- __le64 *cqe;
- u8 polarity;
-
- ukcq = &iwcq->sc_cq.cq_uk;
- if (ukcq->avoid_mem_cflct)
- cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(ukcq);
- else
- cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq);
- get_64bit_val(cqe, 24, &qword3);
- polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
-
- return polarity != ukcq->polarity;
-}
-
void irdma_remove_cmpls_list(struct irdma_cq *iwcq)
{
struct irdma_cmpl_gen *cmpl_node;
@@ -2436,6 +2414,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
struct irdma_qp_uk *qp = &iwqp->sc_qp.qp_uk;
struct irdma_ring *sq_ring = &qp->sq_ring;
struct irdma_ring *rq_ring = &qp->rq_ring;
+ struct irdma_cq *iwscq = iwqp->iwscq;
+ struct irdma_cq *iwrcq = iwqp->iwrcq;
struct irdma_cmpl_gen *cmpl;
__le64 *sw_wqe;
u64 wqe_qword;
@@ -2443,8 +2423,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
bool compl_generated = false;
unsigned long flags1;
- spin_lock_irqsave(&iwqp->iwscq->lock, flags1);
- if (irdma_cq_empty(iwqp->iwscq)) {
+ spin_lock_irqsave(&iwscq->lock, flags1);
+ if (irdma_uk_cq_empty(&iwscq->sc_cq.cq_uk)) {
unsigned long flags2;
spin_lock_irqsave(&iwqp->lock, flags2);
@@ -2452,7 +2432,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC);
if (!cmpl) {
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
return;
}
@@ -2471,24 +2451,24 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
kfree(cmpl);
continue;
}
- ibdev_dbg(iwqp->iwscq->ibcq.device,
+ ibdev_dbg(iwscq->ibcq.device,
"DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n",
__func__, cmpl->cpi.wr_id, qp->qp_id);
- list_add_tail(&cmpl->list, &iwqp->iwscq->cmpl_generated);
+ list_add_tail(&cmpl->list, &iwscq->cmpl_generated);
compl_generated = true;
}
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
if (compl_generated)
- irdma_comp_handler(iwqp->iwscq);
+ irdma_comp_handler(iwscq);
} else {
- spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1);
+ spin_unlock_irqrestore(&iwscq->lock, flags1);
mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush,
msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS));
}
- spin_lock_irqsave(&iwqp->iwrcq->lock, flags1);
- if (irdma_cq_empty(iwqp->iwrcq)) {
+ spin_lock_irqsave(&iwrcq->lock, flags1);
+ if (irdma_uk_cq_empty(&iwrcq->sc_cq.cq_uk)) {
unsigned long flags2;
spin_lock_irqsave(&iwqp->lock, flags2);
@@ -2496,7 +2476,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC);
if (!cmpl) {
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
return;
}
@@ -2508,20 +2488,20 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp)
cmpl->cpi.q_type = IRDMA_CQE_QTYPE_RQ;
/* remove the RQ WR by moving RQ tail */
IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1);
- ibdev_dbg(iwqp->iwrcq->ibcq.device,
+ ibdev_dbg(iwrcq->ibcq.device,
"DEV: %s: adding wr_id = 0x%llx RQ Completion to list qp_id=%d, wqe_idx=%d\n",
__func__, cmpl->cpi.wr_id, qp->qp_id,
wqe_idx);
- list_add_tail(&cmpl->list, &iwqp->iwrcq->cmpl_generated);
+ list_add_tail(&cmpl->list, &iwrcq->cmpl_generated);
compl_generated = true;
}
spin_unlock_irqrestore(&iwqp->lock, flags2);
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
if (compl_generated)
- irdma_comp_handler(iwqp->iwrcq);
+ irdma_comp_handler(iwrcq);
} else {
- spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1);
+ spin_unlock_irqrestore(&iwrcq->lock, flags1);
mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush,
msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS));
}
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index c883c9ea5a83..6d9af41a2884 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -27,7 +27,8 @@ static int irdma_query_device(struct ib_device *ibdev,
irdma_fw_minor_ver(&rf->sc_dev);
props->device_cap_flags = IB_DEVICE_MEM_WINDOW |
IB_DEVICE_MEM_MGT_EXTENSIONS;
- props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY;
+ if (hw_attrs->uk_attrs.hw_rev < IRDMA_GEN_3)
+ props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY;
props->vendor_id = pcidev->vendor;
props->vendor_part_id = pcidev->device;
@@ -771,7 +772,6 @@ static int irdma_cqp_create_qp_cmd(struct irdma_qp *iwqp)
cqp_info = &cqp_request->info;
qp_info = &cqp_request->info.in.u.qp_create.info;
- memset(qp_info, 0, sizeof(*qp_info));
qp_info->mac_valid = true;
qp_info->cq_num_valid = true;
qp_info->next_iwarp_state = IRDMA_QP_STATE_IDLE;
@@ -2029,6 +2029,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
struct irdma_pci_f *rf;
struct irdma_cq_buf *cq_buf = NULL;
unsigned long flags;
+ u8 cqe_size;
int ret;
iwdev = to_iwdev(ibcq->device);
@@ -2045,7 +2046,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
return -EINVAL;
if (!iwcq->user_mode) {
- entries++;
+ entries += 2;
if (!iwcq->sc_cq.cq_uk.avoid_mem_cflct &&
dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
@@ -2053,6 +2054,10 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
if (entries & 1)
entries += 1; /* cq size must be an even number */
+
+ cqe_size = iwcq->sc_cq.cq_uk.avoid_mem_cflct ? 64 : 32;
+ if (entries * cqe_size == IRDMA_HW_PAGE_SIZE)
+ entries += 2;
}
info.cq_size = max(entries, 4);
@@ -2306,8 +2311,8 @@ static int irdma_setup_kmode_srq(struct irdma_device *iwdev,
ukinfo->srq_size = depth >> shift;
ukinfo->shadow_area = mem->va + ring_size;
- info->shadow_area_pa = info->srq_pa + ring_size;
info->srq_pa = mem->pa;
+ info->shadow_area_pa = info->srq_pa + ring_size;
return 0;
}
@@ -2384,6 +2389,7 @@ static int irdma_create_srq(struct ib_srq *ibsrq,
info.vsi = &iwdev->vsi;
info.pd = &iwpd->sc_pd;
+ iwsrq->sc_srq.srq_uk.lock = &iwsrq->lock;
err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info);
if (err_code)
goto free_dmem;
@@ -2483,6 +2489,7 @@ static int irdma_create_cq(struct ib_cq *ibcq,
int err_code;
int entries = attr->cqe;
bool cqe_64byte_ena;
+ u8 cqe_size;
err_code = cq_validate_flags(attr->flags, dev->hw_attrs.uk_attrs.hw_rev);
if (err_code)
@@ -2509,6 +2516,7 @@ static int irdma_create_cq(struct ib_cq *ibcq,
ukinfo->cq_id = cq_num;
cqe_64byte_ena = dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_64_BYTE_CQE ?
true : false;
+ cqe_size = cqe_64byte_ena ? 64 : 32;
ukinfo->avoid_mem_cflct = cqe_64byte_ena;
iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size;
if (attr->comp_vector < rf->ceqs_count)
@@ -2581,13 +2589,16 @@ static int irdma_create_cq(struct ib_cq *ibcq,
goto cq_free_rsrc;
}
- entries++;
+ entries += 2;
if (!cqe_64byte_ena && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
entries *= 2;
if (entries & 1)
entries += 1; /* cq size must be an even number */
+ if (entries * cqe_size == IRDMA_HW_PAGE_SIZE)
+ entries += 2;
+
ukinfo->cq_size = entries;
if (cqe_64byte_ena)
@@ -3103,12 +3114,10 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev,
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.alloc_stag.info;
- memset(info, 0, sizeof(*info));
info->page_size = PAGE_SIZE;
info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S;
info->pd_id = iwpd->sc_pd.pd_id;
info->total_len = iwmr->len;
- info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY;
info->remote_access = true;
cqp_info->cqp_cmd = IRDMA_OP_ALLOC_STAG;
cqp_info->post_sq = 1;
@@ -3119,7 +3128,7 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev,
if (status)
return status;
- iwmr->is_hwreg = 1;
+ iwmr->is_hwreg = true;
return 0;
}
@@ -3253,7 +3262,6 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
cqp_info = &cqp_request->info;
stag_info = &cqp_info->in.u.mr_reg_non_shared.info;
- memset(stag_info, 0, sizeof(*stag_info));
stag_info->va = iwpbl->user_base;
stag_info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S;
stag_info->stag_key = (u8)iwmr->stag;
@@ -3263,7 +3271,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS)
stag_info->remote_atomics_en = (access & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
stag_info->pd_id = iwpd->sc_pd.pd_id;
- stag_info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY;
+ stag_info->all_memory = iwmr->dma_mr;
if (stag_info->access_rights & IRDMA_ACCESS_FLAGS_ZERO_BASED)
stag_info->addr_type = IRDMA_ADDR_TYPE_ZERO_BASED;
else
@@ -3290,7 +3298,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request);
if (!ret)
- iwmr->is_hwreg = 1;
+ iwmr->is_hwreg = true;
return ret;
}
@@ -3647,7 +3655,6 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr)
cqp_info = &cqp_request->info;
info = &cqp_info->in.u.dealloc_stag.info;
- memset(info, 0, sizeof(*info));
info->pd_id = iwpd->sc_pd.pd_id;
info->stag_idx = ib_mr->rkey >> IRDMA_CQPSQ_STAG_IDX_S;
info->mr = true;
@@ -3663,7 +3670,7 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr)
if (status)
return status;
- iwmr->is_hwreg = 0;
+ iwmr->is_hwreg = false;
return 0;
}
@@ -3786,9 +3793,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
* @size: size of memory to register
* @access: Access rights
* @iova_start: start of virtual address for physical buffers
+ * @dma_mr: Flag indicating whether this region is a PD DMA MR
*/
struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access,
- u64 *iova_start)
+ u64 *iova_start, bool dma_mr)
{
struct irdma_device *iwdev = to_iwdev(pd->device);
struct irdma_pbl *iwpbl;
@@ -3805,6 +3813,7 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access
iwpbl = &iwmr->iwpbl;
iwpbl->iwmr = iwmr;
iwmr->type = IRDMA_MEMREG_TYPE_MEM;
+ iwmr->dma_mr = dma_mr;
iwpbl->user_base = *iova_start;
stag = irdma_create_stag(iwdev);
if (!stag) {
@@ -3843,7 +3852,7 @@ static struct ib_mr *irdma_get_dma_mr(struct ib_pd *pd, int acc)
{
u64 kva = 0;
- return irdma_reg_phys_mr(pd, 0, 0, acc, &kva);
+ return irdma_reg_phys_mr(pd, 0, 0, acc, &kva, true);
}
/**
@@ -4078,7 +4087,7 @@ static int irdma_post_send(struct ib_qp *ibqp,
break;
case IB_WR_LOCAL_INV:
info.op_type = IRDMA_OP_TYPE_INV_STAG;
- info.local_fence = info.read_fence;
+ info.local_fence = true;
info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
err = irdma_uk_stag_local_invalidate(ukqp, &info, true);
break;
@@ -4505,7 +4514,7 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq,
}
if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
- (!irdma_cq_empty(iwcq) || !list_empty(&iwcq->cmpl_generated)))
+ (!irdma_uk_cq_empty(ukcq) || !list_empty(&iwcq->cmpl_generated)))
ret = 1;
spin_unlock_irqrestore(&iwcq->lock, flags);
@@ -5204,7 +5213,7 @@ static int irdma_create_user_ah(struct ib_ah *ibah,
struct irdma_ah *parent_ah;
int err;
- if (udata && udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN)
+ if (udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN)
return -EINVAL;
err = irdma_setup_ah(ibah, attr);
@@ -5500,7 +5509,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev)
irdma_rt_deinit_hw(iwdev);
if (!iwdev->is_vport) {
irdma_ctrl_deinit_hw(iwdev->rf);
- if (iwdev->rf->vchnl_wq)
+ if (iwdev->rf->vchnl_wq) {
destroy_workqueue(iwdev->rf->vchnl_wq);
+ mutex_destroy(&iwdev->rf->sc_dev.vchnl_mutex);
+ }
}
}
diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h
index ac8b38701835..aabbb3442098 100644
--- a/drivers/infiniband/hw/irdma/verbs.h
+++ b/drivers/infiniband/hw/irdma/verbs.h
@@ -111,7 +111,8 @@ struct irdma_mr {
};
struct ib_umem *region;
int access;
- u8 is_hwreg;
+ bool is_hwreg:1;
+ bool dma_mr:1;
u16 type;
u32 page_cnt;
u64 page_size;
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index 12b481d138cf..03aacd526860 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -591,7 +591,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
int mlx4_ib_cm_init(void)
{
- cm_wq = alloc_workqueue("mlx4_ib_cm", 0, 0);
+ cm_wq = alloc_workqueue("mlx4_ib_cm", WQ_PERCPU, 0);
if (!cm_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 8b506417ad2f..d31d7f3005c6 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1225,6 +1225,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(create_flow_table_in, in, other_vport));
MLX5_SET(destroy_flow_table_in, din, vport_number,
MLX5_GET(create_flow_table_in, in, vport_number));
+ MLX5_SET(destroy_flow_table_in, din, other_eswitch,
+ MLX5_GET(create_flow_table_in, in, other_eswitch));
+ MLX5_SET(destroy_flow_table_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(create_flow_table_in, in,
+ eswitch_owner_vhca_id));
MLX5_SET(destroy_flow_table_in, din, table_type,
MLX5_GET(create_flow_table_in, in, table_type));
MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id);
@@ -1237,6 +1242,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(create_flow_group_in, in, other_vport));
MLX5_SET(destroy_flow_group_in, din, vport_number,
MLX5_GET(create_flow_group_in, in, vport_number));
+ MLX5_SET(destroy_flow_group_in, din, other_eswitch,
+ MLX5_GET(create_flow_group_in, in, other_eswitch));
+ MLX5_SET(destroy_flow_group_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(create_flow_group_in, in,
+ eswitch_owner_vhca_id));
MLX5_SET(destroy_flow_group_in, din, table_type,
MLX5_GET(create_flow_group_in, in, table_type));
MLX5_SET(destroy_flow_group_in, din, table_id,
@@ -1251,6 +1261,10 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
MLX5_GET(set_fte_in, in, other_vport));
MLX5_SET(delete_fte_in, din, vport_number,
MLX5_GET(set_fte_in, in, vport_number));
+ MLX5_SET(delete_fte_in, din, other_eswitch,
+ MLX5_GET(set_fte_in, in, other_eswitch));
+ MLX5_SET(delete_fte_in, din, eswitch_owner_vhca_id,
+ MLX5_GET(set_fte_in, in, eswitch_owner_vhca_id));
MLX5_SET(delete_fte_in, din, table_type,
MLX5_GET(set_fte_in, in, table_type));
MLX5_SET(delete_fte_in, din, table_id,
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index b0f7663c24c1..d17823ce7f38 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -691,22 +691,13 @@ static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device)
return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed);
}
-static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev,
- struct mlx5_flow_namespace *ns,
+static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
struct mlx5_ib_flow_prio *prio,
- int priority,
- int num_entries, int num_groups,
- u32 flags, u16 vport)
+ struct mlx5_flow_table_attr *ft_attr)
{
- struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_table *ft;
- ft_attr.prio = priority;
- ft_attr.max_fte = num_entries;
- ft_attr.flags = flags;
- ft_attr.vport = vport;
- ft_attr.autogroup.max_num_groups = num_groups;
- ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+ ft = mlx5_create_auto_grouped_flow_table(ns, ft_attr);
if (IS_ERR(ft))
return ERR_CAST(ft);
@@ -720,6 +711,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
enum flow_table_type ft_type)
{
bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
+ struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_namespace *ns = NULL;
enum mlx5_flow_namespace_type fn_type;
struct mlx5_ib_flow_prio *prio;
@@ -797,11 +789,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
max_table_size = min_t(int, num_entries, max_table_size);
ft = prio->flow_table;
- if (!ft)
- return _get_prio(dev, ns, prio, priority, max_table_size,
- num_groups, flags, 0);
+ if (ft)
+ return prio;
- return prio;
+ ft_attr.prio = priority;
+ ft_attr.max_fte = max_table_size;
+ ft_attr.flags = flags;
+ ft_attr.autogroup.max_num_groups = num_groups;
+ return _get_prio(ns, prio, &ft_attr);
}
enum {
@@ -950,6 +945,7 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev,
enum mlx5_ib_optional_counter_type type)
{
enum mlx5_ib_optional_counter_type per_qp_type;
+ struct mlx5_flow_table_attr ft_attr = {};
enum mlx5_flow_namespace_type fn_type;
struct mlx5_flow_namespace *ns;
struct mlx5_ib_flow_prio *prio;
@@ -1003,7 +999,10 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev,
if (prio->flow_table)
return 0;
- prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = MLX5_FS_MAX_POOL_SIZE;
+ ft_attr.autogroup.max_num_groups = 1;
+ prio = _get_prio(ns, prio, &ft_attr);
if (IS_ERR(prio))
return PTR_ERR(prio);
@@ -1223,6 +1222,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
struct mlx5_ib_op_fc *opfc,
enum mlx5_ib_optional_counter_type type)
{
+ struct mlx5_flow_table_attr ft_attr = {};
enum mlx5_flow_namespace_type fn_type;
int priority, i, err, spec_num;
struct mlx5_flow_act flow_act = {};
@@ -1304,8 +1304,10 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
if (err)
goto free;
- prio = _get_prio(dev, ns, prio, priority,
- dev->num_ports * MAX_OPFC_RULES, 1, 0, 0);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = dev->num_ports * MAX_OPFC_RULES;
+ ft_attr.autogroup.max_num_groups = 1;
+ prio = _get_prio(ns, prio, &ft_attr);
if (IS_ERR(prio)) {
err = PTR_ERR(prio);
goto put_prio;
@@ -1872,7 +1874,7 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev,
u32 *flags, u16 *vport_idx,
u16 *vport,
struct mlx5_core_dev **ft_mdev,
- u32 ib_port)
+ u32 ib_port, u16 *esw_owner_vhca_id)
{
struct mlx5_core_dev *esw_mdev;
@@ -1886,8 +1888,13 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev,
return -EINVAL;
esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw);
- if (esw_mdev != dev->mdev)
- return -EOPNOTSUPP;
+ if (esw_mdev != dev->mdev) {
+ if (!MLX5_CAP_ADV_RDMA(dev->mdev,
+ rdma_transport_manager_other_eswitch))
+ return -EOPNOTSUPP;
+ *flags |= MLX5_FLOW_TABLE_OTHER_ESWITCH;
+ *esw_owner_vhca_id = MLX5_CAP_GEN(esw_mdev, vhca_id);
+ }
*flags |= MLX5_FLOW_TABLE_OTHER_VPORT;
*ft_mdev = esw_mdev;
@@ -1903,8 +1910,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
bool mcast, u32 ib_port)
{
struct mlx5_core_dev *ft_mdev = dev->mdev;
+ struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_flow_namespace *ns = NULL;
struct mlx5_ib_flow_prio *prio = NULL;
+ u16 esw_owner_vhca_id = 0;
int max_table_size = 0;
u16 vport_idx = 0;
bool esw_encap;
@@ -1966,7 +1975,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
return ERR_PTR(-EINVAL);
ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags,
&vport_idx, &vport,
- &ft_mdev, ib_port);
+ &ft_mdev, ib_port,
+ &esw_owner_vhca_id);
if (ret)
return ERR_PTR(ret);
@@ -2026,8 +2036,13 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority,
if (prio->flow_table)
return prio;
- return _get_prio(dev, ns, prio, priority, max_table_size,
- MLX5_FS_MAX_TYPES, flags, vport);
+ ft_attr.prio = priority;
+ ft_attr.max_fte = max_table_size;
+ ft_attr.flags = flags;
+ ft_attr.vport = vport;
+ ft_attr.esw_owner_vhca_id = esw_owner_vhca_id;
+ ft_attr.autogroup.max_num_groups = MLX5_FS_MAX_TYPES;
+ return _get_prio(ns, prio, &ft_attr);
}
static struct mlx5_ib_flow_handler *
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index cc8859d3c2f5..bbecca405171 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -44,6 +44,63 @@ static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports)
}
}
+static int mlx5_ib_set_owner_transport(struct mlx5_core_dev *cur_owner,
+ struct mlx5_core_dev *new_owner)
+{
+ int ret;
+
+ if (!MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(cur_owner, ft_support) ||
+ !MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(cur_owner, ft_support))
+ return 0;
+
+ if (!MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager) ||
+ !MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager_other_eswitch))
+ return 0;
+
+ ret = mlx5_fs_set_root_dev(cur_owner, new_owner,
+ FS_FT_RDMA_TRANSPORT_TX);
+ if (ret)
+ return ret;
+
+ ret = mlx5_fs_set_root_dev(cur_owner, new_owner,
+ FS_FT_RDMA_TRANSPORT_RX);
+ if (ret) {
+ mlx5_fs_set_root_dev(cur_owner, cur_owner,
+ FS_FT_RDMA_TRANSPORT_TX);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mlx5_ib_release_transport(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_dev *peer_dev;
+ int i, ret;
+
+ mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
+ ret = mlx5_ib_set_owner_transport(peer_dev, peer_dev);
+ WARN_ON_ONCE(ret);
+ }
+}
+
+static int mlx5_ib_take_transport(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_dev *peer_dev;
+ int ret;
+ int i;
+
+ mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
+ ret = mlx5_ib_set_owner_transport(peer_dev, dev);
+ if (ret) {
+ mlx5_ib_release_transport(dev);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static int
mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
@@ -88,10 +145,18 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
else
return mlx5_ib_set_vport_rep(lag_master, rep, vport_index);
+ if (mlx5_lag_is_shared_fdb(dev)) {
+ ret = mlx5_ib_take_transport(lag_master);
+ if (ret)
+ return ret;
+ }
+
ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
mlx5_core_net(lag_master));
- if (!ibdev)
- return -ENOMEM;
+ if (!ibdev) {
+ ret = -ENOMEM;
+ goto release_transport;
+ }
ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port),
GFP_KERNEL);
@@ -127,6 +192,10 @@ fail_add:
kfree(ibdev->port);
fail_port:
ib_dealloc_device(&ibdev->ib_dev);
+release_transport:
+ if (mlx5_lag_is_shared_fdb(lag_master))
+ mlx5_ib_release_transport(lag_master);
+
return ret;
}
@@ -182,6 +251,7 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
esw = peer_mdev->priv.eswitch;
mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
}
+ mlx5_ib_release_transport(mdev);
}
__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 90daa58126f4..40284bbb45d6 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -511,6 +511,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_XDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_1600TAUI_8_1600TBASE_CR8_KR8):
+ *active_width = IB_WIDTH_8X;
+ *active_speed = IB_SPEED_XDR;
+ break;
default:
return -EINVAL;
}
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 0e8ae85af5a6..e71ee3d52eb0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -97,33 +97,28 @@ struct mlx5_pagefault {
* a pagefault. */
#define MMU_NOTIFIER_TIMEOUT 1000
-#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
-#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
-#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
-#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
-#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
-
-#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
-
static u64 mlx5_imr_ksm_entries;
+static u64 mlx5_imr_mtt_entries;
+static u64 mlx5_imr_mtt_size;
+static u8 mlx5_imr_mtt_shift;
+static u8 mlx5_imr_ksm_page_shift;
-static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
+static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries,
struct mlx5_ib_mr *imr, int flags)
{
struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev;
- struct mlx5_klm *end = pklm + nentries;
- int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0;
+ struct mlx5_ksm *end = pksm + nentries;
+ u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0;
__be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ?
cpu_to_be32(imr->null_mmkey.key) :
mr_to_mdev(imr)->mkeys.null_mkey;
u64 va =
- MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0;
+ MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0;
if (flags & MLX5_IB_UPD_XLT_ZAP) {
- for (; pklm != end; pklm++, idx++, va += step) {
- pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- pklm->key = key;
- pklm->va = cpu_to_be64(va);
+ for (; pksm != end; pksm++, idx++, va += step) {
+ pksm->key = key;
+ pksm->va = cpu_to_be64(va);
}
return;
}
@@ -147,16 +142,15 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
*/
lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
- for (; pklm != end; pklm++, idx++, va += step) {
+ for (; pksm != end; pksm++, idx++, va += step) {
struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
- pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
if (mtt) {
- pklm->key = cpu_to_be32(mtt->ibmr.lkey);
- pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
+ pksm->key = cpu_to_be32(mtt->ibmr.lkey);
+ pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size);
} else {
- pklm->key = key;
- pklm->va = cpu_to_be64(va);
+ pksm->key = key;
+ pksm->va = cpu_to_be64(va);
}
}
}
@@ -201,7 +195,7 @@ int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
{
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
- populate_klm(xlt, idx, nentries, mr, flags);
+ populate_ksm(xlt, idx, nentries, mr, flags);
return 0;
} else {
return populate_mtt(xlt, idx, nentries, mr, flags);
@@ -226,7 +220,7 @@ static void free_implicit_child_mr_work(struct work_struct *work)
mutex_lock(&odp_imr->umem_mutex);
mlx5r_umr_update_xlt(mr->parent,
- ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0,
+ ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0,
MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
mutex_unlock(&odp_imr->umem_mutex);
mlx5_ib_dereg_mr(&mr->ibmr, NULL);
@@ -237,7 +231,7 @@ static void free_implicit_child_mr_work(struct work_struct *work)
static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
+ unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift;
struct mlx5_ib_mr *imr = mr->parent;
/*
@@ -265,7 +259,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
- queue_work(system_unbound_wq, &mr->odp_destroy.work);
+ queue_work(system_dfl_wq, &mr->odp_destroy.work);
}
static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
@@ -425,7 +419,10 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
MLX5_CAP_GEN(dev->mdev, null_mkey) &&
MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
- !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
+ !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) &&
+ mlx5_imr_ksm_entries != 0 &&
+ !(mlx5_imr_ksm_page_shift >
+ get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM)))
caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
}
@@ -476,14 +473,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
int err;
odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
- idx * MLX5_IMR_MTT_SIZE,
- MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
+ idx * mlx5_imr_mtt_size,
+ mlx5_imr_mtt_size, &mlx5_mn_ops);
if (IS_ERR(odp))
return ERR_CAST(odp);
mr = mlx5_mr_cache_alloc(dev, imr->access_flags,
MLX5_MKC_ACCESS_MODE_MTT,
- MLX5_IMR_MTT_ENTRIES);
+ mlx5_imr_mtt_entries);
if (IS_ERR(mr)) {
ib_umem_odp_release(odp);
return mr;
@@ -495,7 +492,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
mr->umem = &odp->umem;
mr->ibmr.lkey = mr->mmkey.key;
mr->ibmr.rkey = mr->mmkey.key;
- mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE;
+ mr->ibmr.iova = idx * mlx5_imr_mtt_size;
mr->parent = imr;
odp->private = mr;
@@ -506,7 +503,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_set(&mr->mmkey.usecount, 2);
err = mlx5r_umr_update_xlt(mr, 0,
- MLX5_IMR_MTT_ENTRIES,
+ mlx5_imr_mtt_entries,
PAGE_SHIFT,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ENABLE);
@@ -611,7 +608,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
struct mlx5_ib_mr *imr;
int err;
- if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE))
+ if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE))
return ERR_PTR(-EOPNOTSUPP);
umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
@@ -647,7 +644,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
err = mlx5r_umr_update_xlt(imr, 0,
mlx5_imr_ksm_entries,
- MLX5_KSM_PAGE_SHIFT,
+ mlx5_imr_ksm_page_shift,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ENABLE);
@@ -750,20 +747,20 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
struct ib_umem_odp *odp_imr, u64 user_va,
size_t bcnt, u32 *bytes_mapped, u32 flags)
{
- unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
+ unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift;
unsigned long upd_start_idx = end_idx + 1;
unsigned long upd_len = 0;
unsigned long npages = 0;
int err;
int ret;
- if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
- mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
+ if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size ||
+ mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt))
return -EFAULT;
/* Fault each child mr that intersects with our interval. */
while (bcnt) {
- unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
+ unsigned long idx = user_va >> mlx5_imr_mtt_shift;
struct ib_umem_odp *umem_odp;
struct mlx5_ib_mr *mtt;
u64 len;
@@ -1924,9 +1921,25 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
int mlx5_ib_odp_init(void)
{
+ u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT;
+ u8 mlx5_imr_mtt_bits;
+
+ /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */
+ if (log_va_pages <= 48 - PAGE_SHIFT)
+ mlx5_imr_mtt_shift = 30;
+ /* 56 is x86-64, 5-level paging */
+ else if (log_va_pages <= 56 - PAGE_SHIFT)
+ mlx5_imr_mtt_shift = 34;
+ else
+ return 0;
+
+ mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift);
+ mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT;
+ mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits);
mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
- MLX5_IMR_MTT_BITS);
+ mlx5_imr_mtt_bits);
+ mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift;
return 0;
}
@@ -2093,6 +2106,6 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
destroy_prefetch_work(work);
return rc;
}
- queue_work(system_unbound_wq, &work->work);
+ queue_work(system_dfl_wq, &work->work);
return 0;
}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 88724d15705d..69af20790481 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -3451,10 +3451,11 @@ int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate)
{
u32 stat_rate_support;
- if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS)
+ if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS ||
+ rate == IB_RATE_1600_GBPS)
return 0;
- if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS)
+ if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_1600_GBPS)
return -EINVAL;
stat_rate_support = MLX5_CAP_GEN(dev->mdev, stat_rate_support);
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 0ca2743f1075..e7835ca70e2b 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -518,7 +518,8 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
*/
int rvt_driver_cq_init(void)
{
- comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ comp_vector_wq = alloc_workqueue("%s",
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_PERCPU,
0, "rdmavt_cq");
if (!comp_vector_wq)
return -ENOMEM;
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index bcb97b3ea58a..b1df05238848 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -452,7 +452,6 @@ static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int leng
length -= bytes;
iova += bytes;
- page_offset = 0;
}
return 0;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index ac0183a2ff7a..0195d361e5e3 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -20,6 +20,54 @@
static struct rxe_recv_sockets recv_sockets;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key rxe_recv_sk_key[2];
+static struct lock_class_key rxe_recv_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void rxe_reclassify_recv_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-RXE-RECV",
+ &rxe_recv_slock_key[0],
+ "sk_lock-AF_INET-RDMA-RXE-RECV",
+ &rxe_recv_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-RXE-RECV",
+ &rxe_recv_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-RXE-RECV",
+ &rxe_recv_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
struct net_device *ndev,
struct in_addr *saddr,
@@ -192,6 +240,7 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
err = udp_sock_create(net, &udp_cfg, &sock);
if (err < 0)
return ERR_PTR(err);
+ rxe_reclassify_recv_socket(sock);
tnl_cfg.encap_type = 1;
tnl_cfg.encap_rcv = rxe_udp_encap_recv;
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index f58e3ec6252f..ae71812bea82 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -358,7 +358,6 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
length -= bytes;
iova += bytes;
- page_offset = 0;
}
mutex_unlock(&umem_odp->umem_mutex);
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 95f1c1c2949d..845bdd03ca28 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -15,6 +15,54 @@
#include "rxe_queue.h"
#include "rxe_task.h"
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key rxe_send_sk_key[2];
+static struct lock_class_key rxe_send_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void rxe_reclassify_send_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-RXE-SEND",
+ &rxe_send_slock_key[0],
+ "sk_lock-AF_INET-RDMA-RXE-SEND",
+ &rxe_send_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-RXE-SEND",
+ &rxe_send_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-RXE-SEND",
+ &rxe_send_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
int has_srq)
{
@@ -244,6 +292,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
if (err < 0)
return err;
+ rxe_reclassify_send_socket(qp->sk);
qp->sk->sk->sk_user_data = qp;
/* pick a source UDP port number for this QP based on
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
index 3661cb627d28..2a234f26ac10 100644
--- a/drivers/infiniband/sw/rxe/rxe_srq.c
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -171,7 +171,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
udata, mi, &srq->rq.producer_lock,
&srq->rq.consumer_lock);
if (err)
- goto err_free;
+ return err;
srq->rq.max_wr = attr->max_wr;
}
@@ -180,11 +180,6 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
srq->limit = attr->srq_limit;
return 0;
-
-err_free:
- rxe_queue_cleanup(q);
- srq->rq.queue = NULL;
- return err;
}
void rxe_srq_cleanup(struct rxe_pool_elem *elem)
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index eb0bd4f79a85..1d3de8209bfa 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -39,6 +39,55 @@ static void siw_cm_llp_error_report(struct sock *s);
static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
int status);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * lockdep can detect false positive circular dependencies
+ * when there are user-space socket API users or in kernel
+ * users switching between a tcp and rdma transport.
+ * Maybe also switching between siw and rxe may cause
+ * problems as per default sockets are only classified
+ * by family and not by ip protocol. And there might
+ * be different locks used between the application
+ * and the low level sockets.
+ *
+ * Problems were seen with ksmbd.ko and cifs.ko,
+ * switching transports, use git blame to find
+ * more details.
+ */
+static struct lock_class_key siw_sk_key[2];
+static struct lock_class_key siw_slock_key[2];
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void siw_reclassify_socket(struct socket *sock)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct sock *sk = sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET-RDMA-SIW",
+ &siw_slock_key[0],
+ "sk_lock-AF_INET-RDMA-SIW",
+ &siw_sk_key[0]);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk,
+ "slock-AF_INET6-RDMA-SIW",
+ &siw_slock_key[1],
+ "sk_lock-AF_INET6-RDMA-SIW",
+ &siw_sk_key[1]);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+}
+
static void siw_sk_assign_cm_upcalls(struct sock *sk)
{
struct siw_cep *cep = sk_to_cep(sk);
@@ -1394,6 +1443,7 @@ int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
if (rv < 0)
goto error;
+ siw_reclassify_socket(s);
/*
* NOTE: For simplification, connect() is called in blocking
@@ -1770,6 +1820,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog)
rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
if (rv < 0)
return rv;
+ siw_reclassify_socket(s);
/*
* Allow binding local port when still in TIME_WAIT from last close.
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 2e3c0516ce8f..dc531fad73de 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -1029,7 +1029,7 @@ static int __init iser_init(void)
mutex_init(&ig.connlist_mutex);
INIT_LIST_HEAD(&ig.connlist);
- release_wq = alloc_workqueue("release workqueue", 0, 0);
+ release_wq = alloc_workqueue("release workqueue", WQ_PERCPU, 0);
if (!release_wq) {
iser_err("failed to allocate release workqueue\n");
err = -ENOMEM;
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 42977a5326ee..af811d060cc8 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -2613,7 +2613,7 @@ static struct iscsit_transport iser_target_transport = {
static int __init isert_init(void)
{
- isert_login_wq = alloc_workqueue("isert_login_wq", 0, 0);
+ isert_login_wq = alloc_workqueue("isert_login_wq", WQ_PERCPU, 0);
if (!isert_login_wq) {
isert_err("Unable to allocate isert_login_wq\n");
return -ENOMEM;
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
index ef4abdea3c2d..9ecc6343455d 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -1450,7 +1450,7 @@ err_free_chunks:
kfree(srv->chunks);
err_free_srv:
- kfree(srv);
+ put_device(&srv->dev);
return ERR_PTR(-ENOMEM);
}