diff options
Diffstat (limited to 'drivers/infiniband/hw')
152 files changed, 8944 insertions, 3657 deletions
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index 1211f4317a9f..aba96ca9bce5 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma/ obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ -obj-$(CONFIG_INFINIBAND_HNS) += hns/ +obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile index ee9bb1be61ea..f63417d2ccc6 100644 --- a/drivers/infiniband/hw/bnxt_re/Makefile +++ b/drivers/infiniband/hw/bnxt_re/Makefile @@ -4,4 +4,5 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnxt obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o bnxt_re-y := main.o ib_verbs.o \ qplib_res.o qplib_rcfw.o \ - qplib_sp.o qplib_fp.o hw_counters.o + qplib_sp.o qplib_fp.o hw_counters.o \ + debugfs.o diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 9dca451ed522..502a79136d4d 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -53,12 +53,6 @@ #define BNXT_RE_MAX_MR_SIZE_HIGH BIT_ULL(39) #define BNXT_RE_MAX_MR_SIZE BNXT_RE_MAX_MR_SIZE_HIGH -#define BNXT_RE_MAX_QPC_COUNT (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT (64 * 1024) -#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) -#define BNXT_RE_MAX_CQ_COUNT (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) /* Number of MRs to reserve for PF, leaving remainder for VFs */ #define BNXT_RE_RESVD_MR_FOR_PF (32 * 1024) @@ -91,6 +85,15 @@ struct bnxt_re_ring_attr { u8 mode; }; +/* + * Data structure and defines to handle + * recovery + */ +#define BNXT_RE_PRE_RECOVERY_REMOVE 0x1 +#define BNXT_RE_COMPLETE_REMOVE 0x2 +#define BNXT_RE_POST_RECOVERY_INIT 0x4 +#define BNXT_RE_COMPLETE_INIT 0x8 + struct bnxt_re_sqp_entries { struct bnxt_qplib_sge sge; u64 wrid; @@ -107,8 +110,11 @@ struct bnxt_re_gsi_context { struct bnxt_re_sqp_entries *sqp_tbl; }; -#define BNXT_RE_MIN_MSIX 2 -#define BNXT_RE_MAX_MSIX 9 +struct bnxt_re_en_dev_info { + struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; +}; + #define BNXT_RE_AEQ_IDX 0 #define BNXT_RE_NQ_IDX 1 #define BNXT_RE_GEN_P5_MAX_VF 64 @@ -131,12 +137,36 @@ struct bnxt_re_pacing { #define BNXT_RE_PACING_ALARM_TH_MULTIPLE 2 /* Multiple of pacing algo threshold */ /* Default do_pacing value when there is no congestion */ #define BNXT_RE_DBR_DO_PACING_NO_CONGESTION 0x7F /* 1 in 512 probability */ -#define BNXT_RE_DB_FIFO_ROOM_MASK 0x1FFF8000 -#define BNXT_RE_MAX_FIFO_DEPTH 0x2c00 -#define BNXT_RE_DB_FIFO_ROOM_SHIFT 15 + +#define BNXT_RE_MAX_FIFO_DEPTH_P5 0x2c00 +#define BNXT_RE_MAX_FIFO_DEPTH_P7 0x8000 + +#define BNXT_RE_MAX_FIFO_DEPTH(ctx) \ + (bnxt_qplib_is_chip_gen_p7((ctx)) ? \ + BNXT_RE_MAX_FIFO_DEPTH_P7 :\ + BNXT_RE_MAX_FIFO_DEPTH_P5) + #define BNXT_RE_GRC_FIFO_REG_BASE 0x2000 +#define BNXT_RE_MIN_MSIX 2 +#define BNXT_RE_MAX_MSIX BNXT_MAX_ROCE_MSIX +struct bnxt_re_nq_record { + struct bnxt_msix_entry msix_entries[BNXT_RE_MAX_MSIX]; + struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; + int num_msix; + /* serialize NQ access */ + struct mutex load_lock; +}; + #define MAX_CQ_HASH_BITS (16) +#define MAX_SRQ_HASH_BITS (16) + +static inline bool bnxt_re_chip_gen_p7(u16 chip_num) +{ + return (chip_num == CHIP_NUM_58818 || + chip_num == CHIP_NUM_57608); +} + struct bnxt_re_dev { struct ib_device ibdev; struct list_head list; @@ -150,31 +180,28 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_ERR_DEVICE_DETACHED 17 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; - struct notifier_block nb; + struct auxiliary_device *adev; unsigned int version, major, minor; struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; - int num_msix; int id; struct delayed_work worker; u8 cur_prio_map; - /* FP Notification Queue (CQ & SRQ) */ - struct tasklet_struct nq_task; - /* RCFW Channel */ struct bnxt_qplib_rcfw rcfw; - /* NQ */ - struct bnxt_qplib_nq nq[BNXT_RE_MAX_MSIX]; + /* NQ record */ + struct bnxt_re_nq_record *nqr; /* Device Resources */ - struct bnxt_qplib_dev_attr dev_attr; + struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_ctx qplib_ctx; struct bnxt_qplib_res qplib_res; struct bnxt_qplib_dpi dpi_privileged; + struct bnxt_qplib_cq_coal_param cq_coalescing; struct mutex qp_lock; /* protect qp list */ struct list_head qp_list; @@ -192,6 +219,12 @@ struct bnxt_re_dev { struct work_struct dbq_fifo_check_work; struct delayed_work dbq_pacing_work; DECLARE_HASHTABLE(cq_hash, MAX_CQ_HASH_BITS); + DECLARE_HASHTABLE(srq_hash, MAX_SRQ_HASH_BITS); + struct dentry *dbg_root; + struct dentry *qp_debugfs; + unsigned long event_bitmap; + struct bnxt_qplib_cc_param cc_param; + struct workqueue_struct *dcb_wq; }; #define to_bnxt_re_dev(ptr, member) \ @@ -212,4 +245,29 @@ static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) } extern const struct uapi_definition bnxt_re_uapi_defs[]; + +static inline void bnxt_re_set_pacing_dev_state(struct bnxt_re_dev *rdev) +{ + rdev->qplib_res.pacing_data->dev_err_state = + test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); +} + +static inline int bnxt_re_read_context_allowed(struct bnxt_re_dev *rdev) +{ + if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) || + rdev->rcfw.res->cctx->hwrm_intf_ver < HWRM_VERSION_READ_CTX) + return -EOPNOTSUPP; + return 0; +} + +#define BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P5 1088 +#define BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P5 128 +#define BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P5 128 +#define BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P5 192 + +#define BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P7 1088 +#define BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P7 192 +#define BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P7 192 +#define BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P7 192 + #endif diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c new file mode 100644 index 000000000000..7c47039044ef --- /dev/null +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright (c) 2024, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * Description: Debugfs component of the bnxt_re driver + */ + +#include <linux/debugfs.h> +#include <linux/pci.h> +#include <rdma/ib_addr.h> + +#include "bnxt_ulp.h" +#include "roce_hsi.h" +#include "qplib_res.h" +#include "qplib_sp.h" +#include "qplib_fp.h" +#include "qplib_rcfw.h" +#include "bnxt_re.h" +#include "ib_verbs.h" +#include "debugfs.h" + +static struct dentry *bnxt_re_debugfs_root; + +static inline const char *bnxt_re_qp_state_str(u8 state) +{ + switch (state) { + case CMDQ_MODIFY_QP_NEW_STATE_RESET: + return "RST"; + case CMDQ_MODIFY_QP_NEW_STATE_INIT: + return "INIT"; + case CMDQ_MODIFY_QP_NEW_STATE_RTR: + return "RTR"; + case CMDQ_MODIFY_QP_NEW_STATE_RTS: + return "RTS"; + case CMDQ_MODIFY_QP_NEW_STATE_SQE: + return "SQER"; + case CMDQ_MODIFY_QP_NEW_STATE_SQD: + return "SQD"; + case CMDQ_MODIFY_QP_NEW_STATE_ERR: + return "ERR"; + default: + return "Invalid QP state"; + } +} + +static inline const char *bnxt_re_qp_type_str(u8 type) +{ + switch (type) { + case CMDQ_CREATE_QP1_TYPE_GSI: return "QP1"; + case CMDQ_CREATE_QP_TYPE_GSI: return "QP1"; + case CMDQ_CREATE_QP_TYPE_RC: return "RC"; + case CMDQ_CREATE_QP_TYPE_UD: return "UD"; + case CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE: return "RAW_ETHERTYPE"; + default: return "Invalid transport type"; + } +} + +static ssize_t qp_info_read(struct file *filep, + char __user *buffer, + size_t count, loff_t *ppos) +{ + struct bnxt_re_qp *qp = filep->private_data; + char *buf; + int len; + + if (*ppos) + return 0; + + buf = kasprintf(GFP_KERNEL, + "QPN\t\t: %d\n" + "transport\t: %s\n" + "state\t\t: %s\n" + "mtu\t\t: %d\n" + "timeout\t\t: %d\n" + "remote QPN\t: %d\n", + qp->qplib_qp.id, + bnxt_re_qp_type_str(qp->qplib_qp.type), + bnxt_re_qp_state_str(qp->qplib_qp.state), + qp->qplib_qp.mtu, + qp->qplib_qp.timeout, + qp->qplib_qp.dest_qpn); + if (!buf) + return -ENOMEM; + if (count < strlen(buf)) { + kfree(buf); + return -ENOSPC; + } + len = simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); + kfree(buf); + return len; +} + +static const struct file_operations debugfs_qp_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = qp_info_read, +}; + +void bnxt_re_debug_add_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) +{ + char resn[32]; + + sprintf(resn, "0x%x", qp->qplib_qp.id); + qp->dentry = debugfs_create_file(resn, 0400, rdev->qp_debugfs, qp, &debugfs_qp_fops); +} + +void bnxt_re_debug_rem_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) +{ + debugfs_remove(qp->dentry); +} + +void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) +{ + struct pci_dev *pdev = rdev->en_dev->pdev; + + rdev->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), bnxt_re_debugfs_root); + + rdev->qp_debugfs = debugfs_create_dir("QPs", rdev->dbg_root); +} + +void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->qp_debugfs); + + debugfs_remove_recursive(rdev->dbg_root); + rdev->dbg_root = NULL; +} + +void bnxt_re_register_debugfs(void) +{ + bnxt_re_debugfs_root = debugfs_create_dir("bnxt_re", NULL); +} + +void bnxt_re_unregister_debugfs(void) +{ + debugfs_remove(bnxt_re_debugfs_root); +} diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h new file mode 100644 index 000000000000..cd3be0a9ec7e --- /dev/null +++ b/drivers/infiniband/hw/bnxt_re/debugfs.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright (c) 2024, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * Description: Debugfs header + */ + +#ifndef __BNXT_RE_DEBUGFS__ +#define __BNXT_RE_DEBUGFS__ + +void bnxt_re_debug_add_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp); +void bnxt_re_debug_rem_qpinfo(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp); + +void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev); +void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev); + +void bnxt_re_register_debugfs(void); +void bnxt_re_unregister_debugfs(void); + +#endif diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 128651c01595..f039aefcaf67 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -37,18 +37,9 @@ * */ -#include <linux/interrupt.h> #include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/sched.h> -#include <linux/slab.h> #include <linux/pci.h> -#include <linux/prefetch.h> -#include <linux/delay.h> -#include <rdma/ib_addr.h> - -#include "bnxt_ulp.h" #include "roce_hsi.h" #include "qplib_res.h" #include "qplib_sp.h" @@ -357,8 +348,8 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } bnxt_re_copy_err_stats(rdev, stats, err_s); - if (_is_ext_stats_supported(rdev->dev_attr.dev_cap_flags) && - !rdev->is_virtfn) { + if (bnxt_ext_stats_supported(rdev->chip_ctx, rdev->dev_attr->dev_cap_flags, + rdev->is_virtfn)) { rc = bnxt_re_get_ext_stat(rdev, stats); if (rc) { clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, @@ -366,7 +357,7 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } } - if (rdev->pacing.dbr_pacing) + if (rdev->pacing.dbr_pacing && bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) bnxt_re_copy_db_pacing_stats(rdev, stats); } diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index ce9c5bae83bf..6f5db32082dd 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -52,8 +52,6 @@ #include <rdma/uverbs_ioctl.h> #include <linux/hashtable.h> -#include "bnxt_ulp.h" - #include "roce_hsi.h" #include "qplib_res.h" #include "qplib_sp.h" @@ -62,6 +60,7 @@ #include "bnxt_re.h" #include "ib_verbs.h" +#include "debugfs.h" #include <rdma/uverbs_types.h> #include <rdma/uverbs_std_types.h> @@ -94,9 +93,9 @@ static int __from_ib_access_flags(int iflags) return qflags; }; -static enum ib_access_flags __to_ib_access_flags(int qflags) +static int __to_ib_access_flags(int qflags) { - enum ib_access_flags iflags = 0; + int iflags = 0; if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE) iflags |= IB_ACCESS_LOCAL_WRITE; @@ -113,7 +112,57 @@ static enum ib_access_flags __to_ib_access_flags(int qflags) if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND) iflags |= IB_ACCESS_ON_DEMAND; return iflags; -}; +} + +static u8 __qp_access_flags_from_ib(struct bnxt_qplib_chip_ctx *cctx, int iflags) +{ + u8 qflags = 0; + + if (!bnxt_qplib_is_chip_gen_p5_p7(cctx)) + /* For Wh+ */ + return (u8)__from_ib_access_flags(iflags); + + /* For P5, P7 and later chips */ + if (iflags & IB_ACCESS_LOCAL_WRITE) + qflags |= CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE; + if (iflags & IB_ACCESS_REMOTE_WRITE) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE; + if (iflags & IB_ACCESS_REMOTE_READ) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ; + if (iflags & IB_ACCESS_REMOTE_ATOMIC) + qflags |= CMDQ_MODIFY_QP_ACCESS_REMOTE_ATOMIC; + + return qflags; +} + +static int __qp_access_flags_to_ib(struct bnxt_qplib_chip_ctx *cctx, u8 qflags) +{ + int iflags = 0; + + if (!bnxt_qplib_is_chip_gen_p5_p7(cctx)) + /* For Wh+ */ + return __to_ib_access_flags(qflags); + + /* For P5, P7 and later chips */ + if (qflags & CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE) + iflags |= IB_ACCESS_LOCAL_WRITE; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE) + iflags |= IB_ACCESS_REMOTE_WRITE; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_READ) + iflags |= IB_ACCESS_REMOTE_READ; + if (qflags & CMDQ_MODIFY_QP_ACCESS_REMOTE_ATOMIC) + iflags |= IB_ACCESS_REMOTE_ATOMIC; + + return iflags; +} + +static void bnxt_re_check_and_set_relaxed_ordering(struct bnxt_re_dev *rdev, + struct bnxt_qplib_mrw *qplib_mr) +{ + if (_is_relaxed_ordering_supported(rdev->dev_attr->dev_cap_flags2) && + pcie_relaxed_ordering_enabled(rdev->en_dev->pdev)) + qplib_mr->flags |= CMDQ_REGISTER_MR_FLAGS_ENABLE_RO; +} static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list, struct bnxt_qplib_sge *sg_list, int num) @@ -135,7 +184,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, struct ib_udata *udata) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; memset(ib_attr, 0, sizeof(*ib_attr)); memcpy(&ib_attr->fw_ver, dev_attr->fw_ver, @@ -148,7 +197,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, ib_attr->vendor_id = rdev->en_dev->pdev->vendor; ib_attr->vendor_part_id = rdev->en_dev->pdev->device; - ib_attr->hw_ver = rdev->en_dev->pdev->subsystem_device; + ib_attr->hw_ver = rdev->en_dev->pdev->revision; ib_attr->max_qp = dev_attr->max_qp; ib_attr->max_qp_wr = dev_attr->max_qp_wqes; ib_attr->device_cap_flags = @@ -203,12 +252,28 @@ int bnxt_re_query_device(struct ib_device *ibdev, return 0; } +int bnxt_re_modify_device(struct ib_device *ibdev, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + ibdev_dbg(ibdev, "Modify device with mask 0x%x", device_modify_mask); + + if (device_modify_mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + memcpy(ibdev->node_desc, device_modify->node_desc, IB_DEVICE_NODE_DESC_MAX); + return 0; +} + /* Port */ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *port_attr) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; int rc; memset(port_attr, 0, sizeof(*port_attr)); @@ -266,8 +331,8 @@ void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str) struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d", - rdev->dev_attr.fw_ver[0], rdev->dev_attr.fw_ver[1], - rdev->dev_attr.fw_ver[2], rdev->dev_attr.fw_ver[3]); + rdev->dev_attr->fw_ver[0], rdev->dev_attr->fw_ver[1], + rdev->dev_attr->fw_ver[2], rdev->dev_attr->fw_ver[3]); } int bnxt_re_query_pkey(struct ib_device *ibdev, u32 port_num, @@ -517,15 +582,19 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); - goto fail; - } + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); + if (!_is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)) { + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); + goto fail; + } - /* Register MR */ - mr->ib_mr.lkey = mr->qplib_mr.lkey; + /* Register MR */ + mr->ib_mr.lkey = mr->qplib_mr.lkey; + } else { + mr->qplib_mr.flags = CMDQ_REGISTER_MR_FLAGS_ALLOC_MR; + } mr->qplib_mr.va = (u64)(unsigned long)fence->va; mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, @@ -896,13 +965,13 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) unsigned int flags; int rc; + bnxt_re_debug_rem_qpinfo(rdev, qp); + bnxt_qplib_flush_cqn_wq(&qp->qplib_qp); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); - if (rc) { + if (rc) ibdev_err(&rdev->ibdev, "Failed to destroy HW QP"); - return rc; - } if (rdma_is_kernel_res(&qp->ib_qp.res)) { flags = bnxt_re_lock_cqs(qp); @@ -912,11 +981,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp); - if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) { - rc = bnxt_re_destroy_gsi_sqp(qp); - if (rc) - return rc; - } + if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) + bnxt_re_destroy_gsi_sqp(qp); mutex_lock(&rdev->qp_lock); list_del(&qp->list); @@ -989,48 +1055,42 @@ static int bnxt_re_setup_swqe_size(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; sq = &qplqp->sq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; align = sizeof(struct sq_send_hdr); ilsize = ALIGN(init_attr->cap.max_inline_data, align); - sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge); - if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges)) - return -EINVAL; - /* For gen p4 and gen p5 backward compatibility mode - * wqe size is fixed to 128 bytes + /* For gen p4 and gen p5 fixed wqe compatibility mode + * wqe size is fixed to 128 bytes - ie 6 SGEs */ - if (sq->wqe_size < bnxt_re_get_swqe_size(dev_attr->max_qp_sges) && - qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) - sq->wqe_size = bnxt_re_get_swqe_size(dev_attr->max_qp_sges); + if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) { + sq->wqe_size = bnxt_re_get_swqe_size(BNXT_STATIC_MAX_SGE); + sq->max_sge = BNXT_STATIC_MAX_SGE; + } else { + sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge); + if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges)) + return -EINVAL; + } if (init_attr->cap.max_inline_data) { qplqp->max_inline_data = sq->wqe_size - sizeof(struct sq_send_hdr); init_attr->cap.max_inline_data = qplqp->max_inline_data; - if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) - sq->max_sge = qplqp->max_inline_data / - sizeof(struct sq_sge); } return 0; } static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, - struct bnxt_re_qp *qp, struct ib_udata *udata) + struct bnxt_re_qp *qp, struct bnxt_re_ucontext *cntx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_qp *qplib_qp; - struct bnxt_re_ucontext *cntx; - struct bnxt_re_qp_req ureq; int bytes = 0, psn_sz; struct ib_umem *umem; int psn_nume; qplib_qp = &qp->qplib_qp; - cntx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, - ib_uctx); - if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) - return -EFAULT; bytes = (qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size); /* Consider mapping PSN search memory only for RC QPs. */ @@ -1038,15 +1098,20 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, psn_sz = bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) ? sizeof(struct sq_psn_search_ext) : sizeof(struct sq_psn_search); - psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? - qplib_qp->sq.max_wqe : - ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / - sizeof(struct bnxt_qplib_sge)); + if (cntx && bnxt_re_is_var_size_supported(rdev, cntx)) { + psn_nume = ureq->sq_slots; + } else { + psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? + qplib_qp->sq.max_wqe : ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / + sizeof(struct bnxt_qplib_sge)); + } + if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) + psn_nume = roundup_pow_of_two(psn_nume); bytes += (psn_nume * psn_sz); } bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq->qpsva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -1055,12 +1120,12 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, qplib_qp->sq.sg_info.umem = umem; qplib_qp->sq.sg_info.pgsize = PAGE_SIZE; qplib_qp->sq.sg_info.pgshft = PAGE_SHIFT; - qplib_qp->qp_handle = ureq.qp_handle; + qplib_qp->qp_handle = ureq->qp_handle; if (!qp->qplib_qp.srq) { bytes = (qplib_qp->rq.max_wqe * qplib_qp->rq.wqe_size); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq->qprva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) goto rqfail; @@ -1156,6 +1221,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp /* Shadow QP SQ depth should be same as QP1 RQ depth */ qp->qplib_qp.sq.wqe_size = bnxt_re_get_wqe_size(0, 6); qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe; + qp->qplib_qp.sq.max_sw_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.sq.max_sge = 2; /* Q full delta can be 1 since it is internal QP */ qp->qplib_qp.sq.q_full_delta = 1; @@ -1167,6 +1233,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp qp->qplib_qp.rq.wqe_size = bnxt_re_get_rwqe_size(6); qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe; + qp->qplib_qp.rq.max_sw_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge; /* Q full delta can be 1 since it is internal QP */ qp->qplib_qp.rq.q_full_delta = 1; @@ -1208,7 +1275,7 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; rq = &qplqp->rq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (init_attr->srq) { struct bnxt_re_srq *srq; @@ -1228,6 +1295,7 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, */ entries = bnxt_re_init_depth(init_attr->cap.max_recv_wr + 1, uctx); rq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + rq->max_sw_wqe = rq->max_wqe; rq->q_full_delta = 0; rq->sg_info.pgsize = PAGE_SIZE; rq->sg_info.pgshft = PAGE_SHIFT; @@ -1244,7 +1312,7 @@ static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp) rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { qplqp->rq.max_sge = dev_attr->max_qp_sges; @@ -1256,37 +1324,49 @@ static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp) static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, struct ib_qp_init_attr *init_attr, - struct bnxt_re_ucontext *uctx) + struct bnxt_re_ucontext *uctx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; struct bnxt_qplib_q *sq; + int diff = 0; int entries; - int diff; int rc; rdev = qp->rdev; qplqp = &qp->qplib_qp; sq = &qplqp->sq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; sq->max_sge = init_attr->cap.max_send_sge; - if (sq->max_sge > dev_attr->max_qp_sges) { - sq->max_sge = dev_attr->max_qp_sges; - init_attr->cap.max_send_sge = sq->max_sge; - } + entries = init_attr->cap.max_send_wr; + if (uctx && qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) { + sq->max_wqe = ureq->sq_slots; + sq->max_sw_wqe = ureq->sq_slots; + sq->wqe_size = sizeof(struct sq_sge); + } else { + if (sq->max_sge > dev_attr->max_qp_sges) { + sq->max_sge = dev_attr->max_qp_sges; + init_attr->cap.max_send_sge = sq->max_sge; + } - rc = bnxt_re_setup_swqe_size(qp, init_attr); - if (rc) - return rc; + rc = bnxt_re_setup_swqe_size(qp, init_attr); + if (rc) + return rc; - entries = init_attr->cap.max_send_wr; - /* Allocate 128 + 1 more than what's provided */ - diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ? - 0 : BNXT_QPLIB_RESERVED_QP_WRS; - entries = bnxt_re_init_depth(entries + diff + 1, uctx); - sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); + /* Allocate 128 + 1 more than what's provided */ + diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ? + 0 : BNXT_QPLIB_RESERVED_QP_WRS; + entries = bnxt_re_init_depth(entries + diff + 1, uctx); + sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); + if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + sq->max_sw_wqe = bnxt_qplib_get_depth(sq, qplqp->wqe_mode, true); + else + sq->max_sw_wqe = sq->max_wqe; + + } sq->q_full_delta = diff + 1; /* * Reserving one slot for Phantom WQE. Application can @@ -1311,7 +1391,7 @@ static void bnxt_re_adjust_gsi_sq_attr(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { entries = bnxt_re_init_depth(init_attr->cap.max_send_wr + 1, uctx); @@ -1349,10 +1429,10 @@ out: static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata) + struct bnxt_re_ucontext *uctx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_dev_attr *dev_attr; - struct bnxt_re_ucontext *uctx; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; struct bnxt_re_cq *cq; @@ -1360,9 +1440,8 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; - uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); /* Setup misc params */ ether_addr_copy(qplqp->smac, rdev->netdev->dev_addr); qplqp->pd = &pd->qplib_pd; @@ -1375,8 +1454,7 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, goto out; } qplqp->type = (u8)qptype; - qplqp->wqe_mode = rdev->chip_ctx->modes.wqe_mode; - + qplqp->wqe_mode = bnxt_re_is_var_size_supported(rdev, uctx); if (init_attr->qp_type == IB_QPT_RC) { qplqp->max_rd_atomic = dev_attr->max_qp_rd_atom; qplqp->max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom; @@ -1411,14 +1489,14 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, bnxt_re_adjust_gsi_rq_attr(qp); /* Setup SQ */ - rc = bnxt_re_init_sq_attr(qp, init_attr, uctx); + rc = bnxt_re_init_sq_attr(qp, init_attr, uctx, ureq); if (rc) goto out; if (init_attr->qp_type == IB_QPT_GSI) bnxt_re_adjust_gsi_sq_attr(qp, init_attr, uctx); - if (udata) /* This will update DPI and qp_handle */ - rc = bnxt_re_init_user_qp(rdev, pd, qp, udata); + if (uctx) /* This will update DPI and qp_handle */ + rc = bnxt_re_init_user_qp(rdev, pd, qp, uctx, ureq); out: return rc; } @@ -1519,14 +1597,27 @@ static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev, int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata) { - struct ib_pd *ib_pd = ib_qp->pd; - struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); - struct bnxt_re_dev *rdev = pd->rdev; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; - struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + struct bnxt_qplib_dev_attr *dev_attr; + struct bnxt_re_ucontext *uctx; + struct bnxt_re_qp_req ureq; + struct bnxt_re_dev *rdev; + struct bnxt_re_pd *pd; + struct bnxt_re_qp *qp; + struct ib_pd *ib_pd; u32 active_qps; int rc; + ib_pd = ib_qp->pd; + pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + rdev = pd->rdev; + dev_attr = rdev->dev_attr; + qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + + uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); + if (udata) + if (ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq)))) + return -EFAULT; + rc = bnxt_re_test_qp_limits(rdev, qp_init_attr, dev_attr); if (!rc) { rc = -EINVAL; @@ -1534,7 +1625,7 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, } qp->rdev = rdev; - rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, udata); + rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, uctx, &ureq); if (rc) goto fail; @@ -1585,6 +1676,7 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, if (active_qps > rdev->stats.res.ud_qp_watermark) rdev->stats.res.ud_qp_watermark = active_qps; } + bnxt_re_debug_add_qpinfo(rdev, qp); return 0; qp_destroy: @@ -1685,6 +1777,10 @@ int bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata) if (qplib_srq->cq) nq = qplib_srq->cq->nq; + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) { + free_page((unsigned long)srq->uctx_srq_page); + hash_del(&srq->hash_entry); + } bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq); ib_umem_release(srq->umem); atomic_dec(&rdev->stats.res.srq_count); @@ -1742,7 +1838,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, ib_pd = ib_srq->pd; pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); rdev = pd->rdev; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) { @@ -1773,8 +1869,10 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, srq->qplib_srq.wqe_size = bnxt_re_get_rwqe_size(dev_attr->max_srq_sges); srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit; srq->srq_limit = srq_init_attr->attr.srq_limit; - srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id; - nq = &rdev->nq[0]; + srq->qplib_srq.eventq_hw_ring_id = rdev->nqr->nq[0].ring_id; + srq->qplib_srq.sg_info.pgsize = PAGE_SIZE; + srq->qplib_srq.sg_info.pgshft = PAGE_SHIFT; + nq = &rdev->nqr->nq[0]; if (udata) { rc = bnxt_re_init_user_srq(rdev, pd, srq, udata); @@ -1789,9 +1887,18 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, } if (udata) { - struct bnxt_re_srq_resp resp; + struct bnxt_re_srq_resp resp = {}; resp.srqid = srq->qplib_srq.id; + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) { + hash_add(rdev->srq_hash, &srq->hash_entry, srq->qplib_srq.id); + srq->uctx_srq_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!srq->uctx_srq_page) { + rc = -ENOMEM; + goto fail; + } + resp.comp_mask |= BNXT_RE_SRQ_TOGGLE_PAGE_SUPPORT; + } rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (rc) { ibdev_err(&rdev->ibdev, "SRQ copy to udata failed!"); @@ -1937,7 +2044,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, { struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); struct bnxt_re_dev *rdev = qp->rdev; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; enum ib_qp_state curr_qp_state, new_qp_state; int rc, entries; unsigned int flags; @@ -1991,12 +2098,10 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS; qp->qplib_qp.access = - __from_ib_access_flags(qp_attr->qp_access_flags); + __qp_access_flags_from_ib(qp->qplib_qp.cctx, + qp_attr->qp_access_flags); /* LOCAL_WRITE access must be set to allow RC receive */ - qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE; - /* Temp: Set all params on QP as of now */ - qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE; - qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ; + qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE; } if (qp_attr_mask & IB_QP_PKEY_INDEX) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; @@ -2030,7 +2135,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp->qplib_qp.ah.sgid_index = ctx->idx; qp->qplib_qp.ah.host_sgid_index = grh->sgid_index; qp->qplib_qp.ah.hop_limit = grh->hop_limit; - qp->qplib_qp.ah.traffic_class = grh->traffic_class; + qp->qplib_qp.ah.traffic_class = grh->traffic_class >> 2; qp->qplib_qp.ah.sl = rdma_ah_get_sl(&qp_attr->ah_attr); ether_addr_copy(qp->qplib_qp.ah.dmac, qp_attr->ah_attr.roce.dmac); @@ -2057,18 +2162,20 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, } } - if (qp_attr_mask & IB_QP_PATH_MTU) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu); - qp->qplib_qp.mtu = ib_mtu_enum_to_int(qp_attr->path_mtu); - } else if (qp_attr->qp_state == IB_QPS_RTR) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = - __from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu)); - qp->qplib_qp.mtu = - ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu)); + if (qp_attr->qp_state == IB_QPS_RTR) { + enum ib_mtu qpmtu; + + qpmtu = iboe_get_mtu(rdev->netdev->mtu); + if (qp_attr_mask & IB_QP_PATH_MTU) { + if (ib_mtu_enum_to_int(qp_attr->path_mtu) > + ib_mtu_enum_to_int(qpmtu)) + return -EINVAL; + qpmtu = qp_attr->path_mtu; + } + + qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; + qp->qplib_qp.path_mtu = __from_ib_mtu(qpmtu); + qp->qplib_qp.mtu = ib_mtu_enum_to_int(qpmtu); } if (qp_attr_mask & IB_QP_TIMEOUT) { @@ -2155,6 +2262,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, entries = bnxt_re_init_depth(qp_attr->cap.max_recv_wr, uctx); qp->qplib_qp.rq.max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + qp->qplib_qp.rq.max_sw_wqe = qp->qplib_qp.rq.max_wqe; qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe - qp_attr->cap.max_recv_wr; qp->qplib_qp.rq.max_sge = qp_attr->cap.max_recv_sge; @@ -2200,7 +2308,8 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->qp_state = __to_ib_qp_state(qplib_qp->state); qp_attr->cur_qp_state = __to_ib_qp_state(qplib_qp->cur_qp_state); qp_attr->en_sqd_async_notify = qplib_qp->en_sqd_async_notify ? 1 : 0; - qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp->access); + qp_attr->qp_access_flags = __qp_access_flags_to_ib(qp->qplib_qp.cctx, + qplib_qp->access); qp_attr->pkey_index = qplib_qp->pkey_index; qp_attr->qkey = qplib_qp->qkey; qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; @@ -2216,6 +2325,7 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->retry_cnt = qplib_qp->retry_cnt; qp_attr->rnr_retry = qplib_qp->rnr_retry; qp_attr->min_rnr_timer = qplib_qp->min_rnr_timer; + qp_attr->port_num = __to_ib_port_num(qplib_qp->port_id); qp_attr->rq_psn = qplib_qp->rq.psn; qp_attr->max_rd_atomic = qplib_qp->max_rd_atomic; qp_attr->sq_psn = qplib_qp->sq.psn; @@ -2479,7 +2589,7 @@ static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp, break; case IB_WR_SEND_WITH_IMM: wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM; - wqe->send.imm_data = wr->ex.imm_data; + wqe->send.imm_data = be32_to_cpu(wr->ex.imm_data); break; case IB_WR_SEND_WITH_INV: wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV; @@ -2509,7 +2619,7 @@ static int bnxt_re_build_rdma_wqe(const struct ib_send_wr *wr, break; case IB_WR_RDMA_WRITE_WITH_IMM: wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM; - wqe->rdma.imm_data = wr->ex.imm_data; + wqe->rdma.imm_data = be32_to_cpu(wr->ex.imm_data); break; case IB_WR_RDMA_READ: wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_READ; @@ -2712,7 +2822,8 @@ bad: wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; } @@ -2824,7 +2935,8 @@ bad: wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; @@ -2921,6 +3033,28 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr, return rc; } +static struct bnxt_qplib_nq *bnxt_re_get_nq(struct bnxt_re_dev *rdev) +{ + int min, indx; + + mutex_lock(&rdev->nqr->load_lock); + for (indx = 0, min = 0; indx < (rdev->nqr->num_msix - 1); indx++) { + if (rdev->nqr->nq[min].load > rdev->nqr->nq[indx].load) + min = indx; + } + rdev->nqr->nq[min].load++; + mutex_unlock(&rdev->nqr->load_lock); + + return &rdev->nqr->nq[min]; +} + +static void bnxt_re_put_nq(struct bnxt_re_dev *rdev, struct bnxt_qplib_nq *nq) +{ + mutex_lock(&rdev->nqr->load_lock); + nq->load--; + mutex_unlock(&rdev->nqr->load_lock); +} + /* Completion Queues */ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { @@ -2939,6 +3073,8 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) hash_del(&cq->hash_entry); } bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); + + bnxt_re_put_nq(rdev, nq); ib_umem_release(cq->umem); atomic_dec(&rdev->stats.res.cq_count); @@ -2948,16 +3084,15 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) } int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev); + struct ib_udata *udata = &attrs->driver_udata; struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; struct bnxt_qplib_chip_ctx *cctx; - struct bnxt_qplib_nq *nq = NULL; - unsigned int nq_alloc_cnt; int cqe = attr->cqe; int rc, entries; u32 active_cqs; @@ -3008,15 +3143,10 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->qplib_cq.dpi = &rdev->dpi_privileged; } - /* - * Allocating the NQ in a round robin fashion. nq_alloc_cnt is a - * used for getting the NQ index. - */ - nq_alloc_cnt = atomic_inc_return(&rdev->nq_alloc_cnt); - nq = &rdev->nq[nq_alloc_cnt % (rdev->num_msix - 1)]; cq->qplib_cq.max_wqe = entries; - cq->qplib_cq.cnq_hw_ring_id = nq->ring_id; - cq->qplib_cq.nq = nq; + cq->qplib_cq.coalescing = &rdev->cq_coalescing; + cq->qplib_cq.nq = bnxt_re_get_nq(rdev); + cq->qplib_cq.cnq_hw_ring_id = cq->qplib_cq.nq->ring_id; rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq); if (rc) { @@ -3026,7 +3156,6 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->ib_cq.cqe = entries; cq->cq_period = cq->qplib_cq.period; - nq->budget++; active_cqs = atomic_inc_return(&rdev->stats.res.cq_count); if (active_cqs > rdev->stats.res.cq_watermark) @@ -3097,7 +3226,7 @@ int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); rdev = cq->rdev; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!ibcq->uobject) { ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); return -EOPNOTSUPP; @@ -3581,7 +3710,7 @@ static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *gsi_sqp, wc->byte_len = orig_cqe->length; wc->qp = &gsi_qp->ib_qp; - wc->ex.imm_data = orig_cqe->immdata; + wc->ex.imm_data = cpu_to_be32(orig_cqe->immdata); wc->src_qp = orig_cqe->src_qp; memcpy(wc->smac, orig_cqe->smac, ETH_ALEN); if (bnxt_re_is_vlan_pkt(orig_cqe, &vlan_id, &sl)) { @@ -3726,7 +3855,10 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) (unsigned long)(cqe->qp_handle), struct bnxt_re_qp, qplib_qp); wc->qp = &qp->ib_qp; - wc->ex.imm_data = cqe->immdata; + if (cqe->flags & CQ_RES_RC_FLAGS_IMM) + wc->ex.imm_data = cpu_to_be32(cqe->immdata); + else + wc->ex.invalidate_rkey = cqe->invrkey; wc->src_qp = cqe->src_qp; memcpy(wc->smac, cqe->smac, ETH_ALEN); wc->port_num = 1; @@ -3844,9 +3976,12 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; + if (mr_access_flags & IB_ACCESS_RELAXED_ORDERING) + bnxt_re_check_and_set_relaxed_ordering(rdev, &mr->qplib_mr); + /* Allocate and register 0 as the address */ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) @@ -3944,7 +4079,7 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type, mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = BNXT_QPLIB_FR_PMR; + mr->qplib_mr.access_flags = BNXT_QPLIB_FR_PMR; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); @@ -4061,21 +4196,28 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR; - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); - rc = -EIO; - goto free_mr; + if (!_is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)) { + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); + rc = -EIO; + goto free_mr; + } + /* The fixed portion of the rkey is the same as the lkey */ + mr->ib_mr.rkey = mr->qplib_mr.rkey; + } else { + mr->qplib_mr.flags = CMDQ_REGISTER_MR_FLAGS_ALLOC_MR; } - /* The fixed portion of the rkey is the same as the lkey */ - mr->ib_mr.rkey = mr->qplib_mr.rkey; mr->ib_umem = umem; mr->qplib_mr.va = virt_addr; mr->qplib_mr.total_size = length; + if (mr_access_flags & IB_ACCESS_RELAXED_ORDERING) + bnxt_re_check_and_set_relaxed_ordering(rdev, &mr->qplib_mr); + umem_pgs = ib_umem_num_dma_blocks(umem, page_size); rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem, umem_pgs, page_size); @@ -4121,7 +4263,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, - int mr_access_flags, struct ib_udata *udata) + int mr_access_flags, + struct uverbs_attr_bundle *attrs) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; @@ -4148,7 +4291,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) struct bnxt_re_ucontext *uctx = container_of(ctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; struct bnxt_re_user_mmap_entry *entry; struct bnxt_re_uctx_resp resp = {}; struct bnxt_re_uctx_req ureq = {}; @@ -4186,9 +4329,6 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) resp.cqe_sz = sizeof(struct cq_base); resp.max_cqd = dev_attr->max_cq_wqes; - resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; - resp.mode = rdev->chip_ctx->modes.wqe_mode; - if (rdev->chip_ctx->modes.db_push) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED; @@ -4201,13 +4341,22 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) if (rdev->pacing.dbr_pacing) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED; + if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED; + if (udata->inlen >= sizeof(ureq)) { rc = ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq))); if (rc) goto cfail; if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_POW2_SUPPORT) { resp.comp_mask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; - uctx->cmask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; + uctx->cmask |= BNXT_RE_UCNTX_CAP_POW2_DISABLED; + } + if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_VAR_WQE_SUPPORT) { + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; + resp.mode = rdev->chip_ctx->modes.wqe_mode; + if (resp.mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + uctx->cmask |= BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; } } @@ -4261,6 +4410,19 @@ static struct bnxt_re_cq *bnxt_re_search_for_cq(struct bnxt_re_dev *rdev, u32 cq return cq; } +static struct bnxt_re_srq *bnxt_re_search_for_srq(struct bnxt_re_dev *rdev, u32 srq_id) +{ + struct bnxt_re_srq *srq = NULL, *tmp_srq; + + hash_for_each_possible(rdev->srq_hash, tmp_srq, hash_entry, srq_id) { + if (tmp_srq->qplib_srq.id == srq_id) { + srq = tmp_srq; + break; + } + } + return srq; +} + /* Helper function to mmap the virtual memory from user app */ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) { @@ -4305,9 +4467,10 @@ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) case BNXT_RE_MMAP_TOGGLE_PAGE: /* Driver doesn't expect write access for user space */ if (vma->vm_flags & VM_WRITE) - return -EFAULT; - ret = vm_insert_page(vma, vma->vm_start, - virt_to_page((void *)bnxt_entry->mem_offset)); + ret = -EFAULT; + else + ret = vm_insert_page(vma, vma->vm_start, + virt_to_page((void *)bnxt_entry->mem_offset)); break; default: ret = -EINVAL; @@ -4489,12 +4652,13 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund struct bnxt_re_ucontext *uctx; struct ib_ucontext *ib_uctx; struct bnxt_re_dev *rdev; + struct bnxt_re_srq *srq; + u32 length = PAGE_SIZE; struct bnxt_re_cq *cq; u64 mem_offset; + u32 offset = 0; u64 addr = 0; - u32 length; - u32 offset; - u32 cq_id; + u32 res_id; int err; ib_uctx = ib_uverbs_get_ucontext(attrs); @@ -4507,23 +4671,24 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); rdev = uctx->rdev; + err = uverbs_copy_from(&res_id, attrs, BNXT_RE_TOGGLE_MEM_RES_ID); + if (err) + return err; switch (res_type) { case BNXT_RE_CQ_TOGGLE_MEM: - err = uverbs_copy_from(&cq_id, attrs, BNXT_RE_TOGGLE_MEM_RES_ID); - if (err) - return err; - - cq = bnxt_re_search_for_cq(rdev, cq_id); + cq = bnxt_re_search_for_cq(rdev, res_id); if (!cq) return -EINVAL; - length = PAGE_SIZE; addr = (u64)cq->uctx_cq_page; - mmap_flag = BNXT_RE_MMAP_TOGGLE_PAGE; - offset = 0; break; case BNXT_RE_SRQ_TOGGLE_MEM: + srq = bnxt_re_search_for_srq(rdev, res_id); + if (!srq) + return -EINVAL; + + addr = (u64)srq->uctx_srq_page; break; default: diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index b267d6d5975f..fbb16a411d6a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -77,6 +77,8 @@ struct bnxt_re_srq { struct bnxt_qplib_srq qplib_srq; struct ib_umem *umem; spinlock_t lock; /* protect srq */ + void *uctx_srq_page; + struct hlist_node hash_entry; }; struct bnxt_re_qp { @@ -93,6 +95,7 @@ struct bnxt_re_qp { struct ib_ud_header qp1_hdr; struct bnxt_re_cq *scq; struct bnxt_re_cq *rcq; + struct dentry *dentry; }; struct bnxt_re_cq { @@ -171,15 +174,32 @@ static inline u16 bnxt_re_get_rwqe_size(int nsge) return sizeof(struct rq_wqe_hdr) + (nsge * sizeof(struct sq_sge)); } +enum { + BNXT_RE_UCNTX_CAP_POW2_DISABLED = 0x1ULL, + BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED = 0x2ULL, +}; + static inline u32 bnxt_re_init_depth(u32 ent, struct bnxt_re_ucontext *uctx) { - return uctx ? (uctx->cmask & BNXT_RE_UCNTX_CMASK_POW2_DISABLED) ? + return uctx ? (uctx->cmask & BNXT_RE_UCNTX_CAP_POW2_DISABLED) ? ent : roundup_pow_of_two(ent) : ent; } +static inline bool bnxt_re_is_var_size_supported(struct bnxt_re_dev *rdev, + struct bnxt_re_ucontext *uctx) +{ + if (uctx) + return uctx->cmask & BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; + else + return rdev->chip_ctx->modes.wqe_mode; +} + int bnxt_re_query_device(struct ib_device *ibdev, struct ib_device_attr *ib_attr, struct ib_udata *udata); +int bnxt_re_modify_device(struct ib_device *ibdev, + int device_modify_mask, + struct ib_device_modify *device_modify); int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *port_attr); int bnxt_re_get_port_immutable(struct ib_device *ibdev, u32 port_num, @@ -221,7 +241,7 @@ int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); @@ -242,12 +262,16 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, int mr_access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +static inline u32 __to_ib_port_num(u16 port_id) +{ + return (u32)port_id + 1; +} unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, unsigned long flags); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 54b4d2f3a5d8..4659a2f73364 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -67,6 +67,7 @@ #include <rdma/bnxt_re-abi.h> #include "bnxt.h" #include "hw_counters.h" +#include "debugfs.h" static char version[] = BNXT_RE_DESC "\n"; @@ -78,16 +79,12 @@ MODULE_LICENSE("Dual BSD/GPL"); /* globals */ static DEFINE_MUTEX(bnxt_re_mutex); -static void bnxt_re_stop_irq(void *handle); -static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev); -static int bnxt_re_netdev_event(struct notifier_block *notifier, - unsigned long event, void *ptr); -static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, u32 *offset); +static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp, + u8 port_num, enum ib_event_type event); static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; @@ -129,18 +126,20 @@ static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) } } -static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) +static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; cctx = rdev->chip_ctx; - cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) ? - mode : BNXT_QPLIB_WQE_MODE_STATIC; + cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? + BNXT_QPLIB_WQE_MODE_VARIABLE : BNXT_QPLIB_WQE_MODE_STATIC; if (bnxt_re_hwrm_qcaps(rdev)) dev_err(rdev_to_dev(rdev), "Failed to query hwrm qcaps\n"); - if (bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx)) + if (bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx)) { cctx->modes.toggle_bits |= BNXT_QPLIB_CQ_TOGGLE_BIT; + cctx->modes.toggle_bits |= BNXT_QPLIB_SRQ_TOGGLE_BIT; + } } static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) @@ -149,6 +148,10 @@ static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) if (!rdev->chip_ctx) return; + + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; + chip_ctx = rdev->chip_ctx; rdev->chip_ctx = NULL; rdev->rcfw.res = NULL; @@ -158,14 +161,15 @@ static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) kfree(chip_ctx); } -static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) +static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; - int rc; + int rc = -ENOMEM; en_dev = rdev->en_dev; + rdev->qplib_res.pdev = en_dev->pdev; chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); if (!chip_ctx) return -ENOMEM; @@ -177,20 +181,31 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->qplib_res.cctx = rdev->chip_ctx; rdev->rcfw.res = &rdev->qplib_res; - rdev->qplib_res.dattr = &rdev->dev_attr; + rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL); + if (!rdev->dev_attr) + goto free_chip_ctx; + rdev->qplib_res.dattr = rdev->dev_attr; rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); + rdev->qplib_res.en_dev = en_dev; - bnxt_re_set_drv_mode(rdev, wqe_mode); + bnxt_re_set_drv_mode(rdev); bnxt_re_set_db_offset(rdev); rc = bnxt_qplib_map_db_bar(&rdev->qplib_res); if (rc) - return rc; + goto free_dev_attr; if (bnxt_qplib_determine_atomics(en_dev->pdev)) ibdev_info(&rdev->ibdev, "platform doesn't support global atomics."); return 0; +free_dev_attr: + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; +free_chip_ctx: + kfree(rdev->chip_ctx); + rdev->chip_ctx = NULL; + return rc; } /* SR-IOV helper functions */ @@ -212,7 +227,7 @@ static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev) struct bnxt_qplib_ctx *ctx; int i; - attr = &rdev->dev_attr; + attr = rdev->dev_attr; ctx = &rdev->qplib_ctx; ctx->qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT, @@ -226,7 +241,7 @@ static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev) if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) rdev->qplib_ctx.tqm_ctx.qcount[i] = - rdev->dev_attr.tqm_alloc_reqs[i]; + rdev->dev_attr->tqm_alloc_reqs[i]; } static void bnxt_re_limit_vf_res(struct bnxt_qplib_ctx *qplib_ctx, u32 num_vf) @@ -280,33 +295,163 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) { + /* + * Use the total VF count since the actual VF count may not be + * available at this point. + */ rdev->num_vfs = pci_sriov_get_totalvfs(rdev->en_dev->pdev); - if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { - bnxt_re_set_resource_limits(rdev); - bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, - &rdev->qplib_ctx); + if (!rdev->num_vfs) + return; + + bnxt_re_set_resource_limits(rdev); + bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, + &rdev->qplib_ctx); +} + +struct bnxt_re_dcb_work { + struct work_struct work; + struct bnxt_re_dev *rdev; + struct hwrm_async_event_cmpl cmpl; +}; + +static bool bnxt_re_is_qp1_qp(struct bnxt_re_qp *qp) +{ + return qp->ib_qp.qp_type == IB_QPT_GSI; +} + +static struct bnxt_re_qp *bnxt_re_get_qp1_qp(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_qp *qp; + + mutex_lock(&rdev->qp_lock); + list_for_each_entry(qp, &rdev->qp_list, list) { + if (bnxt_re_is_qp1_qp(qp)) { + mutex_unlock(&rdev->qp_lock); + return qp; + } } + mutex_unlock(&rdev->qp_lock); + return NULL; } -static void bnxt_re_shutdown(struct auxiliary_device *adev) +static int bnxt_re_update_qp1_tos_dscp(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_qp *qp; + + if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) + return 0; + + qp = bnxt_re_get_qp1_qp(rdev); + if (!qp) + return 0; + + qp->qplib_qp.modify_flags = CMDQ_MODIFY_QP_MODIFY_MASK_TOS_DSCP; + qp->qplib_qp.tos_dscp = rdev->cc_param.qp1_tos_dscp; + + return bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp); +} + +static void bnxt_re_init_dcb_wq(struct bnxt_re_dev *rdev) +{ + rdev->dcb_wq = create_singlethread_workqueue("bnxt_re_dcb_wq"); +} + +static void bnxt_re_uninit_dcb_wq(struct bnxt_re_dev *rdev) +{ + if (!rdev->dcb_wq) + return; + destroy_workqueue(rdev->dcb_wq); +} + +static void bnxt_re_dcb_wq_task(struct work_struct *work) +{ + struct bnxt_re_dcb_work *dcb_work = + container_of(work, struct bnxt_re_dcb_work, work); + struct bnxt_re_dev *rdev = dcb_work->rdev; + struct bnxt_qplib_cc_param *cc_param; + int rc; + + if (!rdev) + goto free_dcb; + + cc_param = &rdev->cc_param; + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, cc_param); + if (rc) { + ibdev_dbg(&rdev->ibdev, "Failed to query ccparam rc:%d", rc); + goto free_dcb; + } + if (cc_param->qp1_tos_dscp != cc_param->tos_dscp) { + cc_param->qp1_tos_dscp = cc_param->tos_dscp; + rc = bnxt_re_update_qp1_tos_dscp(rdev); + if (rc) { + ibdev_dbg(&rdev->ibdev, "%s: Failed to modify QP1 rc:%d", + __func__, rc); + goto free_dcb; + } + } + +free_dcb: + kfree(dcb_work); +} + +static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *cmpl) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_re_dcb_work *dcb_work; + struct bnxt_re_dev *rdev; + u32 data1, data2; + u16 event_id; + rdev = en_info->rdev; if (!rdev) return; - ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev); + + event_id = le16_to_cpu(cmpl->event_id); + data1 = le32_to_cpu(cmpl->event_data1); + data2 = le32_to_cpu(cmpl->event_data2); + + ibdev_dbg(&rdev->ibdev, "Async event_id = %d data1 = %d data2 = %d", + event_id, data1, data2); + + switch (event_id) { + case ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE: + dcb_work = kzalloc(sizeof(*dcb_work), GFP_ATOMIC); + if (!dcb_work) + break; + + dcb_work->rdev = rdev; + memcpy(&dcb_work->cmpl, cmpl, sizeof(*cmpl)); + INIT_WORK(&dcb_work->work, bnxt_re_dcb_wq_task); + queue_work(rdev->dcb_wq, &dcb_work->work); + break; + default: + break; + } } -static void bnxt_re_stop_irq(void *handle) +static void bnxt_re_stop_irq(void *handle, bool reset) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_qplib_rcfw *rcfw; + struct bnxt_re_dev *rdev; struct bnxt_qplib_nq *nq; int indx; - for (indx = BNXT_RE_NQ_IDX; indx < rdev->num_msix; indx++) { - nq = &rdev->nq[indx - 1]; + rdev = en_info->rdev; + if (!rdev) + return; + rcfw = &rdev->rcfw; + + if (reset) { + set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); + wake_up_all(&rdev->rcfw.cmdq.waitq); + bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, + IB_EVENT_DEVICE_FATAL); + } + + for (indx = BNXT_RE_NQ_IDX; indx < rdev->nqr->num_msix; indx++) { + nq = &rdev->nqr->nq[indx - 1]; bnxt_qplib_nq_stop_irq(nq, false); } @@ -315,12 +460,18 @@ static void bnxt_re_stop_irq(void *handle) static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_msix_entry *msix_ent = rdev->en_dev->msix_entries; - struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_msix_entry *msix_ent; + struct bnxt_qplib_rcfw *rcfw; + struct bnxt_re_dev *rdev; struct bnxt_qplib_nq *nq; int indx, rc; + rdev = en_info->rdev; + if (!rdev) + return; + msix_ent = rdev->nqr->msix_entries; + rcfw = &rdev->rcfw; if (!ent) { /* Not setting the f/w timeout bit in rcfw. * During the driver unload the first command @@ -334,8 +485,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) /* Vectors may change after restart, so update with new vectors * in device sctructure. */ - for (indx = 0; indx < rdev->num_msix; indx++) - rdev->en_dev->msix_entries[indx].vector = ent[indx].vector; + for (indx = 0; indx < rdev->nqr->num_msix; indx++) + rdev->nqr->msix_entries[indx].vector = ent[indx].vector; rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, false); @@ -343,8 +494,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) ibdev_warn(&rdev->ibdev, "Failed to reinit CREQ\n"); return; } - for (indx = BNXT_RE_NQ_IDX ; indx < rdev->num_msix; indx++) { - nq = &rdev->nq[indx - 1]; + for (indx = BNXT_RE_NQ_IDX ; indx < rdev->nqr->num_msix; indx++) { + nq = &rdev->nqr->nq[indx - 1]; rc = bnxt_qplib_nq_start_irq(nq, indx - 1, msix_ent[indx].vector, false); if (rc) { @@ -356,6 +507,7 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) } static struct bnxt_ulp_ops bnxt_re_ulp_ops = { + .ulp_async_notifier = bnxt_re_async_notifier, .ulp_irq_stop = bnxt_re_stop_irq, .ulp_irq_restart = bnxt_re_start_irq }; @@ -365,14 +517,9 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = { static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; - int rc; en_dev = rdev->en_dev; - - rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev); - if (!rc) - rdev->qplib_res.pdev = rdev->en_dev->pdev; - return rc; + return bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev); } static void bnxt_re_init_hwrm_hdr(struct input *hdr, u16 opcd) @@ -423,6 +570,7 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) struct hwrm_func_qcaps_input req = {}; struct bnxt_qplib_chip_ctx *cctx; struct bnxt_fw_msg fw_msg = {}; + u32 flags_ext2; int rc; cctx = rdev->chip_ctx; @@ -436,14 +584,15 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) return rc; cctx->modes.db_push = le32_to_cpu(resp.flags) & FUNC_QCAPS_RESP_FLAGS_WCB_PUSH_MODE; - cctx->modes.dbr_pacing = - le32_to_cpu(resp.flags_ext2) & - FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED; + flags_ext2 = le32_to_cpu(resp.flags_ext2); + cctx->modes.dbr_pacing = flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED || + flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_V0_SUPPORTED; return 0; } static int bnxt_re_hwrm_dbr_pacing_qcfg(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; struct hwrm_func_dbr_pacing_qcfg_output resp = {}; struct hwrm_func_dbr_pacing_qcfg_input req = {}; struct bnxt_en_dev *en_dev = rdev->en_dev; @@ -465,6 +614,13 @@ static int bnxt_re_hwrm_dbr_pacing_qcfg(struct bnxt_re_dev *rdev) cctx->dbr_stat_db_fifo = le32_to_cpu(resp.dbr_stat_db_fifo_reg) & ~FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_MASK; + + pacing_data->fifo_max_depth = le32_to_cpu(resp.dbr_stat_db_max_fifo_depth); + if (!pacing_data->fifo_max_depth) + pacing_data->fifo_max_depth = BNXT_RE_MAX_FIFO_DEPTH(cctx); + pacing_data->fifo_room_mask = le32_to_cpu(resp.dbr_stat_db_fifo_reg_fifo_room_mask); + pacing_data->fifo_room_shift = resp.dbr_stat_db_fifo_reg_fifo_room_shift; + return 0; } @@ -479,24 +635,55 @@ static void bnxt_re_set_default_pacing_data(struct bnxt_re_dev *rdev) pacing_data->pacing_th * BNXT_RE_PACING_ALARM_TH_MULTIPLE; } -static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) +static u32 __get_fifo_occupancy(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; u32 read_val, fifo_occup; + read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); + fifo_occup = pacing_data->fifo_max_depth - + ((read_val & pacing_data->fifo_room_mask) >> + pacing_data->fifo_room_shift); + return fifo_occup; +} + +static bool is_dbr_fifo_full(struct bnxt_re_dev *rdev) +{ + u32 max_occup, fifo_occup; + + fifo_occup = __get_fifo_occupancy(rdev); + max_occup = BNXT_RE_MAX_FIFO_DEPTH(rdev->chip_ctx) - 1; + if (fifo_occup == max_occup) + return true; + + return false; +} + +static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data; + u32 retry_fifo_check = 1000; + u32 fifo_occup; + /* loop shouldn't run infintely as the occupancy usually goes * below pacing algo threshold as soon as pacing kicks in. */ while (1) { - read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); - fifo_occup = BNXT_RE_MAX_FIFO_DEPTH - - ((read_val & BNXT_RE_DB_FIFO_ROOM_MASK) >> - BNXT_RE_DB_FIFO_ROOM_SHIFT); + fifo_occup = __get_fifo_occupancy(rdev); /* Fifo occupancy cannot be greater the MAX FIFO depth */ - if (fifo_occup > BNXT_RE_MAX_FIFO_DEPTH) + if (fifo_occup > pacing_data->fifo_max_depth) break; - if (fifo_occup < rdev->qplib_res.pacing_data->pacing_th) + if (fifo_occup < pacing_data->pacing_th) + break; + if (!retry_fifo_check--) { + dev_info_once(rdev_to_dev(rdev), + "%s: fifo_occup = 0x%xfifo_max_depth = 0x%x pacing_th = 0x%x\n", + __func__, fifo_occup, pacing_data->fifo_max_depth, + pacing_data->pacing_th); break; + } + } } @@ -546,16 +733,13 @@ static void bnxt_re_pacing_timer_exp(struct work_struct *work) struct bnxt_re_dev *rdev = container_of(work, struct bnxt_re_dev, dbq_pacing_work.work); struct bnxt_qplib_db_pacing_data *pacing_data; - u32 read_val, fifo_occup; + u32 fifo_occup; if (!mutex_trylock(&rdev->pacing.dbq_lock)) return; pacing_data = rdev->qplib_res.pacing_data; - read_val = readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); - fifo_occup = BNXT_RE_MAX_FIFO_DEPTH - - ((read_val & BNXT_RE_DB_FIFO_ROOM_MASK) >> - BNXT_RE_DB_FIFO_ROOM_SHIFT); + fifo_occup = __get_fifo_occupancy(rdev); if (fifo_occup > pacing_data->pacing_th) goto restart_timer; @@ -594,7 +778,7 @@ void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev) * Increase the alarm_th to max so that other user lib instances do not * keep alerting the driver. */ - pacing_data->alarm_th = BNXT_RE_MAX_FIFO_DEPTH; + pacing_data->alarm_th = pacing_data->fifo_max_depth; pacing_data->do_pacing = BNXT_RE_MAX_DBR_DO_PACING; cancel_work_sync(&rdev->dbq_fifo_check_work); schedule_work(&rdev->dbq_fifo_check_work); @@ -603,9 +787,6 @@ void bnxt_re_pacing_alert(struct bnxt_re_dev *rdev) static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) { - if (bnxt_re_hwrm_dbr_pacing_qcfg(rdev)) - return -EIO; - /* Allocate a page for app use */ rdev->pacing.dbr_page = (void *)__get_free_page(GFP_KERNEL); if (!rdev->pacing.dbr_page) @@ -614,6 +795,12 @@ static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) memset((u8 *)rdev->pacing.dbr_page, 0, PAGE_SIZE); rdev->qplib_res.pacing_data = (struct bnxt_qplib_db_pacing_data *)rdev->pacing.dbr_page; + if (bnxt_re_hwrm_dbr_pacing_qcfg(rdev)) { + free_page((u64)rdev->pacing.dbr_page); + rdev->pacing.dbr_page = NULL; + return -EIO; + } + /* MAP HW window 2 for reading db fifo depth */ writel(rdev->chip_ctx->dbr_stat_db_fifo & BNXT_GRC_BASE_MASK, rdev->en_dev->bar0 + BNXT_GRCPF_REG_WINDOW_BASE_OUT + 4); @@ -623,13 +810,16 @@ static int bnxt_re_initialize_dbr_pacing(struct bnxt_re_dev *rdev) rdev->pacing.dbr_bar_addr = pci_resource_start(rdev->qplib_res.pdev, 0) + rdev->pacing.dbr_db_fifo_reg_off; + if (is_dbr_fifo_full(rdev)) { + free_page((u64)rdev->pacing.dbr_page); + rdev->pacing.dbr_page = NULL; + return -EIO; + } + rdev->pacing.pacing_algo_th = BNXT_RE_PACING_ALGO_THRESHOLD; rdev->pacing.dbq_pacing_time = BNXT_RE_DBR_PACING_TIME; rdev->pacing.dbr_def_do_pacing = BNXT_RE_DBR_DO_PACING_NO_CONGESTION; rdev->pacing.do_pacing_save = rdev->pacing.dbr_def_do_pacing; - rdev->qplib_res.pacing_data->fifo_max_depth = BNXT_RE_MAX_FIFO_DEPTH; - rdev->qplib_res.pacing_data->fifo_room_mask = BNXT_RE_DB_FIFO_ROOM_MASK; - rdev->qplib_res.pacing_data->fifo_room_shift = BNXT_RE_DB_FIFO_ROOM_SHIFT; rdev->qplib_res.pacing_data->grc_reg_offset = rdev->pacing.dbr_db_fifo_reg_off; bnxt_re_set_default_pacing_data(rdev); /* Initialize worker for DBR Pacing */ @@ -779,17 +969,6 @@ static void bnxt_re_disassociate_ucontext(struct ib_ucontext *ibcontext) } /* Device */ - -static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev) -{ - struct ib_device *ibdev = - ib_device_get_by_netdev(netdev, RDMA_DRIVER_BNXT_RE); - if (!ibdev) - return NULL; - - return container_of(ibdev, struct bnxt_re_dev, ibdev); -} - static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { @@ -820,6 +999,253 @@ static const struct attribute_group bnxt_re_dev_attr_group = { .attrs = bnxt_re_attributes, }; +static int bnxt_re_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ib_mr) +{ + struct bnxt_qplib_hwq *mr_hwq; + struct nlattr *table_attr; + struct bnxt_re_mr *mr; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr); + mr_hwq = &mr->qplib_mr.hwq; + + if (rdma_nl_put_driver_u32(msg, "page_size", + mr_hwq->qe_ppg * mr_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_elements", mr_hwq->max_elements)) + goto err; + if (rdma_nl_put_driver_u32(msg, "element_size", mr_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "hwq", (unsigned long)mr_hwq)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "va", mr->qplib_mr.va)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ib_mr) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_mr *mr; + int err, len; + void *data; + + mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr); + rdev = mr->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_MRW, + mr->qplib_mr.lkey, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + +static int bnxt_re_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq) +{ + struct bnxt_qplib_hwq *cq_hwq; + struct nlattr *table_attr; + struct bnxt_re_cq *cq; + + cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); + cq_hwq = &cq->qplib_cq.hwq; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + if (rdma_nl_put_driver_u32(msg, "cq_depth", cq_hwq->depth)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_elements", cq_hwq->max_elements)) + goto err; + if (rdma_nl_put_driver_u32(msg, "element_size", cq_hwq->element_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "max_wqe", cq->qplib_cq.max_wqe)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_cq *cq; + int err, len; + void *data; + + cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); + rdev = cq->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_CQ_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, + CMDQ_READ_CONTEXT_TYPE_CQ, + cq->qplib_cq.id, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + +static int bnxt_re_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp) +{ + struct bnxt_qplib_qp *qplib_qp; + struct nlattr *table_attr; + struct bnxt_re_qp *qp; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + qplib_qp = &qp->qplib_qp; + + if (rdma_nl_put_driver_u32(msg, "sq_max_wqe", qplib_qp->sq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_max_sge", qplib_qp->sq.max_sge)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_wqe_size", qplib_qp->sq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_swq_start", qplib_qp->sq.swq_start)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sq_swq_last", qplib_qp->sq.swq_last)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_max_wqe", qplib_qp->rq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_max_sge", qplib_qp->rq.max_sge)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_wqe_size", qplib_qp->rq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_swq_start", qplib_qp->rq.swq_start)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rq_swq_last", qplib_qp->rq.swq_last)) + goto err; + if (rdma_nl_put_driver_u32(msg, "timeout", qplib_qp->timeout)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibqp->device, ibdev); + int err, len; + void *data; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_QPC_SIZE_P5; + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_QPC, + ibqp->qp_num, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + +static int bnxt_re_fill_res_srq_entry(struct sk_buff *msg, struct ib_srq *ib_srq) +{ + struct nlattr *table_attr; + struct bnxt_re_srq *srq; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + return -EMSGSIZE; + + srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); + + if (rdma_nl_put_driver_u32_hex(msg, "wqe_size", srq->qplib_srq.wqe_size)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "max_wqe", srq->qplib_srq.max_wqe)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "max_sge", srq->qplib_srq.max_sge)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int bnxt_re_fill_res_srq_entry_raw(struct sk_buff *msg, struct ib_srq *ib_srq) +{ + struct bnxt_re_dev *rdev; + struct bnxt_re_srq *srq; + int err, len; + void *data; + + srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); + rdev = srq->rdev; + + err = bnxt_re_read_context_allowed(rdev); + if (err) + return err; + + len = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P7 : + BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P5; + + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = bnxt_qplib_read_context(&rdev->rcfw, CMDQ_READ_CONTEXT_TYPE_SRQ, + srq->qplib_srq.id, len, data); + if (!err) + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); + + kfree(data); + return err; +} + static const struct ib_device_ops bnxt_re_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_BNXT_RE, @@ -861,6 +1287,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .post_srq_recv = bnxt_re_post_srq_recv, .query_ah = bnxt_re_query_ah, .query_device = bnxt_re_query_device, + .modify_device = bnxt_re_modify_device, .query_pkey = bnxt_re_query_pkey, .query_port = bnxt_re_query_port, .query_qp = bnxt_re_query_qp, @@ -877,6 +1304,17 @@ static const struct ib_device_ops bnxt_re_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx), }; +static const struct ib_device_ops restrack_ops = { + .fill_res_cq_entry = bnxt_re_fill_res_cq_entry, + .fill_res_cq_entry_raw = bnxt_re_fill_res_cq_entry_raw, + .fill_res_qp_entry = bnxt_re_fill_res_qp_entry, + .fill_res_qp_entry_raw = bnxt_re_fill_res_qp_entry_raw, + .fill_res_mr_entry = bnxt_re_fill_res_mr_entry, + .fill_res_mr_entry_raw = bnxt_re_fill_res_mr_entry_raw, + .fill_res_srq_entry = bnxt_re_fill_res_srq_entry, + .fill_res_srq_entry_raw = bnxt_re_fill_res_srq_entry_raw, +}; + static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) { struct ib_device *ibdev = &rdev->ibdev; @@ -890,7 +1328,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) addrconf_addr_eui48((u8 *)&ibdev->node_guid, rdev->netdev->dev_addr); - ibdev->num_comp_vectors = rdev->num_msix - 1; + ibdev->num_comp_vectors = rdev->nqr->num_msix - 1; ibdev->dev.parent = &rdev->en_dev->pdev->dev; ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY; @@ -898,6 +1336,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->driver_def = bnxt_re_uapi_defs; ib_set_device_ops(ibdev, &bnxt_re_dev_ops); + ib_set_device_ops(ibdev, &restrack_ops); ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1); if (ret) return ret; @@ -907,7 +1346,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev); } -static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, +static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev, struct bnxt_en_dev *en_dev) { struct bnxt_re_dev *rdev; @@ -920,9 +1359,9 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, return NULL; } /* Default values */ - rdev->nb.notifier_call = NULL; rdev->netdev = en_dev->net; rdev->en_dev = en_dev; + rdev->adev = adev; rdev->id = rdev->en_dev->pdev->devfn; INIT_LIST_HEAD(&rdev->qp_list); mutex_init(&rdev->qp_lock); @@ -936,6 +1375,15 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, atomic_set(&rdev->stats.res.pd_count, 0); rdev->cosq[0] = 0xFFFF; rdev->cosq[1] = 0xFFFF; + rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME; + if (bnxt_re_chip_gen_p7(en_dev->chip_num)) { + rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7; + rdev->cq_coalescing.during_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P7; + } else { + rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P5; + rdev->cq_coalescing.during_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P5; + } + rdev->cq_coalescing.en_ring_idle_mode = BNXT_QPLIB_CQ_COAL_DEF_EN_RING_IDLE_MODE; return rdev; } @@ -975,12 +1423,15 @@ static int bnxt_re_handle_unaffi_async_event(struct creq_func_event static int bnxt_re_handle_qp_async_event(struct creq_qp_event *qp_event, struct bnxt_re_qp *qp) { - struct bnxt_re_srq *srq = container_of(qp->qplib_qp.srq, struct bnxt_re_srq, - qplib_srq); struct creq_qp_error_notification *err_event; + struct bnxt_re_srq *srq = NULL; struct ib_event event = {}; unsigned int flags; + if (qp->qplib_qp.srq) + srq = container_of(qp->qplib_qp.srq, struct bnxt_re_srq, + qplib_srq); + if (qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR && rdma_is_kernel_res(&qp->ib_qp.res)) { flags = bnxt_re_lock_cqs(qp); @@ -1208,15 +1659,9 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq, { struct bnxt_re_cq *cq = container_of(handle, struct bnxt_re_cq, qplib_cq); - u32 *cq_ptr; - if (cq->ib_cq.comp_handler) { - if (cq->uctx_cq_page) { - cq_ptr = (u32 *)cq->uctx_cq_page; - *cq_ptr = cq->qplib_cq.toggle; - } + if (cq->ib_cq.comp_handler) (*cq->ib_cq.comp_handler)(&cq->ib_cq, cq->ib_cq.cq_context); - } return 0; } @@ -1225,8 +1670,8 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) { int i; - for (i = 1; i < rdev->num_msix; i++) - bnxt_qplib_disable_nq(&rdev->nq[i - 1]); + for (i = 1; i < rdev->nqr->num_msix; i++) + bnxt_qplib_disable_nq(&rdev->nqr->nq[i - 1]); if (rdev->qplib_res.rcfw) bnxt_qplib_cleanup_res(&rdev->qplib_res); @@ -1240,10 +1685,12 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) bnxt_qplib_init_res(&rdev->qplib_res); - for (i = 1; i < rdev->num_msix ; i++) { - db_offt = rdev->en_dev->msix_entries[i].db_offset; - rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1], - i - 1, rdev->en_dev->msix_entries[i].vector, + mutex_init(&rdev->nqr->load_lock); + + for (i = 1; i < rdev->nqr->num_msix ; i++) { + db_offt = rdev->nqr->msix_entries[i].db_offset; + rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nqr->nq[i - 1], + i - 1, rdev->nqr->msix_entries[i].vector, db_offt, &bnxt_re_cqn_handler, &bnxt_re_srqn_handler); if (rc) { @@ -1256,20 +1703,22 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) return 0; fail: for (i = num_vec_enabled; i >= 0; i--) - bnxt_qplib_disable_nq(&rdev->nq[i]); + bnxt_qplib_disable_nq(&rdev->nqr->nq[i]); return rc; } static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev) { + struct bnxt_qplib_nq *nq; u8 type; int i; - for (i = 0; i < rdev->num_msix - 1; i++) { + for (i = 0; i < rdev->nqr->num_msix - 1; i++) { type = bnxt_qplib_get_ring_type(rdev->chip_ctx); - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type); - bnxt_qplib_free_nq(&rdev->nq[i]); - rdev->nq[i].res = NULL; + nq = &rdev->nqr->nq[i]; + bnxt_re_net_ring_free(rdev, nq->ring_id, type); + bnxt_qplib_free_nq(nq); + nq->res = NULL; } } @@ -1296,12 +1745,11 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw); if (rc) goto fail; - rc = bnxt_qplib_alloc_res(&rdev->qplib_res, rdev->en_dev->pdev, - rdev->netdev, &rdev->dev_attr); + rc = bnxt_qplib_alloc_res(&rdev->qplib_res, rdev->netdev); if (rc) goto fail; @@ -1311,12 +1759,12 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) if (rc) goto dealloc_res; - for (i = 0; i < rdev->num_msix - 1; i++) { + for (i = 0; i < rdev->nqr->num_msix - 1; i++) { struct bnxt_qplib_nq *nq; - nq = &rdev->nq[i]; + nq = &rdev->nqr->nq[i]; nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT; - rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, &rdev->nq[i]); + rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, nq); if (rc) { ibdev_err(&rdev->ibdev, "Alloc Failed NQ%d rc:%#x", i, rc); @@ -1324,17 +1772,17 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) } type = bnxt_qplib_get_ring_type(rdev->chip_ctx); rattr.dma_arr = nq->hwq.pbl[PBL_LVL_0].pg_map_arr; - rattr.pages = nq->hwq.pbl[rdev->nq[i].hwq.level].pg_count; + rattr.pages = nq->hwq.pbl[rdev->nqr->nq[i].hwq.level].pg_count; rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_NQE_MAX_CNT - 1; - rattr.lrid = rdev->en_dev->msix_entries[i + 1].ring_idx; + rattr.lrid = rdev->nqr->msix_entries[i + 1].ring_idx; rc = bnxt_re_net_ring_alloc(rdev, &rattr, &nq->ring_id); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate NQ fw id with rc = 0x%x", rc); - bnxt_qplib_free_nq(&rdev->nq[i]); + bnxt_qplib_free_nq(nq); goto free_nq; } num_vec_created++; @@ -1343,8 +1791,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) free_nq: for (i = num_vec_created - 1; i >= 0; i--) { type = bnxt_qplib_get_ring_type(rdev->chip_ctx); - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type); - bnxt_qplib_free_nq(&rdev->nq[i]); + bnxt_re_net_ring_free(rdev, rdev->nqr->nq[i].ring_id, type); + bnxt_qplib_free_nq(&rdev->nqr->nq[i]); } bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &rdev->dpi_privileged); @@ -1384,11 +1832,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev, static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) { - int mask = IB_QP_STATE; - struct ib_qp_attr qp_attr; struct bnxt_re_qp *qp; - qp_attr.qp_state = IB_QPS_ERR; mutex_lock(&rdev->qp_lock); list_for_each_entry(qp, &rdev->qp_list, list) { /* Modify the state of all QPs except QP1/Shadow QP */ @@ -1396,12 +1841,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) if (qp->qplib_qp.state != CMDQ_MODIFY_QP_NEW_STATE_RESET && qp->qplib_qp.state != - CMDQ_MODIFY_QP_NEW_STATE_ERR) { + CMDQ_MODIFY_QP_NEW_STATE_ERR) bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp, 1, IB_EVENT_QP_FATAL); - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask, - NULL); - } } } mutex_unlock(&rdev->qp_lock); @@ -1482,6 +1924,26 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) return 0; } +static void bnxt_re_net_unregister_async_event(struct bnxt_re_dev *rdev) +{ + if (rdev->is_virtfn) + return; + + memset(&rdev->event_bitmap, 0, sizeof(rdev->event_bitmap)); + bnxt_register_async_events(rdev->en_dev, &rdev->event_bitmap, + ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); +} + +static void bnxt_re_net_register_async_event(struct bnxt_re_dev *rdev) +{ + if (rdev->is_virtfn) + return; + + rdev->event_bitmap |= (1 << ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); + bnxt_register_async_events(rdev->en_dev, &rdev->event_bitmap, + ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); +} + static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev = rdev->en_dev; @@ -1539,11 +2001,31 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) return rc; } -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) +static int bnxt_re_alloc_nqr_mem(struct bnxt_re_dev *rdev) +{ + rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL); + if (!rdev->nqr) + return -ENOMEM; + + return 0; +} + +static void bnxt_re_free_nqr_mem(struct bnxt_re_dev *rdev) +{ + kfree(rdev->nqr); + rdev->nqr = NULL; +} + +static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) { u8 type; int rc; + bnxt_re_debugfs_rem_pdev(rdev); + + bnxt_re_net_unregister_async_event(rdev); + bnxt_re_uninit_dcb_wq(rdev); + if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) cancel_delayed_work_sync(&rdev->worker); @@ -1566,14 +2048,17 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) bnxt_qplib_free_rcfw_channel(&rdev->rcfw); } - rdev->num_msix = 0; + rdev->nqr->num_msix = 0; if (rdev->pacing.dbr_pacing) bnxt_re_deinitialize_dbr_pacing(rdev); + bnxt_re_free_nqr_mem(rdev); bnxt_re_destroy_chip_ctx(rdev); - if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) - bnxt_unregister_dev(rdev->en_dev); + if (op_type == BNXT_RE_COMPLETE_REMOVE) { + if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnxt_unregister_dev(rdev->en_dev); + } } /* worker thread for polling periodic events. Now used for QoS programming*/ @@ -1586,7 +2071,7 @@ static void bnxt_re_worker(struct work_struct *work) schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); } -static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) +static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) { struct bnxt_re_ring_attr rattr = {}; struct bnxt_qplib_creq_ctx *creq; @@ -1595,16 +2080,29 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) u8 type; int rc; - /* Registered a new RoCE device instance to netdev */ - rc = bnxt_re_register_netdev(rdev); - if (rc) { + if (op_type == BNXT_RE_COMPLETE_INIT) { + /* Registered a new RoCE device instance to netdev */ + rc = bnxt_re_register_netdev(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to register with netedev: %#x\n", rc); + return -EINVAL; + } + } + set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + + if (rdev->en_dev->ulp_tbl->msix_requested < BNXT_RE_MIN_MSIX) { ibdev_err(&rdev->ibdev, - "Failed to register with netedev: %#x\n", rc); + "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n", + rdev->en_dev->ulp_tbl->msix_requested); + bnxt_unregister_dev(rdev->en_dev); + clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); return -EINVAL; } - set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", + rdev->en_dev->ulp_tbl->msix_requested); - rc = bnxt_re_setup_chip_ctx(rdev, wqe_mode); + rc = bnxt_re_setup_chip_ctx(rdev); if (rc) { bnxt_unregister_dev(rdev->en_dev); clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); @@ -1612,27 +2110,27 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) return -EINVAL; } + rc = bnxt_re_alloc_nqr_mem(rdev); + if (rc) { + bnxt_re_destroy_chip_ctx(rdev); + bnxt_unregister_dev(rdev->en_dev); + clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return rc; + } + rdev->nqr->num_msix = rdev->en_dev->ulp_tbl->msix_requested; + memcpy(rdev->nqr->msix_entries, rdev->en_dev->msix_entries, + sizeof(struct bnxt_msix_entry) * rdev->nqr->num_msix); + /* Check whether VF or PF */ bnxt_re_get_sriov_func_type(rdev); - if (!rdev->en_dev->ulp_tbl->msix_requested) { - ibdev_err(&rdev->ibdev, - "Failed to get MSI-X vectors: %#x\n", rc); - rc = -EINVAL; - goto fail; - } - ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", - rdev->en_dev->ulp_tbl->msix_requested); - rdev->num_msix = rdev->en_dev->ulp_tbl->msix_requested; - bnxt_re_query_hwrm_intf_version(rdev); /* Establish RCFW Communication Channel to initialize the context * memory for the function and all child VFs */ rc = bnxt_qplib_alloc_rcfw_channel(&rdev->qplib_res, &rdev->rcfw, - &rdev->qplib_ctx, - BNXT_RE_MAX_QPC_COUNT); + &rdev->qplib_ctx); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate RCFW Channel: %#x\n", rc); @@ -1646,14 +2144,14 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_CREQE_MAX_CNT - 1; - rattr.lrid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; + rattr.lrid = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; rc = bnxt_re_net_ring_alloc(rdev, &rattr, &creq->ring_id); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc); goto free_rcfw; } - db_offt = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].db_offset; - vid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].vector; + db_offt = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].db_offset; + vid = rdev->nqr->msix_entries[BNXT_RE_AEQ_IDX].vector; rc = bnxt_qplib_enable_rcfw_channel(&rdev->rcfw, vid, db_offt, &bnxt_re_aeq_handler); @@ -1673,7 +2171,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->pacing.dbr_pacing = false; } } - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw); if (rc) goto disable_rcfw; @@ -1722,6 +2220,11 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) set_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, &rdev->flags); if (!rdev->is_virtfn) { + /* Query f/w defaults of CC params */ + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, &rdev->cc_param); + if (rc) + ibdev_warn(&rdev->ibdev, "Failed to query CC defaults\n"); + rc = bnxt_re_setup_qos(rdev); if (rc) ibdev_info(&rdev->ibdev, @@ -1730,13 +2233,18 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); - /* - * Use the total VF count since the actual VF count may not be - * available at this point. - */ - bnxt_re_vf_res_config(rdev); + + if (!(rdev->qplib_res.en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT)) + bnxt_re_vf_res_config(rdev); } hash_init(rdev->cq_hash); + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) + hash_init(rdev->srq_hash); + + bnxt_re_debugfs_add_pdev(rdev); + + bnxt_re_init_dcb_wq(rdev); + bnxt_re_net_register_async_event(rdev); return 0; free_sctx: @@ -1751,50 +2259,11 @@ free_ring: free_rcfw: bnxt_qplib_free_rcfw_channel(&rdev->rcfw); fail: - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); return rc; } -static int bnxt_re_add_device(struct auxiliary_device *adev, u8 wqe_mode) -{ - struct bnxt_aux_priv *aux_priv = - container_of(adev, struct bnxt_aux_priv, aux_dev); - struct bnxt_en_dev *en_dev; - struct bnxt_re_dev *rdev; - int rc; - - /* en_dev should never be NULL as long as adev and aux_dev are valid. */ - en_dev = aux_priv->edev; - - rdev = bnxt_re_dev_add(aux_priv, en_dev); - if (!rdev || !rdev_to_dev(rdev)) { - rc = -ENOMEM; - goto exit; - } - - rc = bnxt_re_dev_init(rdev, wqe_mode); - if (rc) - goto re_dev_dealloc; - - rc = bnxt_re_ib_init(rdev); - if (rc) { - pr_err("Failed to register with IB: %s", - aux_priv->aux_dev.name); - goto re_dev_uninit; - } - auxiliary_set_drvdata(adev, rdev); - - return 0; - -re_dev_uninit: - bnxt_re_dev_uninit(rdev); -re_dev_dealloc: - ib_dealloc_device(&rdev->ibdev); -exit: - return rc; -} - static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) { struct bnxt_qplib_cc_param cc_param = {}; @@ -1809,142 +2278,139 @@ static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) if (enable) { cc_param.enable = 1; - cc_param.cc_mode = CMDQ_MODIFY_ROCE_CC_CC_MODE_PROBABILISTIC_CC_MODE; + cc_param.tos_ecn = 1; } - cc_param.mask = (CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_CC_MODE | - CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC | + cc_param.mask = (CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC | CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN); if (bnxt_qplib_modify_cc(&rdev->qplib_res, &cc_param)) ibdev_err(&rdev->ibdev, "Failed to setup CC enable = %d\n", enable); } -/* - * "Notifier chain callback can be invoked for the same chain from - * different CPUs at the same time". - * - * For cases when the netdev is already present, our call to the - * register_netdevice_notifier() will actually get the rtnl_lock() - * before sending NETDEV_REGISTER and (if up) NETDEV_UP - * events. - * - * But for cases when the netdev is not already present, the notifier - * chain is subjected to be invoked from different CPUs simultaneously. - * - * This is protected by the netdev_mutex. - */ -static int bnxt_re_netdev_event(struct notifier_block *notifier, - unsigned long event, void *ptr) +static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev, + struct bnxt_re_en_dev_info *en_info, + struct auxiliary_device *adev) { - struct net_device *real_dev, *netdev = netdev_notifier_info_to_dev(ptr); + /* Before updating the rdev pointer in bnxt_re_en_dev_info structure, + * take the rtnl lock to avoid accessing invalid rdev pointer from + * L2 ULP callbacks. This is applicable in all the places where rdev + * pointer is updated in bnxt_re_en_dev_info. + */ + rtnl_lock(); + en_info->rdev = rdev; + rtnl_unlock(); +} + +static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) +{ + struct bnxt_aux_priv *aux_priv = + container_of(adev, struct bnxt_aux_priv, aux_dev); + struct bnxt_re_en_dev_info *en_info; + struct bnxt_en_dev *en_dev; struct bnxt_re_dev *rdev; + int rc; + + en_info = auxiliary_get_drvdata(adev); + en_dev = en_info->en_dev; - real_dev = rdma_vlan_dev_real_dev(netdev); - if (!real_dev) - real_dev = netdev; - if (real_dev != netdev) + rdev = bnxt_re_dev_add(adev, en_dev); + if (!rdev || !rdev_to_dev(rdev)) { + rc = -ENOMEM; goto exit; + } - rdev = bnxt_re_from_netdev(real_dev); - if (!rdev) - return NOTIFY_DONE; + bnxt_re_update_en_info_rdev(rdev, en_info, adev); + rc = bnxt_re_dev_init(rdev, op_type); + if (rc) + goto re_dev_dealloc; - switch (event) { - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_CHANGE: - bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, - netif_carrier_ok(real_dev) ? - IB_EVENT_PORT_ACTIVE : - IB_EVENT_PORT_ERR); - break; - default: - break; + rc = bnxt_re_ib_init(rdev); + if (rc) { + pr_err("Failed to register with IB: %s", + aux_priv->aux_dev.name); + goto re_dev_uninit; } - ib_device_put(&rdev->ibdev); + + bnxt_re_setup_cc(rdev, true); + + return 0; + +re_dev_uninit: + bnxt_re_update_en_info_rdev(NULL, en_info, adev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); +re_dev_dealloc: + ib_dealloc_device(&rdev->ibdev); exit: - return NOTIFY_DONE; + return rc; } #define BNXT_ADEV_NAME "bnxt_en" -static void bnxt_re_remove(struct auxiliary_device *adev) +static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type, + struct auxiliary_device *aux_dev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - - if (!rdev) - return; - - mutex_lock(&bnxt_re_mutex); - if (rdev->nb.notifier_call) { - unregister_netdevice_notifier(&rdev->nb); - rdev->nb.notifier_call = NULL; - } else { - /* If notifier is null, we should have already done a - * clean up before coming here. - */ - goto skip_remove; - } bnxt_re_setup_cc(rdev, false); ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, op_type); ib_dealloc_device(&rdev->ibdev); -skip_remove: +} + +static void bnxt_re_remove(struct auxiliary_device *adev) +{ + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; + + mutex_lock(&bnxt_re_mutex); + rdev = en_info->rdev; + + if (rdev) + bnxt_re_remove_device(rdev, BNXT_RE_COMPLETE_REMOVE, adev); + kfree(en_info); mutex_unlock(&bnxt_re_mutex); } static int bnxt_re_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { - struct bnxt_re_dev *rdev; + struct bnxt_aux_priv *aux_priv = + container_of(adev, struct bnxt_aux_priv, aux_dev); + struct bnxt_re_en_dev_info *en_info; + struct bnxt_en_dev *en_dev; int rc; + en_dev = aux_priv->edev; + mutex_lock(&bnxt_re_mutex); - rc = bnxt_re_add_device(adev, BNXT_QPLIB_WQE_MODE_STATIC); - if (rc) { + en_info = kzalloc(sizeof(*en_info), GFP_KERNEL); + if (!en_info) { mutex_unlock(&bnxt_re_mutex); - return rc; + return -ENOMEM; } + en_info->en_dev = en_dev; - rdev = auxiliary_get_drvdata(adev); + auxiliary_set_drvdata(adev, en_info); - rdev->nb.notifier_call = bnxt_re_netdev_event; - rc = register_netdevice_notifier(&rdev->nb); - if (rc) { - rdev->nb.notifier_call = NULL; - pr_err("%s: Cannot register to netdevice_notifier", - ROCE_DRV_MODULE_NAME); - goto err; - } - - bnxt_re_setup_cc(rdev, true); - mutex_unlock(&bnxt_re_mutex); - return 0; + rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT); + if (rc) + kfree(en_info); -err: mutex_unlock(&bnxt_re_mutex); - bnxt_re_remove(adev); return rc; } static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - - if (!rdev) - return 0; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; + rdev = en_info->rdev; + en_dev = en_info->en_dev; mutex_lock(&bnxt_re_mutex); - /* L2 driver may invoke this callback during device error/crash or device - * reset. Current RoCE driver doesn't recover the device in case of - * error. Handle the error by dispatching fatal events to all qps - * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as - * L2 driver want to modify the MSIx table. - */ ibdev_info(&rdev->ibdev, "Handle device suspend call"); /* Check the current device state from bnxt_en_dev and move the @@ -1952,17 +2418,20 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) * This prevents more commands to HW during clean-up, * in case the device is already in error. */ - if (test_bit(BNXT_STATE_FW_FATAL_COND, &rdev->en_dev->en_state)) + if (test_bit(BNXT_STATE_FW_FATAL_COND, &rdev->en_dev->en_state)) { set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); + wake_up_all(&rdev->rcfw.cmdq.waitq); + bnxt_re_dev_stop(rdev); + } - bnxt_re_dev_stop(rdev); - bnxt_re_stop_irq(rdev); - /* Move the device states to detached and avoid sending any more - * commands to HW - */ - set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); - set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); - wake_up_all(&rdev->rcfw.cmdq.waitq); + if (rdev->pacing.dbr_pacing) + bnxt_re_set_pacing_dev_state(rdev); + + ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx", + __func__, en_dev->en_state); + bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev); + bnxt_re_update_en_info_rdev(NULL, en_info, adev); mutex_unlock(&bnxt_re_mutex); return 0; @@ -1970,25 +2439,28 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) static int bnxt_re_resume(struct auxiliary_device *adev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - - if (!rdev) - return 0; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; mutex_lock(&bnxt_re_mutex); - /* L2 driver may invoke this callback during device recovery, resume. - * reset. Current RoCE driver doesn't recover the device in case of - * error. Handle the error by dispatching fatal events to all qps - * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as - * L2 driver want to modify the MSIx table. - */ - - ibdev_info(&rdev->ibdev, "Handle device resume call"); + bnxt_re_add_device(adev, BNXT_RE_POST_RECOVERY_INIT); + rdev = en_info->rdev; + ibdev_info(&rdev->ibdev, "Device resume completed"); mutex_unlock(&bnxt_re_mutex); return 0; } +static void bnxt_re_shutdown(struct auxiliary_device *adev) +{ + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; + + rdev = en_info->rdev; + ib_unregister_device(&rdev->ibdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); +} + static const struct auxiliary_device_id bnxt_re_id_table[] = { { .name = BNXT_ADEV_NAME ".rdma", }, {}, @@ -2011,18 +2483,24 @@ static int __init bnxt_re_mod_init(void) int rc; pr_info("%s: %s", ROCE_DRV_MODULE_NAME, version); + bnxt_re_register_debugfs(); + rc = auxiliary_driver_register(&bnxt_re_driver); if (rc) { pr_err("%s: Failed to register auxiliary driver\n", ROCE_DRV_MODULE_NAME); - return rc; + goto err_debug; } return 0; +err_debug: + bnxt_re_unregister_debugfs(); + return rc; } static void __exit bnxt_re_mod_exit(void) { auxiliary_driver_unregister(&bnxt_re_driver); + bnxt_re_unregister_debugfs(); } module_init(bnxt_re_mod_init); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 439d0c7c5d0c..457eecb99f96 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -54,6 +54,10 @@ #include "qplib_rcfw.h" #include "qplib_sp.h" #include "qplib_fp.h" +#include <rdma/ib_addr.h> +#include "bnxt_ulp.h" +#include "bnxt_re.h" +#include "ib_verbs.h" static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp); @@ -323,6 +327,7 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) case NQ_BASE_TYPE_CQ_NOTIFICATION: { struct nq_cn *nqcne = (struct nq_cn *)nqe; + struct bnxt_re_cq *cq_p; q_handle = le32_to_cpu(nqcne->cq_handle_low); q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high) @@ -333,6 +338,10 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) cq->toggle = (le16_to_cpu(nqe->info10_type) & NQ_CN_TOGGLE_MASK) >> NQ_CN_TOGGLE_SFT; cq->dbinfo.toggle = cq->toggle; + cq_p = container_of(cq, struct bnxt_re_cq, qplib_cq); + if (cq_p->uctx_cq_page) + *((u32 *)cq_p->uctx_cq_page) = cq->toggle; + bnxt_qplib_armen_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMENA); spin_lock_bh(&cq->compl_lock); @@ -347,6 +356,7 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) case NQ_BASE_TYPE_SRQ_EVENT: { struct bnxt_qplib_srq *srq; + struct bnxt_re_srq *srq_p; struct nq_srq_event *nqsrqe = (struct nq_srq_event *)nqe; @@ -354,6 +364,12 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high) << 32; srq = (struct bnxt_qplib_srq *)q_handle; + srq->toggle = (le16_to_cpu(nqe->info10_type) & NQ_CN_TOGGLE_MASK) + >> NQ_CN_TOGGLE_SFT; + srq->dbinfo.toggle = srq->toggle; + srq_p = container_of(srq, struct bnxt_re_srq, qplib_srq); + if (srq_p->uctx_srq_page) + *((u32 *)srq_p->uctx_srq_page) = srq->toggle; bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA); if (nq->srqn_handler(nq, @@ -540,6 +556,7 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, nq->pdev = pdev; nq->cqn_handler = cqn_handler; nq->srqn_handler = srqn_handler; + nq->load = 0; /* Have a task to schedule CQ notifiers in post send case */ nq->cqn_wq = create_singlethread_workqueue("bnxt_qplib_nq"); @@ -642,13 +659,6 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr); if (rc) return rc; - - srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), - GFP_KERNEL); - if (!srq->swq) { - rc = -ENOMEM; - goto fail; - } srq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_SRQ, @@ -677,9 +687,17 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, spin_lock_init(&srq->lock); srq->start_idx = 0; srq->last_idx = srq->hwq.max_elements - 1; - for (idx = 0; idx < srq->hwq.max_elements; idx++) - srq->swq[idx].next_idx = idx + 1; - srq->swq[srq->last_idx].next_idx = -1; + if (!srq->hwq.is_user) { + srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), + GFP_KERNEL); + if (!srq->swq) { + rc = -ENOMEM; + goto fail; + } + for (idx = 0; idx < srq->hwq.max_elements; idx++) + srq->swq[idx].next_idx = idx + 1; + srq->swq[srq->last_idx].next_idx = -1; + } srq->id = le32_to_cpu(resp.xid); srq->dbinfo.hwq = &srq->hwq; @@ -809,13 +827,13 @@ static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que) { int indx; - que->swq = kcalloc(que->max_wqe, sizeof(*que->swq), GFP_KERNEL); + que->swq = kcalloc(que->max_sw_wqe, sizeof(*que->swq), GFP_KERNEL); if (!que->swq) return -ENOMEM; que->swq_start = 0; - que->swq_last = que->max_wqe - 1; - for (indx = 0; indx < que->max_wqe; indx++) + que->swq_last = que->max_sw_wqe - 1; + for (indx = 0; indx < que->max_sw_wqe; indx++) que->swq[indx].next_idx = indx + 1; que->swq[que->swq_last].next_idx = 0; /* Make it circular */ que->swq_last = 0; @@ -851,7 +869,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(sq); + hwq_attr.depth = bnxt_qplib_get_depth(sq, qp->wqe_mode, false); hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) @@ -879,7 +897,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(rq); + hwq_attr.depth = bnxt_qplib_get_depth(rq, qp->wqe_mode, false); hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) @@ -983,9 +1001,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) u32 tbl_indx; u16 nsge; - if (res->dattr) - qp->dev_cap_flags = res->dattr->dev_cap_flags; - + qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); sq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_QP, @@ -1002,7 +1018,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) sizeof(struct sq_psn_search_ext) : sizeof(struct sq_psn_search); - if (BNXT_RE_HW_RETX(qp->dev_cap_flags)) { + if (qp->is_host_msn_tbl) { psn_sz = sizeof(struct sq_msn_search); qp->msn = 0; } @@ -1011,12 +1027,18 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(sq); + hwq_attr.depth = bnxt_qplib_get_depth(sq, qp->wqe_mode, true); hwq_attr.aux_stride = psn_sz; - hwq_attr.aux_depth = bnxt_qplib_set_sq_size(sq, qp->wqe_mode); + hwq_attr.aux_depth = psn_sz ? bnxt_qplib_set_sq_size(sq, qp->wqe_mode) + : 0; /* Update msn tbl size */ - if (BNXT_RE_HW_RETX(qp->dev_cap_flags) && psn_sz) { - hwq_attr.aux_depth = roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + if (qp->is_host_msn_tbl && psn_sz) { + if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + else + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)) / 2; qp->msn_tbl_sz = hwq_attr.aux_depth; qp->msn = 0; } @@ -1026,13 +1048,14 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (rc) return rc; - rc = bnxt_qplib_alloc_init_swq(sq); - if (rc) - goto fail_sq; - - if (psn_sz) - bnxt_qplib_init_psn_ptr(qp, psn_sz); + if (!sq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(sq); + if (rc) + goto fail_sq; + if (psn_sz) + bnxt_qplib_init_psn_ptr(qp, psn_sz); + } req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); @@ -1051,16 +1074,18 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(rq); + hwq_attr.depth = bnxt_qplib_get_depth(rq, qp->wqe_mode, false); hwq_attr.aux_stride = 0; hwq_attr.aux_depth = 0; hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) goto sq_swq; - rc = bnxt_qplib_alloc_init_swq(rq); - if (rc) - goto fail_rq; + if (!rq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(rq); + if (rc) + goto fail_rq; + } req.rq_size = cpu_to_le32(rq->max_wqe); pbl = &rq->hwq.pbl[PBL_LVL_0]; @@ -1156,9 +1181,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rq->dbinfo.db = qp->dpi->dbr; rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size); } + spin_lock_bh(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw); rcfw->qp_tbl[tbl_indx].qp_id = qp->id; rcfw->qp_tbl[tbl_indx].qp_handle = (void *)qp; + spin_unlock_bh(&rcfw->tbl_lock); return 0; fail: @@ -1190,8 +1217,6 @@ static void __modify_flags_from_init_state(struct bnxt_qplib_qp *qp) qp->path_mtu = CMDQ_MODIFY_QP_PATH_MTU_MTU_2048; } - qp->modify_flags &= - ~CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID; /* Bono FW require the max_dest_rd_atomic to be >= 1 */ if (qp->max_dest_rd_atomic < 1) qp->max_dest_rd_atomic = 1; @@ -1265,12 +1290,56 @@ static void __filter_modify_flags(struct bnxt_qplib_qp *qp) } } +static void bnxt_set_mandatory_attributes(struct bnxt_qplib_res *res, + struct bnxt_qplib_qp *qp, + struct cmdq_modify_qp *req) +{ + u32 mandatory_flags = 0; + + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC) + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS; + + if (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_INIT && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTR) { + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC && qp->srq) + req->flags = cpu_to_le16(CMDQ_MODIFY_QP_FLAGS_SRQ_USED); + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; + } + + if (_is_min_rnr_in_rtr_rts_mandatory(res->dattr->dev_cap_flags2) && + (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_RTR && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTS)) { + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC) + mandatory_flags |= + CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER; + } + + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_UD || + qp->type == CMDQ_MODIFY_QP_QP_TYPE_GSI) + mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY; + + qp->modify_flags |= mandatory_flags; + req->qp_type = qp->type; +} + +static bool is_optimized_state_transition(struct bnxt_qplib_qp *qp) +{ + if ((qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_INIT && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTR) || + (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_RTR && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTS)) + return true; + + return false; +} + int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct creq_modify_qp_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_modify_qp req = {}; + u16 vlan_pcp_vlan_dei_vlan_id; u32 temp32[4]; u32 bmask; int rc; @@ -1281,6 +1350,12 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) /* Filter out the qp_attr_mask based on the state->new transition */ __filter_modify_flags(qp); + if (qp->modify_flags & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) { + /* Set mandatory attributes for INIT -> RTR and RTR -> RTS transition */ + if (_is_optimize_modify_qp_supported(res->dattr->dev_cap_flags2) && + is_optimized_state_transition(qp)) + bnxt_set_mandatory_attributes(res, qp, &req); + } bmask = qp->modify_flags; req.modify_mask = cpu_to_le32(qp->modify_flags); req.qp_cid = cpu_to_le32(qp->id); @@ -1361,7 +1436,16 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID) req.dest_qp_id = cpu_to_le32(qp->dest_qpn); - req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(qp->vlan_id); + if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID) { + vlan_pcp_vlan_dei_vlan_id = + ((res->sgid_tbl.tbl[qp->ah.sgid_index].vlan_id << + CMDQ_MODIFY_QP_VLAN_ID_SFT) & + CMDQ_MODIFY_QP_VLAN_ID_MASK); + vlan_pcp_vlan_dei_vlan_id |= + ((qp->ah.sl << CMDQ_MODIFY_QP_VLAN_PCP_SFT) & + CMDQ_MODIFY_QP_VLAN_PCP_MASK); + req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(vlan_pcp_vlan_dei_vlan_id); + } bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); @@ -1453,6 +1537,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->dest_qpn = le32_to_cpu(sb->dest_qp_id); memcpy(qp->smac, sb->src_mac, 6); qp->vlan_id = le16_to_cpu(sb->vlan_pcp_vlan_dei_vlan_id); + qp->port_id = le16_to_cpu(sb->port_id); bail: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); @@ -1515,9 +1600,11 @@ int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, u32 tbl_indx; int rc; + spin_lock_bh(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw); rcfw->qp_tbl[tbl_indx].qp_id = BNXT_QPLIB_QP_ID_INVALID; rcfw->qp_tbl[tbl_indx].qp_handle = NULL; + spin_unlock_bh(&rcfw->tbl_lock); bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_DESTROY_QP, @@ -1528,8 +1615,10 @@ int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); if (rc) { + spin_lock_bh(&rcfw->tbl_lock); rcfw->qp_tbl[tbl_indx].qp_id = qp->id; rcfw->qp_tbl[tbl_indx].qp_handle = qp; + spin_unlock_bh(&rcfw->tbl_lock); return rc; } @@ -1636,7 +1725,7 @@ static void bnxt_qplib_fill_psn_search(struct bnxt_qplib_qp *qp, if (!swq->psn_search) return; /* Handle MSN differently on cap flags */ - if (BNXT_RE_HW_RETX(qp->dev_cap_flags)) { + if (qp->is_host_msn_tbl) { bnxt_qplib_fill_msn_search(qp, wqe, swq); return; } @@ -1818,7 +1907,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, } swq = bnxt_qplib_get_swqe(sq, &wqe_idx); - bnxt_qplib_pull_psn_buff(qp, sq, swq, BNXT_RE_HW_RETX(qp->dev_cap_flags)); + bnxt_qplib_pull_psn_buff(qp, sq, swq, qp->is_host_msn_tbl); idx = 0; swq->slot_idx = hwq->prod; @@ -2008,7 +2097,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, rc = -EINVAL; goto done; } - if (!BNXT_RE_HW_RETX(qp->dev_cap_flags) || msn_update) { + if (!qp->is_host_msn_tbl || msn_update) { swq->next_psn = sq->psn & BTH_PSN_MASK; bnxt_qplib_fill_psn_search(qp, wqe, swq); } @@ -2130,6 +2219,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_create_cq req = {}; struct bnxt_qplib_pbl *pbl; + u32 coalescing = 0; u32 pg_sz_lvl; int rc; @@ -2156,6 +2246,25 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) req.dpi = cpu_to_le32(cq->dpi->dpi); req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->max_wqe); + + if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) { + req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID); + coalescing |= ((cq->coalescing->buf_maxtime << + CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) & + CMDQ_CREATE_CQ_BUF_MAXTIME_MASK); + coalescing |= ((cq->coalescing->normal_maxbuf << + CMDQ_CREATE_CQ_NORMAL_MAXBUF_SFT) & + CMDQ_CREATE_CQ_NORMAL_MAXBUF_MASK); + coalescing |= ((cq->coalescing->during_maxbuf << + CMDQ_CREATE_CQ_DURING_MAXBUF_SFT) & + CMDQ_CREATE_CQ_DURING_MAXBUF_MASK); + if (cq->coalescing->en_ring_idle_mode) + coalescing |= CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE; + else + coalescing &= ~CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE; + req.coalescing = cpu_to_le32(coalescing); + } + pbl = &cq->hwq.pbl[PBL_LVL_0]; pg_sz_lvl = (bnxt_qplib_base_pg_size(&cq->hwq) << CMDQ_CREATE_CQ_PG_SIZE_SFT); @@ -2470,6 +2579,32 @@ out: return rc; } +static int bnxt_qplib_get_cqe_sq_cons(struct bnxt_qplib_q *sq, u32 cqe_slot) +{ + struct bnxt_qplib_hwq *sq_hwq; + struct bnxt_qplib_swq *swq; + int cqe_sq_cons = -1; + u32 start, last; + + sq_hwq = &sq->hwq; + + start = sq->swq_start; + last = sq->swq_last; + + while (last != start) { + swq = &sq->swq[last]; + if (swq->slot_idx == cqe_slot) { + cqe_sq_cons = swq->next_idx; + dev_err(&sq_hwq->pdev->dev, "%s: Found cons wqe = %d slot = %d\n", + __func__, cqe_sq_cons, cqe_slot); + break; + } + + last = swq->next_idx; + } + return cqe_sq_cons; +} + static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, struct cq_req *hwcqe, struct bnxt_qplib_cqe **pcqe, int *budget, @@ -2477,9 +2612,10 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_swq *swq; struct bnxt_qplib_cqe *cqe; + u32 cqe_sq_cons, slot_num; struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *sq; - u32 cqe_sq_cons; + int cqe_cons; int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) @@ -2491,12 +2627,26 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, } sq = &qp->sq; - cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_wqe; + cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_sw_wqe; if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } + + if (__is_err_cqe_for_var_wqe(qp, hwcqe->status)) { + slot_num = le16_to_cpu(hwcqe->sq_cons_idx); + cqe_cons = bnxt_qplib_get_cqe_sq_cons(sq, slot_num); + if (cqe_cons < 0) { + dev_err(&cq->hwq.pdev->dev, "%s: Wrong SQ cons cqe_slot_indx = %d\n", + __func__, slot_num); + goto done; + } + cqe_sq_cons = cqe_cons; + dev_err(&cq->hwq.pdev->dev, "%s: cqe_sq_cons = %d swq_last = %d swq_start = %d\n", + __func__, cqe_sq_cons, sq->swq_last, sq->swq_start); + } + /* Require to walk the sq's swq to fabricate CQEs for all previously * signaled SWQEs due to CQE aggregation from the current sq cons * to the cqe_sq_cons @@ -2534,10 +2684,12 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, bnxt_qplib_add_flush_qp(qp); } else { /* Before we complete, do WA 9060 */ - if (do_wa9060(qp, cq, cq_cons, sq->swq_last, - cqe_sq_cons)) { - *lib_qp = qp; - goto out; + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->cctx)) { + if (do_wa9060(qp, cq, cq_cons, sq->swq_last, + cqe_sq_cons)) { + *lib_qp = qp; + goto out; + } } if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) { cqe->status = CQ_REQ_STATUS_OK; @@ -2881,7 +3033,7 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, cqe_cons = le16_to_cpu(hwcqe->sq_cons_idx); if (cqe_cons == 0xFFFF) goto do_rq; - cqe_cons %= sq->max_wqe; + cqe_cons %= sq->max_sw_wqe; if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 7fd4506b3584..0d9487c889ff 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -105,6 +105,7 @@ struct bnxt_qplib_srq { struct bnxt_qplib_sg_info sg_info; u16 eventq_hw_ring_id; spinlock_t lock; /* protect SRQE link list */ + u8 toggle; }; struct bnxt_qplib_sge { @@ -113,7 +114,6 @@ struct bnxt_qplib_sge { u32 size; }; -#define BNXT_QPLIB_QP_MAX_SGL 6 struct bnxt_qplib_swq { u64 wr_id; int next_idx; @@ -153,7 +153,7 @@ struct bnxt_qplib_swqe { #define BNXT_QPLIB_SWQE_FLAGS_UC_FENCE BIT(2) #define BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT BIT(3) #define BNXT_QPLIB_SWQE_FLAGS_INLINE BIT(4) - struct bnxt_qplib_sge sg_list[BNXT_QPLIB_QP_MAX_SGL]; + struct bnxt_qplib_sge sg_list[BNXT_VAR_MAX_SGE]; int num_sge; /* Max inline data is 96 bytes */ u32 inline_len; @@ -164,12 +164,12 @@ struct bnxt_qplib_swqe { /* Send, with imm, inval key */ struct { union { - __be32 imm_data; + u32 imm_data; u32 inv_key; }; u32 q_key; u32 dst_qp; - u16 avid; + u32 avid; } send; /* Send Raw Ethernet and QP1 */ @@ -182,7 +182,7 @@ struct bnxt_qplib_swqe { /* RDMA write, with imm, read */ struct { union { - __be32 imm_data; + u32 imm_data; u32 inv_key; }; u64 remote_va; @@ -251,6 +251,7 @@ struct bnxt_qplib_q { struct bnxt_qplib_db_info dbinfo; struct bnxt_qplib_sg_info sg_info; u32 max_wqe; + u32 max_sw_wqe; u16 wqe_size; u16 q_full_delta; u16 max_sge; @@ -297,6 +298,7 @@ struct bnxt_qplib_qp { u32 dest_qpn; u8 smac[6]; u16 vlan_id; + u16 port_id; u8 nw_type; struct bnxt_qplib_ah ah; @@ -340,7 +342,8 @@ struct bnxt_qplib_qp { struct list_head rq_flush; u32 msn; u32 msn_tbl_sz; - u16 dev_cap_flags; + bool is_host_msn_tbl; + u8 tos_dscp; }; #define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE sizeof(struct cq_base) @@ -381,6 +384,25 @@ static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *que, return avail <= slots; } +/* CQ coalescing parameters */ +struct bnxt_qplib_cq_coal_param { + u16 buf_maxtime; + u8 normal_maxbuf; + u8 during_maxbuf; + u8 en_ring_idle_mode; +}; + +#define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7 0x8 +#define BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P7 0x8 +#define BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P5 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_DURING_MAXBUF_P5 0x1 +#define BNXT_QPLIB_CQ_COAL_DEF_EN_RING_IDLE_MODE 0x1 +#define BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME 0x1bf +#define BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF 0x1f +#define BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF 0x1f +#define BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE 0x1 + struct bnxt_qplib_cqe { u8 status; u8 type; @@ -389,7 +411,7 @@ struct bnxt_qplib_cqe { u16 cfa_meta; u64 wr_id; union { - __be32 immdata; + u32 immdata; u32 invrkey; }; u64 qp_handle; @@ -443,6 +465,7 @@ struct bnxt_qplib_cq { */ spinlock_t flush_lock; /* QP flush management */ u16 cnq_events; + struct bnxt_qplib_cq_coal_param *coalescing; }; #define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE sizeof(struct xrrq_irrq) @@ -497,6 +520,7 @@ struct bnxt_qplib_nq { struct tasklet_struct nq_tasklet; bool requested; int budget; + u32 load; cqn_handler_t cqn_handler; srqn_handler_t srqn_handler; @@ -586,15 +610,22 @@ static inline void bnxt_qplib_swq_mod_start(struct bnxt_qplib_q *que, u32 idx) que->swq_start = que->swq[idx].next_idx; } -static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que) +static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que, u8 wqe_mode, bool is_sq) { - return (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge); + u32 slots; + + /* Queue depth is the number of slots. */ + slots = (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge); + /* For variable WQE mode, need to align the slots to 256 */ + if (wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE && is_sq) + slots = ALIGN(slots, BNXT_VAR_MAX_SLOT_ALIGN); + return slots; } static inline u32 bnxt_qplib_set_sq_size(struct bnxt_qplib_q *que, u8 wqe_mode) { return (wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? - que->max_wqe : bnxt_qplib_get_depth(que); + que->max_wqe : bnxt_qplib_get_depth(que, wqe_mode, true); } static inline u32 bnxt_qplib_set_sq_max_slot(u8 wqe_mode) @@ -641,4 +672,14 @@ static inline __le64 bnxt_re_update_msn_tbl(u32 st_idx, u32 npsn, u32 start_psn) (((start_psn) << SQ_MSN_SEARCH_START_PSN_SFT) & SQ_MSN_SEARCH_START_PSN_MASK)); } + +static inline bool __is_var_wqe(struct bnxt_qplib_qp *qp) +{ + return (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE); +} + +static inline bool __is_err_cqe_for_var_wqe(struct bnxt_qplib_qp *qp, u8 status) +{ + return (status != CQ_REQ_STATUS_OK) && __is_var_wqe(qp); +} #endif /* __BNXT_QPLIB_FP_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 3ffaef0c2651..d23074383428 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -290,7 +290,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_hwq *hwq; u32 sw_prod, cmdq_prod; struct pci_dev *pdev; - unsigned long flags; u16 cookie; u8 *preq; @@ -301,7 +300,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, /* Cmdq are in 16-byte units, each request can consume 1 or more * cmdqe */ - spin_lock_irqsave(&hwq->lock, flags); + spin_lock_bh(&hwq->lock); required_slots = bnxt_qplib_get_cmd_slots(msg->req); free_slots = HWQ_FREE_SLOTS(hwq); cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; @@ -311,7 +310,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, dev_info_ratelimited(&pdev->dev, "CMDQ is full req/free %d/%d!", required_slots, free_slots); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock_bh(&hwq->lock); return -EAGAIN; } if (msg->block) @@ -367,7 +366,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, wmb(); writel(cmdq_prod, cmdq->cmdq_mbox.prod); writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock_bh(&hwq->lock); /* Return the CREQ response pointer */ return 0; } @@ -425,7 +424,8 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) - return bnxt_qplib_map_rc(opcode); + return -ENXIO; + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -486,7 +486,6 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, { struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; struct bnxt_qplib_crsqe *crsqe; - unsigned long flags; u16 cookie; int rc; u8 opcode; @@ -495,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __send_message_basic_sanity(rcfw, msg, opcode); if (rc) - return rc; + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; rc = __send_message(rcfw, msg, opcode); if (rc) @@ -512,12 +511,12 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __poll_for_resp(rcfw, cookie); if (rc) { - spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags); + spin_lock_bh(&rcfw->cmdq.hwq.lock); crsqe = &rcfw->crsqe_tbl[cookie]; crsqe->is_waiter_alive = false; if (rc == -ENODEV) set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); - spin_unlock_irqrestore(&rcfw->cmdq.hwq.lock, flags); + spin_unlock_bh(&rcfw->cmdq.hwq.lock); return -ETIMEDOUT; } @@ -525,7 +524,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, /* failed with status */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", cookie, opcode, evnt->status); - rc = -EFAULT; + rc = -EIO; } return rc; @@ -628,7 +627,6 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, u16 cookie, blocked = 0; bool is_waiter_alive; struct pci_dev *pdev; - unsigned long flags; u32 wait_cmds = 0; int rc = 0; @@ -637,17 +635,21 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION: err_event = (struct creq_qp_error_notification *)qp_event; qp_id = le32_to_cpu(err_event->xid); + spin_lock(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp_id, rcfw); qp = rcfw->qp_tbl[tbl_indx].qp_handle; + if (!qp) { + spin_unlock(&rcfw->tbl_lock); + break; + } + bnxt_qplib_mark_qp_error(qp); + rc = rcfw->creq.aeq_handler(rcfw, qp_event, qp); + spin_unlock(&rcfw->tbl_lock); dev_dbg(&pdev->dev, "Received QP error notification\n"); dev_dbg(&pdev->dev, "qpid 0x%x, req_err=0x%x, resp_err=0x%x\n", qp_id, err_event->req_err_state_reason, err_event->res_err_state_reason); - if (!qp) - break; - bnxt_qplib_mark_qp_error(qp); - rc = rcfw->creq.aeq_handler(rcfw, qp_event, qp); break; default: /* @@ -659,8 +661,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, * */ - spin_lock_irqsave_nested(&hwq->lock, flags, - SINGLE_DEPTH_NESTING); + spin_lock_nested(&hwq->lock, SINGLE_DEPTH_NESTING); cookie = le16_to_cpu(qp_event->cookie); blocked = cookie & RCFW_CMD_IS_BLOCKING; cookie &= RCFW_MAX_COOKIE_VALUE; @@ -672,7 +673,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, dev_info(&pdev->dev, "rcfw timedout: cookie = %#x, free_slots = %d", cookie, crsqe->free_slots); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock(&hwq->lock); return rc; } @@ -720,7 +721,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, __destroy_timedout_ah(rcfw, (struct creq_create_ah_resp *) qp_event); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock(&hwq->lock); } *num_wait += wait_cmds; return rc; @@ -734,12 +735,11 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) u32 type, budget = CREQ_ENTRY_POLL_BUDGET; struct bnxt_qplib_hwq *hwq = &creq->hwq; struct creq_base *creqe; - unsigned long flags; u32 num_wakeup = 0; u32 hw_polled = 0; /* Service the CREQ until budget is over */ - spin_lock_irqsave(&hwq->lock, flags); + spin_lock_bh(&hwq->lock); while (budget > 0) { creqe = bnxt_qplib_get_qe(hwq, hwq->cons, NULL); if (!CREQ_CMP_VALID(creqe, creq->creq_db.dbinfo.flags)) @@ -782,7 +782,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) if (hw_polled) bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, true); - spin_unlock_irqrestore(&hwq->lock, flags); + spin_unlock_bh(&hwq->lock); if (num_wakeup) wake_up_nr(&rcfw->cmdq.waitq, num_wakeup); } @@ -832,6 +832,7 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct creq_initialize_fw_resp resp = {}; struct cmdq_initialize_fw req = {}; struct bnxt_qplib_cmdqmsg msg = {}; + u16 flags = 0; u8 pgsz, lvl; int rc; @@ -850,10 +851,8 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, * shall setup this area for VF. Skipping the * HW programming */ - if (is_virtfn) + if (is_virtfn || bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) goto skip_ctx_setup; - if (bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) - goto config_vf_res; lvl = ctx->qpc_tbl.level; pgsz = bnxt_qplib_base_pg_size(&ctx->qpc_tbl); @@ -897,16 +896,14 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, req.number_of_srq = cpu_to_le32(ctx->srqc_tbl.max_elements); req.number_of_cq = cpu_to_le32(ctx->cq_tbl.max_elements); -config_vf_res: - req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf); - req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf); - req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf); - req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf); - req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf); - skip_ctx_setup: if (BNXT_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags)) - req.flags |= cpu_to_le16(CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED); + flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED; + if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED; + if (rcfw->res->en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT) + flags |= CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT; + req.flags |= cpu_to_le16(flags); req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); @@ -918,7 +915,6 @@ skip_ctx_setup: void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { - kfree(rcfw->qp_tbl); kfree(rcfw->crsqe_tbl); bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq.hwq); bnxt_qplib_free_hwq(rcfw->res, &rcfw->creq.hwq); @@ -927,8 +923,7 @@ void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_ctx *ctx, - int qp_tbl_sz) + struct bnxt_qplib_ctx *ctx) { struct bnxt_qplib_hwq_attr hwq_attr = {}; struct bnxt_qplib_sg_info sginfo = {}; @@ -972,12 +967,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, if (!rcfw->crsqe_tbl) goto fail; - /* Allocate one extra to hold the QP1 entries */ - rcfw->qp_tbl_size = qp_tbl_sz + 1; - rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), - GFP_KERNEL); - if (!rcfw->qp_tbl) - goto fail; + spin_lock_init(&rcfw->tbl_lock); rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 45996e60a0d0..ff873c5f1b25 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -131,6 +131,8 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) #define RCFW_CMD_IS_BLOCKING 0x8000 #define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL +/* HWRM version 1.10.3.18 */ +#define HWRM_VERSION_READ_CTX 0x1000A00030012 /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { @@ -224,6 +226,8 @@ struct bnxt_qplib_rcfw { struct bnxt_qplib_crsqe *crsqe_tbl; int qp_tbl_size; struct bnxt_qplib_qp_node *qp_tbl; + /* To synchronize the qp-handle hash table */ + spinlock_t tbl_lock; u64 oos_prev; u32 init_oos_stats; u32 cmdq_depth; @@ -258,8 +262,7 @@ static inline void bnxt_qplib_fill_cmdqmsg(struct bnxt_qplib_cmdqmsg *msg, void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_ctx *ctx, - int qp_tbl_sz); + struct bnxt_qplib_ctx *ctx); void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill); void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, @@ -281,9 +284,10 @@ int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx, int is_virtfn); void bnxt_qplib_mark_qp_error(void *qp_handle); + static inline u32 map_qp_id_to_tbl_indx(u32 qid, struct bnxt_qplib_rcfw *rcfw) { /* Last index of the qp_tbl is for QP1 ie. qp_tbl_size - 1*/ - return (qid == 1) ? rcfw->qp_tbl_size - 1 : qid % rcfw->qp_tbl_size - 2; + return (qid == 1) ? rcfw->qp_tbl_size - 1 : (qid % (rcfw->qp_tbl_size - 2)); } #endif /* __BNXT_QPLIB_RCFW_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index dfc943fab87b..6cd05207ffed 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -244,6 +244,8 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, sginfo.pgsize = npde * pg_size; sginfo.npages = 1; rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], &sginfo); + if (rc) + goto fail; /* Alloc PBL pages */ sginfo.npages = npbl; @@ -255,22 +257,9 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, dst_virt_ptr = (dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr; src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr; - if (hwq_attr->type == HWQ_TYPE_MR) { - /* For MR it is expected that we supply only 1 contigous - * page i.e only 1 entry in the PDL that will contain - * all the PBLs for the user supplied memory region - */ - for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; - i++) - dst_virt_ptr[0][i] = src_phys_ptr[i] | - flag; - } else { - for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; - i++) - dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = - src_phys_ptr[i] | - PTU_PDE_VALID; - } + for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++) + dst_virt_ptr[0][i] = src_phys_ptr[i] | flag; + /* Alloc or init PTEs */ rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_2], hwq_attr->sginfo); @@ -882,19 +871,27 @@ int bnxt_qplib_init_res(struct bnxt_qplib_res *res) void bnxt_qplib_free_res(struct bnxt_qplib_res *res) { + kfree(res->rcfw->qp_tbl); bnxt_qplib_free_sgid_tbl(res, &res->sgid_tbl); bnxt_qplib_free_pd_tbl(&res->pd_tbl); bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl); } -int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev, - struct net_device *netdev, - struct bnxt_qplib_dev_attr *dev_attr) +int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev) { + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct bnxt_qplib_dev_attr *dev_attr; int rc; - res->pdev = pdev; res->netdev = netdev; + dev_attr = res->dattr; + + /* Allocate one extra to hold the QP1 entries */ + rcfw->qp_tbl_size = max_t(u32, BNXT_RE_MAX_QPC_COUNT + 1, dev_attr->max_qp); + rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), + GFP_KERNEL); + if (!rcfw->qp_tbl) + return -ENOMEM; rc = bnxt_qplib_alloc_sgid_tbl(res, &res->sgid_tbl, dev_attr->max_sgid); if (rc) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 61628f7f1253..6a13927674b4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -39,6 +39,8 @@ #ifndef __BNXT_QPLIB_RES_H__ #define __BNXT_QPLIB_RES_H__ +#include "bnxt_ulp.h" + extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; #define CHIP_NUM_57508 0x1750 @@ -47,6 +49,13 @@ extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; #define CHIP_NUM_58818 0xd818 #define CHIP_NUM_57608 0x1760 +#define BNXT_RE_MAX_QPC_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT (64 * 1024) +#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) +#define BNXT_RE_MAX_CQ_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) + #define BNXT_QPLIB_DBR_VALID (0x1UL << 26) #define BNXT_QPLIB_DBR_EPOCH_SHIFT 24 #define BNXT_QPLIB_DBR_TOGGLE_SHIFT 25 @@ -82,6 +91,7 @@ struct bnxt_qplib_db_pacing_data { u32 fifo_room_mask; u32 fifo_room_shift; u32 grc_reg_offset; + u32 dev_err_state; }; #define BNXT_QPLIB_DBR_PF_DB_OFFSET 0x10000 @@ -301,6 +311,7 @@ struct bnxt_qplib_res { struct bnxt_qplib_chip_ctx *cctx; struct bnxt_qplib_dev_attr *dattr; struct net_device *netdev; + struct bnxt_en_dev *en_dev; struct bnxt_qplib_rcfw *rcfw; struct bnxt_qplib_pd_tbl pd_tbl; /* To protect the pd table bit map */ @@ -420,9 +431,7 @@ int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res); int bnxt_qplib_init_res(struct bnxt_qplib_res *res); void bnxt_qplib_free_res(struct bnxt_qplib_res *res); -int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev, - struct net_device *netdev, - struct bnxt_qplib_dev_attr *dev_attr); +int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev); void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res, struct bnxt_qplib_ctx *ctx); int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res, @@ -545,6 +554,14 @@ static inline bool _is_ext_stats_supported(u16 dev_cap_flags) CREQ_QUERY_FUNC_RESP_SB_EXT_STATS; } +static inline int bnxt_ext_stats_supported(struct bnxt_qplib_chip_ctx *ctx, + u16 flags, bool virtfn) +{ + /* ext stats supported if cap flag is set AND is a PF OR a Thor2 VF */ + return (_is_ext_stats_supported(flags) && + ((virtfn && bnxt_qplib_is_chip_gen_p7(ctx)) || (!virtfn))); +} + static inline bool _is_hw_retx_supported(u16 dev_cap_flags) { return dev_cap_flags & @@ -554,9 +571,45 @@ static inline bool _is_hw_retx_supported(u16 dev_cap_flags) #define BNXT_RE_HW_RETX(a) _is_hw_retx_supported((a)) +static inline bool _is_host_msn_table(u16 dev_cap_ext_flags2) +{ + return (dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_MASK) == + CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_MSN_TABLE; +} + static inline u8 bnxt_qplib_dbr_pacing_en(struct bnxt_qplib_chip_ctx *cctx) { return cctx->modes.dbr_pacing; } +static inline bool _is_alloc_mr_unified(u16 dev_cap_flags) +{ + return dev_cap_flags & CREQ_QUERY_FUNC_RESP_SB_MR_REGISTER_ALLOC; +} + +static inline bool _is_relaxed_ordering_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MEMORY_REGION_RO_SUPPORTED; +} + +static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; +} + +static inline bool _is_min_rnr_in_rtr_rts_mandatory(u16 dev_cap_ext_flags2) +{ + return !!(dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED); +} + +static inline bool _is_cq_coalescing_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED; +} + +static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) +{ + return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 8beeedd15061..f231e886ad9d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -88,18 +88,20 @@ static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw, fw_ver[3] = resp.fw_rsvd; } -int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr) +int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) { + struct bnxt_qplib_dev_attr *attr = rcfw->res->dattr; struct creq_query_func_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct creq_query_func_resp_sb *sb; struct bnxt_qplib_rcfw_sbuf sbuf; + struct bnxt_qplib_chip_ctx *cctx; struct cmdq_query_func req = {}; u8 *tqm_alloc; int i, rc; u32 temp; + cctx = rcfw->res->cctx; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_QUERY_FUNC, sizeof(req)); @@ -127,16 +129,25 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_qp_init_rd_atom = sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; - attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr); - /* - * 128 WQEs needs to be reserved for the HW (8916). Prevent - * reporting the max number - */ - attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; - attr->max_qp_sges = bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx) ? - 6 : sb->max_sge; + attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; + if (!bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) { + /* + * 128 WQEs needs to be reserved for the HW (8916). Prevent + * reporting the max number on legacy devices + */ + attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; + } + + /* Adjust for max_qp_wqes for variable wqe */ + if (cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + attr->max_qp_wqes = BNXT_VAR_MAX_WQE - 1; + + attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? + min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; attr->max_cq = le32_to_cpu(sb->max_cq); attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); + if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) + attr->max_cq_wqes = min_t(u32, BNXT_QPLIB_MAX_CQ_WQES, attr->max_cq_wqes); attr->max_cq_sges = attr->max_qp_sges; attr->max_mr = le32_to_cpu(sb->max_mr); attr->max_mw = le32_to_cpu(sb->max_mw); @@ -154,8 +165,19 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) attr->l2_db_size = (sb->l2_db_space_size + 1) * (0x01 << RCFW_DBR_BASE_PAGE_SHIFT); - attr->max_sgid = BNXT_QPLIB_NUM_GIDS_SUPPORTED; + /* + * Read the max gid supported by HW. + * For each entry in HW GID in HW table, we consume 2 + * GID entries in the kernel GID table. So max_gid reported + * to stack can be up to twice the value reported by the HW, up to 256 gids. + */ + attr->max_sgid = le32_to_cpu(sb->max_gid); + attr->max_sgid = min_t(u32, BNXT_QPLIB_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid); attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); + attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); + + if (_is_max_srq_ext_supported(attr->dev_cap_flags2)) + attr->max_srq += le16_to_cpu(sb->max_srq_ext); bnxt_qplib_query_version(rcfw, attr->fw_ver); @@ -540,7 +562,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw) req.pd_id = cpu_to_le32(mrw->pd->id); req.mrw_flags = mrw->type; if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR && - mrw->flags & BNXT_QPLIB_FR_PMR) || + mrw->access_flags & BNXT_QPLIB_FR_PMR) || mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A || mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B) req.access = CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY; @@ -652,9 +674,12 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, req.log2_pbl_pg_size = cpu_to_le16(((ilog2(PAGE_SIZE) << CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT) & CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK)); - req.access = (mr->flags & 0xFFFF); + req.access = (mr->access_flags & 0xFFFF); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); + if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) + req.key = cpu_to_le32(mr->pd->id); + req.flags = cpu_to_le16(mr->flags); req.mr_size = cpu_to_le64(mr->total_size); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), @@ -663,6 +688,11 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, if (rc) goto fail; + if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) { + mr->lkey = le32_to_cpu(resp.xid); + mr->rkey = mr->lkey; + } + return 0; fail: @@ -960,3 +990,151 @@ int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, rc = bnxt_qplib_rcfw_send_message(res->rcfw, &msg); return rc; } + +int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 res_type, + u32 xid, u32 resp_size, void *resp_va) +{ + struct creq_read_context resp = {}; + struct bnxt_qplib_cmdqmsg msg = {}; + struct cmdq_read_context req = {}; + struct bnxt_qplib_rcfw_sbuf sbuf; + int rc; + + sbuf.size = resp_size; + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_READ_CONTEXT, sizeof(req)); + req.resp_addr = cpu_to_le64(sbuf.dma_addr); + req.resp_size = resp_size / BNXT_QPLIB_CMDQE_UNITS; + + req.xid = cpu_to_le32(xid); + req.type = res_type; + + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); + if (rc) + goto free_mem; + + memcpy(resp_va, sbuf.sb, resp_size); +free_mem: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); + return rc; +} + +static void bnxt_qplib_read_cc_gen1(struct bnxt_qplib_cc_param_ext *cc_ext, + struct creq_query_roce_cc_gen1_resp_sb_tlv *sb) +{ + cc_ext->inact_th_hi = le16_to_cpu(sb->inactivity_th_hi); + cc_ext->min_delta_cnp = le16_to_cpu(sb->min_time_between_cnps); + cc_ext->init_cp = le16_to_cpu(sb->init_cp); + cc_ext->tr_update_mode = sb->tr_update_mode; + cc_ext->tr_update_cyls = sb->tr_update_cycles; + cc_ext->fr_rtt = sb->fr_num_rtts; + cc_ext->ai_rate_incr = sb->ai_rate_increase; + cc_ext->rr_rtt_th = le16_to_cpu(sb->reduction_relax_rtts_th); + cc_ext->ar_cr_th = le16_to_cpu(sb->additional_relax_cr_th); + cc_ext->cr_min_th = le16_to_cpu(sb->cr_min_th); + cc_ext->bw_avg_weight = sb->bw_avg_weight; + cc_ext->cr_factor = sb->actual_cr_factor; + cc_ext->cr_th_max_cp = le16_to_cpu(sb->max_cp_cr_th); + cc_ext->cp_bias_en = sb->cp_bias_en; + cc_ext->cp_bias = sb->cp_bias; + cc_ext->cnp_ecn = sb->cnp_ecn; + cc_ext->rtt_jitter_en = sb->rtt_jitter_en; + cc_ext->bytes_per_usec = le16_to_cpu(sb->link_bytes_per_usec); + cc_ext->cc_cr_reset_th = le16_to_cpu(sb->reset_cc_cr_th); + cc_ext->cr_width = sb->cr_width; + cc_ext->min_quota = sb->quota_period_min; + cc_ext->max_quota = sb->quota_period_max; + cc_ext->abs_max_quota = sb->quota_period_abs_max; + cc_ext->tr_lb = le16_to_cpu(sb->tr_lower_bound); + cc_ext->cr_prob_fac = sb->cr_prob_factor; + cc_ext->tr_prob_fac = sb->tr_prob_factor; + cc_ext->fair_cr_th = le16_to_cpu(sb->fairness_cr_th); + cc_ext->red_div = sb->red_div; + cc_ext->cnp_ratio_th = sb->cnp_ratio_th; + cc_ext->ai_ext_rtt = le16_to_cpu(sb->exp_ai_rtts); + cc_ext->exp_crcp_ratio = sb->exp_ai_cr_cp_ratio; + cc_ext->low_rate_en = sb->use_rate_table; + cc_ext->cpcr_update_th = le16_to_cpu(sb->cp_exp_update_th); + cc_ext->ai_rtt_th1 = le16_to_cpu(sb->high_exp_ai_rtts_th1); + cc_ext->ai_rtt_th2 = le16_to_cpu(sb->high_exp_ai_rtts_th2); + cc_ext->cf_rtt_th = le16_to_cpu(sb->actual_cr_cong_free_rtts_th); + cc_ext->sc_cr_th1 = le16_to_cpu(sb->severe_cong_cr_th1); + cc_ext->sc_cr_th2 = le16_to_cpu(sb->severe_cong_cr_th2); + cc_ext->l64B_per_rtt = le32_to_cpu(sb->link64B_per_rtt); + cc_ext->cc_ack_bytes = sb->cc_ack_bytes; + cc_ext->reduce_cf_rtt_th = le16_to_cpu(sb->reduce_init_cong_free_rtts_th); +} + +int bnxt_qplib_query_cc_param(struct bnxt_qplib_res *res, + struct bnxt_qplib_cc_param *cc_param) +{ + struct bnxt_qplib_tlv_query_rcc_sb *ext_sb; + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct creq_query_roce_cc_resp resp = {}; + struct creq_query_roce_cc_resp_sb *sb; + struct bnxt_qplib_cmdqmsg msg = {}; + struct cmdq_query_roce_cc req = {}; + struct bnxt_qplib_rcfw_sbuf sbuf; + size_t resp_size; + int rc; + + /* Query the parameters from chip */ + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_QUERY_ROCE_CC, + sizeof(req)); + if (bnxt_qplib_is_chip_gen_p5_p7(res->cctx)) + resp_size = sizeof(*ext_sb); + else + resp_size = sizeof(*sb); + + sbuf.size = ALIGN(resp_size, BNXT_QPLIB_CMDQE_UNITS); + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + + req.resp_size = sbuf.size / BNXT_QPLIB_CMDQE_UNITS; + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bnxt_qplib_rcfw_send_message(res->rcfw, &msg); + if (rc) + goto out; + + ext_sb = sbuf.sb; + sb = bnxt_qplib_is_chip_gen_p5_p7(res->cctx) ? &ext_sb->base_sb : + (struct creq_query_roce_cc_resp_sb *)ext_sb; + + cc_param->enable = sb->enable_cc & CREQ_QUERY_ROCE_CC_RESP_SB_ENABLE_CC; + cc_param->tos_ecn = (sb->tos_dscp_tos_ecn & + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_ECN_MASK) >> + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_ECN_SFT; + cc_param->tos_dscp = (sb->tos_dscp_tos_ecn & + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_DSCP_MASK) >> + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_DSCP_SFT; + cc_param->alt_tos_dscp = sb->alt_tos_dscp; + cc_param->alt_vlan_pcp = sb->alt_vlan_pcp; + + cc_param->g = sb->g; + cc_param->nph_per_state = sb->num_phases_per_state; + cc_param->init_cr = le16_to_cpu(sb->init_cr); + cc_param->init_tr = le16_to_cpu(sb->init_tr); + cc_param->cc_mode = sb->cc_mode; + cc_param->inact_th = le16_to_cpu(sb->inactivity_th); + cc_param->rtt = le16_to_cpu(sb->rtt); + cc_param->tcp_cp = le16_to_cpu(sb->tcp_cp); + cc_param->time_pph = sb->time_per_phase; + cc_param->pkts_pph = sb->pkts_per_phase; + if (bnxt_qplib_is_chip_gen_p5_p7(res->cctx)) { + bnxt_qplib_read_cc_gen1(&cc_param->cc_ext, &ext_sb->gen1_sb); + cc_param->inact_th |= (cc_param->cc_ext.inact_th_hi & 0x3F) << 16; + } +out: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); + return rc; +} diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index d33c78b96217..e626b05038a1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -40,6 +40,7 @@ #ifndef __BNXT_QPLIB_SP_H__ #define __BNXT_QPLIB_SP_H__ +#include <rdma/bnxt_re-abi.h> #define BNXT_QPLIB_RESERVED_QP_WRS 128 struct bnxt_qplib_dev_attr { @@ -55,6 +56,7 @@ struct bnxt_qplib_dev_attr { u32 max_qp_wqes; u32 max_qp_sges; u32 max_cq; +#define BNXT_QPLIB_MAX_CQ_WQES 0xfffff u32 max_cq_wqes; u32 max_cq_sges; u32 max_mr; @@ -72,6 +74,7 @@ struct bnxt_qplib_dev_attr { u8 tqm_alloc_reqs[MAX_TQM_ALLOC_REQ]; bool is_atomic; u16 dev_cap_flags; + u16 dev_cap_flags2; u32 max_dpi; }; @@ -107,7 +110,7 @@ struct bnxt_qplib_ah { struct bnxt_qplib_mrw { struct bnxt_qplib_pd *pd; int type; - u32 flags; + u32 access_flags; #define BNXT_QPLIB_FR_PMR 0x80000000 u32 lkey; u32 rkey; @@ -115,6 +118,7 @@ struct bnxt_qplib_mrw { u64 va; u64 total_size; u32 npages; + u16 flags; u64 mr_handle; struct bnxt_qplib_hwq hwq; }; @@ -292,6 +296,7 @@ struct bnxt_qplib_cc_param_ext { struct bnxt_qplib_cc_param { u8 alt_vlan_pcp; + u8 qp1_tos_dscp; u16 alt_tos_dscp; u8 cc_mode; u8 enable; @@ -321,8 +326,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_gid *gid, u16 gid_idx, const u8 *smac); -int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr); +int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx); @@ -349,5 +353,16 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid, struct bnxt_qplib_ext_stat *estat); int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, struct bnxt_qplib_cc_param *cc_param); +int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 type, u32 xid, + u32 resp_size, void *resp_va); +int bnxt_qplib_query_cc_param(struct bnxt_qplib_res *res, + struct bnxt_qplib_cc_param *cc_param); + +#define BNXT_VAR_MAX_WQE 4352 +#define BNXT_VAR_MAX_SLOT_ALIGN 256 +#define BNXT_VAR_MAX_SGE 13 +#define BNXT_RE_MAX_RQ_WQES 65536 + +#define BNXT_STATIC_MAX_SGE 6 #endif /* __BNXT_QPLIB_SP_H__*/ diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 605c9463c408..7eceb3e9f4ce 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -216,6 +216,8 @@ struct cmdq_initialize_fw { __le16 flags; #define CMDQ_INITIALIZE_FW_FLAGS_MRAV_RESERVATION_SPLIT 0x1UL #define CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED 0x2UL + #define CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED 0x8UL + #define CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT 0x10UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -409,7 +411,7 @@ struct creq_deinitialize_fw_resp { u8 reserved48[6]; }; -/* cmdq_create_qp (size:768b/96B) */ +/* cmdq_create_qp (size:832b/104B) */ struct cmdq_create_qp { u8 opcode; #define CMDQ_CREATE_QP_OPCODE_CREATE_QP 0x1UL @@ -430,8 +432,11 @@ struct cmdq_create_qp { #define CMDQ_CREATE_QP_QP_FLAGS_OPTIMIZED_TRANSMIT_ENABLED 0x20UL #define CMDQ_CREATE_QP_QP_FLAGS_RESPONDER_UD_CQE_WITH_CFA 0x40UL #define CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED 0x80UL + #define CMDQ_CREATE_QP_QP_FLAGS_EXPRESS_MODE_ENABLED 0x100UL + #define CMDQ_CREATE_QP_QP_FLAGS_STEERING_TAG_VALID 0x200UL + #define CMDQ_CREATE_QP_QP_FLAGS_RDMA_READ_OR_ATOMICS_USED 0x400UL #define CMDQ_CREATE_QP_QP_FLAGS_LAST \ - CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED + CMDQ_CREATE_QP_QP_FLAGS_RDMA_READ_OR_ATOMICS_USED u8 type; #define CMDQ_CREATE_QP_TYPE_RC 0x2UL #define CMDQ_CREATE_QP_TYPE_UD 0x4UL @@ -492,6 +497,9 @@ struct cmdq_create_qp { __le64 rq_pbl; __le64 irrq_addr; __le64 orrq_addr; + __le32 request_xid; + __le16 steering_tag; + __le16 reserved16; }; /* creq_create_qp_resp (size:128b/16B) */ @@ -553,6 +561,7 @@ struct cmdq_modify_qp { #define CMDQ_MODIFY_QP_OPCODE_LAST CMDQ_MODIFY_QP_OPCODE_MODIFY_QP u8 cmd_size; __le16 flags; + #define CMDQ_MODIFY_QP_FLAGS_SRQ_USED 0x1UL __le16 cookie; u8 resp_size; u8 qp_type; @@ -972,13 +981,14 @@ struct creq_query_qp_extend_resp_sb_tlv { __le16 reserved_16; }; -/* cmdq_create_srq (size:384b/48B) */ +/* cmdq_create_srq (size:448b/56B) */ struct cmdq_create_srq { u8 opcode; #define CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ 0x5UL #define CMDQ_CREATE_SRQ_OPCODE_LAST CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ u8 cmd_size; __le16 flags; + #define CMDQ_CREATE_SRQ_FLAGS_STEERING_TAG_VALID 0x1UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1012,6 +1022,8 @@ struct cmdq_create_srq { __le32 dpi; __le32 pd_id; __le64 pbl; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_create_srq_resp (size:128b/16B) */ @@ -1118,7 +1130,7 @@ struct creq_query_srq_resp_sb { __le32 data[4]; }; -/* cmdq_create_cq (size:384b/48B) */ +/* cmdq_create_cq (size:448b/56B) */ struct cmdq_create_cq { u8 opcode; #define CMDQ_CREATE_CQ_OPCODE_CREATE_CQ 0x9UL @@ -1126,6 +1138,9 @@ struct cmdq_create_cq { u8 cmd_size; __le16 flags; #define CMDQ_CREATE_CQ_FLAGS_DISABLE_CQ_OVERFLOW_DETECTION 0x1UL + #define CMDQ_CREATE_CQ_FLAGS_STEERING_TAG_VALID 0x2UL + #define CMDQ_CREATE_CQ_FLAGS_INFINITE_CQ_MODE 0x4UL + #define CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID 0x8UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1157,6 +1172,19 @@ struct cmdq_create_cq { __le32 dpi; __le32 cq_size; __le64 pbl; + __le16 steering_tag; + u8 reserved48[2]; + __le32 coalescing; + #define CMDQ_CREATE_CQ_BUF_MAXTIME_MASK 0x1ffUL + #define CMDQ_CREATE_CQ_BUF_MAXTIME_SFT 0 + #define CMDQ_CREATE_CQ_NORMAL_MAXBUF_MASK 0x3e00UL + #define CMDQ_CREATE_CQ_NORMAL_MAXBUF_SFT 9 + #define CMDQ_CREATE_CQ_DURING_MAXBUF_MASK 0x7c000UL + #define CMDQ_CREATE_CQ_DURING_MAXBUF_SFT 14 + #define CMDQ_CREATE_CQ_ENABLE_RING_IDLE_MODE 0x80000UL + #define CMDQ_CREATE_CQ_UNUSED12_MASK 0xfff00000UL + #define CMDQ_CREATE_CQ_UNUSED12_SFT 20 + __le64 reserved64; }; /* creq_create_cq_resp (size:128b/16B) */ @@ -1288,11 +1316,12 @@ struct cmdq_allocate_mrw { #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A 0x3UL #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B 0x4UL #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_LAST CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B - #define CMDQ_ALLOCATE_MRW_UNUSED4_MASK 0xf0UL - #define CMDQ_ALLOCATE_MRW_UNUSED4_SFT 4 + #define CMDQ_ALLOCATE_MRW_STEERING_TAG_VALID 0x10UL + #define CMDQ_ALLOCATE_MRW_UNUSED4_MASK 0xe0UL + #define CMDQ_ALLOCATE_MRW_UNUSED4_SFT 5 u8 access; #define CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY 0x20UL - __le16 unused16; + __le16 steering_tag; __le32 pd_id; }; @@ -1359,14 +1388,16 @@ struct creq_deallocate_key_resp { __le32 bound_window_info; }; -/* cmdq_register_mr (size:384b/48B) */ +/* cmdq_register_mr (size:448b/56B) */ struct cmdq_register_mr { u8 opcode; #define CMDQ_REGISTER_MR_OPCODE_REGISTER_MR 0xfUL #define CMDQ_REGISTER_MR_OPCODE_LAST CMDQ_REGISTER_MR_OPCODE_REGISTER_MR u8 cmd_size; __le16 flags; - #define CMDQ_REGISTER_MR_FLAGS_ALLOC_MR 0x1UL + #define CMDQ_REGISTER_MR_FLAGS_ALLOC_MR 0x1UL + #define CMDQ_REGISTER_MR_FLAGS_STEERING_TAG_VALID 0x2UL + #define CMDQ_REGISTER_MR_FLAGS_ENABLE_RO 0x4UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1415,6 +1446,8 @@ struct cmdq_register_mr { __le64 pbl; __le64 va; __le64 mr_size; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_register_mr_resp (size:128b/16B) */ @@ -2157,8 +2190,38 @@ struct creq_query_func_resp_sb { __le32 tqm_alloc_reqs[12]; __le32 max_dpi; u8 max_sge_var_wqe; - u8 reserved_8; + u8 dev_cap_ext_flags; + #define CREQ_QUERY_FUNC_RESP_SB_ATOMIC_OPS_NOT_SUPPORTED 0x1UL + #define CREQ_QUERY_FUNC_RESP_SB_DRV_VERSION_RGTR_SUPPORTED 0x2UL + #define CREQ_QUERY_FUNC_RESP_SB_CREATE_QP_BATCH_SUPPORTED 0x4UL + #define CREQ_QUERY_FUNC_RESP_SB_DESTROY_QP_BATCH_SUPPORTED 0x8UL + #define CREQ_QUERY_FUNC_RESP_SB_ROCE_STATS_EXT_CTX_SUPPORTED 0x10UL + #define CREQ_QUERY_FUNC_RESP_SB_CREATE_SRQ_SGE_SUPPORTED 0x20UL + #define CREQ_QUERY_FUNC_RESP_SB_FIXED_SIZE_WQE_DISABLED 0x40UL + #define CREQ_QUERY_FUNC_RESP_SB_DCN_SUPPORTED 0x80UL __le16 max_inline_data_var_wqe; + __le32 start_qid; + u8 max_msn_table_size; + u8 reserved8_1; + __le16 dev_cap_ext_flags_2; + #define CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED 0x1UL + #define CREQ_QUERY_FUNC_RESP_SB_CHANGE_UDP_SRC_PORT_WQE_SUPPORTED 0x2UL + #define CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED 0x4UL + #define CREQ_QUERY_FUNC_RESP_SB_MEMORY_REGION_RO_SUPPORTED 0x8UL + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_MASK 0x30UL + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_SFT 4 + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_PSN_TABLE (0x0UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_HOST_MSN_TABLE (0x1UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE (0x2UL << 4) + #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ + CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE + #define CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED 0x40UL + #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL + __le16 max_xp_qp_size; + __le16 create_qp_batch_size; + __le16 destroy_qp_batch_size; + __le16 max_srq_ext; + __le64 reserved64; }; /* cmdq_set_func_resources (size:448b/56B) */ @@ -2205,6 +2268,46 @@ struct creq_set_func_resources_resp { u8 reserved48[6]; }; +/* cmdq_read_context (size:192b/24B) */ +struct cmdq_read_context { + u8 opcode; + #define CMDQ_READ_CONTEXT_OPCODE_READ_CONTEXT 0x85UL + #define CMDQ_READ_CONTEXT_OPCODE_LAST CMDQ_READ_CONTEXT_OPCODE_READ_CONTEXT + u8 cmd_size; + __le16 flags; + __le16 cookie; + u8 resp_size; + u8 reserved8; + __le64 resp_addr; + __le32 xid; + u8 type; + #define CMDQ_READ_CONTEXT_TYPE_QPC 0x0UL + #define CMDQ_READ_CONTEXT_TYPE_CQ 0x1UL + #define CMDQ_READ_CONTEXT_TYPE_MRW 0x2UL + #define CMDQ_READ_CONTEXT_TYPE_SRQ 0x3UL + #define CMDQ_READ_CONTEXT_TYPE_LAST CMDQ_READ_CONTEXT_TYPE_SRQ + u8 unused_0[3]; +}; + +/* creq_read_context (size:128b/16B) */ +struct creq_read_context { + u8 type; + #define CREQ_READ_CONTEXT_TYPE_MASK 0x3fUL + #define CREQ_READ_CONTEXT_TYPE_SFT 0 + #define CREQ_READ_CONTEXT_TYPE_QP_EVENT 0x38UL + #define CREQ_READ_CONTEXT_TYPE_LAST CREQ_READ_CONTEXT_TYPE_QP_EVENT + u8 status; + __le16 cookie; + __le32 reserved32; + u8 v; + #define CREQ_READ_CONTEXT_V 0x1UL + u8 event; + #define CREQ_READ_CONTEXT_EVENT_READ_CONTEXT 0x85UL + #define CREQ_READ_CONTEXT_EVENT_LAST CREQ_READ_CONTEXT_EVENT_READ_CONTEXT + __le16 reserved16; + __le32 reserved_32; +}; + /* cmdq_map_tc_to_cos (size:192b/24B) */ struct cmdq_map_tc_to_cos { u8 opcode; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 040ba2224f9f..8d753e6e0c71 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1222,6 +1222,8 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) int ret; ep = lookup_atid(t, atid); + if (!ep) + return -EINVAL; pr_debug("ep %p tid %u snd_isn %u rcv_isn %u\n", ep, tid, be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); @@ -2084,7 +2086,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, err = -ENOMEM; if (n->dev->flags & IFF_LOOPBACK) { if (iptype == 4) - pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip); + pdev = __ip_dev_find(&init_net, *(__be32 *)peer_ip, false); else if (IS_ENABLED(CONFIG_IPV6)) for_each_netdev(&init_net, pdev) { if (ipv6_chk_addr(&init_net, @@ -2099,12 +2101,12 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, err = -ENODEV; goto out; } + if (is_vlan_dev(pdev)) + pdev = vlan_dev_real_dev(pdev); ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, n, pdev, rt_tos2priority(tos)); - if (!ep->l2t) { - dev_put(pdev); + if (!ep->l2t) goto out; - } ep->mtu = pdev->mtu; ep->tx_chan = cxgb4_port_chan(pdev); ep->smac_idx = ((struct port_info *)netdev_priv(pdev))->smt_idx; @@ -2117,7 +2119,6 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, ep->rss_qid = cdev->rdev.lldi.rxq_ids[ cxgb4_port_idx(pdev) * step]; set_tcp_window(ep, (struct port_info *)netdev_priv(pdev)); - dev_put(pdev); } else { pdev = get_real_dev(n->dev); ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, @@ -2279,6 +2280,9 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) int ret = 0; ep = lookup_atid(t, atid); + if (!ep) + return -EINVAL; + la = (struct sockaddr_in *)&ep->com.local_addr; ra = (struct sockaddr_in *)&ep->com.remote_addr; la6 = (struct sockaddr_in6 *)&ep->com.local_addr; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 7e2835dcbc1c..14ced7b667fa 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -995,8 +995,9 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) } int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; @@ -1125,13 +1126,19 @@ int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_free_mm2; mm->key = uresp.key; - mm->addr = virt_to_phys(chp->cq.queue); + mm->addr = 0; + mm->vaddr = chp->cq.queue; + mm->dma_addr = chp->cq.dma_addr; mm->len = chp->cq.memsize; + insert_flag_to_mmap(&rhp->rdev, mm, mm->addr); insert_mmap(ucontext, mm); mm2->key = uresp.gts_key; mm2->addr = chp->cq.bar2_pa; mm2->len = PAGE_SIZE; + mm2->vaddr = NULL; + mm2->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, mm2, mm2->addr); insert_mmap(ucontext, mm2); } diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 80970a1738f8..034b85c42255 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -1114,8 +1114,10 @@ static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, * The math here assumes sizeof cpl_pass_accept_req >= sizeof * cpl_rx_pkt. */ - skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) + - sizeof(struct rss_header) - pktshift, GFP_ATOMIC); + skb = alloc_skb(size_add(gl->tot_len, + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header)) - pktshift, + GFP_ATOMIC); if (unlikely(!skb)) return NULL; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 50cb2259bf87..5b3007acaa1f 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -532,11 +532,21 @@ static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) return container_of(c, struct c4iw_ucontext, ibucontext); } +enum { + CXGB4_MMAP_BAR, + CXGB4_MMAP_BAR_WC, + CXGB4_MMAP_CONTIG, + CXGB4_MMAP_NON_CONTIG, +}; + struct c4iw_mm_entry { struct list_head entry; u64 addr; u32 key; + void *vaddr; + dma_addr_t dma_addr; unsigned len; + u8 mmap_flag; }; static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, @@ -561,6 +571,32 @@ static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, return NULL; } +static inline void insert_flag_to_mmap(struct c4iw_rdev *rdev, + struct c4iw_mm_entry *mm, u64 addr) +{ + if (addr >= pci_resource_start(rdev->lldi.pdev, 0) && + (addr < (pci_resource_start(rdev->lldi.pdev, 0) + + pci_resource_len(rdev->lldi.pdev, 0)))) + mm->mmap_flag = CXGB4_MMAP_BAR; + else if (addr >= pci_resource_start(rdev->lldi.pdev, 2) && + (addr < (pci_resource_start(rdev->lldi.pdev, 2) + + pci_resource_len(rdev->lldi.pdev, 2)))) { + if (addr >= rdev->oc_mw_pa) { + mm->mmap_flag = CXGB4_MMAP_BAR_WC; + } else { + if (is_t4(rdev->lldi.adapter_type)) + mm->mmap_flag = CXGB4_MMAP_BAR; + else + mm->mmap_flag = CXGB4_MMAP_BAR_WC; + } + } else { + if (addr) + mm->mmap_flag = CXGB4_MMAP_CONTIG; + else + mm->mmap_flag = CXGB4_MMAP_NON_CONTIG; + } +} + static inline void insert_mmap(struct c4iw_ucontext *ucontext, struct c4iw_mm_entry *mm) { @@ -930,15 +966,12 @@ void c4iw_id_table_free(struct c4iw_id_table *alloc); typedef int (*c4iw_handler_func)(struct c4iw_dev *dev, struct sk_buff *skb); -int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, - struct l2t_entry *l2t); void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid, struct c4iw_dev_ucontext *uctx); u32 c4iw_get_resource(struct c4iw_id_table *id_table); void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry); int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid, u32 nr_srqt); -int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_pblpool_create(struct c4iw_rdev *rdev); int c4iw_rqtpool_create(struct c4iw_rdev *rdev); int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev); @@ -946,7 +979,6 @@ void c4iw_pblpool_destroy(struct c4iw_rdev *rdev); void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev); void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev); void c4iw_destroy_resource(struct c4iw_resource *rscp); -int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); void c4iw_register_device(struct work_struct *work); void c4iw_unregister_device(struct c4iw_dev *dev); int __init c4iw_cm_init(void); @@ -980,7 +1012,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); void c4iw_cq_rem_ref(struct c4iw_cq *chp); int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask srq_attr_mask, @@ -1008,8 +1040,6 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp); int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); int c4iw_flush_sq(struct c4iw_qp *qhp); int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid); -u16 c4iw_rqes_posted(struct c4iw_qp *qhp); -int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe); u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, struct c4iw_dev_ucontext *uctx); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 246b739ddb2b..e059f92d90fd 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -113,6 +113,9 @@ static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext, mm->key = uresp.status_page_key; mm->addr = virt_to_phys(rhp->rdev.status_page); mm->len = PAGE_SIZE; + mm->vaddr = NULL; + mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, mm, mm->addr); insert_mmap(context, mm); } return 0; @@ -131,6 +134,11 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) struct c4iw_mm_entry *mm; struct c4iw_ucontext *ucontext; u64 addr; + u8 mmap_flag; + size_t size; + void *vaddr; + unsigned long vm_pgoff; + dma_addr_t dma_addr; pr_debug("pgoff 0x%lx key 0x%x len %d\n", vma->vm_pgoff, key, len); @@ -145,47 +153,38 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if (!mm) return -EINVAL; addr = mm->addr; + vaddr = mm->vaddr; + dma_addr = mm->dma_addr; + size = mm->len; + mmap_flag = mm->mmap_flag; kfree(mm); - if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 0) + - pci_resource_len(rdev->lldi.pdev, 0)))) { - - /* - * MA_SYNC register... - */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + switch (mmap_flag) { + case CXGB4_MMAP_BAR: + ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, + len, + pgprot_noncached(vma->vm_page_prot)); + break; + case CXGB4_MMAP_BAR_WC: ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, - len, vma->vm_page_prot); - } else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 2) + - pci_resource_len(rdev->lldi.pdev, 2)))) { - - /* - * Map user DB or OCQP memory... - */ - if (addr >= rdev->oc_mw_pa) - vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); - else { - if (!is_t4(rdev->lldi.adapter_type)) - vma->vm_page_prot = - t4_pgprot_wc(vma->vm_page_prot); - else - vma->vm_page_prot = - pgprot_noncached(vma->vm_page_prot); - } + len, t4_pgprot_wc(vma->vm_page_prot)); + break; + case CXGB4_MMAP_CONTIG: ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); - } else { - - /* - * Map WQ or CQ contig dma memory... - */ - ret = remap_pfn_range(vma, vma->vm_start, - addr >> PAGE_SHIFT, - len, vma->vm_page_prot); + break; + case CXGB4_MMAP_NON_CONTIG: + vm_pgoff = vma->vm_pgoff; + vma->vm_pgoff = 0; + ret = dma_mmap_coherent(&rdev->lldi.pdev->dev, vma, + vaddr, dma_addr, size); + vma->vm_pgoff = vm_pgoff; + break; + default: + ret = -EINVAL; + break; } return ret; @@ -474,6 +473,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .fill_res_cq_entry = c4iw_fill_res_cq_entry, .fill_res_cm_id_entry = c4iw_fill_res_cm_id_entry, .fill_res_mr_entry = c4iw_fill_res_mr_entry, + .fill_res_qp_entry = c4iw_fill_res_qp_entry, .get_dev_fw_str = get_dev_fw_str, .get_dma_mr = c4iw_get_dma_mr, .get_hw_stats = c4iw_get_mib, diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index d16d8eaa1415..955f061a55e9 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1599,6 +1599,7 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, int count; int rq_flushed = 0, sq_flushed; unsigned long flag; + struct ib_event ev; pr_debug("qhp %p rchp %p schp %p\n", qhp, rchp, schp); @@ -1607,6 +1608,13 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, if (schp != rchp) spin_lock(&schp->lock); spin_lock(&qhp->lock); + if (qhp->srq && qhp->attr.state == C4IW_QP_STATE_ERROR && + qhp->ibqp.event_handler) { + ev.device = qhp->ibqp.device; + ev.element.qp = &qhp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qhp->ibqp.event_handler(&ev, qhp->ibqp.qp_context); + } if (qhp->wq.flushed) { spin_unlock(&qhp->lock); @@ -2281,24 +2289,39 @@ int c4iw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, if (ret) goto err_free_ma_sync_key; sq_key_mm->key = uresp.sq_key; - sq_key_mm->addr = qhp->wq.sq.phys_addr; + sq_key_mm->addr = 0; + sq_key_mm->vaddr = qhp->wq.sq.queue; + sq_key_mm->dma_addr = qhp->wq.sq.dma_addr; sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize); + insert_flag_to_mmap(&rhp->rdev, sq_key_mm, sq_key_mm->addr); insert_mmap(ucontext, sq_key_mm); if (!attrs->srq) { rq_key_mm->key = uresp.rq_key; - rq_key_mm->addr = virt_to_phys(qhp->wq.rq.queue); + rq_key_mm->addr = 0; + rq_key_mm->vaddr = qhp->wq.rq.queue; + rq_key_mm->dma_addr = qhp->wq.rq.dma_addr; rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize); + insert_flag_to_mmap(&rhp->rdev, rq_key_mm, + rq_key_mm->addr); insert_mmap(ucontext, rq_key_mm); } sq_db_key_mm->key = uresp.sq_db_gts_key; sq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.sq.bar2_pa; + sq_db_key_mm->vaddr = NULL; + sq_db_key_mm->dma_addr = 0; sq_db_key_mm->len = PAGE_SIZE; + insert_flag_to_mmap(&rhp->rdev, sq_db_key_mm, + sq_db_key_mm->addr); insert_mmap(ucontext, sq_db_key_mm); if (!attrs->srq) { rq_db_key_mm->key = uresp.rq_db_gts_key; rq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.rq.bar2_pa; rq_db_key_mm->len = PAGE_SIZE; + rq_db_key_mm->vaddr = NULL; + rq_db_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, rq_db_key_mm, + rq_db_key_mm->addr); insert_mmap(ucontext, rq_db_key_mm); } if (ma_sync_key_mm) { @@ -2307,6 +2330,10 @@ int c4iw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, (pci_resource_start(rhp->rdev.lldi.pdev, 0) + PCIE_MA_SYNC_A) & PAGE_MASK; ma_sync_key_mm->len = PAGE_SIZE; + ma_sync_key_mm->vaddr = NULL; + ma_sync_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, ma_sync_key_mm, + ma_sync_key_mm->addr); insert_mmap(ucontext, ma_sync_key_mm); } @@ -2761,12 +2788,19 @@ int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs, if (ret) goto err_free_srq_db_key_mm; srq_key_mm->key = uresp.srq_key; - srq_key_mm->addr = virt_to_phys(srq->wq.queue); + srq_key_mm->addr = 0; srq_key_mm->len = PAGE_ALIGN(srq->wq.memsize); + srq_key_mm->vaddr = srq->wq.queue; + srq_key_mm->dma_addr = srq->wq.dma_addr; + insert_flag_to_mmap(&rhp->rdev, srq_key_mm, srq_key_mm->addr); insert_mmap(ucontext, srq_key_mm); srq_db_key_mm->key = uresp.srq_db_gts_key; srq_db_key_mm->addr = (u64)(unsigned long)srq->wq.bar2_pa; srq_db_key_mm->len = PAGE_SIZE; + srq_db_key_mm->vaddr = NULL; + srq_db_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, srq_db_key_mm, + srq_db_key_mm->addr); insert_mmap(ucontext, srq_db_key_mm); } diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index e2bdec32ae80..838182d0409c 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_H_ @@ -57,14 +57,15 @@ struct efa_dev { u64 db_bar_addr; u64 db_bar_len; - int admin_msix_vector_idx; + u32 num_irq_vectors; + u32 admin_msix_vector_idx; struct efa_irq admin_irq; struct efa_stats stats; /* Array of completion EQs */ struct efa_eq *eqs; - unsigned int neqs; + u32 neqs; /* Only stores CQs with interrupts enabled */ struct xarray cqs_xa; @@ -160,14 +161,14 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable); diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 7377c8a9f4d5..fe0b6aec7839 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -30,7 +30,8 @@ enum efa_admin_aq_opcode { EFA_ADMIN_DEALLOC_UAR = 17, EFA_ADMIN_CREATE_EQ = 18, EFA_ADMIN_DESTROY_EQ = 19, - EFA_ADMIN_MAX_OPCODE = 19, + EFA_ADMIN_ALLOC_MR = 20, + EFA_ADMIN_MAX_OPCODE = 20, }; enum efa_admin_aq_feature_id { @@ -110,7 +111,10 @@ struct efa_admin_create_qp_cmd { * virtual (IOVA returned by MR registration) * 1 : rq_virt - If set, RQ ring base address is * virtual (IOVA returned by MR registration) - * 7:2 : reserved - MBZ + * 2 : unsolicited_write_recv - If set, work requests + * will not be consumed for incoming RDMA write with + * immediate + * 7:3 : reserved - MBZ */ u8 flags; @@ -147,8 +151,11 @@ struct efa_admin_create_qp_cmd { /* UAR number */ u16 uar; + /* Requested service level for the QP, 0 is the default SL */ + u8 sl; + /* MBZ */ - u16 reserved; + u8 reserved; /* MBZ */ u32 reserved2; @@ -456,6 +463,41 @@ struct efa_admin_dereg_mr_resp { struct efa_admin_acq_common_desc acq_common_desc; }; +/* + * Allocation of MemoryRegion, required for QP working with Virtual + * Addresses in kernel verbs semantics, ready for fast registration use. + */ +struct efa_admin_alloc_mr_cmd { + /* Common Admin Queue descriptor */ + struct efa_admin_aq_common_desc aq_common_desc; + + /* Protection Domain */ + u16 pd; + + /* MBZ */ + u16 reserved1; + + /* Maximum number of pages this MR supports. */ + u32 max_pages; +}; + +struct efa_admin_alloc_mr_resp { + /* Common Admin Queue completion descriptor */ + struct efa_admin_acq_common_desc acq_common_desc; + + /* + * L_Key, to be used in conjunction with local buffer references in + * SQ and RQ WQE, or with virtual RQ/CQ rings + */ + u32 l_key; + + /* + * R_Key, to be used in RDMA messages to refer to remotely accessed + * memory region + */ + u32 r_key; +}; + struct efa_admin_create_cq_cmd { struct efa_admin_aq_common_desc aq_common_desc; @@ -480,8 +522,8 @@ struct efa_admin_create_cq_cmd { */ u8 cq_caps_2; - /* completion queue depth in # of entries. must be power of 2 */ - u16 cq_depth; + /* Sub completion queue depth in # of entries. must be power of 2 */ + u16 sub_cq_depth; /* EQ number assigned to this cq */ u16 eqn; @@ -516,8 +558,8 @@ struct efa_admin_create_cq_resp { u16 cq_idx; - /* actual cq depth in number of entries */ - u16 cq_actual_depth; + /* actual sub cq depth in number of entries */ + u16 sub_cq_actual_depth; /* CQ doorbell address, as offset to PCIe DB BAR */ u32 db_offset; @@ -575,6 +617,8 @@ struct efa_admin_basic_stats { u64 rx_pkts; u64 rx_drops; + + u64 qkey_viol; }; struct efa_admin_messages_stats { @@ -663,12 +707,26 @@ struct efa_admin_feature_device_attr_desc { * polling is supported * 3 : rdma_write - If set, RDMA Write is supported * on TX queues - * 31:4 : reserved - MBZ + * 4 : unsolicited_write_recv - If set, unsolicited + * write with imm. receive is supported + * 31:5 : reserved - MBZ */ u32 device_caps; /* Max RDMA transfer size in bytes */ u32 max_rdma_size; + + /* Unique global ID for an EFA device */ + u64 guid; + + /* The device maximum link speed in Gbit/sec */ + u16 max_link_speed_gbps; + + /* MBZ */ + u16 reserved0; + + /* MBZ */ + u32 reserved1; }; struct efa_admin_feature_queue_attr_desc { @@ -1009,6 +1067,7 @@ struct efa_admin_host_info { /* create_qp_cmd */ #define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK BIT(0) #define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK BIT(1) +#define EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV_MASK BIT(2) /* modify_qp_cmd */ #define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK BIT(0) @@ -1044,10 +1103,10 @@ struct efa_admin_host_info { #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK BIT(1) #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2) #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_WRITE_MASK BIT(3) +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_UNSOLICITED_WRITE_RECV_MASK BIT(4) /* create_eq_cmd */ #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) -#define EFA_ADMIN_CREATE_EQ_CMD_VIRT_MASK BIT(6) #define EFA_ADMIN_CREATE_EQ_CMD_COMPLETION_EVENTS_MASK BIT(0) /* host_info */ diff --git a/drivers/infiniband/hw/efa/efa_admin_defs.h b/drivers/infiniband/hw/efa/efa_admin_defs.h index 83f20c38a840..35700c93e639 100644 --- a/drivers/infiniband/hw/efa/efa_admin_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_defs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_ADMIN_H_ @@ -96,7 +96,7 @@ struct efa_admin_acq_entry { struct efa_admin_aenq_common_desc { u16 group; - u16 syndrom; + u16 syndrome; /* * 0 : phase diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index 16a24a05fc2a..bafd210dd43e 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_com.h" @@ -406,8 +406,8 @@ static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue return comp_ctx; } -static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, - struct efa_admin_acq_entry *cqe) +static int efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq, + struct efa_admin_acq_entry *cqe) { struct efa_comp_ctx *comp_ctx; u16 cmd_id; @@ -416,11 +416,11 @@ static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *a EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); - if (!comp_ctx) { + if (comp_ctx->status != EFA_CMD_SUBMITTED) { ibdev_err(aq->efa_dev, - "comp_ctx is NULL. Changing the admin queue running state\n"); - clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); - return; + "Received completion with unexpected command id[%d], sq producer: %d, sq consumer: %d, cq consumer: %d\n", + cmd_id, aq->sq.pc, aq->sq.cc, aq->cq.cc); + return -EINVAL; } comp_ctx->status = EFA_CMD_COMPLETED; @@ -428,14 +428,17 @@ static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *a if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state)) complete(&comp_ctx->wait_event); + + return 0; } static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) { struct efa_admin_acq_entry *cqe; u16 queue_size_mask; - u16 comp_num = 0; + u16 comp_cmds = 0; u8 phase; + int err; u16 ci; queue_size_mask = aq->depth - 1; @@ -453,10 +456,12 @@ static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) * phase bit was validated */ dma_rmb(); - efa_com_handle_single_admin_completion(aq, cqe); + err = efa_com_handle_single_admin_completion(aq, cqe); + if (!err) + comp_cmds++; + aq->cq.cc++; ci++; - comp_num++; if (ci == aq->depth) { ci = 0; phase = !phase; @@ -465,10 +470,9 @@ static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq) cqe = &aq->cq.entries[ci]; } - aq->cq.cc += comp_num; aq->cq.phase = phase; - aq->sq.cc += comp_num; - atomic64_add(comp_num, &aq->stats.completed_cmd); + aq->sq.cc += comp_cmds; + atomic64_add(comp_cmds, &aq->stats.completed_cmd); } static int efa_com_comp_status_to_errno(u8 comp_status) diff --git a/drivers/infiniband/hw/efa/efa_com.h b/drivers/infiniband/hw/efa/efa_com.h index 77282234ce68..4d9ca97e4296 100644 --- a/drivers/infiniband/hw/efa/efa_com.h +++ b/drivers/infiniband/hw/efa/efa_com.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_COM_H_ @@ -65,7 +65,7 @@ struct efa_com_admin_queue { u16 depth; struct efa_com_admin_cq cq; struct efa_com_admin_sq sq; - u16 msix_vector_idx; + u32 msix_vector_idx; unsigned long state; @@ -89,7 +89,7 @@ struct efa_com_aenq { struct efa_aenq_handlers *aenq_handlers; dma_addr_t dma_addr; u32 cc; /* consumer counter */ - u16 msix_vector_idx; + u32 msix_vector_idx; u16 depth; u8 phase; }; diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index d3398c7b0bd0..c6b89c45fdc9 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -31,6 +31,10 @@ int efa_com_create_qp(struct efa_com_dev *edev, create_qp_cmd.qp_alloc_size.recv_queue_depth = params->rq_depth; create_qp_cmd.uar = params->uarn; + create_qp_cmd.sl = params->sl; + + if (params->unsolicited_write_recv) + EFA_SET(&create_qp_cmd.flags, EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV, 1); err = efa_com_cmd_exec(aq, (struct efa_admin_aq_entry *)&create_qp_cmd, @@ -160,7 +164,7 @@ int efa_com_create_cq(struct efa_com_dev *edev, EFA_SET(&create_cmd.cq_caps_2, EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS, params->entry_size_in_bytes / 4); - create_cmd.cq_depth = params->cq_depth; + create_cmd.sub_cq_depth = params->sub_cq_depth; create_cmd.num_sub_cqs = params->num_sub_cqs; create_cmd.uar = params->uarn; if (params->interrupt_mode_enabled) { @@ -188,7 +192,7 @@ int efa_com_create_cq(struct efa_com_dev *edev, } result->cq_idx = cmd_completion.cq_idx; - result->actual_depth = params->cq_depth; + result->actual_depth = params->sub_cq_depth; result->db_off = cmd_completion.db_offset; result->db_valid = EFA_GET(&cmd_completion.flags, EFA_ADMIN_CREATE_CQ_RESP_DB_VALID); @@ -462,6 +466,8 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->db_bar = resp.u.device_attr.db_bar; result->max_rdma_size = resp.u.device_attr.max_rdma_size; result->device_caps = resp.u.device_attr.device_caps; + result->guid = resp.u.device_attr.guid; + result->max_link_speed_gbps = resp.u.device_attr.max_link_speed_gbps; if (result->admin_api_version < 1) { ibdev_err_ratelimited( diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 720a99ba0f7d..5511355b700d 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -27,6 +27,8 @@ struct efa_com_create_qp_params { u16 pd; u16 uarn; u8 qp_type; + u8 sl; + u8 unsolicited_write_recv : 1; }; struct efa_com_create_qp_result { @@ -70,7 +72,7 @@ struct efa_com_create_cq_params { /* cq physical base address in OS memory */ dma_addr_t dma_addr; /* completion queue depth in # of entries */ - u16 cq_depth; + u16 sub_cq_depth; u16 num_sub_cqs; u16 uarn; u16 eqn; @@ -111,6 +113,7 @@ struct efa_com_get_device_attr_result { u8 addr[EFA_GID_SIZE]; u64 page_size_cap; u64 max_mr_pages; + u64 guid; u32 mtu; u32 fw_version; u32 admin_api_version; @@ -139,6 +142,7 @@ struct efa_com_get_device_attr_result { u16 max_wr_rdma_sge; u16 max_tx_batch; u16 min_sq_depth; + u16 max_link_speed_gbps; u8 db_bar; }; diff --git a/drivers/infiniband/hw/efa/efa_io_defs.h b/drivers/infiniband/hw/efa/efa_io_defs.h index 2d8eb96eaa81..a4c9fd33da38 100644 --- a/drivers/infiniband/hw/efa/efa_io_defs.h +++ b/drivers/infiniband/hw/efa/efa_io_defs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_IO_H_ @@ -10,6 +10,7 @@ #define EFA_IO_TX_DESC_NUM_RDMA_BUFS 1 #define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 #define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 +#define EFA_IO_TX_DESC_INLINE_PBL_SIZE 1 enum efa_io_queue_type { /* send queue (of a QP) */ @@ -25,6 +26,10 @@ enum efa_io_send_op_type { EFA_IO_RDMA_READ = 1, /* RDMA write */ EFA_IO_RDMA_WRITE = 2, + /* Fast MR registration */ + EFA_IO_FAST_REG = 3, + /* Fast MR invalidation */ + EFA_IO_FAST_INV = 4, }; enum efa_io_comp_status { @@ -34,15 +39,15 @@ enum efa_io_comp_status { EFA_IO_COMP_STATUS_FLUSHED = 1, /* Internal QP error */ EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2, - /* Bad operation type */ - EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3, + /* Unsupported operation */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP = 3, /* Bad AH */ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH = 4, /* LKEY not registered or does not match IOVA */ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5, /* Message too long */ EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH = 6, - /* Destination ENI is down or does not run EFA */ + /* RKEY not registered or does not match remote IOVA */ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7, /* Connection was reset by remote side */ EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT = 8, @@ -54,8 +59,17 @@ enum efa_io_comp_status { EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH = 11, /* Unexpected status returned by responder */ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS = 12, - /* Unresponsive remote - detected locally */ + /* Unresponsive remote - was previously responsive */ EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE = 13, + /* No valid AH at remote side (required for RDMA operations) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER = 14, + /* Unreachable remote - never received a response */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE = 15, +}; + +enum efa_io_frwr_pbl_mode { + EFA_IO_FRWR_INLINE_PBL = 0, + EFA_IO_FRWR_DIRECT_PBL = 1, }; struct efa_io_tx_meta_desc { @@ -95,13 +109,13 @@ struct efa_io_tx_meta_desc { /* * If inline_msg bit is set, length of inline message in bytes, - * otherwise length of SGL (number of buffers). + * otherwise length of SGL (number of buffers). */ u16 length; /* - * immediate data: if has_imm is set, then this field is included - * within Tx message and reported in remote Rx completion. + * immediate data: if has_imm is set, then this field is included within + * Tx message and reported in remote Rx completion. */ u32 immediate_data; @@ -158,6 +172,63 @@ struct efa_io_rdma_req { struct efa_io_tx_buf_desc local_mem[1]; }; +struct efa_io_fast_mr_reg_req { + /* Updated local key of the MR after lkey/rkey increment */ + u32 lkey; + + /* + * permissions + * 0 : local_write_enable - Local write permissions: + * must be set for RQ buffers and buffers posted for + * RDMA Read requests + * 1 : remote_write_enable - Remote write + * permissions: must be set to enable RDMA write to + * the region + * 2 : remote_read_enable - Remote read permissions: + * must be set to enable RDMA read from the region + * 7:3 : reserved2 - MBZ + */ + u8 permissions; + + /* + * control flags + * 4:0 : phys_page_size_shift - page size is (1 << + * phys_page_size_shift) + * 6:5 : pbl_mode - enum efa_io_frwr_pbl_mode + * 7 : reserved - MBZ + */ + u8 flags; + + /* MBZ */ + u8 reserved[2]; + + /* IO Virtual Address associated with this MR */ + u64 iova; + + /* Memory region length, in bytes */ + u64 mr_length; + + /* Physical Buffer List, each element is page-aligned. */ + union { + /* + * Inline array of physical page addresses (optimization + * for short region activation). + */ + u64 inline_array[1]; + + /* points to PBL (Currently only direct) */ + u64 dma_addr; + } pbl; +}; + +struct efa_io_fast_mr_inv_req { + /* Local key of the MR to invalidate */ + u32 lkey; + + /* MBZ */ + u8 reserved[28]; +}; + /* * Tx WQE, composed of tx meta descriptors followed by either tx buffer * descriptors or inline data @@ -174,6 +245,12 @@ struct efa_io_tx_wqe { /* RDMA local and remote memory addresses */ struct efa_io_rdma_req rdma_req; + + /* Fast registration */ + struct efa_io_fast_mr_reg_req reg_mr_req; + + /* Fast invalidation */ + struct efa_io_fast_mr_inv_req inv_mr_req; } data; }; @@ -208,7 +285,7 @@ struct efa_io_rx_desc { struct efa_io_cdesc_common { /* * verbs-generated request ID, as provided in the completed tx or rx - * descriptor. + * descriptor. */ u16 req_id; @@ -221,7 +298,8 @@ struct efa_io_cdesc_common { * 3 : has_imm - indicates that immediate data is * present - for RX completions only * 6:4 : op_type - enum efa_io_send_op_type - * 7 : reserved31 - MBZ + * 7 : unsolicited - indicates that there is no + * matching request - for RDMA with imm. RX only */ u8 flags; @@ -291,6 +369,13 @@ struct efa_io_rx_cdesc_ex { /* tx_buf_desc */ #define EFA_IO_TX_BUF_DESC_LKEY_MASK GENMASK(23, 0) +/* fast_mr_reg_req */ +#define EFA_IO_FAST_MR_REG_REQ_LOCAL_WRITE_ENABLE_MASK BIT(0) +#define EFA_IO_FAST_MR_REG_REQ_REMOTE_WRITE_ENABLE_MASK BIT(1) +#define EFA_IO_FAST_MR_REG_REQ_REMOTE_READ_ENABLE_MASK BIT(2) +#define EFA_IO_FAST_MR_REG_REQ_PHYS_PAGE_SIZE_SHIFT_MASK GENMASK(4, 0) +#define EFA_IO_FAST_MR_REG_REQ_PBL_MODE_MASK GENMASK(6, 5) + /* rx_desc */ #define EFA_IO_RX_DESC_LKEY_MASK GENMASK(23, 0) #define EFA_IO_RX_DESC_FIRST_MASK BIT(30) @@ -301,5 +386,6 @@ struct efa_io_rx_cdesc_ex { #define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1) #define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3) #define EFA_IO_CDESC_COMMON_OP_TYPE_MASK GENMASK(6, 4) +#define EFA_IO_CDESC_COMMON_UNSOLICITED_MASK BIT(7) #endif /* _EFA_IO_H_ */ diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 7b1910a86216..4f03c0ec819f 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include <linux/module.h> @@ -16,11 +16,13 @@ #define PCI_DEV_ID_EFA0_VF 0xefa0 #define PCI_DEV_ID_EFA1_VF 0xefa1 #define PCI_DEV_ID_EFA2_VF 0xefa2 +#define PCI_DEV_ID_EFA3_VF 0xefa3 static const struct pci_device_id efa_pci_tbl[] = { { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) }, { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) }, { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA2_VF) }, + { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA3_VF) }, { } }; @@ -139,8 +141,7 @@ static int efa_request_irq(struct efa_dev *dev, struct efa_irq *irq) return 0; } -static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq, - int vector) +static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq, u32 vector) { u32 cpu; @@ -190,15 +191,23 @@ static int efa_request_doorbell_bar(struct efa_dev *dev) { u8 db_bar_idx = dev->dev_attr.db_bar; struct pci_dev *pdev = dev->pdev; - int bars; + int pci_mem_bars; + int db_bar; int err; - if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) { - bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx); + db_bar = BIT(db_bar_idx); + if (!(db_bar & EFA_BASE_BAR_MASK)) { + pci_mem_bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (db_bar & ~pci_mem_bars) { + dev_err(&pdev->dev, + "Doorbells BAR unavailable. Requested %#x, available %#x\n", + db_bar, pci_mem_bars); + return -ENODEV; + } - err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + err = pci_request_selected_regions(pdev, db_bar, DRV_MODULE_NAME); if (err) { - dev_err(&dev->pdev->dev, + dev_err(&pdev->dev, "pci_request_selected_regions for bar %d failed %d\n", db_bar_idx, err); return err; @@ -295,7 +304,7 @@ static void efa_destroy_eq(struct efa_dev *dev, struct efa_eq *eq) efa_free_irq(dev, &eq->irq); } -static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u8 msix_vec) +static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u32 msix_vec) { int err; @@ -318,19 +327,17 @@ err_free_comp_irq: static int efa_create_eqs(struct efa_dev *dev) { - unsigned int neqs = dev->dev_attr.max_eq; - int err; - int i; + u32 neqs = dev->dev_attr.max_eq; + int err, i; - neqs = min_t(unsigned int, neqs, num_online_cpus()); + neqs = min_t(u32, neqs, dev->num_irq_vectors - EFA_COMP_EQS_VEC_BASE); dev->neqs = neqs; dev->eqs = kcalloc(neqs, sizeof(*dev->eqs), GFP_KERNEL); if (!dev->eqs) return -ENOMEM; for (i = 0; i < neqs; i++) { - err = efa_create_eq(dev, &dev->eqs[i], - i + EFA_COMP_EQS_VEC_BASE); + err = efa_create_eq(dev, &dev->eqs[i], i + EFA_COMP_EQS_VEC_BASE); if (err) goto err_destroy_eqs; } @@ -429,6 +436,7 @@ static int efa_ib_device_add(struct efa_dev *dev) efa_set_host_info(dev); dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; + dev->ibdev.node_guid = dev->dev_attr.guid; dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = dev->neqs ?: 1; dev->ibdev.dev.parent = &pdev->dev; @@ -457,7 +465,6 @@ static void efa_ib_device_remove(struct efa_dev *dev) ibdev_info(&dev->ibdev, "Unregister ib device\n"); ib_unregister_device(&dev->ibdev); efa_destroy_eqs(dev); - efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL); efa_release_doorbell_bar(dev); } @@ -468,34 +475,30 @@ static void efa_disable_msix(struct efa_dev *dev) static int efa_enable_msix(struct efa_dev *dev) { - int msix_vecs, irq_num; + int max_vecs, num_vecs; /* * Reserve the max msix vectors we might need, one vector is reserved * for admin. */ - msix_vecs = min_t(int, pci_msix_vec_count(dev->pdev), - num_online_cpus() + 1); + max_vecs = min_t(int, pci_msix_vec_count(dev->pdev), + num_online_cpus() + 1); dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n", - msix_vecs); + max_vecs); dev->admin_msix_vector_idx = EFA_MGMNT_MSIX_VEC_IDX; - irq_num = pci_alloc_irq_vectors(dev->pdev, msix_vecs, - msix_vecs, PCI_IRQ_MSIX); + num_vecs = pci_alloc_irq_vectors(dev->pdev, 1, + max_vecs, PCI_IRQ_MSIX); - if (irq_num < 0) { - dev_err(&dev->pdev->dev, "Failed to enable MSI-X. irq_num %d\n", - irq_num); + if (num_vecs < 0) { + dev_err(&dev->pdev->dev, "Failed to enable MSI-X. error %d\n", + num_vecs); return -ENOSPC; } - if (irq_num != msix_vecs) { - efa_disable_msix(dev); - dev_err(&dev->pdev->dev, - "Allocated %d MSI-X (out of %d requested)\n", - irq_num, msix_vecs); - return -ENOSPC; - } + dev_dbg(&dev->pdev->dev, "Allocated %d MSI-X vectors\n", num_vecs); + + dev->num_irq_vectors = num_vecs; return 0; } @@ -533,7 +536,7 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev) { struct efa_com_dev *edev; struct efa_dev *dev; - int bars; + int pci_mem_bars; int err; err = pci_enable_device_mem(pdev); @@ -558,8 +561,14 @@ static struct efa_dev *efa_probe_device(struct pci_dev *pdev) dev->pdev = pdev; xa_init(&dev->cqs_xa); - bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK; - err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + pci_mem_bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (EFA_BASE_BAR_MASK & ~pci_mem_bars) { + dev_err(&pdev->dev, "BARs unavailable. Requested %#x, available %#x\n", + (int)EFA_BASE_BAR_MASK, pci_mem_bars); + err = -ENODEV; + goto err_ibdev_destroy; + } + err = pci_request_selected_regions(pdev, EFA_BASE_BAR_MASK, DRV_MODULE_NAME); if (err) { dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n", err); @@ -628,12 +637,14 @@ err_disable_device: return ERR_PTR(err); } -static void efa_remove_device(struct pci_dev *pdev) +static void efa_remove_device(struct pci_dev *pdev, + enum efa_regs_reset_reason_types reset_reason) { struct efa_dev *dev = pci_get_drvdata(pdev); struct efa_com_dev *edev; edev = &dev->edev; + efa_com_dev_reset(edev, reset_reason); efa_com_admin_destroy(edev); efa_free_irq(dev, &dev->admin_irq); efa_disable_msix(dev); @@ -661,7 +672,7 @@ static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; err_remove_device: - efa_remove_device(pdev); + efa_remove_device(pdev, EFA_REGS_RESET_INIT_ERR); return err; } @@ -670,7 +681,17 @@ static void efa_remove(struct pci_dev *pdev) struct efa_dev *dev = pci_get_drvdata(pdev); efa_ib_device_remove(dev); - efa_remove_device(pdev); + efa_remove_device(pdev, EFA_REGS_RESET_NORMAL); +} + +static void efa_shutdown(struct pci_dev *pdev) +{ + struct efa_dev *dev = pci_get_drvdata(pdev); + + efa_destroy_eqs(dev); + efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_SHUTDOWN); + efa_free_irq(dev, &dev->admin_irq); + efa_disable_msix(dev); } static struct pci_driver efa_pci_driver = { @@ -678,6 +699,7 @@ static struct pci_driver efa_pci_driver = { .id_table = efa_pci_tbl, .probe = efa_probe, .remove = efa_remove, + .shutdown = efa_shutdown, }; module_pci_driver(efa_pci_driver); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 2f412db2edcd..a8645a40730f 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -26,10 +26,6 @@ enum { EFA_MMAP_IO_NC, }; -#define EFA_AENQ_ENABLED_GROUPS \ - (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \ - BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE)) - struct efa_user_mmap_entry { struct rdma_user_mmap_entry rdma_entry; u64 address; @@ -89,6 +85,8 @@ static const struct rdma_stat_desc efa_port_stats_descs[] = { EFA_DEFINE_PORT_STATS(EFA_STATS_STR) }; +#define EFA_DEFAULT_LINK_SPEED_GBPS 100 + #define EFA_CHUNK_PAYLOAD_SHIFT 12 #define EFA_CHUNK_PAYLOAD_SIZE BIT(EFA_CHUNK_PAYLOAD_SHIFT) #define EFA_CHUNK_PAYLOAD_PTR_SIZE 8 @@ -263,6 +261,9 @@ int efa_query_device(struct ib_device *ibdev, if (EFA_DEV_CAP(dev, RDMA_WRITE)) resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE; + if (EFA_DEV_CAP(dev, UNSOLICITED_WRITE_RECV)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_UNSOLICITED_WRITE_RECV; + if (dev->neqs) resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS; @@ -278,10 +279,47 @@ int efa_query_device(struct ib_device *ibdev, return 0; } +static void efa_link_gbps_to_speed_and_width(u16 gbps, + enum ib_port_speed *speed, + enum ib_port_width *width) +{ + if (gbps >= 400) { + *width = IB_WIDTH_8X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 200) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 120) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_FDR10; + } else if (gbps >= 100) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } else if (gbps >= 60) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_DDR; + } else if (gbps >= 50) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_HDR; + } else if (gbps >= 40) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else if (gbps >= 30) { + *width = IB_WIDTH_12X; + *speed = IB_SPEED_SDR; + } else { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } +} + int efa_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { struct efa_dev *dev = to_edev(ibdev); + enum ib_port_speed link_speed; + enum ib_port_width link_width; + u16 link_gbps; props->lmc = 1; @@ -289,8 +327,10 @@ int efa_query_port(struct ib_device *ibdev, u32 port, props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; - props->active_speed = IB_SPEED_EDR; - props->active_width = IB_WIDTH_4X; + link_gbps = dev->dev_attr.max_link_speed_gbps ?: EFA_DEFAULT_LINK_SPEED_GBPS; + efa_link_gbps_to_speed_and_width(link_gbps, &link_speed, &link_width); + props->active_speed = link_speed; + props->active_width = link_width; props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu); props->max_msg_sz = dev->dev_attr.mtu; @@ -521,7 +561,7 @@ static int qp_mmap_entries_setup(struct efa_qp *qp, address = dev->mem_bar_addr + resp->llq_desc_offset; length = PAGE_ALIGN(params->sq_ring_size_in_bytes + - (resp->llq_desc_offset & ~PAGE_MASK)); + offset_in_page(resp->llq_desc_offset)); qp->llq_desc_mmap_entry = efa_user_mmap_entry_insert(&ucontext->ibucontext, @@ -639,6 +679,7 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, struct efa_ibv_create_qp cmd = {}; struct efa_qp *qp = to_eqp(ibqp); struct efa_ucontext *ucontext; + u16 supported_efa_flags = 0; int err; ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext, @@ -676,13 +717,23 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, goto err_out; } - if (cmd.comp_mask) { + if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_98)) { ibdev_dbg(&dev->ibdev, "Incompatible ABI params, unknown fields in udata\n"); err = -EINVAL; goto err_out; } + if (EFA_DEV_CAP(dev, UNSOLICITED_WRITE_RECV)) + supported_efa_flags |= EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV; + + if (cmd.flags & ~supported_efa_flags) { + ibdev_dbg(&dev->ibdev, "Unsupported EFA QP create flags[%#x], supported[%#x]\n", + cmd.flags, supported_efa_flags); + err = -EOPNOTSUPP; + goto err_out; + } + create_qp_params.uarn = ucontext->uarn; create_qp_params.pd = to_epd(ibqp->pd)->pdn; @@ -722,6 +773,11 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, create_qp_params.rq_base_addr = qp->rq_dma_addr; } + create_qp_params.sl = cmd.sl; + + if (cmd.flags & EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV) + create_qp_params.unsolicited_write_recv = true; + err = efa_com_create_qp(&dev->edev, &create_qp_params, &create_qp_resp); if (err) @@ -1067,8 +1123,9 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, } int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct efa_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct efa_ucontext, ibucontext); struct efa_com_create_cq_params params = {}; @@ -1153,7 +1210,7 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, } params.uarn = cq->ucontext->uarn; - params.cq_depth = entries; + params.sub_cq_depth = entries; params.dma_addr = cq->dma_addr; params.entry_size_in_bytes = cmd.cq_entry_size; params.num_sub_cqs = cmd.num_sub_cqs; @@ -1670,14 +1727,14 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start, struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct efa_dev *dev = to_edev(ibpd->device); struct ib_umem_dmabuf *umem_dmabuf; struct efa_mr *mr; int err; - mr = efa_alloc_mr(ibpd, access_flags, udata); + mr = efa_alloc_mr(ibpd, access_flags, &attrs->driver_udata); if (IS_ERR(mr)) { err = PTR_ERR(mr); goto err_out; diff --git a/drivers/infiniband/hw/erdma/Kconfig b/drivers/infiniband/hw/erdma/Kconfig index 169038e3ceb1..267fc1f3c42a 100644 --- a/drivers/infiniband/hw/erdma/Kconfig +++ b/drivers/infiniband/hw/erdma/Kconfig @@ -5,7 +5,7 @@ config INFINIBAND_ERDMA depends on INFINIBAND_ADDR_TRANS depends on INFINIBAND_USER_ACCESS help - This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA), + This is a RDMA driver for Alibaba Elastic RDMA Adapter(ERDMA), which supports RDMA features in Alibaba cloud environment. To compile this driver as module, choose M here. The module will be diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 5df401a30cb9..2a023b99f992 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -16,7 +16,7 @@ #include "erdma_hw.h" #define DRV_MODULE_NAME "erdma" -#define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack" +#define ERDMA_NODE_DESC "Elastic RDMA Adapter stack" struct erdma_eq { void *qbuf; @@ -33,7 +33,8 @@ struct erdma_eq { atomic64_t notify_num; void __iomem *db; - u64 *db_record; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_cmdq_sq { @@ -48,7 +49,8 @@ struct erdma_cmdq_sq { u16 wqebb_cnt; - u64 *db_record; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_cmdq_cq { @@ -61,7 +63,8 @@ struct erdma_cmdq_cq { u32 ci; u32 cmdsn; - u64 *db_record; + u64 *dbrec; + dma_addr_t dbrec_dma; atomic64_t armed_num; }; @@ -98,8 +101,6 @@ struct erdma_cmdq { struct erdma_comp_wait *wait_pool; spinlock_t lock; - bool use_event; - struct erdma_cmdq_sq sq; struct erdma_cmdq_cq cq; struct erdma_eq eq; @@ -145,6 +146,8 @@ struct erdma_devattr { u32 max_mr; u32 max_pd; u32 max_mw; + u32 max_gid; + u32 max_ah; u32 local_dma_key; }; @@ -174,12 +177,10 @@ struct erdma_resource_cb { enum { ERDMA_RES_TYPE_PD = 0, ERDMA_RES_TYPE_STAG_IDX = 1, - ERDMA_RES_CNT = 2, + ERDMA_RES_TYPE_AH = 2, + ERDMA_RES_CNT = 3, }; -#define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE -#define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE) - struct erdma_dev { struct ib_device ibdev; struct net_device *netdev; @@ -192,8 +193,6 @@ struct erdma_dev { u8 __iomem *func_bar; struct erdma_devattr attrs; - /* physical port state (only one port per device) */ - enum ib_port_state state; u32 mtu; /* cmdq and aeq use the same msix vector */ @@ -213,7 +212,9 @@ struct erdma_dev { atomic_t num_ctx; struct list_head cep_list; + struct dma_pool *db_pool; struct dma_pool *resp_pool; + enum erdma_proto_type proto; }; static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift) @@ -264,7 +265,7 @@ void erdma_cmdq_destroy(struct erdma_dev *dev); void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op); int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, - u64 *resp0, u64 *resp1); + u64 *resp0, u64 *resp1, bool sleepable); void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq); int erdma_ceqs_init(struct erdma_dev *dev); @@ -273,7 +274,8 @@ void notify_eq(struct erdma_eq *eq); void *get_next_valid_eqe(struct erdma_eq *eq); int erdma_aeq_init(struct erdma_dev *dev); -void erdma_aeq_destroy(struct erdma_dev *dev); +int erdma_eq_common_init(struct erdma_dev *dev, struct erdma_eq *eq, u32 depth); +void erdma_eq_destroy(struct erdma_dev *dev, struct erdma_eq *eq); void erdma_aeq_event_handler(struct erdma_dev *dev); void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb); diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index 771059a8eb7d..1b23c698ec25 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -567,7 +567,8 @@ reject_conn: static int erdma_proc_mpareply(struct erdma_cep *cep) { - struct erdma_qp_attrs qp_attrs; + enum erdma_qpa_mask_iwarp to_modify_attrs = 0; + struct erdma_mod_qp_params_iwarp params; struct erdma_qp *qp = cep->qp; struct mpa_rr *rep; int ret; @@ -597,26 +598,29 @@ static int erdma_proc_mpareply(struct erdma_cep *cep) return -EINVAL; } - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.irq_size = cep->ird; - qp_attrs.orq_size = cep->ord; - qp_attrs.state = ERDMA_QP_STATE_RTS; + memset(¶ms, 0, sizeof(params)); + params.state = ERDMA_QPS_IWARP_RTS; + params.irq_size = cep->ird; + params.orq_size = cep->ord; down_write(&qp->state_lock); - if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + if (qp->attrs.iwarp.state > ERDMA_QPS_IWARP_RTR) { ret = -EINVAL; up_write(&qp->state_lock); goto out_err; } - qp->attrs.qp_type = ERDMA_QP_ACTIVE; - if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) - qp->attrs.cc = COMPROMISE_CC; + to_modify_attrs = ERDMA_QPA_IWARP_STATE | ERDMA_QPA_IWARP_LLP_HANDLE | + ERDMA_QPA_IWARP_MPA | ERDMA_QPA_IWARP_IRD | + ERDMA_QPA_IWARP_ORD; - ret = erdma_modify_qp_internal(qp, &qp_attrs, - ERDMA_QP_ATTR_STATE | - ERDMA_QP_ATTR_LLP_HANDLE | - ERDMA_QP_ATTR_MPA); + params.qp_type = ERDMA_QP_ACTIVE; + if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) { + to_modify_attrs |= ERDMA_QPA_IWARP_CC; + params.cc = COMPROMISE_CC; + } + + ret = erdma_modify_qp_state_iwarp(qp, ¶ms, to_modify_attrs); up_write(&qp->state_lock); @@ -722,7 +726,7 @@ static int erdma_newconn_connected(struct erdma_cep *cep) __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1); memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE); - cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.iwarp.cookie); __mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc); ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len); @@ -1126,10 +1130,11 @@ error_put_qp: int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) { - struct erdma_dev *dev = to_edev(id->device); struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + struct erdma_mod_qp_params_iwarp mod_qp_params; + enum erdma_qpa_mask_iwarp to_modify_attrs = 0; + struct erdma_dev *dev = to_edev(id->device); struct erdma_qp *qp; - struct erdma_qp_attrs qp_attrs; int ret; erdma_cep_set_inuse(cep); @@ -1156,7 +1161,7 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) erdma_qp_get(qp); down_write(&qp->state_lock); - if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + if (qp->attrs.iwarp.state > ERDMA_QPS_IWARP_RTR) { ret = -EINVAL; up_write(&qp->state_lock); goto error; @@ -1181,11 +1186,11 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->cm_id = id; id->add_ref(id); - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.orq_size = params->ord; - qp_attrs.irq_size = params->ird; + memset(&mod_qp_params, 0, sizeof(mod_qp_params)); - qp_attrs.state = ERDMA_QP_STATE_RTS; + mod_qp_params.irq_size = params->ird; + mod_qp_params.orq_size = params->ord; + mod_qp_params.state = ERDMA_QPS_IWARP_RTS; /* Associate QP with CEP */ erdma_cep_get(cep); @@ -1194,19 +1199,21 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->state = ERDMA_EPSTATE_RDMA_MODE; - qp->attrs.qp_type = ERDMA_QP_PASSIVE; - qp->attrs.pd_len = params->private_data_len; + mod_qp_params.qp_type = ERDMA_QP_PASSIVE; + mod_qp_params.pd_len = params->private_data_len; - if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) - qp->attrs.cc = COMPROMISE_CC; + to_modify_attrs = ERDMA_QPA_IWARP_STATE | ERDMA_QPA_IWARP_ORD | + ERDMA_QPA_IWARP_LLP_HANDLE | ERDMA_QPA_IWARP_IRD | + ERDMA_QPA_IWARP_MPA; + + if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) { + to_modify_attrs |= ERDMA_QPA_IWARP_CC; + mod_qp_params.cc = COMPROMISE_CC; + } /* move to rts */ - ret = erdma_modify_qp_internal(qp, &qp_attrs, - ERDMA_QP_ATTR_STATE | - ERDMA_QP_ATTR_ORD | - ERDMA_QP_ATTR_LLP_HANDLE | - ERDMA_QP_ATTR_IRD | - ERDMA_QP_ATTR_MPA); + ret = erdma_modify_qp_state_iwarp(qp, &mod_qp_params, to_modify_attrs); + up_write(&qp->state_lock); if (ret) @@ -1214,7 +1221,7 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->mpa.ext_data.bits = 0; __mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc); - cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.iwarp.cookie); ret = erdma_send_mpareqrep(cep, params->private_data, params->private_data_len); diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c index a151a7bdd504..b867aefe83b2 100644 --- a/drivers/infiniband/hw/erdma/erdma_cmdq.c +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -14,7 +14,7 @@ static void arm_cmdq_cq(struct erdma_cmdq *cmdq) FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) | FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn); - *cmdq->cq.db_record = db_data; + *cmdq->cq.dbrec = db_data; writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG); atomic64_inc(&cmdq->cq.armed_num); @@ -25,7 +25,7 @@ static void kick_cmdq_db(struct erdma_cmdq *cmdq) struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi); - *cmdq->sq.db_record = db_data; + *cmdq->sq.dbrec = db_data; writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG); } @@ -89,20 +89,18 @@ static int erdma_cmdq_sq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_cmdq_sq *sq = &cmdq->sq; - u32 buf_size; sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE); sq->depth = cmdq->max_outstandings * sq->wqebb_cnt; - buf_size = sq->depth << SQEBB_SHIFT; - - sq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &sq->qbuf_dma_addr, GFP_KERNEL); + sq->qbuf = dma_alloc_coherent(&dev->pdev->dev, sq->depth << SQEBB_SHIFT, + &sq->qbuf_dma_addr, GFP_KERNEL); if (!sq->qbuf) return -ENOMEM; - sq->db_record = (u64 *)(sq->qbuf + buf_size); + sq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &sq->dbrec_dma); + if (!sq->dbrec) + goto err_out; spin_lock_init(&sq->lock); @@ -111,30 +109,33 @@ static int erdma_cmdq_sq_init(struct erdma_dev *dev) erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG, lower_32_bits(sq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth); - erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, - sq->qbuf_dma_addr + buf_size); + erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, sq->dbrec_dma); return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, sq->depth << SQEBB_SHIFT, + sq->qbuf, sq->qbuf_dma_addr); + + return -ENOMEM; } static int erdma_cmdq_cq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_cmdq_cq *cq = &cmdq->cq; - u32 buf_size; cq->depth = cmdq->sq.depth; - buf_size = cq->depth << CQE_SHIFT; - - cq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + cq->qbuf = dma_alloc_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, + &cq->qbuf_dma_addr, GFP_KERNEL); if (!cq->qbuf) return -ENOMEM; spin_lock_init(&cq->lock); - cq->db_record = (u64 *)(cq->qbuf + buf_size); + cq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &cq->dbrec_dma); + if (!cq->dbrec) + goto err_out; atomic64_set(&cq->armed_num, 0); @@ -142,40 +143,35 @@ static int erdma_cmdq_cq_init(struct erdma_dev *dev) upper_32_bits(cq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG, lower_32_bits(cq->qbuf_dma_addr)); - erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, - cq->qbuf_dma_addr + buf_size); + erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, cq->dbrec_dma); return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, cq->qbuf, + cq->qbuf_dma_addr); + + return -ENOMEM; } static int erdma_cmdq_eq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_eq *eq = &cmdq->eq; - u32 buf_size; - - eq->depth = cmdq->max_outstandings; - buf_size = eq->depth << EQE_SHIFT; - - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); - if (!eq->qbuf) - return -ENOMEM; + int ret; - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); + ret = erdma_eq_common_init(dev, eq, cmdq->max_outstandings); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG; - eq->db_record = (u64 *)(eq->qbuf + buf_size); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG, lower_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth); - erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, - eq->qbuf_dma_addr + buf_size); + erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; } @@ -186,7 +182,6 @@ int erdma_cmdq_init(struct erdma_dev *dev) int err; cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING; - cmdq->use_event = false; sema_init(&cmdq->credits, cmdq->max_outstandings); @@ -211,24 +206,22 @@ int erdma_cmdq_init(struct erdma_dev *dev) return 0; err_destroy_cq: - dma_free_coherent(&dev->pdev->dev, - (cmdq->cq.depth << CQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + dma_free_coherent(&dev->pdev->dev, cmdq->cq.depth << CQE_SHIFT, cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cmdq->cq.dbrec, cmdq->cq.dbrec_dma); + err_destroy_sq: - dma_free_coherent(&dev->pdev->dev, - (cmdq->sq.depth << SQEBB_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cmdq->sq.dbrec, cmdq->sq.dbrec_dma); + return err; } void erdma_finish_cmdq_init(struct erdma_dev *dev) { - /* after device init successfully, change cmdq to event mode. */ - dev->cmdq.use_event = true; arm_cmdq_cq(&dev->cmdq); } @@ -238,18 +231,17 @@ void erdma_cmdq_destroy(struct erdma_dev *dev) clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); - dma_free_coherent(&dev->pdev->dev, - (cmdq->eq.depth << EQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, - cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); - dma_free_coherent(&dev->pdev->dev, - (cmdq->sq.depth << SQEBB_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + erdma_eq_destroy(dev, &cmdq->eq); + + dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); - dma_free_coherent(&dev->pdev->dev, - (cmdq->cq.depth << CQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + + dma_pool_free(dev->db_pool, cmdq->sq.dbrec, cmdq->sq.dbrec_dma); + + dma_free_coherent(&dev->pdev->dev, cmdq->cq.depth << CQE_SHIFT, cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + + dma_pool_free(dev->db_pool, cmdq->cq.dbrec, cmdq->cq.dbrec_dma); } static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq) @@ -317,8 +309,7 @@ static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq) /* Copy 16B comp data after cqe hdr to outer */ be32_to_cpu_array(comp_wait->comp_data, cqe + 2, 4); - if (cmdq->use_event) - complete(&comp_wait->wait_event); + complete(&comp_wait->wait_event); return 0; } @@ -337,9 +328,6 @@ static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq) if (erdma_poll_single_cmd_completion(cmdq)) break; - if (comp_num && cmdq->use_event) - arm_cmdq_cq(cmdq); - spin_unlock_irqrestore(&cmdq->cq.lock, flags); } @@ -347,8 +335,7 @@ void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) { int got_event = 0; - if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) || - !cmdq->use_event) + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) return; while (get_next_valid_eqe(&cmdq->eq)) { @@ -359,6 +346,7 @@ void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) if (got_event) { cmdq->cq.cmdsn++; erdma_polling_cmd_completions(cmdq); + arm_cmdq_cq(cmdq); } notify_eq(&cmdq->eq); @@ -377,7 +365,7 @@ static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx, if (time_is_before_jiffies(comp_timeout)) return -ETIME; - msleep(20); + udelay(20); } return 0; @@ -408,7 +396,7 @@ void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op) } int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, - u64 *resp0, u64 *resp1) + u64 *resp0, u64 *resp1, bool sleepable) { struct erdma_comp_wait *comp_wait; int ret; @@ -416,7 +404,12 @@ int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) return -ENODEV; - down(&cmdq->credits); + if (!sleepable) { + while (down_trylock(&cmdq->credits)) + ; + } else { + down(&cmdq->credits); + } comp_wait = get_comp_wait(cmdq); if (IS_ERR(comp_wait)) { @@ -430,7 +423,7 @@ int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, push_cmdq_sqe(cmdq, req, req_size, comp_wait); spin_unlock(&cmdq->sq.lock); - if (cmdq->use_event) + if (sleepable) ret = erdma_wait_cmd_completion(comp_wait, cmdq, ERDMA_CMDQ_TIMEOUT_MS); else diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index c1cb5568eab2..1f456327e63c 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -26,7 +26,7 @@ static void notify_cq(struct erdma_cq *cq, u8 solcitied) FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) | FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci); - *cq->kern_cq.db_record = db_data; + *cq->kern_cq.dbrec = db_data; writeq(db_data, cq->kern_cq.db); } @@ -105,6 +105,22 @@ static const struct { { ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR }, }; +static void erdma_process_ud_cqe(struct erdma_cqe *cqe, struct ib_wc *wc) +{ + u32 ud_info; + + wc->wc_flags |= (IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE); + ud_info = be32_to_cpu(cqe->ud.info); + wc->network_hdr_type = FIELD_GET(ERDMA_CQE_NTYPE_MASK, ud_info); + if (wc->network_hdr_type == ERDMA_NETWORK_TYPE_IPV4) + wc->network_hdr_type = RDMA_NETWORK_IPV4; + else + wc->network_hdr_type = RDMA_NETWORK_IPV6; + wc->src_qp = FIELD_GET(ERDMA_CQE_SQPN_MASK, ud_info); + wc->sl = FIELD_GET(ERDMA_CQE_SL_MASK, ud_info); + wc->pkey_index = 0; +} + #define ERDMA_POLLCQ_NO_QP 1 static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) @@ -168,6 +184,10 @@ static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) wc->wc_flags |= IB_WC_WITH_INVALIDATE; } + if (erdma_device_rocev2(dev) && + (qp->ibqp.qp_type == IB_QPT_UD || qp->ibqp.qp_type == IB_QPT_GSI)) + erdma_process_ud_cqe(cqe, wc); + if (syndrome >= ERDMA_NUM_WC_STATUS) syndrome = ERDMA_WC_GENERAL_ERR; @@ -201,3 +221,48 @@ int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) return npolled; } + +void erdma_remove_cqes_of_qp(struct ib_cq *ibcq, u32 qpn) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_cqe *cqe, *dst_cqe; + u32 prev_cq_ci, cur_cq_ci; + u32 ncqe = 0, nqp_cqe = 0; + unsigned long flags; + u8 owner; + + spin_lock_irqsave(&cq->kern_cq.lock, flags); + + prev_cq_ci = cq->kern_cq.ci; + + while (ncqe < cq->depth && (cqe = get_next_valid_cqe(cq)) != NULL) { + ++cq->kern_cq.ci; + ++ncqe; + } + + while (ncqe > 0) { + cur_cq_ci = prev_cq_ci + ncqe - 1; + cqe = get_queue_entry(cq->kern_cq.qbuf, cur_cq_ci, cq->depth, + CQE_SHIFT); + + if (be32_to_cpu(cqe->qpn) == qpn) { + ++nqp_cqe; + } else if (nqp_cqe) { + dst_cqe = get_queue_entry(cq->kern_cq.qbuf, + cur_cq_ci + nqp_cqe, + cq->depth, CQE_SHIFT); + owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + be32_to_cpu(dst_cqe->hdr)); + cqe->hdr = cpu_to_be32( + (be32_to_cpu(cqe->hdr) & + ~ERDMA_CQE_HDR_OWNER_MASK) | + FIELD_PREP(ERDMA_CQE_HDR_OWNER_MASK, owner)); + memcpy(dst_cqe, cqe, sizeof(*cqe)); + } + + --ncqe; + } + + cq->kern_cq.ci = prev_cq_ci + nqp_cqe; + spin_unlock_irqrestore(&cq->kern_cq.lock, flags); +} diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c index ea47cb21fdb8..6486234a2360 100644 --- a/drivers/infiniband/hw/erdma/erdma_eq.c +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -13,7 +13,7 @@ void notify_eq(struct erdma_eq *eq) u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) | FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1); - *eq->db_record = db_data; + *eq->dbrec = db_data; writeq(db_data, eq->db); atomic64_inc(&eq->notify_num); @@ -80,47 +80,62 @@ void erdma_aeq_event_handler(struct erdma_dev *dev) notify_eq(&dev->aeq); } -int erdma_aeq_init(struct erdma_dev *dev) +int erdma_eq_common_init(struct erdma_dev *dev, struct erdma_eq *eq, u32 depth) { - struct erdma_eq *eq = &dev->aeq; - u32 buf_size; - - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; - buf_size = eq->depth << EQE_SHIFT; + u32 buf_size = depth << EQE_SHIFT; - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, buf_size, + &eq->qbuf_dma_addr, GFP_KERNEL); if (!eq->qbuf) return -ENOMEM; + eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); + if (!eq->dbrec) + goto err_free_qbuf; + spin_lock_init(&eq->lock); atomic64_set(&eq->event_num, 0); atomic64_set(&eq->notify_num, 0); + eq->ci = 0; + eq->depth = depth; + + return 0; + +err_free_qbuf: + dma_free_coherent(&dev->pdev->dev, buf_size, eq->qbuf, + eq->qbuf_dma_addr); + + return -ENOMEM; +} + +void erdma_eq_destroy(struct erdma_dev *dev, struct erdma_eq *eq) +{ + dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); + dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, + eq->qbuf_dma_addr); +} + +int erdma_aeq_init(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + int ret; + + ret = erdma_eq_common_init(dev, &dev->aeq, ERDMA_DEFAULT_EQ_DEPTH); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_AEQ_DB_REG; - eq->db_record = (u64 *)(eq->qbuf + buf_size); erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, lower_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); - erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, - eq->qbuf_dma_addr + buf_size); + erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; } -void erdma_aeq_destroy(struct erdma_dev *dev) -{ - struct erdma_eq *eq = &dev->aeq; - - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf, - eq->qbuf_dma_addr); -} - void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) { struct erdma_dev *dev = ceq_cb->dev; @@ -209,7 +224,6 @@ static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn) static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) { struct erdma_cmdq_create_eq_req req; - dma_addr_t db_info_dma_addr; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, CMDQ_OPCODE_CREATE_EQ); @@ -219,39 +233,33 @@ static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) req.qtype = ERDMA_EQ_TYPE_CEQ; /* Vector index is the same as EQN. */ req.vector_idx = eqn; - db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT); - req.db_dma_addr_l = lower_32_bits(db_info_dma_addr); - req.db_dma_addr_h = upper_32_bits(db_info_dma_addr); + req.db_dma_addr_l = lower_32_bits(eq->dbrec_dma); + req.db_dma_addr_h = upper_32_bits(eq->dbrec_dma); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + false); } static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) { struct erdma_eq *eq = &dev->ceqs[ceqn].eq; - u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; int ret; - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); - if (!eq->qbuf) - return -ENOMEM; + ret = erdma_eq_common_init(dev, eq, ERDMA_DEFAULT_EQ_DEPTH); + if (ret) + return ret; - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); - atomic64_set(&eq->notify_num, 0); - - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + (ceqn + 1) * ERDMA_DB_SIZE; - eq->db_record = (u64 *)(eq->qbuf + buf_size); - eq->ci = 0; dev->ceqs[ceqn].dev = dev; + dev->ceqs[ceqn].ready = true; /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ ret = create_eq_cmd(dev, ceqn + 1, eq); - dev->ceqs[ceqn].ready = ret ? false : true; + if (ret) { + erdma_eq_destroy(dev, eq); + dev->ceqs[ceqn].ready = false; + } return ret; } @@ -259,7 +267,6 @@ static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) { struct erdma_eq *eq = &dev->ceqs[ceqn].eq; - u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; struct erdma_cmdq_destroy_eq_req req; int err; @@ -272,12 +279,12 @@ static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) req.qtype = ERDMA_EQ_TYPE_CEQ; req.vector_idx = ceqn + 1; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + false); if (err) return; - dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf, - eq->qbuf_dma_addr); + erdma_eq_destroy(dev, eq); } int erdma_ceqs_init(struct erdma_dev *dev) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 3212a1222760..ea4db53901a4 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/types.h> +#include <linux/if_ether.h> /* PCIe device related definition. */ #define ERDMA_PCI_WIDTH 64 @@ -21,8 +22,21 @@ #define ERDMA_NUM_MSIX_VEC 32U #define ERDMA_MSIX_VECTOR_CMDQ 0 +/* RoCEv2 related */ +#define ERDMA_ROCEV2_GID_SIZE 16 +#define ERDMA_MAX_PKEYS 1 +#define ERDMA_DEFAULT_PKEY 0xFFFF + +/* erdma device protocol type */ +enum erdma_proto_type { + ERDMA_PROTO_IWARP = 0, + ERDMA_PROTO_ROCEV2 = 1, + ERDMA_PROTO_COUNT = 2, +}; + /* PCIe Bar0 Registers. */ #define ERDMA_REGS_VERSION_REG 0x0 +#define ERDMA_REGS_DEV_PROTO_REG 0xC #define ERDMA_REGS_DEV_CTRL_REG 0x10 #define ERDMA_REGS_DEV_ST_REG 0x14 #define ERDMA_REGS_NETDEV_MAC_L_REG 0x18 @@ -136,7 +150,11 @@ enum CMDQ_RDMA_OPCODE { CMDQ_OPCODE_DESTROY_CQ = 5, CMDQ_OPCODE_REFLUSH = 6, CMDQ_OPCODE_REG_MR = 8, - CMDQ_OPCODE_DEREG_MR = 9 + CMDQ_OPCODE_DEREG_MR = 9, + CMDQ_OPCODE_SET_GID = 14, + CMDQ_OPCODE_CREATE_AH = 15, + CMDQ_OPCODE_DESTROY_AH = 16, + CMDQ_OPCODE_QUERY_QP = 17, }; enum CMDQ_COMMON_OPCODE { @@ -240,7 +258,7 @@ struct erdma_cmdq_create_cq_req { u32 qbuf_addr_l; u32 qbuf_addr_h; u32 cfg1; - u64 cq_db_info_addr; + u64 cq_dbrec_dma; u32 first_page_offset; u32 cfg2; }; @@ -284,6 +302,36 @@ struct erdma_cmdq_dereg_mr_req { u32 cfg; }; +/* create_av cfg0 */ +#define ERDMA_CMD_CREATE_AV_FL_MASK GENMASK(19, 0) +#define ERDMA_CMD_CREATE_AV_NTYPE_MASK BIT(20) + +struct erdma_av_cfg { + u32 cfg0; + u8 traffic_class; + u8 hop_limit; + u8 sl; + u8 rsvd; + u16 udp_sport; + u16 sgid_index; + u8 dmac[ETH_ALEN]; + u8 padding[2]; + u8 dgid[ERDMA_ROCEV2_GID_SIZE]; +}; + +struct erdma_cmdq_create_ah_req { + u64 hdr; + u32 pdn; + u32 ahn; + struct erdma_av_cfg av_cfg; +}; + +struct erdma_cmdq_destroy_ah_req { + u64 hdr; + u32 pdn; + u32 ahn; +}; + /* modify qp cfg */ #define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24) #define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20) @@ -301,6 +349,36 @@ struct erdma_cmdq_modify_qp_req { u32 recv_nxt; }; +/* modify qp cfg1 for roce device */ +#define ERDMA_CMD_MODIFY_QP_DQPN_MASK GENMASK(19, 0) + +struct erdma_cmdq_mod_qp_req_rocev2 { + u64 hdr; + u32 cfg0; + u32 cfg1; + u32 attr_mask; + u32 qkey; + u32 rq_psn; + u32 sq_psn; + struct erdma_av_cfg av_cfg; +}; + +/* query qp response mask */ +#define ERDMA_CMD_QUERY_QP_RESP_SQ_PSN_MASK GENMASK_ULL(23, 0) +#define ERDMA_CMD_QUERY_QP_RESP_RQ_PSN_MASK GENMASK_ULL(47, 24) +#define ERDMA_CMD_QUERY_QP_RESP_QP_STATE_MASK GENMASK_ULL(55, 48) +#define ERDMA_CMD_QUERY_QP_RESP_SQ_DRAINING_MASK GENMASK_ULL(56, 56) + +struct erdma_cmdq_query_qp_req_rocev2 { + u64 hdr; + u32 qpn; +}; + +enum erdma_qp_type { + ERDMA_QPT_RC = 0, + ERDMA_QPT_UD = 1, +}; + /* create qp cfg0 */ #define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) #define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) @@ -309,6 +387,9 @@ struct erdma_cmdq_modify_qp_req { #define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20) #define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0) +/* create qp cfg2 */ +#define ERDMA_CMD_CREATE_QP_TYPE_MASK GENMASK(3, 0) + /* create qp cqn_mtt_cfg */ #define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) #define ERDMA_CMD_CREATE_QP_DB_CFG_MASK BIT(25) @@ -335,13 +416,14 @@ struct erdma_cmdq_create_qp_req { u64 rq_buf_addr; u32 sq_mtt_cfg; u32 rq_mtt_cfg; - u64 sq_db_info_dma_addr; - u64 rq_db_info_dma_addr; + u64 sq_dbrec_dma; + u64 rq_dbrec_dma; u64 sq_mtt_entry[3]; u64 rq_mtt_entry[3]; u32 db_cfg; + u32 cfg2; }; struct erdma_cmdq_destroy_qp_req { @@ -394,10 +476,33 @@ struct erdma_cmdq_query_stats_resp { u64 rx_pps_meter_drop_packets_cnt; }; +enum erdma_network_type { + ERDMA_NETWORK_TYPE_IPV4 = 0, + ERDMA_NETWORK_TYPE_IPV6 = 1, +}; + +enum erdma_set_gid_op { + ERDMA_SET_GID_OP_ADD = 0, + ERDMA_SET_GID_OP_DEL = 1, +}; + +/* set gid cfg */ +#define ERDMA_CMD_SET_GID_SGID_IDX_MASK GENMASK(15, 0) +#define ERDMA_CMD_SET_GID_NTYPE_MASK BIT(16) +#define ERDMA_CMD_SET_GID_OP_MASK BIT(31) + +struct erdma_cmdq_set_gid_req { + u64 hdr; + u32 cfg; + u8 gid[ERDMA_ROCEV2_GID_SIZE]; +}; + /* cap qword 0 definition */ +#define ERDMA_CMD_DEV_CAP_MAX_GID_MASK GENMASK_ULL(51, 48) #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) #define ERDMA_CMD_DEV_CAP_FLAGS_MASK GENMASK_ULL(31, 24) #define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_DEV_CAP_MAX_AH_MASK GENMASK_ULL(15, 8) #define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) /* cap qword 1 definition */ @@ -426,6 +531,10 @@ enum { #define ERDMA_CQE_QTYPE_RQ 1 #define ERDMA_CQE_QTYPE_CMDQ 2 +#define ERDMA_CQE_NTYPE_MASK BIT(31) +#define ERDMA_CQE_SL_MASK GENMASK(27, 20) +#define ERDMA_CQE_SQPN_MASK GENMASK(19, 0) + struct erdma_cqe { __be32 hdr; __be32 qe_idx; @@ -435,7 +544,16 @@ struct erdma_cqe { __be32 inv_rkey; }; __be32 size; - __be32 rsvd[3]; + union { + struct { + __be32 rsvd[3]; + } rc; + + struct { + __be32 rsvd[2]; + __be32 info; + } ud; + }; }; struct erdma_sge { @@ -487,7 +605,7 @@ struct erdma_write_sqe { struct erdma_sge sgl[]; }; -struct erdma_send_sqe { +struct erdma_send_sqe_rc { __le64 hdr; union { __be32 imm_data; @@ -498,6 +616,17 @@ struct erdma_send_sqe { struct erdma_sge sgl[]; }; +struct erdma_send_sqe_ud { + __le64 hdr; + __be32 imm_data; + __le32 length; + __le32 qkey; + __le32 dst_qpn; + __le32 ahn; + __le32 rsvd; + struct erdma_sge sgl[]; +}; + struct erdma_readreq_sqe { __le64 hdr; __le32 invalid_stag; diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 472939172f0c..f35b30235018 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -26,14 +26,6 @@ static int erdma_netdev_event(struct notifier_block *nb, unsigned long event, goto done; switch (event) { - case NETDEV_UP: - dev->state = IB_PORT_ACTIVE; - erdma_port_event(dev, IB_EVENT_PORT_ACTIVE); - break; - case NETDEV_DOWN: - dev->state = IB_PORT_DOWN; - erdma_port_event(dev, IB_EVENT_PORT_ERR); - break; case NETDEV_CHANGEMTU: if (dev->mtu != netdev->mtu) { erdma_set_mtu(dev, netdev->mtu); @@ -172,22 +164,34 @@ static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) { int ret; + dev->proto = erdma_reg_read32(dev, ERDMA_REGS_DEV_PROTO_REG); + dev->resp_pool = dma_pool_create("erdma_resp_pool", &pdev->dev, ERDMA_HW_RESP_SIZE, ERDMA_HW_RESP_SIZE, 0); if (!dev->resp_pool) return -ENOMEM; + dev->db_pool = dma_pool_create("erdma_db_pool", &pdev->dev, + ERDMA_DB_SIZE, ERDMA_DB_SIZE, 0); + if (!dev->db_pool) { + ret = -ENOMEM; + goto destroy_resp_pool; + } + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ERDMA_PCI_WIDTH)); if (ret) - goto destroy_pool; + goto destroy_db_pool; dma_set_max_seg_size(&pdev->dev, UINT_MAX); return 0; -destroy_pool: +destroy_db_pool: + dma_pool_destroy(dev->db_pool); + +destroy_resp_pool: dma_pool_destroy(dev->resp_pool); return ret; @@ -195,6 +199,7 @@ destroy_pool: static void erdma_device_uninit(struct erdma_dev *dev) { + dma_pool_destroy(dev->db_pool); dma_pool_destroy(dev->resp_pool); } @@ -322,7 +327,7 @@ err_uninit_cmdq: erdma_cmdq_destroy(dev); err_uninit_aeq: - erdma_aeq_destroy(dev); + erdma_eq_destroy(dev, &dev->aeq); err_uninit_comm_irq: erdma_comm_irq_uninit(dev); @@ -355,7 +360,7 @@ static void erdma_remove_dev(struct pci_dev *pdev) erdma_ceqs_uninit(dev); erdma_hw_reset(dev); erdma_cmdq_destroy(dev); - erdma_aeq_destroy(dev); + erdma_eq_destroy(dev, &dev->aeq); erdma_comm_irq_uninit(dev); pci_free_irq_vectors(dev->pdev); erdma_device_uninit(dev); @@ -379,7 +384,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) CMDQ_OPCODE_QUERY_DEVICE); err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, - &cap1); + &cap1, true); if (err) return err; @@ -387,6 +392,8 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0); dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1); dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0); + dev->attrs.max_gid = 1 << ERDMA_GET_CAP(MAX_GID, cap0); + dev->attrs.max_ah = 1 << ERDMA_GET_CAP(MAX_AH, cap0); dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1); dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1); dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); @@ -404,12 +411,13 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD; dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr; + dev->res_cb[ERDMA_RES_TYPE_AH].max_cap = dev->attrs.max_ah; erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON, CMDQ_OPCODE_QUERY_FW_INFO); err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, - &cap1); + &cap1, true); if (!err) dev->attrs.fw_version = FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0); @@ -430,7 +438,8 @@ static int erdma_device_config(struct erdma_dev *dev) req.cfg = FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PGSHIFT_MASK, PAGE_SHIFT) | FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PS_EN_MASK, 1); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_res_cb_init(struct erdma_dev *dev) @@ -463,6 +472,29 @@ static void erdma_res_cb_free(struct erdma_dev *dev) bitmap_free(dev->res_cb[i].bitmap); } +static const struct ib_device_ops erdma_device_ops_rocev2 = { + .get_link_layer = erdma_get_link_layer, + .add_gid = erdma_add_gid, + .del_gid = erdma_del_gid, + .query_pkey = erdma_query_pkey, + .create_ah = erdma_create_ah, + .destroy_ah = erdma_destroy_ah, + .query_ah = erdma_query_ah, + + INIT_RDMA_OBJ_SIZE(ib_ah, erdma_ah, ibah), +}; + +static const struct ib_device_ops erdma_device_ops_iwarp = { + .iw_accept = erdma_accept, + .iw_add_ref = erdma_qp_get_ref, + .iw_connect = erdma_connect, + .iw_create_listen = erdma_create_listen, + .iw_destroy_listen = erdma_destroy_listen, + .iw_get_qp = erdma_get_ibqp, + .iw_reject = erdma_reject, + .iw_rem_ref = erdma_qp_put_ref, +}; + static const struct ib_device_ops erdma_device_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_ERDMA, @@ -479,21 +511,13 @@ static const struct ib_device_ops erdma_device_ops = { .dereg_mr = erdma_dereg_mr, .destroy_cq = erdma_destroy_cq, .destroy_qp = erdma_destroy_qp, + .disassociate_ucontext = erdma_disassociate_ucontext, .get_dma_mr = erdma_get_dma_mr, .get_hw_stats = erdma_get_hw_stats, .get_port_immutable = erdma_get_port_immutable, - .iw_accept = erdma_accept, - .iw_add_ref = erdma_qp_get_ref, - .iw_connect = erdma_connect, - .iw_create_listen = erdma_create_listen, - .iw_destroy_listen = erdma_destroy_listen, - .iw_get_qp = erdma_get_ibqp, - .iw_reject = erdma_reject, - .iw_rem_ref = erdma_qp_put_ref, .map_mr_sg = erdma_map_mr_sg, .mmap = erdma_mmap, .mmap_free = erdma_mmap_free, - .modify_qp = erdma_modify_qp, .post_recv = erdma_post_recv, .post_send = erdma_post_send, .poll_cq = erdma_poll_cq, @@ -503,6 +527,7 @@ static const struct ib_device_ops erdma_device_ops = { .query_qp = erdma_query_qp, .req_notify_cq = erdma_req_notify_cq, .reg_user_mr = erdma_reg_user_mr, + .modify_qp = erdma_modify_qp, INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd), @@ -525,7 +550,14 @@ static int erdma_ib_device_add(struct pci_dev *pdev) if (ret) return ret; - ibdev->node_type = RDMA_NODE_RNIC; + if (erdma_device_iwarp(dev)) { + ibdev->node_type = RDMA_NODE_RNIC; + ib_set_device_ops(ibdev, &erdma_device_ops_iwarp); + } else { + ibdev->node_type = RDMA_NODE_IB_CA; + ib_set_device_ops(ibdev, &erdma_device_ops_rocev2); + } + memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); /* diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 6d0330badd68..25f6c49aec77 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -11,20 +11,20 @@ void erdma_qp_llp_close(struct erdma_qp *qp) { - struct erdma_qp_attrs qp_attrs; + struct erdma_mod_qp_params_iwarp params; down_write(&qp->state_lock); - switch (qp->attrs.state) { - case ERDMA_QP_STATE_RTS: - case ERDMA_QP_STATE_RTR: - case ERDMA_QP_STATE_IDLE: - case ERDMA_QP_STATE_TERMINATE: - qp_attrs.state = ERDMA_QP_STATE_CLOSING; - erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + switch (qp->attrs.iwarp.state) { + case ERDMA_QPS_IWARP_RTS: + case ERDMA_QPS_IWARP_RTR: + case ERDMA_QPS_IWARP_IDLE: + case ERDMA_QPS_IWARP_TERMINATE: + params.state = ERDMA_QPS_IWARP_CLOSING; + erdma_modify_qp_state_iwarp(qp, ¶ms, ERDMA_QPA_IWARP_STATE); break; - case ERDMA_QP_STATE_CLOSING: - qp->attrs.state = ERDMA_QP_STATE_IDLE; + case ERDMA_QPS_IWARP_CLOSING: + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; break; default: break; @@ -48,9 +48,10 @@ struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id) return NULL; } -static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, - struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +static int +erdma_modify_qp_state_to_rts(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + enum erdma_qpa_mask_iwarp mask) { int ret; struct erdma_dev *dev = qp->dev; @@ -59,12 +60,15 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, struct erdma_cep *cep = qp->cep; struct sockaddr_storage local_addr, remote_addr; - if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE)) + if (!(mask & ERDMA_QPA_IWARP_LLP_HANDLE)) return -EINVAL; - if (!(mask & ERDMA_QP_ATTR_MPA)) + if (!(mask & ERDMA_QPA_IWARP_MPA)) return -EINVAL; + if (!(mask & ERDMA_QPA_IWARP_CC)) + params->cc = qp->attrs.cc; + ret = getname_local(cep->sock, &local_addr); if (ret < 0) return ret; @@ -73,18 +77,16 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, if (ret < 0) return ret; - qp->attrs.state = ERDMA_QP_STATE_RTS; - tp = tcp_sk(qp->cep->sock->sk); erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_MODIFY_QP); - req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) | - FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) | + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, params->state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, params->cc) | FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); - req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie); + req.cookie = be32_to_cpu(cep->mpa.ext_data.cookie); req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr; req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr; req.dport = to_sockaddr_in(remote_addr).sin_port; @@ -92,33 +94,57 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, req.send_nxt = tp->snd_nxt; /* rsvd tcp seq for mpa-rsp in server. */ - if (qp->attrs.qp_type == ERDMA_QP_PASSIVE) - req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len; + if (params->qp_type == ERDMA_QP_PASSIVE) + req.send_nxt += MPA_DEFAULT_HDR_LEN + params->pd_len; req.recv_nxt = tp->rcv_nxt; - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); + if (ret) + return ret; + + if (mask & ERDMA_QPA_IWARP_IRD) + qp->attrs.irq_size = params->irq_size; + + if (mask & ERDMA_QPA_IWARP_ORD) + qp->attrs.orq_size = params->orq_size; + + if (mask & ERDMA_QPA_IWARP_CC) + qp->attrs.cc = params->cc; + + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_RTS; + + return 0; } -static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, - struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +static int +erdma_modify_qp_state_to_stop(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + enum erdma_qpa_mask_iwarp mask) { struct erdma_dev *dev = qp->dev; struct erdma_cmdq_modify_qp_req req; - - qp->attrs.state = attrs->state; + int ret; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_MODIFY_QP); - req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) | + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, params->state) | FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); + if (ret) + return ret; + + qp->attrs.iwarp.state = params->state; + + return 0; } -int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +int erdma_modify_qp_state_iwarp(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + int mask) { bool need_reflush = false; int drop_conn, ret = 0; @@ -126,31 +152,31 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, if (!mask) return 0; - if (!(mask & ERDMA_QP_ATTR_STATE)) + if (!(mask & ERDMA_QPA_IWARP_STATE)) return 0; - switch (qp->attrs.state) { - case ERDMA_QP_STATE_IDLE: - case ERDMA_QP_STATE_RTR: - if (attrs->state == ERDMA_QP_STATE_RTS) { - ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); - } else if (attrs->state == ERDMA_QP_STATE_ERROR) { - qp->attrs.state = ERDMA_QP_STATE_ERROR; + switch (qp->attrs.iwarp.state) { + case ERDMA_QPS_IWARP_IDLE: + case ERDMA_QPS_IWARP_RTR: + if (params->state == ERDMA_QPS_IWARP_RTS) { + ret = erdma_modify_qp_state_to_rts(qp, params, mask); + } else if (params->state == ERDMA_QPS_IWARP_ERROR) { + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; need_reflush = true; if (qp->cep) { erdma_cep_put(qp->cep); qp->cep = NULL; } - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + ret = erdma_modify_qp_state_to_stop(qp, params, mask); } break; - case ERDMA_QP_STATE_RTS: + case ERDMA_QPS_IWARP_RTS: drop_conn = 0; - if (attrs->state == ERDMA_QP_STATE_CLOSING || - attrs->state == ERDMA_QP_STATE_TERMINATE || - attrs->state == ERDMA_QP_STATE_ERROR) { - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + if (params->state == ERDMA_QPS_IWARP_CLOSING || + params->state == ERDMA_QPS_IWARP_TERMINATE || + params->state == ERDMA_QPS_IWARP_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, params, mask); drop_conn = 1; need_reflush = true; } @@ -159,17 +185,17 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, erdma_qp_cm_drop(qp); break; - case ERDMA_QP_STATE_TERMINATE: - if (attrs->state == ERDMA_QP_STATE_ERROR) - qp->attrs.state = ERDMA_QP_STATE_ERROR; + case ERDMA_QPS_IWARP_TERMINATE: + if (params->state == ERDMA_QPS_IWARP_ERROR) + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; break; - case ERDMA_QP_STATE_CLOSING: - if (attrs->state == ERDMA_QP_STATE_IDLE) { - qp->attrs.state = ERDMA_QP_STATE_IDLE; - } else if (attrs->state == ERDMA_QP_STATE_ERROR) { - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); - qp->attrs.state = ERDMA_QP_STATE_ERROR; - } else if (attrs->state != ERDMA_QP_STATE_CLOSING) { + case ERDMA_QPS_IWARP_CLOSING: + if (params->state == ERDMA_QPS_IWARP_IDLE) { + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; + } else if (params->state == ERDMA_QPS_IWARP_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, params, mask); + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; + } else if (params->state != ERDMA_QPS_IWARP_CLOSING) { return -ECONNABORTED; } break; @@ -186,6 +212,98 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, return ret; } +static int modify_qp_cmd_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + enum erdma_qpa_mask_rocev2 attr_mask) +{ + struct erdma_cmdq_mod_qp_req_rocev2 req; + + memset(&req, 0, sizeof(req)); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + if (attr_mask & ERDMA_QPA_ROCEV2_STATE) + req.cfg0 |= FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, + params->state); + + if (attr_mask & ERDMA_QPA_ROCEV2_DST_QPN) + req.cfg1 = FIELD_PREP(ERDMA_CMD_MODIFY_QP_DQPN_MASK, + params->dst_qpn); + + if (attr_mask & ERDMA_QPA_ROCEV2_QKEY) + req.qkey = params->qkey; + + if (attr_mask & ERDMA_QPA_ROCEV2_AV) + erdma_set_av_cfg(&req.av_cfg, ¶ms->av); + + if (attr_mask & ERDMA_QPA_ROCEV2_SQ_PSN) + req.sq_psn = params->sq_psn; + + if (attr_mask & ERDMA_QPA_ROCEV2_RQ_PSN) + req.rq_psn = params->rq_psn; + + req.attr_mask = attr_mask; + + return erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, + NULL, true); +} + +static void erdma_reset_qp(struct erdma_qp *qp) +{ + qp->kern_qp.sq_pi = 0; + qp->kern_qp.sq_ci = 0; + qp->kern_qp.rq_pi = 0; + qp->kern_qp.rq_ci = 0; + memset(qp->kern_qp.swr_tbl, 0, qp->attrs.sq_size * sizeof(u64)); + memset(qp->kern_qp.rwr_tbl, 0, qp->attrs.rq_size * sizeof(u64)); + memset(qp->kern_qp.sq_buf, 0, qp->attrs.sq_size << SQEBB_SHIFT); + memset(qp->kern_qp.rq_buf, 0, qp->attrs.rq_size << RQE_SHIFT); + erdma_remove_cqes_of_qp(&qp->scq->ibcq, QP_ID(qp)); + if (qp->rcq != qp->scq) + erdma_remove_cqes_of_qp(&qp->rcq->ibcq, QP_ID(qp)); +} + +int erdma_modify_qp_state_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + int attr_mask) +{ + struct erdma_dev *dev = to_edev(qp->ibqp.device); + int ret; + + ret = modify_qp_cmd_rocev2(qp, params, attr_mask); + if (ret) + return ret; + + if (attr_mask & ERDMA_QPA_ROCEV2_STATE) + qp->attrs.rocev2.state = params->state; + + if (attr_mask & ERDMA_QPA_ROCEV2_QKEY) + qp->attrs.rocev2.qkey = params->qkey; + + if (attr_mask & ERDMA_QPA_ROCEV2_DST_QPN) + qp->attrs.rocev2.dst_qpn = params->dst_qpn; + + if (attr_mask & ERDMA_QPA_ROCEV2_AV) + memcpy(&qp->attrs.rocev2.av, ¶ms->av, + sizeof(struct erdma_av)); + + if (rdma_is_kernel_res(&qp->ibqp.res) && + params->state == ERDMA_QPS_ROCEV2_RESET) + erdma_reset_qp(qp); + + if (rdma_is_kernel_res(&qp->ibqp.res) && + params->state == ERDMA_QPS_ROCEV2_ERROR) { + qp->flags |= ERDMA_QP_IN_FLUSHING; + mod_delayed_work(dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + } + + return 0; +} + static void erdma_qp_safe_free(struct kref *ref) { struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref); @@ -282,17 +400,57 @@ static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr, return 0; } +static void init_send_sqe_rc(struct erdma_qp *qp, struct erdma_send_sqe_rc *sqe, + const struct ib_send_wr *wr, u32 *hw_op) +{ + u32 op = ERDMA_OP_SEND; + + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + op = ERDMA_OP_SEND_WITH_IMM; + sqe->imm_data = wr->ex.imm_data; + } else if (wr->opcode == IB_WR_SEND_WITH_INV) { + op = ERDMA_OP_SEND_WITH_INV; + sqe->invalid_stag = cpu_to_le32(wr->ex.invalidate_rkey); + } + + *hw_op = op; +} + +static void init_send_sqe_ud(struct erdma_qp *qp, struct erdma_send_sqe_ud *sqe, + const struct ib_send_wr *wr, u32 *hw_op) +{ + const struct ib_ud_wr *uwr = ud_wr(wr); + struct erdma_ah *ah = to_eah(uwr->ah); + u32 op = ERDMA_OP_SEND; + + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + op = ERDMA_OP_SEND_WITH_IMM; + sqe->imm_data = wr->ex.imm_data; + } + + *hw_op = op; + + sqe->ahn = cpu_to_le32(ah->ahn); + sqe->dst_qpn = cpu_to_le32(uwr->remote_qpn); + /* Not allowed to send control qkey */ + if (uwr->remote_qkey & 0x80000000) + sqe->qkey = cpu_to_le32(qp->attrs.rocev2.qkey); + else + sqe->qkey = cpu_to_le32(uwr->remote_qkey); +} + static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, const struct ib_send_wr *send_wr) { u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; u32 idx = *pi & (qp->attrs.sq_size - 1); enum ib_wr_opcode op = send_wr->opcode; + struct erdma_send_sqe_rc *rc_send_sqe; + struct erdma_send_sqe_ud *ud_send_sqe; struct erdma_atomic_sqe *atomic_sqe; struct erdma_readreq_sqe *read_sqe; struct erdma_reg_mr_sqe *regmr_sge; struct erdma_write_sqe *write_sqe; - struct erdma_send_sqe *send_sqe; struct ib_rdma_wr *rdma_wr; struct erdma_sge *sge; __le32 *length_field; @@ -301,6 +459,10 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, u32 attrs; int ret; + if (qp->ibqp.qp_type != IB_QPT_RC && send_wr->opcode != IB_WR_SEND && + send_wr->opcode != IB_WR_SEND_WITH_IMM) + return -EINVAL; + entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size, SQEBB_SHIFT); @@ -374,21 +536,20 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: case IB_WR_SEND_WITH_INV: - send_sqe = (struct erdma_send_sqe *)entry; - hw_op = ERDMA_OP_SEND; - if (op == IB_WR_SEND_WITH_IMM) { - hw_op = ERDMA_OP_SEND_WITH_IMM; - send_sqe->imm_data = send_wr->ex.imm_data; - } else if (op == IB_WR_SEND_WITH_INV) { - hw_op = ERDMA_OP_SEND_WITH_INV; - send_sqe->invalid_stag = - cpu_to_le32(send_wr->ex.invalidate_rkey); + if (qp->ibqp.qp_type == IB_QPT_RC) { + rc_send_sqe = (struct erdma_send_sqe_rc *)entry; + init_send_sqe_rc(qp, rc_send_sqe, send_wr, &hw_op); + length_field = &rc_send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe_rc); + } else { + ud_send_sqe = (struct erdma_send_sqe_ud *)entry; + init_send_sqe_ud(qp, ud_send_sqe, send_wr, &hw_op); + length_field = &ud_send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe_ud); } - wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); - length_field = &send_sqe->length; - wqe_size = sizeof(struct erdma_send_sqe); - sgl_offset = wqe_size; + sgl_offset = wqe_size; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); break; case IB_WR_REG_MR: wqe_hdr |= @@ -492,7 +653,7 @@ static void kick_sq_db(struct erdma_qp *qp, u16 pi) u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) | FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi); - *(u64 *)qp->kern_qp.sq_db_info = db_data; + *(u64 *)qp->kern_qp.sq_dbrec = db_data; writeq(db_data, qp->kern_qp.hw_sq_db); } @@ -557,7 +718,7 @@ static int erdma_post_recv_one(struct erdma_qp *qp, return -EINVAL; } - *(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe; + *(u64 *)qp->kern_qp.rq_dbrec = *(u64 *)rqe; writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db); qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] = diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 23dfc01603f8..af36a8d2df22 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -55,6 +55,13 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) ilog2(qp->attrs.rq_size)) | FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn); + if (qp->ibqp.qp_type == IB_QPT_RC) + req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_QP_TYPE_MASK, + ERDMA_QPT_RC); + else + req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_QP_TYPE_MASK, + ERDMA_QPT_UD); + if (rdma_is_kernel_res(&qp->ibqp.res)) { u32 pgsz_range = ilog2(SZ_1M) - ERDMA_HW_PAGE_SHIFT; @@ -76,10 +83,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr; req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr; - req.sq_db_info_dma_addr = qp->kern_qp.sq_buf_dma_addr + - (qp->attrs.sq_size << SQEBB_SHIFT); - req.rq_db_info_dma_addr = qp->kern_qp.rq_buf_dma_addr + - (qp->attrs.rq_size << RQE_SHIFT); + req.sq_dbrec_dma = qp->kern_qp.sq_dbrec_dma; + req.rq_dbrec_dma = qp->kern_qp.rq_dbrec_dma; } else { user_qp = &qp->user_qp; req.sq_cqn_mtt_cfg = FIELD_PREP( @@ -107,8 +112,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) assemble_qbuf_mtt_for_cmd(&user_qp->rq_mem, &req.rq_mtt_cfg, &req.rq_buf_addr, req.rq_mtt_entry); - req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; - req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; + req.sq_dbrec_dma = user_qp->sq_dbrec_dma; + req.rq_dbrec_dma = user_qp->rq_dbrec_dma; if (uctx->ext_db.enable) { req.sq_cqn_mtt_cfg |= @@ -121,10 +126,10 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) } } - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, - &resp1); - if (!err) - qp->attrs.cookie = + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, &resp1, + true); + if (!err && erdma_device_iwarp(dev)) + qp->attrs.iwarp.cookie = FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); return err; @@ -180,7 +185,8 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) } post_cmd: - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) @@ -209,8 +215,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) ERDMA_MR_MTT_0LEVEL); req.first_page_offset = 0; - req.cq_db_info_addr = - cq->kern_cq.qbuf_dma_addr + (cq->depth << CQE_SHIFT); + req.cq_dbrec_dma = cq->kern_cq.dbrec_dma; } else { mem = &cq->user_cq.qbuf_mem; req.cfg0 |= @@ -233,7 +238,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) mem->mtt_nents); req.first_page_offset = mem->page_offset; - req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; + req.cq_dbrec_dma = cq->user_cq.dbrec_dma; if (uctx->ext_db.enable) { req.cfg1 |= FIELD_PREP( @@ -243,7 +248,8 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) } } - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_alloc_idx(struct erdma_resource_cb *res_cb) @@ -339,6 +345,11 @@ int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; + if (erdma_device_rocev2(dev)) { + attr->max_pkeys = ERDMA_MAX_PKEYS; + attr->max_ah = dev->attrs.max_ah; + } + if (dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_ATOMIC) attr->atomic_cap = IB_ATOMIC_GLOB; @@ -370,7 +381,14 @@ int erdma_query_port(struct ib_device *ibdev, u32 port, memset(attr, 0, sizeof(*attr)); - attr->gid_tbl_len = 1; + if (erdma_device_iwarp(dev)) { + attr->gid_tbl_len = 1; + } else { + attr->gid_tbl_len = dev->attrs.max_gid; + attr->ip_gids = true; + attr->pkey_tbl_len = ERDMA_MAX_PKEYS; + } + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; attr->max_msg_sz = -1; @@ -380,14 +398,10 @@ int erdma_query_port(struct ib_device *ibdev, u32 port, ib_get_eth_speed(ibdev, port, &attr->active_speed, &attr->active_width); attr->max_mtu = ib_mtu_int_to_enum(ndev->mtu); attr->active_mtu = ib_mtu_int_to_enum(ndev->mtu); - if (netif_running(ndev) && netif_carrier_ok(ndev)) - dev->state = IB_PORT_ACTIVE; - else - dev->state = IB_PORT_DOWN; - attr->state = dev->state; + attr->state = ib_get_curr_port_state(ndev); out: - if (dev->state == IB_PORT_ACTIVE) + if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; @@ -398,8 +412,18 @@ out: int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, struct ib_port_immutable *port_immutable) { - port_immutable->gid_tbl_len = 1; - port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + struct erdma_dev *dev = to_edev(ibdev); + + if (erdma_device_iwarp(dev)) { + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + port_immutable->gid_tbl_len = 1; + } else { + port_immutable->core_cap_flags = + RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + port_immutable->max_mad_size = IB_MGMT_MAD_SIZE; + port_immutable->gid_tbl_len = dev->attrs.max_gid; + port_immutable->pkey_tbl_len = ERDMA_MAX_PKEYS; + } return 0; } @@ -441,7 +465,8 @@ static void erdma_flush_worker(struct work_struct *work) req.qpn = QP_ID(qp); req.sq_pi = qp->kern_qp.sq_pi; req.rq_pi = qp->kern_qp.rq_pi; - erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, NULL); + erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_qp_validate_cap(struct erdma_dev *dev, @@ -462,7 +487,11 @@ static int erdma_qp_validate_cap(struct erdma_dev *dev, static int erdma_qp_validate_attr(struct erdma_dev *dev, struct ib_qp_init_attr *attrs) { - if (attrs->qp_type != IB_QPT_RC) + if (erdma_device_iwarp(dev) && attrs->qp_type != IB_QPT_RC) + return -EOPNOTSUPP; + + if (erdma_device_rocev2(dev) && attrs->qp_type != IB_QPT_RC && + attrs->qp_type != IB_QPT_UD && attrs->qp_type != IB_QPT_GSI) return -EOPNOTSUPP; if (attrs->srq) @@ -482,16 +511,24 @@ static void free_kernel_qp(struct erdma_qp *qp) vfree(qp->kern_qp.rwr_tbl); if (qp->kern_qp.sq_buf) - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), - qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + qp->attrs.sq_size << SQEBB_SHIFT, + qp->kern_qp.sq_buf, + qp->kern_qp.sq_buf_dma_addr); + + if (qp->kern_qp.sq_dbrec) + dma_pool_free(dev->db_pool, qp->kern_qp.sq_dbrec, + qp->kern_qp.sq_dbrec_dma); if (qp->kern_qp.rq_buf) - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), - qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + qp->attrs.rq_size << RQE_SHIFT, + qp->kern_qp.rq_buf, + qp->kern_qp.rq_buf_dma_addr); + + if (qp->kern_qp.rq_dbrec) + dma_pool_free(dev->db_pool, qp->kern_qp.rq_dbrec, + qp->kern_qp.rq_dbrec_dma); } static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, @@ -516,20 +553,27 @@ static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, if (!kqp->swr_tbl || !kqp->rwr_tbl) goto err_out; - size = (qp->attrs.sq_size << SQEBB_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + size = qp->attrs.sq_size << SQEBB_SHIFT; kqp->sq_buf = dma_alloc_coherent(&dev->pdev->dev, size, &kqp->sq_buf_dma_addr, GFP_KERNEL); if (!kqp->sq_buf) goto err_out; - size = (qp->attrs.rq_size << RQE_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->sq_dbrec = + dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &kqp->sq_dbrec_dma); + if (!kqp->sq_dbrec) + goto err_out; + + size = qp->attrs.rq_size << RQE_SHIFT; kqp->rq_buf = dma_alloc_coherent(&dev->pdev->dev, size, &kqp->rq_buf_dma_addr, GFP_KERNEL); if (!kqp->rq_buf) goto err_out; - kqp->sq_db_info = kqp->sq_buf + (qp->attrs.sq_size << SQEBB_SHIFT); - kqp->rq_db_info = kqp->rq_buf + (qp->attrs.rq_size << RQE_SHIFT); + kqp->rq_dbrec = + dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &kqp->rq_dbrec_dma); + if (!kqp->rq_dbrec) + goto err_out; return 0; @@ -864,9 +908,9 @@ erdma_unmap_user_dbrecords(struct erdma_ucontext *ctx, } static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, - u64 va, u32 len, u64 db_info_va) + u64 va, u32 len, u64 dbrec_va) { - dma_addr_t db_info_dma_addr; + dma_addr_t dbrec_dma; u32 rq_offset; int ret; @@ -889,14 +933,14 @@ static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, if (ret) goto put_sq_mtt; - ret = erdma_map_user_dbrecords(uctx, db_info_va, + ret = erdma_map_user_dbrecords(uctx, dbrec_va, &qp->user_qp.user_dbr_page, - &db_info_dma_addr); + &dbrec_dma); if (ret) goto put_rq_mtt; - qp->user_qp.sq_db_info_dma_addr = db_info_dma_addr; - qp->user_qp.rq_db_info_dma_addr = db_info_dma_addr + ERDMA_DB_SIZE; + qp->user_qp.sq_dbrec_dma = dbrec_dma; + qp->user_qp.rq_dbrec_dma = dbrec_dma + ERDMA_DB_SIZE; return 0; @@ -925,7 +969,8 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, udata, struct erdma_ucontext, ibucontext); struct erdma_ureq_create_qp ureq; struct erdma_uresp_create_qp uresp; - int ret; + void *old_entry; + int ret = 0; ret = erdma_qp_validate_cap(dev, attrs); if (ret) @@ -944,9 +989,16 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, kref_init(&qp->ref); init_completion(&qp->safe_free); - ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, - XA_LIMIT(1, dev->attrs.max_qp - 1), - &dev->next_alloc_qpn, GFP_KERNEL); + if (qp->ibqp.qp_type == IB_QPT_GSI) { + old_entry = xa_store(&dev->qp_xa, 1, qp, GFP_KERNEL); + if (xa_is_err(old_entry)) + ret = xa_err(old_entry); + } else { + ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, + XA_LIMIT(1, dev->attrs.max_qp - 1), + &dev->next_alloc_qpn, GFP_KERNEL); + } + if (ret < 0) { ret = -ENOMEM; goto err_out; @@ -983,7 +1035,12 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, qp->attrs.max_send_sge = attrs->cap.max_send_sge; qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; - qp->attrs.state = ERDMA_QP_STATE_IDLE; + + if (erdma_device_iwarp(qp->dev)) + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; + else + qp->attrs.rocev2.state = ERDMA_QPS_ROCEV2_RESET; + INIT_DELAYED_WORK(&qp->reflush_dwork, erdma_flush_worker); ret = create_qp_cmd(uctx, qp); @@ -1207,7 +1264,8 @@ int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) req.cfg = FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, ibmr->lkey >> 8) | FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, ibmr->lkey & 0xFF); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) return ret; @@ -1232,14 +1290,16 @@ int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) CMDQ_OPCODE_DESTROY_CQ); req.cqn = cq->cqn; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) return err; if (rdma_is_kernel_res(&cq->ibcq.res)) { - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cq->kern_cq.dbrec, + cq->kern_cq.dbrec_dma); } else { erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); put_mtt_entries(dev, &cq->user_cq.qbuf_mem); @@ -1256,13 +1316,20 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) struct erdma_dev *dev = to_edev(ibqp->device); struct erdma_ucontext *ctx = rdma_udata_to_drv_context( udata, struct erdma_ucontext, ibucontext); - struct erdma_qp_attrs qp_attrs; - int err; struct erdma_cmdq_destroy_qp_req req; + union erdma_mod_qp_params params; + int err; down_write(&qp->state_lock); - qp_attrs.state = ERDMA_QP_STATE_ERROR; - erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + if (erdma_device_iwarp(dev)) { + params.iwarp.state = ERDMA_QPS_IWARP_ERROR; + erdma_modify_qp_state_iwarp(qp, ¶ms.iwarp, + ERDMA_QPA_IWARP_STATE); + } else { + params.rocev2.state = ERDMA_QPS_ROCEV2_ERROR; + erdma_modify_qp_state_rocev2(qp, ¶ms.rocev2, + ERDMA_QPA_ROCEV2_STATE); + } up_write(&qp->state_lock); cancel_delayed_work_sync(&qp->reflush_dwork); @@ -1271,7 +1338,8 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) CMDQ_OPCODE_DESTROY_QP); req.qpn = QP_ID(qp); - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) return err; @@ -1279,16 +1347,7 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) wait_for_completion(&qp->safe_free); if (rdma_is_kernel_res(&qp->ibqp.res)) { - vfree(qp->kern_qp.swr_tbl); - vfree(qp->kern_qp.rwr_tbl); - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), - qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), - qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + free_kernel_qp(qp); } else { put_mtt_entries(dev, &qp->user_qp.sq_mem); put_mtt_entries(dev, &qp->user_qp.rq_mem); @@ -1378,7 +1437,8 @@ static int alloc_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx, FIELD_PREP(ERDMA_CMD_EXT_DB_RQ_EN_MASK, 1) | FIELD_PREP(ERDMA_CMD_EXT_DB_SQ_EN_MASK, 1); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &val0, &val1); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &val0, &val1, + true); if (ret) return ret; @@ -1413,7 +1473,8 @@ static void free_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx) req.rdb_off = ctx->ext_db.rdb_off; req.cdb_off = ctx->ext_db.cdb_off; - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) ibdev_err_ratelimited(&dev->ibdev, "free db resources failed %d", ret); @@ -1502,49 +1563,248 @@ void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) atomic_dec(&dev->num_ctx); } -static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { - [IB_QPS_RESET] = ERDMA_QP_STATE_IDLE, - [IB_QPS_INIT] = ERDMA_QP_STATE_IDLE, - [IB_QPS_RTR] = ERDMA_QP_STATE_RTR, - [IB_QPS_RTS] = ERDMA_QP_STATE_RTS, - [IB_QPS_SQD] = ERDMA_QP_STATE_CLOSING, - [IB_QPS_SQE] = ERDMA_QP_STATE_TERMINATE, - [IB_QPS_ERR] = ERDMA_QP_STATE_ERROR +static void erdma_attr_to_av(const struct rdma_ah_attr *ah_attr, + struct erdma_av *av, u16 sport) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + + av->port = rdma_ah_get_port_num(ah_attr); + av->sgid_index = grh->sgid_index; + av->hop_limit = grh->hop_limit; + av->traffic_class = grh->traffic_class; + av->sl = rdma_ah_get_sl(ah_attr); + + av->flow_label = grh->flow_label; + av->udp_sport = sport; + + ether_addr_copy(av->dmac, ah_attr->roce.dmac); + memcpy(av->dgid, grh->dgid.raw, ERDMA_ROCEV2_GID_SIZE); + + if (ipv6_addr_v4mapped((struct in6_addr *)&grh->dgid)) + av->ntype = ERDMA_NETWORK_TYPE_IPV4; + else + av->ntype = ERDMA_NETWORK_TYPE_IPV6; +} + +static void erdma_av_to_attr(struct erdma_av *av, struct rdma_ah_attr *ah_attr) +{ + ah_attr->type = RDMA_AH_ATTR_TYPE_ROCE; + + rdma_ah_set_sl(ah_attr, av->sl); + rdma_ah_set_port_num(ah_attr, av->port); + rdma_ah_set_ah_flags(ah_attr, IB_AH_GRH); + + rdma_ah_set_grh(ah_attr, NULL, av->flow_label, av->sgid_index, + av->hop_limit, av->traffic_class); + rdma_ah_set_dgid_raw(ah_attr, av->dgid); +} + +static int ib_qps_to_erdma_qps[ERDMA_PROTO_COUNT][IB_QPS_ERR + 1] = { + [ERDMA_PROTO_IWARP] = { + [IB_QPS_RESET] = ERDMA_QPS_IWARP_IDLE, + [IB_QPS_INIT] = ERDMA_QPS_IWARP_IDLE, + [IB_QPS_RTR] = ERDMA_QPS_IWARP_RTR, + [IB_QPS_RTS] = ERDMA_QPS_IWARP_RTS, + [IB_QPS_SQD] = ERDMA_QPS_IWARP_CLOSING, + [IB_QPS_SQE] = ERDMA_QPS_IWARP_TERMINATE, + [IB_QPS_ERR] = ERDMA_QPS_IWARP_ERROR, + }, + [ERDMA_PROTO_ROCEV2] = { + [IB_QPS_RESET] = ERDMA_QPS_ROCEV2_RESET, + [IB_QPS_INIT] = ERDMA_QPS_ROCEV2_INIT, + [IB_QPS_RTR] = ERDMA_QPS_ROCEV2_RTR, + [IB_QPS_RTS] = ERDMA_QPS_ROCEV2_RTS, + [IB_QPS_SQD] = ERDMA_QPS_ROCEV2_SQD, + [IB_QPS_SQE] = ERDMA_QPS_ROCEV2_SQE, + [IB_QPS_ERR] = ERDMA_QPS_ROCEV2_ERROR, + }, }; -int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, - struct ib_udata *udata) +static int erdma_qps_to_ib_qps[ERDMA_PROTO_COUNT][ERDMA_QPS_ROCEV2_COUNT] = { + [ERDMA_PROTO_IWARP] = { + [ERDMA_QPS_IWARP_IDLE] = IB_QPS_INIT, + [ERDMA_QPS_IWARP_RTR] = IB_QPS_RTR, + [ERDMA_QPS_IWARP_RTS] = IB_QPS_RTS, + [ERDMA_QPS_IWARP_CLOSING] = IB_QPS_ERR, + [ERDMA_QPS_IWARP_TERMINATE] = IB_QPS_ERR, + [ERDMA_QPS_IWARP_ERROR] = IB_QPS_ERR, + }, + [ERDMA_PROTO_ROCEV2] = { + [ERDMA_QPS_ROCEV2_RESET] = IB_QPS_RESET, + [ERDMA_QPS_ROCEV2_INIT] = IB_QPS_INIT, + [ERDMA_QPS_ROCEV2_RTR] = IB_QPS_RTR, + [ERDMA_QPS_ROCEV2_RTS] = IB_QPS_RTS, + [ERDMA_QPS_ROCEV2_SQD] = IB_QPS_SQD, + [ERDMA_QPS_ROCEV2_SQE] = IB_QPS_SQE, + [ERDMA_QPS_ROCEV2_ERROR] = IB_QPS_ERR, + }, +}; + +static inline enum erdma_qps_iwarp ib_to_iwarp_qps(enum ib_qp_state state) { - struct erdma_qp_attrs new_attrs; - enum erdma_qp_attr_mask erdma_attr_mask = 0; - struct erdma_qp *qp = to_eqp(ibqp); - int ret = 0; + return ib_qps_to_erdma_qps[ERDMA_PROTO_IWARP][state]; +} - if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) - return -EOPNOTSUPP; +static inline enum erdma_qps_rocev2 ib_to_rocev2_qps(enum ib_qp_state state) +{ + return ib_qps_to_erdma_qps[ERDMA_PROTO_ROCEV2][state]; +} + +static inline enum ib_qp_state iwarp_to_ib_qps(enum erdma_qps_iwarp state) +{ + return erdma_qps_to_ib_qps[ERDMA_PROTO_IWARP][state]; +} + +static inline enum ib_qp_state rocev2_to_ib_qps(enum erdma_qps_rocev2 state) +{ + return erdma_qps_to_ib_qps[ERDMA_PROTO_ROCEV2][state]; +} + +static int erdma_check_qp_attrs(struct erdma_qp *qp, struct ib_qp_attr *attr, + int attr_mask) +{ + enum ib_qp_state cur_state, nxt_state; + struct erdma_dev *dev = qp->dev; + int ret = -EINVAL; + + if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) { + ret = -EOPNOTSUPP; + goto out; + } + + if ((attr_mask & IB_QP_PORT) && + !rdma_is_port_valid(&dev->ibdev, attr->port_num)) + goto out; + + if (erdma_device_rocev2(dev)) { + cur_state = (attr_mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : + rocev2_to_ib_qps(qp->attrs.rocev2.state); + + nxt_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : + cur_state; + + if (!ib_modify_qp_is_ok(cur_state, nxt_state, qp->ibqp.qp_type, + attr_mask)) + goto out; + + if ((attr_mask & IB_QP_AV) && + erdma_check_gid_attr( + rdma_ah_read_grh(&attr->ah_attr)->sgid_attr)) + goto out; + + if ((attr_mask & IB_QP_PKEY_INDEX) && + attr->pkey_index >= ERDMA_MAX_PKEYS) + goto out; + } + + return 0; + +out: + return ret; +} + +static void erdma_init_mod_qp_params_rocev2( + struct erdma_qp *qp, struct erdma_mod_qp_params_rocev2 *params, + int *erdma_attr_mask, struct ib_qp_attr *attr, int ib_attr_mask) +{ + enum erdma_qpa_mask_rocev2 to_modify_attrs = 0; + enum erdma_qps_rocev2 cur_state, nxt_state; + u16 udp_sport; + + if (ib_attr_mask & IB_QP_CUR_STATE) + cur_state = ib_to_rocev2_qps(attr->cur_qp_state); + else + cur_state = qp->attrs.rocev2.state; + + if (ib_attr_mask & IB_QP_STATE) + nxt_state = ib_to_rocev2_qps(attr->qp_state); + else + nxt_state = cur_state; + + to_modify_attrs |= ERDMA_QPA_ROCEV2_STATE; + params->state = nxt_state; - memset(&new_attrs, 0, sizeof(new_attrs)); + if (ib_attr_mask & IB_QP_QKEY) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_QKEY; + params->qkey = attr->qkey; + } - if (attr_mask & IB_QP_STATE) { - new_attrs.state = ib_qp_state_to_erdma_qp_state[attr->qp_state]; + if (ib_attr_mask & IB_QP_SQ_PSN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_SQ_PSN; + params->sq_psn = attr->sq_psn; + } - erdma_attr_mask |= ERDMA_QP_ATTR_STATE; + if (ib_attr_mask & IB_QP_RQ_PSN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_RQ_PSN; + params->rq_psn = attr->rq_psn; } + if (ib_attr_mask & IB_QP_DEST_QPN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_DST_QPN; + params->dst_qpn = attr->dest_qp_num; + } + + if (ib_attr_mask & IB_QP_AV) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_AV; + udp_sport = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, + QP_ID(qp), params->dst_qpn); + erdma_attr_to_av(&attr->ah_attr, ¶ms->av, udp_sport); + } + + *erdma_attr_mask = to_modify_attrs; +} + +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + struct erdma_qp *qp = to_eqp(ibqp); + union erdma_mod_qp_params params; + int ret = 0, erdma_attr_mask = 0; + down_write(&qp->state_lock); - ret = erdma_modify_qp_internal(qp, &new_attrs, erdma_attr_mask); + ret = erdma_check_qp_attrs(qp, attr, attr_mask); + if (ret) + goto out; - up_write(&qp->state_lock); + if (erdma_device_iwarp(qp->dev)) { + if (attr_mask & IB_QP_STATE) { + erdma_attr_mask |= ERDMA_QPA_IWARP_STATE; + params.iwarp.state = ib_to_iwarp_qps(attr->qp_state); + } + ret = erdma_modify_qp_state_iwarp(qp, ¶ms.iwarp, + erdma_attr_mask); + } else { + erdma_init_mod_qp_params_rocev2( + qp, ¶ms.rocev2, &erdma_attr_mask, attr, attr_mask); + + ret = erdma_modify_qp_state_rocev2(qp, ¶ms.rocev2, + erdma_attr_mask); + } + +out: + up_write(&qp->state_lock); return ret; } +static enum ib_qp_state query_qp_state(struct erdma_qp *qp) +{ + if (erdma_device_iwarp(qp->dev)) + return iwarp_to_ib_qps(qp->attrs.iwarp.state); + else + return rocev2_to_ib_qps(qp->attrs.rocev2.state); +} + int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { - struct erdma_qp *qp; + struct erdma_cmdq_query_qp_req_rocev2 req; struct erdma_dev *dev; + struct erdma_qp *qp; + u64 resp0, resp1; + int ret; if (ibqp && qp_attr && qp_init_attr) { qp = to_eqp(ibqp); @@ -1571,6 +1831,38 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; + if (erdma_device_rocev2(dev)) { + /* Query hardware to get some attributes */ + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_QUERY_QP); + req.qpn = QP_ID(qp); + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, + &resp1, true); + if (ret) + return ret; + + qp_attr->sq_psn = + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_SQ_PSN_MASK, resp0); + qp_attr->rq_psn = + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_RQ_PSN_MASK, resp0); + qp_attr->qp_state = rocev2_to_ib_qps(FIELD_GET( + ERDMA_CMD_QUERY_QP_RESP_QP_STATE_MASK, resp0)); + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->sq_draining = FIELD_GET( + ERDMA_CMD_QUERY_QP_RESP_SQ_DRAINING_MASK, resp0); + + qp_attr->pkey_index = 0; + qp_attr->dest_qp_num = qp->attrs.rocev2.dst_qpn; + + if (qp->ibqp.qp_type == IB_QPT_RC) + erdma_av_to_attr(&qp->attrs.rocev2.av, + &qp_attr->ah_attr); + } else { + qp_attr->qp_state = query_qp_state(qp); + qp_attr->cur_qp_state = qp_attr->qp_state; + } + return 0; } @@ -1588,7 +1880,7 @@ static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq, ret = erdma_map_user_dbrecords(ctx, ureq->db_record_va, &cq->user_cq.user_dbr_page, - &cq->user_cq.db_info_dma_addr); + &cq->user_cq.dbrec_dma); if (ret) put_mtt_entries(dev, &cq->user_cq.qbuf_mem); @@ -1600,24 +1892,33 @@ static int erdma_init_kernel_cq(struct erdma_cq *cq) struct erdma_dev *dev = to_edev(cq->ibcq.device); cq->kern_cq.qbuf = - dma_alloc_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + dma_alloc_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, &cq->kern_cq.qbuf_dma_addr, GFP_KERNEL); if (!cq->kern_cq.qbuf) return -ENOMEM; - cq->kern_cq.db_record = - (u64 *)(cq->kern_cq.qbuf + (cq->depth << CQE_SHIFT)); + cq->kern_cq.dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &cq->kern_cq.dbrec_dma); + if (!cq->kern_cq.dbrec) + goto err_out; + spin_lock_init(&cq->kern_cq.lock); /* use default cqdb addr */ cq->kern_cq.db = dev->func_bar + ERDMA_BAR_CQDB_SPACE_OFFSET; return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + + return -ENOMEM; } int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct erdma_cq *cq = to_ecq(ibcq); struct erdma_dev *dev = to_edev(ibcq->device); unsigned int depth = attr->cqe; @@ -1676,9 +1977,10 @@ err_free_res: erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); put_mtt_entries(dev, &cq->user_cq.qbuf_mem); } else { - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(depth << CQE_SHIFT), + dma_free_coherent(&dev->pdev->dev, depth << CQE_SHIFT, cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cq->kern_cq.dbrec, + cq->kern_cq.dbrec_dma); } err_out_xa: @@ -1687,6 +1989,10 @@ err_out_xa: return ret; } +void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} + void erdma_set_mtu(struct erdma_dev *dev, u32 mtu) { struct erdma_cmdq_config_mtu_req req; @@ -1695,7 +2001,7 @@ void erdma_set_mtu(struct erdma_dev *dev, u32 mtu) CMDQ_OPCODE_CONF_MTU); req.mtu = mtu; - erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, true); } void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason) @@ -1765,7 +2071,8 @@ static int erdma_query_hw_stats(struct erdma_dev *dev, req.target_addr = dma_addr; req.target_length = ERDMA_HW_RESP_SIZE; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) goto out; @@ -1798,3 +2105,159 @@ int erdma_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, return stats->num_counters; } + +enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, u32 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static int erdma_set_gid(struct erdma_dev *dev, u8 op, u32 idx, + const union ib_gid *gid) +{ + struct erdma_cmdq_set_gid_req req; + u8 ntype; + + req.cfg = FIELD_PREP(ERDMA_CMD_SET_GID_SGID_IDX_MASK, idx) | + FIELD_PREP(ERDMA_CMD_SET_GID_OP_MASK, op); + + if (op == ERDMA_SET_GID_OP_ADD) { + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) + ntype = ERDMA_NETWORK_TYPE_IPV4; + else + ntype = ERDMA_NETWORK_TYPE_IPV6; + + req.cfg |= FIELD_PREP(ERDMA_CMD_SET_GID_NTYPE_MASK, ntype); + + memcpy(&req.gid, gid, ERDMA_ROCEV2_GID_SIZE); + } + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_SET_GID); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); +} + +int erdma_add_gid(const struct ib_gid_attr *attr, void **context) +{ + struct erdma_dev *dev = to_edev(attr->device); + int ret; + + ret = erdma_check_gid_attr(attr); + if (ret) + return ret; + + return erdma_set_gid(dev, ERDMA_SET_GID_OP_ADD, attr->index, + &attr->gid); +} + +int erdma_del_gid(const struct ib_gid_attr *attr, void **context) +{ + return erdma_set_gid(to_edev(attr->device), ERDMA_SET_GID_OP_DEL, + attr->index, NULL); +} + +int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) +{ + if (index >= ERDMA_MAX_PKEYS) + return -EINVAL; + + *pkey = ERDMA_DEFAULT_PKEY; + return 0; +} + +void erdma_set_av_cfg(struct erdma_av_cfg *av_cfg, struct erdma_av *av) +{ + av_cfg->cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_AV_FL_MASK, av->flow_label) | + FIELD_PREP(ERDMA_CMD_CREATE_AV_NTYPE_MASK, av->ntype); + + av_cfg->traffic_class = av->traffic_class; + av_cfg->hop_limit = av->hop_limit; + av_cfg->sl = av->sl; + + av_cfg->udp_sport = av->udp_sport; + av_cfg->sgid_index = av->sgid_index; + + ether_addr_copy(av_cfg->dmac, av->dmac); + memcpy(av_cfg->dgid, av->dgid, ERDMA_ROCEV2_GID_SIZE); +} + +int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) +{ + const struct ib_global_route *grh = + rdma_ah_read_grh(init_attr->ah_attr); + struct erdma_dev *dev = to_edev(ibah->device); + struct erdma_pd *pd = to_epd(ibah->pd); + struct erdma_ah *ah = to_eah(ibah); + struct erdma_cmdq_create_ah_req req; + u32 udp_sport; + int ret; + + ret = erdma_check_gid_attr(grh->sgid_attr); + if (ret) + return ret; + + ret = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_AH]); + if (ret < 0) + return ret; + + ah->ahn = ret; + + if (grh->flow_label) + udp_sport = rdma_flow_label_to_udp_sport(grh->flow_label); + else + udp_sport = + IB_ROCE_UDP_ENCAP_VALID_PORT_MIN + (ah->ahn & 0x3FFF); + + erdma_attr_to_av(init_attr->ah_attr, &ah->av, udp_sport); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_AH); + + req.pdn = pd->pdn; + req.ahn = ah->ahn; + erdma_set_av_cfg(&req.av_cfg, &ah->av); + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); + if (ret) { + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_AH], ah->ahn); + return ret; + } + + return 0; +} + +int erdma_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct erdma_dev *dev = to_edev(ibah->device); + struct erdma_pd *pd = to_epd(ibah->pd); + struct erdma_ah *ah = to_eah(ibah); + struct erdma_cmdq_destroy_ah_req req; + int ret; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_AH); + + req.pdn = pd->pdn; + req.ahn = ah->ahn; + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + flags & RDMA_DESTROY_AH_SLEEPABLE); + if (ret) + return ret; + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_AH], ah->ahn); + + return 0; +} + +int erdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) +{ + struct erdma_ah *ah = to_eah(ibah); + + memset(ah_attr, 0, sizeof(*ah_attr)); + erdma_av_to_attr(&ah->av, ah_attr); + + return 0; +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index db6018529ccc..f9408ccc8bad 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -136,12 +136,31 @@ struct erdma_user_dbrecords_page { int refcnt; }; +struct erdma_av { + u8 port; + u8 hop_limit; + u8 traffic_class; + u8 sl; + u8 sgid_index; + u16 udp_sport; + u32 flow_label; + u8 dmac[ETH_ALEN]; + u8 dgid[ERDMA_ROCEV2_GID_SIZE]; + enum erdma_network_type ntype; +}; + +struct erdma_ah { + struct ib_ah ibah; + struct erdma_av av; + u32 ahn; +}; + struct erdma_uqp { struct erdma_mem sq_mem; struct erdma_mem rq_mem; - dma_addr_t sq_db_info_dma_addr; - dma_addr_t rq_db_info_dma_addr; + dma_addr_t sq_dbrec_dma; + dma_addr_t rq_dbrec_dma; struct erdma_user_dbrecords_page *user_dbr_page; @@ -167,39 +186,100 @@ struct erdma_kqp { void *rq_buf; dma_addr_t rq_buf_dma_addr; - void *sq_db_info; - void *rq_db_info; + void *sq_dbrec; + void *rq_dbrec; + + dma_addr_t sq_dbrec_dma; + dma_addr_t rq_dbrec_dma; u8 sig_all; }; -enum erdma_qp_state { - ERDMA_QP_STATE_IDLE = 0, - ERDMA_QP_STATE_RTR = 1, - ERDMA_QP_STATE_RTS = 2, - ERDMA_QP_STATE_CLOSING = 3, - ERDMA_QP_STATE_TERMINATE = 4, - ERDMA_QP_STATE_ERROR = 5, - ERDMA_QP_STATE_UNDEF = 7, - ERDMA_QP_STATE_COUNT = 8 +enum erdma_qps_iwarp { + ERDMA_QPS_IWARP_IDLE = 0, + ERDMA_QPS_IWARP_RTR = 1, + ERDMA_QPS_IWARP_RTS = 2, + ERDMA_QPS_IWARP_CLOSING = 3, + ERDMA_QPS_IWARP_TERMINATE = 4, + ERDMA_QPS_IWARP_ERROR = 5, + ERDMA_QPS_IWARP_UNDEF = 6, + ERDMA_QPS_IWARP_COUNT = 7, +}; + +enum erdma_qpa_mask_iwarp { + ERDMA_QPA_IWARP_STATE = (1 << 0), + ERDMA_QPA_IWARP_LLP_HANDLE = (1 << 2), + ERDMA_QPA_IWARP_ORD = (1 << 3), + ERDMA_QPA_IWARP_IRD = (1 << 4), + ERDMA_QPA_IWARP_SQ_SIZE = (1 << 5), + ERDMA_QPA_IWARP_RQ_SIZE = (1 << 6), + ERDMA_QPA_IWARP_MPA = (1 << 7), + ERDMA_QPA_IWARP_CC = (1 << 8), }; -enum erdma_qp_attr_mask { - ERDMA_QP_ATTR_STATE = (1 << 0), - ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2), - ERDMA_QP_ATTR_ORD = (1 << 3), - ERDMA_QP_ATTR_IRD = (1 << 4), - ERDMA_QP_ATTR_SQ_SIZE = (1 << 5), - ERDMA_QP_ATTR_RQ_SIZE = (1 << 6), - ERDMA_QP_ATTR_MPA = (1 << 7) +enum erdma_qps_rocev2 { + ERDMA_QPS_ROCEV2_RESET = 0, + ERDMA_QPS_ROCEV2_INIT = 1, + ERDMA_QPS_ROCEV2_RTR = 2, + ERDMA_QPS_ROCEV2_RTS = 3, + ERDMA_QPS_ROCEV2_SQD = 4, + ERDMA_QPS_ROCEV2_SQE = 5, + ERDMA_QPS_ROCEV2_ERROR = 6, + ERDMA_QPS_ROCEV2_COUNT = 7, +}; + +enum erdma_qpa_mask_rocev2 { + ERDMA_QPA_ROCEV2_STATE = (1 << 0), + ERDMA_QPA_ROCEV2_QKEY = (1 << 1), + ERDMA_QPA_ROCEV2_AV = (1 << 2), + ERDMA_QPA_ROCEV2_SQ_PSN = (1 << 3), + ERDMA_QPA_ROCEV2_RQ_PSN = (1 << 4), + ERDMA_QPA_ROCEV2_DST_QPN = (1 << 5), }; enum erdma_qp_flags { ERDMA_QP_IN_FLUSHING = (1 << 0), }; +#define ERDMA_QP_ACTIVE 0 +#define ERDMA_QP_PASSIVE 1 + +struct erdma_mod_qp_params_iwarp { + enum erdma_qps_iwarp state; + enum erdma_cc_alg cc; + u8 qp_type; + u8 pd_len; + u32 irq_size; + u32 orq_size; +}; + +struct erdma_qp_attrs_iwarp { + enum erdma_qps_iwarp state; + u32 cookie; +}; + +struct erdma_mod_qp_params_rocev2 { + enum erdma_qps_rocev2 state; + u32 qkey; + u32 sq_psn; + u32 rq_psn; + u32 dst_qpn; + struct erdma_av av; +}; + +union erdma_mod_qp_params { + struct erdma_mod_qp_params_iwarp iwarp; + struct erdma_mod_qp_params_rocev2 rocev2; +}; + +struct erdma_qp_attrs_rocev2 { + enum erdma_qps_rocev2 state; + u32 qkey; + u32 dst_qpn; + struct erdma_av av; +}; + struct erdma_qp_attrs { - enum erdma_qp_state state; enum erdma_cc_alg cc; /* Congestion control algorithm */ u32 sq_size; u32 rq_size; @@ -207,11 +287,10 @@ struct erdma_qp_attrs { u32 irq_size; u32 max_send_sge; u32 max_recv_sge; - u32 cookie; -#define ERDMA_QP_ACTIVE 0 -#define ERDMA_QP_PASSIVE 1 - u8 qp_type; - u8 pd_len; + union { + struct erdma_qp_attrs_iwarp iwarp; + struct erdma_qp_attrs_rocev2 rocev2; + }; }; struct erdma_qp { @@ -246,13 +325,14 @@ struct erdma_kcq_info { spinlock_t lock; u8 __iomem *db; - u64 *db_record; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_ucq_info { struct erdma_mem qbuf_mem; struct erdma_user_dbrecords_page *user_dbr_page; - dma_addr_t db_info_dma_addr; + dma_addr_t dbrec_dma; }; struct erdma_cq { @@ -282,11 +362,25 @@ static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id) void erdma_qp_get(struct erdma_qp *qp); void erdma_qp_put(struct erdma_qp *qp); -int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask); +int erdma_modify_qp_state_iwarp(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + int mask); +int erdma_modify_qp_state_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + int attr_mask); void erdma_qp_llp_close(struct erdma_qp *qp); void erdma_qp_cm_drop(struct erdma_qp *qp); +static inline bool erdma_device_iwarp(struct erdma_dev *dev) +{ + return dev->proto == ERDMA_PROTO_IWARP; +} + +static inline bool erdma_device_rocev2(struct erdma_dev *dev) +{ + return dev->proto == ERDMA_PROTO_ROCEV2; +} + static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx) { return container_of(ibctx, struct erdma_ucontext, ibucontext); @@ -312,6 +406,21 @@ static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq) return container_of(ibcq, struct erdma_cq, ibcq); } +static inline struct erdma_ah *to_eah(struct ib_ah *ibah) +{ + return container_of(ibah, struct erdma_ah, ibah); +} + +static inline int erdma_check_gid_attr(const struct ib_gid_attr *attr) +{ + u8 ntype = rdma_gid_attr_network_type(attr); + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) + return -EINVAL; + + return 0; +} + static inline struct erdma_user_mmap_entry * to_emmap(struct rdma_user_mmap_entry *ibmmap) { @@ -325,7 +434,7 @@ int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr, int erdma_get_port_immutable(struct ib_device *dev, u32 port, struct ib_port_immutable *ib_port_immutable); int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *data); + struct uverbs_attr_bundle *attrs); int erdma_query_port(struct ib_device *dev, u32 port, struct ib_port_attr *attr); int erdma_query_gid(struct ib_device *dev, u32 port, int idx, @@ -340,6 +449,7 @@ int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *data); int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext); int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, u64 virt, int access, struct ib_udata *udata); @@ -355,6 +465,7 @@ int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +void erdma_remove_cqes_of_qp(struct ib_cq *ibcq, u32 qpn); struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg); int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, @@ -365,5 +476,15 @@ struct rdma_hw_stats *erdma_alloc_hw_port_stats(struct ib_device *device, u32 port_num); int erdma_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index); +enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, + u32 port_num); +int erdma_add_gid(const struct ib_gid_attr *attr, void **context); +int erdma_del_gid(const struct ib_gid_attr *attr, void **context); +int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); +void erdma_set_av_cfg(struct erdma_av_cfg *av_cfg, struct erdma_av *av); +int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int erdma_destroy_ah(struct ib_ah *ibah, u32 flags); +int erdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); #endif diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 78f27f7b4203..a442eca498b8 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -251,7 +251,7 @@ struct flag_table { /* * CCE Error flags. */ -static struct flag_table cce_err_status_flags[] = { +static const struct flag_table cce_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("CceCsrParityErr", CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK), /* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr", @@ -341,7 +341,7 @@ static struct flag_table cce_err_status_flags[] = { * Misc Error flags */ #define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK -static struct flag_table misc_err_status_flags[] = { +static const struct flag_table misc_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)), /* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)), /* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)), @@ -360,7 +360,7 @@ static struct flag_table misc_err_status_flags[] = { /* * TXE PIO Error flags and consequences */ -static struct flag_table pio_err_status_flags[] = { +static const struct flag_table pio_err_status_flags[] = { /* 0*/ FLAG_ENTRY("PioWriteBadCtxt", SEC_WRITE_DROPPED, SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK), @@ -502,7 +502,7 @@ static struct flag_table pio_err_status_flags[] = { /* * TXE SDMA Error flags */ -static struct flag_table sdma_err_status_flags[] = { +static const struct flag_table sdma_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK), /* 1*/ FLAG_ENTRY0("SDmaCsrParityErr", @@ -530,7 +530,7 @@ static struct flag_table sdma_err_status_flags[] = { * TXE Egress Error flags */ #define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK -static struct flag_table egress_err_status_flags[] = { +static const struct flag_table egress_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)), /* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)), /* 2 reserved */ @@ -631,7 +631,7 @@ static struct flag_table egress_err_status_flags[] = { * TXE Egress Error Info flags */ #define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK -static struct flag_table egress_err_info_flags[] = { +static const struct flag_table egress_err_info_flags[] = { /* 0*/ FLAG_ENTRY0("Reserved", 0ull), /* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)), /* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), @@ -680,7 +680,7 @@ static struct flag_table egress_err_info_flags[] = { * TXE Send error flags */ #define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK -static struct flag_table send_err_status_flags[] = { +static const struct flag_table send_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)), /* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)), /* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR)) @@ -689,7 +689,7 @@ static struct flag_table send_err_status_flags[] = { /* * TXE Send Context Error flags and consequences */ -static struct flag_table sc_err_status_flags[] = { +static const struct flag_table sc_err_status_flags[] = { /* 0*/ FLAG_ENTRY("InconsistentSop", SEC_PACKET_DROPPED | SEC_SC_HALTED, SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK), @@ -712,7 +712,7 @@ static struct flag_table sc_err_status_flags[] = { * RXE Receive Error flags */ #define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK -static struct flag_table rxe_err_status_flags[] = { +static const struct flag_table rxe_err_status_flags[] = { /* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)), /* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)), /* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)), @@ -847,7 +847,7 @@ static struct flag_table rxe_err_status_flags[] = { * DCC Error Flags */ #define DCCE(name) DCC_ERR_FLG_##name##_SMASK -static struct flag_table dcc_err_flags[] = { +static const struct flag_table dcc_err_flags[] = { FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)), FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)), FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)), @@ -900,7 +900,7 @@ static struct flag_table dcc_err_flags[] = { * LCB error flags */ #define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK -static struct flag_table lcb_err_flags[] = { +static const struct flag_table lcb_err_flags[] = { /* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)), /* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)), /* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)), @@ -943,7 +943,7 @@ static struct flag_table lcb_err_flags[] = { * DC8051 Error Flags */ #define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK -static struct flag_table dc8051_err_flags[] = { +static const struct flag_table dc8051_err_flags[] = { FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)), FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)), FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)), @@ -962,7 +962,7 @@ static struct flag_table dc8051_err_flags[] = { * * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field. */ -static struct flag_table dc8051_info_err_flags[] = { +static const struct flag_table dc8051_info_err_flags[] = { FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED), FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME), FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET), @@ -986,7 +986,7 @@ static struct flag_table dc8051_info_err_flags[] = { * * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field. */ -static struct flag_table dc8051_info_host_msg_flags[] = { +static const struct flag_table dc8051_info_host_msg_flags[] = { FLAG_ENTRY0("Host request done", 0x0001), FLAG_ENTRY0("BC PWR_MGM message", 0x0002), FLAG_ENTRY0("BC SMA message", 0x0004), @@ -5275,7 +5275,7 @@ done: * the buffer. End in '*' if the buffer is too short. */ static char *flag_string(char *buf, int buf_len, u64 flags, - struct flag_table *table, int table_size) + const struct flag_table *table, int table_size) { char extra[32]; char *p = buf; @@ -13235,7 +13235,7 @@ int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set) /* * Clear all interrupt sources on the chip. */ -void clear_all_interrupts(struct hfi1_devdata *dd) +static void clear_all_interrupts(struct hfi1_devdata *dd) { int i; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index d861aa8fc640..8841db16bde7 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1404,7 +1404,6 @@ irqreturn_t receive_context_interrupt_napi(int irq, void *data); int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set); void init_qsfp_int(struct hfi1_devdata *dd); -void clear_all_interrupts(struct hfi1_devdata *dd); void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); void reset_interrupts(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c index 35d2382ee618..ec9ee59fcf0c 100644 --- a/drivers/infiniband/hw/hfi1/fault.c +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -203,7 +203,6 @@ static const struct file_operations __fault_opcodes_fops = { .open = fault_opcodes_open, .read = fault_opcodes_read, .write = fault_opcodes_write, - .llseek = no_llseek }; void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 4b3f1cb125fc..cb630551cf1a 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -2339,20 +2339,6 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (port), ##__VA_ARGS__) -/* - * this is used for formatting hw error messages... - */ -struct hfi1_hwerror_msgs { - u64 mask; - const char *msg; - size_t sz; -}; - -/* in intr.c... */ -void hfi1_format_hwerrors(u64 hwerrs, - const struct hfi1_hwerror_msgs *hwerrmsgs, - size_t nhwerrmsgs, char *msg, size_t lmsg); - #define USER_OPCODE_CHECK_VAL 0xC0 #define USER_OPCODE_CHECK_MASK 0xC0 #define OPCODE_CHECK_VAL_DISABLED 0x0 @@ -2425,7 +2411,7 @@ static inline bool hfi1_need_drop(struct hfi1_devdata *dd) int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); #define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) -#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev) static inline void hfi1_update_ah_attr(struct ib_device *ibdev, struct rdma_ah_attr *attr) diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c index 3737f632d62a..d8dd1a599631 100644 --- a/drivers/infiniband/hw/hfi1/intr.c +++ b/drivers/infiniband/hw/hfi1/intr.c @@ -47,37 +47,6 @@ static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd) hfi1_event_pkey_change(ppd->dd, ppd->port); } -/** - * format_hwmsg - format a single hwerror message - * @msg: message buffer - * @msgl: length of message buffer - * @hwmsg: message to add to message buffer - */ -static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg) -{ - strlcat(msg, "[", msgl); - strlcat(msg, hwmsg, msgl); - strlcat(msg, "]", msgl); -} - -/** - * hfi1_format_hwerrors - format hardware error messages for display - * @hwerrs: hardware errors bit vector - * @hwerrmsgs: hardware error descriptions - * @nhwerrmsgs: number of hwerrmsgs - * @msg: message buffer - * @msgl: message buffer length - */ -void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs, - size_t nhwerrmsgs, char *msg, size_t msgl) -{ - int i; - - for (i = 0; i < nhwerrmsgs; i++) - if (hwerrs & hwerrmsgs[i].mask) - format_hwmsg(msg, msgl, hwerrmsgs[i].msg); -} - static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev) { struct ib_event event; diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 49805a24bb0a..7259f4f55700 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -92,7 +92,7 @@ struct iowait_work { * * The lock field is used by waiters to record * the seqlock_t that guards the list head. - * Waiters explicity know that, but the destroy + * Waiters explicitly know that, but the destroy * code that unwaits QPs does not. */ struct iowait { diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index 5d814afdf7f3..7c9d5203002b 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -21,36 +21,25 @@ static int hfi1_ipoib_dev_init(struct net_device *dev) struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); int ret; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = priv->netdev_ops->ndo_init(dev); if (ret) - goto out_ret; + return ret; ret = hfi1_netdev_add_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr), dev); if (ret < 0) { priv->netdev_ops->ndo_uninit(dev); - goto out_ret; + return ret; } return 0; -out_ret: - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; } static void hfi1_ipoib_dev_uninit(struct net_device *dev) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); - free_percpu(dev->tstats); - dev->tstats = NULL; - hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr)); priv->netdev_ops->ndo_uninit(dev); @@ -107,7 +96,6 @@ static const struct net_device_ops hfi1_ipoib_netdev_ops = { .ndo_uninit = hfi1_ipoib_dev_uninit, .ndo_open = hfi1_ipoib_dev_open, .ndo_stop = hfi1_ipoib_dev_stop, - .ndo_get_stats64 = dev_get_tstats64, }; static int hfi1_ipoib_mcast_attach(struct net_device *dev, @@ -173,9 +161,6 @@ static void hfi1_ipoib_netdev_dtor(struct net_device *dev) hfi1_ipoib_txreq_deinit(priv); hfi1_ipoib_rxq_deinit(priv->netdev); - - free_percpu(dev->tstats); - dev->tstats = NULL; } static void hfi1_ipoib_set_id(struct net_device *dev, int id) @@ -234,6 +219,7 @@ static int hfi1_ipoib_setup_rn(struct ib_device *device, netdev->priv_destructor = hfi1_ipoib_netdev_dtor; netdev->needs_free_netdev = true; + netdev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; return 0; } diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index d4a6acad0e65..67a5c410fb5e 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -40,7 +40,7 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node) } int hfi1_mmu_rb_register(void *ops_arg, - struct mmu_rb_ops *ops, + const struct mmu_rb_ops *ops, struct workqueue_struct *wq, struct mmu_rb_handler **handler) { diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 8e5d05454d70..3fa50dd64db6 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -42,7 +42,7 @@ struct mmu_rb_handler { /* Begin on a new cachline boundary here */ struct rb_root_cached root ____cacheline_aligned_in_smp; void *ops_arg; - struct mmu_rb_ops *ops; + const struct mmu_rb_ops *ops; struct list_head lru_list; struct work_struct del_work; struct list_head del_list; @@ -51,7 +51,7 @@ struct mmu_rb_handler { }; int hfi1_mmu_rb_register(void *ops_arg, - struct mmu_rb_ops *ops, + const struct mmu_rb_ops *ops, struct workqueue_struct *wq, struct mmu_rb_handler **handler); void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h index 8aa074670a9c..07c8f77c9181 100644 --- a/drivers/infiniband/hw/hfi1/netdev.h +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -49,7 +49,7 @@ struct hfi1_netdev_rxq { * When 0 receive queues will be freed. */ struct hfi1_netdev_rx { - struct net_device rx_napi; + struct net_device *rx_napi; struct hfi1_devdata *dd; struct hfi1_netdev_rxq *rxq; int num_rx_q; diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 720d4c85c9c9..8608044203bb 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -188,7 +188,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx) int i; int rc; struct hfi1_devdata *dd = rx->dd; - struct net_device *dev = &rx->rx_napi; + struct net_device *dev = rx->rx_napi; rx->num_rx_q = dd->num_netdev_contexts; rx->rxq = kcalloc_node(rx->num_rx_q, sizeof(*rx->rxq), @@ -360,7 +360,11 @@ int hfi1_alloc_rx(struct hfi1_devdata *dd) if (!rx) return -ENOMEM; rx->dd = dd; - init_dummy_netdev(&rx->rx_napi); + rx->rx_napi = alloc_netdev_dummy(0); + if (!rx->rx_napi) { + kfree(rx); + return -ENOMEM; + } xa_init(&rx->dev_tbl); atomic_set(&rx->enabled, 0); @@ -374,6 +378,7 @@ void hfi1_free_rx(struct hfi1_devdata *dd) { if (dd->netdev_rx) { dd_dev_info(dd, "hfi1 rx freed\n"); + free_netdev(dd->netdev_rx->rx_napi); kfree(dd->netdev_rx); dd->netdev_rx = NULL; } diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 119ec2f1382b..7133964749f8 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1207,14 +1207,11 @@ retry: (u32)lnkctl2); /* only write to parent if target is not as high as ours */ if ((lnkctl2 & PCI_EXP_LNKCTL2_TLS) < target_vector) { - lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; - lnkctl2 |= target_vector; - dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, - (u32)lnkctl2); - ret = pcie_capability_write_word(parent, - PCI_EXP_LNKCTL2, lnkctl2); + ret = pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL2, + PCI_EXP_LNKCTL2_TLS, + target_vector); if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); + dd_dev_err(dd, "Unable to change parent PCI target speed\n"); return_error = 1; goto done; } @@ -1223,22 +1220,11 @@ retry: } dd_dev_info(dd, "%s: setting target link speed\n", __func__); - ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); + ret = pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL2, + PCI_EXP_LNKCTL2_TLS, + target_vector); if (ret) { - dd_dev_err(dd, "Unable to read from PCI config\n"); - return_error = 1; - goto done; - } - - dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, - (u32)lnkctl2); - lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; - lnkctl2 |= target_vector; - dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, - (u32)lnkctl2); - ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); - if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); + dd_dev_err(dd, "Unable to change device PCI target speed\n"); return_error = 1; goto done; } diff --git a/drivers/infiniband/hw/hfi1/pin_system.c b/drivers/infiniband/hw/hfi1/pin_system.c index 384f722093e0..cce56134519b 100644 --- a/drivers/infiniband/hw/hfi1/pin_system.c +++ b/drivers/infiniband/hw/hfi1/pin_system.c @@ -26,7 +26,7 @@ static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, bool *stop); static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); -static struct mmu_rb_ops sdma_rb_ops = { +static const struct mmu_rb_ops sdma_rb_ops = { .filter = sdma_rb_filter, .evict = sdma_rb_evict, .remove = sdma_rb_remove, diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index d62ba5fdd80c..d94216c7d576 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -27,8 +27,8 @@ static struct hfi1_pportdata *hfi1_get_pportdata_kobj(struct kobject *kobj) * Congestion control table size followed by table entries */ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { int ret; struct hfi1_pportdata *ppd = hfi1_get_pportdata_kobj(kobj); @@ -57,7 +57,7 @@ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); /* * Congestion settings: port control, control map and an array of 16 @@ -65,7 +65,7 @@ static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); * trigger threshold and the minimum injection rate delay. */ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t count) { struct hfi1_pportdata *ppd = hfi1_get_pportdata_kobj(kobj); @@ -93,9 +93,9 @@ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); -static struct bin_attribute *port_cc_bin_attributes[] = { +static const struct bin_attribute *const port_cc_bin_attributes[] = { &bin_attr_cc_setting_bin, &bin_attr_cc_table_bin, NULL @@ -134,7 +134,7 @@ static struct attribute *port_cc_attributes[] = { static const struct attribute_group port_cc_group = { .name = "CCMgtA", .attrs = port_cc_attributes, - .bin_attrs = port_cc_bin_attributes, + .bin_attrs_new = port_cc_bin_attributes, }; /* Start sc2vl */ diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 18b05ffb415a..c465966a1d9c 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -315,7 +315,7 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) * This routine returns the receive context associated * with a a qp's qpn. * - * Returns the context. + * Return: the context. */ static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi, struct rvt_qp *qp) @@ -710,7 +710,7 @@ void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) * The exp_lock must be held. * * Return: - * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 + * On success: a value positive value between 0 and RXE_NUM_TID_FLOWS - 1 * On failure: -EAGAIN */ static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) @@ -1007,7 +1007,7 @@ static u32 tid_flush_pages(struct tid_rdma_pageset *list, * pages are tested two at a time, i, i + 1 for contiguous * pages and i - 1 and i contiguous pages. * - * If any condition is false, any accumlated pages are flushed and + * If any condition is false, any accumulated pages are flushed and * v0,v1 are emitted as separate PAGE_SIZE pagesets * * Otherwise, the current 8k is totaled for a future flush. @@ -1434,7 +1434,7 @@ static void kern_program_rcvarray(struct tid_rdma_flow *flow) * (5) computes a tidarray with formatted TID entries which can be sent * to the sender * (6) Reserves and programs HW flows. - * (7) It also manages queing the QP when TID/flow resources are not + * (7) It also manages queueing the QP when TID/flow resources are not * available. * * @req points to struct tid_rdma_request of which the segments are a part. The @@ -1604,7 +1604,7 @@ void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) } /** - * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information + * hfi1_kern_exp_rcv_free_flows - free previously allocated flow information * @req: the tid rdma request to be cleaned */ static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) @@ -2055,7 +2055,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, * req->clear_tail is advanced). However, when an earlier * request is received, this request will not be complete any * more (qp->s_tail_ack_queue is moved back, see below). - * Consequently, we need to update the TID flow info everytime + * Consequently, we need to update the TID flow info every time * a duplicate request is received. */ bth0 = be32_to_cpu(ohdr->bth[0]); @@ -2219,7 +2219,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) /* * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ * (see hfi1_rc_rcv()) - * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) + * 2. Put TID RDMA READ REQ into the response queue (s_ack_queue) * - Setup struct tid_rdma_req with request info * - Initialize struct tid_rdma_flow info; * - Copy TID entries; @@ -2439,7 +2439,7 @@ find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) { - /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ + /* HANDLER FOR TID RDMA READ RESPONSE packet (Requester side) */ /* * 1. Find matching SWQE @@ -3649,7 +3649,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet) * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST * (see hfi1_rc_rcv()) * - Don't allow 0-length requests. - * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue) + * 2. Put TID RDMA WRITE REQ into the response queue (s_ack_queue) * - Setup struct tid_rdma_req with request info * - Prepare struct tid_rdma_flow array? * 3. Set the qp->s_ack_state as state diagram in design doc. @@ -4026,7 +4026,7 @@ unlock_r_lock: void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet) { - /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */ + /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requester side) */ /* * 1. Find matching SWQE @@ -5440,8 +5440,9 @@ static bool _hfi1_schedule_tid_send(struct rvt_qp *qp) * the two state machines can step on each other with respect to the * RVT_S_BUSY flag. * Therefore, a modified test is used. - * @return true if the second leg is scheduled; - * false if the second leg is not scheduled. + * + * Return: %true if the second leg is scheduled; + * %false if the second leg is not scheduled. */ bool hfi1_schedule_tid_send(struct rvt_qp *qp) { diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h index 75599d5168db..58304b91380f 100644 --- a/drivers/infiniband/hw/hfi1/trace_dbg.h +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(hfi1_trace_template, TP_STRUCT__entry(__string(function, function) __vstring(msg, vaf->fmt, vaf->va) ), - TP_fast_assign(__assign_str(function, function); + TP_fast_assign(__assign_str(function); __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("(%s) %s", diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h index e6904aa80c00..8d5e12fe88a5 100644 --- a/drivers/infiniband/hw/hfi1/trace_rx.h +++ b/drivers/infiniband/hw/hfi1/trace_rx.h @@ -90,7 +90,7 @@ TRACE_EVENT(hfi1_mmu_invalidate, TP_fast_assign( __entry->ctxt = ctxt; __entry->subctxt = subctxt; - __assign_str(type, type); + __assign_str(type); __entry->start = start; __entry->end = end; ), diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index d129b8195959..e358f5b885fa 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(/* msg */ ), TP_fast_assign(/* assign */ __entry->qpn = qp ? qp->ibqp.qp_num : 0; - __assign_str(msg, msg); + __assign_str(msg); __entry->more = more; ), TP_printk(/* print */ @@ -651,7 +651,7 @@ DECLARE_EVENT_CLASS(/* tid_node */ TP_fast_assign(/* assign */ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)); __entry->qpn = qp->ibqp.qp_num; - __assign_str(msg, msg); + __assign_str(msg); __entry->index = index; __entry->base = base; __entry->map = map; diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h index c79856d4fdfb..c0ba6b0a2c4e 100644 --- a/drivers/infiniband/hw/hfi1/trace_tx.h +++ b/drivers/infiniband/hw/hfi1/trace_tx.h @@ -740,8 +740,8 @@ TRACE_EVENT(hfi1_sdma_state, __string(newstate, nstate) ), TP_fast_assign(DD_DEV_ASSIGN(sde->dd); - __assign_str(curstate, cstate); - __assign_str(newstate, nstate); + __assign_str(curstate); + __assign_str(newstate); ), TP_printk("[%s] current state %s new state %s", __get_str(dev), diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig index ab3fbba70789..44cdb706fe27 100644 --- a/drivers/infiniband/hw/hns/Kconfig +++ b/drivers/infiniband/hw/hns/Kconfig @@ -1,21 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only -config INFINIBAND_HNS - tristate "HNS RoCE Driver" - depends on NET_VENDOR_HISILICON - depends on ARM64 || (COMPILE_TEST && 64BIT) - depends on (HNS_DSAF && HNS_ENET) || HNS3 - help - This is a RoCE/RDMA driver for the Hisilicon RoCE engine. - - To compile HIP08 driver as module, choose M here. - config INFINIBAND_HNS_HIP08 - bool "Hisilicon Hip08 Family RoCE support" - depends on INFINIBAND_HNS && PCI && HNS3 - depends on INFINIBAND_HNS=m || HNS3=y + tristate "Hisilicon Hip08 Family RoCE support" + depends on ARM64 || (COMPILE_TEST && 64BIT) + depends on PCI && HNS3 help RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip08 SoC. The RoCE engine is a PCI device. - To compile this driver, choose Y here: if INFINIBAND_HNS is m, this - module will be called hns-roce-hw-v2. + To compile this driver, choose M here. This module will be called + hns-roce-hw-v2. diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index be1e1cdbcfa8..7917af8e6380 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -5,12 +5,9 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 -hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ +hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_debugfs.o + hns_roce_debugfs.o hns_roce_hw_v2.o -ifdef CONFIG_INFINIBAND_HNS_HIP08 -hns-roce-hw-v2-objs := hns_roce_hw_v2.o $(hns-roce-objs) -obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v2.o -endif +obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index b4209b6aed8d..4fc5b9d5fea8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -59,11 +59,15 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device); struct hns_roce_ib_create_ah_resp resp = {}; struct hns_roce_ah *ah = to_hr_ah(ibah); - int ret = 0; - u32 max_sl; - - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) - return -EOPNOTSUPP; + u8 tclass = get_tclass(grh); + u8 priority = 0; + u8 tc_mode = 0; + int ret; + + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) { + ret = -EOPNOTSUPP; + goto err_out; + } ah->av.port = rdma_ah_get_port_num(ah_attr); ah->av.gid_index = grh->sgid_index; @@ -74,15 +78,24 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ah->av.hop_limit = grh->hop_limit; ah->av.flowlabel = grh->flow_label; ah->av.udp_sport = get_ah_udp_sport(ah_attr); - ah->av.tclass = get_tclass(grh); - - ah->av.sl = rdma_ah_get_sl(ah_attr); - max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); - if (unlikely(ah->av.sl > max_sl)) { - ibdev_err_ratelimited(&hr_dev->ib_dev, - "failed to set sl, sl (%u) shouldn't be larger than %u.\n", - ah->av.sl, max_sl); - return -EINVAL; + ah->av.tclass = tclass; + + ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); + if (ret == -EOPNOTSUPP) + ret = 0; + + if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + goto err_out; + + if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + ah->av.sl = priority; + else + ah->av.sl = rdma_ah_get_sl(ah_attr); + + if (!check_sl_valid(hr_dev, ah->av.sl)) { + ret = -EINVAL; + goto err_out; } memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE); @@ -99,6 +112,8 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, } if (udata) { + resp.priority = ah->av.sl; + resp.tc_mode = tc_mode; memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN); ret = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 11a78ceae568..6ee911f6885b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -153,8 +153,7 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, return total; } -int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, - int buf_cnt, struct ib_umem *umem, +int hns_roce_get_umem_bufs(dma_addr_t *bufs, int buf_cnt, struct ib_umem *umem, unsigned int page_shift) { struct ib_block_iter biter; @@ -176,8 +175,10 @@ void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev) if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC) ida_destroy(&hr_dev->xrcd_ida.ida); - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) { ida_destroy(&hr_dev->srq_table.srq_ida.ida); + xa_destroy(&hr_dev->srq_table.xa); + } hns_roce_cleanup_qp_table(hr_dev); hns_roce_cleanup_cq_table(hr_dev); ida_destroy(&hr_dev->mr_table.mtpt_ida.ida); diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h index 052a3d60905a..11dbbabebdc9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.h +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h @@ -108,6 +108,9 @@ enum { HNS_ROCE_CMD_QUERY_CEQC = 0x92, HNS_ROCE_CMD_DESTROY_CEQC = 0x93, + /* SCC CTX commands */ + HNS_ROCE_CMD_QUERY_SCCC = 0xa2, + /* SCC CTX BT commands */ HNS_ROCE_CMD_READ_SCCC_BT0 = 0xa4, HNS_ROCE_CMD_WRITE_SCCC_BT0 = 0xa5, diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 1b6d16af8c12..3a5c93c9fb3e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -133,14 +133,12 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; struct ib_device *ibdev = &hr_dev->ib_dev; u64 mtts[MTT_MIN_COUNT] = {}; - dma_addr_t dma_handle; int ret; - ret = hns_roce_mtr_find(hr_dev, &hr_cq->mtr, 0, mtts, ARRAY_SIZE(mtts), - &dma_handle); - if (!ret) { + ret = hns_roce_mtr_find(hr_dev, &hr_cq->mtr, 0, mtts, ARRAY_SIZE(mtts)); + if (ret) { ibdev_err(ibdev, "failed to find CQ mtr, ret = %d.\n", ret); - return -EINVAL; + return ret; } /* Get CQC memory HEM(Hardware Entry Memory) table */ @@ -151,20 +149,21 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) return ret; } - ret = xa_err(xa_store(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL)); + ret = xa_err(xa_store_irq(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL)); if (ret) { ibdev_err(ibdev, "failed to xa_store CQ, ret = %d.\n", ret); goto err_put; } - ret = hns_roce_create_cqc(hr_dev, hr_cq, mtts, dma_handle); + ret = hns_roce_create_cqc(hr_dev, hr_cq, mtts, + hns_roce_get_mtr_ba(&hr_cq->mtr)); if (ret) goto err_xa; return 0; err_xa: - xa_erase(&cq_table->array, hr_cq->cqn); + xa_erase_irq(&cq_table->array, hr_cq->cqn); err_put: hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn); @@ -180,10 +179,10 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC, hr_cq->cqn); if (ret) - dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, - hr_cq->cqn); + dev_err_ratelimited(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", + ret, hr_cq->cqn); - xa_erase(&cq_table->array, hr_cq->cqn); + xa_erase_irq(&cq_table->array, hr_cq->cqn); /* Waiting interrupt process procedure carried out */ synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq); @@ -354,9 +353,10 @@ static int set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata, } int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); + struct ib_udata *udata = &attrs->driver_udata; struct hns_roce_ib_create_cq_resp resp = {}; struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); struct ib_device *ibdev = &hr_dev->ib_dev; @@ -477,13 +477,6 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) struct ib_event event; struct ib_cq *ibcq; - hr_cq = xa_load(&hr_dev->cq_table.array, - cqn & (hr_dev->caps.num_cqs - 1)); - if (!hr_cq) { - dev_warn(dev, "async event for bogus CQ 0x%06x\n", cqn); - return; - } - if (event_type != HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID && event_type != HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR && event_type != HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW) { @@ -492,7 +485,16 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) return; } - refcount_inc(&hr_cq->refcount); + xa_lock(&hr_dev->cq_table.array); + hr_cq = xa_load(&hr_dev->cq_table.array, + cqn & (hr_dev->caps.num_cqs - 1)); + if (hr_cq) + refcount_inc(&hr_cq->refcount); + xa_unlock(&hr_dev->cq_table.array); + if (!hr_cq) { + dev_warn(dev, "async event for bogus CQ 0x%06x\n", cqn); + return; + } ibcq = &hr_cq->ib_cq; if (ibcq->event_handler) { @@ -535,4 +537,6 @@ void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) ida_destroy(&hr_dev->cq_table.bank[i].ida); + xa_destroy(&hr_dev->cq_table.array); + mutex_destroy(&hr_dev->cq_table.bank_mutex); } diff --git a/drivers/infiniband/hw/hns/hns_roce_debugfs.c b/drivers/infiniband/hw/hns/hns_roce_debugfs.c index e8febb40f645..b869cdc54118 100644 --- a/drivers/infiniband/hw/hns/hns_roce_debugfs.c +++ b/drivers/infiniband/hw/hns/hns_roce_debugfs.c @@ -5,6 +5,7 @@ #include <linux/debugfs.h> #include <linux/device.h> +#include <linux/pci.h> #include "hns_roce_device.h" @@ -86,7 +87,7 @@ void hns_roce_register_debugfs(struct hns_roce_dev *hr_dev) { struct hns_roce_dev_debugfs *dbgfs = &hr_dev->dbgfs; - dbgfs->root = debugfs_create_dir(dev_name(&hr_dev->ib_dev.dev), + dbgfs->root = debugfs_create_dir(pci_name(hr_dev->pci_dev), hns_roce_dbgfs_root); create_sw_stat_debugfs(hr_dev, dbgfs->root); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index b1fce5ddf631..560a1d9de408 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -83,6 +83,7 @@ #define MR_TYPE_DMA 0x03 #define HNS_ROCE_FRMR_MAX_PA 512 +#define HNS_ROCE_FRMR_ALIGN_SIZE 128 #define PKEY_ID 0xffff #define NODE_DESC_SIZE 64 @@ -91,6 +92,8 @@ /* Configure to HW for PAGE_SIZE larger than 4KB */ #define PG_SHIFT_OFFSET (PAGE_SHIFT - 12) +#define ATOMIC_WR_LEN 8 + #define HNS_ROCE_IDX_QUE_ENTRY_SZ 4 #define SRQ_DB_REG 0x230 @@ -100,6 +103,9 @@ #define CQ_BANKID_SHIFT 2 #define CQ_BANKID_MASK GENMASK(1, 0) +#define HNS_ROCE_MAX_CQ_COUNT 0xFFFF +#define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF + enum { SERV_TYPE_RC, SERV_TYPE_UC, @@ -179,10 +185,14 @@ enum { #define HNS_ROCE_CMD_SUCCESS 1 +#define HNS_ROCE_MAX_HOP_NUM 3 /* The minimum page size is 4K for hardware */ #define HNS_HW_PAGE_SHIFT 12 #define HNS_HW_PAGE_SIZE (1 << HNS_HW_PAGE_SHIFT) +#define HNS_HW_MAX_PAGE_SHIFT 27 +#define HNS_HW_MAX_PAGE_SIZE (1 << HNS_HW_MAX_PAGE_SHIFT) + struct hns_roce_uar { u64 pfn; unsigned long index; @@ -269,6 +279,11 @@ struct hns_roce_hem_list { dma_addr_t root_ba; /* pointer to the root ba table */ }; +enum mtr_type { + MTR_DEFAULT = 0, + MTR_PBL, +}; + struct hns_roce_buf_attr { struct { size_t size; /* region size */ @@ -277,7 +292,10 @@ struct hns_roce_buf_attr { unsigned int region_count; /* valid region count */ unsigned int page_shift; /* buffer page shift */ unsigned int user_access; /* umem access flag */ + u64 iova; + enum mtr_type type; bool mtt_only; /* only alloc buffer-required MTT memory */ + bool adaptive; /* adaptive for page_shift and hopnum */ }; struct hns_roce_hem_cfg { @@ -471,12 +489,6 @@ struct hns_roce_bank { u32 next; /* Next ID to allocate. */ }; -struct hns_roce_idx_table { - u32 *spare_idx; - u32 head; - u32 tail; -}; - struct hns_roce_qp_table { struct hns_roce_hem_table qp_table; struct hns_roce_hem_table irrl_table; @@ -485,7 +497,7 @@ struct hns_roce_qp_table { struct mutex scc_mutex; struct hns_roce_bank bank[HNS_ROCE_QP_BANK_NUM]; struct mutex bank_mutex; - struct hns_roce_idx_table idx_table; + struct xarray dip_xa; }; struct hns_roce_cq_table { @@ -575,6 +587,7 @@ struct hns_roce_dev; enum { HNS_ROCE_FLUSH_FLAG = 0, + HNS_ROCE_STOP_FLUSH_FLAG = 1, }; struct hns_roce_work { @@ -585,6 +598,13 @@ struct hns_roce_work { u32 queue_num; }; +enum hns_roce_cong_type { + CONG_TYPE_DCQCN, + CONG_TYPE_LDCP, + CONG_TYPE_HC3, + CONG_TYPE_DIP, +}; + struct hns_roce_qp { struct ib_qp ibqp; struct hns_roce_wq rq; @@ -628,6 +648,11 @@ struct hns_roce_qp { struct list_head sq_node; /* all send qps are on a list */ struct hns_user_mmap_entry *dwqe_mmap_entry; u32 config; + enum hns_roce_cong_type cong_type; + u8 tc_mode; + u8 priority; + spinlock_t flush_lock; + struct hns_roce_dip *dip; }; struct hns_roce_ib_iboe { @@ -693,19 +718,13 @@ struct hns_roce_eq { int shift; int event_type; int sub_type; + struct work_struct work; }; struct hns_roce_eq_table { struct hns_roce_eq *eq; }; -enum cong_type { - CONG_TYPE_DCQCN, - CONG_TYPE_LDCP, - CONG_TYPE_HC3, - CONG_TYPE_DIP, -}; - struct hns_roce_caps { u64 fw_ver; u8 num_ports; @@ -835,7 +854,8 @@ struct hns_roce_caps { u16 default_aeq_period; u16 default_aeq_arm_st; u16 default_ceq_arm_st; - enum cong_type cong_type; + u8 cong_cap; + enum hns_roce_cong_type default_cong_type; }; enum hns_roce_device_state { @@ -912,8 +932,7 @@ struct hns_roce_hw { int (*rereg_write_mtpt)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, int flags, void *mb_buf); - int (*frmr_write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf, - struct hns_roce_mr *mr); + int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr); int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw); void (*write_cqc)(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, @@ -936,8 +955,11 @@ struct hns_roce_hw { int (*query_qpc)(struct hns_roce_dev *hr_dev, u32 qpn, void *buffer); int (*query_mpt)(struct hns_roce_dev *hr_dev, u32 key, void *buffer); int (*query_srqc)(struct hns_roce_dev *hr_dev, u32 srqn, void *buffer); + int (*query_sccc)(struct hns_roce_dev *hr_dev, u32 qpn, void *buffer); int (*query_hw_counter)(struct hns_roce_dev *hr_dev, u64 *stats, u32 port, int *hw_counters); + int (*get_dscp)(struct hns_roce_dev *hr_dev, u8 dscp, + u8 *tc_mode, u8 *priority); const struct ib_device_ops *hns_roce_dev_ops; const struct ib_device_ops *hns_roce_dev_srq_ops; }; @@ -957,8 +979,6 @@ struct hns_roce_dev { enum hns_roce_device_state state; struct list_head qp_list; /* list of all qps on this dev */ spinlock_t qp_list_lock; /* protect qp_list */ - struct list_head dip_list; /* list of all dest ips on this dev */ - spinlock_t dip_list_lock; /* protect dip_list */ struct list_head pgdir_list; struct mutex pgdir_mutex; @@ -1152,8 +1172,13 @@ void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev); /* hns roce hw need current block and next block addr from mtt */ #define MTT_MIN_COUNT 2 +static inline dma_addr_t hns_roce_get_mtr_ba(struct hns_roce_mtr *mtr) +{ + return mtr->hem_cfg.root_ba; +} + int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - u32 offset, u64 *mtt_buf, int mtt_max, u64 *base_addr); + u32 offset, u64 *mtt_buf, int mtt_max); int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, struct hns_roce_buf_attr *buf_attr, unsigned int page_shift, struct ib_udata *udata, @@ -1211,7 +1236,7 @@ struct hns_roce_buf *hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int buf_cnt, struct hns_roce_buf *buf, unsigned int page_shift); -int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, +int hns_roce_get_umem_bufs(dma_addr_t *bufs, int buf_cnt, struct ib_umem *umem, unsigned int page_shift); @@ -1244,7 +1269,7 @@ __be32 send_ieth(const struct ib_send_wr *wr); int to_hr_qp_type(int qp_type); int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, @@ -1259,6 +1284,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn); void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type); void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp); void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type); +void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn); void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type); void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev); int hns_roce_init(struct hns_roce_dev *hr_dev); @@ -1275,4 +1301,6 @@ struct hns_user_mmap_entry * hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address, size_t length, enum hns_roce_mmap_type mmap_type); +bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl); + #endif /* _HNS_ROCE_DEVICE_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index c4ac06a33869..ca0798224e56 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -249,85 +249,48 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, } static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev, - int npages, unsigned long hem_alloc_size, gfp_t gfp_mask) { - struct hns_roce_hem_chunk *chunk = NULL; struct hns_roce_hem *hem; - struct scatterlist *mem; int order; void *buf; WARN_ON(gfp_mask & __GFP_HIGHMEM); + order = get_order(hem_alloc_size); + if (PAGE_SIZE << order != hem_alloc_size) { + dev_err(hr_dev->dev, "invalid hem_alloc_size: %lu!\n", + hem_alloc_size); + return NULL; + } + hem = kmalloc(sizeof(*hem), gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); if (!hem) return NULL; - INIT_LIST_HEAD(&hem->chunk_list); - - order = get_order(hem_alloc_size); - - while (npages > 0) { - if (!chunk) { - chunk = kmalloc(sizeof(*chunk), - gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); - if (!chunk) - goto fail; - - sg_init_table(chunk->mem, HNS_ROCE_HEM_CHUNK_LEN); - chunk->npages = 0; - chunk->nsg = 0; - memset(chunk->buf, 0, sizeof(chunk->buf)); - list_add_tail(&chunk->list, &hem->chunk_list); - } + buf = dma_alloc_coherent(hr_dev->dev, hem_alloc_size, + &hem->dma, gfp_mask); + if (!buf) + goto fail; - while (1 << order > npages) - --order; - - /* - * Alloc memory one time. If failed, don't alloc small block - * memory, directly return fail. - */ - mem = &chunk->mem[chunk->npages]; - buf = dma_alloc_coherent(hr_dev->dev, PAGE_SIZE << order, - &sg_dma_address(mem), gfp_mask); - if (!buf) - goto fail; - - chunk->buf[chunk->npages] = buf; - sg_dma_len(mem) = PAGE_SIZE << order; - - ++chunk->npages; - ++chunk->nsg; - npages -= 1 << order; - } + hem->buf = buf; + hem->size = hem_alloc_size; return hem; fail: - hns_roce_free_hem(hr_dev, hem); + kfree(hem); return NULL; } void hns_roce_free_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem *hem) { - struct hns_roce_hem_chunk *chunk, *tmp; - int i; - if (!hem) return; - list_for_each_entry_safe(chunk, tmp, &hem->chunk_list, list) { - for (i = 0; i < chunk->npages; ++i) - dma_free_coherent(hr_dev->dev, - sg_dma_len(&chunk->mem[i]), - chunk->buf[i], - sg_dma_address(&chunk->mem[i])); - kfree(chunk); - } + dma_free_coherent(hr_dev->dev, hem->size, hem->buf, hem->dma); kfree(hem); } @@ -337,7 +300,7 @@ static int calc_hem_config(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; unsigned long mhop_obj = obj; u32 l0_idx, l1_idx, l2_idx; u32 chunk_ba_num; @@ -368,14 +331,14 @@ static int calc_hem_config(struct hns_roce_dev *hr_dev, index->buf = l0_idx; break; default: - ibdev_err(ibdev, "table %u not support mhop.hop_num = %u!\n", - table->type, mhop->hop_num); + dev_err(dev, "table %u not support mhop.hop_num = %u!\n", + table->type, mhop->hop_num); return -EINVAL; } if (unlikely(index->buf >= table->num_hem)) { - ibdev_err(ibdev, "table %u exceed hem limt idx %llu, max %lu!\n", - table->type, index->buf, table->num_hem); + dev_err(dev, "table %u exceed hem limt idx %llu, max %lu!\n", + table->type, index->buf, table->num_hem); return -EINVAL; } @@ -415,7 +378,6 @@ static int alloc_mhop_hem(struct hns_roce_dev *hr_dev, { u32 bt_size = mhop->bt_chunk_size; struct device *dev = hr_dev->dev; - struct hns_roce_hem_iter iter; gfp_t flag; u64 bt_ba; u32 size; @@ -456,16 +418,15 @@ static int alloc_mhop_hem(struct hns_roce_dev *hr_dev, */ size = table->type < HEM_TYPE_MTT ? mhop->buf_chunk_size : bt_size; flag = GFP_KERNEL | __GFP_NOWARN; - table->hem[index->buf] = hns_roce_alloc_hem(hr_dev, size >> PAGE_SHIFT, - size, flag); + table->hem[index->buf] = hns_roce_alloc_hem(hr_dev, size, flag); if (!table->hem[index->buf]) { ret = -ENOMEM; goto err_alloc_hem; } index->inited |= HEM_INDEX_BUF; - hns_roce_hem_first(table->hem[index->buf], &iter); - bt_ba = hns_roce_hem_addr(&iter); + bt_ba = table->hem[index->buf]->dma; + if (table->type < HEM_TYPE_MTT) { if (mhop->hop_num == 2) *(table->bt_l1[index->l1] + mhop->l2_idx) = bt_ba; @@ -487,14 +448,14 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; u32 step_idx; int ret = 0; if (index->inited & HEM_INDEX_L0) { ret = hr_dev->hw->set_hem(hr_dev, table, obj, 0); if (ret) { - ibdev_err(ibdev, "set HEM step 0 failed!\n"); + dev_err(dev, "set HEM step 0 failed!\n"); goto out; } } @@ -502,7 +463,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, if (index->inited & HEM_INDEX_L1) { ret = hr_dev->hw->set_hem(hr_dev, table, obj, 1); if (ret) { - ibdev_err(ibdev, "set HEM step 1 failed!\n"); + dev_err(dev, "set HEM step 1 failed!\n"); goto out; } } @@ -514,7 +475,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, step_idx = mhop->hop_num; ret = hr_dev->hw->set_hem(hr_dev, table, obj, step_idx); if (ret) - ibdev_err(ibdev, "set HEM step last failed!\n"); + dev_err(dev, "set HEM step last failed!\n"); } out: return ret; @@ -524,14 +485,14 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, unsigned long obj) { - struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_hem_index index = {}; struct hns_roce_hem_mhop mhop = {}; + struct device *dev = hr_dev->dev; int ret; ret = calc_hem_config(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "calc hem config failed!\n"); + dev_err(dev, "calc hem config failed!\n"); return ret; } @@ -543,7 +504,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, ret = alloc_mhop_hem(hr_dev, table, &mhop, &index); if (ret) { - ibdev_err(ibdev, "alloc mhop hem failed!\n"); + dev_err(dev, "alloc mhop hem failed!\n"); goto out; } @@ -551,7 +512,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, if (table->type < HEM_TYPE_MTT) { ret = set_mhop_hem(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "set HEM address to HW failed!\n"); + dev_err(dev, "set HEM address to HW failed!\n"); goto err_alloc; } } @@ -586,7 +547,6 @@ int hns_roce_table_get(struct hns_roce_dev *hr_dev, } table->hem[i] = hns_roce_alloc_hem(hr_dev, - table->table_chunk_size >> PAGE_SHIFT, table->table_chunk_size, GFP_KERNEL | __GFP_NOWARN); if (!table->hem[i]) { @@ -615,7 +575,7 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop, struct hns_roce_hem_index *index) { - struct ib_device *ibdev = &hr_dev->ib_dev; + struct device *dev = hr_dev->dev; u32 hop_num = mhop->hop_num; u32 chunk_ba_num; u32 step_idx; @@ -645,21 +605,21 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx); if (ret) - ibdev_warn(ibdev, "failed to clear hop%u HEM, ret = %d.\n", - hop_num, ret); + dev_warn(dev, "failed to clear hop%u HEM, ret = %d.\n", + hop_num, ret); if (index->inited & HEM_INDEX_L1) { ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 1); if (ret) - ibdev_warn(ibdev, "failed to clear HEM step 1, ret = %d.\n", - ret); + dev_warn(dev, "failed to clear HEM step 1, ret = %d.\n", + ret); } if (index->inited & HEM_INDEX_L0) { ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); if (ret) - ibdev_warn(ibdev, "failed to clear HEM step 0, ret = %d.\n", - ret); + dev_warn(dev, "failed to clear HEM step 0, ret = %d.\n", + ret); } } } @@ -669,14 +629,14 @@ static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev, unsigned long obj, int check_refcount) { - struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_hem_index index = {}; struct hns_roce_hem_mhop mhop = {}; + struct device *dev = hr_dev->dev; int ret; ret = calc_hem_config(hr_dev, table, obj, &mhop, &index); if (ret) { - ibdev_err(ibdev, "calc hem config failed!\n"); + dev_err(dev, "calc hem config failed!\n"); return; } @@ -712,8 +672,8 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); if (ret) - dev_warn(dev, "failed to clear HEM base address, ret = %d.\n", - ret); + dev_warn_ratelimited(dev, "failed to clear HEM base address, ret = %d.\n", + ret); hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; @@ -725,7 +685,6 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, unsigned long obj, dma_addr_t *dma_handle) { - struct hns_roce_hem_chunk *chunk; struct hns_roce_hem_mhop mhop; struct hns_roce_hem *hem; unsigned long mhop_obj = obj; @@ -734,7 +693,6 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev, int offset, dma_offset; void *addr = NULL; u32 hem_idx = 0; - int length; int i, j; mutex_lock(&table->mutex); @@ -767,23 +725,8 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev, if (!hem) goto out; - list_for_each_entry(chunk, &hem->chunk_list, list) { - for (i = 0; i < chunk->npages; ++i) { - length = sg_dma_len(&chunk->mem[i]); - if (dma_handle && dma_offset >= 0) { - if (length > (u32)dma_offset) - *dma_handle = sg_dma_address( - &chunk->mem[i]) + dma_offset; - dma_offset -= length; - } - - if (length > (u32)offset) { - addr = chunk->buf[i] + offset; - goto out; - } - offset -= length; - } - } + *dma_handle = hem->dma + dma_offset; + addr = hem->buf + offset; out: mutex_unlock(&table->mutex); @@ -934,6 +877,7 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_cleanup_mhop_hem_table(hr_dev, table); + mutex_destroy(&table->mutex); return; } @@ -948,6 +892,7 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, hns_roce_free_hem(hr_dev, table->hem[i]); } + mutex_destroy(&table->mutex); kfree(table->hem); } @@ -986,6 +931,7 @@ struct hns_roce_hem_item { size_t count; /* max ba numbers */ int start; /* start buf offset in this hem */ int end; /* end buf offset in this hem */ + bool exist_bt; }; /* All HEM items are linked in a tree structure */ @@ -1014,6 +960,7 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } } + hem->exist_bt = exist_bt; hem->count = count; hem->start = start; hem->end = end; @@ -1024,34 +971,32 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } static void hem_list_free_item(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_item *hem, bool exist_bt) + struct hns_roce_hem_item *hem) { - if (exist_bt) + if (hem->exist_bt) dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN, hem->addr, hem->dma_addr); kfree(hem); } static void hem_list_free_all(struct hns_roce_dev *hr_dev, - struct list_head *head, bool exist_bt) + struct list_head *head) { struct hns_roce_hem_item *hem, *temp_hem; list_for_each_entry_safe(hem, temp_hem, head, list) { list_del(&hem->list); - hem_list_free_item(hr_dev, hem, exist_bt); + hem_list_free_item(hr_dev, hem); } } -static void hem_list_link_bt(struct hns_roce_dev *hr_dev, void *base_addr, - u64 table_addr) +static void hem_list_link_bt(void *base_addr, u64 table_addr) { *(u64 *)(base_addr) = table_addr; } /* assign L0 table address to hem from root bt */ -static void hem_list_assign_bt(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_item *hem, void *cpu_addr, +static void hem_list_assign_bt(struct hns_roce_hem_item *hem, void *cpu_addr, u64 phy_addr) { hem->addr = cpu_addr; @@ -1098,9 +1043,9 @@ static bool hem_list_is_bottom_bt(int hopnum, int bt_level) * @bt_level: base address table level * @unit: ba entries per bt page */ -static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) +static u64 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) { - u32 step; + u64 step; int max; int i; @@ -1136,11 +1081,15 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, { struct hns_roce_buf_region *r; int total = 0; - int step; + u64 step; int i; for (i = 0; i < region_cnt; i++) { r = (struct hns_roce_buf_region *)®ions[i]; + /* when r->hopnum = 0, the region should not occupy root_ba. */ + if (!r->hopnum) + continue; + if (r->hopnum > 1) { step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step > 0) @@ -1167,7 +1116,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, int ret = 0; int max_ofs; int level; - u32 step; + u64 step; int end; if (hopnum <= 1) @@ -1191,10 +1140,12 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, /* config L1 bt to last bt and link them to corresponding parent */ for (level = 1; level < hopnum; level++) { - cur = hem_list_search_item(&mid_bt[level], offset); - if (cur) { - hem_ptrs[level] = cur; - continue; + if (!hem_list_is_bottom_bt(hopnum, level)) { + cur = hem_list_search_item(&mid_bt[level], offset); + if (cur) { + hem_ptrs[level] = cur; + continue; + } } step = hem_list_calc_ba_range(hopnum, level, unit); @@ -1204,7 +1155,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, } start_aligned = (distance / step) * step + r->offset; - end = min_t(int, start_aligned + step - 1, max_ofs); + end = min_t(u64, start_aligned + step - 1, max_ofs); cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit, true); if (!cur) { @@ -1220,8 +1171,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, if (level > 1) { pre = hem_ptrs[level - 1]; step = (cur->start - pre->start) / step * BA_BYTE_LEN; - hem_list_link_bt(hr_dev, pre->addr + step, - cur->dma_addr); + hem_list_link_bt(pre->addr + step, cur->dma_addr); } } @@ -1233,7 +1183,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, err_exit: for (level = 1; level < hopnum; level++) - hem_list_free_all(hr_dev, &temp_list[level], true); + hem_list_free_all(hr_dev, &temp_list[level]); return ret; } @@ -1274,16 +1224,26 @@ static int alloc_fake_root_bt(struct hns_roce_dev *hr_dev, void *cpu_base, { struct hns_roce_hem_item *hem; + /* This is on the has_mtt branch, if r->hopnum + * is 0, there is no root_ba to reuse for the + * region's fake hem, so a dma_alloc request is + * necessary here. + */ hem = hem_list_alloc_item(hr_dev, r->offset, r->offset + r->count - 1, - r->count, false); + r->count, !r->hopnum); if (!hem) return -ENOMEM; - hem_list_assign_bt(hr_dev, hem, cpu_base, phy_base); + /* The root_ba can be reused only when r->hopnum > 0. */ + if (r->hopnum) + hem_list_assign_bt(hem, cpu_base, phy_base); list_add(&hem->list, branch_head); list_add(&hem->sibling, leaf_head); - return r->count; + /* If r->hopnum == 0, 0 is returned, + * so that the root_bt entry is not occupied. + */ + return r->hopnum ? r->count : 0; } static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, @@ -1293,7 +1253,7 @@ static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, struct hns_roce_hem_item *hem, *temp_hem; int total = 0; int offset; - int step; + u64 step; step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step < 1) @@ -1302,7 +1262,7 @@ static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, /* if exist mid bt, link L1 to L0 */ list_for_each_entry_safe(hem, temp_hem, branch_head, list) { offset = (hem->start - r->offset) / step * BA_BYTE_LEN; - hem_list_link_bt(hr_dev, cpu_base + offset, hem->dma_addr); + hem_list_link_bt(cpu_base + offset, hem->dma_addr); total++; } @@ -1327,7 +1287,7 @@ setup_root_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, return -ENOMEM; total = 0; - for (i = 0; i < region_cnt && total < max_ba_num; i++) { + for (i = 0; i < region_cnt && total <= max_ba_num; i++) { r = ®ions[i]; if (!r->count) continue; @@ -1393,14 +1353,19 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev, region_cnt); if (ret) { for (i = 0; i < region_cnt; i++) - hem_list_free_all(hr_dev, &head.branch[i], false); + hem_list_free_all(hr_dev, &head.branch[i]); - hem_list_free_all(hr_dev, &head.root, true); + hem_list_free_all(hr_dev, &head.root); } return ret; } +/* This is the bottom bt pages number of a 100G MR on 4K OS, assuming + * the bt page size is not expanded by cal_best_bt_pg_sz() + */ +#define RESCHED_LOOP_CNT_THRESHOLD_ON_4K 12800 + /* construct the base address table and link them by address hop config */ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, @@ -1409,6 +1374,7 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, { const struct hns_roce_buf_region *r; int ofs, end; + int loop; int unit; int ret; int i; @@ -1426,7 +1392,10 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, continue; end = r->offset + r->count; - for (ofs = r->offset; ofs < end; ofs += unit) { + for (ofs = r->offset, loop = 1; ofs < end; ofs += unit, loop++) { + if (!(loop % RESCHED_LOOP_CNT_THRESHOLD_ON_4K)) + cond_resched(); + ret = hem_list_alloc_mid_bt(hr_dev, r, unit, ofs, hem_list->mid_bt[i], &hem_list->btm_bt); @@ -1458,10 +1427,9 @@ void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) - hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j], - j != 0); + hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j]); - hem_list_free_all(hr_dev, &hem_list->root_bt, true); + hem_list_free_all(hr_dev, &hem_list->root_bt); INIT_LIST_HEAD(&hem_list->btm_bt); hem_list->root_ba = 0; } @@ -1484,9 +1452,14 @@ void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, struct list_head *head = &hem_list->btm_bt; struct hns_roce_hem_item *hem, *temp_hem; void *cpu_base = NULL; + int loop = 1; int nr = 0; list_for_each_entry_safe(hem, temp_hem, head, sibling) { + if (!(loop % RESCHED_LOOP_CNT_THRESHOLD_ON_4K)) + cond_resched(); + loop++; + if (hem_list_page_is_in_range(hem, offset)) { nr = offset - hem->start; cpu_base = hem->addr + nr * BA_BYTE_LEN; diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index 7d23d3c51da4..9c415b2541af 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -56,41 +56,25 @@ enum { HEM_TYPE_TRRL, }; -#define HNS_ROCE_HEM_CHUNK_LEN \ - ((256 - sizeof(struct list_head) - 2 * sizeof(int)) / \ - (sizeof(struct scatterlist) + sizeof(void *))) - #define check_whether_bt_num_3(type, hop_num) \ - (type < HEM_TYPE_MTT && hop_num == 2) + ((type) < HEM_TYPE_MTT && (hop_num) == 2) #define check_whether_bt_num_2(type, hop_num) \ - ((type < HEM_TYPE_MTT && hop_num == 1) || \ - (type >= HEM_TYPE_MTT && hop_num == 2)) + (((type) < HEM_TYPE_MTT && (hop_num) == 1) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == 2)) #define check_whether_bt_num_1(type, hop_num) \ - ((type < HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0) || \ - (type >= HEM_TYPE_MTT && hop_num == 1) || \ - (type >= HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0)) - -struct hns_roce_hem_chunk { - struct list_head list; - int npages; - int nsg; - struct scatterlist mem[HNS_ROCE_HEM_CHUNK_LEN]; - void *buf[HNS_ROCE_HEM_CHUNK_LEN]; -}; + (((type) < HEM_TYPE_MTT && (hop_num) == HNS_ROCE_HOP_NUM_0) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == 1) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == HNS_ROCE_HOP_NUM_0)) struct hns_roce_hem { - struct list_head chunk_list; + void *buf; + dma_addr_t dma; + unsigned long size; refcount_t refcount; }; -struct hns_roce_hem_iter { - struct hns_roce_hem *hem; - struct hns_roce_hem_chunk *chunk; - int page_idx; -}; - struct hns_roce_hem_mhop { u32 hop_num; u32 buf_chunk_size; @@ -133,38 +117,4 @@ void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, int offset, int *mtt_cnt); -static inline void hns_roce_hem_first(struct hns_roce_hem *hem, - struct hns_roce_hem_iter *iter) -{ - iter->hem = hem; - iter->chunk = list_empty(&hem->chunk_list) ? NULL : - list_entry(hem->chunk_list.next, - struct hns_roce_hem_chunk, list); - iter->page_idx = 0; -} - -static inline int hns_roce_hem_last(struct hns_roce_hem_iter *iter) -{ - return !iter->chunk; -} - -static inline void hns_roce_hem_next(struct hns_roce_hem_iter *iter) -{ - if (++iter->page_idx >= iter->chunk->nsg) { - if (iter->chunk->list.next == &iter->hem->chunk_list) { - iter->chunk = NULL; - return; - } - - iter->chunk = list_entry(iter->chunk->list.next, - struct hns_roce_hem_chunk, list); - iter->page_idx = 0; - } -} - -static inline dma_addr_t hns_roce_hem_addr(struct hns_roce_hem_iter *iter) -{ - return sg_dma_address(&iter->chunk->mem[iter->page_idx]); -} - #endif /* _HNS_ROCE_HEM_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 8206daea6767..160e8927d364 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -36,6 +36,7 @@ #include <linux/iopoll.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/workqueue.h> #include <net/addrconf.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> @@ -372,19 +373,12 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, static int check_send_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { - struct ib_device *ibdev = &hr_dev->ib_dev; - if (unlikely(hr_qp->state == IB_QPS_RESET || hr_qp->state == IB_QPS_INIT || - hr_qp->state == IB_QPS_RTR)) { - ibdev_err(ibdev, "failed to post WQE, QP state %u!\n", - hr_qp->state); + hr_qp->state == IB_QPS_RTR)) return -EINVAL; - } else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) { - ibdev_err(ibdev, "failed to post WQE, dev state %d!\n", - hr_dev->state); + else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) return -EIO; - } return 0; } @@ -443,10 +437,6 @@ static int fill_ud_av(struct hns_roce_v2_ud_send_wqe *ud_sq_wqe, hr_reg_write(ud_sq_wqe, UD_SEND_WQE_HOPLIMIT, ah->av.hop_limit); hr_reg_write(ud_sq_wqe, UD_SEND_WQE_TCLASS, ah->av.tclass); hr_reg_write(ud_sq_wqe, UD_SEND_WQE_FLOW_LABEL, ah->av.flowlabel); - - if (WARN_ON(ah->av.sl > MAX_SERVICE_LEVEL)) - return -EINVAL; - hr_reg_write(ud_sq_wqe, UD_SEND_WQE_SL, ah->av.sl); ud_sq_wqe->sgid_index = ah->av.gid_index; @@ -478,7 +468,7 @@ static inline int set_ud_wqe(struct hns_roce_qp *qp, valid_num_sge = calc_wr_sge_num(wr, &msg_len); ret = set_ud_opcode(ud_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; ud_sq_wqe->msg_len = cpu_to_le32(msg_len); @@ -582,10 +572,10 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, rc_sq_wqe->msg_len = cpu_to_le32(msg_len); ret = set_rc_opcode(hr_dev, rc_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; - hr_reg_write(rc_sq_wqe, RC_SEND_WQE_FENCE, + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SO, (wr->send_flags & IB_SEND_FENCE) ? 1 : 0); hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SE, @@ -595,11 +585,16 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + if (msg_len != ATOMIC_WR_LEN) + return -EINVAL; set_atomic_seg(wr, rc_sq_wqe, valid_num_sge); - else if (wr->opcode != IB_WR_REG_MR) + } else if (wr->opcode != IB_WR_REG_MR) { ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe, &curr_idx, valid_num_sge); + if (ret) + return ret; + } /* * The pipeline can sequentially post all valid WQEs into WQ buffer, @@ -675,6 +670,10 @@ static void write_dwqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, #define HNS_ROCE_SL_SHIFT 2 struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe; + if (unlikely(qp->state == IB_QPS_ERR)) { + flush_cqe(hr_dev, qp); + return; + } /* All kinds of DirectWQE have the same header field layout */ hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_FLAG); hr_reg_write(rc_sq_wqe, RC_SEND_WQE_DB_SL_L, qp->sl); @@ -1273,19 +1272,42 @@ static int hns_roce_cmd_err_convert_errno(u16 desc_ret) return -EIO; } -static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, - struct hns_roce_cmq_desc *desc, int num) +static u32 hns_roce_cmdq_tx_timeout(u16 opcode, u32 tx_timeout) +{ + static const struct hns_roce_cmdq_tx_timeout_map cmdq_tx_timeout[] = { + {HNS_ROCE_OPC_POST_MB, HNS_ROCE_OPC_POST_MB_TIMEOUT}, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(cmdq_tx_timeout); i++) + if (cmdq_tx_timeout[i].opcode == opcode) + return cmdq_tx_timeout[i].tx_timeout; + + return tx_timeout; +} + +static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u32 tx_timeout) +{ + u32 timeout = 0; + + do { + if (hns_roce_cmq_csq_done(hr_dev)) + break; + udelay(1); + } while (++timeout < tx_timeout); +} + +static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, + int num, u32 tx_timeout) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; - u32 timeout = 0; u16 desc_ret; u32 tail; int ret; int i; - spin_lock_bh(&csq->lock); - tail = csq->head; for (i = 0; i < num; i++) { @@ -1299,27 +1321,17 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_CNT]); - do { - if (hns_roce_cmq_csq_done(hr_dev)) - break; - udelay(1); - } while (++timeout < priv->cmq.tx_timeout); - + hns_roce_wait_csq_done(hr_dev, tx_timeout); if (hns_roce_cmq_csq_done(hr_dev)) { ret = 0; for (i = 0; i < num; i++) { /* check the result of hardware write back */ - desc[i] = csq->desc[tail++]; + desc_ret = le16_to_cpu(csq->desc[tail++].retval); if (tail == csq->desc_num) tail = 0; - - desc_ret = le16_to_cpu(desc[i].retval); if (likely(desc_ret == CMD_EXEC_SUCCESS)) continue; - dev_err_ratelimited(hr_dev->dev, - "Cmdq IO error, opcode = 0x%x, return = 0x%x.\n", - desc->opcode, desc_ret); ret = hns_roce_cmd_err_convert_errno(desc_ret); } } else { @@ -1334,14 +1346,54 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, ret = -EAGAIN; } - spin_unlock_bh(&csq->lock); - if (ret) atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_ERR_CNT]); return ret; } +static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, int num) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; + u16 opcode = le16_to_cpu(desc->opcode); + u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout); + u8 try_cnt = HNS_ROCE_OPC_POST_MB_TRY_CNT; + u32 rsv_tail; + int ret; + int i; + + while (try_cnt) { + try_cnt--; + + spin_lock_bh(&csq->lock); + rsv_tail = csq->head; + ret = __hns_roce_cmq_send_one(hr_dev, desc, num, tx_timeout); + if (opcode == HNS_ROCE_OPC_POST_MB && ret == -ETIME && + try_cnt) { + spin_unlock_bh(&csq->lock); + mdelay(HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC); + continue; + } + + for (i = 0; i < num; i++) { + desc[i] = csq->desc[rsv_tail++]; + if (rsv_tail == csq->desc_num) + rsv_tail = 0; + } + spin_unlock_bh(&csq->lock); + break; + } + + if (ret) + dev_err_ratelimited(hr_dev->dev, + "Cmdq IO error, opcode = 0x%x, return = %d.\n", + opcode, ret); + + return ret; +} + static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { @@ -1658,8 +1710,8 @@ static int hns_roce_hw_v2_query_counter(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_HW_CNT_TOTAL && i < *num_counters; i++) { bd_idx = i / CNT_PER_DESC; - if (!(desc[bd_idx].flag & HNS_ROCE_CMD_FLAG_NEXT) && - bd_idx != HNS_ROCE_HW_CNT_TOTAL / CNT_PER_DESC) + if (bd_idx != HNS_ROCE_HW_CNT_TOTAL / CNT_PER_DESC && + !(desc[bd_idx].flag & cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT))) break; cnt_data = (__le64 *)&desc[bd_idx].data[0]; @@ -2105,7 +2157,7 @@ static void apply_func_caps(struct hns_roce_dev *hr_dev) caps->gmv_bt_num * (HNS_HW_PAGE_SIZE / caps->gmv_entry_sz)); - caps->gmv_entry_num = caps->gmv_bt_num * (PAGE_SIZE / + caps->gmv_entry_num = caps->gmv_bt_num * (HNS_HW_PAGE_SIZE / caps->gmv_entry_sz); } else { u32 func_num = max_t(u32, 1, hr_dev->func_num); @@ -2209,11 +2261,12 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev) caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth); caps->num_srqs = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_SRQS); - caps->cong_type = hr_reg_read(resp_d, PF_CAPS_D_CONG_TYPE); + caps->cong_cap = hr_reg_read(resp_d, PF_CAPS_D_CONG_CAP); caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth); caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH); caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS); caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH); + caps->default_cong_type = hr_reg_read(resp_d, PF_CAPS_D_DEFAULT_ALG); caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS); caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS); caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS); @@ -2460,14 +2513,16 @@ static int set_llm_cfg_to_hw(struct hns_roce_dev *hr_dev, static struct hns_roce_link_table * alloc_link_table_buf(struct hns_roce_dev *hr_dev) { + u16 total_sl = hr_dev->caps.sl_num * hr_dev->func_num; struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_link_table *link_tbl; u32 pg_shift, size, min_size; link_tbl = &priv->ext_llm; pg_shift = hr_dev->caps.llm_buf_pg_sz + PAGE_SHIFT; - size = hr_dev->caps.num_qps * HNS_ROCE_V2_EXT_LLM_ENTRY_SZ; - min_size = HNS_ROCE_EXT_LLM_MIN_PAGES(hr_dev->caps.sl_num) << pg_shift; + size = hr_dev->caps.num_qps * hr_dev->func_num * + HNS_ROCE_V2_EXT_LLM_ENTRY_SZ; + min_size = HNS_ROCE_EXT_LLM_MIN_PAGES(total_sl) << pg_shift; /* Alloc data table */ size = max(size, min_size); @@ -2534,20 +2589,19 @@ static void hns_roce_free_link_table(struct hns_roce_dev *hr_dev) free_link_table_buf(hr_dev, &priv->ext_llm); } -static void free_dip_list(struct hns_roce_dev *hr_dev) +static void free_dip_entry(struct hns_roce_dev *hr_dev) { struct hns_roce_dip *hr_dip; - struct hns_roce_dip *tmp; - unsigned long flags; + unsigned long idx; - spin_lock_irqsave(&hr_dev->dip_list_lock, flags); + xa_lock(&hr_dev->qp_table.dip_xa); - list_for_each_entry_safe(hr_dip, tmp, &hr_dev->dip_list, node) { - list_del(&hr_dip->node); + xa_for_each(&hr_dev->qp_table.dip_xa, idx, hr_dip) { + __xa_erase(&hr_dev->qp_table.dip_xa, hr_dip->dip_idx); kfree(hr_dip); } - spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); + xa_unlock(&hr_dev->qp_table.dip_xa); } static struct ib_pd *free_mr_init_pd(struct hns_roce_dev *hr_dev) @@ -2670,6 +2724,8 @@ static void free_mr_exit(struct hns_roce_dev *hr_dev) kfree(free_mr->rsv_pd); free_mr->rsv_pd = NULL; } + + mutex_destroy(&free_mr->mutex); } static int free_mr_alloc_res(struct hns_roce_dev *hr_dev) @@ -2747,8 +2803,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, IB_QPS_INIT, NULL); if (ret) { - ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, "failed to modify qp to init, ret = %d.\n", + ret); return ret; } @@ -2820,8 +2876,10 @@ static int free_mr_init(struct hns_roce_dev *hr_dev) mutex_init(&free_mr->mutex); ret = free_mr_alloc_res(hr_dev); - if (ret) + if (ret) { + mutex_destroy(&free_mr->mutex); return ret; + } ret = free_mr_modify_qp(hr_dev); if (ret) @@ -2942,13 +3000,16 @@ err_llm_init_failed: static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_exit(hr_dev); + hns_roce_function_clear(hr_dev); if (!hr_dev->is_vf) hns_roce_free_link_table(hr_dev); if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP09) - free_dip_list(hr_dev); + free_dip_entry(hr_dev); } static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, @@ -3195,24 +3256,26 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, u64 pages[HNS_ROCE_V2_MAX_INNER_MTPT_NUM] = { 0 }; struct ib_device *ibdev = &hr_dev->ib_dev; dma_addr_t pbl_ba; - int i, count; + int ret; + int i; - count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages, - min_t(int, ARRAY_SIZE(pages), mr->npages), - &pbl_ba); - if (count < 1) { - ibdev_err(ibdev, "failed to find PBL mtr, count = %d.\n", - count); - return -ENOBUFS; + ret = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages, + min_t(int, ARRAY_SIZE(pages), mr->npages)); + if (ret) { + ibdev_err(ibdev, "failed to find PBL mtr, ret = %d.\n", ret); + return ret; } /* Aligned to the hardware address access unit */ - for (i = 0; i < count; i++) - pages[i] >>= 6; + for (i = 0; i < ARRAY_SIZE(pages); i++) + pages[i] >>= MPT_PBL_BUF_ADDR_S; + + pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr); mpt_entry->pbl_size = cpu_to_le32(mr->npages); - mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> 3); - hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3)); + mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> MPT_PBL_BA_ADDR_S); + hr_reg_write(mpt_entry, MPT_PBL_BA_H, + upper_32_bits(pbl_ba >> MPT_PBL_BA_ADDR_S)); mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0])); hr_reg_write(mpt_entry, MPT_PA0_H, upper_32_bits(pages[0])); @@ -3305,21 +3368,14 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, return ret; } -static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, - void *mb_buf, struct hns_roce_mr *mr) +static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) { - struct ib_device *ibdev = &hr_dev->ib_dev; + dma_addr_t pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr); struct hns_roce_v2_mpt_entry *mpt_entry; - dma_addr_t pbl_ba = 0; mpt_entry = mb_buf; memset(mpt_entry, 0, sizeof(*mpt_entry)); - if (hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, NULL, 0, &pbl_ba) < 0) { - ibdev_err(ibdev, "failed to find frmr mtr.\n"); - return -ENOBUFS; - } - hr_reg_write(mpt_entry, MPT_ST, V2_MPT_ST_FREE); hr_reg_write(mpt_entry, MPT_PD, mr->pd); @@ -3339,8 +3395,10 @@ static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, mpt_entry->pbl_size = cpu_to_le32(mr->npages); - mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> 3)); - hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3)); + mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> + MPT_PBL_BA_ADDR_S)); + hr_reg_write(mpt_entry, MPT_PBL_BA_H, + upper_32_bits(pbl_ba >> MPT_PBL_BA_ADDR_S)); return 0; } @@ -3391,8 +3449,8 @@ static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp) ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr); if (ret) { - ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, "failed to post wqe for free mr, ret = %d.\n", + ret); return ret; } @@ -3431,9 +3489,9 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) ret = free_mr_post_send_lp_wqe(hr_qp); if (ret) { - ibdev_err(ibdev, - "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", - hr_qp->qpn, ret); + ibdev_err_ratelimited(ibdev, + "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", + hr_qp->qpn, ret); break; } @@ -3444,16 +3502,16 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) while (cqe_cnt) { npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc); if (npolled < 0) { - ibdev_err(ibdev, - "failed to poll cqe for free mr, remain %d cqe.\n", - cqe_cnt); + ibdev_err_ratelimited(ibdev, + "failed to poll cqe for free mr, remain %d cqe.\n", + cqe_cnt); goto out; } if (time_after(jiffies, end)) { - ibdev_err(ibdev, - "failed to poll cqe for free mr and timeout, remain %d cqe.\n", - cqe_cnt); + ibdev_err_ratelimited(ibdev, + "failed to poll cqe for free mr and timeout, remain %d cqe.\n", + cqe_cnt); goto out; } cqe_cnt -= npolled; @@ -3586,14 +3644,14 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.ba_pg_shift)); hr_reg_write(cq_context, CQC_CQE_BUF_PG_SZ, to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.buf_pg_shift)); - hr_reg_write(cq_context, CQC_CQE_BA_L, dma_handle >> 3); - hr_reg_write(cq_context, CQC_CQE_BA_H, (dma_handle >> (32 + 3))); + hr_reg_write(cq_context, CQC_CQE_BA_L, dma_handle >> CQC_CQE_BA_L_S); + hr_reg_write(cq_context, CQC_CQE_BA_H, dma_handle >> CQC_CQE_BA_H_S); hr_reg_write_bool(cq_context, CQC_DB_RECORD_EN, hr_cq->flags & HNS_ROCE_CQ_FLAG_RECORD_DB); hr_reg_write(cq_context, CQC_CQE_DB_RECORD_ADDR_L, ((u32)hr_cq->db.dma) >> 1); hr_reg_write(cq_context, CQC_CQE_DB_RECORD_ADDR_H, - hr_cq->db.dma >> 32); + hr_cq->db.dma >> CQC_CQE_DB_RECORD_ADDR_H_S); hr_reg_write(cq_context, CQC_CQ_MAX_CNT, HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM); hr_reg_write(cq_context, CQC_CQ_PERIOD, @@ -3715,8 +3773,9 @@ static void get_cqe_status(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, wc->status == IB_WC_WR_FLUSH_ERR)) return; - ibdev_err(&hr_dev->ib_dev, "error cqe status 0x%x:\n", cqe_status); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_NONE, 16, 4, cqe, + ibdev_err_ratelimited(&hr_dev->ib_dev, "error cqe status 0x%x:\n", + cqe_status); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 4, cqe, cq->cqe_size, false); wc->vendor_err = hr_reg_read(cqe, CQE_SUB_STATUS); @@ -4063,7 +4122,6 @@ static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, int obj, u32 step_idx) { - struct hns_roce_hem_iter iter; struct hns_roce_hem_mhop mhop; struct hns_roce_hem *hem; unsigned long mhop_obj = obj; @@ -4100,12 +4158,8 @@ static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev, if (check_whether_last_step(hop_num, step_idx)) { hem = table->hem[hem_idx]; - for (hns_roce_hem_first(hem, &iter); - !hns_roce_hem_last(&iter); hns_roce_hem_next(&iter)) { - bt_ba = hns_roce_hem_addr(&iter); - ret = set_hem_to_hw(hr_dev, obj, bt_ba, table->type, - step_idx); - } + + ret = set_hem_to_hw(hr_dev, obj, hem->dma, table->type, step_idx); } else { if (step_idx == 0) bt_ba = table->bt_l0_dma_addr[i]; @@ -4226,8 +4280,7 @@ static void set_access_flags(struct hns_roce_qp *hr_qp, } static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp, - struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *context) { hr_reg_write(context, QPC_SGE_SHIFT, to_hr_hem_entries_shift(hr_qp->sge.sge_cnt, @@ -4249,7 +4302,6 @@ static inline int get_pdn(struct ib_pd *ib_pd) } static void modify_qp_reset_to_init(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -4268,7 +4320,7 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, hr_reg_write(context, QPC_RQWS, ilog2(hr_qp->rq.max_gs)); - set_qpc_wqe_cnt(hr_qp, context, qpc_mask); + set_qpc_wqe_cnt(hr_qp, context); /* No VLAN need to set 0xFFF */ hr_reg_write(context, QPC_VLAN_ID, 0xfff); @@ -4309,7 +4361,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, } static void modify_qp_init_to_init(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -4346,17 +4397,20 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, { u64 mtts[MTT_MIN_COUNT] = { 0 }; u64 wqe_sge_ba; - int count; + int ret; /* Search qp buf's mtts */ - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, mtts, - MTT_MIN_COUNT, &wqe_sge_ba); - if (hr_qp->rq.wqe_cnt && count < 1) { + ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, mtts, + MTT_MIN_COUNT); + if (hr_qp->rq.wqe_cnt && ret) { ibdev_err(&hr_dev->ib_dev, - "failed to find RQ WQE, QPN = 0x%lx.\n", hr_qp->qpn); - return -EINVAL; + "failed to find QP(0x%lx) RQ WQE buf, ret = %d.\n", + hr_qp->qpn, ret); + return ret; } + wqe_sge_ba = hns_roce_get_mtr_ba(&hr_qp->mtr); + context->wqe_sge_ba = cpu_to_le32(wqe_sge_ba >> 3); qpc_mask->wqe_sge_ba = 0; @@ -4400,12 +4454,14 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, upper_32_bits(to_hr_hw_page_addr(mtts[0]))); hr_reg_clear(qpc_mask, QPC_RQ_CUR_BLK_ADDR_H); - context->rq_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1])); - qpc_mask->rq_nxt_blk_addr = 0; - - hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, - upper_32_bits(to_hr_hw_page_addr(mtts[1]))); - hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + context->rq_nxt_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(mtts[1])); + qpc_mask->rq_nxt_blk_addr = 0; + hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, + upper_32_bits(to_hr_hw_page_addr(mtts[1]))); + hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + } return 0; } @@ -4418,23 +4474,23 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, struct ib_device *ibdev = &hr_dev->ib_dev; u64 sge_cur_blk = 0; u64 sq_cur_blk = 0; - int count; + int ret; /* search qp buf's mtts */ - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL); - if (count < 1) { - ibdev_err(ibdev, "failed to find QP(0x%lx) SQ buf.\n", - hr_qp->qpn); - return -EINVAL; + ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->sq.offset, + &sq_cur_blk, 1); + if (ret) { + ibdev_err(ibdev, "failed to find QP(0x%lx) SQ WQE buf, ret = %d.\n", + hr_qp->qpn, ret); + return ret; } if (hr_qp->sge.sge_cnt > 0) { - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, - hr_qp->sge.offset, - &sge_cur_blk, 1, NULL); - if (count < 1) { - ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf.\n", - hr_qp->qpn); - return -EINVAL; + ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, + hr_qp->sge.offset, &sge_cur_blk, 1); + if (ret) { + ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf, ret = %d.\n", + hr_qp->qpn, ret); + return ret; } } @@ -4527,16 +4583,16 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, return -EINVAL; } - hr_reg_write(context, QPC_TRRL_BA_L, trrl_ba >> 4); + hr_reg_write(context, QPC_TRRL_BA_L, trrl_ba >> QPC_TRRL_BA_L_S); hr_reg_clear(qpc_mask, QPC_TRRL_BA_L); - context->trrl_ba = cpu_to_le32(trrl_ba >> (16 + 4)); + context->trrl_ba = cpu_to_le32(trrl_ba >> QPC_TRRL_BA_M_S); qpc_mask->trrl_ba = 0; - hr_reg_write(context, QPC_TRRL_BA_H, trrl_ba >> (32 + 16 + 4)); + hr_reg_write(context, QPC_TRRL_BA_H, trrl_ba >> QPC_TRRL_BA_H_S); hr_reg_clear(qpc_mask, QPC_TRRL_BA_H); - context->irrl_ba = cpu_to_le32(irrl_ba >> 6); + context->irrl_ba = cpu_to_le32(irrl_ba >> QPC_IRRL_BA_L_S); qpc_mask->irrl_ba = 0; - hr_reg_write(context, QPC_IRRL_BA_H, irrl_ba >> (32 + 6)); + hr_reg_write(context, QPC_IRRL_BA_H, irrl_ba >> QPC_IRRL_BA_H_S); hr_reg_clear(qpc_mask, QPC_IRRL_BA_H); hr_reg_enable(context, QPC_RMT_E2E); @@ -4598,8 +4654,9 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_clear(qpc_mask, QPC_TRRL_HEAD_MAX); hr_reg_clear(qpc_mask, QPC_TRRL_TAIL_MAX); +#define MAX_LP_SGEN 3 /* rocee send 2^lp_sgen_ini segs every time */ - hr_reg_write(context, QPC_LP_SGEN_INI, 3); + hr_reg_write(context, QPC_LP_SGEN_INI, MAX_LP_SGEN); hr_reg_clear(qpc_mask, QPC_LP_SGEN_INI); if (udata && ibqp->qp_type == IB_QPT_RC && @@ -4625,8 +4682,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, return 0; } -static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, int attr_mask, +static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, int attr_mask, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -4673,26 +4729,49 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, return 0; } +static int alloc_dip_entry(struct xarray *dip_xa, u32 qpn) +{ + struct hns_roce_dip *hr_dip; + int ret; + + hr_dip = xa_load(dip_xa, qpn); + if (hr_dip) + return 0; + + hr_dip = kzalloc(sizeof(*hr_dip), GFP_KERNEL); + if (!hr_dip) + return -ENOMEM; + + ret = xa_err(xa_store(dip_xa, qpn, hr_dip, GFP_KERNEL)); + if (ret) + kfree(hr_dip); + + return ret; +} + static int get_dip_ctx_idx(struct ib_qp *ibqp, const struct ib_qp_attr *attr, u32 *dip_idx) { const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); - u32 *spare_idx = hr_dev->qp_table.idx_table.spare_idx; - u32 *head = &hr_dev->qp_table.idx_table.head; - u32 *tail = &hr_dev->qp_table.idx_table.tail; + struct xarray *dip_xa = &hr_dev->qp_table.dip_xa; + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct hns_roce_dip *hr_dip; - unsigned long flags; + unsigned long idx; int ret = 0; - spin_lock_irqsave(&hr_dev->dip_list_lock, flags); + ret = alloc_dip_entry(dip_xa, ibqp->qp_num); + if (ret) + return ret; - spare_idx[*tail] = ibqp->qp_num; - *tail = (*tail == hr_dev->caps.num_qps - 1) ? 0 : (*tail + 1); + xa_lock(dip_xa); - list_for_each_entry(hr_dip, &hr_dev->dip_list, node) { - if (!memcmp(grh->dgid.raw, hr_dip->dgid, 16)) { + xa_for_each(dip_xa, idx, hr_dip) { + if (hr_dip->qp_cnt && + !memcmp(grh->dgid.raw, hr_dip->dgid, GID_LEN_V2)) { *dip_idx = hr_dip->dip_idx; + hr_dip->qp_cnt++; + hr_qp->dip = hr_dip; goto out; } } @@ -4700,19 +4779,24 @@ static int get_dip_ctx_idx(struct ib_qp *ibqp, const struct ib_qp_attr *attr, /* If no dgid is found, a new dip and a mapping between dgid and * dip_idx will be created. */ - hr_dip = kzalloc(sizeof(*hr_dip), GFP_ATOMIC); - if (!hr_dip) { - ret = -ENOMEM; - goto out; + xa_for_each(dip_xa, idx, hr_dip) { + if (hr_dip->qp_cnt) + continue; + + *dip_idx = idx; + memcpy(hr_dip->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); + hr_dip->dip_idx = idx; + hr_dip->qp_cnt++; + hr_qp->dip = hr_dip; + break; } - memcpy(hr_dip->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); - hr_dip->dip_idx = *dip_idx = spare_idx[*head]; - *head = (*head == hr_dev->caps.num_qps - 1) ? 0 : (*head + 1); - list_add_tail(&hr_dip->node, &hr_dev->dip_list); + /* This should never happen. */ + if (WARN_ON_ONCE(!hr_qp->dip)) + ret = -ENOSPC; out: - spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); + xa_unlock(dip_xa); return ret; } @@ -4744,13 +4828,10 @@ enum { static int check_cong_type(struct ib_qp *ibqp, struct hns_roce_congestion_algorithm *cong_alg) { - struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); - - if (ibqp->qp_type == IB_QPT_UD) - hr_dev->caps.cong_type = CONG_TYPE_DCQCN; + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); /* different congestion types match different configurations */ - switch (hr_dev->caps.cong_type) { + switch (hr_qp->cong_type) { case CONG_TYPE_DCQCN: cong_alg->alg_sel = CONG_DCQCN; cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL; @@ -4776,10 +4857,7 @@ static int check_cong_type(struct ib_qp *ibqp, cong_alg->wnd_mode_sel = WND_LIMIT; break; default: - ibdev_warn(&hr_dev->ib_dev, - "invalid type(%u) for congestion selection.\n", - hr_dev->caps.cong_type); - hr_dev->caps.cong_type = CONG_TYPE_DCQCN; + hr_qp->cong_type = CONG_TYPE_DCQCN; cong_alg->alg_sel = CONG_DCQCN; cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL; cong_alg->dip_vld = DIP_INVALID; @@ -4798,6 +4876,7 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr, struct hns_roce_congestion_algorithm cong_field; struct ib_device *ibdev = ibqp->device; struct hns_roce_dev *hr_dev = to_hr_dev(ibdev); + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); u32 dip_idx = 0; int ret; @@ -4810,7 +4889,7 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr, return ret; hr_reg_write(context, QPC_CONG_ALGO_TMPL_ID, hr_dev->cong_algo_tmpl_id + - hr_dev->caps.cong_type * HNS_ROCE_CONG_SIZE); + hr_qp->cong_type * HNS_ROCE_CONG_SIZE); hr_reg_clear(qpc_mask, QPC_CONG_ALGO_TMPL_ID); hr_reg_write(&context->ext, QPCEX_CONG_ALG_SEL, cong_field.alg_sel); hr_reg_clear(&qpc_mask->ext, QPCEX_CONG_ALG_SEL); @@ -4839,6 +4918,69 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr, return 0; } +static int hns_roce_hw_v2_get_dscp(struct hns_roce_dev *hr_dev, u8 dscp, + u8 *tc_mode, u8 *priority) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (!ops->get_dscp_prio) + return -EOPNOTSUPP; + + return ops->get_dscp_prio(handle, dscp, tc_mode, priority); +} + +bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl) +{ + u32 max_sl; + + max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); + if (unlikely(sl > max_sl)) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to set SL(%u). Shouldn't be larger than %u.\n", + sl, max_sl); + return false; + } + + return true; +} + +static int hns_roce_set_sl(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct ib_device *ibdev = &hr_dev->ib_dev; + int ret; + + ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh), + &hr_qp->tc_mode, &hr_qp->priority); + if (ret && ret != -EOPNOTSUPP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + ibdev_err_ratelimited(ibdev, + "failed to get dscp, ret = %d.\n", ret); + return ret; + } + + if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + hr_qp->sl = hr_qp->priority; + else + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + + if (!check_sl_valid(hr_dev, hr_qp->sl)) + return -EINVAL; + + hr_reg_write(context, QPC_SL, hr_qp->sl); + hr_reg_clear(qpc_mask, QPC_SL); + + return 0; +} + static int hns_roce_v2_set_path(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, @@ -4854,25 +4996,18 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, int is_roce_protocol; u16 vlan_id = 0xffff; bool is_udp = false; - u32 max_sl; u8 ib_port; u8 hr_port; int ret; - max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); - if (unlikely(sl > max_sl)) { - ibdev_err_ratelimited(ibdev, - "failed to fill QPC, sl (%u) shouldn't be larger than %u.\n", - sl, max_sl); - return -EINVAL; - } - /* * If free_mr_en of qp is set, it means that this qp comes from * free mr. This qp will perform the loopback operation. * In the loopback scenario, only sl needs to be set. */ if (hr_qp->free_mr_en) { + if (!check_sl_valid(hr_dev, sl)) + return -EINVAL; hr_reg_write(context, QPC_SL, sl); hr_reg_clear(qpc_mask, QPC_SL); hr_qp->sl = sl; @@ -4942,11 +5077,7 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw)); - hr_qp->sl = sl; - hr_reg_write(context, QPC_SL, hr_qp->sl); - hr_reg_clear(qpc_mask, QPC_SL); - - return 0; + return hns_roce_set_sl(ibqp, attr, context, qpc_mask); } static bool check_qp_state(enum ib_qp_state cur_state, @@ -4986,22 +5117,19 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); int ret = 0; - if (!check_qp_state(cur_state, new_state)) { - ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n"); + if (!check_qp_state(cur_state, new_state)) return -EINVAL; - } if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { memset(qpc_mask, 0, hr_dev->caps.qpc_sz); - modify_qp_reset_to_init(ibqp, attr, context, qpc_mask); + modify_qp_reset_to_init(ibqp, context, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { - modify_qp_init_to_init(ibqp, attr, context, qpc_mask); + modify_qp_init_to_init(ibqp, context, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context, qpc_mask, udata); } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { - ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context, - qpc_mask); + ret = modify_qp_rtr_to_rts(ibqp, attr_mask, context, qpc_mask); } return ret; @@ -5251,7 +5379,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, /* SW pass context to HW */ ret = hns_roce_v2_qp_modify(hr_dev, context, qpc_mask, hr_qp); if (ret) { - ibdev_err(ibdev, "failed to modify QP, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, "failed to modify QP, ret = %d.\n", ret); goto out; } @@ -5328,6 +5456,30 @@ out: return ret; } +static int hns_roce_v2_query_sccc(struct hns_roce_dev *hr_dev, u32 qpn, + void *buffer) +{ + struct hns_roce_v2_scc_context *context; + struct hns_roce_cmd_mailbox *mailbox; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_SCCC, + qpn); + if (ret) + goto out; + + context = mailbox->buf; + memcpy(buffer, context, sizeof(*context)); + +out: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + return ret; +} + static u8 get_qp_timeout_attr(struct hns_roce_dev *hr_dev, struct hns_roce_v2_qp_context *context) { @@ -5365,7 +5517,9 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context); if (ret) { - ibdev_err(ibdev, "failed to query QPC, ret = %d.\n", ret); + ibdev_err_ratelimited(ibdev, + "failed to query QPC, ret = %d.\n", + ret); ret = -EINVAL; goto out; } @@ -5373,7 +5527,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, state = hr_reg_read(&context, QPC_QP_ST); tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state); if (tmp_qp_state == -1) { - ibdev_err(ibdev, "Illegal ib_qp_state\n"); + ibdev_err_ratelimited(ibdev, "Illegal ib_qp_state\n"); ret = -EINVAL; goto out; } @@ -5466,9 +5620,9 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state, IB_QPS_RESET, udata); if (ret) - ibdev_err(ibdev, - "failed to modify QP to RST, ret = %d.\n", - ret); + ibdev_err_ratelimited(ibdev, + "failed to modify QP to RST, ret = %d.\n", + ret); } send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL; @@ -5496,17 +5650,44 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, return ret; } +static void put_dip_ctx_idx(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp) +{ + struct hns_roce_dip *hr_dip = hr_qp->dip; + + if (!hr_dip) + return; + + xa_lock(&hr_dev->qp_table.dip_xa); + + hr_dip->qp_cnt--; + if (!hr_dip->qp_cnt) + memset(hr_dip->dgid, 0, GID_LEN_V2); + + xa_unlock(&hr_dev->qp_table.dip_xa); +} + int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + unsigned long flags; int ret; + /* Make sure flush_cqe() is completed */ + spin_lock_irqsave(&hr_qp->flush_lock, flags); + set_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag); + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); + flush_work(&hr_qp->flush_work.work); + + if (hr_qp->cong_type == CONG_TYPE_DIP) + put_dip_ctx_idx(hr_dev, hr_qp); + ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata); if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n", - hr_qp->qpn, ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n", + hr_qp->qpn, ret); hns_roce_qp_destroy(hr_dev, hr_qp, udata); @@ -5581,18 +5762,20 @@ static int hns_roce_v2_write_srqc_index_queue(struct hns_roce_srq *srq, struct ib_device *ibdev = srq->ibsrq.device; struct hns_roce_dev *hr_dev = to_hr_dev(ibdev); u64 mtts_idx[MTT_MIN_COUNT] = {}; - dma_addr_t dma_handle_idx = 0; + dma_addr_t dma_handle_idx; int ret; /* Get physical address of idx que buf */ ret = hns_roce_mtr_find(hr_dev, &idx_que->mtr, 0, mtts_idx, - ARRAY_SIZE(mtts_idx), &dma_handle_idx); - if (ret < 1) { + ARRAY_SIZE(mtts_idx)); + if (ret) { ibdev_err(ibdev, "failed to find mtr for SRQ idx, ret = %d.\n", ret); - return -ENOBUFS; + return ret; } + dma_handle_idx = hns_roce_get_mtr_ba(&idx_que->mtr); + hr_reg_write(ctx, SRQC_IDX_HOP_NUM, to_hr_hem_hopnum(hr_dev->caps.idx_hop_num, srq->wqe_cnt)); @@ -5624,20 +5807,22 @@ static int hns_roce_v2_write_srqc(struct hns_roce_srq *srq, void *mb_buf) struct hns_roce_dev *hr_dev = to_hr_dev(ibdev); struct hns_roce_srq_context *ctx = mb_buf; u64 mtts_wqe[MTT_MIN_COUNT] = {}; - dma_addr_t dma_handle_wqe = 0; + dma_addr_t dma_handle_wqe; int ret; memset(ctx, 0, sizeof(*ctx)); /* Get the physical address of srq buf */ ret = hns_roce_mtr_find(hr_dev, &srq->buf_mtr, 0, mtts_wqe, - ARRAY_SIZE(mtts_wqe), &dma_handle_wqe); - if (ret < 1) { + ARRAY_SIZE(mtts_wqe)); + if (ret) { ibdev_err(ibdev, "failed to find mtr for SRQ WQE, ret = %d.\n", ret); - return -ENOBUFS; + return ret; } + dma_handle_wqe = hns_roce_get_mtr_ba(&srq->buf_mtr); + hr_reg_write(ctx, SRQC_SRQ_ST, 1); hr_reg_write_bool(ctx, SRQC_SRQ_TYPE, srq->ibsrq.srq_type == IB_SRQT_XRC); @@ -5785,7 +5970,7 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) dev_info(hr_dev->dev, "cq_period(%u) reached the upper limit, adjusted to 65.\n", cq_period); - cq_period = HNS_ROCE_MAX_CQ_PERIOD; + cq_period = HNS_ROCE_MAX_CQ_PERIOD_HIP08; } cq_period *= HNS_ROCE_CLOCK_ADJUST; } @@ -5796,9 +5981,9 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) HNS_ROCE_CMD_MODIFY_CQC, hr_cq->cqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when modifying CQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd when modifying CQ, ret = %d.\n", + ret); err_out: if (ret) @@ -5822,9 +6007,9 @@ static int hns_roce_v2_query_cqc(struct hns_roce_dev *hr_dev, u32 cqn, ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_CQC, cqn); if (ret) { - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when querying CQ, ret = %d.\n", - ret); + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to process cmd when querying CQ, ret = %d.\n", + ret); goto err_mailbox; } @@ -5865,11 +6050,10 @@ err_mailbox: return ret; } -static void hns_roce_irq_work_handle(struct work_struct *work) +static void dump_aeqe_log(struct hns_roce_work *irq_work) { - struct hns_roce_work *irq_work = - container_of(work, struct hns_roce_work, work); - struct ib_device *ibdev = &irq_work->hr_dev->ib_dev; + struct hns_roce_dev *hr_dev = irq_work->hr_dev; + struct ib_device *ibdev = &hr_dev->ib_dev; switch (irq_work->event_type) { case HNS_ROCE_EVENT_TYPE_PATH_MIG: @@ -5913,6 +6097,8 @@ static void hns_roce_irq_work_handle(struct work_struct *work) case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: ibdev_warn(ibdev, "DB overflow.\n"); break; + case HNS_ROCE_EVENT_TYPE_MB: + break; case HNS_ROCE_EVENT_TYPE_FLR: ibdev_warn(ibdev, "function level reset.\n"); break; @@ -5923,8 +6109,46 @@ static void hns_roce_irq_work_handle(struct work_struct *work) ibdev_err(ibdev, "invalid xrceth error.\n"); break; default: + ibdev_info(ibdev, "Undefined event %d.\n", + irq_work->event_type); break; } +} + +static void hns_roce_irq_work_handle(struct work_struct *work) +{ + struct hns_roce_work *irq_work = + container_of(work, struct hns_roce_work, work); + struct hns_roce_dev *hr_dev = irq_work->hr_dev; + int event_type = irq_work->event_type; + u32 queue_num = irq_work->queue_num; + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + case HNS_ROCE_EVENT_TYPE_COMM_EST: + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION: + case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH: + hns_roce_qp_event(hr_dev, queue_num, event_type); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + hns_roce_srq_event(hr_dev, queue_num, event_type); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + hns_roce_cq_event(hr_dev, queue_num, event_type); + break; + default: + break; + } + + dump_aeqe_log(irq_work); kfree(irq_work); } @@ -5985,14 +6209,14 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) { - struct device *dev = hr_dev->dev; struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq); irqreturn_t aeqe_found = IRQ_NONE; + int num_aeqes = 0; int event_type; u32 queue_num; int sub_type; - while (aeqe) { + while (aeqe && num_aeqes < HNS_AEQ_POLLING_BUDGET) { /* Make sure we read AEQ entry after we have checked the * ownership bit */ @@ -6003,25 +6227,12 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, queue_num = hr_reg_read(aeqe, AEQE_EVENT_QUEUE_NUM); switch (event_type) { - case HNS_ROCE_EVENT_TYPE_PATH_MIG: - case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: - case HNS_ROCE_EVENT_TYPE_COMM_EST: - case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: case HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION: case HNS_ROCE_EVENT_TYPE_INVALID_XRCETH: - hns_roce_qp_event(hr_dev, queue_num, event_type); - break; - case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: - case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: - hns_roce_srq_event(hr_dev, queue_num, event_type); - break; - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - hns_roce_cq_event(hr_dev, queue_num, event_type); + hns_roce_flush_cqe(hr_dev, queue_num); break; case HNS_ROCE_EVENT_TYPE_MB: hns_roce_cmd_event(hr_dev, @@ -6029,12 +6240,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, aeqe->event.cmd.status, le64_to_cpu(aeqe->event.cmd.out_param)); break; - case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: - case HNS_ROCE_EVENT_TYPE_FLR: - break; default: - dev_err(dev, "unhandled event %d on EQ %d at idx %u.\n", - event_type, eq->eqn, eq->cons_index); break; } @@ -6048,6 +6254,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, hns_roce_v2_init_irq_work(hr_dev, eq, queue_num); aeqe = next_aeqe_sw_v2(eq); + ++num_aeqes; } update_eq_db(eq); @@ -6067,33 +6274,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; } -static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_eq *eq) { - struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - irqreturn_t ceqe_found = IRQ_NONE; - u32 cqn; - - while (ceqe) { - /* Make sure we read CEQ entry after we have checked the - * ownership bit - */ - dma_rmb(); - - cqn = hr_reg_read(ceqe, CEQE_CQN); + queue_work(system_bh_wq, &eq->work); - hns_roce_cq_completion(hr_dev, cqn); - - ++eq->cons_index; - ceqe_found = IRQ_HANDLED; - atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); - - ceqe = next_ceqe_sw_v2(eq); - } - - update_eq_db(eq); - - return IRQ_RETVAL(ceqe_found); + return IRQ_HANDLED; } static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) @@ -6104,7 +6289,7 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ - int_work = hns_roce_v2_ceq_int(hr_dev, eq); + int_work = hns_roce_v2_ceq_int(eq); else /* Asynchronous event interrupt */ int_work = hns_roce_v2_aeq_int(hr_dev, eq); @@ -6118,6 +6303,7 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, struct pci_dev *pdev = hr_dev->pci_dev; struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); const struct hnae3_ae_ops *ops = ae_dev->ops; + enum hnae3_reset_type reset_type; irqreturn_t int_work = IRQ_NONE; u32 int_en; @@ -6129,10 +6315,12 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); + reset_type = hr_dev->is_vf ? + HNAE3_VF_FUNC_RESET : HNAE3_FUNC_RESET; + /* Set reset level for reset_event() */ if (ops->set_default_reset_request) - ops->set_default_reset_request(ae_dev, - HNAE3_FUNC_RESET); + ops->set_default_reset_request(ae_dev, reset_type); if (ops->reset_event) ops->reset_event(pdev, NULL); @@ -6202,7 +6390,7 @@ static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) res_type == ECC_RESOURCE_SCCC) return le64_to_cpu(*data); - return le64_to_cpu(*data) << PAGE_SHIFT; + return le64_to_cpu(*data) << HNS_HW_PAGE_SHIFT; } static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, @@ -6316,9 +6504,16 @@ static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev, roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG, enable_flag); } -static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) +static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) +{ + hns_roce_mtr_destroy(hr_dev, &eq->mtr); +} + +static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct device *dev = hr_dev->dev; + int eqn = eq->eqn; int ret; u8 cmd; @@ -6329,12 +6524,9 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) ret = hns_roce_destroy_hw_ctx(hr_dev, cmd, eqn & HNS_ROCE_V2_EQN_M); if (ret) - dev_err(dev, "[mailbox cmd] destroy eqc(%u) failed.\n", eqn); -} + dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn); -static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - hns_roce_mtr_destroy(hr_dev, &eq->mtr); + free_eq_buf(hr_dev, eq); } static void init_eq_config(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) @@ -6353,7 +6545,7 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq, u64 eqe_ba[MTT_MIN_COUNT] = { 0 }; struct hns_roce_eq_context *eqc; u64 bt_ba = 0; - int count; + int ret; eqc = mb_buf; memset(eqc, 0, sizeof(struct hns_roce_eq_context)); @@ -6361,13 +6553,15 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq, init_eq_config(hr_dev, eq); /* if not multi-hop, eqe buffer only use one trunk */ - count = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, eqe_ba, MTT_MIN_COUNT, - &bt_ba); - if (count < 1) { - dev_err(hr_dev->dev, "failed to find EQE mtr\n"); - return -ENOBUFS; + ret = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, eqe_ba, + ARRAY_SIZE(eqe_ba)); + if (ret) { + dev_err(hr_dev->dev, "failed to find EQE mtr, ret = %d\n", ret); + return ret; } + bt_ba = hns_roce_get_mtr_ba(&eq->mtr); + hr_reg_write(eqc, EQC_EQ_ST, HNS_ROCE_V2_EQ_STATE_VALID); hr_reg_write(eqc, EQC_EQE_HOP_NUM, eq->hop_num); hr_reg_write(eqc, EQC_OVER_IGNORE, eq->over_ignore); @@ -6470,6 +6664,34 @@ free_cmd_mbox: return ret; } +static void hns_roce_ceq_work(struct work_struct *work) +{ + struct hns_roce_eq *eq = from_work(eq, work, work); + struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); + struct hns_roce_dev *hr_dev = eq->hr_dev; + int ceqe_num = 0; + u32 cqn; + + while (ceqe && ceqe_num < hr_dev->caps.ceqe_depth) { + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + cqn = hr_reg_read(ceqe, CEQE_CQN); + + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ++ceqe_num; + atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CEQE_CNT]); + + ceqe = next_ceqe_sw_v2(eq); + } + + update_eq_db(eq); +} + static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, int comp_num, int aeq_num, int other_num) { @@ -6501,21 +6723,24 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, j - other_num - aeq_num); for (j = 0; j < irq_num; j++) { - if (j < other_num) + if (j < other_num) { ret = request_irq(hr_dev->irq[j], hns_roce_v2_msix_interrupt_abn, 0, hr_dev->irq_names[j], hr_dev); - - else if (j < (other_num + comp_num)) + } else if (j < (other_num + comp_num)) { + INIT_WORK(&eq_table->eq[j - other_num].work, + hns_roce_ceq_work); ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j + aeq_num], &eq_table->eq[j - other_num]); - else + } else { ret = request_irq(eq_table->eq[j - other_num].irq, hns_roce_v2_msix_interrupt_eq, 0, hr_dev->irq_names[j - comp_num], &eq_table->eq[j - other_num]); + } + if (ret) { dev_err(hr_dev->dev, "request irq error!\n"); goto err_request_failed; @@ -6525,12 +6750,16 @@ static int __hns_roce_request_irq(struct hns_roce_dev *hr_dev, int irq_num, return 0; err_request_failed: - for (j -= 1; j >= 0; j--) - if (j < other_num) + for (j -= 1; j >= 0; j--) { + if (j < other_num) { free_irq(hr_dev->irq[j], hr_dev); - else - free_irq(eq_table->eq[j - other_num].irq, - &eq_table->eq[j - other_num]); + continue; + } + free_irq(eq_table->eq[j - other_num].irq, + &eq_table->eq[j - other_num]); + if (j < other_num + comp_num) + cancel_work_sync(&eq_table->eq[j - other_num].work); + } err_kzalloc_failed: for (i -= 1; i >= 0; i--) @@ -6551,8 +6780,11 @@ static void __hns_roce_free_irq(struct hns_roce_dev *hr_dev) for (i = 0; i < hr_dev->caps.num_other_vectors; i++) free_irq(hr_dev->irq[i], hr_dev); - for (i = 0; i < eq_num; i++) + for (i = 0; i < eq_num; i++) { free_irq(hr_dev->eq_table.eq[i].irq, &hr_dev->eq_table.eq[i]); + if (i < hr_dev->caps.num_comp_vectors) + cancel_work_sync(&hr_dev->eq_table.eq[i].work); + } for (i = 0; i < irq_num; i++) kfree(hr_dev->irq_names[i]); @@ -6572,6 +6804,9 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) int ret; int i; + if (hr_dev->caps.aeqe_depth < HNS_AEQ_POLLING_BUDGET) + return -EINVAL; + other_num = hr_dev->caps.num_other_vectors; comp_num = hr_dev->caps.num_comp_vectors; aeq_num = hr_dev->caps.num_aeq_vectors; @@ -6641,7 +6876,7 @@ err_request_irq_fail: err_create_eq_fail: for (i -= 1; i >= 0; i--) - free_eq_buf(hr_dev, &eq_table->eq[i]); + hns_roce_v2_destroy_eqc(hr_dev, &eq_table->eq[i]); kfree(eq_table->eq); return ret; @@ -6661,11 +6896,8 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev) __hns_roce_free_irq(hr_dev); destroy_workqueue(hr_dev->irq_workq); - for (i = 0; i < eq_num; i++) { - hns_roce_v2_destroy_eqc(hr_dev, i); - - free_eq_buf(hr_dev, &eq_table->eq[i]); - } + for (i = 0; i < eq_num; i++) + hns_roce_v2_destroy_eqc(hr_dev, &eq_table->eq[i]); kfree(eq_table->eq); } @@ -6714,7 +6946,9 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .query_qpc = hns_roce_v2_query_qpc, .query_mpt = hns_roce_v2_query_mpt, .query_srqc = hns_roce_v2_query_srqc, + .query_sccc = hns_roce_v2_query_sccc, .query_hw_counter = hns_roce_hw_v2_query_counter, + .get_dscp = hns_roce_hw_v2_get_dscp, .hns_roce_dev_ops = &hns_roce_v2_dev_ops, .hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops, }; @@ -6831,9 +7065,6 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) - free_mr_exit(hr_dev); - hns_roce_exit(hr_dev); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); @@ -6894,6 +7125,7 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; } + static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; @@ -6912,6 +7144,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) hr_dev->active = false; hr_dev->dis_db = true; + + rdma_user_mmap_disassociate(&hr_dev->ib_dev); + hr_dev->state = HNS_ROCE_DEVICE_STATE_RST_DOWN; return 0; @@ -6982,9 +7217,22 @@ static int hns_roce_hw_v2_reset_notify(struct hnae3_handle *handle, return ret; } +static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, + bool linkup) +{ + struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + struct net_device *netdev = handle->rinfo.netdev; + + if (linkup || !hr_dev) + return; + + ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev); +} + static const struct hnae3_client_ops hns_roce_hw_v2_ops = { .init_instance = hns_roce_hw_v2_init_instance, .uninit_instance = hns_roce_hw_v2_uninit_instance, + .link_status_change = hns_roce_hw_v2_link_status_change, .reset_notify = hns_roce_hw_v2_reset_notify, }; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index cd97cbee682a..91a5665465ff 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -85,6 +85,11 @@ #define HNS_ROCE_V2_TABLE_CHUNK_SIZE (1 << 18) +/* budget must be smaller than aeqe_depth to guarantee that we update + * the ci before we polled all the entries in the EQ. + */ +#define HNS_AEQ_POLLING_BUDGET 64 + enum { HNS_ROCE_CMD_FLAG_IN = BIT(0), HNS_ROCE_CMD_FLAG_OUT = BIT(1), @@ -224,6 +229,14 @@ enum hns_roce_opcode_type { HNS_SWITCH_PARAMETER_CFG = 0x1033, }; +#define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000 +#define HNS_ROCE_OPC_POST_MB_TRY_CNT 8 +#define HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC 5 +struct hns_roce_cmdq_tx_timeout_map { + u16 opcode; + u32 tx_timeout; +}; + enum { TYPE_CRQ, TYPE_CSQ, @@ -276,6 +289,10 @@ struct hns_roce_v2_cq_context { __le32 byte_64_se_cqe_idx; }; +#define CQC_CQE_BA_L_S 3 +#define CQC_CQE_BA_H_S (32 + CQC_CQE_BA_L_S) +#define CQC_CQE_DB_RECORD_ADDR_H_S 32 + #define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0 #define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL 0x0 @@ -447,6 +464,12 @@ struct hns_roce_v2_qp_context { struct hns_roce_v2_qp_context_ex ext; }; +#define QPC_TRRL_BA_L_S 4 +#define QPC_TRRL_BA_M_S (16 + QPC_TRRL_BA_L_S) +#define QPC_TRRL_BA_H_S (32 + QPC_TRRL_BA_M_S) +#define QPC_IRRL_BA_L_S 6 +#define QPC_IRRL_BA_H_S (32 + QPC_IRRL_BA_L_S) + #define QPC_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_qp_context, h, l) #define QPC_TST QPC_FIELD_LOC(2, 0) @@ -646,6 +669,12 @@ struct hns_roce_v2_qp_context { #define QPCEX_SQ_RQ_NOT_FORBID_EN QPCEX_FIELD_LOC(23, 23) #define QPCEX_STASH QPCEX_FIELD_LOC(82, 82) +#define SCC_CONTEXT_SIZE 16 + +struct hns_roce_v2_scc_context { + __le32 data[SCC_CONTEXT_SIZE]; +}; + #define V2_QP_RWE_S 1 /* rdma write enable */ #define V2_QP_RRE_S 2 /* rdma read enable */ #define V2_QP_ATE_S 3 /* rdma atomic enable */ @@ -710,6 +739,9 @@ struct hns_roce_v2_mpt_entry { __le32 byte_64_buf_pa1; }; +#define MPT_PBL_BUF_ADDR_S 6 +#define MPT_PBL_BA_ADDR_S 3 + #define MPT_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_mpt_entry, h, l) #define MPT_ST MPT_FIELD_LOC(1, 0) @@ -894,6 +926,7 @@ struct hns_roce_v2_rc_send_wqe { #define RC_SEND_WQE_OWNER RC_SEND_WQE_FIELD_LOC(7, 7) #define RC_SEND_WQE_CQE RC_SEND_WQE_FIELD_LOC(8, 8) #define RC_SEND_WQE_FENCE RC_SEND_WQE_FIELD_LOC(9, 9) +#define RC_SEND_WQE_SO RC_SEND_WQE_FIELD_LOC(10, 10) #define RC_SEND_WQE_SE RC_SEND_WQE_FIELD_LOC(11, 11) #define RC_SEND_WQE_INLINE RC_SEND_WQE_FIELD_LOC(12, 12) #define RC_SEND_WQE_WQE_INDEX RC_SEND_WQE_FIELD_LOC(30, 15) @@ -1214,12 +1247,13 @@ struct hns_roce_query_pf_caps_d { #define PF_CAPS_D_RQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(21, 20) #define PF_CAPS_D_EX_SGE_HOP_NUM PF_CAPS_D_FIELD_LOC(23, 22) #define PF_CAPS_D_SQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(25, 24) -#define PF_CAPS_D_CONG_TYPE PF_CAPS_D_FIELD_LOC(29, 26) +#define PF_CAPS_D_CONG_CAP PF_CAPS_D_FIELD_LOC(29, 26) #define PF_CAPS_D_CEQ_DEPTH PF_CAPS_D_FIELD_LOC(85, 64) #define PF_CAPS_D_NUM_CEQS PF_CAPS_D_FIELD_LOC(95, 86) #define PF_CAPS_D_AEQ_DEPTH PF_CAPS_D_FIELD_LOC(117, 96) #define PF_CAPS_D_AEQ_ARM_ST PF_CAPS_D_FIELD_LOC(119, 118) #define PF_CAPS_D_CEQ_ARM_ST PF_CAPS_D_FIELD_LOC(121, 120) +#define PF_CAPS_D_DEFAULT_ALG PF_CAPS_D_FIELD_LOC(127, 122) #define PF_CAPS_D_RSV_PDS PF_CAPS_D_FIELD_LOC(147, 128) #define PF_CAPS_D_NUM_UARS PF_CAPS_D_FIELD_LOC(155, 148) #define PF_CAPS_D_RSV_QPS PF_CAPS_D_FIELD_LOC(179, 160) @@ -1316,7 +1350,7 @@ struct hns_roce_v2_priv { struct hns_roce_dip { u8 dgid[GID_LEN_V2]; u32 dip_idx; - struct list_head node; /* all dips are on a list */ + u32 qp_cnt; }; struct fmea_ram_ecc { @@ -1327,7 +1361,7 @@ struct fmea_ram_ecc { /* only for RNR timeout issue of HIP08 */ #define HNS_ROCE_CLOCK_ADJUST 1000 -#define HNS_ROCE_MAX_CQ_PERIOD 65 +#define HNS_ROCE_MAX_CQ_PERIOD_HIP08 65 #define HNS_ROCE_MAX_EQ_PERIOD 65 #define HNS_ROCE_RNR_TIMER_10NS 1 #define HNS_ROCE_1US_CFG 999 diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index b55fe6911f9f..cf89a8db4f64 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -37,9 +37,11 @@ #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_cache.h> +#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" +#include "hns_roce_hw_v2.h" static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -181,7 +183,7 @@ static int hns_roce_query_device(struct ib_device *ib_dev, IB_DEVICE_RC_RNR_NAK_GEN; props->max_send_sge = hr_dev->caps.max_sq_sg; props->max_recv_sge = hr_dev->caps.max_rq_sg; - props->max_sge_rd = 1; + props->max_sge_rd = hr_dev->caps.max_sq_sg; props->max_cq = hr_dev->caps.num_cqs; props->max_cqe = hr_dev->caps.max_cqes; props->max_mr = hr_dev->caps.num_mtpts; @@ -192,6 +194,12 @@ static int hns_roce_query_device(struct ib_device *ib_dev, IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->max_pkeys = 1; props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay; + props->max_ah = INT_MAX; + props->cq_caps.max_cq_moderation_period = HNS_ROCE_MAX_CQ_PERIOD; + props->cq_caps.max_cq_moderation_count = HNS_ROCE_MAX_CQ_COUNT; + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + props->cq_caps.max_cq_moderation_period = HNS_ROCE_MAX_CQ_PERIOD_HIP08; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) { props->max_srq = hr_dev->caps.num_srqs; props->max_srq_wr = hr_dev->caps.max_srq_wrs; @@ -394,6 +402,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS; } + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + resp.congest_type = hr_dev->caps.cong_cap; + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_out; @@ -418,6 +429,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, return 0; error_fail_copy_to_udata: + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&context->page_mutex); hns_roce_dealloc_uar_entry(context); error_fail_uar_entry: @@ -434,6 +448,10 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&context->page_mutex); + hns_roce_dealloc_uar_entry(context); ida_free(&hr_dev->uar_ida.ida, (int)context->uar.logic_idx); @@ -448,6 +466,11 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma) pgprot_t prot; int ret; + if (hr_dev->dis_db) { + atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]); + return -EPERM; + } + rdma_entry = rdma_user_mmap_entry_get_pgoff(uctx, vma->vm_pgoff); if (!rdma_entry) { atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]); @@ -922,6 +945,15 @@ err_unmap_dmpt: return ret; } +static void hns_roce_teardown_hca(struct hns_roce_dev *hr_dev) +{ + hns_roce_cleanup_bitmap(hr_dev); + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&hr_dev->pgdir_mutex); +} + /** * hns_roce_setup_hca - setup host channel adapter * @hr_dev: pointer to hns roce device @@ -970,6 +1002,10 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) err_uar_table_free: ida_destroy(&hr_dev->uar_ida.ida); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&hr_dev->pgdir_mutex); + return ret; } @@ -1099,8 +1135,6 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) INIT_LIST_HEAD(&hr_dev->qp_list); spin_lock_init(&hr_dev->qp_list_lock); - INIT_LIST_HEAD(&hr_dev->dip_list); - spin_lock_init(&hr_dev->dip_list_lock); ret = hns_roce_register_device(hr_dev); if (ret) @@ -1115,7 +1149,7 @@ error_failed_register_device: hr_dev->hw->hw_exit(hr_dev); error_failed_engine_init: - hns_roce_cleanup_bitmap(hr_dev); + hns_roce_teardown_hca(hr_dev); error_failed_setup_hca: hns_roce_cleanup_hem(hr_dev); @@ -1145,7 +1179,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev) if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); - hns_roce_cleanup_bitmap(hr_dev); + hns_roce_teardown_hca(hr_dev); hns_roce_cleanup_hem(hr_dev); if (hr_dev->cmd_mod) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 91cd580480fe..55b9283bfc6f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -32,6 +32,7 @@ */ #include <linux/vmalloc.h> +#include <linux/count_zeros.h> #include <rdma/ib_umem.h> #include <linux/math.h> #include "hns_roce_device.h" @@ -103,14 +104,21 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, buf_attr.user_access = mr->access; /* fast MR's buffer is alloced before mapping, not at creation */ buf_attr.mtt_only = is_fast; + buf_attr.iova = mr->iova; + /* pagesize and hopnum is fixed for fast MR */ + buf_attr.adaptive = !is_fast; + buf_attr.type = MTR_PBL; err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr, hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT, udata, start); - if (err) + if (err) { ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err); - else - mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count; + return err; + } + + mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count; + mr->pbl_hop_num = buf_attr.region[0].hopnum; return err; } @@ -130,8 +138,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1)); if (ret) - ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n", - ret); + ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n", + ret); } free_mr_pbl(hr_dev, mr); @@ -154,7 +162,7 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, if (mr->type != MR_TYPE_FRMR) ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr); else - ret = hr_dev->hw->frmr_write_mtpt(hr_dev, mailbox->buf, mr); + ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr); if (ret) { dev_err(dev, "failed to write mtpt, ret = %d.\n", ret); goto err_page; @@ -427,24 +435,30 @@ static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr) } int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, - unsigned int *sg_offset) + unsigned int *sg_offset_p) { + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_mr *mr = to_hr_mr(ibmr); struct hns_roce_mtr *mtr = &mr->pbl_mtr; - int ret = 0; + int ret, sg_num = 0; + + if (!IS_ALIGNED(sg_offset, HNS_ROCE_FRMR_ALIGN_SIZE) || + ibmr->page_size < HNS_HW_PAGE_SIZE || + ibmr->page_size > HNS_HW_MAX_PAGE_SIZE) + return sg_num; mr->npages = 0; mr->page_list = kvcalloc(mr->pbl_mtr.hem_cfg.buf_pg_count, sizeof(dma_addr_t), GFP_KERNEL); if (!mr->page_list) - return ret; + return sg_num; - ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); - if (ret < 1) { + sg_num = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset_p, hns_roce_set_page); + if (sg_num < 1) { ibdev_err(ibdev, "failed to store sg pages %u %u, cnt = %d.\n", - mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, ret); + mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, sg_num); goto err_page_list; } @@ -455,17 +469,16 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, ret = hns_roce_mtr_map(hr_dev, mtr, mr->page_list, mr->npages); if (ret) { ibdev_err(ibdev, "failed to map sg mtr, ret = %d.\n", ret); - ret = 0; + sg_num = 0; } else { mr->pbl_mtr.hem_cfg.buf_pg_shift = (u32)ilog2(ibmr->page_size); - ret = mr->npages; } err_page_list: kvfree(mr->page_list); mr->page_list = NULL; - return ret; + return sg_num; } static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, @@ -695,7 +708,7 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, mtr->umem = NULL; mtr->kmem = hns_roce_buf_alloc(hr_dev, total_size, buf_attr->page_shift, - mtr->hem_cfg.is_direct ? + !mtr_has_mtt(buf_attr) ? HNS_ROCE_BUF_DIRECT : 0); if (IS_ERR(mtr->kmem)) { ibdev_err(ibdev, "failed to alloc kmem, ret = %ld.\n", @@ -707,21 +720,48 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, return 0; } -static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - int page_count, unsigned int page_shift) +static int cal_mtr_pg_cnt(struct hns_roce_mtr *mtr) +{ + struct hns_roce_buf_region *region; + int page_cnt = 0; + int i; + + for (i = 0; i < mtr->hem_cfg.region_count; i++) { + region = &mtr->hem_cfg.region[i]; + page_cnt += region->count; + } + + return page_cnt; +} + +static bool need_split_huge_page(struct hns_roce_mtr *mtr) +{ + /* When HEM buffer uses 0-level addressing, the page size is + * equal to the whole buffer size. If the current MTR has multiple + * regions, we split the buffer into small pages(4k, required by hns + * ROCEE). These pages will be used in multiple regions. + */ + return mtr->hem_cfg.is_direct && mtr->hem_cfg.region_count > 1; +} + +static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) { struct ib_device *ibdev = &hr_dev->ib_dev; + int page_count = cal_mtr_pg_cnt(mtr); + unsigned int page_shift; dma_addr_t *pages; int npage; int ret; + page_shift = need_split_huge_page(mtr) ? HNS_HW_PAGE_SHIFT : + mtr->hem_cfg.buf_pg_shift; /* alloc a tmp array to store buffer's dma address */ pages = kvcalloc(page_count, sizeof(dma_addr_t), GFP_KERNEL); if (!pages) return -ENOMEM; if (mtr->umem) - npage = hns_roce_get_umem_bufs(hr_dev, pages, page_count, + npage = hns_roce_get_umem_bufs(pages, page_count, mtr->umem, page_shift); else npage = hns_roce_get_kmem_bufs(hr_dev, pages, page_count, @@ -734,7 +774,7 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, goto err_alloc_list; } - if (mtr->hem_cfg.is_direct && npage > 1) { + if (need_split_huge_page(mtr) && npage > 1) { ret = mtr_check_direct_pages(pages, npage, page_shift); if (ret) { ibdev_err(ibdev, "failed to check %s page: %d / %d.\n", @@ -774,11 +814,6 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, for (i = 0, mapped_cnt = 0; i < mtr->hem_cfg.region_count && mapped_cnt < page_cnt; i++) { r = &mtr->hem_cfg.region[i]; - /* if hopnum is 0, no need to map pages in this region */ - if (!r->hopnum) { - mapped_cnt += r->count; - continue; - } if (r->offset + r->count > page_cnt) { ret = -EINVAL; @@ -809,47 +844,53 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, return ret; } -int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - u32 offset, u64 *mtt_buf, int mtt_max, u64 *base_addr) +static int hns_roce_get_direct_addr_mtt(struct hns_roce_hem_cfg *cfg, + u32 start_index, u64 *mtt_buf, + int mtt_cnt) { - struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg; - int mtt_count, left; - u32 start_index; + int mtt_count; int total = 0; - __le64 *mtts; u32 npage; u64 addr; - if (!mtt_buf || mtt_max < 1) - goto done; - - /* no mtt memory in direct mode, so just return the buffer address */ - if (cfg->is_direct) { - start_index = offset >> HNS_HW_PAGE_SHIFT; - for (mtt_count = 0; mtt_count < cfg->region_count && - total < mtt_max; mtt_count++) { - npage = cfg->region[mtt_count].offset; - if (npage < start_index) - continue; + if (mtt_cnt > cfg->region_count) + return -EINVAL; - addr = cfg->root_ba + (npage << HNS_HW_PAGE_SHIFT); - mtt_buf[total] = addr; + for (mtt_count = 0; mtt_count < cfg->region_count && total < mtt_cnt; + mtt_count++) { + npage = cfg->region[mtt_count].offset; + if (npage < start_index) + continue; - total++; - } + addr = cfg->root_ba + (npage << HNS_HW_PAGE_SHIFT); + mtt_buf[total] = addr; - goto done; + total++; } - start_index = offset >> cfg->buf_pg_shift; - left = mtt_max; + if (!total) + return -ENOENT; + + return 0; +} + +static int hns_roce_get_mhop_mtt(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, u32 start_index, + u64 *mtt_buf, int mtt_cnt) +{ + int left = mtt_cnt; + int total = 0; + int mtt_count; + __le64 *mtts; + u32 npage; + while (left > 0) { mtt_count = 0; mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list, start_index + total, &mtt_count); if (!mtts || !mtt_count) - goto done; + break; npage = min(mtt_count, left); left -= npage; @@ -857,69 +898,165 @@ int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, mtt_buf[total++] = le64_to_cpu(mtts[mtt_count]); } -done: - if (base_addr) - *base_addr = cfg->root_ba; + if (!total) + return -ENOENT; - return total; + return 0; +} + +int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + u32 offset, u64 *mtt_buf, int mtt_max) +{ + struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg; + u32 start_index; + int ret; + + if (!mtt_buf || mtt_max < 1) + return -EINVAL; + + /* no mtt memory in direct mode, so just return the buffer address */ + if (cfg->is_direct) { + start_index = offset >> HNS_HW_PAGE_SHIFT; + ret = hns_roce_get_direct_addr_mtt(cfg, start_index, + mtt_buf, mtt_max); + } else { + start_index = offset >> cfg->buf_pg_shift; + ret = hns_roce_get_mhop_mtt(hr_dev, mtr, start_index, + mtt_buf, mtt_max); + } + return ret; +} + +static int get_best_page_shift(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *buf_attr) +{ + unsigned int page_sz; + + if (!buf_attr->adaptive || buf_attr->type != MTR_PBL || !mtr->umem) + return 0; + + page_sz = ib_umem_find_best_pgsz(mtr->umem, + hr_dev->caps.page_size_cap, + buf_attr->iova); + if (!page_sz) + return -EINVAL; + + buf_attr->page_shift = order_base_2(page_sz); + return 0; +} + +static int get_best_hop_num(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *buf_attr, + unsigned int ba_pg_shift) +{ +#define INVALID_HOPNUM -1 +#define MIN_BA_CNT 1 + size_t buf_pg_sz = 1 << buf_attr->page_shift; + struct ib_device *ibdev = &hr_dev->ib_dev; + size_t ba_pg_sz = 1 << ba_pg_shift; + int hop_num = INVALID_HOPNUM; + size_t unit = MIN_BA_CNT; + size_t ba_cnt; + int j; + + if (!buf_attr->adaptive || buf_attr->type != MTR_PBL) + return 0; + + /* Caculating the number of buf pages, each buf page need a BA */ + if (mtr->umem) + ba_cnt = ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz); + else + ba_cnt = DIV_ROUND_UP(buf_attr->region[0].size, buf_pg_sz); + + for (j = 0; j <= HNS_ROCE_MAX_HOP_NUM; j++) { + if (ba_cnt <= unit) { + hop_num = j; + break; + } + /* Number of BAs can be represented at per hop */ + unit *= ba_pg_sz / BA_BYTE_LEN; + } + + if (hop_num < 0) { + ibdev_err(ibdev, + "failed to calculate a valid hopnum.\n"); + return -EINVAL; + } + + buf_attr->region[0].hopnum = hop_num; + + return 0; +} + +static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev, + struct hns_roce_buf_attr *attr) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + + if (attr->region_count > ARRAY_SIZE(attr->region) || + attr->region_count < 1 || attr->page_shift < HNS_HW_PAGE_SHIFT) { + ibdev_err(ibdev, + "invalid buf attr, region count %d, page shift %u.\n", + attr->region_count, attr->page_shift); + return false; + } + + return true; } static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, - struct hns_roce_buf_attr *attr, - struct hns_roce_hem_cfg *cfg, - unsigned int *buf_page_shift, u64 unalinged_size) + struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *attr) { + struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg; struct hns_roce_buf_region *r; - u64 first_region_padding; - int page_cnt, region_cnt; - unsigned int page_shift; + size_t buf_pg_sz; size_t buf_size; + int page_cnt, i; + u64 pgoff = 0; + + if (!is_buf_attr_valid(hr_dev, attr)) + return -EINVAL; /* If mtt is disabled, all pages must be within a continuous range */ cfg->is_direct = !mtr_has_mtt(attr); + cfg->region_count = attr->region_count; buf_size = mtr_bufs_size(attr); - if (cfg->is_direct) { - /* When HEM buffer uses 0-level addressing, the page size is - * equal to the whole buffer size, and we split the buffer into - * small pages which is used to check whether the adjacent - * units are in the continuous space and its size is fixed to - * 4K based on hns ROCEE's requirement. - */ - page_shift = HNS_HW_PAGE_SHIFT; - - /* The ROCEE requires the page size to be 4K * 2 ^ N. */ + if (need_split_huge_page(mtr)) { + buf_pg_sz = HNS_HW_PAGE_SIZE; cfg->buf_pg_count = 1; + /* The ROCEE requires the page size to be 4K * 2 ^ N. */ cfg->buf_pg_shift = HNS_HW_PAGE_SHIFT + order_base_2(DIV_ROUND_UP(buf_size, HNS_HW_PAGE_SIZE)); - first_region_padding = 0; } else { - page_shift = attr->page_shift; - cfg->buf_pg_count = DIV_ROUND_UP(buf_size + unalinged_size, - 1 << page_shift); - cfg->buf_pg_shift = page_shift; - first_region_padding = unalinged_size; + buf_pg_sz = 1 << attr->page_shift; + cfg->buf_pg_count = mtr->umem ? + ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz) : + DIV_ROUND_UP(buf_size, buf_pg_sz); + cfg->buf_pg_shift = attr->page_shift; + pgoff = mtr->umem ? mtr->umem->address & ~PAGE_MASK : 0; } /* Convert buffer size to page index and page count for each region and * the buffer's offset needs to be appended to the first region. */ - for (page_cnt = 0, region_cnt = 0; region_cnt < attr->region_count && - region_cnt < ARRAY_SIZE(cfg->region); region_cnt++) { - r = &cfg->region[region_cnt]; + for (page_cnt = 0, i = 0; i < attr->region_count; i++) { + r = &cfg->region[i]; r->offset = page_cnt; - buf_size = hr_hw_page_align(attr->region[region_cnt].size + - first_region_padding); - r->count = DIV_ROUND_UP(buf_size, 1 << page_shift); - first_region_padding = 0; + buf_size = hr_hw_page_align(attr->region[i].size + pgoff); + if (attr->type == MTR_PBL && mtr->umem) + r->count = ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz); + else + r->count = DIV_ROUND_UP(buf_size, buf_pg_sz); + + pgoff = 0; page_cnt += r->count; - r->hopnum = to_hr_hem_hopnum(attr->region[region_cnt].hopnum, - r->count); + r->hopnum = to_hr_hem_hopnum(attr->region[i].hopnum, r->count); } - cfg->region_count = region_cnt; - *buf_page_shift = page_shift; - - return page_cnt; + return 0; } static u64 cal_pages_per_l1ba(unsigned int ba_per_bt, unsigned int hopnum) @@ -1007,50 +1144,58 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, unsigned long user_addr) { struct ib_device *ibdev = &hr_dev->ib_dev; - unsigned int buf_page_shift = 0; - int buf_page_cnt; int ret; - buf_page_cnt = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, - &buf_page_shift, - udata ? user_addr & ~PAGE_MASK : 0); - if (buf_page_cnt < 1 || buf_page_shift < HNS_HW_PAGE_SHIFT) { - ibdev_err(ibdev, "failed to init mtr cfg, count %d shift %u.\n", - buf_page_cnt, buf_page_shift); - return -EINVAL; - } - - ret = mtr_alloc_mtt(hr_dev, mtr, ba_page_shift); - if (ret) { - ibdev_err(ibdev, "failed to alloc mtr mtt, ret = %d.\n", ret); - return ret; - } - /* The caller has its own buffer list and invokes the hns_roce_mtr_map() * to finish the MTT configuration. */ if (buf_attr->mtt_only) { mtr->umem = NULL; mtr->kmem = NULL; - return 0; + } else { + ret = mtr_alloc_bufs(hr_dev, mtr, buf_attr, udata, user_addr); + if (ret) { + ibdev_err(ibdev, + "failed to alloc mtr bufs, ret = %d.\n", ret); + return ret; + } + + ret = get_best_page_shift(hr_dev, mtr, buf_attr); + if (ret) + goto err_init_buf; + + ret = get_best_hop_num(hr_dev, mtr, buf_attr, ba_page_shift); + if (ret) + goto err_init_buf; } - ret = mtr_alloc_bufs(hr_dev, mtr, buf_attr, udata, user_addr); + ret = mtr_init_buf_cfg(hr_dev, mtr, buf_attr); + if (ret) + goto err_init_buf; + + ret = mtr_alloc_mtt(hr_dev, mtr, ba_page_shift); if (ret) { - ibdev_err(ibdev, "failed to alloc mtr bufs, ret = %d.\n", ret); - goto err_alloc_mtt; + ibdev_err(ibdev, "failed to alloc mtr mtt, ret = %d.\n", ret); + goto err_init_buf; } + if (buf_attr->mtt_only) + return 0; + /* Write buffer's dma address to MTT */ - ret = mtr_map_bufs(hr_dev, mtr, buf_page_cnt, buf_page_shift); - if (ret) + ret = mtr_map_bufs(hr_dev, mtr); + if (ret) { ibdev_err(ibdev, "failed to map mtr bufs, ret = %d.\n", ret); - else - return 0; + goto err_alloc_mtt; + } + + return 0; - mtr_free_bufs(hr_dev, mtr); err_alloc_mtt: mtr_free_mtt(hr_dev, mtr); +err_init_buf: + mtr_free_bufs(hr_dev, mtr); + return ret; } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 31b147210688..8901c142c1b6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -39,6 +39,25 @@ #include "hns_roce_device.h" #include "hns_roce_hem.h" +static struct hns_roce_qp *hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, + u32 qpn) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_qp *qp; + unsigned long flags; + + xa_lock_irqsave(&hr_dev->qp_table_xa, flags); + qp = __hns_roce_qp_lookup(hr_dev, qpn); + if (qp) + refcount_inc(&qp->refcount); + xa_unlock_irqrestore(&hr_dev->qp_table_xa, flags); + + if (!qp) + dev_warn(dev, "async event for bogus QP %08x\n", qpn); + + return qp; +} + static void flush_work_handle(struct work_struct *work) { struct hns_roce_work *flush_work = container_of(work, @@ -71,11 +90,18 @@ static void flush_work_handle(struct work_struct *work) void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct hns_roce_work *flush_work = &hr_qp->flush_work; + unsigned long flags; + + spin_lock_irqsave(&hr_qp->flush_lock, flags); + /* Exit directly after destroy_qp() */ + if (test_bit(HNS_ROCE_STOP_FLUSH_FLAG, &hr_qp->flush_flag)) { + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); + return; + } - flush_work->hr_dev = hr_dev; - INIT_WORK(&flush_work->work, flush_work_handle); refcount_inc(&hr_qp->refcount); queue_work(hr_dev->irq_workq, &flush_work->work); + spin_unlock_irqrestore(&hr_qp->flush_lock, flags); } void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp) @@ -95,31 +121,28 @@ void flush_cqe(struct hns_roce_dev *dev, struct hns_roce_qp *qp) void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) { - struct device *dev = hr_dev->dev; struct hns_roce_qp *qp; - xa_lock(&hr_dev->qp_table_xa); - qp = __hns_roce_qp_lookup(hr_dev, qpn); - if (qp) - refcount_inc(&qp->refcount); - xa_unlock(&hr_dev->qp_table_xa); - - if (!qp) { - dev_warn(dev, "async event for bogus QP %08x\n", qpn); + qp = hns_roce_qp_lookup(hr_dev, qpn); + if (!qp) return; - } - if (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR || - event_type == HNS_ROCE_EVENT_TYPE_XRCD_VIOLATION || - event_type == HNS_ROCE_EVENT_TYPE_INVALID_XRCETH) { - qp->state = IB_QPS_ERR; + qp->event(qp, (enum hns_roce_event)event_type); - flush_cqe(hr_dev, qp); - } + if (refcount_dec_and_test(&qp->refcount)) + complete(&qp->free); +} - qp->event(qp, (enum hns_roce_event)event_type); +void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn) +{ + struct hns_roce_qp *qp; + + qp = hns_roce_qp_lookup(hr_dev, qpn); + if (!qp) + return; + + qp->state = IB_QPS_ERR; + flush_cqe(hr_dev, qp); if (refcount_dec_and_test(&qp->refcount)) complete(&qp->free); @@ -410,7 +433,8 @@ static void free_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) bankid = get_qp_bankid(hr_qp->qpn); - ida_free(&hr_dev->qp_table.bank[bankid].ida, hr_qp->qpn >> 3); + ida_free(&hr_dev->qp_table.bank[bankid].ida, + hr_qp->qpn / HNS_ROCE_QP_BANK_NUM); mutex_lock(&hr_dev->qp_table.bank_mutex); hr_dev->qp_table.bank[bankid].inuse--; @@ -531,13 +555,15 @@ static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi, { unsigned int inline_sge; - inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; + if (!max_inline_data) + return 0; /* * if max_inline_data less than * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, * In addition to ud's mode, no need to extend sge. */ + inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; if (!is_ud_or_gsi && inline_sge <= HNS_ROCE_SGE_IN_WQE) inline_sge = 0; @@ -842,12 +868,14 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_ib_create_qp *ucmd, struct hns_roce_ib_create_qp_resp *resp) { + bool has_sdb = user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd); struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, ibucontext); + bool has_rdb = user_qp_has_rdb(hr_dev, init_attr, udata, resp); struct ib_device *ibdev = &hr_dev->ib_dev; int ret; - if (user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd)) { + if (has_sdb) { ret = hns_roce_db_map_user(uctx, ucmd->sdb_addr, &hr_qp->sdb); if (ret) { ibdev_err(ibdev, @@ -858,7 +886,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, hr_qp->en_flags |= HNS_ROCE_QP_CAP_SQ_RECORD_DB; } - if (user_qp_has_rdb(hr_dev, init_attr, udata, resp)) { + if (has_rdb) { ret = hns_roce_db_map_user(uctx, ucmd->db_addr, &hr_qp->rdb); if (ret) { ibdev_err(ibdev, @@ -872,7 +900,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, return 0; err_sdb: - if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) + if (has_sdb) hns_roce_db_unmap_user(uctx, &hr_qp->sdb); err_out: return ret; @@ -1004,6 +1032,60 @@ static void free_kernel_wrid(struct hns_roce_qp *hr_qp) kfree(hr_qp->sq.wrid); } +static void default_congest_type(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp) +{ + if (hr_qp->ibqp.qp_type == IB_QPT_UD || + hr_qp->ibqp.qp_type == IB_QPT_GSI) + hr_qp->cong_type = CONG_TYPE_DCQCN; + else + hr_qp->cong_type = hr_dev->caps.default_cong_type; +} + +static int set_congest_type(struct hns_roce_qp *hr_qp, + struct hns_roce_ib_create_qp *ucmd) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); + + switch (ucmd->cong_type_flags) { + case HNS_ROCE_CREATE_QP_FLAGS_DCQCN: + hr_qp->cong_type = CONG_TYPE_DCQCN; + break; + case HNS_ROCE_CREATE_QP_FLAGS_LDCP: + hr_qp->cong_type = CONG_TYPE_LDCP; + break; + case HNS_ROCE_CREATE_QP_FLAGS_HC3: + hr_qp->cong_type = CONG_TYPE_HC3; + break; + case HNS_ROCE_CREATE_QP_FLAGS_DIP: + hr_qp->cong_type = CONG_TYPE_DIP; + break; + default: + return -EINVAL; + } + + if (!test_bit(hr_qp->cong_type, (unsigned long *)&hr_dev->caps.cong_cap)) + return -EOPNOTSUPP; + + if (hr_qp->ibqp.qp_type == IB_QPT_UD && + hr_qp->cong_type != CONG_TYPE_DCQCN) + return -EOPNOTSUPP; + + return 0; +} + +static int set_congest_param(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_ib_create_qp *ucmd) +{ + if (ucmd->comp_mask & HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE) + return set_congest_type(hr_qp, ucmd); + + default_congest_type(hr_dev, hr_qp); + + return 0; +} + static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, @@ -1039,13 +1121,18 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, ibucontext); hr_qp->config = uctx->config; ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); - if (ret) + if (ret) { ibdev_err(ibdev, "failed to set user SQ size, ret = %d.\n", ret); + return ret; + } + + ret = set_congest_param(hr_dev, hr_qp, ucmd); } else { if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) hr_qp->config = HNS_ROCE_EXSGE_FLAGS; + default_congest_type(hr_dev, hr_qp); ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, @@ -1057,11 +1144,11 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, } static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, - struct ib_pd *ib_pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct hns_roce_qp *hr_qp) { + struct hns_roce_work *flush_work = &hr_qp->flush_work; struct hns_roce_ib_create_qp_resp resp = {}; struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_ib_create_qp ucmd = {}; @@ -1070,9 +1157,12 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, mutex_init(&hr_qp->mutex); spin_lock_init(&hr_qp->sq.lock); spin_lock_init(&hr_qp->rq.lock); + spin_lock_init(&hr_qp->flush_lock); hr_qp->state = IB_QPS_RESET; hr_qp->flush_flag = 0; + flush_work->hr_dev = hr_dev; + INIT_WORK(&flush_work->work, flush_work_handle); if (init_attr->create_flags) return -EOPNOTSUPP; @@ -1080,7 +1170,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, ret = set_qp_param(hr_dev, hr_qp, init_attr, udata, &ucmd); if (ret) { ibdev_err(ibdev, "failed to set QP param, ret = %d.\n", ret); - return ret; + goto err_out; } if (!udata) { @@ -1088,7 +1178,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, if (ret) { ibdev_err(ibdev, "failed to alloc wrid, ret = %d.\n", ret); - return ret; + goto err_out; } } @@ -1130,7 +1220,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, min(udata->outlen, sizeof(resp))); if (ret) { ibdev_err(ibdev, "copy qp resp failed!\n"); - goto err_store; + goto err_flow_ctrl; } } @@ -1159,6 +1249,8 @@ err_qpn: free_qp_buf(hr_dev, hr_qp); err_buf: free_kernel_wrid(hr_qp); +err_out: + mutex_destroy(&hr_qp->mutex); return ret; } @@ -1174,6 +1266,7 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, free_qp_buf(hr_dev, hr_qp); free_kernel_wrid(hr_qp); free_qp_db(hr_dev, hr_qp, udata); + mutex_destroy(&hr_qp->mutex); } static int check_qp_type(struct hns_roce_dev *hr_dev, enum ib_qp_type type, @@ -1211,7 +1304,6 @@ int hns_roce_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, struct ib_device *ibdev = qp->device; struct hns_roce_dev *hr_dev = to_hr_dev(ibdev); struct hns_roce_qp *hr_qp = to_hr_qp(qp); - struct ib_pd *pd = qp->pd; int ret; ret = check_qp_type(hr_dev, init_attr->qp_type, !!udata); @@ -1226,7 +1318,7 @@ int hns_roce_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port]; } - ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, hr_qp); + ret = hns_roce_create_qp_common(hr_dev, init_attr, udata, hr_qp); if (ret) ibdev_err(ibdev, "create QP type 0x%x failed(%d)\n", init_attr->qp_type, ret); @@ -1326,6 +1418,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_ib_modify_qp_resp resp = {}; struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); enum ib_qp_state cur_state, new_state; int ret = -EINVAL; @@ -1367,6 +1460,18 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state, new_state, udata); + if (ret) + goto out; + + if (udata && udata->outlen) { + resp.tc_mode = hr_qp->tc_mode; + resp.priority = hr_qp->sl; + ret = ib_copy_to_udata(udata, &resp, + min(udata->outlen, sizeof(resp))); + if (ret) + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to copy modify qp resp.\n"); + } out: mutex_unlock(&hr_qp->mutex); @@ -1383,19 +1488,19 @@ void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq) __acquire(&send_cq->lock); __acquire(&recv_cq->lock); } else if (unlikely(send_cq != NULL && recv_cq == NULL)) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (unlikely(send_cq == NULL && recv_cq != NULL)) { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); __acquire(&send_cq->lock); } else if (send_cq == recv_cq) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } @@ -1415,13 +1520,13 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, spin_unlock(&recv_cq->lock); } else if (send_cq == recv_cq) { __release(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { spin_unlock(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); - spin_unlock_irq(&recv_cq->lock); + spin_unlock(&recv_cq->lock); } } @@ -1469,14 +1574,10 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev) unsigned int reserved_from_bot; unsigned int i; - qp_table->idx_table.spare_idx = kcalloc(hr_dev->caps.num_qps, - sizeof(u32), GFP_KERNEL); - if (!qp_table->idx_table.spare_idx) - return -ENOMEM; - mutex_init(&qp_table->scc_mutex); mutex_init(&qp_table->bank_mutex); xa_init(&hr_dev->qp_table_xa); + xa_init(&qp_table->dip_xa); reserved_from_bot = hr_dev->caps.reserved_qps; @@ -1501,5 +1602,8 @@ void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) ida_destroy(&hr_dev->qp_table.bank[i].ida); - kfree(hr_dev->qp_table.idx_table.spare_idx); + xa_destroy(&hr_dev->qp_table.dip_xa); + xa_destroy(&hr_dev->qp_table_xa); + mutex_destroy(&hr_dev->qp_table.bank_mutex); + mutex_destroy(&hr_dev->qp_table.scc_mutex); } diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c index f7f3c4cc7426..356d98816949 100644 --- a/drivers/infiniband/hw/hns/hns_roce_restrack.c +++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c @@ -97,16 +97,33 @@ int hns_roce_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ib_qp) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_qp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ib_qp); - struct hns_roce_v2_qp_context context; + struct hns_roce_full_qp_ctx { + struct hns_roce_v2_qp_context qpc; + struct hns_roce_v2_scc_context sccc; + } context = {}; int ret; if (!hr_dev->hw->query_qpc) return -EINVAL; - ret = hr_dev->hw->query_qpc(hr_dev, hr_qp->qpn, &context); + ret = hr_dev->hw->query_qpc(hr_dev, hr_qp->qpn, &context.qpc); if (ret) - return -EINVAL; + return ret; + + /* If SCC is disabled or the query fails, the queried SCCC will + * be all 0. + */ + if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) || + !hr_dev->hw->query_sccc) + goto out; + + ret = hr_dev->hw->query_sccc(hr_dev, hr_qp->qpn, &context.sccc); + if (ret) + ibdev_warn_ratelimited(&hr_dev->ib_dev, + "failed to query SCCC, ret = %d.\n", + ret); +out: ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context); return ret; diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 4abae9477854..70c06ef65603 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -123,7 +123,7 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) return ret; } - ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); + ret = xa_err(xa_store_irq(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); if (ret) { ibdev_err(ibdev, "failed to store SRQC, ret = %d.\n", ret); goto err_put; @@ -136,7 +136,7 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) return 0; err_xa: - xa_erase(&srq_table->xa, srq->srqn); + xa_erase_irq(&srq_table->xa, srq->srqn); err_put: hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn); @@ -151,10 +151,10 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ, srq->srqn); if (ret) - dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", - ret, srq->srqn); + dev_err_ratelimited(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", + ret, srq->srqn); - xa_erase(&srq_table->xa, srq->srqn); + xa_erase_irq(&srq_table->xa, srq->srqn); if (refcount_dec_and_test(&srq->refcount)) complete(&srq->free); @@ -250,7 +250,7 @@ static void free_srq_wqe_buf(struct hns_roce_dev *hr_dev, hns_roce_mtr_destroy(hr_dev, &srq->buf_mtr); } -static int alloc_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +static int alloc_srq_wrid(struct hns_roce_srq *srq) { srq->wrid = kvmalloc_array(srq->wqe_cnt, sizeof(u64), GFP_KERNEL); if (!srq->wrid) @@ -297,7 +297,7 @@ static int set_srq_basic_param(struct hns_roce_srq *srq, max_sge = proc_srq_sge(hr_dev, srq, !!udata); if (attr->max_wr > hr_dev->caps.max_srq_wrs || - attr->max_sge > max_sge) { + attr->max_sge > max_sge || !attr->max_sge) { ibdev_err(&hr_dev->ib_dev, "invalid SRQ attr, depth = %u, sge = %u.\n", attr->max_wr, attr->max_sge); @@ -366,7 +366,7 @@ static int alloc_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, goto err_idx; if (!udata) { - ret = alloc_srq_wrid(hr_dev, srq); + ret = alloc_srq_wrid(srq); if (ret) goto err_wqe_buf; } @@ -518,6 +518,7 @@ err_srq_db: err_srq_buf: free_srq_buf(hr_dev, srq); err_out: + mutex_destroy(&srq->mutex); atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_SRQ_CREATE_ERR_CNT]); return ret; @@ -532,6 +533,7 @@ int hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) free_srqn(hr_dev, srq); free_srq_db(hr_dev, srq, udata); free_srq_buf(hr_dev, srq); + mutex_destroy(&srq->mutex); return 0; } diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 1ee7a4e0d8d8..ce8d821bdad8 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1985,7 +1985,8 @@ static int irdma_addr_resolve_neigh(struct irdma_device *iwdev, u32 src_ip, __be32 dst_ipaddr = htonl(dst_ip); __be32 src_ipaddr = htonl(src_ip); - rt = ip_route_output(&init_net, dst_ipaddr, src_ipaddr, 0, 0); + rt = ip_route_output(&init_net, dst_ipaddr, src_ipaddr, 0, 0, + RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) { ibdev_dbg(&iwdev->ibdev, "CM: ip_route_output fail\n"); return -EINVAL; @@ -3630,7 +3631,7 @@ void irdma_free_lsmm_rsrc(struct irdma_qp *iwqp) /** * irdma_accept - registered call for connection to be accepted * @cm_id: cm information for passive connection - * @conn_param: accpet parameters + * @conn_param: accept parameters */ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) { diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index b65bc2ea542f..9f0ed6e84471 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -239,7 +239,7 @@ struct irdma_qv_info { struct irdma_qvlist_info { u32 num_vectors; - struct irdma_qv_info qv_info[]; + struct irdma_qv_info qv_info[] __counted_by(num_vectors); }; struct irdma_gen_ops { diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index e1e3d3ae72b7..ddf02a462efa 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -59,10 +59,6 @@ int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, int irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, struct irdma_hmc_fcn_info *hmcfcninfo, u16 *pmf_idx); -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); int irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, struct irdma_dma_mem *mem); void *irdma_remove_cqp_head(struct irdma_sc_dev *dev); diff --git a/drivers/infiniband/hw/irdma/protos.h b/drivers/infiniband/hw/irdma/protos.h index d7c8ea948bcd..c0c9441885d3 100644 --- a/drivers/infiniband/hw/irdma/protos.h +++ b/drivers/infiniband/hw/irdma/protos.h @@ -85,10 +85,6 @@ int irdma_process_cqp_cmd(struct irdma_sc_dev *dev, int irdma_process_bh(struct irdma_sc_dev *dev); int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, struct irdma_update_sds_info *info); -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); int irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, struct irdma_dma_mem *mem); int irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 0422787592d8..0e594122baa7 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -320,9 +320,6 @@ int irdma_netdevice_event(struct notifier_block *notifier, unsigned long event, case NETDEV_DOWN: iwdev->iw_status = 0; fallthrough; - case NETDEV_UP: - irdma_port_ibevent(iwdev); - break; default: break; } @@ -972,74 +969,6 @@ void irdma_terminate_del_timer(struct irdma_sc_qp *qp) } /** - * irdma_cqp_query_fpm_val_cmd - send cqp command for fpm - * @dev: function device struct - * @val_mem: buffer for fpm - * @hmc_fn_id: function id for fpm - */ -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id) -{ - struct irdma_cqp_request *cqp_request; - struct cqp_cmds_info *cqp_info; - struct irdma_pci_f *rf = dev_to_rf(dev); - int status; - - cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); - if (!cqp_request) - return -ENOMEM; - - cqp_info = &cqp_request->info; - cqp_request->param = NULL; - cqp_info->in.u.query_fpm_val.cqp = dev->cqp; - cqp_info->in.u.query_fpm_val.fpm_val_pa = val_mem->pa; - cqp_info->in.u.query_fpm_val.fpm_val_va = val_mem->va; - cqp_info->in.u.query_fpm_val.hmc_fn_id = hmc_fn_id; - cqp_info->cqp_cmd = IRDMA_OP_QUERY_FPM_VAL; - cqp_info->post_sq = 1; - cqp_info->in.u.query_fpm_val.scratch = (uintptr_t)cqp_request; - - status = irdma_handle_cqp_op(rf, cqp_request); - irdma_put_cqp_request(&rf->cqp, cqp_request); - - return status; -} - -/** - * irdma_cqp_commit_fpm_val_cmd - commit fpm values in hw - * @dev: hardware control device structure - * @val_mem: buffer with fpm values - * @hmc_fn_id: function id for fpm - */ -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id) -{ - struct irdma_cqp_request *cqp_request; - struct cqp_cmds_info *cqp_info; - struct irdma_pci_f *rf = dev_to_rf(dev); - int status; - - cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); - if (!cqp_request) - return -ENOMEM; - - cqp_info = &cqp_request->info; - cqp_request->param = NULL; - cqp_info->in.u.commit_fpm_val.cqp = dev->cqp; - cqp_info->in.u.commit_fpm_val.fpm_val_pa = val_mem->pa; - cqp_info->in.u.commit_fpm_val.fpm_val_va = val_mem->va; - cqp_info->in.u.commit_fpm_val.hmc_fn_id = hmc_fn_id; - cqp_info->cqp_cmd = IRDMA_OP_COMMIT_FPM_VAL; - cqp_info->post_sq = 1; - cqp_info->in.u.commit_fpm_val.scratch = (uintptr_t)cqp_request; - - status = irdma_handle_cqp_op(rf, cqp_request); - irdma_put_cqp_request(&rf->cqp, cqp_request); - - return status; -} - -/** * irdma_cqp_cq_create_cmd - create a cq for the cqp * @dev: device pointer * @cq: pointer to created cq diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 0b046c061742..eeb932e58730 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -719,7 +719,6 @@ static int irdma_setup_kmode_qp(struct irdma_device *iwdev, info->rq_pa + (ukinfo->rq_depth * IRDMA_QP_WQE_MIN_SIZE); ukinfo->sq_size = ukinfo->sq_depth >> ukinfo->sq_shift; ukinfo->rq_size = ukinfo->rq_depth >> ukinfo->rq_shift; - ukinfo->qp_id = iwqp->ibqp.qp_num; iwqp->max_send_wr = (ukinfo->sq_depth - IRDMA_SQ_RSVD) >> ukinfo->sq_shift; iwqp->max_recv_wr = (ukinfo->rq_depth - IRDMA_RQ_RSVD) >> ukinfo->rq_shift; @@ -944,7 +943,7 @@ static int irdma_create_qp(struct ib_qp *ibqp, iwqp->host_ctx.size = IRDMA_QP_CTX_SIZE; init_info.pd = &iwpd->sc_pd; - init_info.qp_uk_init_info.qp_id = iwqp->ibqp.qp_num; + init_info.qp_uk_init_info.qp_id = qp_num; if (!rdma_protocol_roce(&iwdev->ibdev, 1)) init_info.qp_uk_init_info.first_sq_wq = 1; iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp; @@ -1348,7 +1347,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->max_dest_rd_atomic > dev->hw_attrs.max_hw_ird) { ibdev_err(&iwdev->ibdev, "rd_atomic = %d, above max_hw_ird=%d\n", - attr->max_rd_atomic, + attr->max_dest_rd_atomic, dev->hw_attrs.max_hw_ird); return -EINVAL; } @@ -2036,14 +2035,15 @@ static inline int cq_validate_flags(u32 flags, u8 hw_rev) * irdma_create_cq - create cq * @ibcq: CQ allocated * @attr: attributes for cq - * @udata: user data + * @attrs: uverbs attribute bundle */ static int irdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { #define IRDMA_CREATE_CQ_MIN_REQ_LEN offsetofend(struct irdma_create_cq_req, user_cq_buf) #define IRDMA_CREATE_CQ_MIN_RESP_LEN offsetofend(struct irdma_create_cq_resp, cq_size) + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; struct irdma_device *iwdev = to_iwdev(ibdev); struct irdma_pci_f *rf = iwdev->rf; @@ -3085,7 +3085,7 @@ error: static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 len, u64 virt, int fd, int access, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct irdma_device *iwdev = to_iwdev(pd->device); struct ib_umem_dmabuf *umem_dmabuf; diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 83ebd070535a..f04a679d2871 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -6,26 +6,27 @@ #include "mana_ib.h" int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct mana_ib_create_cq_resp resp = {}; + struct mana_ib_ucontext *mana_ucontext; struct ib_device *ibdev = ibcq->device; struct mana_ib_create_cq ucmd = {}; struct mana_ib_dev *mdev; - struct gdma_context *gc; + bool is_rnic_cq; + u32 doorbell; int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev->gdma_dev->gdma_context; - if (udata->inlen < sizeof(ucmd)) - return -EINVAL; + cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; + cq->cq_handle = INVALID_MANA_HANDLE; - if (attr->comp_vector > gc->max_num_queues) + if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) return -EINVAL; - cq->comp_vector = attr->comp_vector; - err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); if (err) { ibdev_dbg(ibdev, @@ -33,42 +34,54 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return err; } - if (attr->cqe > mdev->adapter_caps.max_qp_wr) { + is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); + + if (!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) { ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); return -EINVAL; } cq->cqe = attr->cqe; - cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(cq->umem)) { - err = PTR_ERR(cq->umem); - ibdev_dbg(ibdev, "Failed to get umem for create cq, err %d\n", - err); + err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, &cq->queue); + if (err) { + ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err); return err; } - err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region); - if (err) { - ibdev_dbg(ibdev, - "Failed to create dma region for create cq, %d\n", - err); - goto err_release_umem; + mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, + ibucontext); + doorbell = mana_ucontext->doorbell; + + if (is_rnic_cq) { + err = mana_ib_gd_create_cq(mdev, cq, doorbell); + if (err) { + ibdev_dbg(ibdev, "Failed to create RNIC cq, %d\n", err); + goto err_destroy_queue; + } + + err = mana_ib_install_cq_cb(mdev, cq); + if (err) { + ibdev_dbg(ibdev, "Failed to install cq callback, %d\n", err); + goto err_destroy_rnic_cq; + } } - ibdev_dbg(ibdev, - "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", - err, cq->gdma_region); - - /* - * The CQ ID is not known at this time. The ID is generated at create_qp - */ - cq->id = INVALID_QUEUE_ID; + resp.cqid = cq->queue.id; + err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); + goto err_remove_cq_cb; + } return 0; -err_release_umem: - ib_umem_release(cq->umem); +err_remove_cq_cb: + mana_ib_remove_cq_cb(mdev, cq); +err_destroy_rnic_cq: + mana_ib_gd_destroy_cq(mdev, cq); +err_destroy_queue: + mana_ib_destroy_queue(mdev, &cq->queue); + return err; } @@ -77,33 +90,57 @@ int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); struct ib_device *ibdev = ibcq->device; struct mana_ib_dev *mdev; - struct gdma_context *gc; - int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev->gdma_dev->gdma_context; - err = mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region); - if (err) { - ibdev_dbg(ibdev, - "Failed to destroy dma region, %d\n", err); - return err; - } + mana_ib_remove_cq_cb(mdev, cq); - if (cq->id != INVALID_QUEUE_ID) { - kfree(gc->cq_table[cq->id]); - gc->cq_table[cq->id] = NULL; - } + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_cq(mdev, cq); - ib_umem_release(cq->umem); + mana_ib_destroy_queue(mdev, &cq->queue); return 0; } -void mana_ib_cq_handler(void *ctx, struct gdma_queue *gdma_cq) +static void mana_ib_cq_handler(void *ctx, struct gdma_queue *gdma_cq) { struct mana_ib_cq *cq = ctx; if (cq->ibcq.comp_handler) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } + +int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_queue *gdma_cq; + + if (cq->queue.id >= gc->max_num_cqs) + return -EINVAL; + /* Create CQ table entry */ + WARN_ON(gc->cq_table[cq->queue.id]); + gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); + if (!gdma_cq) + return -ENOMEM; + + gdma_cq->cq.context = cq; + gdma_cq->type = GDMA_CQ; + gdma_cq->cq.callback = mana_ib_cq_handler; + gdma_cq->id = cq->queue.id; + gc->cq_table[cq->queue.id] = gdma_cq; + return 0; +} + +void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + + if (cq->queue.id >= gc->max_num_cqs || cq->queue.id == INVALID_QUEUE_ID) + return; + + kfree(gc->cq_table[cq->queue.id]); + gc->cq_table[cq->queue.id] = NULL; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 6fa902ee80a6..3416a85f8738 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -5,16 +5,18 @@ #include "mana_ib.h" #include <net/mana/mana_auxiliary.h> +#include <net/addrconf.h> MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver"); MODULE_LICENSE("GPL"); -MODULE_IMPORT_NS(NET_MANA); +MODULE_IMPORT_NS("NET_MANA"); static const struct ib_device_ops mana_ib_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MANA, .uverbs_abi_ver = MANA_IB_UVERBS_ABI_VERSION, + .add_gid = mana_ib_gd_add_gid, .alloc_pd = mana_ib_alloc_pd, .alloc_ucontext = mana_ib_alloc_ucontext, .create_cq = mana_ib_create_cq, @@ -23,18 +25,21 @@ static const struct ib_device_ops mana_ib_dev_ops = { .create_wq = mana_ib_create_wq, .dealloc_pd = mana_ib_dealloc_pd, .dealloc_ucontext = mana_ib_dealloc_ucontext, + .del_gid = mana_ib_gd_del_gid, .dereg_mr = mana_ib_dereg_mr, .destroy_cq = mana_ib_destroy_cq, .destroy_qp = mana_ib_destroy_qp, .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, .destroy_wq = mana_ib_destroy_wq, .disassociate_ucontext = mana_ib_disassociate_ucontext, + .get_link_layer = mana_ib_get_link_layer, .get_port_immutable = mana_ib_get_port_immutable, .mmap = mana_ib_mmap, .modify_qp = mana_ib_modify_qp, .modify_wq = mana_ib_modify_wq, .query_device = mana_ib_query_device, .query_gid = mana_ib_query_gid, + .query_pkey = mana_ib_query_pkey, .query_port = mana_ib_query_port, .reg_user_mr = mana_ib_reg_user_mr, @@ -51,8 +56,10 @@ static int mana_ib_probe(struct auxiliary_device *adev, { struct mana_adev *madev = container_of(adev, struct mana_adev, adev); struct gdma_dev *mdev = madev->mdev; + struct net_device *ndev; struct mana_context *mc; struct mana_ib_dev *dev; + u8 mac_addr[ETH_ALEN]; int ret; mc = mdev->driver_data; @@ -74,9 +81,26 @@ static int mana_ib_probe(struct auxiliary_device *adev, * num_comp_vectors needs to set to the max MSIX index * when interrupts and event queues are implemented */ - dev->ib_dev.num_comp_vectors = 1; + dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues; dev->ib_dev.dev.parent = mdev->gdma_context->dev; + rcu_read_lock(); /* required to get primary netdev */ + ndev = mana_get_primary_netdev_rcu(mc, 0); + if (!ndev) { + rcu_read_unlock(); + ret = -ENODEV; + ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1"); + goto free_ib_device; + } + ether_addr_copy(mac_addr, ndev->dev_addr); + addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr); + ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1); + rcu_read_unlock(); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); + goto free_ib_device; + } + ret = mana_gd_register_device(&mdev->gdma_context->mana_ib); if (ret) { ibdev_err(&dev->ib_dev, "Failed to register device, ret %d", @@ -92,15 +116,38 @@ static int mana_ib_probe(struct auxiliary_device *adev, goto deregister_device; } + ret = mana_ib_create_eqs(dev); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); + goto deregister_device; + } + + ret = mana_ib_gd_create_rnic_adapter(dev); + if (ret) + goto destroy_eqs; + + xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ); + ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d", + ret); + goto destroy_rnic; + } + ret = ib_register_device(&dev->ib_dev, "mana_%d", mdev->gdma_context->dev); if (ret) - goto deregister_device; + goto destroy_rnic; dev_set_drvdata(&adev->dev, dev); return 0; +destroy_rnic: + xa_destroy(&dev->qp_table_wq); + mana_ib_gd_destroy_rnic_adapter(dev); +destroy_eqs: + mana_ib_destroy_eqs(dev); deregister_device: mana_gd_deregister_device(dev->gdma_dev); free_ib_device: @@ -113,9 +160,10 @@ static void mana_ib_remove(struct auxiliary_device *adev) struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); ib_unregister_device(&dev->ib_dev); - + xa_destroy(&dev->qp_table_wq); + mana_ib_gd_destroy_rnic_adapter(dev); + mana_ib_destroy_eqs(dev); mana_gd_deregister_device(dev->gdma_dev); - ib_dealloc_device(&dev->ib_dev); } diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index faca092456fa..457cea6d9909 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -8,13 +8,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, u32 port) { - struct gdma_dev *gd = &dev->gdma_dev->gdma_context->mana; struct mana_port_context *mpc; struct net_device *ndev; - struct mana_context *mc; - mc = gd->driver_data; - ndev = mc->ports[port]; + ndev = mana_ib_get_netdev(&dev->ib_dev, port); mpc = netdev_priv(ndev); mutex_lock(&pd->vport_mutex); @@ -31,14 +28,11 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd, u32 doorbell_id) { - struct gdma_dev *mdev = &dev->gdma_dev->gdma_context->mana; struct mana_port_context *mpc; - struct mana_context *mc; struct net_device *ndev; int err; - mc = mdev->driver_data; - ndev = mc->ports[port]; + ndev = mana_ib_get_netdev(&dev->ib_dev, port); mpc = netdev_priv(ndev); mutex_lock(&pd->vport_mutex); @@ -79,17 +73,17 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) struct gdma_create_pd_req req = {}; enum gdma_pd_flags flags = 0; struct mana_ib_dev *dev; - struct gdma_dev *mdev; + struct gdma_context *gc; int err; dev = container_of(ibdev, struct mana_ib_dev, ib_dev); - mdev = dev->gdma_dev; + gc = mdev_to_gc(dev); mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req), sizeof(resp)); req.flags = flags; - err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req, + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { @@ -119,17 +113,17 @@ int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) struct gdma_destory_pd_resp resp = {}; struct gdma_destroy_pd_req req = {}; struct mana_ib_dev *dev; - struct gdma_dev *mdev; + struct gdma_context *gc; int err; dev = container_of(ibdev, struct mana_ib_dev, ib_dev); - mdev = dev->gdma_dev; + gc = mdev_to_gc(dev); mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req), sizeof(resp)); req.pd_handle = pd->pd_handle; - err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req, + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { @@ -180,7 +174,7 @@ static int mana_gd_allocate_doorbell_page(struct gdma_context *gc, req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; req.num_resources = 1; - req.alignment = 1; + req.alignment = PAGE_SIZE / MANA_PAGE_SIZE; /* Have GDMA start searching from 0 */ req.allocated_resources = 0; @@ -206,13 +200,11 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext, struct ib_device *ibdev = ibcontext->device; struct mana_ib_dev *mdev; struct gdma_context *gc; - struct gdma_dev *dev; int doorbell_page; int ret; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - dev = mdev->gdma_dev; - gc = dev->gdma_context; + gc = mdev_to_gc(mdev); /* Allocate a doorbell page index */ ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page); @@ -238,13 +230,54 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) int ret; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev->gdma_dev->gdma_context; + gc = mdev_to_gc(mdev); ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext->doorbell); if (ret) ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret); } +int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, + struct mana_ib_queue *queue) +{ + struct ib_umem *umem; + int err; + + queue->umem = NULL; + queue->id = INVALID_QUEUE_ID; + queue->gdma_region = GDMA_INVALID_DMA_REGION; + + umem = ib_umem_get(&mdev->ib_dev, addr, size, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, "Failed to get umem, %d\n", err); + return err; + } + + err = mana_ib_create_zero_offset_dma_region(mdev, umem, &queue->gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to create dma region, %d\n", err); + goto free_umem; + } + queue->umem = umem; + + ibdev_dbg(&mdev->ib_dev, "created dma region 0x%llx\n", queue->gdma_region); + + return 0; +free_umem: + ib_umem_release(umem); + return err; +} + +void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue) +{ + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_dma_region(mdev, queue->gdma_region); + ib_umem_release(queue->umem); +} + static int mana_ib_gd_first_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, @@ -309,8 +342,8 @@ mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, return 0; } -int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, - mana_handle_t *gdma_region) +static int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region, unsigned long page_sz) { struct gdma_dma_region_add_pages_req *add_req = NULL; size_t num_pages_processed = 0, num_pages_to_handle; @@ -322,23 +355,14 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, size_t max_pgs_create_cmd; struct gdma_context *gc; size_t num_pages_total; - struct gdma_dev *mdev; - unsigned long page_sz; unsigned int tail = 0; u64 *page_addr_list; void *request_buf; int err; - mdev = dev->gdma_dev; - gc = mdev->gdma_context; + gc = mdev_to_gc(dev); hwc = gc->hwc.driver_data; - /* Hardware requires dma region to align to chosen page size */ - page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); - if (!page_sz) { - ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); - return -ENOMEM; - } num_pages_total = ib_umem_num_dma_blocks(umem, page_sz); max_pgs_create_cmd = @@ -358,8 +382,8 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, sizeof(struct gdma_create_dma_region_resp)); create_req->length = umem->length; - create_req->offset_in_page = umem->address & (page_sz - 1); - create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); + create_req->gdma_page_type = order_base_2(page_sz) - MANA_PAGE_SHIFT; create_req->page_count = num_pages_total; ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", @@ -424,12 +448,39 @@ out: return err; } +int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region, u64 virt) +{ + unsigned long page_sz; + + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt); + if (!page_sz) { + ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n"); + return -EINVAL; + } + + return mana_ib_gd_create_dma_region(dev, umem, gdma_region, page_sz); +} + +int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region) +{ + unsigned long page_sz; + + /* Hardware requires dma region to align to chosen page size */ + page_sz = ib_umem_find_best_pgoff(umem, PAGE_SZ_BM, 0); + if (!page_sz) { + ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n"); + return -EINVAL; + } + + return mana_ib_gd_create_dma_region(dev, umem, gdma_region, page_sz); +} + int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, u64 gdma_region) { - struct gdma_dev *mdev = dev->gdma_dev; - struct gdma_context *gc; + struct gdma_context *gc = mdev_to_gc(dev); - gc = mdev->gdma_context; ibdev_dbg(&dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region); return mana_gd_destroy_dma_region(gc, gdma_region); @@ -447,7 +498,7 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) int ret; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev->gdma_dev->gdma_context; + gc = mdev_to_gc(mdev); if (vma->vm_pgoff != 0) { ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma->vm_pgoff); @@ -460,13 +511,13 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) PAGE_SHIFT; prot = pgprot_writecombine(vma->vm_page_prot); - ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + ret = rdma_user_mmap_io(ibcontext, vma, pfn, PAGE_SIZE, prot, NULL); if (ret) ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); else - ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", - pfn, gc->db_page_size, ret); + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %lu, ret %d\n", + pfn, PAGE_SIZE, ret); return ret; } @@ -474,11 +525,18 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { - /* - * This version only support RAW_PACKET - * other values need to be filled for other types - */ + struct ib_port_attr attr; + int err; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + if (port_num == 1) + immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; return 0; } @@ -489,14 +547,27 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + memset(props, 0, sizeof(*props)); + props->max_mr_size = MANA_IB_MAX_MR_SIZE; + props->page_size_cap = PAGE_SZ_BM; props->max_qp = dev->adapter_caps.max_qp_count; props->max_qp_wr = dev->adapter_caps.max_qp_wr; + props->device_cap_flags = IB_DEVICE_RC_RNR_NAK_GEN; + props->max_send_sge = dev->adapter_caps.max_send_sge_count; + props->max_recv_sge = dev->adapter_caps.max_recv_sge_count; + props->max_sge_rd = dev->adapter_caps.max_recv_sge_count; props->max_cq = dev->adapter_caps.max_cq_count; props->max_cqe = dev->adapter_caps.max_qp_wr; props->max_mr = dev->adapter_caps.max_mr_count; - props->max_mr_size = MANA_IB_MAX_MR_SIZE; - props->max_send_sge = dev->adapter_caps.max_send_sge_count; - props->max_recv_sge = dev->adapter_caps.max_recv_sge_count; + props->max_pd = dev->adapter_caps.max_pd_count; + props->max_qp_rd_atom = dev->adapter_caps.max_inbound_read_limit; + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_qp_init_rd_atom = dev->adapter_caps.max_outbound_read_limit; + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_ah = INT_MAX; + props->max_pkeys = 1; + props->local_ca_ack_delay = MANA_CA_ACK_DELAY; return 0; } @@ -504,7 +575,42 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, int mana_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { - /* This version doesn't return port properties */ + struct net_device *ndev = mana_ib_get_netdev(ibdev, port); + + if (!ndev) + return -EINVAL; + + memset(props, 0, sizeof(*props)); + props->max_mtu = IB_MTU_4096; + props->active_mtu = ib_mtu_int_to_enum(ndev->mtu); + + if (netif_carrier_ok(ndev) && netif_running(ndev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + } else { + props->state = IB_PORT_DOWN; + props->phys_state = IB_PORT_PHYS_STATE_DISABLED; + } + + props->active_width = IB_WIDTH_4X; + props->active_speed = IB_SPEED_EDR; + props->pkey_tbl_len = 1; + if (port == 1) + props->gid_tbl_len = 16; + + return 0; +} + +enum rdma_link_layer mana_ib_get_link_layer(struct ib_device *device, u32 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +int mana_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) +{ + if (index != 0) + return -EINVAL; + *pkey = IB_DEFAULT_PKEY_FULL; return 0; } @@ -531,7 +637,7 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) req.hdr.resp.msg_version = GDMA_MESSAGE_V3; req.hdr.dev_id = dev->gdma_dev->dev_id; - err = mana_gd_send_request(dev->gdma_dev->gdma_context, sizeof(req), + err = mana_gd_send_request(mdev_to_gc(dev), sizeof(req), &req, sizeof(resp), &resp); if (err) { @@ -560,3 +666,324 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) return 0; } + +static void +mana_ib_event_handler(void *ctx, struct gdma_queue *q, struct gdma_event *event) +{ + struct mana_ib_dev *mdev = (struct mana_ib_dev *)ctx; + struct mana_ib_qp *qp; + struct ib_event ev; + u32 qpn; + + switch (event->type) { + case GDMA_EQE_RNIC_QP_FATAL: + qpn = event->details[0]; + qp = mana_get_qp_ref(mdev, qpn); + if (!qp) + break; + if (qp->ibqp.event_handler) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_FATAL; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + mana_put_qp_ref(qp); + break; + default: + break; + } +} + +int mana_ib_create_eqs(struct mana_ib_dev *mdev) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_queue_spec spec = {}; + int err, i; + + spec.type = GDMA_EQ; + spec.monitor_avl_buf = false; + spec.queue_size = EQ_SIZE; + spec.eq.callback = mana_ib_event_handler; + spec.eq.context = mdev; + spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; + spec.eq.msix_index = 0; + + err = mana_gd_create_mana_eq(&gc->mana_ib, &spec, &mdev->fatal_err_eq); + if (err) + return err; + + mdev->eqs = kcalloc(mdev->ib_dev.num_comp_vectors, sizeof(struct gdma_queue *), + GFP_KERNEL); + if (!mdev->eqs) { + err = -ENOMEM; + goto destroy_fatal_eq; + } + spec.eq.callback = NULL; + for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) { + spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]); + if (err) + goto destroy_eqs; + } + + return 0; + +destroy_eqs: + while (i-- > 0) + mana_gd_destroy_queue(gc, mdev->eqs[i]); + kfree(mdev->eqs); +destroy_fatal_eq: + mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + return err; +} + +void mana_ib_destroy_eqs(struct mana_ib_dev *mdev) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + int i; + + mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + + for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) + mana_gd_destroy_queue(gc, mdev->eqs[i]); + + kfree(mdev->eqs); +} + +int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev) +{ + struct mana_rnic_create_adapter_resp resp = {}; + struct mana_rnic_create_adapter_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_ADAPTER, sizeof(req), sizeof(resp)); + req.hdr.req.msg_version = GDMA_MESSAGE_V2; + req.hdr.dev_id = gc->mana_ib.dev_id; + req.notify_eq_id = mdev->fatal_err_eq->id; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create RNIC adapter err %d", err); + return err; + } + mdev->adapter_handle = resp.adapter; + + return 0; +} + +int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev) +{ + struct mana_rnic_destroy_adapter_resp resp = {}; + struct mana_rnic_destroy_adapter_req req = {}; + struct gdma_context *gc; + int err; + + gc = mdev_to_gc(mdev); + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_ADAPTER, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy RNIC adapter err %d", err); + return err; + } + + return 0; +} + +int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context) +{ + struct mana_ib_dev *mdev = container_of(attr->device, struct mana_ib_dev, ib_dev); + enum rdma_network_type ntype = rdma_gid_attr_network_type(attr); + struct mana_rnic_config_addr_resp resp = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_config_addr_req req = {}; + int err; + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) { + ibdev_dbg(&mdev->ib_dev, "Unsupported rdma network type %d", ntype); + return -EINVAL; + } + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.op = ADDR_OP_ADD; + req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4; + copy_in_reverse(req.ip_addr, attr->gid.raw, sizeof(union ib_gid)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config IP addr err %d\n", err); + return err; + } + + return 0; +} + +int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context) +{ + struct mana_ib_dev *mdev = container_of(attr->device, struct mana_ib_dev, ib_dev); + enum rdma_network_type ntype = rdma_gid_attr_network_type(attr); + struct mana_rnic_config_addr_resp resp = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_config_addr_req req = {}; + int err; + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) { + ibdev_dbg(&mdev->ib_dev, "Unsupported rdma network type %d", ntype); + return -EINVAL; + } + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.op = ADDR_OP_REMOVE; + req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4; + copy_in_reverse(req.ip_addr, attr->gid.raw, sizeof(union ib_gid)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config IP addr err %d\n", err); + return err; + } + + return 0; +} + +int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 *mac) +{ + struct mana_rnic_config_mac_addr_resp resp = {}; + struct mana_rnic_config_mac_addr_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_MAC_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.op = op; + copy_in_reverse(req.mac_addr, mac, ETH_ALEN); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config Mac addr err %d", err); + return err; + } + + return 0; +} + +int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 doorbell) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_cq_resp resp = {}; + struct mana_rnic_create_cq_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_CQ, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.gdma_region = cq->queue.gdma_region; + req.eq_id = mdev->eqs[cq->comp_vector]->id; + req.doorbell_page = doorbell; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create cq err %d", err); + return err; + } + + cq->queue.id = resp.cq_id; + cq->cq_handle = resp.cq_handle; + /* The GDMA region is now owned by the CQ handle */ + cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; + + return 0; +} + +int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_destroy_cq_resp resp = {}; + struct mana_rnic_destroy_cq_req req = {}; + int err; + + if (cq->cq_handle == INVALID_MANA_HANDLE) + return 0; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_CQ, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.cq_handle = cq->cq_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy cq err %d", err); + return err; + } + + return 0; +} + +int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u64 flags) +{ + struct mana_ib_cq *send_cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_cq *recv_cq = container_of(qp->ibqp.recv_cq, struct mana_ib_cq, ibcq); + struct mana_ib_pd *pd = container_of(qp->ibqp.pd, struct mana_ib_pd, ibpd); + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_qp_resp resp = {}; + struct mana_rnic_create_qp_req req = {}; + int err, i; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_RC_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.pd_handle = pd->pd_handle; + req.send_cq_handle = send_cq->cq_handle; + req.recv_cq_handle = recv_cq->cq_handle; + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; i++) + req.dma_region[i] = qp->rc_qp.queues[i].gdma_region; + req.doorbell_page = doorbell; + req.max_send_wr = attr->cap.max_send_wr; + req.max_recv_wr = attr->cap.max_recv_wr; + req.max_send_sge = attr->cap.max_send_sge; + req.max_recv_sge = attr->cap.max_recv_sge; + req.flags = flags; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create rc qp err %d", err); + return err; + } + qp->qp_handle = resp.rc_qp_handle; + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; i++) { + qp->rc_qp.queues[i].id = resp.queue_ids[i]; + /* The GDMA regions are now owned by the RNIC QP handle */ + qp->rc_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + } + return 0; +} + +int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + struct mana_rnic_destroy_rc_qp_resp resp = {0}; + struct mana_rnic_destroy_rc_qp_req req = {0}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_RC_QP, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.rc_qp_handle = qp->qp_handle; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy rc qp err %d", err); + return err; + } + return 0; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 6bdc0f5498d5..b53a5b4de908 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -27,6 +27,11 @@ */ #define MANA_IB_MAX_MR 0xFFFFFFu +/* + * The CA timeout is approx. 260ms (4us * 2^(DELAY)) + */ +#define MANA_CA_ACK_DELAY 16 + struct mana_ib_adapter_caps { u32 max_sq_id; u32 max_rq_id; @@ -45,19 +50,27 @@ struct mana_ib_adapter_caps { u32 max_inline_data_size; }; +struct mana_ib_queue { + struct ib_umem *umem; + u64 gdma_region; + u64 id; +}; + struct mana_ib_dev { struct ib_device ib_dev; struct gdma_dev *gdma_dev; + mana_handle_t adapter_handle; + struct gdma_queue *fatal_err_eq; + struct gdma_queue **eqs; + struct xarray qp_table_wq; struct mana_ib_adapter_caps adapter_caps; }; struct mana_ib_wq { struct ib_wq ibwq; - struct ib_umem *umem; + struct mana_ib_queue queue; int wqe; u32 wq_buf_size; - u64 gdma_region; - u64 id; mana_handle_t rx_object; }; @@ -82,25 +95,39 @@ struct mana_ib_mr { struct mana_ib_cq { struct ib_cq ibcq; - struct ib_umem *umem; + struct mana_ib_queue queue; int cqe; - u64 gdma_region; - u64 id; u32 comp_vector; + mana_handle_t cq_handle; +}; + +enum mana_rc_queue_type { + MANA_RC_SEND_QUEUE_REQUESTER = 0, + MANA_RC_SEND_QUEUE_RESPONDER, + MANA_RC_SEND_QUEUE_FMR, + MANA_RC_RECV_QUEUE_REQUESTER, + MANA_RC_RECV_QUEUE_RESPONDER, + MANA_RC_QUEUE_TYPE_MAX, +}; + +struct mana_ib_rc_qp { + struct mana_ib_queue queues[MANA_RC_QUEUE_TYPE_MAX]; }; struct mana_ib_qp { struct ib_qp ibqp; - /* Work queue info */ - struct ib_umem *sq_umem; - int sqe; - u64 sq_gdma_region; - u64 sq_id; - mana_handle_t tx_object; + mana_handle_t qp_handle; + union { + struct mana_ib_queue raw_sq; + struct mana_ib_rc_qp rc_qp; + }; /* The port on the IB device, starting with 1 */ u32 port; + + refcount_t refcount; + struct completion free; }; struct mana_ib_ucontext { @@ -114,6 +141,15 @@ struct mana_ib_rwq_ind_table { enum mana_ib_command_code { MANA_IB_GET_ADAPTER_CAP = 0x30001, + MANA_IB_CREATE_ADAPTER = 0x30002, + MANA_IB_DESTROY_ADAPTER = 0x30003, + MANA_IB_CONFIG_IP_ADDR = 0x30004, + MANA_IB_CONFIG_MAC_ADDR = 0x30005, + MANA_IB_CREATE_CQ = 0x30008, + MANA_IB_DESTROY_CQ = 0x30009, + MANA_IB_CREATE_RC_QP = 0x3000a, + MANA_IB_DESTROY_RC_QP = 0x3000b, + MANA_IB_SET_QP_STATE = 0x3000d, }; struct mana_ib_query_adapter_caps_req { @@ -142,12 +178,220 @@ struct mana_ib_query_adapter_caps_resp { u32 max_inline_data_size; }; /* HW Data */ -int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, - mana_handle_t *gdma_region); +struct mana_rnic_create_adapter_req { + struct gdma_req_hdr hdr; + u32 notify_eq_id; + u32 reserved; + u64 feature_flags; +}; /*HW Data */ + +struct mana_rnic_create_adapter_resp { + struct gdma_resp_hdr hdr; + mana_handle_t adapter; +}; /* HW Data */ + +struct mana_rnic_destroy_adapter_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; +}; /*HW Data */ + +struct mana_rnic_destroy_adapter_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +enum mana_ib_addr_op { + ADDR_OP_ADD = 1, + ADDR_OP_REMOVE = 2, +}; + +enum sgid_entry_type { + SGID_TYPE_IPV4 = 1, + SGID_TYPE_IPV6 = 2, +}; + +struct mana_rnic_config_addr_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + enum mana_ib_addr_op op; + enum sgid_entry_type sgid_type; + u8 ip_addr[16]; +}; /* HW Data */ + +struct mana_rnic_config_addr_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +struct mana_rnic_config_mac_addr_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + enum mana_ib_addr_op op; + u8 mac_addr[ETH_ALEN]; + u8 reserved[6]; +}; /* HW Data */ + +struct mana_rnic_config_mac_addr_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +struct mana_rnic_create_cq_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + u64 gdma_region; + u32 eq_id; + u32 doorbell_page; +}; /* HW Data */ + +struct mana_rnic_create_cq_resp { + struct gdma_resp_hdr hdr; + mana_handle_t cq_handle; + u32 cq_id; + u32 reserved; +}; /* HW Data */ + +struct mana_rnic_destroy_cq_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t cq_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_cq_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +enum mana_rnic_create_rc_flags { + MANA_RC_FLAG_NO_FMR = 2, +}; + +struct mana_rnic_create_qp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t pd_handle; + mana_handle_t send_cq_handle; + mana_handle_t recv_cq_handle; + u64 dma_region[MANA_RC_QUEUE_TYPE_MAX]; + u64 deprecated[2]; + u64 flags; + u32 doorbell_page; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; + u32 reserved; +}; /* HW Data */ + +struct mana_rnic_create_qp_resp { + struct gdma_resp_hdr hdr; + mana_handle_t rc_qp_handle; + u32 queue_ids[MANA_RC_QUEUE_TYPE_MAX]; + u32 reserved; +}; /* HW Data*/ + +struct mana_rnic_destroy_rc_qp_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t rc_qp_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_rc_qp_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +struct mana_ib_ah_attr { + u8 src_addr[16]; + u8 dest_addr[16]; + u8 src_mac[ETH_ALEN]; + u8 dest_mac[ETH_ALEN]; + u8 src_addr_type; + u8 dest_addr_type; + u8 hop_limit; + u8 traffic_class; + u16 src_port; + u16 dest_port; + u32 reserved; +}; + +struct mana_rnic_set_qp_state_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t qp_handle; + u64 attr_mask; + u32 qp_state; + u32 path_mtu; + u32 rq_psn; + u32 sq_psn; + u32 dest_qpn; + u32 max_dest_rd_atomic; + u32 retry_cnt; + u32 rnr_retry; + u32 min_rnr_timer; + u32 reserved; + struct mana_ib_ah_attr ah_attr; +}; /* HW Data */ + +struct mana_rnic_set_qp_state_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + +static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) +{ + return mdev->gdma_dev->gdma_context; +} + +static inline struct mana_ib_qp *mana_get_qp_ref(struct mana_ib_dev *mdev, + uint32_t qid) +{ + struct mana_ib_qp *qp; + unsigned long flag; + + xa_lock_irqsave(&mdev->qp_table_wq, flag); + qp = xa_load(&mdev->qp_table_wq, qid); + if (qp) + refcount_inc(&qp->refcount); + xa_unlock_irqrestore(&mdev->qp_table_wq, flag); + return qp; +} + +static inline void mana_put_qp_ref(struct mana_ib_qp *qp) +{ + if (refcount_dec_and_test(&qp->refcount)) + complete(&qp->free); +} + +static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 port) +{ + struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_context *mc = gc->mana.driver_data; + + if (port < 1 || port > mc->num_ports) + return NULL; + return mc->ports[port - 1]; +} + +static inline void copy_in_reverse(u8 *dst, const u8 *src, u32 size) +{ + u32 i; + + for (i = 0; i < size; i++) + dst[size - 1 - i] = src[i]; +} + +int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); +void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); + +int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region); + +int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region, u64 virt); int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, mana_handle_t gdma_region); +int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, + struct mana_ib_queue *queue); +void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue); + struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); @@ -185,7 +429,7 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, u32 port); int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); @@ -211,5 +455,29 @@ void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext); int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *mdev); -void mana_ib_cq_handler(void *ctx, struct gdma_queue *gdma_cq); +int mana_ib_create_eqs(struct mana_ib_dev *mdev); + +void mana_ib_destroy_eqs(struct mana_ib_dev *mdev); + +int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev); + +int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev); + +int mana_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); + +enum rdma_link_layer mana_ib_get_link_layer(struct ib_device *device, u32 port_num); + +int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context); + +int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context); + +int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 *mac); + +int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 doorbell); + +int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); + +int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp, + struct ib_qp_init_attr *attr, u32 doorbell, u64 flags); +int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp); #endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 351207c60eb6..887b09dd86e7 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -30,12 +30,9 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, { struct gdma_create_mr_response resp = {}; struct gdma_create_mr_request req = {}; - struct gdma_dev *mdev = dev->gdma_dev; - struct gdma_context *gc; + struct gdma_context *gc = mdev_to_gc(dev); int err; - gc = mdev->gdma_context; - mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req), sizeof(resp)); req.pd_handle = mr_params->pd_handle; @@ -77,12 +74,9 @@ static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle) { struct gdma_destroy_mr_response resp = {}; struct gdma_destroy_mr_request req = {}; - struct gdma_dev *mdev = dev->gdma_dev; - struct gdma_context *gc; + struct gdma_context *gc = mdev_to_gc(dev); int err; - gc = mdev->gdma_context; - mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req), sizeof(resp)); @@ -118,6 +112,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x", start, iova, length, access_flags); + access_flags &= ~IB_ACCESS_OPTIONAL; if (access_flags & ~VALID_MR_FLAGS) return ERR_PTR(-EINVAL); @@ -133,7 +128,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, goto err_free; } - err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle); + err = mana_ib_create_dma_region(dev, mr->umem, &dma_region_handle, iova); if (err) { ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", err); @@ -141,7 +136,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, } ibdev_dbg(ibdev, - "mana_ib_gd_create_dma_region ret %d gdma_region %llx\n", err, + "created dma region for user-mr 0x%llx\n", dma_region_handle); mr_params.pd_handle = pd->pd_handle; @@ -164,8 +159,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, return &mr->ibmr; err_dma_region: - mana_gd_destroy_dma_region(dev->gdma_dev->gdma_context, - dma_region_handle); + mana_gd_destroy_dma_region(mdev_to_gc(dev), dma_region_handle); err_umem: ib_umem_release(mr->umem); diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 21ac9fcadf3f..73d67c853b6f 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -15,17 +15,13 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, struct mana_port_context *mpc = netdev_priv(ndev); struct mana_cfg_rx_steer_req_v2 *req; struct mana_cfg_rx_steer_resp resp = {}; - mana_handle_t *req_indir_tab; struct gdma_context *gc; - struct gdma_dev *mdev; u32 req_buf_size; int i, err; - gc = dev->gdma_dev->gdma_context; - mdev = &gc->mana; + gc = mdev_to_gc(dev); - req_buf_size = - sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE; + req_buf_size = struct_size(req, indir_tab, MANA_INDIRECT_TABLE_DEF_SIZE); req = kzalloc(req_buf_size, GFP_KERNEL); if (!req) return -ENOMEM; @@ -39,27 +35,27 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, req->rx_enable = 1; req->update_default_rxobj = 1; req->default_rxobj = default_rxobj; - req->hdr.dev_id = mdev->dev_id; + req->hdr.dev_id = gc->mana.dev_id; /* If there are more than 1 entries in indirection table, enable RSS */ if (log_ind_tbl_size) req->rss_enable = true; - req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE; - req->indir_tab_offset = sizeof(*req); + req->num_indir_entries = MANA_INDIRECT_TABLE_DEF_SIZE; + req->indir_tab_offset = offsetof(struct mana_cfg_rx_steer_req_v2, + indir_tab); req->update_indir_tab = true; req->cqe_coalescing_enable = 1; - req_indir_tab = (mana_handle_t *)(req + 1); /* The ind table passed to the hardware must have - * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb + * MANA_INDIRECT_TABLE_DEF_SIZE entries. Adjust the verb * ind_table to MANA_INDIRECT_TABLE_SIZE if required */ ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size); - for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) { - req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)]; + for (i = 0; i < MANA_INDIRECT_TABLE_DEF_SIZE; i++) { + req->indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)]; ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i, - req_indir_tab[i]); + req->indir_tab[i]); } req->update_hashkey = true; @@ -102,17 +98,12 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl; struct mana_ib_create_qp_rss_resp resp = {}; struct mana_ib_create_qp_rss ucmd = {}; - struct gdma_queue **gdma_cq_allocated; mana_handle_t *mana_ind_table; struct mana_port_context *mpc; - struct gdma_queue *gdma_cq; unsigned int ind_tbl_size; - struct mana_context *mc; struct net_device *ndev; - struct gdma_context *gc; struct mana_ib_cq *cq; struct mana_ib_wq *wq; - struct gdma_dev *gd; struct mana_eq *eq; struct ib_cq *ibcq; struct ib_wq *ibwq; @@ -120,10 +111,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, u32 port; int ret; - gc = mdev->gdma_dev->gdma_context; - gd = &gc->mana; - mc = gd->driver_data; - if (!udata || udata->inlen < sizeof(ucmd)) return -EINVAL; @@ -150,7 +137,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, } ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size; - if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) { + if (ind_tbl_size > MANA_INDIRECT_TABLE_DEF_SIZE) { ibdev_dbg(&mdev->ib_dev, "Indirect table size %d exceeding limit\n", ind_tbl_size); @@ -166,12 +153,12 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, /* IB ports start with 1, MANA start with 0 */ port = ucmd.port; - if (port < 1 || port > mc->num_ports) { + ndev = mana_ib_get_netdev(pd->device, port); + if (!ndev) { ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n", port); return -EINVAL; } - ndev = mc->ports[port - 1]; mpc = netdev_priv(ndev); ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n", @@ -184,13 +171,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, goto fail; } - gdma_cq_allocated = kcalloc(ind_tbl_size, sizeof(*gdma_cq_allocated), - GFP_KERNEL); - if (!gdma_cq_allocated) { - ret = -ENOMEM; - goto fail; - } - qp->port = port; for (i = 0; i < ind_tbl_size; i++) { @@ -203,13 +183,13 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ibcq = ibwq->cq; cq = container_of(ibcq, struct mana_ib_cq, ibcq); - wq_spec.gdma_region = wq->gdma_region; + wq_spec.gdma_region = wq->queue.gdma_region; wq_spec.queue_size = wq->wq_buf_size; - cq_spec.gdma_region = cq->gdma_region; + cq_spec.gdma_region = cq->queue.gdma_region; cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; - eq = &mc->eqs[cq->comp_vector % gc->max_num_queues]; + eq = &mpc->ac->eqs[cq->comp_vector]; cq_spec.attached_eq = eq->eq->id; ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, @@ -221,35 +201,25 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, } /* The GDMA regions are now owned by the WQ object */ - wq->gdma_region = GDMA_INVALID_DMA_REGION; - cq->gdma_region = GDMA_INVALID_DMA_REGION; + wq->queue.gdma_region = GDMA_INVALID_DMA_REGION; + cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; - wq->id = wq_spec.queue_index; - cq->id = cq_spec.queue_index; + wq->queue.id = wq_spec.queue_index; + cq->queue.id = cq_spec.queue_index; ibdev_dbg(&mdev->ib_dev, - "ret %d rx_object 0x%llx wq id %llu cq id %llu\n", - ret, wq->rx_object, wq->id, cq->id); + "rx_object 0x%llx wq id %llu cq id %llu\n", + wq->rx_object, wq->queue.id, cq->queue.id); - resp.entries[i].cqid = cq->id; - resp.entries[i].wqid = wq->id; + resp.entries[i].cqid = cq->queue.id; + resp.entries[i].wqid = wq->queue.id; mana_ind_table[i] = wq->rx_object; /* Create CQ table entry */ - WARN_ON(gc->cq_table[cq->id]); - gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); - if (!gdma_cq) { - ret = -ENOMEM; + ret = mana_ib_install_cq_cb(mdev, cq); + if (ret) goto fail; - } - gdma_cq_allocated[i] = gdma_cq; - - gdma_cq->cq.context = cq; - gdma_cq->type = GDMA_CQ; - gdma_cq->cq.callback = mana_ib_cq_handler; - gdma_cq->id = cq->id; - gc->cq_table[cq->id] = gdma_cq; } resp.num_entries = i; @@ -269,7 +239,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, goto fail; } - kfree(gdma_cq_allocated); kfree(mana_ind_table); return 0; @@ -281,13 +250,10 @@ fail: wq = container_of(ibwq, struct mana_ib_wq, ibwq); cq = container_of(ibcq, struct mana_ib_cq, ibcq); - gc->cq_table[cq->id] = NULL; - kfree(gdma_cq_allocated[i]); - + mana_ib_remove_cq_cb(mdev, cq); mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); } - kfree(gdma_cq_allocated); kfree(mana_ind_table); return ret; @@ -306,23 +272,17 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, struct mana_ib_ucontext *mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, ibucontext); - struct gdma_dev *gd = &mdev->gdma_dev->gdma_context->mana; struct mana_ib_create_qp_resp resp = {}; struct mana_ib_create_qp ucmd = {}; - struct gdma_queue *gdma_cq = NULL; struct mana_obj_spec wq_spec = {}; struct mana_obj_spec cq_spec = {}; struct mana_port_context *mpc; - struct mana_context *mc; struct net_device *ndev; - struct ib_umem *umem; struct mana_eq *eq; int eq_vec; u32 port; int err; - mc = gd->driver_data; - if (!mana_ucontext || udata->inlen < sizeof(ucmd)) return -EINVAL; @@ -333,11 +293,6 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, return err; } - /* IB ports start with 1, MANA Ethernet ports start with 0 */ - port = ucmd.port; - if (port < 1 || port > mc->num_ports) - return -EINVAL; - if (attr->cap.max_send_wr > mdev->adapter_caps.max_qp_wr) { ibdev_dbg(&mdev->ib_dev, "Requested max_send_wr %d exceeding limit\n", @@ -352,11 +307,17 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, return -EINVAL; } - ndev = mc->ports[port - 1]; + port = ucmd.port; + ndev = mana_ib_get_netdev(ibpd->device, port); + if (!ndev) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n", + port); + return -EINVAL; + } mpc = netdev_priv(ndev); ibdev_dbg(&mdev->ib_dev, "port %u ndev %p mpc %p\n", port, ndev, mpc); - err = mana_ib_cfg_vport(mdev, port - 1, pd, mana_ucontext->doorbell); + err = mana_ib_cfg_vport(mdev, port, pd, mana_ucontext->doorbell); if (err) return -ENODEV; @@ -365,77 +326,51 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n", ucmd.sq_buf_addr, ucmd.port); - umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(umem)) { - err = PTR_ERR(umem); - ibdev_dbg(&mdev->ib_dev, - "Failed to get umem for create qp-raw, err %d\n", - err); - goto err_free_vport; - } - qp->sq_umem = umem; - - err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, - &qp->sq_gdma_region); + err = mana_ib_create_queue(mdev, ucmd.sq_buf_addr, ucmd.sq_buf_size, &qp->raw_sq); if (err) { ibdev_dbg(&mdev->ib_dev, - "Failed to create dma region for create qp-raw, %d\n", - err); - goto err_release_umem; + "Failed to create queue for create qp-raw, err %d\n", err); + goto err_free_vport; } - ibdev_dbg(&mdev->ib_dev, - "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", - err, qp->sq_gdma_region); - /* Create a WQ on the same port handle used by the Ethernet */ - wq_spec.gdma_region = qp->sq_gdma_region; + wq_spec.gdma_region = qp->raw_sq.gdma_region; wq_spec.queue_size = ucmd.sq_buf_size; - cq_spec.gdma_region = send_cq->gdma_region; + cq_spec.gdma_region = send_cq->queue.gdma_region; cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; - eq_vec = send_cq->comp_vector % gd->gdma_context->max_num_queues; - eq = &mc->eqs[eq_vec]; + eq_vec = send_cq->comp_vector; + eq = &mpc->ac->eqs[eq_vec]; cq_spec.attached_eq = eq->eq->id; err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec, - &cq_spec, &qp->tx_object); + &cq_spec, &qp->qp_handle); if (err) { ibdev_dbg(&mdev->ib_dev, "Failed to create wq for create raw-qp, err %d\n", err); - goto err_destroy_dma_region; + goto err_destroy_queue; } /* The GDMA regions are now owned by the WQ object */ - qp->sq_gdma_region = GDMA_INVALID_DMA_REGION; - send_cq->gdma_region = GDMA_INVALID_DMA_REGION; + qp->raw_sq.gdma_region = GDMA_INVALID_DMA_REGION; + send_cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; - qp->sq_id = wq_spec.queue_index; - send_cq->id = cq_spec.queue_index; + qp->raw_sq.id = wq_spec.queue_index; + send_cq->queue.id = cq_spec.queue_index; /* Create CQ table entry */ - WARN_ON(gd->gdma_context->cq_table[send_cq->id]); - gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); - if (!gdma_cq) { - err = -ENOMEM; + err = mana_ib_install_cq_cb(mdev, send_cq); + if (err) goto err_destroy_wq_obj; - } - - gdma_cq->cq.context = send_cq; - gdma_cq->type = GDMA_CQ; - gdma_cq->cq.callback = mana_ib_cq_handler; - gdma_cq->id = send_cq->id; - gd->gdma_context->cq_table[send_cq->id] = gdma_cq; ibdev_dbg(&mdev->ib_dev, - "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err, - qp->tx_object, qp->sq_id, send_cq->id); + "qp->qp_handle 0x%llx sq id %llu cq id %llu\n", + qp->qp_handle, qp->raw_sq.id, send_cq->queue.id); - resp.sqid = qp->sq_id; - resp.cqid = send_cq->id; + resp.sqid = qp->raw_sq.id; + resp.cqid = send_cq->queue.id; resp.tx_vp_offset = pd->tx_vp_offset; err = ib_copy_to_udata(udata, &resp, sizeof(resp)); @@ -443,27 +378,115 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, ibdev_dbg(&mdev->ib_dev, "Failed copy udata for create qp-raw, %d\n", err); - goto err_release_gdma_cq; + goto err_remove_cq_cb; } return 0; -err_release_gdma_cq: - kfree(gdma_cq); - gd->gdma_context->cq_table[send_cq->id] = NULL; +err_remove_cq_cb: + mana_ib_remove_cq_cb(mdev, send_cq); err_destroy_wq_obj: - mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); - -err_destroy_dma_region: - mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->qp_handle); -err_release_umem: - ib_umem_release(umem); +err_destroy_queue: + mana_ib_destroy_queue(mdev, &qp->raw_sq); err_free_vport: - mana_ib_uncfg_vport(mdev, pd, port - 1); + mana_ib_uncfg_vport(mdev, pd, port); + + return err; +} + +static int mana_table_store_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp) +{ + refcount_set(&qp->refcount, 1); + init_completion(&qp->free); + return xa_insert_irq(&mdev->qp_table_wq, qp->ibqp.qp_num, qp, + GFP_KERNEL); +} + +static void mana_table_remove_qp(struct mana_ib_dev *mdev, + struct mana_ib_qp *qp) +{ + xa_erase_irq(&mdev->qp_table_wq, qp->ibqp.qp_num); + mana_put_qp_ref(qp); + wait_for_completion(&qp->free); +} + +static int mana_ib_create_rc_qp(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_create_rc_qp_resp resp = {}; + struct mana_ib_ucontext *mana_ucontext; + struct mana_ib_create_rc_qp ucmd = {}; + int i, err, j; + u64 flags = 0; + u32 doorbell; + + if (!udata || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, ibucontext); + doorbell = mana_ucontext->doorbell; + flags = MANA_RC_FLAG_NO_FMR; + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy from udata, %d\n", err); + return err; + } + for (i = 0, j = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) { + /* skip FMR for user-level RC QPs */ + if (i == MANA_RC_SEND_QUEUE_FMR) { + qp->rc_qp.queues[i].id = INVALID_QUEUE_ID; + qp->rc_qp.queues[i].gdma_region = GDMA_INVALID_DMA_REGION; + continue; + } + err = mana_ib_create_queue(mdev, ucmd.queue_buf[j], ucmd.queue_size[j], + &qp->rc_qp.queues[i]); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create queue %d, err %d\n", i, err); + goto destroy_queues; + } + j++; + } + + err = mana_ib_gd_create_rc_qp(mdev, qp, attr, doorbell, flags); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create rc qp %d\n", err); + goto destroy_queues; + } + qp->ibqp.qp_num = qp->rc_qp.queues[MANA_RC_RECV_QUEUE_RESPONDER].id; + qp->port = attr->port_num; + + if (udata) { + for (i = 0, j = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) { + if (i == MANA_RC_SEND_QUEUE_FMR) + continue; + resp.queue_id[j] = qp->rc_qp.queues[i].id; + j++; + } + err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); + goto destroy_qp; + } + } + + err = mana_table_store_qp(mdev, qp); + if (err) + goto destroy_qp; + + return 0; + +destroy_qp: + mana_ib_gd_destroy_rc_qp(mdev, qp); +destroy_queues: + while (i-- > 0) + mana_ib_destroy_queue(mdev, &qp->rc_qp.queues[i]); return err; } @@ -478,8 +501,9 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, udata); return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata); + case IB_QPT_RC: + return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata); default: - /* Creating QP other than IB_QPT_RAW_PACKET is not supported */ ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n", attr->qp_type); } @@ -487,11 +511,79 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, return -EINVAL; } +static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = container_of(ibqp->device, struct mana_ib_dev, ib_dev); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_rnic_set_qp_state_resp resp = {}; + struct mana_rnic_set_qp_state_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_port_context *mpc; + struct net_device *ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.qp_handle = qp->qp_handle; + req.qp_state = attr->qp_state; + req.attr_mask = attr_mask; + req.path_mtu = attr->path_mtu; + req.rq_psn = attr->rq_psn; + req.sq_psn = attr->sq_psn; + req.dest_qpn = attr->dest_qp_num; + req.max_dest_rd_atomic = attr->max_dest_rd_atomic; + req.retry_cnt = attr->retry_cnt; + req.rnr_retry = attr->rnr_retry; + req.min_rnr_timer = attr->min_rnr_timer; + if (attr_mask & IB_QP_AV) { + ndev = mana_ib_get_netdev(&mdev->ib_dev, ibqp->port); + if (!ndev) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in QP %u\n", + ibqp->port, ibqp->qp_num); + return -EINVAL; + } + mpc = netdev_priv(ndev); + copy_in_reverse(req.ah_attr.src_mac, mpc->mac_addr, ETH_ALEN); + copy_in_reverse(req.ah_attr.dest_mac, attr->ah_attr.roce.dmac, ETH_ALEN); + copy_in_reverse(req.ah_attr.src_addr, attr->ah_attr.grh.sgid_attr->gid.raw, + sizeof(union ib_gid)); + copy_in_reverse(req.ah_attr.dest_addr, attr->ah_attr.grh.dgid.raw, + sizeof(union ib_gid)); + if (rdma_gid_attr_network_type(attr->ah_attr.grh.sgid_attr) == RDMA_NETWORK_IPV4) { + req.ah_attr.src_addr_type = SGID_TYPE_IPV4; + req.ah_attr.dest_addr_type = SGID_TYPE_IPV4; + } else { + req.ah_attr.src_addr_type = SGID_TYPE_IPV6; + req.ah_attr.dest_addr_type = SGID_TYPE_IPV6; + } + req.ah_attr.dest_port = ROCE_V2_UDP_DPORT; + req.ah_attr.src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, + ibqp->qp_num, attr->dest_qp_num); + req.ah_attr.traffic_class = attr->ah_attr.grh.traffic_class; + req.ah_attr.hop_limit = attr->ah_attr.grh.hop_limit; + } + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed modify qp err %d", err); + return err; + } + + return 0; +} + int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - /* modify_qp is not supported by this version of the driver */ - return -EOPNOTSUPP; + switch (ibqp->qp_type) { + case IB_QPT_RC: + return mana_ib_gd_modify_qp(ibqp, attr, attr_mask, udata); + default: + ibdev_dbg(ibqp->device, "Modify QP type %u not supported", ibqp->qp_type); + return -EOPNOTSUPP; + } } static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp, @@ -500,16 +592,13 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp, { struct mana_ib_dev *mdev = container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); - struct gdma_dev *gd = &mdev->gdma_dev->gdma_context->mana; struct mana_port_context *mpc; - struct mana_context *mc; struct net_device *ndev; struct mana_ib_wq *wq; struct ib_wq *ibwq; int i; - mc = gd->driver_data; - ndev = mc->ports[qp->port - 1]; + ndev = mana_ib_get_netdev(qp->ibqp.device, qp->port); mpc = netdev_priv(ndev); for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { @@ -527,26 +616,38 @@ static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata) { struct mana_ib_dev *mdev = container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); - struct gdma_dev *gd = &mdev->gdma_dev->gdma_context->mana; struct ib_pd *ibpd = qp->ibqp.pd; struct mana_port_context *mpc; - struct mana_context *mc; struct net_device *ndev; struct mana_ib_pd *pd; - mc = gd->driver_data; - ndev = mc->ports[qp->port - 1]; + ndev = mana_ib_get_netdev(qp->ibqp.device, qp->port); mpc = netdev_priv(ndev); pd = container_of(ibpd, struct mana_ib_pd, ibpd); - mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->qp_handle); - if (qp->sq_umem) { - mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); - ib_umem_release(qp->sq_umem); - } + mana_ib_destroy_queue(mdev, &qp->raw_sq); - mana_ib_uncfg_vport(mdev, pd, qp->port - 1); + mana_ib_uncfg_vport(mdev, pd, qp->port); + + return 0; +} + +static int mana_ib_destroy_rc_qp(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + int i; + + mana_table_remove_qp(mdev, qp); + + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_rc_qp(mdev, qp); + for (i = 0; i < MANA_RC_QUEUE_TYPE_MAX; ++i) + mana_ib_destroy_queue(mdev, &qp->rc_qp.queues[i]); return 0; } @@ -562,7 +663,8 @@ int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) udata); return mana_ib_destroy_qp_raw(qp, udata); - + case IB_QPT_RC: + return mana_ib_destroy_rc_qp(qp, udata); default: ibdev_dbg(ibqp->device, "Unexpected QP type %u\n", ibqp->qp_type); diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c index 372d361510e0..f959f4b9244f 100644 --- a/drivers/infiniband/hw/mana/wq.c +++ b/drivers/infiniband/hw/mana/wq.c @@ -13,7 +13,6 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, container_of(pd->device, struct mana_ib_dev, ib_dev); struct mana_ib_create_wq ucmd = {}; struct mana_ib_wq *wq; - struct ib_umem *umem; int err; if (udata->inlen < sizeof(ucmd)) @@ -32,39 +31,18 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr); - umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(umem)) { - err = PTR_ERR(umem); + err = mana_ib_create_queue(mdev, ucmd.wq_buf_addr, ucmd.wq_buf_size, &wq->queue); + if (err) { ibdev_dbg(&mdev->ib_dev, - "Failed to get umem for create wq, err %d\n", err); + "Failed to create queue for create wq, %d\n", err); goto err_free_wq; } - wq->umem = umem; wq->wqe = init_attr->max_wr; wq->wq_buf_size = ucmd.wq_buf_size; wq->rx_object = INVALID_MANA_HANDLE; - - err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region); - if (err) { - ibdev_dbg(&mdev->ib_dev, - "Failed to create dma region for create wq, %d\n", - err); - goto err_release_umem; - } - - ibdev_dbg(&mdev->ib_dev, - "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", - err, wq->gdma_region); - - /* WQ ID is returned at wq_create time, doesn't know the value yet */ - return &wq->ibwq; -err_release_umem: - ib_umem_release(umem); - err_free_wq: kfree(wq); @@ -86,8 +64,7 @@ int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev); - mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region); - ib_umem_release(wq->umem); + mana_ib_destroy_queue(mdev, &wq->queue); kfree(wq); diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 111fa88a3be4..d7327735b8d0 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -829,7 +829,6 @@ void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) { - char alias_wq_name[15]; int ret = 0; int i, j; union ib_gid gid; @@ -875,9 +874,8 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; dev->sriov.alias_guid.ports_guid[i].port = i; - snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); dev->sriov.alias_guid.ports_guid[i].wq = - alloc_ordered_workqueue(alias_wq_name, WQ_MEM_RECLAIM); + alloc_ordered_workqueue("alias_guid%d", WQ_MEM_RECLAIM, i); if (!dev->sriov.alias_guid.ports_guid[i].wq) { ret = -ENOMEM; goto err_thread; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 4cd738aae53c..c592374f4a58 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -150,8 +150,12 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, return PTR_ERR(*umem); shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); if (err) goto err_buf; @@ -172,8 +176,9 @@ err_buf: #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index a37cfac5e23f..e6e132f10625 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -2158,7 +2158,6 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, struct mlx4_ib_demux_ctx *ctx, int port) { - char name[12]; int ret = 0; int i; @@ -2194,24 +2193,21 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, goto err_mcg; } - snprintf(name, sizeof(name), "mlx4_ibt%d", port); - ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->wq = alloc_ordered_workqueue("mlx4_ibt%d", WQ_MEM_RECLAIM, port); if (!ctx->wq) { pr_err("Failed to create tunnelling WQ for port %d\n", port); ret = -ENOMEM; goto err_wq; } - snprintf(name, sizeof(name), "mlx4_ibwi%d", port); - ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->wi_wq = alloc_ordered_workqueue("mlx4_ibwi%d", WQ_MEM_RECLAIM, port); if (!ctx->wi_wq) { pr_err("Failed to create wire WQ for port %d\n", port); ret = -ENOMEM; goto err_wiwq; } - snprintf(name, sizeof(name), "mlx4_ibud%d", port); - ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->ud_wq = alloc_ordered_workqueue("mlx4_ibud%d", WQ_MEM_RECLAIM, port); if (!ctx->ud_wq) { pr_err("Failed to create up/down WQ for port %d\n", port); ret = -ENOMEM; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 529db874d67c..dd35e03402ab 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -351,7 +351,7 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) struct mlx4_port_gid_table *port_gid_table; int ret = 0; int hw_update = 0; - struct gid_entry *gids; + struct gid_entry *gids = NULL; if (!rdma_cap_roce_gid_table(attr->device, attr->port_num)) return -EINVAL; @@ -389,10 +389,10 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) } spin_unlock_bh(&iboe->lock); - if (!ret && hw_update) { + if (gids) ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num); - kfree(gids); - } + + kfree(gids); return ret; } @@ -2341,37 +2341,38 @@ static void mlx4_ib_scan_netdev(struct mlx4_ib_dev *ibdev, iboe->netdevs[dev->dev_port] = event != NETDEV_UNREGISTER ? dev : NULL; - if (event == NETDEV_UP || event == NETDEV_DOWN) { - enum ib_port_state port_state; - struct ib_event ibev = { }; + spin_unlock_bh(&iboe->lock); - if (ib_get_cached_port_state(&ibdev->ib_dev, dev->dev_port + 1, - &port_state)) - goto iboe_out; + if (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER) + mlx4_ib_update_qps(ibdev, dev, dev->dev_port + 1); +} - if (event == NETDEV_UP && - (port_state != IB_PORT_ACTIVE || - iboe->last_port_state[dev->dev_port] != IB_PORT_DOWN)) - goto iboe_out; - if (event == NETDEV_DOWN && - (port_state != IB_PORT_DOWN || - iboe->last_port_state[dev->dev_port] != IB_PORT_ACTIVE)) - goto iboe_out; - iboe->last_port_state[dev->dev_port] = port_state; +static void mlx4_ib_port_event(struct ib_device *ibdev, struct net_device *ndev, + unsigned long event) +{ + struct mlx4_ib_dev *mlx4_ibdev = + container_of(ibdev, struct mlx4_ib_dev, ib_dev); + struct mlx4_ib_iboe *iboe = &mlx4_ibdev->iboe; - ibev.device = &ibdev->ib_dev; - ibev.element.port_num = dev->dev_port + 1; - ibev.event = event == NETDEV_UP ? IB_EVENT_PORT_ACTIVE : - IB_EVENT_PORT_ERR; - ib_dispatch_event(&ibev); - } + if (!net_eq(dev_net(ndev), &init_net)) + return; + + ASSERT_RTNL(); + + if (ndev->dev.parent != mlx4_ibdev->ib_dev.dev.parent) + return; + + spin_lock_bh(&iboe->lock); + + iboe->netdevs[ndev->dev_port] = event != NETDEV_UNREGISTER ? ndev : NULL; + + if (event == NETDEV_UP || event == NETDEV_DOWN) + ib_dispatch_port_state_event(&mlx4_ibdev->ib_dev, ndev); -iboe_out: spin_unlock_bh(&iboe->lock); - if (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || - event == NETDEV_UP || event == NETDEV_CHANGE) - mlx4_ib_update_qps(ibdev, dev, dev->dev_port + 1); + if (event == NETDEV_UP || event == NETDEV_CHANGE) + mlx4_ib_update_qps(mlx4_ibdev, ndev, ndev->dev_port + 1); } static int mlx4_ib_netdev_event(struct notifier_block *this, @@ -2569,6 +2570,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = { .req_notify_cq = mlx4_ib_arm_cq, .rereg_user_mr = mlx4_ib_rereg_user_mr, .resize_cq = mlx4_ib_resize_cq, + .report_port_event = mlx4_ib_port_event, INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq), diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 41ca1114a995..f53b1846594c 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -667,6 +667,9 @@ struct mlx4_uverbs_ex_query_device { __u32 reserved; }; +/* 4k - 4G */ +#define MLX4_PAGE_SIZE_SUPPORTED ((unsigned long)GENMASK_ULL(31, 12)) + static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); @@ -767,7 +770,7 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); @@ -936,8 +939,19 @@ mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table) { return 0; } -int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, - int *num_of_mtts); +static inline int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, + u64 start, + int *num_of_mtts) +{ + unsigned long pg_sz; + + pg_sz = ib_umem_find_best_pgsz(umem, MLX4_PAGE_SIZE_SUPPORTED, start); + if (!pg_sz) + return -EOPNOTSUPP; + + *num_of_mtts = ib_umem_num_dma_blocks(umem, pg_sz); + return order_base_2(pg_sz); +} int mlx4_ib_cm_init(void); void mlx4_ib_cm_destroy(void); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index a40bf58bcdd3..e77645a673fb 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -87,286 +87,20 @@ err_free: return ERR_PTR(err); } -enum { - MLX4_MAX_MTT_SHIFT = 31 -}; - -static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, - struct mlx4_mtt *mtt, - u64 mtt_size, u64 mtt_shift, u64 len, - u64 cur_start_addr, u64 *pages, - int *start_index, int *npages) -{ - u64 cur_end_addr = cur_start_addr + len; - u64 cur_end_addr_aligned = 0; - u64 mtt_entries; - int err = 0; - int k; - - len += (cur_start_addr & (mtt_size - 1ULL)); - cur_end_addr_aligned = round_up(cur_end_addr, mtt_size); - len += (cur_end_addr_aligned - cur_end_addr); - if (len & (mtt_size - 1ULL)) { - pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n", - len, mtt_size); - return -EINVAL; - } - - mtt_entries = (len >> mtt_shift); - - /* - * Align the MTT start address to the mtt_size. - * Required to handle cases when the MR starts in the middle of an MTT - * record. Was not required in old code since the physical addresses - * provided by the dma subsystem were page aligned, which was also the - * MTT size. - */ - cur_start_addr = round_down(cur_start_addr, mtt_size); - /* A new block is started ... */ - for (k = 0; k < mtt_entries; ++k) { - pages[*npages] = cur_start_addr + (mtt_size * k); - (*npages)++; - /* - * Be friendly to mlx4_write_mtt() and pass it chunks of - * appropriate size. - */ - if (*npages == PAGE_SIZE / sizeof(u64)) { - err = mlx4_write_mtt(dev->dev, mtt, *start_index, - *npages, pages); - if (err) - return err; - - (*start_index) += *npages; - *npages = 0; - } - } - - return 0; -} - -static inline u64 alignment_of(u64 ptr) -{ - return ilog2(ptr & (~(ptr - 1))); -} - -static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start, - u64 current_block_end, - u64 block_shift) -{ - /* Check whether the alignment of the new block is aligned as well as - * the previous block. - * Block address must start with zeros till size of entity_size. - */ - if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0) - /* - * It is not as well aligned as the previous block-reduce the - * mtt size accordingly. Here we take the last right bit which - * is 1. - */ - block_shift = alignment_of(next_block_start); - - /* - * Check whether the alignment of the end of previous block - is it - * aligned as well as the start of the block - */ - if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0) - /* - * It is not as well aligned as the start of the block - - * reduce the mtt size accordingly. - */ - block_shift = alignment_of(current_block_end); - - return block_shift; -} - int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { - u64 *pages; - u64 len = 0; - int err = 0; - u64 mtt_size; - u64 cur_start_addr = 0; - u64 mtt_shift; - int start_index = 0; - int npages = 0; - struct scatterlist *sg; - int i; - - pages = (u64 *) __get_free_page(GFP_KERNEL); - if (!pages) - return -ENOMEM; - - mtt_shift = mtt->page_shift; - mtt_size = 1ULL << mtt_shift; + struct ib_block_iter biter; + int err, i = 0; + u64 addr; - for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - if (cur_start_addr + len == sg_dma_address(sg)) { - /* still the same block */ - len += sg_dma_len(sg); - continue; - } - /* - * A new block is started ... - * If len is malaligned, write an extra mtt entry to cover the - * misaligned area (round up the division) - */ - err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, - mtt_shift, len, - cur_start_addr, - pages, &start_index, - &npages); - if (err) - goto out; - - cur_start_addr = sg_dma_address(sg); - len = sg_dma_len(sg); - } - - /* Handle the last block */ - if (len > 0) { - /* - * If len is malaligned, write an extra mtt entry to cover - * the misaligned area (round up the division) - */ - err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, - mtt_shift, len, - cur_start_addr, pages, - &start_index, &npages); + rdma_umem_for_each_dma_block(umem, &biter, BIT(mtt->page_shift)) { + addr = rdma_block_iter_dma_address(&biter); + err = mlx4_write_mtt(dev->dev, mtt, i++, 1, &addr); if (err) - goto out; - } - - if (npages) - err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages); - -out: - free_page((unsigned long) pages); - return err; -} - -/* - * Calculate optimal mtt size based on contiguous pages. - * Function will return also the number of pages that are not aligned to the - * calculated mtt_size to be added to total number of pages. For that we should - * check the first chunk length & last chunk length and if not aligned to - * mtt_size we should increment the non_aligned_pages number. All chunks in the - * middle already handled as part of mtt shift calculation for both their start - * & end addresses. - */ -int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, - int *num_of_mtts) -{ - u64 block_shift = MLX4_MAX_MTT_SHIFT; - u64 min_shift = PAGE_SHIFT; - u64 last_block_aligned_end = 0; - u64 current_block_start = 0; - u64 first_block_start = 0; - u64 current_block_len = 0; - u64 last_block_end = 0; - struct scatterlist *sg; - u64 current_block_end; - u64 misalignment_bits; - u64 next_block_start; - u64 total_len = 0; - int i; - - *num_of_mtts = ib_umem_num_dma_blocks(umem, PAGE_SIZE); - - for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - /* - * Initialization - save the first chunk start as the - * current_block_start - block means contiguous pages. - */ - if (current_block_len == 0 && current_block_start == 0) { - current_block_start = sg_dma_address(sg); - first_block_start = current_block_start; - /* - * Find the bits that are different between the physical - * address and the virtual address for the start of the - * MR. - * umem_get aligned the start_va to a page boundary. - * Therefore, we need to align the start va to the same - * boundary. - * misalignment_bits is needed to handle the case of a - * single memory region. In this case, the rest of the - * logic will not reduce the block size. If we use a - * block size which is bigger than the alignment of the - * misalignment bits, we might use the virtual page - * number instead of the physical page number, resulting - * in access to the wrong data. - */ - misalignment_bits = - (start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^ - current_block_start; - block_shift = min(alignment_of(misalignment_bits), - block_shift); - } - - /* - * Go over the scatter entries and check if they continue the - * previous scatter entry. - */ - next_block_start = sg_dma_address(sg); - current_block_end = current_block_start + current_block_len; - /* If we have a split (non-contig.) between two blocks */ - if (current_block_end != next_block_start) { - block_shift = mlx4_ib_umem_calc_block_mtt - (next_block_start, - current_block_end, - block_shift); - - /* - * If we reached the minimum shift for 4k page we stop - * the loop. - */ - if (block_shift <= min_shift) - goto end; - - /* - * If not saved yet we are in first block - we save the - * length of first block to calculate the - * non_aligned_pages number at the end. - */ - total_len += current_block_len; - - /* Start a new block */ - current_block_start = next_block_start; - current_block_len = sg_dma_len(sg); - continue; - } - /* The scatter entry is another part of the current block, - * increase the block size. - * An entry in the scatter can be larger than 4k (page) as of - * dma mapping which merge some blocks together. - */ - current_block_len += sg_dma_len(sg); + return err; } - - /* Account for the last block in the total len */ - total_len += current_block_len; - /* Add to the first block the misalignment that it suffers from. */ - total_len += (first_block_start & ((1ULL << block_shift) - 1ULL)); - last_block_end = current_block_start + current_block_len; - last_block_aligned_end = round_up(last_block_end, 1ULL << block_shift); - total_len += (last_block_aligned_end - last_block_end); - - if (total_len & ((1ULL << block_shift) - 1ULL)) - pr_warn("misaligned total length detected (%llu, %llu)!", - total_len, block_shift); - - *num_of_mtts = total_len >> block_shift; -end: - if (block_shift < min_shift) { - /* - * If shift is less than the min we set a warning and return the - * min shift. - */ - pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift); - - block_shift = min_shift; - } - return block_shift; + return 0; } static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start, @@ -424,6 +158,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n); + if (shift < 0) { + err = shift; + goto err_umem; + } err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, convert_access(access_flags), n, shift, &mr->mmr); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 9d08aa99f3cb..50fd407103c7 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -925,8 +925,12 @@ static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, } shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); if (err) goto err_buf; @@ -1108,8 +1112,12 @@ static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, } shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); if (err) goto err_buf; diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 72a526236c2e..b38961f5058e 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -6,6 +6,7 @@ mlx5_ib-y := ah.o \ cong.o \ counters.o \ cq.o \ + data_direct.o \ dm.o \ doorbell.o \ gsi.o \ diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 505bc47fd575..531a57f9ee7e 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -50,11 +50,12 @@ static __be16 mlx5_ah_get_udp_sport(const struct mlx5_ib_dev *dev, return sport; } -static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, +static int create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, struct rdma_ah_init_attr *init_attr) { struct rdma_ah_attr *ah_attr = init_attr->ah_attr; enum ib_gid_type gid_type; + int rate_val; if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); @@ -67,7 +68,10 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.tclass = grh->traffic_class; } - ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); + rate_val = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)); + if (rate_val < 0) + return rate_val; + ah->av.stat_rate_sl = rate_val << 4; if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { if (init_attr->xmit_slave) @@ -88,6 +92,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f; ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf); } + + return 0; } int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, @@ -120,8 +126,7 @@ int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, return err; } - create_ib_ah(dev, ah, init_attr); - return 0; + return create_ib_ah(dev, ah, init_attr); } int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 1d0c8d5e745b..7c08e3008927 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -177,7 +177,7 @@ int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid) return mlx5_cmd_exec_in(dev, dealloc_xrcd, in); } -int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, +int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port) { int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out); @@ -195,12 +195,18 @@ int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC); MLX5_SET(mad_ifc_in, in, op_mod, opmod); - MLX5_SET(mad_ifc_in, in, port, port); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + MLX5_SET(mad_ifc_in, in, plane_index, port); + MLX5_SET(mad_ifc_in, in, port, + smi_to_native_portnum(dev, port)); + } else { + MLX5_SET(mad_ifc_in, in, port, port); + } data = MLX5_ADDR_OF(mad_ifc_in, in, mad); memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad)); - err = mlx5_cmd_exec_inout(dev, mad_ifc, in, out); + err = mlx5_cmd_exec_inout(dev->mdev, mad_ifc, in, out); if (err) goto out; @@ -239,3 +245,24 @@ int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid) MLX5_SET(dealloc_uar_in, in, uid, uid); return mlx5_cmd_exec_in(dev, dealloc_uar, in); } + +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid) +{ + u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) + + MLX5_ST_SZ_BYTES(array1024_auto)] = {}; + u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {}; + char *vuid; + int err; + + MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID); + MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(dev, vhca_id)); + MLX5_SET(query_vuid_in, in, data_direct, data_direct); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid); + memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 93a971a40d11..e6c88b6ebd0d 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -54,8 +54,10 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid); int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid); -int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, +int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port); int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid); int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid); +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 8300ce622835..81cfa74147a1 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -83,6 +83,8 @@ static const struct mlx5_ib_counter extended_err_cnts[] = { INIT_Q_COUNTER(resp_remote_access_errors), INIT_Q_COUNTER(resp_cqe_flush_error), INIT_Q_COUNTER(req_cqe_flush_error), + INIT_Q_COUNTER(req_transport_retries_exceeded), + INIT_Q_COUNTER(req_rnr_retries_exceeded), }; static const struct mlx5_ib_counter roce_accl_cnts[] = { @@ -102,6 +104,8 @@ static const struct mlx5_ib_counter vport_extended_err_cnts[] = { INIT_VPORT_Q_COUNTER(resp_remote_access_errors), INIT_VPORT_Q_COUNTER(resp_cqe_flush_error), INIT_VPORT_Q_COUNTER(req_cqe_flush_error), + INIT_VPORT_Q_COUNTER(req_transport_retries_exceeded), + INIT_VPORT_Q_COUNTER(req_rnr_retries_exceeded), }; static const struct mlx5_ib_counter vport_roce_accl_cnts[] = { @@ -542,6 +546,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); + bool new = false; int err; if (!counter->id) { @@ -556,6 +561,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return err; counter->id = MLX5_GET(alloc_q_counter_out, out, counter_set_id); + new = true; } err = mlx5_ib_qp_set_counter(qp, counter); @@ -565,8 +571,10 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return 0; fail_set_counter: - mlx5_ib_counter_dealloc(counter); - counter->id = 0; + if (new) { + mlx5_ib_counter_dealloc(counter); + counter->id = 0; + } return err; } diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 9773d2a3d97f..4c54dc578069 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -38,6 +38,9 @@ #include "srq.h" #include "qp.h" +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) { struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; @@ -714,7 +717,8 @@ static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format) static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct mlx5_ib_cq *cq, int entries, u32 **cqb, - int *cqe_size, int *index, int *inlen) + int *cqe_size, int *index, int *inlen, + struct uverbs_attr_bundle *attrs) { struct mlx5_ib_create_cq ucmd = {}; unsigned long page_size; @@ -788,7 +792,11 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET(cqc, cqc, page_offset, page_offset_quantized); - if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) { + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX)) { + err = uverbs_copy_from(index, attrs, MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX); + if (err) + goto err_cqb; + } else if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) { *index = ucmd.uar_page_index; } else if (context->bfregi.lib_uar_dyn) { err = -EINVAL; @@ -942,8 +950,9 @@ static void notify_soft_wc_handler(struct work_struct *work) } int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; @@ -980,7 +989,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, if (udata) { err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size, - &index, &inlen); + &index, &inlen, attrs); if (err) return err; } else { @@ -1442,3 +1451,17 @@ int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc) return 0; } + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_cq_create, + UVERBS_OBJECT_CQ, + UVERBS_METHOD_CQ_CREATE, + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); + +const struct uapi_definition mlx5_ib_create_cq_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_CQ, &mlx5_ib_cq_create), + {}, +}; diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c new file mode 100644 index 000000000000..b9ba84afaae2 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "mlx5_ib.h" +#include "data_direct.h" + +static LIST_HEAD(mlx5_data_direct_dev_list); +static LIST_HEAD(mlx5_data_direct_reg_list); + +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_data_direct_mutex); + +struct mlx5_data_direct_registration { + struct mlx5_ib_dev *ibdev; + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1]; + struct list_head list; +}; + +static const struct pci_device_id mlx5_data_direct_pci_table[] = { + { PCI_VDEVICE(MELLANOX, 0x2100) }, /* ConnectX-8 Data Direct */ + { 0, } +}; + +static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + unsigned int vpd_size, kw_len; + u8 *vpd_data; + int start; + int ret; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) { + pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data)); + return PTR_ERR(vpd_data); + } + + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len); + if (start < 0) { + ret = start; + pci_err(pdev, "VU keyword not found, err=%d\n", ret); + goto end; + } + + dev->vuid = kmemdup_nul(vpd_data + start, kw_len, GFP_KERNEL); + ret = dev->vuid ? 0 : -ENOMEM; + +end: + kfree(vpd_data); + return ret; +} + +static void mlx5_data_direct_shutdown(struct pci_dev *pdev) +{ + pci_disable_device(pdev); +} + +static int mlx5_data_direct_set_dma_caps(struct pci_dev *pdev) +{ + int err; + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, + "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err); + return err; + } + } + + dma_set_max_seg_size(&pdev->dev, SZ_2G); + return 0; +} + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid) +{ + struct mlx5_data_direct_registration *reg; + struct mlx5_data_direct_dev *dev; + + reg = kzalloc(sizeof(*reg), GFP_KERNEL); + if (!reg) + return -ENOMEM; + + reg->ibdev = ibdev; + strcpy(reg->vuid, vuid); + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(dev, &mlx5_data_direct_dev_list, list) { + if (strcmp(dev->vuid, vuid) == 0) { + mlx5_ib_data_direct_bind(ibdev, dev); + break; + } + } + + /* Add the registration to its global list, to be used upon bind/unbind + * of its affiliated data direct device + */ + list_add_tail(®->list, &mlx5_data_direct_reg_list); + mutex_unlock(&mlx5_data_direct_mutex); + return 0; +} + +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (reg->ibdev == ibdev) { + list_del(®->list); + kfree(reg); + goto end; + } + } + + WARN_ON(true); +end: + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_reg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_bind(reg->ibdev, dev); + } + + /* Add the data direct device to the global list, further IB devices may + * use it later as well + */ + list_add_tail(&dev->list, &mlx5_data_direct_dev_list); + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_unreg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + /* Prevent any further affiliations */ + list_del(&dev->list); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_unbind(reg->ibdev); + } + mutex_unlock(&mlx5_data_direct_mutex); +} + +static int mlx5_data_direct_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mlx5_data_direct_dev *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->device = &pdev->dev; + dev->pdev = pdev; + + pci_set_drvdata(dev->pdev, dev); + err = pci_enable_device(pdev); + if (err) { + dev_err(dev->device, "Cannot enable PCI device, err=%d\n", err); + goto err; + } + + pci_set_master(pdev); + err = mlx5_data_direct_set_dma_caps(pdev); + if (err) + goto err_disable; + + if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + dev_dbg(dev->device, "Enabling pci atomics failed\n"); + + err = mlx5_data_direct_vpd_get_vuid(dev); + if (err) + goto err_disable; + + mlx5_data_direct_dev_reg(dev); + return 0; + +err_disable: + pci_disable_device(pdev); +err: + kfree(dev); + return err; +} + +static void mlx5_data_direct_remove(struct pci_dev *pdev) +{ + struct mlx5_data_direct_dev *dev = pci_get_drvdata(pdev); + + mlx5_data_direct_dev_unreg(dev); + pci_disable_device(pdev); + kfree(dev->vuid); + kfree(dev); +} + +static struct pci_driver mlx5_data_direct_driver = { + .name = KBUILD_MODNAME, + .id_table = mlx5_data_direct_pci_table, + .probe = mlx5_data_direct_probe, + .remove = mlx5_data_direct_remove, + .shutdown = mlx5_data_direct_shutdown, +}; + +int mlx5_data_direct_driver_register(void) +{ + return pci_register_driver(&mlx5_data_direct_driver); +} + +void mlx5_data_direct_driver_unregister(void) +{ + pci_unregister_driver(&mlx5_data_direct_driver); +} diff --git a/drivers/infiniband/hw/mlx5/data_direct.h b/drivers/infiniband/hw/mlx5/data_direct.h new file mode 100644 index 000000000000..2fd2bdbe8f69 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _MLX5_IB_DATA_DIRECT_H +#define _MLX5_IB_DATA_DIRECT_H + +struct mlx5_ib_dev; + +struct mlx5_data_direct_dev { + struct device *device; + struct pci_dev *pdev; + char *vuid; + struct list_head list; +}; + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid); +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev); +int mlx5_data_direct_driver_register(void); +void mlx5_data_direct_driver_unregister(void); + +#endif diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 253fea374a72..4186884c66e1 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -27,6 +27,19 @@ enum devx_obj_flags { DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, DEVX_OBJ_FLAGS_DCT = 1 << 1, DEVX_OBJ_FLAGS_CQ = 1 << 2, + DEVX_OBJ_FLAGS_HW_FREED = 1 << 3, +}; + +#define MAX_ASYNC_CMDS 8 + +struct mlx5_async_cmd { + struct ib_uobject *uobject; + void *in; + int in_size; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + int err; + struct mlx5_async_work cb_work; + struct completion comp; }; struct devx_async_data { @@ -1405,7 +1418,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, */ mlx5r_deref_wait_odp_mkey(&obj->mkey); - if (obj->flags & DEVX_OBJ_FLAGS_DCT) + if (obj->flags & DEVX_OBJ_FLAGS_HW_FREED) + ret = 0; + else if (obj->flags & DEVX_OBJ_FLAGS_DCT) ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); @@ -2595,6 +2610,82 @@ void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) } } +static void devx_async_destroy_cb(int status, struct mlx5_async_work *context) +{ + struct mlx5_async_cmd *devx_out = container_of(context, + struct mlx5_async_cmd, cb_work); + struct devx_obj *obj = devx_out->uobject->object; + + if (!status) + obj->flags |= DEVX_OBJ_FLAGS_HW_FREED; + + complete(&devx_out->comp); +} + +static void devx_async_destroy(struct mlx5_ib_dev *dev, + struct mlx5_async_cmd *cmd) +{ + init_completion(&cmd->comp); + cmd->err = mlx5_cmd_exec_cb(&dev->async_ctx, cmd->in, cmd->in_size, + &cmd->out, sizeof(cmd->out), + devx_async_destroy_cb, &cmd->cb_work); +} + +static void devx_wait_async_destroy(struct mlx5_async_cmd *cmd) +{ + if (!cmd->err) + wait_for_completion(&cmd->comp); + atomic_set(&cmd->uobject->usecnt, 0); +} + +void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile) +{ + struct mlx5_async_cmd async_cmd[MAX_ASYNC_CMDS]; + struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *device = ucontext->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_uobject *uobject; + struct devx_obj *obj; + int head = 0; + int tail = 0; + + list_for_each_entry(uobject, &ufile->uobjects, list) { + WARN_ON(uverbs_try_lock_object(uobject, UVERBS_LOOKUP_WRITE)); + + /* + * Currently we only support QP destruction, if other objects + * are to be destroyed need to add type synchronization to the + * cleanup algorithm and handle pre/post FW cleanup for the + * new types if needed. + */ + if (uobj_get_object_id(uobject) != MLX5_IB_OBJECT_DEVX_OBJ || + (get_dec_obj_type(uobject->object, MLX5_EVENT_TYPE_MAX) != + MLX5_OBJ_TYPE_QP)) { + atomic_set(&uobject->usecnt, 0); + continue; + } + + obj = uobject->object; + + async_cmd[tail % MAX_ASYNC_CMDS].in = obj->dinbox; + async_cmd[tail % MAX_ASYNC_CMDS].in_size = obj->dinlen; + async_cmd[tail % MAX_ASYNC_CMDS].uobject = uobject; + + devx_async_destroy(dev, &async_cmd[tail % MAX_ASYNC_CMDS]); + tail++; + + if (tail - head == MAX_ASYNC_CMDS) { + devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]); + head++; + } + } + + while (head != tail) { + devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]); + head++; + } +} + static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { @@ -2673,7 +2764,6 @@ static const struct file_operations devx_async_cmd_event_fops = { .read = devx_async_cmd_event_read, .poll = devx_async_cmd_event_poll, .release = uverbs_uobject_fd_release, - .llseek = no_llseek, }; static ssize_t devx_async_event_read(struct file *filp, char __user *buf, @@ -2788,7 +2878,6 @@ static const struct file_operations devx_async_event_fops = { .read = devx_async_event_read, .poll = devx_async_event_poll, .release = uverbs_uobject_fd_release, - .llseek = no_llseek, }; static void devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj, diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h index ee2213275fd6..1344bf4c9d21 100644 --- a/drivers/infiniband/hw/mlx5/devx.h +++ b/drivers/infiniband/hw/mlx5/devx.h @@ -28,6 +28,7 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); int mlx5_ib_devx_init(struct mlx5_ib_dev *dev); void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev); +void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile); #else static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) { @@ -41,5 +42,8 @@ static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) { } +static inline void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile) +{ +} #endif #endif /* _MLX5_IB_DEVX_H */ diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 520034acf73a..162814ae8cb4 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -943,7 +943,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, } dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst.counter_id = mlx5_fc_id(opfc->fc); + dst.counter = opfc->fc; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; @@ -1113,8 +1113,8 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, handler->ibcounters = flow_act.counters; dest_arr[dest_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest_arr[dest_num].counter_id = - mlx5_fc_id(mcounters->hw_cntrs_hndl); + dest_arr[dest_num].counter = + mcounters->hw_cntrs_hndl; dest_num++; } @@ -1603,7 +1603,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, static struct mlx5_ib_flow_handler *raw_fs_rule_add( struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act, - u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type) + struct mlx5_fc *counter, void *cmd_in, int inlen, int dest_id, int dest_type) { struct mlx5_flow_destination *dst; struct mlx5_ib_flow_prio *ft_prio; @@ -1652,8 +1652,12 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( } if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + if (WARN_ON(!counter)) { + err = -EINVAL; + goto unlock; + } dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst[dst_num].counter_id = counter_id; + dst[dst_num].counter = counter; dst_num++; } @@ -1878,7 +1882,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return 0; } -static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) +static bool +is_flow_counter(void *obj, u32 offset, u32 *counter_id, u32 *fc_bulk_size) { struct devx_obj *devx_obj = obj; u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode); @@ -1888,6 +1893,7 @@ static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) if (offset && offset >= devx_obj->flow_counter_bulk_size) return false; + *fc_bulk_size = devx_obj->flow_counter_bulk_size; *counter_id = MLX5_GET(dealloc_flow_counter_in, devx_obj->dinbox, flow_counter_id); @@ -1904,13 +1910,13 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( { struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; - u32 *offset_attr, offset = 0, counter_id = 0; int dest_id, dest_type = -1, inlen, len, ret, i; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; struct ib_uobject **arr_flow_actions; struct ib_uflow_resources *uflow_res; struct mlx5_flow_act flow_act = {}; + struct mlx5_fc *counter = NULL; struct ib_qp *qp = NULL; void *devx_obj, *cmd_in; struct ib_uobject *uobj; @@ -1937,6 +1943,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { + u32 *offset_attr, fc_bulk_size, offset = 0, counter_id = 0; devx_obj = arr_flow_actions[0]->object; if (uverbs_attr_is_valid(attrs, @@ -1956,8 +1963,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( offset = *offset_attr; } - if (!is_flow_counter(devx_obj, offset, &counter_id)) + if (!is_flow_counter(devx_obj, offset, &counter_id, &fc_bulk_size)) return -EINVAL; + counter = mlx5_fc_local_create(counter_id, offset, fc_bulk_size); + if (IS_ERR(counter)) + return PTR_ERR(counter); flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; } @@ -1968,8 +1978,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); - if (!uflow_res) - return -ENOMEM; + if (!uflow_res) { + ret = -ENOMEM; + goto destroy_counter; + } len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); @@ -1996,7 +2008,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( flow_handler = raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act, - counter_id, cmd_in, inlen, dest_id, dest_type); + counter, cmd_in, inlen, dest_id, dest_type); if (IS_ERR(flow_handler)) { ret = PTR_ERR(flow_handler); goto err_out; @@ -2007,6 +2019,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( return 0; err_out: ib_uverbs_flow_resources_free(uflow_res); +destroy_counter: + if (counter) + mlx5_fc_local_destroy(counter); return ret; } diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index c7a4ee896121..49af1cfbe6d1 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -13,6 +13,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, int vport_index) { struct mlx5_ib_dev *ibdev; + struct net_device *ndev; ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); if (!ibdev) @@ -20,12 +21,9 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, ibdev->port[vport_index].rep = rep; rep->rep_data[REP_IB].priv = ibdev; - write_lock(&ibdev->port[vport_index].roce.netdev_lock); - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - write_unlock(&ibdev->port[vport_index].roce.netdev_lock); + ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - return 0; + return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); } static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); @@ -104,10 +102,15 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) ibdev->is_rep = true; vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport); ibdev->mdev = lag_master; ibdev->num_ports = num_ports; + ibdev->ib_dev.phys_port_cnt = num_ports; + ret = ib_device_set_netdev(&ibdev->ib_dev, + mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, + rep->vport), + vport_index + 1); + if (ret) + goto fail_add; ret = __mlx5_ib_add(ibdev, profile); if (ret) @@ -160,9 +163,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) } port = &dev->port[vport_index]; - write_lock(&port->roce.netdev_lock); - port->roce.netdev = NULL; - write_unlock(&port->roce.netdev_lock); + + ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1); rep->rep_data[REP_IB].priv = NULL; port->rep = NULL; diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 0c3c4e64812c..2453ae4384a7 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -69,7 +69,7 @@ static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, if (ignore_bkey || !in_wc) op_modifier |= 0x2; - return mlx5_cmd_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, + return mlx5_cmd_mad_ifc(dev, in_mad, response_mad, op_modifier, port); } @@ -147,8 +147,39 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, vl_15_dropped); } -static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, - size_t sz) +static void pma_cnt_ext_assign_ppcnt(struct ib_pma_portcounters_ext *cnt_ext, + void *out) +{ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_GET_EXT_CNTR(counter_name) \ + MLX5_GET64(ib_ext_port_cntrs_grp_data_layout, \ + out_pma, counter_name##_high) + + cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_xmit_data) >> 2); + cnt_ext->port_rcv_data = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_rcv_data) >> 2); + + cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_xmit_pkts)); + cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_rcv_pkts)); + + cnt_ext->port_unicast_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_unicast_xmit_pkts)); + cnt_ext->port_unicast_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_unicast_rcv_pkts)); + + cnt_ext->port_multicast_xmit_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_multicast_xmit_pkts)); + cnt_ext->port_multicast_rcv_packets = + cpu_to_be64(MLX5_GET_EXT_CNTR(port_multicast_rcv_pkts)); +} + +static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, u8 plane_num, + void *out, size_t sz, bool ext) { u32 *in; int err; @@ -160,8 +191,14 @@ static int query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, } MLX5_SET(ppcnt_reg, in, local_port, port_num); - - MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP); + MLX5_SET(ppcnt_reg, in, plane_ind, plane_num); + + if (ext) + MLX5_SET(ppcnt_reg, in, grp, + MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP); + else + MLX5_SET(ppcnt_reg, in, grp, + MLX5_INFINIBAND_PORT_COUNTERS_GROUP); err = mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); @@ -188,7 +225,9 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, mdev = dev->mdev; mdev_port_num = 1; } - if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1) { + if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1 && + !mlx5_core_mp_enabled(mdev) && + dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) { /* set local port to one for Function-Per-Port HCA. */ mdev = dev->mdev; mdev_port_num = 1; @@ -207,7 +246,8 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { struct ib_pma_portcounters_ext *pma_cnt_ext = (struct ib_pma_portcounters_ext *)(out_mad->data + 40); - int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + int sz = max(MLX5_ST_SZ_BYTES(query_vport_counter_out), + MLX5_ST_SZ_BYTES(ppcnt_reg)); out_cnt = kvzalloc(sz, GFP_KERNEL); if (!out_cnt) { @@ -215,10 +255,18 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, goto done; } - err = mlx5_core_query_vport_counter(mdev, 0, 0, mdev_port_num, - out_cnt); - if (!err) - pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + err = query_ib_ppcnt(mdev, mdev_port_num, + port_num, out_cnt, sz, 1); + if (!err) + pma_cnt_ext_assign_ppcnt(pma_cnt_ext, out_cnt); + } else { + err = mlx5_core_query_vport_counter(mdev, 0, 0, + mdev_port_num, + out_cnt); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } } else { struct ib_pma_portcounters *pma_cnt = (struct ib_pma_portcounters *)(out_mad->data + 40); @@ -230,7 +278,13 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, goto done; } - err = query_ib_ppcnt(mdev, mdev_port_num, out_cnt, sz); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) + err = query_ib_ppcnt(mdev, mdev_port_num, port_num, + out_cnt, sz, 0); + else + err = query_ib_ppcnt(mdev, mdev_port_num, 0, + out_cnt, sz, 0); + if (!err) pma_cnt_assign(pma_cnt, out_cnt); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c2b557e64290..81849eb671a1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -48,6 +48,7 @@ #include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/mlx5_user_ioctl_cmds.h> #include "macsec.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -146,16 +147,52 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev, if (upper && port->rep->vport == MLX5_VPORT_UPLINK) continue; - - read_lock(&port->roce.netdev_lock); - rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw, - port->rep->vport); - if (rep_ndev == ndev) { - read_unlock(&port->roce.netdev_lock); + rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1); + if (rep_ndev && rep_ndev == ndev) { + dev_put(rep_ndev); *port_num = i + 1; return &port->roce; } - read_unlock(&port->roce.netdev_lock); + + dev_put(rep_ndev); + } + + return NULL; +} + +static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev, + struct net_device *ndev, + struct net_device *upper, + struct net_device *ib_ndev) +{ + if (!dev->ib_active) + return false; + + /* Event is about our upper device */ + if (upper == ndev) + return true; + + /* RDMA device is not in lag and not in switchdev */ + if (!dev->is_rep && !upper && ndev == ib_ndev) + return true; + + /* RDMA devie is in switchdev */ + if (dev->is_rep && ndev == ib_ndev) + return true; + + return false; +} + +static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_port *port; + int i; + + for (i = 0; i < ibdev->num_ports; i++) { + port = &ibdev->port[i]; + if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) { + return ib_device_get_netdev(&ibdev->ib_dev, i + 1); + } } return NULL; @@ -167,6 +204,7 @@ static int mlx5_netdev_event(struct notifier_block *this, struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); struct net_device *ndev = netdev_notifier_info_to_dev(ptr); u32 port_num = roce->native_port_num; + struct net_device *ib_ndev = NULL; struct mlx5_core_dev *mdev; struct mlx5_ib_dev *ibdev; @@ -180,47 +218,67 @@ static int mlx5_netdev_event(struct notifier_block *this, /* Should already be registered during the load */ if (ibdev->is_rep) break; - write_lock(&roce->netdev_lock); + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + /* Exit if already registered */ + if (ib_ndev) + goto put_ndev; + if (ndev->dev.parent == mdev->device) - roce->netdev = ndev; - write_unlock(&roce->netdev_lock); + ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num); break; case NETDEV_UNREGISTER: /* In case of reps, ib device goes away before the netdevs */ - write_lock(&roce->netdev_lock); - if (roce->netdev == ndev) - roce->netdev = NULL; - write_unlock(&roce->netdev_lock); - break; + if (ibdev->is_rep) + break; + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + if (ib_ndev == ndev) + ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num); + goto put_ndev; case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { - struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); struct net_device *upper = NULL; - if (lag_ndev) { - upper = netdev_master_upper_dev_get(lag_ndev); - dev_put(lag_ndev); + if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev) && + !mlx5_core_mp_enabled(mdev)) + return NOTIFY_DONE; + + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { + struct net_device *lag_ndev; + + if(mlx5_lag_is_roce(mdev)) + lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1); + else /* sriov lag */ + lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev); + + if (lag_ndev) { + upper = netdev_master_upper_dev_get(lag_ndev); + dev_put(lag_ndev); + } else { + goto done; + } } if (ibdev->is_rep) roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num); if (!roce) return NOTIFY_DONE; - if ((upper == ndev || - ((!upper || ibdev->is_rep) && ndev == roce->netdev)) && - ibdev->ib_active) { + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + + if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) { struct ib_event ibev = { }; enum ib_port_state port_state; if (get_port_state(&ibdev->ib_dev, port_num, &port_state)) - goto done; + goto put_ndev; if (roce->last_port_state == port_state) - goto done; + goto put_ndev; roce->last_port_state = port_state; ibev.device = &ibdev->ib_dev; @@ -229,7 +287,7 @@ static int mlx5_netdev_event(struct notifier_block *this, else if (port_state == IB_PORT_ACTIVE) ibev.event = IB_EVENT_PORT_ACTIVE; else - goto done; + goto put_ndev; ibev.element.port_num = port_num; ib_dispatch_event(&ibev); @@ -240,39 +298,13 @@ static int mlx5_netdev_event(struct notifier_block *this, default: break; } +put_ndev: + dev_put(ib_ndev); done: mlx5_ib_put_native_port_mdev(ibdev, port_num); return NOTIFY_DONE; } -static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, - u32 port_num) -{ - struct mlx5_ib_dev *ibdev = to_mdev(device); - struct net_device *ndev; - struct mlx5_core_dev *mdev; - - mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); - if (!mdev) - return NULL; - - ndev = mlx5_lag_get_roce_netdev(mdev); - if (ndev) - goto out; - - /* Ensure ndev does not disappear before we invoke dev_hold() - */ - read_lock(&ibdev->port[port_num - 1].roce.netdev_lock); - ndev = ibdev->port[port_num - 1].roce.netdev; - if (ndev) - dev_hold(ndev); - read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock); - -out: - mlx5_ib_put_native_port_mdev(ibdev, port_num); - return ndev; -} - struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 ib_port_num, u32 *native_port_num) @@ -283,6 +315,14 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, struct mlx5_ib_multiport_info *mpi; struct mlx5_ib_port *port; + if (ibdev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { + if (native_port_num) + *native_port_num = smi_to_native_portnum(ibdev, + ib_port_num); + return ibdev->mdev; + + } + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) { if (native_port_num) @@ -504,10 +544,10 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, */ if (dev->is_rep) err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - 1); + 1, 0); else err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, - mdev_port_num); + mdev_port_num, 0); if (err) goto out; ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); @@ -539,11 +579,11 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, if (!put_mdev) goto out; - ndev = mlx5_ib_get_netdev(device, port_num); + ndev = ib_device_get_netdev(device, port_num); if (!ndev) goto out; - if (dev->lag_active) { + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { rcu_read_lock(); upper = netdev_master_upper_dev_get_rcu(ndev); if (upper) { @@ -1146,6 +1186,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE; resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; + + if (MLX5_CAP_GEN_2(mdev, dp_ordering_force) && + (MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc))) + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP; } if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) { @@ -1334,11 +1382,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *rep; + u8 vl_hw_cap, plane_index = 0; u16 max_mtu; u16 oper_mtu; int err; u16 ib_link_width_oper; - u8 vl_hw_cap; rep = kzalloc(sizeof(*rep), GFP_KERNEL); if (!rep) { @@ -1348,6 +1396,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, /* props being zeroed by the caller, avoid zeroing it here */ + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) { + plane_index = port; + port = smi_to_native_portnum(dev, port); + } + err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep); if (err) goto out; @@ -1358,7 +1411,14 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, props->sm_sl = rep->sm_sl; props->state = rep->vport_state; props->phys_state = rep->port_physical_state; - props->port_cap_flags = rep->cap_mask1; + + props->port_cap_flags = rep->cap_mask1; + if (dev->num_plane) { + props->port_cap_flags |= IB_PORT_SM_DISABLED; + props->port_cap_flags &= ~IB_PORT_SM; + } else if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + props->port_cap_flags &= ~IB_PORT_CM_SUP; + props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); @@ -1371,7 +1431,7 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, props->port_cap_flags2 = rep->cap_mask2; err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper, - &props->active_speed, port); + &props->active_speed, port, plane_index); if (err) goto out; @@ -1811,7 +1871,7 @@ static int set_ucontext_resp(struct ib_ucontext *uctx, } resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); - if (dev->wc_support) + if (mlx5_wc_support_get(dev->mdev)) resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp->cache_line_size = cache_line_size(); @@ -2338,7 +2398,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm switch (command) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_ALLOC_WC: - if (!dev->wc_support) + if (!mlx5_wc_support_get(dev->mdev)) return -EPERM; fallthrough; case MLX5_IB_MMAP_NC_PAGE: @@ -2777,6 +2837,23 @@ static int mlx5_ib_event_slave_port(struct notifier_block *nb, return NOTIFY_OK; } +static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) +{ + struct mlx5_hca_vport_context vport_ctx; + int err; + + *num_plane = 0; + if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane)) + return 0; + + err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); + if (err) + return err; + + *num_plane = vport_ctx.num_plane; + return 0; +} + static int set_has_smi_cap(struct mlx5_ib_dev *dev) { struct mlx5_hca_vport_context vport_ctx; @@ -2787,10 +2864,15 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev) return 0; for (port = 1; port <= dev->num_ports; port++) { - if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) { + if (dev->num_plane) { + dev->port_caps[port - 1].has_smi = false; + continue; + } else if (!MLX5_CAP_GEN(dev->mdev, ib_virt) || + dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) { dev->port_caps[port - 1].has_smi = true; continue; } + err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0, &vport_ctx); if (err) { @@ -2824,37 +2906,72 @@ static u8 mlx5_get_umr_fence(u8 umr_fence_cap) } } -static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) +int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - struct ib_srq_init_attr attr; - struct ib_device *ibdev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; - int port; + struct ib_device *ibdev; + struct ib_pd *pd; + struct ib_cq *cq; int ret = 0; - ibdev = &dev->ib_dev; - if (!MLX5_CAP_GEN(dev->mdev, xrc)) - return -EOPNOTSUPP; + /* + * devr->c0 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->c0) + return 0; - devr->p0 = ib_alloc_pd(ibdev, 0); - if (IS_ERR(devr->p0)) - return PTR_ERR(devr->p0); + mutex_lock(&devr->cq_lock); + if (devr->c0) + goto unlock; - devr->c0 = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); - if (IS_ERR(devr->c0)) { - ret = PTR_ERR(devr->c0); - goto error1; + ibdev = &dev->ib_dev; + pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret); + goto unlock; } - ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); - if (ret) - goto error2; + cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret); + ib_dealloc_pd(pd); + goto unlock; + } - ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); + devr->p0 = pd; + devr->c0 = cq; + +unlock: + mutex_unlock(&devr->cq_lock); + return ret; +} + +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_srq_init_attr attr; + struct ib_srq *s0, *s1; + int ret = 0; + + /* + * devr->s1 is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (devr->s1) + return 0; + + mutex_lock(&devr->srq_lock); + if (devr->s1) + goto unlock; + + ret = mlx5_ib_dev_res_cq_init(dev); if (ret) - goto error3; + goto unlock; memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; @@ -2862,10 +2979,11 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) attr.srq_type = IB_SRQT_XRC; attr.ext.cq = devr->c0; - devr->s0 = ib_create_srq(devr->p0, &attr); - if (IS_ERR(devr->s0)) { - ret = PTR_ERR(devr->s0); - goto err_create; + s0 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s0)) { + ret = PTR_ERR(s0); + mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret); + goto unlock; } memset(&attr, 0, sizeof(attr)); @@ -2873,51 +2991,116 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC; - devr->s1 = ib_create_srq(devr->p0, &attr); - if (IS_ERR(devr->s1)) { - ret = PTR_ERR(devr->s1); - goto error6; + s1 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(s1)) { + ret = PTR_ERR(s1); + mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret); + ib_destroy_srq(s0); } - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - INIT_WORK(&devr->ports[port].pkey_change_work, - pkey_change_handler); - - return 0; + devr->s0 = s0; + devr->s1 = s1; -error6: - ib_destroy_srq(devr->s0); -err_create: - mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); -error3: - mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); -error2: - ib_destroy_cq(devr->c0); -error1: - ib_dealloc_pd(devr->p0); +unlock: + mutex_unlock(&devr->srq_lock); return ret; } -static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) +static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - int port; + int ret; - /* - * Make sure no change P_Key work items are still executing. - * - * At this stage, the mlx5_ib_event should be unregistered - * and it ensures that no new works are added. - */ - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - cancel_work_sync(&devr->ports[port].pkey_change_work); + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return -EOPNOTSUPP; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); + if (ret) + return ret; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); + if (ret) { + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); + return ret; + } + + mutex_init(&devr->cq_lock); + mutex_init(&devr->srq_lock); + + return 0; +} + +static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; - ib_destroy_srq(devr->s1); - ib_destroy_srq(devr->s0); + /* After s0/s1 init, they are not unset during the device lifetime. */ + if (devr->s1) { + ib_destroy_srq(devr->s1); + ib_destroy_srq(devr->s0); + } mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); - ib_destroy_cq(devr->c0); - ib_dealloc_pd(devr->p0); + /* After p0/c0 init, they are not unset during the device lifetime. */ + if (devr->c0) { + ib_destroy_cq(devr->c0); + ib_dealloc_pd(devr->p0); + } + mutex_destroy(&devr->cq_lock); + mutex_destroy(&devr->srq_lock); +} + +static int +mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + void *mkc; + u32 mkey; + u32 pdn; + u32 *in; + int err; + + err = mlx5_core_alloc_pd(mdev, &pdn); + if (err) + return err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err; + } + + MLX5_SET(create_mkey_in, in, data_direct, 1); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + kvfree(in); + if (err) + goto err; + + dev->ddr.mkey = mkey; + dev->ddr.pdn = pdn; + return 0; + +err: + mlx5_core_dealloc_pd(mdev, pdn); + return err; +} + +static void +mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev) +{ + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey); + mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn); } static u32 get_core_cap_flags(struct ib_device *ibdev, @@ -2933,6 +3116,13 @@ static u32 get_core_cap_flags(struct ib_device *ibdev, if (rep->grh_required) ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED; + if (dev->num_plane) + return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD | + RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA | + RDMA_CORE_CAP_AF_IB; + else if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + return ret | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_SMI; + if (ll == IB_LINK_LAYER_INFINIBAND) return ret | RDMA_CORE_PORT_IBA_IB; @@ -2968,6 +3158,9 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num, return err; if (ll == IB_LINK_LAYER_INFINIBAND) { + if (ibdev->type == RDMA_DEVICE_TYPE_SMI) + port_num = smi_to_native_portnum(dev, port_num); + err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0, &rep); if (err) @@ -3010,6 +3203,67 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str) fw_rev_sub(dev->mdev)); } +static int lag_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, + lag_events); + struct mlx5_core_dev *mdev = dev->mdev; + struct ib_device *ibdev = &dev->ib_dev; + struct net_device *old_ndev = NULL; + struct mlx5_ib_port *port; + struct net_device *ndev; + u32 portnum = 0; + int ret = 0; + int i; + + switch (event) { + case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: + ndev = data; + if (ndev) { + if (!mlx5_lag_is_roce(mdev)) { + // sriov lag + for (i = 0; i < dev->num_ports; i++) { + port = &dev->port[i]; + if (port->rep && port->rep->vport == + MLX5_VPORT_UPLINK) { + portnum = i; + break; + } + } + } + old_ndev = ib_device_get_netdev(ibdev, portnum + 1); + ret = ib_device_set_netdev(ibdev, ndev, portnum + 1); + if (ret) + goto out; + + if (old_ndev) + roce_del_all_netdev_gids(ibdev, portnum + 1, + old_ndev); + rdma_roce_rescan_port(ibdev, portnum + 1); + } + break; + default: + return NOTIFY_DONE; + } + +out: + dev_put(old_ndev); + return notifier_from_errno(ret); +} + +static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) +{ + dev->lag_events.notifier_call = lag_event; + blocking_notifier_chain_register(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + +static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) +{ + blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; @@ -3031,6 +3285,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) goto err_destroy_vport_lag; } + mlx5e_lag_event_register(dev); dev->flow_db->lag_demux_ft = ft; dev->lag_ports = mlx5_lag_get_num_ports(mdev); dev->lag_active = true; @@ -3048,6 +3303,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) if (dev->lag_active) { dev->lag_active = false; + mlx5e_lag_event_unregister(dev); mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); dev->flow_db->lag_demux_ft = NULL; @@ -3306,6 +3562,41 @@ unbind: return false; } +static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) +{ + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {}; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return 0; + + ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid); + if (ret) + return ret; + + ret = mlx5_ib_create_data_direct_resources(dev); + if (ret) + return ret; + + INIT_LIST_HEAD(&dev->data_direct_mr_list); + ret = mlx5_data_direct_ib_reg(dev, vuid); + if (ret) + mlx5_ib_free_data_direct_resources(dev); + + return ret; +} + +static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev) +{ + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return; + + mlx5_data_direct_ib_unreg(dev); + mlx5_ib_free_data_direct_resources(dev); +} + static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) { u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; @@ -3352,7 +3643,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, list) { if (dev->sys_image_guid == mpi->sys_image_guid && - (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + (mlx5_core_native_port_num(mpi->mdev) - 1) == i && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) { bound = mlx5_ib_bind_slave_port(dev, mpi); } @@ -3613,7 +3905,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)( alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) return -EOPNOTSUPP; - if (!to_mdev(c->ibucontext.device)->wc_support && + if (!mlx5_wc_support_get(to_mdev(c->ibucontext.device)->mdev) && alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) return -EOPNOTSUPP; @@ -3682,14 +3974,24 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( dump_fill_mkey), UA_MANDATORY)); +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_reg_dmabuf_mr, + UVERBS_OBJECT_MR, + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum mlx5_ib_uapi_reg_dmabuf_flags, + UA_OPTIONAL)); + static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), UAPI_DEF_CHAIN(mlx5_ib_qos_defs), UAPI_DEF_CHAIN(mlx5_ib_std_types_defs), UAPI_DEF_CHAIN(mlx5_ib_dm_defs), + UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), @@ -3698,6 +4000,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { + mlx5_ib_data_direct_cleanup(dev); mlx5_ib_cleanup_multiport_master(dev); WARN_ON(!xa_empty(&dev->odp_mkeys)); mutex_destroy(&dev->cap_mask_mutex); @@ -3713,13 +4016,11 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; - dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.dev.parent = mdev->device; dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; for (i = 0; i < dev->num_ports; i++) { spin_lock_init(&dev->port[i].mp.mpi_lock); - rwlock_init(&dev->port[i].roce.netdev_lock); dev->port[i].roce.dev = dev; dev->port[i].roce.native_port_num = i + 1; dev->port[i].roce.last_port_state = IB_PORT_DOWN; @@ -3751,6 +4052,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev); mutex_init(&dev->cap_mask_mutex); + mutex_init(&dev->data_direct_lock); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); xa_init(&dev->odp_mkeys); @@ -3759,25 +4061,22 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; + err = mlx5_ib_data_direct_init(dev); + if (err) + goto err_mp; + return 0; -err: - mlx5r_macsec_dealloc_gids(dev); err_mp: mlx5_ib_cleanup_multiport_master(dev); +err: + mlx5r_macsec_dealloc_gids(dev); return err; } -static int mlx5_ib_enable_driver(struct ib_device *dev) -{ - struct mlx5_ib_dev *mdev = to_mdev(dev); - int ret; - - ret = mlx5_ib_test_wc(mdev); - mlx5_ib_dbg(mdev, "Write-Combining %s", - mdev->wc_support ? "supported" : "not supported"); - - return ret; -} +static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name); +static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev); static const struct ib_device_ops mlx5_ib_dev_ops = { .owner = THIS_MODULE, @@ -3785,6 +4084,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, .add_gid = mlx5_ib_add_gid, + .add_sub_dev = mlx5_ib_add_sub_dev, .alloc_mr = mlx5_ib_alloc_mr, .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity, .alloc_pd = mlx5_ib_alloc_pd, @@ -3799,6 +4099,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .dealloc_pd = mlx5_ib_dealloc_pd, .dealloc_ucontext = mlx5_ib_dealloc_ucontext, .del_gid = mlx5_ib_del_gid, + .del_sub_dev = mlx5_ib_del_sub_dev, .dereg_mr = mlx5_ib_dereg_mr, .destroy_ah = mlx5_ib_destroy_ah, .destroy_cq = mlx5_ib_destroy_cq, @@ -3809,7 +4110,6 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .drain_rq = mlx5_ib_drain_rq, .drain_sq = mlx5_ib_drain_sq, .device_group = &mlx5_attr_group, - .enable_driver = mlx5_ib_enable_driver, .get_dev_fw_str = get_dev_fw_str, .get_dma_mr = mlx5_ib_get_dma_mr, .get_link_layer = mlx5_ib_port_link_layer, @@ -3839,6 +4139,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, + .ufile_hw_cleanup = mlx5_ib_ufile_hw_cleanup, INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), @@ -3985,7 +4286,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = { .create_wq = mlx5_ib_create_wq, .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table, .destroy_wq = mlx5_ib_destroy_wq, - .get_netdev = mlx5_ib_get_netdev, .modify_wq = mlx5_ib_modify_wq, INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table, @@ -4089,7 +4389,10 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { const char *name; - if (!mlx5_lag_is_active(dev->mdev)) + if (dev->sub_dev_name) { + name = dev->sub_dev_name; + ib_mark_name_assigned_by_user(&dev->ib_dev); + } else if (!mlx5_lag_is_active(dev->mdev)) name = "mlx5_%d"; else name = "mlx5_bond_%d"; @@ -4100,6 +4403,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { mlx5_mkey_cache_cleanup(dev); mlx5r_umr_resource_cleanup(dev); + mlx5r_umr_cleanup(dev); } static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) @@ -4111,7 +4415,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) { int ret; - ret = mlx5r_umr_resource_init(dev); + ret = mlx5r_umr_init(dev); if (ret) return ret; @@ -4166,6 +4470,13 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + dev->mdev_events.notifier_call = mlx5_ib_event; mlx5_notifier_register(dev->mdev, &dev->mdev_events); @@ -4176,8 +4487,30 @@ static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + mlx5r_macsec_event_unregister(dev); mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); +} + +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev) +{ + mutex_lock(&ibdev->data_direct_lock); + ibdev->data_direct_dev = dev; + mutex_unlock(&ibdev->data_direct_lock); +} + +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev) +{ + mutex_lock(&ibdev->data_direct_lock); + mlx5_ib_revoke_data_direct_mrs(ibdev); + ibdev->data_direct_dev = NULL; + mutex_unlock(&ibdev->data_direct_lock); } void __mlx5_ib_remove(struct mlx5_ib_dev *dev, @@ -4251,9 +4584,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_ODP, mlx5_ib_odp_init_one, mlx5_ib_odp_cleanup_one), @@ -4278,6 +4608,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), @@ -4314,9 +4647,6 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, mlx5_ib_counters_init, mlx5_ib_counters_cleanup), @@ -4338,6 +4668,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), @@ -4349,6 +4682,90 @@ const struct mlx5_ib_profile raw_eth_profile = { NULL), }; +static const struct mlx5_ib_profile plane_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + mlx5_ib_stage_caps_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), + STAGE_CREATE(MLX5_IB_STAGE_SRQ, + mlx5_init_srq_table, + mlx5_cleanup_srq_table), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_dev_res_init, + mlx5_ib_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), +}; + +static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name) +{ + struct mlx5_ib_dev *mparent = to_mdev(parent), *mplane; + enum rdma_link_layer ll; + int ret; + + if (mparent->smi_dev) + return ERR_PTR(-EEXIST); + + ll = mlx5_port_type_cap_to_rdma_ll(MLX5_CAP_GEN(mparent->mdev, + port_type)); + if (type != RDMA_DEVICE_TYPE_SMI || !mparent->num_plane || + ll != IB_LINK_LAYER_INFINIBAND || + !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud)) + return ERR_PTR(-EOPNOTSUPP); + + mplane = ib_alloc_device(mlx5_ib_dev, ib_dev); + if (!mplane) + return ERR_PTR(-ENOMEM); + + mplane->port = kcalloc(mparent->num_plane * mparent->num_ports, + sizeof(*mplane->port), GFP_KERNEL); + if (!mplane->port) { + ret = -ENOMEM; + goto fail_kcalloc; + } + + mplane->ib_dev.type = type; + mplane->mdev = mparent->mdev; + mplane->num_ports = mparent->num_plane; + mplane->sub_dev_name = name; + mplane->ib_dev.phys_port_cnt = mplane->num_ports; + + ret = __mlx5_ib_add(mplane, &plane_profile); + if (ret) + goto fail_ib_add; + + mparent->smi_dev = mplane; + return &mplane->ib_dev; + +fail_ib_add: + kfree(mplane->port); +fail_kcalloc: + ib_dealloc_device(&mplane->ib_dev); + return ERR_PTR(ret); +} + +static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev) +{ + struct mlx5_ib_dev *mdev = to_mdev(sub_dev); + + to_mdev(sub_dev->parent)->smi_dev = NULL; + __mlx5_ib_remove(mdev, mdev->profile, MLX5_IB_STAGE_MAX); +} + static int mlx5r_mp_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -4373,7 +4790,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev, mutex_lock(&mlx5_ib_multiport_mutex); list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { - if (dev->sys_image_guid == mpi->sys_image_guid) + if (dev->sys_image_guid == mpi->sys_image_guid && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) bound = mlx5_ib_bind_slave_port(dev, mpi); if (bound) { @@ -4426,15 +4844,23 @@ static int mlx5r_probe(struct auxiliary_device *adev, dev = ib_alloc_device(mlx5_ib_dev, ib_dev); if (!dev) return -ENOMEM; + + if (ll == IB_LINK_LAYER_INFINIBAND) { + ret = mlx5_ib_get_plane_num(mdev, &dev->num_plane); + if (ret) + goto fail; + } + dev->port = kcalloc(num_ports, sizeof(*dev->port), GFP_KERNEL); if (!dev->port) { - ib_dealloc_device(&dev->ib_dev); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dev->mdev = mdev; dev->num_ports = num_ports; + dev->ib_dev.phys_port_cnt = num_ports; if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) profile = &raw_eth_profile; @@ -4442,14 +4868,17 @@ static int mlx5r_probe(struct auxiliary_device *adev, profile = &pf_profile; ret = __mlx5_ib_add(dev, profile); - if (ret) { - kfree(dev->port); - ib_dealloc_device(&dev->ib_dev); - return ret; - } + if (ret) + goto fail_ib_add; auxiliary_set_drvdata(adev, dev); return 0; + +fail_ib_add: + kfree(dev->port); +fail: + ib_dealloc_device(&dev->ib_dev); + return ret; } static void mlx5r_remove(struct auxiliary_device *adev) @@ -4509,17 +4938,23 @@ static int __init mlx5_ib_init(void) ret = mlx5r_rep_init(); if (ret) goto rep_err; + ret = mlx5_data_direct_driver_register(); + if (ret) + goto dd_err; ret = auxiliary_driver_register(&mlx5r_mp_driver); if (ret) goto mp_err; ret = auxiliary_driver_register(&mlx5r_driver); if (ret) goto drv_err; + return 0; drv_err: auxiliary_driver_unregister(&mlx5r_mp_driver); mp_err: + mlx5_data_direct_driver_unregister(); +dd_err: mlx5r_rep_cleanup(); rep_err: mlx5_ib_qp_event_cleanup(); @@ -4531,6 +4966,7 @@ qp_event_err: static void __exit mlx5_ib_cleanup(void) { + mlx5_data_direct_driver_unregister(); auxiliary_driver_unregister(&mlx5r_driver); auxiliary_driver_unregister(&mlx5r_mp_driver); mlx5r_rep_cleanup(); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 96ffbbaf0a73..af321f6ef7f5 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -32,7 +32,6 @@ #include <rdma/ib_umem_odp.h> #include "mlx5_ib.h" -#include <linux/jiffies.h> /* * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be @@ -94,202 +93,3 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff( return 0; return page_size; } - -#define WR_ID_BF 0xBF -#define WR_ID_END 0xBAD -#define TEST_WC_NUM_WQES 255 -#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100) -static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id, - bool signaled) -{ - struct mlx5_ib_qp *qp = to_mqp(ibqp); - struct mlx5_wqe_ctrl_seg *ctrl; - struct mlx5_bf *bf = &qp->bf; - __be32 mmio_wqe[16] = {}; - unsigned long flags; - unsigned int idx; - int i; - - if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) - return -EIO; - - spin_lock_irqsave(&qp->sq.lock, flags); - - idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); - ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx); - - memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg)); - ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0; - ctrl->opmod_idx_opcode = - cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP); - ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) | - (qp->trans_qp.base.mqp.qpn << 8)); - - qp->sq.wrid[idx] = wr_id; - qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP; - qp->sq.wqe_head[idx] = qp->sq.head + 1; - qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg), - MLX5_SEND_WQE_BB); - qp->sq.w_list[idx].next = qp->sq.cur_post; - qp->sq.head++; - - memcpy(mmio_wqe, ctrl, sizeof(*ctrl)); - ((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |= - MLX5_WQE_CTRL_CQ_UPDATE; - - /* Make sure that descriptors are written before - * updating doorbell record and ringing the doorbell - */ - wmb(); - - qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); - - /* Make sure doorbell record is visible to the HCA before - * we hit doorbell - */ - wmb(); - for (i = 0; i < 8; i++) - mlx5_write64(&mmio_wqe[i * 2], - bf->bfreg->map + bf->offset + i * 8); - io_stop_wc(); - - bf->offset ^= bf->buf_size; - - spin_unlock_irqrestore(&qp->sq.lock, flags); - - return 0; -} - -static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq) -{ - int ret; - struct ib_wc wc = {}; - unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; - - do { - ret = ib_poll_cq(cq, 1, &wc); - if (ret < 0 || wc.status) - return ret < 0 ? ret : -EINVAL; - if (ret) - break; - } while (!time_after(jiffies, end)); - - if (!ret) - return -ETIMEDOUT; - - if (wc.wr_id != WR_ID_BF) - ret = 0; - - return ret; -} - -static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp) -{ - int err, i; - - for (i = 0; i < TEST_WC_NUM_WQES; i++) { - err = post_send_nop(dev, qp, WR_ID_BF, false); - if (err) - return err; - } - - return post_send_nop(dev, qp, WR_ID_END, true); -} - -int mlx5_ib_test_wc(struct mlx5_ib_dev *dev) -{ - struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 }; - int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); - struct ib_qp_init_attr qp_init_attr = { - .cap = { .max_send_wr = TEST_WC_NUM_WQES }, - .qp_type = IB_QPT_UD, - .sq_sig_type = IB_SIGNAL_REQ_WR, - .create_flags = MLX5_IB_QP_CREATE_WC_TEST, - }; - struct ib_qp_attr qp_attr = { .port_num = 1 }; - struct ib_device *ibdev = &dev->ib_dev; - struct ib_qp *qp; - struct ib_cq *cq; - struct ib_pd *pd; - int ret; - - if (!MLX5_CAP_GEN(dev->mdev, bf)) - return 0; - - if (!dev->mdev->roce.roce_en && - port_type_cap == MLX5_CAP_PORT_TYPE_ETH) { - if (mlx5_core_is_pf(dev->mdev)) - dev->wc_support = arch_can_pci_mmap_wc(); - return 0; - } - - ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false); - if (ret) - goto print_err; - - if (!dev->wc_bfreg.wc) - goto out1; - - pd = ib_alloc_pd(ibdev, 0); - if (IS_ERR(pd)) { - ret = PTR_ERR(pd); - goto out1; - } - - cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); - goto out2; - } - - qp_init_attr.recv_cq = cq; - qp_init_attr.send_cq = cq; - qp = ib_create_qp(pd, &qp_init_attr); - if (IS_ERR(qp)) { - ret = PTR_ERR(qp); - goto out3; - } - - qp_attr.qp_state = IB_QPS_INIT; - ret = ib_modify_qp(qp, &qp_attr, - IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX | - IB_QP_QKEY); - if (ret) - goto out4; - - qp_attr.qp_state = IB_QPS_RTR; - ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); - if (ret) - goto out4; - - qp_attr.qp_state = IB_QPS_RTS; - ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); - if (ret) - goto out4; - - ret = test_wc_do_send(dev, qp); - if (ret < 0) - goto out4; - - ret = test_wc_poll_cq_result(dev, cq); - if (ret > 0) { - dev->wc_support = true; - ret = 0; - } - -out4: - ib_destroy_qp(qp); -out3: - ib_destroy_cq(cq); -out2: - ib_dealloc_pd(pd); -out1: - mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg); -print_err: - if (ret) - mlx5_ib_err( - dev, - "Error %d while trying to test write-combining support\n", - ret); - return ret; -} diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index bbe79b86c717..974a45c92fbb 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -63,17 +63,6 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits, return GENMASK(largest_pg_shift, pgsz_shift); } -/* - * For mkc users, instead of a page_offset the command has a start_iova which - * specifies both the page_offset and the on-the-wire IOVA - */ -#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \ - ib_umem_find_best_pgsz(umem, \ - __mlx5_log_page_size_to_bitmap( \ - __mlx5_bit_sz(typ, log_pgsz_fld), \ - pgsz_shift), \ - iova) - static __always_inline unsigned long __mlx5_page_offset_to_bitmask(unsigned int page_offset_bits, unsigned int offset_shift) @@ -115,6 +104,19 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff( __mlx5_bit_sz(typ, page_offset_fld), 0, scale, \ page_offset_quantized) +static inline unsigned long +mlx5_umem_dmabuf_find_best_pgsz(struct ib_umem_dmabuf *umem_dmabuf) +{ + /* + * mkeys used for dmabuf are fixed at PAGE_SIZE because we must be able + * to hold any sgl after a move operation. Ideally the mkc page size + * could be changed at runtime to be optimal, but right now the driver + * cannot do that. + */ + return ib_umem_find_best_pgsz(&umem_dmabuf->umem, PAGE_SIZE, + umem_dmabuf->umem.iova); +} + enum { MLX5_IB_MMAP_OFFSET_START = 9, MLX5_IB_MMAP_OFFSET_END = 255, @@ -341,7 +343,6 @@ struct mlx5_ib_flow_db { * rely on the range reserved for that use in the ib_qp_create_flags enum. */ #define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START -#define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1) struct wr_list { u16 opcode; @@ -520,6 +521,7 @@ struct mlx5_ib_qp { struct mlx5_bf bf; u8 has_rq:1; u8 is_rss:1; + u8 is_ooo_rq:1; /* only for user space QPs. For kernel * we have it from the bf object @@ -628,6 +630,8 @@ enum mlx5_mkey_type { MLX5_MKEY_MR = 1, MLX5_MKEY_MW, MLX5_MKEY_INDIRECT_DEVX, + MLX5_MKEY_NULL, + MLX5_MKEY_IMPLICIT_CHILD, }; struct mlx5r_cache_rb_key { @@ -643,9 +647,10 @@ struct mlx5_ib_mkey { unsigned int ndescs; struct wait_queue_head wait; refcount_t usecount; - /* User Mkey must hold either a rb_key or a cache_ent. */ + /* Cacheable user Mkey must hold either a rb_key or a cache_ent. */ struct mlx5r_cache_rb_key rb_key; struct mlx5_cache_ent *cache_ent; + u8 cacheable : 1; }; #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) @@ -664,11 +669,19 @@ struct mlx5_ib_mkey { #define mlx5_update_odp_stats(mr, counter_name, value) \ atomic64_add(value, &((mr)->odp_stats.counter_name)) +#define mlx5_update_odp_stats_with_handled(mr, counter_name, value) \ + do { \ + mlx5_update_odp_stats(mr, counter_name, value); \ + atomic64_add(1, &((mr)->odp_stats.counter_name##_handled)); \ + } while (0) + struct mlx5_ib_mr { struct ib_mr ibmr; struct mlx5_ib_mkey mmkey; struct ib_umem *umem; + /* The mr is data direct related */ + u8 data_direct :1; union { /* Used only by kernel MRs (umem == NULL) */ @@ -706,6 +719,11 @@ struct mlx5_ib_mr { } odp_destroy; struct ib_odp_counters odp_stats; bool is_odp_implicit; + /* The affilated data direct crossed mr */ + struct mlx5_ib_mr *dd_crossed_mr; + struct list_head dd_node; + u8 revoked :1; + struct mlx5_ib_mkey null_mmkey; }; }; }; @@ -751,6 +769,8 @@ struct umr_common { */ struct mutex lock; unsigned int state; + /* Protects from repeat UMR QP creation */ + struct mutex init_lock; }; #define NUM_MKEYS_PER_PAGE \ @@ -781,6 +801,7 @@ struct mlx5_cache_ent { u8 is_tmp:1; u8 disabled:1; u8 fill_to_high_water:1; + u8 tmp_cleanup_scheduled:1; /* * - limit is the low water mark for stored mkeys, 2* limit is the @@ -812,7 +833,6 @@ struct mlx5_mkey_cache { struct mutex rb_lock; struct dentry *fs_root; unsigned long last_add; - struct delayed_work remove_ent_dwork; }; struct mlx5_ib_port_resources { @@ -820,13 +840,20 @@ struct mlx5_ib_port_resources { struct work_struct pkey_change_work; }; +struct mlx5_data_direct_resources { + u32 pdn; + u32 mkey; +}; + struct mlx5_ib_resources { struct ib_cq *c0; + struct mutex cq_lock; u32 xrcdn0; u32 xrcdn1; struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; + struct mutex srq_lock; struct mlx5_ib_port_resources ports[2]; }; @@ -868,8 +895,6 @@ struct mlx5_roce { /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL * netdev pointer */ - rwlock_t netdev_lock; - struct net_device *netdev; struct notifier_block nb; struct netdev_net_notifier nn; struct notifier_block mdev_nb; @@ -954,7 +979,6 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_QP, MLX5_IB_STAGE_SRQ, MLX5_IB_STAGE_DEVICE_RESOURCES, - MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_ODP, MLX5_IB_STAGE_COUNTERS, MLX5_IB_STAGE_CONG_DEBUGFS, @@ -963,6 +987,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_POST_IB_REG_UMR, MLX5_IB_STAGE_DELAY_DROP, MLX5_IB_STAGE_RESTRACK, @@ -1114,7 +1139,11 @@ struct mlx5_macsec { struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; + struct mlx5_data_direct_dev *data_direct_dev; + /* protect accessing data_direct_dev */ + struct mutex data_direct_lock; struct notifier_block mdev_events; + struct notifier_block lag_events; int num_ports; /* serialize update of capability mask */ @@ -1122,7 +1151,6 @@ struct mlx5_ib_dev { u8 ib_active:1; u8 is_rep:1; u8 lag_active:1; - u8 wc_support:1; u8 fill_delay; struct umr_common umrc; /* sync used page count stats @@ -1145,10 +1173,10 @@ struct mlx5_ib_dev { /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; + struct list_head data_direct_mr_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; - struct mlx5_sq_bfreg wc_bfreg; struct mlx5_sq_bfreg fp_bfreg; struct mlx5_ib_delay_drop delay_drop; const struct mlx5_ib_profile *profile; @@ -1170,10 +1198,15 @@ struct mlx5_ib_dev { u16 pkey_table_len; u8 lag_ports; struct mlx5_special_mkeys mkeys; + struct mlx5_data_direct_resources ddr; #ifdef CONFIG_MLX5_MACSEC struct mlx5_macsec macsec; #endif + + u8 num_plane; + struct mlx5_ib_dev *smi_dev; + const char *sub_dev_name; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1272,6 +1305,8 @@ to_mmmap(struct rdma_user_mmap_entry *rdma_entry) struct mlx5_user_mmap_entry, rdma_entry); } +int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev); +int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev); int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); @@ -1311,7 +1346,7 @@ int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, size_t buflen, size_t *bc); int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); @@ -1324,7 +1359,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, @@ -1335,7 +1370,6 @@ int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags); -void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr); struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, @@ -1377,7 +1411,6 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props); void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, u64 access_flags); -void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); @@ -1405,6 +1438,10 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev); +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev); +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); @@ -1512,6 +1549,7 @@ extern const struct uapi_definition mlx5_ib_devx_defs[]; extern const struct uapi_definition mlx5_ib_flow_defs[]; extern const struct uapi_definition mlx5_ib_qos_defs[]; extern const struct uapi_definition mlx5_ib_std_types_defs[]; +extern const struct uapi_definition mlx5_ib_create_cq_defs[]; static inline int is_qp1(enum ib_qp_type qp_type) { @@ -1612,8 +1650,6 @@ static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_ib_mkey *mmkey) wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0); } -int mlx5_ib_test_wc(struct mlx5_ib_dev *dev); - static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) { /* @@ -1680,4 +1716,26 @@ static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev, int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num, unsigned int index, const union ib_gid *gid, const struct ib_gid_attr *attr); + +static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port) +{ + return (port - 1) / dev->num_ports + 1; +} + +/* + * For mkc users, instead of a page_offset the command has a start_iova which + * specifies both the page_offset and the on-the-wire IOVA + */ +static __always_inline unsigned long +mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem, + u64 iova) +{ + int page_size_bits = + MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5; + unsigned long bitmap = + __mlx5_log_page_size_to_bitmap(page_size_bits, 0); + + return ib_umem_find_best_pgsz(umem, bitmap, iova); +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index a8ee2ca1f4a1..753faa9ad06a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -43,18 +43,22 @@ #include "dm.h" #include "mlx5_ib.h" #include "umr.h" +#include "data_direct.h" enum { MAX_PENDING_REG_MR = 8, }; +#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 #define MLX5_UMR_ALIGN 2048 static void create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate); + unsigned int page_size, bool populate, + int access_mode); +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, struct ib_pd *pd) @@ -211,9 +215,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&ent->mkeys_queue.lock, flags); push_mkey_locked(ent, mkey_out->mkey); + ent->pending--; /* If we are doing fill_to_high_water then keep going. */ queue_adjust_cache_locked(ent); - ent->pending--; spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); kfree(mkey_out); } @@ -246,6 +250,7 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); MLX5_SET(mkc, mkc, access_mode_4_2, (ent->rb_key.access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats); MLX5_SET(mkc, mkc, translations_octword_size, get_mkc_octo_size(ent->rb_key.access_mode, @@ -526,6 +531,21 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) } } +static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) +{ + u32 mkey; + + spin_lock_irq(&ent->mkeys_queue.lock); + while (ent->mkeys_queue.ci) { + mkey = pop_mkey_locked(ent); + spin_unlock_irq(&ent->mkeys_queue.lock); + mlx5_core_destroy_mkey(dev->mdev, mkey); + spin_lock_irq(&ent->mkeys_queue.lock); + } + ent->tmp_cleanup_scheduled = false; + spin_unlock_irq(&ent->mkeys_queue.lock); +} + static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; @@ -597,7 +617,11 @@ static void delayed_cache_work_func(struct work_struct *work) struct mlx5_cache_ent *ent; ent = container_of(work, struct mlx5_cache_ent, dwork.work); - __cache_work_func(ent); + /* temp entries are never filled, only cleaned */ + if (ent->is_tmp) + clean_keys(ent->dev, ent); + else + __cache_work_func(ent); } static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, @@ -641,10 +665,8 @@ static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, new = &((*new)->rb_left); if (cmp < 0) new = &((*new)->rb_right); - if (cmp == 0) { - mutex_unlock(&cache->rb_lock); + if (cmp == 0) return -EEXIST; - } } /* Add new node and rebalance tree. */ @@ -660,6 +682,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, { struct rb_node *node = dev->cache.rb_root.rb_node; struct mlx5_cache_ent *cur, *smallest = NULL; + u64 ndescs_limit; int cmp; /* @@ -678,10 +701,18 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, return cur; } + /* + * Limit the usage of mkeys larger than twice the required size while + * also allowing the usage of smallest cache entry for small MRs. + */ + ndescs_limit = max_t(u64, rb_key.ndescs * 2, + MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); + return (smallest && smallest->rb_key.access_mode == rb_key.access_mode && smallest->rb_key.access_flags == rb_key.access_flags && - smallest->rb_key.ats == rb_key.ats) ? + smallest->rb_key.ats == rb_key.ats && + smallest->rb_key.ndescs <= ndescs_limit) ? smallest : NULL; } @@ -719,6 +750,8 @@ static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, } mr->mmkey.cache_ent = ent; mr->mmkey.type = MLX5_MKEY_MR; + mr->mmkey.rb_key = ent->rb_key; + mr->mmkey.cacheable = true; init_waitqueue_head(&mr->mmkey.wait); return mr; } @@ -764,21 +797,6 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, return _mlx5_mr_cache_alloc(dev, ent, access_flags); } -static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) -{ - u32 mkey; - - cancel_delayed_work(&ent->dwork); - spin_lock_irq(&ent->mkeys_queue.lock); - while (ent->mkeys_queue.ci) { - mkey = pop_mkey_locked(ent); - spin_unlock_irq(&ent->mkeys_queue.lock); - mlx5_core_destroy_mkey(dev->mdev, mkey); - spin_lock_irq(&ent->mkeys_queue.lock); - } - spin_unlock_irq(&ent->mkeys_queue.lock); -} - static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) { if (!mlx5_debugfs_root || dev->is_rep) @@ -891,10 +909,6 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, ent->limit = 0; mlx5_mkey_cache_debugfs_add_ent(dev, ent); - } else { - mod_delayed_work(ent->dev->cache.wq, - &ent->dev->cache.remove_ent_dwork, - msecs_to_jiffies(30 * 1000)); } return ent; @@ -905,35 +919,6 @@ mkeys_err: return ERR_PTR(ret); } -static void remove_ent_work_func(struct work_struct *work) -{ - struct mlx5_mkey_cache *cache; - struct mlx5_cache_ent *ent; - struct rb_node *cur; - - cache = container_of(work, struct mlx5_mkey_cache, - remove_ent_dwork.work); - mutex_lock(&cache->rb_lock); - cur = rb_last(&cache->rb_root); - while (cur) { - ent = rb_entry(cur, struct mlx5_cache_ent, node); - cur = rb_prev(cur); - mutex_unlock(&cache->rb_lock); - - spin_lock_irq(&ent->mkeys_queue.lock); - if (!ent->is_tmp) { - spin_unlock_irq(&ent->mkeys_queue.lock); - mutex_lock(&cache->rb_lock); - continue; - } - spin_unlock_irq(&ent->mkeys_queue.lock); - - clean_keys(ent->dev, ent); - mutex_lock(&cache->rb_lock); - } - mutex_unlock(&cache->rb_lock); -} - int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mkey_cache *cache = &dev->cache; @@ -949,7 +934,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mutex_init(&dev->slow_path_mutex); mutex_init(&dev->cache.rb_lock); dev->cache.rb_root = RB_ROOT; - INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); @@ -961,7 +945,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mlx5_mkey_cache_debugfs_init(dev); mutex_lock(&cache->rb_lock); for (i = 0; i <= mkey_cache_max_order(dev); i++) { - rb_key.ndescs = 1 << (i + 2); + rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); if (IS_ERR(ent)) { ret = PTR_ERR(ent); @@ -1000,7 +984,6 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) return; mutex_lock(&dev->cache.rb_lock); - cancel_delayed_work(&dev->cache.remove_ent_dwork); for (node = rb_first(root); node; node = rb_next(node)) { ent = rb_entry(node, struct mlx5_cache_ent, node); spin_lock_irq(&ent->mkeys_queue.lock); @@ -1061,6 +1044,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) MLX5_SET(mkc, mkc, length64, 1); set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, pd); + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); if (err) @@ -1125,12 +1109,10 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct ib_umem *umem, u64 iova, - int access_flags) + int access_flags, int access_mode) { - struct mlx5r_cache_rb_key rb_key = { - .access_mode = MLX5_MKC_ACCESS_MODE_MTT, - }; struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5r_cache_rb_key rb_key = {}; struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; unsigned int page_size; @@ -1138,11 +1120,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); else - page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, - 0, iova); + page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); + rb_key.access_mode = access_mode; rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); @@ -1153,11 +1135,12 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, */ if (!ent) { mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, false); + mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); mutex_unlock(&dev->slow_path_mutex); if (IS_ERR(mr)) return mr; mr->mmkey.rb_key = rb_key; + mr->mmkey.cacheable = true; return mr; } @@ -1173,13 +1156,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } +static struct ib_mr * +reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, + u32 crossed_lkey) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; + struct mlx5_ib_mr *mr; + void *mkc; + int inlen; + u32 *in; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_1; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, crossing_target_vhca_id, + MLX5_CAP_GEN(dev->mdev, vhca_id)); + MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + + /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ + set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); + MLX5_SET64(mkc, mkc, len, iova + length); + + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) + goto err_2; + + mr->mmkey.type = MLX5_MKEY_MR; + set_mr_fields(dev, mr, length, access_flags, iova); + mr->ibmr.pd = pd; + kvfree(in); + mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); + + return &mr->ibmr; +err_2: + kvfree(in); +err_1: + kfree(mr); + return ERR_PTR(err); +} + /* * If ibmr is NULL it will be allocated by reg_create. * Else, the given ibmr will be used. */ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate) + unsigned int page_size, bool populate, + int access_mode) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; @@ -1188,7 +1229,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, int inlen; u32 *in; int err; - bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && + (access_mode == MLX5_MKC_ACCESS_MODE_MTT); + bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); if (!page_size) return ERR_PTR(-EINVAL); @@ -1211,7 +1254,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); if (populate) { - if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { + if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { err = -EINVAL; goto err_2; } @@ -1227,14 +1270,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); set_mkc_access_pd_addr_fields(mkc, access_flags, iova, populate ? pd : dev->umrc.pd); + /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ + if (umem->is_dmabuf && ksm_mode) + MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); + MLX5_SET(mkc, mkc, free, !populate); - MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET64(mkc, mkc, len, umem->length); MLX5_SET(mkc, mkc, bsf_octword_size, 0); - MLX5_SET(mkc, mkc, translations_octword_size, - get_octo_len(iova, umem->length, mr->page_shift)); + if (ksm_mode) + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift) * 2); + else + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift)); MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); if (mlx5_umem_needs_ats(dev, umem, access_flags)) MLX5_SET(mkc, mkc, ma_translation_mode, 1); @@ -1371,13 +1422,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { - mr = alloc_cacheable_mr(pd, umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); } else { - unsigned int page_size = mlx5_umem_find_best_pgsz( - umem, mkc, log_page_size, 0, iova); + unsigned int page_size = + mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, true); + mr = reg_create(pd, umem, iova, access_flags, page_size, + true, MLX5_MKC_ACCESS_MODE_MTT); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { @@ -1440,7 +1493,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(odp)) return ERR_CAST(odp); - mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); if (IS_ERR(mr)) { ib_umem_release(&odp->umem); return ERR_CAST(mr); @@ -1468,6 +1522,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem *umem; + int err; if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) return ERR_PTR(-EOPNOTSUPP); @@ -1475,6 +1530,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", start, iova, length, access_flags); + err = mlx5r_umr_resource_init(dev); + if (err) + return ERR_PTR(err); + if (access_flags & IB_ACCESS_ON_DEMAND) return create_user_odp_mr(pd, start, length, iova, access_flags, udata); @@ -1491,7 +1550,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); - if (!umem_dmabuf->sgt) + if (!umem_dmabuf->sgt || !mr) return; mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); @@ -1503,31 +1562,31 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .move_notify = mlx5_ib_dmabuf_invalidate_cb, }; -struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, - u64 length, u64 virt_addr, - int fd, int access_flags, - struct ib_udata *udata) +static struct ib_mr * +reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, + u64 offset, u64 length, u64 virt_addr, + int fd, int access_flags, int access_mode) { + bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; struct ib_umem_dmabuf *umem_dmabuf; int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || - !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - return ERR_PTR(-EOPNOTSUPP); - - mlx5_ib_dbg(dev, - "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", - offset, virt_addr, length, fd, access_flags); + err = mlx5r_umr_resource_init(dev); + if (err) + return ERR_PTR(err); - /* dmabuf requires xlt update via umr to work. */ - if (!mlx5r_umr_can_load_pas(dev, length)) - return ERR_PTR(-EINVAL); + if (!pinned_mode) + umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, + offset, length, fd, + access_flags, + &mlx5_ib_dmabuf_attach_ops); + else + umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, + dma_device, offset, length, + fd, access_flags); - umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, - access_flags, - &mlx5_ib_dmabuf_attach_ops); if (IS_ERR(umem_dmabuf)) { mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", PTR_ERR(umem_dmabuf)); @@ -1535,7 +1594,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, } mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, - access_flags); + access_flags, access_mode); if (IS_ERR(mr)) { ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); @@ -1545,9 +1604,13 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); umem_dmabuf->private = mr; - err = mlx5r_store_odp_mkey(dev, &mr->mmkey); - if (err) - goto err_dereg_mr; + if (!pinned_mode) { + err = mlx5r_store_odp_mkey(dev, &mr->mmkey); + if (err) + goto err_dereg_mr; + } else { + mr->data_direct = true; + } err = mlx5_ib_init_dmabuf_mr(mr); if (err) @@ -1555,10 +1618,101 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return &mr->ibmr; err_dereg_mr: - mlx5_ib_dereg_mr(&mr->ibmr, NULL); + __mlx5_ib_dereg_mr(&mr->ibmr); return ERR_PTR(err); } +static struct ib_mr * +reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_data_direct_dev *data_direct_dev; + struct ib_mr *crossing_mr; + struct ib_mr *crossed_mr; + int ret = 0; + + /* As of HW behaviour the IOVA must be page aligned in KSM mode */ + if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -EINVAL; + goto end; + } + + /* The device's 'data direct mkey' was created without RO flags to + * simplify things and allow for a single mkey per device. + * Since RO is not a must, mask it out accordingly. + */ + access_flags &= ~IB_ACCESS_RELAXED_ORDERING; + crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, + offset, length, virt_addr, fd, + access_flags, MLX5_MKC_ACCESS_MODE_KSM); + if (IS_ERR(crossed_mr)) { + ret = PTR_ERR(crossed_mr); + goto end; + } + + mutex_lock(&dev->slow_path_mutex); + crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, + crossed_mr->lkey); + mutex_unlock(&dev->slow_path_mutex); + if (IS_ERR(crossing_mr)) { + __mlx5_ib_dereg_mr(crossed_mr); + ret = PTR_ERR(crossing_mr); + goto end; + } + + list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); + to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); + to_mmr(crossing_mr)->data_direct = true; +end: + mutex_unlock(&dev->data_direct_lock); + return ret ? ERR_PTR(ret) : crossing_mr; +} + +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int mlx5_access_flags = 0; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return ERR_PTR(-EOPNOTSUPP); + + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { + err = uverbs_get_flags32(&mlx5_access_flags, attrs, + MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); + if (err) + return ERR_PTR(err); + } + + mlx5_ib_dbg(dev, + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", + offset, virt_addr, length, fd, access_flags, mlx5_access_flags); + + /* dmabuf requires xlt update via umr to work. */ + if (!mlx5r_umr_can_load_pas(dev, length)) + return ERR_PTR(-EINVAL); + + if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) + return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, + fd, access_flags); + + return reg_user_mr_dmabuf(pd, pd->device->dma_device, + offset, length, virt_addr, + fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); +} + /* * True if the change in access flags can be done via UMR, only some access * flags can be updated. @@ -1570,7 +1724,8 @@ static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, unsigned int diffs = current_access_flags ^ target_access_flags; if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) + IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING | + IB_ACCESS_REMOTE_ATOMIC)) return false; return mlx5r_umr_can_reconfig(dev, current_access_flags, target_access_flags); @@ -1589,8 +1744,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; - *page_size = - mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); + *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); if (WARN_ON(!*page_size)) return false; return (mr->mmkey.cache_ent->rb_key.ndescs) >= @@ -1653,7 +1807,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, struct mlx5_ib_mr *mr = to_mmr(ib_mr); int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg( @@ -1781,7 +1935,8 @@ err: static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { - if (!mr->umem && mr->descs) { + if (!mr->umem && !mr->data_direct && + mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) { struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; struct mlx5_ib_dev *dev = to_mdev(device); @@ -1835,7 +1990,86 @@ end: return ret; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + int err; + + lockdep_assert_held(&dev->data_direct_lock); + mr->revoked = true; + err = mlx5r_umr_revoke_mr(mr); + if (WARN_ON(err)) + return err; + + ib_umem_dmabuf_revoke(umem_dmabuf); + return 0; +} + +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_mr *mr, *next; + + lockdep_assert_held(&dev->data_direct_lock); + + list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { + list_del(&mr->dd_node); + mlx5_ib_revoke_data_direct_mr(mr); + } +} + +static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; + bool is_odp = is_odp_mr(mr); + bool is_odp_dma_buf = is_dmabuf_mr(mr) && + !to_ib_umem_dmabuf(mr->umem)->pinned; + int ret = 0; + + if (is_odp) + mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); + + if (is_odp_dma_buf) + dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); + + if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { + ent = mr->mmkey.cache_ent; + /* upon storing to a clean temp entry - schedule its cleanup */ + spin_lock_irq(&ent->mkeys_queue.lock); + if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, + msecs_to_jiffies(30 * 1000)); + ent->tmp_cleanup_scheduled = true; + } + spin_unlock_irq(&ent->mkeys_queue.lock); + goto out; + } + + if (ent) { + spin_lock_irq(&ent->mkeys_queue.lock); + ent->in_use--; + mr->mmkey.cache_ent = NULL; + spin_unlock_irq(&ent->mkeys_queue.lock); + } + ret = destroy_mkey(dev, mr); +out: + if (is_odp) { + if (!ret) + to_ib_umem_odp(mr->umem)->private = NULL; + mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); + } + + if (is_odp_dma_buf) { + if (!ret) + to_ib_umem_dmabuf(mr->umem)->private = NULL; + dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); + } + + return ret; +} + +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); struct mlx5_ib_dev *dev = to_mdev(ibmr->device); @@ -1880,16 +2114,9 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } /* Stop DMA */ - if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length)) - if (mlx5r_umr_revoke_mr(mr) || - cache_ent_find_and_store(dev, mr)) - mr->mmkey.cache_ent = NULL; - - if (!mr->mmkey.cache_ent) { - rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); - if (rc) - return rc; - } + rc = mlx5_revoke_mr(mr); + if (rc) + return rc; if (mr->umem) { bool is_odp = is_odp_mr(mr); @@ -1909,9 +2136,40 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return 0; } +static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; + int ret; + + ret = __mlx5_ib_dereg_mr(&mr->ibmr); + if (ret) + return ret; + + mutex_lock(&dev->data_direct_lock); + if (!dd_crossed_mr->revoked) + list_del(&dd_crossed_mr->dd_node); + + ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); + mutex_unlock(&dev->data_direct_lock); + return ret; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + + if (mr->data_direct) + return dereg_crossing_data_direct_mr(dev, mr); + + return __mlx5_ib_dereg_mr(ibmr); +} + static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, int access_mode, int page_shift) { + struct mlx5_ib_dev *dev = to_mdev(pd->device); void *mkc; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); @@ -1924,6 +2182,9 @@ static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, log_page_size, page_shift); + if (access_mode == MLX5_MKC_ACCESS_MODE_PA || + access_mode == MLX5_MKC_ACCESS_MODE_MTT) + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); } static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4a04cbc5b78a..e77c9280c07e 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -45,7 +45,7 @@ /* Contains the details of a pagefault. */ struct mlx5_pagefault { u32 bytes_committed; - u32 token; + u64 token; u8 event_subtype; u8 type; union { @@ -74,6 +74,14 @@ struct mlx5_pagefault { u32 rdma_op_len; u64 rdma_va; } rdma; + struct { + u64 va; + u32 mkey; + u32 fault_byte_count; + u32 prefetch_before_byte_count; + u32 prefetch_after_byte_count; + u8 flags; + } memory; }; struct mlx5_ib_pf_eq *eq; @@ -99,13 +107,20 @@ static u64 mlx5_imr_ksm_entries; static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { + struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; struct mlx5_klm *end = pklm + nentries; + int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? + cpu_to_be32(imr->null_mmkey.key) : + mr_to_mdev(imr)->mkeys.null_mkey; + u64 va = + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } return; } @@ -129,7 +144,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); @@ -137,8 +152,8 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, pklm->key = cpu_to_be32(mtt->ibmr.lkey); pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } } } @@ -213,10 +228,28 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *imr = mr->parent; + /* + * If userspace is racing freeing the parent implicit ODP MR then we can + * loose the race with parent destruction. In this case + * mlx5_ib_free_odp_mr() will free everything in the implicit_children + * xarray so NOP is fine. This child MR cannot be destroyed here because + * we are under its umem_mutex. + */ if (!refcount_inc_not_zero(&imr->mmkey.usecount)) return; - xa_erase(&imr->implicit_children, idx); + xa_lock(&imr->implicit_children); + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != + mr) { + xa_unlock(&imr->implicit_children); + mlx5r_deref_odp_mkey(&imr->mmkey); + return; + } + + if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) + __xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key)); + xa_unlock(&imr->implicit_children); /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); @@ -250,6 +283,8 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, if (!umem_odp->npages) goto out; mr = umem_odp->private; + if (!mr) + goto out; start = max_t(u64, ib_umem_start(umem_odp), range->start); end = min_t(u64, ib_umem_end(umem_odp), range->end); @@ -295,7 +330,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); - mlx5_update_odp_stats(mr, invalidations, invalidations); + mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations); /* * We are now sure that the device will not access the @@ -332,46 +367,46 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) else dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.send)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.srq_receive)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.send)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.write)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.read)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.atomic)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.srq_receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.send)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.write)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.read)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.atomic)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.srq_receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && @@ -388,13 +423,29 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? pfault->wqe.wq_num : pfault->token; u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; + void *info; int err; MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); - MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); - MLX5_SET(page_fault_resume_in, in, token, pfault->token); - MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); - MLX5_SET(page_fault_resume_in, in, error, !!error); + + if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.mem_page_fault_info); + MLX5_SET(mem_page_fault_info, info, fault_token_31_0, + pfault->token & 0xffffffff); + MLX5_SET(mem_page_fault_info, info, fault_token_47_32, + (pfault->token >> 32) & 0xffff); + MLX5_SET(mem_page_fault_info, info, error, !!error); + } else { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.trans_page_fault_info); + MLX5_SET(trans_page_fault_info, info, page_fault_type, + pfault->type); + MLX5_SET(trans_page_fault_info, info, fault_token, + pfault->token); + MLX5_SET(trans_page_fault_info, info, wq_number, wq_num); + MLX5_SET(trans_page_fault_info, info, error, !!error); + } err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); if (err) @@ -466,8 +517,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_inc(&ret->mmkey.usecount); goto out_lock; } - xa_unlock(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_KERNEL); + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + __xa_erase(&imr->implicit_children, idx); + goto out_lock; + } + mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; + } + xa_unlock(&imr->implicit_children); mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); return mr; @@ -478,6 +539,57 @@ out_mr: return ret; } +/* + * When using memory scheme ODP, implicit MRs can't use the reserved null mkey + * and each implicit MR needs to assign a private null mkey to get the page + * faults on. + * The null mkey is created with the properties to enable getting the page + * fault for every time it is accessed and having all relevant access flags. + */ +static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *imr, + struct mlx5_ib_pd *pd) +{ + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64; + void *mkc; + u32 *in; + int err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4); + MLX5_SET(create_mkey_in, in, pg_access, 1); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + + MLX5_SET(mkc, mkc, translations_octword_size, 4); + MLX5_SET(mkc, mkc, log_page_size, 61); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, pd, pd->pdn); + MLX5_SET64(mkc, mkc, start_addr, 0); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen); + if (err) + goto free_in; + + imr->null_mmkey.type = MLX5_MKEY_NULL; + +free_in: + kfree(in); + return err; +} + struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags) { @@ -510,6 +622,16 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, imr->is_odp_implicit = true; xa_init(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + err = alloc_implicit_mr_null_mkey(dev, imr, pd); + if (err) + goto out_mr; + + err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey); + if (err) + goto out_mr; + } + err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, MLX5_KSM_PAGE_SHIFT, @@ -544,6 +666,14 @@ void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr) xa_erase(&mr->implicit_children, idx); mlx5_ib_dereg_mr(&mtt->ibmr, NULL); } + + if (mr->null_mmkey.key) { + xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->null_mmkey.key)); + + mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, + mr->null_mmkey.key); + } } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) @@ -693,7 +823,7 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); u32 xlt_flags = 0; int err; - unsigned int page_size; + unsigned long page_size; if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; @@ -705,14 +835,15 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, return err; } - page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc, - log_page_size, 0, - umem_dmabuf->umem.iova); - if (unlikely(page_size < PAGE_SIZE)) { + page_size = mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf); + if (!page_size) { ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { - err = mlx5r_umr_update_mr_pas(mr, xlt_flags); + if (mr->data_direct) + err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags); + else + err = mlx5r_umr_update_mr_pas(mr, xlt_flags); } dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); @@ -735,24 +866,31 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, * >0: Number of pages mapped */ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, - u32 *bytes_mapped, u32 flags) + u32 *bytes_mapped, u32 flags, bool permissive_fault) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - if (unlikely(io_virt < mr->ibmr.iova)) + if (unlikely(io_virt < mr->ibmr.iova) && !permissive_fault) return -EFAULT; if (mr->umem->is_dmabuf) return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); if (!odp->is_implicit_odp) { + u64 offset = io_virt < mr->ibmr.iova ? 0 : io_virt - mr->ibmr.iova; u64 user_va; - if (check_add_overflow(io_virt - mr->ibmr.iova, - (u64)odp->umem.address, &user_va)) + if (check_add_overflow(offset, (u64)odp->umem.address, + &user_va)) return -EFAULT; - if (unlikely(user_va >= ib_umem_end(odp) || - ib_umem_end(odp) - user_va < bcnt)) + + if (permissive_fault) { + if (user_va < ib_umem_start(odp)) + user_va = ib_umem_start(odp); + if ((user_va + bcnt) > ib_umem_end(odp)) + bcnt = ib_umem_end(odp) - user_va; + } else if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) return -EFAULT; return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, flags); @@ -799,11 +937,31 @@ static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key) return mmkey->key == key; } +static struct mlx5_ib_mkey *find_odp_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_ib_mkey *mmkey; + + xa_lock(&dev->odp_mkeys); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + mmkey = ERR_PTR(-ENOENT); + goto out; + } + if (!mkey_is_eq(mmkey, key)) { + mmkey = ERR_PTR(-EFAULT); + goto out; + } + refcount_inc(&mmkey->usecount); +out: + xa_unlock(&dev->odp_mkeys); + + return mmkey; +} + /* * Handle a single data segment in a page-fault WQE or RDMA region. * - * Returns number of OS pages retrieved on success. The caller may continue to - * the next data segment. + * Returns zero on success. The caller may continue to the next data segment. * Can return the following error codes: * -EAGAIN to designate a temporary error. The caller will abort handling the * page fault and resolve it. @@ -816,7 +974,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 *bytes_committed, u32 *bytes_mapped) { - int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0; + int ret, i, outlen, cur_outlen = 0, depth = 0, pages_in_range; struct pf_frame *head = NULL, *frame; struct mlx5_ib_mkey *mmkey; struct mlx5_ib_mr *mr; @@ -826,32 +984,24 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, io_virt += *bytes_committed; bcnt -= *bytes_committed; - next_mr: - xa_lock(&dev->odp_mkeys); - mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); - if (!mmkey) { - xa_unlock(&dev->odp_mkeys); - mlx5_ib_dbg( - dev, - "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); - if (bytes_mapped) - *bytes_mapped += bcnt; - /* - * The user could specify a SGL with multiple lkeys and only - * some of them are ODP. Treat the non-ODP ones as fully - * faulted. - */ - ret = 0; - goto end; - } - refcount_inc(&mmkey->usecount); - xa_unlock(&dev->odp_mkeys); - - if (!mkey_is_eq(mmkey, key)) { - mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); - ret = -EFAULT; + mmkey = find_odp_mkey(dev, key); + if (IS_ERR(mmkey)) { + ret = PTR_ERR(mmkey); + if (ret == -ENOENT) { + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and + * only some of them are ODP. Treat the non-ODP ones as + * fully faulted. + */ + ret = 0; + } goto end; } @@ -859,13 +1009,20 @@ next_mr: case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); + pages_in_range = (ALIGN(io_virt + bcnt, PAGE_SIZE) - + (io_virt & PAGE_MASK)) >> + PAGE_SHIFT; + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false); if (ret < 0) goto end; - mlx5_update_odp_stats(mr, faults, ret); + mlx5_update_odp_stats_with_handled(mr, faults, ret); + + if (ret < pages_in_range) { + ret = -EFAULT; + goto end; + } - npages += ret; ret = 0; break; @@ -946,7 +1103,7 @@ next_mr: } end: - if (mmkey) + if (!IS_ERR(mmkey)) mlx5r_deref_odp_mkey(mmkey); while (head) { frame = head; @@ -956,7 +1113,7 @@ end: kfree(out); *bytes_committed = 0; - return ret ? ret : npages; + return ret; } /* @@ -975,8 +1132,7 @@ end: * the committed bytes). * @receive_queue: receive WQE end of sg list * - * Returns the number of pages loaded if positive, zero for an empty WQE, or a - * negative error code. + * Returns zero for success or a negative error code. */ static int pagefault_data_segments(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, @@ -984,7 +1140,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, void *wqe_end, u32 *bytes_mapped, u32 *total_wqe_bytes, bool receive_queue) { - int ret = 0, npages = 0; + int ret = 0; u64 io_virt; __be32 key; u32 byte_count; @@ -1041,10 +1197,9 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, bytes_mapped); if (ret < 0) break; - npages += ret; } - return ret < 0 ? ret : npages; + return ret; } /* @@ -1268,7 +1423,7 @@ read_user: if (ret) mlx5_ib_err( dev, - "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", + "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n", ret, wqe_index, pfault->token); resolve_page_fault: @@ -1280,12 +1435,6 @@ resolve_page_fault: free_page((unsigned long)wqe_start); } -static int pages_in_range(u64 address, u32 length) -{ - return (ALIGN(address + length, PAGE_SIZE) - - (address & PAGE_MASK)) >> PAGE_SHIFT; -} - static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { @@ -1324,16 +1473,16 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; - } else if (ret < 0 || pages_in_range(address, length) > ret) { + } else if (ret < 0) { mlx5_ib_page_fault_resume(dev, pfault, 1); if (ret != -ENOENT) - mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n", ret, pfault->token, pfault->type); return; } mlx5_ib_page_fault_resume(dev, pfault, 0); - mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n", pfault->token, pfault->type, prefetch_activated); @@ -1349,12 +1498,80 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, prefetch_len, &bytes_committed, NULL); if (ret < 0 && ret != -EAGAIN) { - mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); } } } +#define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7) +static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + u64 prefetch_va = + pfault->memory.va - pfault->memory.prefetch_before_byte_count; + size_t prefetch_size = pfault->memory.prefetch_before_byte_count + + pfault->memory.fault_byte_count + + pfault->memory.prefetch_after_byte_count; + struct mlx5_ib_mkey *mmkey; + struct mlx5_ib_mr *mr, *child_mr; + int ret = 0; + + mmkey = find_odp_mkey(dev, pfault->memory.mkey); + if (IS_ERR(mmkey)) + goto err; + + switch (mmkey->type) { + case MLX5_MKEY_IMPLICIT_CHILD: + child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + mr = child_mr->parent; + break; + case MLX5_MKEY_NULL: + mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey); + break; + default: + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + break; + } + + /* If prefetch fails, handle only demanded page fault */ + ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true); + if (ret < 0) { + ret = pagefault_mr(mr, pfault->memory.va, + pfault->memory.fault_byte_count, NULL, 0, + true); + if (ret < 0) + goto err; + } + + mlx5_update_odp_stats_with_handled(mr, faults, ret); + mlx5r_deref_odp_mkey(mmkey); + + if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST) + mlx5_ib_page_fault_resume(dev, pfault, 0); + + mlx5_ib_dbg( + dev, + "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n", + pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ? + "" : + "without resume cmd", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count); + + return; + +err: + if (!IS_ERR(mmkey)) + mlx5r_deref_odp_mkey(mmkey); + mlx5_ib_page_fault_resume(dev, pfault, 1); + mlx5_ib_dbg( + dev, + "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count, ret); +} + static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { u8 event_subtype = pfault->event_subtype; @@ -1366,6 +1583,9 @@ static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfaul case MLX5_PFAULT_SUBTYPE_RDMA: mlx5_ib_mr_rdma_pfault_handler(dev, pfault); break; + case MLX5_PFAULT_SUBTYPE_MEMORY: + mlx5_ib_mr_memory_pfault_handler(dev, pfault); + break; default: mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", event_subtype); @@ -1384,6 +1604,7 @@ static void mlx5_ib_eqe_pf_action(struct work_struct *work) mempool_free(pfault, eq->pool); } +#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) { struct mlx5_eqe_page_fault *pf_eqe; @@ -1400,15 +1621,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) pf_eqe = &eqe->data.page_fault; pfault->event_subtype = eqe->sub_type; - pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); - - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", - eqe->sub_type, pfault->bytes_committed); switch (eqe->sub_type) { case MLX5_PFAULT_SUBTYPE_RDMA: /* RDMA based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->rdma.bytes_committed); pfault->type = be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; pfault->token = @@ -1422,10 +1640,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be32_to_cpu(pf_eqe->rdma.rdma_op_len); pfault->rdma.rdma_va = be64_to_cpu(pf_eqe->rdma.rdma_va); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", - pfault->type, pfault->token, - pfault->rdma.r_key); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, + pfault->rdma.r_key); mlx5_ib_dbg(eq->dev, "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", pfault->rdma.rdma_op_len, @@ -1434,6 +1654,8 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) case MLX5_PFAULT_SUBTYPE_WQE: /* WQE based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->wqe.bytes_committed); pfault->type = (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; pfault->token = @@ -1445,11 +1667,47 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be16_to_cpu(pf_eqe->wqe.wqe_index); pfault->wqe.packet_size = be16_to_cpu(pf_eqe->wqe.packet_length); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", - pfault->type, pfault->token, - pfault->wqe.wq_num, - pfault->wqe.wqe_index); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, pfault->wqe.wq_num, + pfault->wqe.wqe_index); + break; + + case MLX5_PFAULT_SUBTYPE_MEMORY: + /* Memory based event */ + pfault->bytes_committed = 0; + pfault->token = + be32_to_cpu(pf_eqe->memory.token31_0) | + ((u64)be16_to_cpu(pf_eqe->memory.token47_32) + << 32); + pfault->memory.va = be64_to_cpu(pf_eqe->memory.va); + pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey); + pfault->memory.fault_byte_count = (be32_to_cpu( + pf_eqe->memory.demand_fault_pages) >> 12) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_before_byte_count = + be16_to_cpu( + pf_eqe->memory.pre_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_after_byte_count = + be16_to_cpu( + pf_eqe->memory.post_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.flags = pf_eqe->memory.flags; + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n", + eqe->sub_type, pfault->token, + pfault->memory.mkey, + pfault->memory.fault_byte_count, + pfault->memory.va, pfault->memory.flags); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n", + pfault->memory.prefetch_before_byte_count, + pfault->memory.prefetch_after_byte_count); break; default: @@ -1712,7 +1970,7 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w) for (i = 0; i < work->num_sge; ++i) { ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, work->frags[i].length, &bytes_mapped, - work->pf_flags); + work->pf_flags, false); if (ret <= 0) continue; mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); @@ -1763,7 +2021,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, if (IS_ERR(mr)) return PTR_ERR(mr); ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, - &bytes_mapped, pf_flags); + &bytes_mapped, pf_flags, false); if (ret < 0) { mlx5r_deref_odp_mkey(&mr->mmkey); return ret; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8115ab107149..88724d15705d 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1107,8 +1107,6 @@ static int _create_kernel_qp(struct mlx5_ib_dev *dev, if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) qp->bf.bfreg = &dev->fp_bfreg; - else if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) - qp->bf.bfreg = &dev->wc_bfreg; else qp->bf.bfreg = &dev->bfreg; @@ -1962,7 +1960,7 @@ static int atomic_size_to_mode(int size_mask) } static int get_atomic_mode(struct mlx5_ib_dev *dev, - enum ib_qp_type qp_type) + struct mlx5_ib_qp *qp) { u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); @@ -1972,7 +1970,7 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, if (!atomic) return -EOPNOTSUPP; - if (qp_type == MLX5_IB_QPT_DCT) + if (qp->type == MLX5_IB_QPT_DCT) atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); else atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); @@ -1986,6 +1984,10 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; + /* OOO DP QPs do not support larger than 8-Bytes atomic operations */ + if (atomic_mode > MLX5_ATOMIC_MODE_8B && qp->is_ooo_rq) + atomic_mode = MLX5_ATOMIC_MODE_8B; + return atomic_mode; } @@ -2841,6 +2843,29 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; } +static bool get_dp_ooo_cap(struct mlx5_core_dev *mdev, enum ib_qp_type qp_type) +{ + if (!MLX5_CAP_GEN_2(mdev, dp_ordering_force)) + return false; + + switch (qp_type) { + case IB_QPT_RC: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc); + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc); + case IB_QPT_UC: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc); + case IB_QPT_UD: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud); + case MLX5_IB_QPT_DCI: + case MLX5_IB_QPT_DCT: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc); + default: + return false; + } +} + static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag, bool cond, struct mlx5_ib_qp *qp) { @@ -2959,14 +2984,6 @@ static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag, return; } - if (flag == MLX5_IB_QP_CREATE_WC_TEST) { - /* - * Special case, if condition didn't meet, it won't be error, - * just different in-kernel flow. - */ - *flags &= ~MLX5_IB_QP_CREATE_WC_TEST; - return; - } mlx5_ib_dbg(dev, "Verbs create QP flag 0x%X is not supported\n", flag); } @@ -3027,8 +3044,6 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, IB_QP_CREATE_PCI_WRITE_END_PADDING, MLX5_CAP_GEN(mdev, end_pad), qp); - process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_WC_TEST, - qp_type != MLX5_IB_QPT_REG_UMR, qp); process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_SQPN_QP1, true, qp); @@ -3097,7 +3112,6 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, switch (qp->type) { case MLX5_IB_QPT_DCT: err = create_dct(dev, pd, qp, params); - rdma_restrack_no_track(&qp->ibqp.res); break; case MLX5_IB_QPT_DCI: err = create_dci(dev, pd, qp, params); @@ -3109,9 +3123,9 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = mlx5_ib_create_gsi(pd, qp, params->attr); break; case MLX5_IB_QPT_HW_GSI: - case MLX5_IB_QPT_REG_UMR: rdma_restrack_no_track(&qp->ibqp.res); fallthrough; + case MLX5_IB_QPT_REG_UMR: default: if (params->udata) err = create_user_qp(dev, pd, qp, params); @@ -3247,6 +3261,10 @@ int mlx5_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, enum ib_qp_type type; int err; + err = mlx5_ib_dev_res_srq_init(dev); + if (err) + return err; + err = check_qp_type(dev, attr, &type); if (err) return err; @@ -3374,7 +3392,7 @@ static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp, if (access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; - atomic_mode = get_atomic_mode(dev, qp->type); + atomic_mode = get_atomic_mode(dev, qp); if (atomic_mode < 0) return -EOPNOTSUPP; @@ -3429,11 +3447,11 @@ static int ib_to_mlx5_rate_map(u8 rate) return 0; } -static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate) { u32 stat_rate_support; - if (rate == IB_RATE_PORT_CURRENT) + if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS) return 0; if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS) @@ -3578,7 +3596,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, sizeof(grh->dgid.raw)); } - err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); + err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah)); if (err < 0) return err; MLX5_SET(ads, path, stat_rate, err); @@ -4226,7 +4244,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, /* todo implement counter_index functionality */ - if (is_sqp(qp->type)) + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI && is_qp0(qp->type)) { + MLX5_SET(ads, pri_path, vhca_port_num, + smi_to_native_portnum(dev, qp->port)); + if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) + MLX5_SET(ads, pri_path, plane_index, qp->port); + } else if (is_sqp(qp->type)) MLX5_SET(ads, pri_path, vhca_port_num, qp->port); if (attr_mask & IB_QP_PORT) @@ -4272,14 +4295,14 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic) - MLX5_SET(qpc, qpc, log_sra_max, ilog2(attr->max_rd_atomic)); + MLX5_SET(qpc, qpc, log_sra_max, fls(attr->max_rd_atomic - 1)); if (attr_mask & IB_QP_SQ_PSN) MLX5_SET(qpc, qpc, next_send_psn, attr->sq_psn); if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic) MLX5_SET(qpc, qpc, log_rra_max, - ilog2(attr->max_dest_rd_atomic)); + fls(attr->max_dest_rd_atomic - 1)); if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { err = set_qpc_atomic_flags(qp, attr, attr_mask, qpc); @@ -4320,6 +4343,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) MLX5_SET(qpc, qpc, deth_sqpn, 1); + if (qp->is_ooo_rq && cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + MLX5_SET(qpc, qpc, dp_ordering_1, 1); + MLX5_SET(qpc, qpc, dp_ordering_force, 1); + } + mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -4535,7 +4563,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; - atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT); + atomic_mode = get_atomic_mode(dev, qp); if (atomic_mode < 0) return -EOPNOTSUPP; @@ -4551,6 +4579,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1); MLX5_SET(dctc, dctc, counter_set_id, set_id); + + qp->port = attr->port_num; } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {}; @@ -4577,6 +4607,10 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7); + if (qp->is_ooo_rq) { + MLX5_SET(dctc, dctc, dp_ordering_1, 1); + MLX5_SET(dctc, dctc, dp_ordering_force, 1); + } err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, @@ -4610,10 +4644,6 @@ static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev, if (qp->type == IB_QPT_RAW_PACKET || qp->type == MLX5_IB_QPT_REG_UMR) return true; - /* Internal QP used for wc testing, with NOPs in wq */ - if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) - return true; - return false; } @@ -4684,11 +4714,16 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, min(udata->inlen, sizeof(ucmd)))) return -EFAULT; - if (ucmd.comp_mask || + if (ucmd.comp_mask & ~MLX5_IB_MODIFY_QP_OOO_DP || memchr_inv(&ucmd.burst_info.reserved, 0, sizeof(ucmd.burst_info.reserved))) return -EOPNOTSUPP; + if (ucmd.comp_mask & MLX5_IB_MODIFY_QP_OOO_DP) { + if (!get_dp_ooo_cap(dev->mdev, qp->type)) + return -EOPNOTSUPP; + qp->is_ooo_rq = 1; + } } if (qp->type == IB_QPT_GSI) @@ -5041,7 +5076,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, } if (qp_attr_mask & IB_QP_PORT) - qp_attr->port_num = MLX5_GET(dctc, dctc, port); + qp_attr->port_num = mqp->port; if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); if (qp_attr_mask & IB_QP_AV) { diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index b6ee7c3ee1ca..2530e7730635 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -56,4 +56,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); int mlx5_ib_qp_event_init(void); void mlx5_ib_qp_event_cleanup(void); +int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate); #endif /* _MLX5_IB_QP_H */ diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index d9cf6982d645..d3dcc272200a 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -249,7 +249,8 @@ int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, if (err) goto err_cmd; - mlx5_debug_qp_add(dev->mdev, qp); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_debug_qp_add(dev->mdev, qp); return 0; @@ -307,7 +308,8 @@ int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) { u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; - mlx5_debug_qp_remove(dev->mdev, qp); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_debug_qp_remove(dev->mdev, qp); destroy_resource_common(dev, qp); @@ -504,7 +506,9 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev) spin_lock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); xa_init(&table->dct_xa); - mlx5_qp_debugfs_init(dev->mdev); + + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_qp_debugfs_init(dev->mdev); table->nb.notifier_call = rsc_event_notifier; mlx5_notifier_register(dev->mdev, &table->nb); @@ -517,7 +521,8 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev) struct mlx5_qp_table *table = &dev->qp_table; mlx5_notifier_unregister(dev->mdev, &table->nb); - mlx5_qp_debugfs_cleanup(dev->mdev); + if (dev->ib_dev.type != RDMA_DEVICE_TYPE_SMI) + mlx5_qp_debugfs_cleanup(dev->mdev); } int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c index 4ac429e72004..67841922c7b8 100644 --- a/drivers/infiniband/hw/mlx5/restrack.c +++ b/drivers/infiniband/hw/mlx5/restrack.c @@ -96,9 +96,18 @@ static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr) atomic64_read(&mr->odp_stats.faults))) goto err_table; if (rdma_nl_stat_hwcounter_entry( + msg, "page_faults_handled", + atomic64_read(&mr->odp_stats.faults_handled))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry( msg, "page_invalidations", atomic64_read(&mr->odp_stats.invalidations))) goto err_table; + if (rdma_nl_stat_hwcounter_entry( + msg, "page_invalidations_handled", + atomic64_read(&mr->odp_stats.invalidations_handled))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch", atomic64_read(&mr->odp_stats.prefetch))) goto err_table; @@ -156,6 +165,34 @@ static int fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ibcq) return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_CQ, cq->mcq.cqn); } +static int fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ibqp) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + int ret; + + if (qp->type < IB_QPT_DRIVER) + return 0; + + switch (qp->type) { + case MLX5_IB_QPT_REG_UMR: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, + "REG_UMR"); + break; + case MLX5_IB_QPT_DCT: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, "DCT"); + break; + case MLX5_IB_QPT_DCI: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, "DCI"); + break; + default: + return 0; + } + if (ret) + return ret; + + return nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, IB_QPT_DRIVER); +} + static int fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); @@ -168,6 +205,7 @@ static const struct ib_device_ops restrack_ops = { .fill_res_cq_entry_raw = fill_res_cq_entry_raw, .fill_res_mr_entry = fill_res_mr_entry, .fill_res_mr_entry_raw = fill_res_mr_entry_raw, + .fill_res_qp_entry = fill_res_qp_entry, .fill_res_qp_entry_raw = fill_res_qp_entry_raw, .fill_stat_mr_entry = fill_stat_mr_entry, }; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index a056ea835da5..bcb6b324af50 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -199,20 +199,27 @@ int mlx5_ib_create_srq(struct ib_srq *ib_srq, int err; struct mlx5_srq_attr in = {}; __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + __u32 max_sge_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq) / + sizeof(struct mlx5_wqe_data_seg); if (init_attr->srq_type != IB_SRQT_BASIC && init_attr->srq_type != IB_SRQT_XRC && init_attr->srq_type != IB_SRQT_TM) return -EOPNOTSUPP; - /* Sanity check SRQ size before proceeding */ - if (init_attr->attr.max_wr >= max_srq_wqes) { - mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", - init_attr->attr.max_wr, - max_srq_wqes); + /* Sanity check SRQ and sge size before proceeding */ + if (init_attr->attr.max_wr >= max_srq_wqes || + init_attr->attr.max_sge > max_sge_sz) { + mlx5_ib_dbg(dev, "max_wr %d,wr_cap %d,max_sge %d, sge_cap:%d\n", + init_attr->attr.max_wr, max_srq_wqes, + init_attr->attr.max_sge, max_sge_sz); return -EINVAL; } + err = mlx5_ib_dev_res_cq_init(dev); + if (err) + return err; + mutex_init(&srq->mutex); spin_lock_init(&srq->lock); srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index bbfcce3bdc84..bdb568411091 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -10,6 +10,7 @@ #include <linux/mlx5/eswitch.h> #include <linux/mlx5/vport.h> #include "mlx5_ib.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -111,6 +112,23 @@ out: return err; } +static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_ib_uapi_query_port *info) +{ + struct mlx5_core_dev *mdev; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL); + if (!mdev) + return -EINVAL; + + info->vport_vhca_id = MLX5_CAP_GEN(mdev, vhca_id); + info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; + + mlx5_ib_put_native_port_mdev(dev, port_num); + + return 0; +} + static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_uapi_query_port *info) { @@ -177,12 +195,60 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)( ret = fill_switchdev_info(dev, port_num, &info); if (ret) return ret; + } else if (mlx5_core_mp_enabled(dev->mdev)) { + ret = fill_multiport_info(dev, port_num, &info); + if (ret) + return ret; } return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info, sizeof(info)); } +static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_data_direct_dev *data_direct_dev; + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + int out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH); + u32 dev_path_len; + char *dev_path; + int ret; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -ENODEV; + goto end; + } + + dev_path = kobject_get_path(&data_direct_dev->device->kobj, GFP_KERNEL); + if (!dev_path) { + ret = -ENOMEM; + goto end; + } + + dev_path_len = strlen(dev_path) + 1; + if (dev_path_len > out_len) { + ret = -ENOSPC; + goto end; + } + + ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path, + dev_path_len); + kfree(dev_path); + +end: + mutex_unlock(&dev->data_direct_lock); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_QUERY_PORT, UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM, @@ -193,9 +259,17 @@ DECLARE_UVERBS_NAMED_METHOD( reg_c0), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_MIN_SIZE(0), + UA_MANDATORY)); + ADD_UVERBS_METHODS(mlx5_ib_device, UVERBS_OBJECT_DEVICE, - &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT)); + &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT), + &UVERBS_METHOD(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)); DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_PD_QUERY, diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index e76142f6fa88..793f3c5c4d01 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -135,22 +135,28 @@ static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp) int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) { struct ib_qp_init_attr init_attr = {}; - struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - int ret; + int ret = 0; - pd = ib_alloc_pd(&dev->ib_dev, 0); - if (IS_ERR(pd)) { - mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); - return PTR_ERR(pd); - } + + /* + * UMR qp is set once, never changed until device unload. + * Avoid taking the mutex if initialization is already done. + */ + if (dev->umrc.qp) + return 0; + + mutex_lock(&dev->umrc.init_lock); + /* First user allocates the UMR resources. Skip if already allocated. */ + if (dev->umrc.qp) + goto unlock; cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); - goto destroy_pd; + goto unlock; } init_attr.send_cq = cq; @@ -160,7 +166,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) init_attr.cap.max_send_sge = 1; init_attr.qp_type = MLX5_IB_QPT_REG_UMR; init_attr.port_num = 1; - qp = ib_create_qp(pd, &init_attr); + qp = ib_create_qp(dev->umrc.pd, &init_attr); if (IS_ERR(qp)) { mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); ret = PTR_ERR(qp); @@ -171,22 +177,22 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) if (ret) goto destroy_qp; - dev->umrc.qp = qp; dev->umrc.cq = cq; - dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); mutex_init(&dev->umrc.lock); dev->umrc.state = MLX5_UMR_STATE_ACTIVE; + dev->umrc.qp = qp; + mutex_unlock(&dev->umrc.init_lock); return 0; destroy_qp: ib_destroy_qp(qp); destroy_cq: ib_free_cq(cq); -destroy_pd: - ib_dealloc_pd(pd); +unlock: + mutex_unlock(&dev->umrc.init_lock); return ret; } @@ -194,36 +200,38 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) { if (dev->umrc.state == MLX5_UMR_STATE_UNINIT) return; + mutex_destroy(&dev->umrc.lock); + /* After device init, UMR cp/qp are not unset during the lifetime. */ ib_destroy_qp(dev->umrc.qp); ib_free_cq(dev->umrc.cq); - ib_dealloc_pd(dev->umrc.pd); } -static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) +int mlx5r_umr_init(struct mlx5_ib_dev *dev) { - struct umr_common *umrc = &dev->umrc; - struct ib_qp_attr attr; - int err; + struct ib_pd *pd; - attr.qp_state = IB_QPS_RESET; - err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); - if (err) { - mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); - goto err; + pd = ib_alloc_pd(&dev->ib_dev, 0); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + return PTR_ERR(pd); } + dev->umrc.pd = pd; - err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); - if (err) - goto err; + mutex_init(&dev->umrc.init_lock); - umrc->state = MLX5_UMR_STATE_ACTIVE; return 0; +} -err: - umrc->state = MLX5_UMR_STATE_ERR; - return err; +void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) +{ + if (!dev->umrc.pd) + return; + + mutex_destroy(&dev->umrc.init_lock); + ib_dealloc_pd(dev->umrc.pd); } + static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, struct mlx5r_umr_wqe *wqe, bool with_data) { @@ -270,6 +278,61 @@ out: return err; } +static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey, + struct mlx5r_umr_context *umr_context, + struct mlx5r_umr_wqe *wqe, bool with_data) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + mutex_lock(&umrc->lock); + /* Preventing any further WRs to be sent now */ + if (umrc->state != MLX5_UMR_STATE_RECOVER) { + mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n", + umrc->state); + umrc->state = MLX5_UMR_STATE_RECOVER; + } + mutex_unlock(&umrc->lock); + + /* Sending a final/barrier WR (the failed one) and wait for its completion. + * This will ensure that all the previous WRs got a completion before + * we set the QP state to RESET. + */ + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe, + with_data); + if (err) { + mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err); + goto err; + } + + /* Since the QP is in an error state, it will only receive + * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier + * we don't care about its status. + */ + wait_for_completion(&umr_context->done); + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) { + mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err); + goto err; + } + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc) { struct mlx5_ib_umr_context *context = @@ -334,9 +397,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, mlx5_ib_warn(dev, "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n", umr_context.status, mkey); - mutex_lock(&umrc->lock); - err = mlx5r_umr_recover(dev); - mutex_unlock(&umrc->lock); + err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data); if (err) mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", err); @@ -603,44 +664,47 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, wqe->data_seg.byte_count = cpu_to_be32(sg->length); } -/* - * Send the DMA list to the HW for a normal MR using UMR. - * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP - * flag may be used. - */ -int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +static int +_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) { + size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct device *ddev = &dev->mdev->pdev->dev; struct mlx5r_umr_wqe wqe = {}; struct ib_block_iter biter; + struct mlx5_ksm *cur_ksm; struct mlx5_mtt *cur_mtt; size_t orig_sg_length; - struct mlx5_mtt *mtt; size_t final_size; + void *curr_entry; struct ib_sge sg; + void *entry; u64 offset = 0; int err = 0; - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - mtt = mlx5r_umr_create_xlt( - dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), - sizeof(*mtt), flags); - if (!mtt) + entry = mlx5r_umr_create_xlt(dev, &sg, + ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), + ent_size, flags); + if (!entry) return -ENOMEM; orig_sg_length = sg.length; - mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, mr->page_shift); + if (dd) { + /* Use the data direct internal kernel PD */ + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + cur_ksm = entry; + } else { + cur_mtt = entry; + } + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); - cur_mtt = mtt; + curr_entry = entry; rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { - if (cur_mtt == (void *)mtt + sg.length) { + if (curr_entry == entry + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -652,23 +716,31 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) DMA_TO_DEVICE); offset += sg.length; mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); - - cur_mtt = mtt; + if (dd) + cur_ksm = entry; + else + cur_mtt = entry; } - cur_mtt->ptag = - cpu_to_be64(rdma_block_iter_dma_address(&biter) | - MLX5_IB_MTT_PRESENT); - - if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) - cur_mtt->ptag = 0; - - cur_mtt++; + if (dd) { + cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); + cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + cur_ksm++; + curr_entry = cur_ksm; + } else { + cur_mtt->ptag = + cpu_to_be64(rdma_block_iter_dma_address(&biter) | + MLX5_IB_MTT_PRESENT); + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) + cur_mtt->ptag = 0; + cur_mtt++; + curr_entry = cur_mtt; + } } - final_size = (void *)cur_mtt - (void *)mtt; + final_size = curr_entry - entry; sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT); - memset(cur_mtt, 0, sg.length - final_size); + memset(curr_entry, 0, sg.length - final_size); mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -676,10 +748,32 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) err: sg.length = orig_sg_length; - mlx5r_umr_unmap_free_xlt(dev, mtt, &sg); + mlx5r_umr_unmap_free_xlt(dev, entry, &sg); return err; } +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + /* No invalidation flow is expected */ + if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, true); +} + +/* + * Send the DMA list to the HW for a normal MR using UMR. + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP + * flag may be used. + */ +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, false); +} + static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h index 3799bb758e49..4a02c9b5aad8 100644 --- a/drivers/infiniband/hw/mlx5/umr.h +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -16,6 +16,9 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev); void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev); +int mlx5r_umr_init(struct mlx5_ib_dev *dev); +void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev); + static inline bool mlx5r_umr_can_load_pas(struct mlx5_ib_dev *dev, size_t length) { @@ -92,6 +95,7 @@ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr); int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, int access_flags); int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags); int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index e1325f2927d6..6a1e2e79ddc3 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -574,8 +574,9 @@ static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) static int mthca_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct mthca_create_cq ucmd; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 5f831e3bdbad..0834416cb3f8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -46,7 +46,7 @@ static struct dentry *ocrdma_dbgfs_dir; -static int ocrdma_add_stat(char *start, char *pcur, +static noinline_for_stack int ocrdma_add_stat(char *start, char *pcur, char *name, u64 count) { char buff[128] = {0}; @@ -99,7 +99,7 @@ void ocrdma_release_stats_resources(struct ocrdma_dev *dev) kfree(mem->debugfs_mem); } -static char *ocrdma_resource_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_resource_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -216,7 +216,7 @@ static char *ocrdma_resource_stats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_rx_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rx_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -284,7 +284,7 @@ static u64 ocrdma_sysfs_rcv_data(struct ocrdma_dev *dev) rx_stats->roce_frame_bytes_hi))/4; } -static char *ocrdma_tx_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_tx_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -358,7 +358,7 @@ static u64 ocrdma_sysfs_xmit_data(struct ocrdma_dev *dev) tx_stats->read_rsp_bytes_hi))/4; } -static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_wqe_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -391,7 +391,7 @@ static char *ocrdma_wqe_stats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_db_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_db_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -412,7 +412,7 @@ static char *ocrdma_db_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -438,7 +438,7 @@ static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; struct ocrdma_rdma_stats_resp *rdma_stats = @@ -462,7 +462,7 @@ static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev) return stats; } -static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) { int i; char *pstats = dev->stats_mem.debugfs_mem; @@ -480,7 +480,7 @@ static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev) return dev->stats_mem.debugfs_mem; } -static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) { int i; char *pstats = dev->stats_mem.debugfs_mem; @@ -498,7 +498,7 @@ static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev) return dev->stats_mem.debugfs_mem; } -static char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) +static noinline_for_stack char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) { char *stats = dev->stats_mem.debugfs_mem, *pcur; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c849fdbd4c99..979de8f8df14 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -963,8 +963,9 @@ err: } int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index f860b7fcef33..0644346d8d98 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -70,7 +70,7 @@ int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata); int ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c index a51fc6854984..259303b9907c 100644 --- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c @@ -447,7 +447,8 @@ qedr_addr4_resolve(struct qedr_dev *dev, struct rtable *rt = NULL; int rc = 0; - rt = ip_route_output(&init_net, dst_ip, src_ip, 0, 0); + rt = ip_route_output(&init_net, dst_ip, src_ip, 0, 0, + RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) { DP_ERR(dev, "ip_route_output returned error\n"); return -EINVAL; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index f118ce0a9a61..568a5b18803f 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -900,8 +900,9 @@ int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) } int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; struct qedr_ucontext *ctx = rdma_udata_to_drv_context( udata, struct qedr_ucontext, ibucontext); diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 081753df79ef..5731458abb06 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -52,7 +52,7 @@ int qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); int qedr_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata); int qedr_dealloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata); int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int qedr_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 26c615772be3..8ee4edd7883c 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1359,7 +1359,6 @@ static inline u32 qib_get_rcvhdrtail(const struct qib_ctxtdata *rcd) * sysfs interface. */ -extern const char ib_qib_version[]; extern const struct attribute_group qib_attr_group; extern const struct attribute_group *qib_attr_port_groups[]; diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index bf3fa12fe935..4fcbef99e400 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -44,12 +44,6 @@ #include "qib.h" -/* - * The size has to be longer than this string, so we can append - * board/chip information to it in the init code. - */ -const char ib_qib_version[] = QIB_DRIVER_VERSION "\n"; - DEFINE_MUTEX(qib_mutex); /* general driver use */ unsigned qib_ibmtu; diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index 455e966eeff3..b27791029fa9 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -439,6 +439,7 @@ static int remove_device_files(struct super_block *sb, return PTR_ERR(dir); } simple_recursive_removal(dir, NULL); + dput(dir); return 0; } diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index 6af57067c32e..78dfe98ebcf7 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -3281,7 +3281,7 @@ static int qib_7220_intr_fallback(struct qib_devdata *dd) qib_free_irq(dd); dd->msi_lo = 0; - if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_LEGACY) < 0) + if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_INTX) < 0) qib_dev_err(dd, "Failed to enable INTx\n"); qib_setup_7220_interrupt(dd); return 1; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index f93906d8fc09..9db29916e35a 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -3471,8 +3471,7 @@ try_intx: pci_irq_vector(dd->pcidev, msixnum), ret); qib_7322_free_irq(dd); - pci_alloc_irq_vectors(dd->pcidev, 1, 1, - PCI_IRQ_LEGACY); + pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_INTX); goto try_intx; } dd->cspec->msix_entries[msixnum].arg = arg; @@ -5143,7 +5142,7 @@ static int qib_7322_intr_fallback(struct qib_devdata *dd) qib_devinfo(dd->pcidev, "MSIx interrupt not detected, trying INTx interrupts\n"); qib_7322_free_irq(dd); - if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_LEGACY) < 0) + if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_INTX) < 0) qib_dev_err(dd, "Failed to enable INTx\n"); qib_setup_7322_interrupt(dd, 0); return 1; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 33667becd52b..4100656fe9a3 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -581,12 +581,9 @@ static int qib_create_workqueues(struct qib_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (!ppd->qib_wq) { - char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ - - snprintf(wq_name, sizeof(wq_name), "qib%d_%d", - dd->unit, pidx); - ppd->qib_wq = alloc_ordered_workqueue(wq_name, - WQ_MEM_RECLAIM); + ppd->qib_wq = alloc_ordered_workqueue("qib%d_%d", + WQ_MEM_RECLAIM, + dd->unit, pidx); if (!ppd->qib_wq) goto wq_error; } diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 47bf64ace05c..58c1d62d341b 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -210,7 +210,7 @@ int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent) } if (dd->flags & QIB_HAS_INTX) - flags |= PCI_IRQ_LEGACY; + flags |= PCI_IRQ_INTX; maxvec = (nent && *nent) ? *nent : 1; nvec = pci_alloc_irq_vectors(dd->pcidev, 1, maxvec, flags); if (nvec < 0) diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 41c272980f91..805e37dc7621 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -214,8 +214,8 @@ static const struct attribute_group port_linkcontrol_group = { * Congestion control table size followed by table entries */ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj); int ret; @@ -241,7 +241,7 @@ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); /* * Congestion settings: port control, control map and an array of 16 @@ -249,8 +249,8 @@ static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); * trigger threshold and the minimum injection rate delay. */ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj); int ret; @@ -274,16 +274,16 @@ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); -static struct bin_attribute *port_ccmgta_attributes[] = { +static const struct bin_attribute *const port_ccmgta_attributes[] = { &bin_attr_cc_setting_bin, &bin_attr_cc_table_bin, NULL, }; static umode_t qib_ccmgta_is_bin_visible(struct kobject *kobj, - struct bin_attribute *attr, int n) + const struct bin_attribute *attr, int n) { struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj); @@ -295,7 +295,7 @@ static umode_t qib_ccmgta_is_bin_visible(struct kobject *kobj, static const struct attribute_group port_ccmgta_attribute_group = { .name = "CCMgtA", .is_bin_visible = qib_ccmgta_is_bin_visible, - .bin_attrs = port_ccmgta_attributes, + .bin_attrs_new = port_ccmgta_attributes, }; /* Start sl2vl */ @@ -585,13 +585,7 @@ static ssize_t hca_type_show(struct device *device, static DEVICE_ATTR_RO(hca_type); static DEVICE_ATTR(board_id, 0444, hca_type_show, NULL); -static ssize_t version_show(struct device *device, - struct device_attribute *attr, char *buf) -{ - /* The string printed here is already newline-terminated. */ - return sysfs_emit(buf, "%s", (char *)ib_qib_version); -} -static DEVICE_ATTR_RO(version); +static DEVICE_STRING_ATTR_RO(version, 0444, QIB_DRIVER_VERSION); static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) @@ -721,7 +715,7 @@ static struct attribute *qib_attributes[] = { &dev_attr_hw_rev.attr, &dev_attr_hca_type.attr, &dev_attr_board_id.attr, - &dev_attr_version.attr, + &dev_attr_version.attr.attr, &dev_attr_nctxts.attr, &dev_attr_nfreectxts.attr, &dev_attr_serial.attr, diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 0080f0be72fe..5fcb41970ad9 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1551,7 +1551,7 @@ int qib_register_ib_device(struct qib_devdata *dd) ibdev->dev.parent = &dd->pcidev->dev; snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), - "Intel Infiniband HCA %s", init_utsname()->nodename); + "Intel Infiniband HCA %.42s", init_utsname()->nodename); /* * Fill in rvt info object. diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 07548fac1d8e..408fe1ba74b9 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -303,8 +303,6 @@ int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); -void qib_rc_rnr_retry(unsigned long arg); - void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr); int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); @@ -312,8 +310,6 @@ int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); -void mr_rcu_callback(struct rcu_head *list); - void qib_migrate_qp(struct rvt_qp *qp); int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr, diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h index 7fe9502ce8d3..86a82a4da0aa 100644 --- a/drivers/infiniband/hw/usnic/usnic_abi.h +++ b/drivers/infiniband/hw/usnic/usnic_abi.h @@ -72,7 +72,7 @@ struct usnic_ib_create_qp_resp { u64 bar_bus_addr; u32 bar_len; /* - * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface + * WQ, RQ, CQ are explicitly specified bc exposing a generic resources inteface * expands the scope of ABI to many files. */ u32 wq_cnt; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 13b654ddd3cc..4ddcd5860e0f 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -151,34 +151,6 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, ib_event.element.port_num = 1; ib_dispatch_event(&ib_event); break; - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_CHANGE: - if (!us_ibdev->ufdev->link_up && - netif_carrier_ok(netdev)) { - usnic_fwd_carrier_up(us_ibdev->ufdev); - usnic_info("Link UP on %s\n", - dev_name(&us_ibdev->ib_dev.dev)); - ib_event.event = IB_EVENT_PORT_ACTIVE; - ib_event.device = &us_ibdev->ib_dev; - ib_event.element.port_num = 1; - ib_dispatch_event(&ib_event); - } else if (us_ibdev->ufdev->link_up && - !netif_carrier_ok(netdev)) { - usnic_fwd_carrier_down(us_ibdev->ufdev); - usnic_info("Link DOWN on %s\n", - dev_name(&us_ibdev->ib_dev.dev)); - usnic_ib_qp_grp_modify_active_to_err(us_ibdev); - ib_event.event = IB_EVENT_PORT_ERR; - ib_event.device = &us_ibdev->ib_dev; - ib_event.element.port_num = 1; - ib_dispatch_event(&ib_event); - } else { - usnic_dbg("Ignoring %s on %s\n", - netdev_cmd_to_name(event), - dev_name(&us_ibdev->ib_dev.dev)); - } - break; case NETDEV_CHANGEADDR: if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, sizeof(us_ibdev->ufdev->mac))) { @@ -218,6 +190,50 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, mutex_unlock(&us_ibdev->usdev_lock); } +static void usnic_ib_handle_port_event(struct ib_device *ibdev, + struct net_device *netdev, + unsigned long event) +{ + struct usnic_ib_dev *us_ibdev = + container_of(ibdev, struct usnic_ib_dev, ib_dev); + struct ib_event ib_event; + + mutex_lock(&us_ibdev->usdev_lock); + switch (event) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + if (!us_ibdev->ufdev->link_up && + netif_carrier_ok(netdev)) { + usnic_fwd_carrier_up(us_ibdev->ufdev); + usnic_info("Link UP on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); + ib_event.event = IB_EVENT_PORT_ACTIVE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else if (us_ibdev->ufdev->link_up && + !netif_carrier_ok(netdev)) { + usnic_fwd_carrier_down(us_ibdev->ufdev); + usnic_info("Link DOWN on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else { + usnic_dbg("Ignoring %s on %s\n", + netdev_cmd_to_name(event), + dev_name(&us_ibdev->ib_dev.dev)); + } + break; + default: + break; + } + mutex_unlock(&us_ibdev->usdev_lock); +} + static int usnic_ib_netdevice_event(struct notifier_block *notifier, unsigned long event, void *ptr) { @@ -358,6 +374,7 @@ static const struct ib_device_ops usnic_dev_ops = { .query_port = usnic_ib_query_port, .query_qp = usnic_ib_query_qp, .reg_user_mr = usnic_ib_reg_mr, + .report_port_event = usnic_ib_handle_port_event, INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_cq, usnic_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_qp, usnic_ib_qp_grp, ibqp), diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 6289238cc5af..217af34e82b3 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -577,7 +577,7 @@ out_unlock: } int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { if (attr->flags) return -EOPNOTSUPP; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 6ca9ee0dddbe..53f53f2d53be 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -56,7 +56,7 @@ int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata); int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 84e0f41e7dfa..f948b76f984d 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -443,11 +443,11 @@ struct usnic_uiom_pd *usnic_uiom_alloc_pd(struct device *dev) if (!pd) return ERR_PTR(-ENOMEM); - pd->domain = domain = iommu_domain_alloc(dev->bus); - if (!domain) { + pd->domain = domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) { usnic_err("Failed to allocate IOMMU domain"); kfree(pd); - return ERR_PTR(-ENOMEM); + return ERR_CAST(domain); } iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 6aa40bd2fd52..b3df6eb9b8ef 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -94,13 +94,14 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq, * pvrdma_create_cq - create completion queue * @ibcq: Allocated CQ * @attr: completion queue attributes - * @udata: user data + * @attrs: bundle * * @return: 0 on success */ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct pvrdma_dev *dev = to_vdev(ibdev); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index a5e88185171f..1664d1d7d969 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -143,6 +143,46 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u32 port_num, return 0; } +static void pvrdma_dispatch_event(struct pvrdma_dev *dev, int port, + enum ib_event_type event) +{ + struct ib_event ib_event; + + memset(&ib_event, 0, sizeof(ib_event)); + ib_event.device = &dev->ib_dev; + ib_event.element.port_num = port; + ib_event.event = event; + ib_dispatch_event(&ib_event); +} + +static void pvrdma_report_event_handle(struct ib_device *ibdev, + struct net_device *ndev, + unsigned long event) +{ + struct pvrdma_dev *dev = container_of(ibdev, struct pvrdma_dev, ib_dev); + + switch (event) { + case NETDEV_DOWN: + pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ERR); + break; + case NETDEV_UP: + pvrdma_write_reg(dev, PVRDMA_REG_CTL, + PVRDMA_DEVICE_CTL_UNQUIESCE); + + mb(); + + if (pvrdma_read_reg(dev, PVRDMA_REG_ERR)) + dev_err(&dev->pdev->dev, + "failed to activate device during link up\n"); + else + pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE); + break; + + default: + break; + } +} + static const struct ib_device_ops pvrdma_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_VMW_PVRDMA, @@ -181,6 +221,7 @@ static const struct ib_device_ops pvrdma_dev_ops = { .query_qp = pvrdma_query_qp, .reg_user_mr = pvrdma_reg_user_mr, .req_notify_cq = pvrdma_req_notify_cq, + .report_port_event = pvrdma_report_event_handle, INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, pvrdma_cq, ibcq), @@ -362,18 +403,6 @@ static void pvrdma_srq_event(struct pvrdma_dev *dev, u32 srqn, int type) } } -static void pvrdma_dispatch_event(struct pvrdma_dev *dev, int port, - enum ib_event_type event) -{ - struct ib_event ib_event; - - memset(&ib_event, 0, sizeof(ib_event)); - ib_event.device = &dev->ib_dev; - ib_event.element.port_num = port; - ib_event.event = event; - ib_dispatch_event(&ib_event); -} - static void pvrdma_dev_event(struct pvrdma_dev *dev, u8 port, int type) { if (port < 1 || port > dev->dsr->caps.phys_port_cnt) { @@ -531,7 +560,7 @@ static int pvrdma_alloc_intrs(struct pvrdma_dev *dev) PCI_IRQ_MSIX); if (ret < 0) { ret = pci_alloc_irq_vectors(pdev, 1, 1, - PCI_IRQ_MSI | PCI_IRQ_LEGACY); + PCI_IRQ_MSI | PCI_IRQ_INTX); if (ret < 0) return ret; } @@ -666,21 +695,8 @@ static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev, switch (event) { case NETDEV_REBOOT: - case NETDEV_DOWN: pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ERR); break; - case NETDEV_UP: - pvrdma_write_reg(dev, PVRDMA_REG_CTL, - PVRDMA_DEVICE_CTL_UNQUIESCE); - - mb(); - - if (pvrdma_read_reg(dev, PVRDMA_REG_ERR)) - dev_err(&dev->pdev->dev, - "failed to activate device during link up\n"); - else - pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE); - break; case NETDEV_UNREGISTER: ib_device_set_netdev(&dev->ib_dev, NULL, 1); dev_put(dev->netdev); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index 78807b23d831..4b9edc03d73d 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -375,7 +375,7 @@ struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); |