diff options
Diffstat (limited to 'drivers')
253 files changed, 23632 insertions, 2840 deletions
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index b594780a57d7..2cdbd08b30e4 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -547,6 +547,10 @@ if ARM64 source "drivers/acpi/arm64/Kconfig" endif +if RISCV +source "drivers/acpi/riscv/Kconfig" +endif + config ACPI_PPTT bool diff --git a/drivers/acpi/riscv/Kconfig b/drivers/acpi/riscv/Kconfig new file mode 100644 index 000000000000..046296a18d00 --- /dev/null +++ b/drivers/acpi/riscv/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# ACPI Configuration for RISC-V +# + +config ACPI_RIMT + bool diff --git a/drivers/acpi/riscv/Makefile b/drivers/acpi/riscv/Makefile index a96fdf1e2cb8..1284a076fa88 100644 --- a/drivers/acpi/riscv/Makefile +++ b/drivers/acpi/riscv/Makefile @@ -2,3 +2,4 @@ obj-y += rhct.o init.o irq.o obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o +obj-$(CONFIG_ACPI_RIMT) += rimt.o diff --git a/drivers/acpi/riscv/init.c b/drivers/acpi/riscv/init.c index 673e4d5dd752..7c00f7995e86 100644 --- a/drivers/acpi/riscv/init.c +++ b/drivers/acpi/riscv/init.c @@ -10,4 +10,6 @@ void __init acpi_arch_init(void) { riscv_acpi_init_gsi_mapping(); + if (IS_ENABLED(CONFIG_ACPI_RIMT)) + riscv_acpi_rimt_init(); } diff --git a/drivers/acpi/riscv/init.h b/drivers/acpi/riscv/init.h index 0b9a07e4031f..1680aa2aaf23 100644 --- a/drivers/acpi/riscv/init.h +++ b/drivers/acpi/riscv/init.h @@ -2,3 +2,4 @@ #include <linux/init.h> void __init riscv_acpi_init_gsi_mapping(void); +void __init riscv_acpi_rimt_init(void); diff --git a/drivers/acpi/riscv/rimt.c b/drivers/acpi/riscv/rimt.c new file mode 100644 index 000000000000..683fcfe35c31 --- /dev/null +++ b/drivers/acpi/riscv/rimt.c @@ -0,0 +1,520 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024-2025, Ventana Micro Systems Inc + * Author: Sunil V L <sunilvl@ventanamicro.com> + * + */ + +#define pr_fmt(fmt) "ACPI: RIMT: " fmt + +#include <linux/acpi.h> +#include <linux/acpi_rimt.h> +#include <linux/iommu.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/platform_device.h> +#include "init.h" + +struct rimt_fwnode { + struct list_head list; + struct acpi_rimt_node *rimt_node; + struct fwnode_handle *fwnode; +}; + +static LIST_HEAD(rimt_fwnode_list); +static DEFINE_SPINLOCK(rimt_fwnode_lock); + +#define RIMT_TYPE_MASK(type) (1 << (type)) +#define RIMT_IOMMU_TYPE BIT(0) + +/* Root pointer to the mapped RIMT table */ +static struct acpi_table_header *rimt_table; + +/** + * rimt_set_fwnode() - Create rimt_fwnode and use it to register + * iommu data in the rimt_fwnode_list + * + * @rimt_node: RIMT table node associated with the IOMMU + * @fwnode: fwnode associated with the RIMT node + * + * Returns: 0 on success + * <0 on failure + */ +static int rimt_set_fwnode(struct acpi_rimt_node *rimt_node, + struct fwnode_handle *fwnode) +{ + struct rimt_fwnode *np; + + np = kzalloc(sizeof(*np), GFP_ATOMIC); + + if (WARN_ON(!np)) + return -ENOMEM; + + INIT_LIST_HEAD(&np->list); + np->rimt_node = rimt_node; + np->fwnode = fwnode; + + spin_lock(&rimt_fwnode_lock); + list_add_tail(&np->list, &rimt_fwnode_list); + spin_unlock(&rimt_fwnode_lock); + + return 0; +} + +/** + * rimt_get_fwnode() - Retrieve fwnode associated with an RIMT node + * + * @node: RIMT table node to be looked-up + * + * Returns: fwnode_handle pointer on success, NULL on failure + */ +static struct fwnode_handle *rimt_get_fwnode(struct acpi_rimt_node *node) +{ + struct fwnode_handle *fwnode = NULL; + struct rimt_fwnode *curr; + + spin_lock(&rimt_fwnode_lock); + list_for_each_entry(curr, &rimt_fwnode_list, list) { + if (curr->rimt_node == node) { + fwnode = curr->fwnode; + break; + } + } + spin_unlock(&rimt_fwnode_lock); + + return fwnode; +} + +static acpi_status rimt_match_node_callback(struct acpi_rimt_node *node, + void *context) +{ + acpi_status status = AE_NOT_FOUND; + struct device *dev = context; + + if (node->type == ACPI_RIMT_NODE_TYPE_IOMMU) { + struct acpi_rimt_iommu *iommu_node = (struct acpi_rimt_iommu *)&node->node_data; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev; + u16 bdf; + + pdev = to_pci_dev(dev); + bdf = PCI_DEVID(pdev->bus->number, pdev->devfn); + if ((pci_domain_nr(pdev->bus) == iommu_node->pcie_segment_number) && + bdf == iommu_node->pcie_bdf) { + status = AE_OK; + } else { + status = AE_NOT_FOUND; + } + } else { + struct platform_device *pdev = to_platform_device(dev); + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (res && res->start == iommu_node->base_address) + status = AE_OK; + else + status = AE_NOT_FOUND; + } + } else if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + struct acpi_rimt_pcie_rc *pci_rc; + struct pci_bus *bus; + + bus = to_pci_bus(dev); + pci_rc = (struct acpi_rimt_pcie_rc *)node->node_data; + + /* + * It is assumed that PCI segment numbers maps one-to-one + * with root complexes. Each segment number can represent only + * one root complex. + */ + status = pci_rc->pcie_segment_number == pci_domain_nr(bus) ? + AE_OK : AE_NOT_FOUND; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_rimt_platform_device *ncomp; + struct device *plat_dev = dev; + struct acpi_device *adev; + + /* + * Walk the device tree to find a device with an + * ACPI companion; there is no point in scanning + * RIMT for a device matching a platform device if + * the device does not have an ACPI companion to + * start with. + */ + do { + adev = ACPI_COMPANION(plat_dev); + if (adev) + break; + + plat_dev = plat_dev->parent; + } while (plat_dev); + + if (!adev) + return status; + + status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf); + if (ACPI_FAILURE(status)) { + dev_warn(plat_dev, "Can't get device full path name\n"); + return status; + } + + ncomp = (struct acpi_rimt_platform_device *)node->node_data; + status = !strcmp(ncomp->device_name, buf.pointer) ? + AE_OK : AE_NOT_FOUND; + acpi_os_free(buf.pointer); + } + + return status; +} + +static struct acpi_rimt_node *rimt_scan_node(enum acpi_rimt_node_type type, + void *context) +{ + struct acpi_rimt_node *rimt_node, *rimt_end; + struct acpi_table_rimt *rimt; + int i; + + if (!rimt_table) + return NULL; + + /* Get the first RIMT node */ + rimt = (struct acpi_table_rimt *)rimt_table; + rimt_node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt, + rimt->node_offset); + rimt_end = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, + rimt_table->length); + + for (i = 0; i < rimt->num_nodes; i++) { + if (WARN_TAINT(rimt_node >= rimt_end, TAINT_FIRMWARE_WORKAROUND, + "RIMT node pointer overflows, bad table!\n")) + return NULL; + + if (rimt_node->type == type && + ACPI_SUCCESS(rimt_match_node_callback(rimt_node, context))) + return rimt_node; + + rimt_node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_node, + rimt_node->length); + } + + return NULL; +} + +static bool rimt_pcie_rc_supports_ats(struct acpi_rimt_node *node) +{ + struct acpi_rimt_pcie_rc *pci_rc; + + pci_rc = (struct acpi_rimt_pcie_rc *)node->node_data; + return pci_rc->flags & ACPI_RIMT_PCIE_ATS_SUPPORTED; +} + +static int rimt_iommu_xlate(struct device *dev, struct acpi_rimt_node *node, u32 deviceid) +{ + struct fwnode_handle *rimt_fwnode; + + if (!node) + return -ENODEV; + + rimt_fwnode = rimt_get_fwnode(node); + + /* + * The IOMMU drivers may not be probed yet. + * Defer the IOMMU configuration + */ + if (!rimt_fwnode) + return -EPROBE_DEFER; + + return acpi_iommu_fwspec_init(dev, deviceid, rimt_fwnode); +} + +struct rimt_pci_alias_info { + struct device *dev; + struct acpi_rimt_node *node; + const struct iommu_ops *ops; +}; + +static int rimt_id_map(struct acpi_rimt_id_mapping *map, u8 type, u32 rid_in, u32 *rid_out) +{ + if (rid_in < map->source_id_base || + (rid_in > map->source_id_base + map->num_ids)) + return -ENXIO; + + *rid_out = map->dest_id_base + (rid_in - map->source_id_base); + return 0; +} + +static struct acpi_rimt_node *rimt_node_get_id(struct acpi_rimt_node *node, + u32 *id_out, int index) +{ + struct acpi_rimt_platform_device *plat_node; + u32 id_mapping_offset, num_id_mapping; + struct acpi_rimt_pcie_rc *pci_node; + struct acpi_rimt_id_mapping *map; + struct acpi_rimt_node *parent; + + if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + pci_node = (struct acpi_rimt_pcie_rc *)&node->node_data; + id_mapping_offset = pci_node->id_mapping_offset; + num_id_mapping = pci_node->num_id_mappings; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + plat_node = (struct acpi_rimt_platform_device *)&node->node_data; + id_mapping_offset = plat_node->id_mapping_offset; + num_id_mapping = plat_node->num_id_mappings; + } else { + return NULL; + } + + if (!id_mapping_offset || !num_id_mapping || index >= num_id_mapping) + return NULL; + + map = ACPI_ADD_PTR(struct acpi_rimt_id_mapping, node, + id_mapping_offset + index * sizeof(*map)); + + /* Firmware bug! */ + if (!map->dest_offset) { + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n", + node, node->type); + return NULL; + } + + parent = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, map->dest_offset); + + if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE || + node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + *id_out = map->dest_id_base; + return parent; + } + + return NULL; +} + +/* + * RISC-V supports IOMMU as a PCI device or a platform device. + * When it is a platform device, there should be a namespace device as + * well along with RIMT. To create the link between RIMT information and + * the platform device, the IOMMU driver should register itself with the + * RIMT module. This is true for PCI based IOMMU as well. + */ +int rimt_iommu_register(struct device *dev) +{ + struct fwnode_handle *rimt_fwnode; + struct acpi_rimt_node *node; + + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_IOMMU, dev); + if (!node) { + pr_err("Could not find IOMMU node in RIMT\n"); + return -ENODEV; + } + + if (dev_is_pci(dev)) { + rimt_fwnode = acpi_alloc_fwnode_static(); + if (!rimt_fwnode) + return -ENOMEM; + + rimt_fwnode->dev = dev; + if (!dev->fwnode) + dev->fwnode = rimt_fwnode; + + rimt_set_fwnode(node, rimt_fwnode); + } else { + rimt_set_fwnode(node, dev->fwnode); + } + + return 0; +} + +#ifdef CONFIG_IOMMU_API + +static struct acpi_rimt_node *rimt_node_map_id(struct acpi_rimt_node *node, + u32 id_in, u32 *id_out, + u8 type_mask) +{ + struct acpi_rimt_platform_device *plat_node; + u32 id_mapping_offset, num_id_mapping; + struct acpi_rimt_pcie_rc *pci_node; + u32 id = id_in; + + /* Parse the ID mapping tree to find specified node type */ + while (node) { + struct acpi_rimt_id_mapping *map; + int i, rc = 0; + u32 map_id = id; + + if (RIMT_TYPE_MASK(node->type) & type_mask) { + if (id_out) + *id_out = id; + return node; + } + + if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + pci_node = (struct acpi_rimt_pcie_rc *)&node->node_data; + id_mapping_offset = pci_node->id_mapping_offset; + num_id_mapping = pci_node->num_id_mappings; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + plat_node = (struct acpi_rimt_platform_device *)&node->node_data; + id_mapping_offset = plat_node->id_mapping_offset; + num_id_mapping = plat_node->num_id_mappings; + } else { + goto fail_map; + } + + if (!id_mapping_offset || !num_id_mapping) + goto fail_map; + + map = ACPI_ADD_PTR(struct acpi_rimt_id_mapping, node, + id_mapping_offset); + + /* Firmware bug! */ + if (!map->dest_offset) { + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n", + node, node->type); + goto fail_map; + } + + /* Do the ID translation */ + for (i = 0; i < num_id_mapping; i++, map++) { + rc = rimt_id_map(map, node->type, map_id, &id); + if (!rc) + break; + } + + if (i == num_id_mapping) + goto fail_map; + + node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, + rc ? 0 : map->dest_offset); + } + +fail_map: + /* Map input ID to output ID unchanged on mapping failure */ + if (id_out) + *id_out = id_in; + + return NULL; +} + +static struct acpi_rimt_node *rimt_node_map_platform_id(struct acpi_rimt_node *node, u32 *id_out, + u8 type_mask, int index) +{ + struct acpi_rimt_node *parent; + u32 id; + + parent = rimt_node_get_id(node, &id, index); + if (!parent) + return NULL; + + if (!(RIMT_TYPE_MASK(parent->type) & type_mask)) + parent = rimt_node_map_id(parent, id, id_out, type_mask); + else + if (id_out) + *id_out = id; + + return parent; +} + +static int rimt_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) +{ + struct rimt_pci_alias_info *info = data; + struct acpi_rimt_node *parent; + u32 deviceid; + + parent = rimt_node_map_id(info->node, alias, &deviceid, RIMT_IOMMU_TYPE); + return rimt_iommu_xlate(info->dev, parent, deviceid); +} + +static int rimt_plat_iommu_map(struct device *dev, struct acpi_rimt_node *node) +{ + struct acpi_rimt_node *parent; + int err = -ENODEV, i = 0; + u32 deviceid = 0; + + do { + parent = rimt_node_map_platform_id(node, &deviceid, + RIMT_IOMMU_TYPE, + i++); + + if (parent) + err = rimt_iommu_xlate(dev, parent, deviceid); + } while (parent && !err); + + return err; +} + +static int rimt_plat_iommu_map_id(struct device *dev, + struct acpi_rimt_node *node, + const u32 *in_id) +{ + struct acpi_rimt_node *parent; + u32 deviceid; + + parent = rimt_node_map_id(node, *in_id, &deviceid, RIMT_IOMMU_TYPE); + if (parent) + return rimt_iommu_xlate(dev, parent, deviceid); + + return -ENODEV; +} + +/** + * rimt_iommu_configure_id - Set-up IOMMU configuration for a device. + * + * @dev: device to configure + * @id_in: optional input id const value pointer + * + * Returns: 0 on success, <0 on failure + */ +int rimt_iommu_configure_id(struct device *dev, const u32 *id_in) +{ + struct acpi_rimt_node *node; + int err = -ENODEV; + + if (dev_is_pci(dev)) { + struct iommu_fwspec *fwspec; + struct pci_bus *bus = to_pci_dev(dev)->bus; + struct rimt_pci_alias_info info = { .dev = dev }; + + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX, &bus->dev); + if (!node) + return -ENODEV; + + info.node = node; + err = pci_for_each_dma_alias(to_pci_dev(dev), + rimt_pci_iommu_init, &info); + + fwspec = dev_iommu_fwspec_get(dev); + if (fwspec && rimt_pcie_rc_supports_ats(node)) + fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS; + } else { + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_PLAT_DEVICE, dev); + if (!node) + return -ENODEV; + + err = id_in ? rimt_plat_iommu_map_id(dev, node, id_in) : + rimt_plat_iommu_map(dev, node); + } + + return err; +} + +#endif + +void __init riscv_acpi_rimt_init(void) +{ + acpi_status status; + + /* rimt_table will be used at runtime after the rimt init, + * so we don't need to call acpi_put_table() to release + * the RIMT table mapping. + */ + status = acpi_get_table(ACPI_SIG_RIMT, 0, &rimt_table); + if (ACPI_FAILURE(status)) { + if (status != AE_NOT_FOUND) { + const char *msg = acpi_format_exception(status); + + pr_err("Failed to get table, %s\n", msg); + } + + return; + } +} diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 880a544d73cd..065abe56f440 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/acpi_iort.h> +#include <linux/acpi_rimt.h> #include <linux/acpi_viot.h> #include <linux/iommu.h> #include <linux/signal.h> @@ -1631,7 +1632,10 @@ static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in) err = iort_iommu_configure_id(dev, id_in); if (err && err != -EPROBE_DEFER) + err = rimt_iommu_configure_id(dev, id_in); + if (err && err != -EPROBE_DEFER) err = viot_iommu_configure(dev); + mutex_unlock(&iommu_probe_device_lock); return err; diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 65d6d0af140a..8dff5c2c40fd 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -28,6 +28,7 @@ #include <linux/fs_struct.h> #include <linux/psp.h> #include <linux/amd-iommu.h> +#include <linux/crash_dump.h> #include <asm/smp.h> #include <asm/cacheflush.h> @@ -1526,6 +1527,15 @@ static int _sev_platform_init_locked(struct sev_platform_init_args *args) if (!psp_master || !psp_master->sev_data) return -ENODEV; + /* + * Skip SNP/SEV initialization under a kdump kernel as SEV/SNP + * may already be initialized in the previous kernel. Since no + * SNP/SEV guests are run under a kdump kernel, there is no + * need to initialize SNP or SEV during kdump boot. + */ + if (is_kdump_kernel()) + return 0; + sev = psp_master->sev_data; if (sev->state == SEV_STATE_INIT) diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c index a00e07b853f2..a65c2d5b9e7b 100644 --- a/drivers/firmware/efi/efi-init.c +++ b/drivers/firmware/efi/efi-init.c @@ -12,6 +12,7 @@ #include <linux/efi.h> #include <linux/fwnode.h> #include <linux/init.h> +#include <linux/kexec_handover.h> #include <linux/memblock.h> #include <linux/mm_types.h> #include <linux/of.h> @@ -164,12 +165,32 @@ static __init void reserve_regions(void) pr_info("Processing EFI memory map:\n"); /* - * Discard memblocks discovered so far: if there are any at this - * point, they originate from memory nodes in the DT, and UEFI - * uses its own memory map instead. + * Discard memblocks discovered so far except for KHO scratch + * regions. Most memblocks at this point originate from memory nodes + * in the DT and UEFI uses its own memory map instead. However, if + * KHO is enabled, scratch regions, which are good known memory + * must be preserved. */ memblock_dump_all(); - memblock_remove(0, PHYS_ADDR_MAX); + + if (is_kho_boot()) { + struct memblock_region *r; + + /* Remove all non-KHO regions */ + for_each_mem_region(r) { + if (!memblock_is_kho_scratch(r)) { + memblock_remove(r->base, r->size); + r--; + } + } + } else { + /* + * KHO is disabled. Discard memblocks discovered so far: + * if there are any at this point, they originate from memory + * nodes in the DT, and UEFI uses its own memory map instead. + */ + memblock_remove(0, PHYS_ADDR_MAX); + } for_each_efi_memory_desc(md) { paddr = md->phys_addr; diff --git a/drivers/fwctl/mlx5/main.c b/drivers/fwctl/mlx5/main.c index f93aa0cecdb9..3dacccf7855c 100644 --- a/drivers/fwctl/mlx5/main.c +++ b/drivers/fwctl/mlx5/main.c @@ -58,6 +58,9 @@ enum { MLX5_CMD_OP_QUERY_DC_CNAK_TRACE = 0x716, MLX5_CMD_OP_QUERY_NVMF_BACKEND_CONTROLLER = 0x722, MLX5_CMD_OP_QUERY_NVMF_NAMESPACE_CONTEXT = 0x728, + MLX5_CMD_OP_QUERY_ADJACENT_FUNCTIONS_ID = 0x730, + MLX5_CMD_OP_DELEGATE_VHCA_MANAGEMENT = 0x731, + MLX5_CMD_OP_QUERY_DELEGATED_VHCA = 0x732, MLX5_CMD_OP_QUERY_BURST_SIZE = 0x813, MLX5_CMD_OP_QUERY_DIAGNOSTIC_PARAMS = 0x819, MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS = 0x820, @@ -188,6 +191,7 @@ static bool mlx5ctl_validate_rpc(const void *in, enum fwctl_rpc_scope scope) * filter commands manually for now. */ switch (opcode) { + case MLX5_CMD_OP_MODIFY_CONG_STATUS: case MLX5_CMD_OP_POSTPONE_CONNECTED_QP_TIMEOUT: case MLX5_CMD_OP_QUERY_ADAPTER: case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS: @@ -196,6 +200,7 @@ static bool mlx5ctl_validate_rpc(const void *in, enum fwctl_rpc_scope scope) case MLX5_CMD_OP_QUERY_OTHER_HCA_CAP: case MLX5_CMD_OP_QUERY_ROCE_ADDRESS: case MLX5_CMD_OPCODE_QUERY_VUID: + case MLX5_CMD_OP_DELEGATE_VHCA_MANAGEMENT: /* * FW limits SET_HCA_CAP on the tools UID to only the other function * mode which is used for function pre-configuration @@ -281,6 +286,8 @@ static bool mlx5ctl_validate_rpc(const void *in, enum fwctl_rpc_scope scope) case MLX5_CMD_OP_QUERY_XRQ: case MLX5_CMD_OP_USER_QUERY_XRQ_DC_PARAMS_ENTRY: case MLX5_CMD_OP_USER_QUERY_XRQ_ERROR_PARAMS: + case MLX5_CMD_OP_QUERY_ADJACENT_FUNCTIONS_ID: + case MLX5_CMD_OP_QUERY_DELEGATED_VHCA: return scope >= FWCTL_RPC_DEBUG_READ_ONLY; case MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS: @@ -345,7 +352,7 @@ static void *mlx5ctl_fw_rpc(struct fwctl_uctx *uctx, enum fwctl_rpc_scope scope, */ if (ret && ret != -EREMOTEIO) { if (rpc_out != rpc_in) - kfree(rpc_out); + kvfree(rpc_out); return ERR_PTR(ret); } return rpc_out; diff --git a/drivers/fwctl/pds/main.c b/drivers/fwctl/pds/main.c index 9b9d1f6b5556..1809853f6353 100644 --- a/drivers/fwctl/pds/main.c +++ b/drivers/fwctl/pds/main.c @@ -6,6 +6,7 @@ #include <linux/pci.h> #include <linux/vmalloc.h> #include <linux/bitfield.h> +#include <linux/string.h> #include <uapi/fwctl/fwctl.h> #include <uapi/fwctl/pds.h> @@ -366,18 +367,10 @@ static void *pdsfc_fw_rpc(struct fwctl_uctx *uctx, enum fwctl_rpc_scope scope, return ERR_PTR(err); if (rpc->in.len > 0) { - in_payload = kzalloc(rpc->in.len, GFP_KERNEL); - if (!in_payload) { - dev_err(dev, "Failed to allocate in_payload\n"); - err = -ENOMEM; - goto err_out; - } - - if (copy_from_user(in_payload, u64_to_user_ptr(rpc->in.payload), - rpc->in.len)) { + in_payload = memdup_user(u64_to_user_ptr(rpc->in.payload), rpc->in.len); + if (IS_ERR(in_payload)) { dev_dbg(dev, "Failed to copy in_payload from user\n"); - err = -EFAULT; - goto err_in_payload; + return in_payload; } in_payload_dma_addr = dma_map_single(dev->parent, in_payload, @@ -453,7 +446,6 @@ err_out_payload: rpc->in.len, DMA_TO_DEVICE); err_in_payload: kfree(in_payload); -err_out: if (err) return ERR_PTR(err); @@ -481,7 +473,7 @@ static int pdsfc_probe(struct auxiliary_device *adev, pdsfc = fwctl_alloc_device(&padev->vf_pdev->dev, &pdsfc_ops, struct pdsfc_dev, fwctl); if (!pdsfc) - return dev_err_probe(dev, -ENOMEM, "Failed to allocate fwctl device struct\n"); + return -ENOMEM; pdsfc->padev = padev; err = pdsfc_identify(pdsfc); diff --git a/drivers/gpu/drm/i915/gem/i915_gemfs.c b/drivers/gpu/drm/i915/gem/i915_gemfs.c index a09e2eb47175..8f13ec4ff0d0 100644 --- a/drivers/gpu/drm/i915/gem/i915_gemfs.c +++ b/drivers/gpu/drm/i915/gem/i915_gemfs.c @@ -11,11 +11,6 @@ #include "i915_gemfs.h" #include "i915_utils.h" -static int add_param(struct fs_context *fc, const char *key, const char *val) -{ - return vfs_parse_fs_string(fc, key, val, strlen(val)); -} - void i915_gemfs_init(struct drm_i915_private *i915) { struct file_system_type *type; @@ -48,9 +43,9 @@ void i915_gemfs_init(struct drm_i915_private *i915) fc = fs_context_for_mount(type, SB_KERNMOUNT); if (IS_ERR(fc)) goto err; - ret = add_param(fc, "source", "tmpfs"); + ret = vfs_parse_fs_string(fc, "source", "tmpfs"); if (!ret) - ret = add_param(fc, "huge", "within_size"); + ret = vfs_parse_fs_string(fc, "huge", "within_size"); if (!ret) gemfs = fc_mount_longterm(fc); put_fs_context(fc); diff --git a/drivers/gpu/drm/v3d/v3d_gemfs.c b/drivers/gpu/drm/v3d/v3d_gemfs.c index 8ec6ed82b3d9..c1a30166c099 100644 --- a/drivers/gpu/drm/v3d/v3d_gemfs.c +++ b/drivers/gpu/drm/v3d/v3d_gemfs.c @@ -7,11 +7,6 @@ #include "v3d_drv.h" -static int add_param(struct fs_context *fc, const char *key, const char *val) -{ - return vfs_parse_fs_string(fc, key, val, strlen(val)); -} - void v3d_gemfs_init(struct v3d_dev *v3d) { struct file_system_type *type; @@ -38,9 +33,9 @@ void v3d_gemfs_init(struct v3d_dev *v3d) fc = fs_context_for_mount(type, SB_KERNMOUNT); if (IS_ERR(fc)) goto err; - ret = add_param(fc, "source", "tmpfs"); + ret = vfs_parse_fs_string(fc, "source", "tmpfs"); if (!ret) - ret = add_param(fc, "huge", "within_size"); + ret = vfs_parse_fs_string(fc, "huge", "within_size"); if (!ret) gemfs = fc_mount_longterm(fc); put_fs_context(fc); diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 3a394cd772f6..f0323f1d6f01 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -85,6 +85,7 @@ source "drivers/infiniband/hw/efa/Kconfig" source "drivers/infiniband/hw/erdma/Kconfig" source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/hns/Kconfig" +source "drivers/infiniband/hw/ionic/Kconfig" source "drivers/infiniband/hw/irdma/Kconfig" source "drivers/infiniband/hw/mana/Kconfig" source "drivers/infiniband/hw/mlx4/Kconfig" diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index be0743dac3ff..61596cda2b65 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -446,63 +446,41 @@ static int addr6_resolve(struct sockaddr *src_sock, } #endif +static bool is_dst_local(const struct dst_entry *dst) +{ + if (dst->ops->family == AF_INET) + return !!(dst_rtable(dst)->rt_type & RTN_LOCAL); + else if (dst->ops->family == AF_INET6) + return !!(dst_rt6_info(dst)->rt6i_flags & RTF_LOCAL); + else + return false; +} + static int addr_resolve_neigh(const struct dst_entry *dst, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, - unsigned int ndev_flags, u32 seq) { - int ret = 0; - - if (ndev_flags & IFF_LOOPBACK) { + if (is_dst_local(dst)) { + /* When the destination is local entry, source and destination + * are same. Skip the neighbour lookup. + */ memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - } else { - if (!(ndev_flags & IFF_NOARP)) { - /* If the device doesn't do ARP internally */ - ret = fetch_ha(dst, addr, dst_in, seq); - } + return 0; } - return ret; -} - -static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr, - const struct sockaddr *dst_in, - const struct dst_entry *dst, - const struct net_device *ndev) -{ - int ret = 0; - - if (dst->dev->flags & IFF_LOOPBACK) - ret = rdma_translate_ip(dst_in, dev_addr); - else - rdma_copy_src_l2_addr(dev_addr, dst->dev); - - /* - * If there's a gateway and type of device not ARPHRD_INFINIBAND, - * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the - * network type accordingly. - */ - if (has_gateway(dst, dst_in->sa_family) && - ndev->type != ARPHRD_INFINIBAND) - dev_addr->network = dst_in->sa_family == AF_INET ? - RDMA_NETWORK_IPV4 : - RDMA_NETWORK_IPV6; - else - dev_addr->network = RDMA_NETWORK_IB; - return ret; + return fetch_ha(dst, addr, dst_in, seq); } static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, - unsigned int *ndev_flags, const struct sockaddr *dst_in, const struct dst_entry *dst) { struct net_device *ndev = READ_ONCE(dst->dev); - *ndev_flags = ndev->flags; /* A physical device must be the RDMA device to use */ - if (ndev->flags & IFF_LOOPBACK) { + if (is_dst_local(dst)) { + int ret; /* * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or * loopback IP address. So if route is resolved to loopback @@ -512,9 +490,27 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); if (IS_ERR(ndev)) return -ENODEV; + ret = rdma_translate_ip(dst_in, dev_addr); + if (ret) + return ret; + } else { + rdma_copy_src_l2_addr(dev_addr, dst->dev); } - return copy_src_l2_addr(dev_addr, dst_in, dst, ndev); + /* + * If there's a gateway and type of device not ARPHRD_INFINIBAND, + * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the + * network type accordingly. + */ + if (has_gateway(dst, dst_in->sa_family) && + ndev->type != ARPHRD_INFINIBAND) + dev_addr->network = dst_in->sa_family == AF_INET ? + RDMA_NETWORK_IPV4 : + RDMA_NETWORK_IPV6; + else + dev_addr->network = RDMA_NETWORK_IB; + + return 0; } static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) @@ -551,7 +547,6 @@ static int addr_resolve(struct sockaddr *src_in, u32 seq) { struct dst_entry *dst = NULL; - unsigned int ndev_flags = 0; struct rtable *rt = NULL; int ret; @@ -588,7 +583,7 @@ static int addr_resolve(struct sockaddr *src_in, rcu_read_unlock(); goto done; } - ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); + ret = rdma_set_src_addr_rcu(addr, dst_in, dst); rcu_read_unlock(); /* @@ -596,7 +591,7 @@ static int addr_resolve(struct sockaddr *src_in, * only if src addr translation didn't fail. */ if (!ret && resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq); + ret = addr_resolve_neigh(dst, dst_in, addr, seq); if (src_in->sa_family == AF_INET) ip_rt_put(rt); diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index 3bb46696731e..25a060a28301 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -110,8 +110,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh * agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", - PTR_ERR(ah)); + dev_err(&device->dev, "ib_create_ah_from_wc error %pe\n", ah); return; } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 92678e438ff4..01bede8ba105 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1049,8 +1049,8 @@ static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id, struct cm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - pr_err("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__, - cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount)); + pr_err_ratelimited("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__, + cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount)); } static void cm_destroy_id(struct ib_cm_id *cm_id, int err) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9b471548e7ae..5b2d3ae3f9fc 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2076,6 +2076,7 @@ static void _destroy_id(struct rdma_id_private *id_priv, kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); + kfree(id_priv->id.route.service_recs); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); @@ -3382,13 +3383,18 @@ err1: int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; + enum rdma_cm_state state; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) + state = id_priv->state; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_QUERY) && + !cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_RESOLVED, + RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); @@ -3409,7 +3415,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) return 0; err: - cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); + cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, state); cma_id_put(id_priv); return ret; } @@ -5506,3 +5512,129 @@ static void __exit cma_cleanup(void) module_init(cma_init); module_exit(cma_cleanup); + +static void cma_query_ib_service_handler(int status, + struct sa_service_rec *recs, + unsigned int num_recs, void *context) +{ + struct cma_work *work = context; + struct rdma_id_private *id_priv = work->id; + struct sockaddr_ib *addr; + + if (status) + goto fail; + + if (!num_recs) { + status = -ENOENT; + goto fail; + } + + if (id_priv->id.route.service_recs) { + status = -EALREADY; + goto fail; + } + + id_priv->id.route.service_recs = + kmalloc_array(num_recs, sizeof(*recs), GFP_KERNEL); + if (!id_priv->id.route.service_recs) { + status = -ENOMEM; + goto fail; + } + + id_priv->id.route.num_service_recs = num_recs; + memcpy(id_priv->id.route.service_recs, recs, sizeof(*recs) * num_recs); + + addr = (struct sockaddr_ib *)&id_priv->id.route.addr.dst_addr; + addr->sib_family = AF_IB; + addr->sib_addr = *(struct ib_addr *)&recs->gid; + addr->sib_pkey = recs->pkey; + addr->sib_sid = recs->id; + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, + (union ib_gid *)&addr->sib_addr); + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, + ntohs(addr->sib_pkey)); + + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ADDRINFO_QUERY; + work->new_state = RDMA_CM_ADDR_BOUND; + work->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR; + work->event.status = status; + pr_debug_ratelimited( + "RDMA CM: SERVICE_ERROR: failed to query service record. status %d\n", + status); + queue_work(cma_wq, &work->work); +} + +static int cma_resolve_ib_service(struct rdma_id_private *id_priv, + struct rdma_ucm_ib_service *ibs) +{ + struct sa_service_rec sr = {}; + ib_sa_comp_mask mask = 0; + struct cma_work *work; + + work = kzalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return -ENOMEM; + + cma_id_get(id_priv); + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDRINFO_QUERY; + work->new_state = RDMA_CM_ADDRINFO_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDRINFO_RESOLVED; + + if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_ID) { + sr.id = cpu_to_be64(ibs->service_id); + mask |= IB_SA_SERVICE_REC_SERVICE_ID; + } + if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_NAME) { + strscpy(sr.name, ibs->service_name, sizeof(sr.name)); + mask |= IB_SA_SERVICE_REC_SERVICE_NAME; + } + + id_priv->query_id = ib_sa_service_rec_get(&sa_client, + id_priv->id.device, + id_priv->id.port_num, + &sr, mask, + 2000, GFP_KERNEL, + cma_query_ib_service_handler, + work, &id_priv->query); + + if (id_priv->query_id < 0) { + cma_id_put(id_priv); + kfree(work); + return id_priv->query_id; + } + + return 0; +} + +int rdma_resolve_ib_service(struct rdma_cm_id *id, + struct rdma_ucm_ib_service *ibs) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cma_dev || + !cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDRINFO_QUERY)) + return -EINVAL; + + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_service(id_priv, ibs); + else + ret = -EOPNOTSUPP; + + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ib_service); diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index b7354c94cf1b..c604b601f4d9 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -47,7 +47,9 @@ enum rdma_cm_state { RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN, RDMA_CM_DEVICE_REMOVAL, - RDMA_CM_DESTROYING + RDMA_CM_DESTROYING, + RDMA_CM_ADDRINFO_QUERY, + RDMA_CM_ADDRINFO_RESOLVED }; struct rdma_id_private { diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 3145cb34a1d2..b4f3c835844a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1543,7 +1543,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev) /* * We have a registration lock so that all the calls to unregister are - * fully fenced, once any unregister returns the device is truely + * fully fenced, once any unregister returns the device is truly * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing. diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 53571e6b3162..c23e9c847314 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -107,6 +107,8 @@ struct ib_sa_device { struct ib_sa_query { void (*callback)(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad); + void (*rmpp_callback)(struct ib_sa_query *sa_query, int status, + struct ib_mad_recv_wc *mad); void (*release)(struct ib_sa_query *); struct ib_sa_client *client; struct ib_sa_port *port; @@ -150,6 +152,13 @@ struct ib_sa_mcmember_query { struct ib_sa_query sa_query; }; +struct ib_sa_service_query { + void (*callback)(int status, struct sa_service_rec *rec, + unsigned int num_services, void *context); + void *context; + struct ib_sa_query sa_query; +}; + static LIST_HEAD(ib_nl_request_list); static DEFINE_SPINLOCK(ib_nl_request_lock); static atomic_t ib_nl_sa_request_seq; @@ -684,6 +693,58 @@ static const struct ib_field guidinfo_rec_table[] = { .size_bits = 512 }, }; +#define SERVICE_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct sa_service_rec, field), \ + .struct_size_bytes = sizeof_field(struct sa_service_rec, field), \ + .field_name = "sa_service_rec:" #field + +static const struct ib_field service_rec_table[] = { + { SERVICE_REC_FIELD(id), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 64 }, + { SERVICE_REC_FIELD(gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(pkey), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 16 }, + { RESERVED, + .offset_words = 6, + .offset_bits = 16, + .size_bits = 16 }, + { SERVICE_REC_FIELD(lease), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 32 }, + { SERVICE_REC_FIELD(key), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(name), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 512 }, + { SERVICE_REC_FIELD(data_8), + .offset_words = 28, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_16), + .offset_words = 32, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_32), + .offset_words = 36, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_64), + .offset_words = 40, + .offset_bits = 0, + .size_bits = 128 }, +}; + #define RDMA_PRIMARY_PATH_MAX_REC_NUM 3 static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) @@ -1013,6 +1074,8 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX) timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX; + spin_lock_irqsave(&ib_nl_request_lock, flags); + delta = timeout - sa_local_svc_timeout_ms; if (delta < 0) abs_delta = -delta; @@ -1020,7 +1083,6 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, abs_delta = delta; if (delta != 0) { - spin_lock_irqsave(&ib_nl_request_lock, flags); sa_local_svc_timeout_ms = timeout; list_for_each_entry(query, &ib_nl_request_list, list) { if (delta < 0 && abs_delta > query->timeout) @@ -1038,9 +1100,10 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, if (delay) mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, (unsigned long)delay); - spin_unlock_irqrestore(&ib_nl_request_lock, flags); } + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + settimeout_out: return 0; } @@ -1390,6 +1453,20 @@ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute) } EXPORT_SYMBOL(ib_sa_pack_path); +void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute) +{ + ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec, + attribute); +} +EXPORT_SYMBOL(ib_sa_pack_service); + +void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec) +{ + ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), attribute, + rec); +} +EXPORT_SYMBOL(ib_sa_unpack_service); + static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client, struct ib_sa_device *sa_dev, u32 port_num) @@ -1479,6 +1556,68 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, } } +#define IB_SA_DATA_OFFS 56 +#define IB_SERVICE_REC_SZ 176 + +static void ib_unpack_service_rmpp(struct sa_service_rec *rec, + struct ib_mad_recv_wc *mad_wc, + int num_services) +{ + unsigned int cp_sz, data_i, data_size, rec_i = 0, buf_i = 0; + struct ib_mad_recv_buf *mad_buf; + u8 buf[IB_SERVICE_REC_SZ]; + u8 *data; + + data_size = sizeof(((struct ib_sa_mad *) mad_buf->mad)->data); + + list_for_each_entry(mad_buf, &mad_wc->rmpp_list, list) { + data = ((struct ib_sa_mad *) mad_buf->mad)->data; + data_i = 0; + while (data_i < data_size && rec_i < num_services) { + cp_sz = min(IB_SERVICE_REC_SZ - buf_i, + data_size - data_i); + memcpy(buf + buf_i, data + data_i, cp_sz); + data_i += cp_sz; + buf_i += cp_sz; + if (buf_i == IB_SERVICE_REC_SZ) { + ib_sa_unpack_service(buf, rec + rec_i); + buf_i = 0; + rec_i++; + } + } + } +} + +static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status, + struct ib_mad_recv_wc *mad_wc) +{ + struct ib_sa_service_query *query = + container_of(sa_query, struct ib_sa_service_query, sa_query); + struct sa_service_rec *rec; + int num_services; + + if (!mad_wc || !mad_wc->recv_buf.mad) { + query->callback(status, NULL, 0, query->context); + return; + } + + num_services = (mad_wc->mad_len - IB_SA_DATA_OFFS) / IB_SERVICE_REC_SZ; + if (!num_services) { + query->callback(-ENODATA, NULL, 0, query->context); + return; + } + + rec = kmalloc_array(num_services, sizeof(*rec), GFP_KERNEL); + if (!rec) { + query->callback(-ENOMEM, NULL, 0, query->context); + return; + } + + ib_unpack_service_rmpp(rec, mad_wc, num_services); + query->callback(status, rec, num_services, query->context); + kfree(rec); +} + static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) { struct ib_sa_path_query *query = @@ -1488,6 +1627,14 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) kfree(query); } +static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) +{ + struct ib_sa_service_query *query = + container_of(sa_query, struct ib_sa_service_query, sa_query); + + kfree(query); +} + /** * ib_sa_path_rec_get - Start a Path get query * @client:SA client @@ -1618,6 +1765,101 @@ err1: } EXPORT_SYMBOL(ib_sa_path_rec_get); +/** + * ib_sa_service_rec_get - Start a Service get query + * @client: SA client + * @device: device to send query on + * @port_num: port number to send query on + * @rec: Service Record to send in query + * @comp_mask: component mask to send in query + * @timeout_ms: time to wait for response + * @gfp_mask: GFP mask to use for internal allocations + * @callback: function called when query completes, times out or is + * canceled + * @context: opaque user context passed to callback + * @sa_query: query context, used to cancel query + * + * Send a Service Record Get query to the SA to look up a path. The + * callback function will be called when the query completes (or + * fails); status is 0 for a successful response, -EINTR if the query + * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error + * occurred sending the query. The resp parameter of the callback is + * only valid if status is 0. + * + * If the return value of ib_sa_service_rec_get() is negative, it is an + * error code. Otherwise it is a query ID that can be used to cancel + * the query. + */ +int ib_sa_service_rec_get(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + struct sa_service_rec *rec, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct sa_service_rec *resp, + unsigned int num_services, + void *context), + void *context, struct ib_sa_query **sa_query) +{ + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_service_query *query; + struct ib_mad_agent *agent; + struct ib_sa_port *port; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(&query->sa_query, agent); + + query->sa_query.rmpp_callback = callback ? ib_sa_service_rec_callback : + NULL; + query->sa_query.release = ib_sa_service_rec_release; + mad->mad_hdr.method = IB_MGMT_METHOD_GET_TABLE; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_sa_pack_service(rec, mad->data); + + *sa_query = &query->sa_query; + query->sa_query.mad_buf->context[1] = rec; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_service_rec_get); + static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { @@ -1987,23 +2229,29 @@ static void send_handler(struct ib_mad_agent *agent, { struct ib_sa_query *query = mad_send_wc->send_buf->context[0]; unsigned long flags; + int status = 0; - if (query->callback) + if (query->callback || query->rmpp_callback) { switch (mad_send_wc->status) { case IB_WC_SUCCESS: /* No callback -- already got recv */ break; case IB_WC_RESP_TIMEOUT_ERR: - query->callback(query, -ETIMEDOUT, NULL); + status = -ETIMEDOUT; break; case IB_WC_WR_FLUSH_ERR: - query->callback(query, -EINTR, NULL); + status = -EINTR; break; default: - query->callback(query, -EIO, NULL); + status = -EIO; break; } + if (status) + query->callback ? query->callback(query, status, NULL) : + query->rmpp_callback(query, status, NULL); + } + xa_lock_irqsave(&queries, flags); __xa_erase(&queries, query->id); xa_unlock_irqrestore(&queries, flags); @@ -2019,17 +2267,25 @@ static void recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; + struct ib_mad *mad; + if (!send_buf) return; query = send_buf->context[0]; - if (query->callback) { + mad = mad_recv_wc->recv_buf.mad; + + if (query->rmpp_callback) { + if (mad_recv_wc->wc->status == IB_WC_SUCCESS) + query->rmpp_callback(query, mad->mad_hdr.status ? + -EINVAL : 0, mad_recv_wc); + else + query->rmpp_callback(query, -EIO, NULL); + } else if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) - query->callback(query, - mad_recv_wc->recv_buf.mad->mad_hdr.status ? - -EINVAL : 0, - (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); + query->callback(query, mad->mad_hdr.status ? + -EINVAL : 0, (struct ib_sa_mad *)mad); else query->callback(query, -EIO, NULL); } @@ -2181,8 +2437,9 @@ static int ib_sa_add_one(struct ib_device *device) sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, - NULL, 0, send_handler, - recv_handler, sa_dev, 0); + NULL, IB_MGMT_RMPP_VERSION, + send_handler, recv_handler, + sa_dev, 0); if (IS_ERR(sa_dev->port[i].agent)) { ret = PTR_ERR(sa_dev->port[i].agent); goto err; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 6e700b974033..f86ece701db6 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -282,6 +282,10 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, } uevent->resp.event = event->event; uevent->resp.status = event->status; + + if (event->event == RDMA_CM_EVENT_ADDRINFO_RESOLVED) + goto out; + if (ctx->cm_id->qp_type == IB_QPT_UD) ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, &event->param.ud); @@ -289,6 +293,7 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); +out: uevent->resp.ece.vendor_id = event->ece.vendor_id; uevent->resp.ece.attr_mod = event->ece.attr_mod; return uevent; @@ -728,6 +733,28 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file, return ret; } +static ssize_t ucma_resolve_ib_service(struct ucma_file *file, + const char __user *inbuf, int in_len, + int out_len) +{ + struct rdma_ucm_resolve_ib_service cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_resolve_ib_service(ctx->cm_id, &cmd.ibs); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + static ssize_t ucma_resolve_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -994,6 +1021,43 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, return ret; } +static ssize_t ucma_query_ib_service(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_ib_service_resp *resp; + int n, ret = 0; + + if (out_len < sizeof(struct rdma_ucm_query_ib_service_resp)) + return -ENOSPC; + + if (!ctx->cm_id->route.service_recs) + return -ENODATA; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_service_recs = ctx->cm_id->route.num_service_recs; + + n = (out_len - sizeof(struct rdma_ucm_query_ib_service_resp)) / + sizeof(struct ib_user_service_rec); + + if (!n) + goto out; + + if (n > ctx->cm_id->route.num_service_recs) + n = ctx->cm_id->route.num_service_recs; + + memcpy(resp->recs, ctx->cm_id->route.service_recs, + sizeof(*resp->recs) * n); + if (copy_to_user(response, resp, struct_size(resp, recs, n))) + ret = -EFAULT; + +out: + kfree(resp); + return ret; +} + static ssize_t ucma_query(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -1022,6 +1086,9 @@ static ssize_t ucma_query(struct ucma_file *file, case RDMA_USER_CM_QUERY_GID: ret = ucma_query_gid(ctx, response, out_len); break; + case RDMA_USER_CM_QUERY_IB_SERVICE: + ret = ucma_query_ib_service(ctx, response, out_len); + break; default: ret = -ENOSYS; break; @@ -1678,6 +1745,55 @@ err_unlock: return ret; } +static ssize_t ucma_write_cm_event(struct ucma_file *file, + const char __user *inbuf, int in_len, + int out_len) +{ + struct rdma_ucm_write_cm_event cmd; + struct rdma_cm_event event = {}; + struct ucma_event *uevent; + struct ucma_context *ctx; + int ret = 0; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if ((cmd.event != RDMA_CM_EVENT_USER) && + (cmd.event != RDMA_CM_EVENT_INTERNAL)) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + event.event = cmd.event; + event.status = cmd.status; + event.param.arg = cmd.param.arg; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) { + ret = -ENOMEM; + goto out; + } + + uevent->ctx = ctx; + uevent->resp.uid = ctx->uid; + uevent->resp.id = ctx->id; + uevent->resp.event = event.event; + uevent->resp.status = event.status; + memcpy(uevent->resp.param.arg32, &event.param.arg, + sizeof(event.param.arg)); + + mutex_lock(&ctx->file->mut); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + +out: + ucma_put_ctx(ctx); + return ret; +} + static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { @@ -1703,7 +1819,9 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, [RDMA_USER_CM_CMD_QUERY] = ucma_query, [RDMA_USER_CM_CMD_BIND] = ucma_bind, [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, - [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, + [RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE] = ucma_resolve_ib_service, + [RDMA_USER_CM_CMD_WRITE_CM_EVENT] = ucma_write_cm_event, }; static ssize_t ucma_write(struct file *filp, const char __user *buf, diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index df61b2299ec0..b706dc0d0263 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ +obj-$(CONFIG_INFINIBAND_IONIC) += ionic/ diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 6df5a2738c95..3485e495ac6a 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -172,9 +172,9 @@ struct bnxt_re_dev { struct list_head list; unsigned long flags; #define BNXT_RE_FLAG_NETDEV_REGISTERED 0 +#define BNXT_RE_FLAG_STATS_CTX3_ALLOC 1 #define BNXT_RE_FLAG_HAVE_L2_REF 3 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN 4 -#define BNXT_RE_FLAG_QOS_WORK_REG 5 #define BNXT_RE_FLAG_RESOURCES_ALLOCATED 7 #define BNXT_RE_FLAG_RESOURCES_INITIALIZED 8 #define BNXT_RE_FLAG_ERR_DEVICE_DETACHED 17 @@ -187,9 +187,6 @@ struct bnxt_re_dev { int id; - struct delayed_work worker; - u8 cur_prio_map; - /* RCFW Channel */ struct bnxt_qplib_rcfw rcfw; @@ -227,6 +224,13 @@ struct bnxt_re_dev { struct workqueue_struct *dcb_wq; struct dentry *cc_config; struct bnxt_re_dbg_cc_config_params *cc_config_params; +#define BNXT_VPD_FLD_LEN 32 + char board_partno[BNXT_VPD_FLD_LEN]; + /* RoCE mirror */ + u16 mirror_vnic_id; + union ib_gid ugid; + u32 ugid_index; + u8 sniffer_flow_created : 1; }; #define to_bnxt_re_dev(ptr, member) \ @@ -243,6 +247,10 @@ int bnxt_re_assign_pma_port_counters(struct bnxt_re_dev *rdev, struct ib_mad *ou int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad); +void bnxt_re_hwrm_free_vnic(struct bnxt_re_dev *rdev); +int bnxt_re_hwrm_alloc_vnic(struct bnxt_re_dev *rdev); +int bnxt_re_hwrm_cfg_vnic(struct bnxt_re_dev *rdev, u32 qp_id); + static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) { if (rdev) @@ -276,4 +284,7 @@ static inline int bnxt_re_read_context_allowed(struct bnxt_re_dev *rdev) #define BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P7 192 #define BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P7 192 +#define BNXT_RE_HWRM_CMD_TIMEOUT(rdev) \ + ((rdev)->chip_ctx->hwrm_cmd_max_timeout * 1000) + #endif diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index e632f1661b92..be5e9b5ca2f0 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -8,6 +8,7 @@ #include <linux/debugfs.h> #include <linux/pci.h> +#include <linux/seq_file.h> #include <rdma/ib_addr.h> #include "bnxt_ulp.h" @@ -314,6 +315,40 @@ static const struct file_operations bnxt_re_cc_config_ops = { .write = bnxt_re_cc_config_set, }; +static int info_show(struct seq_file *m, void *unused) +{ + struct bnxt_re_dev *rdev = m->private; + struct bnxt_re_res_cntrs *res_s = &rdev->stats.res; + + seq_puts(m, "Info:\n"); + seq_printf(m, "Device Name\t\t: %s\n", dev_name(&rdev->ibdev.dev)); + seq_printf(m, "PD Watermark\t\t: %llu\n", res_s->pd_watermark); + seq_printf(m, "AH Watermark\t\t: %llu\n", res_s->ah_watermark); + seq_printf(m, "QP Watermark\t\t: %llu\n", res_s->qp_watermark); + seq_printf(m, "RC QP Watermark\t\t: %llu\n", res_s->rc_qp_watermark); + seq_printf(m, "UD QP Watermark\t\t: %llu\n", res_s->ud_qp_watermark); + seq_printf(m, "SRQ Watermark\t\t: %llu\n", res_s->srq_watermark); + seq_printf(m, "CQ Watermark\t\t: %llu\n", res_s->cq_watermark); + seq_printf(m, "MR Watermark\t\t: %llu\n", res_s->mr_watermark); + seq_printf(m, "MW Watermark\t\t: %llu\n", res_s->mw_watermark); + seq_printf(m, "CQ Resize Count\t\t: %d\n", atomic_read(&res_s->resize_count)); + if (rdev->pacing.dbr_pacing) { + seq_printf(m, "DB Pacing Reschedule\t: %llu\n", rdev->stats.pacing.resched); + seq_printf(m, "DB Pacing Complete\t: %llu\n", rdev->stats.pacing.complete); + seq_printf(m, "DB Pacing Alerts\t: %llu\n", rdev->stats.pacing.alerts); + seq_printf(m, "DB FIFO Register\t: 0x%x\n", + readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off)); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(info); + +static void bnxt_re_debugfs_add_info(struct bnxt_re_dev *rdev) +{ + debugfs_create_file("info", 0400, rdev->dbg_root, rdev, &info_fops); +} + void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) { struct pci_dev *pdev = rdev->en_dev->pdev; @@ -325,6 +360,8 @@ void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) rdev->qp_debugfs = debugfs_create_dir("QPs", rdev->dbg_root); rdev->cc_config = debugfs_create_dir("cc_config", rdev->dbg_root); + bnxt_re_debugfs_add_info(rdev); + rdev->cc_config_params = kzalloc(sizeof(*cc_params), GFP_KERNEL); for (i = 0; i < BNXT_RE_CC_PARAM_GEN0; i++) { diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 44bb082e0a60..651cf9d0e0c7 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -51,25 +51,6 @@ #include "hw_counters.h" static const struct rdma_stat_desc bnxt_re_stat_descs[] = { - [BNXT_RE_ACTIVE_PD].name = "active_pds", - [BNXT_RE_ACTIVE_AH].name = "active_ahs", - [BNXT_RE_ACTIVE_QP].name = "active_qps", - [BNXT_RE_ACTIVE_RC_QP].name = "active_rc_qps", - [BNXT_RE_ACTIVE_UD_QP].name = "active_ud_qps", - [BNXT_RE_ACTIVE_SRQ].name = "active_srqs", - [BNXT_RE_ACTIVE_CQ].name = "active_cqs", - [BNXT_RE_ACTIVE_MR].name = "active_mrs", - [BNXT_RE_ACTIVE_MW].name = "active_mws", - [BNXT_RE_WATERMARK_PD].name = "watermark_pds", - [BNXT_RE_WATERMARK_AH].name = "watermark_ahs", - [BNXT_RE_WATERMARK_QP].name = "watermark_qps", - [BNXT_RE_WATERMARK_RC_QP].name = "watermark_rc_qps", - [BNXT_RE_WATERMARK_UD_QP].name = "watermark_ud_qps", - [BNXT_RE_WATERMARK_SRQ].name = "watermark_srqs", - [BNXT_RE_WATERMARK_CQ].name = "watermark_cqs", - [BNXT_RE_WATERMARK_MR].name = "watermark_mrs", - [BNXT_RE_WATERMARK_MW].name = "watermark_mws", - [BNXT_RE_RESIZE_CQ_CNT].name = "resize_cq_cnt", [BNXT_RE_RX_PKTS].name = "rx_pkts", [BNXT_RE_RX_BYTES].name = "rx_bytes", [BNXT_RE_TX_PKTS].name = "tx_pkts", @@ -79,22 +60,22 @@ static const struct rdma_stat_desc bnxt_re_stat_descs[] = { [BNXT_RE_TX_DISCARDS].name = "tx_roce_discards", [BNXT_RE_RX_ERRORS].name = "rx_roce_errors", [BNXT_RE_RX_DISCARDS].name = "rx_roce_discards", - [BNXT_RE_TO_RETRANSMITS].name = "to_retransmits", - [BNXT_RE_SEQ_ERR_NAKS_RCVD].name = "seq_err_naks_rcvd", - [BNXT_RE_MAX_RETRY_EXCEEDED].name = "max_retry_exceeded", - [BNXT_RE_RNR_NAKS_RCVD].name = "rnr_naks_rcvd", - [BNXT_RE_MISSING_RESP].name = "missing_resp", + [BNXT_RE_TO_RETRANSMITS].name = "local_ack_timeout_err", + [BNXT_RE_SEQ_ERR_NAKS_RCVD].name = "packet_seq_err", + [BNXT_RE_MAX_RETRY_EXCEEDED].name = "max_retry_exceeded", + [BNXT_RE_RNR_NAKS_RCVD].name = "rnr_nak_retry_err", + [BNXT_RE_MISSING_RESP].name = "implied_nak_seq_err", [BNXT_RE_UNRECOVERABLE_ERR].name = "unrecoverable_err", [BNXT_RE_BAD_RESP_ERR].name = "bad_resp_err", [BNXT_RE_LOCAL_QP_OP_ERR].name = "local_qp_op_err", [BNXT_RE_LOCAL_PROTECTION_ERR].name = "local_protection_err", [BNXT_RE_MEM_MGMT_OP_ERR].name = "mem_mgmt_op_err", - [BNXT_RE_REMOTE_INVALID_REQ_ERR].name = "remote_invalid_req_err", - [BNXT_RE_REMOTE_ACCESS_ERR].name = "remote_access_err", + [BNXT_RE_REMOTE_INVALID_REQ_ERR].name = "req_remote_invalid_request", + [BNXT_RE_REMOTE_ACCESS_ERR].name = "req_remote_access_errors", [BNXT_RE_REMOTE_OP_ERR].name = "remote_op_err", - [BNXT_RE_DUP_REQ].name = "dup_req", + [BNXT_RE_DUP_REQ].name = "duplicate_request", [BNXT_RE_RES_EXCEED_MAX].name = "res_exceed_max", - [BNXT_RE_RES_LENGTH_MISMATCH].name = "res_length_mismatch", + [BNXT_RE_RES_LENGTH_MISMATCH].name = "resp_local_length_error", [BNXT_RE_RES_EXCEEDS_WQE].name = "res_exceeds_wqe", [BNXT_RE_RES_OPCODE_ERR].name = "res_opcode_err", [BNXT_RE_RES_RX_INVALID_RKEY].name = "res_rx_invalid_rkey", @@ -118,7 +99,7 @@ static const struct rdma_stat_desc bnxt_re_stat_descs[] = { [BNXT_RE_RES_SRQ_LOAD_ERR].name = "res_srq_load_err", [BNXT_RE_RES_TX_PCI_ERR].name = "res_tx_pci_err", [BNXT_RE_RES_RX_PCI_ERR].name = "res_rx_pci_err", - [BNXT_RE_OUT_OF_SEQ_ERR].name = "oos_drop_count", + [BNXT_RE_OUT_OF_SEQ_ERR].name = "out_of_sequence", [BNXT_RE_TX_ATOMIC_REQ].name = "tx_atomic_req", [BNXT_RE_TX_READ_REQ].name = "tx_read_req", [BNXT_RE_TX_READ_RES].name = "tx_read_resp", @@ -126,23 +107,22 @@ static const struct rdma_stat_desc bnxt_re_stat_descs[] = { [BNXT_RE_TX_SEND_REQ].name = "tx_send_req", [BNXT_RE_TX_ROCE_PKTS].name = "tx_roce_only_pkts", [BNXT_RE_TX_ROCE_BYTES].name = "tx_roce_only_bytes", - [BNXT_RE_RX_ATOMIC_REQ].name = "rx_atomic_req", - [BNXT_RE_RX_READ_REQ].name = "rx_read_req", + [BNXT_RE_RX_ATOMIC_REQ].name = "rx_atomic_requests", + [BNXT_RE_RX_READ_REQ].name = "rx_read_requests", [BNXT_RE_RX_READ_RESP].name = "rx_read_resp", - [BNXT_RE_RX_WRITE_REQ].name = "rx_write_req", + [BNXT_RE_RX_WRITE_REQ].name = "rx_write_requests", [BNXT_RE_RX_SEND_REQ].name = "rx_send_req", [BNXT_RE_RX_ROCE_PKTS].name = "rx_roce_only_pkts", [BNXT_RE_RX_ROCE_BYTES].name = "rx_roce_only_bytes", [BNXT_RE_RX_ROCE_GOOD_PKTS].name = "rx_roce_good_pkts", [BNXT_RE_RX_ROCE_GOOD_BYTES].name = "rx_roce_good_bytes", - [BNXT_RE_OOB].name = "rx_out_of_buffer", - [BNXT_RE_TX_CNP].name = "tx_cnp_pkts", - [BNXT_RE_RX_CNP].name = "rx_cnp_pkts", - [BNXT_RE_RX_ECN].name = "rx_ecn_marked_pkts", - [BNXT_RE_PACING_RESCHED].name = "pacing_reschedule", - [BNXT_RE_PACING_CMPL].name = "pacing_complete", - [BNXT_RE_PACING_ALERT].name = "pacing_alerts", - [BNXT_RE_DB_FIFO_REG].name = "db_fifo_register", + [BNXT_RE_OOB].name = "out_of_buffer", + [BNXT_RE_TX_CNP].name = "np_cnp_pkts", + [BNXT_RE_RX_CNP].name = "rp_cnp_handled", + [BNXT_RE_RX_ECN].name = "np_ecn_marked_roce_packets", + [BNXT_RE_REQ_CQE_ERROR].name = "req_cqe_error", + [BNXT_RE_RESP_CQE_ERROR].name = "resp_cqe_error", + [BNXT_RE_RESP_REMOTE_ACCESS_ERRS].name = "resp_remote_access_errors", }; static void bnxt_re_copy_ext_stats(struct bnxt_re_dev *rdev, @@ -273,18 +253,20 @@ static void bnxt_re_copy_err_stats(struct bnxt_re_dev *rdev, err_s->res_rx_pci_err; stats->value[BNXT_RE_OUT_OF_SEQ_ERR] = err_s->res_oos_drop_count; -} - -static void bnxt_re_copy_db_pacing_stats(struct bnxt_re_dev *rdev, - struct rdma_hw_stats *stats) -{ - struct bnxt_re_db_pacing_stats *pacing_s = &rdev->stats.pacing; - - stats->value[BNXT_RE_PACING_RESCHED] = pacing_s->resched; - stats->value[BNXT_RE_PACING_CMPL] = pacing_s->complete; - stats->value[BNXT_RE_PACING_ALERT] = pacing_s->alerts; - stats->value[BNXT_RE_DB_FIFO_REG] = - readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off); + stats->value[BNXT_RE_REQ_CQE_ERROR] = + err_s->bad_resp_err + + err_s->local_qp_op_err + + err_s->local_protection_err + + err_s->mem_mgmt_op_err + + err_s->remote_invalid_req_err + + err_s->remote_access_err + + err_s->remote_op_err; + stats->value[BNXT_RE_RESP_CQE_ERROR] = + err_s->res_cmp_err + + err_s->res_cq_load_err; + stats->value[BNXT_RE_RESP_REMOTE_ACCESS_ERRS] = + err_s->res_rx_no_perm + + err_s->res_tx_no_perm; } int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad) @@ -382,7 +364,6 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, u32 port, int index) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_re_res_cntrs *res_s = &rdev->stats.res; struct bnxt_qplib_roce_stats *err_s = NULL; struct ctx_hw_stats *hw_stats = NULL; int rc = 0; @@ -391,26 +372,6 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, if (!port || !stats) return -EINVAL; - stats->value[BNXT_RE_ACTIVE_QP] = atomic_read(&res_s->qp_count); - stats->value[BNXT_RE_ACTIVE_RC_QP] = atomic_read(&res_s->rc_qp_count); - stats->value[BNXT_RE_ACTIVE_UD_QP] = atomic_read(&res_s->ud_qp_count); - stats->value[BNXT_RE_ACTIVE_SRQ] = atomic_read(&res_s->srq_count); - stats->value[BNXT_RE_ACTIVE_CQ] = atomic_read(&res_s->cq_count); - stats->value[BNXT_RE_ACTIVE_MR] = atomic_read(&res_s->mr_count); - stats->value[BNXT_RE_ACTIVE_MW] = atomic_read(&res_s->mw_count); - stats->value[BNXT_RE_ACTIVE_PD] = atomic_read(&res_s->pd_count); - stats->value[BNXT_RE_ACTIVE_AH] = atomic_read(&res_s->ah_count); - stats->value[BNXT_RE_WATERMARK_QP] = res_s->qp_watermark; - stats->value[BNXT_RE_WATERMARK_RC_QP] = res_s->rc_qp_watermark; - stats->value[BNXT_RE_WATERMARK_UD_QP] = res_s->ud_qp_watermark; - stats->value[BNXT_RE_WATERMARK_SRQ] = res_s->srq_watermark; - stats->value[BNXT_RE_WATERMARK_CQ] = res_s->cq_watermark; - stats->value[BNXT_RE_WATERMARK_MR] = res_s->mr_watermark; - stats->value[BNXT_RE_WATERMARK_MW] = res_s->mw_watermark; - stats->value[BNXT_RE_WATERMARK_PD] = res_s->pd_watermark; - stats->value[BNXT_RE_WATERMARK_AH] = res_s->ah_watermark; - stats->value[BNXT_RE_RESIZE_CQ_CNT] = atomic_read(&res_s->resize_count); - if (hw_stats) { stats->value[BNXT_RE_RECOVERABLE_ERRORS] = le64_to_cpu(hw_stats->tx_bcast_pkts); @@ -449,8 +410,6 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } } - if (rdev->pacing.dbr_pacing && bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) - bnxt_re_copy_db_pacing_stats(rdev, stats); } done: diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h index e541b6f8ca9f..09d371d442aa 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.h +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h @@ -41,25 +41,6 @@ #define __BNXT_RE_HW_STATS_H__ enum bnxt_re_hw_stats { - BNXT_RE_ACTIVE_PD, - BNXT_RE_ACTIVE_AH, - BNXT_RE_ACTIVE_QP, - BNXT_RE_ACTIVE_RC_QP, - BNXT_RE_ACTIVE_UD_QP, - BNXT_RE_ACTIVE_SRQ, - BNXT_RE_ACTIVE_CQ, - BNXT_RE_ACTIVE_MR, - BNXT_RE_ACTIVE_MW, - BNXT_RE_WATERMARK_PD, - BNXT_RE_WATERMARK_AH, - BNXT_RE_WATERMARK_QP, - BNXT_RE_WATERMARK_RC_QP, - BNXT_RE_WATERMARK_UD_QP, - BNXT_RE_WATERMARK_SRQ, - BNXT_RE_WATERMARK_CQ, - BNXT_RE_WATERMARK_MR, - BNXT_RE_WATERMARK_MW, - BNXT_RE_RESIZE_CQ_CNT, BNXT_RE_RX_PKTS, BNXT_RE_RX_BYTES, BNXT_RE_TX_PKTS, @@ -129,10 +110,9 @@ enum bnxt_re_hw_stats { BNXT_RE_TX_CNP, BNXT_RE_RX_CNP, BNXT_RE_RX_ECN, - BNXT_RE_PACING_RESCHED, - BNXT_RE_PACING_CMPL, - BNXT_RE_PACING_ALERT, - BNXT_RE_DB_FIFO_REG, + BNXT_RE_REQ_CQE_ERROR, + BNXT_RE_RESP_CQE_ERROR, + BNXT_RE_RESP_REMOTE_ACCESS_ERRS, BNXT_RE_NUM_EXT_COUNTERS }; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 260dc67b8b87..4dab5ca7362b 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -288,7 +288,9 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, } port_attr->max_mtu = IB_MTU_4096; port_attr->active_mtu = iboe_get_mtu(rdev->netdev->mtu); - port_attr->gid_tbl_len = dev_attr->max_sgid; + /* One GID is reserved for RawEth QP. Report one less */ + port_attr->gid_tbl_len = (rdev->rcfw.roce_mirror ? (dev_attr->max_sgid - 1) : + dev_attr->max_sgid); port_attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP; @@ -375,7 +377,7 @@ int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context) if (!ctx) return -EINVAL; - if (sgid_tbl && sgid_tbl->active) { + if (sgid_tbl->active) { if (ctx->idx >= sgid_tbl->max) return -EINVAL; gid_to_del = &sgid_tbl->tbl[ctx->idx].gid; @@ -429,7 +431,7 @@ int bnxt_re_add_gid(const struct ib_gid_attr *attr, void **context) rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)&attr->gid, rdev->qplib_res.netdev->dev_addr, - vlan_id, true, &tbl_idx); + vlan_id, true, &tbl_idx, false, 0); if (rc == -EALREADY) { ctx_tbl = sgid_tbl->ctx; ctx_tbl[tbl_idx]->refcnt++; @@ -955,6 +957,20 @@ fail: return rc; } +static void bnxt_re_del_unique_gid(struct bnxt_re_dev *rdev) +{ + int rc; + + if (!rdev->rcfw.roce_mirror) + return; + + rc = bnxt_qplib_del_sgid(&rdev->qplib_res.sgid_tbl, + (struct bnxt_qplib_gid *)&rdev->ugid, + 0xFFFF, true); + if (rc) + dev_err(rdev_to_dev(rdev), "Failed to delete unique GID, rc: %d\n", rc); +} + /* Queue Pairs */ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) { @@ -994,6 +1010,9 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) else if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD) atomic_dec(&rdev->stats.res.ud_qp_count); + if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE) + bnxt_re_del_unique_gid(rdev); + ib_umem_release(qp->rumem); ib_umem_release(qp->sumem); @@ -1018,6 +1037,8 @@ static u8 __from_ib_qp_type(enum ib_qp_type type) return CMDQ_CREATE_QP_TYPE_RC; case IB_QPT_UD: return CMDQ_CREATE_QP_TYPE_UD; + case IB_QPT_RAW_PACKET: + return CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE; default: return IB_QPT_MAX; } @@ -1595,6 +1616,29 @@ static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev, return rc; } +static int bnxt_re_add_unique_gid(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx; + struct bnxt_qplib_res *res = &rdev->qplib_res; + int rc; + + if (!rdev->rcfw.roce_mirror) + return 0; + + rdev->ugid.global.subnet_prefix = cpu_to_be64(0xfe8000000000abcdLL); + addrconf_ifid_eui48(&rdev->ugid.raw[8], rdev->netdev); + + rc = bnxt_qplib_add_sgid(&res->sgid_tbl, + (struct bnxt_qplib_gid *)&rdev->ugid, + rdev->qplib_res.netdev->dev_addr, + 0xFFFF, true, &rdev->ugid_index, true, + hctx->stats3.fw_id); + if (rc) + dev_err(rdev_to_dev(rdev), "Failed to add unique GID. rc = %d\n", rc); + + return rc; +} + int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata) { @@ -1656,6 +1700,17 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, } } + /* Support for RawEth QP is added to capture TCP pkt dump. + * So unique SGID is used to avoid incorrect statistics on per + * function stats_ctx + */ + if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE) { + rc = bnxt_re_add_unique_gid(rdev); + if (rc) + goto qp_destroy; + qp->qplib_qp.ugid_index = rdev->ugid_index; + } + qp->ib_qp.qp_num = qp->qplib_qp.id; if (qp_init_attr->qp_type == IB_QPT_GSI) rdev->gsi_ctx.gsi_qp = qp; @@ -2301,7 +2356,7 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->pkey_index = qplib_qp->pkey_index; qp_attr->qkey = qplib_qp->qkey; qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; - rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp->ah.flow_label, + rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp->udp_sport, qplib_qp->ah.host_sgid_index, qplib_qp->ah.hop_limit, qplib_qp->ah.traffic_class); @@ -3248,9 +3303,9 @@ int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) IB_ACCESS_LOCAL_WRITE); if (IS_ERR(cq->resize_umem)) { rc = PTR_ERR(cq->resize_umem); + ibdev_err(&rdev->ibdev, "%s: ib_umem_get failed! rc = %pe\n", + __func__, cq->resize_umem); cq->resize_umem = NULL; - ibdev_err(&rdev->ibdev, "%s: ib_umem_get failed! rc = %d\n", - __func__, rc); goto fail; } cq->resize_cqe = entries; @@ -4392,6 +4447,93 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) } } +static int bnxt_re_setup_vnic(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp) +{ + int rc; + + rc = bnxt_re_hwrm_alloc_vnic(rdev); + if (rc) + return rc; + + rc = bnxt_re_hwrm_cfg_vnic(rdev, qp->qplib_qp.id); + if (rc) + goto out_free_vnic; + + return 0; +out_free_vnic: + bnxt_re_hwrm_free_vnic(rdev); + return rc; +} + +struct ib_flow *bnxt_re_create_flow(struct ib_qp *ib_qp, + struct ib_flow_attr *attr, + struct ib_udata *udata) +{ + struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + struct bnxt_re_dev *rdev = qp->rdev; + struct bnxt_re_flow *flow; + int rc; + + if (attr->type != IB_FLOW_ATTR_SNIFFER || + !rdev->rcfw.roce_mirror) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&rdev->qp_lock); + if (rdev->sniffer_flow_created) { + ibdev_err(&rdev->ibdev, "RoCE Mirroring is already Configured\n"); + mutex_unlock(&rdev->qp_lock); + return ERR_PTR(-EBUSY); + } + + flow = kzalloc(sizeof(*flow), GFP_KERNEL); + if (!flow) { + mutex_unlock(&rdev->qp_lock); + return ERR_PTR(-ENOMEM); + } + + flow->rdev = rdev; + + rc = bnxt_re_setup_vnic(rdev, qp); + if (rc) + goto out_free_flow; + + rc = bnxt_qplib_create_flow(&rdev->qplib_res); + if (rc) + goto out_free_vnic; + + rdev->sniffer_flow_created = 1; + mutex_unlock(&rdev->qp_lock); + + return &flow->ib_flow; + +out_free_vnic: + bnxt_re_hwrm_free_vnic(rdev); +out_free_flow: + mutex_unlock(&rdev->qp_lock); + kfree(flow); + return ERR_PTR(rc); +} + +int bnxt_re_destroy_flow(struct ib_flow *flow_id) +{ + struct bnxt_re_flow *flow = + container_of(flow_id, struct bnxt_re_flow, ib_flow); + struct bnxt_re_dev *rdev = flow->rdev; + int rc; + + mutex_lock(&rdev->qp_lock); + rc = bnxt_qplib_destroy_flow(&rdev->qplib_res); + if (rc) + ibdev_dbg(&rdev->ibdev, "failed to destroy_flow rc = %d\n", rc); + rdev->sniffer_flow_created = 0; + + bnxt_re_hwrm_free_vnic(rdev); + mutex_unlock(&rdev->qp_lock); + kfree(flow); + + return rc; +} + static struct bnxt_re_cq *bnxt_re_search_for_cq(struct bnxt_re_dev *rdev, u32 cq_id) { struct bnxt_re_cq *cq = NULL, *tmp_cq; @@ -4604,7 +4746,7 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_ALLOC_PAGE)(struct uverbs_attr_bundle * return err; err = uverbs_copy_to(attrs, BNXT_RE_ALLOC_PAGE_DPI, - &dpi, sizeof(length)); + &dpi, sizeof(dpi)); if (err) return err; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index fe00ab691a51..76ba9ab04d5c 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -164,6 +164,11 @@ struct bnxt_re_user_mmap_entry { u8 mmap_flag; }; +struct bnxt_re_flow { + struct ib_flow ib_flow; + struct bnxt_re_dev *rdev; +}; + static inline u16 bnxt_re_get_swqe_size(int nsge) { return sizeof(struct sq_send_hdr) + nsge * sizeof(struct sq_sge); @@ -267,6 +272,11 @@ struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, struct uverbs_attr_bundle *attrs); int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); +struct ib_flow *bnxt_re_create_flow(struct ib_qp *ib_qp, + struct ib_flow_attr *attr, + struct ib_udata *udata); +int bnxt_re_destroy_flow(struct ib_flow *flow_id); + int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index df7cf8d68e27..b13810572c2e 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -80,6 +80,7 @@ MODULE_LICENSE("Dual BSD/GPL"); static DEFINE_MUTEX(bnxt_re_mutex); static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); +static int bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, u32 *offset); @@ -188,6 +189,10 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); rdev->qplib_res.en_dev = en_dev; + rc = bnxt_re_query_hwrm_intf_version(rdev); + if (rc) + goto free_dev_attr; + bnxt_re_set_drv_mode(rdev); bnxt_re_set_db_offset(rdev); @@ -540,6 +545,72 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, fw_msg->timeout = timeout; } +void bnxt_re_hwrm_free_vnic(struct bnxt_re_dev *rdev) +{ + struct bnxt_en_dev *en_dev = rdev->en_dev; + struct hwrm_vnic_free_input req = {}; + struct bnxt_fw_msg fw_msg = {}; + int rc; + + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_VNIC_FREE); + + req.vnic_id = cpu_to_le32(rdev->mirror_vnic_id); + bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), NULL, + 0, BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); + rc = bnxt_send_msg(en_dev, &fw_msg); + if (rc) + ibdev_dbg(&rdev->ibdev, + "Failed to free vnic, rc = %d\n", rc); +} + +int bnxt_re_hwrm_alloc_vnic(struct bnxt_re_dev *rdev) +{ + struct bnxt_en_dev *en_dev = rdev->en_dev; + struct hwrm_vnic_alloc_output resp = {}; + struct hwrm_vnic_alloc_input req = {}; + struct bnxt_fw_msg fw_msg = {}; + int rc; + + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_VNIC_ALLOC); + + req.vnic_id = cpu_to_le16(rdev->mirror_vnic_id); + req.flags = cpu_to_le32(VNIC_ALLOC_REQ_FLAGS_VNIC_ID_VALID); + bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); + rc = bnxt_send_msg(en_dev, &fw_msg); + if (rc) + ibdev_dbg(&rdev->ibdev, + "Failed to alloc vnic, rc = %d\n", rc); + + return rc; +} + +int bnxt_re_hwrm_cfg_vnic(struct bnxt_re_dev *rdev, u32 qp_id) +{ + struct bnxt_en_dev *en_dev = rdev->en_dev; + struct hwrm_vnic_cfg_input req = {}; + struct bnxt_fw_msg fw_msg = {}; + int rc; + + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_VNIC_CFG); + + req.flags = cpu_to_le32(VNIC_CFG_REQ_FLAGS_ROCE_ONLY_VNIC_MODE); + req.enables = cpu_to_le32(VNIC_CFG_REQ_ENABLES_RAW_QP_ID | + VNIC_CFG_REQ_ENABLES_MRU); + req.vnic_id = cpu_to_le16(rdev->mirror_vnic_id); + req.raw_qp_id = cpu_to_le32(qp_id); + req.mru = cpu_to_le16(rdev->netdev->mtu + VLAN_ETH_HLEN); + + bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), NULL, + 0, BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); + rc = bnxt_send_msg(en_dev, &fw_msg); + if (rc) + ibdev_dbg(&rdev->ibdev, + "Failed to cfg vnic, rc = %d\n", rc); + + return rc; +} + /* Query device config using common hwrm */ static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, u32 *offset) @@ -553,11 +624,12 @@ static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCFG); req.fid = cpu_to_le16(0xffff); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, - sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); rc = bnxt_send_msg(en_dev, &fw_msg); if (!rc) { *db_len = PAGE_ALIGN(le16_to_cpu(resp.l2_doorbell_bar_size_kb) * 1024); *offset = PAGE_ALIGN(le16_to_cpu(resp.legacy_l2_db_size_kb) * 1024); + rdev->mirror_vnic_id = le16_to_cpu(resp.mirror_vnic_id); } return rc; } @@ -577,7 +649,7 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCAPS); req.fid = cpu_to_le16(0xffff); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, - sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); rc = bnxt_send_msg(en_dev, &fw_msg); if (rc) @@ -587,6 +659,8 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) flags_ext2 = le32_to_cpu(resp.flags_ext2); cctx->modes.dbr_pacing = flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED || flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_V0_SUPPORTED; + cctx->modes.roce_mirror = !!(le32_to_cpu(resp.flags_ext3) & + FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED); return 0; } @@ -603,7 +677,7 @@ static int bnxt_re_hwrm_dbr_pacing_qcfg(struct bnxt_re_dev *rdev) cctx = rdev->chip_ctx; bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_DBR_PACING_QCFG); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, - sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); rc = bnxt_send_msg(en_dev, &fw_msg); if (rc) return rc; @@ -842,20 +916,12 @@ static void bnxt_re_deinitialize_dbr_pacing(struct bnxt_re_dev *rdev) static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, int type) { - struct bnxt_en_dev *en_dev; + struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_ring_free_input req = {}; struct hwrm_ring_free_output resp; struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; - if (!rdev) - return rc; - - en_dev = rdev->en_dev; - - if (!en_dev) - return rc; - if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) return 0; @@ -863,7 +929,7 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, req.ring_type = type; req.ring_id = cpu_to_le16(fw_ring_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, - sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); rc = bnxt_send_msg(en_dev, &fw_msg); if (rc) ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x", @@ -881,9 +947,6 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; - if (!en_dev) - return rc; - bnxt_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC); req.enables = 0; req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]); @@ -899,7 +962,7 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, req.ring_type = ring_attr->type; req.int_mode = ring_attr->mode; bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, - sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); rc = bnxt_send_msg(en_dev, &fw_msg); if (!rc) *fw_ring_id = le16_to_cpu(resp.ring_id); @@ -916,16 +979,13 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; - if (!en_dev) - return rc; - if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) return 0; bnxt_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE); req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, - sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); rc = bnxt_send_msg(en_dev, &fw_msg); if (rc) ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x", @@ -935,8 +995,7 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, } static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, - dma_addr_t dma_map, - u32 *fw_stats_ctx_id) + struct bnxt_qplib_stats *stats) { struct bnxt_qplib_chip_ctx *chip_ctx = rdev->chip_ctx; struct hwrm_stat_ctx_alloc_output resp = {}; @@ -945,21 +1004,18 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; - *fw_stats_ctx_id = INVALID_STATS_CTX_ID; - - if (!en_dev) - return rc; + stats->fw_id = INVALID_STATS_CTX_ID; bnxt_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC); req.update_period_ms = cpu_to_le32(1000); - req.stats_dma_addr = cpu_to_le64(dma_map); + req.stats_dma_addr = cpu_to_le64(stats->dma_map); req.stats_dma_length = cpu_to_le16(chip_ctx->hw_stats_size); req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE; bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, - sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev)); rc = bnxt_send_msg(en_dev, &fw_msg); if (!rc) - *fw_stats_ctx_id = le32_to_cpu(resp.stat_ctx_id); + stats->fw_id = le32_to_cpu(resp.stat_ctx_id); return rc; } @@ -975,7 +1031,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, struct bnxt_re_dev *rdev = rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev); - return sysfs_emit(buf, "0x%x\n", rdev->en_dev->pdev->vendor); + return sysfs_emit(buf, "0x%x\n", rdev->en_dev->pdev->revision); } static DEVICE_ATTR_RO(hw_rev); @@ -985,13 +1041,31 @@ static ssize_t hca_type_show(struct device *device, struct bnxt_re_dev *rdev = rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev); - return sysfs_emit(buf, "%s\n", rdev->ibdev.node_desc); + return sysfs_emit(buf, "0x%x\n", rdev->en_dev->pdev->device); } static DEVICE_ATTR_RO(hca_type); +static ssize_t board_id_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct bnxt_re_dev *rdev = rdma_device_to_drv_device(device, + struct bnxt_re_dev, ibdev); + char buffer[BNXT_VPD_FLD_LEN] = {}; + + if (!rdev->is_virtfn) + memcpy(buffer, rdev->board_partno, BNXT_VPD_FLD_LEN - 1); + else + scnprintf(buffer, BNXT_VPD_FLD_LEN, "0x%x-VF", + rdev->en_dev->pdev->device); + + return sysfs_emit(buf, "%s\n", buffer); +} +static DEVICE_ATTR_RO(board_id); + static struct attribute *bnxt_re_attributes[] = { &dev_attr_hw_rev.attr, &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, NULL }; @@ -1207,6 +1281,8 @@ static int bnxt_re_fill_res_srq_entry(struct sk_buff *msg, struct ib_srq *ib_srq goto err; if (rdma_nl_put_driver_u32_hex(msg, "max_sge", srq->qplib_srq.max_sge)) goto err; + if (rdma_nl_put_driver_u32_hex(msg, "srq_limit", srq->qplib_srq.threshold)) + goto err; nla_nest_end(msg, table_attr); return 0; @@ -1297,6 +1373,8 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .reg_user_mr_dmabuf = bnxt_re_reg_user_mr_dmabuf, .req_notify_cq = bnxt_re_req_notify_cq, .resize_cq = bnxt_re_resize_cq, + .create_flow = bnxt_re_create_flow, + .destroy_flow = bnxt_re_destroy_flow, INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah), INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq), INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), @@ -1323,8 +1401,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) /* ib device init */ ibdev->node_type = RDMA_NODE_IB_CA; - strscpy(ibdev->node_desc, BNXT_RE_DESC " HCA", - strlen(BNXT_RE_DESC) + 5); + strscpy(ibdev->node_desc, BNXT_RE_DESC " HCA"); ibdev->phys_port_cnt = 1; addrconf_addr_eui48((u8 *)&ibdev->node_guid, rdev->netdev->dev_addr); @@ -1850,81 +1927,6 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) mutex_unlock(&rdev->qp_lock); } -static int bnxt_re_update_gid(struct bnxt_re_dev *rdev) -{ - struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl; - struct bnxt_qplib_gid gid; - u16 gid_idx, index; - int rc = 0; - - if (!ib_device_try_get(&rdev->ibdev)) - return 0; - - for (index = 0; index < sgid_tbl->active; index++) { - gid_idx = sgid_tbl->hw_id[index]; - - if (!memcmp(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero, - sizeof(bnxt_qplib_gid_zero))) - continue; - /* need to modify the VLAN enable setting of non VLAN GID only - * as setting is done for VLAN GID while adding GID - */ - if (sgid_tbl->vlan[index]) - continue; - - memcpy(&gid, &sgid_tbl->tbl[index], sizeof(gid)); - - rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx, - rdev->qplib_res.netdev->dev_addr); - } - - ib_device_put(&rdev->ibdev); - return rc; -} - -static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev) -{ - u32 prio_map = 0, tmp_map = 0; - struct net_device *netdev; - struct dcb_app app = {}; - - netdev = rdev->netdev; - - app.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE; - app.protocol = ETH_P_IBOE; - tmp_map = dcb_ieee_getapp_mask(netdev, &app); - prio_map = tmp_map; - - app.selector = IEEE_8021QAZ_APP_SEL_DGRAM; - app.protocol = ROCE_V2_UDP_DPORT; - tmp_map = dcb_ieee_getapp_mask(netdev, &app); - prio_map |= tmp_map; - - return prio_map; -} - -static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) -{ - u8 prio_map = 0; - - /* Get priority for roce */ - prio_map = bnxt_re_get_priority_mask(rdev); - - if (prio_map == rdev->cur_prio_map) - return 0; - rdev->cur_prio_map = prio_map; - /* Actual priorities are not programmed as they are already - * done by L2 driver; just enable or disable priority vlan tagging - */ - if ((prio_map == 0 && rdev->qplib_res.prio) || - (prio_map != 0 && !rdev->qplib_res.prio)) { - rdev->qplib_res.prio = prio_map; - bnxt_re_update_gid(rdev); - } - - return 0; -} - static void bnxt_re_net_unregister_async_event(struct bnxt_re_dev *rdev) { if (rdev->is_virtfn) @@ -1945,7 +1947,31 @@ static void bnxt_re_net_register_async_event(struct bnxt_re_dev *rdev) ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); } -static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) +static void bnxt_re_read_vpd_info(struct bnxt_re_dev *rdev) +{ + struct pci_dev *pdev = rdev->en_dev->pdev; + unsigned int vpd_size, kw_len; + int pos, size; + u8 *vpd_data; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) { + pci_warn(pdev, "Unable to read VPD, err=%pe\n", vpd_data); + return; + } + + pos = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, + PCI_VPD_RO_KEYWORD_PARTNO, &kw_len); + if (pos < 0) + goto free; + + size = min_t(int, kw_len, BNXT_VPD_FLD_LEN - 1); + memcpy(rdev->board_partno, &vpd_data[pos], size); +free: + kfree(vpd_data); +} + +static int bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_ver_get_output resp = {}; @@ -1964,7 +1990,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) if (rc) { ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x", rc); - return; + return rc; } cctx = rdev->chip_ctx; @@ -1978,6 +2004,8 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) if (!cctx->hwrm_cmd_max_timeout) cctx->hwrm_cmd_max_timeout = RCFW_FW_STALL_MAX_TIMEOUT; + + return 0; } static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) @@ -2039,6 +2067,72 @@ static void bnxt_re_free_gid_ctx(struct bnxt_re_dev *rdev) } } +static int bnxt_re_get_stats_ctx(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx; + struct bnxt_qplib_res *res = &rdev->qplib_res; + int rc; + + rc = bnxt_qplib_alloc_stats_ctx(res->pdev, res->cctx, &hctx->stats); + if (rc) + return rc; + + rc = bnxt_re_net_stats_ctx_alloc(rdev, &hctx->stats); + if (rc) + goto free_stat_mem; + + return 0; +free_stat_mem: + bnxt_qplib_free_stats_ctx(res->pdev, &hctx->stats); + + return rc; +} + +static int bnxt_re_get_stats3_ctx(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx; + struct bnxt_qplib_res *res = &rdev->qplib_res; + int rc; + + if (!rdev->rcfw.roce_mirror) + return 0; + + rc = bnxt_qplib_alloc_stats_ctx(res->pdev, res->cctx, &hctx->stats3); + if (rc) + return rc; + + rc = bnxt_re_net_stats_ctx_alloc(rdev, &hctx->stats3); + if (rc) + goto free_stat_mem; + + return 0; +free_stat_mem: + bnxt_qplib_free_stats_ctx(res->pdev, &hctx->stats3); + + return rc; +} + +static void bnxt_re_put_stats3_ctx(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx; + struct bnxt_qplib_res *res = &rdev->qplib_res; + + if (!rdev->rcfw.roce_mirror) + return; + + bnxt_re_net_stats_ctx_free(rdev, hctx->stats3.fw_id); + bnxt_qplib_free_stats_ctx(res->pdev, &hctx->stats3); +} + +static void bnxt_re_put_stats_ctx(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx; + struct bnxt_qplib_res *res = &rdev->qplib_res; + + bnxt_re_net_stats_ctx_free(rdev, hctx->stats.fw_id); + bnxt_qplib_free_stats_ctx(res->pdev, &hctx->stats); +} + static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) { u8 type; @@ -2049,8 +2143,7 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) bnxt_re_net_unregister_async_event(rdev); bnxt_re_uninit_dcb_wq(rdev); - if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) - cancel_delayed_work_sync(&rdev->worker); + bnxt_re_put_stats3_ctx(rdev); bnxt_re_free_gid_ctx(rdev); if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, @@ -2064,8 +2157,8 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) if (rc) ibdev_warn(&rdev->ibdev, "Failed to deinitialize RCFW: %#x", rc); - bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); - bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx); + bnxt_re_put_stats_ctx(rdev); + bnxt_qplib_free_hwctx(&rdev->qplib_res, &rdev->qplib_ctx); bnxt_qplib_disable_rcfw_channel(&rdev->rcfw); type = bnxt_qplib_get_ring_type(rdev->chip_ctx); bnxt_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type); @@ -2085,16 +2178,6 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) } } -/* worker thread for polling periodic events. Now used for QoS programming*/ -static void bnxt_re_worker(struct work_struct *work) -{ - struct bnxt_re_dev *rdev = container_of(work, struct bnxt_re_dev, - worker.work); - - bnxt_re_setup_qos(rdev); - schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); -} - static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) { struct bnxt_re_ring_attr rattr = {}; @@ -2109,8 +2192,9 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) rc = bnxt_re_register_netdev(rdev); if (rc) { ibdev_err(&rdev->ibdev, - "Failed to register with netedev: %#x\n", rc); - return -EINVAL; + "Failed to register with Ethernet driver, rc %d\n", + rc); + return rc; } } set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); @@ -2148,8 +2232,6 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) /* Check whether VF or PF */ bnxt_re_get_sriov_func_type(rdev); - bnxt_re_query_hwrm_intf_version(rdev); - /* Establish RCFW Communication Channel to initialize the context * memory for the function and all child VFs */ @@ -2199,18 +2281,20 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) if (rc) goto disable_rcfw; + bnxt_qplib_query_version(&rdev->rcfw); bnxt_re_set_resource_limits(rdev); - rc = bnxt_qplib_alloc_ctx(&rdev->qplib_res, &rdev->qplib_ctx, 0, - bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)); - if (rc) { - ibdev_err(&rdev->ibdev, - "Failed to allocate QPLIB context: %#x\n", rc); - goto disable_rcfw; + if (!rdev->is_virtfn && + !bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { + rc = bnxt_qplib_alloc_hwctx(&rdev->qplib_res, &rdev->qplib_ctx); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate hw context: %#x\n", rc); + goto disable_rcfw; + } } - rc = bnxt_re_net_stats_ctx_alloc(rdev, - rdev->qplib_ctx.stats.dma_map, - &rdev->qplib_ctx.stats.fw_id); + + rc = bnxt_re_get_stats_ctx(rdev); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate stats context: %#x\n", rc); @@ -2249,15 +2333,6 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) if (rc) ibdev_warn(&rdev->ibdev, "Failed to query CC defaults\n"); - rc = bnxt_re_setup_qos(rdev); - if (rc) - ibdev_info(&rdev->ibdev, - "RoCE priority not yet configured\n"); - - INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); - set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); - schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); - if (!(rdev->qplib_res.en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT)) bnxt_re_vf_res_config(rdev); } @@ -2270,11 +2345,18 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) bnxt_re_init_dcb_wq(rdev); bnxt_re_net_register_async_event(rdev); + if (!rdev->is_virtfn) + bnxt_re_read_vpd_info(rdev); + + rc = bnxt_re_get_stats3_ctx(rdev); + if (rc) + goto fail; + return 0; free_sctx: bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); free_ctx: - bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx); + bnxt_qplib_free_hwctx(&rdev->qplib_res, &rdev->qplib_ctx); disable_rcfw: bnxt_qplib_disable_rcfw_channel(&rdev->rcfw); free_ring: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index ee36b3d82cc0..ce90d3d834d4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1307,6 +1307,7 @@ static bool is_optimized_state_transition(struct bnxt_qplib_qp *qp) int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { + struct bnxt_qplib_sgid_tbl *sgid_tbl = &res->sgid_tbl; struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct creq_modify_qp_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; @@ -1358,9 +1359,14 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL) req.flow_label = cpu_to_le32(qp->ah.flow_label); - if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX) - req.sgid_index = cpu_to_le16(res->sgid_tbl.hw_id - [qp->ah.sgid_index]); + if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX) { + if (qp->type == CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE) + req.sgid_index = + cpu_to_le16(sgid_tbl->hw_id[qp->ugid_index]); + else + req.sgid_index = + cpu_to_le16(sgid_tbl->hw_id[qp->ah.sgid_index]); + } if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT) req.hop_limit = qp->ah.hop_limit; @@ -1464,6 +1470,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->access = sb->access; qp->pkey_index = le16_to_cpu(sb->pkey); qp->qkey = le32_to_cpu(sb->qkey); + qp->udp_sport = le16_to_cpu(sb->udp_src_port); temp32[0] = le32_to_cpu(sb->dgid[0]); temp32[1] = le32_to_cpu(sb->dgid[1]); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 4921a214c34c..b990d0c0ce1a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -299,6 +299,7 @@ struct bnxt_qplib_qp { u8 smac[6]; u16 vlan_id; u16 port_id; + u16 udp_sport; u8 nw_type; struct bnxt_qplib_ah ah; @@ -344,6 +345,7 @@ struct bnxt_qplib_qp { u32 msn_tbl_sz; bool is_host_msn_tbl; u8 tos_dscp; + u32 ugid_index; }; #define BNXT_RE_MAX_MSG_SIZE 0x80000000 diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 804bc773b4ef..295a9610f3e6 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -186,7 +186,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) * wait for command completion. Maximum holding interval is 8 second. * * Returns: - * -ETIMEOUT if command is not completed in specific time interval. + * -ETIMEDOUT if command is not completed in specific time interval. * 0 if command is completed by firmware. */ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) @@ -366,6 +366,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, wmb(); writel(cmdq_prod, cmdq->cmdq_mbox.prod); writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); + print_hex_dump_bytes("req: ", DUMP_PREFIX_OFFSET, msg->req, msg->req_sz); spin_unlock_bh(&hwq->lock); /* Return the CREQ response pointer */ return 0; @@ -381,7 +382,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, * This function can not be called from non-sleepable context. * * Returns: - * -ETIMEOUT if command is not completed in specific time interval. + * -ETIMEDOUT if command is not completed in specific time interval. * 0 if command is completed by firmware. */ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) @@ -631,6 +632,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, int rc = 0; pdev = rcfw->pdev; + print_hex_dump_bytes("event: ", DUMP_PREFIX_OFFSET, qp_event, sizeof(*qp_event)); switch (qp_event->event) { case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION: err_event = (struct creq_qp_error_notification *)qp_event; @@ -903,6 +905,10 @@ skip_ctx_setup: flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED; if (rcfw->res->en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT) flags |= CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT; + if (bnxt_qplib_roce_mirror_supported(rcfw->res->cctx)) { + flags |= CMDQ_INITIALIZE_FW_FLAGS_MIRROR_ON_ROCE_SUPPORTED; + rcfw->roce_mirror = true; + } req.flags |= cpu_to_le16(flags); req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index ff873c5f1b25..988c89b4232e 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -236,6 +236,7 @@ struct bnxt_qplib_rcfw { atomic_t timeout_send; /* cached from chip cctx for quick reference in slow path */ u16 max_timeout; + bool roce_mirror; }; struct bnxt_qplib_cmdqmsg { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index cc5c82d96839..875d7b52c06a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -53,12 +53,6 @@ #include "qplib_sp.h" #include "qplib_rcfw.h" -static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev, - struct bnxt_qplib_stats *stats); -static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev, - struct bnxt_qplib_chip_ctx *cctx, - struct bnxt_qplib_stats *stats); - /* PBL */ static void __free_pbl(struct bnxt_qplib_res *res, struct bnxt_qplib_pbl *pbl, bool is_umem) @@ -352,8 +346,8 @@ fail: } /* Context Tables */ -void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res, - struct bnxt_qplib_ctx *ctx) +void bnxt_qplib_free_hwctx(struct bnxt_qplib_res *res, + struct bnxt_qplib_ctx *ctx) { int i; @@ -367,7 +361,6 @@ void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res, /* restore original pde level before destroy */ ctx->tqm_ctx.pde.level = ctx->tqm_ctx.pde_level; bnxt_qplib_free_hwq(res, &ctx->tqm_ctx.pde); - bnxt_qplib_free_stats_ctx(res->pdev, &ctx->stats); } static int bnxt_qplib_alloc_tqm_rings(struct bnxt_qplib_res *res, @@ -466,7 +459,7 @@ fail: } /* - * Routine: bnxt_qplib_alloc_ctx + * Routine: bnxt_qplib_alloc_hwctx * Description: * Context tables are memories which are used by the chip fw. * The 6 tables defined are: @@ -486,17 +479,13 @@ fail: * Returns: * 0 if success, else -ERRORS */ -int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res, - struct bnxt_qplib_ctx *ctx, - bool virt_fn, bool is_p5) +int bnxt_qplib_alloc_hwctx(struct bnxt_qplib_res *res, + struct bnxt_qplib_ctx *ctx) { struct bnxt_qplib_hwq_attr hwq_attr = {}; struct bnxt_qplib_sg_info sginfo = {}; int rc; - if (virt_fn || is_p5) - goto stats_alloc; - /* QPC Tables */ sginfo.pgsize = PAGE_SIZE; sginfo.pgshft = PAGE_SHIFT; @@ -542,16 +531,11 @@ int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res, rc = bnxt_qplib_alloc_init_hwq(&ctx->tim_tbl, &hwq_attr); if (rc) goto fail; -stats_alloc: - /* Stats */ - rc = bnxt_qplib_alloc_stats_ctx(res->pdev, res->cctx, &ctx->stats); - if (rc) - goto fail; return 0; fail: - bnxt_qplib_free_ctx(res, ctx); + bnxt_qplib_free_hwctx(res, ctx); return rc; } @@ -832,8 +816,8 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, } /* Stats */ -static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev, - struct bnxt_qplib_stats *stats) +void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev, + struct bnxt_qplib_stats *stats) { if (stats->dma) { dma_free_coherent(&pdev->dev, stats->size, @@ -843,9 +827,9 @@ static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev, stats->fw_id = -1; } -static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev, - struct bnxt_qplib_chip_ctx *cctx, - struct bnxt_qplib_stats *stats) +int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev, + struct bnxt_qplib_chip_ctx *cctx, + struct bnxt_qplib_stats *stats) { memset(stats, 0, sizeof(*stats)); stats->fw_id = -1; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 6a13927674b4..2ea3b7f232a3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -65,6 +65,7 @@ struct bnxt_qplib_drv_modes { bool db_push; bool dbr_pacing; u32 toggle_bits; + u8 roce_mirror; }; enum bnxt_re_toggle_modes { @@ -303,6 +304,7 @@ struct bnxt_qplib_ctx { struct bnxt_qplib_hwq tim_tbl; struct bnxt_qplib_tqm_ctx tqm_ctx; struct bnxt_qplib_stats stats; + struct bnxt_qplib_stats stats3; struct bnxt_qplib_vf_res vf_res; }; @@ -432,15 +434,19 @@ void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res); int bnxt_qplib_init_res(struct bnxt_qplib_res *res); void bnxt_qplib_free_res(struct bnxt_qplib_res *res); int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev); -void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res, - struct bnxt_qplib_ctx *ctx); -int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res, - struct bnxt_qplib_ctx *ctx, - bool virt_fn, bool is_p5); +void bnxt_qplib_free_hwctx(struct bnxt_qplib_res *res, + struct bnxt_qplib_ctx *ctx); +int bnxt_qplib_alloc_hwctx(struct bnxt_qplib_res *res, + struct bnxt_qplib_ctx *ctx); int bnxt_qplib_map_db_bar(struct bnxt_qplib_res *res); void bnxt_qplib_unmap_db_bar(struct bnxt_qplib_res *res); int bnxt_qplib_determine_atomics(struct pci_dev *dev); +int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev, + struct bnxt_qplib_chip_ctx *cctx, + struct bnxt_qplib_stats *stats); +void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev, + struct bnxt_qplib_stats *stats); static inline void bnxt_qplib_hwq_incr_prod(struct bnxt_qplib_db_info *dbinfo, struct bnxt_qplib_hwq *hwq, u32 cnt) @@ -582,6 +588,11 @@ static inline u8 bnxt_qplib_dbr_pacing_en(struct bnxt_qplib_chip_ctx *cctx) return cctx->modes.dbr_pacing; } +static inline u8 bnxt_qplib_roce_mirror_supported(struct bnxt_qplib_chip_ctx *cctx) +{ + return cctx->modes.roce_mirror; +} + static inline bool _is_alloc_mr_unified(u16 dev_cap_flags) { return dev_cap_flags & CREQ_QUERY_FUNC_RESP_SB_MR_REGISTER_ALLOC; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 68981399598d..9ef581ed785c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -66,14 +66,15 @@ static bool bnxt_qplib_is_atomic_cap(struct bnxt_qplib_rcfw *rcfw) return (pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ); } -static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw, - char *fw_ver) +void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw) { struct creq_query_version_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_query_version req = {}; + struct bnxt_qplib_dev_attr *attr; int rc; + attr = rcfw->res->dattr; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_QUERY_VERSION, sizeof(req)); @@ -82,10 +83,10 @@ static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw, rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); if (rc) return; - fw_ver[0] = resp.fw_maj; - fw_ver[1] = resp.fw_minor; - fw_ver[2] = resp.fw_bld; - fw_ver[3] = resp.fw_rsvd; + attr->fw_ver[0] = resp.fw_maj; + attr->fw_ver[1] = resp.fw_minor; + attr->fw_ver[2] = resp.fw_bld; + attr->fw_ver[3] = resp.fw_rsvd; } int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) @@ -179,8 +180,6 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) if (_is_max_srq_ext_supported(attr->dev_cap_flags2)) attr->max_srq += le16_to_cpu(sb->max_srq_ext); - bnxt_qplib_query_version(rcfw, attr->fw_ver); - for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) { temp = le32_to_cpu(sb->tqm_alloc_reqs[i]); tqm_alloc = (u8 *)&temp; @@ -309,7 +308,8 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_gid *gid, const u8 *smac, - u16 vlan_id, bool update, u32 *index) + u16 vlan_id, bool update, u32 *index, + bool is_ugid, u32 stats_ctx_id) { struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl, struct bnxt_qplib_res, @@ -374,6 +374,9 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]); req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]); + req.stats_ctx = cpu_to_le16(CMDQ_ADD_GID_STATS_CTX_STATS_CTX_VALID | + (u16)stats_ctx_id); + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); @@ -397,46 +400,6 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, return 0; } -int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, - struct bnxt_qplib_gid *gid, u16 gid_idx, - const u8 *smac) -{ - struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl, - struct bnxt_qplib_res, - sgid_tbl); - struct bnxt_qplib_rcfw *rcfw = res->rcfw; - struct creq_modify_gid_resp resp = {}; - struct bnxt_qplib_cmdqmsg msg = {}; - struct cmdq_modify_gid req = {}; - int rc; - - bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, - CMDQ_BASE_OPCODE_MODIFY_GID, - sizeof(req)); - - req.gid[0] = cpu_to_be32(((u32 *)gid->data)[3]); - req.gid[1] = cpu_to_be32(((u32 *)gid->data)[2]); - req.gid[2] = cpu_to_be32(((u32 *)gid->data)[1]); - req.gid[3] = cpu_to_be32(((u32 *)gid->data)[0]); - if (res->prio) { - req.vlan |= cpu_to_le16 - (CMDQ_ADD_GID_VLAN_TPID_TPID_8100 | - CMDQ_ADD_GID_VLAN_VLAN_EN); - } - - /* MAC in network format */ - req.src_mac[0] = cpu_to_be16(((u16 *)smac)[0]); - req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]); - req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]); - - req.gid_index = cpu_to_le16(gid_idx); - - bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), - sizeof(resp), 0); - rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); - return rc; -} - /* AH */ int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, bool block) @@ -1143,3 +1106,40 @@ out: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); return rc; } + +int bnxt_qplib_create_flow(struct bnxt_qplib_res *res) +{ + struct creq_roce_mirror_cfg_resp resp = {}; + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct cmdq_roce_mirror_cfg req = {}; + struct bnxt_qplib_cmdqmsg msg = {}; + + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_ROCE_MIRROR_CFG, + sizeof(req)); + + req.mirror_flags = (u8)CMDQ_ROCE_MIRROR_CFG_MIRROR_ENABLE; + + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), + sizeof(resp), 0); + return bnxt_qplib_rcfw_send_message(rcfw, &msg); +} + +int bnxt_qplib_destroy_flow(struct bnxt_qplib_res *res) +{ + struct creq_roce_mirror_cfg_resp resp = {}; + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct cmdq_roce_mirror_cfg req = {}; + struct bnxt_qplib_cmdqmsg msg = {}; + + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_ROCE_MIRROR_CFG, + sizeof(req)); + + req.mirror_flags &= ~((u8)CMDQ_ROCE_MIRROR_CFG_MIRROR_ENABLE); + + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), + sizeof(resp), 0); + + return bnxt_qplib_rcfw_send_message(rcfw, &msg); +} diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 09faf4a1e849..147b5d9c0313 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -323,7 +323,8 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_gid *gid, u16 vlan_id, bool update); int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_gid *gid, const u8 *mac, u16 vlan_id, - bool update, u32 *index); + bool update, u32 *index, + bool is_ugid, u32 stats_ctx_id); int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_gid *gid, u16 gid_idx, const u8 *smac); @@ -358,6 +359,9 @@ int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 type, u32 xid, u32 resp_size, void *resp_va); int bnxt_qplib_query_cc_param(struct bnxt_qplib_res *res, struct bnxt_qplib_cc_param *cc_param); +void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw); +int bnxt_qplib_create_flow(struct bnxt_qplib_res *res); +int bnxt_qplib_destroy_flow(struct bnxt_qplib_res *res); #define BNXT_VAR_MAX_WQE 4352 #define BNXT_VAR_MAX_SLOT_ALIGN 256 diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 024845f945ff..99ecd72e72e2 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -144,7 +144,8 @@ struct cmdq_base { #define CMDQ_BASE_OPCODE_MODIFY_CQ 0x90UL #define CMDQ_BASE_OPCODE_QUERY_QP_EXTEND 0x91UL #define CMDQ_BASE_OPCODE_QUERY_ROCE_STATS_EXT 0x92UL - #define CMDQ_BASE_OPCODE_LAST CMDQ_BASE_OPCODE_QUERY_ROCE_STATS_EXT + #define CMDQ_BASE_OPCODE_ROCE_MIRROR_CFG 0x99UL + #define CMDQ_BASE_OPCODE_LAST CMDQ_BASE_OPCODE_ROCE_MIRROR_CFG u8 cmd_size; __le16 flags; __le16 cookie; @@ -218,6 +219,7 @@ struct cmdq_initialize_fw { #define CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED 0x2UL #define CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED 0x8UL #define CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT 0x10UL + #define CMDQ_INITIALIZE_FW_FLAGS_MIRROR_ON_ROCE_SUPPORTED 0x80UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -788,7 +790,8 @@ struct creq_query_qp_resp_sb { #define CREQ_QUERY_QP_RESP_SB_ACCESS_REMOTE_ATOMIC 0x8UL __le16 pkey; __le32 qkey; - __le32 reserved32; + __le16 udp_src_port; + __le16 reserved16; __le32 dgid[4]; __le32 flow_label; __le16 sgid_index; @@ -2108,6 +2111,43 @@ struct creq_query_roce_stats_ext_resp_sb { __le64 dup_req; }; +/* cmdq_roce_mirror_cfg (size:192b/24B) */ +struct cmdq_roce_mirror_cfg { + u8 opcode; + #define CMDQ_ROCE_MIRROR_CFG_OPCODE_ROCE_MIRROR_CFG 0x99UL + #define CMDQ_ROCE_MIRROR_CFG_OPCODE_LAST \ + CMDQ_ROCE_MIRROR_CFG_OPCODE_ROCE_MIRROR_CFG + u8 cmd_size; + __le16 flags; + __le16 cookie; + u8 resp_size; + u8 reserved8; + __le64 resp_addr; + u8 mirror_flags; + #define CMDQ_ROCE_MIRROR_CFG_MIRROR_ENABLE 0x1UL + u8 rsvd[7]; +}; + +/* creq_roce_mirror_cfg_resp (size:128b/16B) */ +struct creq_roce_mirror_cfg_resp { + u8 type; + #define CREQ_ROCE_MIRROR_CFG_RESP_TYPE_MASK 0x3fUL + #define CREQ_ROCE_MIRROR_CFG_RESP_TYPE_SFT 0 + #define CREQ_ROCE_MIRROR_CFG_RESP_TYPE_QP_EVENT 0x38UL + #define CREQ_ROCE_MIRROR_CFG_RESP_TYPE_LAST \ + CREQ_ROCE_MIRROR_CFG_RESP_TYPE_QP_EVENT + u8 status; + __le16 cookie; + __le32 reserved32; + u8 v; + #define CREQ_ROCE_MIRROR_CFG_RESP_V 0x1UL + u8 event; + #define CREQ_ROCE_MIRROR_CFG_RESP_EVENT_ROCE_MIRROR_CFG 0x99UL + #define CREQ_ROCE_MIRROR_CFG_RESP_EVENT_LAST \ + CREQ_ROCE_MIRROR_CFG_RESP_EVENT_ROCE_MIRROR_CFG + u8 reserved48[6]; +}; + /* cmdq_query_func (size:128b/16B) */ struct cmdq_query_func { u8 opcode; diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index b67747ae6a68..d892f55febe2 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -1228,9 +1228,8 @@ static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state) if (!ctx->dev) { ctx->dev = c4iw_alloc(&ctx->lldi); if (IS_ERR(ctx->dev)) { - pr_err("%s: initialization failed: %ld\n", - pci_name(ctx->lldi.pdev), - PTR_ERR(ctx->dev)); + pr_err("%s: initialization failed: %pe\n", + pci_name(ctx->lldi.pdev), ctx->dev); ctx->dev = NULL; break; } diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index bafd210dd43e..0e979ca10d24 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_com.h" @@ -30,6 +30,7 @@ struct efa_comp_ctx { struct efa_admin_acq_entry *user_cqe; u32 comp_size; enum efa_cmd_status status; + u16 cmd_id; u8 cmd_opcode; u8 occupied; }; @@ -333,6 +334,7 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu comp_ctx->comp_size = comp_size_in_bytes; comp_ctx->user_cqe = comp; comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode; + comp_ctx->cmd_id = cmd_id; reinit_completion(&comp_ctx->wait_event); @@ -557,17 +559,19 @@ static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *com if (comp_ctx->status == EFA_CMD_COMPLETED) ibdev_err_ratelimited( aq->efa_dev, - "The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", + "The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (id: %d, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", efa_com_cmd_str(comp_ctx->cmd_opcode), comp_ctx->cmd_opcode, comp_ctx->status, - comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc); + comp_ctx->cmd_id, aq->sq.pc, aq->sq.cc, + aq->cq.cc); else ibdev_err_ratelimited( aq->efa_dev, - "The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", + "The device didn't send any completion for admin cmd %s(%d) status %d (id: %d, sq producer: %d, sq consumer: %d, cq consumer: %d)\n", efa_com_cmd_str(comp_ctx->cmd_opcode), comp_ctx->cmd_opcode, comp_ctx->status, - comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc); + comp_ctx->cmd_id, aq->sq.pc, aq->sq.cc, + aq->cq.cc); clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state); err = -ETIME; @@ -631,9 +635,9 @@ int efa_com_cmd_exec(struct efa_com_admin_queue *aq, if (IS_ERR(comp_ctx)) { ibdev_err_ratelimited( aq->efa_dev, - "Failed to submit command %s (opcode %u) err %ld\n", + "Failed to submit command %s (opcode %u) err %pe\n", efa_com_cmd_str(cmd->aq_common_descriptor.opcode), - cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx)); + cmd->aq_common_descriptor.opcode, comp_ctx); up(&aq->avail_cmds); atomic64_inc(&aq->stats.cmd_err); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 886923d5fe50..d9a12681f843 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1788,7 +1788,8 @@ struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, access_flags); if (IS_ERR(umem_dmabuf)) { err = PTR_ERR(umem_dmabuf); - ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%d]\n", err); + ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%pe]\n", + umem_dmabuf); goto err_free; } @@ -1832,7 +1833,8 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); ibdev_dbg(&dev->ibdev, - "Failed to pin and map user space memory[%d]\n", err); + "Failed to pin and map user space memory[%pe]\n", + mr->umem); goto err_free; } diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index fdeec33c71da..109a3f3de911 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -149,7 +149,7 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) req.phy_addr[0] = mr->mem.mtt->buf_dma; mtt_level = ERDMA_MR_MTT_1LEVEL; } else { - req.phy_addr[0] = sg_dma_address(mr->mem.mtt->sglist); + req.phy_addr[0] = mr->mem.mtt->dma_addrs[0]; mtt_level = mr->mem.mtt->level; } } else if (mr->type != ERDMA_MR_TYPE_DMA) { @@ -626,18 +626,27 @@ err_free_mtt: return ERR_PTR(-ENOMEM); } -static void erdma_destroy_mtt_buf_sg(struct erdma_dev *dev, - struct erdma_mtt *mtt) +static void erdma_unmap_page_list(struct erdma_dev *dev, dma_addr_t *pg_dma, + u32 npages) { - dma_unmap_sg(&dev->pdev->dev, mtt->sglist, - DIV_ROUND_UP(mtt->size, PAGE_SIZE), DMA_TO_DEVICE); - vfree(mtt->sglist); + u32 i; + + for (i = 0; i < npages; i++) + dma_unmap_page(&dev->pdev->dev, pg_dma[i], PAGE_SIZE, + DMA_TO_DEVICE); +} + +static void erdma_destroy_mtt_buf_dma_addrs(struct erdma_dev *dev, + struct erdma_mtt *mtt) +{ + erdma_unmap_page_list(dev, mtt->dma_addrs, mtt->npages); + vfree(mtt->dma_addrs); } static void erdma_destroy_scatter_mtt(struct erdma_dev *dev, struct erdma_mtt *mtt) { - erdma_destroy_mtt_buf_sg(dev, mtt); + erdma_destroy_mtt_buf_dma_addrs(dev, mtt); vfree(mtt->buf); kfree(mtt); } @@ -645,50 +654,69 @@ static void erdma_destroy_scatter_mtt(struct erdma_dev *dev, static void erdma_init_middle_mtt(struct erdma_mtt *mtt, struct erdma_mtt *low_mtt) { - struct scatterlist *sg; - u32 idx = 0, i; + dma_addr_t *pg_addr = mtt->buf; + u32 i; - for_each_sg(low_mtt->sglist, sg, low_mtt->nsg, i) - mtt->buf[idx++] = sg_dma_address(sg); + for (i = 0; i < low_mtt->npages; i++) + pg_addr[i] = low_mtt->dma_addrs[i]; } -static int erdma_create_mtt_buf_sg(struct erdma_dev *dev, struct erdma_mtt *mtt) +static u32 vmalloc_to_dma_addrs(struct erdma_dev *dev, dma_addr_t **dma_addrs, + void *buf, u64 len) { - struct scatterlist *sglist; - void *buf = mtt->buf; - u32 npages, i, nsg; + dma_addr_t *pg_dma; struct page *pg; + u32 npages, i; + void *addr; - /* Failed if buf is not page aligned */ - if ((uintptr_t)buf & ~PAGE_MASK) - return -EINVAL; - - npages = DIV_ROUND_UP(mtt->size, PAGE_SIZE); - sglist = vzalloc(npages * sizeof(*sglist)); - if (!sglist) - return -ENOMEM; + npages = (PAGE_ALIGN((u64)buf + len) - PAGE_ALIGN_DOWN((u64)buf)) >> + PAGE_SHIFT; + pg_dma = vcalloc(npages, sizeof(*pg_dma)); + if (!pg_dma) + return 0; - sg_init_table(sglist, npages); + addr = buf; for (i = 0; i < npages; i++) { - pg = vmalloc_to_page(buf); + pg = vmalloc_to_page(addr); if (!pg) goto err; - sg_set_page(&sglist[i], pg, PAGE_SIZE, 0); - buf += PAGE_SIZE; + + pg_dma[i] = dma_map_page(&dev->pdev->dev, pg, 0, PAGE_SIZE, + DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, pg_dma[i])) + goto err; + + addr += PAGE_SIZE; } - nsg = dma_map_sg(&dev->pdev->dev, sglist, npages, DMA_TO_DEVICE); - if (!nsg) - goto err; + *dma_addrs = pg_dma; - mtt->sglist = sglist; - mtt->nsg = nsg; + return npages; +err: + erdma_unmap_page_list(dev, pg_dma, i); + vfree(pg_dma); return 0; -err: - vfree(sglist); +} - return -ENOMEM; +static int erdma_create_mtt_buf_dma_addrs(struct erdma_dev *dev, + struct erdma_mtt *mtt) +{ + dma_addr_t *addrs; + u32 npages; + + /* Failed if buf is not page aligned */ + if ((uintptr_t)mtt->buf & ~PAGE_MASK) + return -EINVAL; + + npages = vmalloc_to_dma_addrs(dev, &addrs, mtt->buf, mtt->size); + if (!npages) + return -ENOMEM; + + mtt->dma_addrs = addrs; + mtt->npages = npages; + + return 0; } static struct erdma_mtt *erdma_create_scatter_mtt(struct erdma_dev *dev, @@ -707,12 +735,12 @@ static struct erdma_mtt *erdma_create_scatter_mtt(struct erdma_dev *dev, if (!mtt->buf) goto err_free_mtt; - ret = erdma_create_mtt_buf_sg(dev, mtt); + ret = erdma_create_mtt_buf_dma_addrs(dev, mtt); if (ret) goto err_free_mtt_buf; - ibdev_dbg(&dev->ibdev, "create scatter mtt, size:%lu, nsg:%u\n", - mtt->size, mtt->nsg); + ibdev_dbg(&dev->ibdev, "create scatter mtt, size:%lu, npages:%u\n", + mtt->size, mtt->npages); return mtt; @@ -746,8 +774,8 @@ static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size, level = 1; /* convergence the mtt table. */ - while (mtt->nsg != 1 && level <= 3) { - tmp_mtt = erdma_create_scatter_mtt(dev, MTT_SIZE(mtt->nsg)); + while (mtt->npages != 1 && level <= 3) { + tmp_mtt = erdma_create_scatter_mtt(dev, MTT_SIZE(mtt->npages)); if (IS_ERR(tmp_mtt)) { ret = PTR_ERR(tmp_mtt); goto err_free_mtt; @@ -765,7 +793,7 @@ static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size, mtt->level = level; ibdev_dbg(&dev->ibdev, "top mtt: level:%d, dma_addr 0x%llx\n", - mtt->level, mtt->sglist[0].dma_address); + mtt->level, mtt->dma_addrs[0]); return mtt; err_free_mtt: diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index ef411b81fbd7..7d8d3fe501d5 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -99,8 +99,8 @@ struct erdma_mtt { union { dma_addr_t buf_dma; struct { - struct scatterlist *sglist; - u32 nsg; + dma_addr_t *dma_addrs; + u32 npages; u32 level; }; }; diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c index 4250d077b06f..a98a4175e53b 100644 --- a/drivers/infiniband/hw/hfi1/device.c +++ b/drivers/infiniband/hw/hfi1/device.c @@ -64,9 +64,9 @@ int hfi1_cdev_init(int minor, const char *name, if (IS_ERR(device)) { ret = PTR_ERR(device); + pr_err("Could not create device for minor %d, %s (err %pe)\n", + minor, name, device); device = NULL; - pr_err("Could not create device for minor %d, %s (err %d)\n", - minor, name, -ret); cdev_del(cdev); } done: diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 719b7c34e238..5cfa4f8fbf3d 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -990,7 +990,7 @@ ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf, } /* Clean up old mappings */ - for_each_cpu(cpu, cpu_online_mask) { + for_each_online_cpu(cpu) { struct sdma_rht_node *rht_node; /* Don't cleanup sdes that are set in the new mask */ diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index b72625283fcf..9b1aece1b080 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -498,8 +498,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, ntids, sizeof(*req->tids)); if (IS_ERR(tmp)) { ret = PTR_ERR(tmp); - SDMA_DBG(req, "Failed to copy %d TIDs (%d)", - ntids, ret); + SDMA_DBG(req, "Failed to copy %d TIDs (%pe)", ntids, + tmp); goto free_req; } req->tids = tmp; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 0f037e545520..31cb8699e198 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -594,8 +594,8 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, mtr->umem = ib_umem_get(ibdev, user_addr, total_size, buf_attr->user_access); if (IS_ERR(mtr->umem)) { - ibdev_err(ibdev, "failed to get umem, ret = %ld.\n", - PTR_ERR(mtr->umem)); + ibdev_err(ibdev, "failed to get umem, ret = %pe.\n", + mtr->umem); return -ENOMEM; } } else { @@ -605,8 +605,8 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, !mtr_has_mtt(buf_attr) ? HNS_ROCE_BUF_DIRECT : 0); if (IS_ERR(mtr->kmem)) { - ibdev_err(ibdev, "failed to alloc kmem, ret = %ld.\n", - PTR_ERR(mtr->kmem)); + ibdev_err(ibdev, "failed to alloc kmem, ret = %pe.\n", + mtr->kmem); return PTR_ERR(mtr->kmem); } } diff --git a/drivers/infiniband/hw/ionic/Kconfig b/drivers/infiniband/hw/ionic/Kconfig new file mode 100644 index 000000000000..de6f10e9b6e9 --- /dev/null +++ b/drivers/infiniband/hw/ionic/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. + +config INFINIBAND_IONIC + tristate "AMD Pensando DSC RDMA/RoCE Support" + depends on NETDEVICES && ETHERNET && PCI && INET && IONIC + help + This enables RDMA/RoCE support for the AMD Pensando family of + Distributed Services Cards (DSCs). + + To learn more, visit our website at + <https://www.amd.com/en/products/accelerators/pensando.html>. + + To compile this driver as a module, choose M here. The module + will be called ionic_rdma. diff --git a/drivers/infiniband/hw/ionic/Makefile b/drivers/infiniband/hw/ionic/Makefile new file mode 100644 index 000000000000..957973742820 --- /dev/null +++ b/drivers/infiniband/hw/ionic/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y := -I $(srctree)/drivers/net/ethernet/pensando/ionic + +obj-$(CONFIG_INFINIBAND_IONIC) += ionic_rdma.o + +ionic_rdma-y := \ + ionic_ibdev.o ionic_lif_cfg.o ionic_queue.o ionic_pgtbl.o ionic_admin.o \ + ionic_controlpath.o ionic_datapath.o ionic_hw_stats.o diff --git a/drivers/infiniband/hw/ionic/ionic_admin.c b/drivers/infiniband/hw/ionic/ionic_admin.c new file mode 100644 index 000000000000..2537aa55d12d --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_admin.c @@ -0,0 +1,1229 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/printk.h> + +#include "ionic_fw.h" +#include "ionic_ibdev.h" + +#define IONIC_EQ_COUNT_MIN 4 +#define IONIC_AQ_COUNT_MIN 1 + +/* not a valid queue position or negative error status */ +#define IONIC_ADMIN_POSTED 0x10000 + +/* cpu can be held with irq disabled for COUNT * MS (for create/destroy_ah) */ +#define IONIC_ADMIN_BUSY_RETRY_COUNT 2000 +#define IONIC_ADMIN_BUSY_RETRY_MS 1 + +/* admin queue will be considered failed if a command takes longer */ +#define IONIC_ADMIN_TIMEOUT (HZ * 2) +#define IONIC_ADMIN_WARN (HZ / 8) + +/* will poll for admin cq to tolerate and report from missed event */ +#define IONIC_ADMIN_DELAY (HZ / 8) + +/* work queue for polling the event queue and admin cq */ +struct workqueue_struct *ionic_evt_workq; + +static void ionic_admin_timedout(struct ionic_aq *aq) +{ + struct ionic_ibdev *dev = aq->dev; + unsigned long irqflags; + u16 pos; + + spin_lock_irqsave(&aq->lock, irqflags); + if (ionic_queue_empty(&aq->q)) + goto out; + + /* Reset ALL adminq if any one times out */ + if (atomic_read(&aq->admin_state) < IONIC_ADMIN_KILLED) + queue_work(ionic_evt_workq, &dev->reset_work); + + ibdev_err(&dev->ibdev, "admin command timed out, aq %d after: %ums\n", + aq->aqid, (u32)jiffies_to_msecs(jiffies - aq->stamp)); + + pos = (aq->q.prod - 1) & aq->q.mask; + if (pos == aq->q.cons) + goto out; + + ibdev_warn(&dev->ibdev, "admin pos %u (last posted)\n", pos); + print_hex_dump(KERN_WARNING, "cmd ", DUMP_PREFIX_OFFSET, 16, 1, + ionic_queue_at(&aq->q, pos), + BIT(aq->q.stride_log2), true); + +out: + spin_unlock_irqrestore(&aq->lock, irqflags); +} + +static void ionic_admin_reset_dwork(struct ionic_ibdev *dev) +{ + if (atomic_read(&dev->admin_state) == IONIC_ADMIN_KILLED) + return; + + queue_delayed_work(ionic_evt_workq, &dev->admin_dwork, + IONIC_ADMIN_DELAY); +} + +static void ionic_admin_reset_wdog(struct ionic_aq *aq) +{ + if (atomic_read(&aq->admin_state) == IONIC_ADMIN_KILLED) + return; + + aq->stamp = jiffies; + ionic_admin_reset_dwork(aq->dev); +} + +static bool ionic_admin_next_cqe(struct ionic_ibdev *dev, struct ionic_cq *cq, + struct ionic_v1_cqe **cqe) +{ + struct ionic_v1_cqe *qcqe = ionic_queue_at_prod(&cq->q); + + if (unlikely(cq->color != ionic_v1_cqe_color(qcqe))) + return false; + + /* Prevent out-of-order reads of the CQE */ + dma_rmb(); + *cqe = qcqe; + + return true; +} + +static void ionic_admin_poll_locked(struct ionic_aq *aq) +{ + struct ionic_cq *cq = &aq->vcq->cq[0]; + struct ionic_admin_wr *wr, *wr_next; + struct ionic_ibdev *dev = aq->dev; + u32 wr_strides, avlbl_strides; + struct ionic_v1_cqe *cqe; + u32 qtf, qid; + u16 old_prod; + u8 type; + + lockdep_assert_held(&aq->lock); + + if (atomic_read(&aq->admin_state) == IONIC_ADMIN_KILLED) { + list_for_each_entry_safe(wr, wr_next, &aq->wr_prod, aq_ent) { + INIT_LIST_HEAD(&wr->aq_ent); + aq->q_wr[wr->status].wr = NULL; + wr->status = atomic_read(&aq->admin_state); + complete_all(&wr->work); + } + INIT_LIST_HEAD(&aq->wr_prod); + + list_for_each_entry_safe(wr, wr_next, &aq->wr_post, aq_ent) { + INIT_LIST_HEAD(&wr->aq_ent); + wr->status = atomic_read(&aq->admin_state); + complete_all(&wr->work); + } + INIT_LIST_HEAD(&aq->wr_post); + + return; + } + + old_prod = cq->q.prod; + + while (ionic_admin_next_cqe(dev, cq, &cqe)) { + qtf = ionic_v1_cqe_qtf(cqe); + qid = ionic_v1_cqe_qtf_qid(qtf); + type = ionic_v1_cqe_qtf_type(qtf); + + if (unlikely(type != IONIC_V1_CQE_TYPE_ADMIN)) { + ibdev_warn_ratelimited(&dev->ibdev, + "bad cqe type %u\n", type); + goto cq_next; + } + + if (unlikely(qid != aq->aqid)) { + ibdev_warn_ratelimited(&dev->ibdev, + "bad cqe qid %u\n", qid); + goto cq_next; + } + + if (unlikely(be16_to_cpu(cqe->admin.cmd_idx) != aq->q.cons)) { + ibdev_warn_ratelimited(&dev->ibdev, + "bad idx %u cons %u qid %u\n", + be16_to_cpu(cqe->admin.cmd_idx), + aq->q.cons, qid); + goto cq_next; + } + + if (unlikely(ionic_queue_empty(&aq->q))) { + ibdev_warn_ratelimited(&dev->ibdev, + "bad cqe for empty adminq\n"); + goto cq_next; + } + + wr = aq->q_wr[aq->q.cons].wr; + if (wr) { + aq->q_wr[aq->q.cons].wr = NULL; + list_del_init(&wr->aq_ent); + + wr->cqe = *cqe; + wr->status = atomic_read(&aq->admin_state); + complete_all(&wr->work); + } + + ionic_queue_consume_entries(&aq->q, + aq->q_wr[aq->q.cons].wqe_strides); + +cq_next: + ionic_queue_produce(&cq->q); + cq->color = ionic_color_wrap(cq->q.prod, cq->color); + } + + if (old_prod != cq->q.prod) { + ionic_admin_reset_wdog(aq); + cq->q.cons = cq->q.prod; + ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.cq_qtype, + ionic_queue_dbell_val(&cq->q)); + queue_work(ionic_evt_workq, &aq->work); + } else if (!aq->armed) { + aq->armed = true; + cq->arm_any_prod = ionic_queue_next(&cq->q, cq->arm_any_prod); + ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.cq_qtype, + cq->q.dbell | IONIC_CQ_RING_ARM | + cq->arm_any_prod); + queue_work(ionic_evt_workq, &aq->work); + } + + if (atomic_read(&aq->admin_state) != IONIC_ADMIN_ACTIVE) + return; + + old_prod = aq->q.prod; + + if (ionic_queue_empty(&aq->q) && !list_empty(&aq->wr_post)) + ionic_admin_reset_wdog(aq); + + if (list_empty(&aq->wr_post)) + return; + + do { + u8 *src; + int i, src_len; + size_t stride_len; + + wr = list_first_entry(&aq->wr_post, struct ionic_admin_wr, + aq_ent); + wr_strides = (le16_to_cpu(wr->wqe.len) + ADMIN_WQE_HDR_LEN + + (ADMIN_WQE_STRIDE - 1)) >> aq->q.stride_log2; + avlbl_strides = ionic_queue_length_remaining(&aq->q); + + if (wr_strides > avlbl_strides) + break; + + list_move(&wr->aq_ent, &aq->wr_prod); + wr->status = aq->q.prod; + aq->q_wr[aq->q.prod].wr = wr; + aq->q_wr[aq->q.prod].wqe_strides = wr_strides; + + src_len = le16_to_cpu(wr->wqe.len); + src = (uint8_t *)&wr->wqe.cmd; + + /* First stride */ + memcpy(ionic_queue_at_prod(&aq->q), &wr->wqe, + ADMIN_WQE_HDR_LEN); + stride_len = ADMIN_WQE_STRIDE - ADMIN_WQE_HDR_LEN; + if (stride_len > src_len) + stride_len = src_len; + memcpy(ionic_queue_at_prod(&aq->q) + ADMIN_WQE_HDR_LEN, + src, stride_len); + ibdev_dbg(&dev->ibdev, "post admin prod %u (%u strides)\n", + aq->q.prod, wr_strides); + print_hex_dump_debug("wqe ", DUMP_PREFIX_OFFSET, 16, 1, + ionic_queue_at_prod(&aq->q), + BIT(aq->q.stride_log2), true); + ionic_queue_produce(&aq->q); + + /* Remaining strides */ + for (i = stride_len; i < src_len; i += stride_len) { + stride_len = ADMIN_WQE_STRIDE; + + if (i + stride_len > src_len) + stride_len = src_len - i; + + memcpy(ionic_queue_at_prod(&aq->q), src + i, + stride_len); + print_hex_dump_debug("wqe ", DUMP_PREFIX_OFFSET, 16, 1, + ionic_queue_at_prod(&aq->q), + BIT(aq->q.stride_log2), true); + ionic_queue_produce(&aq->q); + } + } while (!list_empty(&aq->wr_post)); + + if (old_prod != aq->q.prod) + ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.aq_qtype, + ionic_queue_dbell_val(&aq->q)); +} + +static void ionic_admin_dwork(struct work_struct *ws) +{ + struct ionic_ibdev *dev = + container_of(ws, struct ionic_ibdev, admin_dwork.work); + struct ionic_aq *aq, *bad_aq = NULL; + bool do_reschedule = false; + unsigned long irqflags; + bool do_reset = false; + u16 pos; + int i; + + for (i = 0; i < dev->lif_cfg.aq_count; i++) { + aq = dev->aq_vec[i]; + + spin_lock_irqsave(&aq->lock, irqflags); + + if (ionic_queue_empty(&aq->q)) + goto next_aq; + + /* Reschedule if any queue has outstanding work */ + do_reschedule = true; + + if (time_is_after_eq_jiffies(aq->stamp + IONIC_ADMIN_WARN)) + /* Warning threshold not met, nothing to do */ + goto next_aq; + + /* See if polling now makes some progress */ + pos = aq->q.cons; + ionic_admin_poll_locked(aq); + if (pos != aq->q.cons) { + ibdev_dbg(&dev->ibdev, + "missed event for acq %d\n", aq->cqid); + goto next_aq; + } + + if (time_is_after_eq_jiffies(aq->stamp + + IONIC_ADMIN_TIMEOUT)) { + /* Timeout threshold not met */ + ibdev_dbg(&dev->ibdev, "no progress after %ums\n", + (u32)jiffies_to_msecs(jiffies - aq->stamp)); + goto next_aq; + } + + /* Queue timed out */ + bad_aq = aq; + do_reset = true; +next_aq: + spin_unlock_irqrestore(&aq->lock, irqflags); + } + + if (do_reset) + /* Reset RDMA lif on a timeout */ + ionic_admin_timedout(bad_aq); + else if (do_reschedule) + /* Try to poll again later */ + ionic_admin_reset_dwork(dev); +} + +static void ionic_admin_work(struct work_struct *ws) +{ + struct ionic_aq *aq = container_of(ws, struct ionic_aq, work); + unsigned long irqflags; + + spin_lock_irqsave(&aq->lock, irqflags); + ionic_admin_poll_locked(aq); + spin_unlock_irqrestore(&aq->lock, irqflags); +} + +static void ionic_admin_post_aq(struct ionic_aq *aq, struct ionic_admin_wr *wr) +{ + unsigned long irqflags; + bool poll; + + wr->status = IONIC_ADMIN_POSTED; + wr->aq = aq; + + spin_lock_irqsave(&aq->lock, irqflags); + poll = list_empty(&aq->wr_post); + list_add(&wr->aq_ent, &aq->wr_post); + if (poll) + ionic_admin_poll_locked(aq); + spin_unlock_irqrestore(&aq->lock, irqflags); +} + +void ionic_admin_post(struct ionic_ibdev *dev, struct ionic_admin_wr *wr) +{ + int aq_idx; + + /* Use cpu id for the adminq selection */ + aq_idx = raw_smp_processor_id() % dev->lif_cfg.aq_count; + ionic_admin_post_aq(dev->aq_vec[aq_idx], wr); +} + +static void ionic_admin_cancel(struct ionic_admin_wr *wr) +{ + struct ionic_aq *aq = wr->aq; + unsigned long irqflags; + + spin_lock_irqsave(&aq->lock, irqflags); + + if (!list_empty(&wr->aq_ent)) { + list_del(&wr->aq_ent); + if (wr->status != IONIC_ADMIN_POSTED) + aq->q_wr[wr->status].wr = NULL; + } + + spin_unlock_irqrestore(&aq->lock, irqflags); +} + +static int ionic_admin_busy_wait(struct ionic_admin_wr *wr) +{ + struct ionic_aq *aq = wr->aq; + unsigned long irqflags; + int try_i; + + for (try_i = 0; try_i < IONIC_ADMIN_BUSY_RETRY_COUNT; ++try_i) { + if (completion_done(&wr->work)) + return 0; + + mdelay(IONIC_ADMIN_BUSY_RETRY_MS); + + spin_lock_irqsave(&aq->lock, irqflags); + ionic_admin_poll_locked(aq); + spin_unlock_irqrestore(&aq->lock, irqflags); + } + + /* + * we timed out. Initiate RDMA LIF reset and indicate + * error to caller. + */ + ionic_admin_timedout(aq); + return -ETIMEDOUT; +} + +int ionic_admin_wait(struct ionic_ibdev *dev, struct ionic_admin_wr *wr, + enum ionic_admin_flags flags) +{ + int rc, timo; + + if (flags & IONIC_ADMIN_F_BUSYWAIT) { + /* Spin */ + rc = ionic_admin_busy_wait(wr); + } else if (flags & IONIC_ADMIN_F_INTERRUPT) { + /* + * Interruptible sleep, 1s timeout + * This is used for commands which are safe for the caller + * to clean up without killing and resetting the adminq. + */ + timo = wait_for_completion_interruptible_timeout(&wr->work, + HZ); + if (timo > 0) + rc = 0; + else if (timo == 0) + rc = -ETIMEDOUT; + else + rc = timo; + } else { + /* + * Uninterruptible sleep + * This is used for commands which are NOT safe for the + * caller to clean up. Cleanup must be handled by the + * adminq kill and reset process so that host memory is + * not corrupted by the device. + */ + wait_for_completion(&wr->work); + rc = 0; + } + + if (rc) { + ibdev_warn(&dev->ibdev, "wait status %d\n", rc); + ionic_admin_cancel(wr); + } else if (wr->status == IONIC_ADMIN_KILLED) { + ibdev_dbg(&dev->ibdev, "admin killed\n"); + + /* No error if admin already killed during teardown */ + rc = (flags & IONIC_ADMIN_F_TEARDOWN) ? 0 : -ENODEV; + } else if (ionic_v1_cqe_error(&wr->cqe)) { + ibdev_warn(&dev->ibdev, "opcode %u error %u\n", + wr->wqe.op, + be32_to_cpu(wr->cqe.status_length)); + rc = -EINVAL; + } + return rc; +} + +static int ionic_rdma_devcmd(struct ionic_ibdev *dev, + struct ionic_admin_ctx *admin) +{ + int rc; + + rc = ionic_adminq_post_wait(dev->lif_cfg.lif, admin); + if (rc) + return rc; + + return ionic_error_to_errno(admin->comp.comp.status); +} + +int ionic_rdma_reset_devcmd(struct ionic_ibdev *dev) +{ + struct ionic_admin_ctx admin = { + .work = COMPLETION_INITIALIZER_ONSTACK(admin.work), + .cmd.rdma_reset = { + .opcode = IONIC_CMD_RDMA_RESET_LIF, + .lif_index = cpu_to_le16(dev->lif_cfg.lif_index), + }, + }; + + return ionic_rdma_devcmd(dev, &admin); +} + +static int ionic_rdma_queue_devcmd(struct ionic_ibdev *dev, + struct ionic_queue *q, + u32 qid, u32 cid, u16 opcode) +{ + struct ionic_admin_ctx admin = { + .work = COMPLETION_INITIALIZER_ONSTACK(admin.work), + .cmd.rdma_queue = { + .opcode = opcode, + .lif_index = cpu_to_le16(dev->lif_cfg.lif_index), + .qid_ver = cpu_to_le32(qid), + .cid = cpu_to_le32(cid), + .dbid = cpu_to_le16(dev->lif_cfg.dbid), + .depth_log2 = q->depth_log2, + .stride_log2 = q->stride_log2, + .dma_addr = cpu_to_le64(q->dma), + }, + }; + + return ionic_rdma_devcmd(dev, &admin); +} + +static void ionic_rdma_admincq_comp(struct ib_cq *ibcq, void *cq_context) +{ + struct ionic_aq *aq = cq_context; + unsigned long irqflags; + + spin_lock_irqsave(&aq->lock, irqflags); + aq->armed = false; + if (atomic_read(&aq->admin_state) < IONIC_ADMIN_KILLED) + queue_work(ionic_evt_workq, &aq->work); + spin_unlock_irqrestore(&aq->lock, irqflags); +} + +static void ionic_rdma_admincq_event(struct ib_event *event, void *cq_context) +{ + struct ionic_aq *aq = cq_context; + + ibdev_err(&aq->dev->ibdev, "admincq event %d\n", event->event); +} + +static struct ionic_vcq *ionic_create_rdma_admincq(struct ionic_ibdev *dev, + int comp_vector) +{ + struct ib_cq_init_attr attr = { + .cqe = IONIC_AQ_DEPTH, + .comp_vector = comp_vector, + }; + struct ionic_tbl_buf buf = {}; + struct ionic_vcq *vcq; + struct ionic_cq *cq; + int rc; + + vcq = kzalloc(sizeof(*vcq), GFP_KERNEL); + if (!vcq) + return ERR_PTR(-ENOMEM); + + vcq->ibcq.device = &dev->ibdev; + vcq->ibcq.comp_handler = ionic_rdma_admincq_comp; + vcq->ibcq.event_handler = ionic_rdma_admincq_event; + atomic_set(&vcq->ibcq.usecnt, 0); + + vcq->udma_mask = 1; + cq = &vcq->cq[0]; + + rc = ionic_create_cq_common(vcq, &buf, &attr, NULL, NULL, + NULL, NULL, 0); + if (rc) + goto err_init; + + rc = ionic_rdma_queue_devcmd(dev, &cq->q, cq->cqid, cq->eqid, + IONIC_CMD_RDMA_CREATE_CQ); + if (rc) + goto err_cmd; + + return vcq; + +err_cmd: + ionic_destroy_cq_common(dev, cq); +err_init: + kfree(vcq); + + return ERR_PTR(rc); +} + +static struct ionic_aq *__ionic_create_rdma_adminq(struct ionic_ibdev *dev, + u32 aqid, u32 cqid) +{ + struct ionic_aq *aq; + int rc; + + aq = kzalloc(sizeof(*aq), GFP_KERNEL); + if (!aq) + return ERR_PTR(-ENOMEM); + + atomic_set(&aq->admin_state, IONIC_ADMIN_KILLED); + aq->dev = dev; + aq->aqid = aqid; + aq->cqid = cqid; + spin_lock_init(&aq->lock); + + rc = ionic_queue_init(&aq->q, dev->lif_cfg.hwdev, IONIC_EQ_DEPTH, + ADMIN_WQE_STRIDE); + if (rc) + goto err_q; + + ionic_queue_dbell_init(&aq->q, aq->aqid); + + aq->q_wr = kcalloc((u32)aq->q.mask + 1, sizeof(*aq->q_wr), GFP_KERNEL); + if (!aq->q_wr) { + rc = -ENOMEM; + goto err_wr; + } + + INIT_LIST_HEAD(&aq->wr_prod); + INIT_LIST_HEAD(&aq->wr_post); + + INIT_WORK(&aq->work, ionic_admin_work); + aq->armed = false; + + return aq; + +err_wr: + ionic_queue_destroy(&aq->q, dev->lif_cfg.hwdev); +err_q: + kfree(aq); + + return ERR_PTR(rc); +} + +static void __ionic_destroy_rdma_adminq(struct ionic_ibdev *dev, + struct ionic_aq *aq) +{ + kfree(aq->q_wr); + ionic_queue_destroy(&aq->q, dev->lif_cfg.hwdev); + kfree(aq); +} + +static struct ionic_aq *ionic_create_rdma_adminq(struct ionic_ibdev *dev, + u32 aqid, u32 cqid) +{ + struct ionic_aq *aq; + int rc; + + aq = __ionic_create_rdma_adminq(dev, aqid, cqid); + if (IS_ERR(aq)) + return aq; + + rc = ionic_rdma_queue_devcmd(dev, &aq->q, aq->aqid, aq->cqid, + IONIC_CMD_RDMA_CREATE_ADMINQ); + if (rc) + goto err_cmd; + + return aq; + +err_cmd: + __ionic_destroy_rdma_adminq(dev, aq); + + return ERR_PTR(rc); +} + +static void ionic_flush_qs(struct ionic_ibdev *dev) +{ + struct ionic_qp *qp, *qp_tmp; + struct ionic_cq *cq, *cq_tmp; + LIST_HEAD(flush_list); + unsigned long index; + + WARN_ON(!irqs_disabled()); + + /* Flush qp send and recv */ + xa_lock(&dev->qp_tbl); + xa_for_each(&dev->qp_tbl, index, qp) { + kref_get(&qp->qp_kref); + list_add_tail(&qp->ibkill_flush_ent, &flush_list); + } + xa_unlock(&dev->qp_tbl); + + list_for_each_entry_safe(qp, qp_tmp, &flush_list, ibkill_flush_ent) { + ionic_flush_qp(dev, qp); + kref_put(&qp->qp_kref, ionic_qp_complete); + list_del(&qp->ibkill_flush_ent); + } + + /* Notify completions */ + xa_lock(&dev->cq_tbl); + xa_for_each(&dev->cq_tbl, index, cq) { + kref_get(&cq->cq_kref); + list_add_tail(&cq->ibkill_flush_ent, &flush_list); + } + xa_unlock(&dev->cq_tbl); + + list_for_each_entry_safe(cq, cq_tmp, &flush_list, ibkill_flush_ent) { + ionic_notify_flush_cq(cq); + kref_put(&cq->cq_kref, ionic_cq_complete); + list_del(&cq->ibkill_flush_ent); + } +} + +static void ionic_kill_ibdev(struct ionic_ibdev *dev, bool fatal_path) +{ + unsigned long irqflags; + bool do_flush = false; + int i; + + /* Mark AQs for drain and flush the QPs while irq is disabled */ + local_irq_save(irqflags); + + /* Mark the admin queue, flushing at most once */ + for (i = 0; i < dev->lif_cfg.aq_count; i++) { + struct ionic_aq *aq = dev->aq_vec[i]; + + spin_lock(&aq->lock); + if (atomic_read(&aq->admin_state) != IONIC_ADMIN_KILLED) { + atomic_set(&aq->admin_state, IONIC_ADMIN_KILLED); + /* Flush incomplete admin commands */ + ionic_admin_poll_locked(aq); + do_flush = true; + } + spin_unlock(&aq->lock); + } + + if (do_flush) + ionic_flush_qs(dev); + + local_irq_restore(irqflags); + + /* Post a fatal event if requested */ + if (fatal_path) { + struct ib_event ev; + + ev.device = &dev->ibdev; + ev.element.port_num = 1; + ev.event = IB_EVENT_DEVICE_FATAL; + + ib_dispatch_event(&ev); + } + + atomic_set(&dev->admin_state, IONIC_ADMIN_KILLED); +} + +void ionic_kill_rdma_admin(struct ionic_ibdev *dev, bool fatal_path) +{ + enum ionic_admin_state old_state; + unsigned long irqflags = 0; + int i, rc; + + if (!dev->aq_vec) + return; + + /* + * Admin queues are transitioned from active to paused to killed state. + * When in paused state, no new commands are issued to the device, + * nor are any completed locally. After resetting the lif, it will be + * safe to resume the rdma admin queues in the killed state. Commands + * will not be issued to the device, but will complete locally with status + * IONIC_ADMIN_KILLED. Handling completion will ensure that creating or + * modifying resources fails, but destroying resources succeeds. + * If there was a failure resetting the lif using this strategy, + * then the state of the device is unknown. + */ + old_state = atomic_cmpxchg(&dev->admin_state, IONIC_ADMIN_ACTIVE, + IONIC_ADMIN_PAUSED); + if (old_state != IONIC_ADMIN_ACTIVE) + return; + + /* Pause all the AQs */ + local_irq_save(irqflags); + for (i = 0; i < dev->lif_cfg.aq_count; i++) { + struct ionic_aq *aq = dev->aq_vec[i]; + + spin_lock(&aq->lock); + /* pause rdma admin queues to reset lif */ + if (atomic_read(&aq->admin_state) == IONIC_ADMIN_ACTIVE) + atomic_set(&aq->admin_state, IONIC_ADMIN_PAUSED); + spin_unlock(&aq->lock); + } + local_irq_restore(irqflags); + + rc = ionic_rdma_reset_devcmd(dev); + if (unlikely(rc)) { + ibdev_err(&dev->ibdev, "failed to reset rdma %d\n", rc); + ionic_request_rdma_reset(dev->lif_cfg.lif); + } + + ionic_kill_ibdev(dev, fatal_path); +} + +static void ionic_reset_work(struct work_struct *ws) +{ + struct ionic_ibdev *dev = + container_of(ws, struct ionic_ibdev, reset_work); + + ionic_kill_rdma_admin(dev, true); +} + +static bool ionic_next_eqe(struct ionic_eq *eq, struct ionic_v1_eqe *eqe) +{ + struct ionic_v1_eqe *qeqe; + bool color; + + qeqe = ionic_queue_at_prod(&eq->q); + color = ionic_v1_eqe_color(qeqe); + + /* cons is color for eq */ + if (eq->q.cons != color) + return false; + + /* Prevent out-of-order reads of the EQE */ + dma_rmb(); + + ibdev_dbg(&eq->dev->ibdev, "poll eq prod %u\n", eq->q.prod); + print_hex_dump_debug("eqe ", DUMP_PREFIX_OFFSET, 16, 1, + qeqe, BIT(eq->q.stride_log2), true); + *eqe = *qeqe; + + return true; +} + +static void ionic_cq_event(struct ionic_ibdev *dev, u32 cqid, u8 code) +{ + unsigned long irqflags; + struct ib_event ibev; + struct ionic_cq *cq; + + xa_lock_irqsave(&dev->cq_tbl, irqflags); + cq = xa_load(&dev->cq_tbl, cqid); + if (cq) + kref_get(&cq->cq_kref); + xa_unlock_irqrestore(&dev->cq_tbl, irqflags); + + if (!cq) { + ibdev_dbg(&dev->ibdev, + "missing cqid %#x code %u\n", cqid, code); + return; + } + + switch (code) { + case IONIC_V1_EQE_CQ_NOTIFY: + if (cq->vcq->ibcq.comp_handler) + cq->vcq->ibcq.comp_handler(&cq->vcq->ibcq, + cq->vcq->ibcq.cq_context); + break; + + case IONIC_V1_EQE_CQ_ERR: + if (cq->vcq->ibcq.event_handler) { + ibev.event = IB_EVENT_CQ_ERR; + ibev.device = &dev->ibdev; + ibev.element.cq = &cq->vcq->ibcq; + + cq->vcq->ibcq.event_handler(&ibev, + cq->vcq->ibcq.cq_context); + } + break; + + default: + ibdev_dbg(&dev->ibdev, + "unrecognized cqid %#x code %u\n", cqid, code); + break; + } + + kref_put(&cq->cq_kref, ionic_cq_complete); +} + +static void ionic_qp_event(struct ionic_ibdev *dev, u32 qpid, u8 code) +{ + unsigned long irqflags; + struct ib_event ibev; + struct ionic_qp *qp; + + xa_lock_irqsave(&dev->qp_tbl, irqflags); + qp = xa_load(&dev->qp_tbl, qpid); + if (qp) + kref_get(&qp->qp_kref); + xa_unlock_irqrestore(&dev->qp_tbl, irqflags); + + if (!qp) { + ibdev_dbg(&dev->ibdev, + "missing qpid %#x code %u\n", qpid, code); + return; + } + + ibev.device = &dev->ibdev; + ibev.element.qp = &qp->ibqp; + + switch (code) { + case IONIC_V1_EQE_SQ_DRAIN: + ibev.event = IB_EVENT_SQ_DRAINED; + break; + + case IONIC_V1_EQE_QP_COMM_EST: + ibev.event = IB_EVENT_COMM_EST; + break; + + case IONIC_V1_EQE_QP_LAST_WQE: + ibev.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + + case IONIC_V1_EQE_QP_ERR: + ibev.event = IB_EVENT_QP_FATAL; + break; + + case IONIC_V1_EQE_QP_ERR_REQUEST: + ibev.event = IB_EVENT_QP_REQ_ERR; + break; + + case IONIC_V1_EQE_QP_ERR_ACCESS: + ibev.event = IB_EVENT_QP_ACCESS_ERR; + break; + + default: + ibdev_dbg(&dev->ibdev, + "unrecognized qpid %#x code %u\n", qpid, code); + goto out; + } + + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&ibev, qp->ibqp.qp_context); + +out: + kref_put(&qp->qp_kref, ionic_qp_complete); +} + +static u16 ionic_poll_eq(struct ionic_eq *eq, u16 budget) +{ + struct ionic_ibdev *dev = eq->dev; + struct ionic_v1_eqe eqe; + u16 npolled = 0; + u8 type, code; + u32 evt, qid; + + while (npolled < budget) { + if (!ionic_next_eqe(eq, &eqe)) + break; + + ionic_queue_produce(&eq->q); + + /* cons is color for eq */ + eq->q.cons = ionic_color_wrap(eq->q.prod, eq->q.cons); + + ++npolled; + + evt = ionic_v1_eqe_evt(&eqe); + type = ionic_v1_eqe_evt_type(evt); + code = ionic_v1_eqe_evt_code(evt); + qid = ionic_v1_eqe_evt_qid(evt); + + switch (type) { + case IONIC_V1_EQE_TYPE_CQ: + ionic_cq_event(dev, qid, code); + break; + + case IONIC_V1_EQE_TYPE_QP: + ionic_qp_event(dev, qid, code); + break; + + default: + ibdev_dbg(&dev->ibdev, + "unknown event %#x type %u\n", evt, type); + } + } + + return npolled; +} + +static void ionic_poll_eq_work(struct work_struct *work) +{ + struct ionic_eq *eq = container_of(work, struct ionic_eq, work); + u32 npolled; + + if (unlikely(!eq->enable) || WARN_ON(eq->armed)) + return; + + npolled = ionic_poll_eq(eq, IONIC_EQ_WORK_BUDGET); + if (npolled == IONIC_EQ_WORK_BUDGET) { + ionic_intr_credits(eq->dev->lif_cfg.intr_ctrl, eq->intr, + npolled, 0); + queue_work(ionic_evt_workq, &eq->work); + } else { + xchg(&eq->armed, 1); + ionic_intr_credits(eq->dev->lif_cfg.intr_ctrl, eq->intr, + 0, IONIC_INTR_CRED_UNMASK); + } +} + +static irqreturn_t ionic_poll_eq_isr(int irq, void *eqptr) +{ + struct ionic_eq *eq = eqptr; + int was_armed; + u32 npolled; + + was_armed = xchg(&eq->armed, 0); + + if (unlikely(!eq->enable) || !was_armed) + return IRQ_HANDLED; + + npolled = ionic_poll_eq(eq, IONIC_EQ_ISR_BUDGET); + if (npolled == IONIC_EQ_ISR_BUDGET) { + ionic_intr_credits(eq->dev->lif_cfg.intr_ctrl, eq->intr, + npolled, 0); + queue_work(ionic_evt_workq, &eq->work); + } else { + xchg(&eq->armed, 1); + ionic_intr_credits(eq->dev->lif_cfg.intr_ctrl, eq->intr, + 0, IONIC_INTR_CRED_UNMASK); + } + + return IRQ_HANDLED; +} + +static struct ionic_eq *ionic_create_eq(struct ionic_ibdev *dev, int eqid) +{ + struct ionic_intr_info intr_obj = { }; + struct ionic_eq *eq; + int rc; + + eq = kzalloc(sizeof(*eq), GFP_KERNEL); + if (!eq) + return ERR_PTR(-ENOMEM); + + eq->dev = dev; + + rc = ionic_queue_init(&eq->q, dev->lif_cfg.hwdev, IONIC_EQ_DEPTH, + sizeof(struct ionic_v1_eqe)); + if (rc) + goto err_q; + + eq->eqid = eqid; + + eq->armed = true; + eq->enable = false; + INIT_WORK(&eq->work, ionic_poll_eq_work); + + rc = ionic_intr_alloc(dev->lif_cfg.lif, &intr_obj); + if (rc < 0) + goto err_intr; + + eq->irq = intr_obj.vector; + eq->intr = intr_obj.index; + + ionic_queue_dbell_init(&eq->q, eq->eqid); + + /* cons is color for eq */ + eq->q.cons = true; + + snprintf(eq->name, sizeof(eq->name), "%s-%d-%d-eq", + "ionr", dev->lif_cfg.lif_index, eq->eqid); + + ionic_intr_mask(dev->lif_cfg.intr_ctrl, eq->intr, IONIC_INTR_MASK_SET); + ionic_intr_mask_assert(dev->lif_cfg.intr_ctrl, eq->intr, IONIC_INTR_MASK_SET); + ionic_intr_coal_init(dev->lif_cfg.intr_ctrl, eq->intr, 0); + ionic_intr_clean(dev->lif_cfg.intr_ctrl, eq->intr); + + eq->enable = true; + + rc = request_irq(eq->irq, ionic_poll_eq_isr, 0, eq->name, eq); + if (rc) + goto err_irq; + + rc = ionic_rdma_queue_devcmd(dev, &eq->q, eq->eqid, eq->intr, + IONIC_CMD_RDMA_CREATE_EQ); + if (rc) + goto err_cmd; + + ionic_intr_mask(dev->lif_cfg.intr_ctrl, eq->intr, IONIC_INTR_MASK_CLEAR); + + return eq; + +err_cmd: + eq->enable = false; + free_irq(eq->irq, eq); + flush_work(&eq->work); +err_irq: + ionic_intr_free(dev->lif_cfg.lif, eq->intr); +err_intr: + ionic_queue_destroy(&eq->q, dev->lif_cfg.hwdev); +err_q: + kfree(eq); + + return ERR_PTR(rc); +} + +static void ionic_destroy_eq(struct ionic_eq *eq) +{ + struct ionic_ibdev *dev = eq->dev; + + eq->enable = false; + free_irq(eq->irq, eq); + flush_work(&eq->work); + + ionic_intr_free(dev->lif_cfg.lif, eq->intr); + ionic_queue_destroy(&eq->q, dev->lif_cfg.hwdev); + kfree(eq); +} + +int ionic_create_rdma_admin(struct ionic_ibdev *dev) +{ + int eq_i = 0, aq_i = 0, rc = 0; + struct ionic_vcq *vcq; + struct ionic_aq *aq; + struct ionic_eq *eq; + + dev->eq_vec = NULL; + dev->aq_vec = NULL; + + INIT_WORK(&dev->reset_work, ionic_reset_work); + INIT_DELAYED_WORK(&dev->admin_dwork, ionic_admin_dwork); + atomic_set(&dev->admin_state, IONIC_ADMIN_KILLED); + + if (dev->lif_cfg.aq_count > IONIC_AQ_COUNT) { + ibdev_dbg(&dev->ibdev, "limiting adminq count to %d\n", + IONIC_AQ_COUNT); + dev->lif_cfg.aq_count = IONIC_AQ_COUNT; + } + + if (dev->lif_cfg.eq_count > IONIC_EQ_COUNT) { + dev_dbg(&dev->ibdev.dev, "limiting eventq count to %d\n", + IONIC_EQ_COUNT); + dev->lif_cfg.eq_count = IONIC_EQ_COUNT; + } + + /* need at least two eq and one aq */ + if (dev->lif_cfg.eq_count < IONIC_EQ_COUNT_MIN || + dev->lif_cfg.aq_count < IONIC_AQ_COUNT_MIN) { + rc = -EINVAL; + goto out; + } + + dev->eq_vec = kmalloc_array(dev->lif_cfg.eq_count, sizeof(*dev->eq_vec), + GFP_KERNEL); + if (!dev->eq_vec) { + rc = -ENOMEM; + goto out; + } + + for (eq_i = 0; eq_i < dev->lif_cfg.eq_count; ++eq_i) { + eq = ionic_create_eq(dev, eq_i + dev->lif_cfg.eq_base); + if (IS_ERR(eq)) { + rc = PTR_ERR(eq); + + if (eq_i < IONIC_EQ_COUNT_MIN) { + ibdev_err(&dev->ibdev, + "fail create eq %pe\n", eq); + goto out; + } + + /* ok, just fewer eq than device supports */ + ibdev_dbg(&dev->ibdev, "eq count %d want %d rc %pe\n", + eq_i, dev->lif_cfg.eq_count, eq); + + rc = 0; + break; + } + + dev->eq_vec[eq_i] = eq; + } + + dev->lif_cfg.eq_count = eq_i; + + dev->aq_vec = kmalloc_array(dev->lif_cfg.aq_count, sizeof(*dev->aq_vec), + GFP_KERNEL); + if (!dev->aq_vec) { + rc = -ENOMEM; + goto out; + } + + /* Create one CQ per AQ */ + for (aq_i = 0; aq_i < dev->lif_cfg.aq_count; ++aq_i) { + vcq = ionic_create_rdma_admincq(dev, aq_i % eq_i); + if (IS_ERR(vcq)) { + rc = PTR_ERR(vcq); + + if (!aq_i) { + ibdev_err(&dev->ibdev, + "failed to create acq %pe\n", vcq); + goto out; + } + + /* ok, just fewer adminq than device supports */ + ibdev_dbg(&dev->ibdev, "acq count %d want %d rc %pe\n", + aq_i, dev->lif_cfg.aq_count, vcq); + break; + } + + aq = ionic_create_rdma_adminq(dev, aq_i + dev->lif_cfg.aq_base, + vcq->cq[0].cqid); + if (IS_ERR(aq)) { + /* Clean up the dangling CQ */ + ionic_destroy_cq_common(dev, &vcq->cq[0]); + kfree(vcq); + + rc = PTR_ERR(aq); + + if (!aq_i) { + ibdev_err(&dev->ibdev, + "failed to create aq %pe\n", aq); + goto out; + } + + /* ok, just fewer adminq than device supports */ + ibdev_dbg(&dev->ibdev, "aq count %d want %d rc %pe\n", + aq_i, dev->lif_cfg.aq_count, aq); + break; + } + + vcq->ibcq.cq_context = aq; + aq->vcq = vcq; + + atomic_set(&aq->admin_state, IONIC_ADMIN_ACTIVE); + dev->aq_vec[aq_i] = aq; + } + + atomic_set(&dev->admin_state, IONIC_ADMIN_ACTIVE); +out: + dev->lif_cfg.eq_count = eq_i; + dev->lif_cfg.aq_count = aq_i; + + return rc; +} + +void ionic_destroy_rdma_admin(struct ionic_ibdev *dev) +{ + struct ionic_vcq *vcq; + struct ionic_aq *aq; + struct ionic_eq *eq; + + /* + * Killing the admin before destroy makes sure all admin and + * completions are flushed. admin_state = IONIC_ADMIN_KILLED + * stops queueing up further works. + */ + cancel_delayed_work_sync(&dev->admin_dwork); + cancel_work_sync(&dev->reset_work); + + if (dev->aq_vec) { + while (dev->lif_cfg.aq_count > 0) { + aq = dev->aq_vec[--dev->lif_cfg.aq_count]; + vcq = aq->vcq; + + cancel_work_sync(&aq->work); + + __ionic_destroy_rdma_adminq(dev, aq); + if (vcq) { + ionic_destroy_cq_common(dev, &vcq->cq[0]); + kfree(vcq); + } + } + + kfree(dev->aq_vec); + } + + if (dev->eq_vec) { + while (dev->lif_cfg.eq_count > 0) { + eq = dev->eq_vec[--dev->lif_cfg.eq_count]; + ionic_destroy_eq(eq); + } + + kfree(dev->eq_vec); + } +} diff --git a/drivers/infiniband/hw/ionic/ionic_controlpath.c b/drivers/infiniband/hw/ionic/ionic_controlpath.c new file mode 100644 index 000000000000..ea12d9b8e125 --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_controlpath.c @@ -0,0 +1,2679 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/module.h> +#include <linux/printk.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_user_verbs.h> +#include <ionic_api.h> + +#include "ionic_fw.h" +#include "ionic_ibdev.h" + +#define ionic_set_ecn(tos) (((tos) | 2u) & ~1u) +#define ionic_clear_ecn(tos) ((tos) & ~3u) + +static int ionic_validate_qdesc(struct ionic_qdesc *q) +{ + if (!q->addr || !q->size || !q->mask || + !q->depth_log2 || !q->stride_log2) + return -EINVAL; + + if (q->addr & (PAGE_SIZE - 1)) + return -EINVAL; + + if (q->mask != BIT(q->depth_log2) - 1) + return -EINVAL; + + if (q->size < BIT_ULL(q->depth_log2 + q->stride_log2)) + return -EINVAL; + + return 0; +} + +static u32 ionic_get_eqid(struct ionic_ibdev *dev, u32 comp_vector, u8 udma_idx) +{ + /* EQ per vector per udma, and the first eqs reserved for async events. + * The rest of the vectors can be requested for completions. + */ + u32 comp_vec_count = dev->lif_cfg.eq_count / dev->lif_cfg.udma_count - 1; + + return (comp_vector % comp_vec_count + 1) * dev->lif_cfg.udma_count + udma_idx; +} + +static int ionic_get_cqid(struct ionic_ibdev *dev, u32 *cqid, u8 udma_idx) +{ + unsigned int size, base, bound; + int rc; + + size = dev->lif_cfg.cq_count / dev->lif_cfg.udma_count; + base = size * udma_idx; + bound = base + size; + + rc = ionic_resid_get_shared(&dev->inuse_cqid, base, bound); + if (rc >= 0) { + /* cq_base is zero or a multiple of two queue groups */ + *cqid = dev->lif_cfg.cq_base + + ionic_bitid_to_qid(rc, dev->lif_cfg.udma_qgrp_shift, + dev->half_cqid_udma_shift); + + rc = 0; + } + + return rc; +} + +static void ionic_put_cqid(struct ionic_ibdev *dev, u32 cqid) +{ + u32 bitid = ionic_qid_to_bitid(cqid - dev->lif_cfg.cq_base, + dev->lif_cfg.udma_qgrp_shift, + dev->half_cqid_udma_shift); + + ionic_resid_put(&dev->inuse_cqid, bitid); +} + +int ionic_create_cq_common(struct ionic_vcq *vcq, + struct ionic_tbl_buf *buf, + const struct ib_cq_init_attr *attr, + struct ionic_ctx *ctx, + struct ib_udata *udata, + struct ionic_qdesc *req_cq, + __u32 *resp_cqid, + int udma_idx) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(vcq->ibcq.device); + struct ionic_cq *cq = &vcq->cq[udma_idx]; + void *entry; + int rc; + + cq->vcq = vcq; + + if (attr->cqe < 1 || attr->cqe + IONIC_CQ_GRACE > 0xffff) { + rc = -EINVAL; + goto err_args; + } + + rc = ionic_get_cqid(dev, &cq->cqid, udma_idx); + if (rc) + goto err_args; + + cq->eqid = ionic_get_eqid(dev, attr->comp_vector, udma_idx); + + spin_lock_init(&cq->lock); + INIT_LIST_HEAD(&cq->poll_sq); + INIT_LIST_HEAD(&cq->flush_sq); + INIT_LIST_HEAD(&cq->flush_rq); + + if (udata) { + rc = ionic_validate_qdesc(req_cq); + if (rc) + goto err_qdesc; + + cq->umem = ib_umem_get(&dev->ibdev, req_cq->addr, req_cq->size, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(cq->umem)) { + rc = PTR_ERR(cq->umem); + goto err_qdesc; + } + + cq->q.ptr = NULL; + cq->q.size = req_cq->size; + cq->q.mask = req_cq->mask; + cq->q.depth_log2 = req_cq->depth_log2; + cq->q.stride_log2 = req_cq->stride_log2; + + *resp_cqid = cq->cqid; + } else { + rc = ionic_queue_init(&cq->q, dev->lif_cfg.hwdev, + attr->cqe + IONIC_CQ_GRACE, + sizeof(struct ionic_v1_cqe)); + if (rc) + goto err_q_init; + + ionic_queue_dbell_init(&cq->q, cq->cqid); + cq->color = true; + cq->credit = cq->q.mask; + } + + rc = ionic_pgtbl_init(dev, buf, cq->umem, cq->q.dma, 1, PAGE_SIZE); + if (rc) + goto err_pgtbl_init; + + init_completion(&cq->cq_rel_comp); + kref_init(&cq->cq_kref); + + entry = xa_store_irq(&dev->cq_tbl, cq->cqid, cq, GFP_KERNEL); + if (entry) { + if (!xa_is_err(entry)) + rc = -EINVAL; + else + rc = xa_err(entry); + + goto err_xa; + } + + return 0; + +err_xa: + ionic_pgtbl_unbuf(dev, buf); +err_pgtbl_init: + if (!udata) + ionic_queue_destroy(&cq->q, dev->lif_cfg.hwdev); +err_q_init: + if (cq->umem) + ib_umem_release(cq->umem); +err_qdesc: + ionic_put_cqid(dev, cq->cqid); +err_args: + cq->vcq = NULL; + + return rc; +} + +void ionic_destroy_cq_common(struct ionic_ibdev *dev, struct ionic_cq *cq) +{ + if (!cq->vcq) + return; + + xa_erase_irq(&dev->cq_tbl, cq->cqid); + + kref_put(&cq->cq_kref, ionic_cq_complete); + wait_for_completion(&cq->cq_rel_comp); + + if (cq->umem) + ib_umem_release(cq->umem); + else + ionic_queue_destroy(&cq->q, dev->lif_cfg.hwdev); + + ionic_put_cqid(dev, cq->cqid); + + cq->vcq = NULL; +} + +static int ionic_validate_qdesc_zero(struct ionic_qdesc *q) +{ + if (q->addr || q->size || q->mask || q->depth_log2 || q->stride_log2) + return -EINVAL; + + return 0; +} + +static int ionic_get_pdid(struct ionic_ibdev *dev, u32 *pdid) +{ + int rc; + + rc = ionic_resid_get(&dev->inuse_pdid); + if (rc < 0) + return rc; + + *pdid = rc; + return 0; +} + +static int ionic_get_ahid(struct ionic_ibdev *dev, u32 *ahid) +{ + int rc; + + rc = ionic_resid_get(&dev->inuse_ahid); + if (rc < 0) + return rc; + + *ahid = rc; + return 0; +} + +static int ionic_get_mrid(struct ionic_ibdev *dev, u32 *mrid) +{ + int rc; + + /* wrap to 1, skip reserved lkey */ + rc = ionic_resid_get_shared(&dev->inuse_mrid, 1, + dev->inuse_mrid.inuse_size); + if (rc < 0) + return rc; + + *mrid = ionic_mrid(rc, dev->next_mrkey++); + return 0; +} + +static int ionic_get_gsi_qpid(struct ionic_ibdev *dev, u32 *qpid) +{ + int rc = 0; + + rc = ionic_resid_get_shared(&dev->inuse_qpid, IB_QPT_GSI, IB_QPT_GSI + 1); + if (rc < 0) + return rc; + + *qpid = IB_QPT_GSI; + return 0; +} + +static int ionic_get_qpid(struct ionic_ibdev *dev, u32 *qpid, + u8 *udma_idx, u8 udma_mask) +{ + unsigned int size, base, bound; + int udma_i, udma_x, udma_ix; + int rc = -EINVAL; + + udma_x = dev->next_qpid_udma_idx; + + dev->next_qpid_udma_idx ^= dev->lif_cfg.udma_count - 1; + + for (udma_i = 0; udma_i < dev->lif_cfg.udma_count; ++udma_i) { + udma_ix = udma_i ^ udma_x; + + if (!(udma_mask & BIT(udma_ix))) + continue; + + size = dev->lif_cfg.qp_count / dev->lif_cfg.udma_count; + base = size * udma_ix; + bound = base + size; + + /* skip reserved SMI and GSI qpids in group zero */ + if (!base) + base = 2; + + rc = ionic_resid_get_shared(&dev->inuse_qpid, base, bound); + if (rc >= 0) { + *qpid = ionic_bitid_to_qid(rc, + dev->lif_cfg.udma_qgrp_shift, + dev->half_qpid_udma_shift); + *udma_idx = udma_ix; + + rc = 0; + break; + } + } + + return rc; +} + +static int ionic_get_dbid(struct ionic_ibdev *dev, u32 *dbid, phys_addr_t *addr) +{ + int rc, dbpage_num; + + /* wrap to 1, skip kernel reserved */ + rc = ionic_resid_get_shared(&dev->inuse_dbid, 1, + dev->inuse_dbid.inuse_size); + if (rc < 0) + return rc; + + dbpage_num = (dev->lif_cfg.lif_hw_index * dev->lif_cfg.dbid_count) + rc; + *addr = dev->lif_cfg.db_phys + ((phys_addr_t)dbpage_num << PAGE_SHIFT); + + *dbid = rc; + + return 0; +} + +static void ionic_put_pdid(struct ionic_ibdev *dev, u32 pdid) +{ + ionic_resid_put(&dev->inuse_pdid, pdid); +} + +static void ionic_put_ahid(struct ionic_ibdev *dev, u32 ahid) +{ + ionic_resid_put(&dev->inuse_ahid, ahid); +} + +static void ionic_put_mrid(struct ionic_ibdev *dev, u32 mrid) +{ + ionic_resid_put(&dev->inuse_mrid, ionic_mrid_index(mrid)); +} + +static void ionic_put_qpid(struct ionic_ibdev *dev, u32 qpid) +{ + u32 bitid = ionic_qid_to_bitid(qpid, + dev->lif_cfg.udma_qgrp_shift, + dev->half_qpid_udma_shift); + + ionic_resid_put(&dev->inuse_qpid, bitid); +} + +static void ionic_put_dbid(struct ionic_ibdev *dev, u32 dbid) +{ + ionic_resid_put(&dev->inuse_dbid, dbid); +} + +static struct rdma_user_mmap_entry* +ionic_mmap_entry_insert(struct ionic_ctx *ctx, unsigned long size, + unsigned long pfn, u8 mmap_flags, u64 *offset) +{ + struct ionic_mmap_entry *entry; + int rc; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->size = size; + entry->pfn = pfn; + entry->mmap_flags = mmap_flags; + + rc = rdma_user_mmap_entry_insert(&ctx->ibctx, &entry->rdma_entry, + entry->size); + if (rc) { + kfree(entry); + return NULL; + } + + if (offset) + *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return &entry->rdma_entry; +} + +int ionic_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device); + struct ionic_ctx *ctx = to_ionic_ctx(ibctx); + struct ionic_ctx_resp resp = {}; + struct ionic_ctx_req req; + phys_addr_t db_phys = 0; + int rc; + + rc = ib_copy_from_udata(&req, udata, sizeof(req)); + if (rc) + return rc; + + /* try to allocate dbid for user ctx */ + rc = ionic_get_dbid(dev, &ctx->dbid, &db_phys); + if (rc < 0) + return rc; + + ibdev_dbg(&dev->ibdev, "user space dbid %u\n", ctx->dbid); + + ctx->mmap_dbell = ionic_mmap_entry_insert(ctx, PAGE_SIZE, + PHYS_PFN(db_phys), 0, NULL); + if (!ctx->mmap_dbell) { + rc = -ENOMEM; + goto err_mmap_dbell; + } + + resp.page_shift = PAGE_SHIFT; + + resp.dbell_offset = db_phys & ~PAGE_MASK; + + resp.version = dev->lif_cfg.rdma_version; + resp.qp_opcodes = dev->lif_cfg.qp_opcodes; + resp.admin_opcodes = dev->lif_cfg.admin_opcodes; + + resp.sq_qtype = dev->lif_cfg.sq_qtype; + resp.rq_qtype = dev->lif_cfg.rq_qtype; + resp.cq_qtype = dev->lif_cfg.cq_qtype; + resp.admin_qtype = dev->lif_cfg.aq_qtype; + resp.max_stride = dev->lif_cfg.max_stride; + resp.max_spec = IONIC_SPEC_HIGH; + + resp.udma_count = dev->lif_cfg.udma_count; + resp.expdb_mask = dev->lif_cfg.expdb_mask; + + if (dev->lif_cfg.sq_expdb) + resp.expdb_qtypes |= IONIC_EXPDB_SQ; + if (dev->lif_cfg.rq_expdb) + resp.expdb_qtypes |= IONIC_EXPDB_RQ; + + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) + goto err_resp; + + return 0; + +err_resp: + rdma_user_mmap_entry_remove(ctx->mmap_dbell); +err_mmap_dbell: + ionic_put_dbid(dev, ctx->dbid); + + return rc; +} + +void ionic_dealloc_ucontext(struct ib_ucontext *ibctx) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device); + struct ionic_ctx *ctx = to_ionic_ctx(ibctx); + + rdma_user_mmap_entry_remove(ctx->mmap_dbell); + ionic_put_dbid(dev, ctx->dbid); +} + +int ionic_mmap(struct ib_ucontext *ibctx, struct vm_area_struct *vma) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device); + struct ionic_ctx *ctx = to_ionic_ctx(ibctx); + struct rdma_user_mmap_entry *rdma_entry; + struct ionic_mmap_entry *ionic_entry; + int rc = 0; + + rdma_entry = rdma_user_mmap_entry_get(&ctx->ibctx, vma); + if (!rdma_entry) { + ibdev_dbg(&dev->ibdev, "not found %#lx\n", + vma->vm_pgoff << PAGE_SHIFT); + return -EINVAL; + } + + ionic_entry = container_of(rdma_entry, struct ionic_mmap_entry, + rdma_entry); + + ibdev_dbg(&dev->ibdev, "writecombine? %d\n", + ionic_entry->mmap_flags & IONIC_MMAP_WC); + if (ionic_entry->mmap_flags & IONIC_MMAP_WC) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + ibdev_dbg(&dev->ibdev, "remap st %#lx pf %#lx sz %#lx\n", + vma->vm_start, ionic_entry->pfn, ionic_entry->size); + rc = rdma_user_mmap_io(&ctx->ibctx, vma, ionic_entry->pfn, + ionic_entry->size, vma->vm_page_prot, + rdma_entry); + if (rc) + ibdev_dbg(&dev->ibdev, "remap failed %d\n", rc); + + rdma_user_mmap_entry_put(rdma_entry); + return rc; +} + +void ionic_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct ionic_mmap_entry *ionic_entry; + + ionic_entry = container_of(rdma_entry, struct ionic_mmap_entry, + rdma_entry); + kfree(ionic_entry); +} + +int ionic_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + + return ionic_get_pdid(dev, &pd->pdid); +} + +int ionic_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + + ionic_put_pdid(dev, pd->pdid); + + return 0; +} + +static int ionic_build_hdr(struct ionic_ibdev *dev, + struct ib_ud_header *hdr, + const struct rdma_ah_attr *attr, + u16 sport, bool want_ecn) +{ + const struct ib_global_route *grh; + enum rdma_network_type net; + u16 vlan; + int rc; + + if (attr->ah_flags != IB_AH_GRH) + return -EINVAL; + if (attr->type != RDMA_AH_ATTR_TYPE_ROCE) + return -EINVAL; + + grh = rdma_ah_read_grh(attr); + + rc = rdma_read_gid_l2_fields(grh->sgid_attr, &vlan, &hdr->eth.smac_h[0]); + if (rc) + return rc; + + net = rdma_gid_attr_network_type(grh->sgid_attr); + + rc = ib_ud_header_init(0, /* no payload */ + 0, /* no lrh */ + 1, /* yes eth */ + vlan != 0xffff, + 0, /* no grh */ + net == RDMA_NETWORK_IPV4 ? 4 : 6, + 1, /* yes udp */ + 0, /* no imm */ + hdr); + if (rc) + return rc; + + ether_addr_copy(hdr->eth.dmac_h, attr->roce.dmac); + + if (net == RDMA_NETWORK_IPV4) { + hdr->eth.type = cpu_to_be16(ETH_P_IP); + hdr->ip4.frag_off = cpu_to_be16(0x4000); /* don't fragment */ + hdr->ip4.ttl = grh->hop_limit; + hdr->ip4.tot_len = cpu_to_be16(0xffff); + hdr->ip4.saddr = + *(const __be32 *)(grh->sgid_attr->gid.raw + 12); + hdr->ip4.daddr = *(const __be32 *)(grh->dgid.raw + 12); + + if (want_ecn) + hdr->ip4.tos = ionic_set_ecn(grh->traffic_class); + else + hdr->ip4.tos = ionic_clear_ecn(grh->traffic_class); + } else { + hdr->eth.type = cpu_to_be16(ETH_P_IPV6); + hdr->grh.flow_label = cpu_to_be32(grh->flow_label); + hdr->grh.hop_limit = grh->hop_limit; + hdr->grh.source_gid = grh->sgid_attr->gid; + hdr->grh.destination_gid = grh->dgid; + + if (want_ecn) + hdr->grh.traffic_class = + ionic_set_ecn(grh->traffic_class); + else + hdr->grh.traffic_class = + ionic_clear_ecn(grh->traffic_class); + } + + if (vlan != 0xffff) { + vlan |= rdma_ah_get_sl(attr) << VLAN_PRIO_SHIFT; + hdr->vlan.tag = cpu_to_be16(vlan); + hdr->vlan.type = hdr->eth.type; + hdr->eth.type = cpu_to_be16(ETH_P_8021Q); + } + + hdr->udp.sport = cpu_to_be16(sport); + hdr->udp.dport = cpu_to_be16(ROCE_V2_UDP_DPORT); + + return 0; +} + +static void ionic_set_ah_attr(struct ionic_ibdev *dev, + struct rdma_ah_attr *ah_attr, + struct ib_ud_header *hdr, + int sgid_index) +{ + u32 flow_label; + u16 vlan = 0; + u8 tos, ttl; + + if (hdr->vlan_present) + vlan = be16_to_cpu(hdr->vlan.tag); + + if (hdr->ipv4_present) { + flow_label = 0; + ttl = hdr->ip4.ttl; + tos = hdr->ip4.tos; + *(__be16 *)(hdr->grh.destination_gid.raw + 10) = cpu_to_be16(0xffff); + *(__be32 *)(hdr->grh.destination_gid.raw + 12) = hdr->ip4.daddr; + } else { + flow_label = be32_to_cpu(hdr->grh.flow_label); + ttl = hdr->grh.hop_limit; + tos = hdr->grh.traffic_class; + } + + memset(ah_attr, 0, sizeof(*ah_attr)); + ah_attr->type = RDMA_AH_ATTR_TYPE_ROCE; + if (hdr->eth_present) + ether_addr_copy(ah_attr->roce.dmac, hdr->eth.dmac_h); + rdma_ah_set_sl(ah_attr, vlan >> VLAN_PRIO_SHIFT); + rdma_ah_set_port_num(ah_attr, 1); + rdma_ah_set_grh(ah_attr, NULL, flow_label, sgid_index, ttl, tos); + rdma_ah_set_dgid_raw(ah_attr, &hdr->grh.destination_gid); +} + +static int ionic_create_ah_cmd(struct ionic_ibdev *dev, + struct ionic_ah *ah, + struct ionic_pd *pd, + struct rdma_ah_attr *attr, + u32 flags) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_CREATE_AH, + .len = cpu_to_le16(IONIC_ADMIN_CREATE_AH_IN_V1_LEN), + .cmd.create_ah = { + .pd_id = cpu_to_le32(pd->pdid), + .dbid_flags = cpu_to_le16(dev->lif_cfg.dbid), + .id_ver = cpu_to_le32(ah->ahid), + } + } + }; + enum ionic_admin_flags admin_flags = 0; + dma_addr_t hdr_dma = 0; + void *hdr_buf; + gfp_t gfp = GFP_ATOMIC; + int rc, hdr_len = 0; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_AH) + return -EBADRQC; + + if (flags & RDMA_CREATE_AH_SLEEPABLE) + gfp = GFP_KERNEL; + else + admin_flags |= IONIC_ADMIN_F_BUSYWAIT; + + rc = ionic_build_hdr(dev, &ah->hdr, attr, IONIC_ROCE_UDP_SPORT, false); + if (rc) + return rc; + + if (ah->hdr.eth.type == cpu_to_be16(ETH_P_8021Q)) { + if (ah->hdr.vlan.type == cpu_to_be16(ETH_P_IP)) + wr.wqe.cmd.create_ah.csum_profile = + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP; + else + wr.wqe.cmd.create_ah.csum_profile = + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP; + } else { + if (ah->hdr.eth.type == cpu_to_be16(ETH_P_IP)) + wr.wqe.cmd.create_ah.csum_profile = + IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP; + else + wr.wqe.cmd.create_ah.csum_profile = + IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP; + } + + ah->sgid_index = rdma_ah_read_grh(attr)->sgid_index; + + hdr_buf = kmalloc(PAGE_SIZE, gfp); + if (!hdr_buf) + return -ENOMEM; + + hdr_len = ib_ud_header_pack(&ah->hdr, hdr_buf); + hdr_len -= IB_BTH_BYTES; + hdr_len -= IB_DETH_BYTES; + ibdev_dbg(&dev->ibdev, "roce packet header template\n"); + print_hex_dump_debug("hdr ", DUMP_PREFIX_OFFSET, 16, 1, + hdr_buf, hdr_len, true); + + hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf, hdr_len, + DMA_TO_DEVICE); + + rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma); + if (rc) + goto err_dma; + + wr.wqe.cmd.create_ah.dma_addr = cpu_to_le64(hdr_dma); + wr.wqe.cmd.create_ah.length = cpu_to_le32(hdr_len); + + ionic_admin_post(dev, &wr); + rc = ionic_admin_wait(dev, &wr, admin_flags); + + dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, hdr_len, + DMA_TO_DEVICE); +err_dma: + kfree(hdr_buf); + + return rc; +} + +static int ionic_destroy_ah_cmd(struct ionic_ibdev *dev, u32 ahid, u32 flags) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_DESTROY_AH, + .len = cpu_to_le16(IONIC_ADMIN_DESTROY_AH_IN_V1_LEN), + .cmd.destroy_ah = { + .ah_id = cpu_to_le32(ahid), + }, + } + }; + enum ionic_admin_flags admin_flags = IONIC_ADMIN_F_TEARDOWN; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_AH) + return -EBADRQC; + + if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) + admin_flags |= IONIC_ADMIN_F_BUSYWAIT; + + ionic_admin_post(dev, &wr); + ionic_admin_wait(dev, &wr, admin_flags); + + /* No host-memory resource is associated with ah, so it is ok + * to "succeed" and complete this destroy ah on the host. + */ + return 0; +} + +int ionic_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device); + struct rdma_ah_attr *attr = init_attr->ah_attr; + struct ionic_pd *pd = to_ionic_pd(ibah->pd); + struct ionic_ah *ah = to_ionic_ah(ibah); + struct ionic_ah_resp resp = {}; + u32 flags = init_attr->flags; + int rc; + + rc = ionic_get_ahid(dev, &ah->ahid); + if (rc) + return rc; + + rc = ionic_create_ah_cmd(dev, ah, pd, attr, flags); + if (rc) + goto err_cmd; + + if (udata) { + resp.ahid = ah->ahid; + + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) + goto err_resp; + } + + return 0; + +err_resp: + ionic_destroy_ah_cmd(dev, ah->ahid, flags); +err_cmd: + ionic_put_ahid(dev, ah->ahid); + return rc; +} + +int ionic_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device); + struct ionic_ah *ah = to_ionic_ah(ibah); + + ionic_set_ah_attr(dev, ah_attr, &ah->hdr, ah->sgid_index); + + return 0; +} + +int ionic_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device); + struct ionic_ah *ah = to_ionic_ah(ibah); + int rc; + + rc = ionic_destroy_ah_cmd(dev, ah->ahid, flags); + if (rc) + return rc; + + ionic_put_ahid(dev, ah->ahid); + + return 0; +} + +static int ionic_create_mr_cmd(struct ionic_ibdev *dev, + struct ionic_pd *pd, + struct ionic_mr *mr, + u64 addr, + u64 length) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_CREATE_MR, + .len = cpu_to_le16(IONIC_ADMIN_CREATE_MR_IN_V1_LEN), + .cmd.create_mr = { + .va = cpu_to_le64(addr), + .length = cpu_to_le64(length), + .pd_id = cpu_to_le32(pd->pdid), + .page_size_log2 = mr->buf.page_size_log2, + .tbl_index = cpu_to_le32(~0), + .map_count = cpu_to_le32(mr->buf.tbl_pages), + .dma_addr = ionic_pgtbl_dma(&mr->buf, addr), + .dbid_flags = cpu_to_le16(mr->flags), + .id_ver = cpu_to_le32(mr->mrid), + } + } + }; + int rc; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_MR) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + rc = ionic_admin_wait(dev, &wr, 0); + if (!rc) + mr->created = true; + + return rc; +} + +static int ionic_destroy_mr_cmd(struct ionic_ibdev *dev, u32 mrid) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_DESTROY_MR, + .len = cpu_to_le16(IONIC_ADMIN_DESTROY_MR_IN_V1_LEN), + .cmd.destroy_mr = { + .mr_id = cpu_to_le32(mrid), + }, + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_MR) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN); +} + +struct ib_mr *ionic_get_dma_mr(struct ib_pd *ibpd, int access) +{ + struct ionic_pd *pd = to_ionic_pd(ibpd); + struct ionic_mr *mr; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->ibmr.lkey = IONIC_DMA_LKEY; + mr->ibmr.rkey = IONIC_DMA_RKEY; + + if (pd) + pd->flags |= IONIC_QPF_PRIVILEGED; + + return &mr->ibmr; +} + +struct ib_mr *ionic_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 addr, int access, struct ib_dmah *dmah, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + struct ionic_mr *mr; + unsigned long pg_sz; + int rc; + + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + rc = ionic_get_mrid(dev, &mr->mrid); + if (rc) + goto err_mrid; + + mr->ibmr.lkey = mr->mrid; + mr->ibmr.rkey = mr->mrid; + mr->ibmr.iova = addr; + mr->ibmr.length = length; + + mr->flags = IONIC_MRF_USER_MR | to_ionic_mr_flags(access); + + mr->umem = ib_umem_get(&dev->ibdev, start, length, access); + if (IS_ERR(mr->umem)) { + rc = PTR_ERR(mr->umem); + goto err_umem; + } + + pg_sz = ib_umem_find_best_pgsz(mr->umem, + dev->lif_cfg.page_size_supported, + addr); + if (!pg_sz) { + rc = -EINVAL; + goto err_pgtbl; + } + + rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, 1, pg_sz); + if (rc) + goto err_pgtbl; + + rc = ionic_create_mr_cmd(dev, pd, mr, addr, length); + if (rc) + goto err_cmd; + + ionic_pgtbl_unbuf(dev, &mr->buf); + + return &mr->ibmr; + +err_cmd: + ionic_pgtbl_unbuf(dev, &mr->buf); +err_pgtbl: + ib_umem_release(mr->umem); +err_umem: + ionic_put_mrid(dev, mr->mrid); +err_mrid: + kfree(mr); + return ERR_PTR(rc); +} + +struct ib_mr *ionic_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 offset, + u64 length, u64 addr, int fd, int access, + struct ib_dmah *dmah, + struct uverbs_attr_bundle *attrs) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + struct ib_umem_dmabuf *umem_dmabuf; + struct ionic_mr *mr; + u64 pg_sz; + int rc; + + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + rc = ionic_get_mrid(dev, &mr->mrid); + if (rc) + goto err_mrid; + + mr->ibmr.lkey = mr->mrid; + mr->ibmr.rkey = mr->mrid; + mr->ibmr.iova = addr; + mr->ibmr.length = length; + + mr->flags = IONIC_MRF_USER_MR | to_ionic_mr_flags(access); + + umem_dmabuf = ib_umem_dmabuf_get_pinned(&dev->ibdev, offset, length, + fd, access); + if (IS_ERR(umem_dmabuf)) { + rc = PTR_ERR(umem_dmabuf); + goto err_umem; + } + + mr->umem = &umem_dmabuf->umem; + + pg_sz = ib_umem_find_best_pgsz(mr->umem, + dev->lif_cfg.page_size_supported, + addr); + if (!pg_sz) { + rc = -EINVAL; + goto err_pgtbl; + } + + rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, 1, pg_sz); + if (rc) + goto err_pgtbl; + + rc = ionic_create_mr_cmd(dev, pd, mr, addr, length); + if (rc) + goto err_cmd; + + ionic_pgtbl_unbuf(dev, &mr->buf); + + return &mr->ibmr; + +err_cmd: + ionic_pgtbl_unbuf(dev, &mr->buf); +err_pgtbl: + ib_umem_release(mr->umem); +err_umem: + ionic_put_mrid(dev, mr->mrid); +err_mrid: + kfree(mr); + return ERR_PTR(rc); +} + +int ionic_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device); + struct ionic_mr *mr = to_ionic_mr(ibmr); + int rc; + + if (!mr->ibmr.lkey) + goto out; + + if (mr->created) { + rc = ionic_destroy_mr_cmd(dev, mr->mrid); + if (rc) + return rc; + } + + ionic_pgtbl_unbuf(dev, &mr->buf); + + if (mr->umem) + ib_umem_release(mr->umem); + + ionic_put_mrid(dev, mr->mrid); + +out: + kfree(mr); + + return 0; +} + +struct ib_mr *ionic_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type type, + u32 max_sg) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device); + struct ionic_pd *pd = to_ionic_pd(ibpd); + struct ionic_mr *mr; + int rc; + + if (type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + rc = ionic_get_mrid(dev, &mr->mrid); + if (rc) + goto err_mrid; + + mr->ibmr.lkey = mr->mrid; + mr->ibmr.rkey = mr->mrid; + + mr->flags = IONIC_MRF_PHYS_MR; + + rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, max_sg, PAGE_SIZE); + if (rc) + goto err_pgtbl; + + mr->buf.tbl_pages = 0; + + rc = ionic_create_mr_cmd(dev, pd, mr, 0, 0); + if (rc) + goto err_cmd; + + return &mr->ibmr; + +err_cmd: + ionic_pgtbl_unbuf(dev, &mr->buf); +err_pgtbl: + ionic_put_mrid(dev, mr->mrid); +err_mrid: + kfree(mr); + return ERR_PTR(rc); +} + +static int ionic_map_mr_page(struct ib_mr *ibmr, u64 dma) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device); + struct ionic_mr *mr = to_ionic_mr(ibmr); + + ibdev_dbg(&dev->ibdev, "dma %p\n", (void *)dma); + return ionic_pgtbl_page(&mr->buf, dma); +} + +int ionic_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device); + struct ionic_mr *mr = to_ionic_mr(ibmr); + int rc; + + /* mr must be allocated using ib_alloc_mr() */ + if (unlikely(!mr->buf.tbl_limit)) + return -EINVAL; + + mr->buf.tbl_pages = 0; + + if (mr->buf.tbl_buf) + dma_sync_single_for_cpu(dev->lif_cfg.hwdev, mr->buf.tbl_dma, + mr->buf.tbl_size, DMA_TO_DEVICE); + + ibdev_dbg(&dev->ibdev, "sg %p nent %d\n", sg, sg_nents); + rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ionic_map_mr_page); + + mr->buf.page_size_log2 = order_base_2(ibmr->page_size); + + if (mr->buf.tbl_buf) + dma_sync_single_for_device(dev->lif_cfg.hwdev, mr->buf.tbl_dma, + mr->buf.tbl_size, DMA_TO_DEVICE); + + return rc; +} + +int ionic_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmw->device); + struct ionic_pd *pd = to_ionic_pd(ibmw->pd); + struct ionic_mr *mr = to_ionic_mw(ibmw); + int rc; + + rc = ionic_get_mrid(dev, &mr->mrid); + if (rc) + return rc; + + mr->ibmw.rkey = mr->mrid; + + if (mr->ibmw.type == IB_MW_TYPE_1) + mr->flags = IONIC_MRF_MW_1; + else + mr->flags = IONIC_MRF_MW_2; + + rc = ionic_create_mr_cmd(dev, pd, mr, 0, 0); + if (rc) + goto err_cmd; + + return 0; + +err_cmd: + ionic_put_mrid(dev, mr->mrid); + return rc; +} + +int ionic_dealloc_mw(struct ib_mw *ibmw) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibmw->device); + struct ionic_mr *mr = to_ionic_mw(ibmw); + int rc; + + rc = ionic_destroy_mr_cmd(dev, mr->mrid); + if (rc) + return rc; + + ionic_put_mrid(dev, mr->mrid); + + return 0; +} + +static int ionic_create_cq_cmd(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_cq *cq, + struct ionic_tbl_buf *buf) +{ + const u16 dbid = ionic_ctx_dbid(dev, ctx); + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_CREATE_CQ, + .len = cpu_to_le16(IONIC_ADMIN_CREATE_CQ_IN_V1_LEN), + .cmd.create_cq = { + .eq_id = cpu_to_le32(cq->eqid), + .depth_log2 = cq->q.depth_log2, + .stride_log2 = cq->q.stride_log2, + .page_size_log2 = buf->page_size_log2, + .tbl_index = cpu_to_le32(~0), + .map_count = cpu_to_le32(buf->tbl_pages), + .dma_addr = ionic_pgtbl_dma(buf, 0), + .dbid_flags = cpu_to_le16(dbid), + .id_ver = cpu_to_le32(cq->cqid), + } + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_CQ) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, 0); +} + +static int ionic_destroy_cq_cmd(struct ionic_ibdev *dev, u32 cqid) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_DESTROY_CQ, + .len = cpu_to_le16(IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN), + .cmd.destroy_cq = { + .cq_id = cpu_to_le32(cqid), + }, + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_CQ) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN); +} + +int ionic_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device); + struct ib_udata *udata = &attrs->driver_udata; + struct ionic_ctx *ctx = + rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx); + struct ionic_vcq *vcq = to_ionic_vcq(ibcq); + struct ionic_tbl_buf buf = {}; + struct ionic_cq_resp resp; + struct ionic_cq_req req; + int udma_idx = 0, rc; + + if (udata) { + rc = ib_copy_from_udata(&req, udata, sizeof(req)); + if (rc) + return rc; + } + + vcq->udma_mask = BIT(dev->lif_cfg.udma_count) - 1; + + if (udata) + vcq->udma_mask &= req.udma_mask; + + if (!vcq->udma_mask) { + rc = -EINVAL; + goto err_init; + } + + for (; udma_idx < dev->lif_cfg.udma_count; ++udma_idx) { + if (!(vcq->udma_mask & BIT(udma_idx))) + continue; + + rc = ionic_create_cq_common(vcq, &buf, attr, ctx, udata, + &req.cq[udma_idx], + &resp.cqid[udma_idx], + udma_idx); + if (rc) + goto err_init; + + rc = ionic_create_cq_cmd(dev, ctx, &vcq->cq[udma_idx], &buf); + if (rc) + goto err_cmd; + + ionic_pgtbl_unbuf(dev, &buf); + } + + vcq->ibcq.cqe = attr->cqe; + + if (udata) { + resp.udma_mask = vcq->udma_mask; + + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) + goto err_resp; + } + + return 0; + +err_resp: + while (udma_idx) { + --udma_idx; + if (!(vcq->udma_mask & BIT(udma_idx))) + continue; + ionic_destroy_cq_cmd(dev, vcq->cq[udma_idx].cqid); +err_cmd: + ionic_pgtbl_unbuf(dev, &buf); + ionic_destroy_cq_common(dev, &vcq->cq[udma_idx]); +err_init: + ; + } + + return rc; +} + +int ionic_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device); + struct ionic_vcq *vcq = to_ionic_vcq(ibcq); + int udma_idx, rc_tmp, rc = 0; + + for (udma_idx = dev->lif_cfg.udma_count; udma_idx; ) { + --udma_idx; + + if (!(vcq->udma_mask & BIT(udma_idx))) + continue; + + rc_tmp = ionic_destroy_cq_cmd(dev, vcq->cq[udma_idx].cqid); + if (rc_tmp) { + if (!rc) + rc = rc_tmp; + + continue; + } + + ionic_destroy_cq_common(dev, &vcq->cq[udma_idx]); + } + + return rc; +} + +static bool pd_remote_privileged(struct ib_pd *pd) +{ + return pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; +} + +static int ionic_create_qp_cmd(struct ionic_ibdev *dev, + struct ionic_pd *pd, + struct ionic_cq *send_cq, + struct ionic_cq *recv_cq, + struct ionic_qp *qp, + struct ionic_tbl_buf *sq_buf, + struct ionic_tbl_buf *rq_buf, + struct ib_qp_init_attr *attr) +{ + const u16 dbid = ionic_obj_dbid(dev, pd->ibpd.uobject); + const u32 flags = to_ionic_qp_flags(0, 0, + qp->sq_cmb & IONIC_CMB_ENABLE, + qp->rq_cmb & IONIC_CMB_ENABLE, + qp->sq_spec, qp->rq_spec, + pd->flags & IONIC_QPF_PRIVILEGED, + pd_remote_privileged(&pd->ibpd)); + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_CREATE_QP, + .len = cpu_to_le16(IONIC_ADMIN_CREATE_QP_IN_V1_LEN), + .cmd.create_qp = { + .pd_id = cpu_to_le32(pd->pdid), + .priv_flags = cpu_to_be32(flags), + .type_state = to_ionic_qp_type(attr->qp_type), + .dbid_flags = cpu_to_le16(dbid), + .id_ver = cpu_to_le32(qp->qpid), + } + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_QP) + return -EBADRQC; + + if (qp->has_sq) { + wr.wqe.cmd.create_qp.sq_cq_id = cpu_to_le32(send_cq->cqid); + wr.wqe.cmd.create_qp.sq_depth_log2 = qp->sq.depth_log2; + wr.wqe.cmd.create_qp.sq_stride_log2 = qp->sq.stride_log2; + wr.wqe.cmd.create_qp.sq_page_size_log2 = sq_buf->page_size_log2; + wr.wqe.cmd.create_qp.sq_tbl_index_xrcd_id = cpu_to_le32(~0); + wr.wqe.cmd.create_qp.sq_map_count = + cpu_to_le32(sq_buf->tbl_pages); + wr.wqe.cmd.create_qp.sq_dma_addr = ionic_pgtbl_dma(sq_buf, 0); + } + + if (qp->has_rq) { + wr.wqe.cmd.create_qp.rq_cq_id = cpu_to_le32(recv_cq->cqid); + wr.wqe.cmd.create_qp.rq_depth_log2 = qp->rq.depth_log2; + wr.wqe.cmd.create_qp.rq_stride_log2 = qp->rq.stride_log2; + wr.wqe.cmd.create_qp.rq_page_size_log2 = rq_buf->page_size_log2; + wr.wqe.cmd.create_qp.rq_tbl_index_srq_id = cpu_to_le32(~0); + wr.wqe.cmd.create_qp.rq_map_count = + cpu_to_le32(rq_buf->tbl_pages); + wr.wqe.cmd.create_qp.rq_dma_addr = ionic_pgtbl_dma(rq_buf, 0); + } + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, 0); +} + +static int ionic_modify_qp_cmd(struct ionic_ibdev *dev, + struct ionic_pd *pd, + struct ionic_qp *qp, + struct ib_qp_attr *attr, + int mask) +{ + const u32 flags = to_ionic_qp_flags(attr->qp_access_flags, + attr->en_sqd_async_notify, + qp->sq_cmb & IONIC_CMB_ENABLE, + qp->rq_cmb & IONIC_CMB_ENABLE, + qp->sq_spec, qp->rq_spec, + pd->flags & IONIC_QPF_PRIVILEGED, + pd_remote_privileged(qp->ibqp.pd)); + const u8 state = to_ionic_qp_modify_state(attr->qp_state, + attr->cur_qp_state); + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_MODIFY_QP, + .len = cpu_to_le16(IONIC_ADMIN_MODIFY_QP_IN_V1_LEN), + .cmd.mod_qp = { + .attr_mask = cpu_to_be32(mask), + .access_flags = cpu_to_be16(flags), + .rq_psn = cpu_to_le32(attr->rq_psn), + .sq_psn = cpu_to_le32(attr->sq_psn), + .rate_limit_kbps = + cpu_to_le32(attr->rate_limit), + .pmtu = (attr->path_mtu + 7), + .retry = (attr->retry_cnt | + (attr->rnr_retry << 4)), + .rnr_timer = attr->min_rnr_timer, + .retry_timeout = attr->timeout, + .type_state = state, + .id_ver = cpu_to_le32(qp->qpid), + } + } + }; + const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); + void *hdr_buf = NULL; + dma_addr_t hdr_dma = 0; + int rc, hdr_len = 0; + u16 sport; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_MODIFY_QP) + return -EBADRQC; + + if ((mask & IB_QP_MAX_DEST_RD_ATOMIC) && attr->max_dest_rd_atomic) { + /* Note, round up/down was already done for allocating + * resources on the device. The allocation order is in cache + * line size. We can't use the order of the resource + * allocation to determine the order wqes here, because for + * queue length <= one cache line it is not distinct. + * + * Therefore, order wqes is computed again here. + * + * Account for hole and round up to the next order. + */ + wr.wqe.cmd.mod_qp.rsq_depth = + order_base_2(attr->max_dest_rd_atomic + 1); + wr.wqe.cmd.mod_qp.rsq_index = cpu_to_le32(~0); + } + + if ((mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) { + /* Account for hole and round down to the next order */ + wr.wqe.cmd.mod_qp.rrq_depth = + order_base_2(attr->max_rd_atomic + 2) - 1; + wr.wqe.cmd.mod_qp.rrq_index = cpu_to_le32(~0); + } + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) + wr.wqe.cmd.mod_qp.qkey_dest_qpn = + cpu_to_le32(attr->dest_qp_num); + else + wr.wqe.cmd.mod_qp.qkey_dest_qpn = cpu_to_le32(attr->qkey); + + if (mask & IB_QP_AV) { + if (!qp->hdr) + return -ENOMEM; + + sport = rdma_get_udp_sport(grh->flow_label, + qp->qpid, + attr->dest_qp_num); + + rc = ionic_build_hdr(dev, qp->hdr, &attr->ah_attr, sport, true); + if (rc) + return rc; + + qp->sgid_index = grh->sgid_index; + + hdr_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!hdr_buf) + return -ENOMEM; + + hdr_len = ib_ud_header_pack(qp->hdr, hdr_buf); + hdr_len -= IB_BTH_BYTES; + hdr_len -= IB_DETH_BYTES; + ibdev_dbg(&dev->ibdev, "roce packet header template\n"); + print_hex_dump_debug("hdr ", DUMP_PREFIX_OFFSET, 16, 1, + hdr_buf, hdr_len, true); + + hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf, hdr_len, + DMA_TO_DEVICE); + + rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma); + if (rc) + goto err_dma; + + if (qp->hdr->ipv4_present) { + wr.wqe.cmd.mod_qp.tfp_csum_profile = + qp->hdr->vlan_present ? + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP : + IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP; + } else { + wr.wqe.cmd.mod_qp.tfp_csum_profile = + qp->hdr->vlan_present ? + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP : + IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP; + } + + wr.wqe.cmd.mod_qp.ah_id_len = + cpu_to_le32(qp->ahid | (hdr_len << 24)); + wr.wqe.cmd.mod_qp.dma_addr = cpu_to_le64(hdr_dma); + + wr.wqe.cmd.mod_qp.en_pcp = attr->ah_attr.sl; + wr.wqe.cmd.mod_qp.ip_dscp = grh->traffic_class >> 2; + } + + ionic_admin_post(dev, &wr); + + rc = ionic_admin_wait(dev, &wr, 0); + + if (mask & IB_QP_AV) + dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, hdr_len, + DMA_TO_DEVICE); +err_dma: + if (mask & IB_QP_AV) + kfree(hdr_buf); + + return rc; +} + +static int ionic_query_qp_cmd(struct ionic_ibdev *dev, + struct ionic_qp *qp, + struct ib_qp_attr *attr, + int mask) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_QUERY_QP, + .len = cpu_to_le16(IONIC_ADMIN_QUERY_QP_IN_V1_LEN), + .cmd.query_qp = { + .id_ver = cpu_to_le32(qp->qpid), + }, + } + }; + struct ionic_v1_admin_query_qp_sq *query_sqbuf; + struct ionic_v1_admin_query_qp_rq *query_rqbuf; + dma_addr_t query_sqdma; + dma_addr_t query_rqdma; + dma_addr_t hdr_dma = 0; + void *hdr_buf = NULL; + int flags, rc; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_QUERY_QP) + return -EBADRQC; + + if (qp->has_sq) { + bool expdb = !!(qp->sq_cmb & IONIC_CMB_EXPDB); + + attr->cap.max_send_sge = + ionic_v1_send_wqe_max_sge(qp->sq.stride_log2, + qp->sq_spec, + expdb); + attr->cap.max_inline_data = + ionic_v1_send_wqe_max_data(qp->sq.stride_log2, expdb); + } + + if (qp->has_rq) { + attr->cap.max_recv_sge = + ionic_v1_recv_wqe_max_sge(qp->rq.stride_log2, + qp->rq_spec, + qp->rq_cmb & IONIC_CMB_EXPDB); + } + + query_sqbuf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!query_sqbuf) + return -ENOMEM; + + query_rqbuf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!query_rqbuf) { + rc = -ENOMEM; + goto err_rqbuf; + } + + query_sqdma = dma_map_single(dev->lif_cfg.hwdev, query_sqbuf, PAGE_SIZE, + DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, query_sqdma); + if (rc) + goto err_sqdma; + + query_rqdma = dma_map_single(dev->lif_cfg.hwdev, query_rqbuf, PAGE_SIZE, + DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, query_rqdma); + if (rc) + goto err_rqdma; + + if (mask & IB_QP_AV) { + hdr_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!hdr_buf) { + rc = -ENOMEM; + goto err_hdrbuf; + } + + hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf, + PAGE_SIZE, DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma); + if (rc) + goto err_hdrdma; + } + + wr.wqe.cmd.query_qp.sq_dma_addr = cpu_to_le64(query_sqdma); + wr.wqe.cmd.query_qp.rq_dma_addr = cpu_to_le64(query_rqdma); + wr.wqe.cmd.query_qp.hdr_dma_addr = cpu_to_le64(hdr_dma); + wr.wqe.cmd.query_qp.ah_id = cpu_to_le32(qp->ahid); + + ionic_admin_post(dev, &wr); + + rc = ionic_admin_wait(dev, &wr, 0); + + if (rc) + goto err_hdrdma; + + flags = be16_to_cpu(query_sqbuf->access_perms_flags | + query_rqbuf->access_perms_flags); + + print_hex_dump_debug("sqbuf ", DUMP_PREFIX_OFFSET, 16, 1, + query_sqbuf, sizeof(*query_sqbuf), true); + print_hex_dump_debug("rqbuf ", DUMP_PREFIX_OFFSET, 16, 1, + query_rqbuf, sizeof(*query_rqbuf), true); + ibdev_dbg(&dev->ibdev, "query qp %u state_pmtu %#x flags %#x", + qp->qpid, query_rqbuf->state_pmtu, flags); + + attr->qp_state = from_ionic_qp_state(query_rqbuf->state_pmtu >> 4); + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = (query_rqbuf->state_pmtu & 0xf) - 7; + attr->path_mig_state = IB_MIG_MIGRATED; + attr->qkey = be32_to_cpu(query_sqbuf->qkey_dest_qpn); + attr->rq_psn = be32_to_cpu(query_sqbuf->rq_psn); + attr->sq_psn = be32_to_cpu(query_rqbuf->sq_psn); + attr->dest_qp_num = attr->qkey; + attr->qp_access_flags = from_ionic_qp_flags(flags); + attr->pkey_index = 0; + attr->alt_pkey_index = 0; + attr->en_sqd_async_notify = !!(flags & IONIC_QPF_SQD_NOTIFY); + attr->sq_draining = !!(flags & IONIC_QPF_SQ_DRAINING); + attr->max_rd_atomic = BIT(query_rqbuf->rrq_depth) - 1; + attr->max_dest_rd_atomic = BIT(query_rqbuf->rsq_depth) - 1; + attr->min_rnr_timer = query_sqbuf->rnr_timer; + attr->port_num = 0; + attr->timeout = query_sqbuf->retry_timeout; + attr->retry_cnt = query_rqbuf->retry_rnrtry & 0xf; + attr->rnr_retry = query_rqbuf->retry_rnrtry >> 4; + attr->alt_port_num = 0; + attr->alt_timeout = 0; + attr->rate_limit = be32_to_cpu(query_sqbuf->rate_limit_kbps); + + if (mask & IB_QP_AV) + ionic_set_ah_attr(dev, &attr->ah_attr, + qp->hdr, qp->sgid_index); + +err_hdrdma: + if (mask & IB_QP_AV) { + dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, + PAGE_SIZE, DMA_FROM_DEVICE); + kfree(hdr_buf); + } +err_hdrbuf: + dma_unmap_single(dev->lif_cfg.hwdev, query_rqdma, sizeof(*query_rqbuf), + DMA_FROM_DEVICE); +err_rqdma: + dma_unmap_single(dev->lif_cfg.hwdev, query_sqdma, sizeof(*query_sqbuf), + DMA_FROM_DEVICE); +err_sqdma: + kfree(query_rqbuf); +err_rqbuf: + kfree(query_sqbuf); + + return rc; +} + +static int ionic_destroy_qp_cmd(struct ionic_ibdev *dev, u32 qpid) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = IONIC_V1_ADMIN_DESTROY_QP, + .len = cpu_to_le16(IONIC_ADMIN_DESTROY_QP_IN_V1_LEN), + .cmd.destroy_qp = { + .qp_id = cpu_to_le32(qpid), + }, + } + }; + + if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_QP) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN); +} + +static bool ionic_expdb_wqe_size_supported(struct ionic_ibdev *dev, + uint32_t wqe_size) +{ + switch (wqe_size) { + case 64: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_64; + case 128: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_128; + case 256: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_256; + case 512: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_512; + } + + return false; +} + +static void ionic_qp_sq_init_cmb(struct ionic_ibdev *dev, + struct ionic_qp *qp, + struct ib_udata *udata, + int max_data) +{ + u8 expdb_stride_log2 = 0; + bool expdb; + int rc; + + if (!(qp->sq_cmb & IONIC_CMB_ENABLE)) + goto not_in_cmb; + + if (qp->sq_cmb & ~IONIC_CMB_SUPPORTED) { + if (qp->sq_cmb & IONIC_CMB_REQUIRE) + goto not_in_cmb; + + qp->sq_cmb &= IONIC_CMB_SUPPORTED; + } + + if ((qp->sq_cmb & IONIC_CMB_EXPDB) && !dev->lif_cfg.sq_expdb) { + if (qp->sq_cmb & IONIC_CMB_REQUIRE) + goto not_in_cmb; + + qp->sq_cmb &= ~IONIC_CMB_EXPDB; + } + + qp->sq_cmb_order = order_base_2(qp->sq.size / PAGE_SIZE); + + if (qp->sq_cmb_order >= IONIC_SQCMB_ORDER) + goto not_in_cmb; + + if (qp->sq_cmb & IONIC_CMB_EXPDB) + expdb_stride_log2 = qp->sq.stride_log2; + + rc = ionic_get_cmb(dev->lif_cfg.lif, &qp->sq_cmb_pgid, + &qp->sq_cmb_addr, qp->sq_cmb_order, + expdb_stride_log2, &expdb); + if (rc) + goto not_in_cmb; + + if ((qp->sq_cmb & IONIC_CMB_EXPDB) && !expdb) { + if (qp->sq_cmb & IONIC_CMB_REQUIRE) + goto err_map; + + qp->sq_cmb &= ~IONIC_CMB_EXPDB; + } + + return; + +err_map: + ionic_put_cmb(dev->lif_cfg.lif, qp->sq_cmb_pgid, qp->sq_cmb_order); +not_in_cmb: + if (qp->sq_cmb & IONIC_CMB_REQUIRE) + ibdev_dbg(&dev->ibdev, "could not place sq in cmb as required\n"); + + qp->sq_cmb = 0; + qp->sq_cmb_order = IONIC_RES_INVALID; + qp->sq_cmb_pgid = 0; + qp->sq_cmb_addr = 0; +} + +static void ionic_qp_sq_destroy_cmb(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_qp *qp) +{ + if (!(qp->sq_cmb & IONIC_CMB_ENABLE)) + return; + + if (ctx) + rdma_user_mmap_entry_remove(qp->mmap_sq_cmb); + + ionic_put_cmb(dev->lif_cfg.lif, qp->sq_cmb_pgid, qp->sq_cmb_order); +} + +static int ionic_qp_sq_init(struct ionic_ibdev *dev, struct ionic_ctx *ctx, + struct ionic_qp *qp, struct ionic_qdesc *sq, + struct ionic_tbl_buf *buf, int max_wr, int max_sge, + int max_data, int sq_spec, struct ib_udata *udata) +{ + u32 wqe_size; + int rc = 0; + + qp->sq_msn_prod = 0; + qp->sq_msn_cons = 0; + + if (!qp->has_sq) { + if (buf) { + buf->tbl_buf = NULL; + buf->tbl_limit = 0; + buf->tbl_pages = 0; + } + if (udata) + rc = ionic_validate_qdesc_zero(sq); + + return rc; + } + + rc = -EINVAL; + + if (max_wr < 0 || max_wr > 0xffff) + return rc; + + if (max_sge < 1) + return rc; + + if (max_sge > min(ionic_v1_send_wqe_max_sge(dev->lif_cfg.max_stride, 0, + qp->sq_cmb & + IONIC_CMB_EXPDB), + IONIC_SPEC_HIGH)) + return rc; + + if (max_data < 0) + return rc; + + if (max_data > ionic_v1_send_wqe_max_data(dev->lif_cfg.max_stride, + qp->sq_cmb & IONIC_CMB_EXPDB)) + return rc; + + if (udata) { + rc = ionic_validate_qdesc(sq); + if (rc) + return rc; + + qp->sq_spec = sq_spec; + + qp->sq.ptr = NULL; + qp->sq.size = sq->size; + qp->sq.mask = sq->mask; + qp->sq.depth_log2 = sq->depth_log2; + qp->sq.stride_log2 = sq->stride_log2; + + qp->sq_meta = NULL; + qp->sq_msn_idx = NULL; + + qp->sq_umem = ib_umem_get(&dev->ibdev, sq->addr, sq->size, 0); + if (IS_ERR(qp->sq_umem)) + return PTR_ERR(qp->sq_umem); + } else { + qp->sq_umem = NULL; + + qp->sq_spec = ionic_v1_use_spec_sge(max_sge, sq_spec); + if (sq_spec && !qp->sq_spec) + ibdev_dbg(&dev->ibdev, + "init sq: max_sge %u disables spec\n", + max_sge); + + if (qp->sq_cmb & IONIC_CMB_EXPDB) { + wqe_size = ionic_v1_send_wqe_min_size(max_sge, max_data, + qp->sq_spec, + true); + + if (!ionic_expdb_wqe_size_supported(dev, wqe_size)) + qp->sq_cmb &= ~IONIC_CMB_EXPDB; + } + + if (!(qp->sq_cmb & IONIC_CMB_EXPDB)) + wqe_size = ionic_v1_send_wqe_min_size(max_sge, max_data, + qp->sq_spec, + false); + + rc = ionic_queue_init(&qp->sq, dev->lif_cfg.hwdev, + max_wr, wqe_size); + if (rc) + return rc; + + ionic_queue_dbell_init(&qp->sq, qp->qpid); + + qp->sq_meta = kmalloc_array((u32)qp->sq.mask + 1, + sizeof(*qp->sq_meta), + GFP_KERNEL); + if (!qp->sq_meta) { + rc = -ENOMEM; + goto err_sq_meta; + } + + qp->sq_msn_idx = kmalloc_array((u32)qp->sq.mask + 1, + sizeof(*qp->sq_msn_idx), + GFP_KERNEL); + if (!qp->sq_msn_idx) { + rc = -ENOMEM; + goto err_sq_msn; + } + } + + ionic_qp_sq_init_cmb(dev, qp, udata, max_data); + + if (qp->sq_cmb & IONIC_CMB_ENABLE) + rc = ionic_pgtbl_init(dev, buf, NULL, + (u64)qp->sq_cmb_pgid << PAGE_SHIFT, + 1, PAGE_SIZE); + else + rc = ionic_pgtbl_init(dev, buf, + qp->sq_umem, qp->sq.dma, 1, PAGE_SIZE); + if (rc) + goto err_sq_tbl; + + return 0; + +err_sq_tbl: + ionic_qp_sq_destroy_cmb(dev, ctx, qp); + kfree(qp->sq_msn_idx); +err_sq_msn: + kfree(qp->sq_meta); +err_sq_meta: + if (qp->sq_umem) + ib_umem_release(qp->sq_umem); + else + ionic_queue_destroy(&qp->sq, dev->lif_cfg.hwdev); + return rc; +} + +static void ionic_qp_sq_destroy(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_qp *qp) +{ + if (!qp->has_sq) + return; + + ionic_qp_sq_destroy_cmb(dev, ctx, qp); + + kfree(qp->sq_msn_idx); + kfree(qp->sq_meta); + + if (qp->sq_umem) + ib_umem_release(qp->sq_umem); + else + ionic_queue_destroy(&qp->sq, dev->lif_cfg.hwdev); +} + +static void ionic_qp_rq_init_cmb(struct ionic_ibdev *dev, + struct ionic_qp *qp, + struct ib_udata *udata) +{ + u8 expdb_stride_log2 = 0; + bool expdb; + int rc; + + if (!(qp->rq_cmb & IONIC_CMB_ENABLE)) + goto not_in_cmb; + + if (qp->rq_cmb & ~IONIC_CMB_SUPPORTED) { + if (qp->rq_cmb & IONIC_CMB_REQUIRE) + goto not_in_cmb; + + qp->rq_cmb &= IONIC_CMB_SUPPORTED; + } + + if ((qp->rq_cmb & IONIC_CMB_EXPDB) && !dev->lif_cfg.rq_expdb) { + if (qp->rq_cmb & IONIC_CMB_REQUIRE) + goto not_in_cmb; + + qp->rq_cmb &= ~IONIC_CMB_EXPDB; + } + + qp->rq_cmb_order = order_base_2(qp->rq.size / PAGE_SIZE); + + if (qp->rq_cmb_order >= IONIC_RQCMB_ORDER) + goto not_in_cmb; + + if (qp->rq_cmb & IONIC_CMB_EXPDB) + expdb_stride_log2 = qp->rq.stride_log2; + + rc = ionic_get_cmb(dev->lif_cfg.lif, &qp->rq_cmb_pgid, + &qp->rq_cmb_addr, qp->rq_cmb_order, + expdb_stride_log2, &expdb); + if (rc) + goto not_in_cmb; + + if ((qp->rq_cmb & IONIC_CMB_EXPDB) && !expdb) { + if (qp->rq_cmb & IONIC_CMB_REQUIRE) + goto err_map; + + qp->rq_cmb &= ~IONIC_CMB_EXPDB; + } + + return; + +err_map: + ionic_put_cmb(dev->lif_cfg.lif, qp->rq_cmb_pgid, qp->rq_cmb_order); +not_in_cmb: + if (qp->rq_cmb & IONIC_CMB_REQUIRE) + ibdev_dbg(&dev->ibdev, "could not place rq in cmb as required\n"); + + qp->rq_cmb = 0; + qp->rq_cmb_order = IONIC_RES_INVALID; + qp->rq_cmb_pgid = 0; + qp->rq_cmb_addr = 0; +} + +static void ionic_qp_rq_destroy_cmb(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_qp *qp) +{ + if (!(qp->rq_cmb & IONIC_CMB_ENABLE)) + return; + + if (ctx) + rdma_user_mmap_entry_remove(qp->mmap_rq_cmb); + + ionic_put_cmb(dev->lif_cfg.lif, qp->rq_cmb_pgid, qp->rq_cmb_order); +} + +static int ionic_qp_rq_init(struct ionic_ibdev *dev, struct ionic_ctx *ctx, + struct ionic_qp *qp, struct ionic_qdesc *rq, + struct ionic_tbl_buf *buf, int max_wr, int max_sge, + int rq_spec, struct ib_udata *udata) +{ + int rc = 0, i; + u32 wqe_size; + + if (!qp->has_rq) { + if (buf) { + buf->tbl_buf = NULL; + buf->tbl_limit = 0; + buf->tbl_pages = 0; + } + if (udata) + rc = ionic_validate_qdesc_zero(rq); + + return rc; + } + + rc = -EINVAL; + + if (max_wr < 0 || max_wr > 0xffff) + return rc; + + if (max_sge < 1) + return rc; + + if (max_sge > min(ionic_v1_recv_wqe_max_sge(dev->lif_cfg.max_stride, 0, false), + IONIC_SPEC_HIGH)) + return rc; + + if (udata) { + rc = ionic_validate_qdesc(rq); + if (rc) + return rc; + + qp->rq_spec = rq_spec; + + qp->rq.ptr = NULL; + qp->rq.size = rq->size; + qp->rq.mask = rq->mask; + qp->rq.depth_log2 = rq->depth_log2; + qp->rq.stride_log2 = rq->stride_log2; + + qp->rq_meta = NULL; + + qp->rq_umem = ib_umem_get(&dev->ibdev, rq->addr, rq->size, 0); + if (IS_ERR(qp->rq_umem)) + return PTR_ERR(qp->rq_umem); + } else { + qp->rq_umem = NULL; + + qp->rq_spec = ionic_v1_use_spec_sge(max_sge, rq_spec); + if (rq_spec && !qp->rq_spec) + ibdev_dbg(&dev->ibdev, + "init rq: max_sge %u disables spec\n", + max_sge); + + if (qp->rq_cmb & IONIC_CMB_EXPDB) { + wqe_size = ionic_v1_recv_wqe_min_size(max_sge, + qp->rq_spec, + true); + + if (!ionic_expdb_wqe_size_supported(dev, wqe_size)) + qp->rq_cmb &= ~IONIC_CMB_EXPDB; + } + + if (!(qp->rq_cmb & IONIC_CMB_EXPDB)) + wqe_size = ionic_v1_recv_wqe_min_size(max_sge, + qp->rq_spec, + false); + + rc = ionic_queue_init(&qp->rq, dev->lif_cfg.hwdev, + max_wr, wqe_size); + if (rc) + return rc; + + ionic_queue_dbell_init(&qp->rq, qp->qpid); + + qp->rq_meta = kmalloc_array((u32)qp->rq.mask + 1, + sizeof(*qp->rq_meta), + GFP_KERNEL); + if (!qp->rq_meta) { + rc = -ENOMEM; + goto err_rq_meta; + } + + for (i = 0; i < qp->rq.mask; ++i) + qp->rq_meta[i].next = &qp->rq_meta[i + 1]; + qp->rq_meta[i].next = IONIC_META_LAST; + qp->rq_meta_head = &qp->rq_meta[0]; + } + + ionic_qp_rq_init_cmb(dev, qp, udata); + + if (qp->rq_cmb & IONIC_CMB_ENABLE) + rc = ionic_pgtbl_init(dev, buf, NULL, + (u64)qp->rq_cmb_pgid << PAGE_SHIFT, + 1, PAGE_SIZE); + else + rc = ionic_pgtbl_init(dev, buf, + qp->rq_umem, qp->rq.dma, 1, PAGE_SIZE); + if (rc) + goto err_rq_tbl; + + return 0; + +err_rq_tbl: + ionic_qp_rq_destroy_cmb(dev, ctx, qp); + kfree(qp->rq_meta); +err_rq_meta: + if (qp->rq_umem) + ib_umem_release(qp->rq_umem); + else + ionic_queue_destroy(&qp->rq, dev->lif_cfg.hwdev); + return rc; +} + +static void ionic_qp_rq_destroy(struct ionic_ibdev *dev, + struct ionic_ctx *ctx, + struct ionic_qp *qp) +{ + if (!qp->has_rq) + return; + + ionic_qp_rq_destroy_cmb(dev, ctx, qp); + + kfree(qp->rq_meta); + + if (qp->rq_umem) + ib_umem_release(qp->rq_umem); + else + ionic_queue_destroy(&qp->rq, dev->lif_cfg.hwdev); +} + +int ionic_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_tbl_buf sq_buf = {}, rq_buf = {}; + struct ionic_pd *pd = to_ionic_pd(ibqp->pd); + struct ionic_qp *qp = to_ionic_qp(ibqp); + struct ionic_ctx *ctx = + rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx); + struct ionic_qp_resp resp = {}; + struct ionic_qp_req req = {}; + struct ionic_cq *cq; + u8 udma_mask; + void *entry; + int rc; + + if (udata) { + rc = ib_copy_from_udata(&req, udata, sizeof(req)); + if (rc) + return rc; + } else { + req.sq_spec = IONIC_SPEC_HIGH; + req.rq_spec = IONIC_SPEC_HIGH; + } + + if (attr->qp_type == IB_QPT_SMI || attr->qp_type > IB_QPT_UD) + return -EOPNOTSUPP; + + qp->state = IB_QPS_RESET; + + INIT_LIST_HEAD(&qp->cq_poll_sq); + INIT_LIST_HEAD(&qp->cq_flush_sq); + INIT_LIST_HEAD(&qp->cq_flush_rq); + + spin_lock_init(&qp->sq_lock); + spin_lock_init(&qp->rq_lock); + + qp->has_sq = 1; + qp->has_rq = 1; + + if (attr->qp_type == IB_QPT_GSI) { + rc = ionic_get_gsi_qpid(dev, &qp->qpid); + } else { + udma_mask = BIT(dev->lif_cfg.udma_count) - 1; + + if (qp->has_sq) + udma_mask &= to_ionic_vcq(attr->send_cq)->udma_mask; + + if (qp->has_rq) + udma_mask &= to_ionic_vcq(attr->recv_cq)->udma_mask; + + if (udata && req.udma_mask) + udma_mask &= req.udma_mask; + + if (!udma_mask) + return -EINVAL; + + rc = ionic_get_qpid(dev, &qp->qpid, &qp->udma_idx, udma_mask); + } + if (rc) + return rc; + + qp->sig_all = attr->sq_sig_type == IB_SIGNAL_ALL_WR; + qp->has_ah = attr->qp_type == IB_QPT_RC; + + if (qp->has_ah) { + qp->hdr = kzalloc(sizeof(*qp->hdr), GFP_KERNEL); + if (!qp->hdr) { + rc = -ENOMEM; + goto err_ah_alloc; + } + + rc = ionic_get_ahid(dev, &qp->ahid); + if (rc) + goto err_ahid; + } + + if (udata) { + if (req.rq_cmb & IONIC_CMB_ENABLE) + qp->rq_cmb = req.rq_cmb; + + if (req.sq_cmb & IONIC_CMB_ENABLE) + qp->sq_cmb = req.sq_cmb; + } + + rc = ionic_qp_sq_init(dev, ctx, qp, &req.sq, &sq_buf, + attr->cap.max_send_wr, attr->cap.max_send_sge, + attr->cap.max_inline_data, req.sq_spec, udata); + if (rc) + goto err_sq; + + rc = ionic_qp_rq_init(dev, ctx, qp, &req.rq, &rq_buf, + attr->cap.max_recv_wr, attr->cap.max_recv_sge, + req.rq_spec, udata); + if (rc) + goto err_rq; + + rc = ionic_create_qp_cmd(dev, pd, + to_ionic_vcq_cq(attr->send_cq, qp->udma_idx), + to_ionic_vcq_cq(attr->recv_cq, qp->udma_idx), + qp, &sq_buf, &rq_buf, attr); + if (rc) + goto err_cmd; + + if (udata) { + resp.qpid = qp->qpid; + resp.udma_idx = qp->udma_idx; + + if (qp->sq_cmb & IONIC_CMB_ENABLE) { + bool wc; + + if ((qp->sq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) == + (IONIC_CMB_WC | IONIC_CMB_UC)) { + ibdev_dbg(&dev->ibdev, + "Both sq_cmb flags IONIC_CMB_WC and IONIC_CMB_UC are set, using default driver mapping\n"); + qp->sq_cmb &= ~(IONIC_CMB_WC | IONIC_CMB_UC); + } + + wc = (qp->sq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) + != IONIC_CMB_UC; + + /* let userspace know the mapping */ + if (wc) + qp->sq_cmb |= IONIC_CMB_WC; + else + qp->sq_cmb |= IONIC_CMB_UC; + + qp->mmap_sq_cmb = + ionic_mmap_entry_insert(ctx, + qp->sq.size, + PHYS_PFN(qp->sq_cmb_addr), + wc ? IONIC_MMAP_WC : 0, + &resp.sq_cmb_offset); + if (!qp->mmap_sq_cmb) { + rc = -ENOMEM; + goto err_mmap_sq; + } + + resp.sq_cmb = qp->sq_cmb; + } + + if (qp->rq_cmb & IONIC_CMB_ENABLE) { + bool wc; + + if ((qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) == + (IONIC_CMB_WC | IONIC_CMB_UC)) { + ibdev_dbg(&dev->ibdev, + "Both rq_cmb flags IONIC_CMB_WC and IONIC_CMB_UC are set, using default driver mapping\n"); + qp->rq_cmb &= ~(IONIC_CMB_WC | IONIC_CMB_UC); + } + + if (qp->rq_cmb & IONIC_CMB_EXPDB) + wc = (qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) + == IONIC_CMB_WC; + else + wc = (qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) + != IONIC_CMB_UC; + + /* let userspace know the mapping */ + if (wc) + qp->rq_cmb |= IONIC_CMB_WC; + else + qp->rq_cmb |= IONIC_CMB_UC; + + qp->mmap_rq_cmb = + ionic_mmap_entry_insert(ctx, + qp->rq.size, + PHYS_PFN(qp->rq_cmb_addr), + wc ? IONIC_MMAP_WC : 0, + &resp.rq_cmb_offset); + if (!qp->mmap_rq_cmb) { + rc = -ENOMEM; + goto err_mmap_rq; + } + + resp.rq_cmb = qp->rq_cmb; + } + + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) + goto err_resp; + } + + ionic_pgtbl_unbuf(dev, &rq_buf); + ionic_pgtbl_unbuf(dev, &sq_buf); + + qp->ibqp.qp_num = qp->qpid; + + init_completion(&qp->qp_rel_comp); + kref_init(&qp->qp_kref); + + entry = xa_store_irq(&dev->qp_tbl, qp->qpid, qp, GFP_KERNEL); + if (entry) { + if (!xa_is_err(entry)) + rc = -EINVAL; + else + rc = xa_err(entry); + + goto err_resp; + } + + if (qp->has_sq) { + cq = to_ionic_vcq_cq(attr->send_cq, qp->udma_idx); + + attr->cap.max_send_wr = qp->sq.mask; + attr->cap.max_send_sge = + ionic_v1_send_wqe_max_sge(qp->sq.stride_log2, + qp->sq_spec, + qp->sq_cmb & IONIC_CMB_EXPDB); + attr->cap.max_inline_data = + ionic_v1_send_wqe_max_data(qp->sq.stride_log2, + qp->sq_cmb & + IONIC_CMB_EXPDB); + qp->sq_cqid = cq->cqid; + } + + if (qp->has_rq) { + cq = to_ionic_vcq_cq(attr->recv_cq, qp->udma_idx); + + attr->cap.max_recv_wr = qp->rq.mask; + attr->cap.max_recv_sge = + ionic_v1_recv_wqe_max_sge(qp->rq.stride_log2, + qp->rq_spec, + qp->rq_cmb & IONIC_CMB_EXPDB); + qp->rq_cqid = cq->cqid; + } + + return 0; + +err_resp: + if (udata && (qp->rq_cmb & IONIC_CMB_ENABLE)) + rdma_user_mmap_entry_remove(qp->mmap_rq_cmb); +err_mmap_rq: + if (udata && (qp->sq_cmb & IONIC_CMB_ENABLE)) + rdma_user_mmap_entry_remove(qp->mmap_sq_cmb); +err_mmap_sq: + ionic_destroy_qp_cmd(dev, qp->qpid); +err_cmd: + ionic_pgtbl_unbuf(dev, &rq_buf); + ionic_qp_rq_destroy(dev, ctx, qp); +err_rq: + ionic_pgtbl_unbuf(dev, &sq_buf); + ionic_qp_sq_destroy(dev, ctx, qp); +err_sq: + if (qp->has_ah) + ionic_put_ahid(dev, qp->ahid); +err_ahid: + kfree(qp->hdr); +err_ah_alloc: + ionic_put_qpid(dev, qp->qpid); + return rc; +} + +void ionic_notify_flush_cq(struct ionic_cq *cq) +{ + if (cq->flush && cq->vcq->ibcq.comp_handler) + cq->vcq->ibcq.comp_handler(&cq->vcq->ibcq, + cq->vcq->ibcq.cq_context); +} + +static void ionic_notify_qp_cqs(struct ionic_ibdev *dev, struct ionic_qp *qp) +{ + if (qp->ibqp.send_cq) + ionic_notify_flush_cq(to_ionic_vcq_cq(qp->ibqp.send_cq, + qp->udma_idx)); + if (qp->ibqp.recv_cq && qp->ibqp.recv_cq != qp->ibqp.send_cq) + ionic_notify_flush_cq(to_ionic_vcq_cq(qp->ibqp.recv_cq, + qp->udma_idx)); +} + +void ionic_flush_qp(struct ionic_ibdev *dev, struct ionic_qp *qp) +{ + unsigned long irqflags; + struct ionic_cq *cq; + + if (qp->ibqp.send_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx); + + /* Hold the CQ lock and QP sq_lock to set up flush */ + spin_lock_irqsave(&cq->lock, irqflags); + spin_lock(&qp->sq_lock); + qp->sq_flush = true; + if (!ionic_queue_empty(&qp->sq)) { + cq->flush = true; + list_move_tail(&qp->cq_flush_sq, &cq->flush_sq); + } + spin_unlock(&qp->sq_lock); + spin_unlock_irqrestore(&cq->lock, irqflags); + } + + if (qp->ibqp.recv_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx); + + /* Hold the CQ lock and QP rq_lock to set up flush */ + spin_lock_irqsave(&cq->lock, irqflags); + spin_lock(&qp->rq_lock); + qp->rq_flush = true; + if (!ionic_queue_empty(&qp->rq)) { + cq->flush = true; + list_move_tail(&qp->cq_flush_rq, &cq->flush_rq); + } + spin_unlock(&qp->rq_lock); + spin_unlock_irqrestore(&cq->lock, irqflags); + } +} + +static void ionic_clean_cq(struct ionic_cq *cq, u32 qpid) +{ + struct ionic_v1_cqe *qcqe; + int prod, qtf, qid, type; + bool color; + + if (!cq->q.ptr) + return; + + color = cq->color; + prod = cq->q.prod; + qcqe = ionic_queue_at(&cq->q, prod); + + while (color == ionic_v1_cqe_color(qcqe)) { + qtf = ionic_v1_cqe_qtf(qcqe); + qid = ionic_v1_cqe_qtf_qid(qtf); + type = ionic_v1_cqe_qtf_type(qtf); + + if (qid == qpid && type != IONIC_V1_CQE_TYPE_ADMIN) + ionic_v1_cqe_clean(qcqe); + + prod = ionic_queue_next(&cq->q, prod); + qcqe = ionic_queue_at(&cq->q, prod); + color = ionic_color_wrap(prod, color); + } +} + +static void ionic_reset_qp(struct ionic_ibdev *dev, struct ionic_qp *qp) +{ + unsigned long irqflags; + struct ionic_cq *cq; + int i; + + local_irq_save(irqflags); + + if (qp->ibqp.send_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx); + spin_lock(&cq->lock); + ionic_clean_cq(cq, qp->qpid); + spin_unlock(&cq->lock); + } + + if (qp->ibqp.recv_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx); + spin_lock(&cq->lock); + ionic_clean_cq(cq, qp->qpid); + spin_unlock(&cq->lock); + } + + if (qp->has_sq) { + spin_lock(&qp->sq_lock); + qp->sq_flush = false; + qp->sq_flush_rcvd = false; + qp->sq_msn_prod = 0; + qp->sq_msn_cons = 0; + qp->sq.prod = 0; + qp->sq.cons = 0; + spin_unlock(&qp->sq_lock); + } + + if (qp->has_rq) { + spin_lock(&qp->rq_lock); + qp->rq_flush = false; + qp->rq.prod = 0; + qp->rq.cons = 0; + if (qp->rq_meta) { + for (i = 0; i < qp->rq.mask; ++i) + qp->rq_meta[i].next = &qp->rq_meta[i + 1]; + qp->rq_meta[i].next = IONIC_META_LAST; + } + qp->rq_meta_head = &qp->rq_meta[0]; + spin_unlock(&qp->rq_lock); + } + + local_irq_restore(irqflags); +} + +static bool ionic_qp_cur_state_is_ok(enum ib_qp_state q_state, + enum ib_qp_state attr_state) +{ + if (q_state == attr_state) + return true; + + if (attr_state == IB_QPS_ERR) + return true; + + if (attr_state == IB_QPS_SQE) + return q_state == IB_QPS_RTS || q_state == IB_QPS_SQD; + + return false; +} + +static int ionic_check_modify_qp(struct ionic_qp *qp, struct ib_qp_attr *attr, + int mask) +{ + enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : qp->state; + enum ib_qp_state next_state = (mask & IB_QP_STATE) ? + attr->qp_state : cur_state; + + if ((mask & IB_QP_CUR_STATE) && + !ionic_qp_cur_state_is_ok(qp->state, attr->cur_qp_state)) + return -EINVAL; + + if (!ib_modify_qp_is_ok(cur_state, next_state, qp->ibqp.qp_type, mask)) + return -EINVAL; + + /* unprivileged qp not allowed privileged qkey */ + if ((mask & IB_QP_QKEY) && (attr->qkey & 0x80000000) && + qp->ibqp.uobject) + return -EPERM; + + return 0; +} + +int ionic_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_pd *pd = to_ionic_pd(ibqp->pd); + struct ionic_qp *qp = to_ionic_qp(ibqp); + int rc; + + rc = ionic_check_modify_qp(qp, attr, mask); + if (rc) + return rc; + + if (mask & IB_QP_CAP) + return -EINVAL; + + rc = ionic_modify_qp_cmd(dev, pd, qp, attr, mask); + if (rc) + return rc; + + if (mask & IB_QP_STATE) { + qp->state = attr->qp_state; + + if (attr->qp_state == IB_QPS_ERR) { + ionic_flush_qp(dev, qp); + ionic_notify_qp_cqs(dev, qp); + } else if (attr->qp_state == IB_QPS_RESET) { + ionic_reset_qp(dev, qp); + } + } + + return 0; +} + +int ionic_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_qp_init_attr *init_attr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_qp *qp = to_ionic_qp(ibqp); + int rc; + + memset(attr, 0, sizeof(*attr)); + memset(init_attr, 0, sizeof(*init_attr)); + + rc = ionic_query_qp_cmd(dev, qp, attr, mask); + if (rc) + return rc; + + if (qp->has_sq) + attr->cap.max_send_wr = qp->sq.mask; + + if (qp->has_rq) + attr->cap.max_recv_wr = qp->rq.mask; + + init_attr->event_handler = ibqp->event_handler; + init_attr->qp_context = ibqp->qp_context; + init_attr->send_cq = ibqp->send_cq; + init_attr->recv_cq = ibqp->recv_cq; + init_attr->srq = ibqp->srq; + init_attr->xrcd = ibqp->xrcd; + init_attr->cap = attr->cap; + init_attr->sq_sig_type = qp->sig_all ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + init_attr->qp_type = ibqp->qp_type; + init_attr->create_flags = 0; + init_attr->port_num = 0; + init_attr->rwq_ind_tbl = ibqp->rwq_ind_tbl; + init_attr->source_qpn = 0; + + return rc; +} + +int ionic_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct ionic_ctx *ctx = + rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx); + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_qp *qp = to_ionic_qp(ibqp); + unsigned long irqflags; + struct ionic_cq *cq; + int rc; + + rc = ionic_destroy_qp_cmd(dev, qp->qpid); + if (rc) + return rc; + + xa_erase_irq(&dev->qp_tbl, qp->qpid); + + kref_put(&qp->qp_kref, ionic_qp_complete); + wait_for_completion(&qp->qp_rel_comp); + + if (qp->ibqp.send_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx); + spin_lock_irqsave(&cq->lock, irqflags); + ionic_clean_cq(cq, qp->qpid); + list_del(&qp->cq_poll_sq); + list_del(&qp->cq_flush_sq); + spin_unlock_irqrestore(&cq->lock, irqflags); + } + + if (qp->ibqp.recv_cq) { + cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx); + spin_lock_irqsave(&cq->lock, irqflags); + ionic_clean_cq(cq, qp->qpid); + list_del(&qp->cq_flush_rq); + spin_unlock_irqrestore(&cq->lock, irqflags); + } + + ionic_qp_rq_destroy(dev, ctx, qp); + ionic_qp_sq_destroy(dev, ctx, qp); + if (qp->has_ah) { + ionic_put_ahid(dev, qp->ahid); + kfree(qp->hdr); + } + ionic_put_qpid(dev, qp->qpid); + + return 0; +} diff --git a/drivers/infiniband/hw/ionic/ionic_datapath.c b/drivers/infiniband/hw/ionic/ionic_datapath.c new file mode 100644 index 000000000000..aa2944887f23 --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_datapath.c @@ -0,0 +1,1399 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/module.h> +#include <linux/printk.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_user_verbs.h> + +#include "ionic_fw.h" +#include "ionic_ibdev.h" + +#define IONIC_OP(version, opname) \ + ((version) < 2 ? IONIC_V1_OP_##opname : IONIC_V2_OP_##opname) + +static bool ionic_next_cqe(struct ionic_ibdev *dev, struct ionic_cq *cq, + struct ionic_v1_cqe **cqe) +{ + struct ionic_v1_cqe *qcqe = ionic_queue_at_prod(&cq->q); + + if (unlikely(cq->color != ionic_v1_cqe_color(qcqe))) + return false; + + /* Prevent out-of-order reads of the CQE */ + dma_rmb(); + + *cqe = qcqe; + + return true; +} + +static int ionic_flush_recv(struct ionic_qp *qp, struct ib_wc *wc) +{ + struct ionic_rq_meta *meta; + struct ionic_v1_wqe *wqe; + + if (!qp->rq_flush) + return 0; + + if (ionic_queue_empty(&qp->rq)) + return 0; + + wqe = ionic_queue_at_cons(&qp->rq); + + /* wqe_id must be a valid queue index */ + if (unlikely(wqe->base.wqe_id >> qp->rq.depth_log2)) { + ibdev_warn(qp->ibqp.device, + "flush qp %u recv index %llu invalid\n", + qp->qpid, (unsigned long long)wqe->base.wqe_id); + return -EIO; + } + + /* wqe_id must indicate a request that is outstanding */ + meta = &qp->rq_meta[wqe->base.wqe_id]; + if (unlikely(meta->next != IONIC_META_POSTED)) { + ibdev_warn(qp->ibqp.device, + "flush qp %u recv index %llu not posted\n", + qp->qpid, (unsigned long long)wqe->base.wqe_id); + return -EIO; + } + + ionic_queue_consume(&qp->rq); + + memset(wc, 0, sizeof(*wc)); + + wc->status = IB_WC_WR_FLUSH_ERR; + wc->wr_id = meta->wrid; + wc->qp = &qp->ibqp; + + meta->next = qp->rq_meta_head; + qp->rq_meta_head = meta; + + return 1; +} + +static int ionic_flush_recv_many(struct ionic_qp *qp, + struct ib_wc *wc, int nwc) +{ + int rc = 0, npolled = 0; + + while (npolled < nwc) { + rc = ionic_flush_recv(qp, wc + npolled); + if (rc <= 0) + break; + + npolled += rc; + } + + return npolled ?: rc; +} + +static int ionic_flush_send(struct ionic_qp *qp, struct ib_wc *wc) +{ + struct ionic_sq_meta *meta; + + if (!qp->sq_flush) + return 0; + + if (ionic_queue_empty(&qp->sq)) + return 0; + + meta = &qp->sq_meta[qp->sq.cons]; + + ionic_queue_consume(&qp->sq); + + memset(wc, 0, sizeof(*wc)); + + wc->status = IB_WC_WR_FLUSH_ERR; + wc->wr_id = meta->wrid; + wc->qp = &qp->ibqp; + + return 1; +} + +static int ionic_flush_send_many(struct ionic_qp *qp, + struct ib_wc *wc, int nwc) +{ + int rc = 0, npolled = 0; + + while (npolled < nwc) { + rc = ionic_flush_send(qp, wc + npolled); + if (rc <= 0) + break; + + npolled += rc; + } + + return npolled ?: rc; +} + +static int ionic_poll_recv(struct ionic_ibdev *dev, struct ionic_cq *cq, + struct ionic_qp *cqe_qp, struct ionic_v1_cqe *cqe, + struct ib_wc *wc) +{ + struct ionic_qp *qp = NULL; + struct ionic_rq_meta *meta; + u32 src_qpn, st_len; + u16 vlan_tag; + u8 op; + + if (cqe_qp->rq_flush) + return 0; + + qp = cqe_qp; + + st_len = be32_to_cpu(cqe->status_length); + + /* ignore wqe_id in case of flush error */ + if (ionic_v1_cqe_error(cqe) && st_len == IONIC_STS_WQE_FLUSHED_ERR) { + cqe_qp->rq_flush = true; + cq->flush = true; + list_move_tail(&qp->cq_flush_rq, &cq->flush_rq); + + /* posted recvs (if any) flushed by ionic_flush_recv */ + return 0; + } + + /* there had better be something in the recv queue to complete */ + if (ionic_queue_empty(&qp->rq)) { + ibdev_warn(&dev->ibdev, "qp %u is empty\n", qp->qpid); + return -EIO; + } + + /* wqe_id must be a valid queue index */ + if (unlikely(cqe->recv.wqe_id >> qp->rq.depth_log2)) { + ibdev_warn(&dev->ibdev, + "qp %u recv index %llu invalid\n", + qp->qpid, (unsigned long long)cqe->recv.wqe_id); + return -EIO; + } + + /* wqe_id must indicate a request that is outstanding */ + meta = &qp->rq_meta[cqe->recv.wqe_id]; + if (unlikely(meta->next != IONIC_META_POSTED)) { + ibdev_warn(&dev->ibdev, + "qp %u recv index %llu not posted\n", + qp->qpid, (unsigned long long)cqe->recv.wqe_id); + return -EIO; + } + + meta->next = qp->rq_meta_head; + qp->rq_meta_head = meta; + + memset(wc, 0, sizeof(*wc)); + + wc->wr_id = meta->wrid; + + wc->qp = &cqe_qp->ibqp; + + if (ionic_v1_cqe_error(cqe)) { + wc->vendor_err = st_len; + wc->status = ionic_to_ib_status(st_len); + + cqe_qp->rq_flush = true; + cq->flush = true; + list_move_tail(&qp->cq_flush_rq, &cq->flush_rq); + + ibdev_warn(&dev->ibdev, + "qp %d recv cqe with error\n", qp->qpid); + print_hex_dump(KERN_WARNING, "cqe ", DUMP_PREFIX_OFFSET, 16, 1, + cqe, BIT(cq->q.stride_log2), true); + goto out; + } + + wc->vendor_err = 0; + wc->status = IB_WC_SUCCESS; + + src_qpn = be32_to_cpu(cqe->recv.src_qpn_op); + op = src_qpn >> IONIC_V1_CQE_RECV_OP_SHIFT; + + src_qpn &= IONIC_V1_CQE_RECV_QPN_MASK; + op &= IONIC_V1_CQE_RECV_OP_MASK; + + wc->opcode = IB_WC_RECV; + switch (op) { + case IONIC_V1_CQE_RECV_OP_RDMA_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags |= IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->recv.imm_data_rkey; /* be32 in wc */ + break; + case IONIC_V1_CQE_RECV_OP_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->recv.imm_data_rkey; /* be32 in wc */ + break; + case IONIC_V1_CQE_RECV_OP_SEND_INV: + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->recv.imm_data_rkey); + break; + } + + wc->byte_len = st_len; + wc->src_qp = src_qpn; + + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_GSI) { + wc->wc_flags |= IB_WC_GRH | IB_WC_WITH_SMAC; + ether_addr_copy(wc->smac, cqe->recv.src_mac); + + wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; + if (ionic_v1_cqe_recv_is_ipv4(cqe)) + wc->network_hdr_type = RDMA_NETWORK_IPV4; + else + wc->network_hdr_type = RDMA_NETWORK_IPV6; + + if (ionic_v1_cqe_recv_is_vlan(cqe)) + wc->wc_flags |= IB_WC_WITH_VLAN; + + /* vlan_tag in cqe will be valid from dpath even if no vlan */ + vlan_tag = be16_to_cpu(cqe->recv.vlan_tag); + wc->vlan_id = vlan_tag & 0xfff; /* 802.1q VID */ + wc->sl = vlan_tag >> VLAN_PRIO_SHIFT; /* 802.1q PCP */ + } + + wc->pkey_index = 0; + wc->port_num = 1; + +out: + ionic_queue_consume(&qp->rq); + + return 1; +} + +static bool ionic_peek_send(struct ionic_qp *qp) +{ + struct ionic_sq_meta *meta; + + if (qp->sq_flush) + return false; + + /* completed all send queue requests */ + if (ionic_queue_empty(&qp->sq)) + return false; + + meta = &qp->sq_meta[qp->sq.cons]; + + /* waiting for remote completion */ + if (meta->remote && meta->seq == qp->sq_msn_cons) + return false; + + /* waiting for local completion */ + if (!meta->remote && !meta->local_comp) + return false; + + return true; +} + +static int ionic_poll_send(struct ionic_ibdev *dev, struct ionic_cq *cq, + struct ionic_qp *qp, struct ib_wc *wc) +{ + struct ionic_sq_meta *meta; + + if (qp->sq_flush) + return 0; + + do { + /* completed all send queue requests */ + if (ionic_queue_empty(&qp->sq)) + goto out_empty; + + meta = &qp->sq_meta[qp->sq.cons]; + + /* waiting for remote completion */ + if (meta->remote && meta->seq == qp->sq_msn_cons) + goto out_empty; + + /* waiting for local completion */ + if (!meta->remote && !meta->local_comp) + goto out_empty; + + ionic_queue_consume(&qp->sq); + + /* produce wc only if signaled or error status */ + } while (!meta->signal && meta->ibsts == IB_WC_SUCCESS); + + memset(wc, 0, sizeof(*wc)); + + wc->status = meta->ibsts; + wc->wr_id = meta->wrid; + wc->qp = &qp->ibqp; + + if (meta->ibsts == IB_WC_SUCCESS) { + wc->byte_len = meta->len; + wc->opcode = meta->ibop; + } else { + wc->vendor_err = meta->len; + + qp->sq_flush = true; + cq->flush = true; + list_move_tail(&qp->cq_flush_sq, &cq->flush_sq); + } + + return 1; + +out_empty: + if (qp->sq_flush_rcvd) { + qp->sq_flush = true; + cq->flush = true; + list_move_tail(&qp->cq_flush_sq, &cq->flush_sq); + } + return 0; +} + +static int ionic_poll_send_many(struct ionic_ibdev *dev, struct ionic_cq *cq, + struct ionic_qp *qp, struct ib_wc *wc, int nwc) +{ + int rc = 0, npolled = 0; + + while (npolled < nwc) { + rc = ionic_poll_send(dev, cq, qp, wc + npolled); + if (rc <= 0) + break; + + npolled += rc; + } + + return npolled ?: rc; +} + +static int ionic_validate_cons(u16 prod, u16 cons, + u16 comp, u16 mask) +{ + if (((prod - cons) & mask) <= ((comp - cons) & mask)) + return -EIO; + + return 0; +} + +static int ionic_comp_msn(struct ionic_qp *qp, struct ionic_v1_cqe *cqe) +{ + struct ionic_sq_meta *meta; + u16 cqe_seq, cqe_idx; + int rc; + + if (qp->sq_flush) + return 0; + + cqe_seq = be32_to_cpu(cqe->send.msg_msn) & qp->sq.mask; + + rc = ionic_validate_cons(qp->sq_msn_prod, + qp->sq_msn_cons, + cqe_seq - 1, + qp->sq.mask); + if (rc) { + ibdev_warn(qp->ibqp.device, + "qp %u bad msn %#x seq %u for prod %u cons %u\n", + qp->qpid, be32_to_cpu(cqe->send.msg_msn), + cqe_seq, qp->sq_msn_prod, qp->sq_msn_cons); + return rc; + } + + qp->sq_msn_cons = cqe_seq; + + if (ionic_v1_cqe_error(cqe)) { + cqe_idx = qp->sq_msn_idx[(cqe_seq - 1) & qp->sq.mask]; + + meta = &qp->sq_meta[cqe_idx]; + meta->len = be32_to_cpu(cqe->status_length); + meta->ibsts = ionic_to_ib_status(meta->len); + + ibdev_warn(qp->ibqp.device, + "qp %d msn cqe with error\n", qp->qpid); + print_hex_dump(KERN_WARNING, "cqe ", DUMP_PREFIX_OFFSET, 16, 1, + cqe, sizeof(*cqe), true); + } + + return 0; +} + +static int ionic_comp_npg(struct ionic_qp *qp, struct ionic_v1_cqe *cqe) +{ + struct ionic_sq_meta *meta; + u16 cqe_idx; + u32 st_len; + + if (qp->sq_flush) + return 0; + + st_len = be32_to_cpu(cqe->status_length); + + if (ionic_v1_cqe_error(cqe) && st_len == IONIC_STS_WQE_FLUSHED_ERR) { + /* + * Flush cqe does not consume a wqe on the device, and maybe + * no such work request is posted. + * + * The driver should begin flushing after the last indicated + * normal or error completion. Here, only set a hint that the + * flush request was indicated. In poll_send, if nothing more + * can be polled normally, then begin flushing. + */ + qp->sq_flush_rcvd = true; + return 0; + } + + cqe_idx = cqe->send.npg_wqe_id & qp->sq.mask; + meta = &qp->sq_meta[cqe_idx]; + meta->local_comp = true; + + if (ionic_v1_cqe_error(cqe)) { + meta->len = st_len; + meta->ibsts = ionic_to_ib_status(st_len); + meta->remote = false; + ibdev_warn(qp->ibqp.device, + "qp %d npg cqe with error\n", qp->qpid); + print_hex_dump(KERN_WARNING, "cqe ", DUMP_PREFIX_OFFSET, 16, 1, + cqe, sizeof(*cqe), true); + } + + return 0; +} + +static void ionic_reserve_sync_cq(struct ionic_ibdev *dev, struct ionic_cq *cq) +{ + if (!ionic_queue_empty(&cq->q)) { + cq->credit += ionic_queue_length(&cq->q); + cq->q.cons = cq->q.prod; + + ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.cq_qtype, + ionic_queue_dbell_val(&cq->q)); + } +} + +static void ionic_reserve_cq(struct ionic_ibdev *dev, struct ionic_cq *cq, + int spend) +{ + cq->credit -= spend; + + if (cq->credit <= 0) + ionic_reserve_sync_cq(dev, cq); +} + +static int ionic_poll_vcq_cq(struct ionic_ibdev *dev, + struct ionic_cq *cq, + int nwc, struct ib_wc *wc) +{ + struct ionic_qp *qp, *qp_next; + struct ionic_v1_cqe *cqe; + int rc = 0, npolled = 0; + unsigned long irqflags; + u32 qtf, qid; + bool peek; + u8 type; + + if (nwc < 1) + return 0; + + spin_lock_irqsave(&cq->lock, irqflags); + + /* poll already indicated work completions for send queue */ + list_for_each_entry_safe(qp, qp_next, &cq->poll_sq, cq_poll_sq) { + if (npolled == nwc) + goto out; + + spin_lock(&qp->sq_lock); + rc = ionic_poll_send_many(dev, cq, qp, wc + npolled, + nwc - npolled); + spin_unlock(&qp->sq_lock); + + if (rc > 0) + npolled += rc; + + if (npolled < nwc) + list_del_init(&qp->cq_poll_sq); + } + + /* poll for more work completions */ + while (likely(ionic_next_cqe(dev, cq, &cqe))) { + if (npolled == nwc) + goto out; + + qtf = ionic_v1_cqe_qtf(cqe); + qid = ionic_v1_cqe_qtf_qid(qtf); + type = ionic_v1_cqe_qtf_type(qtf); + + /* + * Safe to access QP without additional reference here as, + * 1. We hold cq->lock throughout + * 2. ionic_destroy_qp() acquires the same cq->lock before cleanup + * 3. QP is removed from qp_tbl before any cleanup begins + * This ensures no concurrent access between polling and destruction. + */ + qp = xa_load(&dev->qp_tbl, qid); + if (unlikely(!qp)) { + ibdev_dbg(&dev->ibdev, "missing qp for qid %u\n", qid); + goto cq_next; + } + + switch (type) { + case IONIC_V1_CQE_TYPE_RECV: + spin_lock(&qp->rq_lock); + rc = ionic_poll_recv(dev, cq, qp, cqe, wc + npolled); + spin_unlock(&qp->rq_lock); + + if (rc < 0) + goto out; + + npolled += rc; + + break; + + case IONIC_V1_CQE_TYPE_SEND_MSN: + spin_lock(&qp->sq_lock); + rc = ionic_comp_msn(qp, cqe); + if (!rc) { + rc = ionic_poll_send_many(dev, cq, qp, + wc + npolled, + nwc - npolled); + peek = ionic_peek_send(qp); + } + spin_unlock(&qp->sq_lock); + + if (rc < 0) + goto out; + + npolled += rc; + + if (peek) + list_move_tail(&qp->cq_poll_sq, &cq->poll_sq); + break; + + case IONIC_V1_CQE_TYPE_SEND_NPG: + spin_lock(&qp->sq_lock); + rc = ionic_comp_npg(qp, cqe); + if (!rc) { + rc = ionic_poll_send_many(dev, cq, qp, + wc + npolled, + nwc - npolled); + peek = ionic_peek_send(qp); + } + spin_unlock(&qp->sq_lock); + + if (rc < 0) + goto out; + + npolled += rc; + + if (peek) + list_move_tail(&qp->cq_poll_sq, &cq->poll_sq); + break; + + default: + ibdev_warn(&dev->ibdev, + "unexpected cqe type %u\n", type); + rc = -EIO; + goto out; + } + +cq_next: + ionic_queue_produce(&cq->q); + cq->color = ionic_color_wrap(cq->q.prod, cq->color); + } + + /* lastly, flush send and recv queues */ + if (likely(!cq->flush)) + goto out; + + cq->flush = false; + + list_for_each_entry_safe(qp, qp_next, &cq->flush_sq, cq_flush_sq) { + if (npolled == nwc) + goto out; + + spin_lock(&qp->sq_lock); + rc = ionic_flush_send_many(qp, wc + npolled, nwc - npolled); + spin_unlock(&qp->sq_lock); + + if (rc > 0) + npolled += rc; + + if (npolled < nwc) + list_del_init(&qp->cq_flush_sq); + else + cq->flush = true; + } + + list_for_each_entry_safe(qp, qp_next, &cq->flush_rq, cq_flush_rq) { + if (npolled == nwc) + goto out; + + spin_lock(&qp->rq_lock); + rc = ionic_flush_recv_many(qp, wc + npolled, nwc - npolled); + spin_unlock(&qp->rq_lock); + + if (rc > 0) + npolled += rc; + + if (npolled < nwc) + list_del_init(&qp->cq_flush_rq); + else + cq->flush = true; + } + +out: + /* in case credit was depleted (more work posted than cq depth) */ + if (cq->credit <= 0) + ionic_reserve_sync_cq(dev, cq); + + spin_unlock_irqrestore(&cq->lock, irqflags); + + return npolled ?: rc; +} + +int ionic_poll_cq(struct ib_cq *ibcq, int nwc, struct ib_wc *wc) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device); + struct ionic_vcq *vcq = to_ionic_vcq(ibcq); + int rc_tmp, rc = 0, npolled = 0; + int cq_i, cq_x, cq_ix; + + cq_x = vcq->poll_idx; + vcq->poll_idx ^= dev->lif_cfg.udma_count - 1; + + for (cq_i = 0; npolled < nwc && cq_i < dev->lif_cfg.udma_count; ++cq_i) { + cq_ix = cq_i ^ cq_x; + + if (!(vcq->udma_mask & BIT(cq_ix))) + continue; + + rc_tmp = ionic_poll_vcq_cq(dev, &vcq->cq[cq_ix], + nwc - npolled, + wc + npolled); + + if (rc_tmp >= 0) + npolled += rc_tmp; + else if (!rc) + rc = rc_tmp; + } + + return npolled ?: rc; +} + +static int ionic_req_notify_vcq_cq(struct ionic_ibdev *dev, struct ionic_cq *cq, + enum ib_cq_notify_flags flags) +{ + u64 dbell_val = cq->q.dbell; + + if (flags & IB_CQ_SOLICITED) { + cq->arm_sol_prod = ionic_queue_next(&cq->q, cq->arm_sol_prod); + dbell_val |= cq->arm_sol_prod | IONIC_CQ_RING_SOL; + } else { + cq->arm_any_prod = ionic_queue_next(&cq->q, cq->arm_any_prod); + dbell_val |= cq->arm_any_prod | IONIC_CQ_RING_ARM; + } + + ionic_reserve_sync_cq(dev, cq); + + ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.cq_qtype, dbell_val); + + /* + * IB_CQ_REPORT_MISSED_EVENTS: + * + * The queue index in ring zero guarantees no missed events. + * + * Here, we check if the color bit in the next cqe is flipped. If it + * is flipped, then progress can be made by immediately polling the cq. + * Still, the cq will be armed, and an event will be generated. The cq + * may be empty when polled after the event, because the next poll + * after arming the cq can empty it. + */ + return (flags & IB_CQ_REPORT_MISSED_EVENTS) && + cq->color == ionic_v1_cqe_color(ionic_queue_at_prod(&cq->q)); +} + +int ionic_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device); + struct ionic_vcq *vcq = to_ionic_vcq(ibcq); + int rc = 0, cq_i; + + for (cq_i = 0; cq_i < dev->lif_cfg.udma_count; ++cq_i) { + if (!(vcq->udma_mask & BIT(cq_i))) + continue; + + if (ionic_req_notify_vcq_cq(dev, &vcq->cq[cq_i], flags)) + rc = 1; + } + + return rc; +} + +static s64 ionic_prep_inline(void *data, u32 max_data, + const struct ib_sge *ib_sgl, int num_sge) +{ + static const s64 bit_31 = 1u << 31; + s64 len = 0, sg_len; + int sg_i; + + for (sg_i = 0; sg_i < num_sge; ++sg_i) { + sg_len = ib_sgl[sg_i].length; + + /* sge length zero means 2GB */ + if (unlikely(sg_len == 0)) + sg_len = bit_31; + + /* greater than max inline data is invalid */ + if (unlikely(len + sg_len > max_data)) + return -EINVAL; + + memcpy(data + len, (void *)ib_sgl[sg_i].addr, sg_len); + + len += sg_len; + } + + return len; +} + +static s64 ionic_prep_pld(struct ionic_v1_wqe *wqe, + union ionic_v1_pld *pld, + int spec, u32 max_sge, + const struct ib_sge *ib_sgl, + int num_sge) +{ + static const s64 bit_31 = 1l << 31; + struct ionic_sge *sgl; + __be32 *spec32 = NULL; + __be16 *spec16 = NULL; + s64 len = 0, sg_len; + int sg_i = 0; + + if (unlikely(num_sge < 0 || (u32)num_sge > max_sge)) + return -EINVAL; + + if (spec && num_sge > IONIC_V1_SPEC_FIRST_SGE) { + sg_i = IONIC_V1_SPEC_FIRST_SGE; + + if (num_sge > 8) { + wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_SPEC16); + spec16 = pld->spec16; + } else { + wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_SPEC32); + spec32 = pld->spec32; + } + } + + sgl = &pld->sgl[sg_i]; + + for (sg_i = 0; sg_i < num_sge; ++sg_i) { + sg_len = ib_sgl[sg_i].length; + + /* sge length zero means 2GB */ + if (unlikely(sg_len == 0)) + sg_len = bit_31; + + /* greater than 2GB data is invalid */ + if (unlikely(len + sg_len > bit_31)) + return -EINVAL; + + sgl[sg_i].va = cpu_to_be64(ib_sgl[sg_i].addr); + sgl[sg_i].len = cpu_to_be32(sg_len); + sgl[sg_i].lkey = cpu_to_be32(ib_sgl[sg_i].lkey); + + if (spec32) { + spec32[sg_i] = sgl[sg_i].len; + } else if (spec16) { + if (unlikely(sg_len > U16_MAX)) + return -EINVAL; + spec16[sg_i] = cpu_to_be16(sg_len); + } + + len += sg_len; + } + + return len; +} + +static void ionic_prep_base(struct ionic_qp *qp, + const struct ib_send_wr *wr, + struct ionic_sq_meta *meta, + struct ionic_v1_wqe *wqe) +{ + meta->wrid = wr->wr_id; + meta->ibsts = IB_WC_SUCCESS; + meta->signal = false; + meta->local_comp = false; + + wqe->base.wqe_id = qp->sq.prod; + + if (wr->send_flags & IB_SEND_FENCE) + wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_FENCE); + + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_SOL); + + if (qp->sig_all || wr->send_flags & IB_SEND_SIGNALED) { + wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_SIG); + meta->signal = true; + } + + meta->seq = qp->sq_msn_prod; + meta->remote = + qp->ibqp.qp_type != IB_QPT_UD && + qp->ibqp.qp_type != IB_QPT_GSI && + !ionic_ibop_is_local(wr->opcode); + + if (meta->remote) { + qp->sq_msn_idx[meta->seq] = qp->sq.prod; + qp->sq_msn_prod = ionic_queue_next(&qp->sq, qp->sq_msn_prod); + } + + ionic_queue_produce(&qp->sq); +} + +static int ionic_prep_common(struct ionic_qp *qp, + const struct ib_send_wr *wr, + struct ionic_sq_meta *meta, + struct ionic_v1_wqe *wqe) +{ + s64 signed_len; + u32 mval; + + if (wr->send_flags & IB_SEND_INLINE) { + wqe->base.num_sge_key = 0; + wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_INL); + mval = ionic_v1_send_wqe_max_data(qp->sq.stride_log2, false); + signed_len = ionic_prep_inline(wqe->common.pld.data, mval, + wr->sg_list, wr->num_sge); + } else { + wqe->base.num_sge_key = wr->num_sge; + mval = ionic_v1_send_wqe_max_sge(qp->sq.stride_log2, + qp->sq_spec, + false); + signed_len = ionic_prep_pld(wqe, &wqe->common.pld, + qp->sq_spec, mval, + wr->sg_list, wr->num_sge); + } + + if (unlikely(signed_len < 0)) + return signed_len; + + meta->len = signed_len; + wqe->common.length = cpu_to_be32(signed_len); + + ionic_prep_base(qp, wr, meta, wqe); + + return 0; +} + +static void ionic_prep_sq_wqe(struct ionic_qp *qp, void *wqe) +{ + memset(wqe, 0, 1u << qp->sq.stride_log2); +} + +static void ionic_prep_rq_wqe(struct ionic_qp *qp, void *wqe) +{ + memset(wqe, 0, 1u << qp->rq.stride_log2); +} + +static int ionic_prep_send(struct ionic_qp *qp, + const struct ib_send_wr *wr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device); + struct ionic_sq_meta *meta; + struct ionic_v1_wqe *wqe; + + meta = &qp->sq_meta[qp->sq.prod]; + wqe = ionic_queue_at_prod(&qp->sq); + + ionic_prep_sq_wqe(qp, wqe); + + meta->ibop = IB_WC_SEND; + + switch (wr->opcode) { + case IB_WR_SEND: + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND); + break; + case IB_WR_SEND_WITH_IMM: + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND_IMM); + wqe->base.imm_data_key = wr->ex.imm_data; + break; + case IB_WR_SEND_WITH_INV: + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND_INV); + wqe->base.imm_data_key = + cpu_to_be32(wr->ex.invalidate_rkey); + break; + default: + return -EINVAL; + } + + return ionic_prep_common(qp, wr, meta, wqe); +} + +static int ionic_prep_send_ud(struct ionic_qp *qp, + const struct ib_ud_wr *wr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device); + struct ionic_sq_meta *meta; + struct ionic_v1_wqe *wqe; + struct ionic_ah *ah; + + if (unlikely(!wr->ah)) + return -EINVAL; + + ah = to_ionic_ah(wr->ah); + + meta = &qp->sq_meta[qp->sq.prod]; + wqe = ionic_queue_at_prod(&qp->sq); + + ionic_prep_sq_wqe(qp, wqe); + + wqe->common.send.ah_id = cpu_to_be32(ah->ahid); + wqe->common.send.dest_qpn = cpu_to_be32(wr->remote_qpn); + wqe->common.send.dest_qkey = cpu_to_be32(wr->remote_qkey); + + meta->ibop = IB_WC_SEND; + + switch (wr->wr.opcode) { + case IB_WR_SEND: + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND); + break; + case IB_WR_SEND_WITH_IMM: + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND_IMM); + wqe->base.imm_data_key = wr->wr.ex.imm_data; + break; + default: + return -EINVAL; + } + + return ionic_prep_common(qp, &wr->wr, meta, wqe); +} + +static int ionic_prep_rdma(struct ionic_qp *qp, + const struct ib_rdma_wr *wr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device); + struct ionic_sq_meta *meta; + struct ionic_v1_wqe *wqe; + + meta = &qp->sq_meta[qp->sq.prod]; + wqe = ionic_queue_at_prod(&qp->sq); + + ionic_prep_sq_wqe(qp, wqe); + + meta->ibop = IB_WC_RDMA_WRITE; + + switch (wr->wr.opcode) { + case IB_WR_RDMA_READ: + if (wr->wr.send_flags & (IB_SEND_SOLICITED | IB_SEND_INLINE)) + return -EINVAL; + meta->ibop = IB_WC_RDMA_READ; + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, RDMA_READ); + break; + case IB_WR_RDMA_WRITE: + if (wr->wr.send_flags & IB_SEND_SOLICITED) + return -EINVAL; + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, RDMA_WRITE); + break; + case IB_WR_RDMA_WRITE_WITH_IMM: + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, RDMA_WRITE_IMM); + wqe->base.imm_data_key = wr->wr.ex.imm_data; + break; + default: + return -EINVAL; + } + + wqe->common.rdma.remote_va_high = cpu_to_be32(wr->remote_addr >> 32); + wqe->common.rdma.remote_va_low = cpu_to_be32(wr->remote_addr); + wqe->common.rdma.remote_rkey = cpu_to_be32(wr->rkey); + + return ionic_prep_common(qp, &wr->wr, meta, wqe); +} + +static int ionic_prep_atomic(struct ionic_qp *qp, + const struct ib_atomic_wr *wr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device); + struct ionic_sq_meta *meta; + struct ionic_v1_wqe *wqe; + + if (wr->wr.num_sge != 1 || wr->wr.sg_list[0].length != 8) + return -EINVAL; + + if (wr->wr.send_flags & (IB_SEND_SOLICITED | IB_SEND_INLINE)) + return -EINVAL; + + meta = &qp->sq_meta[qp->sq.prod]; + wqe = ionic_queue_at_prod(&qp->sq); + + ionic_prep_sq_wqe(qp, wqe); + + meta->ibop = IB_WC_RDMA_WRITE; + + switch (wr->wr.opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + meta->ibop = IB_WC_COMP_SWAP; + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, ATOMIC_CS); + wqe->atomic.swap_add_high = cpu_to_be32(wr->swap >> 32); + wqe->atomic.swap_add_low = cpu_to_be32(wr->swap); + wqe->atomic.compare_high = cpu_to_be32(wr->compare_add >> 32); + wqe->atomic.compare_low = cpu_to_be32(wr->compare_add); + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + meta->ibop = IB_WC_FETCH_ADD; + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, ATOMIC_FA); + wqe->atomic.swap_add_high = cpu_to_be32(wr->compare_add >> 32); + wqe->atomic.swap_add_low = cpu_to_be32(wr->compare_add); + break; + default: + return -EINVAL; + } + + wqe->atomic.remote_va_high = cpu_to_be32(wr->remote_addr >> 32); + wqe->atomic.remote_va_low = cpu_to_be32(wr->remote_addr); + wqe->atomic.remote_rkey = cpu_to_be32(wr->rkey); + + wqe->base.num_sge_key = 1; + wqe->atomic.sge.va = cpu_to_be64(wr->wr.sg_list[0].addr); + wqe->atomic.sge.len = cpu_to_be32(8); + wqe->atomic.sge.lkey = cpu_to_be32(wr->wr.sg_list[0].lkey); + + return ionic_prep_common(qp, &wr->wr, meta, wqe); +} + +static int ionic_prep_inv(struct ionic_qp *qp, + const struct ib_send_wr *wr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device); + struct ionic_sq_meta *meta; + struct ionic_v1_wqe *wqe; + + if (wr->send_flags & (IB_SEND_SOLICITED | IB_SEND_INLINE)) + return -EINVAL; + + meta = &qp->sq_meta[qp->sq.prod]; + wqe = ionic_queue_at_prod(&qp->sq); + + ionic_prep_sq_wqe(qp, wqe); + + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, LOCAL_INV); + wqe->base.imm_data_key = cpu_to_be32(wr->ex.invalidate_rkey); + + meta->len = 0; + meta->ibop = IB_WC_LOCAL_INV; + + ionic_prep_base(qp, wr, meta, wqe); + + return 0; +} + +static int ionic_prep_reg(struct ionic_qp *qp, + const struct ib_reg_wr *wr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device); + struct ionic_mr *mr = to_ionic_mr(wr->mr); + struct ionic_sq_meta *meta; + struct ionic_v1_wqe *wqe; + __le64 dma_addr; + int flags; + + if (wr->wr.send_flags & (IB_SEND_SOLICITED | IB_SEND_INLINE)) + return -EINVAL; + + /* must call ib_map_mr_sg before posting reg wr */ + if (!mr->buf.tbl_pages) + return -EINVAL; + + meta = &qp->sq_meta[qp->sq.prod]; + wqe = ionic_queue_at_prod(&qp->sq); + + ionic_prep_sq_wqe(qp, wqe); + + flags = to_ionic_mr_flags(wr->access); + + wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, REG_MR); + wqe->base.num_sge_key = wr->key; + wqe->base.imm_data_key = cpu_to_be32(mr->ibmr.lkey); + wqe->reg_mr.va = cpu_to_be64(mr->ibmr.iova); + wqe->reg_mr.length = cpu_to_be64(mr->ibmr.length); + wqe->reg_mr.offset = ionic_pgtbl_off(&mr->buf, mr->ibmr.iova); + dma_addr = ionic_pgtbl_dma(&mr->buf, mr->ibmr.iova); + wqe->reg_mr.dma_addr = cpu_to_be64(le64_to_cpu(dma_addr)); + + wqe->reg_mr.map_count = cpu_to_be32(mr->buf.tbl_pages); + wqe->reg_mr.flags = cpu_to_be16(flags); + wqe->reg_mr.dir_size_log2 = 0; + wqe->reg_mr.page_size_log2 = order_base_2(mr->ibmr.page_size); + + meta->len = 0; + meta->ibop = IB_WC_REG_MR; + + ionic_prep_base(qp, &wr->wr, meta, wqe); + + return 0; +} + +static int ionic_prep_one_rc(struct ionic_qp *qp, + const struct ib_send_wr *wr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device); + int rc = 0; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + rc = ionic_prep_send(qp, wr); + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + rc = ionic_prep_rdma(qp, rdma_wr(wr)); + break; + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + rc = ionic_prep_atomic(qp, atomic_wr(wr)); + break; + case IB_WR_LOCAL_INV: + rc = ionic_prep_inv(qp, wr); + break; + case IB_WR_REG_MR: + rc = ionic_prep_reg(qp, reg_wr(wr)); + break; + default: + ibdev_dbg(&dev->ibdev, "invalid opcode %d\n", wr->opcode); + rc = -EINVAL; + } + + return rc; +} + +static int ionic_prep_one_ud(struct ionic_qp *qp, + const struct ib_send_wr *wr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device); + int rc = 0; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + rc = ionic_prep_send_ud(qp, ud_wr(wr)); + break; + default: + ibdev_dbg(&dev->ibdev, "invalid opcode %d\n", wr->opcode); + rc = -EINVAL; + } + + return rc; +} + +static int ionic_prep_recv(struct ionic_qp *qp, + const struct ib_recv_wr *wr) +{ + struct ionic_rq_meta *meta; + struct ionic_v1_wqe *wqe; + s64 signed_len; + u32 mval; + + wqe = ionic_queue_at_prod(&qp->rq); + + /* if wqe is owned by device, caller can try posting again soon */ + if (wqe->base.flags & cpu_to_be16(IONIC_V1_FLAG_FENCE)) + return -EAGAIN; + + meta = qp->rq_meta_head; + if (unlikely(meta == IONIC_META_LAST) || + unlikely(meta == IONIC_META_POSTED)) + return -EIO; + + ionic_prep_rq_wqe(qp, wqe); + + mval = ionic_v1_recv_wqe_max_sge(qp->rq.stride_log2, qp->rq_spec, + false); + signed_len = ionic_prep_pld(wqe, &wqe->recv.pld, + qp->rq_spec, mval, + wr->sg_list, wr->num_sge); + if (signed_len < 0) + return signed_len; + + meta->wrid = wr->wr_id; + + wqe->base.wqe_id = meta - qp->rq_meta; + wqe->base.num_sge_key = wr->num_sge; + + /* total length for recv goes in base imm_data_key */ + wqe->base.imm_data_key = cpu_to_be32(signed_len); + + ionic_queue_produce(&qp->rq); + + qp->rq_meta_head = meta->next; + meta->next = IONIC_META_POSTED; + + return 0; +} + +static int ionic_post_send_common(struct ionic_ibdev *dev, + struct ionic_vcq *vcq, + struct ionic_cq *cq, + struct ionic_qp *qp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad) +{ + unsigned long irqflags; + bool notify = false; + int spend, rc = 0; + + if (!bad) + return -EINVAL; + + if (!qp->has_sq) { + *bad = wr; + return -EINVAL; + } + + if (qp->state < IB_QPS_RTS) { + *bad = wr; + return -EINVAL; + } + + spin_lock_irqsave(&qp->sq_lock, irqflags); + + while (wr) { + if (ionic_queue_full(&qp->sq)) { + ibdev_dbg(&dev->ibdev, "queue full"); + rc = -ENOMEM; + goto out; + } + + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_GSI) + rc = ionic_prep_one_ud(qp, wr); + else + rc = ionic_prep_one_rc(qp, wr); + if (rc) + goto out; + + wr = wr->next; + } + +out: + spin_unlock_irqrestore(&qp->sq_lock, irqflags); + + spin_lock_irqsave(&cq->lock, irqflags); + spin_lock(&qp->sq_lock); + + if (likely(qp->sq.prod != qp->sq_old_prod)) { + /* ring cq doorbell just in time */ + spend = (qp->sq.prod - qp->sq_old_prod) & qp->sq.mask; + ionic_reserve_cq(dev, cq, spend); + + qp->sq_old_prod = qp->sq.prod; + + ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.sq_qtype, + ionic_queue_dbell_val(&qp->sq)); + } + + if (qp->sq_flush) { + notify = true; + cq->flush = true; + list_move_tail(&qp->cq_flush_sq, &cq->flush_sq); + } + + spin_unlock(&qp->sq_lock); + spin_unlock_irqrestore(&cq->lock, irqflags); + + if (notify && vcq->ibcq.comp_handler) + vcq->ibcq.comp_handler(&vcq->ibcq, vcq->ibcq.cq_context); + + *bad = wr; + return rc; +} + +static int ionic_post_recv_common(struct ionic_ibdev *dev, + struct ionic_vcq *vcq, + struct ionic_cq *cq, + struct ionic_qp *qp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad) +{ + unsigned long irqflags; + bool notify = false; + int spend, rc = 0; + + if (!bad) + return -EINVAL; + + if (!qp->has_rq) { + *bad = wr; + return -EINVAL; + } + + if (qp->state < IB_QPS_INIT) { + *bad = wr; + return -EINVAL; + } + + spin_lock_irqsave(&qp->rq_lock, irqflags); + + while (wr) { + if (ionic_queue_full(&qp->rq)) { + ibdev_dbg(&dev->ibdev, "queue full"); + rc = -ENOMEM; + goto out; + } + + rc = ionic_prep_recv(qp, wr); + if (rc) + goto out; + + wr = wr->next; + } + +out: + if (!cq) { + spin_unlock_irqrestore(&qp->rq_lock, irqflags); + goto out_unlocked; + } + spin_unlock_irqrestore(&qp->rq_lock, irqflags); + + spin_lock_irqsave(&cq->lock, irqflags); + spin_lock(&qp->rq_lock); + + if (likely(qp->rq.prod != qp->rq_old_prod)) { + /* ring cq doorbell just in time */ + spend = (qp->rq.prod - qp->rq_old_prod) & qp->rq.mask; + ionic_reserve_cq(dev, cq, spend); + + qp->rq_old_prod = qp->rq.prod; + + ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.rq_qtype, + ionic_queue_dbell_val(&qp->rq)); + } + + if (qp->rq_flush) { + notify = true; + cq->flush = true; + list_move_tail(&qp->cq_flush_rq, &cq->flush_rq); + } + + spin_unlock(&qp->rq_lock); + spin_unlock_irqrestore(&cq->lock, irqflags); + + if (notify && vcq->ibcq.comp_handler) + vcq->ibcq.comp_handler(&vcq->ibcq, vcq->ibcq.cq_context); + +out_unlocked: + *bad = wr; + return rc; +} + +int ionic_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_vcq *vcq = to_ionic_vcq(ibqp->send_cq); + struct ionic_qp *qp = to_ionic_qp(ibqp); + struct ionic_cq *cq = + to_ionic_vcq_cq(ibqp->send_cq, qp->udma_idx); + + return ionic_post_send_common(dev, vcq, cq, qp, wr, bad); +} + +int ionic_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device); + struct ionic_vcq *vcq = to_ionic_vcq(ibqp->recv_cq); + struct ionic_qp *qp = to_ionic_qp(ibqp); + struct ionic_cq *cq = + to_ionic_vcq_cq(ibqp->recv_cq, qp->udma_idx); + + return ionic_post_recv_common(dev, vcq, cq, qp, wr, bad); +} diff --git a/drivers/infiniband/hw/ionic/ionic_fw.h b/drivers/infiniband/hw/ionic/ionic_fw.h new file mode 100644 index 000000000000..adfbb89d856c --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_fw.h @@ -0,0 +1,1029 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#ifndef _IONIC_FW_H_ +#define _IONIC_FW_H_ + +#include <linux/kernel.h> +#include <rdma/ib_verbs.h> + +/* common for ib spec */ + +#define IONIC_EXP_DBELL_SZ 8 + +enum ionic_mrid_bits { + IONIC_MRID_INDEX_SHIFT = 8, +}; + +static inline u32 ionic_mrid(u32 index, u8 key) +{ + return (index << IONIC_MRID_INDEX_SHIFT) | key; +} + +static inline u32 ionic_mrid_index(u32 lrkey) +{ + return lrkey >> IONIC_MRID_INDEX_SHIFT; +} + +/* common to all versions */ + +/* wqe scatter gather element */ +struct ionic_sge { + __be64 va; + __be32 len; + __be32 lkey; +}; + +/* admin queue mr type */ +enum ionic_mr_flags { + /* bits that determine mr access */ + IONIC_MRF_LOCAL_WRITE = BIT(0), + IONIC_MRF_REMOTE_WRITE = BIT(1), + IONIC_MRF_REMOTE_READ = BIT(2), + IONIC_MRF_REMOTE_ATOMIC = BIT(3), + IONIC_MRF_MW_BIND = BIT(4), + IONIC_MRF_ZERO_BASED = BIT(5), + IONIC_MRF_ON_DEMAND = BIT(6), + IONIC_MRF_PB = BIT(7), + IONIC_MRF_ACCESS_MASK = BIT(12) - 1, + + /* bits that determine mr type */ + IONIC_MRF_UKEY_EN = BIT(13), + IONIC_MRF_IS_MW = BIT(14), + IONIC_MRF_INV_EN = BIT(15), + + /* base flags combinations for mr types */ + IONIC_MRF_USER_MR = 0, + IONIC_MRF_PHYS_MR = (IONIC_MRF_UKEY_EN | + IONIC_MRF_INV_EN), + IONIC_MRF_MW_1 = (IONIC_MRF_UKEY_EN | + IONIC_MRF_IS_MW), + IONIC_MRF_MW_2 = (IONIC_MRF_UKEY_EN | + IONIC_MRF_IS_MW | + IONIC_MRF_INV_EN), +}; + +static inline int to_ionic_mr_flags(int access) +{ + int flags = 0; + + if (access & IB_ACCESS_LOCAL_WRITE) + flags |= IONIC_MRF_LOCAL_WRITE; + + if (access & IB_ACCESS_REMOTE_READ) + flags |= IONIC_MRF_REMOTE_READ; + + if (access & IB_ACCESS_REMOTE_WRITE) + flags |= IONIC_MRF_REMOTE_WRITE; + + if (access & IB_ACCESS_REMOTE_ATOMIC) + flags |= IONIC_MRF_REMOTE_ATOMIC; + + if (access & IB_ACCESS_MW_BIND) + flags |= IONIC_MRF_MW_BIND; + + if (access & IB_ZERO_BASED) + flags |= IONIC_MRF_ZERO_BASED; + + return flags; +} + +enum ionic_qp_flags { + /* bits that determine qp access */ + IONIC_QPF_REMOTE_WRITE = BIT(0), + IONIC_QPF_REMOTE_READ = BIT(1), + IONIC_QPF_REMOTE_ATOMIC = BIT(2), + + /* bits that determine other qp behavior */ + IONIC_QPF_SQ_PB = BIT(6), + IONIC_QPF_RQ_PB = BIT(7), + IONIC_QPF_SQ_SPEC = BIT(8), + IONIC_QPF_RQ_SPEC = BIT(9), + IONIC_QPF_REMOTE_PRIVILEGED = BIT(10), + IONIC_QPF_SQ_DRAINING = BIT(11), + IONIC_QPF_SQD_NOTIFY = BIT(12), + IONIC_QPF_SQ_CMB = BIT(13), + IONIC_QPF_RQ_CMB = BIT(14), + IONIC_QPF_PRIVILEGED = BIT(15), +}; + +static inline int from_ionic_qp_flags(int flags) +{ + int access_flags = 0; + + if (flags & IONIC_QPF_REMOTE_WRITE) + access_flags |= IB_ACCESS_REMOTE_WRITE; + + if (flags & IONIC_QPF_REMOTE_READ) + access_flags |= IB_ACCESS_REMOTE_READ; + + if (flags & IONIC_QPF_REMOTE_ATOMIC) + access_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return access_flags; +} + +static inline int to_ionic_qp_flags(int access, bool sqd_notify, + bool sq_is_cmb, bool rq_is_cmb, + bool sq_spec, bool rq_spec, + bool privileged, bool remote_privileged) +{ + int flags = 0; + + if (access & IB_ACCESS_REMOTE_WRITE) + flags |= IONIC_QPF_REMOTE_WRITE; + + if (access & IB_ACCESS_REMOTE_READ) + flags |= IONIC_QPF_REMOTE_READ; + + if (access & IB_ACCESS_REMOTE_ATOMIC) + flags |= IONIC_QPF_REMOTE_ATOMIC; + + if (sqd_notify) + flags |= IONIC_QPF_SQD_NOTIFY; + + if (sq_is_cmb) + flags |= IONIC_QPF_SQ_CMB; + + if (rq_is_cmb) + flags |= IONIC_QPF_RQ_CMB; + + if (sq_spec) + flags |= IONIC_QPF_SQ_SPEC; + + if (rq_spec) + flags |= IONIC_QPF_RQ_SPEC; + + if (privileged) + flags |= IONIC_QPF_PRIVILEGED; + + if (remote_privileged) + flags |= IONIC_QPF_REMOTE_PRIVILEGED; + + return flags; +} + +/* cqe non-admin status indicated in status_length field when err bit is set */ +enum ionic_status { + IONIC_STS_OK, + IONIC_STS_LOCAL_LEN_ERR, + IONIC_STS_LOCAL_QP_OPER_ERR, + IONIC_STS_LOCAL_PROT_ERR, + IONIC_STS_WQE_FLUSHED_ERR, + IONIC_STS_MEM_MGMT_OPER_ERR, + IONIC_STS_BAD_RESP_ERR, + IONIC_STS_LOCAL_ACC_ERR, + IONIC_STS_REMOTE_INV_REQ_ERR, + IONIC_STS_REMOTE_ACC_ERR, + IONIC_STS_REMOTE_OPER_ERR, + IONIC_STS_RETRY_EXCEEDED, + IONIC_STS_RNR_RETRY_EXCEEDED, + IONIC_STS_XRC_VIO_ERR, + IONIC_STS_LOCAL_SGL_INV_ERR, +}; + +static inline int ionic_to_ib_status(int sts) +{ + switch (sts) { + case IONIC_STS_OK: + return IB_WC_SUCCESS; + case IONIC_STS_LOCAL_LEN_ERR: + return IB_WC_LOC_LEN_ERR; + case IONIC_STS_LOCAL_QP_OPER_ERR: + case IONIC_STS_LOCAL_SGL_INV_ERR: + return IB_WC_LOC_QP_OP_ERR; + case IONIC_STS_LOCAL_PROT_ERR: + return IB_WC_LOC_PROT_ERR; + case IONIC_STS_WQE_FLUSHED_ERR: + return IB_WC_WR_FLUSH_ERR; + case IONIC_STS_MEM_MGMT_OPER_ERR: + return IB_WC_MW_BIND_ERR; + case IONIC_STS_BAD_RESP_ERR: + return IB_WC_BAD_RESP_ERR; + case IONIC_STS_LOCAL_ACC_ERR: + return IB_WC_LOC_ACCESS_ERR; + case IONIC_STS_REMOTE_INV_REQ_ERR: + return IB_WC_REM_INV_REQ_ERR; + case IONIC_STS_REMOTE_ACC_ERR: + return IB_WC_REM_ACCESS_ERR; + case IONIC_STS_REMOTE_OPER_ERR: + return IB_WC_REM_OP_ERR; + case IONIC_STS_RETRY_EXCEEDED: + return IB_WC_RETRY_EXC_ERR; + case IONIC_STS_RNR_RETRY_EXCEEDED: + return IB_WC_RNR_RETRY_EXC_ERR; + case IONIC_STS_XRC_VIO_ERR: + default: + return IB_WC_GENERAL_ERR; + } +} + +/* admin queue qp type */ +enum ionic_qp_type { + IONIC_QPT_RC, + IONIC_QPT_UC, + IONIC_QPT_RD, + IONIC_QPT_UD, + IONIC_QPT_SRQ, + IONIC_QPT_XRC_INI, + IONIC_QPT_XRC_TGT, + IONIC_QPT_XRC_SRQ, +}; + +static inline int to_ionic_qp_type(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_GSI: + case IB_QPT_UD: + return IONIC_QPT_UD; + case IB_QPT_RC: + return IONIC_QPT_RC; + case IB_QPT_UC: + return IONIC_QPT_UC; + case IB_QPT_XRC_INI: + return IONIC_QPT_XRC_INI; + case IB_QPT_XRC_TGT: + return IONIC_QPT_XRC_TGT; + default: + return -EINVAL; + } +} + +/* admin queue qp state */ +enum ionic_qp_state { + IONIC_QPS_RESET, + IONIC_QPS_INIT, + IONIC_QPS_RTR, + IONIC_QPS_RTS, + IONIC_QPS_SQD, + IONIC_QPS_SQE, + IONIC_QPS_ERR, +}; + +static inline int from_ionic_qp_state(enum ionic_qp_state state) +{ + switch (state) { + case IONIC_QPS_RESET: + return IB_QPS_RESET; + case IONIC_QPS_INIT: + return IB_QPS_INIT; + case IONIC_QPS_RTR: + return IB_QPS_RTR; + case IONIC_QPS_RTS: + return IB_QPS_RTS; + case IONIC_QPS_SQD: + return IB_QPS_SQD; + case IONIC_QPS_SQE: + return IB_QPS_SQE; + case IONIC_QPS_ERR: + return IB_QPS_ERR; + default: + return -EINVAL; + } +} + +static inline int to_ionic_qp_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: + return IONIC_QPS_RESET; + case IB_QPS_INIT: + return IONIC_QPS_INIT; + case IB_QPS_RTR: + return IONIC_QPS_RTR; + case IB_QPS_RTS: + return IONIC_QPS_RTS; + case IB_QPS_SQD: + return IONIC_QPS_SQD; + case IB_QPS_SQE: + return IONIC_QPS_SQE; + case IB_QPS_ERR: + return IONIC_QPS_ERR; + default: + return 0; + } +} + +static inline int to_ionic_qp_modify_state(enum ib_qp_state to_state, + enum ib_qp_state from_state) +{ + return to_ionic_qp_state(to_state) | + (to_ionic_qp_state(from_state) << 4); +} + +/* fw abi v1 */ + +/* data payload part of v1 wqe */ +union ionic_v1_pld { + struct ionic_sge sgl[2]; + __be32 spec32[8]; + __be16 spec16[16]; + __u8 data[32]; +}; + +/* completion queue v1 cqe */ +struct ionic_v1_cqe { + union { + struct { + __be16 cmd_idx; + __u8 cmd_op; + __u8 rsvd[17]; + __le16 old_sq_cindex; + __le16 old_rq_cq_cindex; + } admin; + struct { + __u64 wqe_id; + __be32 src_qpn_op; + __u8 src_mac[6]; + __be16 vlan_tag; + __be32 imm_data_rkey; + } recv; + struct { + __u8 rsvd[4]; + __be32 msg_msn; + __u8 rsvd2[8]; + __u64 npg_wqe_id; + } send; + }; + __be32 status_length; + __be32 qid_type_flags; +}; + +/* bits for cqe recv */ +enum ionic_v1_cqe_src_qpn_bits { + IONIC_V1_CQE_RECV_QPN_MASK = 0xffffff, + IONIC_V1_CQE_RECV_OP_SHIFT = 24, + + /* MASK could be 0x3, but need 0x1f for makeshift values: + * OP_TYPE_RDMA_OPER_WITH_IMM, OP_TYPE_SEND_RCVD + */ + IONIC_V1_CQE_RECV_OP_MASK = 0x1f, + IONIC_V1_CQE_RECV_OP_SEND = 0, + IONIC_V1_CQE_RECV_OP_SEND_INV = 1, + IONIC_V1_CQE_RECV_OP_SEND_IMM = 2, + IONIC_V1_CQE_RECV_OP_RDMA_IMM = 3, + + IONIC_V1_CQE_RECV_IS_IPV4 = BIT(7 + IONIC_V1_CQE_RECV_OP_SHIFT), + IONIC_V1_CQE_RECV_IS_VLAN = BIT(6 + IONIC_V1_CQE_RECV_OP_SHIFT), +}; + +/* bits for cqe qid_type_flags */ +enum ionic_v1_cqe_qtf_bits { + IONIC_V1_CQE_COLOR = BIT(0), + IONIC_V1_CQE_ERROR = BIT(1), + IONIC_V1_CQE_TYPE_SHIFT = 5, + IONIC_V1_CQE_TYPE_MASK = 0x7, + IONIC_V1_CQE_QID_SHIFT = 8, + + IONIC_V1_CQE_TYPE_ADMIN = 0, + IONIC_V1_CQE_TYPE_RECV = 1, + IONIC_V1_CQE_TYPE_SEND_MSN = 2, + IONIC_V1_CQE_TYPE_SEND_NPG = 3, +}; + +static inline bool ionic_v1_cqe_color(struct ionic_v1_cqe *cqe) +{ + return cqe->qid_type_flags & cpu_to_be32(IONIC_V1_CQE_COLOR); +} + +static inline bool ionic_v1_cqe_error(struct ionic_v1_cqe *cqe) +{ + return cqe->qid_type_flags & cpu_to_be32(IONIC_V1_CQE_ERROR); +} + +static inline bool ionic_v1_cqe_recv_is_ipv4(struct ionic_v1_cqe *cqe) +{ + return cqe->recv.src_qpn_op & cpu_to_be32(IONIC_V1_CQE_RECV_IS_IPV4); +} + +static inline bool ionic_v1_cqe_recv_is_vlan(struct ionic_v1_cqe *cqe) +{ + return cqe->recv.src_qpn_op & cpu_to_be32(IONIC_V1_CQE_RECV_IS_VLAN); +} + +static inline void ionic_v1_cqe_clean(struct ionic_v1_cqe *cqe) +{ + cqe->qid_type_flags |= cpu_to_be32(~0u << IONIC_V1_CQE_QID_SHIFT); +} + +static inline u32 ionic_v1_cqe_qtf(struct ionic_v1_cqe *cqe) +{ + return be32_to_cpu(cqe->qid_type_flags); +} + +static inline u8 ionic_v1_cqe_qtf_type(u32 qtf) +{ + return (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK; +} + +static inline u32 ionic_v1_cqe_qtf_qid(u32 qtf) +{ + return qtf >> IONIC_V1_CQE_QID_SHIFT; +} + +/* v1 base wqe header */ +struct ionic_v1_base_hdr { + __u64 wqe_id; + __u8 op; + __u8 num_sge_key; + __be16 flags; + __be32 imm_data_key; +}; + +/* v1 receive wqe body */ +struct ionic_v1_recv_bdy { + __u8 rsvd[16]; + union ionic_v1_pld pld; +}; + +/* v1 send/rdma wqe body (common, has sgl) */ +struct ionic_v1_common_bdy { + union { + struct { + __be32 ah_id; + __be32 dest_qpn; + __be32 dest_qkey; + } send; + struct { + __be32 remote_va_high; + __be32 remote_va_low; + __be32 remote_rkey; + } rdma; + }; + __be32 length; + union ionic_v1_pld pld; +}; + +/* v1 atomic wqe body */ +struct ionic_v1_atomic_bdy { + __be32 remote_va_high; + __be32 remote_va_low; + __be32 remote_rkey; + __be32 swap_add_high; + __be32 swap_add_low; + __be32 compare_high; + __be32 compare_low; + __u8 rsvd[4]; + struct ionic_sge sge; +}; + +/* v1 reg mr wqe body */ +struct ionic_v1_reg_mr_bdy { + __be64 va; + __be64 length; + __be64 offset; + __be64 dma_addr; + __be32 map_count; + __be16 flags; + __u8 dir_size_log2; + __u8 page_size_log2; + __u8 rsvd[8]; +}; + +/* v1 bind mw wqe body */ +struct ionic_v1_bind_mw_bdy { + __be64 va; + __be64 length; + __be32 lkey; + __be16 flags; + __u8 rsvd[26]; +}; + +/* v1 send/recv wqe */ +struct ionic_v1_wqe { + struct ionic_v1_base_hdr base; + union { + struct ionic_v1_recv_bdy recv; + struct ionic_v1_common_bdy common; + struct ionic_v1_atomic_bdy atomic; + struct ionic_v1_reg_mr_bdy reg_mr; + struct ionic_v1_bind_mw_bdy bind_mw; + }; +}; + +/* queue pair v1 send opcodes */ +enum ionic_v1_op { + IONIC_V1_OP_SEND, + IONIC_V1_OP_SEND_INV, + IONIC_V1_OP_SEND_IMM, + IONIC_V1_OP_RDMA_READ, + IONIC_V1_OP_RDMA_WRITE, + IONIC_V1_OP_RDMA_WRITE_IMM, + IONIC_V1_OP_ATOMIC_CS, + IONIC_V1_OP_ATOMIC_FA, + IONIC_V1_OP_REG_MR, + IONIC_V1_OP_LOCAL_INV, + IONIC_V1_OP_BIND_MW, + + /* flags */ + IONIC_V1_FLAG_FENCE = BIT(0), + IONIC_V1_FLAG_SOL = BIT(1), + IONIC_V1_FLAG_INL = BIT(2), + IONIC_V1_FLAG_SIG = BIT(3), + + /* flags last four bits for sgl spec format */ + IONIC_V1_FLAG_SPEC32 = (1u << 12), + IONIC_V1_FLAG_SPEC16 = (2u << 12), + IONIC_V1_SPEC_FIRST_SGE = 2, +}; + +/* queue pair v2 send opcodes */ +enum ionic_v2_op { + IONIC_V2_OPSL_OUT = 0x20, + IONIC_V2_OPSL_IMM = 0x40, + IONIC_V2_OPSL_INV = 0x80, + + IONIC_V2_OP_SEND = 0x0 | IONIC_V2_OPSL_OUT, + IONIC_V2_OP_SEND_IMM = IONIC_V2_OP_SEND | IONIC_V2_OPSL_IMM, + IONIC_V2_OP_SEND_INV = IONIC_V2_OP_SEND | IONIC_V2_OPSL_INV, + + IONIC_V2_OP_RDMA_WRITE = 0x1 | IONIC_V2_OPSL_OUT, + IONIC_V2_OP_RDMA_WRITE_IMM = IONIC_V2_OP_RDMA_WRITE | IONIC_V2_OPSL_IMM, + + IONIC_V2_OP_RDMA_READ = 0x2, + + IONIC_V2_OP_ATOMIC_CS = 0x4, + IONIC_V2_OP_ATOMIC_FA = 0x5, + IONIC_V2_OP_REG_MR = 0x6, + IONIC_V2_OP_LOCAL_INV = 0x7, + IONIC_V2_OP_BIND_MW = 0x8, +}; + +static inline size_t ionic_v1_send_wqe_min_size(int min_sge, int min_data, + int spec, bool expdb) +{ + size_t sz_wqe, sz_sgl, sz_data; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + min_sge += IONIC_V1_SPEC_FIRST_SGE; + + if (expdb) { + min_sge += 1; + min_data += IONIC_EXP_DBELL_SZ; + } + + sz_wqe = sizeof(struct ionic_v1_wqe); + sz_sgl = offsetof(struct ionic_v1_wqe, common.pld.sgl[min_sge]); + sz_data = offsetof(struct ionic_v1_wqe, common.pld.data[min_data]); + + if (sz_sgl > sz_wqe) + sz_wqe = sz_sgl; + + if (sz_data > sz_wqe) + sz_wqe = sz_data; + + return sz_wqe; +} + +static inline int ionic_v1_send_wqe_max_sge(u8 stride_log2, int spec, + bool expdb) +{ + struct ionic_sge *sge = (void *)(1ull << stride_log2); + struct ionic_v1_wqe *wqe = (void *)0; + int num_sge = 0; + + if (expdb) + sge -= 1; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + num_sge = IONIC_V1_SPEC_FIRST_SGE; + + num_sge = sge - &wqe->common.pld.sgl[num_sge]; + + if (spec && num_sge > spec) + num_sge = spec; + + return num_sge; +} + +static inline int ionic_v1_send_wqe_max_data(u8 stride_log2, bool expdb) +{ + struct ionic_v1_wqe *wqe = (void *)0; + __u8 *data = (void *)(1ull << stride_log2); + + if (expdb) + data -= IONIC_EXP_DBELL_SZ; + + return data - wqe->common.pld.data; +} + +static inline size_t ionic_v1_recv_wqe_min_size(int min_sge, int spec, + bool expdb) +{ + size_t sz_wqe, sz_sgl; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + min_sge += IONIC_V1_SPEC_FIRST_SGE; + + if (expdb) + min_sge += 1; + + sz_wqe = sizeof(struct ionic_v1_wqe); + sz_sgl = offsetof(struct ionic_v1_wqe, recv.pld.sgl[min_sge]); + + if (sz_sgl > sz_wqe) + sz_wqe = sz_sgl; + + return sz_wqe; +} + +static inline int ionic_v1_recv_wqe_max_sge(u8 stride_log2, int spec, + bool expdb) +{ + struct ionic_sge *sge = (void *)(1ull << stride_log2); + struct ionic_v1_wqe *wqe = (void *)0; + int num_sge = 0; + + if (expdb) + sge -= 1; + + if (spec > IONIC_V1_SPEC_FIRST_SGE) + num_sge = IONIC_V1_SPEC_FIRST_SGE; + + num_sge = sge - &wqe->recv.pld.sgl[num_sge]; + + if (spec && num_sge > spec) + num_sge = spec; + + return num_sge; +} + +static inline int ionic_v1_use_spec_sge(int min_sge, int spec) +{ + if (!spec || min_sge > spec) + return 0; + + if (min_sge <= IONIC_V1_SPEC_FIRST_SGE) + return IONIC_V1_SPEC_FIRST_SGE; + + return spec; +} + +struct ionic_admin_stats_hdr { + __le64 dma_addr; + __le32 length; + __le32 id_ver; + __u8 type_state; +} __packed; + +#define IONIC_ADMIN_STATS_HDRS_IN_V1_LEN 17 +static_assert(sizeof(struct ionic_admin_stats_hdr) == + IONIC_ADMIN_STATS_HDRS_IN_V1_LEN); + +struct ionic_admin_create_ah { + __le64 dma_addr; + __le32 length; + __le32 pd_id; + __le32 id_ver; + __le16 dbid_flags; + __u8 csum_profile; + __u8 crypto; +} __packed; + +#define IONIC_ADMIN_CREATE_AH_IN_V1_LEN 24 +static_assert(sizeof(struct ionic_admin_create_ah) == + IONIC_ADMIN_CREATE_AH_IN_V1_LEN); + +struct ionic_admin_destroy_ah { + __le32 ah_id; +} __packed; + +#define IONIC_ADMIN_DESTROY_AH_IN_V1_LEN 4 +static_assert(sizeof(struct ionic_admin_destroy_ah) == + IONIC_ADMIN_DESTROY_AH_IN_V1_LEN); + +struct ionic_admin_query_ah { + __le64 dma_addr; +} __packed; + +#define IONIC_ADMIN_QUERY_AH_IN_V1_LEN 8 +static_assert(sizeof(struct ionic_admin_query_ah) == + IONIC_ADMIN_QUERY_AH_IN_V1_LEN); + +struct ionic_admin_create_mr { + __le64 va; + __le64 length; + __le32 pd_id; + __le32 id_ver; + __le32 tbl_index; + __le32 map_count; + __le64 dma_addr; + __le16 dbid_flags; + __u8 pt_type; + __u8 dir_size_log2; + __u8 page_size_log2; +} __packed; + +#define IONIC_ADMIN_CREATE_MR_IN_V1_LEN 45 +static_assert(sizeof(struct ionic_admin_create_mr) == + IONIC_ADMIN_CREATE_MR_IN_V1_LEN); + +struct ionic_admin_destroy_mr { + __le32 mr_id; +} __packed; + +#define IONIC_ADMIN_DESTROY_MR_IN_V1_LEN 4 +static_assert(sizeof(struct ionic_admin_destroy_mr) == + IONIC_ADMIN_DESTROY_MR_IN_V1_LEN); + +struct ionic_admin_create_cq { + __le32 eq_id; + __u8 depth_log2; + __u8 stride_log2; + __u8 dir_size_log2_rsvd; + __u8 page_size_log2; + __le32 cq_flags; + __le32 id_ver; + __le32 tbl_index; + __le32 map_count; + __le64 dma_addr; + __le16 dbid_flags; +} __packed; + +#define IONIC_ADMIN_CREATE_CQ_IN_V1_LEN 34 +static_assert(sizeof(struct ionic_admin_create_cq) == + IONIC_ADMIN_CREATE_CQ_IN_V1_LEN); + +struct ionic_admin_destroy_cq { + __le32 cq_id; +} __packed; + +#define IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN 4 +static_assert(sizeof(struct ionic_admin_destroy_cq) == + IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN); + +struct ionic_admin_create_qp { + __le32 pd_id; + __be32 priv_flags; + __le32 sq_cq_id; + __u8 sq_depth_log2; + __u8 sq_stride_log2; + __u8 sq_dir_size_log2_rsvd; + __u8 sq_page_size_log2; + __le32 sq_tbl_index_xrcd_id; + __le32 sq_map_count; + __le64 sq_dma_addr; + __le32 rq_cq_id; + __u8 rq_depth_log2; + __u8 rq_stride_log2; + __u8 rq_dir_size_log2_rsvd; + __u8 rq_page_size_log2; + __le32 rq_tbl_index_srq_id; + __le32 rq_map_count; + __le64 rq_dma_addr; + __le32 id_ver; + __le16 dbid_flags; + __u8 type_state; + __u8 rsvd; +} __packed; + +#define IONIC_ADMIN_CREATE_QP_IN_V1_LEN 64 +static_assert(sizeof(struct ionic_admin_create_qp) == + IONIC_ADMIN_CREATE_QP_IN_V1_LEN); + +struct ionic_admin_destroy_qp { + __le32 qp_id; +} __packed; + +#define IONIC_ADMIN_DESTROY_QP_IN_V1_LEN 4 +static_assert(sizeof(struct ionic_admin_destroy_qp) == + IONIC_ADMIN_DESTROY_QP_IN_V1_LEN); + +struct ionic_admin_mod_qp { + __be32 attr_mask; + __u8 dcqcn_profile; + __u8 tfp_csum_profile; + __be16 access_flags; + __le32 rq_psn; + __le32 sq_psn; + __le32 qkey_dest_qpn; + __le32 rate_limit_kbps; + __u8 pmtu; + __u8 retry; + __u8 rnr_timer; + __u8 retry_timeout; + __u8 rsq_depth; + __u8 rrq_depth; + __le16 pkey_id; + __le32 ah_id_len; + __u8 en_pcp; + __u8 ip_dscp; + __u8 rsvd2; + __u8 type_state; + union { + struct { + __le16 rsvd1; + }; + __le32 rrq_index; + }; + __le32 rsq_index; + __le64 dma_addr; + __le32 id_ver; +} __packed; + +#define IONIC_ADMIN_MODIFY_QP_IN_V1_LEN 60 +static_assert(sizeof(struct ionic_admin_mod_qp) == + IONIC_ADMIN_MODIFY_QP_IN_V1_LEN); + +struct ionic_admin_query_qp { + __le64 hdr_dma_addr; + __le64 sq_dma_addr; + __le64 rq_dma_addr; + __le32 ah_id; + __le32 id_ver; + __le16 dbid_flags; +} __packed; + +#define IONIC_ADMIN_QUERY_QP_IN_V1_LEN 34 +static_assert(sizeof(struct ionic_admin_query_qp) == + IONIC_ADMIN_QUERY_QP_IN_V1_LEN); + +#define ADMIN_WQE_STRIDE 64 +#define ADMIN_WQE_HDR_LEN 4 + +/* admin queue v1 wqe */ +struct ionic_v1_admin_wqe { + __u8 op; + __u8 rsvd; + __le16 len; + + union { + struct ionic_admin_stats_hdr stats; + struct ionic_admin_create_ah create_ah; + struct ionic_admin_destroy_ah destroy_ah; + struct ionic_admin_query_ah query_ah; + struct ionic_admin_create_mr create_mr; + struct ionic_admin_destroy_mr destroy_mr; + struct ionic_admin_create_cq create_cq; + struct ionic_admin_destroy_cq destroy_cq; + struct ionic_admin_create_qp create_qp; + struct ionic_admin_destroy_qp destroy_qp; + struct ionic_admin_mod_qp mod_qp; + struct ionic_admin_query_qp query_qp; + } cmd; +}; + +/* side data for query qp */ +struct ionic_v1_admin_query_qp_sq { + __u8 rnr_timer; + __u8 retry_timeout; + __be16 access_perms_flags; + __be16 rsvd; + __be16 pkey_id; + __be32 qkey_dest_qpn; + __be32 rate_limit_kbps; + __be32 rq_psn; +}; + +struct ionic_v1_admin_query_qp_rq { + __u8 state_pmtu; + __u8 retry_rnrtry; + __u8 rrq_depth; + __u8 rsq_depth; + __be32 sq_psn; + __be16 access_perms_flags; + __be16 rsvd; +}; + +/* admin queue v1 opcodes */ +enum ionic_v1_admin_op { + IONIC_V1_ADMIN_NOOP, + IONIC_V1_ADMIN_CREATE_CQ, + IONIC_V1_ADMIN_CREATE_QP, + IONIC_V1_ADMIN_CREATE_MR, + IONIC_V1_ADMIN_STATS_HDRS, + IONIC_V1_ADMIN_STATS_VALS, + IONIC_V1_ADMIN_DESTROY_MR, + IONIC_V1_ADMIN_RSVD_7, /* RESIZE_CQ */ + IONIC_V1_ADMIN_DESTROY_CQ, + IONIC_V1_ADMIN_MODIFY_QP, + IONIC_V1_ADMIN_QUERY_QP, + IONIC_V1_ADMIN_DESTROY_QP, + IONIC_V1_ADMIN_DEBUG, + IONIC_V1_ADMIN_CREATE_AH, + IONIC_V1_ADMIN_QUERY_AH, + IONIC_V1_ADMIN_MODIFY_DCQCN, + IONIC_V1_ADMIN_DESTROY_AH, + IONIC_V1_ADMIN_QP_STATS_HDRS, + IONIC_V1_ADMIN_QP_STATS_VALS, + IONIC_V1_ADMIN_OPCODES_MAX, +}; + +/* admin queue v1 cqe status */ +enum ionic_v1_admin_status { + IONIC_V1_ASTS_OK, + IONIC_V1_ASTS_BAD_CMD, + IONIC_V1_ASTS_BAD_INDEX, + IONIC_V1_ASTS_BAD_STATE, + IONIC_V1_ASTS_BAD_TYPE, + IONIC_V1_ASTS_BAD_ATTR, + IONIC_V1_ASTS_MSG_TOO_BIG, +}; + +/* event queue v1 eqe */ +struct ionic_v1_eqe { + __be32 evt; +}; + +/* bits for cqe queue_type_flags */ +enum ionic_v1_eqe_evt_bits { + IONIC_V1_EQE_COLOR = BIT(0), + IONIC_V1_EQE_TYPE_SHIFT = 1, + IONIC_V1_EQE_TYPE_MASK = 0x7, + IONIC_V1_EQE_CODE_SHIFT = 4, + IONIC_V1_EQE_CODE_MASK = 0xf, + IONIC_V1_EQE_QID_SHIFT = 8, + + /* cq events */ + IONIC_V1_EQE_TYPE_CQ = 0, + /* cq normal events */ + IONIC_V1_EQE_CQ_NOTIFY = 0, + /* cq error events */ + IONIC_V1_EQE_CQ_ERR = 8, + + /* qp and srq events */ + IONIC_V1_EQE_TYPE_QP = 1, + /* qp normal events */ + IONIC_V1_EQE_SRQ_LEVEL = 0, + IONIC_V1_EQE_SQ_DRAIN = 1, + IONIC_V1_EQE_QP_COMM_EST = 2, + IONIC_V1_EQE_QP_LAST_WQE = 3, + /* qp error events */ + IONIC_V1_EQE_QP_ERR = 8, + IONIC_V1_EQE_QP_ERR_REQUEST = 9, + IONIC_V1_EQE_QP_ERR_ACCESS = 10, +}; + +enum ionic_tfp_csum_profiles { + IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP = 0, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP = 1, + IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP = 2, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP = 3, + IONIC_TFP_CSUM_PROF_IPV4_UDP_VXLAN_ETH_QTAG_IPV4_UDP = 4, + IONIC_TFP_CSUM_PROF_IPV4_UDP_VXLAN_ETH_QTAG_IPV6_UDP = 5, + IONIC_TFP_CSUM_PROF_QTAG_IPV4_UDP_VXLAN_ETH_QTAG_IPV4_UDP = 6, + IONIC_TFP_CSUM_PROF_QTAG_IPV4_UDP_VXLAN_ETH_QTAG_IPV6_UDP = 7, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_ESP_IPV4_UDP = 8, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_ESP_UDP = 9, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_ESP_UDP = 10, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_ESP_UDP = 11, + IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_CSUM = 12, +}; + +static inline bool ionic_v1_eqe_color(struct ionic_v1_eqe *eqe) +{ + return eqe->evt & cpu_to_be32(IONIC_V1_EQE_COLOR); +} + +static inline u32 ionic_v1_eqe_evt(struct ionic_v1_eqe *eqe) +{ + return be32_to_cpu(eqe->evt); +} + +static inline u8 ionic_v1_eqe_evt_type(u32 evt) +{ + return (evt >> IONIC_V1_EQE_TYPE_SHIFT) & IONIC_V1_EQE_TYPE_MASK; +} + +static inline u8 ionic_v1_eqe_evt_code(u32 evt) +{ + return (evt >> IONIC_V1_EQE_CODE_SHIFT) & IONIC_V1_EQE_CODE_MASK; +} + +static inline u32 ionic_v1_eqe_evt_qid(u32 evt) +{ + return evt >> IONIC_V1_EQE_QID_SHIFT; +} + +enum ionic_v1_stat_bits { + IONIC_V1_STAT_TYPE_SHIFT = 28, + IONIC_V1_STAT_TYPE_NONE = 0, + IONIC_V1_STAT_TYPE_8 = 1, + IONIC_V1_STAT_TYPE_LE16 = 2, + IONIC_V1_STAT_TYPE_LE32 = 3, + IONIC_V1_STAT_TYPE_LE64 = 4, + IONIC_V1_STAT_TYPE_BE16 = 5, + IONIC_V1_STAT_TYPE_BE32 = 6, + IONIC_V1_STAT_TYPE_BE64 = 7, + IONIC_V1_STAT_OFF_MASK = BIT(IONIC_V1_STAT_TYPE_SHIFT) - 1, +}; + +struct ionic_v1_stat { + union { + __be32 be_type_off; + u32 type_off; + }; + char name[28]; +}; + +static inline int ionic_v1_stat_type(struct ionic_v1_stat *hdr) +{ + return hdr->type_off >> IONIC_V1_STAT_TYPE_SHIFT; +} + +static inline unsigned int ionic_v1_stat_off(struct ionic_v1_stat *hdr) +{ + return hdr->type_off & IONIC_V1_STAT_OFF_MASK; +} + +#endif /* _IONIC_FW_H_ */ diff --git a/drivers/infiniband/hw/ionic/ionic_hw_stats.c b/drivers/infiniband/hw/ionic/ionic_hw_stats.c new file mode 100644 index 000000000000..244a80dde08f --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_hw_stats.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/dma-mapping.h> + +#include "ionic_fw.h" +#include "ionic_ibdev.h" + +static int ionic_v1_stat_normalize(struct ionic_v1_stat *hw_stats, + int hw_stats_count) +{ + int hw_stat_i; + + for (hw_stat_i = 0; hw_stat_i < hw_stats_count; ++hw_stat_i) { + struct ionic_v1_stat *stat = &hw_stats[hw_stat_i]; + + stat->type_off = be32_to_cpu(stat->be_type_off); + stat->name[sizeof(stat->name) - 1] = 0; + if (ionic_v1_stat_type(stat) == IONIC_V1_STAT_TYPE_NONE) + break; + } + + return hw_stat_i; +} + +static void ionic_fill_stats_desc(struct rdma_stat_desc *hw_stats_hdrs, + struct ionic_v1_stat *hw_stats, + int hw_stats_count) +{ + int hw_stat_i; + + for (hw_stat_i = 0; hw_stat_i < hw_stats_count; ++hw_stat_i) { + struct ionic_v1_stat *stat = &hw_stats[hw_stat_i]; + + hw_stats_hdrs[hw_stat_i].name = stat->name; + } +} + +static u64 ionic_v1_stat_val(struct ionic_v1_stat *stat, + void *vals_buf, size_t vals_len) +{ + unsigned int off = ionic_v1_stat_off(stat); + int type = ionic_v1_stat_type(stat); + +#define __ionic_v1_stat_validate(__type) \ + ((off + sizeof(__type) <= vals_len) && \ + (IS_ALIGNED(off, sizeof(__type)))) + + switch (type) { + case IONIC_V1_STAT_TYPE_8: + if (__ionic_v1_stat_validate(u8)) + return *(u8 *)(vals_buf + off); + break; + case IONIC_V1_STAT_TYPE_LE16: + if (__ionic_v1_stat_validate(__le16)) + return le16_to_cpu(*(__le16 *)(vals_buf + off)); + break; + case IONIC_V1_STAT_TYPE_LE32: + if (__ionic_v1_stat_validate(__le32)) + return le32_to_cpu(*(__le32 *)(vals_buf + off)); + break; + case IONIC_V1_STAT_TYPE_LE64: + if (__ionic_v1_stat_validate(__le64)) + return le64_to_cpu(*(__le64 *)(vals_buf + off)); + break; + case IONIC_V1_STAT_TYPE_BE16: + if (__ionic_v1_stat_validate(__be16)) + return be16_to_cpu(*(__be16 *)(vals_buf + off)); + break; + case IONIC_V1_STAT_TYPE_BE32: + if (__ionic_v1_stat_validate(__be32)) + return be32_to_cpu(*(__be32 *)(vals_buf + off)); + break; + case IONIC_V1_STAT_TYPE_BE64: + if (__ionic_v1_stat_validate(__be64)) + return be64_to_cpu(*(__be64 *)(vals_buf + off)); + break; + } + + return ~0ull; +#undef __ionic_v1_stat_validate +} + +static int ionic_hw_stats_cmd(struct ionic_ibdev *dev, + dma_addr_t dma, size_t len, int qid, int op) +{ + struct ionic_admin_wr wr = { + .work = COMPLETION_INITIALIZER_ONSTACK(wr.work), + .wqe = { + .op = op, + .len = cpu_to_le16(IONIC_ADMIN_STATS_HDRS_IN_V1_LEN), + .cmd.stats = { + .dma_addr = cpu_to_le64(dma), + .length = cpu_to_le32(len), + .id_ver = cpu_to_le32(qid), + }, + } + }; + + if (dev->lif_cfg.admin_opcodes <= op) + return -EBADRQC; + + ionic_admin_post(dev, &wr); + + return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_INTERRUPT); +} + +static int ionic_init_hw_stats(struct ionic_ibdev *dev) +{ + dma_addr_t hw_stats_dma; + int rc, hw_stats_count; + + if (dev->hw_stats_hdrs) + return 0; + + dev->hw_stats_count = 0; + + /* buffer for current values from the device */ + dev->hw_stats_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!dev->hw_stats_buf) { + rc = -ENOMEM; + goto err_buf; + } + + /* buffer for names, sizes, offsets of values */ + dev->hw_stats = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!dev->hw_stats) { + rc = -ENOMEM; + goto err_hw_stats; + } + + /* request the names, sizes, offsets */ + hw_stats_dma = dma_map_single(dev->lif_cfg.hwdev, dev->hw_stats, + PAGE_SIZE, DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, hw_stats_dma); + if (rc) + goto err_dma; + + rc = ionic_hw_stats_cmd(dev, hw_stats_dma, PAGE_SIZE, 0, + IONIC_V1_ADMIN_STATS_HDRS); + if (rc) + goto err_cmd; + + dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, PAGE_SIZE, DMA_FROM_DEVICE); + + /* normalize and count the number of hw_stats */ + hw_stats_count = + ionic_v1_stat_normalize(dev->hw_stats, + PAGE_SIZE / sizeof(*dev->hw_stats)); + if (!hw_stats_count) { + rc = -ENODATA; + goto err_dma; + } + + dev->hw_stats_count = hw_stats_count; + + /* alloc and init array of names, for alloc_hw_stats */ + dev->hw_stats_hdrs = kcalloc(hw_stats_count, + sizeof(*dev->hw_stats_hdrs), + GFP_KERNEL); + if (!dev->hw_stats_hdrs) { + rc = -ENOMEM; + goto err_dma; + } + + ionic_fill_stats_desc(dev->hw_stats_hdrs, dev->hw_stats, + hw_stats_count); + + return 0; + +err_cmd: + dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, PAGE_SIZE, DMA_FROM_DEVICE); +err_dma: + kfree(dev->hw_stats); +err_hw_stats: + kfree(dev->hw_stats_buf); +err_buf: + dev->hw_stats_count = 0; + dev->hw_stats = NULL; + dev->hw_stats_buf = NULL; + dev->hw_stats_hdrs = NULL; + return rc; +} + +static struct rdma_hw_stats *ionic_alloc_hw_stats(struct ib_device *ibdev, + u32 port) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibdev); + + if (port != 1) + return NULL; + + return rdma_alloc_hw_stats_struct(dev->hw_stats_hdrs, + dev->hw_stats_count, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int ionic_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *hw_stats, + u32 port, int index) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibdev); + dma_addr_t hw_stats_dma; + int rc, hw_stat_i; + + if (port != 1) + return -EINVAL; + + hw_stats_dma = dma_map_single(dev->lif_cfg.hwdev, dev->hw_stats_buf, + PAGE_SIZE, DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, hw_stats_dma); + if (rc) + goto err_dma; + + rc = ionic_hw_stats_cmd(dev, hw_stats_dma, PAGE_SIZE, + 0, IONIC_V1_ADMIN_STATS_VALS); + if (rc) + goto err_cmd; + + dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, + PAGE_SIZE, DMA_FROM_DEVICE); + + for (hw_stat_i = 0; hw_stat_i < dev->hw_stats_count; ++hw_stat_i) + hw_stats->value[hw_stat_i] = + ionic_v1_stat_val(&dev->hw_stats[hw_stat_i], + dev->hw_stats_buf, PAGE_SIZE); + + return hw_stat_i; + +err_cmd: + dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, + PAGE_SIZE, DMA_FROM_DEVICE); +err_dma: + return rc; +} + +static struct rdma_hw_stats * +ionic_counter_alloc_stats(struct rdma_counter *counter) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(counter->device); + struct ionic_counter *cntr; + int err; + + cntr = kzalloc(sizeof(*cntr), GFP_KERNEL); + if (!cntr) + return NULL; + + /* buffer for current values from the device */ + cntr->vals = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!cntr->vals) + goto err_vals; + + err = xa_alloc(&dev->counter_stats->xa_counters, &counter->id, + cntr, + XA_LIMIT(0, IONIC_MAX_QPID), + GFP_KERNEL); + if (err) + goto err_xa; + + INIT_LIST_HEAD(&cntr->qp_list); + + return rdma_alloc_hw_stats_struct(dev->counter_stats->stats_hdrs, + dev->counter_stats->queue_stats_count, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +err_xa: + kfree(cntr->vals); +err_vals: + kfree(cntr); + + return NULL; +} + +static int ionic_counter_dealloc(struct rdma_counter *counter) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(counter->device); + struct ionic_counter *cntr; + + cntr = xa_erase(&dev->counter_stats->xa_counters, counter->id); + if (!cntr) + return -EINVAL; + + kfree(cntr->vals); + kfree(cntr); + + return 0; +} + +static int ionic_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *ibqp, + u32 port) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(counter->device); + struct ionic_qp *qp = to_ionic_qp(ibqp); + struct ionic_counter *cntr; + + cntr = xa_load(&dev->counter_stats->xa_counters, counter->id); + if (!cntr) + return -EINVAL; + + list_add_tail(&qp->qp_list_counter, &cntr->qp_list); + ibqp->counter = counter; + + return 0; +} + +static int ionic_counter_unbind_qp(struct ib_qp *ibqp, u32 port) +{ + struct ionic_qp *qp = to_ionic_qp(ibqp); + + if (ibqp->counter) { + list_del(&qp->qp_list_counter); + ibqp->counter = NULL; + } + + return 0; +} + +static int ionic_get_qp_stats(struct ib_device *ibdev, + struct rdma_hw_stats *hw_stats, + u32 counter_id) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibdev); + struct ionic_counter_stats *cs; + struct ionic_counter *cntr; + dma_addr_t hw_stats_dma; + struct ionic_qp *qp; + int rc, stat_i = 0; + + cs = dev->counter_stats; + cntr = xa_load(&cs->xa_counters, counter_id); + if (!cntr) + return -EINVAL; + + hw_stats_dma = dma_map_single(dev->lif_cfg.hwdev, cntr->vals, + PAGE_SIZE, DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, hw_stats_dma); + if (rc) + return rc; + + memset(hw_stats->value, 0, sizeof(u64) * hw_stats->num_counters); + + list_for_each_entry(qp, &cntr->qp_list, qp_list_counter) { + rc = ionic_hw_stats_cmd(dev, hw_stats_dma, PAGE_SIZE, + qp->qpid, + IONIC_V1_ADMIN_QP_STATS_VALS); + if (rc) + goto err_cmd; + + for (stat_i = 0; stat_i < cs->queue_stats_count; ++stat_i) + hw_stats->value[stat_i] += + ionic_v1_stat_val(&cs->hdr[stat_i], + cntr->vals, + PAGE_SIZE); + } + + dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, PAGE_SIZE, DMA_FROM_DEVICE); + return stat_i; + +err_cmd: + dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, PAGE_SIZE, DMA_FROM_DEVICE); + + return rc; +} + +static int ionic_counter_update_stats(struct rdma_counter *counter) +{ + return ionic_get_qp_stats(counter->device, counter->stats, counter->id); +} + +static int ionic_alloc_counters(struct ionic_ibdev *dev) +{ + struct ionic_counter_stats *cs = dev->counter_stats; + int rc, hw_stats_count; + dma_addr_t hdr_dma; + + /* buffer for names, sizes, offsets of values */ + cs->hdr = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!cs->hdr) + return -ENOMEM; + + hdr_dma = dma_map_single(dev->lif_cfg.hwdev, cs->hdr, + PAGE_SIZE, DMA_FROM_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma); + if (rc) + goto err_dma; + + rc = ionic_hw_stats_cmd(dev, hdr_dma, PAGE_SIZE, 0, + IONIC_V1_ADMIN_QP_STATS_HDRS); + if (rc) + goto err_cmd; + + dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, PAGE_SIZE, DMA_FROM_DEVICE); + + /* normalize and count the number of hw_stats */ + hw_stats_count = ionic_v1_stat_normalize(cs->hdr, + PAGE_SIZE / sizeof(*cs->hdr)); + if (!hw_stats_count) { + rc = -ENODATA; + goto err_dma; + } + + cs->queue_stats_count = hw_stats_count; + + /* alloc and init array of names */ + cs->stats_hdrs = kcalloc(hw_stats_count, sizeof(*cs->stats_hdrs), + GFP_KERNEL); + if (!cs->stats_hdrs) { + rc = -ENOMEM; + goto err_dma; + } + + ionic_fill_stats_desc(cs->stats_hdrs, cs->hdr, hw_stats_count); + + return 0; + +err_cmd: + dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, PAGE_SIZE, DMA_FROM_DEVICE); +err_dma: + kfree(cs->hdr); + + return rc; +} + +static const struct ib_device_ops ionic_hw_stats_ops = { + .driver_id = RDMA_DRIVER_IONIC, + .alloc_hw_port_stats = ionic_alloc_hw_stats, + .get_hw_stats = ionic_get_hw_stats, +}; + +static const struct ib_device_ops ionic_counter_stats_ops = { + .counter_alloc_stats = ionic_counter_alloc_stats, + .counter_dealloc = ionic_counter_dealloc, + .counter_bind_qp = ionic_counter_bind_qp, + .counter_unbind_qp = ionic_counter_unbind_qp, + .counter_update_stats = ionic_counter_update_stats, +}; + +void ionic_stats_init(struct ionic_ibdev *dev) +{ + u16 stats_type = dev->lif_cfg.stats_type; + int rc; + + if (stats_type & IONIC_LIF_RDMA_STAT_GLOBAL) { + rc = ionic_init_hw_stats(dev); + if (rc) + ibdev_dbg(&dev->ibdev, "Failed to init hw stats\n"); + else + ib_set_device_ops(&dev->ibdev, &ionic_hw_stats_ops); + } + + if (stats_type & IONIC_LIF_RDMA_STAT_QP) { + dev->counter_stats = kzalloc(sizeof(*dev->counter_stats), + GFP_KERNEL); + if (!dev->counter_stats) + return; + + rc = ionic_alloc_counters(dev); + if (rc) { + ibdev_dbg(&dev->ibdev, "Failed to init counter stats\n"); + kfree(dev->counter_stats); + dev->counter_stats = NULL; + return; + } + + xa_init_flags(&dev->counter_stats->xa_counters, XA_FLAGS_ALLOC); + + ib_set_device_ops(&dev->ibdev, &ionic_counter_stats_ops); + } +} + +void ionic_stats_cleanup(struct ionic_ibdev *dev) +{ + if (dev->counter_stats) { + xa_destroy(&dev->counter_stats->xa_counters); + kfree(dev->counter_stats->hdr); + kfree(dev->counter_stats->stats_hdrs); + kfree(dev->counter_stats); + dev->counter_stats = NULL; + } + + kfree(dev->hw_stats); + kfree(dev->hw_stats_buf); + kfree(dev->hw_stats_hdrs); +} diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.c b/drivers/infiniband/hw/ionic/ionic_ibdev.c new file mode 100644 index 000000000000..164046d00e5d --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_ibdev.c @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/pci.h> +#include <linux/irq.h> +#include <net/addrconf.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_mad.h> + +#include "ionic_ibdev.h" + +#define DRIVER_DESCRIPTION "AMD Pensando RoCE HCA driver" +#define DEVICE_DESCRIPTION "AMD Pensando RoCE HCA" + +MODULE_AUTHOR("Allen Hubbe <allen.hubbe@amd.com>"); +MODULE_DESCRIPTION(DRIVER_DESCRIPTION); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS("NET_IONIC"); + +static int ionic_query_device(struct ib_device *ibdev, + struct ib_device_attr *attr, + struct ib_udata *udata) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibdev); + struct net_device *ndev; + + ndev = ib_device_get_netdev(ibdev, 1); + addrconf_ifid_eui48((u8 *)&attr->sys_image_guid, ndev); + dev_put(ndev); + attr->max_mr_size = dev->lif_cfg.npts_per_lif * PAGE_SIZE / 2; + attr->page_size_cap = dev->lif_cfg.page_size_supported; + + attr->vendor_id = to_pci_dev(dev->lif_cfg.hwdev)->vendor; + attr->vendor_part_id = to_pci_dev(dev->lif_cfg.hwdev)->device; + + attr->hw_ver = ionic_lif_asic_rev(dev->lif_cfg.lif); + attr->fw_ver = 0; + attr->max_qp = dev->lif_cfg.qp_count; + attr->max_qp_wr = IONIC_MAX_DEPTH; + attr->device_cap_flags = + IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_MGT_EXTENSIONS | + IB_DEVICE_MEM_WINDOW_TYPE_2B | + 0; + attr->max_send_sge = + min(ionic_v1_send_wqe_max_sge(dev->lif_cfg.max_stride, 0, false), + IONIC_SPEC_HIGH); + attr->max_recv_sge = + min(ionic_v1_recv_wqe_max_sge(dev->lif_cfg.max_stride, 0, false), + IONIC_SPEC_HIGH); + attr->max_sge_rd = attr->max_send_sge; + attr->max_cq = dev->lif_cfg.cq_count / dev->lif_cfg.udma_count; + attr->max_cqe = IONIC_MAX_CQ_DEPTH - IONIC_CQ_GRACE; + attr->max_mr = dev->lif_cfg.nmrs_per_lif; + attr->max_pd = IONIC_MAX_PD; + attr->max_qp_rd_atom = IONIC_MAX_RD_ATOM; + attr->max_ee_rd_atom = 0; + attr->max_res_rd_atom = IONIC_MAX_RD_ATOM; + attr->max_qp_init_rd_atom = IONIC_MAX_RD_ATOM; + attr->max_ee_init_rd_atom = 0; + attr->atomic_cap = IB_ATOMIC_GLOB; + attr->masked_atomic_cap = IB_ATOMIC_GLOB; + attr->max_mw = dev->lif_cfg.nmrs_per_lif; + attr->max_mcast_grp = 0; + attr->max_mcast_qp_attach = 0; + attr->max_ah = dev->lif_cfg.nahs_per_lif; + attr->max_fast_reg_page_list_len = dev->lif_cfg.npts_per_lif / 2; + attr->max_pkeys = IONIC_PKEY_TBL_LEN; + + return 0; +} + +static int ionic_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *attr) +{ + struct net_device *ndev; + + if (port != 1) + return -EINVAL; + + ndev = ib_device_get_netdev(ibdev, port); + + if (netif_running(ndev) && netif_carrier_ok(ndev)) { + attr->state = IB_PORT_ACTIVE; + attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + } else if (netif_running(ndev)) { + attr->state = IB_PORT_DOWN; + attr->phys_state = IB_PORT_PHYS_STATE_POLLING; + } else { + attr->state = IB_PORT_DOWN; + attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + } + + attr->max_mtu = iboe_get_mtu(ndev->max_mtu); + attr->active_mtu = min(attr->max_mtu, iboe_get_mtu(ndev->mtu)); + attr->gid_tbl_len = IONIC_GID_TBL_LEN; + attr->ip_gids = true; + attr->port_cap_flags = 0; + attr->max_msg_sz = 0x80000000; + attr->pkey_tbl_len = IONIC_PKEY_TBL_LEN; + attr->max_vl_num = 1; + attr->subnet_prefix = 0xfe80000000000000ull; + + dev_put(ndev); + + return ib_get_eth_speed(ibdev, port, + &attr->active_speed, + &attr->active_width); +} + +static enum rdma_link_layer ionic_get_link_layer(struct ib_device *ibdev, + u32 port) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static int ionic_query_pkey(struct ib_device *ibdev, u32 port, u16 index, + u16 *pkey) +{ + if (port != 1) + return -EINVAL; + + if (index != 0) + return -EINVAL; + + *pkey = IB_DEFAULT_PKEY_FULL; + + return 0; +} + +static int ionic_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *attr) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibdev); + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (mask & IB_DEVICE_MODIFY_NODE_DESC) + memcpy(dev->ibdev.node_desc, attr->node_desc, + IB_DEVICE_NODE_DESC_MAX); + + return 0; +} + +static int ionic_get_port_immutable(struct ib_device *ibdev, u32 port, + struct ib_port_immutable *attr) +{ + if (port != 1) + return -EINVAL; + + attr->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + + attr->pkey_tbl_len = IONIC_PKEY_TBL_LEN; + attr->gid_tbl_len = IONIC_GID_TBL_LEN; + attr->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + +static void ionic_get_dev_fw_str(struct ib_device *ibdev, char *str) +{ + struct ionic_ibdev *dev = to_ionic_ibdev(ibdev); + + ionic_lif_fw_version(dev->lif_cfg.lif, str, IB_FW_VERSION_NAME_MAX); +} + +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ionic_ibdev *dev = + rdma_device_to_drv_device(device, struct ionic_ibdev, ibdev); + + return sysfs_emit(buf, "0x%x\n", ionic_lif_asic_rev(dev->lif_cfg.lif)); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ionic_ibdev *dev = + rdma_device_to_drv_device(device, struct ionic_ibdev, ibdev); + + return sysfs_emit(buf, "%s\n", dev->ibdev.node_desc); +} +static DEVICE_ATTR_RO(hca_type); + +static struct attribute *ionic_rdma_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + NULL +}; + +static const struct attribute_group ionic_rdma_attr_group = { + .attrs = ionic_rdma_attributes, +}; + +static void ionic_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + /* + * Dummy define disassociate_ucontext so that it does not + * wait for user context before cleaning up hw resources. + */ +} + +static const struct ib_device_ops ionic_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_IONIC, + .uverbs_abi_ver = IONIC_ABI_VERSION, + + .alloc_ucontext = ionic_alloc_ucontext, + .dealloc_ucontext = ionic_dealloc_ucontext, + .mmap = ionic_mmap, + .mmap_free = ionic_mmap_free, + .alloc_pd = ionic_alloc_pd, + .dealloc_pd = ionic_dealloc_pd, + .create_ah = ionic_create_ah, + .query_ah = ionic_query_ah, + .destroy_ah = ionic_destroy_ah, + .create_user_ah = ionic_create_ah, + .get_dma_mr = ionic_get_dma_mr, + .reg_user_mr = ionic_reg_user_mr, + .reg_user_mr_dmabuf = ionic_reg_user_mr_dmabuf, + .dereg_mr = ionic_dereg_mr, + .alloc_mr = ionic_alloc_mr, + .map_mr_sg = ionic_map_mr_sg, + .alloc_mw = ionic_alloc_mw, + .dealloc_mw = ionic_dealloc_mw, + .create_cq = ionic_create_cq, + .destroy_cq = ionic_destroy_cq, + .create_qp = ionic_create_qp, + .modify_qp = ionic_modify_qp, + .query_qp = ionic_query_qp, + .destroy_qp = ionic_destroy_qp, + + .post_send = ionic_post_send, + .post_recv = ionic_post_recv, + .poll_cq = ionic_poll_cq, + .req_notify_cq = ionic_req_notify_cq, + + .query_device = ionic_query_device, + .query_port = ionic_query_port, + .get_link_layer = ionic_get_link_layer, + .query_pkey = ionic_query_pkey, + .modify_device = ionic_modify_device, + .get_port_immutable = ionic_get_port_immutable, + .get_dev_fw_str = ionic_get_dev_fw_str, + .device_group = &ionic_rdma_attr_group, + .disassociate_ucontext = ionic_disassociate_ucontext, + + INIT_RDMA_OBJ_SIZE(ib_ucontext, ionic_ctx, ibctx), + INIT_RDMA_OBJ_SIZE(ib_pd, ionic_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ah, ionic_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, ionic_vcq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_qp, ionic_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_mw, ionic_mr, ibmw), +}; + +static void ionic_init_resids(struct ionic_ibdev *dev) +{ + ionic_resid_init(&dev->inuse_cqid, dev->lif_cfg.cq_count); + dev->half_cqid_udma_shift = + order_base_2(dev->lif_cfg.cq_count / dev->lif_cfg.udma_count); + ionic_resid_init(&dev->inuse_pdid, IONIC_MAX_PD); + ionic_resid_init(&dev->inuse_ahid, dev->lif_cfg.nahs_per_lif); + ionic_resid_init(&dev->inuse_mrid, dev->lif_cfg.nmrs_per_lif); + /* skip reserved lkey */ + dev->next_mrkey = 1; + ionic_resid_init(&dev->inuse_qpid, dev->lif_cfg.qp_count); + /* skip reserved SMI and GSI qpids */ + dev->half_qpid_udma_shift = + order_base_2(dev->lif_cfg.qp_count / dev->lif_cfg.udma_count); + ionic_resid_init(&dev->inuse_dbid, dev->lif_cfg.dbid_count); +} + +static void ionic_destroy_resids(struct ionic_ibdev *dev) +{ + ionic_resid_destroy(&dev->inuse_cqid); + ionic_resid_destroy(&dev->inuse_pdid); + ionic_resid_destroy(&dev->inuse_ahid); + ionic_resid_destroy(&dev->inuse_mrid); + ionic_resid_destroy(&dev->inuse_qpid); + ionic_resid_destroy(&dev->inuse_dbid); +} + +static void ionic_destroy_ibdev(struct ionic_ibdev *dev) +{ + ionic_kill_rdma_admin(dev, false); + ib_unregister_device(&dev->ibdev); + ionic_stats_cleanup(dev); + ionic_destroy_rdma_admin(dev); + ionic_destroy_resids(dev); + WARN_ON(!xa_empty(&dev->qp_tbl)); + xa_destroy(&dev->qp_tbl); + WARN_ON(!xa_empty(&dev->cq_tbl)); + xa_destroy(&dev->cq_tbl); + ib_dealloc_device(&dev->ibdev); +} + +static struct ionic_ibdev *ionic_create_ibdev(struct ionic_aux_dev *ionic_adev) +{ + struct ib_device *ibdev; + struct ionic_ibdev *dev; + struct net_device *ndev; + int rc; + + dev = ib_alloc_device(ionic_ibdev, ibdev); + if (!dev) + return ERR_PTR(-EINVAL); + + ionic_fill_lif_cfg(ionic_adev->lif, &dev->lif_cfg); + + xa_init_flags(&dev->qp_tbl, GFP_ATOMIC); + xa_init_flags(&dev->cq_tbl, GFP_ATOMIC); + + ionic_init_resids(dev); + + rc = ionic_rdma_reset_devcmd(dev); + if (rc) + goto err_reset; + + rc = ionic_create_rdma_admin(dev); + if (rc) + goto err_admin; + + ibdev = &dev->ibdev; + ibdev->dev.parent = dev->lif_cfg.hwdev; + + strscpy(ibdev->name, "ionic_%d", IB_DEVICE_NAME_MAX); + strscpy(ibdev->node_desc, DEVICE_DESCRIPTION, IB_DEVICE_NODE_DESC_MAX); + + ibdev->node_type = RDMA_NODE_IB_CA; + ibdev->phys_port_cnt = 1; + + /* the first two eq are reserved for async events */ + ibdev->num_comp_vectors = dev->lif_cfg.eq_count - 2; + + ndev = ionic_lif_netdev(ionic_adev->lif); + addrconf_ifid_eui48((u8 *)&ibdev->node_guid, ndev); + rc = ib_device_set_netdev(ibdev, ndev, 1); + /* ionic_lif_netdev() returns ndev with refcount held */ + dev_put(ndev); + if (rc) + goto err_admin; + + ib_set_device_ops(&dev->ibdev, &ionic_dev_ops); + + ionic_stats_init(dev); + + rc = ib_register_device(ibdev, "ionic_%d", ibdev->dev.parent); + if (rc) + goto err_register; + + return dev; + +err_register: + ionic_stats_cleanup(dev); +err_admin: + ionic_kill_rdma_admin(dev, false); + ionic_destroy_rdma_admin(dev); +err_reset: + ionic_destroy_resids(dev); + xa_destroy(&dev->qp_tbl); + xa_destroy(&dev->cq_tbl); + ib_dealloc_device(&dev->ibdev); + + return ERR_PTR(rc); +} + +static int ionic_aux_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct ionic_aux_dev *ionic_adev; + struct ionic_ibdev *dev; + + ionic_adev = container_of(adev, struct ionic_aux_dev, adev); + dev = ionic_create_ibdev(ionic_adev); + if (IS_ERR(dev)) + return dev_err_probe(&adev->dev, PTR_ERR(dev), + "Failed to register ibdev\n"); + + auxiliary_set_drvdata(adev, dev); + ibdev_dbg(&dev->ibdev, "registered\n"); + + return 0; +} + +static void ionic_aux_remove(struct auxiliary_device *adev) +{ + struct ionic_ibdev *dev = auxiliary_get_drvdata(adev); + + dev_dbg(&adev->dev, "unregister ibdev\n"); + ionic_destroy_ibdev(dev); + dev_dbg(&adev->dev, "unregistered\n"); +} + +static const struct auxiliary_device_id ionic_aux_id_table[] = { + { .name = "ionic.rdma", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, ionic_aux_id_table); + +static struct auxiliary_driver ionic_aux_r_driver = { + .name = "rdma", + .probe = ionic_aux_probe, + .remove = ionic_aux_remove, + .id_table = ionic_aux_id_table, +}; + +static int __init ionic_mod_init(void) +{ + int rc; + + ionic_evt_workq = create_workqueue(KBUILD_MODNAME "-evt"); + if (!ionic_evt_workq) + return -ENOMEM; + + rc = auxiliary_driver_register(&ionic_aux_r_driver); + if (rc) + goto err_aux; + + return 0; + +err_aux: + destroy_workqueue(ionic_evt_workq); + + return rc; +} + +static void __exit ionic_mod_exit(void) +{ + auxiliary_driver_unregister(&ionic_aux_r_driver); + destroy_workqueue(ionic_evt_workq); +} + +module_init(ionic_mod_init); +module_exit(ionic_mod_exit); diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.h b/drivers/infiniband/hw/ionic/ionic_ibdev.h new file mode 100644 index 000000000000..82fda1e3cdb6 --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_ibdev.h @@ -0,0 +1,517 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#ifndef _IONIC_IBDEV_H_ +#define _IONIC_IBDEV_H_ + +#include <rdma/ib_umem.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_pack.h> +#include <rdma/uverbs_ioctl.h> + +#include <rdma/ionic-abi.h> +#include <ionic_api.h> +#include <ionic_regs.h> + +#include "ionic_fw.h" +#include "ionic_queue.h" +#include "ionic_res.h" + +#include "ionic_lif_cfg.h" + +/* Config knobs */ +#define IONIC_EQ_DEPTH 511 +#define IONIC_EQ_COUNT 32 +#define IONIC_AQ_DEPTH 63 +#define IONIC_AQ_COUNT 4 +#define IONIC_EQ_ISR_BUDGET 10 +#define IONIC_EQ_WORK_BUDGET 1000 +#define IONIC_MAX_RD_ATOM 16 +#define IONIC_PKEY_TBL_LEN 1 +#define IONIC_GID_TBL_LEN 256 + +#define IONIC_MAX_QPID 0xffffff +#define IONIC_SPEC_HIGH 8 +#define IONIC_MAX_PD 1024 +#define IONIC_SPEC_HIGH 8 +#define IONIC_SQCMB_ORDER 5 +#define IONIC_RQCMB_ORDER 0 + +#define IONIC_META_LAST ((void *)1ul) +#define IONIC_META_POSTED ((void *)2ul) + +#define IONIC_CQ_GRACE 100 + +#define IONIC_ROCE_UDP_SPORT 28272 +#define IONIC_DMA_LKEY 0 +#define IONIC_DMA_RKEY IONIC_DMA_LKEY + +#define IONIC_CMB_SUPPORTED \ + (IONIC_CMB_ENABLE | IONIC_CMB_REQUIRE | IONIC_CMB_EXPDB | \ + IONIC_CMB_WC | IONIC_CMB_UC) + +/* resource is not reserved on the device, indicated in tbl_order */ +#define IONIC_RES_INVALID -1 + +struct ionic_aq; +struct ionic_cq; +struct ionic_eq; +struct ionic_vcq; + +enum ionic_admin_state { + IONIC_ADMIN_ACTIVE, /* submitting admin commands to queue */ + IONIC_ADMIN_PAUSED, /* not submitting, but may complete normally */ + IONIC_ADMIN_KILLED, /* not submitting, locally completed */ +}; + +enum ionic_admin_flags { + IONIC_ADMIN_F_BUSYWAIT = BIT(0), /* Don't sleep */ + IONIC_ADMIN_F_TEARDOWN = BIT(1), /* In destroy path */ + IONIC_ADMIN_F_INTERRUPT = BIT(2), /* Interruptible w/timeout */ +}; + +enum ionic_mmap_flag { + IONIC_MMAP_WC = BIT(0), +}; + +struct ionic_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; + unsigned long size; + unsigned long pfn; + u8 mmap_flags; +}; + +struct ionic_ibdev { + struct ib_device ibdev; + + struct ionic_lif_cfg lif_cfg; + + struct xarray qp_tbl; + struct xarray cq_tbl; + + struct ionic_resid_bits inuse_dbid; + struct ionic_resid_bits inuse_pdid; + struct ionic_resid_bits inuse_ahid; + struct ionic_resid_bits inuse_mrid; + struct ionic_resid_bits inuse_qpid; + struct ionic_resid_bits inuse_cqid; + + u8 half_cqid_udma_shift; + u8 half_qpid_udma_shift; + u8 next_qpid_udma_idx; + u8 next_mrkey; + + struct work_struct reset_work; + bool reset_posted; + u32 reset_cnt; + + struct delayed_work admin_dwork; + struct ionic_aq **aq_vec; + atomic_t admin_state; + + struct ionic_eq **eq_vec; + + struct ionic_v1_stat *hw_stats; + void *hw_stats_buf; + struct rdma_stat_desc *hw_stats_hdrs; + struct ionic_counter_stats *counter_stats; + int hw_stats_count; +}; + +struct ionic_eq { + struct ionic_ibdev *dev; + + u32 eqid; + u32 intr; + + struct ionic_queue q; + + int armed; + bool enable; + + struct work_struct work; + + int irq; + char name[32]; +}; + +struct ionic_admin_wr { + struct completion work; + struct list_head aq_ent; + struct ionic_v1_admin_wqe wqe; + struct ionic_v1_cqe cqe; + struct ionic_aq *aq; + int status; +}; + +struct ionic_admin_wr_q { + struct ionic_admin_wr *wr; + int wqe_strides; +}; + +struct ionic_aq { + struct ionic_ibdev *dev; + struct ionic_vcq *vcq; + + struct work_struct work; + + atomic_t admin_state; + unsigned long stamp; + bool armed; + + u32 aqid; + u32 cqid; + + spinlock_t lock; /* for posting */ + struct ionic_queue q; + struct ionic_admin_wr_q *q_wr; + struct list_head wr_prod; + struct list_head wr_post; +}; + +struct ionic_ctx { + struct ib_ucontext ibctx; + u32 dbid; + struct rdma_user_mmap_entry *mmap_dbell; +}; + +struct ionic_tbl_buf { + u32 tbl_limit; + u32 tbl_pages; + size_t tbl_size; + __le64 *tbl_buf; + dma_addr_t tbl_dma; + u8 page_size_log2; +}; + +struct ionic_pd { + struct ib_pd ibpd; + + u32 pdid; + u32 flags; +}; + +struct ionic_cq { + struct ionic_vcq *vcq; + + u32 cqid; + u32 eqid; + + spinlock_t lock; /* for polling */ + struct list_head poll_sq; + bool flush; + struct list_head flush_sq; + struct list_head flush_rq; + struct list_head ibkill_flush_ent; + + struct ionic_queue q; + bool color; + int credit; + u16 arm_any_prod; + u16 arm_sol_prod; + + struct kref cq_kref; + struct completion cq_rel_comp; + + /* infrequently accessed, keep at end */ + struct ib_umem *umem; +}; + +struct ionic_vcq { + struct ib_cq ibcq; + struct ionic_cq cq[2]; + u8 udma_mask; + u8 poll_idx; +}; + +struct ionic_sq_meta { + u64 wrid; + u32 len; + u16 seq; + u8 ibop; + u8 ibsts; + u8 remote:1; + u8 signal:1; + u8 local_comp:1; +}; + +struct ionic_rq_meta { + struct ionic_rq_meta *next; + u64 wrid; +}; + +struct ionic_qp { + struct ib_qp ibqp; + enum ib_qp_state state; + + u32 qpid; + u32 ahid; + u32 sq_cqid; + u32 rq_cqid; + u8 udma_idx; + u8 has_ah:1; + u8 has_sq:1; + u8 has_rq:1; + u8 sig_all:1; + + struct list_head qp_list_counter; + + struct list_head cq_poll_sq; + struct list_head cq_flush_sq; + struct list_head cq_flush_rq; + struct list_head ibkill_flush_ent; + + spinlock_t sq_lock; /* for posting and polling */ + struct ionic_queue sq; + struct ionic_sq_meta *sq_meta; + u16 *sq_msn_idx; + int sq_spec; + u16 sq_old_prod; + u16 sq_msn_prod; + u16 sq_msn_cons; + u8 sq_cmb; + bool sq_flush; + bool sq_flush_rcvd; + + spinlock_t rq_lock; /* for posting and polling */ + struct ionic_queue rq; + struct ionic_rq_meta *rq_meta; + struct ionic_rq_meta *rq_meta_head; + int rq_spec; + u16 rq_old_prod; + u8 rq_cmb; + bool rq_flush; + + struct kref qp_kref; + struct completion qp_rel_comp; + + /* infrequently accessed, keep at end */ + int sgid_index; + int sq_cmb_order; + u32 sq_cmb_pgid; + phys_addr_t sq_cmb_addr; + struct rdma_user_mmap_entry *mmap_sq_cmb; + + struct ib_umem *sq_umem; + + int rq_cmb_order; + u32 rq_cmb_pgid; + phys_addr_t rq_cmb_addr; + struct rdma_user_mmap_entry *mmap_rq_cmb; + + struct ib_umem *rq_umem; + + int dcqcn_profile; + + struct ib_ud_header *hdr; +}; + +struct ionic_ah { + struct ib_ah ibah; + u32 ahid; + int sgid_index; + struct ib_ud_header hdr; +}; + +struct ionic_mr { + union { + struct ib_mr ibmr; + struct ib_mw ibmw; + }; + + u32 mrid; + int flags; + + struct ib_umem *umem; + struct ionic_tbl_buf buf; + bool created; +}; + +struct ionic_counter_stats { + int queue_stats_count; + struct ionic_v1_stat *hdr; + struct rdma_stat_desc *stats_hdrs; + struct xarray xa_counters; +}; + +struct ionic_counter { + void *vals; + struct list_head qp_list; +}; + +static inline struct ionic_ibdev *to_ionic_ibdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct ionic_ibdev, ibdev); +} + +static inline struct ionic_ctx *to_ionic_ctx(struct ib_ucontext *ibctx) +{ + return container_of(ibctx, struct ionic_ctx, ibctx); +} + +static inline struct ionic_ctx *to_ionic_ctx_uobj(struct ib_uobject *uobj) +{ + if (!uobj) + return NULL; + + if (!uobj->context) + return NULL; + + return to_ionic_ctx(uobj->context); +} + +static inline struct ionic_pd *to_ionic_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct ionic_pd, ibpd); +} + +static inline struct ionic_mr *to_ionic_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct ionic_mr, ibmr); +} + +static inline struct ionic_mr *to_ionic_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct ionic_mr, ibmw); +} + +static inline struct ionic_vcq *to_ionic_vcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct ionic_vcq, ibcq); +} + +static inline struct ionic_cq *to_ionic_vcq_cq(struct ib_cq *ibcq, + uint8_t udma_idx) +{ + return &to_ionic_vcq(ibcq)->cq[udma_idx]; +} + +static inline struct ionic_qp *to_ionic_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct ionic_qp, ibqp); +} + +static inline struct ionic_ah *to_ionic_ah(struct ib_ah *ibah) +{ + return container_of(ibah, struct ionic_ah, ibah); +} + +static inline u32 ionic_ctx_dbid(struct ionic_ibdev *dev, + struct ionic_ctx *ctx) +{ + if (!ctx) + return dev->lif_cfg.dbid; + + return ctx->dbid; +} + +static inline u32 ionic_obj_dbid(struct ionic_ibdev *dev, + struct ib_uobject *uobj) +{ + return ionic_ctx_dbid(dev, to_ionic_ctx_uobj(uobj)); +} + +static inline bool ionic_ibop_is_local(enum ib_wr_opcode op) +{ + return op == IB_WR_LOCAL_INV || op == IB_WR_REG_MR; +} + +static inline void ionic_qp_complete(struct kref *kref) +{ + struct ionic_qp *qp = container_of(kref, struct ionic_qp, qp_kref); + + complete(&qp->qp_rel_comp); +} + +static inline void ionic_cq_complete(struct kref *kref) +{ + struct ionic_cq *cq = container_of(kref, struct ionic_cq, cq_kref); + + complete(&cq->cq_rel_comp); +} + +/* ionic_admin.c */ +extern struct workqueue_struct *ionic_evt_workq; +void ionic_admin_post(struct ionic_ibdev *dev, struct ionic_admin_wr *wr); +int ionic_admin_wait(struct ionic_ibdev *dev, struct ionic_admin_wr *wr, + enum ionic_admin_flags); + +int ionic_rdma_reset_devcmd(struct ionic_ibdev *dev); + +int ionic_create_rdma_admin(struct ionic_ibdev *dev); +void ionic_destroy_rdma_admin(struct ionic_ibdev *dev); +void ionic_kill_rdma_admin(struct ionic_ibdev *dev, bool fatal_path); + +/* ionic_controlpath.c */ +int ionic_create_cq_common(struct ionic_vcq *vcq, + struct ionic_tbl_buf *buf, + const struct ib_cq_init_attr *attr, + struct ionic_ctx *ctx, + struct ib_udata *udata, + struct ionic_qdesc *req_cq, + __u32 *resp_cqid, + int udma_idx); +void ionic_destroy_cq_common(struct ionic_ibdev *dev, struct ionic_cq *cq); +void ionic_flush_qp(struct ionic_ibdev *dev, struct ionic_qp *qp); +void ionic_notify_flush_cq(struct ionic_cq *cq); + +int ionic_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata); +void ionic_dealloc_ucontext(struct ib_ucontext *ibctx); +int ionic_mmap(struct ib_ucontext *ibctx, struct vm_area_struct *vma); +void ionic_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +int ionic_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int ionic_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int ionic_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int ionic_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); +int ionic_destroy_ah(struct ib_ah *ibah, u32 flags); +struct ib_mr *ionic_get_dma_mr(struct ib_pd *ibpd, int access); +struct ib_mr *ionic_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 addr, int access, struct ib_dmah *dmah, + struct ib_udata *udata); +struct ib_mr *ionic_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 offset, + u64 length, u64 addr, int fd, int access, + struct ib_dmah *dmah, + struct uverbs_attr_bundle *attrs); +int ionic_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); +struct ib_mr *ionic_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type type, + u32 max_sg); +int ionic_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); +int ionic_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata); +int ionic_dealloc_mw(struct ib_mw *ibmw); +int ionic_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs); +int ionic_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +int ionic_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *udata); +int ionic_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_udata *udata); +int ionic_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_qp_init_attr *init_attr); +int ionic_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); + +/* ionic_datapath.c */ +int ionic_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad); +int ionic_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad); +int ionic_poll_cq(struct ib_cq *ibcq, int nwc, struct ib_wc *wc); +int ionic_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); + +/* ionic_hw_stats.c */ +void ionic_stats_init(struct ionic_ibdev *dev); +void ionic_stats_cleanup(struct ionic_ibdev *dev); + +/* ionic_pgtbl.c */ +__le64 ionic_pgtbl_dma(struct ionic_tbl_buf *buf, u64 va); +__be64 ionic_pgtbl_off(struct ionic_tbl_buf *buf, u64 va); +int ionic_pgtbl_page(struct ionic_tbl_buf *buf, u64 dma); +int ionic_pgtbl_init(struct ionic_ibdev *dev, + struct ionic_tbl_buf *buf, + struct ib_umem *umem, + dma_addr_t dma, + int limit, + u64 page_size); +void ionic_pgtbl_unbuf(struct ionic_ibdev *dev, struct ionic_tbl_buf *buf); +#endif /* _IONIC_IBDEV_H_ */ diff --git a/drivers/infiniband/hw/ionic/ionic_lif_cfg.c b/drivers/infiniband/hw/ionic/ionic_lif_cfg.c new file mode 100644 index 000000000000..f3cd281c3a2f --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_lif_cfg.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/kernel.h> + +#include <ionic.h> +#include <ionic_lif.h> + +#include "ionic_lif_cfg.h" + +#define IONIC_MIN_RDMA_VERSION 0 +#define IONIC_MAX_RDMA_VERSION 2 + +static u8 ionic_get_expdb(struct ionic_lif *lif) +{ + u8 expdb_support = 0; + + if (lif->ionic->idev.phy_cmb_expdb64_pages) + expdb_support |= IONIC_EXPDB_64B_WQE; + if (lif->ionic->idev.phy_cmb_expdb128_pages) + expdb_support |= IONIC_EXPDB_128B_WQE; + if (lif->ionic->idev.phy_cmb_expdb256_pages) + expdb_support |= IONIC_EXPDB_256B_WQE; + if (lif->ionic->idev.phy_cmb_expdb512_pages) + expdb_support |= IONIC_EXPDB_512B_WQE; + + return expdb_support; +} + +void ionic_fill_lif_cfg(struct ionic_lif *lif, struct ionic_lif_cfg *cfg) +{ + union ionic_lif_identity *ident = &lif->ionic->ident.lif; + + cfg->lif = lif; + cfg->hwdev = &lif->ionic->pdev->dev; + cfg->lif_index = lif->index; + cfg->lif_hw_index = lif->hw_index; + + cfg->dbid = lif->kern_pid; + cfg->dbid_count = le32_to_cpu(lif->ionic->ident.dev.ndbpgs_per_lif); + cfg->dbpage = lif->kern_dbpage; + cfg->intr_ctrl = lif->ionic->idev.intr_ctrl; + + cfg->db_phys = lif->ionic->bars[IONIC_PCI_BAR_DBELL].bus_addr; + + if (IONIC_VERSION(ident->rdma.version, ident->rdma.minor_version) >= + IONIC_VERSION(2, 1)) + cfg->page_size_supported = + le64_to_cpu(ident->rdma.page_size_cap); + else + cfg->page_size_supported = IONIC_PAGE_SIZE_SUPPORTED; + + cfg->rdma_version = ident->rdma.version; + cfg->qp_opcodes = ident->rdma.qp_opcodes; + cfg->admin_opcodes = ident->rdma.admin_opcodes; + + cfg->stats_type = le16_to_cpu(ident->rdma.stats_type); + cfg->npts_per_lif = le32_to_cpu(ident->rdma.npts_per_lif); + cfg->nmrs_per_lif = le32_to_cpu(ident->rdma.nmrs_per_lif); + cfg->nahs_per_lif = le32_to_cpu(ident->rdma.nahs_per_lif); + + cfg->aq_base = le32_to_cpu(ident->rdma.aq_qtype.qid_base); + cfg->cq_base = le32_to_cpu(ident->rdma.cq_qtype.qid_base); + cfg->eq_base = le32_to_cpu(ident->rdma.eq_qtype.qid_base); + + /* + * ionic_create_rdma_admin() may reduce aq_count or eq_count if + * it is unable to allocate all that were requested. + * aq_count is tunable; see ionic_aq_count + * eq_count is tunable; see ionic_eq_count + */ + cfg->aq_count = le32_to_cpu(ident->rdma.aq_qtype.qid_count); + cfg->eq_count = le32_to_cpu(ident->rdma.eq_qtype.qid_count); + cfg->cq_count = le32_to_cpu(ident->rdma.cq_qtype.qid_count); + cfg->qp_count = le32_to_cpu(ident->rdma.sq_qtype.qid_count); + cfg->dbid_count = le32_to_cpu(lif->ionic->ident.dev.ndbpgs_per_lif); + + cfg->aq_qtype = ident->rdma.aq_qtype.qtype; + cfg->sq_qtype = ident->rdma.sq_qtype.qtype; + cfg->rq_qtype = ident->rdma.rq_qtype.qtype; + cfg->cq_qtype = ident->rdma.cq_qtype.qtype; + cfg->eq_qtype = ident->rdma.eq_qtype.qtype; + cfg->udma_qgrp_shift = ident->rdma.udma_shift; + cfg->udma_count = 2; + + cfg->max_stride = ident->rdma.max_stride; + cfg->expdb_mask = ionic_get_expdb(lif); + + cfg->sq_expdb = + !!(lif->qtype_info[IONIC_QTYPE_TXQ].features & IONIC_QIDENT_F_EXPDB); + cfg->rq_expdb = + !!(lif->qtype_info[IONIC_QTYPE_RXQ].features & IONIC_QIDENT_F_EXPDB); +} + +struct net_device *ionic_lif_netdev(struct ionic_lif *lif) +{ + struct net_device *netdev = lif->netdev; + + dev_hold(netdev); + return netdev; +} + +void ionic_lif_fw_version(struct ionic_lif *lif, char *str, size_t len) +{ + strscpy(str, lif->ionic->idev.dev_info.fw_version, len); +} + +u8 ionic_lif_asic_rev(struct ionic_lif *lif) +{ + return lif->ionic->idev.dev_info.asic_rev; +} diff --git a/drivers/infiniband/hw/ionic/ionic_lif_cfg.h b/drivers/infiniband/hw/ionic/ionic_lif_cfg.h new file mode 100644 index 000000000000..20853429f623 --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_lif_cfg.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#ifndef _IONIC_LIF_CFG_H_ + +#define IONIC_VERSION(a, b) (((a) << 16) + ((b) << 8)) +#define IONIC_PAGE_SIZE_SUPPORTED 0x40201000 /* 4kb, 2Mb, 1Gb */ + +#define IONIC_EXPDB_64B_WQE BIT(0) +#define IONIC_EXPDB_128B_WQE BIT(1) +#define IONIC_EXPDB_256B_WQE BIT(2) +#define IONIC_EXPDB_512B_WQE BIT(3) + +struct ionic_lif_cfg { + struct device *hwdev; + struct ionic_lif *lif; + + int lif_index; + int lif_hw_index; + + u32 dbid; + int dbid_count; + u64 __iomem *dbpage; + struct ionic_intr __iomem *intr_ctrl; + phys_addr_t db_phys; + + u64 page_size_supported; + u32 npts_per_lif; + u32 nmrs_per_lif; + u32 nahs_per_lif; + + u32 aq_base; + u32 cq_base; + u32 eq_base; + + int aq_count; + int eq_count; + int cq_count; + int qp_count; + + u16 stats_type; + u8 aq_qtype; + u8 sq_qtype; + u8 rq_qtype; + u8 cq_qtype; + u8 eq_qtype; + + u8 udma_count; + u8 udma_qgrp_shift; + + u8 rdma_version; + u8 qp_opcodes; + u8 admin_opcodes; + + u8 max_stride; + bool sq_expdb; + bool rq_expdb; + u8 expdb_mask; +}; + +void ionic_fill_lif_cfg(struct ionic_lif *lif, struct ionic_lif_cfg *cfg); +struct net_device *ionic_lif_netdev(struct ionic_lif *lif); +void ionic_lif_fw_version(struct ionic_lif *lif, char *str, size_t len); +u8 ionic_lif_asic_rev(struct ionic_lif *lif); + +#endif /* _IONIC_LIF_CFG_H_ */ diff --git a/drivers/infiniband/hw/ionic/ionic_pgtbl.c b/drivers/infiniband/hw/ionic/ionic_pgtbl.c new file mode 100644 index 000000000000..e74db73c9246 --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_pgtbl.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/mman.h> +#include <linux/dma-mapping.h> + +#include "ionic_fw.h" +#include "ionic_ibdev.h" + +__le64 ionic_pgtbl_dma(struct ionic_tbl_buf *buf, u64 va) +{ + u64 pg_mask = BIT_ULL(buf->page_size_log2) - 1; + u64 dma; + + if (!buf->tbl_pages) + return cpu_to_le64(0); + + if (buf->tbl_pages > 1) + return cpu_to_le64(buf->tbl_dma); + + if (buf->tbl_buf) + dma = le64_to_cpu(buf->tbl_buf[0]); + else + dma = buf->tbl_dma; + + return cpu_to_le64(dma + (va & pg_mask)); +} + +__be64 ionic_pgtbl_off(struct ionic_tbl_buf *buf, u64 va) +{ + if (buf->tbl_pages > 1) { + u64 pg_mask = BIT_ULL(buf->page_size_log2) - 1; + + return cpu_to_be64(va & pg_mask); + } + + return 0; +} + +int ionic_pgtbl_page(struct ionic_tbl_buf *buf, u64 dma) +{ + if (unlikely(buf->tbl_pages == buf->tbl_limit)) + return -ENOMEM; + + if (buf->tbl_buf) + buf->tbl_buf[buf->tbl_pages] = cpu_to_le64(dma); + else + buf->tbl_dma = dma; + + ++buf->tbl_pages; + + return 0; +} + +static int ionic_tbl_buf_alloc(struct ionic_ibdev *dev, + struct ionic_tbl_buf *buf) +{ + int rc; + + buf->tbl_size = buf->tbl_limit * sizeof(*buf->tbl_buf); + buf->tbl_buf = kmalloc(buf->tbl_size, GFP_KERNEL); + if (!buf->tbl_buf) + return -ENOMEM; + + buf->tbl_dma = dma_map_single(dev->lif_cfg.hwdev, buf->tbl_buf, + buf->tbl_size, DMA_TO_DEVICE); + rc = dma_mapping_error(dev->lif_cfg.hwdev, buf->tbl_dma); + if (rc) { + kfree(buf->tbl_buf); + return rc; + } + + return 0; +} + +static int ionic_pgtbl_umem(struct ionic_tbl_buf *buf, struct ib_umem *umem) +{ + struct ib_block_iter biter; + u64 page_dma; + int rc; + + rdma_umem_for_each_dma_block(umem, &biter, BIT_ULL(buf->page_size_log2)) { + page_dma = rdma_block_iter_dma_address(&biter); + rc = ionic_pgtbl_page(buf, page_dma); + if (rc) + return rc; + } + + return 0; +} + +void ionic_pgtbl_unbuf(struct ionic_ibdev *dev, struct ionic_tbl_buf *buf) +{ + if (buf->tbl_buf) + dma_unmap_single(dev->lif_cfg.hwdev, buf->tbl_dma, + buf->tbl_size, DMA_TO_DEVICE); + + kfree(buf->tbl_buf); + memset(buf, 0, sizeof(*buf)); +} + +int ionic_pgtbl_init(struct ionic_ibdev *dev, + struct ionic_tbl_buf *buf, + struct ib_umem *umem, + dma_addr_t dma, + int limit, + u64 page_size) +{ + int rc; + + memset(buf, 0, sizeof(*buf)); + + if (umem) { + limit = ib_umem_num_dma_blocks(umem, page_size); + buf->page_size_log2 = order_base_2(page_size); + } + + if (limit < 1) + return -EINVAL; + + buf->tbl_limit = limit; + + /* skip pgtbl if contiguous / direct translation */ + if (limit > 1) { + rc = ionic_tbl_buf_alloc(dev, buf); + if (rc) + return rc; + } + + if (umem) + rc = ionic_pgtbl_umem(buf, umem); + else + rc = ionic_pgtbl_page(buf, dma); + + if (rc) + goto err_unbuf; + + return 0; + +err_unbuf: + ionic_pgtbl_unbuf(dev, buf); + return rc; +} diff --git a/drivers/infiniband/hw/ionic/ionic_queue.c b/drivers/infiniband/hw/ionic/ionic_queue.c new file mode 100644 index 000000000000..aa897ed2a412 --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_queue.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/dma-mapping.h> + +#include "ionic_queue.h" + +int ionic_queue_init(struct ionic_queue *q, struct device *dma_dev, + int depth, size_t stride) +{ + if (depth < 0 || depth > 0xffff) + return -EINVAL; + + if (stride == 0 || stride > 0x10000) + return -EINVAL; + + if (depth == 0) + depth = 1; + + q->depth_log2 = order_base_2(depth + 1); + q->stride_log2 = order_base_2(stride); + + if (q->depth_log2 + q->stride_log2 < PAGE_SHIFT) + q->depth_log2 = PAGE_SHIFT - q->stride_log2; + + if (q->depth_log2 > 16 || q->stride_log2 > 16) + return -EINVAL; + + q->size = BIT_ULL(q->depth_log2 + q->stride_log2); + q->mask = BIT(q->depth_log2) - 1; + + q->ptr = dma_alloc_coherent(dma_dev, q->size, &q->dma, GFP_KERNEL); + if (!q->ptr) + return -ENOMEM; + + /* it will always be page aligned, but just to be sure... */ + if (!PAGE_ALIGNED(q->ptr)) { + dma_free_coherent(dma_dev, q->size, q->ptr, q->dma); + return -ENOMEM; + } + + q->prod = 0; + q->cons = 0; + q->dbell = 0; + + return 0; +} + +void ionic_queue_destroy(struct ionic_queue *q, struct device *dma_dev) +{ + dma_free_coherent(dma_dev, q->size, q->ptr, q->dma); +} diff --git a/drivers/infiniband/hw/ionic/ionic_queue.h b/drivers/infiniband/hw/ionic/ionic_queue.h new file mode 100644 index 000000000000..d18020d4cad5 --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_queue.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#ifndef _IONIC_QUEUE_H_ +#define _IONIC_QUEUE_H_ + +#include <linux/io.h> +#include <ionic_regs.h> + +#define IONIC_MAX_DEPTH 0xffff +#define IONIC_MAX_CQ_DEPTH 0xffff +#define IONIC_CQ_RING_ARM IONIC_DBELL_RING_1 +#define IONIC_CQ_RING_SOL IONIC_DBELL_RING_2 + +/** + * struct ionic_queue - Ring buffer used between device and driver + * @size: Size of the buffer, in bytes + * @dma: Dma address of the buffer + * @ptr: Buffer virtual address + * @prod: Driver position in the queue + * @cons: Device position in the queue + * @mask: Capacity of the queue, subtracting the hole + * This value is equal to ((1 << depth_log2) - 1) + * @depth_log2: Log base two size depth of the queue + * @stride_log2: Log base two size of an element in the queue + * @dbell: Doorbell identifying bits + */ +struct ionic_queue { + size_t size; + dma_addr_t dma; + void *ptr; + u16 prod; + u16 cons; + u16 mask; + u8 depth_log2; + u8 stride_log2; + u64 dbell; +}; + +/** + * ionic_queue_init() - Initialize user space queue + * @q: Uninitialized queue structure + * @dma_dev: DMA device for mapping + * @depth: Depth of the queue + * @stride: Size of each element of the queue + * + * Return: status code + */ +int ionic_queue_init(struct ionic_queue *q, struct device *dma_dev, + int depth, size_t stride); + +/** + * ionic_queue_destroy() - Destroy user space queue + * @q: Queue structure + * @dma_dev: DMA device for mapping + * + * Return: status code + */ +void ionic_queue_destroy(struct ionic_queue *q, struct device *dma_dev); + +/** + * ionic_queue_empty() - Test if queue is empty + * @q: Queue structure + * + * This is only valid for to-device queues. + * + * Return: is empty + */ +static inline bool ionic_queue_empty(struct ionic_queue *q) +{ + return q->prod == q->cons; +} + +/** + * ionic_queue_length() - Get the current length of the queue + * @q: Queue structure + * + * This is only valid for to-device queues. + * + * Return: length + */ +static inline u16 ionic_queue_length(struct ionic_queue *q) +{ + return (q->prod - q->cons) & q->mask; +} + +/** + * ionic_queue_length_remaining() - Get the remaining length of the queue + * @q: Queue structure + * + * This is only valid for to-device queues. + * + * Return: length remaining + */ +static inline u16 ionic_queue_length_remaining(struct ionic_queue *q) +{ + return q->mask - ionic_queue_length(q); +} + +/** + * ionic_queue_full() - Test if queue is full + * @q: Queue structure + * + * This is only valid for to-device queues. + * + * Return: is full + */ +static inline bool ionic_queue_full(struct ionic_queue *q) +{ + return q->mask == ionic_queue_length(q); +} + +/** + * ionic_color_wrap() - Flip the color if prod is wrapped + * @prod: Queue index just after advancing + * @color: Queue color just prior to advancing the index + * + * Return: color after advancing the index + */ +static inline bool ionic_color_wrap(u16 prod, bool color) +{ + /* logical xor color with (prod == 0) */ + return color != (prod == 0); +} + +/** + * ionic_queue_at() - Get the element at the given index + * @q: Queue structure + * @idx: Index in the queue + * + * The index must be within the bounds of the queue. It is not checked here. + * + * Return: pointer to element at index + */ +static inline void *ionic_queue_at(struct ionic_queue *q, u16 idx) +{ + return q->ptr + ((unsigned long)idx << q->stride_log2); +} + +/** + * ionic_queue_at_prod() - Get the element at the producer index + * @q: Queue structure + * + * Return: pointer to element at producer index + */ +static inline void *ionic_queue_at_prod(struct ionic_queue *q) +{ + return ionic_queue_at(q, q->prod); +} + +/** + * ionic_queue_at_cons() - Get the element at the consumer index + * @q: Queue structure + * + * Return: pointer to element at consumer index + */ +static inline void *ionic_queue_at_cons(struct ionic_queue *q) +{ + return ionic_queue_at(q, q->cons); +} + +/** + * ionic_queue_next() - Compute the next index + * @q: Queue structure + * @idx: Index + * + * Return: next index after idx + */ +static inline u16 ionic_queue_next(struct ionic_queue *q, u16 idx) +{ + return (idx + 1) & q->mask; +} + +/** + * ionic_queue_produce() - Increase the producer index + * @q: Queue structure + * + * Caller must ensure that the queue is not full. It is not checked here. + */ +static inline void ionic_queue_produce(struct ionic_queue *q) +{ + q->prod = ionic_queue_next(q, q->prod); +} + +/** + * ionic_queue_consume() - Increase the consumer index + * @q: Queue structure + * + * Caller must ensure that the queue is not empty. It is not checked here. + * + * This is only valid for to-device queues. + */ +static inline void ionic_queue_consume(struct ionic_queue *q) +{ + q->cons = ionic_queue_next(q, q->cons); +} + +/** + * ionic_queue_consume_entries() - Increase the consumer index by entries + * @q: Queue structure + * @entries: Number of entries to increment + * + * Caller must ensure that the queue is not empty. It is not checked here. + * + * This is only valid for to-device queues. + */ +static inline void ionic_queue_consume_entries(struct ionic_queue *q, + u16 entries) +{ + q->cons = (q->cons + entries) & q->mask; +} + +/** + * ionic_queue_dbell_init() - Initialize doorbell bits for queue id + * @q: Queue structure + * @qid: Queue identifying number + */ +static inline void ionic_queue_dbell_init(struct ionic_queue *q, u32 qid) +{ + q->dbell = IONIC_DBELL_QID(qid); +} + +/** + * ionic_queue_dbell_val() - Get current doorbell update value + * @q: Queue structure + * + * Return: current doorbell update value + */ +static inline u64 ionic_queue_dbell_val(struct ionic_queue *q) +{ + return q->dbell | q->prod; +} + +#endif /* _IONIC_QUEUE_H_ */ diff --git a/drivers/infiniband/hw/ionic/ionic_res.h b/drivers/infiniband/hw/ionic/ionic_res.h new file mode 100644 index 000000000000..46c8c584bd9a --- /dev/null +++ b/drivers/infiniband/hw/ionic/ionic_res.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#ifndef _IONIC_RES_H_ +#define _IONIC_RES_H_ + +#include <linux/kernel.h> +#include <linux/idr.h> + +/** + * struct ionic_resid_bits - Number allocator based on IDA + * + * @inuse: IDA handle + * @inuse_size: Highest ID limit for IDA + */ +struct ionic_resid_bits { + struct ida inuse; + unsigned int inuse_size; +}; + +/** + * ionic_resid_init() - Initialize a resid allocator + * @resid: Uninitialized resid allocator + * @size: Capacity of the allocator + * + * Return: Zero on success, or negative error number + */ +static inline void ionic_resid_init(struct ionic_resid_bits *resid, + unsigned int size) +{ + resid->inuse_size = size; + ida_init(&resid->inuse); +} + +/** + * ionic_resid_destroy() - Destroy a resid allocator + * @resid: Resid allocator + */ +static inline void ionic_resid_destroy(struct ionic_resid_bits *resid) +{ + ida_destroy(&resid->inuse); +} + +/** + * ionic_resid_get_shared() - Allocate an available shared resource id + * @resid: Resid allocator + * @min: Smallest valid resource id + * @size: One after largest valid resource id + * + * Return: Resource id, or negative error number + */ +static inline int ionic_resid_get_shared(struct ionic_resid_bits *resid, + unsigned int min, + unsigned int size) +{ + return ida_alloc_range(&resid->inuse, min, size - 1, GFP_KERNEL); +} + +/** + * ionic_resid_get() - Allocate an available resource id + * @resid: Resid allocator + * + * Return: Resource id, or negative error number + */ +static inline int ionic_resid_get(struct ionic_resid_bits *resid) +{ + return ionic_resid_get_shared(resid, 0, resid->inuse_size); +} + +/** + * ionic_resid_put() - Free a resource id + * @resid: Resid allocator + * @id: Resource id + */ +static inline void ionic_resid_put(struct ionic_resid_bits *resid, int id) +{ + ida_free(&resid->inuse, id); +} + +/** + * ionic_bitid_to_qid() - Transform a resource bit index into a queue id + * @bitid: Bit index + * @qgrp_shift: Log2 number of queues per queue group + * @half_qid_shift: Log2 of half the total number of queues + * + * Return: Queue id + * + * Udma-constrained queues (QPs and CQs) are associated with their udma by + * queue group. Even queue groups are associated with udma0, and odd queue + * groups with udma1. + * + * For allocating queue ids, we want to arrange the bits into two halves, + * with the even queue groups of udma0 in the lower half of the bitset, + * and the odd queue groups of udma1 in the upper half of the bitset. + * Then, one or two calls of find_next_zero_bit can examine all the bits + * for queues of an entire udma. + * + * For example, assuming eight queue groups with qgrp qids per group: + * + * bitid 0*qgrp..1*qgrp-1 : qid 0*qgrp..1*qgrp-1 + * bitid 1*qgrp..2*qgrp-1 : qid 2*qgrp..3*qgrp-1 + * bitid 2*qgrp..3*qgrp-1 : qid 4*qgrp..5*qgrp-1 + * bitid 3*qgrp..4*qgrp-1 : qid 6*qgrp..7*qgrp-1 + * bitid 4*qgrp..5*qgrp-1 : qid 1*qgrp..2*qgrp-1 + * bitid 5*qgrp..6*qgrp-1 : qid 3*qgrp..4*qgrp-1 + * bitid 6*qgrp..7*qgrp-1 : qid 5*qgrp..6*qgrp-1 + * bitid 7*qgrp..8*qgrp-1 : qid 7*qgrp..8*qgrp-1 + * + * There are three important ranges of bits in the qid. There is the udma + * bit "U" at qgrp_shift, which is the least significant bit of the group + * index, and determines which udma a queue is associated with. + * The bits of lesser significance we can call the idx bits "I", which are + * the index of the queue within the group. The bits of greater significance + * we can call the grp bits "G", which are other bits of the group index that + * do not determine the udma. Those bits are just rearranged in the bit index + * in the bitset. A bitid has the udma bit in the most significant place, + * then the grp bits, then the idx bits. + * + * bitid: 00000000000000 U GGG IIIIII + * qid: 00000000000000 GGG U IIIIII + * + * Transforming from bit index to qid, or from qid to bit index, can be + * accomplished by rearranging the bits by masking and shifting. + */ +static inline u32 ionic_bitid_to_qid(u32 bitid, u8 qgrp_shift, + u8 half_qid_shift) +{ + u32 udma_bit = + (bitid & BIT(half_qid_shift)) >> (half_qid_shift - qgrp_shift); + u32 grp_bits = (bitid & GENMASK(half_qid_shift - 1, qgrp_shift)) << 1; + u32 idx_bits = bitid & (BIT(qgrp_shift) - 1); + + return grp_bits | udma_bit | idx_bits; +} + +/** + * ionic_qid_to_bitid() - Transform a queue id into a resource bit index + * @qid: queue index + * @qgrp_shift: Log2 number of queues per queue group + * @half_qid_shift: Log2 of half the total number of queues + * + * Return: Resource bit index + * + * This is the inverse of ionic_bitid_to_qid(). + */ +static inline u32 ionic_qid_to_bitid(u32 qid, u8 qgrp_shift, u8 half_qid_shift) +{ + u32 udma_bit = (qid & BIT(qgrp_shift)) << (half_qid_shift - qgrp_shift); + u32 grp_bits = (qid & GENMASK(half_qid_shift, qgrp_shift + 1)) >> 1; + u32 idx_bits = qid & (BIT(qgrp_shift) - 1); + + return udma_bit | grp_bits | idx_bits; +} +#endif /* _IONIC_RES_H_ */ diff --git a/drivers/infiniband/hw/irdma/Kconfig b/drivers/infiniband/hw/irdma/Kconfig index 5f49a58590ed..0bd7e3fca1fb 100644 --- a/drivers/infiniband/hw/irdma/Kconfig +++ b/drivers/infiniband/hw/irdma/Kconfig @@ -4,10 +4,11 @@ config INFINIBAND_IRDMA depends on INET depends on IPV6 || !IPV6 depends on PCI - depends on ICE && I40E + depends on IDPF && ICE && I40E select GENERIC_ALLOCATOR select AUXILIARY_BUS select CRC32 help - This is an Intel(R) Ethernet Protocol Driver for RDMA driver - that support E810 (iWARP/RoCE) and X722 (iWARP) network devices. + This is an Intel(R) Ethernet Protocol Driver for RDMA that + supports IPU E2000 (RoCEv2), E810 (iWARP/RoCEv2) and X722 (iWARP) + network devices. diff --git a/drivers/infiniband/hw/irdma/Makefile b/drivers/infiniband/hw/irdma/Makefile index 48c3854235a0..03ceb9e5475f 100644 --- a/drivers/infiniband/hw/irdma/Makefile +++ b/drivers/infiniband/hw/irdma/Makefile @@ -13,7 +13,10 @@ irdma-objs := cm.o \ hw.o \ i40iw_hw.o \ i40iw_if.o \ + ig3rdma_if.o\ + icrdma_if.o \ icrdma_hw.o \ + ig3rdma_hw.o\ main.o \ pble.o \ puda.o \ @@ -22,6 +25,7 @@ irdma-objs := cm.o \ uk.o \ utils.o \ verbs.o \ + virtchnl.o \ ws.o \ CFLAGS_trace.o = -I$(src) diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 99a7f1a6c0b5..4ef1c29032f7 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -74,6 +74,14 @@ static void irdma_set_qos_info(struct irdma_sc_vsi *vsi, { u8 i; + if (vsi->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) { + vsi->qos[i].qs_handle = vsi->dev->qos[i].qs_handle; + vsi->qos[i].valid = true; + } + + return; + } vsi->qos_rel_bw = l2p->vsi_rel_bw; vsi->qos_prio_type = l2p->vsi_prio_type; vsi->dscp_mode = l2p->dscp_mode; @@ -404,7 +412,8 @@ int irdma_sc_qp_init(struct irdma_sc_qp *qp, struct irdma_qp_init_info *info) pble_obj_cnt = info->pd->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; if ((info->virtual_map && info->sq_pa >= pble_obj_cnt) || - (info->virtual_map && info->rq_pa >= pble_obj_cnt)) + (!info->qp_uk_init_info.srq_uk && + info->virtual_map && info->rq_pa >= pble_obj_cnt)) return -EINVAL; qp->llp_stream_handle = (void *)(-1); @@ -439,6 +448,208 @@ int irdma_sc_qp_init(struct irdma_sc_qp *qp, struct irdma_qp_init_info *info) } /** + * irdma_sc_srq_init - init sc_srq structure + * @srq: srq sc struct + * @info: parameters for srq init + */ +int irdma_sc_srq_init(struct irdma_sc_srq *srq, + struct irdma_srq_init_info *info) +{ + u32 srq_size_quanta; + int ret_code; + + ret_code = irdma_uk_srq_init(&srq->srq_uk, &info->srq_uk_init_info); + if (ret_code) + return ret_code; + + srq->dev = info->pd->dev; + srq->pd = info->pd; + srq->vsi = info->vsi; + srq->srq_pa = info->srq_pa; + srq->first_pm_pbl_idx = info->first_pm_pbl_idx; + srq->pasid = info->pasid; + srq->pasid_valid = info->pasid_valid; + srq->srq_limit = info->srq_limit; + srq->leaf_pbl_size = info->leaf_pbl_size; + srq->virtual_map = info->virtual_map; + srq->tph_en = info->tph_en; + srq->arm_limit_event = info->arm_limit_event; + srq->tph_val = info->tph_value; + srq->shadow_area_pa = info->shadow_area_pa; + + /* Smallest SRQ size is 256B i.e. 8 quanta */ + srq_size_quanta = max((u32)IRDMA_SRQ_MIN_QUANTA, + srq->srq_uk.srq_size * + srq->srq_uk.wqe_size_multiplier); + srq->hw_srq_size = irdma_get_encoded_wqe_size(srq_size_quanta, + IRDMA_QUEUE_TYPE_SRQ); + + return 0; +} + +/** + * irdma_sc_srq_create - send srq create CQP WQE + * @srq: srq sc struct + * @scratch: u64 saved to be used during cqp completion + * @post_sq: flag for cqp db to ring + */ +static int irdma_sc_srq_create(struct irdma_sc_srq *srq, u64 scratch, + bool post_sq) +{ + struct irdma_sc_cqp *cqp; + __le64 *wqe; + u64 hdr; + + cqp = srq->pd->dev->cqp; + if (srq->srq_uk.srq_id < cqp->dev->hw_attrs.min_hw_srq_id || + srq->srq_uk.srq_id > + (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].max_cnt - 1)) + return -EINVAL; + + wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); + if (!wqe) + return -ENOMEM; + + set_64bit_val(wqe, 0, + FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQ_LIMIT, srq->srq_limit) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_RQSIZE, srq->hw_srq_size) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE, srq->srq_uk.wqe_size)); + set_64bit_val(wqe, 8, (uintptr_t)srq); + set_64bit_val(wqe, 16, + FIELD_PREP(IRDMA_CQPSQ_SRQ_PD_ID, srq->pd->pd_id)); + set_64bit_val(wqe, 32, + FIELD_PREP(IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR, + srq->srq_pa >> + IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S)); + set_64bit_val(wqe, 40, + FIELD_PREP(IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR, + srq->shadow_area_pa >> + IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S)); + set_64bit_val(wqe, 48, + FIELD_PREP(IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX, + srq->first_pm_pbl_idx)); + + hdr = srq->srq_uk.srq_id | + FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_CREATE_SRQ) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE, srq->leaf_pbl_size) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_VIRTMAP, srq->virtual_map) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT, + srq->arm_limit_event) | + FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); + + dma_wmb(); /* make sure WQE is written before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + print_hex_dump_debug("WQE: SRQ_CREATE WQE", DUMP_PREFIX_OFFSET, 16, 8, + wqe, IRDMA_CQP_WQE_SIZE * 8, false); + if (post_sq) + irdma_sc_cqp_post_sq(cqp); + + return 0; +} + +/** + * irdma_sc_srq_modify - send modify_srq CQP WQE + * @srq: srq sc struct + * @info: parameters for srq modification + * @scratch: u64 saved to be used during cqp completion + * @post_sq: flag for cqp db to ring + */ +static int irdma_sc_srq_modify(struct irdma_sc_srq *srq, + struct irdma_modify_srq_info *info, u64 scratch, + bool post_sq) +{ + struct irdma_sc_cqp *cqp; + __le64 *wqe; + u64 hdr; + + cqp = srq->dev->cqp; + if (srq->srq_uk.srq_id < cqp->dev->hw_attrs.min_hw_srq_id || + srq->srq_uk.srq_id > + (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].max_cnt - 1)) + return -EINVAL; + + wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); + if (!wqe) + return -ENOMEM; + + set_64bit_val(wqe, 0, + FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQ_LIMIT, info->srq_limit) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_RQSIZE, srq->hw_srq_size) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE, srq->srq_uk.wqe_size)); + set_64bit_val(wqe, 8, + FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQCTX, srq->srq_uk.srq_id)); + set_64bit_val(wqe, 16, + FIELD_PREP(IRDMA_CQPSQ_SRQ_PD_ID, srq->pd->pd_id)); + set_64bit_val(wqe, 32, + FIELD_PREP(IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR, + srq->srq_pa >> + IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S)); + set_64bit_val(wqe, 40, + FIELD_PREP(IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR, + srq->shadow_area_pa >> + IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S)); + set_64bit_val(wqe, 48, + FIELD_PREP(IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX, + srq->first_pm_pbl_idx)); + + hdr = srq->srq_uk.srq_id | + FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_MODIFY_SRQ) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE, srq->leaf_pbl_size) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_VIRTMAP, srq->virtual_map) | + FIELD_PREP(IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT, + info->arm_limit_event) | + FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); + dma_wmb(); /* make sure WQE is written before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + print_hex_dump_debug("WQE: SRQ_MODIFY WQE", DUMP_PREFIX_OFFSET, 16, 8, + wqe, IRDMA_CQP_WQE_SIZE * 8, false); + if (post_sq) + irdma_sc_cqp_post_sq(cqp); + + return 0; +} + +/** + * irdma_sc_srq_destroy - send srq_destroy CQP WQE + * @srq: srq sc struct + * @scratch: u64 saved to be used during cqp completion + * @post_sq: flag for cqp db to ring + */ +static int irdma_sc_srq_destroy(struct irdma_sc_srq *srq, u64 scratch, + bool post_sq) +{ + struct irdma_sc_cqp *cqp; + __le64 *wqe; + u64 hdr; + + cqp = srq->dev->cqp; + + wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); + if (!wqe) + return -ENOMEM; + + set_64bit_val(wqe, 8, (uintptr_t)srq); + + hdr = srq->srq_uk.srq_id | + FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DESTROY_SRQ) | + FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); + dma_wmb(); /* make sure WQE is written before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + print_hex_dump_debug("WQE: SRQ_DESTROY WQE", DUMP_PREFIX_OFFSET, 16, + 8, wqe, IRDMA_CQP_WQE_SIZE * 8, false); + if (post_sq) + irdma_sc_cqp_post_sq(cqp); + + return 0; +} + +/** * irdma_sc_qp_create - create qp * @qp: sc qp * @info: qp create info @@ -629,13 +840,14 @@ static u8 irdma_sc_get_encoded_ird_size(u16 ird_size) } /** - * irdma_sc_qp_setctx_roce - set qp's context + * irdma_sc_qp_setctx_roce_gen_2 - set qp's context * @qp: sc qp * @qp_ctx: context ptr * @info: ctx info */ -void irdma_sc_qp_setctx_roce(struct irdma_sc_qp *qp, __le64 *qp_ctx, - struct irdma_qp_host_ctx_info *info) +static void irdma_sc_qp_setctx_roce_gen_2(struct irdma_sc_qp *qp, + __le64 *qp_ctx, + struct irdma_qp_host_ctx_info *info) { struct irdma_roce_offload_info *roce_info; struct irdma_udp_offload_info *udp; @@ -753,6 +965,189 @@ void irdma_sc_qp_setctx_roce(struct irdma_sc_qp *qp, __le64 *qp_ctx, 8, qp_ctx, IRDMA_QP_CTX_SIZE, false); } +/** + * irdma_sc_get_encoded_ird_size_gen_3 - get encoded IRD size for GEN 3 + * @ird_size: IRD size + * The ird from the connection is rounded to a supported HW setting and then encoded + * for ird_size field of qp_ctx. Consumers are expected to provide valid ird size based + * on hardware attributes. IRD size defaults to a value of 4 in case of invalid input. + */ +static u8 irdma_sc_get_encoded_ird_size_gen_3(u16 ird_size) +{ + switch (ird_size ? + roundup_pow_of_two(2 * ird_size) : 4) { + case 4096: + return IRDMA_IRD_HW_SIZE_4096_GEN3; + case 2048: + return IRDMA_IRD_HW_SIZE_2048_GEN3; + case 1024: + return IRDMA_IRD_HW_SIZE_1024_GEN3; + case 512: + return IRDMA_IRD_HW_SIZE_512_GEN3; + case 256: + return IRDMA_IRD_HW_SIZE_256_GEN3; + case 128: + return IRDMA_IRD_HW_SIZE_128_GEN3; + case 64: + return IRDMA_IRD_HW_SIZE_64_GEN3; + case 32: + return IRDMA_IRD_HW_SIZE_32_GEN3; + case 16: + return IRDMA_IRD_HW_SIZE_16_GEN3; + case 8: + return IRDMA_IRD_HW_SIZE_8_GEN3; + case 4: + default: + break; + } + + return IRDMA_IRD_HW_SIZE_4_GEN3; +} + +/** + * irdma_sc_qp_setctx_roce_gen_3 - set qp's context + * @qp: sc qp + * @qp_ctx: context ptr + * @info: ctx info + */ +static void irdma_sc_qp_setctx_roce_gen_3(struct irdma_sc_qp *qp, + __le64 *qp_ctx, + struct irdma_qp_host_ctx_info *info) +{ + struct irdma_roce_offload_info *roce_info = info->roce_info; + struct irdma_udp_offload_info *udp = info->udp_info; + u64 qw0, qw3, qw7 = 0, qw8 = 0; + u8 push_mode_en; + u32 push_idx; + + qp->user_pri = info->user_pri; + if (qp->push_idx == IRDMA_INVALID_PUSH_PAGE_INDEX) { + push_mode_en = 0; + push_idx = 0; + } else { + push_mode_en = 1; + push_idx = qp->push_idx; + } + + qw0 = FIELD_PREP(IRDMAQPC_RQWQESIZE, qp->qp_uk.rq_wqe_size) | + FIELD_PREP(IRDMAQPC_RCVTPHEN, qp->rcv_tph_en) | + FIELD_PREP(IRDMAQPC_XMITTPHEN, qp->xmit_tph_en) | + FIELD_PREP(IRDMAQPC_RQTPHEN, qp->rq_tph_en) | + FIELD_PREP(IRDMAQPC_SQTPHEN, qp->sq_tph_en) | + FIELD_PREP(IRDMAQPC_PPIDX, push_idx) | + FIELD_PREP(IRDMAQPC_PMENA, push_mode_en) | + FIELD_PREP(IRDMAQPC_DC_TCP_EN, roce_info->dctcp_en) | + FIELD_PREP(IRDMAQPC_ISQP1, roce_info->is_qp1) | + FIELD_PREP(IRDMAQPC_ROCE_TVER, roce_info->roce_tver) | + FIELD_PREP(IRDMAQPC_IPV4, udp->ipv4) | + FIELD_PREP(IRDMAQPC_USE_SRQ, !qp->qp_uk.srq_uk ? 0 : 1) | + FIELD_PREP(IRDMAQPC_INSERTVLANTAG, udp->insert_vlan_tag); + set_64bit_val(qp_ctx, 0, qw0); + set_64bit_val(qp_ctx, 8, qp->sq_pa); + set_64bit_val(qp_ctx, 16, qp->rq_pa); + qw3 = FIELD_PREP(IRDMAQPC_RQSIZE, qp->hw_rq_size) | + FIELD_PREP(IRDMAQPC_SQSIZE, qp->hw_sq_size) | + FIELD_PREP(IRDMAQPC_TTL, udp->ttl) | + FIELD_PREP(IRDMAQPC_TOS, udp->tos) | + FIELD_PREP(IRDMAQPC_SRCPORTNUM, udp->src_port) | + FIELD_PREP(IRDMAQPC_DESTPORTNUM, udp->dst_port); + set_64bit_val(qp_ctx, 24, qw3); + set_64bit_val(qp_ctx, 32, + FIELD_PREP(IRDMAQPC_DESTIPADDR2, udp->dest_ip_addr[2]) | + FIELD_PREP(IRDMAQPC_DESTIPADDR3, udp->dest_ip_addr[3])); + set_64bit_val(qp_ctx, 40, + FIELD_PREP(IRDMAQPC_DESTIPADDR0, udp->dest_ip_addr[0]) | + FIELD_PREP(IRDMAQPC_DESTIPADDR1, udp->dest_ip_addr[1])); + set_64bit_val(qp_ctx, 48, + FIELD_PREP(IRDMAQPC_SNDMSS, udp->snd_mss) | + FIELD_PREP(IRDMAQPC_VLANTAG, udp->vlan_tag) | + FIELD_PREP(IRDMAQPC_ARPIDX, udp->arp_idx)); + qw7 = FIELD_PREP(IRDMAQPC_PKEY, roce_info->p_key) | + FIELD_PREP(IRDMAQPC_ACKCREDITS, roce_info->ack_credits) | + FIELD_PREP(IRDMAQPC_FLOWLABEL, udp->flow_label); + set_64bit_val(qp_ctx, 56, qw7); + qw8 = FIELD_PREP(IRDMAQPC_QKEY, roce_info->qkey) | + FIELD_PREP(IRDMAQPC_DESTQP, roce_info->dest_qp); + set_64bit_val(qp_ctx, 64, qw8); + set_64bit_val(qp_ctx, 80, + FIELD_PREP(IRDMAQPC_PSNNXT, udp->psn_nxt) | + FIELD_PREP(IRDMAQPC_LSN, udp->lsn)); + set_64bit_val(qp_ctx, 88, + FIELD_PREP(IRDMAQPC_EPSN, udp->epsn)); + set_64bit_val(qp_ctx, 96, + FIELD_PREP(IRDMAQPC_PSNMAX, udp->psn_max) | + FIELD_PREP(IRDMAQPC_PSNUNA, udp->psn_una)); + set_64bit_val(qp_ctx, 112, + FIELD_PREP(IRDMAQPC_CWNDROCE, udp->cwnd)); + set_64bit_val(qp_ctx, 128, + FIELD_PREP(IRDMAQPC_MINRNR_TIMER, udp->min_rnr_timer) | + FIELD_PREP(IRDMAQPC_RNRNAK_THRESH, udp->rnr_nak_thresh) | + FIELD_PREP(IRDMAQPC_REXMIT_THRESH, udp->rexmit_thresh) | + FIELD_PREP(IRDMAQPC_RNRNAK_TMR, udp->rnr_nak_tmr) | + FIELD_PREP(IRDMAQPC_RTOMIN, roce_info->rtomin)); + set_64bit_val(qp_ctx, 136, + FIELD_PREP(IRDMAQPC_TXCQNUM, info->send_cq_num) | + FIELD_PREP(IRDMAQPC_RXCQNUM, info->rcv_cq_num)); + set_64bit_val(qp_ctx, 152, + FIELD_PREP(IRDMAQPC_MACADDRESS, + ether_addr_to_u64(roce_info->mac_addr)) | + FIELD_PREP(IRDMAQPC_LOCALACKTIMEOUT, + roce_info->local_ack_timeout)); + set_64bit_val(qp_ctx, 160, + FIELD_PREP(IRDMAQPC_ORDSIZE_GEN3, roce_info->ord_size) | + FIELD_PREP(IRDMAQPC_IRDSIZE_GEN3, + irdma_sc_get_encoded_ird_size_gen_3(roce_info->ird_size)) | + FIELD_PREP(IRDMAQPC_WRRDRSPOK, roce_info->wr_rdresp_en) | + FIELD_PREP(IRDMAQPC_RDOK, roce_info->rd_en) | + FIELD_PREP(IRDMAQPC_USESTATSINSTANCE, + info->stats_idx_valid) | + FIELD_PREP(IRDMAQPC_BINDEN, roce_info->bind_en) | + FIELD_PREP(IRDMAQPC_FASTREGEN, roce_info->fast_reg_en) | + FIELD_PREP(IRDMAQPC_DCQCNENABLE, roce_info->dcqcn_en) | + FIELD_PREP(IRDMAQPC_RCVNOICRC, roce_info->rcv_no_icrc) | + FIELD_PREP(IRDMAQPC_FW_CC_ENABLE, + roce_info->fw_cc_enable) | + FIELD_PREP(IRDMAQPC_UDPRIVCQENABLE, + roce_info->udprivcq_en) | + FIELD_PREP(IRDMAQPC_PRIVEN, roce_info->priv_mode_en) | + FIELD_PREP(IRDMAQPC_REMOTE_ATOMIC_EN, + info->remote_atomics_en) | + FIELD_PREP(IRDMAQPC_TIMELYENABLE, roce_info->timely_en)); + set_64bit_val(qp_ctx, 168, + FIELD_PREP(IRDMAQPC_QPCOMPCTX, info->qp_compl_ctx)); + set_64bit_val(qp_ctx, 176, + FIELD_PREP(IRDMAQPC_SQTPHVAL, qp->sq_tph_val) | + FIELD_PREP(IRDMAQPC_RQTPHVAL, qp->rq_tph_val) | + FIELD_PREP(IRDMAQPC_QSHANDLE, qp->qs_handle)); + set_64bit_val(qp_ctx, 184, + FIELD_PREP(IRDMAQPC_LOCAL_IPADDR3, udp->local_ipaddr[3]) | + FIELD_PREP(IRDMAQPC_LOCAL_IPADDR2, udp->local_ipaddr[2])); + set_64bit_val(qp_ctx, 192, + FIELD_PREP(IRDMAQPC_LOCAL_IPADDR1, udp->local_ipaddr[1]) | + FIELD_PREP(IRDMAQPC_LOCAL_IPADDR0, udp->local_ipaddr[0])); + set_64bit_val(qp_ctx, 200, + FIELD_PREP(IRDMAQPC_THIGH, roce_info->t_high) | + FIELD_PREP(IRDMAQPC_SRQ_ID, + !qp->qp_uk.srq_uk ? + 0 : qp->qp_uk.srq_uk->srq_id) | + FIELD_PREP(IRDMAQPC_TLOW, roce_info->t_low)); + set_64bit_val(qp_ctx, 208, roce_info->pd_id | + FIELD_PREP(IRDMAQPC_STAT_INDEX_GEN3, info->stats_idx) | + FIELD_PREP(IRDMAQPC_PKT_LIMIT, qp->pkt_limit)); + + print_hex_dump_debug("WQE: QP_HOST ROCE CTX WQE", DUMP_PREFIX_OFFSET, + 16, 8, qp_ctx, IRDMA_QP_CTX_SIZE, false); +} + +void irdma_sc_qp_setctx_roce(struct irdma_sc_qp *qp, __le64 *qp_ctx, + struct irdma_qp_host_ctx_info *info) +{ + if (qp->dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_2) + irdma_sc_qp_setctx_roce_gen_2(qp, qp_ctx, info); + else + irdma_sc_qp_setctx_roce_gen_3(qp, qp_ctx, info); +} + /* irdma_sc_alloc_local_mac_entry - allocate a mac entry * @cqp: struct for cqp hw * @scratch: u64 saved to be used during cqp completion @@ -1080,7 +1475,8 @@ static int irdma_sc_alloc_stag(struct irdma_sc_dev *dev, FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID) | FIELD_PREP(IRDMA_CQPSQ_STAG_STAGLEN, info->total_len)); set_64bit_val(wqe, 16, - FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx)); + FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx) | + FIELD_PREP(IRDMA_CQPSQ_STAG_PDID_HI, info->pd_id >> 18)); set_64bit_val(wqe, 40, FIELD_PREP(IRDMA_CQPSQ_STAG_HMCFNIDX, info->hmc_fcn_index)); @@ -1096,6 +1492,8 @@ static int irdma_sc_alloc_stag(struct irdma_sc_dev *dev, FIELD_PREP(IRDMA_CQPSQ_STAG_REMACCENABLED, info->remote_access) | FIELD_PREP(IRDMA_CQPSQ_STAG_USEHMCFNIDX, info->use_hmc_fcn_index) | FIELD_PREP(IRDMA_CQPSQ_STAG_USEPFRID, info->use_pf_rid) | + FIELD_PREP(IRDMA_CQPSQ_STAG_REMOTE_ATOMIC_EN, + info->remote_atomics_en) | FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); dma_wmb(); /* make sure WQE is written before valid bit is set */ @@ -1165,6 +1563,7 @@ static int irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev, FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID)); set_64bit_val(wqe, 16, FIELD_PREP(IRDMA_CQPSQ_STAG_KEY, info->stag_key) | + FIELD_PREP(IRDMA_CQPSQ_STAG_PDID_HI, info->pd_id >> 18) | FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx)); if (!info->chunk_size) { set_64bit_val(wqe, 32, info->reg_addr_pa); @@ -1187,6 +1586,8 @@ static int irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev, FIELD_PREP(IRDMA_CQPSQ_STAG_VABASEDTO, addr_type) | FIELD_PREP(IRDMA_CQPSQ_STAG_USEHMCFNIDX, info->use_hmc_fcn_index) | FIELD_PREP(IRDMA_CQPSQ_STAG_USEPFRID, info->use_pf_rid) | + FIELD_PREP(IRDMA_CQPSQ_STAG_REMOTE_ATOMIC_EN, + info->remote_atomics_en) | FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); dma_wmb(); /* make sure WQE is written before valid bit is set */ @@ -1223,7 +1624,8 @@ static int irdma_sc_dealloc_stag(struct irdma_sc_dev *dev, set_64bit_val(wqe, 8, FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID)); set_64bit_val(wqe, 16, - FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx)); + FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx) | + FIELD_PREP(IRDMA_CQPSQ_STAG_PDID_HI, info->pd_id >> 18)); hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DEALLOC_STAG) | FIELD_PREP(IRDMA_CQPSQ_STAG_MR, info->mr) | @@ -1263,7 +1665,8 @@ static int irdma_sc_mw_alloc(struct irdma_sc_dev *dev, set_64bit_val(wqe, 8, FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID)); set_64bit_val(wqe, 16, - FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->mw_stag_index)); + FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->mw_stag_index) | + FIELD_PREP(IRDMA_CQPSQ_STAG_PDID_HI, info->pd_id >> 18)); hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_ALLOC_STAG) | FIELD_PREP(IRDMA_CQPSQ_STAG_MWTYPE, info->mw_wide) | @@ -1343,6 +1746,7 @@ int irdma_sc_mr_fast_register(struct irdma_sc_qp *qp, FIELD_PREP(IRDMAQPSQ_READFENCE, info->read_fence) | FIELD_PREP(IRDMAQPSQ_LOCALFENCE, info->local_fence) | FIELD_PREP(IRDMAQPSQ_SIGCOMPL, info->signaled) | + FIELD_PREP(IRDMAQPSQ_REMOTE_ATOMICS_EN, info->remote_atomics_en) | FIELD_PREP(IRDMAQPSQ_VALID, qp->qp_uk.swqe_polarity); dma_wmb(); /* make sure WQE is written before valid bit is set */ @@ -1873,7 +2277,7 @@ void irdma_sc_vsi_init(struct irdma_sc_vsi *vsi, mutex_init(&vsi->qos[i].qos_mutex); INIT_LIST_HEAD(&vsi->qos[i].qplist); } - if (vsi->register_qset) { + if (vsi->dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_2) { vsi->dev->ws_add = irdma_ws_add; vsi->dev->ws_remove = irdma_ws_remove; vsi->dev->ws_reset = irdma_ws_reset; @@ -1888,7 +2292,7 @@ void irdma_sc_vsi_init(struct irdma_sc_vsi *vsi, * irdma_get_stats_idx - Return stats index * @vsi: pointer to the vsi */ -static u8 irdma_get_stats_idx(struct irdma_sc_vsi *vsi) +static u16 irdma_get_stats_idx(struct irdma_sc_vsi *vsi) { struct irdma_stats_inst_info stats_info = {}; struct irdma_sc_dev *dev = vsi->dev; @@ -1964,12 +2368,13 @@ int irdma_vsi_stats_init(struct irdma_sc_vsi *vsi, (void *)((uintptr_t)stats_buff_mem->va + IRDMA_GATHER_STATS_BUF_SIZE); - irdma_hw_stats_start_timer(vsi); + if (vsi->dev->hw_attrs.uk_attrs.hw_rev < IRDMA_GEN_3) + irdma_hw_stats_start_timer(vsi); /* when stat allocation is not required default to fcn_id. */ vsi->stats_idx = info->fcn_id; if (info->alloc_stats_inst) { - u8 stats_idx = irdma_get_stats_idx(vsi); + u16 stats_idx = irdma_get_stats_idx(vsi); if (stats_idx != IRDMA_INVALID_STATS_IDX) { vsi->stats_inst_alloc = true; @@ -1993,7 +2398,7 @@ void irdma_vsi_stats_free(struct irdma_sc_vsi *vsi) { struct irdma_stats_inst_info stats_info = {}; struct irdma_sc_dev *dev = vsi->dev; - u8 stats_idx = vsi->stats_idx; + u16 stats_idx = vsi->stats_idx; if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) { if (vsi->stats_inst_alloc) { @@ -2009,7 +2414,9 @@ void irdma_vsi_stats_free(struct irdma_sc_vsi *vsi) if (!vsi->pestat) return; - irdma_hw_stats_stop_timer(vsi); + + if (dev->hw_attrs.uk_attrs.hw_rev < IRDMA_GEN_3) + irdma_hw_stats_stop_timer(vsi); dma_free_coherent(vsi->pestat->hw->device, vsi->pestat->gather_info.stats_buff_mem.size, vsi->pestat->gather_info.stats_buff_mem.va, @@ -2026,6 +2433,14 @@ u8 irdma_get_encoded_wqe_size(u32 wqsize, enum irdma_queue_type queue_type) { u8 encoded_size = 0; + if (queue_type == IRDMA_QUEUE_TYPE_SRQ) { + /* Smallest SRQ size is 256B (8 quanta) that gets + * encoded to 0. + */ + encoded_size = ilog2(wqsize) - 3; + + return encoded_size; + } /* cqp sq's hw coded value starts from 1 for size of 4 * while it starts from 0 for qp' wq's. */ @@ -2259,6 +2674,12 @@ int irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp, info->ae_code | FIELD_PREP(IRDMA_CQPSQ_FWQE_AESOURCE, info->ae_src) : 0; set_64bit_val(wqe, 8, temp); + if (cqp->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + set_64bit_val(wqe, 40, + FIELD_PREP(IRDMA_CQPSQ_FWQE_ERR_SQ_IDX, info->err_sq_idx)); + set_64bit_val(wqe, 48, + FIELD_PREP(IRDMA_CQPSQ_FWQE_ERR_RQ_IDX, info->err_rq_idx)); + } hdr = qp->qp_uk.qp_id | FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_FLUSH_WQES) | @@ -2267,6 +2688,9 @@ int irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp, FIELD_PREP(IRDMA_CQPSQ_FWQE_FLUSHSQ, flush_sq) | FIELD_PREP(IRDMA_CQPSQ_FWQE_FLUSHRQ, flush_rq) | FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); + if (cqp->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + hdr |= FIELD_PREP(IRDMA_CQPSQ_FWQE_ERR_SQ_IDX_VALID, info->err_sq_idx_valid) | + FIELD_PREP(IRDMA_CQPSQ_FWQE_ERR_RQ_IDX_VALID, info->err_rq_idx_valid); dma_wmb(); /* make sure WQE is written before valid bit is set */ set_64bit_val(wqe, 24, hdr); @@ -2562,6 +2986,9 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, FIELD_PREP(IRDMA_CQPSQ_CQ_LPBLSIZE, cq->pbl_chunk_size) | FIELD_PREP(IRDMA_CQPSQ_CQ_CHKOVERFLOW, check_overflow) | FIELD_PREP(IRDMA_CQPSQ_CQ_VIRTMAP, cq->virtual_map) | + FIELD_PREP(IRDMA_CQPSQ_CQ_CQID_HIGH, cq->cq_uk.cq_id >> 22) | + FIELD_PREP(IRDMA_CQPSQ_CQ_CEQID_HIGH, + (cq->ceq_id_valid ? cq->ceq_id : 0) >> 10) | FIELD_PREP(IRDMA_CQPSQ_CQ_ENCEQEMASK, cq->ceqe_mask) | FIELD_PREP(IRDMA_CQPSQ_CQ_CEQIDVALID, cq->ceq_id_valid) | FIELD_PREP(IRDMA_CQPSQ_TPHEN, cq->tph_en) | @@ -2706,6 +3133,41 @@ static int irdma_sc_cq_modify(struct irdma_sc_cq *cq, } /** + * irdma_sc_get_decoded_ird_size_gen_3 - get decoded IRD size for GEN 3 + * @ird_enc: IRD encoding + * IRD size defaults to a value of 4 in case of invalid input. + */ +static u16 irdma_sc_get_decoded_ird_size_gen_3(u8 ird_enc) +{ + switch (ird_enc) { + case IRDMA_IRD_HW_SIZE_4096_GEN3: + return 4096; + case IRDMA_IRD_HW_SIZE_2048_GEN3: + return 2048; + case IRDMA_IRD_HW_SIZE_1024_GEN3: + return 1024; + case IRDMA_IRD_HW_SIZE_512_GEN3: + return 512; + case IRDMA_IRD_HW_SIZE_256_GEN3: + return 256; + case IRDMA_IRD_HW_SIZE_128_GEN3: + return 128; + case IRDMA_IRD_HW_SIZE_64_GEN3: + return 64; + case IRDMA_IRD_HW_SIZE_32_GEN3: + return 32; + case IRDMA_IRD_HW_SIZE_16_GEN3: + return 16; + case IRDMA_IRD_HW_SIZE_8_GEN3: + return 8; + case IRDMA_IRD_HW_SIZE_4_GEN3: + return 4; + default: + return 4; + } +} + +/** * irdma_check_cqp_progress - check cqp processing progress * @timeout: timeout info struct * @dev: sc device struct @@ -2738,6 +3200,89 @@ static inline void irdma_get_cqp_reg_info(struct irdma_sc_cqp *cqp, u32 *val, } /** + * irdma_sc_cqp_def_cmpl_ae_handler - remove completed requests from pending list + * @dev: sc device struct + * @info: AE entry info + * @first: true if this is the first call to this handler for given AEQE + * @scratch: (out) scratch entry pointer + * @sw_def_info: (in/out) SW ticket value for this AE + * + * In case of AE_DEF_CMPL event, this function should be called in a loop + * until it returns NULL-ptr via scratch. + * For each call, it looks for a matching CQP request on pending list, + * removes it from the list and returns the pointer to the associated scratch + * entry. + * If this is the first call to this function for given AEQE, sw_def_info + * value is not used to find matching requests. Instead, it is populated + * with the value from the first matching cqp_request on the list. + * For subsequent calls, ooo_op->sw_def_info need to match the value passed + * by a caller. + * + * Return: scratch entry pointer for cqp_request to be released or NULL + * if no matching request is found. + */ +void irdma_sc_cqp_def_cmpl_ae_handler(struct irdma_sc_dev *dev, + struct irdma_aeqe_info *info, + bool first, u64 *scratch, + u32 *sw_def_info) +{ + struct irdma_ooo_cqp_op *ooo_op; + unsigned long flags; + + *scratch = 0; + + spin_lock_irqsave(&dev->cqp->ooo_list_lock, flags); + list_for_each_entry(ooo_op, &dev->cqp->ooo_pnd, list_entry) { + if (ooo_op->deferred && + ((first && ooo_op->def_info == info->def_info) || + (!first && ooo_op->sw_def_info == *sw_def_info))) { + *sw_def_info = ooo_op->sw_def_info; + *scratch = ooo_op->scratch; + + list_move(&ooo_op->list_entry, &dev->cqp->ooo_avail); + atomic64_inc(&dev->cqp->completed_ops); + + break; + } + } + spin_unlock_irqrestore(&dev->cqp->ooo_list_lock, flags); + + if (first && !*scratch) + ibdev_dbg(to_ibdev(dev), + "AEQ: deferred completion with unknown ticket: def_info 0x%x\n", + info->def_info); +} + +/** + * irdma_sc_cqp_cleanup_handler - remove requests from pending list + * @dev: sc device struct + * + * This function should be called in a loop from irdma_cleanup_pending_cqp_op. + * For each call, it returns first CQP request on pending list, removes it + * from the list and returns the pointer to the associated scratch entry. + * + * Return: scratch entry pointer for cqp_request to be released or NULL + * if pending list is empty. + */ +u64 irdma_sc_cqp_cleanup_handler(struct irdma_sc_dev *dev) +{ + struct irdma_ooo_cqp_op *ooo_op; + u64 scratch = 0; + + list_for_each_entry(ooo_op, &dev->cqp->ooo_pnd, list_entry) { + scratch = ooo_op->scratch; + + list_del(&ooo_op->list_entry); + list_add(&ooo_op->list_entry, &dev->cqp->ooo_avail); + atomic64_inc(&dev->cqp->completed_ops); + + break; + } + + return scratch; +} + +/** * irdma_cqp_poll_registers - poll cqp registers * @cqp: struct for cqp hw * @tail: wqtail register value @@ -2794,7 +3339,10 @@ static u64 irdma_sc_decode_fpm_commit(struct irdma_sc_dev *dev, __le64 *buf, obj_info[rsrc_idx].cnt = (u32)FLD_RS_64(dev, temp, IRDMA_COMMIT_FPM_CQCNT); break; case IRDMA_HMC_IW_APBVT_ENTRY: - obj_info[rsrc_idx].cnt = 1; + if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2) + obj_info[rsrc_idx].cnt = 1; + else + obj_info[rsrc_idx].cnt = 0; break; default: obj_info[rsrc_idx].cnt = (u32)temp; @@ -2829,7 +3377,8 @@ irdma_sc_parse_fpm_commit_buf(struct irdma_sc_dev *dev, __le64 *buf, IRDMA_HMC_IW_QP); irdma_sc_decode_fpm_commit(dev, buf, 8, info, IRDMA_HMC_IW_CQ); - /* skiping RSRVD */ + irdma_sc_decode_fpm_commit(dev, buf, 16, info, + IRDMA_HMC_IW_SRQ); irdma_sc_decode_fpm_commit(dev, buf, 24, info, IRDMA_HMC_IW_HTE); irdma_sc_decode_fpm_commit(dev, buf, 32, info, @@ -2864,15 +3413,17 @@ irdma_sc_parse_fpm_commit_buf(struct irdma_sc_dev *dev, __le64 *buf, IRDMA_HMC_IW_HDR); irdma_sc_decode_fpm_commit(dev, buf, 152, info, IRDMA_HMC_IW_MD); - irdma_sc_decode_fpm_commit(dev, buf, 160, info, - IRDMA_HMC_IW_OOISC); - irdma_sc_decode_fpm_commit(dev, buf, 168, info, - IRDMA_HMC_IW_OOISCFFL); + if (dev->cqp->protocol_used == IRDMA_IWARP_PROTOCOL_ONLY) { + irdma_sc_decode_fpm_commit(dev, buf, 160, info, + IRDMA_HMC_IW_OOISC); + irdma_sc_decode_fpm_commit(dev, buf, 168, info, + IRDMA_HMC_IW_OOISCFFL); + } } /* searching for the last object in HMC to find the size of the HMC area. */ for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++) { - if (info[i].base > max_base) { + if (info[i].base > max_base && info[i].cnt) { max_base = info[i].base; last_hmc_obj = i; } @@ -2927,6 +3478,7 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, struct irdma_hmc_fpm_misc *hmc_fpm_misc) { struct irdma_hmc_obj_info *obj_info; + u8 ird_encoding; u64 temp; u32 size; u16 max_pe_sds; @@ -2935,7 +3487,19 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, get_64bit_val(buf, 0, &temp); hmc_info->first_sd_index = (u16)FIELD_GET(IRDMA_QUERY_FPM_FIRST_PE_SD_INDEX, temp); - max_pe_sds = (u16)FIELD_GET(IRDMA_QUERY_FPM_MAX_PE_SDS, temp); + + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + max_pe_sds = (u16)FIELD_GET(IRDMA_QUERY_FPM_MAX_PE_SDS_GEN3, temp); + else + max_pe_sds = (u16)FIELD_GET(IRDMA_QUERY_FPM_MAX_PE_SDS, temp); + + /* Reduce SD count for unprivleged functions by 1 to account for PBLE + * backing page rounding + */ + if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2 && + (hmc_info->hmc_fn_id >= dev->hw_attrs.first_hw_vf_fpm_id || + !dev->privileged)) + max_pe_sds--; hmc_fpm_misc->max_sds = max_pe_sds; hmc_info->sd_table.sd_cnt = max_pe_sds + hmc_info->first_sd_index; @@ -2949,11 +3513,17 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, size = (u32)(temp >> 32); obj_info[IRDMA_HMC_IW_CQ].size = BIT_ULL(size); + irdma_sc_decode_fpm_query(buf, 24, obj_info, IRDMA_HMC_IW_SRQ); irdma_sc_decode_fpm_query(buf, 32, obj_info, IRDMA_HMC_IW_HTE); irdma_sc_decode_fpm_query(buf, 40, obj_info, IRDMA_HMC_IW_ARP); - obj_info[IRDMA_HMC_IW_APBVT_ENTRY].size = 8192; - obj_info[IRDMA_HMC_IW_APBVT_ENTRY].max_cnt = 1; + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + obj_info[IRDMA_HMC_IW_APBVT_ENTRY].size = 0; + obj_info[IRDMA_HMC_IW_APBVT_ENTRY].max_cnt = 0; + } else { + obj_info[IRDMA_HMC_IW_APBVT_ENTRY].size = 8192; + obj_info[IRDMA_HMC_IW_APBVT_ENTRY].max_cnt = 1; + } irdma_sc_decode_fpm_query(buf, 48, obj_info, IRDMA_HMC_IW_MR); irdma_sc_decode_fpm_query(buf, 56, obj_info, IRDMA_HMC_IW_XF); @@ -2962,7 +3532,7 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, obj_info[IRDMA_HMC_IW_XFFL].max_cnt = (u32)temp; obj_info[IRDMA_HMC_IW_XFFL].size = 4; hmc_fpm_misc->xf_block_size = FIELD_GET(IRDMA_QUERY_FPM_XFBLOCKSIZE, temp); - if (!hmc_fpm_misc->xf_block_size) + if (obj_info[IRDMA_HMC_IW_XF].max_cnt && !hmc_fpm_misc->xf_block_size) return -EINVAL; irdma_sc_decode_fpm_query(buf, 72, obj_info, IRDMA_HMC_IW_Q1); @@ -2984,6 +3554,14 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, hmc_fpm_misc->max_ceqs = FIELD_GET(IRDMA_QUERY_FPM_MAX_CEQS, temp); hmc_fpm_misc->ht_multiplier = FIELD_GET(IRDMA_QUERY_FPM_HTMULTIPLIER, temp); hmc_fpm_misc->timer_bucket = FIELD_GET(IRDMA_QUERY_FPM_TIMERBUCKET, temp); + if (FIELD_GET(IRDMA_MANAGE_RSRC_VER2, + dev->feature_info[IRDMA_FTN_FLAGS])) { + ird_encoding = (u8)FIELD_GET(IRDMA_QUERY_FPM_MAX_IRD, temp); + hmc_fpm_misc->ird = + irdma_sc_get_decoded_ird_size_gen_3(ird_encoding) / 2; + dev->hw_attrs.max_hw_ird = hmc_fpm_misc->ird; + dev->hw_attrs.max_hw_ord = hmc_fpm_misc->ird; + } if (dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1) return 0; irdma_sc_decode_fpm_query(buf, 96, obj_info, IRDMA_HMC_IW_FSIMC); @@ -3000,15 +3578,25 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, irdma_sc_decode_fpm_query(buf, 144, obj_info, IRDMA_HMC_IW_HDR); irdma_sc_decode_fpm_query(buf, 152, obj_info, IRDMA_HMC_IW_MD); - irdma_sc_decode_fpm_query(buf, 160, obj_info, IRDMA_HMC_IW_OOISC); - - get_64bit_val(buf, 168, &temp); - obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt = (u32)temp; - obj_info[IRDMA_HMC_IW_OOISCFFL].size = 4; - hmc_fpm_misc->ooiscf_block_size = FIELD_GET(IRDMA_QUERY_FPM_OOISCFBLOCKSIZE, temp); - if (!hmc_fpm_misc->ooiscf_block_size && - obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt) - return -EINVAL; + + if (dev->cqp->protocol_used == IRDMA_IWARP_PROTOCOL_ONLY) { + irdma_sc_decode_fpm_query(buf, 160, obj_info, IRDMA_HMC_IW_OOISC); + + get_64bit_val(buf, 168, &temp); + obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt = (u32)temp; + obj_info[IRDMA_HMC_IW_OOISCFFL].size = 4; + hmc_fpm_misc->ooiscf_block_size = FIELD_GET(IRDMA_QUERY_FPM_OOISCFBLOCKSIZE, temp); + if (!hmc_fpm_misc->ooiscf_block_size && + obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt) + return -EINVAL; + } + + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + get_64bit_val(buf, 176, &temp); + hmc_fpm_misc->loc_mem_pages = (u32)FIELD_GET(IRDMA_QUERY_FPM_LOC_MEM_PAGES, temp); + if (!hmc_fpm_misc->loc_mem_pages) + return -EINVAL; + } return 0; } @@ -3088,6 +3676,8 @@ exit: int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, struct irdma_cqp_init_info *info) { + struct irdma_ooo_cqp_op *ooo_op; + u32 num_ooo_ops; u8 hw_sq_size; if (info->sq_size > IRDMA_CQP_SW_SQSIZE_2048 || @@ -3118,17 +3708,43 @@ int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, cqp->rocev2_rto_policy = info->rocev2_rto_policy; cqp->protocol_used = info->protocol_used; memcpy(&cqp->dcqcn_params, &info->dcqcn_params, sizeof(cqp->dcqcn_params)); + if (cqp->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + cqp->ooisc_blksize = info->ooisc_blksize; + cqp->rrsp_blksize = info->rrsp_blksize; + cqp->q1_blksize = info->q1_blksize; + cqp->xmit_blksize = info->xmit_blksize; + cqp->blksizes_valid = info->blksizes_valid; + cqp->ts_shift = info->ts_shift; + cqp->ts_override = info->ts_override; + cqp->en_fine_grained_timers = info->en_fine_grained_timers; + cqp->pe_en_vf_cnt = info->pe_en_vf_cnt; + cqp->ooo_op_array = info->ooo_op_array; + /* initialize the OOO lists */ + INIT_LIST_HEAD(&cqp->ooo_avail); + INIT_LIST_HEAD(&cqp->ooo_pnd); + if (cqp->ooo_op_array) { + /* Populate avail list entries */ + for (num_ooo_ops = 0, ooo_op = info->ooo_op_array; + num_ooo_ops < cqp->sq_size; + num_ooo_ops++, ooo_op++) + list_add(&ooo_op->list_entry, &cqp->ooo_avail); + } + } info->dev->cqp = cqp; IRDMA_RING_INIT(cqp->sq_ring, cqp->sq_size); + cqp->last_def_cmpl_ticket = 0; + cqp->sw_def_cmpl_ticket = 0; cqp->requested_ops = 0; atomic64_set(&cqp->completed_ops, 0); /* for the cqp commands backlog. */ INIT_LIST_HEAD(&cqp->dev->cqp_cmd_head); writel(0, cqp->dev->hw_regs[IRDMA_CQPTAIL]); - writel(0, cqp->dev->hw_regs[IRDMA_CQPDB]); - writel(0, cqp->dev->hw_regs[IRDMA_CCQPSTATUS]); + if (cqp->dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2) { + writel(0, cqp->dev->hw_regs[IRDMA_CQPDB]); + writel(0, cqp->dev->hw_regs[IRDMA_CCQPSTATUS]); + } ibdev_dbg(to_ibdev(cqp->dev), "WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%p] cqp[%p] polarity[x%04x]\n", @@ -3160,6 +3776,7 @@ int irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, u16 *min_err) return -ENOMEM; spin_lock_init(&cqp->dev->cqp_lock); + spin_lock_init(&cqp->ooo_list_lock); temp = FIELD_PREP(IRDMA_CQPHC_SQSIZE, cqp->hw_sq_size) | FIELD_PREP(IRDMA_CQPHC_SVER, cqp->struct_ver) | @@ -3171,12 +3788,29 @@ int irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, u16 *min_err) FIELD_PREP(IRDMA_CQPHC_PROTOCOL_USED, cqp->protocol_used); } + if (hw_rev >= IRDMA_GEN_3) + temp |= FIELD_PREP(IRDMA_CQPHC_EN_FINE_GRAINED_TIMERS, + cqp->en_fine_grained_timers); set_64bit_val(cqp->host_ctx, 0, temp); set_64bit_val(cqp->host_ctx, 8, cqp->sq_pa); temp = FIELD_PREP(IRDMA_CQPHC_ENABLED_VFS, cqp->ena_vf_count) | FIELD_PREP(IRDMA_CQPHC_HMC_PROFILE, cqp->hmc_profile); + + if (hw_rev >= IRDMA_GEN_3) + temp |= FIELD_PREP(IRDMA_CQPHC_OOISC_BLKSIZE, + cqp->ooisc_blksize) | + FIELD_PREP(IRDMA_CQPHC_RRSP_BLKSIZE, + cqp->rrsp_blksize) | + FIELD_PREP(IRDMA_CQPHC_Q1_BLKSIZE, cqp->q1_blksize) | + FIELD_PREP(IRDMA_CQPHC_XMIT_BLKSIZE, + cqp->xmit_blksize) | + FIELD_PREP(IRDMA_CQPHC_BLKSIZES_VALID, + cqp->blksizes_valid) | + FIELD_PREP(IRDMA_CQPHC_TIMESTAMP_OVERRIDE, + cqp->ts_override) | + FIELD_PREP(IRDMA_CQPHC_TS_SHIFT, cqp->ts_shift); set_64bit_val(cqp->host_ctx, 16, temp); set_64bit_val(cqp->host_ctx, 24, (uintptr_t)cqp); temp = FIELD_PREP(IRDMA_CQPHC_HW_MAJVER, cqp->hw_maj_ver) | @@ -3338,6 +3972,87 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) } /** + * irdma_sc_process_def_cmpl - process deferred or pending completion + * @cqp: CQP sc struct + * @info: CQP CQE info + * @wqe_idx: CQP WQE descriptor index + * @def_info: deferred op ticket value or out-of-order completion id + * @def_cmpl: true for deferred completion, false for pending (RCA) + */ +static void irdma_sc_process_def_cmpl(struct irdma_sc_cqp *cqp, + struct irdma_ccq_cqe_info *info, + u32 wqe_idx, u32 def_info, bool def_cmpl) +{ + struct irdma_ooo_cqp_op *ooo_op; + unsigned long flags; + + /* Deferred and out-of-order completions share the same list of pending + * completions. Since the list can be also accessed from AE handler, + * it must be protected by a lock. + */ + spin_lock_irqsave(&cqp->ooo_list_lock, flags); + + /* For deferred completions bump up SW completion ticket value. */ + if (def_cmpl) { + cqp->last_def_cmpl_ticket = def_info; + cqp->sw_def_cmpl_ticket++; + } + if (!list_empty(&cqp->ooo_avail)) { + ooo_op = (struct irdma_ooo_cqp_op *) + list_entry(cqp->ooo_avail.next, + struct irdma_ooo_cqp_op, list_entry); + + list_del(&ooo_op->list_entry); + ooo_op->scratch = info->scratch; + ooo_op->def_info = def_info; + ooo_op->sw_def_info = cqp->sw_def_cmpl_ticket; + ooo_op->deferred = def_cmpl; + ooo_op->wqe_idx = wqe_idx; + /* Pending completions must be chronologically ordered, + * so adding at the end of list. + */ + list_add_tail(&ooo_op->list_entry, &cqp->ooo_pnd); + } + spin_unlock_irqrestore(&cqp->ooo_list_lock, flags); + + info->pending = true; +} + +/** + * irdma_sc_process_ooo_cmpl - process out-of-order (final) completion + * @cqp: CQP sc struct + * @info: CQP CQE info + * @def_info: out-of-order completion id + */ +static void irdma_sc_process_ooo_cmpl(struct irdma_sc_cqp *cqp, + struct irdma_ccq_cqe_info *info, + u32 def_info) +{ + struct irdma_ooo_cqp_op *ooo_op_tmp; + struct irdma_ooo_cqp_op *ooo_op; + unsigned long flags; + + info->scratch = 0; + + spin_lock_irqsave(&cqp->ooo_list_lock, flags); + list_for_each_entry_safe(ooo_op, ooo_op_tmp, &cqp->ooo_pnd, + list_entry) { + if (!ooo_op->deferred && ooo_op->def_info == def_info) { + list_del(&ooo_op->list_entry); + info->scratch = ooo_op->scratch; + list_add(&ooo_op->list_entry, &cqp->ooo_avail); + break; + } + } + spin_unlock_irqrestore(&cqp->ooo_list_lock, flags); + + if (!info->scratch) + ibdev_dbg(to_ibdev(cqp->dev), + "CQP: DEBUG_FW_OOO out-of-order completion with unknown def_info = 0x%x\n", + def_info); +} + +/** * irdma_sc_ccq_get_cqe_info - get ccq's cq entry * @ccq: ccq sc struct * @info: completion q entry to return @@ -3345,6 +4060,10 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, struct irdma_ccq_cqe_info *info) { + u32 def_info; + bool def_cmpl = false; + bool pend_cmpl = false; + bool ooo_final_cmpl = false; u64 qp_ctx, temp, temp1; __le64 *cqe; struct irdma_sc_cqp *cqp; @@ -3352,6 +4071,7 @@ int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, u32 error; u8 polarity; int ret_code = 0; + unsigned long flags; if (ccq->cq_uk.avoid_mem_cflct) cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(&ccq->cq_uk); @@ -3383,6 +4103,25 @@ int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, get_64bit_val(cqe, 16, &temp1); info->op_ret_val = (u32)FIELD_GET(IRDMA_CCQ_OPRETVAL, temp1); + if (cqp->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + def_cmpl = info->maj_err_code == IRDMA_CQPSQ_MAJ_NO_ERROR && + info->min_err_code == IRDMA_CQPSQ_MIN_DEF_CMPL; + def_info = (u32)FIELD_GET(IRDMA_CCQ_DEFINFO, temp1); + + pend_cmpl = info->maj_err_code == IRDMA_CQPSQ_MAJ_NO_ERROR && + info->min_err_code == IRDMA_CQPSQ_MIN_OOO_CMPL; + + ooo_final_cmpl = (bool)FIELD_GET(IRDMA_OOO_CMPL, temp); + + if (def_cmpl || pend_cmpl || ooo_final_cmpl) { + if (ooo_final_cmpl) + irdma_sc_process_ooo_cmpl(cqp, info, def_info); + else + irdma_sc_process_def_cmpl(cqp, info, wqe_idx, + def_info, def_cmpl); + } + } + get_64bit_val(cqp->sq_base[wqe_idx].elem, 24, &temp1); info->op_code = (u8)FIELD_GET(IRDMA_CQPSQ_OPCODE, temp1); info->cqp = cqp; @@ -3399,7 +4138,16 @@ int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, dma_wmb(); /* make sure shadow area is updated before moving tail */ - IRDMA_RING_MOVE_TAIL(cqp->sq_ring); + spin_lock_irqsave(&cqp->dev->cqp_lock, flags); + if (!ooo_final_cmpl) + IRDMA_RING_MOVE_TAIL(cqp->sq_ring); + spin_unlock_irqrestore(&cqp->dev->cqp_lock, flags); + + /* Do not increment completed_ops counter on pending or deferred + * completions. + */ + if (pend_cmpl || def_cmpl) + return ret_code; atomic64_inc(&cqp->completed_ops); return ret_code; @@ -3647,7 +4395,7 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL); ceq->tph_en = info->tph_en; ceq->tph_val = info->tph_val; - ceq->vsi = info->vsi; + ceq->vsi_idx = info->vsi_idx; ceq->polarity = 1; IRDMA_RING_INIT(ceq->ceq_ring, ceq->elem_cnt); ceq->dev->ceq[info->ceq_id] = ceq; @@ -3680,13 +4428,16 @@ static int irdma_sc_ceq_create(struct irdma_sc_ceq *ceq, u64 scratch, (ceq->virtual_map ? ceq->first_pm_pbl_idx : 0)); set_64bit_val(wqe, 56, FIELD_PREP(IRDMA_CQPSQ_TPHVAL, ceq->tph_val) | - FIELD_PREP(IRDMA_CQPSQ_VSIIDX, ceq->vsi->vsi_idx)); + FIELD_PREP(IRDMA_CQPSQ_PASID, ceq->pasid) | + FIELD_PREP(IRDMA_CQPSQ_VSIIDX, ceq->vsi_idx)); hdr = FIELD_PREP(IRDMA_CQPSQ_CEQ_CEQID, ceq->ceq_id) | + FIELD_PREP(IRDMA_CQPSQ_CEQ_CEQID_HIGH, ceq->ceq_id >> 10) | FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_CREATE_CEQ) | FIELD_PREP(IRDMA_CQPSQ_CEQ_LPBLSIZE, ceq->pbl_chunk_size) | FIELD_PREP(IRDMA_CQPSQ_CEQ_VMAP, ceq->virtual_map) | FIELD_PREP(IRDMA_CQPSQ_CEQ_ITRNOEXPIRE, ceq->itr_no_expire) | FIELD_PREP(IRDMA_CQPSQ_TPHEN, ceq->tph_en) | + FIELD_PREP(IRDMA_CQPSQ_PASID_VALID, ceq->pasid_valid) | FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); dma_wmb(); /* make sure WQE is written before valid bit is set */ @@ -3741,7 +4492,7 @@ int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch) int ret_code; struct irdma_sc_dev *dev = ceq->dev; - dev->ccq->vsi = ceq->vsi; + dev->ccq->vsi_idx = ceq->vsi_idx; if (ceq->reg_cq) { ret_code = irdma_sc_add_cq_ctx(ceq, ceq->dev->ccq); if (ret_code) @@ -3774,11 +4525,14 @@ int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq) set_64bit_val(wqe, 16, ceq->elem_cnt); set_64bit_val(wqe, 48, ceq->first_pm_pbl_idx); + set_64bit_val(wqe, 56, + FIELD_PREP(IRDMA_CQPSQ_PASID, ceq->pasid)); hdr = ceq->ceq_id | FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DESTROY_CEQ) | FIELD_PREP(IRDMA_CQPSQ_CEQ_LPBLSIZE, ceq->pbl_chunk_size) | FIELD_PREP(IRDMA_CQPSQ_CEQ_VMAP, ceq->virtual_map) | FIELD_PREP(IRDMA_CQPSQ_TPHEN, ceq->tph_en) | + FIELD_PREP(IRDMA_CQPSQ_PASID_VALID, ceq->pasid_valid) | FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); dma_wmb(); /* make sure WQE is written before valid bit is set */ @@ -3942,10 +4696,13 @@ static int irdma_sc_aeq_create(struct irdma_sc_aeq *aeq, u64 scratch, (aeq->virtual_map ? 0 : aeq->aeq_elem_pa)); set_64bit_val(wqe, 48, (aeq->virtual_map ? aeq->first_pm_pbl_idx : 0)); + set_64bit_val(wqe, 56, + FIELD_PREP(IRDMA_CQPSQ_PASID, aeq->pasid)); hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_CREATE_AEQ) | FIELD_PREP(IRDMA_CQPSQ_AEQ_LPBLSIZE, aeq->pbl_chunk_size) | FIELD_PREP(IRDMA_CQPSQ_AEQ_VMAP, aeq->virtual_map) | + FIELD_PREP(IRDMA_CQPSQ_PASID_VALID, aeq->pasid_valid) | FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); dma_wmb(); /* make sure WQE is written before valid bit is set */ @@ -3974,7 +4731,8 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch, u64 hdr; dev = aeq->dev; - writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]); + if (dev->privileged) + writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]); cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); @@ -3982,9 +4740,12 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch, return -ENOMEM; set_64bit_val(wqe, 16, aeq->elem_cnt); set_64bit_val(wqe, 48, aeq->first_pm_pbl_idx); + set_64bit_val(wqe, 56, + FIELD_PREP(IRDMA_CQPSQ_PASID, aeq->pasid)); hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DESTROY_AEQ) | FIELD_PREP(IRDMA_CQPSQ_AEQ_LPBLSIZE, aeq->pbl_chunk_size) | FIELD_PREP(IRDMA_CQPSQ_AEQ_VMAP, aeq->virtual_map) | + FIELD_PREP(IRDMA_CQPSQ_PASID_VALID, aeq->pasid_valid) | FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity); dma_wmb(); /* make sure WQE is written before valid bit is set */ @@ -4025,18 +4786,39 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, print_hex_dump_debug("WQE: AEQ_ENTRY WQE", DUMP_PREFIX_OFFSET, 16, 8, aeqe, 16, false); - ae_src = (u8)FIELD_GET(IRDMA_AEQE_AESRC, temp); - info->wqe_idx = (u16)FIELD_GET(IRDMA_AEQE_WQDESCIDX, temp); - info->qp_cq_id = (u32)FIELD_GET(IRDMA_AEQE_QPCQID_LOW, temp) | + if (aeq->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + ae_src = (u8)FIELD_GET(IRDMA_AEQE_AESRC_GEN_3, temp); + info->wqe_idx = (u16)FIELD_GET(IRDMA_AEQE_WQDESCIDX_GEN_3, + temp); + info->qp_cq_id = (u32)FIELD_GET(IRDMA_AEQE_QPCQID_GEN_3, temp); + info->ae_id = (u16)FIELD_GET(IRDMA_AEQE_AECODE_GEN_3, temp); + info->tcp_state = (u8)FIELD_GET(IRDMA_AEQE_TCPSTATE_GEN_3, compl_ctx); + info->iwarp_state = (u8)FIELD_GET(IRDMA_AEQE_IWSTATE_GEN_3, temp); + info->q2_data_written = (u8)FIELD_GET(IRDMA_AEQE_Q2DATA_GEN_3, compl_ctx); + info->aeqe_overflow = (bool)FIELD_GET(IRDMA_AEQE_OVERFLOW_GEN_3, temp); + info->compl_ctx = FIELD_GET(IRDMA_AEQE_CMPL_CTXT, compl_ctx); + compl_ctx = FIELD_GET(IRDMA_AEQE_CMPL_CTXT, compl_ctx) << IRDMA_AEQE_CMPL_CTXT_S; + } else { + ae_src = (u8)FIELD_GET(IRDMA_AEQE_AESRC, temp); + info->wqe_idx = (u16)FIELD_GET(IRDMA_AEQE_WQDESCIDX, temp); + info->qp_cq_id = (u32)FIELD_GET(IRDMA_AEQE_QPCQID_LOW, temp) | ((u32)FIELD_GET(IRDMA_AEQE_QPCQID_HI, temp) << 18); - info->ae_id = (u16)FIELD_GET(IRDMA_AEQE_AECODE, temp); - info->tcp_state = (u8)FIELD_GET(IRDMA_AEQE_TCPSTATE, temp); - info->iwarp_state = (u8)FIELD_GET(IRDMA_AEQE_IWSTATE, temp); - info->q2_data_written = (u8)FIELD_GET(IRDMA_AEQE_Q2DATA, temp); - info->aeqe_overflow = (bool)FIELD_GET(IRDMA_AEQE_OVERFLOW, temp); + info->ae_id = (u16)FIELD_GET(IRDMA_AEQE_AECODE, temp); + info->tcp_state = (u8)FIELD_GET(IRDMA_AEQE_TCPSTATE, temp); + info->iwarp_state = (u8)FIELD_GET(IRDMA_AEQE_IWSTATE, temp); + info->q2_data_written = (u8)FIELD_GET(IRDMA_AEQE_Q2DATA, temp); + info->aeqe_overflow = (bool)FIELD_GET(IRDMA_AEQE_OVERFLOW, + temp); + } info->ae_src = ae_src; switch (info->ae_id) { + case IRDMA_AE_SRQ_LIMIT: + info->srq = true; + /* [63:6] from CMPL_CTXT, [5:0] from WQDESCIDX. */ + info->compl_ctx = compl_ctx; + ae_src = IRDMA_AE_SOURCE_RSVD; + break; case IRDMA_AE_PRIV_OPERATION_DENIED: case IRDMA_AE_AMP_INVALIDATE_TYPE1_MW: case IRDMA_AE_AMP_MWBIND_ZERO_BASED_TYPE1_MW: @@ -4069,6 +4851,10 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, case IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR: case IRDMA_AE_LLP_SEGMENT_TOO_SMALL: case IRDMA_AE_LLP_TOO_MANY_RETRIES: + case IRDMA_AE_LLP_TOO_MANY_RNRS: + case IRDMA_AE_REMOTE_QP_CATASTROPHIC: + case IRDMA_AE_LOCAL_QP_CATASTROPHIC: + case IRDMA_AE_RCE_QP_CATASTROPHIC: case IRDMA_AE_LLP_DOUBT_REACHABILITY: case IRDMA_AE_LLP_CONNECTION_ESTABLISHED: case IRDMA_AE_RESET_SENT: @@ -4085,6 +4871,10 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, info->compl_ctx = compl_ctx << 1; ae_src = IRDMA_AE_SOURCE_RSVD; break; + case IRDMA_AE_CQP_DEFERRED_COMPLETE: + info->def_info = info->wqe_idx; + ae_src = IRDMA_AE_SOURCE_RSVD; + break; case IRDMA_AE_ROCE_EMPTY_MCG: case IRDMA_AE_ROCE_BAD_MC_IP_ADDR: case IRDMA_AE_ROCE_BAD_MC_QPID: @@ -4110,6 +4900,7 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, info->qp = true; info->rq = true; info->compl_ctx = compl_ctx; + info->err_rq_idx_valid = true; break; case IRDMA_AE_SOURCE_CQ: case IRDMA_AE_SOURCE_CQ_0110: @@ -4125,8 +4916,18 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, info->compl_ctx = compl_ctx; break; case IRDMA_AE_SOURCE_IN_RR_WR: + info->qp = true; + if (aeq->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + info->err_rq_idx_valid = true; + info->compl_ctx = compl_ctx; + info->in_rdrsp_wr = true; + break; case IRDMA_AE_SOURCE_IN_RR_WR_1011: info->qp = true; + if (aeq->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + info->sq = true; + info->err_rq_idx_valid = true; + } info->compl_ctx = compl_ctx; info->in_rdrsp_wr = true; break; @@ -4336,6 +5137,26 @@ int irdma_sc_init_iw_hmc(struct irdma_sc_dev *dev, u8 hmc_fn_id) } /** + * irdma_set_loc_mem() - set a local memory bit field + * @buf: ptr to a buffer where local memory gets enabled + */ +static void irdma_set_loc_mem(__le64 *buf) +{ + u64 loc_mem_en = BIT_ULL(ENABLE_LOC_MEM); + u32 offset; + u64 temp; + + for (offset = 0; offset < IRDMA_COMMIT_FPM_BUF_SIZE; + offset += sizeof(__le64)) { + if (offset == IRDMA_PBLE_COMMIT_OFFSET) + continue; + get_64bit_val(buf, offset, &temp); + if (temp) + set_64bit_val(buf, offset, temp | loc_mem_en); + } +} + +/** * irdma_sc_cfg_iw_fpm() - commits hmc obj cnt values using cqp * command and populates fpm base address in hmc_info * @dev : ptr to irdma_dev struct @@ -4356,7 +5177,7 @@ static int irdma_sc_cfg_iw_fpm(struct irdma_sc_dev *dev, u8 hmc_fn_id) set_64bit_val(buf, 0, (u64)obj_info[IRDMA_HMC_IW_QP].cnt); set_64bit_val(buf, 8, (u64)obj_info[IRDMA_HMC_IW_CQ].cnt); - set_64bit_val(buf, 16, (u64)0); /* RSRVD */ + set_64bit_val(buf, 16, (u64)obj_info[IRDMA_HMC_IW_SRQ].cnt); set_64bit_val(buf, 24, (u64)obj_info[IRDMA_HMC_IW_HTE].cnt); set_64bit_val(buf, 32, (u64)obj_info[IRDMA_HMC_IW_ARP].cnt); set_64bit_val(buf, 40, (u64)0); /* RSVD */ @@ -4383,7 +5204,9 @@ static int irdma_sc_cfg_iw_fpm(struct irdma_sc_dev *dev, u8 hmc_fn_id) (u64)obj_info[IRDMA_HMC_IW_OOISC].cnt); set_64bit_val(buf, 168, (u64)obj_info[IRDMA_HMC_IW_OOISCFFL].cnt); - + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3 && + dev->hmc_fpm_misc.loc_mem_pages) + irdma_set_loc_mem(buf); commit_fpm_mem.pa = dev->fpm_commit_buf_pa; commit_fpm_mem.va = dev->fpm_commit_buf; @@ -4592,6 +5415,7 @@ static bool irdma_cqp_ring_full(struct irdma_sc_cqp *cqp) static u32 irdma_est_sd(struct irdma_sc_dev *dev, struct irdma_hmc_info *hmc_info) { + struct irdma_hmc_obj_info *pble_info; int i; u64 size = 0; u64 sd; @@ -4600,12 +5424,22 @@ static u32 irdma_est_sd(struct irdma_sc_dev *dev, if (i != IRDMA_HMC_IW_PBLE) size += round_up(hmc_info->hmc_obj[i].cnt * hmc_info->hmc_obj[i].size, 512); - size += round_up(hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt * - hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].size, 512); + + pble_info = &hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE]; + if (dev->privileged) + size += round_up(pble_info->cnt * pble_info->size, 512); if (size & 0x1FFFFF) sd = (size >> 21) + 1; /* add 1 for remainder */ else sd = size >> 21; + if (!dev->privileged && !dev->hmc_fpm_misc.loc_mem_pages) { + /* 2MB alignment for VF PBLE HMC */ + size = pble_info->cnt * pble_info->size; + if (size & 0x1FFFFF) + sd += (size >> 21) + 1; /* add 1 for remainder */ + else + sd += size >> 21; + } if (sd > 0xFFFFFFFF) { ibdev_dbg(to_ibdev(dev), "HMC: sd overflow[%lld]\n", sd); sd = 0xFFFFFFFF - 1; @@ -4615,17 +5449,6 @@ static u32 irdma_est_sd(struct irdma_sc_dev *dev, } /** - * irdma_sc_query_rdma_features_done - poll cqp for query features done - * @cqp: struct for cqp hw - */ -static int irdma_sc_query_rdma_features_done(struct irdma_sc_cqp *cqp) -{ - return irdma_sc_poll_for_cqp_op_done(cqp, - IRDMA_CQP_OP_QUERY_RDMA_FEATURES, - NULL); -} - -/** * irdma_sc_query_rdma_features - query RDMA features and FW ver * @cqp: struct for cqp hw * @buf: buffer to hold query info @@ -4634,7 +5457,9 @@ static int irdma_sc_query_rdma_features_done(struct irdma_sc_cqp *cqp) static int irdma_sc_query_rdma_features(struct irdma_sc_cqp *cqp, struct irdma_dma_mem *buf, u64 scratch) { + u32 tail, val, error; __le64 *wqe; + int status; u64 temp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); @@ -4654,9 +5479,15 @@ static int irdma_sc_query_rdma_features(struct irdma_sc_cqp *cqp, print_hex_dump_debug("WQE: QUERY RDMA FEATURES", DUMP_PREFIX_OFFSET, 16, 8, wqe, IRDMA_CQP_WQE_SIZE * 8, false); + irdma_get_cqp_reg_info(cqp, &val, &tail, &error); + irdma_sc_cqp_post_sq(cqp); + status = irdma_cqp_poll_registers(cqp, tail, + cqp->dev->hw_attrs.max_done_count); + if (error || status) + status = -EINVAL; - return 0; + return status; } /** @@ -4678,8 +5509,6 @@ int irdma_get_rdma_features(struct irdma_sc_dev *dev) return -ENOMEM; ret_code = irdma_sc_query_rdma_features(dev->cqp, &feat_buf, 0); - if (!ret_code) - ret_code = irdma_sc_query_rdma_features_done(dev->cqp); if (ret_code) goto exit; @@ -4703,8 +5532,6 @@ int irdma_get_rdma_features(struct irdma_sc_dev *dev) return -ENOMEM; ret_code = irdma_sc_query_rdma_features(dev->cqp, &feat_buf, 0); - if (!ret_code) - ret_code = irdma_sc_query_rdma_features_done(dev->cqp); if (ret_code) goto exit; @@ -4731,6 +5558,10 @@ int irdma_get_rdma_features(struct irdma_sc_dev *dev) } dev->feature_info[feat_type] = temp; } + + if (dev->feature_info[IRDMA_FTN_FLAGS] & IRDMA_ATOMICS_ALLOWED_BIT) + dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_ATOMIC_OPS; + exit: dma_free_coherent(dev->hw->device, feat_buf.size, feat_buf.va, feat_buf.pa); @@ -4786,22 +5617,354 @@ static void cfg_fpm_value_gen_2(struct irdma_sc_dev *dev, } /** + * irdma_get_rsrc_mem_config - configure resources if local memory or host + * @dev: sc device struct + * @is_mrte_loc_mem: if true, MR's to be in local memory because sd=loc pages + * + * Only mr can be configured host or local memory if qp's are in local memory. + * If qp is in local memory, then all resource object will be in local memory + * except mr which can be either host or local memory. The only exception + * is pble's which are always in host memory. + */ +static void irdma_get_rsrc_mem_config(struct irdma_sc_dev *dev, bool is_mrte_loc_mem) +{ + struct irdma_hmc_info *hmc_info = dev->hmc_info; + int i; + + for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++) + hmc_info->hmc_obj[i].mem_loc = IRDMA_LOC_MEM; + + if (dev->feature_info[IRDMA_OBJ_1] && !is_mrte_loc_mem) { + u8 mem_type; + + mem_type = (u8)FIELD_GET(IRDMA_MR_MEM_LOC, dev->feature_info[IRDMA_OBJ_1]); + + hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc = + (mem_type & IRDMA_OBJ_LOC_MEM_BIT) ? + IRDMA_LOC_MEM : IRDMA_HOST_MEM; + } else { + hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc = IRDMA_LOC_MEM; + } + + hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].mem_loc = IRDMA_HOST_MEM; + + ibdev_dbg(to_ibdev(dev), "HMC: INFO: mrte_mem_loc = %d pble = %d\n", + hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc, + hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].mem_loc); +} + +/** + * irdma_cfg_sd_mem - allocate sd memory + * @dev: sc device struct + * @hmc_info: ptr to irdma_hmc_obj_info struct + */ +static int irdma_cfg_sd_mem(struct irdma_sc_dev *dev, + struct irdma_hmc_info *hmc_info) +{ + struct irdma_virt_mem virt_mem; + u32 mem_size; + + mem_size = sizeof(struct irdma_hmc_sd_entry) * hmc_info->sd_table.sd_cnt; + virt_mem.size = mem_size; + virt_mem.va = kzalloc(virt_mem.size, GFP_KERNEL); + if (!virt_mem.va) + return -ENOMEM; + hmc_info->sd_table.sd_entry = virt_mem.va; + + return 0; +} + +/** + * irdma_get_objs_pages - get number of 2M pages needed + * @dev: sc device struct + * @hmc_info: pointer to the HMC configuration information struct + * @mem_loc: pages for local or host memory + */ +static u32 irdma_get_objs_pages(struct irdma_sc_dev *dev, + struct irdma_hmc_info *hmc_info, + enum irdma_hmc_obj_mem mem_loc) +{ + u64 size = 0; + int i; + + for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++) { + if (hmc_info->hmc_obj[i].mem_loc == mem_loc) { + size += round_up(hmc_info->hmc_obj[i].cnt * + hmc_info->hmc_obj[i].size, 512); + } + } + + return DIV_ROUND_UP(size, IRDMA_HMC_PAGE_SIZE); +} + +/** + * irdma_set_host_hmc_rsrc_gen_3 - calculate host hmc resources for gen 3 + * @dev: sc device struct + */ +static void irdma_set_host_hmc_rsrc_gen_3(struct irdma_sc_dev *dev) +{ + struct irdma_hmc_fpm_misc *hmc_fpm_misc; + struct irdma_hmc_info *hmc_info; + enum irdma_hmc_obj_mem mrte_loc; + u32 mrwanted, pblewanted; + u32 avail_sds, mr_sds; + + hmc_info = dev->hmc_info; + hmc_fpm_misc = &dev->hmc_fpm_misc; + avail_sds = hmc_fpm_misc->max_sds; + mrte_loc = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc; + mrwanted = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt; + pblewanted = hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].max_cnt; + + if (mrte_loc == IRDMA_HOST_MEM && avail_sds > IRDMA_MIN_PBLE_PAGES) { + mr_sds = avail_sds - IRDMA_MIN_PBLE_PAGES; + mrwanted = min(mrwanted, mr_sds * MAX_MR_PER_SD); + hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt = mrwanted; + avail_sds -= DIV_ROUND_UP(mrwanted, MAX_MR_PER_SD); + } + + if (FIELD_GET(IRDMA_MANAGE_RSRC_VER2, dev->feature_info[IRDMA_FTN_FLAGS]) && + pblewanted > avail_sds * MAX_PBLE_PER_SD) + ibdev_dbg(to_ibdev(dev), + "HMC: Warn: Resource version 2: pble wanted = 0x%x available = 0x%x\n", + pblewanted, avail_sds * MAX_PBLE_PER_SD); + + pblewanted = min(pblewanted, avail_sds * MAX_PBLE_PER_SD); + hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt = pblewanted; +} + +/** + * irdma_verify_commit_fpm_gen_3 - verify query fpm values + * @dev: sc device struct + * @max_pages: max local memory available + * @qpwanted: number of qp's wanted + */ +static int irdma_verify_commit_fpm_gen_3(struct irdma_sc_dev *dev, + u32 max_pages, + u32 qpwanted) +{ + struct irdma_hmc_fpm_misc *hmc_fpm_misc; + u32 rrf_cnt, xf_cnt, timer_cnt, pages_needed; + struct irdma_hmc_info *hmc_info; + u32 rrffl_cnt = 0; + u32 xffl_cnt = 0; + u32 q1fl_cnt; + + hmc_info = dev->hmc_info; + hmc_fpm_misc = &dev->hmc_fpm_misc; + + rrf_cnt = roundup_pow_of_two(IRDMA_RRF_MULTIPLIER * qpwanted); + + if (hmc_info->hmc_obj[IRDMA_HMC_IW_RRFFL].max_cnt) + rrffl_cnt = + hmc_info->hmc_obj[IRDMA_HMC_IW_RRF].cnt / + hmc_fpm_misc->rrf_block_size; + + xf_cnt = roundup_pow_of_two(IRDMA_XF_MULTIPLIER * qpwanted); + + if (xf_cnt) + xffl_cnt = xf_cnt / hmc_fpm_misc->xf_block_size; + + timer_cnt = (round_up(qpwanted, 512) / 512 + 1) * + hmc_fpm_misc->timer_bucket; + + q1fl_cnt = hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].cnt / hmc_fpm_misc->q1_block_size; + + pages_needed = irdma_get_objs_pages(dev, hmc_info, IRDMA_LOC_MEM); + if (pages_needed > max_pages) { + ibdev_dbg(to_ibdev(dev), + "HMC: FAIL: SW counts rrf_cnt = %u rrffl_cnt = %u timer_cnt = %u", + rrf_cnt, rrffl_cnt, timer_cnt); + ibdev_dbg(to_ibdev(dev), + "HMC: FAIL: SW counts xf_cnt = %u xffl_cnt = %u q1fl_cnt = %u", + xf_cnt, xffl_cnt, q1fl_cnt); + + return -EINVAL; + } + + hmc_fpm_misc->max_sds -= pages_needed; + hmc_fpm_misc->loc_mem_pages -= pages_needed; + + return 0; +} + +/** + * irdma_set_loc_hmc_rsrc_gen_3 - calculate hmc resources for gen 3 + * @dev: sc device struct + * @max_pages: max local memory available + * @qpwanted: number of qp's wanted + */ +static int irdma_set_loc_hmc_rsrc_gen_3(struct irdma_sc_dev *dev, + u32 max_pages, + u32 qpwanted) +{ + struct irdma_hmc_fpm_misc *hmc_fpm_misc; + u32 rrf_cnt, xf_cnt, timer_cnt, pages_needed; + struct irdma_hmc_info *hmc_info; + u32 ird, ord; + + if (FIELD_GET(IRDMA_MANAGE_RSRC_VER2, dev->feature_info[IRDMA_FTN_FLAGS])) + return irdma_verify_commit_fpm_gen_3(dev, max_pages, qpwanted); + + hmc_info = dev->hmc_info; + hmc_fpm_misc = &dev->hmc_fpm_misc; + ird = dev->hw_attrs.max_hw_ird; + ord = dev->hw_attrs.max_hw_ord; + + hmc_info->hmc_obj[IRDMA_HMC_IW_HDR].cnt = qpwanted; + hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt = qpwanted; + + hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt = + min(hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt, qpwanted * 2); + + hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].cnt = + min(qpwanted * 8, hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt); + + rrf_cnt = roundup_pow_of_two(IRDMA_RRF_MULTIPLIER * qpwanted); + hmc_info->hmc_obj[IRDMA_HMC_IW_RRF].cnt = + min(hmc_info->hmc_obj[IRDMA_HMC_IW_RRF].max_cnt, rrf_cnt); + + if (hmc_info->hmc_obj[IRDMA_HMC_IW_RRFFL].max_cnt) + hmc_info->hmc_obj[IRDMA_HMC_IW_RRFFL].cnt = + hmc_info->hmc_obj[IRDMA_HMC_IW_RRF].cnt / + hmc_fpm_misc->rrf_block_size; + + xf_cnt = roundup_pow_of_two(IRDMA_XF_MULTIPLIER * qpwanted); + hmc_info->hmc_obj[IRDMA_HMC_IW_XF].cnt = + min(hmc_info->hmc_obj[IRDMA_HMC_IW_XF].max_cnt, xf_cnt); + hmc_info->hmc_obj[IRDMA_HMC_IW_XFFL].cnt = + xf_cnt / hmc_fpm_misc->xf_block_size; + + timer_cnt = (round_up(qpwanted, 512) / 512 + 1) * + hmc_fpm_misc->timer_bucket; + hmc_info->hmc_obj[IRDMA_HMC_IW_TIMER].cnt = + min(timer_cnt, hmc_info->hmc_obj[IRDMA_HMC_IW_TIMER].cnt); + + do { + hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].cnt = roundup_pow_of_two(ird * 2 * qpwanted); + hmc_info->hmc_obj[IRDMA_HMC_IW_Q1FL].cnt = + hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].cnt / hmc_fpm_misc->q1_block_size; + + pages_needed = irdma_get_objs_pages(dev, hmc_info, IRDMA_LOC_MEM); + if (pages_needed <= max_pages) + break; + + ird /= 2; + ord /= 2; + } while (ird >= IRDMA_MIN_IRD); + + if (ird < IRDMA_MIN_IRD) { + ibdev_dbg(to_ibdev(dev), "HMC: FAIL: IRD=%u Q1 CNT = %u\n", + ird, hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].cnt); + return -EINVAL; + } + + dev->hw_attrs.max_hw_ird = ird; + dev->hw_attrs.max_hw_ord = ord; + hmc_fpm_misc->max_sds -= pages_needed; + + return 0; +} + +/** + * cfg_fpm_value_gen_3 - configure fpm for gen 3 + * @dev: sc device struct + * @hmc_info: ptr to irdma_hmc_obj_info struct + * @hmc_fpm_misc: ptr to fpm data + */ +static int cfg_fpm_value_gen_3(struct irdma_sc_dev *dev, + struct irdma_hmc_info *hmc_info, + struct irdma_hmc_fpm_misc *hmc_fpm_misc) +{ + enum irdma_hmc_obj_mem mrte_loc; + u32 mrwanted, qpwanted; + int i, ret_code = 0; + u32 loc_mem_pages; + bool is_mrte_loc_mem; + + loc_mem_pages = hmc_fpm_misc->loc_mem_pages; + is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds ? + true : false; + + irdma_get_rsrc_mem_config(dev, is_mrte_loc_mem); + mrte_loc = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc; + + if (is_mrte_loc_mem) + loc_mem_pages -= IRDMA_MIN_PBLE_PAGES; + + ibdev_dbg(to_ibdev(dev), + "HMC: mrte_loc %d loc_mem %u fpm max sds %u host_obj %d\n", + hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc, + hmc_fpm_misc->loc_mem_pages, hmc_fpm_misc->max_sds, + is_mrte_loc_mem); + + mrwanted = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].max_cnt; + qpwanted = hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt; + hmc_info->hmc_obj[IRDMA_HMC_IW_HDR].cnt = qpwanted; + + hmc_info->hmc_obj[IRDMA_HMC_IW_OOISC].max_cnt = 0; + hmc_info->hmc_obj[IRDMA_HMC_IW_OOISCFFL].max_cnt = 0; + hmc_info->hmc_obj[IRDMA_HMC_IW_HTE].max_cnt = 0; + hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].max_cnt = 0; + + if (!FIELD_GET(IRDMA_MANAGE_RSRC_VER2, dev->feature_info[IRDMA_FTN_FLAGS])) + hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt = + min(hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt, + (u32)IRDMA_FSIAV_CNT_MAX); + + for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++) + hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt; + + while (qpwanted >= IRDMA_MIN_QP_CNT) { + if (!irdma_set_loc_hmc_rsrc_gen_3(dev, loc_mem_pages, qpwanted)) + break; + + if (FIELD_GET(IRDMA_MANAGE_RSRC_VER2, dev->feature_info[IRDMA_FTN_FLAGS])) + return -EINVAL; + + qpwanted /= 2; + if (mrte_loc == IRDMA_LOC_MEM) { + mrwanted = qpwanted * IRDMA_MIN_MR_PER_QP; + hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt = + min(hmc_info->hmc_obj[IRDMA_HMC_IW_MR].max_cnt, mrwanted); + } + } + + if (qpwanted < IRDMA_MIN_QP_CNT) { + ibdev_dbg(to_ibdev(dev), + "HMC: ERROR: could not allocate fpm resources\n"); + return -EINVAL; + } + + irdma_set_host_hmc_rsrc_gen_3(dev); + ret_code = irdma_sc_cfg_iw_fpm(dev, dev->hmc_fn_id); + if (ret_code) { + ibdev_dbg(to_ibdev(dev), + "HMC: cfg_iw_fpm returned error_code[x%08X]\n", + readl(dev->hw_regs[IRDMA_CQPERRCODES])); + + return ret_code; + } + + return irdma_cfg_sd_mem(dev, hmc_info); +} + +/** * irdma_cfg_fpm_val - configure HMC objects * @dev: sc device struct * @qp_count: desired qp count */ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) { - struct irdma_virt_mem virt_mem; - u32 i, mem_size; u32 qpwanted, mrwanted, pblewanted; - u32 powerof2, hte; + u32 powerof2, hte, i; u32 sd_needed; u32 sd_diff; u32 loop_count = 0; struct irdma_hmc_info *hmc_info; struct irdma_hmc_fpm_misc *hmc_fpm_misc; int ret_code = 0; + u32 max_sds; hmc_info = dev->hmc_info; hmc_fpm_misc = &dev->hmc_fpm_misc; @@ -4814,14 +5977,16 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) return ret_code; } + max_sds = hmc_fpm_misc->max_sds; + + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + return cfg_fpm_value_gen_3(dev, hmc_info, hmc_fpm_misc); + for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++) hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt; sd_needed = irdma_est_sd(dev, hmc_info); - ibdev_dbg(to_ibdev(dev), - "HMC: FW max resources sd_needed[%08d] first_sd_index[%04d]\n", - sd_needed, hmc_info->first_sd_index); - ibdev_dbg(to_ibdev(dev), "HMC: sd count %d where max sd is %d\n", - hmc_info->sd_table.sd_cnt, hmc_fpm_misc->max_sds); + ibdev_dbg(to_ibdev(dev), "HMC: sd count %u where max sd is %u\n", + hmc_info->sd_table.sd_cnt, max_sds); qpwanted = min(qp_count, hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt); @@ -4835,21 +6000,21 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) pblewanted = hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].max_cnt; ibdev_dbg(to_ibdev(dev), - "HMC: req_qp=%d max_sd=%d, max_qp = %d, max_cq=%d, max_mr=%d, max_pble=%d, mc=%d, av=%d\n", - qp_count, hmc_fpm_misc->max_sds, + "HMC: req_qp=%d max_sd=%u, max_qp = %u, max_cq=%u, max_mr=%u, max_pble=%u, mc=%d, av=%u\n", + qp_count, max_sds, hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt, hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt, hmc_info->hmc_obj[IRDMA_HMC_IW_MR].max_cnt, hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].max_cnt, hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].max_cnt, hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt); + hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].cnt = hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].max_cnt; hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].cnt = hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt; hmc_info->hmc_obj[IRDMA_HMC_IW_ARP].cnt = hmc_info->hmc_obj[IRDMA_HMC_IW_ARP].max_cnt; - hmc_info->hmc_obj[IRDMA_HMC_IW_APBVT_ENTRY].cnt = 1; while (irdma_q1_cnt(dev, hmc_info, qpwanted) > hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].max_cnt) @@ -4860,7 +6025,7 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt = qpwanted; hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt = min(2 * qpwanted, hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt); - hmc_info->hmc_obj[IRDMA_HMC_IW_RESERVED].cnt = 0; /* Reserved */ + hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].cnt = 0; /* Reserved */ hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt = mrwanted; hte = round_up(qpwanted + hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].cnt, 512); @@ -4898,11 +6063,12 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) if (!(loop_count % 2) && qpwanted > 128) { qpwanted /= 2; } else { - mrwanted /= 2; pblewanted /= 2; + mrwanted /= 2; } continue; } + if (dev->cqp->hmc_profile != IRDMA_HMC_PROFILE_FAVOR_VF && pblewanted > (512 * FPM_MULTIPLIER * sd_diff)) { pblewanted -= 256 * FPM_MULTIPLIER * sd_diff; @@ -4928,14 +6094,13 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) if (sd_needed > hmc_fpm_misc->max_sds) { ibdev_dbg(to_ibdev(dev), - "HMC: cfg_fpm failed loop_cnt=%d, sd_needed=%d, max sd count %d\n", + "HMC: cfg_fpm failed loop_cnt=%u, sd_needed=%u, max sd count %u\n", loop_count, sd_needed, hmc_info->sd_table.sd_cnt); return -EINVAL; } - if (loop_count > 1 && sd_needed < hmc_fpm_misc->max_sds) { - pblewanted += (hmc_fpm_misc->max_sds - sd_needed) * 256 * - FPM_MULTIPLIER; + if (loop_count > 1 && sd_needed < max_sds) { + pblewanted += (max_sds - sd_needed) * 256 * FPM_MULTIPLIER; hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt = pblewanted; sd_needed = irdma_est_sd(dev, hmc_info); } @@ -4959,18 +6124,7 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) return ret_code; } - mem_size = sizeof(struct irdma_hmc_sd_entry) * - (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index + 1); - virt_mem.size = mem_size; - virt_mem.va = kzalloc(virt_mem.size, GFP_KERNEL); - if (!virt_mem.va) { - ibdev_dbg(to_ibdev(dev), - "HMC: failed to allocate memory for sd_entry buffer\n"); - return -ENOMEM; - } - hmc_info->sd_table.sd_entry = virt_mem.va; - - return ret_code; + return irdma_cfg_sd_mem(dev, hmc_info); } /** @@ -5242,6 +6396,22 @@ static int irdma_exec_cqp_cmd(struct irdma_sc_dev *dev, &pcmdinfo->in.u.mc_modify.info, pcmdinfo->in.u.mc_modify.scratch); break; + case IRDMA_OP_SRQ_CREATE: + status = irdma_sc_srq_create(pcmdinfo->in.u.srq_create.srq, + pcmdinfo->in.u.srq_create.scratch, + pcmdinfo->post_sq); + break; + case IRDMA_OP_SRQ_MODIFY: + status = irdma_sc_srq_modify(pcmdinfo->in.u.srq_modify.srq, + &pcmdinfo->in.u.srq_modify.info, + pcmdinfo->in.u.srq_modify.scratch, + pcmdinfo->post_sq); + break; + case IRDMA_OP_SRQ_DESTROY: + status = irdma_sc_srq_destroy(pcmdinfo->in.u.srq_destroy.srq, + pcmdinfo->in.u.srq_destroy.scratch, + pcmdinfo->post_sq); + break; default: status = -EOPNOTSUPP; break; @@ -5314,14 +6484,26 @@ void irdma_cfg_aeq(struct irdma_sc_dev *dev, u32 idx, bool enable) */ void sc_vsi_update_stats(struct irdma_sc_vsi *vsi) { - struct irdma_gather_stats *gather_stats; - struct irdma_gather_stats *last_gather_stats; + struct irdma_dev_hw_stats *hw_stats = &vsi->pestat->hw_stats; + struct irdma_gather_stats *gather_stats = + vsi->pestat->gather_info.gather_stats_va; + struct irdma_gather_stats *last_gather_stats = + vsi->pestat->gather_info.last_gather_stats_va; + const struct irdma_hw_stat_map *map = vsi->dev->hw_stats_map; + u16 max_stat_idx = vsi->dev->hw_attrs.max_stat_idx; + u16 i; + + if (vsi->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + for (i = 0; i < max_stat_idx; i++) { + u16 idx = map[i].byteoff / sizeof(u64); + + hw_stats->stats_val[i] = gather_stats->val[idx]; + } + return; + } - gather_stats = vsi->pestat->gather_info.gather_stats_va; - last_gather_stats = vsi->pestat->gather_info.last_gather_stats_va; - irdma_update_stats(&vsi->pestat->hw_stats, gather_stats, - last_gather_stats, vsi->dev->hw_stats_map, - vsi->dev->hw_attrs.max_stat_idx); + irdma_update_stats(hw_stats, gather_stats, last_gather_stats, + map, max_stat_idx); } /** @@ -5356,6 +6538,9 @@ static inline void irdma_sc_init_hw(struct irdma_sc_dev *dev) case IRDMA_GEN_2: icrdma_init_hw(dev); break; + case IRDMA_GEN_3: + ig3rdma_init_hw(dev); + break; } } @@ -5381,10 +6566,15 @@ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev, dev->fpm_commit_buf = info->fpm_commit_buf; dev->hw = info->hw; dev->hw->hw_addr = info->bar0; + dev->protocol_used = info->protocol_used; /* Setup the hardware limits, hmc may limit further */ dev->hw_attrs.min_hw_qp_id = IRDMA_MIN_IW_QP_ID; + dev->hw_attrs.min_hw_srq_id = IRDMA_MIN_IW_SRQ_ID; dev->hw_attrs.min_hw_aeq_size = IRDMA_MIN_AEQ_ENTRIES; - dev->hw_attrs.max_hw_aeq_size = IRDMA_MAX_AEQ_ENTRIES; + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + dev->hw_attrs.max_hw_aeq_size = IRDMA_MAX_AEQ_ENTRIES_GEN_3; + else + dev->hw_attrs.max_hw_aeq_size = IRDMA_MAX_AEQ_ENTRIES; dev->hw_attrs.min_hw_ceq_size = IRDMA_MIN_CEQ_ENTRIES; dev->hw_attrs.max_hw_ceq_size = IRDMA_MAX_CEQ_ENTRIES; dev->hw_attrs.uk_attrs.min_hw_cq_size = IRDMA_MIN_CQ_SIZE; @@ -5409,21 +6599,39 @@ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev, dev->hw_attrs.max_sleep_count = IRDMA_SLEEP_COUNT; dev->hw_attrs.max_cqp_compl_wait_time_ms = CQP_COMPL_WAIT_TIME_MS; - dev->hw_attrs.uk_attrs.hw_rev = ver; + if (!dev->privileged) { + ret_code = irdma_vchnl_req_get_hmc_fcn(dev); + if (ret_code) { + ibdev_dbg(to_ibdev(dev), + "DEV: Get HMC function ret = %d\n", + ret_code); + + return ret_code; + } + } + irdma_sc_init_hw(dev); - if (irdma_wait_pe_ready(dev)) - return -ETIMEDOUT; + if (dev->privileged) { + if (irdma_wait_pe_ready(dev)) + return -ETIMEDOUT; - val = readl(dev->hw_regs[IRDMA_GLPCI_LBARCTRL]); - db_size = (u8)FIELD_GET(IRDMA_GLPCI_LBARCTRL_PE_DB_SIZE, val); - if (db_size != IRDMA_PE_DB_SIZE_4M && db_size != IRDMA_PE_DB_SIZE_8M) { - ibdev_dbg(to_ibdev(dev), - "DEV: RDMA PE doorbell is not enabled in CSR val 0x%x db_size=%d\n", - val, db_size); - return -ENODEV; + val = readl(dev->hw_regs[IRDMA_GLPCI_LBARCTRL]); + db_size = (u8)FIELD_GET(IRDMA_GLPCI_LBARCTRL_PE_DB_SIZE, val); + if (db_size != IRDMA_PE_DB_SIZE_4M && + db_size != IRDMA_PE_DB_SIZE_8M) { + ibdev_dbg(to_ibdev(dev), + "DEV: RDMA PE doorbell is not enabled in CSR val 0x%x db_size=%d\n", + val, db_size); + return -ENODEV; + } + } else { + ret_code = irdma_vchnl_req_get_reg_layout(dev); + if (ret_code) + ibdev_dbg(to_ibdev(dev), + "DEV: Get Register layout failed ret = %d\n", + ret_code); } - dev->db_addr = dev->hw->hw_addr + (uintptr_t)dev->hw_regs[IRDMA_DB_ADDR_OFFSET]; return ret_code; } diff --git a/drivers/infiniband/hw/irdma/defs.h b/drivers/infiniband/hw/irdma/defs.h index 2cb4b96db721..983b22d7ae23 100644 --- a/drivers/infiniband/hw/irdma/defs.h +++ b/drivers/infiniband/hw/irdma/defs.h @@ -14,6 +14,18 @@ #define IRDMA_PE_DB_SIZE_4M 1 #define IRDMA_PE_DB_SIZE_8M 2 +#define IRDMA_IRD_HW_SIZE_4_GEN3 0 +#define IRDMA_IRD_HW_SIZE_8_GEN3 1 +#define IRDMA_IRD_HW_SIZE_16_GEN3 2 +#define IRDMA_IRD_HW_SIZE_32_GEN3 3 +#define IRDMA_IRD_HW_SIZE_64_GEN3 4 +#define IRDMA_IRD_HW_SIZE_128_GEN3 5 +#define IRDMA_IRD_HW_SIZE_256_GEN3 6 +#define IRDMA_IRD_HW_SIZE_512_GEN3 7 +#define IRDMA_IRD_HW_SIZE_1024_GEN3 8 +#define IRDMA_IRD_HW_SIZE_2048_GEN3 9 +#define IRDMA_IRD_HW_SIZE_4096_GEN3 10 + #define IRDMA_IRD_HW_SIZE_4 0 #define IRDMA_IRD_HW_SIZE_16 1 #define IRDMA_IRD_HW_SIZE_64 2 @@ -114,6 +126,13 @@ enum irdma_protocol_used { #define IRDMA_UPDATE_SD_BUFF_SIZE 128 #define IRDMA_FEATURE_BUF_SIZE (8 * IRDMA_MAX_FEATURES) +#define ENABLE_LOC_MEM 63 +#define IRDMA_ATOMICS_ALLOWED_BIT 1 +#define MAX_PBLE_PER_SD 0x40000 +#define MAX_PBLE_SD_PER_FCN 0x400 +#define MAX_MR_PER_SD 0x8000 +#define MAX_MR_SD_PER_FCN 0x80 +#define IRDMA_PBLE_COMMIT_OFFSET 112 #define IRDMA_MAX_QUANTA_PER_WR 8 #define IRDMA_QP_SW_MAX_WQ_QUANTA 32768 @@ -121,6 +140,10 @@ enum irdma_protocol_used { #define IRDMA_QP_SW_MAX_RQ_QUANTA 32768 #define IRDMA_MAX_QP_WRS(max_quanta_per_wr) \ ((IRDMA_QP_SW_MAX_WQ_QUANTA - IRDMA_SQ_RSVD) / (max_quanta_per_wr)) +#define IRDMA_SRQ_MIN_QUANTA 8 +#define IRDMA_SRQ_MAX_QUANTA 262144 +#define IRDMA_MAX_SRQ_WRS \ + ((IRDMA_SRQ_MAX_QUANTA - IRDMA_RQ_RSVD) / IRDMA_MAX_QUANTA_PER_WR) #define IRDMAQP_TERM_SEND_TERM_AND_FIN 0 #define IRDMAQP_TERM_SEND_TERM_ONLY 1 @@ -147,8 +170,13 @@ enum irdma_protocol_used { #define IRDMA_SQ_RSVD 258 #define IRDMA_RQ_RSVD 1 -#define IRDMA_FEATURE_RTS_AE 1ULL -#define IRDMA_FEATURE_CQ_RESIZE 2ULL +#define IRDMA_FEATURE_RTS_AE BIT_ULL(0) +#define IRDMA_FEATURE_CQ_RESIZE BIT_ULL(1) +#define IRDMA_FEATURE_64_BYTE_CQE BIT_ULL(5) +#define IRDMA_FEATURE_ATOMIC_OPS BIT_ULL(6) +#define IRDMA_FEATURE_SRQ BIT_ULL(7) +#define IRDMA_FEATURE_CQE_TIMESTAMPING BIT_ULL(8) + #define IRDMAQP_OP_RDMA_WRITE 0x00 #define IRDMAQP_OP_RDMA_READ 0x01 #define IRDMAQP_OP_RDMA_SEND 0x03 @@ -161,6 +189,8 @@ enum irdma_protocol_used { #define IRDMAQP_OP_RDMA_READ_LOC_INV 0x0b #define IRDMAQP_OP_NOP 0x0c #define IRDMAQP_OP_RDMA_WRITE_SOL 0x0d +#define IRDMAQP_OP_ATOMIC_FETCH_ADD 0x0f +#define IRDMAQP_OP_ATOMIC_COMPARE_SWAP_ADD 0x11 #define IRDMAQP_OP_GEN_RTS_AE 0x30 enum irdma_cqp_op_type { @@ -212,9 +242,12 @@ enum irdma_cqp_op_type { IRDMA_OP_ADD_LOCAL_MAC_ENTRY = 46, IRDMA_OP_DELETE_LOCAL_MAC_ENTRY = 47, IRDMA_OP_CQ_MODIFY = 48, + IRDMA_OP_SRQ_CREATE = 49, + IRDMA_OP_SRQ_MODIFY = 50, + IRDMA_OP_SRQ_DESTROY = 51, /* Must be last entry*/ - IRDMA_MAX_CQP_OPS = 49, + IRDMA_MAX_CQP_OPS = 52, }; /* CQP SQ WQES */ @@ -224,6 +257,9 @@ enum irdma_cqp_op_type { #define IRDMA_CQP_OP_CREATE_CQ 0x03 #define IRDMA_CQP_OP_MODIFY_CQ 0x04 #define IRDMA_CQP_OP_DESTROY_CQ 0x05 +#define IRDMA_CQP_OP_CREATE_SRQ 0x06 +#define IRDMA_CQP_OP_MODIFY_SRQ 0x07 +#define IRDMA_CQP_OP_DESTROY_SRQ 0x08 #define IRDMA_CQP_OP_ALLOC_STAG 0x09 #define IRDMA_CQP_OP_REG_MR 0x0a #define IRDMA_CQP_OP_QUERY_STAG 0x0b @@ -265,97 +301,6 @@ enum irdma_cqp_op_type { #define IRDMA_CQP_OP_GATHER_STATS 0x2e #define IRDMA_CQP_OP_UP_MAP 0x2f -/* Async Events codes */ -#define IRDMA_AE_AMP_UNALLOCATED_STAG 0x0102 -#define IRDMA_AE_AMP_INVALID_STAG 0x0103 -#define IRDMA_AE_AMP_BAD_QP 0x0104 -#define IRDMA_AE_AMP_BAD_PD 0x0105 -#define IRDMA_AE_AMP_BAD_STAG_KEY 0x0106 -#define IRDMA_AE_AMP_BAD_STAG_INDEX 0x0107 -#define IRDMA_AE_AMP_BOUNDS_VIOLATION 0x0108 -#define IRDMA_AE_AMP_RIGHTS_VIOLATION 0x0109 -#define IRDMA_AE_AMP_TO_WRAP 0x010a -#define IRDMA_AE_AMP_FASTREG_VALID_STAG 0x010c -#define IRDMA_AE_AMP_FASTREG_MW_STAG 0x010d -#define IRDMA_AE_AMP_FASTREG_INVALID_RIGHTS 0x010e -#define IRDMA_AE_AMP_FASTREG_INVALID_LENGTH 0x0110 -#define IRDMA_AE_AMP_INVALIDATE_SHARED 0x0111 -#define IRDMA_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS 0x0112 -#define IRDMA_AE_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS 0x0113 -#define IRDMA_AE_AMP_MWBIND_VALID_STAG 0x0114 -#define IRDMA_AE_AMP_MWBIND_OF_MR_STAG 0x0115 -#define IRDMA_AE_AMP_MWBIND_TO_ZERO_BASED_STAG 0x0116 -#define IRDMA_AE_AMP_MWBIND_TO_MW_STAG 0x0117 -#define IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS 0x0118 -#define IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS 0x0119 -#define IRDMA_AE_AMP_MWBIND_TO_INVALID_PARENT 0x011a -#define IRDMA_AE_AMP_MWBIND_BIND_DISABLED 0x011b -#define IRDMA_AE_PRIV_OPERATION_DENIED 0x011c -#define IRDMA_AE_AMP_INVALIDATE_TYPE1_MW 0x011d -#define IRDMA_AE_AMP_MWBIND_ZERO_BASED_TYPE1_MW 0x011e -#define IRDMA_AE_AMP_FASTREG_INVALID_PBL_HPS_CFG 0x011f -#define IRDMA_AE_AMP_MWBIND_WRONG_TYPE 0x0120 -#define IRDMA_AE_AMP_FASTREG_PBLE_MISMATCH 0x0121 -#define IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG 0x0132 -#define IRDMA_AE_UDA_XMIT_BAD_PD 0x0133 -#define IRDMA_AE_UDA_XMIT_DGRAM_TOO_SHORT 0x0134 -#define IRDMA_AE_UDA_L4LEN_INVALID 0x0135 -#define IRDMA_AE_BAD_CLOSE 0x0201 -#define IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE 0x0202 -#define IRDMA_AE_CQ_OPERATION_ERROR 0x0203 -#define IRDMA_AE_RDMA_READ_WHILE_ORD_ZERO 0x0205 -#define IRDMA_AE_STAG_ZERO_INVALID 0x0206 -#define IRDMA_AE_IB_RREQ_AND_Q1_FULL 0x0207 -#define IRDMA_AE_IB_INVALID_REQUEST 0x0208 -#define IRDMA_AE_WQE_UNEXPECTED_OPCODE 0x020a -#define IRDMA_AE_WQE_INVALID_PARAMETER 0x020b -#define IRDMA_AE_WQE_INVALID_FRAG_DATA 0x020c -#define IRDMA_AE_IB_REMOTE_ACCESS_ERROR 0x020d -#define IRDMA_AE_IB_REMOTE_OP_ERROR 0x020e -#define IRDMA_AE_WQE_LSMM_TOO_LONG 0x0220 -#define IRDMA_AE_INVALID_REQUEST 0x0223 -#define IRDMA_AE_DDP_INVALID_MSN_GAP_IN_MSN 0x0301 -#define IRDMA_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER 0x0303 -#define IRDMA_AE_DDP_UBE_INVALID_DDP_VERSION 0x0304 -#define IRDMA_AE_DDP_UBE_INVALID_MO 0x0305 -#define IRDMA_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE 0x0306 -#define IRDMA_AE_DDP_UBE_INVALID_QN 0x0307 -#define IRDMA_AE_DDP_NO_L_BIT 0x0308 -#define IRDMA_AE_RDMAP_ROE_INVALID_RDMAP_VERSION 0x0311 -#define IRDMA_AE_RDMAP_ROE_UNEXPECTED_OPCODE 0x0312 -#define IRDMA_AE_ROE_INVALID_RDMA_READ_REQUEST 0x0313 -#define IRDMA_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP 0x0314 -#define IRDMA_AE_ROCE_RSP_LENGTH_ERROR 0x0316 -#define IRDMA_AE_ROCE_EMPTY_MCG 0x0380 -#define IRDMA_AE_ROCE_BAD_MC_IP_ADDR 0x0381 -#define IRDMA_AE_ROCE_BAD_MC_QPID 0x0382 -#define IRDMA_AE_MCG_QP_PROTOCOL_MISMATCH 0x0383 -#define IRDMA_AE_INVALID_ARP_ENTRY 0x0401 -#define IRDMA_AE_INVALID_TCP_OPTION_RCVD 0x0402 -#define IRDMA_AE_STALE_ARP_ENTRY 0x0403 -#define IRDMA_AE_INVALID_AH_ENTRY 0x0406 -#define IRDMA_AE_LLP_CLOSE_COMPLETE 0x0501 -#define IRDMA_AE_LLP_CONNECTION_RESET 0x0502 -#define IRDMA_AE_LLP_FIN_RECEIVED 0x0503 -#define IRDMA_AE_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH 0x0504 -#define IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR 0x0505 -#define IRDMA_AE_LLP_SEGMENT_TOO_SMALL 0x0507 -#define IRDMA_AE_LLP_SYN_RECEIVED 0x0508 -#define IRDMA_AE_LLP_TERMINATE_RECEIVED 0x0509 -#define IRDMA_AE_LLP_TOO_MANY_RETRIES 0x050a -#define IRDMA_AE_LLP_TOO_MANY_KEEPALIVE_RETRIES 0x050b -#define IRDMA_AE_LLP_DOUBT_REACHABILITY 0x050c -#define IRDMA_AE_LLP_CONNECTION_ESTABLISHED 0x050e -#define IRDMA_AE_LLP_TOO_MANY_RNRS 0x050f -#define IRDMA_AE_RESOURCE_EXHAUSTION 0x0520 -#define IRDMA_AE_RESET_SENT 0x0601 -#define IRDMA_AE_TERMINATE_SENT 0x0602 -#define IRDMA_AE_RESET_NOT_SENT 0x0603 -#define IRDMA_AE_LCE_QP_CATASTROPHIC 0x0700 -#define IRDMA_AE_LCE_FUNCTION_CATASTROPHIC 0x0701 -#define IRDMA_AE_LCE_CQ_CATASTROPHIC 0x0702 -#define IRDMA_AE_QP_SUSPEND_COMPLETE 0x0900 - #define FLD_LS_64(dev, val, field) \ (((u64)(val) << (dev)->hw_shifts[field ## _S]) & (dev)->hw_masks[field ## _M]) #define FLD_RS_64(dev, val, field) \ @@ -393,9 +338,13 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_STATS_USE_INST BIT_ULL(61) #define IRDMA_CQPSQ_STATS_OP GENMASK_ULL(37, 32) #define IRDMA_CQPSQ_STATS_INST_INDEX GENMASK_ULL(6, 0) -#define IRDMA_CQPSQ_STATS_HMC_FCN_INDEX GENMASK_ULL(5, 0) +#define IRDMA_CQPSQ_STATS_HMC_FCN_INDEX GENMASK_ULL(15, 0) #define IRDMA_CQPSQ_WS_WQEVALID BIT_ULL(63) -#define IRDMA_CQPSQ_WS_NODEOP GENMASK_ULL(53, 52) +#define IRDMA_CQPSQ_WS_NODEOP GENMASK_ULL(55, 52) +#define IRDMA_SD_MAX GENMASK_ULL(15, 0) +#define IRDMA_MEM_MAX GENMASK_ULL(15, 0) +#define IRDMA_QP_MEM_LOC GENMASK_ULL(47, 44) +#define IRDMA_MR_MEM_LOC GENMASK_ULL(27, 24) #define IRDMA_CQPSQ_WS_ENABLENODE BIT_ULL(62) #define IRDMA_CQPSQ_WS_NODETYPE BIT_ULL(61) @@ -404,16 +353,16 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_WS_VMVFTYPE GENMASK_ULL(55, 54) #define IRDMA_CQPSQ_WS_VMVFNUM GENMASK_ULL(51, 42) #define IRDMA_CQPSQ_WS_OP GENMASK_ULL(37, 32) -#define IRDMA_CQPSQ_WS_PARENTID GENMASK_ULL(25, 16) -#define IRDMA_CQPSQ_WS_NODEID GENMASK_ULL(9, 0) -#define IRDMA_CQPSQ_WS_VSI GENMASK_ULL(57, 48) +#define IRDMA_CQPSQ_WS_PARENTID GENMASK_ULL(29, 16) +#define IRDMA_CQPSQ_WS_NODEID GENMASK_ULL(13, 0) +#define IRDMA_CQPSQ_WS_VSI GENMASK_ULL(63, 48) #define IRDMA_CQPSQ_WS_WEIGHT GENMASK_ULL(38, 32) #define IRDMA_CQPSQ_UP_WQEVALID BIT_ULL(63) #define IRDMA_CQPSQ_UP_USEVLAN BIT_ULL(62) #define IRDMA_CQPSQ_UP_USEOVERRIDE BIT_ULL(61) #define IRDMA_CQPSQ_UP_OP GENMASK_ULL(37, 32) -#define IRDMA_CQPSQ_UP_HMCFCNIDX GENMASK_ULL(5, 0) +#define IRDMA_CQPSQ_UP_HMCFCNIDX GENMASK_ULL(15, 0) #define IRDMA_CQPSQ_UP_CNPOVERRIDE GENMASK_ULL(37, 32) #define IRDMA_CQPSQ_QUERY_RDMA_FEATURES_WQEVALID BIT_ULL(63) #define IRDMA_CQPSQ_QUERY_RDMA_FEATURES_BUF_LEN GENMASK_ULL(31, 0) @@ -448,6 +397,16 @@ enum irdma_cqp_op_type { #define IRDMA_CQPHC_SVER GENMASK_ULL(31, 24) #define IRDMA_CQPHC_SQBASE GENMASK_ULL(63, 9) +#define IRDMA_CQPHC_TIMESTAMP_OVERRIDE BIT_ULL(5) +#define IRDMA_CQPHC_TS_SHIFT GENMASK_ULL(12, 8) +#define IRDMA_CQPHC_EN_FINE_GRAINED_TIMERS BIT_ULL(0) + +#define IRDMA_CQPHC_OOISC_BLKSIZE GENMASK_ULL(63, 60) +#define IRDMA_CQPHC_RRSP_BLKSIZE GENMASK_ULL(59, 56) +#define IRDMA_CQPHC_Q1_BLKSIZE GENMASK_ULL(55, 52) +#define IRDMA_CQPHC_XMIT_BLKSIZE GENMASK_ULL(51, 48) +#define IRDMA_CQPHC_BLKSIZES_VALID BIT_ULL(4) + #define IRDMA_CQPHC_QPCTX GENMASK_ULL(63, 0) #define IRDMA_QP_DBSA_HW_SQ_TAIL GENMASK_ULL(14, 0) #define IRDMA_CQ_DBSA_CQEIDX GENMASK_ULL(19, 0) @@ -461,6 +420,8 @@ enum irdma_cqp_op_type { #define IRDMA_CCQ_OPRETVAL GENMASK_ULL(31, 0) +#define IRDMA_CCQ_DEFINFO GENMASK_ULL(63, 32) + #define IRDMA_CQ_MINERR GENMASK_ULL(15, 0) #define IRDMA_CQ_MAJERR GENMASK_ULL(31, 16) #define IRDMA_CQ_WQEIDX GENMASK_ULL(46, 32) @@ -469,6 +430,7 @@ enum irdma_cqp_op_type { #define IRDMA_CQ_ERROR BIT_ULL(55) #define IRDMA_CQ_SQ BIT_ULL(62) +#define IRDMA_CQ_SRQ BIT_ULL(52) #define IRDMA_CQ_VALID BIT_ULL(63) #define IRDMA_CQ_IMMVALID BIT_ULL(62) #define IRDMA_CQ_UDSMACVALID BIT_ULL(61) @@ -476,8 +438,6 @@ enum irdma_cqp_op_type { #define IRDMA_CQ_UDSMAC GENMASK_ULL(47, 0) #define IRDMA_CQ_UDVLAN GENMASK_ULL(63, 48) -#define IRDMA_CQ_IMMDATA_S 0 -#define IRDMA_CQ_IMMDATA_M (0xffffffffffffffffULL << IRDMA_CQ_IMMVALID_S) #define IRDMA_CQ_IMMDATALOW32 GENMASK_ULL(31, 0) #define IRDMA_CQ_IMMDATAUP32 GENMASK_ULL(63, 32) #define IRDMACQ_PAYLDLEN GENMASK_ULL(31, 0) @@ -508,6 +468,17 @@ enum irdma_cqp_op_type { #define IRDMA_AEQE_Q2DATA GENMASK_ULL(62, 61) #define IRDMA_AEQE_VALID BIT_ULL(63) +#define IRDMA_AEQE_Q2DATA_GEN_3 GENMASK_ULL(5, 4) +#define IRDMA_AEQE_TCPSTATE_GEN_3 GENMASK_ULL(3, 0) +#define IRDMA_AEQE_QPCQID_GEN_3 GENMASK_ULL(24, 0) +#define IRDMA_AEQE_AECODE_GEN_3 GENMASK_ULL(61, 50) +#define IRDMA_AEQE_OVERFLOW_GEN_3 BIT_ULL(62) +#define IRDMA_AEQE_WQDESCIDX_GEN_3 GENMASK_ULL(49, 32) +#define IRDMA_AEQE_IWSTATE_GEN_3 GENMASK_ULL(31, 29) +#define IRDMA_AEQE_AESRC_GEN_3 GENMASK_ULL(28, 25) +#define IRDMA_AEQE_CMPL_CTXT_S 6 +#define IRDMA_AEQE_CMPL_CTXT GENMASK_ULL(63, 6) + #define IRDMA_UDA_QPSQ_NEXT_HDR GENMASK_ULL(23, 16) #define IRDMA_UDA_QPSQ_OPCODE GENMASK_ULL(37, 32) #define IRDMA_UDA_QPSQ_L4LEN GENMASK_ULL(45, 42) @@ -530,11 +501,14 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_WQEVALID BIT_ULL(63) #define IRDMA_CQPSQ_TPHVAL GENMASK_ULL(7, 0) -#define IRDMA_CQPSQ_VSIIDX GENMASK_ULL(17, 8) +#define IRDMA_CQPSQ_VSIIDX GENMASK_ULL(23, 8) #define IRDMA_CQPSQ_TPHEN BIT_ULL(60) #define IRDMA_CQPSQ_PBUFADDR IRDMA_CQPHC_QPCTX +#define IRDMA_CQPSQ_PASID GENMASK_ULL(51, 32) +#define IRDMA_CQPSQ_PASID_VALID BIT_ULL(62) + /* Create/Modify/Destroy QP */ #define IRDMA_CQPSQ_QP_NEWMSS GENMASK_ULL(45, 32) @@ -566,10 +540,30 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_QP_DBSHADOWADDR IRDMA_CQPHC_QPCTX +#define IRDMA_CQPSQ_SRQ_RQSIZE GENMASK_ULL(3, 0) +#define IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE GENMASK_ULL(5, 4) +#define IRDMA_CQPSQ_SRQ_SRQ_LIMIT GENMASK_ULL(43, 32) +#define IRDMA_CQPSQ_SRQ_SRQCTX GENMASK_ULL(63, 6) +#define IRDMA_CQPSQ_SRQ_PD_ID GENMASK_ULL(39, 16) +#define IRDMA_CQPSQ_SRQ_SRQ_ID GENMASK_ULL(15, 0) +#define IRDMA_CQPSQ_SRQ_OP GENMASK_ULL(37, 32) +#define IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE GENMASK_ULL(45, 44) +#define IRDMA_CQPSQ_SRQ_VIRTMAP BIT_ULL(47) +#define IRDMA_CQPSQ_SRQ_TPH_EN BIT_ULL(60) +#define IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT BIT_ULL(61) +#define IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX GENMASK_ULL(27, 0) +#define IRDMA_CQPSQ_SRQ_TPH_VALUE GENMASK_ULL(7, 0) +#define IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S 8 +#define IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR GENMASK_ULL(63, 8) +#define IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S 6 +#define IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR GENMASK_ULL(63, 6) + #define IRDMA_CQPSQ_CQ_CQSIZE GENMASK_ULL(20, 0) #define IRDMA_CQPSQ_CQ_CQCTX GENMASK_ULL(62, 0) #define IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD GENMASK(17, 0) +#define IRDMA_CQPSQ_CQ_CQID_HIGH GENMASK_ULL(52, 50) +#define IRDMA_CQPSQ_CQ_CEQID_HIGH GENMASK_ULL(59, 54) #define IRDMA_CQPSQ_CQ_OP GENMASK_ULL(37, 32) #define IRDMA_CQPSQ_CQ_CQRESIZE BIT_ULL(43) #define IRDMA_CQPSQ_CQ_LPBLSIZE GENMASK_ULL(45, 44) @@ -590,6 +584,7 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_STAG_MR BIT_ULL(43) #define IRDMA_CQPSQ_STAG_MWTYPE BIT_ULL(42) #define IRDMA_CQPSQ_STAG_MW1_BIND_DONT_VLDT_KEY BIT_ULL(58) +#define IRDMA_CQPSQ_STAG_PDID_HI GENMASK_ULL(59, 54) #define IRDMA_CQPSQ_STAG_LPBLSIZE IRDMA_CQPSQ_CQ_LPBLSIZE #define IRDMA_CQPSQ_STAG_HPAGESIZE GENMASK_ULL(47, 46) @@ -600,7 +595,8 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_STAG_USEPFRID BIT_ULL(61) #define IRDMA_CQPSQ_STAG_PBA IRDMA_CQPHC_QPCTX -#define IRDMA_CQPSQ_STAG_HMCFNIDX GENMASK_ULL(5, 0) +#define IRDMA_CQPSQ_STAG_HMCFNIDX GENMASK_ULL(15, 0) +#define IRDMA_CQPSQ_STAG_REMOTE_ATOMIC_EN BIT_ULL(61) #define IRDMA_CQPSQ_STAG_FIRSTPMPBLIDX GENMASK_ULL(27, 0) #define IRDMA_CQPSQ_QUERYSTAG_IDX IRDMA_CQPSQ_STAG_IDX @@ -628,11 +624,8 @@ enum irdma_cqp_op_type { /* Manage Push Page - MPP */ #define IRDMA_INVALID_PUSH_PAGE_INDEX_GEN_1 0xffff #define IRDMA_INVALID_PUSH_PAGE_INDEX 0xffffffff - -#define IRDMA_CQPSQ_MPP_QS_HANDLE GENMASK_ULL(9, 0) -#define IRDMA_CQPSQ_MPP_PPIDX GENMASK_ULL(9, 0) +#define IRDMA_CQPSQ_MPP_PPIDX GENMASK_ULL(31, 0) #define IRDMA_CQPSQ_MPP_PPTYPE GENMASK_ULL(61, 60) - #define IRDMA_CQPSQ_MPP_FREE_PAGE BIT_ULL(62) /* Upload Context - UCTX */ @@ -651,6 +644,8 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_CEQ_CEQSIZE GENMASK_ULL(21, 0) #define IRDMA_CQPSQ_CEQ_CEQID GENMASK_ULL(9, 0) +#define IRDMA_CQPSQ_CEQ_CEQID_HIGH GENMASK_ULL(15, 10) + #define IRDMA_CQPSQ_CEQ_LPBLSIZE IRDMA_CQPSQ_CQ_LPBLSIZE #define IRDMA_CQPSQ_CEQ_VMAP BIT_ULL(47) #define IRDMA_CQPSQ_CEQ_ITRNOEXPIRE BIT_ULL(46) @@ -660,10 +655,10 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_AEQ_VMAP BIT_ULL(47) #define IRDMA_CQPSQ_AEQ_FIRSTPMPBLIDX GENMASK_ULL(27, 0) -#define IRDMA_COMMIT_FPM_QPCNT GENMASK_ULL(18, 0) - +#define IRDMA_COMMIT_FPM_QPCNT GENMASK_ULL(20, 0) #define IRDMA_COMMIT_FPM_BASE_S 32 -#define IRDMA_CQPSQ_CFPM_HMCFNID GENMASK_ULL(5, 0) +#define IRDMA_CQPSQ_CFPM_HMCFNID GENMASK_ULL(15, 0) + #define IRDMA_CQPSQ_FWQE_AECODE GENMASK_ULL(15, 0) #define IRDMA_CQPSQ_FWQE_AESOURCE GENMASK_ULL(19, 16) #define IRDMA_CQPSQ_FWQE_RQMNERR GENMASK_ULL(15, 0) @@ -675,6 +670,10 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_FWQE_USERFLCODE BIT_ULL(60) #define IRDMA_CQPSQ_FWQE_FLUSHSQ BIT_ULL(61) #define IRDMA_CQPSQ_FWQE_FLUSHRQ BIT_ULL(62) +#define IRDMA_CQPSQ_FWQE_ERR_SQ_IDX_VALID BIT_ULL(42) +#define IRDMA_CQPSQ_FWQE_ERR_SQ_IDX GENMASK_ULL(49, 32) +#define IRDMA_CQPSQ_FWQE_ERR_RQ_IDX_VALID BIT_ULL(43) +#define IRDMA_CQPSQ_FWQE_ERR_RQ_IDX GENMASK_ULL(46, 32) #define IRDMA_CQPSQ_MAPT_PORT GENMASK_ULL(15, 0) #define IRDMA_CQPSQ_MAPT_ADDPORT BIT_ULL(62) #define IRDMA_CQPSQ_UPESD_SDCMD GENMASK_ULL(31, 0) @@ -693,9 +692,12 @@ enum irdma_cqp_op_type { #define IRDMA_CQPSQ_SUSPENDQP_QPID GENMASK_ULL(23, 0) #define IRDMA_CQPSQ_RESUMEQP_QSHANDLE GENMASK_ULL(31, 0) #define IRDMA_CQPSQ_RESUMEQP_QPID GENMASK(23, 0) +#define IRDMA_MANAGE_RSRC_VER2 BIT_ULL(2) #define IRDMA_CQPSQ_MIN_STAG_INVALID 0x0001 #define IRDMA_CQPSQ_MIN_SUSPEND_PND 0x0005 +#define IRDMA_CQPSQ_MIN_DEF_CMPL 0x0006 +#define IRDMA_CQPSQ_MIN_OOO_CMPL 0x0007 #define IRDMA_CQPSQ_MAJ_NO_ERROR 0x0000 #define IRDMA_CQPSQ_MAJ_OBJCACHE_ERROR 0xF000 @@ -712,6 +714,11 @@ enum irdma_cqp_op_type { #define IRDMAQPC_INSERTL2TAG2 BIT_ULL(11) #define IRDMAQPC_LIMIT GENMASK_ULL(13, 12) +#define IRDMAQPC_USE_SRQ BIT_ULL(10) +#define IRDMAQPC_SRQ_ID GENMASK_ULL(15, 0) +#define IRDMAQPC_PASID GENMASK_ULL(19, 0) +#define IRDMAQPC_PASID_VALID BIT_ULL(11) + #define IRDMAQPC_ECN_EN BIT_ULL(14) #define IRDMAQPC_DROPOOOSEG BIT_ULL(15) #define IRDMAQPC_DUPACK_THRESH GENMASK_ULL(18, 16) @@ -782,21 +789,31 @@ enum irdma_cqp_op_type { #define IRDMAQPC_CWNDROCE GENMASK_ULL(55, 32) #define IRDMAQPC_SNDWL1 GENMASK_ULL(31, 0) #define IRDMAQPC_SNDWL2 GENMASK_ULL(63, 32) -#define IRDMAQPC_ERR_RQ_IDX GENMASK_ULL(45, 32) +#define IRDMAQPC_MINRNR_TIMER GENMASK_ULL(4, 0) +#define IRDMAQPC_ERR_RQ_IDX GENMASK_ULL(46, 32) #define IRDMAQPC_RTOMIN GENMASK_ULL(63, 57) #define IRDMAQPC_MAXSNDWND GENMASK_ULL(31, 0) #define IRDMAQPC_REXMIT_THRESH GENMASK_ULL(53, 48) #define IRDMAQPC_RNRNAK_THRESH GENMASK_ULL(56, 54) -#define IRDMAQPC_TXCQNUM GENMASK_ULL(18, 0) -#define IRDMAQPC_RXCQNUM GENMASK_ULL(50, 32) +#define IRDMAQPC_TXCQNUM GENMASK_ULL(24, 0) +#define IRDMAQPC_RXCQNUM GENMASK_ULL(56, 32) #define IRDMAQPC_STAT_INDEX GENMASK_ULL(6, 0) #define IRDMAQPC_Q2ADDR GENMASK_ULL(63, 8) #define IRDMAQPC_LASTBYTESENT GENMASK_ULL(7, 0) #define IRDMAQPC_MACADDRESS GENMASK_ULL(63, 16) #define IRDMAQPC_ORDSIZE GENMASK_ULL(7, 0) +#define IRDMAQPC_LOCALACKTIMEOUT GENMASK_ULL(12, 8) +#define IRDMAQPC_RNRNAK_TMR GENMASK_ULL(4, 0) +#define IRDMAQPC_ORDSIZE_GEN3 GENMASK_ULL(10, 0) +#define IRDMAQPC_REMOTE_ATOMIC_EN BIT_ULL(18) +#define IRDMAQPC_STAT_INDEX_GEN3 GENMASK_ULL(47, 32) +#define IRDMAQPC_PKT_LIMIT GENMASK_ULL(55, 48) + #define IRDMAQPC_IRDSIZE GENMASK_ULL(18, 16) +#define IRDMAQPC_IRDSIZE_GEN3 GENMASK_ULL(17, 14) + #define IRDMAQPC_UDPRIVCQENABLE BIT_ULL(19) #define IRDMAQPC_WRRDRSPOK BIT_ULL(20) #define IRDMAQPC_RDOK BIT_ULL(21) @@ -833,6 +850,7 @@ enum irdma_cqp_op_type { #define IRDMA_FEATURE_INFO GENMASK_ULL(47, 0) #define IRDMA_FEATURE_CNT GENMASK_ULL(47, 32) #define IRDMA_FEATURE_TYPE GENMASK_ULL(63, 48) +#define IRDMA_FEATURE_RSRC_MAX GENMASK_ULL(31, 0) #define IRDMAQPSQ_OPCODE GENMASK_ULL(37, 32) #define IRDMAQPSQ_COPY_HOST_PBL BIT_ULL(43) @@ -856,7 +874,7 @@ enum irdma_cqp_op_type { #define IRDMAQPSQ_REMSTAGINV GENMASK_ULL(31, 0) #define IRDMAQPSQ_DESTQKEY GENMASK_ULL(31, 0) #define IRDMAQPSQ_DESTQPN GENMASK_ULL(55, 32) -#define IRDMAQPSQ_AHID GENMASK_ULL(16, 0) +#define IRDMAQPSQ_AHID GENMASK_ULL(24, 0) #define IRDMAQPSQ_INLINEDATAFLAG BIT_ULL(57) #define IRDMA_INLINE_VALID_S 7 @@ -869,6 +887,9 @@ enum irdma_cqp_op_type { #define IRDMAQPSQ_REMTO IRDMA_CQPHC_QPCTX +#define IRDMAQPSQ_STAG GENMASK_ULL(31, 0) +#define IRDMAQPSQ_REMOTE_STAG GENMASK_ULL(31, 0) + #define IRDMAQPSQ_STAGRIGHTS GENMASK_ULL(52, 48) #define IRDMAQPSQ_VABASEDTO BIT_ULL(53) #define IRDMAQPSQ_MEMWINDOWTYPE BIT_ULL(54) @@ -879,6 +900,8 @@ enum irdma_cqp_op_type { #define IRDMAQPSQ_BASEVA_TO_FBO IRDMA_CQPHC_QPCTX +#define IRDMAQPSQ_REMOTE_ATOMICS_EN BIT_ULL(55) + #define IRDMAQPSQ_LOCSTAG GENMASK_ULL(31, 0) #define IRDMAQPSQ_STAGKEY GENMASK_ULL(7, 0) @@ -903,11 +926,14 @@ enum irdma_cqp_op_type { #define IRDMAPFINT_OICR_PE_PUSH_M BIT(27) #define IRDMAPFINT_OICR_PE_CRITERR_M BIT(28) -#define IRDMA_QUERY_FPM_MAX_QPS GENMASK_ULL(18, 0) -#define IRDMA_QUERY_FPM_MAX_CQS GENMASK_ULL(19, 0) +#define IRDMA_QUERY_FPM_LOC_MEM_PAGES GENMASK_ULL(63, 32) +#define IRDMA_QUERY_FPM_MAX_QPS GENMASK_ULL(31, 0) +#define IRDMA_QUERY_FPM_MAX_CQS GENMASK_ULL(31, 0) #define IRDMA_QUERY_FPM_FIRST_PE_SD_INDEX GENMASK_ULL(13, 0) -#define IRDMA_QUERY_FPM_MAX_PE_SDS GENMASK_ULL(45, 32) +#define IRDMA_QUERY_FPM_MAX_PE_SDS GENMASK_ULL(44, 32) +#define IRDMA_QUERY_FPM_MAX_PE_SDS_GEN3 GENMASK_ULL(47, 32) #define IRDMA_QUERY_FPM_MAX_CEQS GENMASK_ULL(9, 0) +#define IRDMA_QUERY_FPM_MAX_IRD GENMASK_ULL(53, 50) #define IRDMA_QUERY_FPM_XFBLOCKSIZE GENMASK_ULL(63, 32) #define IRDMA_QUERY_FPM_Q1BLOCKSIZE GENMASK_ULL(63, 32) #define IRDMA_QUERY_FPM_HTMULTIPLIER GENMASK_ULL(19, 16) @@ -1103,7 +1129,7 @@ enum irdma_alignment { IRDMA_CEQ_ALIGNMENT = 0x100, IRDMA_CQ0_ALIGNMENT = 0x100, IRDMA_SD_BUF_ALIGNMENT = 0x80, - IRDMA_FEATURE_BUF_ALIGNMENT = 0x8, + IRDMA_FEATURE_BUF_ALIGNMENT = 0x10, }; enum icrdma_protocol_used { diff --git a/drivers/infiniband/hw/irdma/hmc.c b/drivers/infiniband/hw/irdma/hmc.c index ac58088a8e41..da18add141da 100644 --- a/drivers/infiniband/hw/irdma/hmc.c +++ b/drivers/infiniband/hw/irdma/hmc.c @@ -5,6 +5,7 @@ #include "defs.h" #include "type.h" #include "protos.h" +#include "virtchnl.h" /** * irdma_find_sd_index_limit - finds segment descriptor index limit @@ -228,6 +229,10 @@ int irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev, bool pd_error = false; int ret_code = 0; + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3 && + dev->hmc_info->hmc_obj[info->rsrc_type].mem_loc == IRDMA_LOC_MEM) + return 0; + if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) return -EINVAL; @@ -330,7 +335,7 @@ static int irdma_finish_del_sd_reg(struct irdma_sc_dev *dev, u32 i, sd_idx; struct irdma_dma_mem *mem; - if (!reset) + if (dev->privileged && !reset) ret_code = irdma_hmc_sd_grp(dev, info->hmc_info, info->hmc_info->sd_indexes[0], info->del_sd_cnt, false); @@ -376,6 +381,9 @@ int irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev, u32 i, j; int ret_code = 0; + if (dev->hmc_info->hmc_obj[info->rsrc_type].mem_loc == IRDMA_LOC_MEM) + return 0; + if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) { ibdev_dbg(to_ibdev(dev), "HMC: error start_idx[%04d] >= [type %04d].cnt[%04d]\n", @@ -589,7 +597,10 @@ int irdma_add_pd_table_entry(struct irdma_sc_dev *dev, pd_entry->sd_index = sd_idx; pd_entry->valid = true; pd_table->use_cnt++; - irdma_invalidate_pf_hmc_pd(dev, sd_idx, rel_pd_idx); + + if (hmc_info->hmc_fn_id < dev->hw_attrs.first_hw_vf_fpm_id && + dev->privileged) + irdma_invalidate_pf_hmc_pd(dev, sd_idx, rel_pd_idx); } pd_entry->bp.use_cnt++; @@ -640,7 +651,8 @@ int irdma_remove_pd_bp(struct irdma_sc_dev *dev, pd_addr = pd_table->pd_page_addr.va; pd_addr += rel_pd_idx; memset(pd_addr, 0, sizeof(u64)); - irdma_invalidate_pf_hmc_pd(dev, sd_idx, idx); + if (dev->privileged && dev->hmc_fn_id == hmc_info->hmc_fn_id) + irdma_invalidate_pf_hmc_pd(dev, sd_idx, idx); if (!pd_entry->rsrc_pg) { mem = &pd_entry->bp.addr; diff --git a/drivers/infiniband/hw/irdma/hmc.h b/drivers/infiniband/hw/irdma/hmc.h index 415f9e23bbf6..257a5d22aa96 100644 --- a/drivers/infiniband/hw/irdma/hmc.h +++ b/drivers/infiniband/hw/irdma/hmc.h @@ -16,11 +16,21 @@ #define IRDMA_HMC_PD_BP_BUF_ALIGNMENT 4096 #define IRDMA_FIRST_VF_FPM_ID 8 #define FPM_MULTIPLIER 1024 +#define IRDMA_OBJ_LOC_MEM_BIT 0x4 +#define IRDMA_XF_MULTIPLIER 16 +#define IRDMA_RRF_MULTIPLIER 8 +#define IRDMA_MIN_PBLE_PAGES 3 +#define IRDMA_HMC_PAGE_SIZE 2097152 +#define IRDMA_MIN_MR_PER_QP 4 +#define IRDMA_MIN_QP_CNT 64 +#define IRDMA_FSIAV_CNT_MAX 1048576 +#define IRDMA_MIN_IRD 8 +#define IRDMA_HMC_MIN_RRF 16 enum irdma_hmc_rsrc_type { IRDMA_HMC_IW_QP = 0, IRDMA_HMC_IW_CQ = 1, - IRDMA_HMC_IW_RESERVED = 2, + IRDMA_HMC_IW_SRQ = 2, IRDMA_HMC_IW_HTE = 3, IRDMA_HMC_IW_ARP = 4, IRDMA_HMC_IW_APBVT_ENTRY = 5, @@ -48,11 +58,17 @@ enum irdma_sd_entry_type { IRDMA_SD_TYPE_DIRECT = 2, }; +enum irdma_hmc_obj_mem { + IRDMA_HOST_MEM = 0, + IRDMA_LOC_MEM = 1, +}; + struct irdma_hmc_obj_info { u64 base; u32 max_cnt; u32 cnt; u64 size; + enum irdma_hmc_obj_mem mem_loc; }; struct irdma_hmc_bp { @@ -117,6 +133,7 @@ struct irdma_update_sds_info { struct irdma_ccq_cqe_info; struct irdma_hmc_fcn_info { u32 vf_id; + u8 protocol_used; u8 free_fcn; }; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 69ce1862eabe..7bad0e38786a 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -33,6 +33,7 @@ static struct irdma_rsrc_limits rsrc_limits_table[] = { static enum irdma_hmc_rsrc_type iw_hmc_obj_types[] = { IRDMA_HMC_IW_QP, IRDMA_HMC_IW_CQ, + IRDMA_HMC_IW_SRQ, IRDMA_HMC_IW_HTE, IRDMA_HMC_IW_ARP, IRDMA_HMC_IW_APBVT_ENTRY, @@ -134,75 +135,68 @@ static void irdma_process_ceq(struct irdma_pci_f *rf, struct irdma_ceq *ceq) static void irdma_set_flush_fields(struct irdma_sc_qp *qp, struct irdma_aeqe_info *info) { + struct qp_err_code qp_err; + qp->sq_flush_code = info->sq; qp->rq_flush_code = info->rq; - qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC; - - switch (info->ae_id) { - case IRDMA_AE_AMP_BOUNDS_VIOLATION: - case IRDMA_AE_AMP_INVALID_STAG: - case IRDMA_AE_AMP_RIGHTS_VIOLATION: - case IRDMA_AE_AMP_UNALLOCATED_STAG: - case IRDMA_AE_AMP_BAD_PD: - case IRDMA_AE_AMP_BAD_QP: - case IRDMA_AE_AMP_BAD_STAG_KEY: - case IRDMA_AE_AMP_BAD_STAG_INDEX: - case IRDMA_AE_AMP_TO_WRAP: - case IRDMA_AE_PRIV_OPERATION_DENIED: - qp->flush_code = FLUSH_PROT_ERR; - qp->event_type = IRDMA_QP_EVENT_ACCESS_ERR; - break; - case IRDMA_AE_UDA_XMIT_BAD_PD: - case IRDMA_AE_WQE_UNEXPECTED_OPCODE: - qp->flush_code = FLUSH_LOC_QP_OP_ERR; - qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC; - break; - case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG: - case IRDMA_AE_UDA_XMIT_DGRAM_TOO_SHORT: - case IRDMA_AE_UDA_L4LEN_INVALID: - case IRDMA_AE_DDP_UBE_INVALID_MO: - case IRDMA_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: - qp->flush_code = FLUSH_LOC_LEN_ERR; - qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC; - break; - case IRDMA_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: - case IRDMA_AE_IB_REMOTE_ACCESS_ERROR: - qp->flush_code = FLUSH_REM_ACCESS_ERR; - qp->event_type = IRDMA_QP_EVENT_ACCESS_ERR; - break; - case IRDMA_AE_LLP_SEGMENT_TOO_SMALL: - case IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR: - case IRDMA_AE_ROCE_RSP_LENGTH_ERROR: - case IRDMA_AE_IB_REMOTE_OP_ERROR: - qp->flush_code = FLUSH_REM_OP_ERR; - qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC; - break; - case IRDMA_AE_LCE_QP_CATASTROPHIC: - qp->flush_code = FLUSH_FATAL_ERR; - qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC; - break; - case IRDMA_AE_IB_RREQ_AND_Q1_FULL: - qp->flush_code = FLUSH_GENERAL_ERR; - break; - case IRDMA_AE_LLP_TOO_MANY_RETRIES: - qp->flush_code = FLUSH_RETRY_EXC_ERR; - qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC; - break; - case IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS: - case IRDMA_AE_AMP_MWBIND_BIND_DISABLED: - case IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS: - case IRDMA_AE_AMP_MWBIND_VALID_STAG: - qp->flush_code = FLUSH_MW_BIND_ERR; - qp->event_type = IRDMA_QP_EVENT_ACCESS_ERR; - break; - case IRDMA_AE_IB_INVALID_REQUEST: - qp->flush_code = FLUSH_REM_INV_REQ_ERR; - qp->event_type = IRDMA_QP_EVENT_REQ_ERR; - break; - default: - qp->flush_code = FLUSH_GENERAL_ERR; - qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC; - break; + if (qp->qp_uk.uk_attrs->hw_rev >= IRDMA_GEN_3) { + if (info->sq) { + qp->err_sq_idx_valid = true; + qp->err_sq_idx = info->wqe_idx; + } + if (info->rq) { + qp->err_rq_idx_valid = true; + qp->err_rq_idx = info->wqe_idx; + } + } + + qp_err = irdma_ae_to_qp_err_code(info->ae_id); + qp->flush_code = qp_err.flush_code; + qp->event_type = qp_err.event_type; +} + +/** + * irdma_complete_cqp_request - perform post-completion cleanup + * @cqp: device CQP + * @cqp_request: CQP request + * + * Mark CQP request as done, wake up waiting thread or invoke + * callback function and release/free CQP request. + */ +static void irdma_complete_cqp_request(struct irdma_cqp *cqp, + struct irdma_cqp_request *cqp_request) +{ + if (cqp_request->waiting) { + WRITE_ONCE(cqp_request->request_done, true); + wake_up(&cqp_request->waitq); + } else if (cqp_request->callback_fcn) { + cqp_request->callback_fcn(cqp_request); + } + irdma_put_cqp_request(cqp, cqp_request); +} + +/** + * irdma_process_ae_def_cmpl - handle IRDMA_AE_CQP_DEFERRED_COMPLETE event + * @rf: RDMA PCI function + * @info: AEQ entry info + */ +static void irdma_process_ae_def_cmpl(struct irdma_pci_f *rf, + struct irdma_aeqe_info *info) +{ + u32 sw_def_info; + u64 scratch; + + irdma_cqp_ce_handler(rf, &rf->ccq.sc_cq); + + irdma_sc_cqp_def_cmpl_ae_handler(&rf->sc_dev, info, true, + &scratch, &sw_def_info); + while (scratch) { + struct irdma_cqp_request *cqp_request = + (struct irdma_cqp_request *)(uintptr_t)scratch; + + irdma_complete_cqp_request(&rf->cqp, cqp_request); + irdma_sc_cqp_def_cmpl_ae_handler(&rf->sc_dev, info, false, + &scratch, &sw_def_info); } } @@ -223,6 +217,7 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) struct irdma_sc_qp *qp = NULL; struct irdma_qp_host_ctx_info *ctx_info = NULL; struct irdma_device *iwdev = rf->iwdev; + struct irdma_sc_srq *srq; unsigned long flags; u32 aeqcnt = 0; @@ -236,6 +231,13 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) if (ret) break; + if (info->aeqe_overflow) { + ibdev_err(&iwdev->ibdev, "AEQ has overflowed\n"); + rf->reset = true; + rf->gen_ops.request_reset(rf); + return; + } + aeqcnt++; ibdev_dbg(&iwdev->ibdev, "AEQ: ae_id = 0x%x bool qp=%d qp_id = %d tcp_state=%d iwarp_state=%d ae_src=%d\n", @@ -266,9 +268,12 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) if (info->ae_id != IRDMA_AE_QP_SUSPEND_COMPLETE) iwqp->last_aeq = info->ae_id; spin_unlock_irqrestore(&iwqp->lock, flags); - ctx_info = &iwqp->ctx_info; + } else if (info->srq) { + if (info->ae_id != IRDMA_AE_SRQ_LIMIT) + continue; } else { - if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR) + if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR && + info->ae_id != IRDMA_AE_CQP_DEFERRED_COMPLETE) continue; } @@ -363,6 +368,18 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) } irdma_cq_rem_ref(&iwcq->ibcq); break; + case IRDMA_AE_SRQ_LIMIT: + srq = (struct irdma_sc_srq *)(uintptr_t)info->compl_ctx; + irdma_srq_event(srq); + break; + case IRDMA_AE_SRQ_CATASTROPHIC_ERROR: + break; + case IRDMA_AE_CQP_DEFERRED_COMPLETE: + /* Remove completed CQP requests from pending list + * and notify about those CQP ops completion. + */ + irdma_process_ae_def_cmpl(rf, info); + break; case IRDMA_AE_RESET_NOT_SENT: case IRDMA_AE_LLP_DOUBT_REACHABILITY: case IRDMA_AE_RESOURCE_EXHAUSTION: @@ -389,13 +406,18 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC: case IRDMA_AE_LLP_TOO_MANY_RNRS: case IRDMA_AE_LCE_CQ_CATASTROPHIC: + case IRDMA_AE_REMOTE_QP_CATASTROPHIC: + case IRDMA_AE_LOCAL_QP_CATASTROPHIC: + case IRDMA_AE_RCE_QP_CATASTROPHIC: case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG: default: ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d, ae_src=%d\n", info->ae_id, info->qp, info->qp_cq_id, info->ae_src); - if (rdma_protocol_roce(&iwdev->ibdev, 1)) { - ctx_info->roce_info->err_rq_idx_valid = info->rq; - if (info->rq) { + ctx_info = &iwqp->ctx_info; + if (rdma_protocol_roce(&iwqp->iwdev->ibdev, 1)) { + ctx_info->roce_info->err_rq_idx_valid = + ctx_info->srq_valid ? false : info->err_rq_idx_valid; + if (ctx_info->roce_info->err_rq_idx_valid) { ctx_info->roce_info->err_rq_idx = info->wqe_idx; irdma_sc_qp_setctx_roce(&iwqp->sc_qp, iwqp->host_ctx.va, ctx_info); @@ -599,6 +621,8 @@ static void irdma_destroy_cqp(struct irdma_pci_f *rf) dma_free_coherent(dev->hw->device, cqp->sq.size, cqp->sq.va, cqp->sq.pa); cqp->sq.va = NULL; + kfree(cqp->oop_op_array); + cqp->oop_op_array = NULL; kfree(cqp->scratch_array); cqp->scratch_array = NULL; kfree(cqp->cqp_requests); @@ -631,7 +655,9 @@ static void irdma_destroy_aeq(struct irdma_pci_f *rf) int status = -EBUSY; if (!rf->msix_shared) { - rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, rf->iw_msixtbl->idx, false); + if (rf->sc_dev.privileged) + rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, + rf->iw_msixtbl->idx, false); irdma_destroy_irq(rf, rf->iw_msixtbl, rf); } if (rf->reset) @@ -697,9 +723,10 @@ static void irdma_del_ceq_0(struct irdma_pci_f *rf) if (rf->msix_shared) { msix_vec = &rf->iw_msixtbl[0]; - rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, - msix_vec->ceq_id, - msix_vec->idx, false); + if (rf->sc_dev.privileged) + rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, + msix_vec->ceq_id, + msix_vec->idx, false); irdma_destroy_irq(rf, msix_vec, rf); } else { msix_vec = &rf->iw_msixtbl[1]; @@ -730,8 +757,10 @@ static void irdma_del_ceqs(struct irdma_pci_f *rf) msix_vec = &rf->iw_msixtbl[2]; for (i = 1; i < rf->ceqs_count; i++, msix_vec++, iwceq++) { - rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, msix_vec->ceq_id, - msix_vec->idx, false); + if (rf->sc_dev.privileged) + rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, + msix_vec->ceq_id, + msix_vec->idx, false); irdma_destroy_irq(rf, msix_vec, iwceq); irdma_cqp_ceq_cmd(&rf->sc_dev, &iwceq->sc_ceq, IRDMA_OP_CEQ_DESTROY); @@ -942,6 +971,13 @@ static int irdma_create_cqp(struct irdma_pci_f *rf) goto err_scratch; } + cqp->oop_op_array = kcalloc(sqsize, sizeof(*cqp->oop_op_array), + GFP_KERNEL); + if (!cqp->oop_op_array) { + status = -ENOMEM; + goto err_oop; + } + cqp_init_info.ooo_op_array = cqp->oop_op_array; dev->cqp = &cqp->sc_cqp; dev->cqp->dev = dev; cqp->sq.size = ALIGN(sizeof(struct irdma_cqp_sq_wqe) * sqsize, @@ -978,6 +1014,10 @@ static int irdma_create_cqp(struct irdma_pci_f *rf) case IRDMA_GEN_2: cqp_init_info.hw_maj_ver = IRDMA_CQPHC_HW_MAJVER_GEN_2; break; + case IRDMA_GEN_3: + cqp_init_info.hw_maj_ver = IRDMA_CQPHC_HW_MAJVER_GEN_3; + cqp_init_info.ts_override = 1; + break; } status = irdma_sc_cqp_init(dev->cqp, &cqp_init_info); if (status) { @@ -1012,6 +1052,9 @@ err_ctx: cqp->sq.va, cqp->sq.pa); cqp->sq.va = NULL; err_sq: + kfree(cqp->oop_op_array); + cqp->oop_op_array = NULL; +err_oop: kfree(cqp->scratch_array); cqp->scratch_array = NULL; err_scratch: @@ -1033,13 +1076,15 @@ static int irdma_create_ccq(struct irdma_pci_f *rf) struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_ccq_init_info info = {}; struct irdma_ccq *ccq = &rf->ccq; + int ccq_size; int status; dev->ccq = &ccq->sc_cq; dev->ccq->dev = dev; info.dev = dev; + ccq_size = (rf->rdma_ver >= IRDMA_GEN_3) ? IW_GEN_3_CCQ_SIZE : IW_CCQ_SIZE; ccq->shadow_area.size = sizeof(struct irdma_cq_shadow_area); - ccq->mem_cq.size = ALIGN(sizeof(struct irdma_cqe) * IW_CCQ_SIZE, + ccq->mem_cq.size = ALIGN(sizeof(struct irdma_cqe) * ccq_size, IRDMA_CQ0_ALIGNMENT); ccq->mem_cq.va = dma_alloc_coherent(dev->hw->device, ccq->mem_cq.size, &ccq->mem_cq.pa, GFP_KERNEL); @@ -1056,7 +1101,7 @@ static int irdma_create_ccq(struct irdma_pci_f *rf) /* populate the ccq init info */ info.cq_base = ccq->mem_cq.va; info.cq_pa = ccq->mem_cq.pa; - info.num_elem = IW_CCQ_SIZE; + info.num_elem = ccq_size; info.shadow_area = ccq->shadow_area.va; info.shadow_area_pa = ccq->shadow_area.pa; info.ceqe_mask = false; @@ -1140,9 +1185,13 @@ static int irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, } msix_vec->ceq_id = ceq_id; - rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, ceq_id, msix_vec->idx, true); - - return 0; + if (rf->sc_dev.privileged) + rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, ceq_id, + msix_vec->idx, true); + else + status = irdma_vchnl_req_ceq_vec_map(&rf->sc_dev, ceq_id, + msix_vec->idx); + return status; } /** @@ -1155,7 +1204,7 @@ static int irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, static int irdma_cfg_aeq_vector(struct irdma_pci_f *rf) { struct irdma_msix_vector *msix_vec = rf->iw_msixtbl; - u32 ret = 0; + int ret = 0; if (!rf->msix_shared) { snprintf(msix_vec->name, sizeof(msix_vec->name) - 1, @@ -1166,12 +1215,16 @@ static int irdma_cfg_aeq_vector(struct irdma_pci_f *rf) } if (ret) { ibdev_dbg(&rf->iwdev->ibdev, "ERR: aeq irq config fail\n"); - return -EINVAL; + return ret; } - rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, msix_vec->idx, true); + if (rf->sc_dev.privileged) + rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, msix_vec->idx, + true); + else + ret = irdma_vchnl_req_aeq_vec_map(&rf->sc_dev, msix_vec->idx); - return 0; + return ret; } /** @@ -1179,13 +1232,13 @@ static int irdma_cfg_aeq_vector(struct irdma_pci_f *rf) * @rf: RDMA PCI function * @iwceq: pointer to the ceq resources to be created * @ceq_id: the id number of the iwceq - * @vsi: SC vsi struct + * @vsi_idx: vsi idx * * Return 0, if the ceq and the resources associated with it * are successfully created, otherwise return error */ static int irdma_create_ceq(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, - u32 ceq_id, struct irdma_sc_vsi *vsi) + u32 ceq_id, u16 vsi_idx) { int status; struct irdma_ceq_init_info info = {}; @@ -1209,7 +1262,7 @@ static int irdma_create_ceq(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, info.elem_cnt = ceq_size; iwceq->sc_ceq.ceq_id = ceq_id; info.dev = dev; - info.vsi = vsi; + info.vsi_idx = vsi_idx; status = irdma_sc_ceq_init(&iwceq->sc_ceq, &info); if (!status) { if (dev->ceq_valid) @@ -1252,7 +1305,7 @@ static int irdma_setup_ceq_0(struct irdma_pci_f *rf) } iwceq = &rf->ceqlist[0]; - status = irdma_create_ceq(rf, iwceq, 0, &rf->default_vsi); + status = irdma_create_ceq(rf, iwceq, 0, rf->default_vsi.vsi_idx); if (status) { ibdev_dbg(&rf->iwdev->ibdev, "ERR: create ceq status = %d\n", status); @@ -1287,13 +1340,13 @@ exit: /** * irdma_setup_ceqs - manage the device ceq's and their interrupt resources * @rf: RDMA PCI function - * @vsi: VSI structure for this CEQ + * @vsi_idx: vsi_idx for this CEQ * * Allocate a list for all device completion event queues * Create the ceq's and configure their msix interrupt vectors * Return 0, if ceqs are successfully set up, otherwise return error */ -static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi) +static int irdma_setup_ceqs(struct irdma_pci_f *rf, u16 vsi_idx) { u32 i; u32 ceq_id; @@ -1306,7 +1359,7 @@ static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi) i = (rf->msix_shared) ? 1 : 2; for (ceq_id = 1; i < num_ceqs; i++, ceq_id++) { iwceq = &rf->ceqlist[ceq_id]; - status = irdma_create_ceq(rf, iwceq, ceq_id, vsi); + status = irdma_create_ceq(rf, iwceq, ceq_id, vsi_idx); if (status) { ibdev_dbg(&rf->iwdev->ibdev, "ERR: create ceq status = %d\n", status); @@ -1387,7 +1440,10 @@ static int irdma_create_aeq(struct irdma_pci_f *rf) aeq_size = multiplier * hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt + hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt; aeq_size = min(aeq_size, dev->hw_attrs.max_hw_aeq_size); - + /* GEN_3 does not support virtual AEQ. Cap at max Kernel alloc size */ + if (rf->rdma_ver == IRDMA_GEN_3) + aeq_size = min(aeq_size, (u32)((PAGE_SIZE << MAX_PAGE_ORDER) / + sizeof(struct irdma_sc_aeqe))); aeq->mem.size = ALIGN(sizeof(struct irdma_sc_aeqe) * aeq_size, IRDMA_AEQ_ALIGNMENT); aeq->mem.va = dma_alloc_coherent(dev->hw->device, aeq->mem.size, @@ -1395,6 +1451,8 @@ static int irdma_create_aeq(struct irdma_pci_f *rf) GFP_KERNEL | __GFP_NOWARN); if (aeq->mem.va) goto skip_virt_aeq; + else if (rf->rdma_ver == IRDMA_GEN_3) + return -ENOMEM; /* physically mapped aeq failed. setup virtual aeq */ status = irdma_create_virt_aeq(rf, aeq_size); @@ -1569,6 +1627,8 @@ static void irdma_del_init_mem(struct irdma_pci_f *rf) { struct irdma_sc_dev *dev = &rf->sc_dev; + if (!rf->sc_dev.privileged) + irdma_vchnl_req_put_hmc_fcn(&rf->sc_dev); kfree(dev->hmc_info->sd_table.sd_entry); dev->hmc_info->sd_table.sd_entry = NULL; vfree(rf->mem_rsrc); @@ -1635,6 +1695,7 @@ static int irdma_initialize_dev(struct irdma_pci_f *rf) info.bar0 = rf->hw.hw_addr; info.hmc_fn_id = rf->pf_id; + info.protocol_used = rf->protocol_used; info.hw = &rf->hw; status = irdma_sc_dev_init(rf->rdma_ver, &rf->sc_dev, &info); if (status) @@ -1665,9 +1726,6 @@ void irdma_rt_deinit_hw(struct irdma_device *iwdev) irdma_del_local_mac_entry(iwdev->rf, (u8)iwdev->mac_ip_table_idx); fallthrough; - case AEQ_CREATED: - case PBLE_CHUNK_MEM: - case CEQS_CREATED: case IEQ_CREATED: if (!iwdev->roce_mode) irdma_puda_dele_rsrc(&iwdev->vsi, IRDMA_PUDA_RSRC_TYPE_IEQ, @@ -1740,7 +1798,9 @@ static void irdma_get_used_rsrc(struct irdma_device *iwdev) iwdev->rf->used_qps = find_first_zero_bit(iwdev->rf->allocated_qps, iwdev->rf->max_qp); iwdev->rf->used_cqs = find_first_zero_bit(iwdev->rf->allocated_cqs, - iwdev->rf->max_cq); + iwdev->rf->max_cq); + iwdev->rf->used_srqs = find_first_zero_bit(iwdev->rf->allocated_srqs, + iwdev->rf->max_srq); iwdev->rf->used_mrs = find_first_zero_bit(iwdev->rf->allocated_mrs, iwdev->rf->max_mr); } @@ -1750,13 +1810,17 @@ void irdma_ctrl_deinit_hw(struct irdma_pci_f *rf) enum init_completion_state state = rf->init_state; rf->init_state = INVALID_STATE; - if (rf->rsrc_created) { + + switch (state) { + case AEQ_CREATED: irdma_destroy_aeq(rf); + fallthrough; + case PBLE_CHUNK_MEM: irdma_destroy_pble_prm(rf->pble_rsrc); + fallthrough; + case CEQS_CREATED: irdma_del_ceqs(rf); - rf->rsrc_created = false; - } - switch (state) { + fallthrough; case CEQ0_CREATED: irdma_del_ceq_0(rf); fallthrough; @@ -1835,32 +1899,6 @@ int irdma_rt_init_hw(struct irdma_device *iwdev, break; iwdev->init_state = IEQ_CREATED; } - if (!rf->rsrc_created) { - status = irdma_setup_ceqs(rf, &iwdev->vsi); - if (status) - break; - - iwdev->init_state = CEQS_CREATED; - - status = irdma_hmc_init_pble(&rf->sc_dev, - rf->pble_rsrc); - if (status) { - irdma_del_ceqs(rf); - break; - } - - iwdev->init_state = PBLE_CHUNK_MEM; - - status = irdma_setup_aeq(rf); - if (status) { - irdma_destroy_pble_prm(rf->pble_rsrc); - irdma_del_ceqs(rf); - break; - } - iwdev->init_state = AEQ_CREATED; - rf->rsrc_created = true; - } - if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1) irdma_alloc_set_mac(iwdev); irdma_add_ip(iwdev); @@ -1907,6 +1945,13 @@ int irdma_ctrl_init_hw(struct irdma_pci_f *rf) break; rf->init_state = CQP_CREATED; + dev->feature_info[IRDMA_FEATURE_FW_INFO] = IRDMA_FW_VER_DEFAULT; + if (rf->rdma_ver != IRDMA_GEN_1) { + status = irdma_get_rdma_features(dev); + if (status) + break; + } + status = irdma_hmc_setup(rf); if (status) break; @@ -1922,13 +1967,6 @@ int irdma_ctrl_init_hw(struct irdma_pci_f *rf) break; rf->init_state = CCQ_CREATED; - dev->feature_info[IRDMA_FEATURE_FW_INFO] = IRDMA_FW_VER_DEFAULT; - if (rf->rdma_ver != IRDMA_GEN_1) { - status = irdma_get_rdma_features(dev); - if (status) - break; - } - status = irdma_setup_ceq_0(rf); if (status) break; @@ -1942,6 +1980,25 @@ int irdma_ctrl_init_hw(struct irdma_pci_f *rf) } INIT_WORK(&rf->cqp_cmpl_work, cqp_compl_worker); irdma_sc_ccq_arm(dev->ccq); + + status = irdma_setup_ceqs(rf, rf->iwdev ? rf->iwdev->vsi_num : 0); + if (status) + break; + + rf->init_state = CEQS_CREATED; + + status = irdma_hmc_init_pble(&rf->sc_dev, + rf->pble_rsrc); + if (status) + break; + + rf->init_state = PBLE_CHUNK_MEM; + + status = irdma_setup_aeq(rf); + if (status) + break; + rf->init_state = AEQ_CREATED; + return 0; } while (0); @@ -1960,7 +2017,8 @@ static void irdma_set_hw_rsrc(struct irdma_pci_f *rf) rf->allocated_qps = (void *)(rf->mem_rsrc + (sizeof(struct irdma_arp_entry) * rf->arp_table_size)); rf->allocated_cqs = &rf->allocated_qps[BITS_TO_LONGS(rf->max_qp)]; - rf->allocated_mrs = &rf->allocated_cqs[BITS_TO_LONGS(rf->max_cq)]; + rf->allocated_srqs = &rf->allocated_cqs[BITS_TO_LONGS(rf->max_cq)]; + rf->allocated_mrs = &rf->allocated_srqs[BITS_TO_LONGS(rf->max_srq)]; rf->allocated_pds = &rf->allocated_mrs[BITS_TO_LONGS(rf->max_mr)]; rf->allocated_ahs = &rf->allocated_pds[BITS_TO_LONGS(rf->max_pd)]; rf->allocated_mcgs = &rf->allocated_ahs[BITS_TO_LONGS(rf->max_ah)]; @@ -1988,12 +2046,14 @@ static u32 irdma_calc_mem_rsrc_size(struct irdma_pci_f *rf) rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_qp); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_mr); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_cq); + rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_srq); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_pd); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->arp_table_size); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_ah); rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_mcg); rsrc_size += sizeof(struct irdma_qp **) * rf->max_qp; rsrc_size += sizeof(struct irdma_cq **) * rf->max_cq; + rsrc_size += sizeof(struct irdma_srq **) * rf->max_srq; return rsrc_size; } @@ -2021,6 +2081,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) rf->max_qp = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt; rf->max_mr = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt; rf->max_cq = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt; + rf->max_srq = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].cnt; rf->max_pd = rf->sc_dev.hw_attrs.max_hw_pds; rf->arp_table_size = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_ARP].cnt; rf->max_ah = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].cnt; @@ -2040,6 +2101,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) set_bit(0, rf->allocated_mrs); set_bit(0, rf->allocated_qps); set_bit(0, rf->allocated_cqs); + set_bit(0, rf->allocated_srqs); set_bit(0, rf->allocated_pds); set_bit(0, rf->allocated_arps); set_bit(0, rf->allocated_ahs); @@ -2100,15 +2162,16 @@ void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq) cqp_request->compl_info.op_ret_val = info.op_ret_val; cqp_request->compl_info.error = info.error; - if (cqp_request->waiting) { - WRITE_ONCE(cqp_request->request_done, true); - wake_up(&cqp_request->waitq); - irdma_put_cqp_request(&rf->cqp, cqp_request); - } else { - if (cqp_request->callback_fcn) - cqp_request->callback_fcn(cqp_request); - irdma_put_cqp_request(&rf->cqp, cqp_request); - } + /* + * If this is deferred or pending completion, then mark + * CQP request as pending to not block the CQ, but don't + * release CQP request, as it is still on the OOO list. + */ + if (info.pending) + cqp_request->pending = true; + else + irdma_complete_cqp_request(&rf->cqp, + cqp_request); } cqe_count++; @@ -2718,7 +2781,9 @@ void irdma_flush_wqes(struct irdma_qp *iwqp, u32 flush_mask) struct irdma_pci_f *rf = iwqp->iwdev->rf; u8 flush_code = iwqp->sc_qp.flush_code; - if (!(flush_mask & IRDMA_FLUSH_SQ) && !(flush_mask & IRDMA_FLUSH_RQ)) + if ((!(flush_mask & IRDMA_FLUSH_SQ) && + !(flush_mask & IRDMA_FLUSH_RQ)) || + ((flush_mask & IRDMA_REFLUSH) && rf->rdma_ver >= IRDMA_GEN_3)) return; /* Set flush info fields*/ @@ -2731,6 +2796,10 @@ void irdma_flush_wqes(struct irdma_qp *iwqp, u32 flush_mask) info.rq_major_code = IRDMA_FLUSH_MAJOR_ERR; info.rq_minor_code = FLUSH_GENERAL_ERR; info.userflushcode = true; + info.err_sq_idx_valid = iwqp->sc_qp.err_sq_idx_valid; + info.err_sq_idx = iwqp->sc_qp.err_sq_idx; + info.err_rq_idx_valid = iwqp->sc_qp.err_rq_idx_valid; + info.err_rq_idx = iwqp->sc_qp.err_rq_idx; if (flush_mask & IRDMA_REFLUSH) { if (info.sq) diff --git a/drivers/infiniband/hw/irdma/i40iw_hw.c b/drivers/infiniband/hw/irdma/i40iw_hw.c index ce61a27cb1f6..60c1f2b1811d 100644 --- a/drivers/infiniband/hw/irdma/i40iw_hw.c +++ b/drivers/infiniband/hw/irdma/i40iw_hw.c @@ -85,6 +85,7 @@ static u64 i40iw_masks[IRDMA_MAX_MASKS] = { I40E_CQPSQ_CQ_CEQID, I40E_CQPSQ_CQ_CQID, I40E_COMMIT_FPM_CQCNT, + I40E_CQPSQ_UPESD_HMCFNID, }; static u64 i40iw_shifts[IRDMA_MAX_SHIFTS] = { @@ -94,6 +95,7 @@ static u64 i40iw_shifts[IRDMA_MAX_SHIFTS] = { I40E_CQPSQ_CQ_CEQID_S, I40E_CQPSQ_CQ_CQID_S, I40E_COMMIT_FPM_CQCNT_S, + I40E_CQPSQ_UPESD_HMCFNID_S, }; /** diff --git a/drivers/infiniband/hw/irdma/i40iw_hw.h b/drivers/infiniband/hw/irdma/i40iw_hw.h index e1db84d8a62c..0095b327afcc 100644 --- a/drivers/infiniband/hw/irdma/i40iw_hw.h +++ b/drivers/infiniband/hw/irdma/i40iw_hw.h @@ -123,6 +123,8 @@ #define I40E_CQPSQ_CQ_CQID GENMASK_ULL(15, 0) #define I40E_COMMIT_FPM_CQCNT_S 0 #define I40E_COMMIT_FPM_CQCNT GENMASK_ULL(17, 0) +#define I40E_CQPSQ_UPESD_HMCFNID_S 0 +#define I40E_CQPSQ_UPESD_HMCFNID GENMASK_ULL(5, 0) #define I40E_VSIQF_CTL(_VSI) (0x0020D800 + ((_VSI) * 4)) diff --git a/drivers/infiniband/hw/irdma/i40iw_if.c b/drivers/infiniband/hw/irdma/i40iw_if.c index cc50a7070371..15e036ddaffb 100644 --- a/drivers/infiniband/hw/irdma/i40iw_if.c +++ b/drivers/infiniband/hw/irdma/i40iw_if.c @@ -75,6 +75,9 @@ static void i40iw_fill_device_info(struct irdma_device *iwdev, struct i40e_info struct irdma_pci_f *rf = iwdev->rf; rf->rdma_ver = IRDMA_GEN_1; + rf->sc_dev.hw = &rf->hw; + rf->sc_dev.hw_attrs.uk_attrs.hw_rev = IRDMA_GEN_1; + rf->sc_dev.privileged = true; rf->gen_ops.request_reset = i40iw_request_reset; rf->pcidev = cdev_info->pcidev; rf->pf_id = cdev_info->fid; diff --git a/drivers/infiniband/hw/irdma/icrdma_hw.c b/drivers/infiniband/hw/irdma/icrdma_hw.c index 941d3edffadb..32f26284a788 100644 --- a/drivers/infiniband/hw/irdma/icrdma_hw.c +++ b/drivers/infiniband/hw/irdma/icrdma_hw.c @@ -38,6 +38,7 @@ static u64 icrdma_masks[IRDMA_MAX_MASKS] = { ICRDMA_CQPSQ_CQ_CEQID, ICRDMA_CQPSQ_CQ_CQID, ICRDMA_COMMIT_FPM_CQCNT, + ICRDMA_CQPSQ_UPESD_HMCFNID, }; static u64 icrdma_shifts[IRDMA_MAX_SHIFTS] = { @@ -47,6 +48,7 @@ static u64 icrdma_shifts[IRDMA_MAX_SHIFTS] = { ICRDMA_CQPSQ_CQ_CEQID_S, ICRDMA_CQPSQ_CQ_CQID_S, ICRDMA_COMMIT_FPM_CQCNT_S, + ICRDMA_CQPSQ_UPESD_HMCFNID_S, }; /** @@ -194,6 +196,7 @@ void icrdma_init_hw(struct irdma_sc_dev *dev) dev->hw_attrs.max_hw_ord = ICRDMA_MAX_ORD_SIZE; dev->hw_attrs.max_stat_inst = ICRDMA_MAX_STATS_COUNT; dev->hw_attrs.max_stat_idx = IRDMA_HW_STAT_INDEX_MAX_GEN_2; + dev->hw_attrs.max_hw_device_pages = ICRDMA_MAX_PUSH_PAGE_COUNT; dev->hw_attrs.uk_attrs.min_hw_wq_size = ICRDMA_MIN_WQ_SIZE; dev->hw_attrs.uk_attrs.max_hw_sq_chunk = IRDMA_MAX_QUANTA_PER_WR; diff --git a/drivers/infiniband/hw/irdma/icrdma_hw.h b/drivers/infiniband/hw/irdma/icrdma_hw.h index 697b9572b5c6..d97944ab45da 100644 --- a/drivers/infiniband/hw/irdma/icrdma_hw.h +++ b/drivers/infiniband/hw/irdma/icrdma_hw.h @@ -58,14 +58,15 @@ #define ICRDMA_CQPSQ_CQ_CQID GENMASK_ULL(18, 0) #define ICRDMA_COMMIT_FPM_CQCNT_S 0 #define ICRDMA_COMMIT_FPM_CQCNT GENMASK_ULL(19, 0) - +#define ICRDMA_CQPSQ_UPESD_HMCFNID_S 0 +#define ICRDMA_CQPSQ_UPESD_HMCFNID GENMASK_ULL(5, 0) enum icrdma_device_caps_const { ICRDMA_MAX_STATS_COUNT = 128, ICRDMA_MAX_IRD_SIZE = 127, ICRDMA_MAX_ORD_SIZE = 255, ICRDMA_MIN_WQ_SIZE = 8 /* WQEs */, - + ICRDMA_MAX_PUSH_PAGE_COUNT = 256, }; void icrdma_init_hw(struct irdma_sc_dev *dev); diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c new file mode 100644 index 000000000000..27b191f61caf --- /dev/null +++ b/drivers/infiniband/hw/irdma/icrdma_if.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB +/* Copyright (c) 2015 - 2024 Intel Corporation */ + +#include "main.h" +#include <linux/net/intel/iidc_rdma_ice.h> + +static void icrdma_prep_tc_change(struct irdma_device *iwdev) +{ + iwdev->vsi.tc_change_pending = true; + irdma_sc_suspend_resume_qps(&iwdev->vsi, IRDMA_OP_SUSPEND); + + /* Wait for all qp's to suspend */ + wait_event_timeout(iwdev->suspend_wq, + !atomic_read(&iwdev->vsi.qp_suspend_reqs), + msecs_to_jiffies(IRDMA_EVENT_TIMEOUT_MS)); + irdma_ws_reset(&iwdev->vsi); +} + +static void icrdma_fill_qos_info(struct irdma_l2params *l2params, + struct iidc_rdma_qos_params *qos_info) +{ + int i; + + l2params->num_tc = qos_info->num_tc; + l2params->vsi_prio_type = qos_info->vport_priority_type; + l2params->vsi_rel_bw = qos_info->vport_relative_bw; + for (i = 0; i < l2params->num_tc; i++) { + l2params->tc_info[i].egress_virt_up = + qos_info->tc_info[i].egress_virt_up; + l2params->tc_info[i].ingress_virt_up = + qos_info->tc_info[i].ingress_virt_up; + l2params->tc_info[i].prio_type = qos_info->tc_info[i].prio_type; + l2params->tc_info[i].rel_bw = qos_info->tc_info[i].rel_bw; + l2params->tc_info[i].tc_ctx = qos_info->tc_info[i].tc_ctx; + } + for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++) + l2params->up2tc[i] = qos_info->up2tc[i]; + if (qos_info->pfc_mode == IIDC_DSCP_PFC_MODE) { + l2params->dscp_mode = true; + memcpy(l2params->dscp_map, qos_info->dscp_map, sizeof(l2params->dscp_map)); + } +} + +static void icrdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info, + struct iidc_rdma_event *event) +{ + struct irdma_device *iwdev = dev_get_drvdata(&cdev_info->adev->dev); + struct irdma_l2params l2params = {}; + + if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) { + ibdev_dbg(&iwdev->ibdev, "CLNT: new MTU = %d\n", iwdev->netdev->mtu); + if (iwdev->vsi.mtu != iwdev->netdev->mtu) { + l2params.mtu = iwdev->netdev->mtu; + l2params.mtu_changed = true; + irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev); + irdma_change_l2params(&iwdev->vsi, &l2params); + } + } else if (*event->type & BIT(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE)) { + if (iwdev->vsi.tc_change_pending) + return; + + icrdma_prep_tc_change(iwdev); + } else if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_TC_CHANGE)) { + struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv; + + if (!iwdev->vsi.tc_change_pending) + return; + + l2params.tc_changed = true; + ibdev_dbg(&iwdev->ibdev, "CLNT: TC Change\n"); + + icrdma_fill_qos_info(&l2params, &idc_priv->qos_info); + if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) + iwdev->dcb_vlan_mode = + l2params.num_tc > 1 && !l2params.dscp_mode; + irdma_change_l2params(&iwdev->vsi, &l2params); + } else if (*event->type & BIT(IIDC_RDMA_EVENT_CRIT_ERR)) { + ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n", + event->reg); + if (event->reg & IRDMAPFINT_OICR_PE_CRITERR_M) { + u32 pe_criterr; + + pe_criterr = readl(iwdev->rf->sc_dev.hw_regs[IRDMA_GLPE_CRITERR]); +#define IRDMA_Q1_RESOURCE_ERR 0x0001024d + if (pe_criterr != IRDMA_Q1_RESOURCE_ERR) { + ibdev_err(&iwdev->ibdev, "critical PE Error, GLPE_CRITERR=0x%08x\n", + pe_criterr); + iwdev->rf->reset = true; + } else { + ibdev_warn(&iwdev->ibdev, "Q1 Resource Check\n"); + } + } + if (event->reg & IRDMAPFINT_OICR_HMC_ERR_M) { + ibdev_err(&iwdev->ibdev, "HMC Error\n"); + iwdev->rf->reset = true; + } + if (event->reg & IRDMAPFINT_OICR_PE_PUSH_M) { + ibdev_err(&iwdev->ibdev, "PE Push Error\n"); + iwdev->rf->reset = true; + } + if (iwdev->rf->reset) + iwdev->rf->gen_ops.request_reset(iwdev->rf); + } +} + +/** + * icrdma_lan_register_qset - Register qset with LAN driver + * @vsi: vsi structure + * @tc_node: Traffic class node + */ +static int icrdma_lan_register_qset(struct irdma_sc_vsi *vsi, + struct irdma_ws_node *tc_node) +{ + struct irdma_device *iwdev = vsi->back_vsi; + struct iidc_rdma_core_dev_info *cdev_info = iwdev->rf->cdev; + struct iidc_rdma_qset_params qset = {}; + int ret; + + qset.qs_handle = tc_node->qs_handle; + qset.tc = tc_node->traffic_class; + qset.vport_id = vsi->vsi_idx; + ret = ice_add_rdma_qset(cdev_info, &qset); + if (ret) { + ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n"); + return ret; + } + + tc_node->l2_sched_node_id = qset.teid; + vsi->qos[tc_node->user_pri].l2_sched_node_id = qset.teid; + + return 0; +} + +/** + * icrdma_lan_unregister_qset - Unregister qset with LAN driver + * @vsi: vsi structure + * @tc_node: Traffic class node + */ +static void icrdma_lan_unregister_qset(struct irdma_sc_vsi *vsi, + struct irdma_ws_node *tc_node) +{ + struct irdma_device *iwdev = vsi->back_vsi; + struct iidc_rdma_core_dev_info *cdev_info = iwdev->rf->cdev; + struct iidc_rdma_qset_params qset = {}; + + qset.qs_handle = tc_node->qs_handle; + qset.tc = tc_node->traffic_class; + qset.vport_id = vsi->vsi_idx; + qset.teid = tc_node->l2_sched_node_id; + + if (ice_del_rdma_qset(cdev_info, &qset)) + ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n"); +} + +/** + * icrdma_request_reset - Request a reset + * @rf: RDMA PCI function + */ +static void icrdma_request_reset(struct irdma_pci_f *rf) +{ + ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n"); + ice_rdma_request_reset(rf->cdev, IIDC_FUNC_RESET); +} + +static int icrdma_init_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) +{ + int i; + + rf->msix_count = num_online_cpus() + IRDMA_NUM_AEQ_MSIX; + rf->msix_entries = kcalloc(rf->msix_count, sizeof(*rf->msix_entries), + GFP_KERNEL); + if (!rf->msix_entries) + return -ENOMEM; + + for (i = 0; i < rf->msix_count; i++) + if (ice_alloc_rdma_qvector(cdev, &rf->msix_entries[i])) + break; + + if (i < IRDMA_MIN_MSIX) { + while (--i >= 0) + ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); + + kfree(rf->msix_entries); + return -ENOMEM; + } + + rf->msix_count = i; + + return 0; +} + +static void icrdma_deinit_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) +{ + int i; + + for (i = 0; i < rf->msix_count; i++) + ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); + + kfree(rf->msix_entries); +} + +static void icrdma_fill_device_info(struct irdma_device *iwdev, + struct iidc_rdma_core_dev_info *cdev_info) +{ + struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv; + struct irdma_pci_f *rf = iwdev->rf; + + rf->sc_dev.hw = &rf->hw; + rf->iwdev = iwdev; + rf->cdev = cdev_info; + rf->hw.hw_addr = idc_priv->hw_addr; + rf->pcidev = cdev_info->pdev; + rf->hw.device = &rf->pcidev->dev; + rf->pf_id = idc_priv->pf_id; + rf->rdma_ver = IRDMA_GEN_2; + rf->sc_dev.hw_attrs.uk_attrs.hw_rev = IRDMA_GEN_2; + rf->sc_dev.is_pf = true; + rf->sc_dev.privileged = true; + + rf->gen_ops.register_qset = icrdma_lan_register_qset; + rf->gen_ops.unregister_qset = icrdma_lan_unregister_qset; + + rf->default_vsi.vsi_idx = idc_priv->vport_id; + rf->protocol_used = + cdev_info->rdma_protocol == IIDC_RDMA_PROTOCOL_ROCEV2 ? + IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; + rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT; + rf->rst_to = IRDMA_RST_TIMEOUT_HZ; + rf->gen_ops.request_reset = icrdma_request_reset; + rf->limits_sel = 7; + mutex_init(&rf->ah_tbl_lock); + + iwdev->netdev = idc_priv->netdev; + iwdev->vsi_num = idc_priv->vport_id; + iwdev->init_state = INITIAL_STATE; + iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT; + iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT; + iwdev->rcv_wnd = IRDMA_CM_DEFAULT_RCV_WND_SCALED; + iwdev->rcv_wscale = IRDMA_CM_DEFAULT_RCV_WND_SCALE; + if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) + iwdev->roce_mode = true; +} + +static int icrdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_device_id *id) +{ + struct iidc_rdma_core_auxiliary_dev *iidc_adev; + struct iidc_rdma_core_dev_info *cdev_info; + struct iidc_rdma_priv_dev_info *idc_priv; + struct irdma_l2params l2params = {}; + struct irdma_device *iwdev; + struct irdma_pci_f *rf; + int err; + + iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + cdev_info = iidc_adev->cdev_info; + idc_priv = cdev_info->iidc_priv; + + iwdev = ib_alloc_device(irdma_device, ibdev); + if (!iwdev) + return -ENOMEM; + iwdev->rf = kzalloc(sizeof(*rf), GFP_KERNEL); + if (!iwdev->rf) { + ib_dealloc_device(&iwdev->ibdev); + return -ENOMEM; + } + + icrdma_fill_device_info(iwdev, cdev_info); + rf = iwdev->rf; + + err = icrdma_init_interrupts(rf, cdev_info); + if (err) + goto err_init_interrupts; + + err = irdma_ctrl_init_hw(rf); + if (err) + goto err_ctrl_init; + + l2params.mtu = iwdev->netdev->mtu; + icrdma_fill_qos_info(&l2params, &idc_priv->qos_info); + if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) + iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; + + err = irdma_rt_init_hw(iwdev, &l2params); + if (err) + goto err_rt_init; + + err = irdma_ib_register_device(iwdev); + if (err) + goto err_ibreg; + + ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, true); + + ibdev_dbg(&iwdev->ibdev, "INIT: Gen2 PF[%d] device probe success\n", PCI_FUNC(rf->pcidev->devfn)); + auxiliary_set_drvdata(aux_dev, iwdev); + + return 0; + +err_ibreg: + irdma_rt_deinit_hw(iwdev); +err_rt_init: + irdma_ctrl_deinit_hw(rf); +err_ctrl_init: + icrdma_deinit_interrupts(rf, cdev_info); +err_init_interrupts: + kfree(iwdev->rf); + ib_dealloc_device(&iwdev->ibdev); + + return err; +} + +static void icrdma_remove(struct auxiliary_device *aux_dev) +{ + struct iidc_rdma_core_auxiliary_dev *idc_adev = + container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + struct iidc_rdma_core_dev_info *cdev_info = idc_adev->cdev_info; + struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev); + u8 rdma_ver = iwdev->rf->rdma_ver; + + ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false); + irdma_ib_unregister_device(iwdev); + icrdma_deinit_interrupts(iwdev->rf, cdev_info); + + pr_debug("INIT: Gen[%d] func[%d] device remove success\n", + rdma_ver, PCI_FUNC(cdev_info->pdev->devfn)); +} + +static const struct auxiliary_device_id icrdma_auxiliary_id_table[] = { + {.name = "ice.iwarp", }, + {.name = "ice.roce", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, icrdma_auxiliary_id_table); + +struct iidc_rdma_core_auxiliary_drv icrdma_core_auxiliary_drv = { + .adrv = { + .name = "gen_2", + .id_table = icrdma_auxiliary_id_table, + .probe = icrdma_probe, + .remove = icrdma_remove, + }, + .event_handler = icrdma_iidc_event_handler, +}; diff --git a/drivers/infiniband/hw/irdma/ig3rdma_hw.c b/drivers/infiniband/hw/irdma/ig3rdma_hw.c new file mode 100644 index 000000000000..2e8bb475e22a --- /dev/null +++ b/drivers/infiniband/hw/irdma/ig3rdma_hw.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB +/* Copyright (c) 2018 - 2024 Intel Corporation */ +#include "osdep.h" +#include "type.h" +#include "protos.h" +#include "ig3rdma_hw.h" + +/** + * ig3rdma_ena_irq - Enable interrupt + * @dev: pointer to the device structure + * @idx: vector index + */ +static void ig3rdma_ena_irq(struct irdma_sc_dev *dev, u32 idx) +{ + u32 val; + u32 int_stride = 1; /* one u32 per register */ + + if (dev->is_pf) + int_stride = 0x400; + else + idx--; /* VFs use DYN_CTL_N */ + + val = FIELD_PREP(IRDMA_GLINT_DYN_CTL_INTENA, 1) | + FIELD_PREP(IRDMA_GLINT_DYN_CTL_CLEARPBA, 1); + + writel(val, dev->hw_regs[IRDMA_GLINT_DYN_CTL] + (idx * int_stride)); +} + +/** + * ig3rdma_disable_irq - Disable interrupt + * @dev: pointer to the device structure + * @idx: vector index + */ +static void ig3rdma_disable_irq(struct irdma_sc_dev *dev, u32 idx) +{ + u32 int_stride = 1; /* one u32 per register */ + + if (dev->is_pf) + int_stride = 0x400; + else + idx--; /* VFs use DYN_CTL_N */ + + writel(0, dev->hw_regs[IRDMA_GLINT_DYN_CTL] + (idx * int_stride)); +} + +static const struct irdma_irq_ops ig3rdma_irq_ops = { + .irdma_dis_irq = ig3rdma_disable_irq, + .irdma_en_irq = ig3rdma_ena_irq, +}; + +static const struct irdma_hw_stat_map ig3rdma_hw_stat_map[] = { + [IRDMA_HW_STAT_INDEX_RXVLANERR] = { 0, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4RXOCTS] = { 8, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4RXPKTS] = { 16, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4RXDISCARD] = { 24, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4RXTRUNC] = { 32, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4RXFRAGS] = { 40, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4RXMCOCTS] = { 48, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4RXMCPKTS] = { 56, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6RXOCTS] = { 64, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6RXPKTS] = { 72, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6RXDISCARD] = { 80, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6RXTRUNC] = { 88, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6RXFRAGS] = { 96, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6RXMCOCTS] = { 104, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6RXMCPKTS] = { 112, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4TXOCTS] = { 120, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4TXPKTS] = { 128, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4TXFRAGS] = { 136, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4TXMCOCTS] = { 144, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4TXMCPKTS] = { 152, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6TXOCTS] = { 160, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6TXPKTS] = { 168, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6TXFRAGS] = { 176, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6TXMCOCTS] = { 184, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6TXMCPKTS] = { 192, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP4TXNOROUTE] = { 200, 0, 0 }, + [IRDMA_HW_STAT_INDEX_IP6TXNOROUTE] = { 208, 0, 0 }, + [IRDMA_HW_STAT_INDEX_TCPRTXSEG] = { 216, 0, 0 }, + [IRDMA_HW_STAT_INDEX_TCPRXOPTERR] = { 224, 0, 0 }, + [IRDMA_HW_STAT_INDEX_TCPRXPROTOERR] = { 232, 0, 0 }, + [IRDMA_HW_STAT_INDEX_TCPTXSEG] = { 240, 0, 0 }, + [IRDMA_HW_STAT_INDEX_TCPRXSEGS] = { 248, 0, 0 }, + [IRDMA_HW_STAT_INDEX_UDPRXPKTS] = { 256, 0, 0 }, + [IRDMA_HW_STAT_INDEX_UDPTXPKTS] = { 264, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMARXWRS] = { 272, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMARXRDS] = { 280, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMARXSNDS] = { 288, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMATXWRS] = { 296, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMATXRDS] = { 304, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMATXSNDS] = { 312, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMAVBND] = { 320, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMAVINV] = { 328, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RXNPECNMARKEDPKTS] = { 336, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RXRPCNPHANDLED] = { 344, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RXRPCNPIGNORED] = { 352, 0, 0 }, + [IRDMA_HW_STAT_INDEX_TXNPCNPSENT] = { 360, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RNR_SENT] = { 368, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RNR_RCVD] = { 376, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMAORDLMTCNT] = { 384, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMAIRDLMTCNT] = { 392, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMARXATS] = { 408, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RDMATXATS] = { 416, 0, 0 }, + [IRDMA_HW_STAT_INDEX_NAKSEQERR] = { 424, 0, 0 }, + [IRDMA_HW_STAT_INDEX_NAKSEQERR_IMPLIED] = { 432, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RTO] = { 440, 0, 0 }, + [IRDMA_HW_STAT_INDEX_RXOOOPKTS] = { 448, 0, 0 }, + [IRDMA_HW_STAT_INDEX_ICRCERR] = { 456, 0, 0 }, +}; + +void ig3rdma_init_hw(struct irdma_sc_dev *dev) +{ + dev->irq_ops = &ig3rdma_irq_ops; + dev->hw_stats_map = ig3rdma_hw_stat_map; + + dev->hw_attrs.uk_attrs.hw_rev = IRDMA_GEN_3; + dev->hw_attrs.uk_attrs.max_hw_wq_frags = IG3RDMA_MAX_WQ_FRAGMENT_COUNT; + dev->hw_attrs.uk_attrs.max_hw_read_sges = IG3RDMA_MAX_SGE_RD; + dev->hw_attrs.uk_attrs.max_hw_sq_chunk = IRDMA_MAX_QUANTA_PER_WR; + dev->hw_attrs.first_hw_vf_fpm_id = 0; + dev->hw_attrs.max_hw_vf_fpm_id = IG3_MAX_APFS + IG3_MAX_AVFS; + dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_64_BYTE_CQE; + dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_CQE_TIMESTAMPING; + + dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_SRQ; + dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_RTS_AE | + IRDMA_FEATURE_CQ_RESIZE; + dev->hw_attrs.page_size_cap = SZ_4K | SZ_2M | SZ_1G; + dev->hw_attrs.max_hw_ird = IG3RDMA_MAX_IRD_SIZE; + dev->hw_attrs.max_hw_ord = IG3RDMA_MAX_ORD_SIZE; + dev->hw_attrs.max_stat_inst = IG3RDMA_MAX_STATS_COUNT; + dev->hw_attrs.max_stat_idx = IRDMA_HW_STAT_INDEX_MAX_GEN_3; + dev->hw_attrs.uk_attrs.min_hw_wq_size = IG3RDMA_MIN_WQ_SIZE; + dev->hw_attrs.uk_attrs.max_hw_srq_quanta = IRDMA_SRQ_MAX_QUANTA; + dev->hw_attrs.uk_attrs.max_hw_inline = IG3RDMA_MAX_INLINE_DATA_SIZE; + dev->hw_attrs.max_hw_device_pages = + dev->is_pf ? IG3RDMA_MAX_PF_PUSH_PAGE_COUNT : IG3RDMA_MAX_VF_PUSH_PAGE_COUNT; +} + +static void __iomem *__ig3rdma_get_reg_addr(struct irdma_mmio_region *region, u64 reg_offset) +{ + if (reg_offset >= region->offset && + reg_offset < (region->offset + region->len)) { + reg_offset -= region->offset; + + return region->addr + reg_offset; + } + + return NULL; +} + +void __iomem *ig3rdma_get_reg_addr(struct irdma_hw *hw, u64 reg_offset) +{ + u8 __iomem *reg_addr; + int i; + + reg_addr = __ig3rdma_get_reg_addr(&hw->rdma_reg, reg_offset); + if (reg_addr) + return reg_addr; + + for (i = 0; i < hw->num_io_regions; i++) { + reg_addr = __ig3rdma_get_reg_addr(&hw->io_regs[i], reg_offset); + if (reg_addr) + return reg_addr; + } + + WARN_ON_ONCE(1); + + return NULL; +} diff --git a/drivers/infiniband/hw/irdma/ig3rdma_hw.h b/drivers/infiniband/hw/irdma/ig3rdma_hw.h new file mode 100644 index 000000000000..03d5f1188789 --- /dev/null +++ b/drivers/infiniband/hw/irdma/ig3rdma_hw.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */ +/* Copyright (c) 2021 - 2024 Intel Corporation */ +#ifndef IG3RDMA_HW_H +#define IG3RDMA_HW_H + +#define IG3_MAX_APFS 1 +#define IG3_MAX_AVFS 0 + +#define IG3_PF_RDMA_REGION_OFFSET 0xBC00000 +#define IG3_PF_RDMA_REGION_LEN 0x401000 +#define IG3_VF_RDMA_REGION_OFFSET 0x8C00 +#define IG3_VF_RDMA_REGION_LEN 0x8400 + +enum ig3rdma_device_caps_const { + IG3RDMA_MAX_WQ_FRAGMENT_COUNT = 14, + IG3RDMA_MAX_SGE_RD = 14, + + IG3RDMA_MAX_STATS_COUNT = 128, + + IG3RDMA_MAX_IRD_SIZE = 64, + IG3RDMA_MAX_ORD_SIZE = 64, + IG3RDMA_MIN_WQ_SIZE = 16 /* WQEs */, + IG3RDMA_MAX_INLINE_DATA_SIZE = 216, + IG3RDMA_MAX_PF_PUSH_PAGE_COUNT = 8192, + IG3RDMA_MAX_VF_PUSH_PAGE_COUNT = 16, +}; + +void __iomem *ig3rdma_get_reg_addr(struct irdma_hw *hw, u64 reg_offset); +int ig3rdma_vchnl_send_sync(struct irdma_sc_dev *dev, u8 *msg, u16 len, + u8 *recv_msg, u16 *recv_len); + +#endif /* IG3RDMA_HW_H*/ diff --git a/drivers/infiniband/hw/irdma/ig3rdma_if.c b/drivers/infiniband/hw/irdma/ig3rdma_if.c new file mode 100644 index 000000000000..1bb42eb298ba --- /dev/null +++ b/drivers/infiniband/hw/irdma/ig3rdma_if.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB +/* Copyright (c) 2023 - 2024 Intel Corporation */ + +#include "main.h" +#include <linux/net/intel/iidc_rdma_idpf.h> +#include "ig3rdma_hw.h" + +static void ig3rdma_idc_core_event_handler(struct iidc_rdma_core_dev_info *cdev_info, + struct iidc_rdma_event *event) +{ + struct irdma_pci_f *rf = auxiliary_get_drvdata(cdev_info->adev); + + if (*event->type & BIT(IIDC_RDMA_EVENT_WARN_RESET)) { + rf->reset = true; + rf->sc_dev.vchnl_up = false; + } +} + +int ig3rdma_vchnl_send_sync(struct irdma_sc_dev *dev, u8 *msg, u16 len, + u8 *recv_msg, u16 *recv_len) +{ + struct iidc_rdma_core_dev_info *cdev_info = dev_to_rf(dev)->cdev; + int ret; + + ret = idpf_idc_rdma_vc_send_sync(cdev_info, msg, len, recv_msg, + recv_len); + if (ret == -ETIMEDOUT) { + ibdev_err(&(dev_to_rf(dev)->iwdev->ibdev), + "Virtual channel Req <-> Resp completion timeout\n"); + dev->vchnl_up = false; + } + + return ret; +} + +static int ig3rdma_vchnl_init(struct irdma_pci_f *rf, + struct iidc_rdma_core_dev_info *cdev_info, + u8 *rdma_ver) +{ + struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv; + struct irdma_vchnl_init_info virt_info; + u8 gen = rf->rdma_ver; + int ret; + + rf->vchnl_wq = alloc_ordered_workqueue("irdma-virtchnl-wq", 0); + if (!rf->vchnl_wq) + return -ENOMEM; + + mutex_init(&rf->sc_dev.vchnl_mutex); + + virt_info.is_pf = !idc_priv->ftype; + virt_info.hw_rev = gen; + virt_info.privileged = gen == IRDMA_GEN_2; + virt_info.vchnl_wq = rf->vchnl_wq; + ret = irdma_sc_vchnl_init(&rf->sc_dev, &virt_info); + if (ret) { + destroy_workqueue(rf->vchnl_wq); + return ret; + } + + *rdma_ver = rf->sc_dev.hw_attrs.uk_attrs.hw_rev; + + return 0; +} + +/** + * ig3rdma_request_reset - Request a reset + * @rf: RDMA PCI function + */ +static void ig3rdma_request_reset(struct irdma_pci_f *rf) +{ + ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n"); + idpf_idc_request_reset(rf->cdev, IIDC_FUNC_RESET); +} + +static int ig3rdma_cfg_regions(struct irdma_hw *hw, + struct iidc_rdma_core_dev_info *cdev_info) +{ + struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv; + struct pci_dev *pdev = cdev_info->pdev; + int i; + + switch (idc_priv->ftype) { + case IIDC_FUNCTION_TYPE_PF: + hw->rdma_reg.len = IG3_PF_RDMA_REGION_LEN; + hw->rdma_reg.offset = IG3_PF_RDMA_REGION_OFFSET; + break; + case IIDC_FUNCTION_TYPE_VF: + hw->rdma_reg.len = IG3_VF_RDMA_REGION_LEN; + hw->rdma_reg.offset = IG3_VF_RDMA_REGION_OFFSET; + break; + default: + return -ENODEV; + } + + hw->rdma_reg.addr = ioremap(pci_resource_start(pdev, 0) + hw->rdma_reg.offset, + hw->rdma_reg.len); + + if (!hw->rdma_reg.addr) + return -ENOMEM; + + hw->num_io_regions = le16_to_cpu(idc_priv->num_memory_regions); + hw->io_regs = kcalloc(hw->num_io_regions, + sizeof(struct irdma_mmio_region), GFP_KERNEL); + + if (!hw->io_regs) { + iounmap(hw->rdma_reg.addr); + return -ENOMEM; + } + + for (i = 0; i < hw->num_io_regions; i++) { + hw->io_regs[i].addr = + idc_priv->mapped_mem_regions[i].region_addr; + hw->io_regs[i].len = + le64_to_cpu(idc_priv->mapped_mem_regions[i].size); + hw->io_regs[i].offset = + le64_to_cpu(idc_priv->mapped_mem_regions[i].start_offset); + } + + return 0; +} + +static void ig3rdma_decfg_rf(struct irdma_pci_f *rf) +{ + struct irdma_hw *hw = &rf->hw; + + destroy_workqueue(rf->vchnl_wq); + kfree(hw->io_regs); + iounmap(hw->rdma_reg.addr); +} + +static int ig3rdma_cfg_rf(struct irdma_pci_f *rf, + struct iidc_rdma_core_dev_info *cdev_info) +{ + struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv; + int err; + + rf->sc_dev.hw = &rf->hw; + rf->cdev = cdev_info; + rf->pcidev = cdev_info->pdev; + rf->hw.device = &rf->pcidev->dev; + rf->msix_count = idc_priv->msix_count; + rf->msix_entries = idc_priv->msix_entries; + + err = ig3rdma_vchnl_init(rf, cdev_info, &rf->rdma_ver); + if (err) + return err; + + err = ig3rdma_cfg_regions(&rf->hw, cdev_info); + if (err) { + destroy_workqueue(rf->vchnl_wq); + return err; + } + + rf->protocol_used = IRDMA_ROCE_PROTOCOL_ONLY; + rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT; + rf->rst_to = IRDMA_RST_TIMEOUT_HZ; + rf->gen_ops.request_reset = ig3rdma_request_reset; + rf->limits_sel = 7; + mutex_init(&rf->ah_tbl_lock); + + return 0; +} + +static int ig3rdma_core_probe(struct auxiliary_device *aux_dev, + const struct auxiliary_device_id *id) +{ + struct iidc_rdma_core_auxiliary_dev *idc_adev = + container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + struct iidc_rdma_core_dev_info *cdev_info = idc_adev->cdev_info; + struct irdma_pci_f *rf; + int err; + + rf = kzalloc(sizeof(*rf), GFP_KERNEL); + if (!rf) + return -ENOMEM; + + err = ig3rdma_cfg_rf(rf, cdev_info); + if (err) + goto err_cfg_rf; + + err = irdma_ctrl_init_hw(rf); + if (err) + goto err_ctrl_init; + + auxiliary_set_drvdata(aux_dev, rf); + + err = idpf_idc_vport_dev_ctrl(cdev_info, true); + if (err) + goto err_vport_ctrl; + + return 0; + +err_vport_ctrl: + irdma_ctrl_deinit_hw(rf); +err_ctrl_init: + ig3rdma_decfg_rf(rf); +err_cfg_rf: + kfree(rf); + + return err; +} + +static void ig3rdma_core_remove(struct auxiliary_device *aux_dev) +{ + struct iidc_rdma_core_auxiliary_dev *idc_adev = + container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + struct iidc_rdma_core_dev_info *cdev_info = idc_adev->cdev_info; + struct irdma_pci_f *rf = auxiliary_get_drvdata(aux_dev); + + idpf_idc_vport_dev_ctrl(cdev_info, false); + irdma_ctrl_deinit_hw(rf); + ig3rdma_decfg_rf(rf); + kfree(rf); +} + +static const struct auxiliary_device_id ig3rdma_core_auxiliary_id_table[] = { + {.name = "idpf.8086.rdma.core", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, ig3rdma_core_auxiliary_id_table); + +struct iidc_rdma_core_auxiliary_drv ig3rdma_core_auxiliary_drv = { + .adrv = { + .name = "core", + .id_table = ig3rdma_core_auxiliary_id_table, + .probe = ig3rdma_core_probe, + .remove = ig3rdma_core_remove, + }, + .event_handler = ig3rdma_idc_core_event_handler, +}; diff --git a/drivers/infiniband/hw/irdma/irdma.h b/drivers/infiniband/hw/irdma/irdma.h index 20d2e7393e3d..ff938a01d70c 100644 --- a/drivers/infiniband/hw/irdma/irdma.h +++ b/drivers/infiniband/hw/irdma/irdma.h @@ -32,7 +32,16 @@ #define IRDMA_PFHMC_SDDATALOW_PMSDDATALOW GENMASK(31, 12) #define IRDMA_PFHMC_SDCMD_PMSDWR BIT(31) -#define IRDMA_INVALID_CQ_IDX 0xffffffff +#define IRDMA_INVALID_CQ_IDX 0xffffffff +#define IRDMA_Q_INVALID_IDX 0xffff + +enum irdma_dyn_idx_t { + IRDMA_IDX_ITR0 = 0, + IRDMA_IDX_ITR1 = 1, + IRDMA_IDX_ITR2 = 2, + IRDMA_IDX_NOITR = 3, +}; + enum irdma_registers { IRDMA_CQPTAIL, IRDMA_CQPDB, @@ -67,6 +76,7 @@ enum irdma_shifts { IRDMA_CQPSQ_CQ_CEQID_S, IRDMA_CQPSQ_CQ_CQID_S, IRDMA_COMMIT_FPM_CQCNT_S, + IRDMA_CQPSQ_UPESD_HMCFNID_S, IRDMA_MAX_SHIFTS, }; @@ -77,6 +87,7 @@ enum irdma_masks { IRDMA_CQPSQ_CQ_CEQID_M, IRDMA_CQPSQ_CQ_CQID_M, IRDMA_COMMIT_FPM_CQCNT_M, + IRDMA_CQPSQ_UPESD_HMCFNID_M, IRDMA_MAX_MASKS, /* Must be last entry */ }; @@ -92,7 +103,7 @@ struct irdma_mcast_grp_ctx_entry_info { struct irdma_mcast_grp_info { u8 dest_mac_addr[ETH_ALEN]; u16 vlan_id; - u8 hmc_fcn_id; + u16 hmc_fcn_id; bool ipv4_valid:1; bool vlan_valid:1; u16 mg_id; @@ -107,6 +118,9 @@ enum irdma_vers { IRDMA_GEN_RSVD, IRDMA_GEN_1, IRDMA_GEN_2, + IRDMA_GEN_3, + IRDMA_GEN_NEXT, + IRDMA_GEN_MAX = IRDMA_GEN_NEXT-1 }; struct irdma_uk_attrs { @@ -118,6 +132,7 @@ struct irdma_uk_attrs { u32 max_hw_wq_quanta; u32 min_hw_cq_size; u32 max_hw_cq_size; + u32 max_hw_srq_quanta; u16 max_hw_sq_chunk; u16 min_hw_wq_size; u8 hw_rev; @@ -147,10 +162,13 @@ struct irdma_hw_attrs { u32 max_done_count; u32 max_sleep_count; u32 max_cqp_compl_wait_time_ms; + u32 min_hw_srq_id; u16 max_stat_inst; u16 max_stat_idx; }; void i40iw_init_hw(struct irdma_sc_dev *dev); void icrdma_init_hw(struct irdma_sc_dev *dev); +void ig3rdma_init_hw(struct irdma_sc_dev *dev); +void __iomem *ig3rdma_get_reg_addr(struct irdma_hw *hw, u64 reg_offset); #endif /* IRDMA_H*/ diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 1e840bbd619d..95957d52883d 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "main.h" +#include <linux/net/intel/iidc_rdma_idpf.h> MODULE_ALIAS("i40iw"); MODULE_DESCRIPTION("Intel(R) Ethernet Protocol Driver for RDMA"); @@ -38,19 +39,7 @@ static void irdma_unregister_notifiers(void) unregister_netdevice_notifier(&irdma_netdevice_notifier); } -static void irdma_prep_tc_change(struct irdma_device *iwdev) -{ - iwdev->vsi.tc_change_pending = true; - irdma_sc_suspend_resume_qps(&iwdev->vsi, IRDMA_OP_SUSPEND); - - /* Wait for all qp's to suspend */ - wait_event_timeout(iwdev->suspend_wq, - !atomic_read(&iwdev->vsi.qp_suspend_reqs), - msecs_to_jiffies(IRDMA_EVENT_TIMEOUT_MS)); - irdma_ws_reset(&iwdev->vsi); -} - -static void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev) +void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev) { if (mtu < IRDMA_MIN_MTU_IPV4) ibdev_warn(to_ibdev(dev), "MTU setting [%d] too low for RDMA traffic. Minimum MTU is 576 for IPv4\n", mtu); @@ -58,35 +47,10 @@ static void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev) ibdev_warn(to_ibdev(dev), "MTU setting [%d] too low for RDMA traffic. Minimum MTU is 1280 for IPv6\\n", mtu); } -static void irdma_fill_qos_info(struct irdma_l2params *l2params, - struct iidc_rdma_qos_params *qos_info) +static void ig3rdma_idc_vport_event_handler(struct iidc_rdma_vport_dev_info *cdev_info, + struct iidc_rdma_event *event) { - int i; - - l2params->num_tc = qos_info->num_tc; - l2params->vsi_prio_type = qos_info->vport_priority_type; - l2params->vsi_rel_bw = qos_info->vport_relative_bw; - for (i = 0; i < l2params->num_tc; i++) { - l2params->tc_info[i].egress_virt_up = - qos_info->tc_info[i].egress_virt_up; - l2params->tc_info[i].ingress_virt_up = - qos_info->tc_info[i].ingress_virt_up; - l2params->tc_info[i].prio_type = qos_info->tc_info[i].prio_type; - l2params->tc_info[i].rel_bw = qos_info->tc_info[i].rel_bw; - l2params->tc_info[i].tc_ctx = qos_info->tc_info[i].tc_ctx; - } - for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++) - l2params->up2tc[i] = qos_info->up2tc[i]; - if (qos_info->pfc_mode == IIDC_DSCP_PFC_MODE) { - l2params->dscp_mode = true; - memcpy(l2params->dscp_map, qos_info->dscp_map, sizeof(l2params->dscp_map)); - } -} - -static void irdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info, - struct iidc_rdma_event *event) -{ - struct irdma_device *iwdev = dev_get_drvdata(&cdev_info->adev->dev); + struct irdma_device *iwdev = auxiliary_get_drvdata(cdev_info->adev); struct irdma_l2params l2params = {}; if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) { @@ -97,248 +61,39 @@ static void irdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info, irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev); irdma_change_l2params(&iwdev->vsi, &l2params); } - } else if (*event->type & BIT(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE)) { - if (iwdev->vsi.tc_change_pending) - return; - - irdma_prep_tc_change(iwdev); - } else if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_TC_CHANGE)) { - struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv; - - if (!iwdev->vsi.tc_change_pending) - return; - - l2params.tc_changed = true; - ibdev_dbg(&iwdev->ibdev, "CLNT: TC Change\n"); - - irdma_fill_qos_info(&l2params, &iidc_priv->qos_info); - if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) - iwdev->dcb_vlan_mode = - l2params.num_tc > 1 && !l2params.dscp_mode; - irdma_change_l2params(&iwdev->vsi, &l2params); - } else if (*event->type & BIT(IIDC_RDMA_EVENT_CRIT_ERR)) { - ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n", - event->reg); - if (event->reg & IRDMAPFINT_OICR_PE_CRITERR_M) { - u32 pe_criterr; - - pe_criterr = readl(iwdev->rf->sc_dev.hw_regs[IRDMA_GLPE_CRITERR]); -#define IRDMA_Q1_RESOURCE_ERR 0x0001024d - if (pe_criterr != IRDMA_Q1_RESOURCE_ERR) { - ibdev_err(&iwdev->ibdev, "critical PE Error, GLPE_CRITERR=0x%08x\n", - pe_criterr); - iwdev->rf->reset = true; - } else { - ibdev_warn(&iwdev->ibdev, "Q1 Resource Check\n"); - } - } - if (event->reg & IRDMAPFINT_OICR_HMC_ERR_M) { - ibdev_err(&iwdev->ibdev, "HMC Error\n"); - iwdev->rf->reset = true; - } - if (event->reg & IRDMAPFINT_OICR_PE_PUSH_M) { - ibdev_err(&iwdev->ibdev, "PE Push Error\n"); - iwdev->rf->reset = true; - } - if (iwdev->rf->reset) - iwdev->rf->gen_ops.request_reset(iwdev->rf); } } -/** - * irdma_request_reset - Request a reset - * @rf: RDMA PCI function - */ -static void irdma_request_reset(struct irdma_pci_f *rf) -{ - ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n"); - ice_rdma_request_reset(rf->cdev, IIDC_FUNC_RESET); -} - -/** - * irdma_lan_register_qset - Register qset with LAN driver - * @vsi: vsi structure - * @tc_node: Traffic class node - */ -static int irdma_lan_register_qset(struct irdma_sc_vsi *vsi, - struct irdma_ws_node *tc_node) -{ - struct irdma_device *iwdev = vsi->back_vsi; - struct iidc_rdma_core_dev_info *cdev_info; - struct iidc_rdma_qset_params qset = {}; - int ret; - - cdev_info = iwdev->rf->cdev; - qset.qs_handle = tc_node->qs_handle; - qset.tc = tc_node->traffic_class; - qset.vport_id = vsi->vsi_idx; - ret = ice_add_rdma_qset(cdev_info, &qset); - if (ret) { - ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n"); - return ret; - } - - tc_node->l2_sched_node_id = qset.teid; - vsi->qos[tc_node->user_pri].l2_sched_node_id = qset.teid; - - return 0; -} - -/** - * irdma_lan_unregister_qset - Unregister qset with LAN driver - * @vsi: vsi structure - * @tc_node: Traffic class node - */ -static void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi, - struct irdma_ws_node *tc_node) +static int ig3rdma_vport_probe(struct auxiliary_device *aux_dev, + const struct auxiliary_device_id *id) { - struct irdma_device *iwdev = vsi->back_vsi; - struct iidc_rdma_core_dev_info *cdev_info; - struct iidc_rdma_qset_params qset = {}; - - cdev_info = iwdev->rf->cdev; - qset.qs_handle = tc_node->qs_handle; - qset.tc = tc_node->traffic_class; - qset.vport_id = vsi->vsi_idx; - qset.teid = tc_node->l2_sched_node_id; - - if (ice_del_rdma_qset(cdev_info, &qset)) - ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n"); -} - -static int irdma_init_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) -{ - int i; - - rf->msix_count = num_online_cpus() + IRDMA_NUM_AEQ_MSIX; - rf->msix_entries = kcalloc(rf->msix_count, sizeof(*rf->msix_entries), - GFP_KERNEL); - if (!rf->msix_entries) - return -ENOMEM; - - for (i = 0; i < rf->msix_count; i++) - if (ice_alloc_rdma_qvector(cdev, &rf->msix_entries[i])) - break; - - if (i < IRDMA_MIN_MSIX) { - while (--i >= 0) - ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); + struct iidc_rdma_vport_auxiliary_dev *idc_adev = + container_of(aux_dev, struct iidc_rdma_vport_auxiliary_dev, adev); + struct auxiliary_device *aux_core_dev = idc_adev->vdev_info->core_adev; + struct irdma_pci_f *rf = auxiliary_get_drvdata(aux_core_dev); + struct irdma_l2params l2params = {}; + struct irdma_device *iwdev; + int err; - kfree(rf->msix_entries); + if (!rf) { + WARN_ON_ONCE(1); return -ENOMEM; } - - rf->msix_count = i; - - return 0; -} - -static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) -{ - int i; - - for (i = 0; i < rf->msix_count; i++) - ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); - - kfree(rf->msix_entries); -} - -static void irdma_remove(struct auxiliary_device *aux_dev) -{ - struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev); - struct iidc_rdma_core_auxiliary_dev *iidc_adev; - struct iidc_rdma_core_dev_info *cdev_info; - - iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); - cdev_info = iidc_adev->cdev_info; - - ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false); - irdma_ib_unregister_device(iwdev); - irdma_deinit_interrupts(iwdev->rf, cdev_info); - - kfree(iwdev->rf); - - pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(cdev_info->pdev->devfn)); -} - -static void irdma_fill_device_info(struct irdma_device *iwdev, - struct iidc_rdma_core_dev_info *cdev_info) -{ - struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv; - struct irdma_pci_f *rf = iwdev->rf; - - rf->sc_dev.hw = &rf->hw; - rf->iwdev = iwdev; - rf->cdev = cdev_info; - rf->hw.hw_addr = iidc_priv->hw_addr; - rf->pcidev = cdev_info->pdev; - rf->hw.device = &rf->pcidev->dev; - rf->pf_id = iidc_priv->pf_id; - rf->gen_ops.register_qset = irdma_lan_register_qset; - rf->gen_ops.unregister_qset = irdma_lan_unregister_qset; - - rf->default_vsi.vsi_idx = iidc_priv->vport_id; - rf->protocol_used = - cdev_info->rdma_protocol == IIDC_RDMA_PROTOCOL_ROCEV2 ? - IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; - rf->rdma_ver = IRDMA_GEN_2; - rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT; - rf->rst_to = IRDMA_RST_TIMEOUT_HZ; - rf->gen_ops.request_reset = irdma_request_reset; - rf->limits_sel = 7; - rf->iwdev = iwdev; - - mutex_init(&iwdev->ah_tbl_lock); - - iwdev->netdev = iidc_priv->netdev; - iwdev->vsi_num = iidc_priv->vport_id; + iwdev = ib_alloc_device(irdma_device, ibdev); + /* Fill iwdev info */ + iwdev->is_vport = true; + iwdev->rf = rf; + iwdev->vport_id = idc_adev->vdev_info->vport_id; + iwdev->netdev = idc_adev->vdev_info->netdev; iwdev->init_state = INITIAL_STATE; iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT; iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT; iwdev->rcv_wnd = IRDMA_CM_DEFAULT_RCV_WND_SCALED; iwdev->rcv_wscale = IRDMA_CM_DEFAULT_RCV_WND_SCALE; - if (rf->protocol_used == IRDMA_ROCE_PROTOCOL_ONLY) - iwdev->roce_mode = true; -} - -static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_device_id *id) -{ - struct iidc_rdma_core_auxiliary_dev *iidc_adev; - struct iidc_rdma_core_dev_info *cdev_info; - struct iidc_rdma_priv_dev_info *iidc_priv; - struct irdma_l2params l2params = {}; - struct irdma_device *iwdev; - struct irdma_pci_f *rf; - int err; - - iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); - cdev_info = iidc_adev->cdev_info; - iidc_priv = cdev_info->iidc_priv; - - iwdev = ib_alloc_device(irdma_device, ibdev); - if (!iwdev) - return -ENOMEM; - iwdev->rf = kzalloc(sizeof(*rf), GFP_KERNEL); - if (!iwdev->rf) { - ib_dealloc_device(&iwdev->ibdev); - return -ENOMEM; - } - - irdma_fill_device_info(iwdev, cdev_info); - rf = iwdev->rf; - - err = irdma_init_interrupts(rf, cdev_info); - if (err) - goto err_init_interrupts; - - err = irdma_ctrl_init_hw(rf); - if (err) - goto err_ctrl_init; + iwdev->roce_mode = true; + iwdev->push_mode = false; l2params.mtu = iwdev->netdev->mtu; - irdma_fill_qos_info(&l2params, &iidc_priv->qos_info); - if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) - iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; err = irdma_rt_init_hw(iwdev, &l2params); if (err) @@ -348,43 +103,57 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ if (err) goto err_ibreg; - ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, true); - - ibdev_dbg(&iwdev->ibdev, "INIT: Gen2 PF[%d] device probe success\n", PCI_FUNC(rf->pcidev->devfn)); auxiliary_set_drvdata(aux_dev, iwdev); - return 0; + ibdev_dbg(&iwdev->ibdev, + "INIT: Gen[%d] vport[%d] probe success. dev_name = %s, core_dev_name = %s, netdev=%s\n", + rf->rdma_ver, idc_adev->vdev_info->vport_id, + dev_name(&aux_dev->dev), + dev_name(&idc_adev->vdev_info->core_adev->dev), + netdev_name(idc_adev->vdev_info->netdev)); + return 0; err_ibreg: irdma_rt_deinit_hw(iwdev); err_rt_init: - irdma_ctrl_deinit_hw(rf); -err_ctrl_init: - irdma_deinit_interrupts(rf, cdev_info); -err_init_interrupts: - kfree(iwdev->rf); ib_dealloc_device(&iwdev->ibdev); return err; } -static const struct auxiliary_device_id irdma_auxiliary_id_table[] = { - {.name = "ice.iwarp", }, - {.name = "ice.roce", }, +static void ig3rdma_vport_remove(struct auxiliary_device *aux_dev) +{ + struct iidc_rdma_vport_auxiliary_dev *idc_adev = + container_of(aux_dev, struct iidc_rdma_vport_auxiliary_dev, adev); + struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev); + + ibdev_dbg(&iwdev->ibdev, + "INIT: Gen[%d] dev_name = %s, core_dev_name = %s, netdev=%s\n", + iwdev->rf->rdma_ver, dev_name(&aux_dev->dev), + dev_name(&idc_adev->vdev_info->core_adev->dev), + netdev_name(idc_adev->vdev_info->netdev)); + + irdma_ib_unregister_device(iwdev); +} + +static const struct auxiliary_device_id ig3rdma_vport_auxiliary_id_table[] = { + {.name = "idpf.8086.rdma.vdev", }, {}, }; -MODULE_DEVICE_TABLE(auxiliary, irdma_auxiliary_id_table); +MODULE_DEVICE_TABLE(auxiliary, ig3rdma_vport_auxiliary_id_table); -static struct iidc_rdma_core_auxiliary_drv irdma_auxiliary_drv = { +static struct iidc_rdma_vport_auxiliary_drv ig3rdma_vport_auxiliary_drv = { .adrv = { - .id_table = irdma_auxiliary_id_table, - .probe = irdma_probe, - .remove = irdma_remove, + .name = "vdev", + .id_table = ig3rdma_vport_auxiliary_id_table, + .probe = ig3rdma_vport_probe, + .remove = ig3rdma_vport_remove, }, - .event_handler = irdma_iidc_event_handler, + .event_handler = ig3rdma_idc_vport_event_handler, }; + static int __init irdma_init_module(void) { int ret; @@ -396,14 +165,34 @@ static int __init irdma_init_module(void) return ret; } - ret = auxiliary_driver_register(&irdma_auxiliary_drv.adrv); + ret = auxiliary_driver_register(&icrdma_core_auxiliary_drv.adrv); + if (ret) { + auxiliary_driver_unregister(&i40iw_auxiliary_drv); + pr_err("Failed icrdma(gen_2) auxiliary_driver_register() ret=%d\n", + ret); + return ret; + } + + ret = auxiliary_driver_register(&ig3rdma_core_auxiliary_drv.adrv); if (ret) { + auxiliary_driver_unregister(&icrdma_core_auxiliary_drv.adrv); auxiliary_driver_unregister(&i40iw_auxiliary_drv); - pr_err("Failed irdma auxiliary_driver_register() ret=%d\n", + pr_err("Failed ig3rdma(gen_3) core auxiliary_driver_register() ret=%d\n", ret); + return ret; } + ret = auxiliary_driver_register(&ig3rdma_vport_auxiliary_drv.adrv); + if (ret) { + auxiliary_driver_unregister(&ig3rdma_core_auxiliary_drv.adrv); + auxiliary_driver_unregister(&icrdma_core_auxiliary_drv.adrv); + auxiliary_driver_unregister(&i40iw_auxiliary_drv); + pr_err("Failed ig3rdma vport auxiliary_driver_register() ret=%d\n", + ret); + + return ret; + } irdma_register_notifiers(); return 0; @@ -412,8 +201,10 @@ static int __init irdma_init_module(void) static void __exit irdma_exit_module(void) { irdma_unregister_notifiers(); - auxiliary_driver_unregister(&irdma_auxiliary_drv.adrv); + auxiliary_driver_unregister(&icrdma_core_auxiliary_drv.adrv); auxiliary_driver_unregister(&i40iw_auxiliary_drv); + auxiliary_driver_unregister(&ig3rdma_core_auxiliary_drv.adrv); + auxiliary_driver_unregister(&ig3rdma_vport_auxiliary_drv.adrv); } module_init(irdma_init_module); diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 674acc952168..886b30da188a 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -30,7 +30,6 @@ #endif #include <linux/auxiliary_bus.h> #include <linux/net/intel/iidc_rdma.h> -#include <linux/net/intel/iidc_rdma_ice.h> #include <rdma/ib_smi.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> @@ -54,6 +53,8 @@ #include "puda.h" extern struct auxiliary_driver i40iw_auxiliary_drv; +extern struct iidc_rdma_core_auxiliary_drv icrdma_core_auxiliary_drv; +extern struct iidc_rdma_core_auxiliary_drv ig3rdma_core_auxiliary_drv; #define IRDMA_FW_VER_DEFAULT 2 #define IRDMA_HW_VER 2 @@ -65,7 +66,8 @@ extern struct auxiliary_driver i40iw_auxiliary_drv; #define IRDMA_MACIP_ADD 1 #define IRDMA_MACIP_DELETE 2 -#define IW_CCQ_SIZE (IRDMA_CQP_SW_SQSIZE_2048 + 1) +#define IW_GEN_3_CCQ_SIZE (2 * IRDMA_CQP_SW_SQSIZE_2048 + 2) +#define IW_CCQ_SIZE (IRDMA_CQP_SW_SQSIZE_2048 + 2) #define IW_CEQ_SIZE 2048 #define IW_AEQ_SIZE 2048 @@ -127,12 +129,12 @@ enum init_completion_state { HMC_OBJS_CREATED, HW_RSRC_INITIALIZED, CCQ_CREATED, - CEQ0_CREATED, /* Last state of probe */ - ILQ_CREATED, - IEQ_CREATED, + CEQ0_CREATED, CEQS_CREATED, PBLE_CHUNK_MEM, AEQ_CREATED, + ILQ_CREATED, + IEQ_CREATED, /* Last state of probe */ IP_ADDR_REGISTERED, /* Last state of open */ }; @@ -167,6 +169,7 @@ struct irdma_cqp_request { bool request_done; /* READ/WRITE_ONCE macros operate on it */ bool waiting:1; bool dynamic:1; + bool pending:1; }; struct irdma_cqp { @@ -179,6 +182,7 @@ struct irdma_cqp { struct irdma_dma_mem host_ctx; u64 *scratch_array; struct irdma_cqp_request *cqp_requests; + struct irdma_ooo_cqp_op *oop_op_array; struct list_head cqp_avail_reqs; struct list_head cqp_pending_reqs; }; @@ -257,6 +261,7 @@ struct irdma_pci_f { bool reset:1; bool rsrc_created:1; bool msix_shared:1; + bool hwqp1_rsvd:1; u8 rsrc_profile; u8 *hmc_info_mem; u8 *mem_rsrc; @@ -269,6 +274,8 @@ struct irdma_pci_f { u32 max_mr; u32 max_qp; u32 max_cq; + u32 max_srq; + u32 next_srq; u32 max_ah; u32 next_ah; u32 max_mcg; @@ -282,6 +289,7 @@ struct irdma_pci_f { u32 mr_stagmask; u32 used_pds; u32 used_cqs; + u32 used_srqs; u32 used_mrs; u32 used_qps; u32 arp_table_size; @@ -293,6 +301,7 @@ struct irdma_pci_f { unsigned long *allocated_ws_nodes; unsigned long *allocated_qps; unsigned long *allocated_cqs; + unsigned long *allocated_srqs; unsigned long *allocated_mrs; unsigned long *allocated_pds; unsigned long *allocated_mcgs; @@ -327,10 +336,13 @@ struct irdma_pci_f { wait_queue_head_t vchnl_waitq; struct workqueue_struct *cqp_cmpl_wq; struct work_struct cqp_cmpl_work; + struct workqueue_struct *vchnl_wq; struct irdma_sc_vsi default_vsi; void *back_fcn; struct irdma_gen_ops gen_ops; struct irdma_device *iwdev; + DECLARE_HASHTABLE(ah_hash_tbl, 8); + struct mutex ah_tbl_lock; /* protect AH hash table access */ }; struct irdma_device { @@ -340,8 +352,6 @@ struct irdma_device { struct workqueue_struct *cleanup_wq; struct irdma_sc_vsi vsi; struct irdma_cm_core cm_core; - DECLARE_HASHTABLE(ah_hash_tbl, 8); - struct mutex ah_tbl_lock; /* protect AH hash table access */ u32 roce_cwnd; u32 roce_ackcreds; u32 vendor_id; @@ -350,12 +360,14 @@ struct irdma_device { u32 rcv_wnd; u16 mac_ip_table_idx; u16 vsi_num; + u16 vport_id; u8 rcv_wscale; u8 iw_status; bool roce_mode:1; bool roce_dcqcn_en:1; bool dcb_vlan_mode:1; bool iw_ooo:1; + bool is_vport:1; enum init_completion_state init_state; wait_queue_head_t suspend_wq; @@ -413,6 +425,11 @@ static inline struct irdma_pci_f *dev_to_rf(struct irdma_sc_dev *dev) return container_of(dev, struct irdma_pci_f, sc_dev); } +static inline struct irdma_srq *to_iwsrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct irdma_srq, ibsrq); +} + /** * irdma_alloc_resource - allocate a resource * @iwdev: device pointer @@ -508,7 +525,8 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, void irdma_cq_add_ref(struct ib_cq *ibcq); void irdma_cq_rem_ref(struct ib_cq *ibcq); void irdma_cq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_cq *cq); - +void irdma_srq_event(struct irdma_sc_srq *srq); +void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq); void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf); int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp, struct irdma_modify_qp_info *info, bool wait); @@ -557,4 +575,5 @@ int irdma_netdevice_event(struct notifier_block *notifier, unsigned long event, void *ptr); void irdma_add_ip(struct irdma_device *iwdev); void cqp_compl_worker(struct work_struct *work); +void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev); #endif /* IRDMA_MAIN_H */ diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index 37ce35cb10e7..3091f9345f12 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -193,8 +193,15 @@ static enum irdma_sd_entry_type irdma_get_type(struct irdma_sc_dev *dev, { enum irdma_sd_entry_type sd_entry_type; - sd_entry_type = !idx->rel_pd_idx && pages == IRDMA_HMC_PD_CNT_IN_SD ? - IRDMA_SD_TYPE_DIRECT : IRDMA_SD_TYPE_PAGED; + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + sd_entry_type = (!idx->rel_pd_idx && + pages == IRDMA_HMC_PD_CNT_IN_SD) ? + IRDMA_SD_TYPE_DIRECT : IRDMA_SD_TYPE_PAGED; + else + sd_entry_type = (!idx->rel_pd_idx && + pages == IRDMA_HMC_PD_CNT_IN_SD && + dev->privileged) ? + IRDMA_SD_TYPE_DIRECT : IRDMA_SD_TYPE_PAGED; return sd_entry_type; } @@ -279,10 +286,11 @@ static int add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) sd_reg_val = (sd_entry_type == IRDMA_SD_TYPE_PAGED) ? sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa; - - if (!sd_entry->valid) { - ret_code = irdma_hmc_sd_one(dev, hmc_info->hmc_fn_id, sd_reg_val, - idx->sd_idx, sd_entry->entry_type, true); + if ((dev->privileged && !sd_entry->valid) || + dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + ret_code = irdma_hmc_sd_one(dev, hmc_info->hmc_fn_id, + sd_reg_val, idx->sd_idx, + sd_entry->entry_type, true); if (ret_code) goto error; } diff --git a/drivers/infiniband/hw/irdma/protos.h b/drivers/infiniband/hw/irdma/protos.h index c0c9441885d3..324cfbf21764 100644 --- a/drivers/infiniband/hw/irdma/protos.h +++ b/drivers/infiniband/hw/irdma/protos.h @@ -10,6 +10,7 @@ #define ALL_TC2PFC 0xff #define CQP_COMPL_WAIT_TIME_MS 10 #define CQP_TIMEOUT_THRESHOLD 500 +#define CQP_DEF_CMPL_TIMEOUT_THRESHOLD 2500 /* init operations */ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev, diff --git a/drivers/infiniband/hw/irdma/puda.h b/drivers/infiniband/hw/irdma/puda.h index 2fc638f2b143..d65041bee667 100644 --- a/drivers/infiniband/hw/irdma/puda.h +++ b/drivers/infiniband/hw/irdma/puda.h @@ -91,7 +91,7 @@ struct irdma_puda_rsrc_info { u32 rq_size; u32 tx_buf_cnt; /* total bufs allocated will be rq_size + tx_buf_cnt */ u16 buf_size; - u8 stats_idx; + u16 stats_idx; bool stats_idx_valid:1; int abi_ver; }; @@ -140,7 +140,7 @@ struct irdma_puda_rsrc { u64 crc_err; u64 pmode_count; u64 partials_handled; - u8 stats_idx; + u16 stats_idx; bool check_crc:1; bool stats_idx_valid:1; }; diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 527c6da2c1ac..4ae77cdde9dc 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -8,6 +8,8 @@ #include "hmc.h" #include "uda.h" #include "ws.h" +#include "virtchnl.h" + #define IRDMA_DEBUG_ERR "ERR" #define IRDMA_DEBUG_INIT "INIT" #define IRDMA_DEBUG_DEV "DEV" @@ -95,12 +97,6 @@ enum irdma_term_mpa_errors { MPA_REQ_RSP = 0x04, }; -enum irdma_qp_event_type { - IRDMA_QP_EVENT_CATASTROPHIC, - IRDMA_QP_EVENT_ACCESS_ERR, - IRDMA_QP_EVENT_REQ_ERR, -}; - enum irdma_hw_stats_index { /* gen1 - 32-bit */ IRDMA_HW_STAT_INDEX_IP4RXDISCARD = 0, @@ -154,12 +150,46 @@ enum irdma_hw_stats_index { IRDMA_HW_STAT_INDEX_RXRPCNPIGNORED = 44, IRDMA_HW_STAT_INDEX_TXNPCNPSENT = 45, IRDMA_HW_STAT_INDEX_MAX_GEN_2 = 46, + + /* gen3 */ + IRDMA_HW_STAT_INDEX_RNR_SENT = 46, + IRDMA_HW_STAT_INDEX_RNR_RCVD = 47, + IRDMA_HW_STAT_INDEX_RDMAORDLMTCNT = 48, + IRDMA_HW_STAT_INDEX_RDMAIRDLMTCNT = 49, + IRDMA_HW_STAT_INDEX_RDMARXATS = 50, + IRDMA_HW_STAT_INDEX_RDMATXATS = 51, + IRDMA_HW_STAT_INDEX_NAKSEQERR = 52, + IRDMA_HW_STAT_INDEX_NAKSEQERR_IMPLIED = 53, + IRDMA_HW_STAT_INDEX_RTO = 54, + IRDMA_HW_STAT_INDEX_RXOOOPKTS = 55, + IRDMA_HW_STAT_INDEX_ICRCERR = 56, + + IRDMA_HW_STAT_INDEX_MAX_GEN_3 = 57, }; enum irdma_feature_type { IRDMA_FEATURE_FW_INFO = 0, IRDMA_HW_VERSION_INFO = 1, + IRDMA_QP_MAX_INCR = 2, + IRDMA_CQ_MAX_INCR = 3, + IRDMA_CEQ_MAX_INCR = 4, + IRDMA_SD_MAX_INCR = 5, + IRDMA_MR_MAX_INCR = 6, + IRDMA_Q1_MAX_INCR = 7, + IRDMA_AH_MAX_INCR = 8, + IRDMA_SRQ_MAX_INCR = 9, + IRDMA_TIMER_MAX_INCR = 10, + IRDMA_XF_MAX_INCR = 11, + IRDMA_RRF_MAX_INCR = 12, + IRDMA_PBLE_MAX_INCR = 13, + IRDMA_OBJ_1 = 22, + IRDMA_OBJ_2 = 23, + IRDMA_ENDPT_TRK = 24, + IRDMA_FTN_INLINE_MAX = 25, IRDMA_QSETS_MAX = 26, + IRDMA_ASO = 27, + IRDMA_FTN_FLAGS = 32, + IRDMA_FTN_NOP = 33, IRDMA_MAX_FEATURES, /* Must be last entry */ }; @@ -206,6 +236,7 @@ enum irdma_syn_rst_handling { enum irdma_queue_type { IRDMA_QUEUE_TYPE_SQ_RQ = 0, IRDMA_QUEUE_TYPE_CQP, + IRDMA_QUEUE_TYPE_SRQ, }; struct irdma_sc_dev; @@ -233,12 +264,22 @@ struct irdma_cqp_init_info { __le64 *host_ctx; u64 *scratch_array; u32 sq_size; + struct irdma_ooo_cqp_op *ooo_op_array; + u32 pe_en_vf_cnt; u16 hw_maj_ver; u16 hw_min_ver; u8 struct_ver; u8 hmc_profile; u8 ena_vf_count; u8 ceqs_per_vf; + u8 ooisc_blksize; + u8 rrsp_blksize; + u8 q1_blksize; + u8 xmit_blksize; + u8 ts_override; + u8 ts_shift; + u8 en_fine_grained_timers; + u8 blksizes_valid; bool en_datacenter_tcp:1; bool disable_packed:1; bool rocev2_rto_policy:1; @@ -310,9 +351,21 @@ struct irdma_vsi_pestat { spinlock_t lock; /* rdma stats lock */ }; +struct irdma_mmio_region { + u8 __iomem *addr; + resource_size_t len; + resource_size_t offset; +}; + struct irdma_hw { - u8 __iomem *hw_addr; - u8 __iomem *priv_hw_addr; + union { + u8 __iomem *hw_addr; + struct { + struct irdma_mmio_region rdma_reg; /* RDMA region */ + struct irdma_mmio_region *io_regs; /* Non-RDMA MMIO regions */ + u16 num_io_regions; /* Number of Non-RDMA MMIO regions */ + }; + }; struct device *device; struct irdma_hmc_info hmc; }; @@ -351,7 +404,21 @@ struct irdma_cqp_quanta { __le64 elem[IRDMA_CQP_WQE_SIZE]; }; +struct irdma_ooo_cqp_op { + struct list_head list_entry; + u64 scratch; + u32 def_info; + u32 sw_def_info; + u32 wqe_idx; + bool deferred:1; +}; + struct irdma_sc_cqp { + spinlock_t ooo_list_lock; /* protects list of pending completions */ + struct list_head ooo_avail; + struct list_head ooo_pnd; + u32 last_def_cmpl_ticket; + u32 sw_def_cmpl_ticket; u32 size; u64 sq_pa; u64 host_ctx_pa; @@ -367,8 +434,10 @@ struct irdma_sc_cqp { u64 *scratch_array; u64 requested_ops; atomic64_t completed_ops; + struct irdma_ooo_cqp_op *ooo_op_array; u32 cqp_id; u32 sq_size; + u32 pe_en_vf_cnt; u32 hw_sq_size; u16 hw_maj_ver; u16 hw_min_ver; @@ -378,6 +447,14 @@ struct irdma_sc_cqp { u8 ena_vf_count; u8 timeout_count; u8 ceqs_per_vf; + u8 ooisc_blksize; + u8 rrsp_blksize; + u8 q1_blksize; + u8 xmit_blksize; + u8 ts_override; + u8 ts_shift; + u8 en_fine_grained_timers; + u8 blksizes_valid; bool en_datacenter_tcp:1; bool disable_packed:1; bool rocev2_rto_policy:1; @@ -397,6 +474,8 @@ struct irdma_sc_aeq { u32 msix_idx; u8 polarity; bool virtual_map:1; + bool pasid_valid:1; + u32 pasid; }; struct irdma_sc_ceq { @@ -412,13 +491,15 @@ struct irdma_sc_ceq { u8 tph_val; u32 first_pm_pbl_idx; u8 polarity; - struct irdma_sc_vsi *vsi; + u16 vsi_idx; struct irdma_sc_cq **reg_cq; u32 reg_cq_size; spinlock_t req_cq_lock; /* protect access to reg_cq array */ bool virtual_map:1; bool tph_en:1; bool itr_no_expire:1; + bool pasid_valid:1; + u32 pasid; }; struct irdma_sc_cq { @@ -426,6 +507,7 @@ struct irdma_sc_cq { u64 cq_pa; u64 shadow_area_pa; struct irdma_sc_dev *dev; + u16 vsi_idx; struct irdma_sc_vsi *vsi; void *pbl_list; void *back_cq; @@ -477,8 +559,13 @@ struct irdma_sc_qp { bool virtual_map:1; bool flush_sq:1; bool flush_rq:1; + bool err_sq_idx_valid:1; + bool err_rq_idx_valid:1; + u32 err_sq_idx; + u32 err_rq_idx; bool sq_flush_code:1; bool rq_flush_code:1; + u32 pkt_limit; enum irdma_flush_opcode flush_code; enum irdma_qp_event_type event_type; u8 term_flags; @@ -489,13 +576,13 @@ struct irdma_sc_qp { struct irdma_stats_inst_info { bool use_hmc_fcn_index; u8 hmc_fn_id; - u8 stats_idx; + u16 stats_idx; }; struct irdma_up_info { u8 map[8]; u8 cnp_up_override; - u8 hmc_fcn_idx; + u16 hmc_fcn_idx; bool use_vlan:1; bool use_cnp_up_override:1; }; @@ -518,6 +605,8 @@ struct irdma_ws_node_info { struct irdma_hmc_fpm_misc { u32 max_ceqs; u32 max_sds; + u32 loc_mem_pages; + u8 ird; u32 xf_block_size; u32 q1_block_size; u32 ht_multiplier; @@ -526,6 +615,7 @@ struct irdma_hmc_fpm_misc { u32 ooiscf_block_size; }; +#define IRDMA_VCHNL_MAX_MSG_SIZE 512 #define IRDMA_LEAF_DEFAULT_REL_BW 64 #define IRDMA_PARENT_DEFAULT_REL_BW 1 @@ -601,19 +691,28 @@ struct irdma_sc_dev { u64 cqp_cmd_stats[IRDMA_MAX_CQP_OPS]; struct irdma_hw_attrs hw_attrs; struct irdma_hmc_info *hmc_info; + struct irdma_vchnl_rdma_caps vc_caps; + u8 vc_recv_buf[IRDMA_VCHNL_MAX_MSG_SIZE]; + u16 vc_recv_len; struct irdma_sc_cqp *cqp; struct irdma_sc_aeq *aeq; struct irdma_sc_ceq *ceq[IRDMA_CEQ_MAX_COUNT]; struct irdma_sc_cq *ccq; const struct irdma_irq_ops *irq_ops; + struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY]; struct irdma_hmc_fpm_misc hmc_fpm_misc; struct irdma_ws_node *ws_tree_root; struct mutex ws_mutex; /* ws tree mutex */ + u32 vchnl_ver; u16 num_vfs; - u8 hmc_fn_id; + u16 hmc_fn_id; u8 vf_id; + bool privileged:1; bool vchnl_up:1; bool ceq_valid:1; + bool is_pf:1; + u8 protocol_used; + struct mutex vchnl_mutex; /* mutex to synchronize RDMA virtual channel messages */ u8 pci_rev; int (*ws_add)(struct irdma_sc_vsi *vsi, u8 user_pri); void (*ws_remove)(struct irdma_sc_vsi *vsi, u8 user_pri); @@ -632,6 +731,51 @@ struct irdma_modify_cq_info { bool cq_resize:1; }; +struct irdma_srq_init_info { + struct irdma_sc_pd *pd; + struct irdma_sc_vsi *vsi; + u64 srq_pa; + u64 shadow_area_pa; + u32 first_pm_pbl_idx; + u32 pasid; + u32 srq_size; + u16 srq_limit; + u8 pasid_valid; + u8 wqe_size; + u8 leaf_pbl_size; + u8 virtual_map; + u8 tph_en; + u8 arm_limit_event; + u8 tph_value; + u8 pbl_chunk_size; + struct irdma_srq_uk_init_info srq_uk_init_info; +}; + +struct irdma_sc_srq { + struct irdma_sc_dev *dev; + struct irdma_sc_vsi *vsi; + struct irdma_sc_pd *pd; + struct irdma_srq_uk srq_uk; + void *back_srq; + u64 srq_pa; + u64 shadow_area_pa; + u32 first_pm_pbl_idx; + u32 pasid; + u32 hw_srq_size; + u16 srq_limit; + u8 pasid_valid; + u8 leaf_pbl_size; + u8 virtual_map; + u8 tph_en; + u8 arm_limit_event; + u8 tph_val; +}; + +struct irdma_modify_srq_info { + u16 srq_limit; + u8 arm_limit_event; +}; + struct irdma_create_qp_info { bool ord_valid:1; bool tcp_ctx_valid:1; @@ -671,7 +815,8 @@ struct irdma_ccq_cqe_info { u16 maj_err_code; u16 min_err_code; u8 op_code; - bool error; + bool error:1; + bool pending:1; }; struct irdma_dcb_app_info { @@ -720,7 +865,7 @@ struct irdma_vsi_init_info { struct irdma_vsi_stats_info { struct irdma_vsi_pestat *pestat; - u8 fcn_id; + u16 fcn_id; bool alloc_stats_inst; }; @@ -731,7 +876,8 @@ struct irdma_device_init_info { __le64 *fpm_commit_buf; struct irdma_hw *hw; void __iomem *bar0; - u8 hmc_fn_id; + enum irdma_protocol_used protocol_used; + u16 hmc_fn_id; }; struct irdma_ceq_init_info { @@ -746,8 +892,8 @@ struct irdma_ceq_init_info { bool itr_no_expire:1; u8 pbl_chunk_size; u8 tph_val; + u16 vsi_idx; u32 first_pm_pbl_idx; - struct irdma_sc_vsi *vsi; struct irdma_sc_cq **reg_cq; u32 reg_cq_idx; }; @@ -807,6 +953,8 @@ struct irdma_udp_offload_info { u32 cwnd; u8 rexmit_thresh; u8 rnr_nak_thresh; + u8 rnr_nak_tmr; + u8 min_rnr_timer; }; struct irdma_roce_offload_info { @@ -833,6 +981,7 @@ struct irdma_roce_offload_info { bool dctcp_en:1; bool fw_cc_enable:1; bool use_stats_inst:1; + u8 local_ack_timeout; u16 t_high; u16 t_low; u8 last_byte_sent; @@ -933,8 +1082,10 @@ struct irdma_qp_host_ctx_info { }; u32 send_cq_num; u32 rcv_cq_num; + u32 srq_id; u32 rem_endpoint_idx; - u8 stats_idx; + u16 stats_idx; + bool remote_atomics_en:1; bool srq_valid:1; bool tcp_info_valid:1; bool iwarp_info_valid:1; @@ -945,6 +1096,7 @@ struct irdma_qp_host_ctx_info { struct irdma_aeqe_info { u64 compl_ctx; u32 qp_cq_id; + u32 def_info; /* only valid for DEF_CMPL */ u16 ae_id; u16 wqe_idx; u8 tcp_state; @@ -953,9 +1105,11 @@ struct irdma_aeqe_info { bool cq:1; bool sq:1; bool rq:1; + bool srq:1; bool in_rdrsp_wr:1; bool out_rdrsp:1; bool aeqe_overflow:1; + bool err_rq_idx_valid:1; u8 q2_data_written; u8 ae_src; }; @@ -972,7 +1126,8 @@ struct irdma_allocate_stag_info { bool use_hmc_fcn_index:1; bool use_pf_rid:1; bool all_memory:1; - u8 hmc_fcn_index; + bool remote_atomics_en:1; + u16 hmc_fcn_index; }; struct irdma_mw_alloc_info { @@ -1000,6 +1155,7 @@ struct irdma_reg_ns_stag_info { u8 hmc_fcn_index; bool use_pf_rid:1; bool all_memory:1; + bool remote_atomics_en:1; }; struct irdma_fast_reg_stag_info { @@ -1023,6 +1179,7 @@ struct irdma_fast_reg_stag_info { u8 hmc_fcn_index; bool use_pf_rid:1; bool defer_flag:1; + bool remote_atomics_en:1; }; struct irdma_dealloc_stag_info { @@ -1130,6 +1287,8 @@ struct irdma_cqp_manage_push_page_info { }; struct irdma_qp_flush_info { + u32 err_sq_idx; + u32 err_rq_idx; u16 sq_minor_code; u16 sq_major_code; u16 rq_minor_code; @@ -1140,6 +1299,8 @@ struct irdma_qp_flush_info { bool rq:1; bool userflushcode:1; bool generate_ae:1; + bool err_sq_idx_valid:1; + bool err_rq_idx_valid:1; }; struct irdma_gen_ae_info { @@ -1189,6 +1350,11 @@ void irdma_sc_pd_init(struct irdma_sc_dev *dev, struct irdma_sc_pd *pd, u32 pd_i void irdma_cfg_aeq(struct irdma_sc_dev *dev, u32 idx, bool enable); void irdma_check_cqp_progress(struct irdma_cqp_timeout *cqp_timeout, struct irdma_sc_dev *dev); +void irdma_sc_cqp_def_cmpl_ae_handler(struct irdma_sc_dev *dev, + struct irdma_aeqe_info *info, + bool first, u64 *scratch, + u32 *sw_def_info); +u64 irdma_sc_cqp_cleanup_handler(struct irdma_sc_dev *dev); int irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, u16 *min_err); int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp); int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, @@ -1224,6 +1390,8 @@ void irdma_sc_cq_resize(struct irdma_sc_cq *cq, struct irdma_modify_cq_info *inf int irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch, u8 hmc_fn_id, bool post_sq, bool poll_registers); +int irdma_sc_srq_init(struct irdma_sc_srq *srq, + struct irdma_srq_init_info *info); void sc_vsi_update_stats(struct irdma_sc_vsi *vsi); struct cqp_info { @@ -1467,6 +1635,23 @@ struct cqp_info { struct irdma_dma_mem query_buff_mem; u64 scratch; } query_rdma; + + struct { + struct irdma_sc_srq *srq; + u64 scratch; + } srq_create; + + struct { + struct irdma_sc_srq *srq; + struct irdma_modify_srq_info info; + u64 scratch; + } srq_modify; + + struct { + struct irdma_sc_srq *srq; + u64 scratch; + } srq_destroy; + } u; }; diff --git a/drivers/infiniband/hw/irdma/uda_d.h b/drivers/infiniband/hw/irdma/uda_d.h index 5a9e6eabf032..4fb4daa20722 100644 --- a/drivers/infiniband/hw/irdma/uda_d.h +++ b/drivers/infiniband/hw/irdma/uda_d.h @@ -78,8 +78,7 @@ #define IRDMA_UDAQPC_IPID GENMASK_ULL(47, 32) #define IRDMA_UDAQPC_SNDMSS GENMASK_ULL(29, 16) #define IRDMA_UDAQPC_VLANTAG GENMASK_ULL(15, 0) - -#define IRDMA_UDA_CQPSQ_MAV_PDINDEXHI GENMASK_ULL(21, 20) +#define IRDMA_UDA_CQPSQ_MAV_PDINDEXHI GENMASK_ULL(27, 20) #define IRDMA_UDA_CQPSQ_MAV_PDINDEXLO GENMASK_ULL(63, 48) #define IRDMA_UDA_CQPSQ_MAV_SRCMACADDRINDEX GENMASK_ULL(29, 24) #define IRDMA_UDA_CQPSQ_MAV_ARPINDEX GENMASK_ULL(63, 48) @@ -94,7 +93,7 @@ #define IRDMA_UDA_CQPSQ_MAV_OPCODE GENMASK_ULL(37, 32) #define IRDMA_UDA_CQPSQ_MAV_DOLOOPBACKK BIT_ULL(62) #define IRDMA_UDA_CQPSQ_MAV_IPV4VALID BIT_ULL(59) -#define IRDMA_UDA_CQPSQ_MAV_AVIDX GENMASK_ULL(16, 0) +#define IRDMA_UDA_CQPSQ_MAV_AVIDX GENMASK_ULL(23, 0) #define IRDMA_UDA_CQPSQ_MAV_INSERTVLANTAG BIT_ULL(60) #define IRDMA_UDA_MGCTX_VFFLAG BIT_ULL(29) #define IRDMA_UDA_MGCTX_DESTPORT GENMASK_ULL(47, 32) diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 38c54e59cc2e..ce1ae10c30fc 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -198,6 +198,26 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, return wqe; } +__le64 *irdma_srq_get_next_recv_wqe(struct irdma_srq_uk *srq, u32 *wqe_idx) +{ + int ret_code; + __le64 *wqe; + + if (IRDMA_RING_FULL_ERR(srq->srq_ring)) + return NULL; + + IRDMA_ATOMIC_RING_MOVE_HEAD(srq->srq_ring, *wqe_idx, ret_code); + if (ret_code) + return NULL; + + if (!*wqe_idx) + srq->srwqe_polarity = !srq->srwqe_polarity; + /* rq_wqe_size_multiplier is no of 32 byte quanta in one rq wqe */ + wqe = srq->srq_base[*wqe_idx * (srq->wqe_size_multiplier)].elem; + + return wqe; +} + /** * irdma_qp_get_next_recv_wqe - get next qp's rcv wqe * @qp: hw qp ptr @@ -318,6 +338,160 @@ int irdma_uk_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, } /** + * irdma_uk_atomic_fetch_add - atomic fetch and add operation + * @qp: hw qp ptr + * @info: post sq information + * @post_sq: flag to post sq + */ +int irdma_uk_atomic_fetch_add(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, bool post_sq) +{ + struct irdma_atomic_fetch_add *op_info; + u32 total_size = 0; + u16 quanta = 2; + u32 wqe_idx; + __le64 *wqe; + u64 hdr; + + op_info = &info->op.atomic_fetch_add; + wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, + info); + if (!wqe) + return -ENOMEM; + + set_64bit_val(wqe, 0, op_info->tagged_offset); + set_64bit_val(wqe, 8, + FIELD_PREP(IRDMAQPSQ_STAG, op_info->stag)); + set_64bit_val(wqe, 16, op_info->remote_tagged_offset); + + hdr = FIELD_PREP(IRDMAQPSQ_ADDFRAGCNT, 1) | + FIELD_PREP(IRDMAQPSQ_REMOTE_STAG, op_info->remote_stag) | + FIELD_PREP(IRDMAQPSQ_OPCODE, IRDMAQP_OP_ATOMIC_FETCH_ADD) | + FIELD_PREP(IRDMAQPSQ_READFENCE, info->read_fence) | + FIELD_PREP(IRDMAQPSQ_LOCALFENCE, info->local_fence) | + FIELD_PREP(IRDMAQPSQ_SIGCOMPL, info->signaled) | + FIELD_PREP(IRDMAQPSQ_VALID, qp->swqe_polarity); + + set_64bit_val(wqe, 32, op_info->fetch_add_data_bytes); + set_64bit_val(wqe, 40, 0); + set_64bit_val(wqe, 48, 0); + set_64bit_val(wqe, 56, + FIELD_PREP(IRDMAQPSQ_VALID, qp->swqe_polarity)); + + dma_wmb(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + if (post_sq) + irdma_uk_qp_post_wr(qp); + + return 0; +} + +/** + * irdma_uk_atomic_compare_swap - atomic compare and swap operation + * @qp: hw qp ptr + * @info: post sq information + * @post_sq: flag to post sq + */ +int irdma_uk_atomic_compare_swap(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, bool post_sq) +{ + struct irdma_atomic_compare_swap *op_info; + u32 total_size = 0; + u16 quanta = 2; + u32 wqe_idx; + __le64 *wqe; + u64 hdr; + + op_info = &info->op.atomic_compare_swap; + wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, + info); + if (!wqe) + return -ENOMEM; + + set_64bit_val(wqe, 0, op_info->tagged_offset); + set_64bit_val(wqe, 8, + FIELD_PREP(IRDMAQPSQ_STAG, op_info->stag)); + set_64bit_val(wqe, 16, op_info->remote_tagged_offset); + + hdr = FIELD_PREP(IRDMAQPSQ_ADDFRAGCNT, 1) | + FIELD_PREP(IRDMAQPSQ_REMOTE_STAG, op_info->remote_stag) | + FIELD_PREP(IRDMAQPSQ_OPCODE, IRDMAQP_OP_ATOMIC_COMPARE_SWAP_ADD) | + FIELD_PREP(IRDMAQPSQ_READFENCE, info->read_fence) | + FIELD_PREP(IRDMAQPSQ_LOCALFENCE, info->local_fence) | + FIELD_PREP(IRDMAQPSQ_SIGCOMPL, info->signaled) | + FIELD_PREP(IRDMAQPSQ_VALID, qp->swqe_polarity); + + set_64bit_val(wqe, 32, op_info->swap_data_bytes); + set_64bit_val(wqe, 40, op_info->compare_data_bytes); + set_64bit_val(wqe, 48, 0); + set_64bit_val(wqe, 56, + FIELD_PREP(IRDMAQPSQ_VALID, qp->swqe_polarity)); + + dma_wmb(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + if (post_sq) + irdma_uk_qp_post_wr(qp); + + return 0; +} + +/** + * irdma_uk_srq_post_receive - post a receive wqe to a shared rq + * @srq: shared rq ptr + * @info: post rq information + */ +int irdma_uk_srq_post_receive(struct irdma_srq_uk *srq, + struct irdma_post_rq_info *info) +{ + u32 wqe_idx, i, byte_off; + u32 addl_frag_cnt; + __le64 *wqe; + u64 hdr; + + if (srq->max_srq_frag_cnt < info->num_sges) + return -EINVAL; + + wqe = irdma_srq_get_next_recv_wqe(srq, &wqe_idx); + if (!wqe) + return -ENOMEM; + + addl_frag_cnt = info->num_sges > 1 ? info->num_sges - 1 : 0; + srq->wqe_ops.iw_set_fragment(wqe, 0, info->sg_list, + srq->srwqe_polarity); + + for (i = 1, byte_off = 32; i < info->num_sges; i++) { + srq->wqe_ops.iw_set_fragment(wqe, byte_off, &info->sg_list[i], + srq->srwqe_polarity); + byte_off += 16; + } + + /* if not an odd number set valid bit in next fragment */ + if (srq->uk_attrs->hw_rev >= IRDMA_GEN_2 && !(info->num_sges & 0x01) && + info->num_sges) { + srq->wqe_ops.iw_set_fragment(wqe, byte_off, NULL, + srq->srwqe_polarity); + if (srq->uk_attrs->hw_rev == IRDMA_GEN_2) + ++addl_frag_cnt; + } + + set_64bit_val(wqe, 16, (u64)info->wr_id); + hdr = FIELD_PREP(IRDMAQPSQ_ADDFRAGCNT, addl_frag_cnt) | + FIELD_PREP(IRDMAQPSQ_VALID, srq->srwqe_polarity); + + dma_wmb(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, 24, hdr); + + set_64bit_val(srq->shadow_area, 0, (wqe_idx + 1) % srq->srq_ring.size); + + return 0; +} + +/** * irdma_uk_rdma_read - rdma read command * @qp: hw qp ptr * @info: post sq information @@ -973,6 +1147,9 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, u64 comp_ctx, qword0, qword2, qword3; __le64 *cqe; struct irdma_qp_uk *qp; + struct irdma_srq_uk *srq; + struct qp_err_code qp_err; + u8 is_srq; struct irdma_ring *pring = NULL; u32 wqe_idx; int ret_code; @@ -1046,21 +1223,46 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, } info->q_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3); + is_srq = (u8)FIELD_GET(IRDMA_CQ_SRQ, qword3); info->error = (bool)FIELD_GET(IRDMA_CQ_ERROR, qword3); info->ipv4 = (bool)FIELD_GET(IRDMACQ_IPV4, qword3); + get_64bit_val(cqe, 8, &comp_ctx); + if (is_srq) + get_64bit_val(cqe, 40, (u64 *)&qp); + else + qp = (struct irdma_qp_uk *)(unsigned long)comp_ctx; if (info->error) { info->major_err = FIELD_GET(IRDMA_CQ_MAJERR, qword3); info->minor_err = FIELD_GET(IRDMA_CQ_MINERR, qword3); - if (info->major_err == IRDMA_FLUSH_MAJOR_ERR) { - info->comp_status = IRDMA_COMPL_STATUS_FLUSHED; + switch (info->major_err) { + case IRDMA_SRQFLUSH_RSVD_MAJOR_ERR: + qp_err = irdma_ae_to_qp_err_code(info->minor_err); + info->minor_err = qp_err.flush_code; + fallthrough; + case IRDMA_FLUSH_MAJOR_ERR: /* Set the min error to standard flush error code for remaining cqes */ if (info->minor_err != FLUSH_GENERAL_ERR) { qword3 &= ~IRDMA_CQ_MINERR; qword3 |= FIELD_PREP(IRDMA_CQ_MINERR, FLUSH_GENERAL_ERR); set_64bit_val(cqe, 24, qword3); } - } else { - info->comp_status = IRDMA_COMPL_STATUS_UNKNOWN; + info->comp_status = IRDMA_COMPL_STATUS_FLUSHED; + break; + default: +#define IRDMA_CIE_SIGNATURE 0xE +#define IRDMA_CQMAJERR_HIGH_NIBBLE GENMASK(15, 12) + if (info->q_type == IRDMA_CQE_QTYPE_SQ && + qp->qp_type == IRDMA_QP_TYPE_ROCE_UD && + FIELD_GET(IRDMA_CQMAJERR_HIGH_NIBBLE, info->major_err) + == IRDMA_CIE_SIGNATURE) { + info->error = 0; + info->major_err = 0; + info->minor_err = 0; + info->comp_status = IRDMA_COMPL_STATUS_SUCCESS; + } else { + info->comp_status = IRDMA_COMPL_STATUS_UNKNOWN; + } + break; } } else { info->comp_status = IRDMA_COMPL_STATUS_SUCCESS; @@ -1069,7 +1271,6 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, get_64bit_val(cqe, 0, &qword0); get_64bit_val(cqe, 16, &qword2); - info->tcp_seq_num_rtt = (u32)FIELD_GET(IRDMACQ_TCPSEQNUMRTT, qword0); info->qp_id = (u32)FIELD_GET(IRDMACQ_QPID, qword2); info->ud_src_qpn = (u32)FIELD_GET(IRDMACQ_UDSRCQPN, qword2); @@ -1085,7 +1286,22 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->qp_handle = (irdma_qp_handle)(unsigned long)qp; info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3); - if (info->q_type == IRDMA_CQE_QTYPE_RQ) { + if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) { + srq = qp->srq_uk; + + get_64bit_val(cqe, 8, &info->wr_id); + info->bytes_xfered = (u32)FIELD_GET(IRDMACQ_PAYLDLEN, qword0); + + if (qword3 & IRDMACQ_STAG) { + info->stag_invalid_set = true; + info->inv_stag = (u32)FIELD_GET(IRDMACQ_INVSTAG, + qword2); + } else { + info->stag_invalid_set = false; + } + IRDMA_RING_MOVE_TAIL(srq->srq_ring); + pring = &srq->srq_ring; + } else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) { u32 array_idx; array_idx = wqe_idx / qp->rq_wqe_size_multiplier; @@ -1180,9 +1396,15 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, ret_code = 0; exit: - if (!ret_code && info->comp_status == IRDMA_COMPL_STATUS_FLUSHED) + if (!ret_code && info->comp_status == IRDMA_COMPL_STATUS_FLUSHED) { if (pring && IRDMA_RING_MORE_WORK(*pring)) - move_cq_head = false; + /* Park CQ head during a flush to generate additional CQEs + * from SW for all unprocessed WQEs. For GEN3 and beyond + * FW will generate/flush these CQEs so move to the next CQE + */ + move_cq_head = qp->uk_attrs->hw_rev <= IRDMA_GEN_2 ? + false : true; + } if (move_cq_head) { IRDMA_RING_MOVE_HEAD_NOCHECK(cq->cq_ring); @@ -1210,10 +1432,10 @@ exit: } /** - * irdma_qp_round_up - return round up qp wq depth + * irdma_round_up_wq - return round up qp wq depth * @wqdepth: wq depth in quanta to round up */ -static int irdma_qp_round_up(u32 wqdepth) +static int irdma_round_up_wq(u32 wqdepth) { int scount = 1; @@ -1268,7 +1490,7 @@ int irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, u32 sq_size, u8 shift, { u32 min_size = (u32)uk_attrs->min_hw_wq_size << shift; - *sqdepth = irdma_qp_round_up((sq_size << shift) + IRDMA_SQ_RSVD); + *sqdepth = irdma_round_up_wq((sq_size << shift) + IRDMA_SQ_RSVD); if (*sqdepth < min_size) *sqdepth = min_size; @@ -1290,7 +1512,7 @@ int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift, { u32 min_size = (u32)uk_attrs->min_hw_wq_size << shift; - *rqdepth = irdma_qp_round_up((rq_size << shift) + IRDMA_RQ_RSVD); + *rqdepth = irdma_round_up_wq((rq_size << shift) + IRDMA_RQ_RSVD); if (*rqdepth < min_size) *rqdepth = min_size; @@ -1300,6 +1522,26 @@ int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift, return 0; } +/* + * irdma_get_srqdepth - get SRQ depth (quanta) + * @uk_attrs: qp HW attributes + * @srq_size: SRQ size + * @shift: shift which determines size of WQE + * @srqdepth: depth of SRQ + */ +int irdma_get_srqdepth(struct irdma_uk_attrs *uk_attrs, u32 srq_size, u8 shift, + u32 *srqdepth) +{ + *srqdepth = irdma_round_up_wq((srq_size << shift) + IRDMA_RQ_RSVD); + + if (*srqdepth < ((u32)uk_attrs->min_hw_wq_size << shift)) + *srqdepth = uk_attrs->min_hw_wq_size << shift; + else if (*srqdepth > uk_attrs->max_hw_srq_quanta) + return -EINVAL; + + return 0; +} + static const struct irdma_wqe_uk_ops iw_wqe_uk_ops = { .iw_copy_inline_data = irdma_copy_inline_data, .iw_inline_data_size_to_quanta = irdma_inline_data_size_to_quanta, @@ -1336,6 +1578,42 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp, } /** + * irdma_uk_srq_init - initialize shared qp + * @srq: hw srq (user and kernel) + * @info: srq initialization info + * + * Initializes the vars used in both user and kernel mode. + * The size of the wqe depends on number of max fragments + * allowed. Then size of wqe * the number of wqes should be the + * amount of memory allocated for srq. + */ +int irdma_uk_srq_init(struct irdma_srq_uk *srq, + struct irdma_srq_uk_init_info *info) +{ + u8 rqshift; + + srq->uk_attrs = info->uk_attrs; + if (info->max_srq_frag_cnt > srq->uk_attrs->max_hw_wq_frags) + return -EINVAL; + + irdma_get_wqe_shift(srq->uk_attrs, info->max_srq_frag_cnt, 0, &rqshift); + srq->srq_caps = info->srq_caps; + srq->srq_base = info->srq; + srq->shadow_area = info->shadow_area; + srq->srq_id = info->srq_id; + srq->srwqe_polarity = 0; + srq->srq_size = info->srq_size; + srq->wqe_size = rqshift; + srq->max_srq_frag_cnt = min(srq->uk_attrs->max_hw_wq_frags, + ((u32)2 << rqshift) - 1); + IRDMA_RING_INIT(srq->srq_ring, srq->srq_size); + srq->wqe_size_multiplier = 1 << rqshift; + srq->wqe_ops = iw_wqe_uk_ops; + + return 0; +} + +/** * irdma_uk_calc_shift_wq - calculate WQE shift for both SQ and RQ * @ukinfo: qp initialization info * @sq_shift: Returns shift of SQ @@ -1461,6 +1739,7 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info) qp->wqe_ops = iw_wqe_uk_ops_gen_1; else qp->wqe_ops = iw_wqe_uk_ops; + qp->srq_uk = info->srq_uk; return ret_code; } diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 380e4a47aede..ab57f689827a 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -41,10 +41,114 @@ #define IRDMA_OP_TYPE_INV_STAG 0x0a #define IRDMA_OP_TYPE_RDMA_READ_INV_STAG 0x0b #define IRDMA_OP_TYPE_NOP 0x0c +#define IRDMA_OP_TYPE_ATOMIC_FETCH_AND_ADD 0x0f +#define IRDMA_OP_TYPE_ATOMIC_COMPARE_AND_SWAP 0x11 #define IRDMA_OP_TYPE_REC 0x3e #define IRDMA_OP_TYPE_REC_IMM 0x3f -#define IRDMA_FLUSH_MAJOR_ERR 1 +#define IRDMA_FLUSH_MAJOR_ERR 1 +#define IRDMA_SRQFLUSH_RSVD_MAJOR_ERR 0xfffe + +/* Async Events codes */ +#define IRDMA_AE_AMP_UNALLOCATED_STAG 0x0102 +#define IRDMA_AE_AMP_INVALID_STAG 0x0103 +#define IRDMA_AE_AMP_BAD_QP 0x0104 +#define IRDMA_AE_AMP_BAD_PD 0x0105 +#define IRDMA_AE_AMP_BAD_STAG_KEY 0x0106 +#define IRDMA_AE_AMP_BAD_STAG_INDEX 0x0107 +#define IRDMA_AE_AMP_BOUNDS_VIOLATION 0x0108 +#define IRDMA_AE_AMP_RIGHTS_VIOLATION 0x0109 +#define IRDMA_AE_AMP_TO_WRAP 0x010a +#define IRDMA_AE_AMP_FASTREG_VALID_STAG 0x010c +#define IRDMA_AE_AMP_FASTREG_MW_STAG 0x010d +#define IRDMA_AE_AMP_FASTREG_INVALID_RIGHTS 0x010e +#define IRDMA_AE_AMP_FASTREG_INVALID_LENGTH 0x0110 +#define IRDMA_AE_AMP_INVALIDATE_SHARED 0x0111 +#define IRDMA_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS 0x0112 +#define IRDMA_AE_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS 0x0113 +#define IRDMA_AE_AMP_MWBIND_VALID_STAG 0x0114 +#define IRDMA_AE_AMP_MWBIND_OF_MR_STAG 0x0115 +#define IRDMA_AE_AMP_MWBIND_TO_ZERO_BASED_STAG 0x0116 +#define IRDMA_AE_AMP_MWBIND_TO_MW_STAG 0x0117 +#define IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS 0x0118 +#define IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS 0x0119 +#define IRDMA_AE_AMP_MWBIND_TO_INVALID_PARENT 0x011a +#define IRDMA_AE_AMP_MWBIND_BIND_DISABLED 0x011b +#define IRDMA_AE_PRIV_OPERATION_DENIED 0x011c +#define IRDMA_AE_AMP_INVALIDATE_TYPE1_MW 0x011d +#define IRDMA_AE_AMP_MWBIND_ZERO_BASED_TYPE1_MW 0x011e +#define IRDMA_AE_AMP_FASTREG_INVALID_PBL_HPS_CFG 0x011f +#define IRDMA_AE_AMP_MWBIND_WRONG_TYPE 0x0120 +#define IRDMA_AE_AMP_FASTREG_PBLE_MISMATCH 0x0121 +#define IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG 0x0132 +#define IRDMA_AE_UDA_XMIT_BAD_PD 0x0133 +#define IRDMA_AE_UDA_XMIT_DGRAM_TOO_SHORT 0x0134 +#define IRDMA_AE_UDA_L4LEN_INVALID 0x0135 +#define IRDMA_AE_BAD_CLOSE 0x0201 +#define IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE 0x0202 +#define IRDMA_AE_CQ_OPERATION_ERROR 0x0203 +#define IRDMA_AE_RDMA_READ_WHILE_ORD_ZERO 0x0205 +#define IRDMA_AE_STAG_ZERO_INVALID 0x0206 +#define IRDMA_AE_IB_RREQ_AND_Q1_FULL 0x0207 +#define IRDMA_AE_IB_INVALID_REQUEST 0x0208 +#define IRDMA_AE_SRQ_LIMIT 0x0209 +#define IRDMA_AE_WQE_UNEXPECTED_OPCODE 0x020a +#define IRDMA_AE_WQE_INVALID_PARAMETER 0x020b +#define IRDMA_AE_WQE_INVALID_FRAG_DATA 0x020c +#define IRDMA_AE_IB_REMOTE_ACCESS_ERROR 0x020d +#define IRDMA_AE_IB_REMOTE_OP_ERROR 0x020e +#define IRDMA_AE_SRQ_CATASTROPHIC_ERROR 0x020f +#define IRDMA_AE_WQE_LSMM_TOO_LONG 0x0220 +#define IRDMA_AE_ATOMIC_ALIGNMENT 0x0221 +#define IRDMA_AE_ATOMIC_MASK 0x0222 +#define IRDMA_AE_INVALID_REQUEST 0x0223 +#define IRDMA_AE_PCIE_ATOMIC_DISABLE 0x0224 +#define IRDMA_AE_DDP_INVALID_MSN_GAP_IN_MSN 0x0301 +#define IRDMA_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER 0x0303 +#define IRDMA_AE_DDP_UBE_INVALID_DDP_VERSION 0x0304 +#define IRDMA_AE_DDP_UBE_INVALID_MO 0x0305 +#define IRDMA_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE 0x0306 +#define IRDMA_AE_DDP_UBE_INVALID_QN 0x0307 +#define IRDMA_AE_DDP_NO_L_BIT 0x0308 +#define IRDMA_AE_RDMAP_ROE_INVALID_RDMAP_VERSION 0x0311 +#define IRDMA_AE_RDMAP_ROE_UNEXPECTED_OPCODE 0x0312 +#define IRDMA_AE_ROE_INVALID_RDMA_READ_REQUEST 0x0313 +#define IRDMA_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP 0x0314 +#define IRDMA_AE_ROCE_RSP_LENGTH_ERROR 0x0316 +#define IRDMA_AE_ROCE_EMPTY_MCG 0x0380 +#define IRDMA_AE_ROCE_BAD_MC_IP_ADDR 0x0381 +#define IRDMA_AE_ROCE_BAD_MC_QPID 0x0382 +#define IRDMA_AE_MCG_QP_PROTOCOL_MISMATCH 0x0383 +#define IRDMA_AE_INVALID_ARP_ENTRY 0x0401 +#define IRDMA_AE_INVALID_TCP_OPTION_RCVD 0x0402 +#define IRDMA_AE_STALE_ARP_ENTRY 0x0403 +#define IRDMA_AE_INVALID_AH_ENTRY 0x0406 +#define IRDMA_AE_LLP_CLOSE_COMPLETE 0x0501 +#define IRDMA_AE_LLP_CONNECTION_RESET 0x0502 +#define IRDMA_AE_LLP_FIN_RECEIVED 0x0503 +#define IRDMA_AE_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH 0x0504 +#define IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR 0x0505 +#define IRDMA_AE_LLP_SEGMENT_TOO_SMALL 0x0507 +#define IRDMA_AE_LLP_SYN_RECEIVED 0x0508 +#define IRDMA_AE_LLP_TERMINATE_RECEIVED 0x0509 +#define IRDMA_AE_LLP_TOO_MANY_RETRIES 0x050a +#define IRDMA_AE_LLP_TOO_MANY_KEEPALIVE_RETRIES 0x050b +#define IRDMA_AE_LLP_DOUBT_REACHABILITY 0x050c +#define IRDMA_AE_LLP_CONNECTION_ESTABLISHED 0x050e +#define IRDMA_AE_LLP_TOO_MANY_RNRS 0x050f +#define IRDMA_AE_RESOURCE_EXHAUSTION 0x0520 +#define IRDMA_AE_RESET_SENT 0x0601 +#define IRDMA_AE_TERMINATE_SENT 0x0602 +#define IRDMA_AE_RESET_NOT_SENT 0x0603 +#define IRDMA_AE_LCE_QP_CATASTROPHIC 0x0700 +#define IRDMA_AE_LCE_FUNCTION_CATASTROPHIC 0x0701 +#define IRDMA_AE_LCE_CQ_CATASTROPHIC 0x0702 +#define IRDMA_AE_REMOTE_QP_CATASTROPHIC 0x0703 +#define IRDMA_AE_LOCAL_QP_CATASTROPHIC 0x0704 +#define IRDMA_AE_RCE_QP_CATASTROPHIC 0x0705 +#define IRDMA_AE_QP_SUSPEND_COMPLETE 0x0900 +#define IRDMA_AE_CQP_DEFERRED_COMPLETE 0x0901 +#define IRDMA_AE_ADAPTER_CATASTROPHIC 0x0B0B enum irdma_device_caps_const { IRDMA_WQE_SIZE = 4, @@ -55,11 +159,12 @@ enum irdma_device_caps_const { IRDMA_CEQE_SIZE = 1, IRDMA_CQP_CTX_SIZE = 8, IRDMA_SHADOW_AREA_SIZE = 8, - IRDMA_QUERY_FPM_BUF_SIZE = 176, - IRDMA_COMMIT_FPM_BUF_SIZE = 176, + IRDMA_QUERY_FPM_BUF_SIZE = 192, + IRDMA_COMMIT_FPM_BUF_SIZE = 192, IRDMA_GATHER_STATS_BUF_SIZE = 1024, IRDMA_MIN_IW_QP_ID = 0, IRDMA_MAX_IW_QP_ID = 262143, + IRDMA_MIN_IW_SRQ_ID = 0, IRDMA_MIN_CEQID = 0, IRDMA_MAX_CEQID = 1023, IRDMA_CEQ_MAX_COUNT = IRDMA_MAX_CEQID + 1, @@ -67,6 +172,7 @@ enum irdma_device_caps_const { IRDMA_MAX_CQID = 524287, IRDMA_MIN_AEQ_ENTRIES = 1, IRDMA_MAX_AEQ_ENTRIES = 524287, + IRDMA_MAX_AEQ_ENTRIES_GEN_3 = 262144, IRDMA_MIN_CEQ_ENTRIES = 1, IRDMA_MAX_CEQ_ENTRIES = 262143, IRDMA_MIN_CQ_SIZE = 1, @@ -105,6 +211,13 @@ enum irdma_flush_opcode { FLUSH_RETRY_EXC_ERR, FLUSH_MW_BIND_ERR, FLUSH_REM_INV_REQ_ERR, + FLUSH_RNR_RETRY_EXC_ERR, +}; + +enum irdma_qp_event_type { + IRDMA_QP_EVENT_CATASTROPHIC, + IRDMA_QP_EVENT_ACCESS_ERR, + IRDMA_QP_EVENT_REQ_ERR, }; enum irdma_cmpl_status { @@ -147,6 +260,8 @@ enum irdma_qp_caps { IRDMA_PUSH_MODE = 8, }; +struct irdma_srq_uk; +struct irdma_srq_uk_init_info; struct irdma_qp_uk; struct irdma_cq_uk; struct irdma_qp_uk_init_info; @@ -201,6 +316,24 @@ struct irdma_bind_window { bool ena_writes:1; irdma_stag mw_stag; bool mem_window_type_1:1; + bool remote_atomics_en:1; +}; + +struct irdma_atomic_fetch_add { + u64 tagged_offset; + u64 remote_tagged_offset; + u64 fetch_add_data_bytes; + u32 stag; + u32 remote_stag; +}; + +struct irdma_atomic_compare_swap { + u64 tagged_offset; + u64 remote_tagged_offset; + u64 swap_data_bytes; + u64 compare_data_bytes; + u32 stag; + u32 remote_stag; }; struct irdma_inv_local_stag { @@ -219,6 +352,7 @@ struct irdma_post_sq_info { bool report_rtt:1; bool udp_hdr:1; bool defer_flag:1; + bool remote_atomic_en:1; u32 imm_data; u32 stag_to_inv; union { @@ -227,6 +361,8 @@ struct irdma_post_sq_info { struct irdma_rdma_read rdma_read; struct irdma_bind_window bind_window; struct irdma_inv_local_stag inv_local_stag; + struct irdma_atomic_fetch_add atomic_fetch_add; + struct irdma_atomic_compare_swap atomic_compare_swap; } op; }; @@ -255,6 +391,15 @@ struct irdma_cq_poll_info { bool imm_valid:1; }; +struct qp_err_code { + enum irdma_flush_opcode flush_code; + enum irdma_qp_event_type event_type; +}; + +int irdma_uk_atomic_compare_swap(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, bool post_sq); +int irdma_uk_atomic_fetch_add(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, bool post_sq); int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, bool post_sq); int irdma_uk_inline_send(struct irdma_qp_uk *qp, @@ -300,6 +445,39 @@ int irdma_uk_calc_depth_shift_sq(struct irdma_qp_uk_init_info *ukinfo, u32 *sq_depth, u8 *sq_shift); int irdma_uk_calc_depth_shift_rq(struct irdma_qp_uk_init_info *ukinfo, u32 *rq_depth, u8 *rq_shift); +int irdma_uk_srq_init(struct irdma_srq_uk *srq, + struct irdma_srq_uk_init_info *info); +int irdma_uk_srq_post_receive(struct irdma_srq_uk *srq, + struct irdma_post_rq_info *info); + +struct irdma_srq_uk { + u32 srq_caps; + struct irdma_qp_quanta *srq_base; + struct irdma_uk_attrs *uk_attrs; + __le64 *shadow_area; + struct irdma_ring srq_ring; + struct irdma_ring initial_ring; + u32 srq_id; + u32 srq_size; + u32 max_srq_frag_cnt; + struct irdma_wqe_uk_ops wqe_ops; + u8 srwqe_polarity; + u8 wqe_size; + u8 wqe_size_multiplier; + u8 deferred_flag; +}; + +struct irdma_srq_uk_init_info { + struct irdma_qp_quanta *srq; + struct irdma_uk_attrs *uk_attrs; + __le64 *shadow_area; + u64 *srq_wrid_array; + u32 srq_id; + u32 srq_caps; + u32 srq_size; + u32 max_srq_frag_cnt; +}; + struct irdma_sq_uk_wr_trk_info { u64 wrid; u32 wr_len; @@ -344,6 +522,7 @@ struct irdma_qp_uk { bool destroy_pending:1; /* Indicates the QP is being destroyed */ void *back_qp; u8 dbg_rq_flushed; + struct irdma_srq_uk *srq_uk; u8 sq_flush_seen; u8 rq_flush_seen; }; @@ -383,6 +562,7 @@ struct irdma_qp_uk_init_info { u8 rq_shift; int abi_ver; bool legacy_mode; + struct irdma_srq_uk *srq_uk; }; struct irdma_cq_uk_init_info { @@ -398,6 +578,7 @@ struct irdma_cq_uk_init_info { __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, u16 quanta, u32 total_size, struct irdma_post_sq_info *info); +__le64 *irdma_srq_get_next_recv_wqe(struct irdma_srq_uk *srq, u32 *wqe_idx); __le64 *irdma_qp_get_next_recv_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx); void irdma_uk_clean_cq(void *q, struct irdma_cq_uk *cq); int irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, bool signaled, bool post_sq); @@ -409,5 +590,85 @@ int irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, u32 sq_size, u8 shift, u32 *wqdepth); int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift, u32 *wqdepth); +int irdma_get_srqdepth(struct irdma_uk_attrs *uk_attrs, u32 srq_size, u8 shift, + u32 *srqdepth); void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx); + +static inline struct qp_err_code irdma_ae_to_qp_err_code(u16 ae_id) +{ + struct qp_err_code qp_err = {}; + + switch (ae_id) { + case IRDMA_AE_AMP_BOUNDS_VIOLATION: + case IRDMA_AE_AMP_INVALID_STAG: + case IRDMA_AE_AMP_RIGHTS_VIOLATION: + case IRDMA_AE_AMP_UNALLOCATED_STAG: + case IRDMA_AE_AMP_BAD_PD: + case IRDMA_AE_AMP_BAD_QP: + case IRDMA_AE_AMP_BAD_STAG_KEY: + case IRDMA_AE_AMP_BAD_STAG_INDEX: + case IRDMA_AE_AMP_TO_WRAP: + case IRDMA_AE_PRIV_OPERATION_DENIED: + qp_err.flush_code = FLUSH_PROT_ERR; + qp_err.event_type = IRDMA_QP_EVENT_ACCESS_ERR; + break; + case IRDMA_AE_UDA_XMIT_BAD_PD: + case IRDMA_AE_WQE_UNEXPECTED_OPCODE: + qp_err.flush_code = FLUSH_LOC_QP_OP_ERR; + qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC; + break; + case IRDMA_AE_UDA_XMIT_DGRAM_TOO_SHORT: + case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG: + case IRDMA_AE_UDA_L4LEN_INVALID: + case IRDMA_AE_DDP_UBE_INVALID_MO: + case IRDMA_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + qp_err.flush_code = FLUSH_LOC_LEN_ERR; + qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC; + break; + case IRDMA_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS: + case IRDMA_AE_IB_REMOTE_ACCESS_ERROR: + qp_err.flush_code = FLUSH_REM_ACCESS_ERR; + qp_err.event_type = IRDMA_QP_EVENT_ACCESS_ERR; + break; + case IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS: + case IRDMA_AE_AMP_MWBIND_BIND_DISABLED: + case IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS: + case IRDMA_AE_AMP_MWBIND_VALID_STAG: + qp_err.flush_code = FLUSH_MW_BIND_ERR; + qp_err.event_type = IRDMA_QP_EVENT_ACCESS_ERR; + break; + case IRDMA_AE_LLP_TOO_MANY_RETRIES: + qp_err.flush_code = FLUSH_RETRY_EXC_ERR; + qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC; + break; + case IRDMA_AE_IB_INVALID_REQUEST: + qp_err.flush_code = FLUSH_REM_INV_REQ_ERR; + qp_err.event_type = IRDMA_QP_EVENT_REQ_ERR; + break; + case IRDMA_AE_LLP_SEGMENT_TOO_SMALL: + case IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR: + case IRDMA_AE_ROCE_RSP_LENGTH_ERROR: + case IRDMA_AE_IB_REMOTE_OP_ERROR: + qp_err.flush_code = FLUSH_REM_OP_ERR; + qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC; + break; + case IRDMA_AE_LLP_TOO_MANY_RNRS: + qp_err.flush_code = FLUSH_RNR_RETRY_EXC_ERR; + qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC; + break; + case IRDMA_AE_LCE_QP_CATASTROPHIC: + case IRDMA_AE_REMOTE_QP_CATASTROPHIC: + case IRDMA_AE_LOCAL_QP_CATASTROPHIC: + case IRDMA_AE_RCE_QP_CATASTROPHIC: + qp_err.flush_code = FLUSH_FATAL_ERR; + qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC; + break; + default: + qp_err.flush_code = FLUSH_GENERAL_ERR; + qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC; + break; + } + + return qp_err; +} #endif /* IRDMA_USER_H */ diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index b510ef747399..8b94d87b0192 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -481,6 +481,7 @@ void irdma_free_cqp_request(struct irdma_cqp *cqp, WRITE_ONCE(cqp_request->request_done, false); cqp_request->callback_fcn = NULL; cqp_request->waiting = false; + cqp_request->pending = false; spin_lock_irqsave(&cqp->req_lock, flags); list_add_tail(&cqp_request->list, &cqp->cqp_avail_reqs); @@ -521,6 +522,22 @@ irdma_free_pending_cqp_request(struct irdma_cqp *cqp, } /** + * irdma_cleanup_deferred_cqp_ops - clean-up cqp with no completions + * @dev: sc_dev + * @cqp: cqp + */ +static void irdma_cleanup_deferred_cqp_ops(struct irdma_sc_dev *dev, + struct irdma_cqp *cqp) +{ + u64 scratch; + + /* process all CQP requests with deferred/pending completions */ + while ((scratch = irdma_sc_cqp_cleanup_handler(dev))) + irdma_free_pending_cqp_request(cqp, (struct irdma_cqp_request *) + (uintptr_t)scratch); +} + +/** * irdma_cleanup_pending_cqp_op - clean-up cqp with no * completions * @rf: RDMA PCI function @@ -533,6 +550,8 @@ void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf) struct cqp_cmds_info *pcmdinfo = NULL; u32 i, pending_work, wqe_idx; + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + irdma_cleanup_deferred_cqp_ops(dev, cqp); pending_work = IRDMA_RING_USED_QUANTA(cqp->sc_cqp.sq_ring); wqe_idx = IRDMA_RING_CURRENT_TAIL(cqp->sc_cqp.sq_ring); for (i = 0; i < pending_work; i++) { @@ -552,6 +571,26 @@ void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf) } } +static int irdma_get_timeout_threshold(struct irdma_sc_dev *dev) +{ + u16 time_s = dev->vc_caps.cqp_timeout_s; + + if (!time_s) + return CQP_TIMEOUT_THRESHOLD; + + return time_s * 1000 / dev->hw_attrs.max_cqp_compl_wait_time_ms; +} + +static int irdma_get_def_timeout_threshold(struct irdma_sc_dev *dev) +{ + u16 time_s = dev->vc_caps.cqp_def_timeout_s; + + if (!time_s) + return CQP_DEF_CMPL_TIMEOUT_THRESHOLD; + + return time_s * 1000 / dev->hw_attrs.max_cqp_compl_wait_time_ms; +} + /** * irdma_wait_event - wait for completion * @rf: RDMA PCI function @@ -561,6 +600,7 @@ static int irdma_wait_event(struct irdma_pci_f *rf, struct irdma_cqp_request *cqp_request) { struct irdma_cqp_timeout cqp_timeout = {}; + int timeout_threshold = irdma_get_timeout_threshold(&rf->sc_dev); bool cqp_error = false; int err_code = 0; @@ -572,9 +612,17 @@ static int irdma_wait_event(struct irdma_pci_f *rf, msecs_to_jiffies(CQP_COMPL_WAIT_TIME_MS))) break; + if (cqp_request->pending) + /* There was a deferred or pending completion + * received for this CQP request, so we need + * to wait longer than usual. + */ + timeout_threshold = + irdma_get_def_timeout_threshold(&rf->sc_dev); + irdma_check_cqp_progress(&cqp_timeout, &rf->sc_dev); - if (cqp_timeout.count < CQP_TIMEOUT_THRESHOLD) + if (cqp_timeout.count < timeout_threshold) continue; if (!rf->reset) { @@ -649,6 +697,9 @@ static const char *const irdma_cqp_cmd_names[IRDMA_MAX_CQP_OPS] = { [IRDMA_OP_ADD_LOCAL_MAC_ENTRY] = "Add Local MAC Entry Cmd", [IRDMA_OP_DELETE_LOCAL_MAC_ENTRY] = "Delete Local MAC Entry Cmd", [IRDMA_OP_CQ_MODIFY] = "CQ Modify Cmd", + [IRDMA_OP_SRQ_CREATE] = "Create SRQ Cmd", + [IRDMA_OP_SRQ_MODIFY] = "Modify SRQ Cmd", + [IRDMA_OP_SRQ_DESTROY] = "Destroy SRQ Cmd", }; static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = { @@ -1065,6 +1116,26 @@ static void irdma_dealloc_push_page(struct irdma_pci_f *rf, irdma_put_cqp_request(&rf->cqp, cqp_request); } +static void irdma_free_gsi_qp_rsrc(struct irdma_qp *iwqp, u32 qp_num) +{ + struct irdma_device *iwdev = iwqp->iwdev; + struct irdma_pci_f *rf = iwdev->rf; + unsigned long flags; + + if (rf->sc_dev.hw_attrs.uk_attrs.hw_rev < IRDMA_GEN_3) + return; + + irdma_vchnl_req_del_vport(&rf->sc_dev, iwdev->vport_id, qp_num); + + if (qp_num == 1) { + spin_lock_irqsave(&rf->rsrc_lock, flags); + rf->hwqp1_rsvd = false; + spin_unlock_irqrestore(&rf->rsrc_lock, flags); + } else if (qp_num > 2) { + irdma_free_rsrc(rf, rf->allocated_qps, qp_num); + } +} + /** * irdma_free_qp_rsrc - free up memory resources for qp * @iwqp: qp ptr (user or kernel) @@ -1073,7 +1144,7 @@ void irdma_free_qp_rsrc(struct irdma_qp *iwqp) { struct irdma_device *iwdev = iwqp->iwdev; struct irdma_pci_f *rf = iwdev->rf; - u32 qp_num = iwqp->ibqp.qp_num; + u32 qp_num = iwqp->sc_qp.qp_uk.qp_id; irdma_ieq_cleanup_qp(iwdev->vsi.ieq, &iwqp->sc_qp); irdma_dealloc_push_page(rf, &iwqp->sc_qp); @@ -1083,8 +1154,12 @@ void irdma_free_qp_rsrc(struct irdma_qp *iwqp) iwqp->sc_qp.user_pri); } - if (qp_num > 2) - irdma_free_rsrc(rf, rf->allocated_qps, qp_num); + if (iwqp->ibqp.qp_type == IB_QPT_GSI) { + irdma_free_gsi_qp_rsrc(iwqp, qp_num); + } else { + if (qp_num > 2) + irdma_free_rsrc(rf, rf->allocated_qps, qp_num); + } dma_free_coherent(rf->sc_dev.hw->device, iwqp->q2_ctx_mem.size, iwqp->q2_ctx_mem.va, iwqp->q2_ctx_mem.pa); iwqp->q2_ctx_mem.va = NULL; @@ -1096,6 +1171,30 @@ void irdma_free_qp_rsrc(struct irdma_qp *iwqp) } /** + * irdma_srq_wq_destroy - send srq destroy cqp + * @rf: RDMA PCI function + * @srq: hardware control srq + */ +void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq) +{ + struct irdma_cqp_request *cqp_request; + struct cqp_cmds_info *cqp_info; + + cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); + if (!cqp_request) + return; + + cqp_info = &cqp_request->info; + cqp_info->cqp_cmd = IRDMA_OP_SRQ_DESTROY; + cqp_info->post_sq = 1; + cqp_info->in.u.srq_destroy.srq = srq; + cqp_info->in.u.srq_destroy.scratch = (uintptr_t)cqp_request; + + irdma_handle_cqp_op(rf, cqp_request); + irdma_put_cqp_request(&rf->cqp, cqp_request); +} + +/** * irdma_cq_wq_destroy - send cq destroy cqp * @rf: RDMA PCI function * @cq: hardware control cq @@ -2266,7 +2365,10 @@ bool irdma_cq_empty(struct irdma_cq *iwcq) u8 polarity; ukcq = &iwcq->sc_cq.cq_uk; - cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq); + if (ukcq->avoid_mem_cflct) + cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(ukcq); + else + cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq); get_64bit_val(cqe, 24, &qword3); polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index da5a41b275d8..76ce6137f2ba 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -41,7 +41,8 @@ static int irdma_query_device(struct ib_device *ibdev, props->max_cq = rf->max_cq - rf->used_cqs; props->max_cqe = rf->max_cqe - 1; props->max_mr = rf->max_mr - rf->used_mrs; - props->max_mw = props->max_mr; + if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_3) + props->max_mw = props->max_mr; props->max_pd = rf->max_pd - rf->used_pds; props->max_sge_rd = hw_attrs->uk_attrs.max_hw_read_sges; props->max_qp_rd_atom = hw_attrs->max_hw_ird; @@ -56,9 +57,21 @@ static int irdma_query_device(struct ib_device *ibdev, props->max_mcast_qp_attach = IRDMA_MAX_MGS_PER_CTX; props->max_total_mcast_qp_attach = rf->max_qp * IRDMA_MAX_MGS_PER_CTX; props->max_fast_reg_page_list_len = IRDMA_MAX_PAGES_PER_FMR; -#define HCA_CLOCK_TIMESTAMP_MASK 0x1ffff - if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_2) - props->timestamp_mask = HCA_CLOCK_TIMESTAMP_MASK; + props->max_srq = rf->max_srq - rf->used_srqs; + props->max_srq_wr = IRDMA_MAX_SRQ_WRS; + props->max_srq_sge = hw_attrs->uk_attrs.max_hw_wq_frags; + if (hw_attrs->uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS) + props->atomic_cap = IB_ATOMIC_HCA; + else + props->atomic_cap = IB_ATOMIC_NONE; + props->masked_atomic_cap = props->atomic_cap; + if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_3) { +#define HCA_CORE_CLOCK_KHZ 1000000UL + props->timestamp_mask = GENMASK(31, 0); + props->hca_core_clock = HCA_CORE_CLOCK_KHZ; + } + if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_3) + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; return 0; } @@ -292,6 +305,10 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx, ucontext->iwdev = iwdev; ucontext->abi_ver = req.userspace_ver; + if (!(req.comp_mask & IRDMA_SUPPORT_WQE_FORMAT_V2) && + uk_attrs->hw_rev >= IRDMA_GEN_3) + return -EOPNOTSUPP; + if (req.comp_mask & IRDMA_ALLOC_UCTX_USE_RAW_ATTR) ucontext->use_raw_attrs = true; @@ -332,6 +349,8 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx, uresp.comp_mask |= IRDMA_ALLOC_UCTX_USE_RAW_ATTR; uresp.min_hw_wq_size = uk_attrs->min_hw_wq_size; uresp.comp_mask |= IRDMA_ALLOC_UCTX_MIN_HW_WQ_SIZE; + uresp.max_hw_srq_quanta = uk_attrs->max_hw_srq_quanta; + uresp.comp_mask |= IRDMA_ALLOC_UCTX_MAX_HW_SRQ_QUANTA; if (ib_copy_to_udata(udata, &uresp, min(sizeof(uresp), udata->outlen))) { rdma_user_mmap_entry_remove(ucontext->db_mmap_entry); @@ -343,6 +362,8 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx, spin_lock_init(&ucontext->cq_reg_mem_list_lock); INIT_LIST_HEAD(&ucontext->qp_reg_mem_list); spin_lock_init(&ucontext->qp_reg_mem_list_lock); + INIT_LIST_HEAD(&ucontext->srq_reg_mem_list); + spin_lock_init(&ucontext->srq_reg_mem_list_lock); return 0; @@ -521,7 +542,7 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) iwqp->sc_qp.qp_uk.destroy_pending = true; - if (iwqp->iwarp_state == IRDMA_QP_STATE_RTS) + if (iwqp->iwarp_state >= IRDMA_QP_STATE_IDLE) irdma_modify_qp_to_err(&iwqp->sc_qp); if (!iwqp->user_mode) @@ -541,6 +562,9 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); irdma_remove_push_mmap_entries(iwqp); + + if (iwqp->sc_qp.qp_uk.qp_id == 1) + iwdev->rf->hwqp1_rsvd = false; irdma_free_qp_rsrc(iwqp); return 0; @@ -564,7 +588,11 @@ static void irdma_setup_virt_qp(struct irdma_device *iwdev, if (iwpbl->pbl_allocated) { init_info->virtual_map = true; init_info->sq_pa = qpmr->sq_pbl.idx; - init_info->rq_pa = qpmr->rq_pbl.idx; + /* Need to use contiguous buffer for RQ of QP + * in case it is associated with SRQ. + */ + init_info->rq_pa = init_info->qp_uk_init_info.srq_uk ? + qpmr->rq_pa : qpmr->rq_pbl.idx; } else { init_info->sq_pa = qpmr->sq_pbl.addr; init_info->rq_pa = qpmr->rq_pbl.addr; @@ -719,6 +747,7 @@ static int irdma_setup_kmode_qp(struct irdma_device *iwdev, info->rq_pa + (ukinfo->rq_depth * IRDMA_QP_WQE_MIN_SIZE); ukinfo->sq_size = ukinfo->sq_depth >> ukinfo->sq_shift; ukinfo->rq_size = ukinfo->rq_depth >> ukinfo->rq_shift; + ukinfo->qp_id = info->qp_uk_init_info.qp_id; iwqp->max_send_wr = (ukinfo->sq_depth - IRDMA_SQ_RSVD) >> ukinfo->sq_shift; iwqp->max_recv_wr = (ukinfo->rq_depth - IRDMA_RQ_RSVD) >> ukinfo->rq_shift; @@ -775,9 +804,12 @@ static void irdma_roce_fill_and_set_qpctx_info(struct irdma_qp *iwqp, roce_info = &iwqp->roce_info; ether_addr_copy(roce_info->mac_addr, iwdev->netdev->dev_addr); + if (iwqp->ibqp.qp_type == IB_QPT_GSI && iwqp->ibqp.qp_num != 1) + roce_info->is_qp1 = true; roce_info->rd_en = true; roce_info->wr_rdresp_en = true; - roce_info->bind_en = true; + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + roce_info->bind_en = true; roce_info->dcqcn_en = false; roce_info->rtomin = 5; @@ -808,7 +840,6 @@ static void irdma_iw_fill_and_set_qpctx_info(struct irdma_qp *iwqp, ether_addr_copy(iwarp_info->mac_addr, iwdev->netdev->dev_addr); iwarp_info->rd_en = true; iwarp_info->wr_rdresp_en = true; - iwarp_info->bind_en = true; iwarp_info->ecn_en = true; iwarp_info->rtomin = 5; @@ -864,6 +895,47 @@ static void irdma_flush_worker(struct work_struct *work) irdma_generate_flush_completions(iwqp); } +static int irdma_setup_gsi_qp_rsrc(struct irdma_qp *iwqp, u32 *qp_num) +{ + struct irdma_device *iwdev = iwqp->iwdev; + struct irdma_pci_f *rf = iwdev->rf; + unsigned long flags; + int ret; + + if (rf->rdma_ver <= IRDMA_GEN_2) { + *qp_num = 1; + return 0; + } + + spin_lock_irqsave(&rf->rsrc_lock, flags); + if (!rf->hwqp1_rsvd) { + *qp_num = 1; + rf->hwqp1_rsvd = true; + spin_unlock_irqrestore(&rf->rsrc_lock, flags); + } else { + spin_unlock_irqrestore(&rf->rsrc_lock, flags); + ret = irdma_alloc_rsrc(rf, rf->allocated_qps, rf->max_qp, + qp_num, &rf->next_qp); + if (ret) + return ret; + } + + ret = irdma_vchnl_req_add_vport(&rf->sc_dev, iwdev->vport_id, *qp_num, + (&iwdev->vsi)->qos); + if (ret) { + if (*qp_num != 1) { + irdma_free_rsrc(rf, rf->allocated_qps, *qp_num); + } else { + spin_lock_irqsave(&rf->rsrc_lock, flags); + rf->hwqp1_rsvd = false; + spin_unlock_irqrestore(&rf->rsrc_lock, flags); + } + return ret; + } + + return 0; +} + /** * irdma_create_qp - create qp * @ibqp: ptr of qp @@ -889,6 +961,18 @@ static int irdma_create_qp(struct ib_qp *ibqp, struct irdma_uk_attrs *uk_attrs = &dev->hw_attrs.uk_attrs; struct irdma_qp_init_info init_info = {}; struct irdma_qp_host_ctx_info *ctx_info; + struct irdma_srq *iwsrq; + bool srq_valid = false; + u32 srq_id = 0; + + if (init_attr->srq) { + iwsrq = to_iwsrq(init_attr->srq); + srq_valid = true; + srq_id = iwsrq->srq_num; + init_attr->cap.max_recv_sge = uk_attrs->max_hw_wq_frags; + init_attr->cap.max_recv_wr = 4; + init_info.qp_uk_init_info.srq_uk = &iwsrq->sc_srq.srq_uk; + } err_code = irdma_validate_qp_attrs(init_attr, iwdev); if (err_code) @@ -925,16 +1009,20 @@ static int irdma_create_qp(struct ib_qp *ibqp, init_info.host_ctx = (__le64 *)(init_info.q2 + IRDMA_Q2_BUF_SIZE); init_info.host_ctx_pa = init_info.q2_pa + IRDMA_Q2_BUF_SIZE; - if (init_attr->qp_type == IB_QPT_GSI) - qp_num = 1; - else + if (init_attr->qp_type == IB_QPT_GSI) { + err_code = irdma_setup_gsi_qp_rsrc(iwqp, &qp_num); + if (err_code) + goto error; + iwqp->ibqp.qp_num = 1; + } else { err_code = irdma_alloc_rsrc(rf, rf->allocated_qps, rf->max_qp, &qp_num, &rf->next_qp); - if (err_code) - goto error; + if (err_code) + goto error; + iwqp->ibqp.qp_num = qp_num; + } iwqp->iwpd = iwpd; - iwqp->ibqp.qp_num = qp_num; qp = &iwqp->sc_qp; iwqp->iwscq = to_iwcq(init_attr->send_cq); iwqp->iwrcq = to_iwcq(init_attr->recv_cq); @@ -991,13 +1079,22 @@ static int irdma_create_qp(struct ib_qp *ibqp, } ctx_info = &iwqp->ctx_info; + ctx_info->srq_valid = srq_valid; + ctx_info->srq_id = srq_id; ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id; ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id; - if (rdma_protocol_roce(&iwdev->ibdev, 1)) + if (rdma_protocol_roce(&iwdev->ibdev, 1)) { + if (dev->ws_add(&iwdev->vsi, 0)) { + irdma_cqp_qp_destroy_cmd(&rf->sc_dev, &iwqp->sc_qp); + err_code = -EINVAL; + goto error; + } + irdma_qp_add_qos(&iwqp->sc_qp); irdma_roce_fill_and_set_qpctx_info(iwqp, ctx_info); - else + } else { irdma_iw_fill_and_set_qpctx_info(iwqp, ctx_info); + } err_code = irdma_cqp_create_qp_cmd(iwqp); if (err_code) @@ -1009,16 +1106,6 @@ static int irdma_create_qp(struct ib_qp *ibqp, iwqp->sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR; rf->qp_table[qp_num] = iwqp; - if (rdma_protocol_roce(&iwdev->ibdev, 1)) { - if (dev->ws_add(&iwdev->vsi, 0)) { - irdma_cqp_qp_destroy_cmd(&rf->sc_dev, &iwqp->sc_qp); - err_code = -EINVAL; - goto error; - } - - irdma_qp_add_qos(&iwqp->sc_qp); - } - if (udata) { /* GEN_1 legacy support with libi40iw does not have expanded uresp struct */ if (udata->outlen < sizeof(uresp)) { @@ -1063,6 +1150,8 @@ static int irdma_get_ib_acc_flags(struct irdma_qp *iwqp) acc_flags |= IB_ACCESS_REMOTE_READ; if (iwqp->roce_info.bind_en) acc_flags |= IB_ACCESS_MW_BIND; + if (iwqp->ctx_info.remote_atomics_en) + acc_flags |= IB_ACCESS_REMOTE_ATOMIC; } else { if (iwqp->iwarp_info.wr_rdresp_en) { acc_flags |= IB_ACCESS_LOCAL_WRITE; @@ -1070,8 +1159,8 @@ static int irdma_get_ib_acc_flags(struct irdma_qp *iwqp) } if (iwqp->iwarp_info.rd_en) acc_flags |= IB_ACCESS_REMOTE_READ; - if (iwqp->iwarp_info.bind_en) - acc_flags |= IB_ACCESS_MW_BIND; + if (iwqp->ctx_info.remote_atomics_en) + acc_flags |= IB_ACCESS_REMOTE_ATOMIC; } return acc_flags; } @@ -1110,6 +1199,7 @@ static int irdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->pkey_index = iwqp->roce_info.p_key; attr->retry_cnt = iwqp->udp_info.rexmit_thresh; attr->rnr_retry = iwqp->udp_info.rnr_nak_thresh; + attr->min_rnr_timer = iwqp->udp_info.min_rnr_timer; attr->max_rd_atomic = iwqp->roce_info.ord_size; attr->max_dest_rd_atomic = iwqp->roce_info.ird_size; } @@ -1118,6 +1208,7 @@ static int irdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->qp_context = iwqp->ibqp.qp_context; init_attr->send_cq = iwqp->ibqp.send_cq; init_attr->recv_cq = iwqp->ibqp.recv_cq; + init_attr->srq = iwqp->ibqp.srq; init_attr->cap = attr->cap; return 0; @@ -1242,6 +1333,10 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_RNR_RETRY) udp_info->rnr_nak_thresh = attr->rnr_retry; + if (attr_mask & IB_QP_MIN_RNR_TIMER && + dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) + udp_info->min_rnr_timer = attr->min_rnr_timer; + if (attr_mask & IB_QP_RETRY_CNT) udp_info->rexmit_thresh = attr->retry_cnt; @@ -1362,6 +1457,9 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, roce_info->wr_rdresp_en = true; if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) roce_info->rd_en = true; + if (dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS) + if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) + ctx_info->remote_atomics_en = true; } wait_event(iwqp->mod_qp_waitq, !atomic_read(&iwqp->hw_mod_qp_pend)); @@ -1777,6 +1875,24 @@ exit: } /** + * irdma_srq_free_rsrc - free up resources for srq + * @rf: RDMA PCI function + * @iwsrq: srq ptr + */ +static void irdma_srq_free_rsrc(struct irdma_pci_f *rf, struct irdma_srq *iwsrq) +{ + struct irdma_sc_srq *srq = &iwsrq->sc_srq; + + if (!iwsrq->user_mode) { + dma_free_coherent(rf->sc_dev.hw->device, iwsrq->kmem.size, + iwsrq->kmem.va, iwsrq->kmem.pa); + iwsrq->kmem.va = NULL; + } + + irdma_free_rsrc(rf, rf->allocated_srqs, srq->srq_uk.srq_id); +} + +/** * irdma_cq_free_rsrc - free up resources for cq * @rf: RDMA PCI function * @iwcq: cq ptr @@ -1840,6 +1956,22 @@ static int irdma_process_resize_list(struct irdma_cq *iwcq, } /** + * irdma_destroy_srq - destroy srq + * @ibsrq: srq pointer + * @udata: user data + */ +static int irdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) +{ + struct irdma_device *iwdev = to_iwdev(ibsrq->device); + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + struct irdma_sc_srq *srq = &iwsrq->sc_srq; + + irdma_srq_wq_destroy(iwdev->rf, srq); + irdma_srq_free_rsrc(iwdev->rf, iwsrq); + return 0; +} + +/** * irdma_destroy_cq - destroy cq * @ib_cq: cq pointer * @udata: user data @@ -1914,8 +2046,13 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, if (!iwcq->user_mode) { entries++; - if (rf->sc_dev.hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) + + if (!iwcq->sc_cq.cq_uk.avoid_mem_cflct && + dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) entries *= 2; + + if (entries & 1) + entries += 1; /* cq size must be an even number */ } info.cq_size = max(entries, 4); @@ -2022,10 +2159,297 @@ error: return ret; } +/** + * irdma_srq_event - event notification for srq limit + * @srq: shared srq struct + */ +void irdma_srq_event(struct irdma_sc_srq *srq) +{ + struct irdma_srq *iwsrq = container_of(srq, struct irdma_srq, sc_srq); + struct ib_srq *ibsrq = &iwsrq->ibsrq; + struct ib_event event; + + srq->srq_limit = 0; + + if (!ibsrq->event_handler) + return; + + event.device = ibsrq->device; + event.element.port_num = 1; + event.element.srq = ibsrq; + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + ibsrq->event_handler(&event, ibsrq->srq_context); +} + +/** + * irdma_modify_srq - modify srq request + * @ibsrq: srq's pointer for modify + * @attr: access attributes + * @attr_mask: state mask + * @udata: user data + */ +static int irdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) +{ + struct irdma_device *iwdev = to_iwdev(ibsrq->device); + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + struct irdma_cqp_request *cqp_request; + struct irdma_pci_f *rf = iwdev->rf; + struct irdma_modify_srq_info *info; + struct cqp_cmds_info *cqp_info; + int status; + + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (!(attr_mask & IB_SRQ_LIMIT)) + return 0; + + if (attr->srq_limit > iwsrq->sc_srq.srq_uk.srq_size) + return -EINVAL; + + /* Execute this cqp op synchronously, so we can update srq_limit + * upon successful completion. + */ + cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); + if (!cqp_request) + return -ENOMEM; + + cqp_info = &cqp_request->info; + info = &cqp_info->in.u.srq_modify.info; + info->srq_limit = attr->srq_limit; + if (info->srq_limit > 0xFFF) + info->srq_limit = 0xFFF; + info->arm_limit_event = 1; + + cqp_info->cqp_cmd = IRDMA_OP_SRQ_MODIFY; + cqp_info->post_sq = 1; + cqp_info->in.u.srq_modify.srq = &iwsrq->sc_srq; + cqp_info->in.u.srq_modify.scratch = (uintptr_t)cqp_request; + status = irdma_handle_cqp_op(rf, cqp_request); + irdma_put_cqp_request(&rf->cqp, cqp_request); + if (status) + return status; + + iwsrq->sc_srq.srq_limit = info->srq_limit; + + return 0; +} + +static int irdma_setup_umode_srq(struct irdma_device *iwdev, + struct irdma_srq *iwsrq, + struct irdma_srq_init_info *info, + struct ib_udata *udata) +{ +#define IRDMA_CREATE_SRQ_MIN_REQ_LEN \ + offsetofend(struct irdma_create_srq_req, user_shadow_area) + struct irdma_create_srq_req req = {}; + struct irdma_ucontext *ucontext; + struct irdma_srq_mr *srqmr; + struct irdma_pbl *iwpbl; + unsigned long flags; + + iwsrq->user_mode = true; + ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, + ibucontext); + + if (udata->inlen < IRDMA_CREATE_SRQ_MIN_REQ_LEN) + return -EINVAL; + + if (ib_copy_from_udata(&req, udata, + min(sizeof(req), udata->inlen))) + return -EFAULT; + + spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags); + iwpbl = irdma_get_pbl((unsigned long)req.user_srq_buf, + &ucontext->srq_reg_mem_list); + spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags); + if (!iwpbl) + return -EPROTO; + + iwsrq->iwpbl = iwpbl; + srqmr = &iwpbl->srq_mr; + + if (iwpbl->pbl_allocated) { + info->virtual_map = true; + info->pbl_chunk_size = 1; + info->first_pm_pbl_idx = srqmr->srq_pbl.idx; + info->leaf_pbl_size = 1; + } else { + info->srq_pa = srqmr->srq_pbl.addr; + } + info->shadow_area_pa = srqmr->shadow; + + return 0; +} + +static int irdma_setup_kmode_srq(struct irdma_device *iwdev, + struct irdma_srq *iwsrq, + struct irdma_srq_init_info *info, u32 depth, + u8 shift) +{ + struct irdma_srq_uk_init_info *ukinfo = &info->srq_uk_init_info; + struct irdma_dma_mem *mem = &iwsrq->kmem; + u32 size, ring_size; + + ring_size = depth * IRDMA_QP_WQE_MIN_SIZE; + size = ring_size + (IRDMA_SHADOW_AREA_SIZE << 3); + + mem->size = ALIGN(size, 256); + mem->va = dma_alloc_coherent(iwdev->rf->hw.device, mem->size, + &mem->pa, GFP_KERNEL); + if (!mem->va) + return -ENOMEM; + + ukinfo->srq = mem->va; + ukinfo->srq_size = depth >> shift; + ukinfo->shadow_area = mem->va + ring_size; + + info->shadow_area_pa = info->srq_pa + ring_size; + info->srq_pa = mem->pa; + + return 0; +} + +/** + * irdma_create_srq - create srq + * @ibsrq: ib's srq pointer + * @initattrs: attributes for srq + * @udata: user data for create srq + */ +static int irdma_create_srq(struct ib_srq *ibsrq, + struct ib_srq_init_attr *initattrs, + struct ib_udata *udata) +{ + struct irdma_device *iwdev = to_iwdev(ibsrq->device); + struct ib_srq_attr *attr = &initattrs->attr; + struct irdma_pd *iwpd = to_iwpd(ibsrq->pd); + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + struct irdma_srq_uk_init_info *ukinfo; + struct irdma_cqp_request *cqp_request; + struct irdma_srq_init_info info = {}; + struct irdma_pci_f *rf = iwdev->rf; + struct irdma_uk_attrs *uk_attrs; + struct cqp_cmds_info *cqp_info; + int err_code = 0; + u32 depth; + u8 shift; + + uk_attrs = &rf->sc_dev.hw_attrs.uk_attrs; + ukinfo = &info.srq_uk_init_info; + + if (initattrs->srq_type != IB_SRQT_BASIC) + return -EOPNOTSUPP; + + if (!(uk_attrs->feature_flags & IRDMA_FEATURE_SRQ) || + attr->max_sge > uk_attrs->max_hw_wq_frags) + return -EINVAL; + + refcount_set(&iwsrq->refcnt, 1); + spin_lock_init(&iwsrq->lock); + err_code = irdma_alloc_rsrc(rf, rf->allocated_srqs, rf->max_srq, + &iwsrq->srq_num, &rf->next_srq); + if (err_code) + return err_code; + + ukinfo->max_srq_frag_cnt = attr->max_sge; + ukinfo->uk_attrs = uk_attrs; + ukinfo->srq_id = iwsrq->srq_num; + + irdma_get_wqe_shift(ukinfo->uk_attrs, ukinfo->max_srq_frag_cnt, 0, + &shift); + + err_code = irdma_get_srqdepth(ukinfo->uk_attrs, attr->max_wr, + shift, &depth); + if (err_code) + return err_code; + + /* Actual SRQ size in WRs for ring and HW */ + ukinfo->srq_size = depth >> shift; + + /* Max postable WRs to SRQ */ + iwsrq->max_wr = (depth - IRDMA_RQ_RSVD) >> shift; + attr->max_wr = iwsrq->max_wr; + + if (udata) + err_code = irdma_setup_umode_srq(iwdev, iwsrq, &info, udata); + else + err_code = irdma_setup_kmode_srq(iwdev, iwsrq, &info, depth, + shift); + + if (err_code) + goto free_rsrc; + + info.vsi = &iwdev->vsi; + info.pd = &iwpd->sc_pd; + + err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info); + if (err_code) + goto free_dmem; + + cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); + if (!cqp_request) { + err_code = -ENOMEM; + goto free_dmem; + } + + cqp_info = &cqp_request->info; + cqp_info->cqp_cmd = IRDMA_OP_SRQ_CREATE; + cqp_info->post_sq = 1; + cqp_info->in.u.srq_create.srq = &iwsrq->sc_srq; + cqp_info->in.u.srq_create.scratch = (uintptr_t)cqp_request; + err_code = irdma_handle_cqp_op(rf, cqp_request); + irdma_put_cqp_request(&rf->cqp, cqp_request); + if (err_code) + goto free_dmem; + + if (udata) { + struct irdma_create_srq_resp resp = {}; + + resp.srq_id = iwsrq->srq_num; + resp.srq_size = ukinfo->srq_size; + if (ib_copy_to_udata(udata, &resp, + min(sizeof(resp), udata->outlen))) { + err_code = -EPROTO; + goto srq_destroy; + } + } + + return 0; + +srq_destroy: + irdma_srq_wq_destroy(rf, &iwsrq->sc_srq); + +free_dmem: + if (!iwsrq->user_mode) + dma_free_coherent(rf->hw.device, iwsrq->kmem.size, + iwsrq->kmem.va, iwsrq->kmem.pa); +free_rsrc: + irdma_free_rsrc(rf, rf->allocated_srqs, iwsrq->srq_num); + return err_code; +} + +/** + * irdma_query_srq - get SRQ attributes + * @ibsrq: the SRQ to query + * @attr: the attributes of the SRQ + */ +static int irdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + + attr->max_wr = iwsrq->max_wr; + attr->max_sge = iwsrq->sc_srq.srq_uk.max_srq_frag_cnt; + attr->srq_limit = iwsrq->sc_srq.srq_limit; + + return 0; +} + static inline int cq_validate_flags(u32 flags, u8 hw_rev) { - /* GEN1 does not support CQ create flags */ - if (hw_rev == IRDMA_GEN_1) + /* GEN1/2 does not support CQ create flags */ + if (hw_rev <= IRDMA_GEN_2) return flags ? -EOPNOTSUPP : 0; return flags & ~IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION ? -EOPNOTSUPP : 0; @@ -2058,6 +2482,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, unsigned long flags; int err_code; int entries = attr->cqe; + bool cqe_64byte_ena; err_code = cq_validate_flags(attr->flags, dev->hw_attrs.uk_attrs.hw_rev); if (err_code) @@ -2081,6 +2506,9 @@ static int irdma_create_cq(struct ib_cq *ibcq, info.dev = dev; ukinfo->cq_size = max(entries, 4); ukinfo->cq_id = cq_num; + cqe_64byte_ena = dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_64_BYTE_CQE ? + true : false; + ukinfo->avoid_mem_cflct = cqe_64byte_ena; iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size; if (attr->comp_vector < rf->ceqs_count) info.ceq_id = attr->comp_vector; @@ -2116,8 +2544,6 @@ static int irdma_create_cq(struct ib_cq *ibcq, goto cq_free_rsrc; } - iwcq->iwpbl = iwpbl; - iwcq->cq_mem_size = 0; cqmr = &iwpbl->cq_mr; if (rf->sc_dev.hw_attrs.uk_attrs.feature_flags & @@ -2132,7 +2558,6 @@ static int irdma_create_cq(struct ib_cq *ibcq, err_code = -EPROTO; goto cq_free_rsrc; } - iwcq->iwpbl_shadow = iwpbl_shadow; cqmr_shadow = &iwpbl_shadow->cq_mr; info.shadow_area_pa = cqmr_shadow->cq_pbl.addr; cqmr->split = true; @@ -2156,11 +2581,18 @@ static int irdma_create_cq(struct ib_cq *ibcq, } entries++; - if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) + if (!cqe_64byte_ena && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) entries *= 2; + + if (entries & 1) + entries += 1; /* cq size must be an even number */ + ukinfo->cq_size = entries; - rsize = info.cq_uk_init_info.cq_size * sizeof(struct irdma_cqe); + if (cqe_64byte_ena) + rsize = info.cq_uk_init_info.cq_size * sizeof(struct irdma_extended_cqe); + else + rsize = info.cq_uk_init_info.cq_size * sizeof(struct irdma_cqe); iwcq->kmem.size = ALIGN(round_up(rsize, 256), 256); iwcq->kmem.va = dma_alloc_coherent(dev->hw->device, iwcq->kmem.size, @@ -2240,8 +2672,9 @@ cq_free_rsrc: /** * irdma_get_mr_access - get hw MR access permissions from IB access flags * @access: IB access flags + * @hw_rev: Hardware version */ -static inline u16 irdma_get_mr_access(int access) +static inline u16 irdma_get_mr_access(int access, u8 hw_rev) { u16 hw_access = 0; @@ -2251,8 +2684,10 @@ static inline u16 irdma_get_mr_access(int access) IRDMA_ACCESS_FLAGS_REMOTEWRITE : 0; hw_access |= (access & IB_ACCESS_REMOTE_READ) ? IRDMA_ACCESS_FLAGS_REMOTEREAD : 0; - hw_access |= (access & IB_ACCESS_MW_BIND) ? - IRDMA_ACCESS_FLAGS_BIND_WINDOW : 0; + if (hw_rev >= IRDMA_GEN_3) { + hw_access |= (access & IB_ACCESS_MW_BIND) ? + IRDMA_ACCESS_FLAGS_BIND_WINDOW : 0; + } hw_access |= (access & IB_ZERO_BASED) ? IRDMA_ACCESS_FLAGS_ZERO_BASED : 0; hw_access |= IRDMA_ACCESS_FLAGS_LOCALREAD; @@ -2463,6 +2898,7 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev, struct irdma_mr *iwmr = iwpbl->iwmr; struct irdma_qp_mr *qpmr = &iwpbl->qp_mr; struct irdma_cq_mr *cqmr = &iwpbl->cq_mr; + struct irdma_srq_mr *srqmr = &iwpbl->srq_mr; struct irdma_hmc_pble *hmc_p; u64 *arr = iwmr->pgaddrmem; u32 pg_size, total; @@ -2482,7 +2918,10 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev, total = req->sq_pages + req->rq_pages; hmc_p = &qpmr->sq_pbl; qpmr->shadow = (dma_addr_t)arr[total]; - + /* Need to use physical address for RQ of QP + * in case it is associated with SRQ. + */ + qpmr->rq_pa = (dma_addr_t)arr[req->sq_pages]; if (lvl) { ret = irdma_check_mem_contiguous(arr, req->sq_pages, pg_size); @@ -2502,6 +2941,18 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev, hmc_p->addr = arr[req->sq_pages]; } break; + case IRDMA_MEMREG_TYPE_SRQ: + hmc_p = &srqmr->srq_pbl; + srqmr->shadow = (dma_addr_t)arr[req->rq_pages]; + if (lvl) + ret = irdma_check_mem_contiguous(arr, req->rq_pages, + pg_size); + + if (!ret) + hmc_p->idx = palloc->level1.idx; + else + hmc_p->addr = arr[0]; + break; case IRDMA_MEMREG_TYPE_CQ: hmc_p = &cqmr->cq_pbl; @@ -2806,7 +3257,10 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, stag_info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S; stag_info->stag_key = (u8)iwmr->stag; stag_info->total_len = iwmr->len; - stag_info->access_rights = irdma_get_mr_access(access); + stag_info->access_rights = irdma_get_mr_access(access, + iwdev->rf->sc_dev.hw_attrs.uk_attrs.hw_rev); + if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS) + stag_info->remote_atomics_en = (access & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; stag_info->pd_id = iwpd->sc_pd.pd_id; stag_info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; if (stag_info->access_rights & IRDMA_ACCESS_FLAGS_ZERO_BASED) @@ -2972,6 +3426,37 @@ static int irdma_reg_user_mr_type_qp(struct irdma_mem_reg_req req, return 0; } +static int irdma_reg_user_mr_type_srq(struct irdma_mem_reg_req req, + struct ib_udata *udata, + struct irdma_mr *iwmr) +{ + struct irdma_device *iwdev = to_iwdev(iwmr->ibmr.device); + struct irdma_pbl *iwpbl = &iwmr->iwpbl; + struct irdma_ucontext *ucontext; + unsigned long flags; + u32 total; + int err; + u8 lvl; + + total = req.rq_pages + IRDMA_SHADOW_PGCNT; + if (total > iwmr->page_cnt) + return -EINVAL; + + lvl = req.rq_pages > 1 ? PBLE_LEVEL_1 : PBLE_LEVEL_0; + err = irdma_handle_q_mem(iwdev, &req, iwpbl, lvl); + if (err) + return err; + + ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, + ibucontext); + spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags); + list_add_tail(&iwpbl->list, &ucontext->srq_reg_mem_list); + iwpbl->on_list = true; + spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags); + + return 0; +} + static int irdma_reg_user_mr_type_cq(struct irdma_mem_reg_req req, struct ib_udata *udata, struct irdma_mr *iwmr) @@ -3063,6 +3548,12 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, goto error; break; + case IRDMA_MEMREG_TYPE_SRQ: + err = irdma_reg_user_mr_type_srq(req, udata, iwmr); + if (err) + goto error; + + break; case IRDMA_MEMREG_TYPE_CQ: err = irdma_reg_user_mr_type_cq(req, udata, iwmr); if (err) @@ -3106,9 +3597,9 @@ static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, access); if (IS_ERR(umem_dmabuf)) { - err = PTR_ERR(umem_dmabuf); - ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%d]\n", err); - return ERR_PTR(err); + ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%pe]\n", + umem_dmabuf); + return ERR_CAST(umem_dmabuf); } iwmr = irdma_alloc_iwmr(&umem_dmabuf->umem, pd, virt, IRDMA_MEMREG_TYPE_MEM); @@ -3382,6 +3873,14 @@ static void irdma_del_memlist(struct irdma_mr *iwmr, } spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); break; + case IRDMA_MEMREG_TYPE_SRQ: + spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags); + if (iwpbl->on_list) { + iwpbl->on_list = false; + list_del(&iwpbl->list); + } + spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags); + break; default: break; } @@ -3461,6 +3960,40 @@ static int irdma_post_send(struct ib_qp *ibqp, if (ib_wr->send_flags & IB_SEND_FENCE) info.read_fence = true; switch (ib_wr->opcode) { + case IB_WR_ATOMIC_CMP_AND_SWP: + if (unlikely(!(dev->hw_attrs.uk_attrs.feature_flags & + IRDMA_FEATURE_ATOMIC_OPS))) { + err = -EINVAL; + break; + } + info.op_type = IRDMA_OP_TYPE_ATOMIC_COMPARE_AND_SWAP; + info.op.atomic_compare_swap.tagged_offset = ib_wr->sg_list[0].addr; + info.op.atomic_compare_swap.remote_tagged_offset = + atomic_wr(ib_wr)->remote_addr; + info.op.atomic_compare_swap.swap_data_bytes = atomic_wr(ib_wr)->swap; + info.op.atomic_compare_swap.compare_data_bytes = + atomic_wr(ib_wr)->compare_add; + info.op.atomic_compare_swap.stag = ib_wr->sg_list[0].lkey; + info.op.atomic_compare_swap.remote_stag = atomic_wr(ib_wr)->rkey; + err = irdma_uk_atomic_compare_swap(ukqp, &info, false); + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(dev->hw_attrs.uk_attrs.feature_flags & + IRDMA_FEATURE_ATOMIC_OPS))) { + err = -EINVAL; + break; + } + info.op_type = IRDMA_OP_TYPE_ATOMIC_FETCH_AND_ADD; + info.op.atomic_fetch_add.tagged_offset = ib_wr->sg_list[0].addr; + info.op.atomic_fetch_add.remote_tagged_offset = + atomic_wr(ib_wr)->remote_addr; + info.op.atomic_fetch_add.fetch_add_data_bytes = + atomic_wr(ib_wr)->compare_add; + info.op.atomic_fetch_add.stag = ib_wr->sg_list[0].lkey; + info.op.atomic_fetch_add.remote_stag = + atomic_wr(ib_wr)->rkey; + err = irdma_uk_atomic_fetch_add(ukqp, &info, false); + break; case IB_WR_SEND_WITH_IMM: if (ukqp->qp_caps & IRDMA_SEND_WITH_IMM) { info.imm_data_valid = true; @@ -3555,7 +4088,9 @@ static int irdma_post_send(struct ib_qp *ibqp, stag_info.signaled = info.signaled; stag_info.read_fence = info.read_fence; - stag_info.access_rights = irdma_get_mr_access(reg_wr(ib_wr)->access); + stag_info.access_rights = + irdma_get_mr_access(reg_wr(ib_wr)->access, + dev->hw_attrs.uk_attrs.hw_rev); stag_info.stag_key = reg_wr(ib_wr)->key & 0xff; stag_info.stag_idx = reg_wr(ib_wr)->key >> 8; stag_info.page_size = reg_wr(ib_wr)->mr->page_size; @@ -3594,6 +4129,48 @@ static int irdma_post_send(struct ib_qp *ibqp, mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } + + if (err) + *bad_wr = ib_wr; + + return err; +} + +/** + * irdma_post_srq_recv - post receive wr for kernel application + * @ibsrq: ib srq pointer + * @ib_wr: work request for receive + * @bad_wr: bad wr caused an error + */ +static int irdma_post_srq_recv(struct ib_srq *ibsrq, + const struct ib_recv_wr *ib_wr, + const struct ib_recv_wr **bad_wr) +{ + struct irdma_srq *iwsrq = to_iwsrq(ibsrq); + struct irdma_srq_uk *uksrq = &iwsrq->sc_srq.srq_uk; + struct irdma_post_rq_info post_recv = {}; + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&iwsrq->lock, flags); + while (ib_wr) { + if (ib_wr->num_sge > uksrq->max_srq_frag_cnt) { + err = -EINVAL; + goto out; + } + post_recv.num_sges = ib_wr->num_sge; + post_recv.wr_id = ib_wr->wr_id; + post_recv.sg_list = ib_wr->sg_list; + err = irdma_uk_srq_post_receive(uksrq, &post_recv); + if (err) + goto out; + + ib_wr = ib_wr->next; + } + +out: + spin_unlock_irqrestore(&iwsrq->lock, flags); + if (err) *bad_wr = ib_wr; @@ -3619,6 +4196,11 @@ static int irdma_post_recv(struct ib_qp *ibqp, iwqp = to_iwqp(ibqp); ukqp = &iwqp->sc_qp.qp_uk; + if (ukqp->srq_uk) { + *bad_wr = ib_wr; + return -EINVAL; + } + spin_lock_irqsave(&iwqp->lock, flags); while (ib_wr) { post_recv.num_sges = ib_wr->num_sge; @@ -3671,6 +4253,8 @@ static enum ib_wc_status irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode return IB_WC_MW_BIND_ERR; case FLUSH_REM_INV_REQ_ERR: return IB_WC_REM_INV_REQ_ERR; + case FLUSH_RNR_RETRY_EXC_ERR: + return IB_WC_RNR_RETRY_EXC_ERR; case FLUSH_FATAL_ERR: default: return IB_WC_FATAL_ERR; @@ -3727,8 +4311,12 @@ static void irdma_process_cqe(struct ib_wc *entry, if (cq_poll_info->q_type == IRDMA_CQE_QTYPE_SQ) { set_ib_wc_op_sq(cq_poll_info, entry); } else { - set_ib_wc_op_rq(cq_poll_info, entry, - qp->qp_uk.qp_caps & IRDMA_SEND_WITH_IMM); + if (qp->dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2) + set_ib_wc_op_rq(cq_poll_info, entry, + qp->qp_uk.qp_caps & IRDMA_SEND_WITH_IMM ? + true : false); + else + set_ib_wc_op_rq_gen_3(cq_poll_info, entry); if (qp->qp_uk.qp_type != IRDMA_QP_TYPE_ROCE_UD && cq_poll_info->stag_invalid_set) { entry->ex.invalidate_rkey = cq_poll_info->inv_stag; @@ -3923,40 +4511,7 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq, return ret; } -static int irdma_roce_port_immutable(struct ib_device *ibdev, u32 port_num, - struct ib_port_immutable *immutable) -{ - struct ib_port_attr attr; - int err; - - immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; - err = ib_query_port(ibdev, port_num, &attr); - if (err) - return err; - - immutable->max_mad_size = IB_MGMT_MAD_SIZE; - immutable->pkey_tbl_len = attr.pkey_tbl_len; - immutable->gid_tbl_len = attr.gid_tbl_len; - - return 0; -} - -static int irdma_iw_port_immutable(struct ib_device *ibdev, u32 port_num, - struct ib_port_immutable *immutable) -{ - struct ib_port_attr attr; - int err; - - immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; - err = ib_query_port(ibdev, port_num, &attr); - if (err) - return err; - immutable->gid_tbl_len = attr.gid_tbl_len; - - return 0; -} - -static const struct rdma_stat_desc irdma_hw_stat_names[] = { +static const struct rdma_stat_desc irdma_hw_stat_descs[] = { /* gen1 - 32-bit */ [IRDMA_HW_STAT_INDEX_IP4RXDISCARD].name = "ip4InDiscards", [IRDMA_HW_STAT_INDEX_IP4RXTRUNC].name = "ip4InTruncatedPkts", @@ -3964,9 +4519,6 @@ static const struct rdma_stat_desc irdma_hw_stat_names[] = { [IRDMA_HW_STAT_INDEX_IP6RXDISCARD].name = "ip6InDiscards", [IRDMA_HW_STAT_INDEX_IP6RXTRUNC].name = "ip6InTruncatedPkts", [IRDMA_HW_STAT_INDEX_IP6TXNOROUTE].name = "ip6OutNoRoutes", - [IRDMA_HW_STAT_INDEX_TCPRTXSEG].name = "tcpRetransSegs", - [IRDMA_HW_STAT_INDEX_TCPRXOPTERR].name = "tcpInOptErrors", - [IRDMA_HW_STAT_INDEX_TCPRXPROTOERR].name = "tcpInProtoErrors", [IRDMA_HW_STAT_INDEX_RXVLANERR].name = "rxVlanErrors", /* gen1 - 64-bit */ [IRDMA_HW_STAT_INDEX_IP4RXOCTS].name = "ip4InOctets", @@ -3985,16 +4537,14 @@ static const struct rdma_stat_desc irdma_hw_stat_names[] = { [IRDMA_HW_STAT_INDEX_IP6TXPKTS].name = "ip6OutPkts", [IRDMA_HW_STAT_INDEX_IP6TXFRAGS].name = "ip6OutSegRqd", [IRDMA_HW_STAT_INDEX_IP6TXMCPKTS].name = "ip6OutMcastPkts", - [IRDMA_HW_STAT_INDEX_TCPRXSEGS].name = "tcpInSegs", - [IRDMA_HW_STAT_INDEX_TCPTXSEG].name = "tcpOutSegs", - [IRDMA_HW_STAT_INDEX_RDMARXRDS].name = "iwInRdmaReads", - [IRDMA_HW_STAT_INDEX_RDMARXSNDS].name = "iwInRdmaSends", - [IRDMA_HW_STAT_INDEX_RDMARXWRS].name = "iwInRdmaWrites", - [IRDMA_HW_STAT_INDEX_RDMATXRDS].name = "iwOutRdmaReads", - [IRDMA_HW_STAT_INDEX_RDMATXSNDS].name = "iwOutRdmaSends", - [IRDMA_HW_STAT_INDEX_RDMATXWRS].name = "iwOutRdmaWrites", - [IRDMA_HW_STAT_INDEX_RDMAVBND].name = "iwRdmaBnd", - [IRDMA_HW_STAT_INDEX_RDMAVINV].name = "iwRdmaInv", + [IRDMA_HW_STAT_INDEX_RDMARXRDS].name = "InRdmaReads", + [IRDMA_HW_STAT_INDEX_RDMARXSNDS].name = "InRdmaSends", + [IRDMA_HW_STAT_INDEX_RDMARXWRS].name = "InRdmaWrites", + [IRDMA_HW_STAT_INDEX_RDMATXRDS].name = "OutRdmaReads", + [IRDMA_HW_STAT_INDEX_RDMATXSNDS].name = "OutRdmaSends", + [IRDMA_HW_STAT_INDEX_RDMATXWRS].name = "OutRdmaWrites", + [IRDMA_HW_STAT_INDEX_RDMAVBND].name = "RdmaBnd", + [IRDMA_HW_STAT_INDEX_RDMAVINV].name = "RdmaInv", /* gen2 - 32-bit */ [IRDMA_HW_STAT_INDEX_RXRPCNPHANDLED].name = "cnpHandled", @@ -4008,9 +4558,59 @@ static const struct rdma_stat_desc irdma_hw_stat_names[] = { [IRDMA_HW_STAT_INDEX_UDPRXPKTS].name = "RxUDP", [IRDMA_HW_STAT_INDEX_UDPTXPKTS].name = "TxUDP", [IRDMA_HW_STAT_INDEX_RXNPECNMARKEDPKTS].name = "RxECNMrkd", - + [IRDMA_HW_STAT_INDEX_TCPRTXSEG].name = "RetransSegs", + [IRDMA_HW_STAT_INDEX_TCPRXOPTERR].name = "InOptErrors", + [IRDMA_HW_STAT_INDEX_TCPRXPROTOERR].name = "InProtoErrors", + [IRDMA_HW_STAT_INDEX_TCPRXSEGS].name = "InSegs", + [IRDMA_HW_STAT_INDEX_TCPTXSEG].name = "OutSegs", + + /* gen3 */ + [IRDMA_HW_STAT_INDEX_RNR_SENT].name = "RNR sent", + [IRDMA_HW_STAT_INDEX_RNR_RCVD].name = "RNR received", + [IRDMA_HW_STAT_INDEX_RDMAORDLMTCNT].name = "ord limit count", + [IRDMA_HW_STAT_INDEX_RDMAIRDLMTCNT].name = "ird limit count", + [IRDMA_HW_STAT_INDEX_RDMARXATS].name = "Rx atomics", + [IRDMA_HW_STAT_INDEX_RDMATXATS].name = "Tx atomics", + [IRDMA_HW_STAT_INDEX_NAKSEQERR].name = "Nak Sequence Error", + [IRDMA_HW_STAT_INDEX_NAKSEQERR_IMPLIED].name = "Nak Sequence Error Implied", + [IRDMA_HW_STAT_INDEX_RTO].name = "RTO", + [IRDMA_HW_STAT_INDEX_RXOOOPKTS].name = "Rcvd Out of order packets", + [IRDMA_HW_STAT_INDEX_ICRCERR].name = "CRC errors", }; +static int irdma_roce_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + + return 0; +} + +static int irdma_iw_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + immutable->gid_tbl_len = attr.gid_tbl_len; + + return 0; +} + static void irdma_get_dev_fw_str(struct ib_device *dev, char *str) { struct irdma_device *iwdev = to_iwdev(dev); @@ -4034,7 +4634,7 @@ static struct rdma_hw_stats *irdma_alloc_hw_port_stats(struct ib_device *ibdev, int num_counters = dev->hw_attrs.max_stat_idx; unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN; - return rdma_alloc_hw_stats_struct(irdma_hw_stat_names, num_counters, + return rdma_alloc_hw_stats_struct(irdma_hw_stat_descs, num_counters, lifespan); } @@ -4539,7 +5139,7 @@ static bool irdma_ah_exists(struct irdma_device *iwdev, new_ah->sc_ah.ah_info.dest_ip_addr[2] ^ new_ah->sc_ah.ah_info.dest_ip_addr[3]; - hash_for_each_possible(iwdev->ah_hash_tbl, ah, list, key) { + hash_for_each_possible(iwdev->rf->ah_hash_tbl, ah, list, key) { /* Set ah_valid and ah_id the same so memcmp can work */ new_ah->sc_ah.ah_info.ah_idx = ah->sc_ah.ah_info.ah_idx; new_ah->sc_ah.ah_info.ah_valid = ah->sc_ah.ah_info.ah_valid; @@ -4565,14 +5165,14 @@ static int irdma_destroy_ah(struct ib_ah *ibah, u32 ah_flags) struct irdma_ah *ah = to_iwah(ibah); if ((ah_flags & RDMA_DESTROY_AH_SLEEPABLE) && ah->parent_ah) { - mutex_lock(&iwdev->ah_tbl_lock); + mutex_lock(&iwdev->rf->ah_tbl_lock); if (!refcount_dec_and_test(&ah->parent_ah->refcnt)) { - mutex_unlock(&iwdev->ah_tbl_lock); + mutex_unlock(&iwdev->rf->ah_tbl_lock); return 0; } hash_del(&ah->parent_ah->list); kfree(ah->parent_ah); - mutex_unlock(&iwdev->ah_tbl_lock); + mutex_unlock(&iwdev->rf->ah_tbl_lock); } irdma_ah_cqp_op(iwdev->rf, &ah->sc_ah, IRDMA_OP_AH_DESTROY, @@ -4609,11 +5209,11 @@ static int irdma_create_user_ah(struct ib_ah *ibah, err = irdma_setup_ah(ibah, attr); if (err) return err; - mutex_lock(&iwdev->ah_tbl_lock); + mutex_lock(&iwdev->rf->ah_tbl_lock); if (!irdma_ah_exists(iwdev, ah)) { err = irdma_create_hw_ah(iwdev, ah, true); if (err) { - mutex_unlock(&iwdev->ah_tbl_lock); + mutex_unlock(&iwdev->rf->ah_tbl_lock); return err; } /* Add new AH to list */ @@ -4625,11 +5225,11 @@ static int irdma_create_user_ah(struct ib_ah *ibah, parent_ah->sc_ah.ah_info.dest_ip_addr[3]; ah->parent_ah = parent_ah; - hash_add(iwdev->ah_hash_tbl, &parent_ah->list, key); + hash_add(iwdev->rf->ah_hash_tbl, &parent_ah->list, key); refcount_set(&parent_ah->refcnt, 1); } } - mutex_unlock(&iwdev->ah_tbl_lock); + mutex_unlock(&iwdev->rf->ah_tbl_lock); uresp.ah_id = ah->sc_ah.ah_info.ah_idx; err = ib_copy_to_udata(udata, &uresp, min(sizeof(uresp), udata->outlen)); @@ -4691,6 +5291,20 @@ static enum rdma_link_layer irdma_get_link_layer(struct ib_device *ibdev, return IB_LINK_LAYER_ETHERNET; } +static const struct ib_device_ops irdma_gen1_dev_ops = { + .dealloc_driver = irdma_ib_dealloc_device, +}; + +static const struct ib_device_ops irdma_gen3_dev_ops = { + .alloc_mw = irdma_alloc_mw, + .create_srq = irdma_create_srq, + .dealloc_mw = irdma_dealloc_mw, + .destroy_srq = irdma_destroy_srq, + .modify_srq = irdma_modify_srq, + .post_srq_recv = irdma_post_srq_recv, + .query_srq = irdma_query_srq, +}; + static const struct ib_device_ops irdma_roce_dev_ops = { .attach_mcast = irdma_attach_mcast, .create_ah = irdma_create_ah, @@ -4725,7 +5339,6 @@ static const struct ib_device_ops irdma_dev_ops = { .alloc_hw_port_stats = irdma_alloc_hw_port_stats, .alloc_mr = irdma_alloc_mr, - .alloc_mw = irdma_alloc_mw, .alloc_pd = irdma_alloc_pd, .alloc_ucontext = irdma_alloc_ucontext, .create_cq = irdma_create_cq, @@ -4761,6 +5374,7 @@ static const struct ib_device_ops irdma_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_cq, irdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_mw, irdma_mr, ibmw), INIT_RDMA_OBJ_SIZE(ib_qp, irdma_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_srq, irdma_srq, ibsrq), }; /** @@ -4808,6 +5422,10 @@ static void irdma_init_rdma_device(struct irdma_device *iwdev) iwdev->ibdev.num_comp_vectors = iwdev->rf->ceqs_count; iwdev->ibdev.dev.parent = &pcidev->dev; ib_set_device_ops(&iwdev->ibdev, &irdma_dev_ops); + if (iwdev->rf->rdma_ver == IRDMA_GEN_1) + ib_set_device_ops(&iwdev->ibdev, &irdma_gen1_dev_ops); + if (iwdev->rf->rdma_ver >= IRDMA_GEN_3) + ib_set_device_ops(&iwdev->ibdev, &irdma_gen3_dev_ops); } /** @@ -4879,5 +5497,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev) struct irdma_device *iwdev = to_iwdev(ibdev); irdma_rt_deinit_hw(iwdev); - irdma_ctrl_deinit_hw(iwdev->rf); + if (!iwdev->is_vport) { + irdma_ctrl_deinit_hw(iwdev->rf); + if (iwdev->rf->vchnl_wq) + destroy_workqueue(iwdev->rf->vchnl_wq); + } } diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index cfa140b36395..ed21c1b56e8e 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -8,6 +8,7 @@ #define IRDMA_PKEY_TBL_SZ 1 #define IRDMA_DEFAULT_PKEY 0xFFFF +#define IRDMA_SHADOW_PGCNT 1 struct irdma_ucontext { struct ib_ucontext ibucontext; @@ -17,6 +18,8 @@ struct irdma_ucontext { spinlock_t cq_reg_mem_list_lock; /* protect CQ memory list */ struct list_head qp_reg_mem_list; spinlock_t qp_reg_mem_list_lock; /* protect QP memory list */ + struct list_head srq_reg_mem_list; + spinlock_t srq_reg_mem_list_lock; /* protect SRQ memory list */ int abi_ver; u8 legacy_mode : 1; u8 use_raw_attrs : 1; @@ -65,10 +68,16 @@ struct irdma_cq_mr { bool split; }; +struct irdma_srq_mr { + struct irdma_hmc_pble srq_pbl; + dma_addr_t shadow; +}; + struct irdma_qp_mr { struct irdma_hmc_pble sq_pbl; struct irdma_hmc_pble rq_pbl; dma_addr_t shadow; + dma_addr_t rq_pa; struct page *sq_page; }; @@ -85,6 +94,7 @@ struct irdma_pbl { union { struct irdma_qp_mr qp_mr; struct irdma_cq_mr cq_mr; + struct irdma_srq_mr srq_mr; }; bool pbl_allocated:1; @@ -112,24 +122,33 @@ struct irdma_mr { struct irdma_pbl iwpbl; }; +struct irdma_srq { + struct ib_srq ibsrq; + struct irdma_sc_srq sc_srq __aligned(64); + struct irdma_dma_mem kmem; + u64 *srq_wrid_mem; + refcount_t refcnt; + spinlock_t lock; /* for poll srq */ + struct irdma_pbl *iwpbl; + struct irdma_sge *sg_list; + u16 srq_head; + u32 srq_num; + u32 max_wr; + bool user_mode:1; +}; + struct irdma_cq { struct ib_cq ibcq; struct irdma_sc_cq sc_cq; - u16 cq_head; - u16 cq_size; u16 cq_num; bool user_mode; atomic_t armed; enum irdma_cmpl_notify last_notify; - u32 polled_cmpls; - u32 cq_mem_size; struct irdma_dma_mem kmem; struct irdma_dma_mem kmem_shadow; struct completion free_cq; refcount_t refcnt; spinlock_t lock; /* for poll cq */ - struct irdma_pbl *iwpbl; - struct irdma_pbl *iwpbl_shadow; struct list_head resize_list; struct irdma_cq_poll_info cur_cqe; struct list_head cmpl_generated; @@ -259,6 +278,12 @@ static inline void set_ib_wc_op_sq(struct irdma_cq_poll_info *cq_poll_info, case IRDMA_OP_TYPE_FAST_REG_NSMR: entry->opcode = IB_WC_REG_MR; break; + case IRDMA_OP_TYPE_ATOMIC_COMPARE_AND_SWAP: + entry->opcode = IB_WC_COMP_SWAP; + break; + case IRDMA_OP_TYPE_ATOMIC_FETCH_AND_ADD: + entry->opcode = IB_WC_FETCH_ADD; + break; case IRDMA_OP_TYPE_INV_STAG: entry->opcode = IB_WC_LOCAL_INV; break; @@ -267,6 +292,19 @@ static inline void set_ib_wc_op_sq(struct irdma_cq_poll_info *cq_poll_info, } } +static inline void set_ib_wc_op_rq_gen_3(struct irdma_cq_poll_info *info, + struct ib_wc *entry) +{ + switch (info->op_type) { + case IRDMA_OP_TYPE_RDMA_WRITE: + case IRDMA_OP_TYPE_RDMA_WRITE_SOL: + entry->opcode = IB_WC_RECV_RDMA_WITH_IMM; + break; + default: + entry->opcode = IB_WC_RECV; + } +} + static inline void set_ib_wc_op_rq(struct irdma_cq_poll_info *cq_poll_info, struct ib_wc *entry, bool send_imm_support) { diff --git a/drivers/infiniband/hw/irdma/virtchnl.c b/drivers/infiniband/hw/irdma/virtchnl.c new file mode 100644 index 000000000000..16ad27247527 --- /dev/null +++ b/drivers/infiniband/hw/irdma/virtchnl.c @@ -0,0 +1,618 @@ +// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB +/* Copyright (c) 2015 - 2024 Intel Corporation */ + +#include "osdep.h" +#include "hmc.h" +#include "defs.h" +#include "type.h" +#include "protos.h" +#include "virtchnl.h" +#include "ws.h" +#include "i40iw_hw.h" +#include "ig3rdma_hw.h" + +struct vchnl_reg_map_elem { + u16 reg_id; + u16 reg_idx; + bool pg_rel; +}; + +struct vchnl_regfld_map_elem { + u16 regfld_id; + u16 regfld_idx; +}; + +static struct vchnl_reg_map_elem vchnl_reg_map[] = { + {IRDMA_VCHNL_REG_ID_CQPTAIL, IRDMA_CQPTAIL, false}, + {IRDMA_VCHNL_REG_ID_CQPDB, IRDMA_CQPDB, false}, + {IRDMA_VCHNL_REG_ID_CCQPSTATUS, IRDMA_CCQPSTATUS, false}, + {IRDMA_VCHNL_REG_ID_CCQPHIGH, IRDMA_CCQPHIGH, false}, + {IRDMA_VCHNL_REG_ID_CCQPLOW, IRDMA_CCQPLOW, false}, + {IRDMA_VCHNL_REG_ID_CQARM, IRDMA_CQARM, false}, + {IRDMA_VCHNL_REG_ID_CQACK, IRDMA_CQACK, false}, + {IRDMA_VCHNL_REG_ID_AEQALLOC, IRDMA_AEQALLOC, false}, + {IRDMA_VCHNL_REG_ID_CQPERRCODES, IRDMA_CQPERRCODES, false}, + {IRDMA_VCHNL_REG_ID_WQEALLOC, IRDMA_WQEALLOC, false}, + {IRDMA_VCHNL_REG_ID_DB_ADDR_OFFSET, IRDMA_DB_ADDR_OFFSET, false }, + {IRDMA_VCHNL_REG_ID_DYN_CTL, IRDMA_GLINT_DYN_CTL, false }, + {IRDMA_VCHNL_REG_INV_ID, IRDMA_VCHNL_REG_INV_ID, false } +}; + +static struct vchnl_regfld_map_elem vchnl_regfld_map[] = { + {IRDMA_VCHNL_REGFLD_ID_CCQPSTATUS_CQP_OP_ERR, IRDMA_CCQPSTATUS_CCQP_ERR_M}, + {IRDMA_VCHNL_REGFLD_ID_CCQPSTATUS_CCQP_DONE, IRDMA_CCQPSTATUS_CCQP_DONE_M}, + {IRDMA_VCHNL_REGFLD_ID_CQPSQ_STAG_PDID, IRDMA_CQPSQ_STAG_PDID_M}, + {IRDMA_VCHNL_REGFLD_ID_CQPSQ_CQ_CEQID, IRDMA_CQPSQ_CQ_CEQID_M}, + {IRDMA_VCHNL_REGFLD_ID_CQPSQ_CQ_CQID, IRDMA_CQPSQ_CQ_CQID_M}, + {IRDMA_VCHNL_REGFLD_ID_COMMIT_FPM_CQCNT, IRDMA_COMMIT_FPM_CQCNT_M}, + {IRDMA_VCHNL_REGFLD_ID_UPESD_HMCN_ID, IRDMA_CQPSQ_UPESD_HMCFNID_M}, + {IRDMA_VCHNL_REGFLD_INV_ID, IRDMA_VCHNL_REGFLD_INV_ID} +}; + +#define IRDMA_VCHNL_REG_COUNT ARRAY_SIZE(vchnl_reg_map) +#define IRDMA_VCHNL_REGFLD_COUNT ARRAY_SIZE(vchnl_regfld_map) +#define IRDMA_VCHNL_REGFLD_BUF_SIZE \ + (IRDMA_VCHNL_REG_COUNT * sizeof(struct irdma_vchnl_reg_info) + \ + IRDMA_VCHNL_REGFLD_COUNT * sizeof(struct irdma_vchnl_reg_field_info)) +#define IRDMA_REGMAP_RESP_BUF_SIZE (IRDMA_VCHNL_RESP_MIN_SIZE + IRDMA_VCHNL_REGFLD_BUF_SIZE) + +/** + * irdma_sc_vchnl_init - Initialize dev virtchannel and get hw_rev + * @dev: dev structure to update + * @info: virtchannel info parameters to fill into the dev structure + */ +int irdma_sc_vchnl_init(struct irdma_sc_dev *dev, + struct irdma_vchnl_init_info *info) +{ + dev->vchnl_up = true; + dev->privileged = info->privileged; + dev->is_pf = info->is_pf; + dev->hw_attrs.uk_attrs.hw_rev = info->hw_rev; + + if (!dev->privileged) { + int ret = irdma_vchnl_req_get_ver(dev, IRDMA_VCHNL_CHNL_VER_MAX, + &dev->vchnl_ver); + + ibdev_dbg(to_ibdev(dev), + "DEV: Get Channel version ret = %d, version is %u\n", + ret, dev->vchnl_ver); + + if (ret) + return ret; + + ret = irdma_vchnl_req_get_caps(dev); + if (ret) + return ret; + + dev->hw_attrs.uk_attrs.hw_rev = dev->vc_caps.hw_rev; + } + + return 0; +} + +/** + * irdma_vchnl_req_verify_resp - Verify requested response size + * @vchnl_req: vchnl message requested + * @resp_len: response length sent from vchnl peer + */ +static int irdma_vchnl_req_verify_resp(struct irdma_vchnl_req *vchnl_req, + u16 resp_len) +{ + switch (vchnl_req->vchnl_msg->op_code) { + case IRDMA_VCHNL_OP_GET_VER: + case IRDMA_VCHNL_OP_GET_HMC_FCN: + case IRDMA_VCHNL_OP_PUT_HMC_FCN: + if (resp_len != vchnl_req->parm_len) + return -EBADMSG; + break; + case IRDMA_VCHNL_OP_GET_RDMA_CAPS: + if (resp_len < IRDMA_VCHNL_OP_GET_RDMA_CAPS_MIN_SIZE) + return -EBADMSG; + break; + case IRDMA_VCHNL_OP_GET_REG_LAYOUT: + case IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP: + case IRDMA_VCHNL_OP_QUEUE_VECTOR_UNMAP: + case IRDMA_VCHNL_OP_ADD_VPORT: + case IRDMA_VCHNL_OP_DEL_VPORT: + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static void irdma_free_vchnl_req_msg(struct irdma_vchnl_req *vchnl_req) +{ + kfree(vchnl_req->vchnl_msg); +} + +static int irdma_alloc_vchnl_req_msg(struct irdma_vchnl_req *vchnl_req, + struct irdma_vchnl_req_init_info *info) +{ + struct irdma_vchnl_op_buf *vchnl_msg; + + vchnl_msg = kzalloc(IRDMA_VCHNL_MAX_MSG_SIZE, GFP_KERNEL); + + if (!vchnl_msg) + return -ENOMEM; + + vchnl_msg->op_ctx = (uintptr_t)vchnl_req; + vchnl_msg->buf_len = sizeof(*vchnl_msg) + info->req_parm_len; + if (info->req_parm_len) + memcpy(vchnl_msg->buf, info->req_parm, info->req_parm_len); + vchnl_msg->op_code = info->op_code; + vchnl_msg->op_ver = info->op_ver; + + vchnl_req->vchnl_msg = vchnl_msg; + vchnl_req->parm = info->resp_parm; + vchnl_req->parm_len = info->resp_parm_len; + + return 0; +} + +static int irdma_vchnl_req_send_sync(struct irdma_sc_dev *dev, + struct irdma_vchnl_req_init_info *info) +{ + u16 resp_len = sizeof(dev->vc_recv_buf); + struct irdma_vchnl_req vchnl_req = {}; + u16 msg_len; + u8 *msg; + int ret; + + ret = irdma_alloc_vchnl_req_msg(&vchnl_req, info); + if (ret) + return ret; + + msg_len = vchnl_req.vchnl_msg->buf_len; + msg = (u8 *)vchnl_req.vchnl_msg; + + mutex_lock(&dev->vchnl_mutex); + ret = ig3rdma_vchnl_send_sync(dev, msg, msg_len, dev->vc_recv_buf, + &resp_len); + dev->vc_recv_len = resp_len; + if (ret) + goto exit; + + ret = irdma_vchnl_req_get_resp(dev, &vchnl_req); +exit: + mutex_unlock(&dev->vchnl_mutex); + ibdev_dbg(to_ibdev(dev), + "VIRT: virtual channel send %s caller: %pS ret=%d op=%u op_ver=%u req_len=%u parm_len=%u resp_len=%u\n", + !ret ? "SUCCEEDS" : "FAILS", __builtin_return_address(0), + ret, vchnl_req.vchnl_msg->op_code, + vchnl_req.vchnl_msg->op_ver, vchnl_req.vchnl_msg->buf_len, + vchnl_req.parm_len, vchnl_req.resp_len); + irdma_free_vchnl_req_msg(&vchnl_req); + + return ret; +} + +/** + * irdma_vchnl_req_get_reg_layout - Get Register Layout + * @dev: RDMA device pointer + */ +int irdma_vchnl_req_get_reg_layout(struct irdma_sc_dev *dev) +{ + u16 reg_idx, reg_id, tmp_reg_id, regfld_idx, regfld_id, tmp_regfld_id; + struct irdma_vchnl_reg_field_info *regfld_array = NULL; + u8 resp_buffer[IRDMA_REGMAP_RESP_BUF_SIZE] = {}; + struct vchnl_regfld_map_elem *regfld_map_array; + struct irdma_vchnl_req_init_info info = {}; + struct vchnl_reg_map_elem *reg_map_array; + struct irdma_vchnl_reg_info *reg_array; + u8 num_bits, shift_cnt; + u16 buf_len = 0; + u64 bitmask; + u32 rindex; + int ret; + + if (!dev->vchnl_up) + return -EBUSY; + + info.op_code = IRDMA_VCHNL_OP_GET_REG_LAYOUT; + info.op_ver = IRDMA_VCHNL_OP_GET_REG_LAYOUT_V0; + info.resp_parm = resp_buffer; + info.resp_parm_len = sizeof(resp_buffer); + + ret = irdma_vchnl_req_send_sync(dev, &info); + + if (ret) + return ret; + + /* parse the response buffer and update reg info*/ + /* Parse registers till invalid */ + /* Parse register fields till invalid */ + reg_array = (struct irdma_vchnl_reg_info *)resp_buffer; + for (rindex = 0; rindex < IRDMA_VCHNL_REG_COUNT; rindex++) { + buf_len += sizeof(struct irdma_vchnl_reg_info); + if (buf_len >= sizeof(resp_buffer)) + return -ENOMEM; + + regfld_array = + (struct irdma_vchnl_reg_field_info *)®_array[rindex + 1]; + reg_id = reg_array[rindex].reg_id; + if (reg_id == IRDMA_VCHNL_REG_INV_ID) + break; + + reg_id &= ~IRDMA_VCHNL_REG_PAGE_REL; + if (reg_id >= IRDMA_VCHNL_REG_COUNT) + return -EINVAL; + + /* search regmap for register index in hw_regs.*/ + reg_map_array = vchnl_reg_map; + do { + tmp_reg_id = reg_map_array->reg_id; + if (tmp_reg_id == reg_id) + break; + + reg_map_array++; + } while (tmp_reg_id != IRDMA_VCHNL_REG_INV_ID); + if (tmp_reg_id != reg_id) + continue; + + reg_idx = reg_map_array->reg_idx; + + /* Page relative, DB Offset do not need bar offset */ + if (reg_idx == IRDMA_DB_ADDR_OFFSET || + (reg_array[rindex].reg_id & IRDMA_VCHNL_REG_PAGE_REL)) { + dev->hw_regs[reg_idx] = + (u32 __iomem *)(uintptr_t)reg_array[rindex].reg_offset; + continue; + } + + /* Update the local HW struct */ + dev->hw_regs[reg_idx] = ig3rdma_get_reg_addr(dev->hw, + reg_array[rindex].reg_offset); + if (!dev->hw_regs[reg_idx]) + return -EINVAL; + } + + if (!regfld_array) + return -ENOMEM; + + /* set up doorbell variables using mapped DB page */ + dev->wqe_alloc_db = dev->hw_regs[IRDMA_WQEALLOC]; + dev->cq_arm_db = dev->hw_regs[IRDMA_CQARM]; + dev->aeq_alloc_db = dev->hw_regs[IRDMA_AEQALLOC]; + dev->cqp_db = dev->hw_regs[IRDMA_CQPDB]; + dev->cq_ack_db = dev->hw_regs[IRDMA_CQACK]; + + for (rindex = 0; rindex < IRDMA_VCHNL_REGFLD_COUNT; rindex++) { + buf_len += sizeof(struct irdma_vchnl_reg_field_info); + if ((buf_len - 1) > sizeof(resp_buffer)) + break; + + if (regfld_array[rindex].fld_id == IRDMA_VCHNL_REGFLD_INV_ID) + break; + + regfld_id = regfld_array[rindex].fld_id; + regfld_map_array = vchnl_regfld_map; + do { + tmp_regfld_id = regfld_map_array->regfld_id; + if (tmp_regfld_id == regfld_id) + break; + + regfld_map_array++; + } while (tmp_regfld_id != IRDMA_VCHNL_REGFLD_INV_ID); + + if (tmp_regfld_id != regfld_id) + continue; + + regfld_idx = regfld_map_array->regfld_idx; + + num_bits = regfld_array[rindex].fld_bits; + shift_cnt = regfld_array[rindex].fld_shift; + if ((num_bits + shift_cnt > 64) || !num_bits) { + ibdev_dbg(to_ibdev(dev), + "ERR: Invalid field mask id %d bits %d shift %d", + regfld_id, num_bits, shift_cnt); + + continue; + } + + bitmask = (1ULL << num_bits) - 1; + dev->hw_masks[regfld_idx] = bitmask << shift_cnt; + dev->hw_shifts[regfld_idx] = shift_cnt; + } + + return 0; +} + +int irdma_vchnl_req_add_vport(struct irdma_sc_dev *dev, u16 vport_id, + u32 qp1_id, struct irdma_qos *qos) +{ + struct irdma_vchnl_resp_vport_info resp_vport = { 0 }; + struct irdma_vchnl_req_vport_info req_vport = { 0 }; + struct irdma_vchnl_req_init_info info = { 0 }; + int ret, i; + + if (!dev->vchnl_up) + return -EBUSY; + + info.op_code = IRDMA_VCHNL_OP_ADD_VPORT; + info.op_ver = IRDMA_VCHNL_OP_ADD_VPORT_V0; + req_vport.vport_id = vport_id; + req_vport.qp1_id = qp1_id; + info.req_parm_len = sizeof(req_vport); + info.req_parm = &req_vport; + info.resp_parm = &resp_vport; + info.resp_parm_len = sizeof(resp_vport); + + ret = irdma_vchnl_req_send_sync(dev, &info); + if (ret) + return ret; + + for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) { + qos[i].qs_handle = resp_vport.qs_handle[i]; + qos[i].valid = true; + } + + return 0; +} + +int irdma_vchnl_req_del_vport(struct irdma_sc_dev *dev, u16 vport_id, u32 qp1_id) +{ + struct irdma_vchnl_req_init_info info = { 0 }; + struct irdma_vchnl_req_vport_info req_vport = { 0 }; + + if (!dev->vchnl_up) + return -EBUSY; + + info.op_code = IRDMA_VCHNL_OP_DEL_VPORT; + info.op_ver = IRDMA_VCHNL_OP_DEL_VPORT_V0; + req_vport.vport_id = vport_id; + req_vport.qp1_id = qp1_id; + info.req_parm_len = sizeof(req_vport); + info.req_parm = &req_vport; + + return irdma_vchnl_req_send_sync(dev, &info); +} + +/** + * irdma_vchnl_req_aeq_vec_map - Map AEQ to vector on this function + * @dev: RDMA device pointer + * @v_idx: vector index + */ +int irdma_vchnl_req_aeq_vec_map(struct irdma_sc_dev *dev, u32 v_idx) +{ + struct irdma_vchnl_req_init_info info = {}; + struct irdma_vchnl_qvlist_info *qvl; + struct irdma_vchnl_qv_info *qv; + u16 qvl_size, num_vectors = 1; + int ret; + + if (!dev->vchnl_up) + return -EBUSY; + + qvl_size = struct_size(qvl, qv_info, num_vectors); + + qvl = kzalloc(qvl_size, GFP_KERNEL); + if (!qvl) + return -ENOMEM; + + qvl->num_vectors = 1; + qv = qvl->qv_info; + + qv->ceq_idx = IRDMA_Q_INVALID_IDX; + qv->v_idx = v_idx; + qv->itr_idx = IRDMA_IDX_ITR0; + + info.op_code = IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP; + info.op_ver = IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP_V0; + info.req_parm = qvl; + info.req_parm_len = qvl_size; + + ret = irdma_vchnl_req_send_sync(dev, &info); + kfree(qvl); + + return ret; +} + +/** + * irdma_vchnl_req_ceq_vec_map - Map CEQ to vector on this function + * @dev: RDMA device pointer + * @ceq_id: CEQ index + * @v_idx: vector index + */ +int irdma_vchnl_req_ceq_vec_map(struct irdma_sc_dev *dev, u16 ceq_id, u32 v_idx) +{ + struct irdma_vchnl_req_init_info info = {}; + struct irdma_vchnl_qvlist_info *qvl; + struct irdma_vchnl_qv_info *qv; + u16 qvl_size, num_vectors = 1; + int ret; + + if (!dev->vchnl_up) + return -EBUSY; + + qvl_size = struct_size(qvl, qv_info, num_vectors); + + qvl = kzalloc(qvl_size, GFP_KERNEL); + if (!qvl) + return -ENOMEM; + + qvl->num_vectors = num_vectors; + qv = qvl->qv_info; + + qv->aeq_idx = IRDMA_Q_INVALID_IDX; + qv->ceq_idx = ceq_id; + qv->v_idx = v_idx; + qv->itr_idx = IRDMA_IDX_ITR0; + + info.op_code = IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP; + info.op_ver = IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP_V0; + info.req_parm = qvl; + info.req_parm_len = qvl_size; + + ret = irdma_vchnl_req_send_sync(dev, &info); + kfree(qvl); + + return ret; +} + +/** + * irdma_vchnl_req_get_ver - Request Channel version + * @dev: RDMA device pointer + * @ver_req: Virtual channel version requested + * @ver_res: Virtual channel version response + */ +int irdma_vchnl_req_get_ver(struct irdma_sc_dev *dev, u16 ver_req, u32 *ver_res) +{ + struct irdma_vchnl_req_init_info info = {}; + int ret; + + if (!dev->vchnl_up) + return -EBUSY; + + info.op_code = IRDMA_VCHNL_OP_GET_VER; + info.op_ver = ver_req; + info.resp_parm = ver_res; + info.resp_parm_len = sizeof(*ver_res); + + ret = irdma_vchnl_req_send_sync(dev, &info); + if (ret) + return ret; + + if (*ver_res < IRDMA_VCHNL_CHNL_VER_MIN) { + ibdev_dbg(to_ibdev(dev), + "VIRT: %s unsupported vchnl version 0x%0x\n", + __func__, *ver_res); + return -EOPNOTSUPP; + } + + return 0; +} + +/** + * irdma_vchnl_req_get_hmc_fcn - Request VF HMC Function + * @dev: RDMA device pointer + */ +int irdma_vchnl_req_get_hmc_fcn(struct irdma_sc_dev *dev) +{ + struct irdma_vchnl_req_hmc_info req_hmc = {}; + struct irdma_vchnl_resp_hmc_info resp_hmc = {}; + struct irdma_vchnl_req_init_info info = {}; + int ret; + + if (!dev->vchnl_up) + return -EBUSY; + + info.op_code = IRDMA_VCHNL_OP_GET_HMC_FCN; + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + info.op_ver = IRDMA_VCHNL_OP_GET_HMC_FCN_V2; + req_hmc.protocol_used = dev->protocol_used; + info.req_parm_len = sizeof(req_hmc); + info.req_parm = &req_hmc; + info.resp_parm = &resp_hmc; + info.resp_parm_len = sizeof(resp_hmc); + } + + ret = irdma_vchnl_req_send_sync(dev, &info); + + if (ret) + return ret; + + if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) { + int i; + + dev->hmc_fn_id = resp_hmc.hmc_func; + + for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) { + dev->qos[i].qs_handle = resp_hmc.qs_handle[i]; + dev->qos[i].valid = true; + } + } + return 0; +} + +/** + * irdma_vchnl_req_put_hmc_fcn - Free VF HMC Function + * @dev: RDMA device pointer + */ +int irdma_vchnl_req_put_hmc_fcn(struct irdma_sc_dev *dev) +{ + struct irdma_vchnl_req_init_info info = {}; + + if (!dev->vchnl_up) + return -EBUSY; + + info.op_code = IRDMA_VCHNL_OP_PUT_HMC_FCN; + info.op_ver = IRDMA_VCHNL_OP_PUT_HMC_FCN_V0; + + return irdma_vchnl_req_send_sync(dev, &info); +} + +/** + * irdma_vchnl_req_get_caps - Request RDMA capabilities + * @dev: RDMA device pointer + */ +int irdma_vchnl_req_get_caps(struct irdma_sc_dev *dev) +{ + struct irdma_vchnl_req_init_info info = {}; + int ret; + + if (!dev->vchnl_up) + return -EBUSY; + + info.op_code = IRDMA_VCHNL_OP_GET_RDMA_CAPS; + info.op_ver = IRDMA_VCHNL_OP_GET_RDMA_CAPS_V0; + info.resp_parm = &dev->vc_caps; + info.resp_parm_len = sizeof(dev->vc_caps); + + ret = irdma_vchnl_req_send_sync(dev, &info); + + if (ret) + return ret; + + if (dev->vc_caps.hw_rev > IRDMA_GEN_MAX || + dev->vc_caps.hw_rev < IRDMA_GEN_2) { + ibdev_dbg(to_ibdev(dev), + "ERR: %s unsupported hw_rev version 0x%0x\n", + __func__, dev->vc_caps.hw_rev); + return -EOPNOTSUPP; + } + + return 0; +} + +/** + * irdma_vchnl_req_get_resp - Receive the inbound vchnl response. + * @dev: Dev pointer + * @vchnl_req: Vchannel request + */ +int irdma_vchnl_req_get_resp(struct irdma_sc_dev *dev, + struct irdma_vchnl_req *vchnl_req) +{ + struct irdma_vchnl_resp_buf *vchnl_msg_resp = + (struct irdma_vchnl_resp_buf *)dev->vc_recv_buf; + u16 resp_len; + int ret; + + if ((uintptr_t)vchnl_req != (uintptr_t)vchnl_msg_resp->op_ctx) { + ibdev_dbg(to_ibdev(dev), + "VIRT: error vchnl context value does not match\n"); + return -EBADMSG; + } + + resp_len = dev->vc_recv_len - sizeof(*vchnl_msg_resp); + resp_len = min(resp_len, vchnl_req->parm_len); + + ret = irdma_vchnl_req_verify_resp(vchnl_req, resp_len); + if (ret) + return ret; + + ret = (int)vchnl_msg_resp->op_ret; + if (ret) + return ret; + + vchnl_req->resp_len = 0; + if (vchnl_req->parm_len && vchnl_req->parm && resp_len) { + memcpy(vchnl_req->parm, vchnl_msg_resp->buf, resp_len); + vchnl_req->resp_len = resp_len; + ibdev_dbg(to_ibdev(dev), "VIRT: Got response, data size %u\n", + resp_len); + } + + return 0; +} diff --git a/drivers/infiniband/hw/irdma/virtchnl.h b/drivers/infiniband/hw/irdma/virtchnl.h new file mode 100644 index 000000000000..aa955a9125bd --- /dev/null +++ b/drivers/infiniband/hw/irdma/virtchnl.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */ +/* Copyright (c) 2015 - 2024 Intel Corporation */ +#ifndef IRDMA_VIRTCHNL_H +#define IRDMA_VIRTCHNL_H + +#include "hmc.h" +#include "irdma.h" + +/* IRDMA_VCHNL_CHNL_VER_V0 is for legacy hw, no longer supported. */ +#define IRDMA_VCHNL_CHNL_VER_V2 2 +#define IRDMA_VCHNL_CHNL_VER_MIN IRDMA_VCHNL_CHNL_VER_V2 +#define IRDMA_VCHNL_CHNL_VER_MAX IRDMA_VCHNL_CHNL_VER_V2 +#define IRDMA_VCHNL_OP_GET_HMC_FCN_V0 0 +#define IRDMA_VCHNL_OP_GET_HMC_FCN_V1 1 +#define IRDMA_VCHNL_OP_GET_HMC_FCN_V2 2 +#define IRDMA_VCHNL_OP_PUT_HMC_FCN_V0 0 +#define IRDMA_VCHNL_OP_GET_REG_LAYOUT_V0 0 +#define IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP_V0 0 +#define IRDMA_VCHNL_OP_QUEUE_VECTOR_UNMAP_V0 0 +#define IRDMA_VCHNL_OP_ADD_VPORT_V0 0 +#define IRDMA_VCHNL_OP_DEL_VPORT_V0 0 +#define IRDMA_VCHNL_OP_GET_RDMA_CAPS_V0 0 +#define IRDMA_VCHNL_OP_GET_RDMA_CAPS_MIN_SIZE 1 + +#define IRDMA_VCHNL_REG_ID_CQPTAIL 0 +#define IRDMA_VCHNL_REG_ID_CQPDB 1 +#define IRDMA_VCHNL_REG_ID_CCQPSTATUS 2 +#define IRDMA_VCHNL_REG_ID_CCQPHIGH 3 +#define IRDMA_VCHNL_REG_ID_CCQPLOW 4 +#define IRDMA_VCHNL_REG_ID_CQARM 5 +#define IRDMA_VCHNL_REG_ID_CQACK 6 +#define IRDMA_VCHNL_REG_ID_AEQALLOC 7 +#define IRDMA_VCHNL_REG_ID_CQPERRCODES 8 +#define IRDMA_VCHNL_REG_ID_WQEALLOC 9 +#define IRDMA_VCHNL_REG_ID_IPCONFIG0 10 +#define IRDMA_VCHNL_REG_ID_DB_ADDR_OFFSET 11 +#define IRDMA_VCHNL_REG_ID_DYN_CTL 12 +#define IRDMA_VCHNL_REG_ID_AEQITRMASK 13 +#define IRDMA_VCHNL_REG_ID_CEQITRMASK 14 +#define IRDMA_VCHNL_REG_INV_ID 0xFFFF +#define IRDMA_VCHNL_REG_PAGE_REL 0x8000 + +#define IRDMA_VCHNL_REGFLD_ID_CCQPSTATUS_CQP_OP_ERR 2 +#define IRDMA_VCHNL_REGFLD_ID_CCQPSTATUS_CCQP_DONE 5 +#define IRDMA_VCHNL_REGFLD_ID_CQPSQ_STAG_PDID 6 +#define IRDMA_VCHNL_REGFLD_ID_CQPSQ_CQ_CEQID 7 +#define IRDMA_VCHNL_REGFLD_ID_CQPSQ_CQ_CQID 8 +#define IRDMA_VCHNL_REGFLD_ID_COMMIT_FPM_CQCNT 9 +#define IRDMA_VCHNL_REGFLD_ID_UPESD_HMCN_ID 10 +#define IRDMA_VCHNL_REGFLD_INV_ID 0xFFFF + +#define IRDMA_VCHNL_RESP_MIN_SIZE (sizeof(struct irdma_vchnl_resp_buf)) + +enum irdma_vchnl_ops { + IRDMA_VCHNL_OP_GET_VER = 0, + IRDMA_VCHNL_OP_GET_HMC_FCN = 1, + IRDMA_VCHNL_OP_PUT_HMC_FCN = 2, + IRDMA_VCHNL_OP_GET_REG_LAYOUT = 11, + IRDMA_VCHNL_OP_GET_RDMA_CAPS = 13, + IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP = 14, + IRDMA_VCHNL_OP_QUEUE_VECTOR_UNMAP = 15, + IRDMA_VCHNL_OP_ADD_VPORT = 16, + IRDMA_VCHNL_OP_DEL_VPORT = 17, +}; + +struct irdma_vchnl_req_hmc_info { + u8 protocol_used; + u8 disable_qos; +} __packed; + +struct irdma_vchnl_resp_hmc_info { + u16 hmc_func; + u16 qs_handle[IRDMA_MAX_USER_PRIORITY]; +} __packed; + +struct irdma_vchnl_qv_info { + u32 v_idx; + u16 ceq_idx; + u16 aeq_idx; + u8 itr_idx; +}; + +struct irdma_vchnl_qvlist_info { + u32 num_vectors; + struct irdma_vchnl_qv_info qv_info[]; +}; + +struct irdma_vchnl_req_vport_info { + u16 vport_id; + u32 qp1_id; +}; + +struct irdma_vchnl_resp_vport_info { + u16 qs_handle[IRDMA_MAX_USER_PRIORITY]; +}; + +struct irdma_vchnl_op_buf { + u16 op_code; + u16 op_ver; + u16 buf_len; + u16 rsvd; + u64 op_ctx; + u8 buf[]; +} __packed; + +struct irdma_vchnl_resp_buf { + u64 op_ctx; + u16 buf_len; + s16 op_ret; + u16 rsvd[2]; + u8 buf[]; +} __packed; + +struct irdma_vchnl_rdma_caps { + u8 hw_rev; + u16 cqp_timeout_s; + u16 cqp_def_timeout_s; + u16 max_hw_push_len; +} __packed; + +struct irdma_vchnl_init_info { + struct workqueue_struct *vchnl_wq; + enum irdma_vers hw_rev; + bool privileged; + bool is_pf; +}; + +struct irdma_vchnl_reg_info { + u32 reg_offset; + u16 field_cnt; + u16 reg_id; /* High bit of reg_id: bar or page relative */ +}; + +struct irdma_vchnl_reg_field_info { + u8 fld_shift; + u8 fld_bits; + u16 fld_id; +}; + +struct irdma_vchnl_req { + struct irdma_vchnl_op_buf *vchnl_msg; + void *parm; + u32 vf_id; + u16 parm_len; + u16 resp_len; +}; + +struct irdma_vchnl_req_init_info { + void *req_parm; + void *resp_parm; + u16 req_parm_len; + u16 resp_parm_len; + u16 op_code; + u16 op_ver; +} __packed; + +struct irdma_qos; + +int irdma_sc_vchnl_init(struct irdma_sc_dev *dev, + struct irdma_vchnl_init_info *info); +int irdma_vchnl_req_get_ver(struct irdma_sc_dev *dev, u16 ver_req, + u32 *ver_res); +int irdma_vchnl_req_get_hmc_fcn(struct irdma_sc_dev *dev); +int irdma_vchnl_req_put_hmc_fcn(struct irdma_sc_dev *dev); +int irdma_vchnl_req_get_caps(struct irdma_sc_dev *dev); +int irdma_vchnl_req_get_resp(struct irdma_sc_dev *dev, + struct irdma_vchnl_req *vc_req); +int irdma_vchnl_req_get_reg_layout(struct irdma_sc_dev *dev); +int irdma_vchnl_req_aeq_vec_map(struct irdma_sc_dev *dev, u32 v_idx); +int irdma_vchnl_req_ceq_vec_map(struct irdma_sc_dev *dev, u16 ceq_id, + u32 v_idx); +int irdma_vchnl_req_add_vport(struct irdma_sc_dev *dev, u16 vport_id, + u32 qp1_id, struct irdma_qos *qos); +int irdma_vchnl_req_del_vport(struct irdma_sc_dev *dev, u16 vport_id, + u32 qp1_id); +#endif /* IRDMA_VIRTCHNL_H */ diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 28e154bbb50f..1becc8779123 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -291,6 +291,32 @@ out: return wc_index; } +void mana_drain_gsi_sqs(struct mana_ib_dev *mdev) +{ + struct mana_ib_qp *qp = mana_get_qp_ref(mdev, MANA_GSI_QPN, false); + struct ud_sq_shadow_wqe *shadow_wqe; + struct mana_ib_cq *cq; + unsigned long flags; + + if (!qp) + return; + + cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + + spin_lock_irqsave(&cq->cq_lock, flags); + while ((shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_sq)) + != NULL) { + shadow_wqe->header.error_code = IB_WC_GENERAL_ERR; + shadow_queue_advance_next_to_complete(&qp->shadow_sq); + } + spin_unlock_irqrestore(&cq->cq_lock, flags); + + if (cq->ibcq.comp_handler) + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + + mana_put_qp_ref(qp); +} + int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index fa60872f169f..bdeddb642b87 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -230,6 +230,9 @@ static void mana_ib_remove(struct auxiliary_device *adev) { struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); + if (mana_ib_is_rnic(dev)) + mana_drain_gsi_sqs(dev); + ib_unregister_device(&dev->ib_dev); dma_pool_destroy(dev->av_pool); if (mana_ib_is_rnic(dev)) { diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 6a2471f2e804..fac159f7128d 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -273,9 +273,8 @@ int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, umem = ib_umem_get(&mdev->ib_dev, addr, size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) { - err = PTR_ERR(umem); - ibdev_dbg(&mdev->ib_dev, "Failed to get umem, %d\n", err); - return err; + ibdev_dbg(&mdev->ib_dev, "Failed to get umem, %pe\n", umem); + return PTR_ERR(umem); } err = mana_ib_create_zero_offset_dma_region(mdev, umem, &queue->gdma_region); diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 5d31034ac7fb..9d36232ed880 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -43,6 +43,8 @@ */ #define MANA_AV_BUFFER_SIZE 64 +#define MANA_GSI_QPN (1) + struct mana_ib_adapter_caps { u32 max_sq_id; u32 max_rq_id; @@ -410,7 +412,7 @@ struct mana_ib_ah_attr { u8 traffic_class; u16 src_port; u16 dest_port; - u32 reserved; + u32 flow_label; }; struct mana_rnic_set_qp_state_req { @@ -427,8 +429,15 @@ struct mana_rnic_set_qp_state_req { u32 retry_cnt; u32 rnr_retry; u32 min_rnr_timer; - u32 reserved; + u32 rate_limit; struct mana_ib_ah_attr ah_attr; + u64 reserved1; + u32 qkey; + u32 qp_access_flags; + u8 local_ack_timeout; + u8 max_rd_atomic; + u16 reserved2; + u32 reserved3; }; /* HW Data */ struct mana_rnic_set_qp_state_resp { @@ -718,6 +727,7 @@ int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); +void mana_drain_gsi_sqs(struct mana_ib_dev *mdev); int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 55701046ffba..3d0245a4c1ed 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -138,7 +138,8 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); ibdev_dbg(ibdev, - "Failed to get umem for register user-mr, %d\n", err); + "Failed to get umem for register user-mr, %pe\n", + mr->umem); goto err_free; } @@ -220,7 +221,8 @@ struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 leng umem_dmabuf = ib_umem_dmabuf_get_pinned(ibdev, start, length, fd, access_flags); if (IS_ERR(umem_dmabuf)) { err = PTR_ERR(umem_dmabuf); - ibdev_dbg(ibdev, "Failed to get dmabuf umem, %d\n", err); + ibdev_dbg(ibdev, "Failed to get dmabuf umem, %pe\n", + umem_dmabuf); goto err_free; } diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index a6bf4d539e67..48c1f4977f21 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -735,6 +735,8 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int err; mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp)); + + req.hdr.req.msg_version = GDMA_MESSAGE_V3; req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.qp_handle = qp->qp_handle; @@ -748,6 +750,12 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, req.retry_cnt = attr->retry_cnt; req.rnr_retry = attr->rnr_retry; req.min_rnr_timer = attr->min_rnr_timer; + req.rate_limit = attr->rate_limit; + req.qkey = attr->qkey; + req.local_ack_timeout = attr->timeout; + req.qp_access_flags = attr->qp_access_flags; + req.max_rd_atomic = attr->max_rd_atomic; + if (attr_mask & IB_QP_AV) { ndev = mana_ib_get_netdev(&mdev->ib_dev, ibqp->port); if (!ndev) { @@ -774,6 +782,7 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ibqp->qp_num, attr->dest_qp_num); req.ah_attr.traffic_class = attr->ah_attr.grh.traffic_class >> 2; req.ah_attr.hop_limit = attr->ah_attr.grh.hop_limit; + req.ah_attr.flow_label = attr->ah_attr.grh.flow_label; } err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index e6e132f10625..91c714f72099 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -1836,9 +1836,9 @@ static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); if (IS_ERR(tun_qp->qp)) { ret = PTR_ERR(tun_qp->qp); + pr_err("Couldn't create %s QP (%pe)\n", + create_tun ? "tunnel" : "special", tun_qp->qp); tun_qp->qp = NULL; - pr_err("Couldn't create %s QP (%d)\n", - create_tun ? "tunnel" : "special", ret); return ret; } @@ -2017,14 +2017,14 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port, NULL, ctx, &cq_attr); if (IS_ERR(ctx->cq)) { ret = PTR_ERR(ctx->cq); - pr_err("Couldn't create tunnel CQ (%d)\n", ret); + pr_err("Couldn't create tunnel CQ (%pe)\n", ctx->cq); goto err_buf; } ctx->pd = ib_alloc_pd(ctx->ib_dev, 0); if (IS_ERR(ctx->pd)) { ret = PTR_ERR(ctx->pd); - pr_err("Couldn't create tunnel PD (%d)\n", ret); + pr_err("Couldn't create tunnel PD (%pe)\n", ctx->pd); goto err_cq; } diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 50fd407103c7..f2887ae6390e 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1652,7 +1652,8 @@ int mlx4_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, sqp->roce_v2_gsi = ib_create_qp(pd, init_attr); if (IS_ERR(sqp->roce_v2_gsi)) { - pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi)); + pr_err("Failed to create GSI QP for RoCEv2 (%pe)\n", + sqp->roce_v2_gsi); sqp->roce_v2_gsi = NULL; } else { to_mqp(sqp->roce_v2_gsi)->flags |= diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c index b9ba84afaae2..b81ac5709b56 100644 --- a/drivers/infiniband/hw/mlx5/data_direct.c +++ b/drivers/infiniband/hw/mlx5/data_direct.c @@ -35,7 +35,7 @@ static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev) vpd_data = pci_vpd_alloc(pdev, &vpd_size); if (IS_ERR(vpd_data)) { - pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data)); + pci_err(pdev, "Unable to read VPD, err=%pe\n", vpd_data); return PTR_ERR(vpd_data); } diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index b804f2dd5628..d5487834ed25 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -131,8 +131,9 @@ int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp, gsi->cq = ib_alloc_cq(pd->device, gsi, attr->cap.max_send_wr, 0, IB_POLL_SOFTIRQ); if (IS_ERR(gsi->cq)) { - mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", - PTR_ERR(gsi->cq)); + mlx5_ib_warn(dev, + "unable to create send CQ for GSI QP. error %pe\n", + gsi->cq); ret = PTR_ERR(gsi->cq); goto err_free_wrs; } @@ -147,8 +148,9 @@ int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp, gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); if (IS_ERR(gsi->rx_qp)) { - mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", - PTR_ERR(gsi->rx_qp)); + mlx5_ib_warn(dev, + "unable to create hardware GSI QP. error %pe\n", + gsi->rx_qp); ret = PTR_ERR(gsi->rx_qp); goto err_destroy_cq; } @@ -294,8 +296,9 @@ static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) qp = create_gsi_ud_qp(gsi); if (IS_ERR(qp)) { - mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", - PTR_ERR(qp)); + mlx5_ib_warn(dev, + "unable to create hardware UD QP for GSI: %pe\n", + qp); return; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d456e4fde3e1..fc1e86f6c409 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -13,6 +13,7 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/bitmap.h> +#include <linux/log2.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> @@ -883,6 +884,51 @@ static void fill_esw_mgr_reg_c0(struct mlx5_core_dev *mdev, resp->reg_c0.mask = mlx5_eswitch_get_vport_metadata_mask(); } +/* + * Calculate maximum SQ overhead across all QP types. + * Other QP types (REG_UMR, UC, RC, UD/SMI/GSI, XRC_TGT) + * have smaller overhead than the types calculated below, + * so they are implicitly included. + */ +static u32 mlx5_ib_calc_max_sq_overhead(void) +{ + u32 max_overhead_xrc, overhead_ud_lso, a, b; + + /* XRC_INI */ + max_overhead_xrc = sizeof(struct mlx5_wqe_xrc_seg); + max_overhead_xrc += sizeof(struct mlx5_wqe_ctrl_seg); + a = sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + b = sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg) + + MLX5_IB_SQ_UMR_INLINE_THRESHOLD / MLX5_IB_UMR_OCTOWORD; + max_overhead_xrc += max(a, b); + + /* UD with LSO */ + overhead_ud_lso = sizeof(struct mlx5_wqe_ctrl_seg); + overhead_ud_lso += sizeof(struct mlx5_wqe_eth_pad); + overhead_ud_lso += sizeof(struct mlx5_wqe_eth_seg); + overhead_ud_lso += sizeof(struct mlx5_wqe_datagram_seg); + + return max(max_overhead_xrc, overhead_ud_lso); +} + +static u32 mlx5_ib_calc_max_qp_wr(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + u32 max_wqe_bb_units = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + u32 max_wqe_size; + /* max QP overhead + 1 SGE, no inline, no special features */ + max_wqe_size = mlx5_ib_calc_max_sq_overhead() + + sizeof(struct mlx5_wqe_data_seg); + + max_wqe_size = roundup_pow_of_two(max_wqe_size); + + max_wqe_size = ALIGN(max_wqe_size, MLX5_SEND_WQE_BB); + + return (max_wqe_bb_units * MLX5_SEND_WQE_BB) / max_wqe_size; +} + static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) @@ -1041,7 +1087,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_mr_size = ~0ull; props->page_size_cap = ~(min_page_size - 1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); - props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + props->max_qp_wr = mlx5_ib_calc_max_qp_wr(dev); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); @@ -1793,7 +1839,8 @@ static void deallocate_uars(struct mlx5_ib_dev *dev, } static int mlx5_ib_enable_lb_mp(struct mlx5_core_dev *master, - struct mlx5_core_dev *slave) + struct mlx5_core_dev *slave, + struct mlx5_ib_lb_state *lb_state) { int err; @@ -1805,6 +1852,7 @@ static int mlx5_ib_enable_lb_mp(struct mlx5_core_dev *master, if (err) goto out; + lb_state->force_enable = true; return 0; out: @@ -1813,16 +1861,22 @@ out: } static void mlx5_ib_disable_lb_mp(struct mlx5_core_dev *master, - struct mlx5_core_dev *slave) + struct mlx5_core_dev *slave, + struct mlx5_ib_lb_state *lb_state) { mlx5_nic_vport_update_local_lb(slave, false); mlx5_nic_vport_update_local_lb(master, false); + + lb_state->force_enable = false; } int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) { int err = 0; + if (dev->lb.force_enable) + return 0; + mutex_lock(&dev->lb.mutex); if (td) dev->lb.user_td++; @@ -1844,6 +1898,9 @@ int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) { + if (dev->lb.force_enable) + return; + mutex_lock(&dev->lb.mutex); if (td) dev->lb.user_td--; @@ -2994,14 +3051,16 @@ int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) pd = ib_alloc_pd(ibdev, 0); if (IS_ERR(pd)) { ret = PTR_ERR(pd); - mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret); + mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%pe\n", + pd); goto unlock; } cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); if (IS_ERR(cq)) { ret = PTR_ERR(cq); - mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret); + mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%pe\n", + cq); ib_dealloc_pd(pd); goto unlock; } @@ -3045,7 +3104,9 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) s0 = ib_create_srq(devr->p0, &attr); if (IS_ERR(s0)) { ret = PTR_ERR(s0); - mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret); + mlx5_ib_err(dev, + "Couldn't create SRQ 0 for res init, err=%pe\n", + s0); goto unlock; } @@ -3057,7 +3118,9 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) s1 = ib_create_srq(devr->p0, &attr); if (IS_ERR(s1)) { ret = PTR_ERR(s1); - mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret); + mlx5_ib_err(dev, + "Couldn't create SRQ 1 for res init, err=%pe\n", + s1); ib_destroy_srq(s0); } @@ -3118,6 +3181,7 @@ mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) { int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_core_dev *mdev = dev->mdev; + bool ro_supp = false; void *mkc; u32 mkey; u32 pdn; @@ -3146,14 +3210,37 @@ mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) MLX5_SET(mkc, mkc, length64, 1); MLX5_SET(mkc, mkc, qpn, 0xffffff); err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); - kvfree(in); if (err) - goto err; + goto err_mkey; dev->ddr.mkey = mkey; dev->ddr.pdn = pdn; + + /* create another mkey with RO support */ + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) { + MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); + ro_supp = true; + } + + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) { + MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); + ro_supp = true; + } + + if (ro_supp) { + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + /* RO is defined as best effort */ + if (!err) { + dev->ddr.mkey_ro = mkey; + dev->ddr.mkey_ro_valid = true; + } + } + + kvfree(in); return 0; +err_mkey: + kvfree(in); err: mlx5_core_dealloc_pd(mdev, pdn); return err; @@ -3162,6 +3249,10 @@ err: static void mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev) { + + if (dev->ddr.mkey_ro_valid) + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey_ro); + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey); mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn); } @@ -3523,7 +3614,7 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, lockdep_assert_held(&mlx5_ib_multiport_mutex); - mlx5_ib_disable_lb_mp(ibdev->mdev, mpi->mdev); + mlx5_ib_disable_lb_mp(ibdev->mdev, mpi->mdev, &ibdev->lb); mlx5_core_mp_event_replay(ibdev->mdev, MLX5_DRIVER_EVENT_AFFILIATION_REMOVED, @@ -3620,7 +3711,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, MLX5_DRIVER_EVENT_AFFILIATION_DONE, &key); - err = mlx5_ib_enable_lb_mp(ibdev->mdev, mpi->mdev); + err = mlx5_ib_enable_lb_mp(ibdev->mdev, mpi->mdev, &ibdev->lb); if (err) goto unbind; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 7ffc7ee92cf0..09d82d5f95e3 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -854,6 +854,8 @@ struct mlx5_ib_port_resources { struct mlx5_data_direct_resources { u32 pdn; u32 mkey; + u32 mkey_ro; + u8 mkey_ro_valid :1; }; struct mlx5_ib_resources { @@ -1109,6 +1111,7 @@ struct mlx5_ib_lb_state { u32 user_td; int qps; bool enabled; + bool force_enable; }; struct mlx5_ib_pf_eq { @@ -1802,6 +1805,10 @@ mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem, bitmap = GENMASK_ULL(max_log_entity_size_cap, min_log_entity_size_cap); + /* In KSM mode HW requires IOVA and mkey's page size to be aligned */ + if (access_mode == MLX5_MKC_ACCESS_MODE_KSM && iova) + bitmap &= GENMASK_ULL(__ffs64(iova), 0); + return ib_umem_find_best_pgsz(umem, bitmap, iova); } diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 1317f2cb38a4..325fa04cbe8a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1652,8 +1652,7 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, fd, access_flags); if (IS_ERR(umem_dmabuf)) { - mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", - PTR_ERR(umem_dmabuf)); + mlx5_ib_dbg(dev, "umem_dmabuf get failed (%pe)\n", umem_dmabuf); return ERR_CAST(umem_dmabuf); } @@ -1717,11 +1716,11 @@ reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, goto end; } - /* The device's 'data direct mkey' was created without RO flags to - * simplify things and allow for a single mkey per device. - * Since RO is not a must, mask it out accordingly. + /* If no device's 'data direct mkey' with RO flags exists + * mask it out accordingly. */ - access_flags &= ~IB_ACCESS_RELAXED_ORDERING; + if (!dev->ddr.mkey_ro_valid) + access_flags &= ~IB_ACCESS_RELAXED_ORDERING; crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, offset, length, virt_addr, fd, access_flags, MLX5_MKC_ACCESS_MODE_KSM, diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 7ef35cddce81..4e562e0dd9e1 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -761,7 +761,11 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd, if (dd) { cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); - cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + if (mr->access_flags & IB_ACCESS_RELAXED_ORDERING && + dev->ddr.mkey_ro_valid) + cur_ksm->key = cpu_to_be32(dev->ddr.mkey_ro); + else + cur_ksm->key = cpu_to_be32(dev->ddr.mkey); if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) { cur_ksm->va = 0; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index e825e2ef7966..134a79eecfcb 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -492,7 +492,7 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, { u32 i, offset, max_scan, qpn; struct rvt_qpn_map *map; - u32 ret; + int ret; u32 max_qpn = exclude_prefix == RVT_AIP_QP_PREFIX ? RVT_AIP_QPN_MAX : RVT_QPN_MAX; @@ -510,7 +510,8 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, else qpt->flags |= n; spin_unlock(&qpt->lock); - goto bail; + + return ret; } qpn = qpt->last + qpt->incr; @@ -530,7 +531,8 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, if (!test_and_set_bit(offset, map->page)) { qpt->last = qpn; ret = qpn; - goto bail; + + return ret; } offset += qpt->incr; /* @@ -565,10 +567,7 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, qpn = mk_qpn(qpt, map, offset); } - ret = -ENOMEM; - -bail: - return ret; + return -ENOMEM; } /** diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 6f8f353e9583..f522820b950c 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -132,8 +132,12 @@ static void do_task(struct rxe_task *task) * yield the cpu and reschedule the task */ if (!ret) { - task->state = TASK_STATE_IDLE; - resched = 1; + if (task->state != TASK_STATE_DRAINING) { + task->state = TASK_STATE_IDLE; + resched = 1; + } else { + cont = 1; + } goto exit; } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 35c3bde0d00a..efa2f097b582 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -769,7 +769,7 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, struct siw_wqe *wqe = tx_wqe(qp); unsigned long flags; - int rv = 0; + int rv = 0, imm_err = 0; if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) { siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); @@ -955,9 +955,17 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, * Send directly if SQ processing is not in progress. * Eventual immediate errors (rv < 0) do not affect the involved * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ - * processing, if new work is already pending. But rv must be passed - * to caller. + * processing, if new work is already pending. But rv and pointer + * to failed work request must be passed to caller. */ + if (unlikely(rv < 0)) { + /* + * Immediate error + */ + siw_dbg_qp(qp, "Immediate error %d\n", rv); + imm_err = rv; + *bad_wr = wr; + } if (wqe->wr_status != SIW_WR_IDLE) { spin_unlock_irqrestore(&qp->sq_lock, flags); goto skip_direct_sending; @@ -982,15 +990,10 @@ skip_direct_sending: up_read(&qp->state_lock); - if (rv >= 0) - return 0; - /* - * Immediate error - */ - siw_dbg_qp(qp, "error %d\n", rv); + if (unlikely(imm_err)) + return imm_err; - *bad_wr = wr; - return rv; + return (rv >= 0) ? 0 : rv; } /* diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 7acafc5c0e09..5b4d76e97437 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -351,26 +351,27 @@ static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr, } /* - * Find the master net_device on top of the given net_device. + * Find the L2 master net_device on top of the given net_device. * @dev: base IPoIB net_device * - * Returns the master net_device with a reference held, or the same net_device - * if no master exists. + * Returns the L2 master net_device with reference held if the L2 master + * exists (such as bond netdevice), or returns same netdev with reference + * held when master does not exist or when L3 master (such as VRF netdev). */ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) { struct net_device *master; rcu_read_lock(); + master = netdev_master_upper_dev_get_rcu(dev); + if (!master || netif_is_l3_master(master)) + master = dev; + dev_hold(master); rcu_read_unlock(); - if (master) - return master; - - dev_hold(dev); - return dev; + return master; } struct ipoib_walk_data { @@ -522,7 +523,7 @@ static struct net_device *ipoib_get_net_dev_by_params( if (ret) return NULL; - /* See if we can find a unique device matching the L2 parameters */ + /* See if we can find a unique device matching the pkey and GID */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); @@ -535,7 +536,7 @@ static struct net_device *ipoib_get_net_dev_by_params( dev_put(net_dev); - /* Couldn't find a unique device with L2 parameters only. Use L3 + /* Couldn't find a unique device with pkey and GID only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 5dfb4644446b..71269446353d 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -667,9 +667,9 @@ static int srpt_refresh_port(struct srpt_port *sport) srpt_mad_recv_handler, sport, 0); if (IS_ERR(mad_agent)) { - pr_err("%s-%d: MAD agent registration failed (%ld). Note: this is expected if SR-IOV is enabled.\n", + pr_err("%s-%d: MAD agent registration failed (%pe). Note: this is expected if SR-IOV is enabled.\n", dev_name(&sport->sdev->device->dev), sport->port, - PTR_ERR(mad_agent)); + mad_agent); sport->mad_agent = NULL; memset(&port_modify, 0, sizeof(port_modify)); port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; @@ -1865,8 +1865,8 @@ retry: IB_POLL_WORKQUEUE); if (IS_ERR(ch->cq)) { ret = PTR_ERR(ch->cq); - pr_err("failed to create CQ cqe= %d ret= %d\n", - ch->rq_size + sq_size, ret); + pr_err("failed to create CQ cqe= %d ret= %pe\n", + ch->rq_size + sq_size, ch->cq); goto out; } ch->cq_size = ch->rq_size + sq_size; @@ -3132,7 +3132,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev) WARN_ON_ONCE(sdev->srq); srq = ib_create_srq(sdev->pd, &srq_attr); if (IS_ERR(srq)) { - pr_debug("ib_create_srq() failed: %ld\n", PTR_ERR(srq)); + pr_debug("ib_create_srq() failed: %pe\n", srq); return PTR_ERR(srq); } @@ -3236,8 +3236,7 @@ static int srpt_add_one(struct ib_device *device) if (rdma_port_get_link_layer(device, 1) == IB_LINK_LAYER_INFINIBAND) sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev); if (IS_ERR(sdev->cm_id)) { - pr_info("ib_create_cm_id() failed: %ld\n", - PTR_ERR(sdev->cm_id)); + pr_info("ib_create_cm_id() failed: %pe\n", sdev->cm_id); ret = PTR_ERR(sdev->cm_id); sdev->cm_id = NULL; if (!rdma_cm_id) @@ -3687,8 +3686,7 @@ static struct rdma_cm_id *srpt_create_rdma_id(struct sockaddr *listen_addr) rdma_cm_id = rdma_create_id(&init_net, srpt_rdma_cm_handler, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(rdma_cm_id)) { - pr_err("RDMA/CM ID creation failed: %ld\n", - PTR_ERR(rdma_cm_id)); + pr_err("RDMA/CM ID creation failed: %pe\n", rdma_cm_id); goto out; } diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 95f63c5f6159..a698a2e7ce2a 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -792,6 +792,11 @@ struct amd_iommu { u32 flags; volatile u64 *cmd_sem; atomic64_t cmd_sem_val; + /* + * Track physical address to directly use it in build_completion_wait() + * and avoid adding any special checks and handling for kdump. + */ + u64 cmd_sem_paddr; #ifdef CONFIG_AMD_IOMMU_DEBUGFS /* DebugFS Info */ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index ba9e582a8bbe..f2991c11867c 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -406,6 +406,9 @@ static void iommu_set_device_table(struct amd_iommu *iommu) BUG_ON(iommu->mmio_base == NULL); + if (is_kdump_kernel()) + return; + entry = iommu_virt_to_phys(dev_table); entry |= (dev_table_size >> 12) - 1; memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, @@ -646,7 +649,10 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg) static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg) { - iommu_free_pages(pci_seg->dev_table); + if (is_kdump_kernel()) + memunmap((void *)pci_seg->dev_table); + else + iommu_free_pages(pci_seg->dev_table); pci_seg->dev_table = NULL; } @@ -710,6 +716,26 @@ static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg) pci_seg->alias_table = NULL; } +static inline void *iommu_memremap(unsigned long paddr, size_t size) +{ + phys_addr_t phys; + + if (!paddr) + return NULL; + + /* + * Obtain true physical address in kdump kernel when SME is enabled. + * Currently, previous kernel with SME enabled and kdump kernel + * with SME support disabled is not supported. + */ + phys = __sme_clr(paddr); + + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return (__force void *)ioremap_encrypted(phys, size); + else + return memremap(phys, size, MEMREMAP_WB); +} + /* * Allocates the command buffer. This buffer is per AMD IOMMU. We can * write commands to that buffer later and the IOMMU will execute them @@ -795,11 +821,16 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) BUG_ON(iommu->cmd_buf == NULL); - entry = iommu_virt_to_phys(iommu->cmd_buf); - entry |= MMIO_CMD_SIZE_512; - - memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); + if (!is_kdump_kernel()) { + /* + * Command buffer is re-used for kdump kernel and setting + * of MMIO register is not required. + */ + entry = iommu_virt_to_phys(iommu->cmd_buf); + entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, + &entry, sizeof(entry)); + } amd_iommu_reset_cmd_buffer(iommu); } @@ -850,10 +881,15 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu) BUG_ON(iommu->evt_buf == NULL); - entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; - - memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, - &entry, sizeof(entry)); + if (!is_kdump_kernel()) { + /* + * Event buffer is re-used for kdump kernel and setting + * of MMIO register is not required. + */ + entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, + &entry, sizeof(entry)); + } /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); @@ -942,8 +978,91 @@ err_out: static int __init alloc_cwwb_sem(struct amd_iommu *iommu) { iommu->cmd_sem = iommu_alloc_4k_pages(iommu, GFP_KERNEL, 1); + if (!iommu->cmd_sem) + return -ENOMEM; + iommu->cmd_sem_paddr = iommu_virt_to_phys((void *)iommu->cmd_sem); + return 0; +} + +static int __init remap_event_buffer(struct amd_iommu *iommu) +{ + u64 paddr; + + pr_info_once("Re-using event buffer from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_EVT_BUF_OFFSET) & PM_ADDR_MASK; + iommu->evt_buf = iommu_memremap(paddr, EVT_BUFFER_SIZE); + + return iommu->evt_buf ? 0 : -ENOMEM; +} + +static int __init remap_command_buffer(struct amd_iommu *iommu) +{ + u64 paddr; + + pr_info_once("Re-using command buffer from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_CMD_BUF_OFFSET) & PM_ADDR_MASK; + iommu->cmd_buf = iommu_memremap(paddr, CMD_BUFFER_SIZE); + + return iommu->cmd_buf ? 0 : -ENOMEM; +} + +static int __init remap_or_alloc_cwwb_sem(struct amd_iommu *iommu) +{ + u64 paddr; + + if (check_feature(FEATURE_SNP)) { + /* + * When SNP is enabled, the exclusion base register is used for the + * completion wait buffer (CWB) address. Read and re-use it. + */ + pr_info_once("Re-using CWB buffers from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET) & PM_ADDR_MASK; + iommu->cmd_sem = iommu_memremap(paddr, PAGE_SIZE); + if (!iommu->cmd_sem) + return -ENOMEM; + iommu->cmd_sem_paddr = paddr; + } else { + return alloc_cwwb_sem(iommu); + } + + return 0; +} + +static int __init alloc_iommu_buffers(struct amd_iommu *iommu) +{ + int ret; + + /* + * Reuse/Remap the previous kernel's allocated completion wait + * command and event buffers for kdump boot. + */ + if (is_kdump_kernel()) { + ret = remap_or_alloc_cwwb_sem(iommu); + if (ret) + return ret; + + ret = remap_command_buffer(iommu); + if (ret) + return ret; + + ret = remap_event_buffer(iommu); + if (ret) + return ret; + } else { + ret = alloc_cwwb_sem(iommu); + if (ret) + return ret; - return iommu->cmd_sem ? 0 : -ENOMEM; + ret = alloc_command_buffer(iommu); + if (ret) + return ret; + + ret = alloc_event_buffer(iommu); + if (ret) + return ret; + } + + return 0; } static void __init free_cwwb_sem(struct amd_iommu *iommu) @@ -951,6 +1070,38 @@ static void __init free_cwwb_sem(struct amd_iommu *iommu) if (iommu->cmd_sem) iommu_free_pages((void *)iommu->cmd_sem); } +static void __init unmap_cwwb_sem(struct amd_iommu *iommu) +{ + if (iommu->cmd_sem) { + if (check_feature(FEATURE_SNP)) + memunmap((void *)iommu->cmd_sem); + else + iommu_free_pages((void *)iommu->cmd_sem); + } +} + +static void __init unmap_command_buffer(struct amd_iommu *iommu) +{ + memunmap((void *)iommu->cmd_buf); +} + +static void __init unmap_event_buffer(struct amd_iommu *iommu) +{ + memunmap(iommu->evt_buf); +} + +static void __init free_iommu_buffers(struct amd_iommu *iommu) +{ + if (is_kdump_kernel()) { + unmap_cwwb_sem(iommu); + unmap_command_buffer(iommu); + unmap_event_buffer(iommu); + } else { + free_cwwb_sem(iommu); + free_command_buffer(iommu); + free_event_buffer(iommu); + } +} static void iommu_enable_xt(struct amd_iommu *iommu) { @@ -982,15 +1133,12 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit) dte->data[i] |= (1UL << _bit); } -static bool __copy_device_table(struct amd_iommu *iommu) +static bool __reuse_device_table(struct amd_iommu *iommu) { - u64 int_ctl, int_tab_len, entry = 0; struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; - struct dev_table_entry *old_devtb = NULL; - u32 lo, hi, devid, old_devtb_size; + u32 lo, hi, old_devtb_size; phys_addr_t old_devtb_phys; - u16 dom_id, dte_v, irq_v; - u64 tmp; + u64 entry; /* Each IOMMU use separate device table with the same size */ lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET); @@ -1015,66 +1163,20 @@ static bool __copy_device_table(struct amd_iommu *iommu) pr_err("The address of old device table is above 4G, not trustworthy!\n"); return false; } - old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel()) - ? (__force void *)ioremap_encrypted(old_devtb_phys, - pci_seg->dev_table_size) - : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB); - - if (!old_devtb) - return false; - pci_seg->old_dev_tbl_cpy = iommu_alloc_pages_sz( - GFP_KERNEL | GFP_DMA32, pci_seg->dev_table_size); + /* + * Re-use the previous kernel's device table for kdump. + */ + pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size); if (pci_seg->old_dev_tbl_cpy == NULL) { - pr_err("Failed to allocate memory for copying old device table!\n"); - memunmap(old_devtb); + pr_err("Failed to remap memory for reusing old device table!\n"); return false; } - for (devid = 0; devid <= pci_seg->last_bdf; ++devid) { - pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid]; - dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK; - dte_v = old_devtb[devid].data[0] & DTE_FLAG_V; - - if (dte_v && dom_id) { - pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0]; - pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1]; - /* Reserve the Domain IDs used by previous kernel */ - if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) { - pr_err("Failed to reserve domain ID 0x%x\n", dom_id); - memunmap(old_devtb); - return false; - } - /* If gcr3 table existed, mask it out */ - if (old_devtb[devid].data[0] & DTE_FLAG_GV) { - tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31); - pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp; - tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV); - pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp; - } - } - - irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE; - int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK; - int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; - if (irq_v && (int_ctl || int_tab_len)) { - if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || - (int_tab_len != DTE_INTTABLEN_512 && - int_tab_len != DTE_INTTABLEN_2K)) { - pr_err("Wrong old irq remapping flag: %#x\n", devid); - memunmap(old_devtb); - return false; - } - - pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2]; - } - } - memunmap(old_devtb); - return true; } -static bool copy_device_table(void) +static bool reuse_device_table(void) { struct amd_iommu *iommu; struct amd_iommu_pci_seg *pci_seg; @@ -1082,17 +1184,17 @@ static bool copy_device_table(void) if (!amd_iommu_pre_enabled) return false; - pr_warn("Translation is already enabled - trying to copy translation structures\n"); + pr_warn("Translation is already enabled - trying to reuse translation structures\n"); /* * All IOMMUs within PCI segment shares common device table. - * Hence copy device table only once per PCI segment. + * Hence reuse device table only once per PCI segment. */ for_each_pci_segment(pci_seg) { for_each_iommu(iommu) { if (pci_seg->id != iommu->pci_seg->id) continue; - if (!__copy_device_table(iommu)) + if (!__reuse_device_table(iommu)) return false; break; } @@ -1655,9 +1757,7 @@ static void __init free_sysfs(struct amd_iommu *iommu) static void __init free_iommu_one(struct amd_iommu *iommu) { free_sysfs(iommu); - free_cwwb_sem(iommu); - free_command_buffer(iommu); - free_event_buffer(iommu); + free_iommu_buffers(iommu); amd_iommu_free_ppr_log(iommu); free_ga_log(iommu); iommu_unmap_mmio_space(iommu); @@ -1821,14 +1921,9 @@ static int __init init_iommu_one_late(struct amd_iommu *iommu) { int ret; - if (alloc_cwwb_sem(iommu)) - return -ENOMEM; - - if (alloc_command_buffer(iommu)) - return -ENOMEM; - - if (alloc_event_buffer(iommu)) - return -ENOMEM; + ret = alloc_iommu_buffers(iommu); + if (ret) + return ret; iommu->int_enabled = false; @@ -2778,8 +2873,8 @@ static void early_enable_iommu(struct amd_iommu *iommu) * This function finally enables all IOMMUs found in the system after * they have been initialized. * - * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy - * the old content of device table entries. Not this case or copy failed, + * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse + * the old content of device table entries. Not this case or reuse failed, * just continue as normal kernel does. */ static void early_enable_iommus(void) @@ -2787,18 +2882,25 @@ static void early_enable_iommus(void) struct amd_iommu *iommu; struct amd_iommu_pci_seg *pci_seg; - if (!copy_device_table()) { + if (!reuse_device_table()) { /* - * If come here because of failure in copying device table from old + * If come here because of failure in reusing device table from old * kernel with all IOMMUs enabled, print error message and try to * free allocated old_dev_tbl_cpy. */ - if (amd_iommu_pre_enabled) - pr_err("Failed to copy DEV table from previous kernel.\n"); + if (amd_iommu_pre_enabled) { + pr_err("Failed to reuse DEV table from previous kernel.\n"); + /* + * Bail out early if unable to remap/reuse DEV table from + * previous kernel if SNP enabled as IOMMU commands will + * time out without DEV table and cause kdump boot panic. + */ + BUG_ON(check_feature(FEATURE_SNP)); + } for_each_pci_segment(pci_seg) { if (pci_seg->old_dev_tbl_cpy != NULL) { - iommu_free_pages(pci_seg->old_dev_tbl_cpy); + memunmap((void *)pci_seg->old_dev_tbl_cpy); pci_seg->old_dev_tbl_cpy = NULL; } } @@ -2808,7 +2910,7 @@ static void early_enable_iommus(void) early_enable_iommu(iommu); } } else { - pr_info("Copied DEV table from previous kernel.\n"); + pr_info("Reused DEV table from previous kernel.\n"); for_each_pci_segment(pci_seg) { iommu_free_pages(pci_seg->dev_table); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index eb348c63a8d0..2e1865daa1ce 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -14,6 +14,7 @@ #include <linux/pci-ats.h> #include <linux/bitmap.h> #include <linux/slab.h> +#include <linux/string_choices.h> #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/dma-map-ops.h> @@ -265,7 +266,7 @@ static inline int get_acpihid_device_id(struct device *dev, return -EINVAL; if (fw_bug) dev_err_once(dev, FW_BUG "No ACPI device matched UID, but %d device%s matched HID.\n", - hid_count, hid_count > 1 ? "s" : ""); + hid_count, str_plural(hid_count)); if (hid_count > 1) return -EINVAL; if (entry) @@ -1195,7 +1196,7 @@ static void build_completion_wait(struct iommu_cmd *cmd, struct amd_iommu *iommu, u64 data) { - u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem); + u64 paddr = iommu->cmd_sem_paddr; memset(cmd, 0, sizeof(*cmd)); cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 190f28d76615..95a4e62b8f63 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -122,6 +122,8 @@ #define DART_T8110_ERROR_ADDR_LO 0x170 #define DART_T8110_ERROR_ADDR_HI 0x174 +#define DART_T8110_ERROR_STREAMS 0x1c0 + #define DART_T8110_PROTECT 0x200 #define DART_T8110_UNPROTECT 0x204 #define DART_T8110_PROTECT_LOCK 0x208 @@ -133,6 +135,7 @@ #define DART_T8110_TCR 0x1000 #define DART_T8110_TCR_REMAP GENMASK(11, 8) #define DART_T8110_TCR_REMAP_EN BIT(7) +#define DART_T8110_TCR_FOUR_LEVEL BIT(3) #define DART_T8110_TCR_BYPASS_DAPF BIT(2) #define DART_T8110_TCR_BYPASS_DART BIT(1) #define DART_T8110_TCR_TRANSLATE_ENABLE BIT(0) @@ -166,22 +169,23 @@ struct apple_dart_hw { int max_sid_count; - u64 lock; - u64 lock_bit; + u32 lock; + u32 lock_bit; - u64 error; + u32 error; - u64 enable_streams; + u32 enable_streams; - u64 tcr; - u64 tcr_enabled; - u64 tcr_disabled; - u64 tcr_bypass; + u32 tcr; + u32 tcr_enabled; + u32 tcr_disabled; + u32 tcr_bypass; + u32 tcr_4level; - u64 ttbr; - u64 ttbr_valid; - u64 ttbr_addr_field_shift; - u64 ttbr_shift; + u32 ttbr; + u32 ttbr_valid; + u32 ttbr_addr_field_shift; + u32 ttbr_shift; int ttbr_count; }; @@ -217,6 +221,7 @@ struct apple_dart { u32 pgsize; u32 num_streams; u32 supports_bypass : 1; + u32 four_level : 1; struct iommu_group *sid2group[DART_MAX_STREAMS]; struct iommu_device iommu; @@ -305,13 +310,19 @@ static struct apple_dart_domain *to_dart_domain(struct iommu_domain *dom) } static void -apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map) +apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map, int levels) { struct apple_dart *dart = stream_map->dart; + u32 tcr = dart->hw->tcr_enabled; int sid; + if (levels == 4) + tcr |= dart->hw->tcr_4level; + + WARN_ON(levels != 3 && levels != 4); + WARN_ON(levels == 4 && !dart->four_level); for_each_set_bit(sid, stream_map->sidmap, dart->num_streams) - writel(dart->hw->tcr_enabled, dart->regs + DART_TCR(dart, sid)); + writel(tcr, dart->regs + DART_TCR(dart, sid)); } static void apple_dart_hw_disable_dma(struct apple_dart_stream_map *stream_map) @@ -569,7 +580,8 @@ apple_dart_setup_translation(struct apple_dart_domain *domain, for (; i < stream_map->dart->hw->ttbr_count; ++i) apple_dart_hw_clear_ttbr(stream_map, i); - apple_dart_hw_enable_translation(stream_map); + apple_dart_hw_enable_translation(stream_map, + pgtbl_cfg->apple_dart_cfg.n_levels); stream_map->dart->hw->invalidate_tlb(stream_map); } @@ -614,7 +626,7 @@ static int apple_dart_finalize_domain(struct apple_dart_domain *dart_domain, dart_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; dart_domain->domain.geometry.aperture_start = 0; dart_domain->domain.geometry.aperture_end = - (dma_addr_t)DMA_BIT_MASK(dart->ias); + (dma_addr_t)DMA_BIT_MASK(pgtbl_cfg.ias); dart_domain->domain.geometry.force_aperture = true; dart_domain->finalized = true; @@ -807,6 +819,8 @@ static int apple_dart_of_xlate(struct device *dev, if (cfg_dart) { if (cfg_dart->pgsize != dart->pgsize) return -EINVAL; + if (cfg_dart->ias != dart->ias) + return -EINVAL; } cfg->supports_bypass &= dart->supports_bypass; @@ -1077,6 +1091,9 @@ static irqreturn_t apple_dart_t8110_irq(int irq, void *dev) error, stream_idx, error_code, fault_name, addr); writel(error, dart->regs + DART_T8110_ERROR); + for (int i = 0; i < BITS_TO_U32(dart->num_streams); i++) + writel(U32_MAX, dart->regs + DART_T8110_ERROR_STREAMS + 4 * i); + return IRQ_HANDLED; } @@ -1137,6 +1154,7 @@ static int apple_dart_probe(struct platform_device *pdev) dart->ias = FIELD_GET(DART_T8110_PARAMS3_VA_WIDTH, dart_params[2]); dart->oas = FIELD_GET(DART_T8110_PARAMS3_PA_WIDTH, dart_params[2]); dart->num_streams = FIELD_GET(DART_T8110_PARAMS4_NUM_SIDS, dart_params[3]); + dart->four_level = dart->ias > 36; break; } @@ -1169,9 +1187,9 @@ static int apple_dart_probe(struct platform_device *pdev) dev_info( &pdev->dev, - "DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d] initialized\n", + "DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d, AS %d -> %d] initialized\n", dart->pgsize, dart->num_streams, dart->supports_bypass, - dart->pgsize > PAGE_SIZE); + dart->pgsize > PAGE_SIZE, dart->ias, dart->oas); return 0; err_sysfs_remove: @@ -1292,6 +1310,7 @@ static const struct apple_dart_hw apple_dart_hw_t8110 = { .tcr_enabled = DART_T8110_TCR_TRANSLATE_ENABLE, .tcr_disabled = 0, .tcr_bypass = DART_T8110_TCR_BYPASS_DAPF | DART_T8110_TCR_BYPASS_DART, + .tcr_4level = DART_T8110_TCR_FOUR_LEVEL, .ttbr = DART_T8110_TTBR, .ttbr_valid = DART_T8110_TTBR_VALID, diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index ea2ef53bd4fe..7944a3af4545 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -724,7 +724,12 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev static int dma_info_to_prot(enum dma_data_direction dir, bool coherent, unsigned long attrs) { - int prot = coherent ? IOMMU_CACHE : 0; + int prot; + + if (attrs & DMA_ATTR_MMIO) + prot = IOMMU_MMIO; + else + prot = coherent ? IOMMU_CACHE : 0; if (attrs & DMA_ATTR_PRIVILEGED) prot |= IOMMU_PRIV; @@ -1190,11 +1195,9 @@ static inline size_t iova_unaligned(struct iova_domain *iovad, phys_addr_t phys, return iova_offset(iovad, phys | size); } -dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; bool coherent = dev_is_dma_coherent(dev); int prot = dma_info_to_prot(dir, coherent, attrs); struct iommu_domain *domain = iommu_get_dma_domain(dev); @@ -1208,27 +1211,34 @@ dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, */ if (dev_use_swiotlb(dev, size, dir) && iova_unaligned(iovad, phys, size)) { + if (attrs & DMA_ATTR_MMIO) + return DMA_MAPPING_ERROR; + phys = iommu_dma_map_swiotlb(dev, phys, size, dir, attrs); if (phys == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; } - if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_device(phys, size, dir); iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); - if (iova == DMA_MAPPING_ERROR) + if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO)) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); return iova; } -void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, +void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct iommu_domain *domain = iommu_get_dma_domain(dev); phys_addr_t phys; - phys = iommu_iova_to_phys(domain, dma_handle); + if (attrs & DMA_ATTR_MMIO) { + __iommu_dma_unmap(dev, dma_handle, size); + return; + } + + phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); if (WARN_ON(!phys)) return; @@ -1341,7 +1351,7 @@ static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *s int i; for_each_sg(sg, s, nents, i) - iommu_dma_unmap_page(dev, sg_dma_address(s), + iommu_dma_unmap_phys(dev, sg_dma_address(s), sg_dma_len(s), dir, attrs); } @@ -1354,8 +1364,8 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg, sg_dma_mark_swiotlb(sg); for_each_sg(sg, s, nents, i) { - sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s), - s->offset, s->length, dir, attrs); + sg_dma_address(s) = iommu_dma_map_phys(dev, sg_phys(s), + s->length, dir, attrs); if (sg_dma_address(s) == DMA_MAPPING_ERROR) goto out_unmap; sg_dma_len(s) = s->length; @@ -1546,20 +1556,6 @@ void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, __iommu_dma_unmap(dev, start, end - start); } -dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - return __iommu_dma_map(dev, phys, size, - dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO, - dma_get_mask(dev)); -} - -void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - __iommu_dma_unmap(dev, handle, size); -} - static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) { size_t alloc_size = PAGE_ALIGN(size); @@ -1838,12 +1834,13 @@ static int __dma_iova_link(struct device *dev, dma_addr_t addr, unsigned long attrs) { bool coherent = dev_is_dma_coherent(dev); + int prot = dma_info_to_prot(dir, coherent, attrs); - if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_device(phys, size, dir); return iommu_map_nosync(iommu_get_dma_domain(dev), addr, phys, size, - dma_info_to_prot(dir, coherent, attrs), GFP_ATOMIC); + prot, GFP_ATOMIC); } static int iommu_dma_iova_bounce_and_link(struct device *dev, dma_addr_t addr, @@ -1949,9 +1946,13 @@ int dma_iova_link(struct device *dev, struct dma_iova_state *state, return -EIO; if (dev_use_swiotlb(dev, size, dir) && - iova_unaligned(iovad, phys, size)) + iova_unaligned(iovad, phys, size)) { + if (attrs & DMA_ATTR_MMIO) + return -EPERM; + return iommu_dma_iova_link_swiotlb(dev, state, phys, offset, size, dir, attrs); + } return __dma_iova_link(dev, state->addr + offset - iova_start_pad, phys - iova_start_pad, diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index affbf4a1558d..617fd81a80f0 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -62,8 +62,6 @@ static const struct iommu_regset iommu_regs_64[] = { IOMMU_REGSET_ENTRY(CAP), IOMMU_REGSET_ENTRY(ECAP), IOMMU_REGSET_ENTRY(RTADDR), - IOMMU_REGSET_ENTRY(CCMD), - IOMMU_REGSET_ENTRY(AFLOG), IOMMU_REGSET_ENTRY(PHMBASE), IOMMU_REGSET_ENTRY(PHMLIMIT), IOMMU_REGSET_ENTRY(IQH), @@ -435,8 +433,21 @@ static int domain_translation_struct_show(struct seq_file *m, } pgd &= VTD_PAGE_MASK; } else { /* legacy mode */ - pgd = context->lo & VTD_PAGE_MASK; - agaw = context->hi & 7; + u8 tt = (u8)(context->lo & GENMASK_ULL(3, 2)) >> 2; + + /* + * According to Translation Type(TT), + * get the page table pointer(SSPTPTR). + */ + switch (tt) { + case CONTEXT_TT_MULTI_LEVEL: + case CONTEXT_TT_DEV_IOTLB: + pgd = context->lo & VTD_PAGE_MASK; + agaw = context->hi & 7; + break; + default: + goto iommu_unlock; + } } seq_printf(m, "Device %04x:%02x:%02x.%x ", @@ -648,17 +659,11 @@ DEFINE_SHOW_ATTRIBUTE(ir_translation_struct); static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu, struct dmar_drhd_unit *drhd) { - int ret; - seq_printf(m, "IOMMU: %s Register Base Address: %llx\n", iommu->name, drhd->reg_base_addr); - ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE); - if (ret < 0) - seq_puts(m, "Failed to get latency snapshot"); - else - seq_puts(m, debug_buf); - seq_puts(m, "\n"); + dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE); + seq_printf(m, "%s\n", debug_buf); } static int latency_show(struct seq_file *m, void *v) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index dff2d895b8ab..e236c7ec221f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3817,7 +3817,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) } if (info->ats_supported && ecap_prs(iommu->ecap) && - pci_pri_supported(pdev)) + ecap_pds(iommu->ecap) && pci_pri_supported(pdev)) info->pri_supported = 1; } } diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index d09b92871659..3056583d7f56 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -77,7 +77,6 @@ #define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ #define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ #define DMAR_FEUADDR_REG 0x44 /* Upper address register */ -#define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ #define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ #define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ #define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ @@ -173,8 +172,6 @@ #define cap_pgsel_inv(c) (((c) >> 39) & 1) #define cap_super_page_val(c) (((c) >> 34) & 0xf) -#define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ - * OFFSET_STRIDE) + 21) #define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) #define cap_max_fault_reg_offset(c) \ @@ -462,7 +459,6 @@ enum { #define QI_PGRP_PASID(pasid) (((u64)(pasid)) << 32) /* Page group response descriptor QW1 */ -#define QI_PGRP_LPIG(x) (((u64)(x)) << 2) #define QI_PGRP_IDX(idx) (((u64)(idx)) << 3) @@ -541,7 +537,8 @@ enum { #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) #define ssads_supported(iommu) (sm_supported(iommu) && \ - ecap_slads((iommu)->ecap)) + ecap_slads((iommu)->ecap) && \ + ecap_smpwc(iommu->ecap)) #define nested_supported(iommu) (sm_supported(iommu) && \ ecap_nest((iommu)->ecap)) diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c index adc4de6bbd88..dceeadc3ee7c 100644 --- a/drivers/iommu/intel/perf.c +++ b/drivers/iommu/intel/perf.c @@ -113,7 +113,7 @@ static char *latency_type_names[] = { " svm_prq" }; -int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) +void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) { struct latency_statistic *lstat = iommu->perf_statistic; unsigned long flags; @@ -122,7 +122,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) memset(str, 0, size); for (i = 0; i < COUNTS_NUM; i++) - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "%s", latency_counter_names[i]); spin_lock_irqsave(&latency_lock, flags); @@ -130,7 +130,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) if (!dmar_latency_enabled(iommu, i)) continue; - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "\n%s", latency_type_names[i]); for (j = 0; j < COUNTS_NUM; j++) { @@ -156,11 +156,9 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) break; } - bytes += snprintf(str + bytes, size - bytes, + bytes += scnprintf(str + bytes, size - bytes, "%12lld", val); } } spin_unlock_irqrestore(&latency_lock, flags); - - return bytes; } diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h index df9a36942d64..1d4baad7e852 100644 --- a/drivers/iommu/intel/perf.h +++ b/drivers/iommu/intel/perf.h @@ -40,7 +40,7 @@ void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type); bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type); void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency); -int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size); +void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size); #else static inline int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type) @@ -64,9 +64,8 @@ dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 laten { } -static inline int +static inline void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) { - return 0; } #endif /* CONFIG_DMAR_PERF */ diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index 52570e42a14c..ff63c228e6e1 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -151,8 +151,7 @@ static void handle_bad_prq_event(struct intel_iommu *iommu, QI_PGRP_PASID_P(req->pasid_present) | QI_PGRP_RESP_CODE(result) | QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(req->prg_index) | - QI_PGRP_LPIG(req->lpig); + desc.qw1 = QI_PGRP_IDX(req->prg_index); qi_submit_sync(iommu, &desc, 1, 0); } @@ -379,19 +378,17 @@ void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_fault_page_request *prm; struct qi_desc desc; bool pasid_present; - bool last_page; u16 sid; prm = &evt->fault.prm; sid = PCI_DEVID(bus, devfn); pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | QI_PGRP_PASID_P(pasid_present) | QI_PGRP_RESP_CODE(msg->code) | QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); + desc.qw1 = QI_PGRP_IDX(prm->grpid); desc.qw2 = 0; desc.qw3 = 0; diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c index 679bda104797..54d287cc0dd1 100644 --- a/drivers/iommu/io-pgtable-dart.c +++ b/drivers/iommu/io-pgtable-dart.c @@ -27,8 +27,9 @@ #define DART1_MAX_ADDR_BITS 36 -#define DART_MAX_TABLES 4 -#define DART_LEVELS 2 +#define DART_MAX_TABLE_BITS 2 +#define DART_MAX_TABLES BIT(DART_MAX_TABLE_BITS) +#define DART_MAX_LEVELS 4 /* Includes TTBR level */ /* Struct accessors */ #define io_pgtable_to_data(x) \ @@ -68,6 +69,7 @@ struct dart_io_pgtable { struct io_pgtable iop; + int levels; int tbl_bits; int bits_per_level; @@ -156,44 +158,45 @@ static dart_iopte dart_install_table(dart_iopte *table, return old; } -static int dart_get_table(struct dart_io_pgtable *data, unsigned long iova) +static int dart_get_index(struct dart_io_pgtable *data, unsigned long iova, int level) { - return (iova >> (3 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & - ((1 << data->tbl_bits) - 1); + return (iova >> (level * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & + ((1 << data->bits_per_level) - 1); } -static int dart_get_l1_index(struct dart_io_pgtable *data, unsigned long iova) -{ - - return (iova >> (2 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & - ((1 << data->bits_per_level) - 1); -} - -static int dart_get_l2_index(struct dart_io_pgtable *data, unsigned long iova) +static int dart_get_last_index(struct dart_io_pgtable *data, unsigned long iova) { return (iova >> (data->bits_per_level + ilog2(sizeof(dart_iopte)))) & ((1 << data->bits_per_level) - 1); } -static dart_iopte *dart_get_l2(struct dart_io_pgtable *data, unsigned long iova) +static dart_iopte *dart_get_last(struct dart_io_pgtable *data, unsigned long iova) { dart_iopte pte, *ptep; - int tbl = dart_get_table(data, iova); + int level = data->levels; + int tbl = dart_get_index(data, iova, level); + + if (tbl >= (1 << data->tbl_bits)) + return NULL; ptep = data->pgd[tbl]; if (!ptep) return NULL; - ptep += dart_get_l1_index(data, iova); - pte = READ_ONCE(*ptep); + while (--level > 1) { + ptep += dart_get_index(data, iova, level); + pte = READ_ONCE(*ptep); - /* Valid entry? */ - if (!pte) - return NULL; + /* Valid entry? */ + if (!pte) + return NULL; - /* Deref to get level 2 table */ - return iopte_deref(pte, data); + /* Deref to get next level table */ + ptep = iopte_deref(pte, data); + } + + return ptep; } static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data, @@ -230,6 +233,7 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, int ret = 0, tbl, num_entries, max_entries, map_idx_start; dart_iopte pte, *cptep, *ptep; dart_iopte prot; + int level = data->levels; if (WARN_ON(pgsize != cfg->pgsize_bitmap)) return -EINVAL; @@ -240,31 +244,36 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) return -EINVAL; - tbl = dart_get_table(data, iova); + tbl = dart_get_index(data, iova, level); + + if (tbl >= (1 << data->tbl_bits)) + return -ENOMEM; ptep = data->pgd[tbl]; - ptep += dart_get_l1_index(data, iova); - pte = READ_ONCE(*ptep); + while (--level > 1) { + ptep += dart_get_index(data, iova, level); + pte = READ_ONCE(*ptep); - /* no L2 table present */ - if (!pte) { - cptep = iommu_alloc_pages_sz(gfp, tblsz); - if (!cptep) - return -ENOMEM; + /* no table present */ + if (!pte) { + cptep = iommu_alloc_pages_sz(gfp, tblsz); + if (!cptep) + return -ENOMEM; - pte = dart_install_table(cptep, ptep, 0, data); - if (pte) - iommu_free_pages(cptep); + pte = dart_install_table(cptep, ptep, 0, data); + if (pte) + iommu_free_pages(cptep); - /* L2 table is present (now) */ - pte = READ_ONCE(*ptep); - } + /* L2 table is present (now) */ + pte = READ_ONCE(*ptep); + } - ptep = iopte_deref(pte, data); + ptep = iopte_deref(pte, data); + } /* install a leaf entries into L2 table */ prot = dart_prot_to_pte(data, iommu_prot); - map_idx_start = dart_get_l2_index(data, iova); + map_idx_start = dart_get_last_index(data, iova); max_entries = DART_PTES_PER_TABLE(data) - map_idx_start; num_entries = min_t(int, pgcount, max_entries); ptep += map_idx_start; @@ -293,13 +302,13 @@ static size_t dart_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(pgsize != cfg->pgsize_bitmap || !pgcount)) return 0; - ptep = dart_get_l2(data, iova); + ptep = dart_get_last(data, iova); /* Valid L2 IOPTE pointer? */ if (WARN_ON(!ptep)) return 0; - unmap_idx_start = dart_get_l2_index(data, iova); + unmap_idx_start = dart_get_last_index(data, iova); ptep += unmap_idx_start; max_entries = DART_PTES_PER_TABLE(data) - unmap_idx_start; @@ -330,13 +339,13 @@ static phys_addr_t dart_iova_to_phys(struct io_pgtable_ops *ops, struct dart_io_pgtable *data = io_pgtable_ops_to_data(ops); dart_iopte pte, *ptep; - ptep = dart_get_l2(data, iova); + ptep = dart_get_last(data, iova); /* Valid L2 IOPTE pointer? */ if (!ptep) return 0; - ptep += dart_get_l2_index(data, iova); + ptep += dart_get_last_index(data, iova); pte = READ_ONCE(*ptep); /* Found translation */ @@ -353,21 +362,37 @@ static struct dart_io_pgtable * dart_alloc_pgtable(struct io_pgtable_cfg *cfg) { struct dart_io_pgtable *data; - int tbl_bits, bits_per_level, va_bits, pg_shift; + int levels, max_tbl_bits, tbl_bits, bits_per_level, va_bits, pg_shift; + + /* + * Old 4K page DARTs can use up to 4 top-level tables. + * Newer ones only ever use a maximum of 1. + */ + if (cfg->pgsize_bitmap == SZ_4K) + max_tbl_bits = DART_MAX_TABLE_BITS; + else + max_tbl_bits = 0; pg_shift = __ffs(cfg->pgsize_bitmap); bits_per_level = pg_shift - ilog2(sizeof(dart_iopte)); va_bits = cfg->ias - pg_shift; - tbl_bits = max_t(int, 0, va_bits - (bits_per_level * DART_LEVELS)); - if ((1 << tbl_bits) > DART_MAX_TABLES) + levels = max_t(int, 2, (va_bits - max_tbl_bits + bits_per_level - 1) / bits_per_level); + + if (levels > (DART_MAX_LEVELS - 1)) + return NULL; + + tbl_bits = max_t(int, 0, va_bits - (bits_per_level * levels)); + + if (tbl_bits > max_tbl_bits) return NULL; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL; + data->levels = levels + 1; /* Table level counts as one level */ data->tbl_bits = tbl_bits; data->bits_per_level = bits_per_level; @@ -403,6 +428,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) return NULL; cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits; + cfg->apple_dart_cfg.n_levels = data->levels; for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) { data->pgd[i] = @@ -422,24 +448,31 @@ out_free_data: return NULL; } -static void apple_dart_free_pgtable(struct io_pgtable *iop) +static void apple_dart_free_pgtables(struct dart_io_pgtable *data, dart_iopte *ptep, int level) { - struct dart_io_pgtable *data = io_pgtable_to_data(iop); - dart_iopte *ptep, *end; - int i; + dart_iopte *end; + dart_iopte *start = ptep; - for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) { - ptep = data->pgd[i]; + if (level > 1) { end = (void *)ptep + DART_GRANULE(data); while (ptep != end) { dart_iopte pte = *ptep++; if (pte) - iommu_free_pages(iopte_deref(pte, data)); + apple_dart_free_pgtables(data, iopte_deref(pte, data), level - 1); } - iommu_free_pages(data->pgd[i]); } + iommu_free_pages(start); +} + +static void apple_dart_free_pgtable(struct io_pgtable *iop) +{ + struct dart_io_pgtable *data = io_pgtable_to_data(iop); + int i; + + for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) + apple_dart_free_pgtables(data, data->pgd[i], data->levels - 1); kfree(data); } diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index e236b932e766..c95394cd03a7 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -37,6 +37,8 @@ void iommu_device_unregister_bus(struct iommu_device *iommu, const struct bus_type *bus, struct notifier_block *nb); +int iommu_mock_device_add(struct device *dev, struct iommu_device *iommu); + struct iommu_attach_handle *iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 060ebe330ee1..59244c744eab 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -304,6 +304,7 @@ void iommu_device_unregister_bus(struct iommu_device *iommu, struct notifier_block *nb) { bus_unregister_notifier(bus, nb); + fwnode_remove_software_node(iommu->fwnode); iommu_device_unregister(iommu); } EXPORT_SYMBOL_GPL(iommu_device_unregister_bus); @@ -326,6 +327,12 @@ int iommu_device_register_bus(struct iommu_device *iommu, if (err) return err; + iommu->fwnode = fwnode_create_software_node(NULL, NULL); + if (IS_ERR(iommu->fwnode)) { + bus_unregister_notifier(bus, nb); + return PTR_ERR(iommu->fwnode); + } + spin_lock(&iommu_device_lock); list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); @@ -335,9 +342,28 @@ int iommu_device_register_bus(struct iommu_device *iommu, iommu_device_unregister_bus(iommu, bus, nb); return err; } + WRITE_ONCE(iommu->ready, true); return 0; } EXPORT_SYMBOL_GPL(iommu_device_register_bus); + +int iommu_mock_device_add(struct device *dev, struct iommu_device *iommu) +{ + int rc; + + mutex_lock(&iommu_probe_device_lock); + rc = iommu_fwspec_init(dev, iommu->fwnode); + mutex_unlock(&iommu_probe_device_lock); + + if (rc) + return rc; + + rc = device_add(dev); + if (rc) + iommu_fwspec_free(dev); + return rc; +} +EXPORT_SYMBOL_GPL(iommu_mock_device_add); #endif static struct dev_iommu *dev_iommu_get(struct device *dev) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 61686603c769..de178827a078 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -1126,7 +1126,7 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) goto err_put; } - rc = device_add(&mdev->dev); + rc = iommu_mock_device_add(&mdev->dev, &mock_iommu.iommu_dev); if (rc) goto err_put; return mdev; diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 6fb93927bdb9..5c6f5943f44b 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1303,8 +1303,8 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, struct omap_iommu_device *iommu; struct omap_iommu *oiommu; struct iotlb_entry e; + int ret = -EINVAL; int omap_pgsz; - u32 ret = -EINVAL; int i; omap_pgsz = bytes_to_iopgsz(bytes); diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c index 725e919b97ef..83a28c83f991 100644 --- a/drivers/iommu/riscv/iommu-platform.c +++ b/drivers/iommu/riscv/iommu-platform.c @@ -10,6 +10,8 @@ * Tomasz Jeznach <tjeznach@rivosinc.com> */ +#include <linux/acpi.h> +#include <linux/irqchip/riscv-imsic.h> #include <linux/kernel.h> #include <linux/msi.h> #include <linux/of_irq.h> @@ -46,6 +48,7 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev) enum riscv_iommu_igs_settings igs; struct device *dev = &pdev->dev; struct riscv_iommu_device *iommu = NULL; + struct irq_domain *msi_domain; struct resource *res = NULL; int vec, ret; @@ -76,8 +79,13 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev) switch (igs) { case RISCV_IOMMU_CAPABILITIES_IGS_BOTH: case RISCV_IOMMU_CAPABILITIES_IGS_MSI: - if (is_of_node(dev->fwnode)) + if (is_of_node(dev_fwnode(dev))) { of_msi_configure(dev, to_of_node(dev->fwnode)); + } else { + msi_domain = irq_find_matching_fwnode(imsic_acpi_get_fwnode(dev), + DOMAIN_BUS_PLATFORM_MSI); + dev_set_msi_domain(dev, msi_domain); + } if (!dev_get_msi_domain(dev)) { dev_warn(dev, "failed to find an MSI domain\n"); @@ -150,6 +158,12 @@ static const struct of_device_id riscv_iommu_of_match[] = { {}, }; +static const struct acpi_device_id riscv_iommu_acpi_match[] = { + { "RSCV0004", 0 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, riscv_iommu_acpi_match); + static struct platform_driver riscv_iommu_platform_driver = { .probe = riscv_iommu_platform_probe, .remove = riscv_iommu_platform_remove, @@ -158,6 +172,7 @@ static struct platform_driver riscv_iommu_platform_driver = { .name = "riscv,iommu", .of_match_table = riscv_iommu_of_match, .suppress_bind_attrs = true, + .acpi_match_table = riscv_iommu_acpi_match, }, }; diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 0eae2f4bdc5e..ebb22979075d 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -12,6 +12,8 @@ #define pr_fmt(fmt) "riscv-iommu: " fmt +#include <linux/acpi.h> +#include <linux/acpi_rimt.h> #include <linux/compiler.h> #include <linux/crash_dump.h> #include <linux/init.h> @@ -1650,6 +1652,14 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu) goto err_iodir_off; } + if (!acpi_disabled) { + rc = rimt_iommu_register(iommu->dev); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n"); + goto err_remove_sysfs; + } + } + rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); if (rc) { dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n"); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 07c19b2182ca..104aa5355090 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -688,4 +688,6 @@ config DM_AUDIT source "drivers/md/dm-vdo/Kconfig" +source "drivers/md/dm-pcache/Kconfig" + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 5a51b3408b70..c338cc6fbe2e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -73,6 +73,7 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_VDO) += dm-vdo/ +obj-$(CONFIG_DM_PCACHE) += dm-pcache/ obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_EBS) += dm-ebs.o diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 8f3a23f4b168..e6d28be11c5c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1337,7 +1337,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, char *ptr; unsigned int len; - bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN); + bio = bio_kmalloc(1, GFP_NOWAIT); if (!bio) { use_dmio(b, op, sector, n_sectors, offset, ioprio); return; @@ -1601,18 +1601,18 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client * dm-bufio is resistant to allocation failures (it just keeps * one buffer reserved in cases all the allocations fail). * So set flags to not try too hard: - * GFP_NOWAIT: don't wait; if we need to sleep we'll release our - * mutex and wait ourselves. + * GFP_NOWAIT: don't wait and don't print a warning in case of + * failure; if we need to sleep we'll release our mutex + * and wait ourselves. * __GFP_NORETRY: don't retry and rather return failure * __GFP_NOMEMALLOC: don't use emergency reserves - * __GFP_NOWARN: don't print a warning in case of failure * * For debugging, if we set the cache size to 1, no new buffers will * be allocated. */ while (1) { if (dm_bufio_cache_size_latch != 1) { - b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC); if (b) return b; } diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 2ed894155cab..7e1e8cc0e33a 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -590,7 +590,7 @@ static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned in nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u)); ht->hash_bits = __ffs(nr_buckets); - ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets))); + ht->buckets = vmalloc_array(nr_buckets, sizeof(*ht->buckets)); if (!ht->buckets) return -ENOMEM; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index c889332e533b..a3c9f74fe2dc 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -162,6 +162,7 @@ struct mapped_device { #define DMF_SUSPENDED_INTERNALLY 7 #define DMF_POST_SUSPENDING 8 #define DMF_EMULATE_ZONE_APPEND 9 +#define DMF_QUEUE_STOPPED 10 static inline sector_t dm_get_size(struct mapped_device *md) { @@ -291,6 +292,7 @@ struct dm_io { struct dm_io *next; struct dm_stats_aux stats_aux; blk_status_t status; + bool requeue_flush_with_data; atomic_t io_count; struct mapped_device *md; diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index 8b50c908c6f4..efb3cd4f9cd4 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -45,7 +45,7 @@ static void fix_separator_chars(char **buf) /* * Internal function to allocate memory for IMA measurements. */ -static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio) +static void *dm_ima_alloc(size_t len, bool noio) { unsigned int noio_flag; void *ptr; @@ -53,7 +53,7 @@ static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio) if (noio) noio_flag = memalloc_noio_save(); - ptr = kzalloc(len, flags); + ptr = kzalloc(len, GFP_KERNEL); if (noio) memalloc_noio_restore(noio_flag); @@ -68,13 +68,13 @@ static int dm_ima_alloc_and_copy_name_uuid(struct mapped_device *md, char **dev_ char **dev_uuid, bool noio) { int r; - *dev_name = dm_ima_alloc(DM_NAME_LEN*2, GFP_KERNEL, noio); + *dev_name = dm_ima_alloc(DM_NAME_LEN*2, noio); if (!(*dev_name)) { r = -ENOMEM; goto error; } - *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, GFP_KERNEL, noio); + *dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, noio); if (!(*dev_uuid)) { r = -ENOMEM; goto error; @@ -109,7 +109,7 @@ static int dm_ima_alloc_and_copy_device_data(struct mapped_device *md, char **de if (r) return r; - *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + *device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!(*device_data)) { r = -ENOMEM; goto error; @@ -153,14 +153,12 @@ static int dm_ima_alloc_and_copy_capacity_str(struct mapped_device *md, char **c capacity = get_capacity(md->disk); - *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, GFP_KERNEL, noio); + *capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, noio); if (!(*capacity_str)) return -ENOMEM; - scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;", - capacity); - - return 0; + return scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;", + capacity); } /* @@ -195,15 +193,15 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1; char table_load_event_name[] = "dm_table_load"; - ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, GFP_KERNEL, noio); + ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio); if (!ima_buf) return; - target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, GFP_KERNEL, noio); + target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, noio); if (!target_metadata_buf) goto error; - target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, GFP_KERNEL, noio); + target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, noio); if (!target_data_buf) goto error; @@ -218,7 +216,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl shash->tfm = tfm; digest_size = crypto_shash_digestsize(tfm); - digest = dm_ima_alloc(digest_size, GFP_KERNEL, noio); + digest = dm_ima_alloc(digest_size, noio); if (!digest) goto error; @@ -327,7 +325,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (r < 0) goto error; - digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, GFP_KERNEL, noio); + digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio); if (!digest_buf) goto error; @@ -371,18 +369,18 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) { char *device_table_data, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; char active[] = "active_table_hash="; - unsigned int active_len = strlen(active), capacity_len = 0; + unsigned int active_len = strlen(active); unsigned int l = 0; bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!device_table_data) return; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) goto error; memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); @@ -445,8 +443,7 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) } if (nodata) { - r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio); - if (r) + if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error; l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, @@ -454,7 +451,6 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) DM_IMA_VERSION_STR, dev_name, dev_uuid); } - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -483,18 +479,17 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) unsigned int device_active_len = strlen(device_active_str); unsigned int device_inactive_len = strlen(device_inactive_str); unsigned int remove_all_len = strlen(remove_all_str); - unsigned int capacity_len = 0; unsigned int l = 0; bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, noio); if (!device_table_data) goto exit; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) { + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) { kfree(device_table_data); goto exit; } @@ -570,7 +565,6 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) memcpy(device_table_data + l, remove_all ? "y;" : "n;", 2); l += 2; - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -602,20 +596,20 @@ exit: */ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) { - unsigned int l = 0, capacity_len = 0; + unsigned int l = 0; char *device_table_data = NULL, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL; char inactive_str[] = "inactive_table_hash="; unsigned int inactive_len = strlen(inactive_str); bool noio = true; bool nodata = true; - int r; + int capacity_len; - device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio); + device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio); if (!device_table_data) return; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); + if (capacity_len < 0) goto error1; memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len); @@ -650,7 +644,6 @@ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) DM_IMA_VERSION_STR, dev_name, dev_uuid); } - capacity_len = strlen(capacity_str); memcpy(device_table_data + l, capacity_str, capacity_len); l += capacity_len; @@ -703,7 +696,7 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL; char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL; bool noio = true; - int r, len; + int len; if (dm_ima_alloc_and_copy_device_data(md, &new_device_data, md->ima.active_table.num_targets, noio)) @@ -712,12 +705,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) if (dm_ima_alloc_and_copy_name_uuid(md, &new_dev_name, &new_dev_uuid, noio)) goto error; - combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, GFP_KERNEL, noio); + combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, noio); if (!combined_device_data) goto error; - r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio); - if (r) + if (dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio) < 0) goto error; old_device_data = md->ima.active_table.device_metadata; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ab96b692e5a3..170bf67a2edd 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -219,10 +219,13 @@ struct dm_integrity_c { __u8 log2_blocks_per_bitmap_bit; unsigned char mode; + bool internal_hash; int failed; - struct crypto_shash *internal_hash; + struct crypto_shash *internal_shash; + struct crypto_ahash *internal_ahash; + unsigned int internal_hash_digestsize; struct dm_target *ti; @@ -277,6 +280,9 @@ struct dm_integrity_c { bool fix_hmac; bool legacy_recalculate; + mempool_t ahash_req_pool; + struct ahash_request *journal_ahash_req; + struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; struct alg_spec journal_mac_alg; @@ -326,6 +332,8 @@ struct dm_integrity_io { unsigned payload_len; bool integrity_payload_from_mempool; bool integrity_range_locked; + + struct ahash_request *ahash_req; }; struct journal_completion { @@ -352,6 +360,7 @@ struct bitmap_block_status { static struct kmem_cache *journal_io_cache; #define JOURNAL_IO_MEMPOOL 32 +#define AHASH_MEMPOOL 32 #ifdef DEBUG_PRINT #define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) @@ -1634,15 +1643,15 @@ static void integrity_end_io(struct bio *bio) dec_in_flight(dio); } -static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, - const char *data, char *result) +static void integrity_sector_checksum_shash(struct dm_integrity_c *ic, sector_t sector, + const char *data, unsigned offset, char *result) { __le64 sector_le = cpu_to_le64(sector); - SHASH_DESC_ON_STACK(req, ic->internal_hash); + SHASH_DESC_ON_STACK(req, ic->internal_shash); int r; unsigned int digest_size; - req->tfm = ic->internal_hash; + req->tfm = ic->internal_shash; r = crypto_shash_init(req); if (unlikely(r < 0)) { @@ -1664,7 +1673,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector goto failed; } - r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT); + r = crypto_shash_update(req, data + offset, ic->sectors_per_block << SECTOR_SHIFT); if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_update", r); goto failed; @@ -1676,7 +1685,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector goto failed; } - digest_size = crypto_shash_digestsize(ic->internal_hash); + digest_size = ic->internal_hash_digestsize; if (unlikely(digest_size < ic->tag_size)) memset(result + digest_size, 0, ic->tag_size - digest_size); @@ -1687,6 +1696,104 @@ failed: get_random_bytes(result, ic->tag_size); } +static void integrity_sector_checksum_ahash(struct dm_integrity_c *ic, struct ahash_request **ahash_req, + sector_t sector, struct page *page, unsigned offset, char *result) +{ + __le64 sector_le = cpu_to_le64(sector); + struct ahash_request *req; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist sg[3], *s = sg; + int r; + unsigned int digest_size; + unsigned int nbytes = 0; + + might_sleep(); + + req = *ahash_req; + if (unlikely(!req)) { + req = mempool_alloc(&ic->ahash_req_pool, GFP_NOIO); + *ahash_req = req; + } + + ahash_request_set_tfm(req, ic->internal_ahash); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + sg_init_table(sg, 3); + sg_set_buf(s, (const __u8 *)&ic->sb->salt, SALT_SIZE); + nbytes += SALT_SIZE; + s++; + } else { + sg_init_table(sg, 2); + } + + if (likely(!is_vmalloc_addr(§or_le))) { + sg_set_buf(s, §or_le, sizeof(sector_le)); + } else { + struct page *sec_page = vmalloc_to_page(§or_le); + unsigned int sec_off = offset_in_page(§or_le); + sg_set_page(s, sec_page, sizeof(sector_le), sec_off); + } + nbytes += sizeof(sector_le); + s++; + + sg_set_page(s, page, ic->sectors_per_block << SECTOR_SHIFT, offset); + nbytes += ic->sectors_per_block << SECTOR_SHIFT; + + ahash_request_set_crypt(req, sg, result, nbytes); + + r = crypto_wait_req(crypto_ahash_digest(req), &wait); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_ahash_digest", r); + goto failed; + } + + digest_size = ic->internal_hash_digestsize; + if (unlikely(digest_size < ic->tag_size)) + memset(result + digest_size, 0, ic->tag_size - digest_size); + + return; + +failed: + /* this shouldn't happen anyway, the hash functions have no reason to fail */ + get_random_bytes(result, ic->tag_size); +} + +static void integrity_sector_checksum(struct dm_integrity_c *ic, struct ahash_request **ahash_req, + sector_t sector, const char *data, unsigned offset, char *result) +{ + if (likely(ic->internal_shash != NULL)) + integrity_sector_checksum_shash(ic, sector, data, offset, result); + else + integrity_sector_checksum_ahash(ic, ahash_req, sector, (struct page *)data, offset, result); +} + +static void *integrity_kmap(struct dm_integrity_c *ic, struct page *p) +{ + if (likely(ic->internal_shash != NULL)) + return kmap_local_page(p); + else + return p; +} + +static void integrity_kunmap(struct dm_integrity_c *ic, const void *ptr) +{ + if (likely(ic->internal_shash != NULL)) + kunmap_local(ptr); +} + +static void *integrity_identity(struct dm_integrity_c *ic, void *data) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(offset_in_page(data)); + BUG_ON(!virt_addr_valid(data)); +#endif + if (likely(ic->internal_shash != NULL)) + return data; + else + return virt_to_page(data); +} + static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum) { struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); @@ -1711,6 +1818,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks sector_t alignment; char *mem; char *buffer = page_to_virt(page); + unsigned int buffer_offset; int r; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -1728,7 +1836,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks alignment &= -alignment; io_loc.sector = round_down(io_loc.sector, alignment); io_loc.count += sector - io_loc.sector; - buffer += (sector - io_loc.sector) << SECTOR_SHIFT; + buffer_offset = (sector - io_loc.sector) << SECTOR_SHIFT; io_loc.count = round_up(io_loc.count, alignment); r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); @@ -1737,7 +1845,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks goto free_ret; } - integrity_sector_checksum(ic, logical_sector, buffer, checksum); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, integrity_identity(ic, buffer), buffer_offset, checksum); r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block, &dio->metadata_offset, ic->tag_size, TAG_CMP); if (r) { @@ -1754,7 +1862,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks } mem = bvec_kmap_local(&bv); - memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT); + memcpy(mem + pos, buffer + buffer_offset, ic->sectors_per_block << SECTOR_SHIFT); kunmap_local(mem); pos += ic->sectors_per_block << SECTOR_SHIFT; @@ -1776,7 +1884,7 @@ static void integrity_metadata(struct work_struct *w) if (ic->internal_hash) { struct bvec_iter iter; struct bio_vec bv; - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + unsigned int digest_size = ic->internal_hash_digestsize; struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; @@ -1837,17 +1945,17 @@ static void integrity_metadata(struct work_struct *w) char *mem, *checksums_ptr; again: - mem = bvec_kmap_local(&bv_copy); + mem = integrity_kmap(ic, bv_copy.bv_page); pos = 0; checksums_ptr = checksums; do { - integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr); + integrity_sector_checksum(ic, &dio->ahash_req, sector, mem, bv_copy.bv_offset + pos, checksums_ptr); checksums_ptr += ic->tag_size; sectors_to_process -= ic->sectors_per_block; pos += ic->sectors_per_block << SECTOR_SHIFT; sector += ic->sectors_per_block; } while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack); - kunmap_local(mem); + integrity_kunmap(ic, mem); r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); @@ -1949,6 +2057,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->ic = ic; dio->bi_status = 0; dio->op = bio_op(bio); + dio->ahash_req = NULL; if (ic->mode == 'I') { bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector); @@ -2071,19 +2180,6 @@ retry_kmap: js++; mem_ptr += 1 << SECTOR_SHIFT; } while (++s < ic->sectors_per_block); -#ifdef INTERNAL_VERIFY - if (ic->internal_hash) { - char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; - - integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); - if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { - DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", - logical_sector); - dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", - bio, logical_sector, 0); - } - } -#endif } if (!ic->internal_hash) { @@ -2124,15 +2220,17 @@ retry_kmap: } while (++s < ic->sectors_per_block); if (ic->internal_hash) { - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + unsigned int digest_size = ic->internal_hash_digestsize; + void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js)); + unsigned js_offset = offset_in_page(js); if (unlikely(digest_size > ic->tag_size)) { char checksums_onstack[HASH_MAX_DIGESTSIZE]; - integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, checksums_onstack); memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size); } else - integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je)); + integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, journal_entry_tag(ic, je)); } journal_entry_set_sector(je, logical_sector); @@ -2428,7 +2526,7 @@ retry: if (!dio->integrity_payload) { unsigned digest_size, extra_size; dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); - digest_size = crypto_shash_digestsize(ic->internal_hash); + digest_size = ic->internal_hash_digestsize; extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; dio->payload_len += extra_size; dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -2505,11 +2603,11 @@ skip_spinlock: unsigned pos = 0; while (dio->bio_details.bi_iter.bi_size) { struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); - const char *mem = bvec_kmap_local(&bv); + const char *mem = integrity_kmap(ic, bv.bv_page); if (ic->tag_size < ic->tuple_size) memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos); - kunmap_local(mem); + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, dio->integrity_payload + pos); + integrity_kunmap(ic, mem); pos += ic->tuple_size; bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); } @@ -2588,8 +2686,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w) } bio_put(outgoing_bio); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); - if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, integrity_identity(ic, outgoing_data), 0, digest); + if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(ic->internal_hash_digestsize, ic->tag_size)))) { DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); atomic64_inc(&ic->number_of_mismatches); @@ -2612,33 +2710,58 @@ static void dm_integrity_inline_recheck(struct work_struct *w) bio_endio(bio); } +static inline bool dm_integrity_check(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + unsigned pos = 0; + + while (dio->bio_details.bi_iter.bi_size) { + char digest[HASH_MAX_DIGESTSIZE]; + struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); + char *mem = integrity_kmap(ic, bv.bv_page); + integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, digest); + if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, + min(ic->internal_hash_digestsize, ic->tag_size)))) { + integrity_kunmap(ic, mem); + dm_integrity_free_payload(dio); + INIT_WORK(&dio->work, dm_integrity_inline_recheck); + queue_work(ic->offload_wq, &dio->work); + return false; + } + integrity_kunmap(ic, mem); + pos += ic->tuple_size; + bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } + + return true; +} + +static void dm_integrity_inline_async_check(struct work_struct *w) +{ + struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work); + struct dm_integrity_c *ic = dio->ic; + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (likely(dm_integrity_check(ic, dio))) + bio_endio(bio); +} + static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) { struct dm_integrity_c *ic = ti->private; + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); if (ic->mode == 'I') { - struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); - if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) { - unsigned pos = 0; + if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK) && likely(dio->bio_details.bi_iter.bi_size != 0)) { if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && unlikely(dio->integrity_range_locked)) - goto skip_check; - while (dio->bio_details.bi_iter.bi_size) { - char digest[HASH_MAX_DIGESTSIZE]; - struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter); - char *mem = bvec_kmap_local(&bv); - //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); - integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); - if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, - min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { - kunmap_local(mem); - dm_integrity_free_payload(dio); - INIT_WORK(&dio->work, dm_integrity_inline_recheck); - queue_work(ic->offload_wq, &dio->work); + goto skip_check; + if (likely(ic->internal_shash != NULL)) { + if (unlikely(!dm_integrity_check(ic, dio))) return DM_ENDIO_INCOMPLETE; - } - kunmap_local(mem); - pos += ic->tuple_size; - bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT); + } else { + INIT_WORK(&dio->work, dm_integrity_inline_async_check); + queue_work(ic->offload_wq, &dio->work); + return DM_ENDIO_INCOMPLETE; } } skip_check: @@ -2646,6 +2769,8 @@ skip_check: if (unlikely(dio->integrity_range_locked)) remove_range(ic, &dio->range); } + if (unlikely(dio->ahash_req)) + mempool_free(dio->ahash_req, &ic->ahash_req_pool); return DM_ENDIO_DONE; } @@ -2902,9 +3027,12 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start #endif ic->internal_hash) { char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + struct journal_sector *js = access_journal_data(ic, i, l); + void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js)); + unsigned js_offset = offset_in_page(js); - integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), - (char *)access_journal_data(ic, i, l), test_tag); + integrity_sector_checksum(ic, &ic->journal_ahash_req, sec + ((l - j) << ic->sb->log2_sectors_per_block), + js_page, js_offset, test_tag); if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); @@ -2987,6 +3115,7 @@ static void integrity_recalc(struct work_struct *w) size_t recalc_tags_size; u8 *recalc_buffer = NULL; u8 *recalc_tags = NULL; + struct ahash_request *ahash_req = NULL; struct dm_integrity_range range; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -3001,7 +3130,7 @@ static void integrity_recalc(struct work_struct *w) unsigned recalc_sectors = RECALC_SECTORS; retry: - recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO); + recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN); if (!recalc_buffer) { oom: recalc_sectors >>= 1; @@ -3011,11 +3140,11 @@ oom: goto free_ret; } recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; + if (ic->internal_hash_digestsize > ic->tag_size) + recalc_tags_size += ic->internal_hash_digestsize - ic->tag_size; recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO); if (!recalc_tags) { - vfree(recalc_buffer); + kfree(recalc_buffer); recalc_buffer = NULL; goto oom; } @@ -3081,7 +3210,7 @@ next_chunk: goto err; io_req.bi_opf = REQ_OP_READ; - io_req.mem.type = DM_IO_VMA; + io_req.mem.type = DM_IO_KMEM; io_req.mem.ptr.addr = recalc_buffer; io_req.notify.fn = NULL; io_req.client = ic->io; @@ -3097,7 +3226,10 @@ next_chunk: t = recalc_tags; for (i = 0; i < n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t); + void *ptr = recalc_buffer + (i << SECTOR_SHIFT); + void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr)); + unsigned ptr_offset = offset_in_page(ptr); + integrity_sector_checksum(ic, &ahash_req, logical_sector + i, ptr_page, ptr_offset, t); t += ic->tag_size; } @@ -3139,8 +3271,9 @@ unlock_ret: recalc_write_super(ic); free_ret: - vfree(recalc_buffer); + kfree(recalc_buffer); kvfree(recalc_tags); + mempool_free(ahash_req, &ic->ahash_req_pool); } static void integrity_recalc_inline(struct work_struct *w) @@ -3149,6 +3282,7 @@ static void integrity_recalc_inline(struct work_struct *w) size_t recalc_tags_size; u8 *recalc_buffer = NULL; u8 *recalc_tags = NULL; + struct ahash_request *ahash_req = NULL; struct dm_integrity_range range; struct bio *bio; struct bio_integrity_payload *bip; @@ -3171,8 +3305,8 @@ oom: } recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size; + if (ic->internal_hash_digestsize > ic->tuple_size) + recalc_tags_size += ic->internal_hash_digestsize - ic->tuple_size; recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN); if (!recalc_tags) { kfree(recalc_buffer); @@ -3217,8 +3351,11 @@ next_chunk: t = recalc_tags; for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { + void *ptr = recalc_buffer + (i << SECTOR_SHIFT); + void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr)); + unsigned ptr_offset = offset_in_page(ptr); memset(t, 0, ic->tuple_size); - integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t); + integrity_sector_checksum(ic, &ahash_req, range.logical_sector + i, ptr_page, ptr_offset, t); t += ic->tuple_size; } @@ -3270,6 +3407,7 @@ unlock_ret: free_ret: kfree(recalc_buffer); kfree(recalc_tags); + mempool_free(ahash_req, &ic->ahash_req_pool); } static void bitmap_block_work(struct work_struct *w) @@ -4210,30 +4348,53 @@ nomem: return -ENOMEM; } -static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, - char *error_alg, char *error_key) +static int get_mac(struct crypto_shash **shash, struct crypto_ahash **ahash, + struct alg_spec *a, char **error, char *error_alg, char *error_key) { int r; if (a->alg_string) { - *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(*hash)) { - *error = error_alg; - r = PTR_ERR(*hash); - *hash = NULL; - return r; - } - - if (a->key) { - r = crypto_shash_setkey(*hash, a->key, a->key_size); - if (r) { + if (shash) { + *shash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(*shash)) { + *shash = NULL; + goto try_ahash; + } + if (a->key) { + r = crypto_shash_setkey(*shash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } else if (crypto_shash_get_flags(*shash) & CRYPTO_TFM_NEED_KEY) { *error = error_key; + return -ENOKEY; + } + return 0; + } +try_ahash: + if (ahash) { + *ahash = crypto_alloc_ahash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(*ahash)) { + *error = error_alg; + r = PTR_ERR(*ahash); + *ahash = NULL; return r; } - } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { - *error = error_key; - return -ENOKEY; + if (a->key) { + r = crypto_ahash_setkey(*ahash, a->key, a->key_size); + if (r) { + *error = error_key; + return r; + } + } else if (crypto_ahash_get_flags(*ahash) & CRYPTO_TFM_NEED_KEY) { + *error = error_key; + return -ENOKEY; + } + return 0; } + *error = error_alg; + return -ENOENT; } return 0; @@ -4690,12 +4851,26 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv buffer_sectors = 1; ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT); - r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error, + r = get_mac(&ic->internal_shash, &ic->internal_ahash, &ic->internal_hash_alg, &ti->error, "Invalid internal hash", "Error setting internal hash key"); if (r) goto bad; + if (ic->internal_shash) { + ic->internal_hash = true; + ic->internal_hash_digestsize = crypto_shash_digestsize(ic->internal_shash); + } + if (ic->internal_ahash) { + ic->internal_hash = true; + ic->internal_hash_digestsize = crypto_ahash_digestsize(ic->internal_ahash); + r = mempool_init_kmalloc_pool(&ic->ahash_req_pool, AHASH_MEMPOOL, + sizeof(struct ahash_request) + crypto_ahash_reqsize(ic->internal_ahash)); + if (r) { + ti->error = "Cannot allocate mempool"; + goto bad; + } + } - r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error, + r = get_mac(&ic->journal_mac, NULL, &ic->journal_mac_alg, &ti->error, "Invalid journal mac", "Error setting journal mac key"); if (r) goto bad; @@ -4706,7 +4881,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv r = -EINVAL; goto bad; } - ic->tag_size = crypto_shash_digestsize(ic->internal_hash); + ic->tag_size = ic->internal_hash_digestsize; } if (ic->tag_size > MAX_TAG_SIZE) { ti->error = "Too big tag size"; @@ -5178,6 +5353,8 @@ static void dm_integrity_dtr(struct dm_target *ti) kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); + mempool_free(ic->journal_ahash_req, &ic->ahash_req_pool); + mempool_exit(&ic->ahash_req_pool); bioset_exit(&ic->recalc_bios); bioset_exit(&ic->recheck_bios); mempool_exit(&ic->recheck_pool); @@ -5215,8 +5392,10 @@ static void dm_integrity_dtr(struct dm_target *ti) if (ic->sb) free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); - if (ic->internal_hash) - crypto_free_shash(ic->internal_hash); + if (ic->internal_shash) + crypto_free_shash(ic->internal_shash); + if (ic->internal_ahash) + crypto_free_ahash(ic->internal_ahash); free_alg(&ic->internal_hash_alg); if (ic->journal_crypt) @@ -5233,7 +5412,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 679b07dee229..7bb7174f8f4f 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -414,7 +414,7 @@ static int log_super(struct log_writes_c *lc) } /* - * Super sector should be writen in-order, otherwise the + * Super sector should be written in-order, otherwise the * nr_entries could be rewritten incorrectly by an old bio. */ wait_for_completion_io(&lc->super_done); diff --git a/drivers/md/dm-pcache/Kconfig b/drivers/md/dm-pcache/Kconfig new file mode 100644 index 000000000000..0e251eca892e --- /dev/null +++ b/drivers/md/dm-pcache/Kconfig @@ -0,0 +1,17 @@ +config DM_PCACHE + tristate "Persistent cache for Block Device (Experimental)" + depends on BLK_DEV_DM + depends on DEV_DAX + help + PCACHE provides a mechanism to use persistent memory (e.g., CXL persistent memory, + DAX-enabled devices) as a high-performance cache layer in front of + traditional block devices such as SSDs or HDDs. + + PCACHE is implemented as a kernel module that integrates with the block + layer and supports direct access (DAX) to persistent memory for low-latency, + byte-addressable caching. + + Note: This feature is experimental and should be tested thoroughly + before use in production environments. + + If unsure, say 'N'. diff --git a/drivers/md/dm-pcache/Makefile b/drivers/md/dm-pcache/Makefile new file mode 100644 index 000000000000..86776e4acad2 --- /dev/null +++ b/drivers/md/dm-pcache/Makefile @@ -0,0 +1,3 @@ +dm-pcache-y := dm_pcache.o cache_dev.o segment.o backing_dev.o cache.o cache_gc.o cache_writeback.o cache_segment.o cache_key.o cache_req.o + +obj-m += dm-pcache.o diff --git a/drivers/md/dm-pcache/backing_dev.c b/drivers/md/dm-pcache/backing_dev.c new file mode 100644 index 000000000000..7165fc0364bb --- /dev/null +++ b/drivers/md/dm-pcache/backing_dev.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/blkdev.h> + +#include "../dm-core.h" +#include "pcache_internal.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +static struct kmem_cache *backing_req_cache; +static struct kmem_cache *backing_bvec_cache; + +static void backing_dev_exit(struct pcache_backing_dev *backing_dev) +{ + mempool_exit(&backing_dev->req_pool); + mempool_exit(&backing_dev->bvec_pool); +} + +static void req_submit_fn(struct work_struct *work); +static void req_complete_fn(struct work_struct *work); +static int backing_dev_init(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + int ret; + + ret = mempool_init_slab_pool(&backing_dev->req_pool, 128, backing_req_cache); + if (ret) + goto err; + + ret = mempool_init_slab_pool(&backing_dev->bvec_pool, 128, backing_bvec_cache); + if (ret) + goto req_pool_exit; + + INIT_LIST_HEAD(&backing_dev->submit_list); + INIT_LIST_HEAD(&backing_dev->complete_list); + spin_lock_init(&backing_dev->submit_lock); + spin_lock_init(&backing_dev->complete_lock); + INIT_WORK(&backing_dev->req_submit_work, req_submit_fn); + INIT_WORK(&backing_dev->req_complete_work, req_complete_fn); + atomic_set(&backing_dev->inflight_reqs, 0); + init_waitqueue_head(&backing_dev->inflight_wq); + + return 0; + +req_pool_exit: + mempool_exit(&backing_dev->req_pool); +err: + return ret; +} + +int backing_dev_start(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + int ret; + + ret = backing_dev_init(pcache); + if (ret) + return ret; + + backing_dev->dev_size = bdev_nr_sectors(backing_dev->dm_dev->bdev); + + return 0; +} + +void backing_dev_stop(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + + /* + * There should not be any new request comming, just wait + * inflight requests done. + */ + wait_event(backing_dev->inflight_wq, + atomic_read(&backing_dev->inflight_reqs) == 0); + + flush_work(&backing_dev->req_submit_work); + flush_work(&backing_dev->req_complete_work); + + backing_dev_exit(backing_dev); +} + +/* pcache_backing_dev_req functions */ +void backing_dev_req_end(struct pcache_backing_dev_req *backing_req) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + + if (backing_req->end_req) + backing_req->end_req(backing_req, backing_req->ret); + + switch (backing_req->type) { + case BACKING_DEV_REQ_TYPE_REQ: + if (backing_req->req.upper_req) + pcache_req_put(backing_req->req.upper_req, backing_req->ret); + break; + case BACKING_DEV_REQ_TYPE_KMEM: + if (backing_req->kmem.bvecs != backing_req->kmem.inline_bvecs) + mempool_free(backing_req->kmem.bvecs, &backing_dev->bvec_pool); + break; + default: + BUG(); + } + + mempool_free(backing_req, &backing_dev->req_pool); + + if (atomic_dec_and_test(&backing_dev->inflight_reqs)) + wake_up(&backing_dev->inflight_wq); +} + +static void req_complete_fn(struct work_struct *work) +{ + struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_complete_work); + struct pcache_backing_dev_req *backing_req; + LIST_HEAD(tmp_list); + + spin_lock_irq(&backing_dev->complete_lock); + list_splice_init(&backing_dev->complete_list, &tmp_list); + spin_unlock_irq(&backing_dev->complete_lock); + + while (!list_empty(&tmp_list)) { + backing_req = list_first_entry(&tmp_list, + struct pcache_backing_dev_req, node); + list_del_init(&backing_req->node); + backing_dev_req_end(backing_req); + } +} + +static void backing_dev_bio_end(struct bio *bio) +{ + struct pcache_backing_dev_req *backing_req = bio->bi_private; + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + unsigned long flags; + + backing_req->ret = blk_status_to_errno(bio->bi_status); + + spin_lock_irqsave(&backing_dev->complete_lock, flags); + list_move_tail(&backing_req->node, &backing_dev->complete_list); + queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_complete_work); + spin_unlock_irqrestore(&backing_dev->complete_lock, flags); +} + +static void req_submit_fn(struct work_struct *work) +{ + struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_submit_work); + struct pcache_backing_dev_req *backing_req; + LIST_HEAD(tmp_list); + + spin_lock(&backing_dev->submit_lock); + list_splice_init(&backing_dev->submit_list, &tmp_list); + spin_unlock(&backing_dev->submit_lock); + + while (!list_empty(&tmp_list)) { + backing_req = list_first_entry(&tmp_list, + struct pcache_backing_dev_req, node); + list_del_init(&backing_req->node); + submit_bio_noacct(&backing_req->bio); + } +} + +void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + + if (direct) { + submit_bio_noacct(&backing_req->bio); + return; + } + + spin_lock(&backing_dev->submit_lock); + list_add_tail(&backing_req->node, &backing_dev->submit_list); + queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_submit_work); + spin_unlock(&backing_dev->submit_lock); +} + +static void bio_map(struct bio *bio, void *base, size_t size) +{ + struct page *page; + unsigned int offset; + unsigned int len; + + if (!is_vmalloc_addr(base)) { + page = virt_to_page(base); + offset = offset_in_page(base); + + BUG_ON(!bio_add_page(bio, page, size, offset)); + return; + } + + flush_kernel_vmap_range(base, size); + while (size) { + page = vmalloc_to_page(base); + offset = offset_in_page(base); + len = min_t(size_t, PAGE_SIZE - offset, size); + + BUG_ON(!bio_add_page(bio, page, len, offset)); + size -= len; + base += len; + } +} + +static struct pcache_backing_dev_req *req_type_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_request *pcache_req = opts->req.upper_req; + struct pcache_backing_dev_req *backing_req; + struct bio *orig = pcache_req->bio; + + backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask); + if (!backing_req) + return NULL; + + memset(backing_req, 0, sizeof(struct pcache_backing_dev_req)); + + bio_init_clone(backing_dev->dm_dev->bdev, &backing_req->bio, orig, opts->gfp_mask); + + backing_req->type = BACKING_DEV_REQ_TYPE_REQ; + backing_req->backing_dev = backing_dev; + atomic_inc(&backing_dev->inflight_reqs); + + return backing_req; +} + +static struct pcache_backing_dev_req *kmem_type_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev_req *backing_req; + u32 n_vecs = bio_add_max_vecs(opts->kmem.data, opts->kmem.len); + + backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask); + if (!backing_req) + return NULL; + + memset(backing_req, 0, sizeof(struct pcache_backing_dev_req)); + + if (n_vecs > BACKING_DEV_REQ_INLINE_BVECS) { + backing_req->kmem.bvecs = mempool_alloc(&backing_dev->bvec_pool, opts->gfp_mask); + if (!backing_req->kmem.bvecs) + goto free_backing_req; + } else { + backing_req->kmem.bvecs = backing_req->kmem.inline_bvecs; + } + + backing_req->kmem.n_vecs = n_vecs; + backing_req->type = BACKING_DEV_REQ_TYPE_KMEM; + backing_req->backing_dev = backing_dev; + atomic_inc(&backing_dev->inflight_reqs); + + return backing_req; + +free_backing_req: + mempool_free(backing_req, &backing_dev->req_pool); + return NULL; +} + +struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + if (opts->type == BACKING_DEV_REQ_TYPE_REQ) + return req_type_req_alloc(backing_dev, opts); + + if (opts->type == BACKING_DEV_REQ_TYPE_KMEM) + return kmem_type_req_alloc(backing_dev, opts); + + BUG(); +} + +static void req_type_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_request *pcache_req = opts->req.upper_req; + struct bio *clone; + u32 off = opts->req.req_off; + u32 len = opts->req.len; + + clone = &backing_req->bio; + BUG_ON(off & SECTOR_MASK); + BUG_ON(len & SECTOR_MASK); + bio_trim(clone, off >> SECTOR_SHIFT, len >> SECTOR_SHIFT); + + clone->bi_iter.bi_sector = (pcache_req->off + off) >> SECTOR_SHIFT; + clone->bi_private = backing_req; + clone->bi_end_io = backing_dev_bio_end; + + INIT_LIST_HEAD(&backing_req->node); + backing_req->end_req = opts->end_fn; + + pcache_req_get(pcache_req); + backing_req->req.upper_req = pcache_req; + backing_req->req.bio_off = off; +} + +static void kmem_type_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev *backing_dev = backing_req->backing_dev; + struct bio *backing_bio; + + bio_init(&backing_req->bio, backing_dev->dm_dev->bdev, backing_req->kmem.bvecs, + backing_req->kmem.n_vecs, opts->kmem.opf); + + backing_bio = &backing_req->bio; + bio_map(backing_bio, opts->kmem.data, opts->kmem.len); + + backing_bio->bi_iter.bi_sector = (opts->kmem.backing_off) >> SECTOR_SHIFT; + backing_bio->bi_private = backing_req; + backing_bio->bi_end_io = backing_dev_bio_end; + + INIT_LIST_HEAD(&backing_req->node); + backing_req->end_req = opts->end_fn; + backing_req->priv_data = opts->priv_data; +} + +void backing_dev_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts) +{ + if (opts->type == BACKING_DEV_REQ_TYPE_REQ) + return req_type_req_init(backing_req, opts); + + if (opts->type == BACKING_DEV_REQ_TYPE_KMEM) + return kmem_type_req_init(backing_req, opts); + + BUG(); +} + +struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts) +{ + struct pcache_backing_dev_req *backing_req; + + backing_req = backing_dev_req_alloc(backing_dev, opts); + if (!backing_req) + return NULL; + + backing_dev_req_init(backing_req, opts); + + return backing_req; +} + +void backing_dev_flush(struct pcache_backing_dev *backing_dev) +{ + blkdev_issue_flush(backing_dev->dm_dev->bdev); +} + +int pcache_backing_init(void) +{ + u32 max_bvecs = (PCACHE_CACHE_SUBTREE_SIZE >> PAGE_SHIFT) + 1; + int ret; + + backing_req_cache = KMEM_CACHE(pcache_backing_dev_req, 0); + if (!backing_req_cache) { + ret = -ENOMEM; + goto err; + } + + backing_bvec_cache = kmem_cache_create("pcache-bvec-slab", + max_bvecs * sizeof(struct bio_vec), + 0, 0, NULL); + if (!backing_bvec_cache) { + ret = -ENOMEM; + goto destroy_req_cache; + } + + return 0; +destroy_req_cache: + kmem_cache_destroy(backing_req_cache); +err: + return ret; +} + +void pcache_backing_exit(void) +{ + kmem_cache_destroy(backing_bvec_cache); + kmem_cache_destroy(backing_req_cache); +} diff --git a/drivers/md/dm-pcache/backing_dev.h b/drivers/md/dm-pcache/backing_dev.h new file mode 100644 index 000000000000..b371cba483b9 --- /dev/null +++ b/drivers/md/dm-pcache/backing_dev.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _BACKING_DEV_H +#define _BACKING_DEV_H + +#include <linux/device-mapper.h> + +#include "pcache_internal.h" + +struct pcache_backing_dev_req; +typedef void (*backing_req_end_fn_t)(struct pcache_backing_dev_req *backing_req, int ret); + +#define BACKING_DEV_REQ_TYPE_REQ 1 +#define BACKING_DEV_REQ_TYPE_KMEM 2 + +#define BACKING_DEV_REQ_INLINE_BVECS 4 + +struct pcache_request; +struct pcache_backing_dev_req { + u8 type; + struct bio bio; + struct pcache_backing_dev *backing_dev; + + void *priv_data; + backing_req_end_fn_t end_req; + + struct list_head node; + int ret; + + union { + struct { + struct pcache_request *upper_req; + u32 bio_off; + } req; + struct { + struct bio_vec inline_bvecs[BACKING_DEV_REQ_INLINE_BVECS]; + struct bio_vec *bvecs; + u32 n_vecs; + } kmem; + }; +}; + +struct pcache_backing_dev { + struct pcache_cache *cache; + + struct dm_dev *dm_dev; + mempool_t req_pool; + mempool_t bvec_pool; + + struct list_head submit_list; + spinlock_t submit_lock; + struct work_struct req_submit_work; + + struct list_head complete_list; + spinlock_t complete_lock; + struct work_struct req_complete_work; + + atomic_t inflight_reqs; + wait_queue_head_t inflight_wq; + + u64 dev_size; +}; + +struct dm_pcache; +int backing_dev_start(struct dm_pcache *pcache); +void backing_dev_stop(struct dm_pcache *pcache); + +struct pcache_backing_dev_req_opts { + u32 type; + union { + struct { + struct pcache_request *upper_req; + u32 req_off; + u32 len; + } req; + struct { + void *data; + blk_opf_t opf; + u32 len; + u64 backing_off; + } kmem; + }; + + gfp_t gfp_mask; + backing_req_end_fn_t end_fn; + void *priv_data; +}; + +static inline u32 backing_dev_req_coalesced_max_len(const void *data, u32 len) +{ + const void *p = data; + u32 done = 0, in_page, to_advance; + struct page *first_page, *next_page; + + if (!is_vmalloc_addr(data)) + return len; + + first_page = vmalloc_to_page(p); +advance: + in_page = PAGE_SIZE - offset_in_page(p); + to_advance = min_t(u32, in_page, len - done); + + done += to_advance; + p += to_advance; + + if (done == len) + return done; + + next_page = vmalloc_to_page(p); + if (zone_device_pages_have_same_pgmap(first_page, next_page)) + goto advance; + + return done; +} + +void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct); +void backing_dev_req_end(struct pcache_backing_dev_req *backing_req); +struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts); +struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev, + struct pcache_backing_dev_req_opts *opts); +void backing_dev_req_init(struct pcache_backing_dev_req *backing_req, + struct pcache_backing_dev_req_opts *opts); +void backing_dev_flush(struct pcache_backing_dev *backing_dev); + +int pcache_backing_init(void); +void pcache_backing_exit(void); +#endif /* _BACKING_DEV_H */ diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c new file mode 100644 index 000000000000..d8e92367d947 --- /dev/null +++ b/drivers/md/dm-pcache/cache.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/blk_types.h> + +#include "cache.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "dm_pcache.h" + +struct kmem_cache *key_cache; + +static inline struct pcache_cache_info *get_cache_info_addr(struct pcache_cache *cache) +{ + return cache->cache_info_addr + cache->info_index; +} + +static void cache_info_write(struct pcache_cache *cache) +{ + struct pcache_cache_info *cache_info = &cache->cache_info; + + cache_info->header.seq++; + cache_info->header.crc = pcache_meta_crc(&cache_info->header, + sizeof(struct pcache_cache_info)); + + memcpy_flushcache(get_cache_info_addr(cache), cache_info, + sizeof(struct pcache_cache_info)); + + cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX; +} + +static void cache_info_init_default(struct pcache_cache *cache); +static int cache_info_init(struct pcache_cache *cache, struct pcache_cache_options *opts) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_info *cache_info_addr; + + cache_info_addr = pcache_meta_find_latest(&cache->cache_info_addr->header, + sizeof(struct pcache_cache_info), + PCACHE_CACHE_INFO_SIZE, + &cache->cache_info); + if (IS_ERR(cache_info_addr)) + return PTR_ERR(cache_info_addr); + + if (cache_info_addr) { + if (opts->data_crc != + (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC)) { + pcache_dev_err(pcache, "invalid option for data_crc: %s, expected: %s", + opts->data_crc ? "true" : "false", + cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC ? "true" : "false"); + return -EINVAL; + } + + return 0; + } + + /* init cache_info for new cache */ + cache_info_init_default(cache); + cache_mode_set(cache, opts->cache_mode); + if (opts->data_crc) + cache->cache_info.flags |= PCACHE_CACHE_FLAGS_DATA_CRC; + + return 0; +} + +static void cache_info_set_gc_percent(struct pcache_cache_info *cache_info, u8 percent) +{ + cache_info->flags &= ~PCACHE_CACHE_FLAGS_GC_PERCENT_MASK; + cache_info->flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, percent); +} + +int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent) +{ + if (percent > PCACHE_CACHE_GC_PERCENT_MAX || percent < PCACHE_CACHE_GC_PERCENT_MIN) + return -EINVAL; + + mutex_lock(&cache->cache_info_lock); + cache_info_set_gc_percent(&cache->cache_info, percent); + + cache_info_write(cache); + mutex_unlock(&cache->cache_info_lock); + + return 0; +} + +void cache_pos_encode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia_base, + struct pcache_cache_pos *pos, u64 seq, u32 *index) +{ + struct pcache_cache_pos_onmedia pos_onmedia; + struct pcache_cache_pos_onmedia *pos_onmedia_addr = pos_onmedia_base + *index; + + pos_onmedia.cache_seg_id = pos->cache_seg->cache_seg_id; + pos_onmedia.seg_off = pos->seg_off; + pos_onmedia.header.seq = seq; + pos_onmedia.header.crc = cache_pos_onmedia_crc(&pos_onmedia); + + memcpy_flushcache(pos_onmedia_addr, &pos_onmedia, sizeof(struct pcache_cache_pos_onmedia)); + pmem_wmb(); + + *index = (*index + 1) % PCACHE_META_INDEX_MAX; +} + +int cache_pos_decode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 *seq, u32 *index) +{ + struct pcache_cache_pos_onmedia latest, *latest_addr; + + latest_addr = pcache_meta_find_latest(&pos_onmedia->header, + sizeof(struct pcache_cache_pos_onmedia), + sizeof(struct pcache_cache_pos_onmedia), + &latest); + if (IS_ERR(latest_addr)) + return PTR_ERR(latest_addr); + + if (!latest_addr) + return -EIO; + + pos->cache_seg = &cache->segments[latest.cache_seg_id]; + pos->seg_off = latest.seg_off; + *seq = latest.header.seq; + *index = (latest_addr - pos_onmedia); + + return 0; +} + +static inline void cache_info_set_seg_id(struct pcache_cache *cache, u32 seg_id) +{ + cache->cache_info.seg_id = seg_id; +} + +static int cache_init(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + int ret; + + cache->segments = kvcalloc(cache_dev->seg_num, sizeof(struct pcache_cache_segment), GFP_KERNEL); + if (!cache->segments) { + ret = -ENOMEM; + goto err; + } + + cache->seg_map = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL); + if (!cache->seg_map) { + ret = -ENOMEM; + goto free_segments; + } + + cache->backing_dev = backing_dev; + cache->cache_dev = &pcache->cache_dev; + cache->n_segs = cache_dev->seg_num; + atomic_set(&cache->gc_errors, 0); + spin_lock_init(&cache->seg_map_lock); + spin_lock_init(&cache->key_head_lock); + + mutex_init(&cache->cache_info_lock); + mutex_init(&cache->key_tail_lock); + mutex_init(&cache->dirty_tail_lock); + mutex_init(&cache->writeback_lock); + + INIT_DELAYED_WORK(&cache->writeback_work, cache_writeback_fn); + INIT_DELAYED_WORK(&cache->gc_work, pcache_cache_gc_fn); + INIT_WORK(&cache->clean_work, clean_fn); + + return 0; + +free_segments: + kvfree(cache->segments); +err: + return ret; +} + +static void cache_exit(struct pcache_cache *cache) +{ + kvfree(cache->seg_map); + kvfree(cache->segments); +} + +static void cache_info_init_default(struct pcache_cache *cache) +{ + struct pcache_cache_info *cache_info = &cache->cache_info; + + cache_info->header.seq = 0; + cache_info->n_segs = cache->cache_dev->seg_num; + cache_info_set_gc_percent(cache_info, PCACHE_CACHE_GC_PERCENT_DEFAULT); +} + +static int cache_tail_init(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE); + + if (new_cache) { + __set_bit(0, cache->seg_map); + + cache->key_head.cache_seg = &cache->segments[0]; + cache->key_head.seg_off = 0; + cache_pos_copy(&cache->key_tail, &cache->key_head); + cache_pos_copy(&cache->dirty_tail, &cache->key_head); + + cache_encode_dirty_tail(cache); + cache_encode_key_tail(cache); + } else { + if (cache_decode_key_tail(cache) || cache_decode_dirty_tail(cache)) { + pcache_dev_err(pcache, "Corrupted key tail or dirty tail.\n"); + return -EIO; + } + } + + return 0; +} + +static int get_seg_id(struct pcache_cache *cache, + struct pcache_cache_segment *prev_cache_seg, + bool new_cache, u32 *seg_id) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_dev *cache_dev = cache->cache_dev; + int ret; + + if (new_cache) { + ret = cache_dev_get_empty_segment_id(cache_dev, seg_id); + if (ret) { + pcache_dev_err(pcache, "no available segment\n"); + goto err; + } + + if (prev_cache_seg) + cache_seg_set_next_seg(prev_cache_seg, *seg_id); + else + cache_info_set_seg_id(cache, *seg_id); + } else { + if (prev_cache_seg) { + struct pcache_segment_info *prev_seg_info; + + prev_seg_info = &prev_cache_seg->cache_seg_info; + if (!segment_info_has_next(prev_seg_info)) { + ret = -EFAULT; + goto err; + } + *seg_id = prev_cache_seg->cache_seg_info.next_seg; + } else { + *seg_id = cache->cache_info.seg_id; + } + } + return 0; +err: + return ret; +} + +static int cache_segs_init(struct pcache_cache *cache) +{ + struct pcache_cache_segment *prev_cache_seg = NULL; + struct pcache_cache_info *cache_info = &cache->cache_info; + bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE); + u32 seg_id; + int ret; + u32 i; + + for (i = 0; i < cache_info->n_segs; i++) { + ret = get_seg_id(cache, prev_cache_seg, new_cache, &seg_id); + if (ret) + goto err; + + ret = cache_seg_init(cache, seg_id, i, new_cache); + if (ret) + goto err; + + prev_cache_seg = &cache->segments[i]; + } + return 0; +err: + return ret; +} + +static int cache_init_req_keys(struct pcache_cache *cache, u32 n_paral) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + u32 n_subtrees; + int ret; + u32 i, cpu; + + /* Calculate number of cache trees based on the device size */ + n_subtrees = DIV_ROUND_UP(cache->dev_size << SECTOR_SHIFT, PCACHE_CACHE_SUBTREE_SIZE); + ret = cache_tree_init(cache, &cache->req_key_tree, n_subtrees); + if (ret) + goto err; + + cache->n_ksets = n_paral; + cache->ksets = kvcalloc(cache->n_ksets, PCACHE_KSET_SIZE, GFP_KERNEL); + if (!cache->ksets) { + ret = -ENOMEM; + goto req_tree_exit; + } + + /* + * Initialize each kset with a spinlock and delayed work for flushing. + * Each kset is associated with one queue to ensure independent handling + * of cache keys across multiple queues, maximizing multiqueue concurrency. + */ + for (i = 0; i < cache->n_ksets; i++) { + struct pcache_cache_kset *kset = get_kset(cache, i); + + kset->cache = cache; + spin_lock_init(&kset->kset_lock); + INIT_DELAYED_WORK(&kset->flush_work, kset_flush_fn); + } + + cache->data_heads = alloc_percpu(struct pcache_cache_data_head); + if (!cache->data_heads) { + ret = -ENOMEM; + goto free_kset; + } + + for_each_possible_cpu(cpu) { + struct pcache_cache_data_head *h = + per_cpu_ptr(cache->data_heads, cpu); + h->head_pos.cache_seg = NULL; + } + + /* + * Replay persisted cache keys using cache_replay. + * This function loads and replays cache keys from previously stored + * ksets, allowing the cache to restore its state after a restart. + */ + ret = cache_replay(cache); + if (ret) { + pcache_dev_err(pcache, "failed to replay keys\n"); + goto free_heads; + } + + return 0; + +free_heads: + free_percpu(cache->data_heads); +free_kset: + kvfree(cache->ksets); +req_tree_exit: + cache_tree_exit(&cache->req_key_tree); +err: + return ret; +} + +static void cache_destroy_req_keys(struct pcache_cache *cache) +{ + u32 i; + + for (i = 0; i < cache->n_ksets; i++) { + struct pcache_cache_kset *kset = get_kset(cache, i); + + cancel_delayed_work_sync(&kset->flush_work); + } + + free_percpu(cache->data_heads); + kvfree(cache->ksets); + cache_tree_exit(&cache->req_key_tree); +} + +int pcache_cache_start(struct dm_pcache *pcache) +{ + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache *cache = &pcache->cache; + struct pcache_cache_options *opts = &pcache->opts; + int ret; + + ret = cache_init(pcache); + if (ret) + return ret; + + cache->cache_info_addr = CACHE_DEV_CACHE_INFO(cache->cache_dev); + cache->cache_ctrl = CACHE_DEV_CACHE_CTRL(cache->cache_dev); + backing_dev->cache = cache; + cache->dev_size = backing_dev->dev_size; + + ret = cache_info_init(cache, opts); + if (ret) + goto cache_exit; + + ret = cache_segs_init(cache); + if (ret) + goto cache_exit; + + ret = cache_tail_init(cache); + if (ret) + goto cache_exit; + + ret = cache_init_req_keys(cache, num_online_cpus()); + if (ret) + goto cache_exit; + + ret = cache_writeback_init(cache); + if (ret) + goto destroy_keys; + + cache->cache_info.flags |= PCACHE_CACHE_FLAGS_INIT_DONE; + cache_info_write(cache); + queue_delayed_work(cache_get_wq(cache), &cache->gc_work, 0); + + return 0; + +destroy_keys: + cache_destroy_req_keys(cache); +cache_exit: + cache_exit(cache); + + return ret; +} + +void pcache_cache_stop(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + + cache_flush(cache); + + cancel_delayed_work_sync(&cache->gc_work); + flush_work(&cache->clean_work); + cache_writeback_exit(cache); + + if (cache->req_key_tree.n_subtrees) + cache_destroy_req_keys(cache); + + cache_exit(cache); +} + +struct workqueue_struct *cache_get_wq(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + + return pcache->task_wq; +} + +int pcache_cache_init(void) +{ + key_cache = KMEM_CACHE(pcache_cache_key, 0); + if (!key_cache) + return -ENOMEM; + + return 0; +} + +void pcache_cache_exit(void) +{ + kmem_cache_destroy(key_cache); +} diff --git a/drivers/md/dm-pcache/cache.h b/drivers/md/dm-pcache/cache.h new file mode 100644 index 000000000000..1136d86958c8 --- /dev/null +++ b/drivers/md/dm-pcache/cache.h @@ -0,0 +1,635 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_CACHE_H +#define _PCACHE_CACHE_H + +#include "segment.h" + +/* Garbage collection thresholds */ +#define PCACHE_CACHE_GC_PERCENT_MIN 0 /* Minimum GC percentage */ +#define PCACHE_CACHE_GC_PERCENT_MAX 90 /* Maximum GC percentage */ +#define PCACHE_CACHE_GC_PERCENT_DEFAULT 70 /* Default GC percentage */ + +#define PCACHE_CACHE_SUBTREE_SIZE (4 * PCACHE_MB) /* 4MB total tree size */ +#define PCACHE_CACHE_SUBTREE_SIZE_MASK 0x3FFFFF /* Mask for tree size */ +#define PCACHE_CACHE_SUBTREE_SIZE_SHIFT 22 /* Bit shift for tree size */ + +/* Maximum number of keys per key set */ +#define PCACHE_KSET_KEYS_MAX 128 +#define PCACHE_CACHE_SEGS_MAX (1024 * 1024) /* maximum cache size for each device is 16T */ +#define PCACHE_KSET_ONMEDIA_SIZE_MAX struct_size_t(struct pcache_cache_kset_onmedia, data, PCACHE_KSET_KEYS_MAX) +#define PCACHE_KSET_SIZE (sizeof(struct pcache_cache_kset) + sizeof(struct pcache_cache_key_onmedia) * PCACHE_KSET_KEYS_MAX) + +/* Maximum number of keys to clean in one round of clean_work */ +#define PCACHE_CLEAN_KEYS_MAX 10 + +/* Writeback and garbage collection intervals in jiffies */ +#define PCACHE_CACHE_WRITEBACK_INTERVAL (5 * HZ) +#define PCACHE_CACHE_GC_INTERVAL (5 * HZ) + +/* Macro to get the cache key structure from an rb_node pointer */ +#define CACHE_KEY(node) (container_of(node, struct pcache_cache_key, rb_node)) + +struct pcache_cache_pos_onmedia { + struct pcache_meta_header header; + __u32 cache_seg_id; + __u32 seg_off; +}; + +/* Offset and size definitions for cache segment control */ +#define PCACHE_CACHE_SEG_CTRL_OFF (PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX) +#define PCACHE_CACHE_SEG_CTRL_SIZE (4 * PCACHE_KB) + +struct pcache_cache_seg_gen { + struct pcache_meta_header header; + __u64 gen; +}; + +/* Control structure for cache segments */ +struct pcache_cache_seg_ctrl { + struct pcache_cache_seg_gen gen[PCACHE_META_INDEX_MAX]; + __u64 res[64]; +}; + +#define PCACHE_CACHE_FLAGS_DATA_CRC BIT(0) +#define PCACHE_CACHE_FLAGS_INIT_DONE BIT(1) + +#define PCACHE_CACHE_FLAGS_CACHE_MODE_MASK GENMASK(5, 2) +#define PCACHE_CACHE_MODE_WRITEBACK 0 +#define PCACHE_CACHE_MODE_WRITETHROUGH 1 +#define PCACHE_CACHE_MODE_WRITEAROUND 2 +#define PCACHE_CACHE_MODE_WRITEONLY 3 + +#define PCACHE_CACHE_FLAGS_GC_PERCENT_MASK GENMASK(12, 6) + +struct pcache_cache_info { + struct pcache_meta_header header; + __u32 seg_id; + __u32 n_segs; + __u32 flags; + __u32 reserved; +}; + +struct pcache_cache_pos { + struct pcache_cache_segment *cache_seg; + u32 seg_off; +}; + +struct pcache_cache_segment { + struct pcache_cache *cache; + u32 cache_seg_id; /* Index in cache->segments */ + struct pcache_segment segment; + atomic_t refs; + + struct pcache_segment_info cache_seg_info; + struct mutex info_lock; + u32 info_index; + + spinlock_t gen_lock; + u64 gen; + u64 gen_seq; + u32 gen_index; + + struct pcache_cache_seg_ctrl *cache_seg_ctrl; +}; + +/* rbtree for cache entries */ +struct pcache_cache_subtree { + struct rb_root root; + spinlock_t tree_lock; +}; + +struct pcache_cache_tree { + struct pcache_cache *cache; + u32 n_subtrees; + mempool_t key_pool; + struct pcache_cache_subtree *subtrees; +}; + +extern struct kmem_cache *key_cache; + +struct pcache_cache_key { + struct pcache_cache_tree *cache_tree; + struct pcache_cache_subtree *cache_subtree; + struct kref ref; + struct rb_node rb_node; + struct list_head list_node; + u64 off; + u32 len; + u32 flags; + struct pcache_cache_pos cache_pos; + u64 seg_gen; +}; + +#define PCACHE_CACHE_KEY_FLAGS_EMPTY BIT(0) +#define PCACHE_CACHE_KEY_FLAGS_CLEAN BIT(1) + +struct pcache_cache_key_onmedia { + __u64 off; + __u32 len; + __u32 flags; + __u32 cache_seg_id; + __u32 cache_seg_off; + __u64 seg_gen; + __u32 data_crc; + __u32 reserved; +}; + +struct pcache_cache_kset_onmedia { + __u32 crc; + union { + __u32 key_num; + __u32 next_cache_seg_id; + }; + __u64 magic; + __u64 flags; + struct pcache_cache_key_onmedia data[]; +}; + +struct pcache_cache { + struct pcache_backing_dev *backing_dev; + struct pcache_cache_dev *cache_dev; + struct pcache_cache_ctrl *cache_ctrl; + u64 dev_size; + + struct pcache_cache_data_head __percpu *data_heads; + + spinlock_t key_head_lock; + struct pcache_cache_pos key_head; + u32 n_ksets; + struct pcache_cache_kset *ksets; + + struct mutex key_tail_lock; + struct pcache_cache_pos key_tail; + u64 key_tail_seq; + u32 key_tail_index; + + struct mutex dirty_tail_lock; + struct pcache_cache_pos dirty_tail; + u64 dirty_tail_seq; + u32 dirty_tail_index; + + struct pcache_cache_tree req_key_tree; + struct work_struct clean_work; + + struct mutex writeback_lock; + char wb_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX]; + struct pcache_cache_tree writeback_key_tree; + struct delayed_work writeback_work; + struct { + atomic_t pending; + u32 advance; + int ret; + } writeback_ctx; + + char gc_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX]; + struct delayed_work gc_work; + atomic_t gc_errors; + + struct mutex cache_info_lock; + struct pcache_cache_info cache_info; + struct pcache_cache_info *cache_info_addr; + u32 info_index; + + u32 n_segs; + unsigned long *seg_map; + u32 last_cache_seg; + bool cache_full; + spinlock_t seg_map_lock; + struct pcache_cache_segment *segments; +}; + +struct workqueue_struct *cache_get_wq(struct pcache_cache *cache); + +struct dm_pcache; +struct pcache_cache_options { + u32 cache_mode:4; + u32 data_crc:1; +}; +int pcache_cache_start(struct dm_pcache *pcache); +void pcache_cache_stop(struct dm_pcache *pcache); + +struct pcache_cache_ctrl { + /* Updated by gc_thread */ + struct pcache_cache_pos_onmedia key_tail_pos[PCACHE_META_INDEX_MAX]; + + /* Updated by writeback_thread */ + struct pcache_cache_pos_onmedia dirty_tail_pos[PCACHE_META_INDEX_MAX]; +}; + +struct pcache_cache_data_head { + struct pcache_cache_pos head_pos; +}; + +static inline u16 pcache_cache_get_gc_percent(struct pcache_cache *cache) +{ + return FIELD_GET(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, cache->cache_info.flags); +} + +int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent); + +/* cache key */ +struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask); +void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key); +void cache_key_get(struct pcache_cache_key *key); +void cache_key_put(struct pcache_cache_key *key); +int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close); +void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup); +int cache_key_decode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key); +void cache_pos_advance(struct pcache_cache_pos *pos, u32 len); + +#define PCACHE_KSET_FLAGS_LAST BIT(0) +#define PCACHE_KSET_MAGIC 0x676894a64e164f1aULL + +struct pcache_cache_kset { + struct pcache_cache *cache; + spinlock_t kset_lock; + struct delayed_work flush_work; + struct pcache_cache_kset_onmedia kset_onmedia; +}; + +extern struct pcache_cache_kset_onmedia pcache_empty_kset; + +#define SUBTREE_WALK_RET_OK 0 +#define SUBTREE_WALK_RET_ERR 1 +#define SUBTREE_WALK_RET_NEED_KEY 2 +#define SUBTREE_WALK_RET_NEED_REQ 3 +#define SUBTREE_WALK_RET_RESEARCH 4 + +struct pcache_cache_subtree_walk_ctx { + struct pcache_cache_tree *cache_tree; + struct rb_node *start_node; + struct pcache_request *pcache_req; + struct pcache_cache_key *key; + u32 req_done; + int ret; + + /* pre-allocated key and backing_dev_req */ + struct pcache_cache_key *pre_alloc_key; + struct pcache_backing_dev_req *pre_alloc_req; + + struct list_head *delete_key_list; + struct list_head *submit_req_list; + + /* + * |--------| key_tmp + * |====| key + */ + int (*before)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----------| key_tmp + * |=====| key + */ + int (*after)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----------------| key_tmp + * |===========| key + */ + int (*overlap_tail)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |--------| key_tmp + * |==========| key + */ + int (*overlap_head)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |----| key_tmp + * |==========| key + */ + int (*overlap_contain)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + /* + * |-----------| key_tmp + * |====| key + */ + int (*overlap_contained)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx); + + int (*walk_finally)(struct pcache_cache_subtree_walk_ctx *ctx, int ret); + bool (*walk_done)(struct pcache_cache_subtree_walk_ctx *ctx); +}; + +int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx); +struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key, + struct rb_node **parentp, struct rb_node ***newp, + struct list_head *delete_key_list); +int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset); +void clean_fn(struct work_struct *work); +void kset_flush_fn(struct work_struct *work); +int cache_replay(struct pcache_cache *cache); +int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees); +void cache_tree_clear(struct pcache_cache_tree *cache_tree); +void cache_tree_exit(struct pcache_cache_tree *cache_tree); + +/* cache segments */ +struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache); +int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id, + bool new_cache); +void cache_seg_get(struct pcache_cache_segment *cache_seg); +void cache_seg_put(struct pcache_cache_segment *cache_seg); +void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id); + +/* cache request*/ +int cache_flush(struct pcache_cache *cache); +void miss_read_end_work_fn(struct work_struct *work); +int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req); + +/* gc */ +void pcache_cache_gc_fn(struct work_struct *work); + +/* writeback */ +void cache_writeback_exit(struct pcache_cache *cache); +int cache_writeback_init(struct pcache_cache *cache); +void cache_writeback_fn(struct work_struct *work); + +/* inline functions */ +static inline struct pcache_cache_subtree *get_subtree(struct pcache_cache_tree *cache_tree, u64 off) +{ + if (cache_tree->n_subtrees == 1) + return &cache_tree->subtrees[0]; + + return &cache_tree->subtrees[off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT]; +} + +static inline void *cache_pos_addr(struct pcache_cache_pos *pos) +{ + return (pos->cache_seg->segment.data + pos->seg_off); +} + +static inline void *get_key_head_addr(struct pcache_cache *cache) +{ + return cache_pos_addr(&cache->key_head); +} + +static inline u32 get_kset_id(struct pcache_cache *cache, u64 off) +{ + u32 kset_id; + + div_u64_rem(off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT, cache->n_ksets, &kset_id); + + return kset_id; +} + +static inline struct pcache_cache_kset *get_kset(struct pcache_cache *cache, u32 kset_id) +{ + return (void *)cache->ksets + PCACHE_KSET_SIZE * kset_id; +} + +static inline struct pcache_cache_data_head *get_data_head(struct pcache_cache *cache) +{ + return this_cpu_ptr(cache->data_heads); +} + +static inline bool cache_key_empty(struct pcache_cache_key *key) +{ + return key->flags & PCACHE_CACHE_KEY_FLAGS_EMPTY; +} + +static inline bool cache_key_clean(struct pcache_cache_key *key) +{ + return key->flags & PCACHE_CACHE_KEY_FLAGS_CLEAN; +} + +static inline void cache_pos_copy(struct pcache_cache_pos *dst, struct pcache_cache_pos *src) +{ + memcpy(dst, src, sizeof(struct pcache_cache_pos)); +} + +/** + * cache_seg_is_ctrl_seg - Checks if a cache segment is a cache ctrl segment. + * @cache_seg_id: ID of the cache segment. + * + * Returns true if the cache segment ID corresponds to a cache ctrl segment. + * + * Note: We extend the segment control of the first cache segment + * (cache segment ID 0) to serve as the cache control (pcache_cache_ctrl) + * for the entire PCACHE cache. This function determines whether the given + * cache segment is the one storing the pcache_cache_ctrl information. + */ +static inline bool cache_seg_is_ctrl_seg(u32 cache_seg_id) +{ + return (cache_seg_id == 0); +} + +/** + * cache_key_cutfront - Cuts a specified length from the front of a cache key. + * @key: Pointer to pcache_cache_key structure. + * @cut_len: Length to cut from the front. + * + * Advances the cache key position by cut_len and adjusts offset and length accordingly. + */ +static inline void cache_key_cutfront(struct pcache_cache_key *key, u32 cut_len) +{ + if (key->cache_pos.cache_seg) + cache_pos_advance(&key->cache_pos, cut_len); + + key->off += cut_len; + key->len -= cut_len; +} + +/** + * cache_key_cutback - Cuts a specified length from the back of a cache key. + * @key: Pointer to pcache_cache_key structure. + * @cut_len: Length to cut from the back. + * + * Reduces the length of the cache key by cut_len. + */ +static inline void cache_key_cutback(struct pcache_cache_key *key, u32 cut_len) +{ + key->len -= cut_len; +} + +static inline void cache_key_delete(struct pcache_cache_key *key) +{ + struct pcache_cache_subtree *cache_subtree; + + cache_subtree = key->cache_subtree; + BUG_ON(!cache_subtree); + + rb_erase(&key->rb_node, &cache_subtree->root); + key->flags = 0; + cache_key_put(key); +} + +static inline bool cache_data_crc_on(struct pcache_cache *cache) +{ + return (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC); +} + +static inline u32 cache_mode_get(struct pcache_cache *cache) +{ + return FIELD_GET(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache->cache_info.flags); +} + +static inline void cache_mode_set(struct pcache_cache *cache, u32 cache_mode) +{ + cache->cache_info.flags &= ~PCACHE_CACHE_FLAGS_CACHE_MODE_MASK; + cache->cache_info.flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache_mode); +} + +/** + * cache_key_data_crc - Calculates CRC for data in a cache key. + * @key: Pointer to the pcache_cache_key structure. + * + * Returns the CRC-32 checksum of the data within the cache key's position. + */ +static inline u32 cache_key_data_crc(struct pcache_cache_key *key) +{ + void *data; + + data = cache_pos_addr(&key->cache_pos); + + return crc32c(PCACHE_CRC_SEED, data, key->len); +} + +static inline u32 cache_kset_crc(struct pcache_cache_kset_onmedia *kset_onmedia) +{ + u32 crc_size; + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) + crc_size = sizeof(struct pcache_cache_kset_onmedia) - 4; + else + crc_size = struct_size(kset_onmedia, data, kset_onmedia->key_num) - 4; + + return crc32c(PCACHE_CRC_SEED, (void *)kset_onmedia + 4, crc_size); +} + +static inline u32 get_kset_onmedia_size(struct pcache_cache_kset_onmedia *kset_onmedia) +{ + return struct_size_t(struct pcache_cache_kset_onmedia, data, kset_onmedia->key_num); +} + +/** + * cache_seg_remain - Computes remaining space in a cache segment. + * @pos: Pointer to pcache_cache_pos structure. + * + * Returns the amount of remaining space in the segment data starting from + * the current position offset. + */ +static inline u32 cache_seg_remain(struct pcache_cache_pos *pos) +{ + struct pcache_cache_segment *cache_seg; + struct pcache_segment *segment; + u32 seg_remain; + + cache_seg = pos->cache_seg; + segment = &cache_seg->segment; + seg_remain = segment->data_size - pos->seg_off; + + return seg_remain; +} + +/** + * cache_key_invalid - Checks if a cache key is invalid. + * @key: Pointer to pcache_cache_key structure. + * + * Returns true if the cache key is invalid due to its generation being + * less than the generation of its segment; otherwise returns false. + * + * When the GC (garbage collection) thread identifies a segment + * as reclaimable, it increments the segment's generation (gen). However, + * it does not immediately remove all related cache keys. When accessing + * such a cache key, this function can be used to determine if the cache + * key has already become invalid. + */ +static inline bool cache_key_invalid(struct pcache_cache_key *key) +{ + if (cache_key_empty(key)) + return false; + + return (key->seg_gen < key->cache_pos.cache_seg->gen); +} + +/** + * cache_key_lstart - Retrieves the logical start offset of a cache key. + * @key: Pointer to pcache_cache_key structure. + * + * Returns the logical start offset for the cache key. + */ +static inline u64 cache_key_lstart(struct pcache_cache_key *key) +{ + return key->off; +} + +/** + * cache_key_lend - Retrieves the logical end offset of a cache key. + * @key: Pointer to pcache_cache_key structure. + * + * Returns the logical end offset for the cache key. + */ +static inline u64 cache_key_lend(struct pcache_cache_key *key) +{ + return key->off + key->len; +} + +static inline void cache_key_copy(struct pcache_cache_key *key_dst, struct pcache_cache_key *key_src) +{ + key_dst->off = key_src->off; + key_dst->len = key_src->len; + key_dst->seg_gen = key_src->seg_gen; + key_dst->cache_tree = key_src->cache_tree; + key_dst->cache_subtree = key_src->cache_subtree; + key_dst->flags = key_src->flags; + + cache_pos_copy(&key_dst->cache_pos, &key_src->cache_pos); +} + +/** + * cache_pos_onmedia_crc - Calculates the CRC for an on-media cache position. + * @pos_om: Pointer to pcache_cache_pos_onmedia structure. + * + * Calculates the CRC-32 checksum of the position, excluding the first 4 bytes. + * Returns the computed CRC value. + */ +static inline u32 cache_pos_onmedia_crc(struct pcache_cache_pos_onmedia *pos_om) +{ + return pcache_meta_crc(&pos_om->header, sizeof(struct pcache_cache_pos_onmedia)); +} + +void cache_pos_encode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 seq, u32 *index); +int cache_pos_decode(struct pcache_cache *cache, + struct pcache_cache_pos_onmedia *pos_onmedia, + struct pcache_cache_pos *pos, u64 *seq, u32 *index); + +static inline void cache_encode_key_tail(struct pcache_cache *cache) +{ + cache_pos_encode(cache, cache->cache_ctrl->key_tail_pos, + &cache->key_tail, ++cache->key_tail_seq, + &cache->key_tail_index); +} + +static inline int cache_decode_key_tail(struct pcache_cache *cache) +{ + return cache_pos_decode(cache, cache->cache_ctrl->key_tail_pos, + &cache->key_tail, &cache->key_tail_seq, + &cache->key_tail_index); +} + +static inline void cache_encode_dirty_tail(struct pcache_cache *cache) +{ + cache_pos_encode(cache, cache->cache_ctrl->dirty_tail_pos, + &cache->dirty_tail, ++cache->dirty_tail_seq, + &cache->dirty_tail_index); +} + +static inline int cache_decode_dirty_tail(struct pcache_cache *cache) +{ + return cache_pos_decode(cache, cache->cache_ctrl->dirty_tail_pos, + &cache->dirty_tail, &cache->dirty_tail_seq, + &cache->dirty_tail_index); +} + +int pcache_cache_init(void); +void pcache_cache_exit(void); +#endif /* _PCACHE_CACHE_H */ diff --git a/drivers/md/dm-pcache/cache_dev.c b/drivers/md/dm-pcache/cache_dev.c new file mode 100644 index 000000000000..ece689e6ce59 --- /dev/null +++ b/drivers/md/dm-pcache/cache_dev.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/blkdev.h> +#include <linux/dax.h> +#include <linux/vmalloc.h> +#include <linux/parser.h> + +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +static void cache_dev_dax_exit(struct pcache_cache_dev *cache_dev) +{ + if (cache_dev->use_vmap) + vunmap(cache_dev->mapping); +} + +static int build_vmap(struct dax_device *dax_dev, long total_pages, void **vaddr) +{ + struct page **pages; + long i = 0, chunk; + unsigned long pfn; + int ret; + + pages = vmalloc_array(total_pages, sizeof(struct page *)); + if (!pages) + return -ENOMEM; + + do { + chunk = dax_direct_access(dax_dev, i, total_pages - i, + DAX_ACCESS, NULL, &pfn); + if (chunk <= 0) { + ret = chunk ? chunk : -EINVAL; + goto out_free; + } + + if (!pfn_valid(pfn)) { + ret = -EOPNOTSUPP; + goto out_free; + } + + while (chunk-- && i < total_pages) { + pages[i++] = pfn_to_page(pfn); + pfn++; + if (!(i & 15)) + cond_resched(); + } + } while (i < total_pages); + + *vaddr = vmap(pages, total_pages, VM_MAP, PAGE_KERNEL); + if (!*vaddr) { + ret = -ENOMEM; + goto out_free; + } + + ret = 0; + +out_free: + vfree(pages); + return ret; +} + +static int cache_dev_dax_init(struct pcache_cache_dev *cache_dev) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + struct dax_device *dax_dev; + long total_pages, mapped_pages; + u64 bdev_size; + void *vaddr; + int ret; + int id; + unsigned long pfn; + + dax_dev = cache_dev->dm_dev->dax_dev; + /* total size check */ + bdev_size = bdev_nr_bytes(cache_dev->dm_dev->bdev); + if (bdev_size < PCACHE_CACHE_DEV_SIZE_MIN) { + pcache_dev_err(pcache, "dax device is too small, required at least %llu", + PCACHE_CACHE_DEV_SIZE_MIN); + ret = -ENOSPC; + goto out; + } + + total_pages = bdev_size >> PAGE_SHIFT; + /* attempt: direct-map the whole range */ + id = dax_read_lock(); + mapped_pages = dax_direct_access(dax_dev, 0, total_pages, + DAX_ACCESS, &vaddr, &pfn); + if (mapped_pages < 0) { + pcache_dev_err(pcache, "dax_direct_access failed: %ld\n", mapped_pages); + ret = mapped_pages; + goto unlock; + } + + if (!pfn_valid(pfn)) { + ret = -EOPNOTSUPP; + goto unlock; + } + + if (mapped_pages == total_pages) { + /* success: contiguous direct mapping */ + cache_dev->mapping = vaddr; + } else { + /* need vmap fallback */ + ret = build_vmap(dax_dev, total_pages, &vaddr); + if (ret) { + pcache_dev_err(pcache, "vmap fallback failed: %d\n", ret); + goto unlock; + } + + cache_dev->mapping = vaddr; + cache_dev->use_vmap = true; + } + dax_read_unlock(id); + + return 0; +unlock: + dax_read_unlock(id); +out: + return ret; +} + +void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size) +{ + memset(pos, 0, size); + dax_flush(cache_dev->dm_dev->dax_dev, pos, size); +} + +static int sb_read(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev); + + if (copy_mc_to_kernel(sb, sb_addr, sizeof(struct pcache_sb))) + return -EIO; + + return 0; +} + +static void sb_write(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev); + + memcpy_flushcache(sb_addr, sb, sizeof(struct pcache_sb)); + pmem_wmb(); +} + +static int sb_init(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u64 nr_segs; + u64 cache_dev_size; + u64 magic; + u32 flags = 0; + + magic = le64_to_cpu(sb->magic); + if (magic) + return -EEXIST; + + cache_dev_size = bdev_nr_bytes(file_bdev(cache_dev->dm_dev->bdev_file)); + if (cache_dev_size < PCACHE_CACHE_DEV_SIZE_MIN) { + pcache_dev_err(pcache, "dax device is too small, required at least %llu", + PCACHE_CACHE_DEV_SIZE_MIN); + return -ENOSPC; + } + + nr_segs = (cache_dev_size - PCACHE_SEGMENTS_OFF) / ((PCACHE_SEG_SIZE)); + +#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN) + flags |= PCACHE_SB_F_BIGENDIAN; +#endif + sb->flags = cpu_to_le32(flags); + sb->magic = cpu_to_le64(PCACHE_MAGIC); + sb->seg_num = cpu_to_le32(nr_segs); + sb->crc = cpu_to_le32(crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4)); + + cache_dev_zero_range(cache_dev, CACHE_DEV_CACHE_INFO(cache_dev), + PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX + + PCACHE_CACHE_CTRL_SIZE); + + return 0; +} + +static int sb_validate(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb) +{ + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u32 flags; + u32 crc; + + if (le64_to_cpu(sb->magic) != PCACHE_MAGIC) { + pcache_dev_err(pcache, "unexpected magic: %llx\n", + le64_to_cpu(sb->magic)); + return -EINVAL; + } + + crc = crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4); + if (crc != le32_to_cpu(sb->crc)) { + pcache_dev_err(pcache, "corrupted sb: %u, expected: %u\n", crc, le32_to_cpu(sb->crc)); + return -EINVAL; + } + + flags = le32_to_cpu(sb->flags); +#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN) + if (!(flags & PCACHE_SB_F_BIGENDIAN)) { + pcache_dev_err(pcache, "cache_dev is not big endian\n"); + return -EINVAL; + } +#else + if (flags & PCACHE_SB_F_BIGENDIAN) { + pcache_dev_err(pcache, "cache_dev is big endian\n"); + return -EINVAL; + } +#endif + return 0; +} + +static int cache_dev_init(struct pcache_cache_dev *cache_dev, u32 seg_num) +{ + cache_dev->seg_num = seg_num; + cache_dev->seg_bitmap = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL); + if (!cache_dev->seg_bitmap) + return -ENOMEM; + + return 0; +} + +static void cache_dev_exit(struct pcache_cache_dev *cache_dev) +{ + kvfree(cache_dev->seg_bitmap); +} + +void cache_dev_stop(struct dm_pcache *pcache) +{ + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + + cache_dev_exit(cache_dev); + cache_dev_dax_exit(cache_dev); +} + +int cache_dev_start(struct dm_pcache *pcache) +{ + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + struct pcache_sb sb; + bool format = false; + int ret; + + mutex_init(&cache_dev->seg_lock); + + ret = cache_dev_dax_init(cache_dev); + if (ret) { + pcache_dev_err(pcache, "failed to init cache_dev %s via dax way: %d.", + cache_dev->dm_dev->name, ret); + goto err; + } + + ret = sb_read(cache_dev, &sb); + if (ret) + goto dax_release; + + if (le64_to_cpu(sb.magic) == 0) { + format = true; + ret = sb_init(cache_dev, &sb); + if (ret < 0) + goto dax_release; + } + + ret = sb_validate(cache_dev, &sb); + if (ret) + goto dax_release; + + cache_dev->sb_flags = le32_to_cpu(sb.flags); + ret = cache_dev_init(cache_dev, le32_to_cpu(sb.seg_num)); + if (ret) + goto dax_release; + + if (format) + sb_write(cache_dev, &sb); + + return 0; + +dax_release: + cache_dev_dax_exit(cache_dev); +err: + return ret; +} + +int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id) +{ + int ret; + + mutex_lock(&cache_dev->seg_lock); + *seg_id = find_next_zero_bit(cache_dev->seg_bitmap, cache_dev->seg_num, 0); + if (*seg_id == cache_dev->seg_num) { + ret = -ENOSPC; + goto unlock; + } + + __set_bit(*seg_id, cache_dev->seg_bitmap); + ret = 0; +unlock: + mutex_unlock(&cache_dev->seg_lock); + return ret; +} diff --git a/drivers/md/dm-pcache/cache_dev.h b/drivers/md/dm-pcache/cache_dev.h new file mode 100644 index 000000000000..6251eb4ebe96 --- /dev/null +++ b/drivers/md/dm-pcache/cache_dev.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_CACHE_DEV_H +#define _PCACHE_CACHE_DEV_H + +#include <linux/device.h> +#include <linux/device-mapper.h> + +#include "pcache_internal.h" + +#define PCACHE_MAGIC 0x65B05EFA96C596EFULL + +#define PCACHE_SB_OFF (4 * PCACHE_KB) +#define PCACHE_SB_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_INFO_OFF (PCACHE_SB_OFF + PCACHE_SB_SIZE) +#define PCACHE_CACHE_INFO_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_CTRL_OFF (PCACHE_CACHE_INFO_OFF + (PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX)) +#define PCACHE_CACHE_CTRL_SIZE (4 * PCACHE_KB) + +#define PCACHE_SEGMENTS_OFF (PCACHE_CACHE_CTRL_OFF + PCACHE_CACHE_CTRL_SIZE) +#define PCACHE_SEG_INFO_SIZE (4 * PCACHE_KB) + +#define PCACHE_CACHE_DEV_SIZE_MIN (512 * PCACHE_MB) /* 512 MB */ +#define PCACHE_SEG_SIZE (16 * PCACHE_MB) /* Size of each PCACHE segment (16 MB) */ + +#define CACHE_DEV_SB(cache_dev) ((struct pcache_sb *)(cache_dev->mapping + PCACHE_SB_OFF)) +#define CACHE_DEV_CACHE_INFO(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_INFO_OFF) +#define CACHE_DEV_CACHE_CTRL(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_CTRL_OFF) +#define CACHE_DEV_SEGMENTS(cache_dev) ((void *)cache_dev->mapping + PCACHE_SEGMENTS_OFF) +#define CACHE_DEV_SEGMENT(cache_dev, id) ((void *)CACHE_DEV_SEGMENTS(cache_dev) + (u64)id * PCACHE_SEG_SIZE) + +/* + * PCACHE SB flags configured during formatting + * + * The PCACHE_SB_F_xxx flags define registration requirements based on cache_dev + * formatting. For a machine to register a cache_dev: + * - PCACHE_SB_F_BIGENDIAN: Requires a big-endian machine. + */ +#define PCACHE_SB_F_BIGENDIAN BIT(0) + +struct pcache_sb { + __le32 crc; + __le32 flags; + __le64 magic; + + __le32 seg_num; +}; + +struct pcache_cache_dev { + u32 sb_flags; + u32 seg_num; + void *mapping; + bool use_vmap; + + struct dm_dev *dm_dev; + + struct mutex seg_lock; + unsigned long *seg_bitmap; +}; + +struct dm_pcache; +int cache_dev_start(struct dm_pcache *pcache); +void cache_dev_stop(struct dm_pcache *pcache); + +void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size); + +int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id); + +#endif /* _PCACHE_CACHE_DEV_H */ diff --git a/drivers/md/dm-pcache/cache_gc.c b/drivers/md/dm-pcache/cache_gc.c new file mode 100644 index 000000000000..94f8b276a021 --- /dev/null +++ b/drivers/md/dm-pcache/cache_gc.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +/** + * cache_key_gc - Releases the reference of a cache key segment. + * @cache: Pointer to the pcache_cache structure. + * @key: Pointer to the cache key to be garbage collected. + * + * This function decrements the reference count of the cache segment + * associated with the given key. If the reference count drops to zero, + * the segment may be invalidated and reused. + */ +static void cache_key_gc(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + cache_seg_put(key->cache_pos.cache_seg); +} + +static bool need_gc(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail, struct pcache_cache_pos *key_tail) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_kset_onmedia *kset_onmedia; + void *dirty_addr, *key_addr; + u32 segs_used, segs_gc_threshold, to_copy; + int ret; + + dirty_addr = cache_pos_addr(dirty_tail); + key_addr = cache_pos_addr(key_tail); + if (dirty_addr == key_addr) { + pcache_dev_debug(pcache, "key tail is equal to dirty tail: %u:%u\n", + dirty_tail->cache_seg->cache_seg_id, + dirty_tail->seg_off); + return false; + } + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf; + + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - key_tail->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, key_addr, to_copy); + if (ret) { + pcache_dev_err(pcache, "error to read kset: %d", ret); + return false; + } + + /* Check if kset_onmedia is corrupted */ + if (kset_onmedia->magic != PCACHE_KSET_MAGIC) { + pcache_dev_debug(pcache, "gc error: magic is not as expected. key_tail: %u:%u magic: %llx, expected: %llx\n", + key_tail->cache_seg->cache_seg_id, key_tail->seg_off, + kset_onmedia->magic, PCACHE_KSET_MAGIC); + return false; + } + + /* Verify the CRC of the kset_onmedia */ + if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + pcache_dev_debug(pcache, "gc error: crc is not as expected. crc: %x, expected: %x\n", + cache_kset_crc(kset_onmedia), kset_onmedia->crc); + return false; + } + + segs_used = bitmap_weight(cache->seg_map, cache->n_segs); + segs_gc_threshold = cache->n_segs * pcache_cache_get_gc_percent(cache) / 100; + if (segs_used < segs_gc_threshold) { + pcache_dev_debug(pcache, "segs_used: %u, segs_gc_threshold: %u\n", segs_used, segs_gc_threshold); + return false; + } + + return true; +} + +/** + * last_kset_gc - Advances the garbage collection for the last kset. + * @cache: Pointer to the pcache_cache structure. + * @kset_onmedia: Pointer to the kset_onmedia structure for the last kset. + */ +static void last_kset_gc(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_segment *cur_seg, *next_seg; + + cur_seg = cache->key_tail.cache_seg; + + next_seg = &cache->segments[kset_onmedia->next_cache_seg_id]; + + mutex_lock(&cache->key_tail_lock); + cache->key_tail.cache_seg = next_seg; + cache->key_tail.seg_off = 0; + cache_encode_key_tail(cache); + mutex_unlock(&cache->key_tail_lock); + + pcache_dev_debug(pcache, "gc advance kset seg: %u\n", cur_seg->cache_seg_id); + + spin_lock(&cache->seg_map_lock); + __clear_bit(cur_seg->cache_seg_id, cache->seg_map); + spin_unlock(&cache->seg_map_lock); +} + +void pcache_cache_gc_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, gc_work.work); + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos dirty_tail, key_tail; + struct pcache_cache_kset_onmedia *kset_onmedia; + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_key *key; + int ret; + int i; + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf; + + while (true) { + if (pcache_is_stopping(pcache) || atomic_read(&cache->gc_errors)) + return; + + /* Get new tail positions */ + mutex_lock(&cache->dirty_tail_lock); + cache_pos_copy(&dirty_tail, &cache->dirty_tail); + mutex_unlock(&cache->dirty_tail_lock); + + mutex_lock(&cache->key_tail_lock); + cache_pos_copy(&key_tail, &cache->key_tail); + mutex_unlock(&cache->key_tail_lock); + + if (!need_gc(cache, &dirty_tail, &key_tail)) + break; + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + /* Don't move to the next segment if dirty_tail has not moved */ + if (dirty_tail.cache_seg == key_tail.cache_seg) + break; + + last_kset_gc(cache, kset_onmedia); + continue; + } + + for (i = 0; i < kset_onmedia->key_num; i++) { + struct pcache_cache_key key_tmp = { 0 }; + + key_onmedia = &kset_onmedia->data[i]; + + key = &key_tmp; + cache_key_init(&cache->req_key_tree, key); + + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + /* return without re-arm gc work, and prevent future + * gc, because we can't retry the partial-gc-ed kset + */ + atomic_inc(&cache->gc_errors); + pcache_dev_err(pcache, "failed to decode cache key in gc\n"); + return; + } + + cache_key_gc(cache, key); + } + + pcache_dev_debug(pcache, "gc advance: %u:%u %u\n", + key_tail.cache_seg->cache_seg_id, + key_tail.seg_off, + get_kset_onmedia_size(kset_onmedia)); + + mutex_lock(&cache->key_tail_lock); + cache_pos_advance(&cache->key_tail, get_kset_onmedia_size(kset_onmedia)); + cache_encode_key_tail(cache); + mutex_unlock(&cache->key_tail_lock); + } + + queue_delayed_work(cache_get_wq(cache), &cache->gc_work, PCACHE_CACHE_GC_INTERVAL); +} diff --git a/drivers/md/dm-pcache/cache_key.c b/drivers/md/dm-pcache/cache_key.c new file mode 100644 index 000000000000..2b77e121f89b --- /dev/null +++ b/drivers/md/dm-pcache/cache_key.c @@ -0,0 +1,888 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +struct pcache_cache_kset_onmedia pcache_empty_kset = { 0 }; + +void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key) +{ + kref_init(&key->ref); + key->cache_tree = cache_tree; + INIT_LIST_HEAD(&key->list_node); + RB_CLEAR_NODE(&key->rb_node); +} + +struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask) +{ + struct pcache_cache_key *key; + + key = mempool_alloc(&cache_tree->key_pool, gfp_mask); + if (!key) + return NULL; + + memset(key, 0, sizeof(struct pcache_cache_key)); + cache_key_init(cache_tree, key); + + return key; +} + +/** + * cache_key_get - Increment the reference count of a cache key. + * @key: Pointer to the pcache_cache_key structure. + * + * This function increments the reference count of the specified cache key, + * ensuring that it is not freed while still in use. + */ +void cache_key_get(struct pcache_cache_key *key) +{ + kref_get(&key->ref); +} + +/** + * cache_key_destroy - Free a cache key structure when its reference count drops to zero. + * @ref: Pointer to the kref structure. + * + * This function is called when the reference count of the cache key reaches zero. + * It frees the allocated cache key back to the slab cache. + */ +static void cache_key_destroy(struct kref *ref) +{ + struct pcache_cache_key *key = container_of(ref, struct pcache_cache_key, ref); + struct pcache_cache_tree *cache_tree = key->cache_tree; + + mempool_free(key, &cache_tree->key_pool); +} + +void cache_key_put(struct pcache_cache_key *key) +{ + kref_put(&key->ref, cache_key_destroy); +} + +void cache_pos_advance(struct pcache_cache_pos *pos, u32 len) +{ + /* Ensure enough space remains in the current segment */ + BUG_ON(cache_seg_remain(pos) < len); + + pos->seg_off += len; +} + +static void cache_key_encode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key) +{ + key_onmedia->off = key->off; + key_onmedia->len = key->len; + + key_onmedia->cache_seg_id = key->cache_pos.cache_seg->cache_seg_id; + key_onmedia->cache_seg_off = key->cache_pos.seg_off; + + key_onmedia->seg_gen = key->seg_gen; + key_onmedia->flags = key->flags; + + if (cache_data_crc_on(cache)) + key_onmedia->data_crc = cache_key_data_crc(key); +} + +int cache_key_decode(struct pcache_cache *cache, + struct pcache_cache_key_onmedia *key_onmedia, + struct pcache_cache_key *key) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + + key->off = key_onmedia->off; + key->len = key_onmedia->len; + + key->cache_pos.cache_seg = &cache->segments[key_onmedia->cache_seg_id]; + key->cache_pos.seg_off = key_onmedia->cache_seg_off; + + key->seg_gen = key_onmedia->seg_gen; + key->flags = key_onmedia->flags; + + if (cache_data_crc_on(cache) && + key_onmedia->data_crc != cache_key_data_crc(key)) { + pcache_dev_err(pcache, "key: %llu:%u seg %u:%u data_crc error: %x, expected: %x\n", + key->off, key->len, key->cache_pos.cache_seg->cache_seg_id, + key->cache_pos.seg_off, cache_key_data_crc(key), key_onmedia->data_crc); + return -EIO; + } + + return 0; +} + +static void append_last_kset(struct pcache_cache *cache, u32 next_seg) +{ + struct pcache_cache_kset_onmedia kset_onmedia = { 0 }; + + kset_onmedia.flags |= PCACHE_KSET_FLAGS_LAST; + kset_onmedia.next_cache_seg_id = next_seg; + kset_onmedia.magic = PCACHE_KSET_MAGIC; + kset_onmedia.crc = cache_kset_crc(&kset_onmedia); + + memcpy_flushcache(get_key_head_addr(cache), &kset_onmedia, sizeof(struct pcache_cache_kset_onmedia)); + pmem_wmb(); + cache_pos_advance(&cache->key_head, sizeof(struct pcache_cache_kset_onmedia)); +} + +int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset) +{ + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 kset_onmedia_size; + int ret; + + kset_onmedia = &kset->kset_onmedia; + + if (!kset_onmedia->key_num) + return 0; + + kset_onmedia_size = struct_size(kset_onmedia, data, kset_onmedia->key_num); + + spin_lock(&cache->key_head_lock); +again: + /* Reserve space for the last kset */ + if (cache_seg_remain(&cache->key_head) < kset_onmedia_size + sizeof(struct pcache_cache_kset_onmedia)) { + struct pcache_cache_segment *next_seg; + + next_seg = get_cache_segment(cache); + if (!next_seg) { + ret = -EBUSY; + goto out; + } + + /* clear outdated kset in next seg */ + memcpy_flushcache(next_seg->segment.data, &pcache_empty_kset, + sizeof(struct pcache_cache_kset_onmedia)); + append_last_kset(cache, next_seg->cache_seg_id); + cache->key_head.cache_seg = next_seg; + cache->key_head.seg_off = 0; + goto again; + } + + kset_onmedia->magic = PCACHE_KSET_MAGIC; + kset_onmedia->crc = cache_kset_crc(kset_onmedia); + + /* clear outdated kset after current kset */ + memcpy_flushcache(get_key_head_addr(cache) + kset_onmedia_size, &pcache_empty_kset, + sizeof(struct pcache_cache_kset_onmedia)); + /* write current kset into segment */ + memcpy_flushcache(get_key_head_addr(cache), kset_onmedia, kset_onmedia_size); + pmem_wmb(); + + /* reset kset_onmedia */ + memset(kset_onmedia, 0, sizeof(struct pcache_cache_kset_onmedia)); + cache_pos_advance(&cache->key_head, kset_onmedia_size); + + ret = 0; +out: + spin_unlock(&cache->key_head_lock); + + return ret; +} + +/** + * cache_key_append - Append a cache key to the related kset. + * @cache: Pointer to the pcache_cache structure. + * @key: Pointer to the cache key structure to append. + * @force_close: Need to close current kset if true. + * + * This function appends a cache key to the appropriate kset. If the kset + * is full, it closes the kset. If not, it queues a flush work to write + * the kset to media. + * + * Returns 0 on success, or a negative error code on failure. + */ +int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close) +{ + struct pcache_cache_kset *kset; + struct pcache_cache_kset_onmedia *kset_onmedia; + struct pcache_cache_key_onmedia *key_onmedia; + u32 kset_id = get_kset_id(cache, key->off); + int ret = 0; + + kset = get_kset(cache, kset_id); + kset_onmedia = &kset->kset_onmedia; + + spin_lock(&kset->kset_lock); + key_onmedia = &kset_onmedia->data[kset_onmedia->key_num]; + cache_key_encode(cache, key_onmedia, key); + + /* Check if the current kset has reached the maximum number of keys */ + if (++kset_onmedia->key_num == PCACHE_KSET_KEYS_MAX || force_close) { + /* If full, close the kset */ + ret = cache_kset_close(cache, kset); + if (ret) { + kset_onmedia->key_num--; + goto out; + } + } else { + /* If not full, queue a delayed work to flush the kset */ + queue_delayed_work(cache_get_wq(cache), &kset->flush_work, 1 * HZ); + } +out: + spin_unlock(&kset->kset_lock); + + return ret; +} + +/** + * cache_subtree_walk - Traverse the cache tree. + * @ctx: Pointer to the context structure for traversal. + * + * This function traverses the cache tree starting from the specified node. + * It calls the appropriate callback functions based on the relationships + * between the keys in the cache tree. + * + * Returns 0 on success, or a negative error code on failure. + */ +int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_key *key_tmp, *key; + struct rb_node *node_tmp; + int ret = SUBTREE_WALK_RET_OK; + + key = ctx->key; + node_tmp = ctx->start_node; + + while (node_tmp) { + if (ctx->walk_done && ctx->walk_done(ctx)) + break; + + key_tmp = CACHE_KEY(node_tmp); + /* + * If key_tmp ends before the start of key, continue to the next node. + * |----------| + * |=====| + */ + if (cache_key_lend(key_tmp) <= cache_key_lstart(key)) { + if (ctx->after) { + ret = ctx->after(key, key_tmp, ctx); + if (ret) + goto out; + } + goto next; + } + + /* + * If key_tmp starts after the end of key, stop traversing. + * |--------| + * |====| + */ + if (cache_key_lstart(key_tmp) >= cache_key_lend(key)) { + if (ctx->before) { + ret = ctx->before(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* Handle overlapping keys */ + if (cache_key_lstart(key_tmp) >= cache_key_lstart(key)) { + /* + * If key_tmp encompasses key. + * |----------------| key_tmp + * |===========| key + */ + if (cache_key_lend(key_tmp) >= cache_key_lend(key)) { + if (ctx->overlap_tail) { + ret = ctx->overlap_tail(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* + * If key_tmp is contained within key. + * |----| key_tmp + * |==========| key + */ + if (ctx->overlap_contain) { + ret = ctx->overlap_contain(key, key_tmp, ctx); + if (ret) + goto out; + } + + goto next; + } + + /* + * If key_tmp starts before key ends but ends after key. + * |-----------| key_tmp + * |====| key + */ + if (cache_key_lend(key_tmp) > cache_key_lend(key)) { + if (ctx->overlap_contained) { + ret = ctx->overlap_contained(key, key_tmp, ctx); + if (ret) + goto out; + } + break; + } + + /* + * If key_tmp starts before key and ends within key. + * |--------| key_tmp + * |==========| key + */ + if (ctx->overlap_head) { + ret = ctx->overlap_head(key, key_tmp, ctx); + if (ret) + goto out; + } +next: + node_tmp = rb_next(node_tmp); + } + +out: + if (ctx->walk_finally) + ret = ctx->walk_finally(ctx, ret); + + return ret; +} + +/** + * cache_subtree_search - Search for a key in the cache tree. + * @cache_subtree: Pointer to the cache tree structure. + * @key: Pointer to the cache key to search for. + * @parentp: Pointer to store the parent node of the found node. + * @newp: Pointer to store the location where the new node should be inserted. + * @delete_key_list: List to collect invalid keys for deletion. + * + * This function searches the cache tree for a specific key and returns + * the node that is the predecessor of the key, or first node if the key is + * less than all keys in the tree. If any invalid keys are found during + * the search, they are added to the delete_key_list for later cleanup. + * + * Returns a pointer to the previous node. + */ +struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key, + struct rb_node **parentp, struct rb_node ***newp, + struct list_head *delete_key_list) +{ + struct rb_node **new, *parent = NULL; + struct pcache_cache_key *key_tmp; + struct rb_node *prev_node = NULL; + + new = &(cache_subtree->root.rb_node); + while (*new) { + key_tmp = container_of(*new, struct pcache_cache_key, rb_node); + if (cache_key_invalid(key_tmp)) + list_add(&key_tmp->list_node, delete_key_list); + + parent = *new; + if (key_tmp->off >= key->off) { + new = &((*new)->rb_left); + } else { + prev_node = *new; + new = &((*new)->rb_right); + } + } + + if (!prev_node) + prev_node = rb_first(&cache_subtree->root); + + if (parentp) + *parentp = parent; + + if (newp) + *newp = new; + + return prev_node; +} + +static struct pcache_cache_key *get_pre_alloc_key(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_key *key; + + if (ctx->pre_alloc_key) { + key = ctx->pre_alloc_key; + ctx->pre_alloc_key = NULL; + + return key; + } + + return cache_key_alloc(ctx->cache_tree, GFP_NOWAIT); +} + +/** + * fixup_overlap_tail - Adjust the key when it overlaps at the tail. + * @key: Pointer to the new cache key being inserted. + * @key_tmp: Pointer to the existing key that overlaps. + * @ctx: Pointer to the context for walking the cache tree. + * + * This function modifies the existing key (key_tmp) when there is an + * overlap at the tail with the new key. If the modified key becomes + * empty, it is deleted. + */ +static int fixup_overlap_tail(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |----------------| key_tmp + * |===========| key + */ + BUG_ON(cache_key_empty(key)); + if (cache_key_empty(key_tmp)) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + cache_key_cutfront(key_tmp, cache_key_lend(key) - cache_key_lstart(key_tmp)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * fixup_overlap_contain - Handle case where new key completely contains an existing key. + * @key: Pointer to the new cache key being inserted. + * @key_tmp: Pointer to the existing key that is being contained. + * @ctx: Pointer to the context for walking the cache tree. + * + * This function deletes the existing key (key_tmp) when the new key + * completely contains it. It returns SUBTREE_WALK_RET_RESEARCH to indicate that the + * tree structure may have changed, necessitating a re-insertion of + * the new key. + */ +static int fixup_overlap_contain(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |----| key_tmp + * |==========| key + */ + BUG_ON(cache_key_empty(key)); + cache_key_delete(key_tmp); + + return SUBTREE_WALK_RET_RESEARCH; +} + +/** + * fixup_overlap_contained - Handle overlap when a new key is contained in an existing key. + * @key: The new cache key being inserted. + * @key_tmp: The existing cache key that overlaps with the new key. + * @ctx: Context for the cache tree walk. + * + * This function adjusts the existing key if the new key is contained + * within it. If the existing key is empty, it indicates a placeholder key + * that was inserted during a miss read. This placeholder will later be + * updated with real data from the backing_dev, making it no longer an empty key. + * + * If we delete key or insert a key, the structure of the entire cache tree may change, + * requiring a full research of the tree to find a new insertion point. + */ +static int fixup_overlap_contained(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache_tree *cache_tree = ctx->cache_tree; + + /* + * |-----------| key_tmp + * |====| key + */ + BUG_ON(cache_key_empty(key)); + if (cache_key_empty(key_tmp)) { + /* If key_tmp is empty, don't split it; + * it's a placeholder key for miss reads that will be updated later. + */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + } else { + struct pcache_cache_key *key_fixup; + bool need_research = false; + + key_fixup = get_pre_alloc_key(ctx); + if (!key_fixup) + return SUBTREE_WALK_RET_NEED_KEY; + + cache_key_copy(key_fixup, key_tmp); + + /* Split key_tmp based on the new key's range */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + cache_key_delete(key_tmp); + need_research = true; + } + + /* Create a new portion for key_fixup */ + cache_key_cutfront(key_fixup, cache_key_lend(key) - cache_key_lstart(key_tmp)); + if (key_fixup->len == 0) { + cache_key_put(key_fixup); + } else { + /* Insert the new key into the cache */ + cache_key_insert(cache_tree, key_fixup, false); + need_research = true; + } + + if (need_research) + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * fixup_overlap_head - Handle overlap when a new key overlaps with the head of an existing key. + * @key: The new cache key being inserted. + * @key_tmp: The existing cache key that overlaps with the new key. + * @ctx: Context for the cache tree walk. + * + * This function adjusts the existing key if the new key overlaps + * with the beginning of it. If the resulting key length is zero + * after the adjustment, the key is deleted. This indicates that + * the key no longer holds valid data and requires the tree to be + * re-researched for a new insertion point. + */ +static int fixup_overlap_head(struct pcache_cache_key *key, + struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx) +{ + /* + * |--------| key_tmp + * |==========| key + */ + BUG_ON(cache_key_empty(key)); + /* Adjust key_tmp by cutting back based on the new key's start */ + cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key)); + if (key_tmp->len == 0) { + /* If the adjusted key_tmp length is zero, delete it */ + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + return SUBTREE_WALK_RET_OK; +} + +/** + * cache_key_insert - Insert a new cache key into the cache tree. + * @cache_tree: Pointer to the cache_tree structure. + * @key: The cache key to insert. + * @fixup: Indicates if this is a new key being inserted. + * + * This function searches for the appropriate location to insert + * a new cache key into the cache tree. It handles key overlaps + * and ensures any invalid keys are removed before insertion. + */ +void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup) +{ + struct pcache_cache *cache = cache_tree->cache; + struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 }; + struct rb_node **new, *parent = NULL; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key_tmp = NULL, *key_next; + struct rb_node *prev_node = NULL; + LIST_HEAD(delete_key_list); + int ret; + + cache_subtree = get_subtree(cache_tree, key->off); + key->cache_subtree = cache_subtree; +search: + prev_node = cache_subtree_search(cache_subtree, key, &parent, &new, &delete_key_list); + if (!list_empty(&delete_key_list)) { + /* Remove invalid keys from the delete list */ + list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) { + list_del_init(&key_tmp->list_node); + cache_key_delete(key_tmp); + } + goto search; + } + + if (fixup) { + /* Set up the context with the cache, start node, and new key */ + walk_ctx.cache_tree = cache_tree; + walk_ctx.start_node = prev_node; + walk_ctx.key = key; + + /* Assign overlap handling functions for different scenarios */ + walk_ctx.overlap_tail = fixup_overlap_tail; + walk_ctx.overlap_head = fixup_overlap_head; + walk_ctx.overlap_contain = fixup_overlap_contain; + walk_ctx.overlap_contained = fixup_overlap_contained; + + ret = cache_subtree_walk(&walk_ctx); + switch (ret) { + case SUBTREE_WALK_RET_OK: + break; + case SUBTREE_WALK_RET_RESEARCH: + goto search; + case SUBTREE_WALK_RET_NEED_KEY: + spin_unlock(&cache_subtree->tree_lock); + pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_key with GFP_NOIO"); + walk_ctx.pre_alloc_key = cache_key_alloc(cache_tree, GFP_NOIO); + spin_lock(&cache_subtree->tree_lock); + goto search; + default: + BUG(); + } + } + + if (walk_ctx.pre_alloc_key) + cache_key_put(walk_ctx.pre_alloc_key); + + /* Link and insert the new key into the red-black tree */ + rb_link_node(&key->rb_node, parent, new); + rb_insert_color(&key->rb_node, &cache_subtree->root); +} + +/** + * clean_fn - Cleanup function to remove invalid keys from the cache tree. + * @work: Pointer to the work_struct associated with the cleanup. + * + * This function cleans up invalid keys from the cache tree in the background + * after a cache segment has been invalidated during cache garbage collection. + * It processes a maximum of PCACHE_CLEAN_KEYS_MAX keys per iteration and holds + * the tree lock to ensure thread safety. + */ +void clean_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, clean_work); + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + int i, count; + + for (i = 0; i < cache->req_key_tree.n_subtrees; i++) { + cache_subtree = &cache->req_key_tree.subtrees[i]; + +again: + if (pcache_is_stopping(CACHE_TO_PCACHE(cache))) + return; + + /* Delete up to PCACHE_CLEAN_KEYS_MAX keys in one iteration */ + count = 0; + spin_lock(&cache_subtree->tree_lock); + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + if (cache_key_invalid(key)) { + count++; + cache_key_delete(key); + } + + if (count >= PCACHE_CLEAN_KEYS_MAX) { + /* Unlock and pause before continuing cleanup */ + spin_unlock(&cache_subtree->tree_lock); + usleep_range(1000, 2000); + goto again; + } + } + spin_unlock(&cache_subtree->tree_lock); + } +} + +/* + * kset_flush_fn - Flush work for a cache kset. + * + * This function is called when a kset flush work is queued from + * cache_key_append(). If the kset is full, it will be closed + * immediately. If not, the flush work will be queued for later closure. + * + * If cache_kset_close detects that a new segment is required to store + * the kset and there are no available segments, it will return an error. + * In this scenario, a retry will be attempted. + */ +void kset_flush_fn(struct work_struct *work) +{ + struct pcache_cache_kset *kset = container_of(work, struct pcache_cache_kset, flush_work.work); + struct pcache_cache *cache = kset->cache; + int ret; + + if (pcache_is_stopping(CACHE_TO_PCACHE(cache))) + return; + + spin_lock(&kset->kset_lock); + ret = cache_kset_close(cache, kset); + spin_unlock(&kset->kset_lock); + + if (ret) { + /* Failed to flush kset, schedule a retry. */ + queue_delayed_work(cache_get_wq(cache), &kset->flush_work, msecs_to_jiffies(100)); + } +} + +static int kset_replay(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + int ret; + int i; + + for (i = 0; i < kset_onmedia->key_num; i++) { + key_onmedia = &kset_onmedia->data[i]; + + key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO); + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + cache_key_put(key); + goto err; + } + + __set_bit(key->cache_pos.cache_seg->cache_seg_id, cache->seg_map); + + /* Check if the segment generation is valid for insertion. */ + if (key->seg_gen < key->cache_pos.cache_seg->gen) { + cache_key_put(key); + } else { + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->req_key_tree, key, true); + spin_unlock(&cache_subtree->tree_lock); + } + + cache_seg_get(key->cache_pos.cache_seg); + } + + return 0; +err: + return ret; +} + +int cache_replay(struct pcache_cache *cache) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos pos_tail; + struct pcache_cache_pos *pos; + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 to_copy, count = 0; + int ret = 0; + + kset_onmedia = kzalloc(PCACHE_KSET_ONMEDIA_SIZE_MAX, GFP_KERNEL); + if (!kset_onmedia) + return -ENOMEM; + + cache_pos_copy(&pos_tail, &cache->key_tail); + pos = &pos_tail; + + /* + * In cache replaying stage, there is no other one will access + * cache->seg_map, so we can set bit here without cache->seg_map_lock. + */ + __set_bit(pos->cache_seg->cache_seg_id, cache->seg_map); + + while (true) { + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - pos->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, cache_pos_addr(pos), to_copy); + if (ret) { + ret = -EIO; + goto out; + } + + if (kset_onmedia->magic != PCACHE_KSET_MAGIC || + kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + break; + } + + /* Process the last kset and prepare for the next segment. */ + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + struct pcache_cache_segment *next_seg; + + pcache_dev_debug(pcache, "last kset replay, next: %u\n", kset_onmedia->next_cache_seg_id); + + next_seg = &cache->segments[kset_onmedia->next_cache_seg_id]; + + pos->cache_seg = next_seg; + pos->seg_off = 0; + + __set_bit(pos->cache_seg->cache_seg_id, cache->seg_map); + continue; + } + + /* Replay the kset and check for errors. */ + ret = kset_replay(cache, kset_onmedia); + if (ret) + goto out; + + /* Advance the position after processing the kset. */ + cache_pos_advance(pos, get_kset_onmedia_size(kset_onmedia)); + if (++count > 512) { + cond_resched(); + count = 0; + } + } + + /* Update the key_head position after replaying. */ + spin_lock(&cache->key_head_lock); + cache_pos_copy(&cache->key_head, pos); + spin_unlock(&cache->key_head_lock); +out: + kfree(kset_onmedia); + return ret; +} + +int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees) +{ + int ret; + u32 i; + + cache_tree->cache = cache; + cache_tree->n_subtrees = n_subtrees; + + ret = mempool_init_slab_pool(&cache_tree->key_pool, 1024, key_cache); + if (ret) + goto err; + + /* + * Allocate and initialize the subtrees array. + * Each element is a cache tree structure that contains + * an RB tree root and a spinlock for protecting its contents. + */ + cache_tree->subtrees = kvcalloc(cache_tree->n_subtrees, sizeof(struct pcache_cache_subtree), GFP_KERNEL); + if (!cache_tree->subtrees) { + ret = -ENOMEM; + goto key_pool_exit; + } + + for (i = 0; i < cache_tree->n_subtrees; i++) { + struct pcache_cache_subtree *cache_subtree = &cache_tree->subtrees[i]; + + cache_subtree->root = RB_ROOT; + spin_lock_init(&cache_subtree->tree_lock); + } + + return 0; + +key_pool_exit: + mempool_exit(&cache_tree->key_pool); +err: + return ret; +} + +void cache_tree_clear(struct pcache_cache_tree *cache_tree) +{ + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + u32 i; + + for (i = 0; i < cache_tree->n_subtrees; i++) { + cache_subtree = &cache_tree->subtrees[i]; + + spin_lock(&cache_subtree->tree_lock); + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + + cache_key_delete(key); + } + spin_unlock(&cache_subtree->tree_lock); + } +} + +void cache_tree_exit(struct pcache_cache_tree *cache_tree) +{ + cache_tree_clear(cache_tree); + kvfree(cache_tree->subtrees); + mempool_exit(&cache_tree->key_pool); +} diff --git a/drivers/md/dm-pcache/cache_req.c b/drivers/md/dm-pcache/cache_req.c new file mode 100644 index 000000000000..27f94c1fa968 --- /dev/null +++ b/drivers/md/dm-pcache/cache_req.c @@ -0,0 +1,836 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +static int cache_data_head_init(struct pcache_cache *cache) +{ + struct pcache_cache_segment *next_seg; + struct pcache_cache_data_head *data_head; + + data_head = get_data_head(cache); + next_seg = get_cache_segment(cache); + if (!next_seg) + return -EBUSY; + + cache_seg_get(next_seg); + data_head->head_pos.cache_seg = next_seg; + data_head->head_pos.seg_off = 0; + + return 0; +} + +/** + * cache_data_alloc - Allocate data for a cache key. + * @cache: Pointer to the cache structure. + * @key: Pointer to the cache key to allocate data for. + * + * This function tries to allocate space from the cache segment specified by the + * data head. If the remaining space in the segment is insufficient to allocate + * the requested length for the cache key, it will allocate whatever is available + * and adjust the key's length accordingly. This function does not allocate + * space that crosses segment boundaries. + */ +static int cache_data_alloc(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + struct pcache_cache_data_head *data_head; + struct pcache_cache_pos *head_pos; + struct pcache_cache_segment *cache_seg; + u32 seg_remain; + u32 allocated = 0, to_alloc; + int ret = 0; + + preempt_disable(); + data_head = get_data_head(cache); +again: + to_alloc = key->len - allocated; + if (!data_head->head_pos.cache_seg) { + seg_remain = 0; + } else { + cache_pos_copy(&key->cache_pos, &data_head->head_pos); + key->seg_gen = key->cache_pos.cache_seg->gen; + + head_pos = &data_head->head_pos; + cache_seg = head_pos->cache_seg; + seg_remain = cache_seg_remain(head_pos); + } + + if (seg_remain > to_alloc) { + /* If remaining space in segment is sufficient for the cache key, allocate it. */ + cache_pos_advance(head_pos, to_alloc); + allocated += to_alloc; + cache_seg_get(cache_seg); + } else if (seg_remain) { + /* If remaining space is not enough, allocate the remaining space and adjust the cache key length. */ + cache_pos_advance(head_pos, seg_remain); + key->len = seg_remain; + + /* Get for key: obtain a reference to the cache segment for the key. */ + cache_seg_get(cache_seg); + /* Put for head_pos->cache_seg: release the reference for the current head's segment. */ + cache_seg_put(head_pos->cache_seg); + head_pos->cache_seg = NULL; + } else { + /* Initialize a new data head if no segment is available. */ + ret = cache_data_head_init(cache); + if (ret) + goto out; + + goto again; + } + +out: + preempt_enable(); + + return ret; +} + +static int cache_copy_from_req_bio(struct pcache_cache *cache, struct pcache_cache_key *key, + struct pcache_request *pcache_req, u32 bio_off) +{ + struct pcache_cache_pos *pos = &key->cache_pos; + struct pcache_segment *segment; + + segment = &pos->cache_seg->segment; + + return segment_copy_from_bio(segment, pos->seg_off, key->len, pcache_req->bio, bio_off); +} + +static int cache_copy_to_req_bio(struct pcache_cache *cache, struct pcache_request *pcache_req, + u32 bio_off, u32 len, struct pcache_cache_pos *pos, u64 key_gen) +{ + struct pcache_cache_segment *cache_seg = pos->cache_seg; + struct pcache_segment *segment = &cache_seg->segment; + int ret; + + spin_lock(&cache_seg->gen_lock); + if (key_gen < cache_seg->gen) { + spin_unlock(&cache_seg->gen_lock); + return -EINVAL; + } + + ret = segment_copy_to_bio(segment, pos->seg_off, len, pcache_req->bio, bio_off); + spin_unlock(&cache_seg->gen_lock); + + return ret; +} + +/** + * miss_read_end_req - Handle the end of a miss read request. + * @backing_req: Pointer to the request structure. + * @read_ret: Return value of read. + * + * This function is called when a backing request to read data from + * the backing_dev is completed. If the key associated with the request + * is empty (a placeholder), it allocates cache space for the key, + * copies the data read from the bio into the cache, and updates + * the key's status. If the key has been overwritten by a write + * request during this process, it will be deleted from the cache + * tree and no further action will be taken. + */ +static void miss_read_end_req(struct pcache_backing_dev_req *backing_req, int read_ret) +{ + void *priv_data = backing_req->priv_data; + struct pcache_request *pcache_req = backing_req->req.upper_req; + struct pcache_cache *cache = backing_req->backing_dev->cache; + int ret; + + if (priv_data) { + struct pcache_cache_key *key; + struct pcache_cache_subtree *cache_subtree; + + key = (struct pcache_cache_key *)priv_data; + cache_subtree = key->cache_subtree; + + /* if this key was deleted from cache_subtree by a write, key->flags should be cleared, + * so if cache_key_empty() return true, this key is still in cache_subtree + */ + spin_lock(&cache_subtree->tree_lock); + if (cache_key_empty(key)) { + /* Check if the backing request was successful. */ + if (read_ret) { + cache_key_delete(key); + goto unlock; + } + + /* Allocate cache space for the key and copy data from the backing_dev. */ + ret = cache_data_alloc(cache, key); + if (ret) { + cache_key_delete(key); + goto unlock; + } + + ret = cache_copy_from_req_bio(cache, key, pcache_req, backing_req->req.bio_off); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + key->flags &= ~PCACHE_CACHE_KEY_FLAGS_EMPTY; + key->flags |= PCACHE_CACHE_KEY_FLAGS_CLEAN; + + /* Append the key to the cache. */ + ret = cache_key_append(cache, key, false); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + } +unlock: + spin_unlock(&cache_subtree->tree_lock); + cache_key_put(key); + } +} + +/** + * submit_cache_miss_req - Submit a backing request when cache data is missing + * @cache: The cache context that manages cache operations + * @backing_req: The cache request containing information about the read request + * + * This function is used to handle cases where a cache read request cannot locate + * the required data in the cache. When such a miss occurs during `cache_subtree_walk`, + * it triggers a backing read request to fetch data from the backing storage. + * + * If `pcache_req->priv_data` is set, it points to a `pcache_cache_key`, representing + * a new cache key to be inserted into the cache. The function calls `cache_key_insert` + * to attempt adding the key. On insertion failure, it releases the key reference and + * clears `priv_data` to avoid further processing. + */ +static void submit_cache_miss_req(struct pcache_cache *cache, struct pcache_backing_dev_req *backing_req) +{ + if (backing_req->priv_data) { + struct pcache_cache_key *key; + + /* Attempt to insert the key into the cache if priv_data is set */ + key = (struct pcache_cache_key *)backing_req->priv_data; + cache_key_insert(&cache->req_key_tree, key, true); + } + backing_dev_req_submit(backing_req, false); +} + +static void cache_miss_req_free(struct pcache_backing_dev_req *backing_req) +{ + struct pcache_cache_key *key; + + if (backing_req->priv_data) { + key = backing_req->priv_data; + backing_req->priv_data = NULL; + cache_key_put(key); /* for ->priv_data */ + cache_key_put(key); /* for init ref in alloc */ + } + + backing_dev_req_end(backing_req); +} + +static struct pcache_backing_dev_req *cache_miss_req_alloc(struct pcache_cache *cache, + struct pcache_request *parent, + gfp_t gfp_mask) +{ + struct pcache_backing_dev *backing_dev = cache->backing_dev; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_key *key = NULL; + struct pcache_backing_dev_req_opts req_opts = { 0 }; + + req_opts.type = BACKING_DEV_REQ_TYPE_REQ; + req_opts.gfp_mask = gfp_mask; + req_opts.req.upper_req = parent; + + backing_req = backing_dev_req_alloc(backing_dev, &req_opts); + if (!backing_req) + return NULL; + + key = cache_key_alloc(&cache->req_key_tree, gfp_mask); + if (!key) + goto free_backing_req; + + cache_key_get(key); + backing_req->priv_data = key; + + return backing_req; + +free_backing_req: + cache_miss_req_free(backing_req); + return NULL; +} + +static void cache_miss_req_init(struct pcache_cache *cache, + struct pcache_backing_dev_req *backing_req, + struct pcache_request *parent, + u32 off, u32 len, bool insert_key) +{ + struct pcache_cache_key *key; + struct pcache_backing_dev_req_opts req_opts = { 0 }; + + req_opts.type = BACKING_DEV_REQ_TYPE_REQ; + req_opts.req.upper_req = parent; + req_opts.req.req_off = off; + req_opts.req.len = len; + req_opts.end_fn = miss_read_end_req; + + backing_dev_req_init(backing_req, &req_opts); + + if (insert_key) { + key = backing_req->priv_data; + key->off = parent->off + off; + key->len = len; + key->flags |= PCACHE_CACHE_KEY_FLAGS_EMPTY; + } else { + key = backing_req->priv_data; + backing_req->priv_data = NULL; + cache_key_put(key); + cache_key_put(key); + } +} + +static struct pcache_backing_dev_req *get_pre_alloc_req(struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_request *pcache_req = ctx->pcache_req; + struct pcache_backing_dev_req *backing_req; + + if (ctx->pre_alloc_req) { + backing_req = ctx->pre_alloc_req; + ctx->pre_alloc_req = NULL; + + return backing_req; + } + + return cache_miss_req_alloc(cache, pcache_req, GFP_NOWAIT); +} + +/* + * In the process of walking the cache tree to locate cached data, this + * function handles the situation where the requested data range lies + * entirely before an existing cache node (`key_tmp`). This outcome + * signifies that the target data is absent from the cache (cache miss). + * + * To fulfill this portion of the read request, the function creates a + * backing request (`backing_req`) for the missing data range represented + * by `key`. It then appends this request to the submission list in the + * `ctx`, which will later be processed to retrieve the data from backing + * storage. After setting up the backing request, `req_done` in `ctx` is + * updated to reflect the length of the handled range, and the range + * in `key` is adjusted by trimming off the portion that is now handled. + * + * The scenario handled here: + * + * |--------| key_tmp (existing cached range) + * |====| key (requested range, preceding key_tmp) + * + * Since `key` is before `key_tmp`, it signifies that the requested data + * range is missing in the cache (cache miss) and needs retrieval from + * backing storage. + */ +static int read_before(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_backing_dev_req *backing_req; + struct pcache_cache *cache = ctx->cache_tree->cache; + + /* + * In this scenario, `key` represents a range that precedes `key_tmp`, + * meaning the requested data range is missing from the cache tree + * and must be retrieved from the backing_dev. + */ + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + ctx->req_done += key->len; + cache_key_cutfront(key, key->len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * During cache_subtree_walk, this function manages a scenario where part of the + * requested data range overlaps with an existing cache node (`key_tmp`). + * + * |----------------| key_tmp (existing cached range) + * |===========| key (requested range, overlapping the tail of key_tmp) + */ +static int read_overlap_tail(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + u32 io_len; + int ret; + + /* + * Calculate the length of the non-overlapping portion of `key` + * before `key_tmp`, representing the data missing in the cache. + */ + io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key); + if (io_len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + } + + /* + * Handle the overlapping portion by calculating the length of + * the remaining data in `key` that coincides with `key_tmp`. + */ + io_len = cache_key_lend(key) - cache_key_lstart(key_tmp); + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &key_tmp->cache_pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |----| key_tmp (existing cached range) + * |==========| key (requested range) + */ +static int read_overlap_contain(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + u32 io_len; + int ret; + + /* + * Calculate the non-overlapping part of `key` before `key_tmp` + * to identify the missing data length. + */ + io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key); + if (io_len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true); + + list_add(&backing_req->node, ctx->submit_req_list); + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + } + + /* + * Handle the overlapping portion between `key` and `key_tmp`. + */ + io_len = key_tmp->len; + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &key_tmp->cache_pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |-----------| key_tmp (existing cached range) + * |====| key (requested range, fully within key_tmp) + * + * If `key_tmp` contains valid cached data, this function copies the relevant + * portion to the request's bio. Otherwise, it sends a backing request to + * fetch the required data range. + */ +static int read_overlap_contained(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_pos pos; + int ret; + + /* + * Check if `key_tmp` is empty, indicating a miss. If so, initiate + * a backing request to fetch the required data for `key`. + */ + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, false); + submit_cache_miss_req(cache, backing_req); + } else { + cache_pos_copy(&pos, &key_tmp->cache_pos); + cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp)); + + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + key->len, &pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += key->len; + cache_key_cutfront(key, key->len); + + return SUBTREE_WALK_RET_OK; +} + +/* + * |--------| key_tmp (existing cached range) + * |==========| key (requested range, overlapping the head of key_tmp) + */ +static int read_overlap_head(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp, + struct pcache_cache_subtree_walk_ctx *ctx) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req; + struct pcache_cache_pos pos; + u32 io_len; + int ret; + + io_len = cache_key_lend(key_tmp) - cache_key_lstart(key); + + if (cache_key_empty(key_tmp)) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false); + submit_cache_miss_req(cache, backing_req); + } else { + cache_pos_copy(&pos, &key_tmp->cache_pos); + cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp)); + + ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done, + io_len, &pos, key_tmp->seg_gen); + if (ret) { + if (ret == -EINVAL) { + cache_key_delete(key_tmp); + return SUBTREE_WALK_RET_RESEARCH; + } + + ctx->ret = ret; + return SUBTREE_WALK_RET_ERR; + } + } + + ctx->req_done += io_len; + cache_key_cutfront(key, io_len); + + return SUBTREE_WALK_RET_OK; +} + +/** + * read_walk_finally - Finalizes the cache read tree walk by submitting any + * remaining backing requests + * @ctx: Context structure holding information about the cache, + * read request, and submission list + * @ret: the return value after this walk. + * + * This function is called at the end of the `cache_subtree_walk` during a + * cache read operation. It completes the walk by checking if any data + * requested by `key` was not found in the cache tree, and if so, it sends + * a backing request to retrieve that data. Then, it iterates through the + * submission list of backing requests created during the walk, removing + * each request from the list and submitting it. + * + * The scenario managed here includes: + * - Sending a backing request for the remaining length of `key` if it was + * not fulfilled by existing cache entries. + * - Iterating through `ctx->submit_req_list` to submit each backing request + * enqueued during the walk. + * + * This ensures all necessary backing requests for cache misses are submitted + * to the backing storage to retrieve any data that could not be found in + * the cache. + */ +static int read_walk_finally(struct pcache_cache_subtree_walk_ctx *ctx, int ret) +{ + struct pcache_cache *cache = ctx->cache_tree->cache; + struct pcache_backing_dev_req *backing_req, *next_req; + struct pcache_cache_key *key = ctx->key; + + list_for_each_entry_safe(backing_req, next_req, ctx->submit_req_list, node) { + list_del_init(&backing_req->node); + submit_cache_miss_req(ctx->cache_tree->cache, backing_req); + } + + if (ret != SUBTREE_WALK_RET_OK) + return ret; + + if (key->len) { + backing_req = get_pre_alloc_req(ctx); + if (!backing_req) + return SUBTREE_WALK_RET_NEED_REQ; + + cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true); + submit_cache_miss_req(cache, backing_req); + ctx->req_done += key->len; + } + + return SUBTREE_WALK_RET_OK; +} + +/* + * This function is used within `cache_subtree_walk` to determine whether the + * read operation has covered the requested data length. It compares the + * amount of data processed (`ctx->req_done`) with the total data length + * specified in the original request (`ctx->pcache_req->data_len`). + * + * If `req_done` meets or exceeds the required data length, the function + * returns `true`, indicating the walk is complete. Otherwise, it returns `false`, + * signaling that additional data processing is needed to fulfill the request. + */ +static bool read_walk_done(struct pcache_cache_subtree_walk_ctx *ctx) +{ + return (ctx->req_done >= ctx->pcache_req->data_len); +} + +/** + * cache_read - Process a read request by traversing the cache tree + * @cache: Cache structure holding cache trees and related configurations + * @pcache_req: Request structure with information about the data to read + * + * This function attempts to fulfill a read request by traversing the cache tree(s) + * to locate cached data for the requested range. If parts of the data are missing + * in the cache, backing requests are generated to retrieve the required segments. + * + * The function operates by initializing a key for the requested data range and + * preparing a context (`walk_ctx`) to manage the cache tree traversal. The context + * includes pointers to functions (e.g., `read_before`, `read_overlap_tail`) that handle + * specific conditions encountered during the traversal. The `walk_finally` and `walk_done` + * functions manage the end stages of the traversal, while the `delete_key_list` and + * `submit_req_list` lists track any keys to be deleted or requests to be submitted. + * + * The function first calculates the requested range and checks if it fits within the + * current cache tree (based on the tree's size limits). It then locks the cache tree + * and performs a search to locate any matching keys. If there are outdated keys, + * these are deleted, and the search is restarted to ensure accurate data retrieval. + * + * If the requested range spans multiple cache trees, the function moves on to the + * next tree once the current range has been processed. This continues until the + * entire requested data length has been handled. + */ +static int cache_read(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct pcache_cache_key key_data = { .off = pcache_req->off, .len = pcache_req->data_len }; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key_tmp = NULL, *key_next; + struct rb_node *prev_node = NULL; + struct pcache_cache_key *key = &key_data; + struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 }; + struct pcache_backing_dev_req *backing_req, *next_req; + LIST_HEAD(delete_key_list); + LIST_HEAD(submit_req_list); + int ret; + + walk_ctx.cache_tree = &cache->req_key_tree; + walk_ctx.req_done = 0; + walk_ctx.pcache_req = pcache_req; + walk_ctx.before = read_before; + walk_ctx.overlap_tail = read_overlap_tail; + walk_ctx.overlap_head = read_overlap_head; + walk_ctx.overlap_contain = read_overlap_contain; + walk_ctx.overlap_contained = read_overlap_contained; + walk_ctx.walk_finally = read_walk_finally; + walk_ctx.walk_done = read_walk_done; + walk_ctx.delete_key_list = &delete_key_list; + walk_ctx.submit_req_list = &submit_req_list; + +next: + key->off = pcache_req->off + walk_ctx.req_done; + key->len = pcache_req->data_len - walk_ctx.req_done; + if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK)) + key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK); + + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); +search: + prev_node = cache_subtree_search(cache_subtree, key, NULL, NULL, &delete_key_list); + if (!list_empty(&delete_key_list)) { + list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) { + list_del_init(&key_tmp->list_node); + cache_key_delete(key_tmp); + } + goto search; + } + + walk_ctx.start_node = prev_node; + walk_ctx.key = key; + + ret = cache_subtree_walk(&walk_ctx); + if (ret == SUBTREE_WALK_RET_RESEARCH) + goto search; + spin_unlock(&cache_subtree->tree_lock); + + if (ret == SUBTREE_WALK_RET_ERR) { + ret = walk_ctx.ret; + goto out; + } + + if (ret == SUBTREE_WALK_RET_NEED_REQ) { + walk_ctx.pre_alloc_req = cache_miss_req_alloc(cache, pcache_req, GFP_NOIO); + pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_req with GFP_NOIO"); + } + + if (walk_ctx.req_done < pcache_req->data_len) + goto next; + ret = 0; +out: + if (walk_ctx.pre_alloc_req) + cache_miss_req_free(walk_ctx.pre_alloc_req); + + list_for_each_entry_safe(backing_req, next_req, &submit_req_list, node) { + list_del_init(&backing_req->node); + backing_dev_req_end(backing_req); + } + + return ret; +} + +static int cache_write(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + u64 offset = pcache_req->off; + u32 length = pcache_req->data_len; + u32 io_done = 0; + int ret; + + while (true) { + if (io_done >= length) + break; + + key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO); + key->off = offset + io_done; + key->len = length - io_done; + if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK)) + key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK); + + ret = cache_data_alloc(cache, key); + if (ret) { + cache_key_put(key); + goto err; + } + + ret = cache_copy_from_req_bio(cache, key, pcache_req, io_done); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_put(key); + goto err; + } + + cache_subtree = get_subtree(&cache->req_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->req_key_tree, key, true); + ret = cache_key_append(cache, key, pcache_req->bio->bi_opf & REQ_FUA); + if (ret) { + cache_seg_put(key->cache_pos.cache_seg); + cache_key_delete(key); + goto unlock; + } + + io_done += key->len; + spin_unlock(&cache_subtree->tree_lock); + } + + return 0; +unlock: + spin_unlock(&cache_subtree->tree_lock); +err: + return ret; +} + +/** + * cache_flush - Flush all ksets to persist any pending cache data + * @cache: Pointer to the cache structure + * + * This function iterates through all ksets associated with the provided `cache` + * and ensures that any data marked for persistence is written to media. For each + * kset, it acquires the kset lock, then invokes `cache_kset_close`, which handles + * the persistence logic for that kset. + * + * If `cache_kset_close` encounters an error, the function exits immediately with + * the respective error code, preventing the flush operation from proceeding to + * subsequent ksets. + */ +int cache_flush(struct pcache_cache *cache) +{ + struct pcache_cache_kset *kset; + int ret; + u32 i; + + for (i = 0; i < cache->n_ksets; i++) { + kset = get_kset(cache, i); + + spin_lock(&kset->kset_lock); + ret = cache_kset_close(cache, kset); + spin_unlock(&kset->kset_lock); + + if (ret) + return ret; + } + + return 0; +} + +int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req) +{ + struct bio *bio = pcache_req->bio; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) + return cache_flush(cache); + + if (bio_data_dir(bio) == READ) + return cache_read(cache, pcache_req); + + return cache_write(cache, pcache_req); +} diff --git a/drivers/md/dm-pcache/cache_segment.c b/drivers/md/dm-pcache/cache_segment.c new file mode 100644 index 000000000000..f0b58980806e --- /dev/null +++ b/drivers/md/dm-pcache/cache_segment.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cache_dev.h" +#include "cache.h" +#include "backing_dev.h" +#include "dm_pcache.h" + +static inline struct pcache_segment_info *get_seg_info_addr(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *seg_info_addr; + u32 seg_id = cache_seg->segment.seg_id; + void *seg_addr; + + seg_addr = CACHE_DEV_SEGMENT(cache_seg->cache->cache_dev, seg_id); + seg_info_addr = seg_addr + PCACHE_SEG_INFO_SIZE * cache_seg->info_index; + + return seg_info_addr; +} + +static void cache_seg_info_write(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *seg_info_addr; + struct pcache_segment_info *seg_info = &cache_seg->cache_seg_info; + + mutex_lock(&cache_seg->info_lock); + seg_info->header.seq++; + seg_info->header.crc = pcache_meta_crc(&seg_info->header, sizeof(struct pcache_segment_info)); + + seg_info_addr = get_seg_info_addr(cache_seg); + memcpy_flushcache(seg_info_addr, seg_info, sizeof(struct pcache_segment_info)); + pmem_wmb(); + + cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX; + mutex_unlock(&cache_seg->info_lock); +} + +static int cache_seg_info_load(struct pcache_cache_segment *cache_seg) +{ + struct pcache_segment_info *cache_seg_info_addr_base, *cache_seg_info_addr; + struct pcache_cache_dev *cache_dev = cache_seg->cache->cache_dev; + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); + u32 seg_id = cache_seg->segment.seg_id; + int ret = 0; + + cache_seg_info_addr_base = CACHE_DEV_SEGMENT(cache_dev, seg_id); + + mutex_lock(&cache_seg->info_lock); + cache_seg_info_addr = pcache_meta_find_latest(&cache_seg_info_addr_base->header, + sizeof(struct pcache_segment_info), + PCACHE_SEG_INFO_SIZE, + &cache_seg->cache_seg_info); + if (IS_ERR(cache_seg_info_addr)) { + ret = PTR_ERR(cache_seg_info_addr); + goto out; + } else if (!cache_seg_info_addr) { + ret = -EIO; + goto out; + } + cache_seg->info_index = cache_seg_info_addr - cache_seg_info_addr_base; +out: + mutex_unlock(&cache_seg->info_lock); + + if (ret) + pcache_dev_err(pcache, "can't read segment info of segment: %u, ret: %d\n", + cache_seg->segment.seg_id, ret); + return ret; +} + +static int cache_seg_ctrl_load(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl; + struct pcache_cache_seg_gen cache_seg_gen, *cache_seg_gen_addr; + int ret = 0; + + cache_seg_gen_addr = pcache_meta_find_latest(&cache_seg_ctrl->gen->header, + sizeof(struct pcache_cache_seg_gen), + sizeof(struct pcache_cache_seg_gen), + &cache_seg_gen); + if (IS_ERR(cache_seg_gen_addr)) { + ret = PTR_ERR(cache_seg_gen_addr); + goto out; + } + + if (!cache_seg_gen_addr) { + cache_seg->gen = 0; + cache_seg->gen_seq = 0; + cache_seg->gen_index = 0; + goto out; + } + + cache_seg->gen = cache_seg_gen.gen; + cache_seg->gen_seq = cache_seg_gen.header.seq; + cache_seg->gen_index = (cache_seg_gen_addr - cache_seg_ctrl->gen); +out: + + return ret; +} + +static inline struct pcache_cache_seg_gen *get_cache_seg_gen_addr(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl; + + return (cache_seg_ctrl->gen + cache_seg->gen_index); +} + +/* + * cache_seg_ctrl_write - write cache segment control information + * @seg: the cache segment to update + * + * This function writes the control information of a cache segment to media. + * + * Although this updates shared control data, we intentionally do not use + * any locking here. All accesses to control information are single-threaded: + * + * - All reads occur during the init phase, where no concurrent writes + * can happen. + * - Writes happen once during init and once when the last reference + * to the segment is dropped in cache_seg_put(). + * + * Both cases are guaranteed to be single-threaded, so there is no risk + * of concurrent read/write races. + */ +static void cache_seg_ctrl_write(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache_seg_gen cache_seg_gen; + + cache_seg_gen.gen = cache_seg->gen; + cache_seg_gen.header.seq = ++cache_seg->gen_seq; + cache_seg_gen.header.crc = pcache_meta_crc(&cache_seg_gen.header, + sizeof(struct pcache_cache_seg_gen)); + + memcpy_flushcache(get_cache_seg_gen_addr(cache_seg), &cache_seg_gen, sizeof(struct pcache_cache_seg_gen)); + pmem_wmb(); + + cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX; +} + +static void cache_seg_ctrl_init(struct pcache_cache_segment *cache_seg) +{ + cache_seg->gen = 0; + cache_seg->gen_seq = 0; + cache_seg->gen_index = 0; + cache_seg_ctrl_write(cache_seg); +} + +static int cache_seg_meta_load(struct pcache_cache_segment *cache_seg) +{ + int ret; + + ret = cache_seg_info_load(cache_seg); + if (ret) + goto err; + + ret = cache_seg_ctrl_load(cache_seg); + if (ret) + goto err; + + return 0; +err: + return ret; +} + +/** + * cache_seg_set_next_seg - Sets the ID of the next segment + * @cache_seg: Pointer to the cache segment structure. + * @seg_id: The segment ID to set as the next segment. + * + * A pcache_cache allocates multiple cache segments, which are linked together + * through next_seg. When loading a pcache_cache, the first cache segment can + * be found using cache->seg_id, which allows access to all the cache segments. + */ +void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id) +{ + cache_seg->cache_seg_info.flags |= PCACHE_SEG_INFO_FLAGS_HAS_NEXT; + cache_seg->cache_seg_info.next_seg = seg_id; + cache_seg_info_write(cache_seg); +} + +int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id, + bool new_cache) +{ + struct pcache_cache_dev *cache_dev = cache->cache_dev; + struct pcache_cache_segment *cache_seg = &cache->segments[cache_seg_id]; + struct pcache_segment_init_options seg_options = { 0 }; + struct pcache_segment *segment = &cache_seg->segment; + int ret; + + cache_seg->cache = cache; + cache_seg->cache_seg_id = cache_seg_id; + spin_lock_init(&cache_seg->gen_lock); + atomic_set(&cache_seg->refs, 0); + mutex_init(&cache_seg->info_lock); + + /* init pcache_segment */ + seg_options.type = PCACHE_SEGMENT_TYPE_CACHE_DATA; + seg_options.data_off = PCACHE_CACHE_SEG_CTRL_OFF + PCACHE_CACHE_SEG_CTRL_SIZE; + seg_options.seg_id = seg_id; + seg_options.seg_info = &cache_seg->cache_seg_info; + pcache_segment_init(cache_dev, segment, &seg_options); + + cache_seg->cache_seg_ctrl = CACHE_DEV_SEGMENT(cache_dev, seg_id) + PCACHE_CACHE_SEG_CTRL_OFF; + + if (new_cache) { + cache_dev_zero_range(cache_dev, CACHE_DEV_SEGMENT(cache_dev, seg_id), + PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX + + PCACHE_CACHE_SEG_CTRL_SIZE); + + cache_seg_ctrl_init(cache_seg); + + cache_seg->info_index = 0; + cache_seg_info_write(cache_seg); + + /* clear outdated kset in segment */ + memcpy_flushcache(segment->data, &pcache_empty_kset, sizeof(struct pcache_cache_kset_onmedia)); + pmem_wmb(); + } else { + ret = cache_seg_meta_load(cache_seg); + if (ret) + goto err; + } + + return 0; +err: + return ret; +} + +/** + * get_cache_segment - Retrieves a free cache segment from the cache. + * @cache: Pointer to the cache structure. + * + * This function attempts to find a free cache segment that can be used. + * It locks the segment map and checks for the next available segment ID. + * If a free segment is found, it initializes it and returns a pointer to the + * cache segment structure. Returns NULL if no segments are available. + */ +struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache) +{ + struct pcache_cache_segment *cache_seg; + u32 seg_id; + + spin_lock(&cache->seg_map_lock); +again: + seg_id = find_next_zero_bit(cache->seg_map, cache->n_segs, cache->last_cache_seg); + if (seg_id == cache->n_segs) { + /* reset the hint of ->last_cache_seg and retry */ + if (cache->last_cache_seg) { + cache->last_cache_seg = 0; + goto again; + } + cache->cache_full = true; + spin_unlock(&cache->seg_map_lock); + return NULL; + } + + /* + * found an available cache_seg, mark it used in seg_map + * and update the search hint ->last_cache_seg + */ + __set_bit(seg_id, cache->seg_map); + cache->last_cache_seg = seg_id; + spin_unlock(&cache->seg_map_lock); + + cache_seg = &cache->segments[seg_id]; + cache_seg->cache_seg_id = seg_id; + + return cache_seg; +} + +static void cache_seg_gen_increase(struct pcache_cache_segment *cache_seg) +{ + spin_lock(&cache_seg->gen_lock); + cache_seg->gen++; + spin_unlock(&cache_seg->gen_lock); + + cache_seg_ctrl_write(cache_seg); +} + +void cache_seg_get(struct pcache_cache_segment *cache_seg) +{ + atomic_inc(&cache_seg->refs); +} + +static void cache_seg_invalidate(struct pcache_cache_segment *cache_seg) +{ + struct pcache_cache *cache; + + cache = cache_seg->cache; + cache_seg_gen_increase(cache_seg); + + spin_lock(&cache->seg_map_lock); + if (cache->cache_full) + cache->cache_full = false; + __clear_bit(cache_seg->cache_seg_id, cache->seg_map); + spin_unlock(&cache->seg_map_lock); + + pcache_defer_reqs_kick(CACHE_TO_PCACHE(cache)); + /* clean_work will clean the bad key in key_tree*/ + queue_work(cache_get_wq(cache), &cache->clean_work); +} + +void cache_seg_put(struct pcache_cache_segment *cache_seg) +{ + if (atomic_dec_and_test(&cache_seg->refs)) + cache_seg_invalidate(cache_seg); +} diff --git a/drivers/md/dm-pcache/cache_writeback.c b/drivers/md/dm-pcache/cache_writeback.c new file mode 100644 index 000000000000..87a82b3fe836 --- /dev/null +++ b/drivers/md/dm-pcache/cache_writeback.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/bio.h> + +#include "cache.h" +#include "backing_dev.h" +#include "cache_dev.h" +#include "dm_pcache.h" + +static void writeback_ctx_end(struct pcache_cache *cache, int ret) +{ + if (ret && !cache->writeback_ctx.ret) { + pcache_dev_err(CACHE_TO_PCACHE(cache), "writeback error: %d", ret); + cache->writeback_ctx.ret = ret; + } + + if (!atomic_dec_and_test(&cache->writeback_ctx.pending)) + return; + + if (!cache->writeback_ctx.ret) { + backing_dev_flush(cache->backing_dev); + + mutex_lock(&cache->dirty_tail_lock); + cache_pos_advance(&cache->dirty_tail, cache->writeback_ctx.advance); + cache_encode_dirty_tail(cache); + mutex_unlock(&cache->dirty_tail_lock); + } + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0); +} + +static void writeback_end_req(struct pcache_backing_dev_req *backing_req, int ret) +{ + struct pcache_cache *cache = backing_req->priv_data; + + mutex_lock(&cache->writeback_lock); + writeback_ctx_end(cache, ret); + mutex_unlock(&cache->writeback_lock); +} + +static inline bool is_cache_clean(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 to_copy; + void *addr; + int ret; + + addr = cache_pos_addr(dirty_tail); + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf; + + to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - dirty_tail->seg_off); + ret = copy_mc_to_kernel(kset_onmedia, addr, to_copy); + if (ret) { + pcache_dev_err(pcache, "error to read kset: %d", ret); + return true; + } + + /* Check if the magic number matches the expected value */ + if (kset_onmedia->magic != PCACHE_KSET_MAGIC) { + pcache_dev_debug(pcache, "dirty_tail: %u:%u magic: %llx, not expected: %llx\n", + dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off, + kset_onmedia->magic, PCACHE_KSET_MAGIC); + return true; + } + + /* Verify the CRC checksum for data integrity */ + if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) { + pcache_dev_debug(pcache, "dirty_tail: %u:%u crc: %x, not expected: %x\n", + dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off, + cache_kset_crc(kset_onmedia), kset_onmedia->crc); + return true; + } + + return false; +} + +void cache_writeback_exit(struct pcache_cache *cache) +{ + cancel_delayed_work_sync(&cache->writeback_work); + backing_dev_flush(cache->backing_dev); + cache_tree_exit(&cache->writeback_key_tree); +} + +int cache_writeback_init(struct pcache_cache *cache) +{ + int ret; + + ret = cache_tree_init(cache, &cache->writeback_key_tree, 1); + if (ret) + goto err; + + atomic_set(&cache->writeback_ctx.pending, 0); + + /* Queue delayed work to start writeback handling */ + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0); + + return 0; +err: + return ret; +} + +static void cache_key_writeback(struct pcache_cache *cache, struct pcache_cache_key *key) +{ + struct pcache_backing_dev_req *writeback_req; + struct pcache_backing_dev_req_opts writeback_req_opts = { 0 }; + struct pcache_cache_pos *pos; + void *addr; + u32 seg_remain, req_len, done = 0; + + if (cache_key_clean(key)) + return; + + pos = &key->cache_pos; + + seg_remain = cache_seg_remain(pos); + BUG_ON(seg_remain < key->len); +next_req: + addr = cache_pos_addr(pos) + done; + req_len = backing_dev_req_coalesced_max_len(addr, key->len - done); + + writeback_req_opts.type = BACKING_DEV_REQ_TYPE_KMEM; + writeback_req_opts.gfp_mask = GFP_NOIO; + writeback_req_opts.end_fn = writeback_end_req; + writeback_req_opts.priv_data = cache; + + writeback_req_opts.kmem.data = addr; + writeback_req_opts.kmem.opf = REQ_OP_WRITE; + writeback_req_opts.kmem.len = req_len; + writeback_req_opts.kmem.backing_off = key->off + done; + + writeback_req = backing_dev_req_create(cache->backing_dev, &writeback_req_opts); + + atomic_inc(&cache->writeback_ctx.pending); + backing_dev_req_submit(writeback_req, true); + + done += req_len; + if (done < key->len) + goto next_req; +} + +static void cache_wb_tree_writeback(struct pcache_cache *cache, u32 advance) +{ + struct pcache_cache_tree *cache_tree = &cache->writeback_key_tree; + struct pcache_cache_subtree *cache_subtree; + struct rb_node *node; + struct pcache_cache_key *key; + u32 i; + + cache->writeback_ctx.ret = 0; + cache->writeback_ctx.advance = advance; + atomic_set(&cache->writeback_ctx.pending, 1); + + for (i = 0; i < cache_tree->n_subtrees; i++) { + cache_subtree = &cache_tree->subtrees[i]; + + node = rb_first(&cache_subtree->root); + while (node) { + key = CACHE_KEY(node); + node = rb_next(node); + + cache_key_writeback(cache, key); + cache_key_delete(key); + } + } + writeback_ctx_end(cache, 0); +} + +static int cache_kset_insert_tree(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia) +{ + struct pcache_cache_key_onmedia *key_onmedia; + struct pcache_cache_subtree *cache_subtree; + struct pcache_cache_key *key; + int ret; + u32 i; + + /* Iterate through all keys in the kset and write each back to storage */ + for (i = 0; i < kset_onmedia->key_num; i++) { + key_onmedia = &kset_onmedia->data[i]; + + key = cache_key_alloc(&cache->writeback_key_tree, GFP_NOIO); + ret = cache_key_decode(cache, key_onmedia, key); + if (ret) { + cache_key_put(key); + goto clear_tree; + } + + cache_subtree = get_subtree(&cache->writeback_key_tree, key->off); + spin_lock(&cache_subtree->tree_lock); + cache_key_insert(&cache->writeback_key_tree, key, true); + spin_unlock(&cache_subtree->tree_lock); + } + + return 0; +clear_tree: + cache_tree_clear(&cache->writeback_key_tree); + return ret; +} + +static void last_kset_writeback(struct pcache_cache *cache, + struct pcache_cache_kset_onmedia *last_kset_onmedia) +{ + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_segment *next_seg; + + pcache_dev_debug(pcache, "last kset, next: %u\n", last_kset_onmedia->next_cache_seg_id); + + next_seg = &cache->segments[last_kset_onmedia->next_cache_seg_id]; + + mutex_lock(&cache->dirty_tail_lock); + cache->dirty_tail.cache_seg = next_seg; + cache->dirty_tail.seg_off = 0; + cache_encode_dirty_tail(cache); + mutex_unlock(&cache->dirty_tail_lock); +} + +void cache_writeback_fn(struct work_struct *work) +{ + struct pcache_cache *cache = container_of(work, struct pcache_cache, writeback_work.work); + struct dm_pcache *pcache = CACHE_TO_PCACHE(cache); + struct pcache_cache_pos dirty_tail; + struct pcache_cache_kset_onmedia *kset_onmedia; + u32 delay; + int ret; + + mutex_lock(&cache->writeback_lock); + if (atomic_read(&cache->writeback_ctx.pending)) + goto unlock; + + if (pcache_is_stopping(pcache)) + goto unlock; + + kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf; + + mutex_lock(&cache->dirty_tail_lock); + cache_pos_copy(&dirty_tail, &cache->dirty_tail); + mutex_unlock(&cache->dirty_tail_lock); + + if (is_cache_clean(cache, &dirty_tail)) { + delay = PCACHE_CACHE_WRITEBACK_INTERVAL; + goto queue_work; + } + + if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) { + last_kset_writeback(cache, kset_onmedia); + delay = 0; + goto queue_work; + } + + ret = cache_kset_insert_tree(cache, kset_onmedia); + if (ret) { + delay = PCACHE_CACHE_WRITEBACK_INTERVAL; + goto queue_work; + } + + cache_wb_tree_writeback(cache, get_kset_onmedia_size(kset_onmedia)); + delay = 0; +queue_work: + queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, delay); +unlock: + mutex_unlock(&cache->writeback_lock); +} diff --git a/drivers/md/dm-pcache/dm_pcache.c b/drivers/md/dm-pcache/dm_pcache.c new file mode 100644 index 000000000000..e5f5936fa6f0 --- /dev/null +++ b/drivers/md/dm-pcache/dm_pcache.c @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/bio.h> + +#include "../dm-core.h" +#include "cache_dev.h" +#include "backing_dev.h" +#include "cache.h" +#include "dm_pcache.h" + +void pcache_defer_reqs_kick(struct dm_pcache *pcache) +{ + struct pcache_cache *cache = &pcache->cache; + + spin_lock(&cache->seg_map_lock); + if (!cache->cache_full) + queue_work(pcache->task_wq, &pcache->defered_req_work); + spin_unlock(&cache->seg_map_lock); +} + +static void defer_req(struct pcache_request *pcache_req) +{ + struct dm_pcache *pcache = pcache_req->pcache; + + BUG_ON(!list_empty(&pcache_req->list_node)); + + spin_lock(&pcache->defered_req_list_lock); + list_add(&pcache_req->list_node, &pcache->defered_req_list); + pcache_defer_reqs_kick(pcache); + spin_unlock(&pcache->defered_req_list_lock); +} + +static void defered_req_fn(struct work_struct *work) +{ + struct dm_pcache *pcache = container_of(work, struct dm_pcache, defered_req_work); + struct pcache_request *pcache_req; + LIST_HEAD(tmp_list); + int ret; + + if (pcache_is_stopping(pcache)) + return; + + spin_lock(&pcache->defered_req_list_lock); + list_splice_init(&pcache->defered_req_list, &tmp_list); + spin_unlock(&pcache->defered_req_list_lock); + + while (!list_empty(&tmp_list)) { + pcache_req = list_first_entry(&tmp_list, + struct pcache_request, list_node); + list_del_init(&pcache_req->list_node); + pcache_req->ret = 0; + ret = pcache_cache_handle_req(&pcache->cache, pcache_req); + if (ret == -EBUSY) + defer_req(pcache_req); + else + pcache_req_put(pcache_req, ret); + } +} + +void pcache_req_get(struct pcache_request *pcache_req) +{ + kref_get(&pcache_req->ref); +} + +static void end_req(struct kref *ref) +{ + struct pcache_request *pcache_req = container_of(ref, struct pcache_request, ref); + struct dm_pcache *pcache = pcache_req->pcache; + struct bio *bio = pcache_req->bio; + int ret = pcache_req->ret; + + if (ret == -EBUSY) { + pcache_req_get(pcache_req); + defer_req(pcache_req); + } else { + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + + if (atomic_dec_and_test(&pcache->inflight_reqs)) + wake_up(&pcache->inflight_wq); + } +} + +void pcache_req_put(struct pcache_request *pcache_req, int ret) +{ + /* Set the return status if it is not already set */ + if (ret && !pcache_req->ret) + pcache_req->ret = ret; + + kref_put(&pcache_req->ref, end_req); +} + +static bool at_least_one_arg(struct dm_arg_set *as, char **error) +{ + if (!as->argc) { + *error = "Insufficient args"; + return false; + } + + return true; +} + +static int parse_cache_dev(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + int ret; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + ret = dm_get_device(pcache->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, + &pcache->cache_dev.dm_dev); + if (ret) { + *error = "Error opening cache device"; + return ret; + } + + return 0; +} + +static int parse_backing_dev(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + int ret; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + ret = dm_get_device(pcache->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, + &pcache->backing_dev.dm_dev); + if (ret) { + *error = "Error opening backing device"; + return ret; + } + + return 0; +} + +static void pcache_init_opts(struct pcache_cache_options *opts) +{ + opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK; + opts->data_crc = false; +} + +static int parse_cache_opts(struct dm_pcache *pcache, struct dm_arg_set *as, + char **error) +{ + struct pcache_cache_options *opts = &pcache->opts; + static const struct dm_arg _args[] = { + {0, 4, "Invalid number of cache option arguments"}, + }; + unsigned int argc; + const char *arg; + int ret; + + pcache_init_opts(opts); + if (!as->argc) + return 0; + + ret = dm_read_arg_group(_args, as, &argc, error); + if (ret) + return -EINVAL; + + while (argc) { + arg = dm_shift_arg(as); + argc--; + + if (!strcmp(arg, "cache_mode")) { + arg = dm_shift_arg(as); + if (!strcmp(arg, "writeback")) { + opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK; + } else { + *error = "Invalid cache mode parameter"; + return -EINVAL; + } + argc--; + } else if (!strcmp(arg, "data_crc")) { + arg = dm_shift_arg(as); + if (!strcmp(arg, "true")) { + opts->data_crc = true; + } else if (!strcmp(arg, "false")) { + opts->data_crc = false; + } else { + *error = "Invalid data crc parameter"; + return -EINVAL; + } + argc--; + } else { + *error = "Unrecognised cache option requested"; + return -EINVAL; + } + } + + return 0; +} + +static int pcache_start(struct dm_pcache *pcache, char **error) +{ + int ret; + + ret = cache_dev_start(pcache); + if (ret) { + *error = "Failed to start cache dev"; + return ret; + } + + ret = backing_dev_start(pcache); + if (ret) { + *error = "Failed to start backing dev"; + goto stop_cache; + } + + ret = pcache_cache_start(pcache); + if (ret) { + *error = "Failed to start pcache"; + goto stop_backing; + } + + return 0; +stop_backing: + backing_dev_stop(pcache); +stop_cache: + cache_dev_stop(pcache); + + return ret; +} + +static void pcache_destroy_args(struct dm_pcache *pcache) +{ + if (pcache->cache_dev.dm_dev) + dm_put_device(pcache->ti, pcache->cache_dev.dm_dev); + if (pcache->backing_dev.dm_dev) + dm_put_device(pcache->ti, pcache->backing_dev.dm_dev); +} + +static int pcache_parse_args(struct dm_pcache *pcache, unsigned int argc, char **argv, + char **error) +{ + struct dm_arg_set as; + int ret; + + as.argc = argc; + as.argv = argv; + + /* + * Parse cache device + */ + ret = parse_cache_dev(pcache, &as, error); + if (ret) + return ret; + /* + * Parse backing device + */ + ret = parse_backing_dev(pcache, &as, error); + if (ret) + goto out; + /* + * Parse optional arguments + */ + ret = parse_cache_opts(pcache, &as, error); + if (ret) + goto out; + + return 0; +out: + pcache_destroy_args(pcache); + return ret; +} + +static int dm_pcache_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct mapped_device *md = ti->table->md; + struct dm_pcache *pcache; + int ret; + + if (md->map) { + ti->error = "Don't support table loading for live md"; + return -EOPNOTSUPP; + } + + /* Allocate memory for the cache structure */ + pcache = kzalloc(sizeof(struct dm_pcache), GFP_KERNEL); + if (!pcache) + return -ENOMEM; + + pcache->task_wq = alloc_workqueue("pcache-%s-wq", WQ_UNBOUND | WQ_MEM_RECLAIM, + 0, md->name); + if (!pcache->task_wq) { + ret = -ENOMEM; + goto free_pcache; + } + + spin_lock_init(&pcache->defered_req_list_lock); + INIT_LIST_HEAD(&pcache->defered_req_list); + INIT_WORK(&pcache->defered_req_work, defered_req_fn); + pcache->ti = ti; + + ret = pcache_parse_args(pcache, argc, argv, &ti->error); + if (ret) + goto destroy_wq; + + ret = pcache_start(pcache, &ti->error); + if (ret) + goto destroy_args; + + ti->num_flush_bios = 1; + ti->flush_supported = true; + ti->per_io_data_size = sizeof(struct pcache_request); + ti->private = pcache; + atomic_set(&pcache->inflight_reqs, 0); + atomic_set(&pcache->state, PCACHE_STATE_RUNNING); + init_waitqueue_head(&pcache->inflight_wq); + + return 0; +destroy_args: + pcache_destroy_args(pcache); +destroy_wq: + destroy_workqueue(pcache->task_wq); +free_pcache: + kfree(pcache); + + return ret; +} + +static void defer_req_stop(struct dm_pcache *pcache) +{ + struct pcache_request *pcache_req; + LIST_HEAD(tmp_list); + + flush_work(&pcache->defered_req_work); + + spin_lock(&pcache->defered_req_list_lock); + list_splice_init(&pcache->defered_req_list, &tmp_list); + spin_unlock(&pcache->defered_req_list_lock); + + while (!list_empty(&tmp_list)) { + pcache_req = list_first_entry(&tmp_list, + struct pcache_request, list_node); + list_del_init(&pcache_req->list_node); + pcache_req_put(pcache_req, -EIO); + } +} + +static void dm_pcache_dtr(struct dm_target *ti) +{ + struct dm_pcache *pcache; + + pcache = ti->private; + atomic_set(&pcache->state, PCACHE_STATE_STOPPING); + defer_req_stop(pcache); + + wait_event(pcache->inflight_wq, + atomic_read(&pcache->inflight_reqs) == 0); + + pcache_cache_stop(pcache); + backing_dev_stop(pcache); + cache_dev_stop(pcache); + + pcache_destroy_args(pcache); + drain_workqueue(pcache->task_wq); + destroy_workqueue(pcache->task_wq); + + kfree(pcache); +} + +static int dm_pcache_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct pcache_request *pcache_req = dm_per_bio_data(bio, sizeof(struct pcache_request)); + struct dm_pcache *pcache = ti->private; + int ret; + + pcache_req->pcache = pcache; + kref_init(&pcache_req->ref); + pcache_req->ret = 0; + pcache_req->bio = bio; + pcache_req->off = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + pcache_req->data_len = bio->bi_iter.bi_size; + INIT_LIST_HEAD(&pcache_req->list_node); + atomic_inc(&pcache->inflight_reqs); + + ret = pcache_cache_handle_req(&pcache->cache, pcache_req); + if (ret == -EBUSY) + defer_req(pcache_req); + else + pcache_req_put(pcache_req, ret); + + return DM_MAPIO_SUBMITTED; +} + +static void dm_pcache_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + struct dm_pcache *pcache = ti->private; + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; + struct pcache_backing_dev *backing_dev = &pcache->backing_dev; + struct pcache_cache *cache = &pcache->cache; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%x %u %u %u %u %x %u:%u %u:%u %u:%u", + cache_dev->sb_flags, + cache_dev->seg_num, + cache->n_segs, + bitmap_weight(cache->seg_map, cache->n_segs), + pcache_cache_get_gc_percent(cache), + cache->cache_info.flags, + cache->key_head.cache_seg->cache_seg_id, + cache->key_head.seg_off, + cache->dirty_tail.cache_seg->cache_seg_id, + cache->dirty_tail.seg_off, + cache->key_tail.cache_seg->cache_seg_id, + cache->key_tail.seg_off); + break; + case STATUSTYPE_TABLE: + DMEMIT("%s %s 4 cache_mode writeback crc %s", + cache_dev->dm_dev->name, + backing_dev->dm_dev->name, + cache_data_crc_on(cache) ? "true" : "false"); + break; + case STATUSTYPE_IMA: + *result = '\0'; + break; + } +} + +static int dm_pcache_message(struct dm_target *ti, unsigned int argc, + char **argv, char *result, unsigned int maxlen) +{ + struct dm_pcache *pcache = ti->private; + unsigned long val; + + if (argc != 2) + goto err; + + if (!strcasecmp(argv[0], "gc_percent")) { + if (kstrtoul(argv[1], 10, &val)) + goto err; + + return pcache_cache_set_gc_percent(&pcache->cache, val); + } +err: + return -EINVAL; +} + +static struct target_type dm_pcache_target = { + .name = "pcache", + .version = {0, 1, 0}, + .module = THIS_MODULE, + .features = DM_TARGET_SINGLETON, + .ctr = dm_pcache_ctr, + .dtr = dm_pcache_dtr, + .map = dm_pcache_map_bio, + .status = dm_pcache_status, + .message = dm_pcache_message, +}; + +static int __init dm_pcache_init(void) +{ + int ret; + + ret = pcache_backing_init(); + if (ret) + goto err; + + ret = pcache_cache_init(); + if (ret) + goto backing_exit; + + ret = dm_register_target(&dm_pcache_target); + if (ret) + goto cache_exit; + return 0; + +cache_exit: + pcache_cache_exit(); +backing_exit: + pcache_backing_exit(); +err: + return ret; +} +module_init(dm_pcache_init); + +static void __exit dm_pcache_exit(void) +{ + dm_unregister_target(&dm_pcache_target); + pcache_cache_exit(); + pcache_backing_exit(); +} +module_exit(dm_pcache_exit); + +MODULE_DESCRIPTION("dm-pcache Persistent Cache for block device"); +MODULE_AUTHOR("Dongsheng Yang <dongsheng.yang@linux.dev>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-pcache/dm_pcache.h b/drivers/md/dm-pcache/dm_pcache.h new file mode 100644 index 000000000000..b4e06be0c0b9 --- /dev/null +++ b/drivers/md/dm-pcache/dm_pcache.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _DM_PCACHE_H +#define _DM_PCACHE_H +#include <linux/device-mapper.h> + +#include "../dm-core.h" + +#define CACHE_DEV_TO_PCACHE(cache_dev) (container_of(cache_dev, struct dm_pcache, cache_dev)) +#define BACKING_DEV_TO_PCACHE(backing_dev) (container_of(backing_dev, struct dm_pcache, backing_dev)) +#define CACHE_TO_PCACHE(cache) (container_of(cache, struct dm_pcache, cache)) + +#define PCACHE_STATE_RUNNING 1 +#define PCACHE_STATE_STOPPING 2 + +struct pcache_cache_dev; +struct pcache_backing_dev; +struct pcache_cache; +struct pcache_cache_options; +struct dm_pcache { + struct dm_target *ti; + struct pcache_cache_dev cache_dev; + struct pcache_backing_dev backing_dev; + struct pcache_cache cache; + struct pcache_cache_options opts; + + spinlock_t defered_req_list_lock; + struct list_head defered_req_list; + struct workqueue_struct *task_wq; + + struct work_struct defered_req_work; + + atomic_t state; + atomic_t inflight_reqs; + wait_queue_head_t inflight_wq; +}; + +static inline bool pcache_is_stopping(struct dm_pcache *pcache) +{ + return (atomic_read(&pcache->state) == PCACHE_STATE_STOPPING); +} + +#define pcache_dev_err(pcache, fmt, ...) \ + pcache_err("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) +#define pcache_dev_info(pcache, fmt, ...) \ + pcache_info("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) +#define pcache_dev_debug(pcache, fmt, ...) \ + pcache_debug("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__) + +struct pcache_request { + struct dm_pcache *pcache; + struct bio *bio; + + u64 off; + u32 data_len; + + struct kref ref; + int ret; + + struct list_head list_node; +}; + +void pcache_req_get(struct pcache_request *pcache_req); +void pcache_req_put(struct pcache_request *pcache_req, int ret); + +void pcache_defer_reqs_kick(struct dm_pcache *pcache); + +#endif /* _DM_PCACHE_H */ diff --git a/drivers/md/dm-pcache/pcache_internal.h b/drivers/md/dm-pcache/pcache_internal.h new file mode 100644 index 000000000000..d427e534727c --- /dev/null +++ b/drivers/md/dm-pcache/pcache_internal.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_INTERNAL_H +#define _PCACHE_INTERNAL_H + +#include <linux/delay.h> +#include <linux/crc32c.h> + +#define pcache_err(fmt, ...) \ + pr_err("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) +#define pcache_info(fmt, ...) \ + pr_info("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) +#define pcache_debug(fmt, ...) \ + pr_debug("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__) + +#define PCACHE_KB (1024ULL) +#define PCACHE_MB (1024 * PCACHE_KB) + +/* Maximum number of metadata indices */ +#define PCACHE_META_INDEX_MAX 2 + +#define PCACHE_CRC_SEED 0x3B15A +/* + * struct pcache_meta_header - PCACHE metadata header structure + * @crc: CRC checksum for validating metadata integrity. + * @seq: Sequence number to track metadata updates. + * @version: Metadata version. + * @res: Reserved space for future use. + */ +struct pcache_meta_header { + __u32 crc; + __u8 seq; + __u8 version; + __u16 res; +}; + +/* + * pcache_meta_crc - Calculate CRC for the given metadata header. + * @header: Pointer to the metadata header. + * @meta_size: Size of the metadata structure. + * + * Returns the CRC checksum calculated by excluding the CRC field itself. + */ +static inline u32 pcache_meta_crc(struct pcache_meta_header *header, u32 meta_size) +{ + return crc32c(PCACHE_CRC_SEED, (void *)header + 4, meta_size - 4); +} + +/* + * pcache_meta_seq_after - Check if a sequence number is more recent, accounting for overflow. + * @seq1: First sequence number. + * @seq2: Second sequence number. + * + * Determines if @seq1 is more recent than @seq2 by calculating the signed + * difference between them. This approach allows handling sequence number + * overflow correctly because the difference wraps naturally, and any value + * greater than zero indicates that @seq1 is "after" @seq2. This method + * assumes 8-bit unsigned sequence numbers, where the difference wraps + * around if seq1 overflows past seq2. + * + * Returns: + * - true if @seq1 is more recent than @seq2, indicating it comes "after" + * - false otherwise. + */ +static inline bool pcache_meta_seq_after(u8 seq1, u8 seq2) +{ + return (s8)(seq1 - seq2) > 0; +} + +/* + * pcache_meta_find_latest - Find the latest valid metadata. + * @header: Pointer to the metadata header. + * @meta_size: Size of each metadata block. + * + * Finds the latest valid metadata by checking sequence numbers. If a + * valid entry with the highest sequence number is found, its pointer + * is returned. Returns NULL if no valid metadata is found. + */ +static inline void __must_check *pcache_meta_find_latest(struct pcache_meta_header *header, + u32 meta_size, u32 meta_max_size, + void *meta_ret) +{ + struct pcache_meta_header *meta, *latest = NULL; + u32 i, seq_latest = 0; + void *meta_addr; + + meta = meta_ret; + + for (i = 0; i < PCACHE_META_INDEX_MAX; i++) { + meta_addr = (void *)header + (i * meta_max_size); + if (copy_mc_to_kernel(meta, meta_addr, meta_size)) { + pcache_err("hardware memory error when copy meta"); + return ERR_PTR(-EIO); + } + + /* Skip if CRC check fails, which means corrupted */ + if (meta->crc != pcache_meta_crc(meta, meta_size)) + continue; + + /* Update latest if a more recent sequence is found */ + if (!latest || pcache_meta_seq_after(meta->seq, seq_latest)) { + seq_latest = meta->seq; + latest = (void *)header + (i * meta_max_size); + } + } + + if (!latest) + return NULL; + + if (copy_mc_to_kernel(meta_ret, latest, meta_size)) { + pcache_err("hardware memory error"); + return ERR_PTR(-EIO); + } + + return latest; +} + +#endif /* _PCACHE_INTERNAL_H */ diff --git a/drivers/md/dm-pcache/segment.c b/drivers/md/dm-pcache/segment.c new file mode 100644 index 000000000000..7e9818701445 --- /dev/null +++ b/drivers/md/dm-pcache/segment.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/dax.h> + +#include "pcache_internal.h" +#include "cache_dev.h" +#include "segment.h" + +int segment_copy_to_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off) +{ + struct iov_iter iter; + size_t copied; + void *src; + + iov_iter_bvec(&iter, ITER_DEST, &bio->bi_io_vec[bio->bi_iter.bi_idx], + bio_segments(bio), bio->bi_iter.bi_size); + iter.iov_offset = bio->bi_iter.bi_bvec_done; + if (bio_off) + iov_iter_advance(&iter, bio_off); + + src = segment->data + data_off; + copied = _copy_mc_to_iter(src, data_len, &iter); + if (copied != data_len) + return -EIO; + + return 0; +} + +int segment_copy_from_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off) +{ + struct iov_iter iter; + size_t copied; + void *dst; + + iov_iter_bvec(&iter, ITER_SOURCE, &bio->bi_io_vec[bio->bi_iter.bi_idx], + bio_segments(bio), bio->bi_iter.bi_size); + iter.iov_offset = bio->bi_iter.bi_bvec_done; + if (bio_off) + iov_iter_advance(&iter, bio_off); + + dst = segment->data + data_off; + copied = _copy_from_iter_flushcache(dst, data_len, &iter); + if (copied != data_len) + return -EIO; + pmem_wmb(); + + return 0; +} + +void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment, + struct pcache_segment_init_options *options) +{ + segment->seg_info = options->seg_info; + segment_info_set_type(segment->seg_info, options->type); + + segment->cache_dev = cache_dev; + segment->seg_id = options->seg_id; + segment->data_size = PCACHE_SEG_SIZE - options->data_off; + segment->data = CACHE_DEV_SEGMENT(cache_dev, options->seg_id) + options->data_off; +} diff --git a/drivers/md/dm-pcache/segment.h b/drivers/md/dm-pcache/segment.h new file mode 100644 index 000000000000..deca1ddcb02b --- /dev/null +++ b/drivers/md/dm-pcache/segment.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PCACHE_SEGMENT_H +#define _PCACHE_SEGMENT_H + +#include <linux/bio.h> +#include <linux/bitfield.h> + +#include "pcache_internal.h" + +struct pcache_segment_info { + struct pcache_meta_header header; + __u32 flags; + __u32 next_seg; +}; + +#define PCACHE_SEG_INFO_FLAGS_HAS_NEXT BIT(0) + +#define PCACHE_SEG_INFO_FLAGS_TYPE_MASK GENMASK(4, 1) +#define PCACHE_SEGMENT_TYPE_CACHE_DATA 1 + +static inline bool segment_info_has_next(struct pcache_segment_info *seg_info) +{ + return (seg_info->flags & PCACHE_SEG_INFO_FLAGS_HAS_NEXT); +} + +static inline void segment_info_set_type(struct pcache_segment_info *seg_info, u8 type) +{ + seg_info->flags &= ~PCACHE_SEG_INFO_FLAGS_TYPE_MASK; + seg_info->flags |= FIELD_PREP(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, type); +} + +static inline u8 segment_info_get_type(struct pcache_segment_info *seg_info) +{ + return FIELD_GET(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, seg_info->flags); +} + +struct pcache_segment_pos { + struct pcache_segment *segment; /* Segment associated with the position */ + u32 off; /* Offset within the segment */ +}; + +struct pcache_segment_init_options { + u8 type; + u32 seg_id; + u32 data_off; + + struct pcache_segment_info *seg_info; +}; + +struct pcache_segment { + struct pcache_cache_dev *cache_dev; + + void *data; + u32 data_size; + u32 seg_id; + + struct pcache_segment_info *seg_info; +}; + +int segment_copy_to_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off); +int segment_copy_from_bio(struct pcache_segment *segment, + u32 data_off, u32 data_len, struct bio *bio, u32 bio_off); + +static inline void segment_pos_advance(struct pcache_segment_pos *seg_pos, u32 len) +{ + BUG_ON(seg_pos->off + len > seg_pos->segment->data_size); + + seg_pos->off += len; +} + +void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment, + struct pcache_segment_init_options *options); +#endif /* _PCACHE_SEGMENT_H */ diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 0a1788fed68c..c6f7129e43d3 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3247,7 +3247,7 @@ size_check: rs_reset_inconclusive_reshape(rs); /* Start raid set read-only and assumed clean to change in raid_resume() */ - rs->md.ro = 1; + rs->md.ro = MD_RDONLY; rs->md.in_sync = 1; /* Has to be held on running the array */ @@ -3385,7 +3385,7 @@ static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long r /* The MD sync thread can be done with io or be interrupted but still be running */ if (!test_bit(MD_RECOVERY_DONE, &recovery) && (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { + (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return st_reshape; @@ -3775,11 +3775,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, } else return -EINVAL; } - if (mddev->ro == 2) { + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - mddev->ro = 0; + mddev->ro = MD_RDWR; if (!mddev->suspended) md_wakeup_thread(mddev->sync_thread); } @@ -3860,6 +3860,7 @@ static void raid_postsuspend(struct dm_target *ti) */ md_stop_writes(&rs->md); mddev_suspend(&rs->md, false); + rs->md.ro = MD_RDONLY; } } @@ -3972,7 +3973,7 @@ static void rs_update_sbs(struct raid_set *rs) int ro = mddev->ro; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - mddev->ro = 0; + mddev->ro = MD_RDWR; md_update_sb(mddev, 1); mddev->ro = ro; } @@ -4131,7 +4132,7 @@ static void raid_resume(struct dm_target *ti) WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread, lockdep_is_held(&mddev->reconfig_mutex))); clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); - mddev->ro = 0; + mddev->ro = MD_RDWR; mddev->in_sync = 0; md_unfrozen_sync_thread(mddev); mddev_unlock_and_resume(mddev); diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index a4550975c27d..e9b47b659976 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -206,7 +206,7 @@ struct dm_region_hash *dm_region_hash_create( rh->shift = RH_HASH_SHIFT; rh->prime = RH_HASH_MULT; - rh->buckets = vmalloc(array_size(nr_buckets, sizeof(*rh->buckets))); + rh->buckets = vmalloc_array(nr_buckets, sizeof(*rh->buckets)); if (!rh->buckets) { DMERR("unable to allocate region hash bucket memory"); kfree(rh); diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index bb1a70b5a215..50a52ca50b34 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -114,8 +114,8 @@ static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths) return -EINVAL; } - sctx->region_table = vmalloc(array_size(nr_slots, - sizeof(region_table_slot_t))); + sctx->region_table = vmalloc_array(nr_slots, + sizeof(region_table_slot_t)); if (!sctx->region_table) { ti->error = "Cannot allocate region table"; return -ENOMEM; diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 2af5a9514c05..8fede41adec0 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -263,7 +263,8 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, static struct target_type error_target = { .name = "error", .version = {1, 7, 0}, - .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM, + .features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM | + DM_TARGET_PASSES_INTEGRITY, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 007bb93e5fca..c84149ba4e38 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3031,8 +3031,8 @@ static struct pool *pool_create(struct mapped_device *pool_md, } pool->cell_sort_array = - vmalloc(array_size(CELL_SORT_ARRAY_SIZE, - sizeof(*pool->cell_sort_array))); + vmalloc_array(CELL_SORT_ARRAY_SIZE, + sizeof(*pool->cell_sort_array)); if (!pool->cell_sort_array) { *error = "Error allocating cell sort array"; err_p = ERR_PTR(-ENOMEM); diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index 810002747091..262e11581f2d 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -17,6 +17,7 @@ #include <linux/minmax.h> #include <linux/sched.h> #include <linux/spinlock.h> +#include <linux/string.h> #include <linux/wait.h> #include "logger.h" @@ -509,18 +510,6 @@ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lb vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY); } -static bool is_zero_block(char *block) -{ - int i; - - for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) { - if (*((u64 *) &block[i])) - return false; - } - - return true; -} - static void copy_from_bio(struct bio *bio, char *data_ptr) { struct bio_vec biovec; @@ -572,7 +561,7 @@ static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *b * we acknowledge the bio. */ copy_from_bio(bio, data_vio->vio.data); - data_vio->is_zero = is_zero_block(data_vio->vio.data); + data_vio->is_zero = mem_is_zero(data_vio->vio.data, VDO_BLOCK_SIZE); data_vio->write = true; } @@ -1459,7 +1448,7 @@ static void modify_for_partial_write(struct vdo_completion *completion) copy_from_bio(bio, data + data_vio->offset); } - data_vio->is_zero = is_zero_block(data); + data_vio->is_zero = mem_is_zero(data, VDO_BLOCK_SIZE); data_vio->read = false; launch_data_vio_logical_callback(data_vio, continue_data_vio_with_block_map_slot); diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c index 12f954a0c532..afb062e1f1fb 100644 --- a/drivers/md/dm-vdo/indexer/volume-index.c +++ b/drivers/md/dm-vdo/indexer/volume-index.c @@ -836,7 +836,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index, "%zu bytes decoded of %zu expected", offset, sizeof(buffer)); if (result != VDO_SUCCESS) - result = UDS_CORRUPT_DATA; + return UDS_CORRUPT_DATA; if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) { return vdo_log_warning_strerror(UDS_CORRUPT_DATA, @@ -928,7 +928,7 @@ static int start_restoring_volume_index(struct volume_index *volume_index, "%zu bytes decoded of %zu expected", offset, sizeof(buffer)); if (result != VDO_SUCCESS) - result = UDS_CORRUPT_DATA; + return UDS_CORRUPT_DATA; if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0) return vdo_log_warning_strerror(UDS_CORRUPT_DATA, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7bd6fa05b00a..f5e5e59b232b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -490,18 +490,13 @@ u64 dm_start_time_ns_from_clone(struct bio *bio) } EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); -static inline bool bio_is_flush_with_data(struct bio *bio) -{ - return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); -} - static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio) { /* * If REQ_PREFLUSH set, don't account payload, it will be * submitted (and accounted) after this flush completes. */ - if (bio_is_flush_with_data(bio)) + if (io->requeue_flush_with_data) return 0; if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT))) return io->sectors; @@ -590,6 +585,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t g io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; io->status = BLK_STS_OK; + io->requeue_flush_with_data = false; /* one ref is for submission, the other is for completion */ atomic_set(&io->io_count, 2); @@ -948,6 +944,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) struct mapped_device *md = io->md; blk_status_t io_error; bool requeued; + bool requeue_flush_with_data; requeued = dm_handle_requeue(io, first_stage); if (requeued && first_stage) @@ -964,6 +961,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) __dm_start_io_acct(io); dm_end_io_acct(io); } + requeue_flush_with_data = io->requeue_flush_with_data; free_io(io); smp_wmb(); this_cpu_dec(*md->pending_io); @@ -976,7 +974,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage) if (requeued) return; - if (bio_is_flush_with_data(bio)) { + if (unlikely(requeue_flush_with_data)) { /* * Preflush done for flush with data, reissue * without REQ_PREFLUSH. @@ -1996,12 +1994,30 @@ static void dm_split_and_process_bio(struct mapped_device *md, } init_clone_info(&ci, io, map, bio, is_abnormal); - if (bio->bi_opf & REQ_PREFLUSH) { + if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) { + /* + * The "flush_bypasses_map" is set on targets where it is safe + * to skip the map function and submit bios directly to the + * underlying block devices - currently, it is set for dm-linear + * and dm-stripe. + * + * If we have just one underlying device (i.e. there is one + * linear target or multiple linear targets pointing to the same + * device), we can send the flush with data directly to it. + */ + if (map->flush_bypasses_map) { + struct list_head *devices = dm_table_get_devices(map); + if (devices->next == devices->prev) + goto send_preflush_with_data; + } + if (bio->bi_iter.bi_size) + io->requeue_flush_with_data = true; __send_empty_flush(&ci); /* dm_io_complete submits any data associated with flush */ goto out; } +send_preflush_with_data: if (static_branch_unlikely(&zoned_enabled) && (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) { error = __send_zone_reset_all(&ci); @@ -2908,7 +2924,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, { bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; - int r; + int r = 0; lockdep_assert_held(&md->suspend_lock); @@ -2960,8 +2976,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) + if (map && dm_request_based(md)) { dm_stop_queue(md->queue); + set_bit(DMF_QUEUE_STOPPED, &md->flags); + } flush_workqueue(md->wq); @@ -2970,7 +2988,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, task_state); + if (map) + r = dm_wait_for_completion(md, task_state); if (!r) set_bit(dmf_suspended_flag, &md->flags); @@ -2983,7 +3002,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, if (r < 0) { dm_queue_flush(md); - if (dm_request_based(md)) + if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags)) dm_start_queue(md->queue); unlock_fs(md); @@ -3067,7 +3086,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map) * so that mapping of targets can work correctly. * Request-based dm is queueing the deferred I/Os in its request_queue. */ - if (dm_request_based(md)) + if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags)) dm_start_queue(md->queue); unlock_fs(md); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1d0e0e7362bd..3fc33b1b4dfb 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9705,6 +9705,8 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp) flags_ext3 = le32_to_cpu(resp->flags_ext3); if (flags_ext3 & FUNC_QCAPS_RESP_FLAGS_EXT3_ROCE_VF_DYN_ALLOC_SUPPORT) bp->fw_cap |= BNXT_FW_CAP_ROCE_VF_DYN_ALLOC_SUPPORT; + if (flags_ext3 & FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED) + bp->fw_cap |= BNXT_FW_CAP_MIRROR_ON_ROCE; bp->tx_push_thresh = 0; if ((flags & FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED) && diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 06a4c2afdf8a..741b2d854789 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2514,6 +2514,7 @@ struct bnxt { #define BNXT_FW_CAP_VNIC_RE_FLUSH BIT_ULL(40) #define BNXT_FW_CAP_SW_MAX_RESOURCE_LIMITS BIT_ULL(41) #define BNXT_FW_CAP_NPAR_1_2 BIT_ULL(42) + #define BNXT_FW_CAP_MIRROR_ON_ROCE BIT_ULL(43) u32 fw_dbg_cap; @@ -2537,6 +2538,8 @@ struct bnxt { ((bp)->fw_cap & BNXT_FW_CAP_ROCE_VF_RESC_MGMT_SUPPORTED) #define BNXT_SW_RES_LMT(bp) \ ((bp)->fw_cap & BNXT_FW_CAP_SW_MAX_RESOURCE_LIMITS) +#define BNXT_MIRROR_ON_ROCE_CAP(bp) \ + ((bp)->fw_cap & BNXT_FW_CAP_MIRROR_ON_ROCE) u32 hwrm_spec_code; u16 hwrm_cmd_seq; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 61cf201bb0dc..f8c2c72b382d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -100,6 +100,12 @@ void bnxt_set_dflt_ulp_stat_ctxs(struct bnxt *bp) if (BNXT_PF(bp) && !bp->pf.port_id && bp->port_count > 1) bp->edev->ulp_num_ctxs++; + + /* Reserve one additional stat_ctx when the device is capable + * of supporting port mirroring on RDMA device. + */ + if (BNXT_MIRROR_ON_ROCE_CAP(bp)) + bp->edev->ulp_num_ctxs++; } } diff --git a/drivers/net/ethernet/pensando/Kconfig b/drivers/net/ethernet/pensando/Kconfig index 01fe76786f77..c99758adf3ad 100644 --- a/drivers/net/ethernet/pensando/Kconfig +++ b/drivers/net/ethernet/pensando/Kconfig @@ -24,6 +24,7 @@ config IONIC select NET_DEVLINK select DIMLIB select PAGE_POOL + select AUXILIARY_BUS help This enables the support for the Pensando family of Ethernet adapters. More specific information on this driver can be diff --git a/drivers/net/ethernet/pensando/ionic/Makefile b/drivers/net/ethernet/pensando/ionic/Makefile index 4e7642a2d25f..a598972fef41 100644 --- a/drivers/net/ethernet/pensando/ionic/Makefile +++ b/drivers/net/ethernet/pensando/ionic/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_IONIC) := ionic.o ionic-y := ionic_main.o ionic_bus_pci.o ionic_devlink.o ionic_dev.o \ ionic_debugfs.o ionic_lif.o ionic_rx_filter.o ionic_ethtool.o \ - ionic_txrx.o ionic_stats.o ionic_fw.o + ionic_txrx.o ionic_stats.o ionic_fw.o ionic_aux.o ionic-$(CONFIG_PTP_1588_CLOCK) += ionic_phc.o diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h index 04f00ea94230..85198e6a806e 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic.h +++ b/drivers/net/ethernet/pensando/ionic/ionic.h @@ -65,16 +65,9 @@ struct ionic { int watchdog_period; }; -struct ionic_admin_ctx { - struct completion work; - union ionic_adminq_cmd cmd; - union ionic_adminq_comp comp; -}; - int ionic_adminq_post(struct ionic_lif *lif, struct ionic_admin_ctx *ctx); int ionic_adminq_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx, const int err, const bool do_msg); -int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx); int ionic_adminq_post_wait_nomsg(struct ionic_lif *lif, struct ionic_admin_ctx *ctx); void ionic_adminq_netdev_err_print(struct ionic_lif *lif, u8 opcode, u8 status, int err); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_api.h b/drivers/net/ethernet/pensando/ionic/ionic_api.h new file mode 100644 index 000000000000..bd88666836b8 --- /dev/null +++ b/drivers/net/ethernet/pensando/ionic/ionic_api.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#ifndef _IONIC_API_H_ +#define _IONIC_API_H_ + +#include <linux/auxiliary_bus.h> +#include "ionic_if.h" +#include "ionic_regs.h" + +/** + * struct ionic_aux_dev - Auxiliary device information + * @lif: Logical interface + * @idx: Index identifier + * @adev: Auxiliary device + */ +struct ionic_aux_dev { + struct ionic_lif *lif; + int idx; + struct auxiliary_device adev; +}; + +/** + * struct ionic_admin_ctx - Admin command context + * @work: Work completion wait queue element + * @cmd: Admin command (64B) to be copied to the queue + * @comp: Admin completion (16B) copied from the queue + */ +struct ionic_admin_ctx { + struct completion work; + union ionic_adminq_cmd cmd; + union ionic_adminq_comp comp; +}; + +#define IONIC_INTR_INDEX_NOT_ASSIGNED -1 +#define IONIC_INTR_NAME_MAX_SZ 32 + +/** + * struct ionic_intr_info - Interrupt information + * @name: Name identifier + * @rearm_count: Interrupt rearm count + * @index: Interrupt index position + * @vector: Interrupt number + * @dim_coal_hw: Interrupt coalesce value in hardware units + * @affinity_mask: CPU affinity mask + * @aff_notify: context for notification of IRQ affinity changes + */ +struct ionic_intr_info { + char name[IONIC_INTR_NAME_MAX_SZ]; + u64 rearm_count; + unsigned int index; + unsigned int vector; + u32 dim_coal_hw; + cpumask_var_t *affinity_mask; + struct irq_affinity_notify aff_notify; +}; + +/** + * ionic_adminq_post_wait - Post an admin command and wait for response + * @lif: Logical interface + * @ctx: API admin command context + * + * Post the command to an admin queue in the ethernet driver. If this command + * succeeds, then the command has been posted, but that does not indicate a + * completion. If this command returns success, then the completion callback + * will eventually be called. + * + * Return: zero or negative error status + */ +int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx); + +/** + * ionic_error_to_errno - Transform ionic_if errors to os errno + * @code: Ionic error number + * + * Return: Negative OS error number or zero + */ +int ionic_error_to_errno(enum ionic_status_code code); + +/** + * ionic_request_rdma_reset - request reset or disable the device or lif + * @lif: Logical interface + * + * The reset is triggered asynchronously. It will wait until reset request + * completes or times out. + */ +void ionic_request_rdma_reset(struct ionic_lif *lif); + +/** + * ionic_intr_alloc - Reserve a device interrupt + * @lif: Logical interface + * @intr: Reserved ionic interrupt structure + * + * Reserve an interrupt index and get irq number for that index. + * + * Return: zero or negative error status + */ +int ionic_intr_alloc(struct ionic_lif *lif, struct ionic_intr_info *intr); + +/** + * ionic_intr_free - Release a device interrupt index + * @lif: Logical interface + * @intr: Interrupt index + * + * Mark the interrupt index unused so that it can be reserved again. + */ +void ionic_intr_free(struct ionic_lif *lif, int intr); + +/** + * ionic_get_cmb - Reserve cmb pages + * @lif: Logical interface + * @pgid: First page index + * @pgaddr: First page bus addr (contiguous) + * @order: Log base two number of pages (PAGE_SIZE) + * @stride_log2: Size of stride to determine CMB pool + * @expdb: Will be set to true if this CMB region has expdb enabled + * + * Return: zero or negative error status + */ +int ionic_get_cmb(struct ionic_lif *lif, u32 *pgid, phys_addr_t *pgaddr, + int order, u8 stride_log2, bool *expdb); + +/** + * ionic_put_cmb - Release cmb pages + * @lif: Logical interface + * @pgid: First page index + * @order: Log base two number of pages (PAGE_SIZE) + */ +void ionic_put_cmb(struct ionic_lif *lif, u32 pgid, int order); + +#endif /* _IONIC_API_H_ */ diff --git a/drivers/net/ethernet/pensando/ionic/ionic_aux.c b/drivers/net/ethernet/pensando/ionic/ionic_aux.c new file mode 100644 index 000000000000..a2be338eb3e5 --- /dev/null +++ b/drivers/net/ethernet/pensando/ionic/ionic_aux.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#include <linux/kernel.h> +#include "ionic.h" +#include "ionic_lif.h" +#include "ionic_aux.h" + +static DEFINE_IDA(aux_ida); + +static void ionic_auxbus_release(struct device *dev) +{ + struct ionic_aux_dev *ionic_adev; + + ionic_adev = container_of(dev, struct ionic_aux_dev, adev.dev); + ida_free(&aux_ida, ionic_adev->adev.id); + kfree(ionic_adev); +} + +int ionic_auxbus_register(struct ionic_lif *lif) +{ + struct ionic_aux_dev *ionic_adev; + struct auxiliary_device *aux_dev; + int err, id; + + if (!(le64_to_cpu(lif->ionic->ident.lif.capabilities) & IONIC_LIF_CAP_RDMA)) + return 0; + + ionic_adev = kzalloc(sizeof(*ionic_adev), GFP_KERNEL); + if (!ionic_adev) + return -ENOMEM; + + aux_dev = &ionic_adev->adev; + + id = ida_alloc(&aux_ida, GFP_KERNEL); + if (id < 0) { + dev_err(lif->ionic->dev, "Failed to allocate aux id: %d\n", id); + kfree(ionic_adev); + return id; + } + + aux_dev->id = id; + aux_dev->name = "rdma"; + aux_dev->dev.parent = &lif->ionic->pdev->dev; + aux_dev->dev.release = ionic_auxbus_release; + ionic_adev->lif = lif; + err = auxiliary_device_init(aux_dev); + if (err) { + dev_err(lif->ionic->dev, "Failed to initialize %s aux device: %d\n", + aux_dev->name, err); + ida_free(&aux_ida, id); + kfree(ionic_adev); + return err; + } + + err = auxiliary_device_add(aux_dev); + if (err) { + dev_err(lif->ionic->dev, "Failed to add %s aux device: %d\n", + aux_dev->name, err); + auxiliary_device_uninit(aux_dev); + return err; + } + + lif->ionic_adev = ionic_adev; + return 0; +} + +void ionic_auxbus_unregister(struct ionic_lif *lif) +{ + mutex_lock(&lif->adev_lock); + if (!lif->ionic_adev) + goto out; + + auxiliary_device_delete(&lif->ionic_adev->adev); + auxiliary_device_uninit(&lif->ionic_adev->adev); + + lif->ionic_adev = NULL; +out: + mutex_unlock(&lif->adev_lock); +} + +void ionic_request_rdma_reset(struct ionic_lif *lif) +{ + struct ionic *ionic = lif->ionic; + int err; + + union ionic_dev_cmd cmd = { + .cmd.opcode = IONIC_CMD_RDMA_RESET_LIF, + .cmd.lif_index = cpu_to_le16(lif->index), + }; + + mutex_lock(&ionic->dev_cmd_lock); + + ionic_dev_cmd_go(&ionic->idev, &cmd); + err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT); + + mutex_unlock(&ionic->dev_cmd_lock); + + if (err) + pr_warn("%s request_reset: error %d\n", __func__, err); +} +EXPORT_SYMBOL_NS(ionic_request_rdma_reset, "NET_IONIC"); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_aux.h b/drivers/net/ethernet/pensando/ionic/ionic_aux.h new file mode 100644 index 000000000000..f5528a9f187d --- /dev/null +++ b/drivers/net/ethernet/pensando/ionic/ionic_aux.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */ + +#ifndef _IONIC_AUX_H_ +#define _IONIC_AUX_H_ + +int ionic_auxbus_register(struct ionic_lif *lif); +void ionic_auxbus_unregister(struct ionic_lif *lif); + +#endif /* _IONIC_AUX_H_ */ diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c index 136bfa3516d0..70d86c5f52fb 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c @@ -9,6 +9,7 @@ #include "ionic.h" #include "ionic_bus.h" #include "ionic_lif.h" +#include "ionic_aux.h" #include "ionic_debugfs.h" /* Supported devices */ @@ -271,6 +272,8 @@ static int ionic_setup_one(struct ionic *ionic) } ionic_debugfs_add_ident(ionic); + ionic_map_cmb(ionic); + err = ionic_init(ionic); if (err) { dev_err(dev, "Cannot init device: %d, aborting\n", err); @@ -375,6 +378,8 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_out_deregister_devlink; } + ionic_auxbus_register(ionic->lif); + mod_timer(&ionic->watchdog_timer, round_jiffies(jiffies + ionic->watchdog_period)); ionic_queue_doorbell_check(ionic, IONIC_NAPI_DEADLINE); @@ -416,6 +421,7 @@ static void ionic_remove(struct pci_dev *pdev) if (ionic->lif->doorbell_wa) cancel_delayed_work_sync(&ionic->doorbell_check_dwork); + ionic_auxbus_unregister(ionic->lif); ionic_lif_unregister(ionic->lif); ionic_devlink_unregister(ionic); ionic_lif_deinit(ionic->lif); @@ -445,6 +451,7 @@ static void ionic_reset_prepare(struct pci_dev *pdev) timer_delete_sync(&ionic->watchdog_timer); cancel_work_sync(&lif->deferred.work); + ionic_auxbus_unregister(ionic->lif); mutex_lock(&lif->queue_lock); ionic_stop_queues_reconfig(lif); ionic_txrx_free(lif); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c index 093c5358b6e8..ab27e9225c1e 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c @@ -199,13 +199,201 @@ void ionic_init_devinfo(struct ionic *ionic) dev_dbg(ionic->dev, "fw_version %s\n", idev->dev_info.fw_version); } +static void ionic_map_disc_cmb(struct ionic *ionic) +{ + struct ionic_identity *ident = &ionic->ident; + u32 length_reg0, length, offset, num_regions; + struct ionic_dev_bar *bar = ionic->bars; + struct ionic_dev *idev = &ionic->idev; + struct device *dev = ionic->dev; + int err, sz, i; + u64 end; + + mutex_lock(&ionic->dev_cmd_lock); + + ionic_dev_cmd_discover_cmb(idev); + err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT); + if (!err) { + sz = min(sizeof(ident->cmb_layout), + sizeof(idev->dev_cmd_regs->data)); + memcpy_fromio(&ident->cmb_layout, + &idev->dev_cmd_regs->data, sz); + } + mutex_unlock(&ionic->dev_cmd_lock); + + if (err) { + dev_warn(dev, "Cannot discover CMB layout, disabling CMB\n"); + return; + } + + bar += 2; + + num_regions = le32_to_cpu(ident->cmb_layout.num_regions); + if (!num_regions || num_regions > IONIC_MAX_CMB_REGIONS) { + dev_warn(dev, "Invalid number of CMB entries (%d)\n", + num_regions); + return; + } + + dev_dbg(dev, "ionic_cmb_layout_identity num_regions %d flags %x:\n", + num_regions, ident->cmb_layout.flags); + + for (i = 0; i < num_regions; i++) { + offset = le32_to_cpu(ident->cmb_layout.region[i].offset); + length = le32_to_cpu(ident->cmb_layout.region[i].length); + end = offset + length; + + dev_dbg(dev, "CMB entry %d: bar_num %u cmb_type %u offset %x length %u\n", + i, ident->cmb_layout.region[i].bar_num, + ident->cmb_layout.region[i].cmb_type, + offset, length); + + if (end > (bar->len >> IONIC_CMB_SHIFT_64K)) { + dev_warn(dev, "Out of bounds CMB region %d offset %x length %u\n", + i, offset, length); + return; + } + } + + /* if first entry matches PCI config, expdb is not supported */ + if (ident->cmb_layout.region[0].bar_num == bar->res_index && + le32_to_cpu(ident->cmb_layout.region[0].length) == bar->len && + !ident->cmb_layout.region[0].offset) { + dev_warn(dev, "No CMB mapping discovered\n"); + return; + } + + /* process first entry for regular mapping */ + length_reg0 = le32_to_cpu(ident->cmb_layout.region[0].length); + if (!length_reg0) { + dev_warn(dev, "region len = 0. No CMB mapping discovered\n"); + return; + } + + /* Verify first entry size matches expected 8MB size (in 64KB pages) */ + if (length_reg0 != IONIC_BAR2_CMB_ENTRY_SIZE >> IONIC_CMB_SHIFT_64K) { + dev_warn(dev, "Unexpected CMB size in entry 0: %u pages\n", + length_reg0); + return; + } + + sz = BITS_TO_LONGS((length_reg0 << IONIC_CMB_SHIFT_64K) / + PAGE_SIZE) * sizeof(long); + idev->cmb_inuse = kzalloc(sz, GFP_KERNEL); + if (!idev->cmb_inuse) { + dev_warn(dev, "No memory for CMB, disabling\n"); + idev->phy_cmb_pages = 0; + idev->phy_cmb_expdb64_pages = 0; + idev->phy_cmb_expdb128_pages = 0; + idev->phy_cmb_expdb256_pages = 0; + idev->phy_cmb_expdb512_pages = 0; + idev->cmb_npages = 0; + return; + } + + for (i = 0; i < num_regions; i++) { + /* check this region matches first region length as to + * ease implementation + */ + if (le32_to_cpu(ident->cmb_layout.region[i].length) != + length_reg0) + continue; + + offset = le32_to_cpu(ident->cmb_layout.region[i].offset); + + switch (ident->cmb_layout.region[i].cmb_type) { + case IONIC_CMB_TYPE_DEVMEM: + idev->phy_cmb_pages = bar->bus_addr + offset; + idev->cmb_npages = + (length_reg0 << IONIC_CMB_SHIFT_64K) / PAGE_SIZE; + dev_dbg(dev, "regular cmb mapping: bar->bus_addr %pa region[%d].length %u\n", + &bar->bus_addr, i, length); + dev_dbg(dev, "idev->phy_cmb_pages %pad, idev->cmb_npages %u\n", + &idev->phy_cmb_pages, idev->cmb_npages); + break; + + case IONIC_CMB_TYPE_EXPDB64: + idev->phy_cmb_expdb64_pages = + bar->bus_addr + (offset << IONIC_CMB_SHIFT_64K); + dev_dbg(dev, "idev->phy_cmb_expdb64_pages %pad\n", + &idev->phy_cmb_expdb64_pages); + break; + + case IONIC_CMB_TYPE_EXPDB128: + idev->phy_cmb_expdb128_pages = + bar->bus_addr + (offset << IONIC_CMB_SHIFT_64K); + dev_dbg(dev, "idev->phy_cmb_expdb128_pages %pad\n", + &idev->phy_cmb_expdb128_pages); + break; + + case IONIC_CMB_TYPE_EXPDB256: + idev->phy_cmb_expdb256_pages = + bar->bus_addr + (offset << IONIC_CMB_SHIFT_64K); + dev_dbg(dev, "idev->phy_cmb_expdb256_pages %pad\n", + &idev->phy_cmb_expdb256_pages); + break; + + case IONIC_CMB_TYPE_EXPDB512: + idev->phy_cmb_expdb512_pages = + bar->bus_addr + (offset << IONIC_CMB_SHIFT_64K); + dev_dbg(dev, "idev->phy_cmb_expdb512_pages %pad\n", + &idev->phy_cmb_expdb512_pages); + break; + + default: + dev_warn(dev, "[%d] Invalid cmb_type (%d)\n", + i, ident->cmb_layout.region[i].cmb_type); + break; + } + } +} + +static void ionic_map_classic_cmb(struct ionic *ionic) +{ + struct ionic_dev_bar *bar = ionic->bars; + struct ionic_dev *idev = &ionic->idev; + struct device *dev = ionic->dev; + int sz; + + bar += 2; + /* classic CMB mapping */ + idev->phy_cmb_pages = bar->bus_addr; + idev->cmb_npages = bar->len / PAGE_SIZE; + dev_dbg(dev, "classic cmb mapping: bar->bus_addr %pa bar->len %lu\n", + &bar->bus_addr, bar->len); + dev_dbg(dev, "idev->phy_cmb_pages %pad, idev->cmb_npages %u\n", + &idev->phy_cmb_pages, idev->cmb_npages); + + sz = BITS_TO_LONGS(idev->cmb_npages) * sizeof(long); + idev->cmb_inuse = kzalloc(sz, GFP_KERNEL); + if (!idev->cmb_inuse) { + idev->phy_cmb_pages = 0; + idev->cmb_npages = 0; + } +} + +void ionic_map_cmb(struct ionic *ionic) +{ + struct pci_dev *pdev = ionic->pdev; + struct device *dev = ionic->dev; + + if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM)) { + dev_dbg(dev, "No CMB, disabling\n"); + return; + } + + if (ionic->ident.dev.capabilities & cpu_to_le64(IONIC_DEV_CAP_DISC_CMB)) + ionic_map_disc_cmb(ionic); + else + ionic_map_classic_cmb(ionic); +} + int ionic_dev_setup(struct ionic *ionic) { struct ionic_dev_bar *bar = ionic->bars; unsigned int num_bars = ionic->num_bars; struct ionic_dev *idev = &ionic->idev; struct device *dev = ionic->dev; - int size; u32 sig; int err; @@ -255,16 +443,11 @@ int ionic_dev_setup(struct ionic *ionic) mutex_init(&idev->cmb_inuse_lock); if (num_bars < 3 || !ionic->bars[IONIC_PCI_BAR_CMB].len) { idev->cmb_inuse = NULL; + idev->phy_cmb_pages = 0; + idev->cmb_npages = 0; return 0; } - idev->phy_cmb_pages = bar->bus_addr; - idev->cmb_npages = bar->len / PAGE_SIZE; - size = BITS_TO_LONGS(idev->cmb_npages) * sizeof(long); - idev->cmb_inuse = kzalloc(size, GFP_KERNEL); - if (!idev->cmb_inuse) - dev_warn(dev, "No memory for CMB, disabling\n"); - return 0; } @@ -277,6 +460,11 @@ void ionic_dev_teardown(struct ionic *ionic) idev->phy_cmb_pages = 0; idev->cmb_npages = 0; + idev->phy_cmb_expdb64_pages = 0; + idev->phy_cmb_expdb128_pages = 0; + idev->phy_cmb_expdb256_pages = 0; + idev->phy_cmb_expdb512_pages = 0; + if (ionic->wq) { destroy_workqueue(ionic->wq); ionic->wq = NULL; @@ -698,28 +886,79 @@ void ionic_dev_cmd_adminq_init(struct ionic_dev *idev, struct ionic_qcq *qcq, ionic_dev_cmd_go(idev, &cmd); } +void ionic_dev_cmd_discover_cmb(struct ionic_dev *idev) +{ + union ionic_dev_cmd cmd = { + .discover_cmb.opcode = IONIC_CMD_DISCOVER_CMB, + }; + + ionic_dev_cmd_go(idev, &cmd); +} + int ionic_db_page_num(struct ionic_lif *lif, int pid) { return (lif->hw_index * lif->dbid_count) + pid; } -int ionic_get_cmb(struct ionic_lif *lif, u32 *pgid, phys_addr_t *pgaddr, int order) +int ionic_get_cmb(struct ionic_lif *lif, u32 *pgid, phys_addr_t *pgaddr, + int order, u8 stride_log2, bool *expdb) { struct ionic_dev *idev = &lif->ionic->idev; - int ret; + void __iomem *nonexpdb_pgptr; + phys_addr_t nonexpdb_pgaddr; + int i, idx; mutex_lock(&idev->cmb_inuse_lock); - ret = bitmap_find_free_region(idev->cmb_inuse, idev->cmb_npages, order); + idx = bitmap_find_free_region(idev->cmb_inuse, idev->cmb_npages, order); mutex_unlock(&idev->cmb_inuse_lock); - if (ret < 0) - return ret; + if (idx < 0) + return idx; + + *pgid = (u32)idx; + + if (idev->phy_cmb_expdb64_pages && + stride_log2 == IONIC_EXPDB_64B_WQE_LG2) { + *pgaddr = idev->phy_cmb_expdb64_pages + idx * PAGE_SIZE; + if (expdb) + *expdb = true; + } else if (idev->phy_cmb_expdb128_pages && + stride_log2 == IONIC_EXPDB_128B_WQE_LG2) { + *pgaddr = idev->phy_cmb_expdb128_pages + idx * PAGE_SIZE; + if (expdb) + *expdb = true; + } else if (idev->phy_cmb_expdb256_pages && + stride_log2 == IONIC_EXPDB_256B_WQE_LG2) { + *pgaddr = idev->phy_cmb_expdb256_pages + idx * PAGE_SIZE; + if (expdb) + *expdb = true; + } else if (idev->phy_cmb_expdb512_pages && + stride_log2 == IONIC_EXPDB_512B_WQE_LG2) { + *pgaddr = idev->phy_cmb_expdb512_pages + idx * PAGE_SIZE; + if (expdb) + *expdb = true; + } else { + *pgaddr = idev->phy_cmb_pages + idx * PAGE_SIZE; + if (expdb) + *expdb = false; + } - *pgid = ret; - *pgaddr = idev->phy_cmb_pages + ret * PAGE_SIZE; + /* clear the requested CMB region, 1 PAGE_SIZE ioremap at a time */ + nonexpdb_pgaddr = idev->phy_cmb_pages + idx * PAGE_SIZE; + for (i = 0; i < (1 << order); i++) { + nonexpdb_pgptr = + ioremap_wc(nonexpdb_pgaddr + i * PAGE_SIZE, PAGE_SIZE); + if (!nonexpdb_pgptr) { + ionic_put_cmb(lif, *pgid, order); + return -ENOMEM; + } + memset_io(nonexpdb_pgptr, 0, PAGE_SIZE); + iounmap(nonexpdb_pgptr); + } return 0; } +EXPORT_SYMBOL_NS(ionic_get_cmb, "NET_IONIC"); void ionic_put_cmb(struct ionic_lif *lif, u32 pgid, int order) { @@ -729,6 +968,7 @@ void ionic_put_cmb(struct ionic_lif *lif, u32 pgid, int order) bitmap_release_region(idev->cmb_inuse, pgid, order); mutex_unlock(&idev->cmb_inuse_lock); } +EXPORT_SYMBOL_NS(ionic_put_cmb, "NET_IONIC"); int ionic_cq_init(struct ionic_lif *lif, struct ionic_cq *cq, struct ionic_intr_info *intr, diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h index c8c710cfe70c..35566f97eaea 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h @@ -12,6 +12,7 @@ #include "ionic_if.h" #include "ionic_regs.h" +#include "ionic_api.h" #define IONIC_MAX_TX_DESC 8192 #define IONIC_MAX_RX_DESC 16384 @@ -34,6 +35,11 @@ #define IONIC_RX_MIN_DOORBELL_DEADLINE (HZ / 100) /* 10ms */ #define IONIC_RX_MAX_DOORBELL_DEADLINE (HZ * 4) /* 4s */ +#define IONIC_EXPDB_64B_WQE_LG2 6 +#define IONIC_EXPDB_128B_WQE_LG2 7 +#define IONIC_EXPDB_256B_WQE_LG2 8 +#define IONIC_EXPDB_512B_WQE_LG2 9 + struct ionic_dev_bar { void __iomem *vaddr; phys_addr_t bus_addr; @@ -170,6 +176,11 @@ struct ionic_dev { dma_addr_t phy_cmb_pages; u32 cmb_npages; + dma_addr_t phy_cmb_expdb64_pages; + dma_addr_t phy_cmb_expdb128_pages; + dma_addr_t phy_cmb_expdb256_pages; + dma_addr_t phy_cmb_expdb512_pages; + u32 port_info_sz; struct ionic_port_info *port_info; dma_addr_t port_info_pa; @@ -273,19 +284,6 @@ struct ionic_queue { char name[IONIC_QUEUE_NAME_MAX_SZ]; } ____cacheline_aligned_in_smp; -#define IONIC_INTR_INDEX_NOT_ASSIGNED -1 -#define IONIC_INTR_NAME_MAX_SZ 32 - -struct ionic_intr_info { - char name[IONIC_INTR_NAME_MAX_SZ]; - u64 rearm_count; - unsigned int index; - unsigned int vector; - u32 dim_coal_hw; - cpumask_var_t *affinity_mask; - struct irq_affinity_notify aff_notify; -}; - struct ionic_cq { struct ionic_lif *lif; struct ionic_queue *bound_q; @@ -363,8 +361,8 @@ void ionic_dev_cmd_adminq_init(struct ionic_dev *idev, struct ionic_qcq *qcq, int ionic_db_page_num(struct ionic_lif *lif, int pid); -int ionic_get_cmb(struct ionic_lif *lif, u32 *pgid, phys_addr_t *pgaddr, int order); -void ionic_put_cmb(struct ionic_lif *lif, u32 pgid, int order); +void ionic_dev_cmd_discover_cmb(struct ionic_dev *idev); +void ionic_map_cmb(struct ionic *ionic); int ionic_cq_init(struct ionic_lif *lif, struct ionic_cq *cq, struct ionic_intr_info *intr, diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 9886cd66ce68..47559c909c8b 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -56,6 +56,9 @@ enum ionic_cmd_opcode { IONIC_CMD_VF_SETATTR = 61, IONIC_CMD_VF_CTRL = 62, + /* CMB command */ + IONIC_CMD_DISCOVER_CMB = 80, + /* QoS commands */ IONIC_CMD_QOS_CLASS_IDENTIFY = 240, IONIC_CMD_QOS_CLASS_INIT = 241, @@ -269,9 +272,11 @@ union ionic_drv_identity { /** * enum ionic_dev_capability - Device capabilities * @IONIC_DEV_CAP_VF_CTRL: Device supports VF ctrl operations + * @IONIC_DEV_CAP_DISC_CMB: Device supports CMB discovery operations */ enum ionic_dev_capability { IONIC_DEV_CAP_VF_CTRL = BIT(0), + IONIC_DEV_CAP_DISC_CMB = BIT(1), }; /** @@ -395,6 +400,7 @@ enum ionic_logical_qtype { * @IONIC_Q_F_4X_DESC: Quadruple main descriptor size * @IONIC_Q_F_4X_CQ_DESC: Quadruple cq descriptor size * @IONIC_Q_F_4X_SG_DESC: Quadruple sg descriptor size + * @IONIC_QIDENT_F_EXPDB: Queue supports express doorbell */ enum ionic_q_feature { IONIC_QIDENT_F_CQ = BIT_ULL(0), @@ -407,6 +413,7 @@ enum ionic_q_feature { IONIC_Q_F_4X_DESC = BIT_ULL(7), IONIC_Q_F_4X_CQ_DESC = BIT_ULL(8), IONIC_Q_F_4X_SG_DESC = BIT_ULL(9), + IONIC_QIDENT_F_EXPDB = BIT_ULL(10), }; /** @@ -495,6 +502,16 @@ union ionic_lif_config { }; /** + * enum ionic_lif_rdma_cap_stats - LIF stat type + * @IONIC_LIF_RDMA_STAT_GLOBAL: Global stats + * @IONIC_LIF_RDMA_STAT_QP: Queue pair stats + */ +enum ionic_lif_rdma_cap_stats { + IONIC_LIF_RDMA_STAT_GLOBAL = BIT(0), + IONIC_LIF_RDMA_STAT_QP = BIT(1), +}; + +/** * struct ionic_lif_identity - LIF identity information (type-specific) * * @capabilities: LIF capabilities @@ -513,10 +530,10 @@ union ionic_lif_config { * @eth.config: LIF config struct with features, mtu, mac, q counts * * @rdma: RDMA identify structure - * @rdma.version: RDMA version of opcodes and queue descriptors + * @rdma.version: RDMA capability version * @rdma.qp_opcodes: Number of RDMA queue pair opcodes supported * @rdma.admin_opcodes: Number of RDMA admin opcodes supported - * @rdma.rsvd: reserved byte(s) + * @rdma.minor_version: RDMA capability minor version * @rdma.npts_per_lif: Page table size per LIF * @rdma.nmrs_per_lif: Number of memory regions per LIF * @rdma.nahs_per_lif: Number of address handles per LIF @@ -526,12 +543,17 @@ union ionic_lif_config { * @rdma.rrq_stride: Remote RQ work request stride * @rdma.rsq_stride: Remote SQ work request stride * @rdma.dcqcn_profiles: Number of DCQCN profiles - * @rdma.rsvd_dimensions: reserved byte(s) + * @rdma.udma_shift: Log2 number of queues per queue group + * @rdma.rsvd_dimensions: Reserved byte + * @rdma.page_size_cap: Supported page sizes * @rdma.aq_qtype: RDMA Admin Qtype * @rdma.sq_qtype: RDMA Send Qtype * @rdma.rq_qtype: RDMA Receive Qtype * @rdma.cq_qtype: RDMA Completion Qtype * @rdma.eq_qtype: RDMA Event Qtype + * @rdma.stats_type: Supported statistics type + * (enum ionic_lif_rdma_cap_stats) + * @rdma.rsvd1: Reserved byte(s) * @words: word access to struct contents */ union ionic_lif_identity { @@ -557,7 +579,7 @@ union ionic_lif_identity { u8 version; u8 qp_opcodes; u8 admin_opcodes; - u8 rsvd; + u8 minor_version; __le32 npts_per_lif; __le32 nmrs_per_lif; __le32 nahs_per_lif; @@ -567,12 +589,16 @@ union ionic_lif_identity { u8 rrq_stride; u8 rsq_stride; u8 dcqcn_profiles; - u8 rsvd_dimensions[10]; + u8 udma_shift; + u8 rsvd_dimensions; + __le64 page_size_cap; struct ionic_lif_logical_qtype aq_qtype; struct ionic_lif_logical_qtype sq_qtype; struct ionic_lif_logical_qtype rq_qtype; struct ionic_lif_logical_qtype cq_qtype; struct ionic_lif_logical_qtype eq_qtype; + __le16 stats_type; + u8 rsvd1[162]; } __packed rdma; } __packed; __le32 words[478]; @@ -2195,6 +2221,80 @@ struct ionic_vf_ctrl_comp { }; /** + * struct ionic_discover_cmb_cmd - CMB discovery command + * @opcode: Opcode for the command + * @rsvd: Reserved bytes + */ +struct ionic_discover_cmb_cmd { + u8 opcode; + u8 rsvd[63]; +}; + +/** + * struct ionic_discover_cmb_comp - CMB discover command completion. + * @status: Status of the command (enum ionic_status_code) + * @rsvd: Reserved bytes + */ +struct ionic_discover_cmb_comp { + u8 status; + u8 rsvd[15]; +}; + +#define IONIC_MAX_CMB_REGIONS 16 +#define IONIC_CMB_SHIFT_64K 16 + +enum ionic_cmb_type { + IONIC_CMB_TYPE_DEVMEM = 0, + IONIC_CMB_TYPE_EXPDB64 = 1, + IONIC_CMB_TYPE_EXPDB128 = 2, + IONIC_CMB_TYPE_EXPDB256 = 3, + IONIC_CMB_TYPE_EXPDB512 = 4, +}; + +/** + * union ionic_cmb_region - Configuration for CMB region + * @bar_num: CMB mapping number from FW + * @cmb_type: Type of CMB this region describes (enum ionic_cmb_type) + * @rsvd: Reserved + * @offset: Offset within BAR in 64KB pages + * @length: Length of the CMB region + * @words: 32-bit words for direct access to the entire region + */ +union ionic_cmb_region { + struct { + u8 bar_num; + u8 cmb_type; + u8 rsvd[6]; + __le32 offset; + __le32 length; + } __packed; + __le32 words[4]; +}; + +/** + * union ionic_discover_cmb_identity - CMB layout identity structure + * @num_regions: Number of CMB regions, up to 16 + * @flags: Feature and capability bits (0 for express + * doorbell, 1 for 4K alignment indicator, + * 31-24 for version information) + * @region: CMB mappings region, entry 0 for regular + * mapping, entries 1-7 for WQE sizes 64, + * 128, 256, 512, 1024, 2048 and 4096 bytes + * @words: Full union buffer size + */ +union ionic_discover_cmb_identity { + struct { + __le32 num_regions; +#define IONIC_CMB_FLAG_EXPDB BIT(0) +#define IONIC_CMB_FLAG_4KALIGN BIT(1) +#define IONIC_CMB_FLAG_VERSION 0xff000000 + __le32 flags; + union ionic_cmb_region region[IONIC_MAX_CMB_REGIONS]; + }; + __le32 words[478]; +}; + +/** * struct ionic_qos_identify_cmd - QoS identify command * @opcode: opcode * @ver: Highest version of identify supported by driver @@ -3054,6 +3154,8 @@ union ionic_dev_cmd { struct ionic_vf_getattr_cmd vf_getattr; struct ionic_vf_ctrl_cmd vf_ctrl; + struct ionic_discover_cmb_cmd discover_cmb; + struct ionic_lif_identify_cmd lif_identify; struct ionic_lif_init_cmd lif_init; struct ionic_lif_reset_cmd lif_reset; @@ -3093,6 +3195,8 @@ union ionic_dev_cmd_comp { struct ionic_vf_getattr_comp vf_getattr; struct ionic_vf_ctrl_comp vf_ctrl; + struct ionic_discover_cmb_comp discover_cmb; + struct ionic_lif_identify_comp lif_identify; struct ionic_lif_init_comp lif_init; ionic_lif_reset_comp lif_reset; @@ -3234,6 +3338,9 @@ union ionic_adminq_comp { #define IONIC_BAR0_DEV_CMD_DATA_REGS_OFFSET 0x0c00 #define IONIC_BAR0_INTR_STATUS_OFFSET 0x1000 #define IONIC_BAR0_INTR_CTRL_OFFSET 0x2000 + +/* BAR2 */ +#define IONIC_BAR2_CMB_ENTRY_SIZE 0x800000 #define IONIC_DEV_CMD_DONE 0x00000001 #define IONIC_ASIC_TYPE_NONE 0 @@ -3287,6 +3394,7 @@ struct ionic_identity { union ionic_port_identity port; union ionic_qos_identity qos; union ionic_q_identity txq; + union ionic_discover_cmb_identity cmb_layout; }; #endif /* _IONIC_IF_H_ */ diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 48cb5d30b5f6..b28966ae50c2 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -19,6 +19,7 @@ #include "ionic_bus.h" #include "ionic_dev.h" #include "ionic_lif.h" +#include "ionic_aux.h" #include "ionic_txrx.h" #include "ionic_ethtool.h" #include "ionic_debugfs.h" @@ -243,29 +244,36 @@ static int ionic_request_irq(struct ionic_lif *lif, struct ionic_qcq *qcq) 0, intr->name, &qcq->napi); } -static int ionic_intr_alloc(struct ionic_lif *lif, struct ionic_intr_info *intr) +int ionic_intr_alloc(struct ionic_lif *lif, struct ionic_intr_info *intr) { struct ionic *ionic = lif->ionic; - int index; + int index, err; index = find_first_zero_bit(ionic->intrs, ionic->nintrs); - if (index == ionic->nintrs) { - netdev_warn(lif->netdev, "%s: no intr, index=%d nintrs=%d\n", - __func__, index, ionic->nintrs); + if (index == ionic->nintrs) return -ENOSPC; - } set_bit(index, ionic->intrs); ionic_intr_init(&ionic->idev, intr, index); + err = ionic_bus_get_irq(ionic, intr->index); + if (err < 0) { + clear_bit(index, ionic->intrs); + return err; + } + + intr->vector = err; + return 0; } +EXPORT_SYMBOL_NS(ionic_intr_alloc, "NET_IONIC"); -static void ionic_intr_free(struct ionic *ionic, int index) +void ionic_intr_free(struct ionic_lif *lif, int index) { - if (index != IONIC_INTR_INDEX_NOT_ASSIGNED && index < ionic->nintrs) - clear_bit(index, ionic->intrs); + if (index != IONIC_INTR_INDEX_NOT_ASSIGNED && index < lif->ionic->nintrs) + clear_bit(index, lif->ionic->intrs); } +EXPORT_SYMBOL_NS(ionic_intr_free, "NET_IONIC"); static void ionic_irq_aff_notify(struct irq_affinity_notify *notify, const cpumask_t *mask) @@ -400,7 +408,7 @@ static void ionic_qcq_intr_free(struct ionic_lif *lif, struct ionic_qcq *qcq) irq_set_affinity_hint(qcq->intr.vector, NULL); devm_free_irq(lif->ionic->dev, qcq->intr.vector, &qcq->napi); qcq->intr.vector = 0; - ionic_intr_free(lif->ionic, qcq->intr.index); + ionic_intr_free(lif, qcq->intr.index); qcq->intr.index = IONIC_INTR_INDEX_NOT_ASSIGNED; } @@ -510,13 +518,6 @@ static int ionic_alloc_qcq_interrupt(struct ionic_lif *lif, struct ionic_qcq *qc goto err_out; } - err = ionic_bus_get_irq(lif->ionic, qcq->intr.index); - if (err < 0) { - netdev_warn(lif->netdev, "no vector for %s: %d\n", - qcq->q.name, err); - goto err_out_free_intr; - } - qcq->intr.vector = err; ionic_intr_mask_assert(lif->ionic->idev.intr_ctrl, qcq->intr.index, IONIC_INTR_MASK_SET); @@ -545,7 +546,7 @@ static int ionic_alloc_qcq_interrupt(struct ionic_lif *lif, struct ionic_qcq *qc return 0; err_out_free_intr: - ionic_intr_free(lif->ionic, qcq->intr.index); + ionic_intr_free(lif, qcq->intr.index); err_out: return err; } @@ -672,7 +673,7 @@ static int ionic_qcq_alloc(struct ionic_lif *lif, unsigned int type, new->cmb_order = order_base_2(new->cmb_q_size / PAGE_SIZE); err = ionic_get_cmb(lif, &new->cmb_pgid, &new->cmb_q_base_pa, - new->cmb_order); + new->cmb_order, 0, NULL); if (err) { netdev_err(lif->netdev, "Cannot allocate queue order %d from cmb: err %d\n", @@ -740,7 +741,7 @@ err_out_free_q: err_out_free_irq: if (flags & IONIC_QCQ_F_INTR) { devm_free_irq(dev, new->intr.vector, &new->napi); - ionic_intr_free(lif->ionic, new->intr.index); + ionic_intr_free(lif, new->intr.index); } err_out_free_page_pool: page_pool_destroy(new->q.page_pool); @@ -3293,6 +3294,7 @@ int ionic_lif_alloc(struct ionic *ionic) mutex_init(&lif->queue_lock); mutex_init(&lif->config_lock); + mutex_init(&lif->adev_lock); spin_lock_init(&lif->adminq_lock); @@ -3349,6 +3351,7 @@ err_out_free_lif_info: lif->info = NULL; lif->info_pa = 0; err_out_free_mutex: + mutex_destroy(&lif->adev_lock); mutex_destroy(&lif->config_lock); mutex_destroy(&lif->queue_lock); err_out_free_netdev: @@ -3384,6 +3387,7 @@ static void ionic_lif_handle_fw_down(struct ionic_lif *lif) netif_device_detach(lif->netdev); + ionic_auxbus_unregister(ionic->lif); mutex_lock(&lif->queue_lock); if (test_bit(IONIC_LIF_F_UP, lif->state)) { dev_info(ionic->dev, "Surprise FW stop, stopping queues\n"); @@ -3446,6 +3450,8 @@ int ionic_restart_lif(struct ionic_lif *lif) netif_device_attach(lif->netdev); ionic_queue_doorbell_check(ionic, IONIC_NAPI_DEADLINE); + ionic_auxbus_register(ionic->lif); + return 0; err_txrx_free: @@ -3528,6 +3534,7 @@ void ionic_lif_free(struct ionic_lif *lif) mutex_destroy(&lif->config_lock); mutex_destroy(&lif->queue_lock); + mutex_destroy(&lif->adev_lock); /* free netdev & lif */ ionic_debugfs_del_lif(lif); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.h b/drivers/net/ethernet/pensando/ionic/ionic_lif.h index e01756fb7fdd..43bdd0fb8733 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.h @@ -10,6 +10,7 @@ #include <linux/dim.h> #include <linux/pci.h> #include "ionic_rx_filter.h" +#include "ionic_api.h" #define IONIC_ADMINQ_LENGTH 16 /* must be a power of two */ #define IONIC_NOTIFYQ_LENGTH 64 /* must be a power of two */ @@ -225,6 +226,8 @@ struct ionic_lif { dma_addr_t info_pa; u32 info_sz; struct ionic_qtype_info qtype_info[IONIC_QTYPE_MAX]; + struct ionic_aux_dev *ionic_adev; + struct mutex adev_lock; /* lock for aux_dev actions */ u8 rss_hash_key[IONIC_RSS_HASH_KEY_SIZE]; u8 *rss_ind_tbl; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index 0e60a6bef99a..14dc055be3e9 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -72,7 +72,7 @@ static const char *ionic_error_to_str(enum ionic_status_code code) } } -static int ionic_error_to_errno(enum ionic_status_code code) +int ionic_error_to_errno(enum ionic_status_code code) { switch (code) { case IONIC_RC_SUCCESS: @@ -114,6 +114,7 @@ static int ionic_error_to_errno(enum ionic_status_code code) return -EIO; } } +EXPORT_SYMBOL_NS(ionic_error_to_errno, "NET_IONIC"); static const char *ionic_opcode_to_str(enum ionic_cmd_opcode opcode) { @@ -480,6 +481,7 @@ int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx) { return __ionic_adminq_post_wait(lif, ctx, true); } +EXPORT_SYMBOL_NS(ionic_adminq_post_wait, "NET_IONIC"); int ionic_adminq_post_wait_nomsg(struct ionic_lif *lif, struct ionic_admin_ctx *ctx) { diff --git a/drivers/scsi/aic94xx/aic94xx_task.c b/drivers/scsi/aic94xx/aic94xx_task.c index 4bfd03724ad6..b26a468ddc98 100644 --- a/drivers/scsi/aic94xx/aic94xx_task.c +++ b/drivers/scsi/aic94xx/aic94xx_task.c @@ -488,7 +488,6 @@ static int asd_build_ssp_ascb(struct asd_ascb *ascb, struct sas_task *task, scb->ssp_task.conn_handle = cpu_to_le16( (u16)(unsigned long)dev->lldd_dev); scb->ssp_task.data_dir = data_dir_flags[task->data_dir]; - scb->ssp_task.retry_count = scb->ssp_task.retry_count; ascb->tasklet_complete = asd_task_tasklet_complete; diff --git a/drivers/scsi/bfa/bfa_core.c b/drivers/scsi/bfa/bfa_core.c index a99a101b95ef..2559df8baa05 100644 --- a/drivers/scsi/bfa/bfa_core.c +++ b/drivers/scsi/bfa/bfa_core.c @@ -1282,7 +1282,6 @@ bfa_iocfc_cfgrsp(struct bfa_s *bfa) struct bfi_iocfc_cfgrsp_s *cfgrsp = iocfc->cfgrsp; struct bfa_iocfc_fwcfg_s *fwcfg = &cfgrsp->fwcfg; - fwcfg->num_cqs = fwcfg->num_cqs; fwcfg->num_ioim_reqs = be16_to_cpu(fwcfg->num_ioim_reqs); fwcfg->num_fwtio_reqs = be16_to_cpu(fwcfg->num_fwtio_reqs); fwcfg->num_tskim_reqs = be16_to_cpu(fwcfg->num_tskim_reqs); diff --git a/drivers/scsi/csiostor/csio_wr.c b/drivers/scsi/csiostor/csio_wr.c index a516df019c22..010a1df37f15 100644 --- a/drivers/scsi/csiostor/csio_wr.c +++ b/drivers/scsi/csiostor/csio_wr.c @@ -960,7 +960,7 @@ csio_wr_copy_to_wrp(void *data_buf, struct csio_wr_pair *wrp, memcpy((uint8_t *) wrp->addr1 + wr_off, data_buf, nbytes); data_len -= nbytes; - /* Write the remaining data from the begining of circular buffer */ + /* Write the remaining data from the beginning of circular buffer */ if (data_len) { CSIO_DB_ASSERT(data_len <= wrp->size2); CSIO_DB_ASSERT(wrp->addr2 != NULL); @@ -1224,7 +1224,7 @@ csio_wr_process_iq(struct csio_hw *hw, struct csio_q *q, /* * We need to re-arm SGE interrupts in case we got a stray interrupt, - * especially in msix mode. With INTx, this may be a common occurence. + * especially in msix mode. With INTx, this may be a common occurrence. */ if (unlikely(!q->inc_idx)) { CSIO_INC_STATS(q, n_stray_comp); diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index d1a4cc69d408..30a9c6612651 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -876,7 +876,7 @@ static int hisi_sas_dev_found(struct domain_device *device) device->lldd_dev = sas_dev; hisi_hba->hw->setup_itct(hisi_hba, sas_dev); - if (parent_dev && dev_is_expander(parent_dev->dev_type)) { + if (dev_parent_is_expander(device)) { int phy_no; phy_no = sas_find_attached_phy_id(&parent_dev->ex_dev, device); diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index 4431698a5d78..f3516a0611dd 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -925,7 +925,6 @@ static void setup_itct_v2_hw(struct hisi_hba *hisi_hba, struct device *dev = hisi_hba->dev; u64 qw0, device_id = sas_dev->device_id; struct hisi_sas_itct *itct = &hisi_hba->itct[device_id]; - struct domain_device *parent_dev = device->parent; struct asd_sas_port *sas_port = device->port; struct hisi_sas_port *port = to_hisi_sas_port(sas_port); u64 sas_addr; @@ -942,7 +941,7 @@ static void setup_itct_v2_hw(struct hisi_hba *hisi_hba, break; case SAS_SATA_DEV: case SAS_SATA_PENDING: - if (parent_dev && dev_is_expander(parent_dev->dev_type)) + if (dev_parent_is_expander(device)) qw0 = HISI_SAS_DEV_TYPE_STP << ITCT_HDR_DEV_TYPE_OFF; else qw0 = HISI_SAS_DEV_TYPE_SATA << ITCT_HDR_DEV_TYPE_OFF; @@ -2494,7 +2493,6 @@ static void prep_ata_v2_hw(struct hisi_hba *hisi_hba, { struct sas_task *task = slot->task; struct domain_device *device = task->dev; - struct domain_device *parent_dev = device->parent; struct hisi_sas_device *sas_dev = device->lldd_dev; struct hisi_sas_cmd_hdr *hdr = slot->cmd_hdr; struct asd_sas_port *sas_port = device->port; @@ -2509,7 +2507,7 @@ static void prep_ata_v2_hw(struct hisi_hba *hisi_hba, /* create header */ /* dw0 */ dw0 = port->id << CMD_HDR_PORT_OFF; - if (parent_dev && dev_is_expander(parent_dev->dev_type)) { + if (dev_parent_is_expander(device)) { dw0 |= 3 << CMD_HDR_CMD_OFF; } else { phy_id = device->phy->identify.phy_identifier; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 2f3d61abab3a..2f9e01717ef3 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -874,7 +874,6 @@ static void setup_itct_v3_hw(struct hisi_hba *hisi_hba, struct device *dev = hisi_hba->dev; u64 qw0, device_id = sas_dev->device_id; struct hisi_sas_itct *itct = &hisi_hba->itct[device_id]; - struct domain_device *parent_dev = device->parent; struct asd_sas_port *sas_port = device->port; struct hisi_sas_port *port = to_hisi_sas_port(sas_port); u64 sas_addr; @@ -891,7 +890,7 @@ static void setup_itct_v3_hw(struct hisi_hba *hisi_hba, break; case SAS_SATA_DEV: case SAS_SATA_PENDING: - if (parent_dev && dev_is_expander(parent_dev->dev_type)) + if (dev_parent_is_expander(device)) qw0 = HISI_SAS_DEV_TYPE_STP << ITCT_HDR_DEV_TYPE_OFF; else qw0 = HISI_SAS_DEV_TYPE_SATA << ITCT_HDR_DEV_TYPE_OFF; @@ -1476,7 +1475,6 @@ static void prep_ata_v3_hw(struct hisi_hba *hisi_hba, { struct sas_task *task = slot->task; struct domain_device *device = task->dev; - struct domain_device *parent_dev = device->parent; struct hisi_sas_device *sas_dev = device->lldd_dev; struct hisi_sas_cmd_hdr *hdr = slot->cmd_hdr; struct asd_sas_port *sas_port = device->port; @@ -1487,7 +1485,7 @@ static void prep_ata_v3_hw(struct hisi_hba *hisi_hba, u32 dw1 = 0, dw2 = 0; hdr->dw0 = cpu_to_le32(port->id << CMD_HDR_PORT_OFF); - if (parent_dev && dev_is_expander(parent_dev->dev_type)) { + if (dev_parent_is_expander(device)) { hdr->dw0 |= cpu_to_le32(3 << CMD_HDR_CMD_OFF); } else { phy_id = device->phy->identify.phy_identifier; diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index c73a71ac3c29..3654b12c5d5a 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -2662,10 +2662,8 @@ static void complete_scsi_command(struct CommandList *cp) case CMD_TARGET_STATUS: cmd->result |= ei->ScsiStatus; /* copy the sense data */ - if (SCSI_SENSE_BUFFERSIZE < sizeof(ei->SenseInfo)) - sense_data_size = SCSI_SENSE_BUFFERSIZE; - else - sense_data_size = sizeof(ei->SenseInfo); + sense_data_size = min_t(unsigned long, SCSI_SENSE_BUFFERSIZE, + sizeof(ei->SenseInfo)); if (ei->SenseLen < sense_data_size) sense_data_size = ei->SenseLen; memcpy(cmd->sense_buffer, ei->SenseInfo, sense_data_size); @@ -3628,10 +3626,7 @@ static bool hpsa_vpd_page_supported(struct ctlr_info *h, if (rc != 0) goto exit_unsupported; pages = buf[3]; - if ((pages + HPSA_VPD_HEADER_SZ) <= 255) - bufsize = pages + HPSA_VPD_HEADER_SZ; - else - bufsize = 255; + bufsize = min(pages + HPSA_VPD_HEADER_SZ, 255); /* Get the whole VPD page list */ rc = hpsa_scsi_do_inquiry(h, scsi3addr, @@ -6407,18 +6402,14 @@ static int hpsa_passthru_ioctl(struct ctlr_info *h, return -EINVAL; } if (iocommand->buf_size > 0) { - buff = kmalloc(iocommand->buf_size, GFP_KERNEL); - if (buff == NULL) - return -ENOMEM; if (iocommand->Request.Type.Direction & XFER_WRITE) { - /* Copy the data into the buffer we created */ - if (copy_from_user(buff, iocommand->buf, - iocommand->buf_size)) { - rc = -EFAULT; - goto out_kfree; - } + buff = memdup_user(iocommand->buf, iocommand->buf_size); + if (IS_ERR(buff)) + return PTR_ERR(buff); } else { - memset(buff, 0, iocommand->buf_size); + buff = kzalloc(iocommand->buf_size, GFP_KERNEL); + if (!buff) + return -ENOMEM; } } c = cmd_alloc(h); @@ -6478,7 +6469,6 @@ static int hpsa_passthru_ioctl(struct ctlr_info *h, } out: cmd_free(h, c); -out_kfree: kfree(buff); return rc; } @@ -6522,18 +6512,21 @@ static int hpsa_big_passthru_ioctl(struct ctlr_info *h, while (left) { sz = (left > ioc->malloc_size) ? ioc->malloc_size : left; buff_size[sg_used] = sz; - buff[sg_used] = kmalloc(sz, GFP_KERNEL); - if (buff[sg_used] == NULL) { - status = -ENOMEM; - goto cleanup1; - } + if (ioc->Request.Type.Direction & XFER_WRITE) { - if (copy_from_user(buff[sg_used], data_ptr, sz)) { - status = -EFAULT; + buff[sg_used] = memdup_user(data_ptr, sz); + if (IS_ERR(buff[sg_used])) { + status = PTR_ERR(buff[sg_used]); goto cleanup1; } - } else - memset(buff[sg_used], 0, sz); + } else { + buff[sg_used] = kzalloc(sz, GFP_KERNEL); + if (!buff[sg_used]) { + status = -ENOMEM; + goto cleanup1; + } + } + left -= sz; data_ptr += sz; sg_used++; @@ -7632,8 +7625,8 @@ static void hpsa_free_cfgtables(struct ctlr_info *h) } /* Find and map CISS config table and transfer table -+ * several items must be unmapped (freed) later -+ * */ + * several items must be unmapped (freed) later + */ static int hpsa_find_cfgtables(struct ctlr_info *h) { u64 cfg_offset; diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index dd6754db7e4c..44214884deaf 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -4281,11 +4281,11 @@ static int ipr_alloc_dump(struct ipr_ioa_cfg *ioa_cfg) } if (ioa_cfg->sis64) - ioa_data = vmalloc(array_size(IPR_FMT3_MAX_NUM_DUMP_PAGES, - sizeof(__be32 *))); + ioa_data = vmalloc_array(IPR_FMT3_MAX_NUM_DUMP_PAGES, + sizeof(__be32 *)); else - ioa_data = vmalloc(array_size(IPR_FMT2_MAX_NUM_DUMP_PAGES, - sizeof(__be32 *))); + ioa_data = vmalloc_array(IPR_FMT2_MAX_NUM_DUMP_PAGES, + sizeof(__be32 *)); if (!ioa_data) { ipr_err("Dump memory allocation failed\n"); diff --git a/drivers/scsi/isci/remote_device.c b/drivers/scsi/isci/remote_device.c index 82deb6a83a8c..4c7462965ea1 100644 --- a/drivers/scsi/isci/remote_device.c +++ b/drivers/scsi/isci/remote_device.c @@ -1434,7 +1434,7 @@ static enum sci_status isci_remote_device_construct(struct isci_port *iport, struct domain_device *dev = idev->domain_dev; enum sci_status status; - if (dev->parent && dev_is_expander(dev->parent->dev_type)) + if (dev_parent_is_expander(dev)) status = sci_remote_device_ea_construct(iport, idev); else status = sci_remote_device_da_construct(iport, idev); diff --git a/drivers/scsi/libfc/fc_encode.h b/drivers/scsi/libfc/fc_encode.h index 02e31db31d68..e046091a549a 100644 --- a/drivers/scsi/libfc/fc_encode.h +++ b/drivers/scsi/libfc/fc_encode.h @@ -356,7 +356,7 @@ static inline int fc_ct_ms_fill(struct fc_lport *lport, put_unaligned_be16(len, &entry->len); snprintf((char *)&entry->value, FC_FDMI_HBA_ATTR_OSNAMEVERSION_LEN, - "%s v%s", + "%.62s v%.62s", init_utsname()->sysname, init_utsname()->release); diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 869b5d4db44c..d953225f6cc2 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -1313,10 +1313,7 @@ static int sas_check_parent_topology(struct domain_device *child) int i; int res = 0; - if (!child->parent) - return 0; - - if (!dev_is_expander(child->parent->dev_type)) + if (!dev_parent_is_expander(child)) return 0; parent_ex = &child->parent->ex_dev; diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index fe4fb67eb50c..224edacf2d8e 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -1,7 +1,7 @@ /******************************************************************* * This file is part of the Emulex Linux Device Driver for * * Fibre Channel Host Bus Adapters. * - * Copyright (C) 2017-2024 Broadcom. All Rights Reserved. The term * + * Copyright (C) 2017-2025 Broadcom. All Rights Reserved. The term * * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Copyright (C) 2004-2016 Emulex. All rights reserved. * * EMULEX and SLI are trademarks of Emulex. * @@ -661,15 +661,12 @@ struct lpfc_vport { uint32_t num_disc_nodes; /* in addition to hba_state */ uint32_t gidft_inp; /* cnt of outstanding GID_FTs */ - uint32_t fc_nlp_cnt; /* outstanding NODELIST requests */ uint32_t fc_rscn_id_cnt; /* count of RSCNs payloads in list */ uint32_t fc_rscn_flush; /* flag use of fc_rscn_id_list */ struct lpfc_dmabuf *fc_rscn_id_list[FC_MAX_HOLD_RSCN]; struct lpfc_name fc_nodename; /* fc nodename */ struct lpfc_name fc_portname; /* fc portname */ - struct lpfc_work_evt disc_timeout_evt; - struct timer_list fc_disctmo; /* Discovery rescue timer */ uint8_t fc_ns_retry; /* retries for fabric nameserver */ uint32_t fc_prli_sent; /* cntr for outstanding PRLIs */ @@ -744,12 +741,6 @@ struct lpfc_vport { struct lpfc_vmid_priority_info vmid_priority; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS - struct dentry *debug_disc_trc; - struct dentry *debug_nodelist; - struct dentry *debug_nvmestat; - struct dentry *debug_scsistat; - struct dentry *debug_ioktime; - struct dentry *debug_hdwqstat; struct dentry *vport_debugfs_root; struct lpfc_debugfs_trc *disc_trc; atomic_t disc_trc_cnt; @@ -767,7 +758,6 @@ struct lpfc_vport { /* There is a single nvme instance per vport. */ struct nvme_fc_local_port *localport; uint8_t nvmei_support; /* driver supports NVME Initiator */ - uint32_t last_fcp_wqidx; uint32_t rcv_flogi_cnt; /* How many unsol FLOGIs ACK'd. */ }; @@ -1060,8 +1050,6 @@ struct lpfc_hba { struct lpfc_dmabuf hbqslimp; - uint16_t pci_cfg_value; - uint8_t fc_linkspeed; /* Link speed after last READ_LA */ uint32_t fc_eventTag; /* event tag for link attention */ @@ -1088,7 +1076,6 @@ struct lpfc_hba { struct lpfc_stats fc_stat; - struct lpfc_nodelist fc_fcpnodev; /* nodelist entry for no device */ uint32_t nport_event_cnt; /* timestamp for nlplist entry */ uint8_t wwnn[8]; @@ -1229,9 +1216,6 @@ struct lpfc_hba { uint32_t hbq_count; /* Count of configured HBQs */ struct hbq_s hbqs[LPFC_MAX_HBQS]; /* local copy of hbq indicies */ - atomic_t fcp_qidx; /* next FCP WQ (RR Policy) */ - atomic_t nvme_qidx; /* next NVME WQ (RR Policy) */ - phys_addr_t pci_bar0_map; /* Physical address for PCI BAR0 */ phys_addr_t pci_bar1_map; /* Physical address for PCI BAR1 */ phys_addr_t pci_bar2_map; /* Physical address for PCI BAR2 */ @@ -1348,30 +1332,9 @@ struct lpfc_hba { unsigned long last_ramp_down_time; #ifdef CONFIG_SCSI_LPFC_DEBUG_FS struct dentry *hba_debugfs_root; - atomic_t debugfs_vport_count; - struct dentry *debug_multixri_pools; - struct dentry *debug_hbqinfo; - struct dentry *debug_dumpHostSlim; - struct dentry *debug_dumpHBASlim; - struct dentry *debug_InjErrLBA; /* LBA to inject errors at */ - struct dentry *debug_InjErrNPortID; /* NPortID to inject errors at */ - struct dentry *debug_InjErrWWPN; /* WWPN to inject errors at */ - struct dentry *debug_writeGuard; /* inject write guard_tag errors */ - struct dentry *debug_writeApp; /* inject write app_tag errors */ - struct dentry *debug_writeRef; /* inject write ref_tag errors */ - struct dentry *debug_readGuard; /* inject read guard_tag errors */ - struct dentry *debug_readApp; /* inject read app_tag errors */ - struct dentry *debug_readRef; /* inject read ref_tag errors */ - - struct dentry *debug_nvmeio_trc; + unsigned int debugfs_vport_count; + struct lpfc_debugfs_nvmeio_trc *nvmeio_trc; - struct dentry *debug_hdwqinfo; -#ifdef LPFC_HDWQ_LOCK_STAT - struct dentry *debug_lockstat; -#endif - struct dentry *debug_cgn_buffer; - struct dentry *debug_rx_monitor; - struct dentry *debug_ras_log; atomic_t nvmeio_trc_cnt; uint32_t nvmeio_trc_size; uint32_t nvmeio_trc_output_idx; @@ -1388,19 +1351,10 @@ struct lpfc_hba { sector_t lpfc_injerr_lba; #define LPFC_INJERR_LBA_OFF (sector_t)(-1) - struct dentry *debug_slow_ring_trc; struct lpfc_debugfs_trc *slow_ring_trc; atomic_t slow_ring_trc_cnt; /* iDiag debugfs sub-directory */ struct dentry *idiag_root; - struct dentry *idiag_pci_cfg; - struct dentry *idiag_bar_acc; - struct dentry *idiag_que_info; - struct dentry *idiag_que_acc; - struct dentry *idiag_drb_acc; - struct dentry *idiag_ctl_acc; - struct dentry *idiag_mbx_acc; - struct dentry *idiag_ext_acc; uint8_t lpfc_idiag_last_eq; #endif uint16_t nvmeio_trc_on; diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 7c4d7bb3a56f..92b5b2dbe847 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -2373,93 +2373,117 @@ out: static ssize_t lpfc_debugfs_dif_err_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { struct lpfc_hba *phba = file->private_data; int kind = debugfs_get_aux_num(file); - char cbuf[32]; - uint64_t tmp = 0; + char cbuf[32] = {0}; int cnt = 0; - if (kind == writeGuard) - cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_wgrd_cnt); - else if (kind == writeApp) - cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_wapp_cnt); - else if (kind == writeRef) - cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_wref_cnt); - else if (kind == readGuard) - cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_rgrd_cnt); - else if (kind == readApp) - cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_rapp_cnt); - else if (kind == readRef) - cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_rref_cnt); - else if (kind == InjErrNPortID) - cnt = scnprintf(cbuf, 32, "0x%06x\n", + switch (kind) { + case writeGuard: + cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n", + phba->lpfc_injerr_wgrd_cnt); + break; + case writeApp: + cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n", + phba->lpfc_injerr_wapp_cnt); + break; + case writeRef: + cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n", + phba->lpfc_injerr_wref_cnt); + break; + case readGuard: + cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n", + phba->lpfc_injerr_rgrd_cnt); + break; + case readApp: + cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n", + phba->lpfc_injerr_rapp_cnt); + break; + case readRef: + cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n", + phba->lpfc_injerr_rref_cnt); + break; + case InjErrNPortID: + cnt = scnprintf(cbuf, sizeof(cbuf), "0x%06x\n", phba->lpfc_injerr_nportid); - else if (kind == InjErrWWPN) { - memcpy(&tmp, &phba->lpfc_injerr_wwpn, sizeof(struct lpfc_name)); - tmp = cpu_to_be64(tmp); - cnt = scnprintf(cbuf, 32, "0x%016llx\n", tmp); - } else if (kind == InjErrLBA) { - if (phba->lpfc_injerr_lba == (sector_t)(-1)) - cnt = scnprintf(cbuf, 32, "off\n"); + break; + case InjErrWWPN: + cnt = scnprintf(cbuf, sizeof(cbuf), "0x%016llx\n", + be64_to_cpu(phba->lpfc_injerr_wwpn.u.wwn_be)); + break; + case InjErrLBA: + if (phba->lpfc_injerr_lba == LPFC_INJERR_LBA_OFF) + cnt = scnprintf(cbuf, sizeof(cbuf), "off\n"); else - cnt = scnprintf(cbuf, 32, "0x%llx\n", - (uint64_t) phba->lpfc_injerr_lba); - } else - lpfc_printf_log(phba, KERN_ERR, LOG_INIT, - "0547 Unknown debugfs error injection entry\n"); + cnt = scnprintf(cbuf, sizeof(cbuf), "0x%llx\n", + (uint64_t)phba->lpfc_injerr_lba); + break; + default: + lpfc_log_msg(phba, KERN_WARNING, LOG_INIT, + "0547 Unknown debugfs error injection entry\n"); + break; + } return simple_read_from_buffer(buf, nbytes, ppos, &cbuf, cnt); } static ssize_t lpfc_debugfs_dif_err_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { struct lpfc_hba *phba = file->private_data; int kind = debugfs_get_aux_num(file); - char dstbuf[33]; - uint64_t tmp = 0; - int size; + char dstbuf[33] = {0}; + unsigned long long tmp; + unsigned long size; - memset(dstbuf, 0, 33); - size = (nbytes < 32) ? nbytes : 32; + size = (nbytes < (sizeof(dstbuf) - 1)) ? nbytes : (sizeof(dstbuf) - 1); if (copy_from_user(dstbuf, buf, size)) return -EFAULT; - if (kind == InjErrLBA) { - if ((dstbuf[0] == 'o') && (dstbuf[1] == 'f') && - (dstbuf[2] == 'f')) - tmp = (uint64_t)(-1); + if (kstrtoull(dstbuf, 0, &tmp)) { + if (kind != InjErrLBA || !strstr(dstbuf, "off")) + return -EINVAL; } - if ((tmp == 0) && (kstrtoull(dstbuf, 0, &tmp))) - return -EINVAL; - - if (kind == writeGuard) + switch (kind) { + case writeGuard: phba->lpfc_injerr_wgrd_cnt = (uint32_t)tmp; - else if (kind == writeApp) + break; + case writeApp: phba->lpfc_injerr_wapp_cnt = (uint32_t)tmp; - else if (kind == writeRef) + break; + case writeRef: phba->lpfc_injerr_wref_cnt = (uint32_t)tmp; - else if (kind == readGuard) + break; + case readGuard: phba->lpfc_injerr_rgrd_cnt = (uint32_t)tmp; - else if (kind == readApp) + break; + case readApp: phba->lpfc_injerr_rapp_cnt = (uint32_t)tmp; - else if (kind == readRef) + break; + case readRef: phba->lpfc_injerr_rref_cnt = (uint32_t)tmp; - else if (kind == InjErrLBA) - phba->lpfc_injerr_lba = (sector_t)tmp; - else if (kind == InjErrNPortID) + break; + case InjErrLBA: + if (strstr(dstbuf, "off")) + phba->lpfc_injerr_lba = LPFC_INJERR_LBA_OFF; + else + phba->lpfc_injerr_lba = (sector_t)tmp; + break; + case InjErrNPortID: phba->lpfc_injerr_nportid = (uint32_t)(tmp & Mask_DID); - else if (kind == InjErrWWPN) { - tmp = cpu_to_be64(tmp); - memcpy(&phba->lpfc_injerr_wwpn, &tmp, sizeof(struct lpfc_name)); - } else - lpfc_printf_log(phba, KERN_ERR, LOG_INIT, - "0548 Unknown debugfs error injection entry\n"); - + break; + case InjErrWWPN: + phba->lpfc_injerr_wwpn.u.wwn_be = cpu_to_be64(tmp); + break; + default: + lpfc_log_msg(phba, KERN_WARNING, LOG_INIT, + "0548 Unknown debugfs error injection entry\n"); + break; + } return nbytes; } @@ -5728,7 +5752,7 @@ static const struct file_operations lpfc_debugfs_op_slow_ring_trc = { }; static struct dentry *lpfc_debugfs_root = NULL; -static atomic_t lpfc_debugfs_hba_count; +static unsigned int lpfc_debugfs_hba_count; /* * File operations for the iDiag debugfs @@ -6050,7 +6074,12 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport) /* Setup lpfc root directory */ if (!lpfc_debugfs_root) { lpfc_debugfs_root = debugfs_create_dir("lpfc", NULL); - atomic_set(&lpfc_debugfs_hba_count, 0); + lpfc_debugfs_hba_count = 0; + if (IS_ERR(lpfc_debugfs_root)) { + lpfc_vlog_msg(vport, KERN_WARNING, LOG_INIT, + "0527 Cannot create debugfs lpfc\n"); + return; + } } if (!lpfc_debugfs_start_time) lpfc_debugfs_start_time = jiffies; @@ -6061,150 +6090,96 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport) pport_setup = true; phba->hba_debugfs_root = debugfs_create_dir(name, lpfc_debugfs_root); - atomic_inc(&lpfc_debugfs_hba_count); - atomic_set(&phba->debugfs_vport_count, 0); + phba->debugfs_vport_count = 0; + if (IS_ERR(phba->hba_debugfs_root)) { + lpfc_vlog_msg(vport, KERN_WARNING, LOG_INIT, + "0528 Cannot create debugfs %s\n", name); + return; + } + lpfc_debugfs_hba_count++; /* Multi-XRI pools */ - snprintf(name, sizeof(name), "multixripools"); - phba->debug_multixri_pools = - debugfs_create_file(name, S_IFREG | 0644, - phba->hba_debugfs_root, - phba, - &lpfc_debugfs_op_multixripools); - if (IS_ERR(phba->debug_multixri_pools)) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, - "0527 Cannot create debugfs multixripools\n"); - goto debug_failed; - } + debugfs_create_file("multixripools", 0644, + phba->hba_debugfs_root, phba, + &lpfc_debugfs_op_multixripools); /* Congestion Info Buffer */ - scnprintf(name, sizeof(name), "cgn_buffer"); - phba->debug_cgn_buffer = - debugfs_create_file(name, S_IFREG | 0644, - phba->hba_debugfs_root, - phba, &lpfc_cgn_buffer_op); - if (IS_ERR(phba->debug_cgn_buffer)) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, - "6527 Cannot create debugfs " - "cgn_buffer\n"); - goto debug_failed; - } + debugfs_create_file("cgn_buffer", 0644, phba->hba_debugfs_root, + phba, &lpfc_cgn_buffer_op); /* RX Monitor */ - scnprintf(name, sizeof(name), "rx_monitor"); - phba->debug_rx_monitor = - debugfs_create_file(name, S_IFREG | 0644, - phba->hba_debugfs_root, - phba, &lpfc_rx_monitor_op); - if (IS_ERR(phba->debug_rx_monitor)) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, - "6528 Cannot create debugfs " - "rx_monitor\n"); - goto debug_failed; - } + debugfs_create_file("rx_monitor", 0644, phba->hba_debugfs_root, + phba, &lpfc_rx_monitor_op); /* RAS log */ - snprintf(name, sizeof(name), "ras_log"); - phba->debug_ras_log = - debugfs_create_file(name, 0644, - phba->hba_debugfs_root, - phba, &lpfc_debugfs_ras_log); - if (IS_ERR(phba->debug_ras_log)) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, - "6148 Cannot create debugfs" - " ras_log\n"); - goto debug_failed; - } + debugfs_create_file("ras_log", 0644, phba->hba_debugfs_root, + phba, &lpfc_debugfs_ras_log); /* Setup hbqinfo */ - snprintf(name, sizeof(name), "hbqinfo"); - phba->debug_hbqinfo = - debugfs_create_file(name, S_IFREG | 0644, - phba->hba_debugfs_root, - phba, &lpfc_debugfs_op_hbqinfo); + debugfs_create_file("hbqinfo", 0644, phba->hba_debugfs_root, + phba, &lpfc_debugfs_op_hbqinfo); #ifdef LPFC_HDWQ_LOCK_STAT /* Setup lockstat */ - snprintf(name, sizeof(name), "lockstat"); - phba->debug_lockstat = - debugfs_create_file(name, S_IFREG | 0644, - phba->hba_debugfs_root, - phba, &lpfc_debugfs_op_lockstat); - if (IS_ERR(phba->debug_lockstat)) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, - "4610 Can't create debugfs lockstat\n"); - goto debug_failed; - } + debugfs_create_file("lockstat", 0644, phba->hba_debugfs_root, + phba, &lpfc_debugfs_op_lockstat); #endif - - /* Setup dumpHBASlim */ if (phba->sli_rev < LPFC_SLI_REV4) { - snprintf(name, sizeof(name), "dumpHBASlim"); - phba->debug_dumpHBASlim = - debugfs_create_file(name, - S_IFREG|S_IRUGO|S_IWUSR, - phba->hba_debugfs_root, - phba, &lpfc_debugfs_op_dumpHBASlim); - } else - phba->debug_dumpHBASlim = NULL; + /* Setup dumpHBASlim */ + debugfs_create_file("dumpHBASlim", 0644, + phba->hba_debugfs_root, phba, + &lpfc_debugfs_op_dumpHBASlim); + } - /* Setup dumpHostSlim */ if (phba->sli_rev < LPFC_SLI_REV4) { - snprintf(name, sizeof(name), "dumpHostSlim"); - phba->debug_dumpHostSlim = - debugfs_create_file(name, - S_IFREG|S_IRUGO|S_IWUSR, - phba->hba_debugfs_root, - phba, &lpfc_debugfs_op_dumpHostSlim); - } else - phba->debug_dumpHostSlim = NULL; + /* Setup dumpHostSlim */ + debugfs_create_file("dumpHostSlim", 0644, + phba->hba_debugfs_root, phba, + &lpfc_debugfs_op_dumpHostSlim); + } /* Setup DIF Error Injections */ - phba->debug_InjErrLBA = - debugfs_create_file_aux_num("InjErrLBA", 0644, - phba->hba_debugfs_root, - phba, InjErrLBA, &lpfc_debugfs_op_dif_err); + debugfs_create_file_aux_num("InjErrLBA", 0644, + phba->hba_debugfs_root, phba, + InjErrLBA, + &lpfc_debugfs_op_dif_err); phba->lpfc_injerr_lba = LPFC_INJERR_LBA_OFF; - phba->debug_InjErrNPortID = - debugfs_create_file_aux_num("InjErrNPortID", 0644, - phba->hba_debugfs_root, - phba, InjErrNPortID, &lpfc_debugfs_op_dif_err); - - phba->debug_InjErrWWPN = - debugfs_create_file_aux_num("InjErrWWPN", 0644, - phba->hba_debugfs_root, - phba, InjErrWWPN, &lpfc_debugfs_op_dif_err); - - phba->debug_writeGuard = - debugfs_create_file_aux_num("writeGuardInjErr", 0644, - phba->hba_debugfs_root, - phba, writeGuard, &lpfc_debugfs_op_dif_err); - - phba->debug_writeApp = - debugfs_create_file_aux_num("writeAppInjErr", 0644, - phba->hba_debugfs_root, - phba, writeApp, &lpfc_debugfs_op_dif_err); - - phba->debug_writeRef = - debugfs_create_file_aux_num("writeRefInjErr", 0644, - phba->hba_debugfs_root, - phba, writeRef, &lpfc_debugfs_op_dif_err); - - phba->debug_readGuard = - debugfs_create_file_aux_num("readGuardInjErr", 0644, - phba->hba_debugfs_root, - phba, readGuard, &lpfc_debugfs_op_dif_err); - - phba->debug_readApp = - debugfs_create_file_aux_num("readAppInjErr", 0644, - phba->hba_debugfs_root, - phba, readApp, &lpfc_debugfs_op_dif_err); - - phba->debug_readRef = - debugfs_create_file_aux_num("readRefInjErr", 0644, - phba->hba_debugfs_root, - phba, readRef, &lpfc_debugfs_op_dif_err); + debugfs_create_file_aux_num("InjErrNPortID", 0644, + phba->hba_debugfs_root, phba, + InjErrNPortID, + &lpfc_debugfs_op_dif_err); + + debugfs_create_file_aux_num("InjErrWWPN", 0644, + phba->hba_debugfs_root, phba, + InjErrWWPN, + &lpfc_debugfs_op_dif_err); + + debugfs_create_file_aux_num("writeGuardInjErr", 0644, + phba->hba_debugfs_root, phba, + writeGuard, + &lpfc_debugfs_op_dif_err); + + debugfs_create_file_aux_num("writeAppInjErr", 0644, + phba->hba_debugfs_root, phba, + writeApp, &lpfc_debugfs_op_dif_err); + + debugfs_create_file_aux_num("writeRefInjErr", 0644, + phba->hba_debugfs_root, phba, + writeRef, &lpfc_debugfs_op_dif_err); + + debugfs_create_file_aux_num("readGuardInjErr", 0644, + phba->hba_debugfs_root, phba, + readGuard, + &lpfc_debugfs_op_dif_err); + + debugfs_create_file_aux_num("readAppInjErr", 0644, + phba->hba_debugfs_root, phba, + readApp, &lpfc_debugfs_op_dif_err); + + debugfs_create_file_aux_num("readRefInjErr", 0644, + phba->hba_debugfs_root, phba, + readRef, &lpfc_debugfs_op_dif_err); /* Setup slow ring trace */ if (lpfc_debugfs_max_slow_ring_trc) { @@ -6224,11 +6199,9 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport) } } - snprintf(name, sizeof(name), "slow_ring_trace"); - phba->debug_slow_ring_trc = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - phba->hba_debugfs_root, - phba, &lpfc_debugfs_op_slow_ring_trc); + debugfs_create_file("slow_ring_trace", 0644, + phba->hba_debugfs_root, phba, + &lpfc_debugfs_op_slow_ring_trc); if (!phba->slow_ring_trc) { phba->slow_ring_trc = kcalloc( lpfc_debugfs_max_slow_ring_trc, @@ -6238,16 +6211,13 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport) lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, "0416 Cannot create debugfs " "slow_ring buffer\n"); - goto debug_failed; + goto out; } atomic_set(&phba->slow_ring_trc_cnt, 0); } - snprintf(name, sizeof(name), "nvmeio_trc"); - phba->debug_nvmeio_trc = - debugfs_create_file(name, 0644, - phba->hba_debugfs_root, - phba, &lpfc_debugfs_op_nvmeio_trc); + debugfs_create_file("nvmeio_trc", 0644, phba->hba_debugfs_root, + phba, &lpfc_debugfs_op_nvmeio_trc); atomic_set(&phba->nvmeio_trc_cnt, 0); if (lpfc_debugfs_max_nvmeio_trc) { @@ -6293,7 +6263,12 @@ nvmeio_off: if (!vport->vport_debugfs_root) { vport->vport_debugfs_root = debugfs_create_dir(name, phba->hba_debugfs_root); - atomic_inc(&phba->debugfs_vport_count); + if (IS_ERR(vport->vport_debugfs_root)) { + lpfc_vlog_msg(vport, KERN_WARNING, LOG_INIT, + "0529 Cannot create debugfs %s\n", name); + return; + } + phba->debugfs_vport_count++; } if (lpfc_debugfs_max_disc_trc) { @@ -6320,54 +6295,27 @@ nvmeio_off: lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, "0418 Cannot create debugfs disc trace " "buffer\n"); - goto debug_failed; + goto out; } atomic_set(&vport->disc_trc_cnt, 0); - snprintf(name, sizeof(name), "discovery_trace"); - vport->debug_disc_trc = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - vport->vport_debugfs_root, - vport, &lpfc_debugfs_op_disc_trc); - snprintf(name, sizeof(name), "nodelist"); - vport->debug_nodelist = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - vport->vport_debugfs_root, - vport, &lpfc_debugfs_op_nodelist); - - snprintf(name, sizeof(name), "nvmestat"); - vport->debug_nvmestat = - debugfs_create_file(name, 0644, - vport->vport_debugfs_root, - vport, &lpfc_debugfs_op_nvmestat); - - snprintf(name, sizeof(name), "scsistat"); - vport->debug_scsistat = - debugfs_create_file(name, 0644, - vport->vport_debugfs_root, - vport, &lpfc_debugfs_op_scsistat); - if (IS_ERR(vport->debug_scsistat)) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, - "4611 Cannot create debugfs scsistat\n"); - goto debug_failed; - } + debugfs_create_file("discovery_trace", 0644, vport->vport_debugfs_root, + vport, &lpfc_debugfs_op_disc_trc); - snprintf(name, sizeof(name), "ioktime"); - vport->debug_ioktime = - debugfs_create_file(name, 0644, - vport->vport_debugfs_root, - vport, &lpfc_debugfs_op_ioktime); - if (IS_ERR(vport->debug_ioktime)) { - lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT, - "0815 Cannot create debugfs ioktime\n"); - goto debug_failed; - } + debugfs_create_file("nodelist", 0644, vport->vport_debugfs_root, vport, + &lpfc_debugfs_op_nodelist); + + debugfs_create_file("nvmestat", 0644, vport->vport_debugfs_root, vport, + &lpfc_debugfs_op_nvmestat); - snprintf(name, sizeof(name), "hdwqstat"); - vport->debug_hdwqstat = - debugfs_create_file(name, 0644, - vport->vport_debugfs_root, - vport, &lpfc_debugfs_op_hdwqstat); + debugfs_create_file("scsistat", 0644, vport->vport_debugfs_root, vport, + &lpfc_debugfs_op_scsistat); + + debugfs_create_file("ioktime", 0644, vport->vport_debugfs_root, vport, + &lpfc_debugfs_op_ioktime); + + debugfs_create_file("hdwqstat", 0644, vport->vport_debugfs_root, vport, + &lpfc_debugfs_op_hdwqstat); /* * The following section is for additional directories/files for the @@ -6375,93 +6323,58 @@ nvmeio_off: */ if (!pport_setup) - goto debug_failed; + return; /* * iDiag debugfs root entry points for SLI4 device only */ if (phba->sli_rev < LPFC_SLI_REV4) - goto debug_failed; + return; - snprintf(name, sizeof(name), "iDiag"); if (!phba->idiag_root) { phba->idiag_root = - debugfs_create_dir(name, phba->hba_debugfs_root); + debugfs_create_dir("iDiag", phba->hba_debugfs_root); /* Initialize iDiag data structure */ memset(&idiag, 0, sizeof(idiag)); } /* iDiag read PCI config space */ - snprintf(name, sizeof(name), "pciCfg"); - if (!phba->idiag_pci_cfg) { - phba->idiag_pci_cfg = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - phba->idiag_root, phba, &lpfc_idiag_op_pciCfg); - idiag.offset.last_rd = 0; - } + debugfs_create_file("pciCfg", 0644, phba->idiag_root, phba, + &lpfc_idiag_op_pciCfg); + idiag.offset.last_rd = 0; /* iDiag PCI BAR access */ - snprintf(name, sizeof(name), "barAcc"); - if (!phba->idiag_bar_acc) { - phba->idiag_bar_acc = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - phba->idiag_root, phba, &lpfc_idiag_op_barAcc); - idiag.offset.last_rd = 0; - } + debugfs_create_file("barAcc", 0644, phba->idiag_root, phba, + &lpfc_idiag_op_barAcc); + idiag.offset.last_rd = 0; /* iDiag get PCI function queue information */ - snprintf(name, sizeof(name), "queInfo"); - if (!phba->idiag_que_info) { - phba->idiag_que_info = - debugfs_create_file(name, S_IFREG|S_IRUGO, - phba->idiag_root, phba, &lpfc_idiag_op_queInfo); - } + debugfs_create_file("queInfo", 0444, phba->idiag_root, phba, + &lpfc_idiag_op_queInfo); /* iDiag access PCI function queue */ - snprintf(name, sizeof(name), "queAcc"); - if (!phba->idiag_que_acc) { - phba->idiag_que_acc = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - phba->idiag_root, phba, &lpfc_idiag_op_queAcc); - } + debugfs_create_file("queAcc", 0644, phba->idiag_root, phba, + &lpfc_idiag_op_queAcc); /* iDiag access PCI function doorbell registers */ - snprintf(name, sizeof(name), "drbAcc"); - if (!phba->idiag_drb_acc) { - phba->idiag_drb_acc = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - phba->idiag_root, phba, &lpfc_idiag_op_drbAcc); - } + debugfs_create_file("drbAcc", 0644, phba->idiag_root, phba, + &lpfc_idiag_op_drbAcc); /* iDiag access PCI function control registers */ - snprintf(name, sizeof(name), "ctlAcc"); - if (!phba->idiag_ctl_acc) { - phba->idiag_ctl_acc = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - phba->idiag_root, phba, &lpfc_idiag_op_ctlAcc); - } + debugfs_create_file("ctlAcc", 0644, phba->idiag_root, phba, + &lpfc_idiag_op_ctlAcc); /* iDiag access mbox commands */ - snprintf(name, sizeof(name), "mbxAcc"); - if (!phba->idiag_mbx_acc) { - phba->idiag_mbx_acc = - debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR, - phba->idiag_root, phba, &lpfc_idiag_op_mbxAcc); - } + debugfs_create_file("mbxAcc", 0644, phba->idiag_root, phba, + &lpfc_idiag_op_mbxAcc); /* iDiag extents access commands */ if (phba->sli4_hba.extents_in_use) { - snprintf(name, sizeof(name), "extAcc"); - if (!phba->idiag_ext_acc) { - phba->idiag_ext_acc = - debugfs_create_file(name, - S_IFREG|S_IRUGO|S_IWUSR, - phba->idiag_root, phba, - &lpfc_idiag_op_extAcc); - } + debugfs_create_file("extAcc", 0644, phba->idiag_root, phba, + &lpfc_idiag_op_extAcc); } - -debug_failed: +out: + /* alloc'ed items are kfree'd in lpfc_debugfs_terminate */ return; #endif } @@ -6486,145 +6399,26 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport) kfree(vport->disc_trc); vport->disc_trc = NULL; - debugfs_remove(vport->debug_disc_trc); /* discovery_trace */ - vport->debug_disc_trc = NULL; - - debugfs_remove(vport->debug_nodelist); /* nodelist */ - vport->debug_nodelist = NULL; - - debugfs_remove(vport->debug_nvmestat); /* nvmestat */ - vport->debug_nvmestat = NULL; - - debugfs_remove(vport->debug_scsistat); /* scsistat */ - vport->debug_scsistat = NULL; - - debugfs_remove(vport->debug_ioktime); /* ioktime */ - vport->debug_ioktime = NULL; - - debugfs_remove(vport->debug_hdwqstat); /* hdwqstat */ - vport->debug_hdwqstat = NULL; - if (vport->vport_debugfs_root) { debugfs_remove(vport->vport_debugfs_root); /* vportX */ vport->vport_debugfs_root = NULL; - atomic_dec(&phba->debugfs_vport_count); + phba->debugfs_vport_count--; } - if (atomic_read(&phba->debugfs_vport_count) == 0) { - - debugfs_remove(phba->debug_multixri_pools); /* multixripools*/ - phba->debug_multixri_pools = NULL; - - debugfs_remove(phba->debug_hbqinfo); /* hbqinfo */ - phba->debug_hbqinfo = NULL; - - debugfs_remove(phba->debug_cgn_buffer); - phba->debug_cgn_buffer = NULL; - - debugfs_remove(phba->debug_rx_monitor); - phba->debug_rx_monitor = NULL; - - debugfs_remove(phba->debug_ras_log); - phba->debug_ras_log = NULL; - -#ifdef LPFC_HDWQ_LOCK_STAT - debugfs_remove(phba->debug_lockstat); /* lockstat */ - phba->debug_lockstat = NULL; -#endif - debugfs_remove(phba->debug_dumpHBASlim); /* HBASlim */ - phba->debug_dumpHBASlim = NULL; - - debugfs_remove(phba->debug_dumpHostSlim); /* HostSlim */ - phba->debug_dumpHostSlim = NULL; - - debugfs_remove(phba->debug_InjErrLBA); /* InjErrLBA */ - phba->debug_InjErrLBA = NULL; - - debugfs_remove(phba->debug_InjErrNPortID); - phba->debug_InjErrNPortID = NULL; - - debugfs_remove(phba->debug_InjErrWWPN); /* InjErrWWPN */ - phba->debug_InjErrWWPN = NULL; - - debugfs_remove(phba->debug_writeGuard); /* writeGuard */ - phba->debug_writeGuard = NULL; - - debugfs_remove(phba->debug_writeApp); /* writeApp */ - phba->debug_writeApp = NULL; - - debugfs_remove(phba->debug_writeRef); /* writeRef */ - phba->debug_writeRef = NULL; - - debugfs_remove(phba->debug_readGuard); /* readGuard */ - phba->debug_readGuard = NULL; - - debugfs_remove(phba->debug_readApp); /* readApp */ - phba->debug_readApp = NULL; - - debugfs_remove(phba->debug_readRef); /* readRef */ - phba->debug_readRef = NULL; - + if (!phba->debugfs_vport_count) { kfree(phba->slow_ring_trc); phba->slow_ring_trc = NULL; - /* slow_ring_trace */ - debugfs_remove(phba->debug_slow_ring_trc); - phba->debug_slow_ring_trc = NULL; - - debugfs_remove(phba->debug_nvmeio_trc); - phba->debug_nvmeio_trc = NULL; - kfree(phba->nvmeio_trc); phba->nvmeio_trc = NULL; - /* - * iDiag release - */ - if (phba->sli_rev == LPFC_SLI_REV4) { - /* iDiag extAcc */ - debugfs_remove(phba->idiag_ext_acc); - phba->idiag_ext_acc = NULL; - - /* iDiag mbxAcc */ - debugfs_remove(phba->idiag_mbx_acc); - phba->idiag_mbx_acc = NULL; - - /* iDiag ctlAcc */ - debugfs_remove(phba->idiag_ctl_acc); - phba->idiag_ctl_acc = NULL; - - /* iDiag drbAcc */ - debugfs_remove(phba->idiag_drb_acc); - phba->idiag_drb_acc = NULL; - - /* iDiag queAcc */ - debugfs_remove(phba->idiag_que_acc); - phba->idiag_que_acc = NULL; - - /* iDiag queInfo */ - debugfs_remove(phba->idiag_que_info); - phba->idiag_que_info = NULL; - - /* iDiag barAcc */ - debugfs_remove(phba->idiag_bar_acc); - phba->idiag_bar_acc = NULL; - - /* iDiag pciCfg */ - debugfs_remove(phba->idiag_pci_cfg); - phba->idiag_pci_cfg = NULL; - - /* Finally remove the iDiag debugfs root */ - debugfs_remove(phba->idiag_root); - phba->idiag_root = NULL; - } - if (phba->hba_debugfs_root) { debugfs_remove(phba->hba_debugfs_root); /* fnX */ phba->hba_debugfs_root = NULL; - atomic_dec(&lpfc_debugfs_hba_count); + lpfc_debugfs_hba_count--; } - if (atomic_read(&lpfc_debugfs_hba_count) == 0) { + if (!lpfc_debugfs_hba_count) { debugfs_remove(lpfc_debugfs_root); /* lpfc */ lpfc_debugfs_root = NULL; } diff --git a/drivers/scsi/lpfc/lpfc_debugfs.h b/drivers/scsi/lpfc/lpfc_debugfs.h index f319f3af0400..a1464f8ac331 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.h +++ b/drivers/scsi/lpfc/lpfc_debugfs.h @@ -1,7 +1,7 @@ /******************************************************************* * This file is part of the Emulex Linux Device Driver for * * Fibre Channel Host Bus Adapters. * - * Copyright (C) 2017-2022 Broadcom. All Rights Reserved. The term * + * Copyright (C) 2017-2025 Broadcom. All Rights Reserved. The term * * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Copyright (C) 2007-2011 Emulex. All rights reserved. * * EMULEX and SLI are trademarks of Emulex. * @@ -44,6 +44,9 @@ /* hbqinfo output buffer size */ #define LPFC_HBQINFO_SIZE 8192 +/* hdwqinfo output buffer size */ +#define LPFC_HDWQINFO_SIZE 8192 + /* nvmestat output buffer size */ #define LPFC_NVMESTAT_SIZE 8192 #define LPFC_IOKTIME_SIZE 8192 diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index fca81e0c7c2e..b71db7d7d747 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -3762,7 +3762,7 @@ lpfc_issue_els_rdf(struct lpfc_vport *vport, uint8_t retry) memset(prdf, 0, cmdsize); prdf->rdf.fpin_cmd = ELS_RDF; prdf->rdf.desc_len = cpu_to_be32(sizeof(struct lpfc_els_rdf_req) - - sizeof(struct fc_els_rdf)); + sizeof(struct fc_els_rdf_hdr)); prdf->reg_d1.reg_desc.desc_tag = cpu_to_be32(ELS_DTAG_FPIN_REGISTER); prdf->reg_d1.reg_desc.desc_len = cpu_to_be32( FC_TLV_DESC_LENGTH_FROM_SZ(prdf->reg_d1)); @@ -5339,12 +5339,12 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, ulp_status, ulp_word4, did); /* ELS response tag <ulpIoTag> completes */ lpfc_printf_vlog(vport, KERN_INFO, LOG_ELS, - "0110 ELS response tag x%x completes " + "0110 ELS response tag x%x completes fc_flag x%lx" "Data: x%x x%x x%x x%x x%lx x%x x%x x%x %p %p\n", - iotag, ulp_status, ulp_word4, tmo, + iotag, vport->fc_flag, ulp_status, ulp_word4, tmo, ndlp->nlp_DID, ndlp->nlp_flag, ndlp->nlp_state, ndlp->nlp_rpi, kref_read(&ndlp->kref), mbox, ndlp); - if (mbox) { + if (mbox && !test_bit(FC_PT2PT, &vport->fc_flag)) { if (ulp_status == 0 && test_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag)) { if (!lpfc_unreg_rpi(vport, ndlp) && @@ -5403,6 +5403,10 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, } out_free_mbox: lpfc_mbox_rsrc_cleanup(phba, mbox, MBOX_THD_UNLOCKED); + } else if (mbox && test_bit(FC_PT2PT, &vport->fc_flag) && + test_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag)) { + lpfc_mbx_cmpl_reg_login(phba, mbox); + clear_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag); } out: if (ndlp && shost) { @@ -11259,6 +11263,11 @@ lpfc_cmpl_els_fdisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS, "0126 FDISC cmpl status: x%x/x%x)\n", ulp_status, ulp_word4); + + /* drop initial reference */ + if (!test_and_set_bit(NLP_DROPPED, &ndlp->nlp_flag)) + lpfc_nlp_put(ndlp); + goto fdisc_failed; } @@ -12008,7 +12017,11 @@ lpfc_sli4_els_xri_aborted(struct lpfc_hba *phba, sglq_entry->state = SGL_FREED; spin_unlock_irqrestore(&phba->sli4_hba.sgl_list_lock, iflag); - + lpfc_printf_log(phba, KERN_INFO, LOG_ELS | LOG_SLI | + LOG_DISCOVERY | LOG_NODE, + "0732 ELS XRI ABORT on Node: ndlp=x%px " + "xri=x%x\n", + ndlp, xri); if (ndlp) { lpfc_set_rrq_active(phba, ndlp, sglq_entry->sli4_lxritag, diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h index 32298285ea5e..3bc0efa7453e 100644 --- a/drivers/scsi/lpfc/lpfc_hw.h +++ b/drivers/scsi/lpfc/lpfc_hw.h @@ -1,7 +1,7 @@ /******************************************************************* * This file is part of the Emulex Linux Device Driver for * * Fibre Channel Host Bus Adapters. * - * Copyright (C) 2017-2024 Broadcom. All Rights Reserved. The term * + * Copyright (C) 2017-2025 Broadcom. All Rights Reserved. The term * * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Copyright (C) 2004-2016 Emulex. All rights reserved. * * EMULEX and SLI are trademarks of Emulex. * @@ -366,6 +366,7 @@ struct lpfc_name { } s; uint8_t wwn[8]; uint64_t name __packed __aligned(4); + __be64 wwn_be __packed __aligned(4); } u; }; diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index bc709786e6af..a7f7ed86d2b0 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -4909,18 +4909,18 @@ struct send_frame_wqe { #define ELS_RDF_REG_TAG_CNT 4 struct lpfc_els_rdf_reg_desc { - struct fc_df_desc_fpin_reg reg_desc; /* descriptor header */ + struct fc_df_desc_fpin_reg_hdr reg_desc; /* descriptor header */ __be32 desc_tags[ELS_RDF_REG_TAG_CNT]; /* tags in reg_desc */ }; struct lpfc_els_rdf_req { - struct fc_els_rdf rdf; /* hdr up to descriptors */ + struct fc_els_rdf_hdr rdf; /* hdr up to descriptors */ struct lpfc_els_rdf_reg_desc reg_d1; /* 1st descriptor */ }; struct lpfc_els_rdf_rsp { - struct fc_els_rdf_resp rdf_resp; /* hdr up to descriptors */ + struct fc_els_rdf_resp_hdr rdf_resp; /* hdr up to descriptors */ struct lpfc_els_rdf_reg_desc reg_d1; /* 1st descriptor */ }; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 4081d2a358ee..0ca7429d86b8 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -3057,13 +3057,6 @@ lpfc_cleanup(struct lpfc_vport *vport) lpfc_vmid_vport_cleanup(vport); list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes, nlp_listp) { - if (vport->port_type != LPFC_PHYSICAL_PORT && - ndlp->nlp_DID == Fabric_DID) { - /* Just free up ndlp with Fabric_DID for vports */ - lpfc_nlp_put(ndlp); - continue; - } - if (ndlp->nlp_DID == Fabric_Cntl_DID && ndlp->nlp_state == NLP_STE_UNUSED_NODE) { lpfc_nlp_put(ndlp); @@ -8300,10 +8293,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) phba->cfg_total_seg_cnt, phba->cfg_scsi_seg_cnt, phba->cfg_nvme_seg_cnt); - if (phba->cfg_sg_dma_buf_size < SLI4_PAGE_SIZE) - i = phba->cfg_sg_dma_buf_size; - else - i = SLI4_PAGE_SIZE; + i = min(phba->cfg_sg_dma_buf_size, SLI4_PAGE_SIZE); phba->lpfc_sg_dma_buf_pool = dma_pool_create("lpfc_sg_dma_buf_pool", diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c index a596b80d03d4..1e5ef93e67e3 100644 --- a/drivers/scsi/lpfc/lpfc_nportdisc.c +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c @@ -1,7 +1,7 @@ /******************************************************************* * This file is part of the Emulex Linux Device Driver for * * Fibre Channel Host Bus Adapters. * - * Copyright (C) 2017-2024 Broadcom. All Rights Reserved. The term * + * Copyright (C) 2017-2025 Broadcom. All Rights Reserved. The term * * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. * * Copyright (C) 2004-2016 Emulex. All rights reserved. * * EMULEX and SLI are trademarks of Emulex. * @@ -326,8 +326,14 @@ lpfc_defer_plogi_acc(struct lpfc_hba *phba, LPFC_MBOXQ_t *login_mbox) /* Now that REG_RPI completed successfully, * we can now proceed with sending the PLOGI ACC. */ - rc = lpfc_els_rsp_acc(login_mbox->vport, ELS_CMD_PLOGI, - save_iocb, ndlp, NULL); + if (test_bit(FC_PT2PT, &ndlp->vport->fc_flag)) { + rc = lpfc_els_rsp_acc(login_mbox->vport, ELS_CMD_PLOGI, + save_iocb, ndlp, login_mbox); + } else { + rc = lpfc_els_rsp_acc(login_mbox->vport, ELS_CMD_PLOGI, + save_iocb, ndlp, NULL); + } + if (rc) { lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, "4576 PLOGI ACC fails pt2pt discovery: " @@ -335,9 +341,16 @@ lpfc_defer_plogi_acc(struct lpfc_hba *phba, LPFC_MBOXQ_t *login_mbox) } } - /* Now process the REG_RPI cmpl */ - lpfc_mbx_cmpl_reg_login(phba, login_mbox); - clear_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag); + /* If this is a fabric topology, complete the reg_rpi and prli now. + * For Pt2Pt, the reg_rpi and PRLI are deferred until after the LS_ACC + * completes. This ensures, in Pt2Pt, that the PLOGI LS_ACC is sent + * before the PRLI. + */ + if (!test_bit(FC_PT2PT, &ndlp->vport->fc_flag)) { + /* Now process the REG_RPI cmpl */ + lpfc_mbx_cmpl_reg_login(phba, login_mbox); + clear_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag); + } kfree(save_iocb); } diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index a6647dd360d1..e6f632521cff 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -1234,12 +1234,8 @@ lpfc_nvme_prep_io_cmd(struct lpfc_vport *vport, if ((phba->cfg_nvme_enable_fb) && test_bit(NLP_FIRSTBURST, &pnode->nlp_flag)) { req_len = lpfc_ncmd->nvmeCmd->payload_length; - if (req_len < pnode->nvme_fb_size) - wqe->fcp_iwrite.initial_xfer_len = - req_len; - else - wqe->fcp_iwrite.initial_xfer_len = - pnode->nvme_fb_size; + wqe->fcp_iwrite.initial_xfer_len = min(req_len, + pnode->nvme_fb_size); } else { wqe->fcp_iwrite.initial_xfer_len = 0; } diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 508ceeecf2d9..6d9d8c196936 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -5935,7 +5935,7 @@ lpfc_chk_tgt_mapped(struct lpfc_vport *vport, struct fc_rport *rport) /** * lpfc_reset_flush_io_context - * @vport: The virtual port (scsi_host) for the flush context - * @tgt_id: If aborting by Target contect - specifies the target id + * @tgt_id: If aborting by Target context - specifies the target id * @lun_id: If aborting by Lun context - specifies the lun id * @context: specifies the context level to flush at. * @@ -6109,8 +6109,14 @@ lpfc_target_reset_handler(struct scsi_cmnd *cmnd) pnode->nlp_fcp_info &= ~NLP_FCP_2_DEVICE; spin_unlock_irqrestore(&pnode->lock, flags); } - lpfc_reset_flush_io_context(vport, tgt_id, lun_id, - LPFC_CTX_TGT); + status = lpfc_reset_flush_io_context(vport, tgt_id, lun_id, + LPFC_CTX_TGT); + if (status != SUCCESS) { + lpfc_printf_vlog(vport, KERN_ERR, LOG_FCP, + "0726 Target Reset flush status x%x\n", + status); + return status; + } return FAST_IO_FAIL; } @@ -6202,7 +6208,7 @@ lpfc_host_reset_handler(struct scsi_cmnd *cmnd) int rc, ret = SUCCESS; lpfc_printf_vlog(vport, KERN_ERR, LOG_FCP, - "3172 SCSI layer issued Host Reset Data:\n"); + "3172 SCSI layer issued Host Reset\n"); lpfc_offline_prep(phba, LPFC_MBX_WAIT); lpfc_offline(phba); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index a8fbdf7119d8..7ea7c4245c69 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -8820,7 +8820,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba) if (unlikely(rc)) { lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT, "0381 Error %d during queue setup.\n", rc); - goto out_stop_timers; + goto out_destroy_queue; } /* Initialize the driver internal SLI layer lists. */ lpfc_sli4_setup(phba); @@ -9103,7 +9103,6 @@ out_free_iocblist: lpfc_free_iocb_list(phba); out_destroy_queue: lpfc_sli4_queue_destroy(phba); -out_stop_timers: lpfc_stop_hba_timers(phba); out_free_mbox: mempool_free(mboxq, phba->mbox_mem_pool); @@ -12439,19 +12438,11 @@ lpfc_sli_issue_abort_iotag(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, } /* - * If we're unloading, don't abort iocb on the ELS ring, but change - * the callback so that nothing happens when it finishes. + * Always abort the outstanding WQE and set the IA bit correctly + * for the context. This is necessary for correctly removing + * outstanding ndlp reference counts when the CQE completes with + * the XB bit set. */ - if (test_bit(FC_UNLOADING, &vport->load_flag) && - pring->ringno == LPFC_ELS_RING) { - if (cmdiocb->cmd_flag & LPFC_IO_FABRIC) - cmdiocb->fabric_cmd_cmpl = lpfc_ignore_els_cmpl; - else - cmdiocb->cmd_cmpl = lpfc_ignore_els_cmpl; - return retval; - } - - /* issue ABTS for this IOCB based on iotag */ abtsiocbp = __lpfc_sli_get_iocbq(phba); if (abtsiocbp == NULL) return IOCB_NORESOURCE; @@ -21373,7 +21364,7 @@ lpfc_sli4_issue_wqe(struct lpfc_hba *phba, struct lpfc_sli4_hdw_queue *qp, struct lpfc_sglq *sglq; struct lpfc_sli_ring *pring; unsigned long iflags; - uint32_t ret = 0; + int ret = 0; /* NVME_LS and NVME_LS ABTS requests. */ if (pwqe->cmd_flag & LPFC_IO_NVME_LS) { diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h index 9ee3a3a4ec4d..31c3c5abdca6 100644 --- a/drivers/scsi/lpfc/lpfc_version.h +++ b/drivers/scsi/lpfc/lpfc_version.h @@ -20,7 +20,7 @@ * included with this package. * *******************************************************************/ -#define LPFC_DRIVER_VERSION "14.4.0.10" +#define LPFC_DRIVER_VERSION "14.4.0.11" #define LPFC_DRIVER_NAME "lpfc" /* Used for SLI 2/3 */ diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h index 96401eb7e231..8c8bfbbdd34e 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h @@ -322,6 +322,9 @@ struct mpi3_man6_gpio_entry { #define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_MASK (0x01) #define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_EDGE (0x00) #define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_LEVEL (0x01) +#define MPI3_MAN6_GPIO_OVER_TEMP_PARAM1_LEVEL_WARNING (0x00) +#define MPI3_MAN6_GPIO_OVER_TEMP_PARAM1_LEVEL_CRITICAL (0x01) +#define MPI3_MAN6_GPIO_OVER_TEMP_PARAM1_LEVEL_FATAL (0x02) #define MPI3_MAN6_GPIO_PORT_GREEN_PARAM1_PHY_STATUS_ALL_UP (0x00) #define MPI3_MAN6_GPIO_PORT_GREEN_PARAM1_PHY_STATUS_ONE_OR_MORE_UP (0x01) #define MPI3_MAN6_GPIO_CABLE_MGMT_PARAM1_INTERFACE_MODULE_PRESENT (0x00) @@ -1250,6 +1253,37 @@ struct mpi3_io_unit_page17 { __le32 current_key[]; }; #define MPI3_IOUNIT17_PAGEVERSION (0x00) +struct mpi3_io_unit_page18 { + struct mpi3_config_page_header header; + u8 flags; + u8 poll_interval; + __le16 reserved0a; + __le32 reserved0c; +}; + +#define MPI3_IOUNIT18_PAGEVERSION (0x00) +#define MPI3_IOUNIT18_FLAGS_DIRECTATTACHED_ENABLE (0x01) +#define MPI3_IOUNIT18_POLLINTERVAL_DISABLE (0x00) +#ifndef MPI3_IOUNIT19_DEVICE_MAX +#define MPI3_IOUNIT19_DEVICE_MAX (1) +#endif +struct mpi3_iounit19_device { + __le16 temperature; + __le16 dev_handle; + __le16 persistent_id; + __le16 reserved06; +}; + +#define MPI3_IOUNIT19_DEVICE_TEMPERATURE_UNAVAILABLE (0x8000) +struct mpi3_io_unit_page19 { + struct mpi3_config_page_header header; + __le16 num_devices; + __le16 reserved0a; + __le32 reserved0c; + struct mpi3_iounit19_device device[MPI3_IOUNIT19_DEVICE_MAX]; +}; + +#define MPI3_IOUNIT19_PAGEVERSION (0x00) struct mpi3_ioc_page0 { struct mpi3_config_page_header header; __le32 reserved08; @@ -2356,7 +2390,9 @@ struct mpi3_device0_vd_format { __le16 io_throttle_group; __le16 io_throttle_group_low; __le16 io_throttle_group_high; - __le32 reserved0c; + u8 vd_abort_to; + u8 vd_reset_to; + __le16 reserved0e; }; #define MPI3_DEVICE0_VD_STATE_OFFLINE (0x00) #define MPI3_DEVICE0_VD_STATE_PARTIALLY_DEGRADED (0x01) diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_pci.h b/drivers/scsi/mpi3mr/mpi/mpi30_pci.h index 7c15e5851ce4..4eeb11c3c73e 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_pci.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_pci.h @@ -9,9 +9,11 @@ #define MPI3_NVME_ENCAP_CMD_MAX (1) #endif #define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_MASK (0x0002) +#define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_SHIFT (1) #define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_FAIL_ONLY (0x0000) #define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_ALL (0x0002) #define MPI3_NVME_FLAGS_SUBMISSIONQ_MASK (0x0001) +#define MPI3_NVME_FLAGS_SUBMISSIONQ_SHIFT (0) #define MPI3_NVME_FLAGS_SUBMISSIONQ_IO (0x0000) #define MPI3_NVME_FLAGS_SUBMISSIONQ_ADMIN (0x0001) diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_sas.h b/drivers/scsi/mpi3mr/mpi/mpi30_sas.h index 4a93c67d335f..190b06508b00 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_sas.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_sas.h @@ -11,6 +11,7 @@ #define MPI3_SAS_DEVICE_INFO_STP_INITIATOR (0x00000010) #define MPI3_SAS_DEVICE_INFO_SMP_INITIATOR (0x00000008) #define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK (0x00000007) +#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_SHIFT (0) #define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_NO_DEVICE (0x00000000) #define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_END_DEVICE (0x00000001) #define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_EXPANDER (0x00000002) diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h index 5c522e2531c3..28ab2efb3baa 100644 --- a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h +++ b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h @@ -18,7 +18,7 @@ union mpi3_version_union { #define MPI3_VERSION_MAJOR (3) #define MPI3_VERSION_MINOR (0) -#define MPI3_VERSION_UNIT (35) +#define MPI3_VERSION_UNIT (37) #define MPI3_VERSION_DEV (0) #define MPI3_DEVHANDLE_INVALID (0xffff) struct mpi3_sysif_oper_queue_indexes { diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h index 8d4ef49e04d1..6742684e2990 100644 --- a/drivers/scsi/mpi3mr/mpi3mr.h +++ b/drivers/scsi/mpi3mr/mpi3mr.h @@ -56,8 +56,8 @@ extern struct list_head mrioc_list; extern int prot_mask; extern atomic64_t event_counter; -#define MPI3MR_DRIVER_VERSION "8.14.0.5.50" -#define MPI3MR_DRIVER_RELDATE "27-June-2025" +#define MPI3MR_DRIVER_VERSION "8.15.0.5.50" +#define MPI3MR_DRIVER_RELDATE "12-August-2025" #define MPI3MR_DRIVER_NAME "mpi3mr" #define MPI3MR_DRIVER_LICENSE "GPL" @@ -697,6 +697,8 @@ struct tgt_dev_vd { u16 tg_id; u32 tg_high; u32 tg_low; + u8 abort_to; + u8 reset_to; struct mpi3mr_throttle_group_info *tg; }; @@ -738,6 +740,8 @@ enum mpi3mr_dev_state { * @wwid: World wide ID * @enclosure_logical_id: Enclosure logical identifier * @dev_spec: Device type specific information + * @abort_to: Timeout for abort TM + * @reset_to: Timeout for Target/LUN reset TM * @ref_count: Reference count * @state: device state */ diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index 0152d31d430a..8fe6e0bf342e 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -2353,6 +2353,8 @@ static int mpi3mr_create_op_queues(struct mpi3mr_ioc *mrioc) { int retval = 0; u16 num_queues = 0, i = 0, msix_count_op_q = 1; + u32 ioc_status; + enum mpi3mr_iocstate ioc_state; num_queues = min_t(int, mrioc->facts.max_op_reply_q, mrioc->facts.max_op_req_q); @@ -2408,6 +2410,14 @@ static int mpi3mr_create_op_queues(struct mpi3mr_ioc *mrioc) retval = -1; goto out_failed; } + ioc_status = readl(&mrioc->sysif_regs->ioc_status); + ioc_state = mpi3mr_get_iocstate(mrioc); + if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) || + ioc_state != MRIOC_STATE_READY) { + mpi3mr_print_fault_info(mrioc); + retval = -1; + goto out_failed; + } mrioc->num_op_reply_q = mrioc->num_op_req_q = i; ioc_info(mrioc, "successfully created %d operational queue pairs(default/polled) queue = (%d/%d)\n", @@ -5420,6 +5430,7 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc, mpi3mr_reset_rc_name(reset_reason)); mrioc->device_refresh_on = 0; + scsi_block_requests(mrioc->shost); mrioc->reset_in_progress = 1; mrioc->stop_bsgs = 1; mrioc->prev_reset_result = -1; @@ -5528,6 +5539,7 @@ out: if (!retval) { mrioc->diagsave_timeout = 0; mrioc->reset_in_progress = 0; + scsi_unblock_requests(mrioc->shost); mrioc->pel_abort_requested = 0; if (mrioc->pel_enabled) { mrioc->pel_cmds.retry_count = 0; @@ -5552,6 +5564,7 @@ out: mrioc->device_refresh_on = 0; mrioc->unrecoverable = 1; mrioc->reset_in_progress = 0; + scsi_unblock_requests(mrioc->shost); mrioc->stop_bsgs = 0; retval = -1; mpi3mr_flush_cmds_for_unrecovered_controller(mrioc); diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index 3df52a3b435b..b88633e1efe2 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -1308,6 +1308,12 @@ static void mpi3mr_update_tgtdev(struct mpi3mr_ioc *mrioc, if (vdinf->vd_state == MPI3_DEVICE0_VD_STATE_OFFLINE) tgtdev->is_hidden = 1; tgtdev->non_stl = 1; + tgtdev->dev_spec.vd_inf.reset_to = + max_t(u8, vdinf->vd_reset_to, + MPI3MR_INTADMCMD_TIMEOUT); + tgtdev->dev_spec.vd_inf.abort_to = + max_t(u8, vdinf->vd_abort_to, + MPI3MR_INTADMCMD_TIMEOUT); tgtdev->dev_spec.vd_inf.tg_id = vdinf_io_throttle_group; tgtdev->dev_spec.vd_inf.tg_high = le16_to_cpu(vdinf->io_throttle_group_high) * 2048; @@ -2049,8 +2055,8 @@ static void mpi3mr_fwevt_bh(struct mpi3mr_ioc *mrioc, if (!fwevt->process_evt) goto evt_ack; - dprint_event_bh(mrioc, "processing event(0x%02x) in the bottom half handler\n", - fwevt->event_id); + dprint_event_bh(mrioc, "processing event(0x%02x) -(0x%08x) in the bottom half handler\n", + fwevt->event_id, fwevt->evt_ctx); switch (fwevt->event_id) { case MPI3_EVENT_DEVICE_ADDED: @@ -2866,12 +2872,14 @@ static void mpi3mr_preparereset_evt_th(struct mpi3mr_ioc *mrioc, "prepare for reset event top half with rc=start\n"); if (mrioc->prepare_for_reset) return; + scsi_block_requests(mrioc->shost); mrioc->prepare_for_reset = 1; mrioc->prepare_for_reset_timeout_counter = 0; } else if (evtdata->reason_code == MPI3_EVENT_PREPARE_RESET_RC_ABORT) { dprint_event_th(mrioc, "prepare for reset top half with rc=abort\n"); mrioc->prepare_for_reset = 0; + scsi_unblock_requests(mrioc->shost); mrioc->prepare_for_reset_timeout_counter = 0; } if ((event_reply->msg_flags & MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_MASK) @@ -3076,8 +3084,8 @@ void mpi3mr_os_handle_events(struct mpi3mr_ioc *mrioc, } if (process_evt_bh || ack_req) { dprint_event_th(mrioc, - "scheduling bottom half handler for event(0x%02x),ack_required=%d\n", - evt_type, ack_req); + "scheduling bottom half handler for event(0x%02x) - (0x%08x), ack_required=%d\n", + evt_type, le32_to_cpu(event_reply->event_context), ack_req); sz = event_reply->event_data_length * 4; fwevt = mpi3mr_alloc_fwevt(sz); if (!fwevt) { @@ -3915,11 +3923,13 @@ int mpi3mr_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type, if (scsi_tgt_priv_data) atomic_inc(&scsi_tgt_priv_data->block_io); - if (tgtdev && (tgtdev->dev_type == MPI3_DEVICE_DEVFORM_PCIE)) { - if (cmd_priv && tgtdev->dev_spec.pcie_inf.abort_to) - timeout = tgtdev->dev_spec.pcie_inf.abort_to; - else if (!cmd_priv && tgtdev->dev_spec.pcie_inf.reset_to) - timeout = tgtdev->dev_spec.pcie_inf.reset_to; + if (tgtdev) { + if (tgtdev->dev_type == MPI3_DEVICE_DEVFORM_PCIE) + timeout = cmd_priv ? tgtdev->dev_spec.pcie_inf.abort_to + : tgtdev->dev_spec.pcie_inf.reset_to; + else if (tgtdev->dev_type == MPI3_DEVICE_DEVFORM_VD) + timeout = cmd_priv ? tgtdev->dev_spec.vd_inf.abort_to + : tgtdev->dev_spec.vd_inf.reset_to; } init_completion(&drv_cmd->done); diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c index c8d6ced5640e..d70f002d6487 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_transport.c +++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c @@ -413,9 +413,11 @@ static void mpi3mr_remove_device_by_sas_address(struct mpi3mr_ioc *mrioc, sas_address, hba_port); if (tgtdev) { if (!list_empty(&tgtdev->list)) { - list_del_init(&tgtdev->list); was_on_tgtdev_list = 1; - mpi3mr_tgtdev_put(tgtdev); + if (tgtdev->state == MPI3MR_DEV_REMOVE_HS_STARTED) { + list_del_init(&tgtdev->list); + mpi3mr_tgtdev_put(tgtdev); + } } } spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags); @@ -2079,6 +2081,8 @@ int mpi3mr_expander_add(struct mpi3mr_ioc *mrioc, u16 handle) link_rate = (expander_pg1.negotiated_link_rate & MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >> MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT; + if (link_rate < MPI3_SAS_NEG_LINK_RATE_1_5) + link_rate = MPI3_SAS_NEG_LINK_RATE_1_5; mpi3mr_update_links(mrioc, sas_address_parent, handle, i, link_rate, hba_port); } @@ -2388,6 +2392,9 @@ int mpi3mr_report_tgtdev_to_sas_transport(struct mpi3mr_ioc *mrioc, link_rate = mpi3mr_get_sas_negotiated_logical_linkrate(mrioc, tgtdev); + if (link_rate < MPI3_SAS_NEG_LINK_RATE_1_5) + link_rate = MPI3_SAS_NEG_LINK_RATE_1_5; + mpi3mr_update_links(mrioc, sas_address_parent, tgtdev->dev_handle, parent_phy_number, link_rate, hba_port); diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index bd3efa5b46c7..0d652db8fe24 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -1420,7 +1420,13 @@ _base_display_reply_info(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, if (ioc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE) { loginfo = le32_to_cpu(mpi_reply->IOCLogInfo); - _base_sas_log_info(ioc, loginfo); + if (ioc->logging_level & MPT_DEBUG_REPLY) + _base_sas_log_info(ioc, loginfo); + else { + if (!((ioc_status & MPI2_IOCSTATUS_MASK) & + MPI2_IOCSTATUS_CONFIG_INVALID_PAGE)) + _base_sas_log_info(ioc, loginfo); + } } if (ioc_status || loginfo) { diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h index 939141cde3ca..e6a6f21d309b 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.h +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h @@ -77,8 +77,8 @@ #define MPT3SAS_DRIVER_NAME "mpt3sas" #define MPT3SAS_AUTHOR "Avago Technologies <MPT-FusionLinux.pdl@avagotech.com>" #define MPT3SAS_DESCRIPTION "LSI MPT Fusion SAS 3.0 Device Driver" -#define MPT3SAS_DRIVER_VERSION "52.100.00.00" -#define MPT3SAS_MAJOR_VERSION 52 +#define MPT3SAS_DRIVER_VERSION "54.100.00.00" +#define MPT3SAS_MAJOR_VERSION 54 #define MPT3SAS_MINOR_VERSION 100 #define MPT3SAS_BUILD_VERSION 00 #define MPT3SAS_RELEASE_VERSION 00 diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c index dc74ebc6405a..f3400d01cc2a 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_transport.c +++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c @@ -166,6 +166,9 @@ _transport_convert_phy_link_rate(u8 link_rate) case MPI25_SAS_NEG_LINK_RATE_12_0: rc = SAS_LINK_RATE_12_0_GBPS; break; + case MPI26_SAS_NEG_LINK_RATE_22_5: + rc = SAS_LINK_RATE_22_5_GBPS; + break; case MPI2_SAS_NEG_LINK_RATE_PHY_DISABLED: rc = SAS_PHY_DISABLED; break; @@ -987,11 +990,9 @@ mpt3sas_transport_port_remove(struct MPT3SAS_ADAPTER *ioc, u64 sas_address, list_for_each_entry_safe(mpt3sas_phy, next_phy, &mpt3sas_port->phy_list, port_siblings) { if ((ioc->logging_level & MPT_DEBUG_TRANSPORT)) - dev_printk(KERN_INFO, &mpt3sas_port->port->dev, - "remove: sas_addr(0x%016llx), phy(%d)\n", - (unsigned long long) - mpt3sas_port->remote_identify.sas_address, - mpt3sas_phy->phy_id); + ioc_info(ioc, "remove: sas_addr(0x%016llx), phy(%d)\n", + (unsigned long long) mpt3sas_port->remote_identify.sas_address, + mpt3sas_phy->phy_id); mpt3sas_phy->phy_belongs_to_port = 0; if (!ioc->remove_host) sas_port_delete_phy(mpt3sas_port->port, diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c index 15b3d9d55a4b..f2e7997d5b9d 100644 --- a/drivers/scsi/mvsas/mv_sas.c +++ b/drivers/scsi/mvsas/mv_sas.c @@ -1175,7 +1175,7 @@ static int mvs_dev_found_notify(struct domain_device *dev, int lock) mvi_device->dev_type = dev->dev_type; mvi_device->mvi_info = mvi; mvi_device->sas_device = dev; - if (parent_dev && dev_is_expander(parent_dev->dev_type)) { + if (dev_parent_is_expander(dev)) { int phy_id; phy_id = sas_find_attached_phy_id(&parent_dev->ex_dev, dev); diff --git a/drivers/scsi/myrs.c b/drivers/scsi/myrs.c index 95af3bb03834..a58abd796603 100644 --- a/drivers/scsi/myrs.c +++ b/drivers/scsi/myrs.c @@ -498,14 +498,14 @@ static bool myrs_enable_mmio_mbox(struct myrs_hba *cs, /* Temporary dma mapping, used only in the scope of this function */ mbox = dma_alloc_coherent(&pdev->dev, sizeof(union myrs_cmd_mbox), &mbox_addr, GFP_KERNEL); - if (dma_mapping_error(&pdev->dev, mbox_addr)) + if (!mbox) return false; /* These are the base addresses for the command memory mailbox array */ cs->cmd_mbox_size = MYRS_MAX_CMD_MBOX * sizeof(union myrs_cmd_mbox); cmd_mbox = dma_alloc_coherent(&pdev->dev, cs->cmd_mbox_size, &cs->cmd_mbox_addr, GFP_KERNEL); - if (dma_mapping_error(&pdev->dev, cs->cmd_mbox_addr)) { + if (!cmd_mbox) { dev_err(&pdev->dev, "Failed to map command mailbox\n"); goto out_free; } @@ -520,7 +520,7 @@ static bool myrs_enable_mmio_mbox(struct myrs_hba *cs, cs->stat_mbox_size = MYRS_MAX_STAT_MBOX * sizeof(struct myrs_stat_mbox); stat_mbox = dma_alloc_coherent(&pdev->dev, cs->stat_mbox_size, &cs->stat_mbox_addr, GFP_KERNEL); - if (dma_mapping_error(&pdev->dev, cs->stat_mbox_addr)) { + if (!stat_mbox) { dev_err(&pdev->dev, "Failed to map status mailbox\n"); goto out_free; } @@ -533,7 +533,7 @@ static bool myrs_enable_mmio_mbox(struct myrs_hba *cs, cs->fwstat_buf = dma_alloc_coherent(&pdev->dev, sizeof(struct myrs_fwstat), &cs->fwstat_addr, GFP_KERNEL); - if (dma_mapping_error(&pdev->dev, cs->fwstat_addr)) { + if (!cs->fwstat_buf) { dev_err(&pdev->dev, "Failed to map firmware health buffer\n"); cs->fwstat_buf = NULL; goto out_free; diff --git a/drivers/scsi/pm8001/pm8001_ctl.c b/drivers/scsi/pm8001/pm8001_ctl.c index 7618f9cc9986..cbfda8c04e95 100644 --- a/drivers/scsi/pm8001/pm8001_ctl.c +++ b/drivers/scsi/pm8001/pm8001_ctl.c @@ -534,23 +534,25 @@ static ssize_t pm8001_ctl_iop_log_show(struct device *cdev, char *str = buf; u32 read_size = pm8001_ha->main_cfg_tbl.pm80xx_tbl.event_log_size / 1024; - static u32 start, end, count; u32 max_read_times = 32; u32 max_count = (read_size * 1024) / (max_read_times * 4); u32 *temp = (u32 *)pm8001_ha->memoryMap.region[IOP].virt_ptr; - if ((count % max_count) == 0) { - start = 0; - end = max_read_times; - count = 0; + mutex_lock(&pm8001_ha->iop_log_lock); + + if ((pm8001_ha->iop_log_count % max_count) == 0) { + pm8001_ha->iop_log_start = 0; + pm8001_ha->iop_log_end = max_read_times; + pm8001_ha->iop_log_count = 0; } else { - start = end; - end = end + max_read_times; + pm8001_ha->iop_log_start = pm8001_ha->iop_log_end; + pm8001_ha->iop_log_end = pm8001_ha->iop_log_end + max_read_times; } - for (; start < end; start++) - str += sprintf(str, "%08x ", *(temp+start)); - count++; + for (; pm8001_ha->iop_log_start < pm8001_ha->iop_log_end; pm8001_ha->iop_log_start++) + str += sprintf(str, "%08x ", *(temp+pm8001_ha->iop_log_start)); + pm8001_ha->iop_log_count++; + mutex_unlock(&pm8001_ha->iop_log_lock); return str - buf; } static DEVICE_ATTR(iop_log, S_IRUGO, pm8001_ctl_iop_log_show, NULL); @@ -680,7 +682,7 @@ static int pm8001_set_nvmd(struct pm8001_hba_info *pm8001_ha) struct pm8001_ioctl_payload *payload; DECLARE_COMPLETION_ONSTACK(completion); u8 *ioctlbuffer; - u32 ret; + int ret; u32 length = 1024 * 5 + sizeof(*payload) - 1; if (pm8001_ha->fw_image->size > 4096) { diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index 42a4eeac24c9..8005995a317c 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -2163,8 +2163,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, void *piomb) /* Print sas address of IO failed device */ if ((status != IO_SUCCESS) && (status != IO_OVERFLOW) && (status != IO_UNDERFLOW)) { - if (!((t->dev->parent) && - (dev_is_expander(t->dev->parent->dev_type)))) { + if (!dev_parent_is_expander(t->dev)) { for (i = 0, j = 4; j <= 7 && i <= 3; i++, j++) sata_addr_low[i] = pm8001_ha->sas_addr[j]; for (i = 0, j = 0; j <= 3 && i <= 3; i++, j++) @@ -4168,7 +4167,6 @@ static int pm8001_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha, u16 firstBurstSize = 0; u16 ITNT = 2000; struct domain_device *dev = pm8001_dev->sas_device; - struct domain_device *parent_dev = dev->parent; struct pm8001_port *port = dev->port->lldd_port; memset(&payload, 0, sizeof(payload)); @@ -4186,10 +4184,9 @@ static int pm8001_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha, dev_is_expander(pm8001_dev->dev_type)) stp_sspsmp_sata = 0x01; /*ssp or smp*/ } - if (parent_dev && dev_is_expander(parent_dev->dev_type)) - phy_id = parent_dev->ex_dev.ex_phy->phy_id; - else - phy_id = pm8001_dev->attached_phy; + + phy_id = pm80xx_get_local_phy_id(dev); + opc = OPC_INB_REG_DEV; linkrate = (pm8001_dev->sas_device->linkrate < dev->port->linkrate) ? pm8001_dev->sas_device->linkrate : dev->port->linkrate; diff --git a/drivers/scsi/pm8001/pm8001_hwi.h b/drivers/scsi/pm8001/pm8001_hwi.h index fc2127dcb58d..f1ce8df082b0 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.h +++ b/drivers/scsi/pm8001/pm8001_hwi.h @@ -339,8 +339,10 @@ struct ssp_completion_resp { __le32 status; __le32 param; __le32 ssptag_rescv_rescpad; + + /* Must be last --ends in a flexible-array member. */ struct ssp_response_iu ssp_resp_iu; - __le32 residual_count; + /* __le32 residual_count; */ } __attribute__((packed, aligned(4))); diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index 599410bcdfea..8ff4b89ff81e 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -552,6 +552,7 @@ static struct pm8001_hba_info *pm8001_pci_alloc(struct pci_dev *pdev, pm8001_ha->id = pm8001_id++; pm8001_ha->logging_level = logging_level; pm8001_ha->non_fatal_count = 0; + mutex_init(&pm8001_ha->iop_log_lock); if (link_rate >= 1 && link_rate <= 15) pm8001_ha->link_rate = (link_rate << 8); else { diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index f7067878b34f..6a8d35aea93a 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -130,6 +130,16 @@ static void pm80xx_get_tag_opcodes(struct sas_task *task, int *ata_op, } } +u32 pm80xx_get_local_phy_id(struct domain_device *dev) +{ + struct pm8001_device *pm8001_dev = dev->lldd_dev; + + if (dev_parent_is_expander(dev)) + return dev->parent->ex_dev.ex_phy->phy_id; + + return pm8001_dev->attached_phy; +} + void pm80xx_show_pending_commands(struct pm8001_hba_info *pm8001_ha, struct pm8001_device *target_pm8001_dev) { @@ -477,7 +487,7 @@ int pm8001_queue_command(struct sas_task *task, gfp_t gfp_flags) struct pm8001_device *pm8001_dev = dev->lldd_dev; bool internal_abort = sas_is_internal_abort(task); struct pm8001_hba_info *pm8001_ha; - struct pm8001_port *port = NULL; + struct pm8001_port *port; struct pm8001_ccb_info *ccb; unsigned long flags; u32 n_elem = 0; @@ -502,8 +512,7 @@ int pm8001_queue_command(struct sas_task *task, gfp_t gfp_flags) spin_lock_irqsave(&pm8001_ha->lock, flags); - pm8001_dev = dev->lldd_dev; - port = pm8001_ha->phy[pm8001_dev->attached_phy].port; + port = dev->port->lldd_port; if (!internal_abort && (DEV_IS_GONE(pm8001_dev) || !port || !port->port_attached)) { @@ -701,7 +710,7 @@ static int pm8001_dev_found_notify(struct domain_device *dev) dev->lldd_dev = pm8001_device; pm8001_device->dev_type = dev->dev_type; pm8001_device->dcompletion = &completion; - if (parent_dev && dev_is_expander(parent_dev->dev_type)) { + if (dev_parent_is_expander(dev)) { int phy_id; phy_id = sas_find_attached_phy_id(&parent_dev->ex_dev, dev); @@ -766,7 +775,16 @@ static void pm8001_dev_gone_notify(struct domain_device *dev) spin_lock_irqsave(&pm8001_ha->lock, flags); } PM8001_CHIP_DISP->dereg_dev_req(pm8001_ha, device_id); - pm8001_ha->phy[pm8001_dev->attached_phy].phy_attached = 0; + + /* + * The phy array only contains local phys. Thus, we cannot clear + * phy_attached for a device behind an expander. + */ + if (!dev_parent_is_expander(dev)) { + u32 phy_id = pm80xx_get_local_phy_id(dev); + + pm8001_ha->phy[phy_id].phy_attached = 0; + } pm8001_free_dev(pm8001_dev); } else { pm8001_dbg(pm8001_ha, DISC, "Found dev has gone.\n"); @@ -1048,7 +1066,7 @@ int pm8001_abort_task(struct sas_task *task) struct pm8001_hba_info *pm8001_ha; struct pm8001_device *pm8001_dev; int rc = TMF_RESP_FUNC_FAILED, ret; - u32 phy_id, port_id; + u32 port_id; struct sas_task_slow slow_task; if (!task->lldd_task || !task->dev) @@ -1057,7 +1075,6 @@ int pm8001_abort_task(struct sas_task *task) dev = task->dev; pm8001_dev = dev->lldd_dev; pm8001_ha = pm8001_find_ha_by_dev(dev); - phy_id = pm8001_dev->attached_phy; if (PM8001_CHIP_DISP->fatal_errors(pm8001_ha)) { // If the controller is seeing fatal errors @@ -1089,7 +1106,8 @@ int pm8001_abort_task(struct sas_task *task) if (pm8001_ha->chip_id == chip_8006) { DECLARE_COMPLETION_ONSTACK(completion_reset); DECLARE_COMPLETION_ONSTACK(completion); - struct pm8001_phy *phy = pm8001_ha->phy + phy_id; + u32 phy_id = pm80xx_get_local_phy_id(dev); + struct pm8001_phy *phy = &pm8001_ha->phy[phy_id]; port_id = phy->port->port_id; /* 1. Set Device state as Recovery */ diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index 334485bb2c12..b63b6ffcaaf5 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -547,6 +547,10 @@ struct pm8001_hba_info { u32 ci_offset; u32 pi_offset; u32 max_memcnt; + u32 iop_log_start; + u32 iop_log_end; + u32 iop_log_count; + struct mutex iop_log_lock; }; struct pm8001_work { @@ -798,6 +802,7 @@ void pm8001_setds_completion(struct domain_device *dev); void pm8001_tmf_aborted(struct sas_task *task); void pm80xx_show_pending_commands(struct pm8001_hba_info *pm8001_ha, struct pm8001_device *dev); +u32 pm80xx_get_local_phy_id(struct domain_device *dev); #endif diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index c1bae995a412..31960b72c1e9 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -2340,8 +2340,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, /* Print sas address of IO failed device */ if ((status != IO_SUCCESS) && (status != IO_OVERFLOW) && (status != IO_UNDERFLOW)) { - if (!((t->dev->parent) && - (dev_is_expander(t->dev->parent->dev_type)))) { + if (!dev_parent_is_expander(t->dev)) { for (i = 0, j = 4; i <= 3 && j <= 7; i++, j++) sata_addr_low[i] = pm8001_ha->sas_addr[j]; for (i = 0, j = 0; i <= 3 && j <= 3; i++, j++) @@ -4780,7 +4779,6 @@ static int pm80xx_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha, u16 firstBurstSize = 0; u16 ITNT = 2000; struct domain_device *dev = pm8001_dev->sas_device; - struct domain_device *parent_dev = dev->parent; struct pm8001_port *port = dev->port->lldd_port; memset(&payload, 0, sizeof(payload)); @@ -4799,10 +4797,8 @@ static int pm80xx_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha, dev_is_expander(pm8001_dev->dev_type)) stp_sspsmp_sata = 0x01; /*ssp or smp*/ } - if (parent_dev && dev_is_expander(parent_dev->dev_type)) - phy_id = parent_dev->ex_dev.ex_phy->phy_id; - else - phy_id = pm8001_dev->attached_phy; + + phy_id = pm80xx_get_local_phy_id(dev); opc = OPC_INB_REG_DEV; diff --git a/drivers/scsi/pm8001/pm80xx_hwi.h b/drivers/scsi/pm8001/pm80xx_hwi.h index eb8fd37b2066..d8a63b7fed6a 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.h +++ b/drivers/scsi/pm8001/pm80xx_hwi.h @@ -558,8 +558,10 @@ struct ssp_completion_resp { __le32 status; __le32 param; __le32 ssptag_rescv_rescpad; + + /* Must be last --ends in a flexible-array member. */ struct ssp_response_iu ssp_resp_iu; - __le32 residual_count; + /* __le32 residual_count; */ } __attribute__((packed, aligned(4))); #define SSP_RESCV_BIT 0x00010000 diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c index 10431a67d202..ccfc2d26dd37 100644 --- a/drivers/scsi/qla2xxx/qla_bsg.c +++ b/drivers/scsi/qla2xxx/qla_bsg.c @@ -3106,8 +3106,8 @@ static bool qla_bsg_found(struct qla_qpair *qpair, struct bsg_job *bsg_job) switch (rval) { case QLA_SUCCESS: /* Wait for the command completion. */ - ratov_j = ha->r_a_tov / 10 * 4 * 1000; - ratov_j = msecs_to_jiffies(ratov_j); + ratov_j = ha->r_a_tov / 10 * 4; + ratov_j = secs_to_jiffies(ratov_j); if (!wait_for_completion_timeout(&comp, ratov_j)) { ql_log(ql_log_info, vha, 0x7089, diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index cb95b7b12051..604e66bead1e 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -4890,9 +4890,7 @@ struct purex_item { struct purex_item *pkt); atomic_t in_use; uint16_t size; - struct { - uint8_t iocb[64]; - } iocb; + uint8_t iocb[] __counted_by(size); }; #include "qla_edif.h" @@ -5101,7 +5099,6 @@ typedef struct scsi_qla_host { struct list_head head; spinlock_t lock; } purex_list; - struct purex_item default_item; struct name_list_extended gnl; /* Count of active session/fcport */ @@ -5130,6 +5127,11 @@ typedef struct scsi_qla_host { #define DPORT_DIAG_IN_PROGRESS BIT_0 #define DPORT_DIAG_CHIP_RESET_IN_PROGRESS BIT_1 uint16_t dport_status; + + /* Must be last --ends in a flexible-array member. */ + TRAILING_OVERLAP(struct purex_item, default_item, iocb, + uint8_t __default_item_iocb[QLA_DEFAULT_PAYLOAD_SIZE]; + ); } scsi_qla_host_t; struct qla27xx_image_status { diff --git a/drivers/scsi/qla2xxx/qla_edif.c b/drivers/scsi/qla2xxx/qla_edif.c index 91bbd3b75bff..ccd4485087a1 100644 --- a/drivers/scsi/qla2xxx/qla_edif.c +++ b/drivers/scsi/qla2xxx/qla_edif.c @@ -1798,7 +1798,7 @@ retry: switch (rval) { case QLA_SUCCESS: break; - case EAGAIN: + case -EAGAIN: msleep(EDIF_MSLEEP_INTERVAL); cnt++; if (cnt < EDIF_RETRY_COUNT) @@ -3649,7 +3649,7 @@ retry: p->e.extra_rx_xchg_address, p->e.extra_control_flags, sp->handle, sp->remap.req.len, bsg_job); break; - case EAGAIN: + case -EAGAIN: msleep(EDIF_MSLEEP_INTERVAL); cnt++; if (cnt < EDIF_RETRY_COUNT) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index be211ff22acb..6a2e1c7fd125 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -2059,11 +2059,11 @@ static void qla_marker_sp_done(srb_t *sp, int res) int cnt = 5; \ do { \ if (_chip_gen != sp->vha->hw->chip_reset || _login_gen != sp->fcport->login_gen) {\ - _rval = EINVAL; \ + _rval = -EINVAL; \ break; \ } \ _rval = qla2x00_start_sp(_sp); \ - if (_rval == EAGAIN) \ + if (_rval == -EAGAIN) \ msleep(1); \ else \ break; \ diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index c4c6b5c6658c..4559b490614d 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -1077,17 +1077,17 @@ static struct purex_item * qla24xx_alloc_purex_item(scsi_qla_host_t *vha, uint16_t size) { struct purex_item *item = NULL; - uint8_t item_hdr_size = sizeof(*item); if (size > QLA_DEFAULT_PAYLOAD_SIZE) { - item = kzalloc(item_hdr_size + - (size - QLA_DEFAULT_PAYLOAD_SIZE), GFP_ATOMIC); + item = kzalloc(struct_size(item, iocb, size), GFP_ATOMIC); } else { if (atomic_inc_return(&vha->default_item.in_use) == 1) { item = &vha->default_item; goto initialize_purex_header; } else { - item = kzalloc(item_hdr_size, GFP_ATOMIC); + item = kzalloc( + struct_size(item, iocb, QLA_DEFAULT_PAYLOAD_SIZE), + GFP_ATOMIC); } } if (!item) { @@ -1127,17 +1127,16 @@ qla24xx_queue_purex_item(scsi_qla_host_t *vha, struct purex_item *pkt, * @vha: SCSI driver HA context * @pkt: ELS packet */ -static struct purex_item -*qla24xx_copy_std_pkt(struct scsi_qla_host *vha, void *pkt) +static struct purex_item * +qla24xx_copy_std_pkt(struct scsi_qla_host *vha, void *pkt) { struct purex_item *item; - item = qla24xx_alloc_purex_item(vha, - QLA_DEFAULT_PAYLOAD_SIZE); + item = qla24xx_alloc_purex_item(vha, QLA_DEFAULT_PAYLOAD_SIZE); if (!item) return item; - memcpy(&item->iocb, pkt, sizeof(item->iocb)); + memcpy(&item->iocb, pkt, QLA_DEFAULT_PAYLOAD_SIZE); return item; } diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 8ee2e337c9e1..065f9bcca26f 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -419,7 +419,7 @@ retry: switch (rval) { case QLA_SUCCESS: break; - case EAGAIN: + case -EAGAIN: msleep(PURLS_MSLEEP_INTERVAL); cnt++; if (cnt < PURLS_RETRY_COUNT) @@ -1308,7 +1308,7 @@ void qla2xxx_process_purls_iocb(void **pkt, struct rsp_que **rsp) ql_dbg(ql_dbg_unsol, vha, 0x2121, "PURLS OP[%01x] size %d xchg addr 0x%x portid %06x\n", - item->iocb.iocb[3], item->size, uctx->exchange_address, + item->iocb[3], item->size, uctx->exchange_address, fcport->d_id.b24); /* +48 0 1 2 3 4 5 6 7 8 9 A B C D E F * ----- ----------------------------------------------- diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index d4b484c0fd9d..98a5c105fdfd 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1291,8 +1291,8 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) "Abort command mbx cmd=%p, rval=%x.\n", cmd, rval); /* Wait for the command completion. */ - ratov_j = ha->r_a_tov/10 * 4 * 1000; - ratov_j = msecs_to_jiffies(ratov_j); + ratov_j = ha->r_a_tov / 10 * 4; + ratov_j = secs_to_jiffies(ratov_j); switch (rval) { case QLA_SUCCESS: if (!wait_for_completion_timeout(&comp, ratov_j)) { @@ -1806,8 +1806,8 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res, rval = ha->isp_ops->abort_command(sp); /* Wait for command completion. */ ret_cmd = false; - ratov_j = ha->r_a_tov/10 * 4 * 1000; - ratov_j = msecs_to_jiffies(ratov_j); + ratov_j = ha->r_a_tov / 10 * 4; + ratov_j = secs_to_jiffies(ratov_j); switch (rval) { case QLA_SUCCESS: if (wait_for_completion_timeout(&comp, ratov_j)) { @@ -6459,9 +6459,10 @@ dealloc: void qla24xx_free_purex_item(struct purex_item *item) { - if (item == &item->vha->default_item) + if (item == &item->vha->default_item) { memset(&item->vha->default_item, 0, sizeof(struct purex_item)); - else + memset(&item->vha->__default_item_iocb, 0, QLA_DEFAULT_PAYLOAD_SIZE); + } else kfree(item); } diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 353cb60e1abe..b2ab97be5db3 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1155,14 +1155,9 @@ static ssize_t sdebug_error_write(struct file *file, const char __user *ubuf, struct sdebug_err_inject *inject; struct scsi_device *sdev = (struct scsi_device *)file->f_inode->i_private; - buf = kzalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, count)) { - kfree(buf); - return -EFAULT; - } + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); if (buf[0] == '-') return sdebug_err_remove(sdev, buf, count); @@ -8805,8 +8800,8 @@ static int sdebug_add_store(void) /* Logical Block Provisioning */ if (scsi_debug_lbp()) { map_size = lba_to_map_index(sdebug_store_sectors - 1) + 1; - sip->map_storep = vmalloc(array_size(sizeof(long), - BITS_TO_LONGS(map_size))); + sip->map_storep = vcalloc(BITS_TO_LONGS(map_size), + sizeof(long)); pr_info("%lu provisioning blocks\n", map_size); @@ -8815,8 +8810,6 @@ static int sdebug_add_store(void) goto err; } - bitmap_zero(sip->map_storep, map_size); - /* Map first 1KB for partition table */ if (sdebug_num_parts) map_region(sip, 0, 2); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 00ad574ce61c..0252d3f6bed1 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -106,7 +106,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned int mode); static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim); -static int sd_revalidate_disk(struct gendisk *); +static void sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static void sd_shutdown(struct device *); static void scsi_disk_release(struct device *cdev); @@ -3691,13 +3691,13 @@ static void sd_read_block_zero(struct scsi_disk *sdkp) * performs disk spin up, read_capacity, etc. * @disk: struct gendisk we care about **/ -static int sd_revalidate_disk(struct gendisk *disk) +static void sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; sector_t old_capacity = sdkp->capacity; - struct queue_limits lim; - unsigned char *buffer; + struct queue_limits *lim = NULL; + unsigned char *buffer = NULL; unsigned int dev_max; int err; @@ -3709,25 +3709,26 @@ static int sd_revalidate_disk(struct gendisk *disk) * of the other niceties. */ if (!scsi_device_online(sdp)) - goto out; + return; + + lim = kmalloc(sizeof(*lim), GFP_KERNEL); + if (!lim) + return; buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL); - if (!buffer) { - sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory " - "allocation failure.\n"); + if (!buffer) goto out; - } sd_spinup_disk(sdkp); - lim = queue_limits_start_update(sdkp->disk->queue); + *lim = queue_limits_start_update(sdkp->disk->queue); /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { - sd_read_capacity(sdkp, &lim, buffer); + sd_read_capacity(sdkp, lim, buffer); /* * Some USB/UAS devices return generic values for mode pages * until the media has been accessed. Trigger a READ operation @@ -3741,17 +3742,17 @@ static int sd_revalidate_disk(struct gendisk *disk) * cause this to be updated correctly and any device which * doesn't support it should be treated as rotational. */ - lim.features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); + lim->features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); - sd_read_block_limits(sdkp, &lim); + sd_read_block_limits(sdkp, lim); sd_read_block_limits_ext(sdkp); - sd_read_block_characteristics(sdkp, &lim); - sd_zbc_read_zones(sdkp, &lim, buffer); + sd_read_block_characteristics(sdkp, lim); + sd_zbc_read_zones(sdkp, lim, buffer); } - sd_config_discard(sdkp, &lim, sd_discard_mode(sdkp)); + sd_config_discard(sdkp, lim, sd_discard_mode(sdkp)); sd_print_capacity(sdkp, old_capacity); @@ -3761,47 +3762,46 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); - sd_config_protection(sdkp, &lim); + sd_config_protection(sdkp, lim); } /* * We now have all cache related info, determine how we deal * with flush requests. */ - sd_set_flush_flag(sdkp, &lim); + sd_set_flush_flag(sdkp, lim); /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); - lim.max_dev_sectors = logical_to_sectors(sdp, dev_max); + lim->max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_min_xfer_size(sdkp)) - lim.io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); + lim->io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); else - lim.io_min = 0; + lim->io_min = 0; /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ - lim.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; + lim->io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; if (sd_validate_opt_xfer_size(sdkp, dev_max)) { - lim.io_opt = min_not_zero(lim.io_opt, + lim->io_opt = min_not_zero(lim->io_opt, logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); - sd_config_write_same(sdkp, &lim); - kfree(buffer); + sd_config_write_same(sdkp, lim); - err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); + err = queue_limits_commit_update_frozen(sdkp->disk->queue, lim); if (err) - return err; + goto out; /* * Query concurrent positioning ranges after @@ -3820,7 +3820,9 @@ static int sd_revalidate_disk(struct gendisk *disk) set_capacity_and_notify(disk, 0); out: - return 0; + kfree(buffer); + kfree(lim); + } /** diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 125944941601..03c97e60d36f 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -20,6 +20,7 @@ #include <linux/reboot.h> #include <linux/cciss_ioctl.h> #include <linux/crash_dump.h> +#include <linux/string.h> #include <scsi/scsi_host.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> @@ -6774,17 +6775,15 @@ static int pqi_passthru_ioctl(struct pqi_ctrl_info *ctrl_info, void __user *arg) } if (iocommand.buf_size > 0) { - kernel_buffer = kmalloc(iocommand.buf_size, GFP_KERNEL); - if (!kernel_buffer) - return -ENOMEM; if (iocommand.Request.Type.Direction & XFER_WRITE) { - if (copy_from_user(kernel_buffer, iocommand.buf, - iocommand.buf_size)) { - rc = -EFAULT; - goto out; - } + kernel_buffer = memdup_user(iocommand.buf, + iocommand.buf_size); + if (IS_ERR(kernel_buffer)) + return PTR_ERR(kernel_buffer); } else { - memset(kernel_buffer, 0, iocommand.buf_size); + kernel_buffer = kzalloc(iocommand.buf_size, GFP_KERNEL); + if (!kernel_buffer) + return -ENOMEM; } } diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index dc51ea352198..567f9cd29102 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1941,8 +1941,8 @@ static int storvsc_probe(struct hv_device *device, int num_present_cpus = num_present_cpus(); struct Scsi_Host *host; struct hv_host_device *host_dev; - bool dev_is_ide = ((dev_id->driver_data == IDE_GUID) ? true : false); - bool is_fc = ((dev_id->driver_data == SFC_GUID) ? true : false); + bool dev_is_ide = dev_id->driver_data == IDE_GUID; + bool is_fc = dev_id->driver_data == SFC_GUID; int target = 0; struct storvsc_device *stor_device; int max_sub_channels = 0; diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c index 88db94f382bb..efe8cdb20060 100644 --- a/drivers/target/iscsi/iscsi_target_configfs.c +++ b/drivers/target/iscsi/iscsi_target_configfs.c @@ -665,7 +665,7 @@ static ssize_t lio_target_nacl_cmdsn_depth_store(struct config_item *item, } acl_ci = &se_nacl->acl_group.cg_item; if (!acl_ci) { - pr_err("Unable to locatel acl_ci\n"); + pr_err("Unable to locate acl_ci\n"); return -EINVAL; } tpg_ci = &acl_ci->ci_parent->ci_group->cg_item; @@ -684,7 +684,7 @@ static ssize_t lio_target_nacl_cmdsn_depth_store(struct config_item *item, ret = core_tpg_set_initiator_node_queue_depth(se_nacl, cmdsn_depth); - pr_debug("LIO_Target_ConfigFS: %s/%s Set CmdSN Window: %u for" + pr_debug("LIO_Target_ConfigFS: %s/%s Set CmdSN Window: %u for " "InitiatorName: %s\n", config_item_name(wwn_ci), config_item_name(tpg_ci), cmdsn_depth, config_item_name(acl_ci)); @@ -1131,7 +1131,7 @@ static void lio_target_tiqn_deltpg(struct se_portal_group *se_tpg) /* End items for lio_target_tiqn_cit */ -/* Start LIO-Target TIQN struct contig_item lio_target_cit */ +/* Start LIO-Target TIQN struct config_item lio_target_cit */ static ssize_t lio_target_wwn_lio_version_show(struct config_item *item, char *page) diff --git a/drivers/target/iscsi/iscsi_target_tmr.c b/drivers/target/iscsi/iscsi_target_tmr.c index f60b156ede12..620de3910599 100644 --- a/drivers/target/iscsi/iscsi_target_tmr.c +++ b/drivers/target/iscsi/iscsi_target_tmr.c @@ -112,7 +112,8 @@ u8 iscsit_tmr_task_reassign( struct iscsi_tmr_req *tmr_req = cmd->tmr_req; struct se_tmr_req *se_tmr = cmd->se_cmd.se_tmr_req; struct iscsi_tm *hdr = (struct iscsi_tm *) buf; - u64 ret, ref_lun; + u64 ref_lun; + int ret; pr_debug("Got TASK_REASSIGN TMR ITT: 0x%08x," " RefTaskTag: 0x%08x, ExpDataSN: 0x%08x, CID: %hu\n", diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index cc88aaa106da..c9bdd4140fd0 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -29,6 +29,10 @@ #define MCQ_ENTRY_SIZE_IN_DWORD 8 #define CQE_UCD_BA GENMASK_ULL(63, 7) +#define UFSHCD_ENABLE_MCQ_INTRS (UTP_TASK_REQ_COMPL |\ + UFSHCD_ERROR_MASK |\ + MCQ_CQ_EVENT_STATUS) + /* Max mcq register polling time in microseconds */ #define MCQ_POLL_US 500000 @@ -355,9 +359,16 @@ EXPORT_SYMBOL_GPL(ufshcd_mcq_poll_cqe_lock); void ufshcd_mcq_make_queues_operational(struct ufs_hba *hba) { struct ufs_hw_queue *hwq; + u32 intrs; u16 qsize; int i; + /* Enable required interrupts */ + intrs = UFSHCD_ENABLE_MCQ_INTRS; + if (hba->quirks & UFSHCD_QUIRK_MCQ_BROKEN_INTR) + intrs &= ~MCQ_CQ_EVENT_STATUS; + ufshcd_enable_intr(hba, intrs); + for (i = 0; i < hba->nr_hw_queues; i++) { hwq = &hba->uhq[i]; hwq->id = i; diff --git a/drivers/ufs/core/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c index 4bd7d491e3c5..0086816b27cd 100644 --- a/drivers/ufs/core/ufs-sysfs.c +++ b/drivers/ufs/core/ufs-sysfs.c @@ -512,6 +512,8 @@ static ssize_t pm_qos_enable_show(struct device *dev, { struct ufs_hba *hba = dev_get_drvdata(dev); + guard(mutex)(&hba->pm_qos_mutex); + return sysfs_emit(buf, "%d\n", hba->pm_qos_enabled); } diff --git a/drivers/ufs/core/ufs_trace.h b/drivers/ufs/core/ufs_trace.h index caa32e23ffa5..584c2b5c6ad9 100644 --- a/drivers/ufs/core/ufs_trace.h +++ b/drivers/ufs/core/ufs_trace.h @@ -11,6 +11,7 @@ #include <ufs/ufs.h> #include <linux/tracepoint.h> +#include "ufs_trace_types.h" #define str_opcode(opcode) \ __print_symbolic(opcode, \ diff --git a/drivers/ufs/core/ufs_trace_types.h b/drivers/ufs/core/ufs_trace_types.h new file mode 100644 index 000000000000..f2d5ad1d92b9 --- /dev/null +++ b/drivers/ufs/core/ufs_trace_types.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _UFS_TRACE_TYPES_H_ +#define _UFS_TRACE_TYPES_H_ + +enum ufs_trace_str_t { + UFS_CMD_SEND, + UFS_CMD_COMP, + UFS_DEV_COMP, + UFS_QUERY_SEND, + UFS_QUERY_COMP, + UFS_QUERY_ERR, + UFS_TM_SEND, + UFS_TM_COMP, + UFS_TM_ERR +}; + +enum ufs_trace_tsf_t { + UFS_TSF_CDB, + UFS_TSF_OSF, + UFS_TSF_TM_INPUT, + UFS_TSF_TM_OUTPUT +}; + +#endif /* _UFS_TRACE_TYPES_H_ */ diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 9a43102b2b21..d9632d7c5f01 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -45,11 +45,6 @@ UTP_TASK_REQ_COMPL |\ UFSHCD_ERROR_MASK) -#define UFSHCD_ENABLE_MCQ_INTRS (UTP_TASK_REQ_COMPL |\ - UFSHCD_ERROR_MASK |\ - MCQ_CQ_EVENT_STATUS) - - /* UIC command timeout, unit: ms */ enum { UIC_CMD_TIMEOUT_DEFAULT = 500, @@ -316,6 +311,9 @@ static const struct ufs_dev_quirk ufs_fixups[] = { { .wmanufacturerid = UFS_VENDOR_TOSHIBA, .model = "THGLF2G9D8KBADG", .quirk = UFS_DEVICE_QUIRK_PA_TACTIVATE }, + { .wmanufacturerid = UFS_VENDOR_TOSHIBA, + .model = "THGJFJT1E45BATP", + .quirk = UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT }, {} }; @@ -369,7 +367,7 @@ EXPORT_SYMBOL_GPL(ufshcd_disable_irq); * @hba: per adapter instance * @intrs: interrupt bits */ -static void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs) +void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs) { u32 old_val = ufshcd_readl(hba, REG_INTERRUPT_ENABLE); u32 new_val = old_val | intrs; @@ -606,10 +604,12 @@ void ufshcd_print_tr(struct ufs_hba *hba, int tag, bool pr_prdt) lrbp = &hba->lrb[tag]; - dev_err(hba->dev, "UPIU[%d] - issue time %lld us\n", - tag, div_u64(lrbp->issue_time_stamp_local_clock, 1000)); - dev_err(hba->dev, "UPIU[%d] - complete time %lld us\n", - tag, div_u64(lrbp->compl_time_stamp_local_clock, 1000)); + if (hba->monitor.enabled) { + dev_err(hba->dev, "UPIU[%d] - issue time %lld us\n", tag, + div_u64(lrbp->issue_time_stamp_local_clock, 1000)); + dev_err(hba->dev, "UPIU[%d] - complete time %lld us\n", tag, + div_u64(lrbp->compl_time_stamp_local_clock, 1000)); + } dev_err(hba->dev, "UPIU[%d] - Transfer Request Descriptor phys@0x%llx\n", tag, (u64)lrbp->utrd_dma_addr); @@ -1045,6 +1045,7 @@ EXPORT_SYMBOL_GPL(ufshcd_is_hba_active); */ void ufshcd_pm_qos_init(struct ufs_hba *hba) { + guard(mutex)(&hba->pm_qos_mutex); if (hba->pm_qos_enabled) return; @@ -1061,6 +1062,8 @@ void ufshcd_pm_qos_init(struct ufs_hba *hba) */ void ufshcd_pm_qos_exit(struct ufs_hba *hba) { + guard(mutex)(&hba->pm_qos_mutex); + if (!hba->pm_qos_enabled) return; @@ -1075,6 +1078,8 @@ void ufshcd_pm_qos_exit(struct ufs_hba *hba) */ static void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on) { + guard(mutex)(&hba->pm_qos_mutex); + if (!hba->pm_qos_enabled) return; @@ -2230,11 +2235,13 @@ static void ufshcd_exit_clk_gating(struct ufs_hba *hba) static void ufshcd_clk_scaling_start_busy(struct ufs_hba *hba) { bool queue_resume_work = false; - ktime_t curr_t = ktime_get(); + ktime_t curr_t; if (!ufshcd_is_clkscaling_supported(hba)) return; + curr_t = ktime_get(); + guard(spinlock_irqsave)(&hba->clk_scaling.lock); if (!hba->clk_scaling.active_reqs++) @@ -2354,10 +2361,12 @@ void ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag, struct ufshcd_lrb *lrbp = &hba->lrb[task_tag]; unsigned long flags; - lrbp->issue_time_stamp = ktime_get(); - lrbp->issue_time_stamp_local_clock = local_clock(); - lrbp->compl_time_stamp = ktime_set(0, 0); - lrbp->compl_time_stamp_local_clock = 0; + if (hba->monitor.enabled) { + lrbp->issue_time_stamp = ktime_get(); + lrbp->issue_time_stamp_local_clock = local_clock(); + lrbp->compl_time_stamp = ktime_set(0, 0); + lrbp->compl_time_stamp_local_clock = 0; + } ufshcd_add_command_trace(hba, task_tag, UFS_CMD_SEND); if (lrbp->cmd) ufshcd_clk_scaling_start_busy(hba); @@ -5622,8 +5631,10 @@ void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag, enum utp_ocs ocs; lrbp = &hba->lrb[task_tag]; - lrbp->compl_time_stamp = ktime_get(); - lrbp->compl_time_stamp_local_clock = local_clock(); + if (hba->monitor.enabled) { + lrbp->compl_time_stamp = ktime_get(); + lrbp->compl_time_stamp_local_clock = local_clock(); + } cmd = lrbp->cmd; if (cmd) { if (unlikely(ufshcd_should_inform_monitor(hba, lrbp))) @@ -6457,13 +6468,14 @@ void ufshcd_schedule_eh_work(struct ufs_hba *hba) } } -static void ufshcd_force_error_recovery(struct ufs_hba *hba) +void ufshcd_force_error_recovery(struct ufs_hba *hba) { spin_lock_irq(hba->host->host_lock); hba->force_reset = true; ufshcd_schedule_eh_work(hba); spin_unlock_irq(hba->host->host_lock); } +EXPORT_SYMBOL_GPL(ufshcd_force_error_recovery); static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow) { @@ -8786,7 +8798,8 @@ static void ufshcd_set_timestamp_attr(struct ufs_hba *hba) struct ufs_dev_info *dev_info = &hba->dev_info; struct utp_upiu_query_v4_0 *upiu_data; - if (dev_info->wspecversion < 0x400) + if (dev_info->wspecversion < 0x400 || + hba->dev_quirks & UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT) return; ufshcd_dev_man_lock(hba); @@ -8913,16 +8926,11 @@ err: static void ufshcd_config_mcq(struct ufs_hba *hba) { int ret; - u32 intrs; ret = ufshcd_mcq_vops_config_esi(hba); hba->mcq_esi_enabled = !ret; dev_info(hba->dev, "ESI %sconfigured\n", ret ? "is not " : ""); - intrs = UFSHCD_ENABLE_MCQ_INTRS; - if (hba->quirks & UFSHCD_QUIRK_MCQ_BROKEN_INTR) - intrs &= ~MCQ_CQ_EVENT_STATUS; - ufshcd_enable_intr(hba, intrs); ufshcd_mcq_make_queues_operational(hba); ufshcd_mcq_config_mac(hba, hba->nutrs); @@ -10756,6 +10764,10 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) mutex_init(&hba->ee_ctrl_mutex); mutex_init(&hba->wb_mutex); + + /* Initialize mutex for PM QoS request synchronization */ + mutex_init(&hba->pm_qos_mutex); + init_rwsem(&hba->clk_scaling_lock); ufshcd_init_clk_gating(hba); diff --git a/drivers/ufs/host/ufs-exynos.c b/drivers/ufs/host/ufs-exynos.c index f0adcd9dd553..70d195179eba 100644 --- a/drivers/ufs/host/ufs-exynos.c +++ b/drivers/ufs/host/ufs-exynos.c @@ -776,7 +776,7 @@ static void exynos_ufs_config_sync_pattern_mask(struct exynos_ufs *ufs, u32 mask, sync_len; enum { SYNC_LEN_G1 = 80 * 1000, /* 80us */ - SYNC_LEN_G2 = 40 * 1000, /* 44us */ + SYNC_LEN_G2 = 40 * 1000, /* 40us */ SYNC_LEN_G3 = 20 * 1000, /* 20us */ }; int i; @@ -1896,6 +1896,13 @@ static int fsd_ufs_pre_pwr_change(struct exynos_ufs *ufs, return 0; } +static int fsd_ufs_suspend(struct exynos_ufs *ufs) +{ + exynos_ufs_gate_clks(ufs); + hci_writel(ufs, 0, HCI_GPIO_OUT); + return 0; +} + static inline u32 get_mclk_period_unipro_18(struct exynos_ufs *ufs) { return (16 * 1000 * 1000000UL / ufs->mclk_rate); @@ -2162,6 +2169,7 @@ static const struct exynos_ufs_drv_data fsd_ufs_drvs = { .pre_link = fsd_ufs_pre_link, .post_link = fsd_ufs_post_link, .pre_pwr_change = fsd_ufs_pre_pwr_change, + .suspend = fsd_ufs_suspend, }; static const struct exynos_ufs_drv_data gs101_ufs_drvs = { diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c index f902ce08c95a..758a393a9de1 100644 --- a/drivers/ufs/host/ufs-mediatek.c +++ b/drivers/ufs/host/ufs-mediatek.c @@ -29,6 +29,7 @@ #include "ufs-mediatek-sip.h" static int ufs_mtk_config_mcq(struct ufs_hba *hba, bool irq); +static void _ufs_mtk_clk_scale(struct ufs_hba *hba, bool scale_up); #define CREATE_TRACE_POINTS #include "ufs-mediatek-trace.h" @@ -415,7 +416,7 @@ static void ufs_mtk_dbg_sel(struct ufs_hba *hba) } } -static void ufs_mtk_wait_idle_state(struct ufs_hba *hba, +static int ufs_mtk_wait_idle_state(struct ufs_hba *hba, unsigned long retry_ms) { u64 timeout, time_checked; @@ -451,8 +452,12 @@ static void ufs_mtk_wait_idle_state(struct ufs_hba *hba, break; } while (time_checked < timeout); - if (wait_idle && sm != VS_HCE_BASE) + if (wait_idle && sm != VS_HCE_BASE) { dev_info(hba->dev, "wait idle tmo: 0x%x\n", val); + return -ETIMEDOUT; + } + + return 0; } static int ufs_mtk_wait_link_state(struct ufs_hba *hba, u32 state, @@ -798,8 +803,14 @@ static int ufs_mtk_setup_clocks(struct ufs_hba *hba, bool on, clk_pwr_off = true; } - if (clk_pwr_off) + if (clk_pwr_off) { ufs_mtk_pwr_ctrl(hba, false); + } else { + dev_warn(hba->dev, "Clock is not turned off, hba->ahit = 0x%x, AHIT = 0x%x\n", + hba->ahit, + ufshcd_readl(hba, + REG_AUTO_HIBERNATE_IDLE_TIMER)); + } ufs_mtk_mcq_disable_irq(hba); } else if (on && status == POST_CHANGE) { ufs_mtk_pwr_ctrl(hba, true); @@ -1018,7 +1029,7 @@ static int ufs_mtk_vreg_fix_vcc(struct ufs_hba *hba) struct arm_smccc_res res; int err, ver; - if (hba->vreg_info.vcc) + if (info->vcc) return 0; if (of_property_read_bool(np, "mediatek,ufs-vcc-by-num")) { @@ -1075,6 +1086,80 @@ static void ufs_mtk_vreg_fix_vccqx(struct ufs_hba *hba) } } +static void ufs_mtk_setup_clk_gating(struct ufs_hba *hba) +{ + unsigned long flags; + u32 ah_ms = 10; + u32 ah_scale, ah_timer; + u32 scale_us[] = {1, 10, 100, 1000, 10000, 100000}; + + if (ufshcd_is_clkgating_allowed(hba)) { + if (ufshcd_is_auto_hibern8_supported(hba) && hba->ahit) { + ah_scale = FIELD_GET(UFSHCI_AHIBERN8_SCALE_MASK, + hba->ahit); + ah_timer = FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK, + hba->ahit); + if (ah_scale <= 5) + ah_ms = ah_timer * scale_us[ah_scale] / 1000; + } + + spin_lock_irqsave(hba->host->host_lock, flags); + hba->clk_gating.delay_ms = max(ah_ms, 10U); + spin_unlock_irqrestore(hba->host->host_lock, flags); + } +} + +/* Convert microseconds to Auto-Hibernate Idle Timer register value */ +static u32 ufs_mtk_us_to_ahit(unsigned int timer) +{ + unsigned int scale; + + for (scale = 0; timer > UFSHCI_AHIBERN8_TIMER_MASK; ++scale) + timer /= UFSHCI_AHIBERN8_SCALE_FACTOR; + + return FIELD_PREP(UFSHCI_AHIBERN8_TIMER_MASK, timer) | + FIELD_PREP(UFSHCI_AHIBERN8_SCALE_MASK, scale); +} + +static void ufs_mtk_fix_ahit(struct ufs_hba *hba) +{ + unsigned int us; + + if (ufshcd_is_auto_hibern8_supported(hba)) { + switch (hba->dev_info.wmanufacturerid) { + case UFS_VENDOR_SAMSUNG: + /* configure auto-hibern8 timer to 3.5 ms */ + us = 3500; + break; + + case UFS_VENDOR_MICRON: + /* configure auto-hibern8 timer to 2 ms */ + us = 2000; + break; + + default: + /* configure auto-hibern8 timer to 1 ms */ + us = 1000; + break; + } + + hba->ahit = ufs_mtk_us_to_ahit(us); + } + + ufs_mtk_setup_clk_gating(hba); +} + +static void ufs_mtk_fix_clock_scaling(struct ufs_hba *hba) +{ + /* UFS version is below 4.0, clock scaling is not necessary */ + if ((hba->dev_info.wspecversion < 0x0400) && + ufs_mtk_is_clk_scale_ready(hba)) { + hba->caps &= ~UFSHCD_CAP_CLK_SCALING; + + _ufs_mtk_clk_scale(hba, false); + } +} + static void ufs_mtk_init_mcq_irq(struct ufs_hba *hba) { struct ufs_mtk_host *host = ufshcd_get_variant(hba); @@ -1240,6 +1325,10 @@ static bool ufs_mtk_pmc_via_fastauto(struct ufs_hba *hba, dev_req_params->gear_rx < UFS_HS_G4) return false; + if (dev_req_params->pwr_tx == SLOW_MODE || + dev_req_params->pwr_rx == SLOW_MODE) + return false; + return true; } @@ -1255,6 +1344,10 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba, host_params.hs_rx_gear = UFS_HS_G5; host_params.hs_tx_gear = UFS_HS_G5; + if (dev_max_params->pwr_rx == SLOW_MODE || + dev_max_params->pwr_tx == SLOW_MODE) + host_params.desired_working_mode = UFS_PWM_MODE; + ret = ufshcd_negotiate_pwr_params(&host_params, dev_max_params, dev_req_params); if (ret) { pr_info("%s: failed to determine capabilities\n", @@ -1278,6 +1371,28 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba, ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXHSADAPTTYPE), PA_NO_ADAPT); + if (!(hba->quirks & UFSHCD_QUIRK_SKIP_DEF_UNIPRO_TIMEOUT_SETTING)) { + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA0), + DL_FC0ProtectionTimeOutVal_Default); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA1), + DL_TC0ReplayTimeOutVal_Default); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA2), + DL_AFC0ReqTimeOutVal_Default); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA3), + DL_FC1ProtectionTimeOutVal_Default); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA4), + DL_TC1ReplayTimeOutVal_Default); + ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA5), + DL_AFC1ReqTimeOutVal_Default); + + ufshcd_dme_set(hba, UIC_ARG_MIB(DME_LocalFC0ProtectionTimeOutVal), + DL_FC0ProtectionTimeOutVal_Default); + ufshcd_dme_set(hba, UIC_ARG_MIB(DME_LocalTC0ReplayTimeOutVal), + DL_TC0ReplayTimeOutVal_Default); + ufshcd_dme_set(hba, UIC_ARG_MIB(DME_LocalAFC0ReqTimeOutVal), + DL_AFC0ReqTimeOutVal_Default); + } + ret = ufshcd_uic_change_pwr_mode(hba, FASTAUTO_MODE << 4 | FASTAUTO_MODE); @@ -1287,10 +1402,59 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba, } } - if (host->hw_ver.major >= 3) { + /* if already configured to the requested pwr_mode, skip adapt */ + if (dev_req_params->gear_rx == hba->pwr_info.gear_rx && + dev_req_params->gear_tx == hba->pwr_info.gear_tx && + dev_req_params->lane_rx == hba->pwr_info.lane_rx && + dev_req_params->lane_tx == hba->pwr_info.lane_tx && + dev_req_params->pwr_rx == hba->pwr_info.pwr_rx && + dev_req_params->pwr_tx == hba->pwr_info.pwr_tx && + dev_req_params->hs_rate == hba->pwr_info.hs_rate) { + return ret; + } + + if (dev_req_params->pwr_rx == FAST_MODE || + dev_req_params->pwr_rx == FASTAUTO_MODE) { + if (host->hw_ver.major >= 3) { + ret = ufshcd_dme_configure_adapt(hba, + dev_req_params->gear_tx, + PA_INITIAL_ADAPT); + } else { + ret = ufshcd_dme_configure_adapt(hba, + dev_req_params->gear_tx, + PA_NO_ADAPT); + } + } else { ret = ufshcd_dme_configure_adapt(hba, - dev_req_params->gear_tx, - PA_INITIAL_ADAPT); + dev_req_params->gear_tx, + PA_NO_ADAPT); + } + + return ret; +} + +static int ufs_mtk_auto_hibern8_disable(struct ufs_hba *hba) +{ + int ret; + + /* disable auto-hibern8 */ + ufshcd_writel(hba, 0, REG_AUTO_HIBERNATE_IDLE_TIMER); + + /* wait host return to idle state when auto-hibern8 off */ + ret = ufs_mtk_wait_idle_state(hba, 5); + if (ret) + goto out; + + ret = ufs_mtk_wait_link_state(hba, VS_LINK_UP, 100); + +out: + if (ret) { + dev_warn(hba->dev, "exit h8 state fail, ret=%d\n", ret); + + ufshcd_force_error_recovery(hba); + + /* trigger error handler and break suspend */ + ret = -EBUSY; } return ret; @@ -1302,13 +1466,20 @@ static int ufs_mtk_pwr_change_notify(struct ufs_hba *hba, struct ufs_pa_layer_attr *dev_req_params) { int ret = 0; + static u32 reg; switch (stage) { case PRE_CHANGE: + if (ufshcd_is_auto_hibern8_supported(hba)) { + reg = ufshcd_readl(hba, REG_AUTO_HIBERNATE_IDLE_TIMER); + ufs_mtk_auto_hibern8_disable(hba); + } ret = ufs_mtk_pre_pwr_change(hba, dev_max_params, dev_req_params); break; case POST_CHANGE: + if (ufshcd_is_auto_hibern8_supported(hba)) + ufshcd_writel(hba, reg, REG_AUTO_HIBERNATE_IDLE_TIMER); break; default: ret = -EINVAL; @@ -1342,6 +1513,7 @@ static int ufs_mtk_pre_link(struct ufs_hba *hba) { int ret; u32 tmp; + struct ufs_mtk_host *host = ufshcd_get_variant(hba); ufs_mtk_get_controller_version(hba); @@ -1367,34 +1539,33 @@ static int ufs_mtk_pre_link(struct ufs_hba *hba) ret = ufshcd_dme_set(hba, UIC_ARG_MIB(VS_SAVEPOWERCONTROL), tmp); + /* Enable the 1144 functions setting */ + if (host->ip_ver == IP_VER_MT6989) { + ret = ufshcd_dme_get(hba, UIC_ARG_MIB(VS_DEBUGOMC), &tmp); + if (ret) + return ret; + + tmp |= 0x10; + ret = ufshcd_dme_set(hba, UIC_ARG_MIB(VS_DEBUGOMC), tmp); + } + return ret; } -static void ufs_mtk_setup_clk_gating(struct ufs_hba *hba) +static void ufs_mtk_post_link(struct ufs_hba *hba) { - u32 ah_ms; + struct ufs_mtk_host *host = ufshcd_get_variant(hba); + u32 tmp; - if (ufshcd_is_clkgating_allowed(hba)) { - if (ufshcd_is_auto_hibern8_supported(hba) && hba->ahit) - ah_ms = FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK, - hba->ahit); - else - ah_ms = 10; - ufshcd_clkgate_delay_set(hba->dev, ah_ms + 5); + /* fix device PA_INIT no adapt */ + if (host->ip_ver >= IP_VER_MT6899) { + ufshcd_dme_get(hba, UIC_ARG_MIB(VS_DEBUGOMC), &tmp); + tmp |= 0x100; + ufshcd_dme_set(hba, UIC_ARG_MIB(VS_DEBUGOMC), tmp); } -} -static void ufs_mtk_post_link(struct ufs_hba *hba) -{ /* enable unipro clock gating feature */ ufs_mtk_cfg_unipro_cg(hba, true); - - /* will be configured during probe hba */ - if (ufshcd_is_auto_hibern8_supported(hba)) - hba->ahit = FIELD_PREP(UFSHCI_AHIBERN8_TIMER_MASK, 10) | - FIELD_PREP(UFSHCI_AHIBERN8_SCALE_MASK, 3); - - ufs_mtk_setup_clk_gating(hba); } static int ufs_mtk_link_startup_notify(struct ufs_hba *hba, @@ -1421,11 +1592,11 @@ static int ufs_mtk_device_reset(struct ufs_hba *hba) { struct arm_smccc_res res; - /* disable hba before device reset */ - ufshcd_hba_stop(hba); - ufs_mtk_device_reset_ctrl(0, res); + /* disable hba in middle of device reset */ + ufshcd_hba_stop(hba); + /* * The reset signal is active low. UFS devices shall detect * more than or equal to 1us of positive or negative RST_n @@ -1462,7 +1633,11 @@ static int ufs_mtk_link_set_hpm(struct ufs_hba *hba) return err; /* Check link state to make sure exit h8 success */ - ufs_mtk_wait_idle_state(hba, 5); + err = ufs_mtk_wait_idle_state(hba, 5); + if (err) { + dev_warn(hba->dev, "wait idle fail, err=%d\n", err); + return err; + } err = ufs_mtk_wait_link_state(hba, VS_LINK_UP, 100); if (err) { dev_warn(hba->dev, "exit h8 state fail, err=%d\n", err); @@ -1507,6 +1682,9 @@ static void ufs_mtk_vccqx_set_lpm(struct ufs_hba *hba, bool lpm) { struct ufs_vreg *vccqx = NULL; + if (!hba->vreg_info.vccq && !hba->vreg_info.vccq2) + return; + if (hba->vreg_info.vccq) vccqx = hba->vreg_info.vccq; else @@ -1561,21 +1739,6 @@ static void ufs_mtk_dev_vreg_set_lpm(struct ufs_hba *hba, bool lpm) } } -static void ufs_mtk_auto_hibern8_disable(struct ufs_hba *hba) -{ - int ret; - - /* disable auto-hibern8 */ - ufshcd_writel(hba, 0, REG_AUTO_HIBERNATE_IDLE_TIMER); - - /* wait host return to idle state when auto-hibern8 off */ - ufs_mtk_wait_idle_state(hba, 5); - - ret = ufs_mtk_wait_link_state(hba, VS_LINK_UP, 100); - if (ret) - dev_warn(hba->dev, "exit h8 state fail, ret=%d\n", ret); -} - static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, enum ufs_notify_change_status status) { @@ -1584,7 +1747,7 @@ static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, if (status == PRE_CHANGE) { if (ufshcd_is_auto_hibern8_supported(hba)) - ufs_mtk_auto_hibern8_disable(hba); + return ufs_mtk_auto_hibern8_disable(hba); return 0; } @@ -1642,8 +1805,21 @@ static int ufs_mtk_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op) } return 0; + fail: - return ufshcd_link_recovery(hba); + /* + * Check if the platform (parent) device has resumed, and ensure that + * power, clock, and MTCMOS are all turned on. + */ + err = ufshcd_link_recovery(hba); + if (err) { + dev_err(hba->dev, "Device PM: req=%d, status:%d, err:%d\n", + hba->dev->power.request, + hba->dev->power.runtime_status, + hba->dev->power.runtime_error); + } + + return 0; /* Cannot return a failure, otherwise, the I/O will hang. */ } static void ufs_mtk_dbg_register_dump(struct ufs_hba *hba) @@ -1726,6 +1902,8 @@ static void ufs_mtk_fixup_dev_quirks(struct ufs_hba *hba) ufs_mtk_vreg_fix_vcc(hba); ufs_mtk_vreg_fix_vccqx(hba); + ufs_mtk_fix_ahit(hba); + ufs_mtk_fix_clock_scaling(hba); } static void ufs_mtk_event_notify(struct ufs_hba *hba, @@ -2012,6 +2190,7 @@ static int ufs_mtk_config_mcq_irq(struct ufs_hba *hba) return ret; } } + host->is_mcq_intr_enabled = true; return 0; } @@ -2095,10 +2274,12 @@ static const struct ufs_hba_variant_ops ufs_hba_mtk_vops = { static int ufs_mtk_probe(struct platform_device *pdev) { int err; - struct device *dev = &pdev->dev; - struct device_node *reset_node; - struct platform_device *reset_pdev; + struct device *dev = &pdev->dev, *phy_dev = NULL; + struct device_node *reset_node, *phy_node = NULL; + struct platform_device *reset_pdev, *phy_pdev = NULL; struct device_link *link; + struct ufs_hba *hba; + struct ufs_mtk_host *host; reset_node = of_find_compatible_node(NULL, NULL, "ti,syscon-reset"); @@ -2125,13 +2306,51 @@ static int ufs_mtk_probe(struct platform_device *pdev) } skip_reset: + /* find phy node */ + phy_node = of_parse_phandle(dev->of_node, "phys", 0); + + if (phy_node) { + phy_pdev = of_find_device_by_node(phy_node); + if (!phy_pdev) + goto skip_phy; + phy_dev = &phy_pdev->dev; + + pm_runtime_set_active(phy_dev); + pm_runtime_enable(phy_dev); + pm_runtime_get_sync(phy_dev); + + put_device(phy_dev); + dev_info(dev, "phys node found\n"); + } else { + dev_notice(dev, "phys node not found\n"); + } + +skip_phy: /* perform generic probe */ err = ufshcd_pltfrm_init(pdev, &ufs_hba_mtk_vops); - -out: - if (err) + if (err) { dev_err(dev, "probe failed %d\n", err); + goto out; + } + + hba = platform_get_drvdata(pdev); + if (!hba) + goto out; + + if (phy_node && phy_dev) { + host = ufshcd_get_variant(hba); + host->phy_dev = phy_dev; + } + + /* + * Because the default power setting of VSx (the upper layer of + * VCCQ/VCCQ2) is HWLP, we need to prevent VCCQ/VCCQ2 from + * entering LPM. + */ + ufs_mtk_dev_vreg_set_lpm(hba, false); +out: + of_node_put(phy_node); of_node_put(reset_node); return err; } @@ -2156,27 +2375,38 @@ static int ufs_mtk_system_suspend(struct device *dev) ret = ufshcd_system_suspend(dev); if (ret) - return ret; + goto out; + + if (pm_runtime_suspended(hba->dev)) + goto out; ufs_mtk_dev_vreg_set_lpm(hba, true); if (ufs_mtk_is_rtff_mtcmos(hba)) ufs_mtk_mtcmos_ctrl(false, res); - return 0; +out: + return ret; } static int ufs_mtk_system_resume(struct device *dev) { + int ret = 0; struct ufs_hba *hba = dev_get_drvdata(dev); struct arm_smccc_res res; - ufs_mtk_dev_vreg_set_lpm(hba, false); + if (pm_runtime_suspended(hba->dev)) + goto out; if (ufs_mtk_is_rtff_mtcmos(hba)) ufs_mtk_mtcmos_ctrl(true, res); - return ufshcd_system_resume(dev); + ufs_mtk_dev_vreg_set_lpm(hba, false); + +out: + ret = ufshcd_system_resume(dev); + + return ret; } #endif @@ -2184,6 +2414,7 @@ static int ufs_mtk_system_resume(struct device *dev) static int ufs_mtk_runtime_suspend(struct device *dev) { struct ufs_hba *hba = dev_get_drvdata(dev); + struct ufs_mtk_host *host = ufshcd_get_variant(hba); struct arm_smccc_res res; int ret = 0; @@ -2196,17 +2427,24 @@ static int ufs_mtk_runtime_suspend(struct device *dev) if (ufs_mtk_is_rtff_mtcmos(hba)) ufs_mtk_mtcmos_ctrl(false, res); + if (host->phy_dev) + pm_runtime_put_sync(host->phy_dev); + return 0; } static int ufs_mtk_runtime_resume(struct device *dev) { struct ufs_hba *hba = dev_get_drvdata(dev); + struct ufs_mtk_host *host = ufshcd_get_variant(hba); struct arm_smccc_res res; if (ufs_mtk_is_rtff_mtcmos(hba)) ufs_mtk_mtcmos_ctrl(true, res); + if (host->phy_dev) + pm_runtime_get_sync(host->phy_dev); + ufs_mtk_dev_vreg_set_lpm(hba, false); return ufshcd_runtime_resume(dev); diff --git a/drivers/ufs/host/ufs-mediatek.h b/drivers/ufs/host/ufs-mediatek.h index e46dc5fa209d..dfbf78bd8664 100644 --- a/drivers/ufs/host/ufs-mediatek.h +++ b/drivers/ufs/host/ufs-mediatek.h @@ -193,6 +193,7 @@ struct ufs_mtk_host { bool is_mcq_intr_enabled; int mcq_nr_intr; struct ufs_mtk_mcq_intr_info mcq_intr_info[UFSHCD_MAX_Q_NR]; + struct device *phy_dev; }; /* MTK delay of autosuspend: 500 ms */ diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 9574fdc2bb0f..3e83dc51d538 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -38,6 +38,9 @@ #define DEEMPHASIS_3_5_dB 0x04 #define NO_DEEMPHASIS 0x0 +#define UFS_ICE_SYNC_RST_SEL BIT(3) +#define UFS_ICE_SYNC_RST_SW BIT(4) + enum { TSTBUS_UAWM, TSTBUS_UARM, @@ -494,12 +497,8 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba) * If the HS-G5 PHY gear is used, update host_params->hs_rate to Rate-A, * so that the subsequent power mode change shall stick to Rate-A. */ - if (host->hw_ver.major == 0x5) { - if (host->phy_gear == UFS_HS_G5) - host_params->hs_rate = PA_HS_MODE_A; - else - host_params->hs_rate = PA_HS_MODE_B; - } + if (host->hw_ver.major == 0x5 && host->phy_gear == UFS_HS_G5) + host_params->hs_rate = PA_HS_MODE_A; mode = host_params->hs_rate == PA_HS_MODE_B ? PHY_MODE_UFS_HS_B : PHY_MODE_UFS_HS_A; @@ -751,11 +750,29 @@ static int ufs_qcom_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op) { struct ufs_qcom_host *host = ufshcd_get_variant(hba); int err; + u32 reg_val; err = ufs_qcom_enable_lane_clks(host); if (err) return err; + if ((!ufs_qcom_is_link_active(hba)) && + host->hw_ver.major == 5 && + host->hw_ver.minor == 0 && + host->hw_ver.step == 0) { + ufshcd_writel(hba, UFS_ICE_SYNC_RST_SEL | UFS_ICE_SYNC_RST_SW, UFS_MEM_ICE_CFG); + reg_val = ufshcd_readl(hba, UFS_MEM_ICE_CFG); + reg_val &= ~(UFS_ICE_SYNC_RST_SEL | UFS_ICE_SYNC_RST_SW); + /* + * HW documentation doesn't recommend any delay between the + * reset set and clear. But we are enforcing an arbitrary delay + * to give flops enough time to settle in. + */ + usleep_range(50, 100); + ufshcd_writel(hba, reg_val, UFS_MEM_ICE_CFG); + ufshcd_readl(hba, UFS_MEM_ICE_CFG); + } + return ufs_qcom_ice_resume(host); } @@ -1096,6 +1113,18 @@ static void ufs_qcom_set_phy_gear(struct ufs_qcom_host *host) } } +static void ufs_qcom_parse_gear_limits(struct ufs_hba *hba) +{ + struct ufs_qcom_host *host = ufshcd_get_variant(hba); + struct ufs_host_params *host_params = &host->host_params; + u32 hs_gear_old = host_params->hs_tx_gear; + + ufshcd_parse_gear_limits(hba, host_params); + if (host_params->hs_tx_gear != hs_gear_old) { + host->phy_gear = host_params->hs_tx_gear; + } +} + static void ufs_qcom_set_host_params(struct ufs_hba *hba) { struct ufs_qcom_host *host = ufshcd_get_variant(hba); @@ -1162,6 +1191,13 @@ static int ufs_qcom_setup_clocks(struct ufs_hba *hba, bool on, case PRE_CHANGE: if (on) { ufs_qcom_icc_update_bw(host); + if (ufs_qcom_is_link_hibern8(hba)) { + err = ufs_qcom_enable_lane_clks(host); + if (err) { + dev_err(hba->dev, "enable lane clks failed, ret=%d\n", err); + return err; + } + } } else { if (!ufs_qcom_is_link_active(hba)) { /* disable device ref_clk */ @@ -1187,6 +1223,9 @@ static int ufs_qcom_setup_clocks(struct ufs_hba *hba, bool on, if (ufshcd_is_hs_mode(&hba->pwr_info)) ufs_qcom_dev_ref_clk_ctrl(host, true); } else { + if (ufs_qcom_is_link_hibern8(hba)) + ufs_qcom_disable_lane_clks(host); + ufs_qcom_icc_set_bw(host, ufs_qcom_bw_table[MODE_MIN][0][0].mem_bw, ufs_qcom_bw_table[MODE_MIN][0][0].cfg_bw); } @@ -1337,6 +1376,7 @@ static int ufs_qcom_init(struct ufs_hba *hba) ufs_qcom_advertise_quirks(hba); ufs_qcom_set_host_params(hba); ufs_qcom_set_phy_gear(host); + ufs_qcom_parse_gear_limits(hba); err = ufs_qcom_ice_init(host); if (err) @@ -1742,7 +1782,7 @@ static void ufs_qcom_dump_testbus(struct ufs_hba *hba) } static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len, - const char *prefix, enum ufshcd_res id) + const char *prefix, void __iomem *base) { u32 *regs __free(kfree) = NULL; size_t pos; @@ -1755,7 +1795,7 @@ static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len, return -ENOMEM; for (pos = 0; pos < len; pos += 4) - regs[pos / 4] = readl(hba->res[id].base + offset + pos); + regs[pos / 4] = readl(base + offset + pos); print_hex_dump(KERN_ERR, prefix, len > 4 ? DUMP_PREFIX_OFFSET : DUMP_PREFIX_NONE, @@ -1766,30 +1806,34 @@ static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len, static void ufs_qcom_dump_mcq_hci_regs(struct ufs_hba *hba) { + struct ufshcd_mcq_opr_info_t *opr = &hba->mcq_opr[0]; + void __iomem *mcq_vs_base = hba->mcq_base + UFS_MEM_VS_BASE; + struct dump_info { + void __iomem *base; size_t offset; size_t len; const char *prefix; - enum ufshcd_res id; }; struct dump_info mcq_dumps[] = { - {0x0, 256 * 4, "MCQ HCI-0 ", RES_MCQ}, - {0x400, 256 * 4, "MCQ HCI-1 ", RES_MCQ}, - {0x0, 5 * 4, "MCQ VS-0 ", RES_MCQ_VS}, - {0x0, 256 * 4, "MCQ SQD-0 ", RES_MCQ_SQD}, - {0x400, 256 * 4, "MCQ SQD-1 ", RES_MCQ_SQD}, - {0x800, 256 * 4, "MCQ SQD-2 ", RES_MCQ_SQD}, - {0xc00, 256 * 4, "MCQ SQD-3 ", RES_MCQ_SQD}, - {0x1000, 256 * 4, "MCQ SQD-4 ", RES_MCQ_SQD}, - {0x1400, 256 * 4, "MCQ SQD-5 ", RES_MCQ_SQD}, - {0x1800, 256 * 4, "MCQ SQD-6 ", RES_MCQ_SQD}, - {0x1c00, 256 * 4, "MCQ SQD-7 ", RES_MCQ_SQD}, + {hba->mcq_base, 0x0, 256 * 4, "MCQ HCI-0 "}, + {hba->mcq_base, 0x400, 256 * 4, "MCQ HCI-1 "}, + {mcq_vs_base, 0x0, 5 * 4, "MCQ VS-0 "}, + {opr->base, 0x0, 256 * 4, "MCQ SQD-0 "}, + {opr->base, 0x400, 256 * 4, "MCQ SQD-1 "}, + {opr->base, 0x800, 256 * 4, "MCQ SQD-2 "}, + {opr->base, 0xc00, 256 * 4, "MCQ SQD-3 "}, + {opr->base, 0x1000, 256 * 4, "MCQ SQD-4 "}, + {opr->base, 0x1400, 256 * 4, "MCQ SQD-5 "}, + {opr->base, 0x1800, 256 * 4, "MCQ SQD-6 "}, + {opr->base, 0x1c00, 256 * 4, "MCQ SQD-7 "}, + }; for (int i = 0; i < ARRAY_SIZE(mcq_dumps); i++) { ufs_qcom_dump_regs(hba, mcq_dumps[i].offset, mcq_dumps[i].len, - mcq_dumps[i].prefix, mcq_dumps[i].id); + mcq_dumps[i].prefix, mcq_dumps[i].base); cond_resched(); } } @@ -1910,116 +1954,68 @@ static void ufs_qcom_config_scaling_param(struct ufs_hba *hba, hba->clk_scaling.suspend_on_no_request = true; } -/* Resources */ -static const struct ufshcd_res_info ufs_res_info[RES_MAX] = { - {.name = "ufs_mem",}, - {.name = "mcq",}, - /* Submission Queue DAO */ - {.name = "mcq_sqd",}, - /* Submission Queue Interrupt Status */ - {.name = "mcq_sqis",}, - /* Completion Queue DAO */ - {.name = "mcq_cqd",}, - /* Completion Queue Interrupt Status */ - {.name = "mcq_cqis",}, - /* MCQ vendor specific */ - {.name = "mcq_vs",}, -}; - static int ufs_qcom_mcq_config_resource(struct ufs_hba *hba) { struct platform_device *pdev = to_platform_device(hba->dev); - struct ufshcd_res_info *res; - struct resource *res_mem, *res_mcq; - int i, ret; - - memcpy(hba->res, ufs_res_info, sizeof(ufs_res_info)); - - for (i = 0; i < RES_MAX; i++) { - res = &hba->res[i]; - res->resource = platform_get_resource_byname(pdev, - IORESOURCE_MEM, - res->name); - if (!res->resource) { - dev_info(hba->dev, "Resource %s not provided\n", res->name); - if (i == RES_UFS) - return -ENODEV; - continue; - } else if (i == RES_UFS) { - res_mem = res->resource; - res->base = hba->mmio_base; - continue; - } + struct resource *res; - res->base = devm_ioremap_resource(hba->dev, res->resource); - if (IS_ERR(res->base)) { - dev_err(hba->dev, "Failed to map res %s, err=%d\n", - res->name, (int)PTR_ERR(res->base)); - ret = PTR_ERR(res->base); - res->base = NULL; - return ret; - } + /* Map the MCQ configuration region */ + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mcq"); + if (!res) { + dev_err(hba->dev, "MCQ resource not found in device tree\n"); + return -ENODEV; } - /* MCQ resource provided in DT */ - res = &hba->res[RES_MCQ]; - /* Bail if MCQ resource is provided */ - if (res->base) - goto out; - - /* Explicitly allocate MCQ resource from ufs_mem */ - res_mcq = devm_kzalloc(hba->dev, sizeof(*res_mcq), GFP_KERNEL); - if (!res_mcq) - return -ENOMEM; - - res_mcq->start = res_mem->start + - MCQ_SQATTR_OFFSET(hba->mcq_capabilities); - res_mcq->end = res_mcq->start + hba->nr_hw_queues * MCQ_QCFG_SIZE - 1; - res_mcq->flags = res_mem->flags; - res_mcq->name = "mcq"; - - ret = insert_resource(&iomem_resource, res_mcq); - if (ret) { - dev_err(hba->dev, "Failed to insert MCQ resource, err=%d\n", - ret); - return ret; - } - - res->base = devm_ioremap_resource(hba->dev, res_mcq); - if (IS_ERR(res->base)) { - dev_err(hba->dev, "MCQ registers mapping failed, err=%d\n", - (int)PTR_ERR(res->base)); - ret = PTR_ERR(res->base); - goto ioremap_err; + hba->mcq_base = devm_ioremap_resource(hba->dev, res); + if (IS_ERR(hba->mcq_base)) { + dev_err(hba->dev, "Failed to map MCQ region: %ld\n", + PTR_ERR(hba->mcq_base)); + return PTR_ERR(hba->mcq_base); } -out: - hba->mcq_base = res->base; return 0; -ioremap_err: - res->base = NULL; - remove_resource(res_mcq); - return ret; } static int ufs_qcom_op_runtime_config(struct ufs_hba *hba) { - struct ufshcd_res_info *mem_res, *sqdao_res; struct ufshcd_mcq_opr_info_t *opr; int i; + u32 doorbell_offsets[OPR_MAX]; - mem_res = &hba->res[RES_UFS]; - sqdao_res = &hba->res[RES_MCQ_SQD]; + /* + * Configure doorbell address offsets in MCQ configuration registers. + * These values are offsets relative to mmio_base (UFS_HCI_BASE). + * + * Memory Layout: + * - mmio_base = UFS_HCI_BASE + * - mcq_base = MCQ_CONFIG_BASE = mmio_base + (UFS_QCOM_MCQCAP_QCFGPTR * 0x200) + * - Doorbell registers are at: mmio_base + (UFS_QCOM_MCQCAP_QCFGPTR * 0x200) + + * - UFS_QCOM_MCQ_SQD_OFFSET + * - Which is also: mcq_base + UFS_QCOM_MCQ_SQD_OFFSET + */ - if (!mem_res->base || !sqdao_res->base) - return -EINVAL; + doorbell_offsets[OPR_SQD] = UFS_QCOM_SQD_ADDR_OFFSET; + doorbell_offsets[OPR_SQIS] = UFS_QCOM_SQIS_ADDR_OFFSET; + doorbell_offsets[OPR_CQD] = UFS_QCOM_CQD_ADDR_OFFSET; + doorbell_offsets[OPR_CQIS] = UFS_QCOM_CQIS_ADDR_OFFSET; + /* + * Configure MCQ operation registers. + * + * The doorbell registers are physically located within the MCQ region: + * - doorbell_physical_addr = mmio_base + doorbell_offset + * - doorbell_physical_addr = mcq_base + (doorbell_offset - MCQ_CONFIG_OFFSET) + */ for (i = 0; i < OPR_MAX; i++) { opr = &hba->mcq_opr[i]; - opr->offset = sqdao_res->resource->start - - mem_res->resource->start + 0x40 * i; - opr->stride = 0x100; - opr->base = sqdao_res->base + 0x40 * i; + opr->offset = doorbell_offsets[i]; /* Offset relative to mmio_base */ + opr->stride = UFS_QCOM_MCQ_STRIDE; /* 256 bytes between queues */ + + /* + * Calculate the actual doorbell base address within MCQ region: + * base = mcq_base + (doorbell_offset - MCQ_CONFIG_OFFSET) + */ + opr->base = hba->mcq_base + (opr->offset - UFS_QCOM_MCQ_CONFIG_OFFSET); } return 0; @@ -2034,12 +2030,8 @@ static int ufs_qcom_get_hba_mac(struct ufs_hba *hba) static int ufs_qcom_get_outstanding_cqs(struct ufs_hba *hba, unsigned long *ocqs) { - struct ufshcd_res_info *mcq_vs_res = &hba->res[RES_MCQ_VS]; - - if (!mcq_vs_res->base) - return -EINVAL; - - *ocqs = readl(mcq_vs_res->base + UFS_MEM_CQIS_VS); + /* Read from MCQ vendor-specific register in MCQ region */ + *ocqs = readl(hba->mcq_base + UFS_MEM_CQIS_VS); return 0; } diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h index e0e129af7c16..380d02333d38 100644 --- a/drivers/ufs/host/ufs-qcom.h +++ b/drivers/ufs/host/ufs-qcom.h @@ -33,6 +33,28 @@ #define DL_VS_CLK_CFG_MASK GENMASK(9, 0) #define DME_VS_CORE_CLK_CTRL_DME_HW_CGC_EN BIT(9) +/* Qualcomm MCQ Configuration */ +#define UFS_QCOM_MCQCAP_QCFGPTR 224 /* 0xE0 in hex */ +#define UFS_QCOM_MCQ_CONFIG_OFFSET (UFS_QCOM_MCQCAP_QCFGPTR * 0x200) /* 0x1C000 */ + +/* Doorbell offsets within MCQ region (relative to MCQ_CONFIG_BASE) */ +#define UFS_QCOM_MCQ_SQD_OFFSET 0x5000 +#define UFS_QCOM_MCQ_CQD_OFFSET 0x5080 +#define UFS_QCOM_MCQ_SQIS_OFFSET 0x5040 +#define UFS_QCOM_MCQ_CQIS_OFFSET 0x50C0 +#define UFS_QCOM_MCQ_STRIDE 0x100 + +/* Calculated doorbell address offsets (relative to mmio_base) */ +#define UFS_QCOM_SQD_ADDR_OFFSET (UFS_QCOM_MCQ_CONFIG_OFFSET + UFS_QCOM_MCQ_SQD_OFFSET) +#define UFS_QCOM_CQD_ADDR_OFFSET (UFS_QCOM_MCQ_CONFIG_OFFSET + UFS_QCOM_MCQ_CQD_OFFSET) +#define UFS_QCOM_SQIS_ADDR_OFFSET (UFS_QCOM_MCQ_CONFIG_OFFSET + UFS_QCOM_MCQ_SQIS_OFFSET) +#define UFS_QCOM_CQIS_ADDR_OFFSET (UFS_QCOM_MCQ_CONFIG_OFFSET + UFS_QCOM_MCQ_CQIS_OFFSET) +#define REG_UFS_MCQ_STRIDE UFS_QCOM_MCQ_STRIDE + +/* MCQ Vendor specific address offsets (relative to MCQ_CONFIG_BASE) */ +#define UFS_MEM_VS_BASE 0x4000 +#define UFS_MEM_CQIS_VS 0x4008 + /* QCOM UFS host controller vendor specific registers */ enum { REG_UFS_SYS1CLK_1US = 0xC0, @@ -60,7 +82,7 @@ enum { UFS_AH8_CFG = 0xFC, UFS_RD_REG_MCQ = 0xD00, - + UFS_MEM_ICE_CFG = 0x2600, REG_UFS_MEM_ICE_CONFIG = 0x260C, REG_UFS_MEM_ICE_NUM_CORE = 0x2664, @@ -95,10 +117,6 @@ enum { REG_UFS_SW_H8_EXIT_CNT = 0x2710, }; -enum { - UFS_MEM_CQIS_VS = 0x8, -}; - #define UFS_CNTLR_2_x_x_VEN_REGS_OFFSET(x) (0x000 + x) #define UFS_CNTLR_3_x_x_VEN_REGS_OFFSET(x) (0x400 + x) diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c index ffe5d1d2b215..c2dafb583cf5 100644 --- a/drivers/ufs/host/ufshcd-pltfrm.c +++ b/drivers/ufs/host/ufshcd-pltfrm.c @@ -430,6 +430,39 @@ int ufshcd_negotiate_pwr_params(const struct ufs_host_params *host_params, } EXPORT_SYMBOL_GPL(ufshcd_negotiate_pwr_params); +/** + * ufshcd_parse_gear_limits - Parse DT-based gear and rate limits for UFS + * @hba: Pointer to UFS host bus adapter instance + * @host_params: Pointer to UFS host parameters structure to be updated + * + * This function reads optional device tree properties to apply + * platform-specific constraints. + * + * "limit-hs-gear": Specifies the max HS gear. + * "limit-gear-rate": Specifies the max High-Speed rate. + */ +void ufshcd_parse_gear_limits(struct ufs_hba *hba, struct ufs_host_params *host_params) +{ + struct device_node *np = hba->dev->of_node; + u32 hs_gear; + const char *hs_rate; + + if (!of_property_read_u32(np, "limit-hs-gear", &hs_gear)) { + host_params->hs_tx_gear = hs_gear; + host_params->hs_rx_gear = hs_gear; + } + + if (!of_property_read_string(np, "limit-gear-rate", &hs_rate)) { + if (!strcmp(hs_rate, "rate-a")) + host_params->hs_rate = PA_HS_MODE_A; + else if (!strcmp(hs_rate, "rate-b")) + host_params->hs_rate = PA_HS_MODE_B; + else + dev_warn(hba->dev, "Invalid rate: %s\n", hs_rate); + } +} +EXPORT_SYMBOL_GPL(ufshcd_parse_gear_limits); + void ufshcd_init_host_params(struct ufs_host_params *host_params) { *host_params = (struct ufs_host_params){ diff --git a/drivers/ufs/host/ufshcd-pltfrm.h b/drivers/ufs/host/ufshcd-pltfrm.h index 3017f8e8f93c..0a18a8aed94d 100644 --- a/drivers/ufs/host/ufshcd-pltfrm.h +++ b/drivers/ufs/host/ufshcd-pltfrm.h @@ -29,6 +29,7 @@ int ufshcd_negotiate_pwr_params(const struct ufs_host_params *host_params, const struct ufs_pa_layer_attr *dev_max, struct ufs_pa_layer_attr *agreed_pwr); void ufshcd_init_host_params(struct ufs_host_params *host_params); +void ufshcd_parse_gear_limits(struct ufs_hba *hba, struct ufs_host_params *host_params); int ufshcd_pltfrm_init(struct platform_device *pdev, const struct ufs_hba_variant_ops *vops); void ufshcd_pltfrm_remove(struct platform_device *pdev); diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 5940e2eb9231..96cc9b389246 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -279,14 +279,7 @@ static int fbcon_get_rotate(struct fb_info *info) static bool fbcon_skip_panic(struct fb_info *info) { -/* panic_cpu is not exported, and can't be used if built as module. Use - * oops_in_progress instead, but non-fatal oops won't be printed. - */ -#if defined(MODULE) - return (info->skip_panic && unlikely(oops_in_progress)); -#else - return (info->skip_panic && unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID)); -#endif + return (info->skip_panic && unlikely(panic_in_progress())); } static inline bool fbcon_is_active(struct vc_data *vc, struct fb_info *info) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f5062061c408..c147145a6593 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -378,7 +378,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist * is initialized by the hardware. Explicitly check/unpoison it * depending on the direction. */ - kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); + kmsan_handle_dma(sg_phys(sg), sg->length, direction); *addr = (dma_addr_t)sg_phys(sg); return 0; } @@ -3157,7 +3157,7 @@ dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) { - kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir); + kmsan_handle_dma(virt_to_phys(ptr), size, dir); return (dma_addr_t)virt_to_phys(ptr); } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index da1a7d3d377c..dd7747a2de87 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -392,6 +392,25 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, } } +static dma_addr_t xen_swiotlb_direct_map_resource(struct device *dev, + phys_addr_t paddr, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t dma_addr = paddr; + + if (unlikely(!dma_capable(dev, dma_addr, size, false))) { + dev_err_once(dev, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + WARN_ON_ONCE(1); + return DMA_MAPPING_ERROR; + } + + return dma_addr; +} + /* * Return whether the given device DMA address mask can be supported * properly. For example, if your device can only drive the low 24-bits @@ -426,5 +445,5 @@ const struct dma_map_ops xen_swiotlb_dma_ops = { .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, .max_mapping_size = swiotlb_max_mapping_size, - .map_resource = dma_direct_map_resource, + .map_resource = xen_swiotlb_direct_map_resource, }; |