diff options
Diffstat (limited to 'drivers/net/ethernet/microsoft/mana/gdma_main.c')
| -rw-r--r-- | drivers/net/ethernet/microsoft/mana/gdma_main.c | 977 |
1 files changed, 834 insertions, 143 deletions
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index b144f2237748..efb4e412ec7e 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1,12 +1,33 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Copyright (c) 2021, Microsoft Corporation. */ +#include <linux/debugfs.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/utsname.h> #include <linux/version.h> +#include <linux/msi.h> +#include <linux/irqdomain.h> +#include <linux/export.h> #include <net/mana/mana.h> +#include <net/mana/hw_channel.h> + +struct dentry *mana_debugfs_root; + +struct mana_dev_recovery { + struct list_head list; + struct pci_dev *pdev; + enum gdma_eqe_type type; +}; + +static struct mana_dev_recovery_work { + struct list_head dev_list; + struct delayed_work work; + + /* Lock for dev_list above */ + spinlock_t lock; +} mana_dev_recovery_work; static u32 mana_gd_r32(struct gdma_context *g, u64 offset) { @@ -28,6 +49,9 @@ static void mana_gd_init_pf_regs(struct pci_dev *pdev) gc->db_page_base = gc->bar0_va + mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); + gc->phys_db_page_base = gc->bar0_pa + + mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); + sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF); sriov_base_va = gc->bar0_va + sriov_base_off; @@ -60,6 +84,24 @@ static void mana_gd_init_registers(struct pci_dev *pdev) mana_gd_init_vf_regs(pdev); } +/* Suppress logging when we set timeout to zero */ +bool mana_need_log(struct gdma_context *gc, int err) +{ + struct hw_channel_context *hwc; + + if (err != -ETIMEDOUT) + return true; + + if (!gc) + return true; + + hwc = gc->hwc.driver_data; + if (hwc && hwc->hwc_timeout == 0) + return false; + + return true; +} + static int mana_gd_query_max_resources(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); @@ -77,8 +119,15 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev) return err ? err : -EPROTO; } - if (gc->num_msix_usable > resp.max_msix) - gc->num_msix_usable = resp.max_msix; + if (!pci_msix_can_alloc_dyn(pdev)) { + if (gc->num_msix_usable > resp.max_msix) + gc->num_msix_usable = resp.max_msix; + } else { + /* If dynamic allocation is enabled we have already allocated + * hwc msi + */ + gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1); + } if (gc->num_msix_usable <= 1) return -ENOSPC; @@ -106,15 +155,35 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev) return 0; } +static int mana_gd_query_hwc_timeout(struct pci_dev *pdev, u32 *timeout_val) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct gdma_query_hwc_timeout_resp resp = {}; + struct gdma_query_hwc_timeout_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_HWC_TIMEOUT, + sizeof(req), sizeof(resp)); + req.timeout_ms = *timeout_val; + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) + return err ? err : -EPROTO; + + *timeout_val = resp.timeout_ms; + + return 0; +} + static int mana_gd_detect_devices(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); struct gdma_list_devices_resp resp = {}; struct gdma_general_req req = {}; struct gdma_dev_id dev; - u32 i, max_num_devs; + int found_dev = 0; u16 dev_type; int err; + u32 i; mana_gd_init_req_hdr(&req.hdr, GDMA_LIST_DEVICES, sizeof(req), sizeof(resp)); @@ -126,12 +195,17 @@ static int mana_gd_detect_devices(struct pci_dev *pdev) return err ? err : -EPROTO; } - max_num_devs = min_t(u32, MAX_NUM_GDMA_DEVICES, resp.num_of_devs); - - for (i = 0; i < max_num_devs; i++) { + for (i = 0; i < GDMA_DEV_LIST_SIZE && + found_dev < resp.num_of_devs; i++) { dev = resp.devs[i]; dev_type = dev.type; + /* Skip empty devices */ + if (dev.as_uint32 == 0) + continue; + + found_dev++; + /* HWC is already detected in mana_hwc_create_channel(). */ if (dev_type == GDMA_DEVICE_HWC) continue; @@ -139,6 +213,9 @@ static int mana_gd_detect_devices(struct pci_dev *pdev) if (dev_type == GDMA_DEVICE_MANA) { gc->mana.gdma_context = gc; gc->mana.dev_id = dev; + } else if (dev_type == GDMA_DEVICE_MANA_IB) { + gc->mana_ib.dev_id = dev; + gc->mana_ib.gdma_context = gc; } } @@ -152,7 +229,7 @@ int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, return mana_hwc_send_request(hwc, req_len, req, resp_len, resp); } -EXPORT_SYMBOL_NS(mana_gd_send_request, NET_MANA); +EXPORT_SYMBOL_NS(mana_gd_send_request, "NET_MANA"); int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, struct gdma_mem_info *gmi) @@ -160,7 +237,7 @@ int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, dma_addr_t dma_handle; void *buf; - if (length < PAGE_SIZE || !is_power_of_2(length)) + if (length < MANA_PAGE_SIZE || !is_power_of_2(length)) return -EINVAL; gmi->dev = gc->dev; @@ -235,8 +312,9 @@ static int mana_gd_disable_queue(struct gdma_queue *queue) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err, - resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err, + resp.hdr.status); return err ? err : -EPROTO; } @@ -300,9 +378,13 @@ static void mana_gd_ring_doorbell(struct gdma_context *gc, u32 db_index, void mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue) { + /* Hardware Spec specifies that software client should set 0 for + * wqe_cnt for Receive Queues. This value is not used in Send Queues. + */ mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type, - queue->id, queue->head * GDMA_WQE_BU_SIZE, 1); + queue->id, queue->head * GDMA_WQE_BU_SIZE, 0); } +EXPORT_SYMBOL_NS(mana_gd_wq_ring_doorbell, "NET_MANA"); void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit) { @@ -315,12 +397,173 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit) mana_gd_ring_doorbell(gc, cq->gdma_dev->doorbell, cq->type, cq->id, head, arm_bit); } +EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA"); + +#define MANA_SERVICE_PERIOD 10 + +static void mana_serv_rescan(struct pci_dev *pdev) +{ + struct pci_bus *parent; + + pci_lock_rescan_remove(); + + parent = pdev->bus; + if (!parent) { + dev_err(&pdev->dev, "MANA service: no parent bus\n"); + goto out; + } + + pci_stop_and_remove_bus_device(pdev); + pci_rescan_bus(parent); + +out: + pci_unlock_rescan_remove(); +} + +static void mana_serv_fpga(struct pci_dev *pdev) +{ + struct pci_bus *bus, *parent; + + pci_lock_rescan_remove(); + + bus = pdev->bus; + if (!bus) { + dev_err(&pdev->dev, "MANA service: no bus\n"); + goto out; + } + + parent = bus->parent; + if (!parent) { + dev_err(&pdev->dev, "MANA service: no parent bus\n"); + goto out; + } + + pci_stop_and_remove_bus_device(bus->self); + + msleep(MANA_SERVICE_PERIOD * 1000); + + pci_rescan_bus(parent); + +out: + pci_unlock_rescan_remove(); +} + +static void mana_serv_reset(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct hw_channel_context *hwc; + int ret; + + if (!gc) { + /* Perform PCI rescan on device if GC is not set up */ + dev_err(&pdev->dev, "MANA service: GC not setup, rescanning\n"); + mana_serv_rescan(pdev); + return; + } + + hwc = gc->hwc.driver_data; + if (!hwc) { + dev_err(&pdev->dev, "MANA service: no HWC\n"); + goto out; + } + + /* HWC is not responding in this case, so don't wait */ + hwc->hwc_timeout = 0; + + dev_info(&pdev->dev, "MANA reset cycle start\n"); + + mana_gd_suspend(pdev, PMSG_SUSPEND); + + msleep(MANA_SERVICE_PERIOD * 1000); + + ret = mana_gd_resume(pdev); + if (ret == -ETIMEDOUT || ret == -EPROTO) { + /* Perform PCI rescan on device if we failed on HWC */ + dev_err(&pdev->dev, "MANA service: resume failed, rescanning\n"); + mana_serv_rescan(pdev); + goto out; + } + + if (ret) + dev_info(&pdev->dev, "MANA reset cycle failed err %d\n", ret); + else + dev_info(&pdev->dev, "MANA reset cycle completed\n"); + +out: + gc->in_service = false; +} + +struct mana_serv_work { + struct work_struct serv_work; + struct pci_dev *pdev; + enum gdma_eqe_type type; +}; + +static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev) +{ + switch (type) { + case GDMA_EQE_HWC_FPGA_RECONFIG: + mana_serv_fpga(pdev); + break; + + case GDMA_EQE_HWC_RESET_REQUEST: + mana_serv_reset(pdev); + break; + + default: + dev_err(&pdev->dev, "MANA service: unknown type %d\n", type); + break; + } +} + +static void mana_recovery_delayed_func(struct work_struct *w) +{ + struct mana_dev_recovery_work *work; + struct mana_dev_recovery *dev; + unsigned long flags; + + work = container_of(w, struct mana_dev_recovery_work, work.work); + + spin_lock_irqsave(&work->lock, flags); + + while (!list_empty(&work->dev_list)) { + dev = list_first_entry(&work->dev_list, + struct mana_dev_recovery, list); + list_del(&dev->list); + spin_unlock_irqrestore(&work->lock, flags); + + mana_do_service(dev->type, dev->pdev); + pci_dev_put(dev->pdev); + kfree(dev); + + spin_lock_irqsave(&work->lock, flags); + } + + spin_unlock_irqrestore(&work->lock, flags); +} + +static void mana_serv_func(struct work_struct *w) +{ + struct mana_serv_work *mns_wk; + struct pci_dev *pdev; + + mns_wk = container_of(w, struct mana_serv_work, serv_work); + pdev = mns_wk->pdev; + + if (pdev) + mana_do_service(mns_wk->type, pdev); + + pci_dev_put(pdev); + kfree(mns_wk); + module_put(THIS_MODULE); +} static void mana_gd_process_eqe(struct gdma_queue *eq) { u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE); struct gdma_context *gc = eq->gdma_dev->gdma_context; struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr; + struct mana_serv_work *mns_wk; union gdma_eqe_info eqe_info; enum gdma_eqe_type type; struct gdma_event event; @@ -355,6 +598,9 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) case GDMA_EQE_HWC_INIT_EQ_ID_DB: case GDMA_EQE_HWC_INIT_DATA: case GDMA_EQE_HWC_INIT_DONE: + case GDMA_EQE_HWC_SOC_SERVICE: + case GDMA_EQE_RNIC_QP_FATAL: + case GDMA_EQE_HWC_SOC_RECONFIG_DATA: if (!eq->eq.callback) break; @@ -363,6 +609,46 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) eq->eq.callback(eq->eq.context, eq, &event); break; + case GDMA_EQE_HWC_FPGA_RECONFIG: + case GDMA_EQE_HWC_RESET_REQUEST: + dev_info(gc->dev, "Recv MANA service type:%d\n", type); + + if (!test_and_set_bit(GC_PROBE_SUCCEEDED, &gc->flags)) { + /* + * Device is in probe and we received a hardware reset + * event, the probe function will detect that the flag + * has changed and perform service procedure. + */ + dev_info(gc->dev, + "Service is to be processed in probe\n"); + break; + } + + if (gc->in_service) { + dev_info(gc->dev, "Already in service\n"); + break; + } + + if (!try_module_get(THIS_MODULE)) { + dev_info(gc->dev, "Module is unloading\n"); + break; + } + + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC); + if (!mns_wk) { + module_put(THIS_MODULE); + break; + } + + dev_info(gc->dev, "Start MANA service type:%d\n", type); + gc->in_service = true; + mns_wk->pdev = to_pci_dev(gc->dev); + mns_wk->type = type; + pci_dev_get(mns_wk->pdev); + INIT_WORK(&mns_wk->serv_work, mana_serv_func); + schedule_work(&mns_wk->serv_work); + break; + default: break; } @@ -392,8 +678,12 @@ static void mana_gd_process_eq_events(void *arg) old_bits = (eq->head / num_eqe - 1) & GDMA_EQE_OWNER_MASK; /* No more entries */ - if (owner_bits == old_bits) + if (owner_bits == old_bits) { + /* return here without ringing the doorbell */ + if (i == 0) + return; break; + } new_bits = (eq->head / num_eqe) & GDMA_EQE_OWNER_MASK; if (owner_bits != new_bits) { @@ -423,72 +713,66 @@ static int mana_gd_register_irq(struct gdma_queue *queue, struct gdma_dev *gd = queue->gdma_dev; struct gdma_irq_context *gic; struct gdma_context *gc; - struct gdma_resource *r; unsigned int msi_index; unsigned long flags; struct device *dev; int err = 0; gc = gd->gdma_context; - r = &gc->msix_resource; dev = gc->dev; + msi_index = spec->eq.msix_index; - spin_lock_irqsave(&r->lock, flags); - - msi_index = find_first_zero_bit(r->map, r->size); - if (msi_index >= r->size || msi_index >= gc->num_msix_usable) { + if (msi_index >= gc->num_msix_usable) { err = -ENOSPC; - } else { - bitmap_set(r->map, msi_index, 1); - queue->eq.msix_index = msi_index; - } - - spin_unlock_irqrestore(&r->lock, flags); - - if (err) { - dev_err(dev, "Register IRQ err:%d, msi:%u rsize:%u, nMSI:%u", - err, msi_index, r->size, gc->num_msix_usable); + dev_err(dev, "Register IRQ err:%d, msi:%u nMSI:%u", + err, msi_index, gc->num_msix_usable); return err; } - gic = &gc->irq_contexts[msi_index]; - - WARN_ON(gic->handler || gic->arg); - - gic->arg = queue; + queue->eq.msix_index = msi_index; + gic = xa_load(&gc->irq_contexts, msi_index); + if (WARN_ON(!gic)) + return -EINVAL; - gic->handler = mana_gd_process_eq_events; + spin_lock_irqsave(&gic->lock, flags); + list_add_rcu(&queue->entry, &gic->eq_list); + spin_unlock_irqrestore(&gic->lock, flags); return 0; } -static void mana_gd_deregiser_irq(struct gdma_queue *queue) +static void mana_gd_deregister_irq(struct gdma_queue *queue) { struct gdma_dev *gd = queue->gdma_dev; struct gdma_irq_context *gic; struct gdma_context *gc; - struct gdma_resource *r; unsigned int msix_index; unsigned long flags; + struct gdma_queue *eq; gc = gd->gdma_context; - r = &gc->msix_resource; /* At most num_online_cpus() + 1 interrupts are used. */ msix_index = queue->eq.msix_index; if (WARN_ON(msix_index >= gc->num_msix_usable)) return; - gic = &gc->irq_contexts[msix_index]; - gic->handler = NULL; - gic->arg = NULL; + gic = xa_load(&gc->irq_contexts, msix_index); + if (WARN_ON(!gic)) + return; - spin_lock_irqsave(&r->lock, flags); - bitmap_clear(r->map, msix_index, 1); - spin_unlock_irqrestore(&r->lock, flags); + spin_lock_irqsave(&gic->lock, flags); + list_for_each_entry_rcu(eq, &gic->eq_list, entry) { + if (queue == eq) { + list_del_rcu(&eq->entry); + break; + } + } + spin_unlock_irqrestore(&gic->lock, flags); queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; + synchronize_rcu(); } int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq) @@ -511,7 +795,8 @@ int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err) { - dev_err(dev, "test_eq failed: %d\n", err); + if (mana_need_log(gc, err)) + dev_err(dev, "test_eq failed: %d\n", err); goto out; } @@ -546,11 +831,11 @@ static void mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets, if (flush_evenets) { err = mana_gd_test_eq(gc, queue); - if (err) + if (err && mana_need_log(gc, err)) dev_warn(gc->dev, "Failed to flush EQ: %d\n", err); } - mana_gd_deregiser_irq(queue); + mana_gd_deregister_irq(queue); if (queue->eq.disable_needed) mana_gd_disable_queue(queue); @@ -566,6 +851,7 @@ static int mana_gd_create_eq(struct gdma_dev *gd, int err; queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; + queue->id = INVALID_QUEUE_ID; log2_num_entries = ilog2(queue->queue_size / GDMA_EQE_SIZE); @@ -643,8 +929,11 @@ int mana_gd_create_hwc_queue(struct gdma_dev *gd, gmi = &queue->mem_info; err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); - if (err) + if (err) { + dev_err(gc->dev, "GDMA queue type: %d, size: %u, gdma memory allocation err: %d\n", + spec->type, spec->queue_size, err); goto free_q; + } queue->head = 0; queue->tail = 0; @@ -665,6 +954,8 @@ int mana_gd_create_hwc_queue(struct gdma_dev *gd, *queue_ptr = queue; return 0; out: + dev_err(gc->dev, "Failed to create queue type %d of size %u, err: %d\n", + spec->type, spec->queue_size, err); mana_gd_free_memory(gmi); free_q: kfree(queue); @@ -686,19 +977,20 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n", - err, resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n", + err, resp.hdr.status); return -EPROTO; } return 0; } -EXPORT_SYMBOL_NS(mana_gd_destroy_dma_region, NET_MANA); +EXPORT_SYMBOL_NS(mana_gd_destroy_dma_region, "NET_MANA"); static int mana_gd_create_dma_region(struct gdma_dev *gd, struct gdma_mem_info *gmi) { - unsigned int num_page = gmi->length / PAGE_SIZE; + unsigned int num_page = gmi->length / MANA_PAGE_SIZE; struct gdma_create_dma_region_req *req = NULL; struct gdma_create_dma_region_resp resp = {}; struct gdma_context *gc = gd->gdma_context; @@ -708,10 +1000,10 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd, int err; int i; - if (length < PAGE_SIZE || !is_power_of_2(length)) + if (length < MANA_PAGE_SIZE || !is_power_of_2(length)) return -EINVAL; - if (offset_in_page(gmi->virt_addr) != 0) + if (!MANA_PAGE_ALIGNED(gmi->virt_addr)) return -EINVAL; hwc = gc->hwc.driver_data; @@ -732,7 +1024,7 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd, req->page_addr_list_len = num_page; for (i = 0; i < num_page; i++) - req->page_addr_list[i] = gmi->dma_handle + i * PAGE_SIZE; + req->page_addr_list[i] = gmi->dma_handle + i * MANA_PAGE_SIZE; err = mana_gd_send_request(gc, req_msg_size, req, sizeof(resp), &resp); if (err) @@ -747,7 +1039,13 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd, } gmi->dma_region_handle = resp.dma_region_handle; + dev_dbg(gc->dev, "Created DMA region handle 0x%llx\n", + gmi->dma_region_handle); out: + if (err) + dev_dbg(gc->dev, + "Failed to create DMA region of length: %u, page_type: %d, status: 0x%x, err: %d\n", + length, req->gdma_page_type, resp.hdr.status, err); kfree(req); return err; } @@ -770,8 +1068,11 @@ int mana_gd_create_mana_eq(struct gdma_dev *gd, gmi = &queue->mem_info; err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); - if (err) + if (err) { + dev_err(gc->dev, "GDMA queue type: %d, size: %u, gdma memory allocation err: %d\n", + spec->type, spec->queue_size, err); goto free_q; + } err = mana_gd_create_dma_region(gd, gmi); if (err) @@ -792,11 +1093,14 @@ int mana_gd_create_mana_eq(struct gdma_dev *gd, *queue_ptr = queue; return 0; out: + dev_err(gc->dev, "Failed to create queue type %d of size: %u, err: %d\n", + spec->type, spec->queue_size, err); mana_gd_free_memory(gmi); free_q: kfree(queue); return err; } +EXPORT_SYMBOL_NS(mana_gd_create_mana_eq, "NET_MANA"); int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, const struct gdma_queue_spec *spec, @@ -817,8 +1121,11 @@ int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, gmi = &queue->mem_info; err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); - if (err) + if (err) { + dev_err(gc->dev, "GDMA queue type: %d, size: %u, memory allocation err: %d\n", + spec->type, spec->queue_size, err); goto free_q; + } err = mana_gd_create_dma_region(gd, gmi); if (err) @@ -838,11 +1145,14 @@ int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, *queue_ptr = queue; return 0; out: + dev_err(gc->dev, "Failed to create queue type %d of size: %u, err: %d\n", + spec->type, spec->queue_size, err); mana_gd_free_memory(gmi); free_q: kfree(queue); return err; } +EXPORT_SYMBOL_NS(mana_gd_create_mana_wq_cq, "NET_MANA"); void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue) { @@ -873,14 +1183,17 @@ void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue) mana_gd_free_memory(gmi); kfree(queue); } +EXPORT_SYMBOL_NS(mana_gd_destroy_queue, "NET_MANA"); int mana_gd_verify_vf_version(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); struct gdma_verify_ver_resp resp = {}; struct gdma_verify_ver_req req = {}; + struct hw_channel_context *hwc; int err; + hwc = gc->hwc.driver_data; mana_gd_init_req_hdr(&req.hdr, GDMA_VERIFY_VF_DRIVER_VERSION, sizeof(req), sizeof(resp)); @@ -907,7 +1220,15 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev) err, resp.hdr.status); return err ? err : -EPROTO; } - + gc->pf_cap_flags1 = resp.pf_cap_flags1; + if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) { + err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout); + if (err) { + dev_err(gc->dev, "Failed to set the hwc timeout %d\n", err); + return err; + } + dev_dbg(gc->dev, "set the hwc timeout to %u\n", hwc->hwc_timeout); + } return 0; } @@ -958,8 +1279,9 @@ int mana_gd_deregister_device(struct gdma_dev *gd) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n", - err, resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n", + err, resp.hdr.status); if (!err) err = -EPROTO; } @@ -1005,7 +1327,7 @@ static u32 mana_gd_write_client_oob(const struct gdma_wqe_request *wqe_req, header->inline_oob_size_div4 = client_oob_size / sizeof(u32); if (oob_in_sgl) { - WARN_ON_ONCE(!pad_data || wqe_req->num_sge < 2); + WARN_ON_ONCE(wqe_req->num_sge < 2); header->client_oob_in_sgl = 1; @@ -1061,7 +1383,6 @@ int mana_gd_post_work_request(struct gdma_queue *wq, struct gdma_posted_wqe_info *wqe_info) { u32 client_oob_size = wqe_req->inline_oob_size; - struct gdma_context *gc; u32 sgl_data_size; u32 max_wqe_size; u32 wqe_size; @@ -1091,11 +1412,8 @@ int mana_gd_post_work_request(struct gdma_queue *wq, if (wqe_size > max_wqe_size) return -EINVAL; - if (wq->monitor_avl_buf && wqe_size > mana_gd_wq_avail_space(wq)) { - gc = wq->gdma_dev->gdma_context; - dev_err(gc->dev, "unsuccessful flow control!\n"); + if (wq->monitor_avl_buf && wqe_size > mana_gd_wq_avail_space(wq)) return -ENOSPC; - } if (wqe_info) wqe_info->wqe_size_in_bu = wqe_size / GDMA_WQE_BU_SIZE; @@ -1112,6 +1430,7 @@ int mana_gd_post_work_request(struct gdma_queue *wq, return 0; } +EXPORT_SYMBOL_NS(mana_gd_post_work_request, "NET_MANA"); int mana_gd_post_and_ring(struct gdma_queue *queue, const struct gdma_wqe_request *wqe_req, @@ -1121,8 +1440,11 @@ int mana_gd_post_and_ring(struct gdma_queue *queue, int err; err = mana_gd_post_work_request(queue, wqe_req, wqe_info); - if (err) + if (err) { + dev_err(gc->dev, "Failed to post work req from queue type %d of size %u (err=%d)\n", + queue->type, queue->queue_size, err); return err; + } mana_gd_wq_ring_doorbell(gc, queue); @@ -1182,13 +1504,19 @@ int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe) return cqe_idx; } +EXPORT_SYMBOL_NS(mana_gd_poll_cq, "NET_MANA"); static irqreturn_t mana_gd_intr(int irq, void *arg) { struct gdma_irq_context *gic = arg; + struct list_head *eq_list = &gic->eq_list; + struct gdma_queue *eq; - if (gic->handler) - gic->handler(gic->arg); + rcu_read_lock(); + list_for_each_entry_rcu(eq, eq_list, entry) { + gic->handler(eq); + } + rcu_read_unlock(); return IRQ_HANDLED; } @@ -1212,52 +1540,189 @@ void mana_gd_free_res_map(struct gdma_resource *r) r->size = 0; } -static int mana_gd_setup_irqs(struct pci_dev *pdev) +/* + * Spread on CPUs with the following heuristics: + * + * 1. No more than one IRQ per CPU, if possible; + * 2. NUMA locality is the second priority; + * 3. Sibling dislocality is the last priority. + * + * Let's consider this topology: + * + * Node 0 1 + * Core 0 1 2 3 + * CPU 0 1 2 3 4 5 6 7 + * + * The most performant IRQ distribution based on the above topology + * and heuristics may look like this: + * + * IRQ Nodes Cores CPUs + * 0 1 0 0-1 + * 1 1 1 2-3 + * 2 1 0 0-1 + * 3 1 1 2-3 + * 4 2 2 4-5 + * 5 2 3 6-7 + * 6 2 2 4-5 + * 7 2 3 6-7 + * + * The heuristics is implemented as follows. + * + * The outer for_each() loop resets the 'weight' to the actual number + * of CPUs in the hop. Then inner for_each() loop decrements it by the + * number of sibling groups (cores) while assigning first set of IRQs + * to each group. IRQs 0 and 1 above are distributed this way. + * + * Now, because NUMA locality is more important, we should walk the + * same set of siblings and assign 2nd set of IRQs (2 and 3), and it's + * implemented by the medium while() loop. We do like this unless the + * number of IRQs assigned on this hop will not become equal to number + * of CPUs in the hop (weight == 0). Then we switch to the next hop and + * do the same thing. + */ + +static int irq_setup(unsigned int *irqs, unsigned int len, int node, + bool skip_first_cpu) +{ + const struct cpumask *next, *prev = cpu_none_mask; + cpumask_var_t cpus __free(free_cpumask_var); + int cpu, weight; + + if (!alloc_cpumask_var(&cpus, GFP_KERNEL)) + return -ENOMEM; + + rcu_read_lock(); + for_each_numa_hop_mask(next, node) { + weight = cpumask_weight_andnot(next, prev); + while (weight > 0) { + cpumask_andnot(cpus, next, prev); + for_each_cpu(cpu, cpus) { + cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); + --weight; + + if (unlikely(skip_first_cpu)) { + skip_first_cpu = false; + continue; + } + + if (len-- == 0) + goto done; + + irq_set_affinity_and_hint(*irqs++, topology_sibling_cpumask(cpu)); + } + } + prev = next; + } +done: + rcu_read_unlock(); + return 0; +} + +static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec) { - unsigned int max_queues_per_port = num_online_cpus(); struct gdma_context *gc = pci_get_drvdata(pdev); struct gdma_irq_context *gic; - unsigned int max_irqs; - u16 *cpus; - cpumask_var_t req_mask; - int nvec, irq; - int err, i = 0, j; + bool skip_first_cpu = false; + int *irqs, irq, err, i; - if (max_queues_per_port > MANA_MAX_NUM_QUEUES) - max_queues_per_port = MANA_MAX_NUM_QUEUES; + irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL); + if (!irqs) + return -ENOMEM; - /* Need 1 interrupt for the Hardware communication Channel (HWC) */ - max_irqs = max_queues_per_port + 1; + /* + * While processing the next pci irq vector, we start with index 1, + * as IRQ vector at index 0 is already processed for HWC. + * However, the population of irqs array starts with index 0, to be + * further used in irq_setup() + */ + for (i = 1; i <= nvec; i++) { + gic = kzalloc(sizeof(*gic), GFP_KERNEL); + if (!gic) { + err = -ENOMEM; + goto free_irq; + } + gic->handler = mana_gd_process_eq_events; + INIT_LIST_HEAD(&gic->eq_list); + spin_lock_init(&gic->lock); + + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", + i - 1, pci_name(pdev)); + + /* one pci vector is already allocated for HWC */ + irqs[i - 1] = pci_irq_vector(pdev, i); + if (irqs[i - 1] < 0) { + err = irqs[i - 1]; + goto free_current_gic; + } - nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX); - if (nvec < 0) - return nvec; + err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic); + if (err) + goto free_current_gic; - gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context), - GFP_KERNEL); - if (!gc->irq_contexts) { - err = -ENOMEM; - goto free_irq_vector; + xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); } - if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL)) { - err = -ENOMEM; + /* + * When calling irq_setup() for dynamically added IRQs, if number of + * CPUs is more than or equal to allocated MSI-X, we need to skip the + * first CPU sibling group since they are already affinitized to HWC IRQ + */ + cpus_read_lock(); + if (gc->num_msix_usable <= num_online_cpus()) + skip_first_cpu = true; + + err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu); + if (err) { + cpus_read_unlock(); goto free_irq; } - cpus = kcalloc(nvec, sizeof(*cpus), GFP_KERNEL); - if (!cpus) { - err = -ENOMEM; - goto free_mask; + cpus_read_unlock(); + kfree(irqs); + return 0; + +free_current_gic: + kfree(gic); +free_irq: + for (i -= 1; i > 0; i--) { + irq = pci_irq_vector(pdev, i); + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; + + irq_update_affinity_hint(irq, NULL); + free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } - for (i = 0; i < nvec; i++) - cpus[i] = cpumask_local_spread(i, gc->numa_node); + kfree(irqs); + return err; +} + +static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct gdma_irq_context *gic; + int *irqs, *start_irqs, irq; + unsigned int cpu; + int err, i; + + irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL); + if (!irqs) + return -ENOMEM; + + start_irqs = irqs; for (i = 0; i < nvec; i++) { - cpumask_set_cpu(cpus[i], req_mask); - gic = &gc->irq_contexts[i]; - gic->handler = NULL; - gic->arg = NULL; + gic = kzalloc(sizeof(*gic), GFP_KERNEL); + if (!gic) { + err = -ENOMEM; + goto free_irq; + } + + gic->handler = mana_gd_process_eq_events; + INIT_LIST_HEAD(&gic->eq_list); + spin_lock_init(&gic->lock); if (!i) snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s", @@ -1266,47 +1731,128 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", i - 1, pci_name(pdev)); - irq = pci_irq_vector(pdev, i); - if (irq < 0) { - err = irq; - goto free_mask; + irqs[i] = pci_irq_vector(pdev, i); + if (irqs[i] < 0) { + err = irqs[i]; + goto free_current_gic; } - err = request_irq(irq, mana_gd_intr, 0, gic->name, gic); + err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic); if (err) - goto free_mask; - irq_set_affinity_and_hint(irq, req_mask); - cpumask_clear(req_mask); + goto free_current_gic; + + xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); } - free_cpumask_var(req_mask); - kfree(cpus); - err = mana_gd_alloc_res_map(nvec, &gc->msix_resource); - if (err) - goto free_irq; + /* If number of IRQ is one extra than number of online CPUs, + * then we need to assign IRQ0 (hwc irq) and IRQ1 to + * same CPU. + * Else we will use different CPUs for IRQ0 and IRQ1. + * Also we are using cpumask_local_spread instead of + * cpumask_first for the node, because the node can be + * mem only. + */ + cpus_read_lock(); + if (nvec > num_online_cpus()) { + cpu = cpumask_local_spread(0, gc->numa_node); + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu)); + irqs++; + nvec -= 1; + } - gc->max_num_msix = nvec; - gc->num_msix_usable = nvec; + err = irq_setup(irqs, nvec, gc->numa_node, false); + if (err) { + cpus_read_unlock(); + goto free_irq; + } + cpus_read_unlock(); + kfree(start_irqs); return 0; -free_mask: - free_cpumask_var(req_mask); - kfree(cpus); +free_current_gic: + kfree(gic); free_irq: - for (j = i - 1; j >= 0; j--) { - irq = pci_irq_vector(pdev, j); - gic = &gc->irq_contexts[j]; + for (i -= 1; i >= 0; i--) { + irq = pci_irq_vector(pdev, i); + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; + + irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } - kfree(gc->irq_contexts); - gc->irq_contexts = NULL; -free_irq_vector: - pci_free_irq_vectors(pdev); + kfree(start_irqs); return err; } +static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + unsigned int max_irqs, min_irqs; + int nvec, err; + + if (pci_msix_can_alloc_dyn(pdev)) { + max_irqs = 1; + min_irqs = 1; + } else { + /* Need 1 interrupt for HWC */ + max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1; + min_irqs = 2; + } + + nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX); + if (nvec < 0) + return nvec; + + err = mana_gd_setup_irqs(pdev, nvec); + if (err) { + pci_free_irq_vectors(pdev); + return err; + } + + gc->num_msix_usable = nvec; + gc->max_num_msix = nvec; + + return 0; +} + +static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct msi_map irq_map; + int max_irqs, i, err; + + if (!pci_msix_can_alloc_dyn(pdev)) + /* remain irqs are already allocated with HWC IRQ */ + return 0; + + /* allocate only remaining IRQs*/ + max_irqs = gc->num_msix_usable - 1; + + for (i = 1; i <= max_irqs; i++) { + irq_map = pci_msix_alloc_irq_at(pdev, i, NULL); + if (!irq_map.virq) { + err = irq_map.index; + /* caller will handle cleaning up all allocated + * irqs, after HWC is destroyed + */ + return err; + } + } + + err = mana_gd_setup_dyn_irqs(pdev, max_irqs); + if (err) + return err; + + gc->max_num_msix = gc->max_num_msix + max_irqs; + + return 0; +} + static void mana_gd_remove_irqs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); @@ -1316,23 +1862,26 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev) if (gc->max_num_msix < 1) return; - mana_gd_free_res_map(&gc->msix_resource); - for (i = 0; i < gc->max_num_msix; i++) { irq = pci_irq_vector(pdev, i); if (irq < 0) continue; - gic = &gc->irq_contexts[i]; + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; + + /* Need to clear the hint before free_irq */ + irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } pci_free_irq_vectors(pdev); gc->max_num_msix = 0; gc->num_msix_usable = 0; - kfree(gc->irq_contexts); - gc->irq_contexts = NULL; } static int mana_gd_setup(struct pci_dev *pdev) @@ -1343,9 +1892,16 @@ static int mana_gd_setup(struct pci_dev *pdev) mana_gd_init_registers(pdev); mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base); - err = mana_gd_setup_irqs(pdev); - if (err) - return err; + gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0); + if (!gc->service_wq) + return -ENOMEM; + + err = mana_gd_setup_hwc_irqs(pdev); + if (err) { + dev_err(gc->dev, "Failed to setup IRQs for HWC creation: %d\n", + err); + goto free_workqueue; + } err = mana_hwc_create_channel(gc); if (err) @@ -1359,16 +1915,26 @@ static int mana_gd_setup(struct pci_dev *pdev) if (err) goto destroy_hwc; + err = mana_gd_setup_remaining_irqs(pdev); + if (err) { + dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err); + goto destroy_hwc; + } + err = mana_gd_detect_devices(pdev); if (err) goto destroy_hwc; + dev_dbg(&pdev->dev, "mana gdma setup successful\n"); return 0; destroy_hwc: mana_hwc_destroy_channel(gc); remove_irq: mana_gd_remove_irqs(pdev); +free_workqueue: + destroy_workqueue(gc->service_wq); + dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err); return err; } @@ -1379,6 +1945,9 @@ static void mana_gd_cleanup(struct pci_dev *pdev) mana_hwc_destroy_channel(gc); mana_gd_remove_irqs(pdev); + + destroy_workqueue(gc->service_wq); + dev_dbg(&pdev->dev, "mana gdma cleanup successful\n"); } static bool mana_is_pf(unsigned short dev_id) @@ -1397,8 +1966,10 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) BUILD_BUG_ON(2 * MAX_PORTS_IN_MANA_DEV * GDMA_EQE_SIZE > EQ_SIZE); err = pci_enable_device(pdev); - if (err) + if (err) { + dev_err(&pdev->dev, "Failed to enable pci device (err=%d)\n", err); return -ENXIO; + } pci_set_master(pdev); @@ -1407,14 +1978,11 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto disable_dev; err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - if (err) - goto release_region; - - err = dma_set_max_seg_size(&pdev->dev, UINT_MAX); if (err) { - dev_err(&pdev->dev, "Failed to set dma device segment size\n"); + dev_err(&pdev->dev, "DMA set mask failed: %d\n", err); goto release_region; } + dma_set_max_seg_size(&pdev->dev, UINT_MAX); err = -ENOMEM; gc = vzalloc(sizeof(*gc)); @@ -1433,6 +2001,13 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) gc->is_pf = mana_is_pf(pdev->device); gc->bar0_va = bar0_va; gc->dev = &pdev->dev; + xa_init(&gc->irq_contexts); + + if (gc->is_pf) + gc->mana_pci_debugfs = debugfs_create_dir("0", mana_debugfs_root); + else + gc->mana_pci_debugfs = debugfs_create_dir(pci_slot_name(pdev->slot), + mana_debugfs_root); err = mana_gd_setup(pdev); if (err) @@ -1442,11 +2017,37 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto cleanup_gd; + err = mana_rdma_probe(&gc->mana_ib); + if (err) + goto cleanup_mana; + + /* + * If a hardware reset event has occurred over HWC during probe, + * rollback and perform hardware reset procedure. + */ + if (test_and_set_bit(GC_PROBE_SUCCEEDED, &gc->flags)) { + err = -EPROTO; + goto cleanup_mana_rdma; + } + return 0; +cleanup_mana_rdma: + mana_rdma_remove(&gc->mana_ib); +cleanup_mana: + mana_remove(&gc->mana, false); cleanup_gd: mana_gd_cleanup(pdev); unmap_bar: + /* + * at this point we know that the other debugfs child dir/files + * are either not yet created or are already cleaned up. + * The pci debugfs folder clean-up now, will only be cleaning up + * adapter-MTU file and apc->mana_pci_debugfs folder. + */ + debugfs_remove_recursive(gc->mana_pci_debugfs); + gc->mana_pci_debugfs = NULL; + xa_destroy(&gc->irq_contexts); pci_iounmap(pdev, bar0_va); free_gc: pci_set_drvdata(pdev, NULL); @@ -1454,9 +2055,37 @@ free_gc: release_region: pci_release_regions(pdev); disable_dev: - pci_clear_master(pdev); pci_disable_device(pdev); dev_err(&pdev->dev, "gdma probe failed: err = %d\n", err); + + /* + * Hardware could be in recovery mode and the HWC returns TIMEDOUT or + * EPROTO from mana_gd_setup(), mana_probe() or mana_rdma_probe(), or + * we received a hardware reset event over HWC interrupt. In this case, + * perform the device recovery procedure after MANA_SERVICE_PERIOD + * seconds. + */ + if (err == -ETIMEDOUT || err == -EPROTO) { + struct mana_dev_recovery *dev; + unsigned long flags; + + dev_info(&pdev->dev, "Start MANA recovery mode\n"); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return err; + + dev->pdev = pci_dev_get(pdev); + dev->type = GDMA_EQE_HWC_RESET_REQUEST; + + spin_lock_irqsave(&mana_dev_recovery_work.lock, flags); + list_add_tail(&dev->list, &mana_dev_recovery_work.dev_list); + spin_unlock_irqrestore(&mana_dev_recovery_work.lock, flags); + + schedule_delayed_work(&mana_dev_recovery_work.work, + secs_to_jiffies(MANA_SERVICE_PERIOD)); + } + return err; } @@ -1464,24 +2093,33 @@ static void mana_gd_remove(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); + mana_rdma_remove(&gc->mana_ib); mana_remove(&gc->mana, false); mana_gd_cleanup(pdev); + debugfs_remove_recursive(gc->mana_pci_debugfs); + + gc->mana_pci_debugfs = NULL; + + xa_destroy(&gc->irq_contexts); + pci_iounmap(pdev, gc->bar0_va); vfree(gc); pci_release_regions(pdev); - pci_clear_master(pdev); pci_disable_device(pdev); + + dev_dbg(&pdev->dev, "mana gdma remove successful\n"); } /* The 'state' parameter is not used. */ -static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) +int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) { struct gdma_context *gc = pci_get_drvdata(pdev); + mana_rdma_remove(&gc->mana_ib); mana_remove(&gc->mana, true); mana_gd_cleanup(pdev); @@ -1493,7 +2131,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) * fail -- if this happens, it's safer to just report an error than try to undo * what has been done. */ -static int mana_gd_resume(struct pci_dev *pdev) +int mana_gd_resume(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); int err; @@ -1506,6 +2144,10 @@ static int mana_gd_resume(struct pci_dev *pdev) if (err) return err; + err = mana_rdma_probe(&gc->mana_ib); + if (err) + return err; + return 0; } @@ -1516,10 +2158,15 @@ static void mana_gd_shutdown(struct pci_dev *pdev) dev_info(&pdev->dev, "Shutdown was called\n"); + mana_rdma_remove(&gc->mana_ib); mana_remove(&gc->mana, true); mana_gd_cleanup(pdev); + debugfs_remove_recursive(gc->mana_pci_debugfs); + + gc->mana_pci_debugfs = NULL; + pci_disable_device(pdev); } @@ -1539,7 +2186,51 @@ static struct pci_driver mana_driver = { .shutdown = mana_gd_shutdown, }; -module_pci_driver(mana_driver); +static int __init mana_driver_init(void) +{ + int err; + + INIT_LIST_HEAD(&mana_dev_recovery_work.dev_list); + spin_lock_init(&mana_dev_recovery_work.lock); + INIT_DELAYED_WORK(&mana_dev_recovery_work.work, mana_recovery_delayed_func); + + mana_debugfs_root = debugfs_create_dir("mana", NULL); + + err = pci_register_driver(&mana_driver); + if (err) { + debugfs_remove(mana_debugfs_root); + mana_debugfs_root = NULL; + } + + return err; +} + +static void __exit mana_driver_exit(void) +{ + struct mana_dev_recovery *dev; + unsigned long flags; + + disable_delayed_work_sync(&mana_dev_recovery_work.work); + + spin_lock_irqsave(&mana_dev_recovery_work.lock, flags); + while (!list_empty(&mana_dev_recovery_work.dev_list)) { + dev = list_first_entry(&mana_dev_recovery_work.dev_list, + struct mana_dev_recovery, list); + list_del(&dev->list); + pci_dev_put(dev->pdev); + kfree(dev); + } + spin_unlock_irqrestore(&mana_dev_recovery_work.lock, flags); + + pci_unregister_driver(&mana_driver); + + debugfs_remove(mana_debugfs_root); + + mana_debugfs_root = NULL; +} + +module_init(mana_driver_init); +module_exit(mana_driver_exit); MODULE_DEVICE_TABLE(pci, mana_id_table); |
