diff options
Diffstat (limited to 'drivers/infiniband/hw/hfi1/init.c')
| -rw-r--r-- | drivers/infiniband/hw/hfi1/init.c | 1030 |
1 files changed, 566 insertions, 464 deletions
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 4a11d4da4c92..e4aef102dac0 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1,59 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright(c) 2015-2017 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * + * Copyright(c) 2015 - 2020 Intel Corporation. + * Copyright(c) 2021 Cornelis Networks. */ #include <linux/pci.h> #include <linux/netdevice.h> #include <linux/vmalloc.h> #include <linux/delay.h> -#include <linux/idr.h> +#include <linux/xarray.h> #include <linux/module.h> #include <linux/printk.h> #include <linux/hrtimer.h> #include <linux/bitmap.h> +#include <linux/numa.h> #include <rdma/rdma_vt.h> #include "hfi.h" @@ -67,29 +27,30 @@ #include "aspm.h" #include "affinity.h" #include "vnic.h" +#include "exp_rcv.h" +#include "netdev.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt -#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 /* * min buffers we want to have per context, after driver */ #define HFI1_MIN_USER_CTXT_BUFCNT 7 -#define HFI1_MIN_HDRQ_EGRBUF_CNT 2 -#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ +#define NUM_IB_PORTS 1 + /* * Number of user receive contexts we are configured to use (to allow for more * pio buffers per ctxt, etc.) Zero means use one user context per CPU. */ int num_user_contexts = -1; -module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO); +module_param_named(num_user_contexts, num_user_contexts, int, 0444); MODULE_PARM_DESC( - num_user_contexts, "Set max number of user contexts to use"); + num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)"); uint krcvqs[RXE_NUM_DATA_VL]; int krcvqsset; @@ -112,98 +73,236 @@ module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); static uint hfi1_hdrq_entsize = 32; -module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO); -MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B"); +module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); +MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)"); unsigned int user_credit_return_threshold = 33; /* default is 33% */ module_param(user_credit_return_threshold, uint, S_IRUGO); MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); -static inline u64 encode_rcv_header_entry_size(u16 size); +DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); -static struct idr hfi1_unit_table; -u32 hfi1_cpulist_count; -unsigned long *hfi1_cpulist; - -/* - * Common code for creating the receive context array. - */ -int hfi1_create_ctxts(struct hfi1_devdata *dd) +static int hfi1_create_kctxt(struct hfi1_devdata *dd, + struct hfi1_pportdata *ppd) { - unsigned i; + struct hfi1_ctxtdata *rcd; int ret; /* Control context has to be always 0 */ BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); - dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd), + ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd); + if (ret < 0) { + dd_dev_err(dd, "Kernel receive context allocation failed\n"); + return ret; + } + + /* + * Set up the kernel context flags here and now because they use + * default values for all receive side memories. User contexts will + * be handled as they are created. + */ + rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | + HFI1_CAP_KGET(NODROP_RHQ_FULL) | + HFI1_CAP_KGET(NODROP_EGR_FULL) | + HFI1_CAP_KGET(DMA_RTAIL); + + /* Control context must use DMA_RTAIL */ + if (rcd->ctxt == HFI1_CTRL_CTXT) + rcd->flags |= HFI1_CAP_DMA_RTAIL; + rcd->fast_handler = get_dma_rtail_setting(rcd) ? + handle_receive_interrupt_dma_rtail : + handle_receive_interrupt_nodma_rtail; + + hfi1_set_seq_cnt(rcd, 1); + + rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); + if (!rcd->sc) { + dd_dev_err(dd, "Kernel send context allocation failed\n"); + return -ENOMEM; + } + hfi1_init_ctxt(rcd->sc); + + return 0; +} + +/* + * Create the receive context array and one or more kernel contexts + */ +int hfi1_create_kctxts(struct hfi1_devdata *dd) +{ + u16 i; + int ret; + + dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL, dd->node); if (!dd->rcd) - goto nomem; + return -ENOMEM; - /* create one or more kernel contexts */ for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { - struct hfi1_pportdata *ppd; - struct hfi1_ctxtdata *rcd; + ret = hfi1_create_kctxt(dd, dd->pport); + if (ret) + goto bail; + } - ppd = dd->pport + (i % dd->num_pports); + return 0; +bail: + for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) + hfi1_free_ctxt(dd->rcd[i]); - /* dd->rcd[i] gets assigned inside the callee */ - rcd = hfi1_create_ctxtdata(ppd, i, dd->node); - if (!rcd) { - dd_dev_err(dd, - "Unable to allocate kernel receive context, failing\n"); - goto nomem; - } - /* - * Set up the kernel context flags here and now because they - * use default values for all receive side memories. User - * contexts will be handled as they are created. - */ - rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | - HFI1_CAP_KGET(NODROP_RHQ_FULL) | - HFI1_CAP_KGET(NODROP_EGR_FULL) | - HFI1_CAP_KGET(DMA_RTAIL); - - /* Control context must use DMA_RTAIL */ - if (rcd->ctxt == HFI1_CTRL_CTXT) - rcd->flags |= HFI1_CAP_DMA_RTAIL; - rcd->seq_cnt = 1; - - rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); - if (!rcd->sc) { - dd_dev_err(dd, - "Unable to allocate kernel send context, failing\n"); - goto nomem; - } + /* All the contexts should be freed, free the array */ + kfree(dd->rcd); + dd->rcd = NULL; + return ret; +} + +/* + * Helper routines for the receive context reference count (rcd and uctxt). + */ +static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd) +{ + kref_init(&rcd->kref); +} + +/** + * hfi1_rcd_free - When reference is zero clean up. + * @kref: pointer to an initialized rcd data structure + * + */ +static void hfi1_rcd_free(struct kref *kref) +{ + unsigned long flags; + struct hfi1_ctxtdata *rcd = + container_of(kref, struct hfi1_ctxtdata, kref); + + spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); + rcd->dd->rcd[rcd->ctxt] = NULL; + spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); + + hfi1_free_ctxtdata(rcd->dd, rcd); + + kfree(rcd); +} + +/** + * hfi1_rcd_put - decrement reference for rcd + * @rcd: pointer to an initialized rcd data structure + * + * Use this to put a reference after the init. + */ +int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) +{ + if (rcd) + return kref_put(&rcd->kref, hfi1_rcd_free); - hfi1_init_ctxt(rcd->sc); + return 0; +} + +/** + * hfi1_rcd_get - increment reference for rcd + * @rcd: pointer to an initialized rcd data structure + * + * Use this to get a reference after the init. + * + * Return : reflect kref_get_unless_zero(), which returns non-zero on + * increment, otherwise 0. + */ +int hfi1_rcd_get(struct hfi1_ctxtdata *rcd) +{ + return kref_get_unless_zero(&rcd->kref); +} + +/** + * allocate_rcd_index - allocate an rcd index from the rcd array + * @dd: pointer to a valid devdata structure + * @rcd: rcd data structure to assign + * @index: pointer to index that is allocated + * + * Find an empty index in the rcd array, and assign the given rcd to it. + * If the array is full, we are EBUSY. + * + */ +static int allocate_rcd_index(struct hfi1_devdata *dd, + struct hfi1_ctxtdata *rcd, u16 *index) +{ + unsigned long flags; + u16 ctxt; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++) + if (!dd->rcd[ctxt]) + break; + + if (ctxt < dd->num_rcv_contexts) { + rcd->ctxt = ctxt; + dd->rcd[ctxt] = rcd; + hfi1_rcd_init(rcd); } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); - /* - * Initialize aspm, to be done after gen3 transition and setting up - * contexts and before enabling interrupts - */ - aspm_init(dd); + if (ctxt >= dd->num_rcv_contexts) + return -EBUSY; + + *index = ctxt; return 0; -nomem: - ret = -ENOMEM; +} + +/** + * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the + * array + * @dd: pointer to a valid devdata structure + * @ctxt: the index of an possilbe rcd + * + * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given + * ctxt index is valid. + * + * The caller is responsible for making the _put(). + * + */ +struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, + u16 ctxt) +{ + if (ctxt < dd->num_rcv_contexts) + return hfi1_rcd_get_by_index(dd, ctxt); - if (dd->rcd) { - for (i = 0; i < dd->num_rcv_contexts; ++i) - hfi1_free_ctxtdata(dd, dd->rcd[i]); + return NULL; +} + +/** + * hfi1_rcd_get_by_index - get by index + * @dd: pointer to a valid devdata structure + * @ctxt: the index of an possilbe rcd + * + * We need to protect access to the rcd array. If access is needed to + * one or more index, get the protecting spinlock and then increment the + * kref. + * + * The caller is responsible for making the _put(). + * + */ +struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) +{ + unsigned long flags; + struct hfi1_ctxtdata *rcd = NULL; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (dd->rcd[ctxt]) { + rcd = dd->rcd[ctxt]; + if (!hfi1_rcd_get(rcd)) + rcd = NULL; } - kfree(dd->rcd); - dd->rcd = NULL; - return ret; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + return rcd; } /* - * Common code for user and kernel context setup. + * Common code for user and kernel context create and setup. + * NOTE: the initial kref is done here (hf1_rcd_init()). */ -struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, - int numa) +int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, + struct hfi1_ctxtdata **context) { struct hfi1_devdata *dd = ppd->dd; struct hfi1_ctxtdata *rcd; @@ -217,19 +316,33 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa); if (rcd) { u32 rcvtids, max_entries; - - hfi1_cdbg(PROC, "setting up context %u\n", ctxt); + u16 ctxt; + int ret; + + ret = allocate_rcd_index(dd, rcd, &ctxt); + if (ret) { + *context = NULL; + kfree(rcd); + return ret; + } INIT_LIST_HEAD(&rcd->qp_wait_list); + hfi1_exp_tid_group_init(rcd); rcd->ppd = ppd; rcd->dd = dd; - __set_bit(0, rcd->in_use_ctxts); - rcd->ctxt = ctxt; - dd->rcd[ctxt] = rcd; rcd->numa_id = numa; rcd->rcv_array_groups = dd->rcv_entries.ngroups; + rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; + rcd->slow_handler = handle_receive_interrupt; + rcd->do_interrupt = rcd->slow_handler; + rcd->msix_intr = CCE_NUM_MSIX_VECTORS; + + mutex_init(&rcd->exp_mutex); + spin_lock_init(&rcd->exp_lock); + INIT_LIST_HEAD(&rcd->flow_queue.queue_head); + INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); - mutex_init(&rcd->exp_lock); + hfi1_cdbg(PROC, "setting up context %u", rcd->ctxt); /* * Calculate the context's RcvArray entry starting point. @@ -263,6 +376,8 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, rcd->rcvhdrq_cnt = rcvhdrcnt; rcd->rcvhdrqentsize = hfi1_hdrq_entsize; + rcd->rhf_offset = + rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32); /* * Simple Eager buffer allocation: we have already pre-allocated * the number of RcvArray entry groups. Each ctxtdata structure @@ -285,7 +400,7 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, rcd->egrbufs.count = MAX_EAGER_ENTRIES; } hfi1_cdbg(PROC, - "ctxt%u: max Eager buffer RcvArray entries: %u\n", + "ctxt%u: max Eager buffer RcvArray entries: %u", rcd->ctxt, rcd->egrbufs.count); /* @@ -296,15 +411,16 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, * The resulting value will be rounded down to the closest * multiple of dd->rcv_entries.group_size. */ - rcd->egrbufs.buffers = kzalloc_node( - rcd->egrbufs.count * sizeof(*rcd->egrbufs.buffers), - GFP_KERNEL, numa); + rcd->egrbufs.buffers = + kcalloc_node(rcd->egrbufs.count, + sizeof(*rcd->egrbufs.buffers), + GFP_KERNEL, numa); if (!rcd->egrbufs.buffers) goto bail; - rcd->egrbufs.rcvtids = kzalloc_node( - rcd->egrbufs.count * - sizeof(*rcd->egrbufs.rcvtids), - GFP_KERNEL, numa); + rcd->egrbufs.rcvtids = + kcalloc_node(rcd->egrbufs.count, + sizeof(*rcd->egrbufs.rcvtids), + GFP_KERNEL, numa); if (!rcd->egrbufs.rcvtids) goto bail; rcd->egrbufs.size = eager_buffer_size; @@ -316,7 +432,7 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, if (rcd->egrbufs.size < hfi1_max_mtu) { rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); hfi1_cdbg(PROC, - "ctxt%u: eager bufs size too small. Adjusting to %zu\n", + "ctxt%u: eager bufs size too small. Adjusting to %u", rcd->ctxt, rcd->egrbufs.size); } rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; @@ -327,32 +443,34 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, GFP_KERNEL, numa); if (!rcd->opstats) goto bail; + + /* Initialize TID flow generations for the context */ + hfi1_kern_init_ctxt_generations(rcd); } + + *context = rcd; + return 0; } - return rcd; + bail: - dd->rcd[ctxt] = NULL; - kfree(rcd->egrbufs.rcvtids); - kfree(rcd->egrbufs.buffers); - kfree(rcd); - return NULL; + *context = NULL; + hfi1_free_ctxt(rcd); + return -ENOMEM; } -/* - * Convert a receive header entry size that to the encoding used in the CSR. +/** + * hfi1_free_ctxt - free context + * @rcd: pointer to an initialized rcd data structure * - * Return a zero if the given size is invalid. + * This wrapper is the free function that matches hfi1_create_ctxtdata(). + * When a context is done being used (kernel or user), this function is called + * for the "final" put to match the kref init from hfi1_create_ctxtdata(). + * Other users of the context do a get/put sequence to make sure that the + * structure isn't removed while in use. */ -static inline u64 encode_rcv_header_entry_size(u16 size) +void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd) { - /* there are only 3 valid receive header entry sizes */ - if (size == 2) - return 1; - if (size == 16) - return 2; - else if (size == 32) - return 4; - return 0; /* invalid */ + hfi1_rcd_put(rcd); } /* @@ -371,7 +489,7 @@ void set_link_ipg(struct hfi1_pportdata *ppd) u16 shift, mult; u64 src; u32 current_egress_rate; /* Mbits /sec */ - u32 max_pkt_time; + u64 max_pkt_time; /* * max_pkt_time is the maximum packet egress time in units * of the fabric clock period 1/(805 MHz). @@ -469,7 +587,7 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) * Common code for initializing the physical port structure. */ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, - struct hfi1_devdata *dd, u8 hw_pidx, u8 port) + struct hfi1_devdata *dd, u8 hw_pidx, u32 port) { int i; uint default_pkey_idx; @@ -478,19 +596,21 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, ppd->dd = dd; ppd->hw_pidx = hw_pidx; ppd->port = port; /* IB port number, not index */ + ppd->prev_link_width = LINK_WIDTH_DEFAULT; + /* + * There are C_VL_COUNT number of PortVLXmitWait counters. + * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. + */ + for (i = 0; i < C_VL_COUNT + 1; i++) { + ppd->port_vl_xmit_wait_last[i] = 0; + ppd->vl_xmit_flit_cnt[i] = 0; + } default_pkey_idx = 1; ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; ppd->part_enforce |= HFI1_PART_ENFORCE_IN; - ppd->part_enforce |= HFI1_PART_ENFORCE_OUT; - - if (loopback) { - hfi1_early_err(&pdev->dev, - "Faking data partition 0x8001 in idx %u\n", - !default_pkey_idx); - ppd->pkeys[!default_pkey_idx] = 0x8001; - } + ppd->pkeys[0] = 0x8001; INIT_WORK(&ppd->link_vc_work, handle_verify_cap); INIT_WORK(&ppd->link_up_work, handle_link_up); @@ -515,12 +635,11 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, spin_lock_init(&ppd->cca_timer_lock); for (i = 0; i < OPA_MAX_SLS; i++) { - hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); ppd->cca_timer[i].ppd = ppd; ppd->cca_timer[i].sl = i; ppd->cca_timer[i].ccti = 0; - ppd->cca_timer[i].hrtimer.function = cca_timer_fn; + hrtimer_setup(&ppd->cca_timer[i].hrtimer, cca_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; @@ -534,9 +653,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, return; bail: - - hfi1_early_err(&pdev->dev, - "Congestion Control Agent disabled for port %d\n", port); + dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); } /* @@ -559,16 +676,19 @@ static int loadtime_init(struct hfi1_devdata *dd) static int init_after_reset(struct hfi1_devdata *dd) { int i; - + struct hfi1_ctxtdata *rcd; /* * Ensure chip does no sends or receives, tail updates, or * pioavail updates while we re-initialize. This is mostly * for the driver data structures, not chip registers. */ - for (i = 0; i < dd->num_rcv_contexts; i++) + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | - HFI1_RCVCTRL_INTRAVAIL_DIS | - HFI1_RCVCTRL_TAILUPD_DIS, i); + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS, rcd); + hfi1_rcd_put(rcd); + } pio_send_control(dd, PSC_GLOBAL_DISABLE); for (i = 0; i < dd->num_send_contexts; i++) sc_disable(dd->send_contexts[i].sc); @@ -578,8 +698,9 @@ static int init_after_reset(struct hfi1_devdata *dd) static void enable_chip(struct hfi1_devdata *dd) { + struct hfi1_ctxtdata *rcd; u32 rcvmask; - u32 i; + u16 i; /* enable PIO send */ pio_send_control(dd, PSC_GLOBAL_ENABLE); @@ -589,17 +710,23 @@ static void enable_chip(struct hfi1_devdata *dd) * Other ctxts done as user opens and initializes them. */ for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { + rcd = hfi1_rcd_get_by_index(dd, i); + if (!rcd) + continue; rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; - rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? + rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; - if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR)) + if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; - if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL)) + if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL)) rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; - if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL)) + if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; - hfi1_rcvctrl(dd, rcvmask, i); - sc_enable(dd->rcd[i]->sc); + if (HFI1_CAP_IS_KSET(TID_RDMA)) + rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB; + hfi1_rcvctrl(dd, rcvmask, rcd); + sc_enable(rcd->sc); + hfi1_rcd_put(rcd); } } @@ -618,12 +745,27 @@ static int create_workqueues(struct hfi1_devdata *dd) ppd->hfi1_wq = alloc_workqueue( "hfi%d_%d", - WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE, + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | + WQ_PERCPU, HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, dd->unit, pidx); if (!ppd->hfi1_wq) goto wq_error; } + if (!ppd->link_wq) { + /* + * Make the link workqueue single-threaded to enforce + * serialization. + */ + ppd->link_wq = + alloc_workqueue( + "hfi_link_%d_%d", + WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, + 1, /* max_active */ + dd->unit, pidx); + if (!ppd->link_wq) + goto wq_error; + } } return 0; wq_error: @@ -634,11 +776,55 @@ wq_error: destroy_workqueue(ppd->hfi1_wq); ppd->hfi1_wq = NULL; } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } } return -ENOMEM; } /** + * destroy_workqueues - destroy per port workqueues + * @dd: the hfi1_ib device + */ +static void destroy_workqueues(struct hfi1_devdata *dd) +{ + int pidx; + struct hfi1_pportdata *ppd; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } + } +} + +/** + * enable_general_intr() - Enable the IRQs that will be handled by the + * general interrupt handler. + * @dd: valid devdata + * + */ +static void enable_general_intr(struct hfi1_devdata *dd) +{ + set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); + set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); + set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); + set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); + set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); + set_intr_bits(dd, IS_DC_START, IS_DC_END, true); + set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); +} + +/** * hfi1_init - do the actual initialization sequence on the chip * @dd: the hfi1_ib device * @reinit: re-initializing, so don't allocate new memory @@ -656,28 +842,11 @@ wq_error: int hfi1_init(struct hfi1_devdata *dd, int reinit) { int ret = 0, pidx, lastfail = 0; - unsigned i, len; + unsigned long len; + u16 i; struct hfi1_ctxtdata *rcd; struct hfi1_pportdata *ppd; - /* Set up recv low level handlers */ - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] = - kdeth_process_expected; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] = - kdeth_process_eager; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] = - process_receive_error; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] = - process_receive_bypass; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] = - process_receive_invalid; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] = - process_receive_invalid; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] = - process_receive_invalid; - dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions; - /* Set up send low level handlers */ dd->process_pio_send = hfi1_verbs_send_pio; dd->process_dma_send = hfi1_verbs_send_dma; @@ -686,10 +855,10 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) if (is_ax(dd)) { atomic_set(&dd->drop_packet, DROP_PACKET_ON); - dd->do_drop = 1; + dd->do_drop = true; } else { atomic_set(&dd->drop_packet, DROP_PACKET_OFF); - dd->do_drop = 0; + dd->do_drop = false; } /* make sure the link is not "up" */ @@ -705,18 +874,6 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) if (ret) goto done; - /* allocate dummy tail memory for all receive contexts */ - dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent( - &dd->pcidev->dev, sizeof(u64), - &dd->rcvhdrtail_dummy_dma, - GFP_KERNEL); - - if (!dd->rcvhdrtail_dummy_kvaddr) { - dd_dev_err(dd, "cannot allocate dummy tail memory\n"); - ret = -ENOMEM; - goto done; - } - /* dd->rcd can be NULL if early initialization failed */ for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) { /* @@ -725,24 +882,26 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) * existing, and re-allocate. * Need to re-create rest of ctxt 0 ctxtdata as well. */ - rcd = dd->rcd[i]; + rcd = hfi1_rcd_get_by_index(dd, i); if (!rcd) continue; - rcd->do_interrupt = &handle_receive_interrupt; - lastfail = hfi1_create_rcvhdrq(dd, rcd); if (!lastfail) lastfail = hfi1_setup_eagerbufs(rcd); + if (!lastfail) + lastfail = hfi1_kern_exp_rcv_init(rcd, reinit); if (lastfail) { dd_dev_err(dd, "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); ret = lastfail; } + /* enable IRQ */ + hfi1_rcd_put(rcd); } /* Allocate enough memory for user event notification. */ - len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS * + len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS * sizeof(*dd->events)); dd->events = vmalloc_user(len); if (!dd->events) @@ -754,9 +913,6 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) dd->status = vmalloc_user(PAGE_SIZE); if (!dd->status) dd_dev_err(dd, "Failed to allocate dev status page\n"); - else - dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) - - sizeof(dd->status->freezemsg)); for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (dd->status) @@ -779,7 +935,8 @@ done: HFI1_STATUS_INITTED; if (!ret) { /* enable all interrupts from the chip */ - set_intr_state(dd, 1); + enable_general_intr(dd); + init_qsfp_int(dd); /* chip is OK for user apps; mark it as initialized */ for (pidx = 0; pidx < dd->num_pports; ++pidx) { @@ -811,21 +968,9 @@ done: return ret; } -static inline struct hfi1_devdata *__hfi1_lookup(int unit) -{ - return idr_find(&hfi1_unit_table, unit); -} - struct hfi1_devdata *hfi1_lookup(int unit) { - struct hfi1_devdata *dd; - unsigned long flags; - - spin_lock_irqsave(&hfi1_devs_lock, flags); - dd = __hfi1_lookup(unit); - spin_unlock_irqrestore(&hfi1_devs_lock, flags); - - return dd; + return xa_load(&hfi1_dev_table, unit); } /* @@ -839,8 +984,8 @@ static void stop_timers(struct hfi1_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; - if (ppd->led_override_timer.data) { - del_timer_sync(&ppd->led_override_timer); + if (ppd->led_override_timer.function) { + timer_delete_sync(&ppd->led_override_timer); atomic_set(&ppd->led_override_timer_active, 0); } } @@ -858,9 +1003,14 @@ static void stop_timers(struct hfi1_devdata *dd) static void shutdown_device(struct hfi1_devdata *dd) { struct hfi1_pportdata *ppd; + struct hfi1_ctxtdata *rcd; unsigned pidx; int i; + if (dd->flags & HFI1_SHUTDOWN) + return; + dd->flags |= HFI1_SHUTDOWN; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -871,17 +1021,20 @@ static void shutdown_device(struct hfi1_devdata *dd) } dd->flags &= ~HFI1_INITTED; - /* mask interrupts, but not errors */ - set_intr_state(dd, 0); + /* mask and clean up interrupts */ + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); + msix_clean_up_interrupts(dd); for (pidx = 0; pidx < dd->num_pports; ++pidx) { - ppd = dd->pport + pidx; - for (i = 0; i < dd->num_rcv_contexts; i++) + for (i = 0; i < dd->num_rcv_contexts; i++) { + rcd = hfi1_rcd_get_by_index(dd, i); hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | - HFI1_RCVCTRL_CTXT_DIS | - HFI1_RCVCTRL_INTRAVAIL_DIS | - HFI1_RCVCTRL_PKEY_DIS | - HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i); + HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_PKEY_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd); + hfi1_rcd_put(rcd); + } /* * Gracefully stop all sends allowing any in progress to * trickle out first. @@ -912,11 +1065,10 @@ static void shutdown_device(struct hfi1_devdata *dd) * We can't count on interrupts since we are stopping. */ hfi1_quiet_serdes(ppd); - - if (ppd->hfi1_wq) { - destroy_workqueue(ppd->hfi1_wq); - ppd->hfi1_wq = NULL; - } + if (ppd->hfi1_wq) + flush_workqueue(ppd->hfi1_wq); + if (ppd->link_wq) + flush_workqueue(ppd->link_wq); } sdma_exit(dd); } @@ -927,25 +1079,22 @@ static void shutdown_device(struct hfi1_devdata *dd) * @rcd: the ctxtdata structure * * free up any allocated data for a context - * This should not touch anything that would affect a simultaneous - * re-allocation of context data, because it is called after hfi1_mutex - * is released (and can be called from reinit as well). * It should never change any chip state, or global driver state. */ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) { - unsigned e; + u32 e; if (!rcd) return; if (rcd->rcvhdrq) { - dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, + dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd), rcd->rcvhdrq, rcd->rcvhdrq_dma); rcd->rcvhdrq = NULL; - if (rcd->rcvhdrtail_kvaddr) { + if (hfi1_rcvhdrtail_kvaddr(rcd)) { dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, - (void *)rcd->rcvhdrtail_kvaddr, + (void *)hfi1_rcvhdrtail_kvaddr(rcd), rcd->rcvhdrqtailaddr_dma); rcd->rcvhdrtail_kvaddr = NULL; } @@ -953,28 +1102,37 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) /* all the RcvArray entries should have been cleared by now */ kfree(rcd->egrbufs.rcvtids); + rcd->egrbufs.rcvtids = NULL; for (e = 0; e < rcd->egrbufs.alloced; e++) { - if (rcd->egrbufs.buffers[e].dma) + if (rcd->egrbufs.buffers[e].addr) dma_free_coherent(&dd->pcidev->dev, rcd->egrbufs.buffers[e].len, rcd->egrbufs.buffers[e].addr, rcd->egrbufs.buffers[e].dma); } kfree(rcd->egrbufs.buffers); + rcd->egrbufs.alloced = 0; + rcd->egrbufs.buffers = NULL; sc_free(rcd->sc); + rcd->sc = NULL; + vfree(rcd->subctxt_uregbase); vfree(rcd->subctxt_rcvegrbuf); vfree(rcd->subctxt_rcvhdr_base); kfree(rcd->opstats); - kfree(rcd); + + rcd->subctxt_uregbase = NULL; + rcd->subctxt_rcvegrbuf = NULL; + rcd->subctxt_rcvhdr_base = NULL; + rcd->opstats = NULL; } /* * Release our hold on the shared asic data. If we are the last one, * return the structure to be finalized outside the lock. Must be - * holding hfi1_devs_lock. + * holding hfi1_dev_table lock. */ static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) { @@ -998,48 +1156,57 @@ static void finalize_asic_data(struct hfi1_devdata *dd, kfree(ad); } -static void __hfi1_free_devdata(struct kobject *kobj) +/** + * hfi1_free_devdata - cleans up and frees per-unit data structure + * @dd: pointer to a valid devdata structure + * + * It cleans up and frees all data structures set up by + * by hfi1_alloc_devdata(). + */ +void hfi1_free_devdata(struct hfi1_devdata *dd) { - struct hfi1_devdata *dd = - container_of(kobj, struct hfi1_devdata, kobj); struct hfi1_asic_data *ad; unsigned long flags; - spin_lock_irqsave(&hfi1_devs_lock, flags); - idr_remove(&hfi1_unit_table, dd->unit); - list_del(&dd->list); + xa_lock_irqsave(&hfi1_dev_table, flags); + __xa_erase(&hfi1_dev_table, dd->unit); ad = release_asic_data(dd); - spin_unlock_irqrestore(&hfi1_devs_lock, flags); - if (ad) - finalize_asic_data(dd, ad); + xa_unlock_irqrestore(&hfi1_dev_table, flags); + + finalize_asic_data(dd, ad); free_platform_config(dd); rcu_barrier(); /* wait for rcu callbacks to complete */ free_percpu(dd->int_counter); free_percpu(dd->rcv_limit); free_percpu(dd->send_schedule); + free_percpu(dd->tx_opstats); + dd->int_counter = NULL; + dd->rcv_limit = NULL; + dd->send_schedule = NULL; + dd->tx_opstats = NULL; + kfree(dd->comp_vect); + dd->comp_vect = NULL; + if (dd->rcvhdrtail_dummy_kvaddr) + dma_free_coherent(&dd->pcidev->dev, sizeof(u64), + (void *)dd->rcvhdrtail_dummy_kvaddr, + dd->rcvhdrtail_dummy_dma); + dd->rcvhdrtail_dummy_kvaddr = NULL; + sdma_clean(dd, dd->num_sdma); rvt_dealloc_device(&dd->verbs_dev.rdi); } -static struct kobj_type hfi1_devdata_type = { - .release = __hfi1_free_devdata, -}; - -void hfi1_free_devdata(struct hfi1_devdata *dd) -{ - kobject_put(&dd->kobj); -} - -/* - * Allocate our primary per-unit data structure. Must be done via verbs - * allocator, because the verbs cleanup process both does cleanup and - * free of the data structure. - * "extra" is for chip-specific data. +/** + * hfi1_alloc_devdata - Allocate our primary per-unit data structure. + * @pdev: Valid PCI device + * @extra: How many bytes to alloc past the default * - * Use the idr mechanism to get a unit number for this unit. + * Must be done via verbs allocator, because the verbs cleanup process + * both does cleanup and free of the data structure. + * "extra" is for chip-specific data. */ -struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) +static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, + size_t extra) { - unsigned long flags; struct hfi1_devdata *dd; int ret, nports; @@ -1052,25 +1219,27 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) return ERR_PTR(-ENOMEM); dd->num_pports = nports; dd->pport = (struct hfi1_pportdata *)(dd + 1); + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); - INIT_LIST_HEAD(&dd->list); - idr_preload(GFP_KERNEL); - spin_lock_irqsave(&hfi1_devs_lock, flags); - - ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT); - if (ret >= 0) { - dd->unit = ret; - list_add(&dd->list, &hfi1_dev_list); - } - - spin_unlock_irqrestore(&hfi1_devs_lock, flags); - idr_preload_end(); - + ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b, + GFP_KERNEL); if (ret < 0) { - hfi1_early_err(&pdev->dev, - "Could not allocate unit ID: error %d\n", -ret); + dev_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); goto bail; } + rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); + /* + * If the BIOS does not have the NUMA node information set, select + * NUMA 0 so we get consistent performance. + */ + dd->node = pcibus_to_node(pdev->bus); + if (dd->node == NUMA_NO_NODE) { + dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n"); + dd->node = 0; + } + /* * Initialize all locks for the device. This needs to be as early as * possible so locks are usable. @@ -1087,50 +1256,52 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) spin_lock_init(&dd->pio_map_lock); mutex_init(&dd->dc8051_lock); init_waitqueue_head(&dd->event_queue); + spin_lock_init(&dd->irq_src_lock); dd->int_counter = alloc_percpu(u64); if (!dd->int_counter) { ret = -ENOMEM; - hfi1_early_err(&pdev->dev, - "Could not allocate per-cpu int_counter\n"); goto bail; } dd->rcv_limit = alloc_percpu(u64); if (!dd->rcv_limit) { ret = -ENOMEM; - hfi1_early_err(&pdev->dev, - "Could not allocate per-cpu rcv_limit\n"); goto bail; } dd->send_schedule = alloc_percpu(u64); if (!dd->send_schedule) { ret = -ENOMEM; - hfi1_early_err(&pdev->dev, - "Could not allocate per-cpu int_counter\n"); goto bail; } - if (!hfi1_cpulist_count) { - u32 count = num_online_cpus(); - - hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long), - GFP_KERNEL); - if (hfi1_cpulist) - hfi1_cpulist_count = count; - else - hfi1_early_err( - &pdev->dev, - "Could not alloc cpulist info, cpu affinity might be wrong\n"); + dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx); + if (!dd->tx_opstats) { + ret = -ENOMEM; + goto bail; + } + + dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL); + if (!dd->comp_vect) { + ret = -ENOMEM; + goto bail; } - kobject_init(&dd->kobj, &hfi1_devdata_type); + + /* allocate dummy tail memory for all receive contexts */ + dd->rcvhdrtail_dummy_kvaddr = + dma_alloc_coherent(&dd->pcidev->dev, sizeof(u64), + &dd->rcvhdrtail_dummy_dma, GFP_KERNEL); + if (!dd->rcvhdrtail_dummy_kvaddr) { + ret = -ENOMEM; + goto bail; + } + + atomic_set(&dd->ipoib_rsm_usr_num, 0); return dd; bail: - if (!list_empty(&dd->list)) - list_del_init(&dd->list); - rvt_dealloc_device(&dd->verbs_dev.rdi); + hfi1_free_devdata(dd); return ERR_PTR(ret); } @@ -1169,8 +1340,9 @@ void hfi1_disable_after_error(struct hfi1_devdata *dd) static void remove_one(struct pci_dev *); static int init_one(struct pci_dev *, const struct pci_device_id *); +static void shutdown_one(struct pci_dev *); -#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " +#define DRIVER_LOAD_MSG "Cornelis " DRIVER_NAME " loaded: " #define PFX DRIVER_NAME ": " const struct pci_device_id hfi1_pci_tbl[] = { @@ -1185,6 +1357,7 @@ static struct pci_driver hfi1_pci_driver = { .name = DRIVER_NAME, .probe = init_one, .remove = remove_one, + .shutdown = shutdown_one, .id_table = hfi1_pci_tbl, .err_handler = &hfi1_pci_err_handler, }; @@ -1258,16 +1431,17 @@ static int __init hfi1_mod_init(void) /* sanitize link CRC options */ link_crc_mask &= SUPPORTED_CRCS; + ret = opfn_init(); + if (ret < 0) { + pr_err("Failed to allocate opfn_wq"); + goto bail_dev; + } + /* * These must be called before the driver is registered with * the PCI subsystem. */ - idr_init(&hfi1_unit_table); - hfi1_dbg_init(); - ret = hfi1_wss_init(); - if (ret < 0) - goto bail_wss; ret = pci_register_driver(&hfi1_pci_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); @@ -1276,10 +1450,7 @@ static int __init hfi1_mod_init(void) goto bail; /* all OK */ bail_dev: - hfi1_wss_exit(); -bail_wss: hfi1_dbg_exit(); - idr_destroy(&hfi1_unit_table); dev_cleanup(); bail: return ret; @@ -1293,13 +1464,11 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); - node_affinity_destroy(); - hfi1_wss_exit(); + opfn_exit(); + node_affinity_destroy_all(); hfi1_dbg_exit(); - hfi1_cpulist_count = 0; - kfree(hfi1_cpulist); - idr_destroy(&hfi1_unit_table); + WARN_ON(!xa_empty(&hfi1_dev_table)); dispose_firmware(); /* asymmetric with obtain_firmware() */ dev_cleanup(); } @@ -1311,8 +1480,6 @@ static void cleanup_device_data(struct hfi1_devdata *dd) { int ctxt; int pidx; - struct hfi1_ctxtdata **tmp; - unsigned long flags; /* users can't do anything more with chip */ for (pidx = 0; pidx < dd->num_pports; ++pidx) { @@ -1340,32 +1507,19 @@ static void cleanup_device_data(struct hfi1_devdata *dd) /* * Free any resources still in use (usually just kernel contexts) * at unload; we do for ctxtcnt, because that's what we allocate. - * We acquire lock to be really paranoid that rcd isn't being - * accessed from some interrupt-related code (that should not happen, - * but best to be sure). */ - spin_lock_irqsave(&dd->uctxt_lock, flags); - tmp = dd->rcd; - dd->rcd = NULL; - spin_unlock_irqrestore(&dd->uctxt_lock, flags); + for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) { + struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; - if (dd->rcvhdrtail_dummy_kvaddr) { - dma_free_coherent(&dd->pcidev->dev, sizeof(u64), - (void *)dd->rcvhdrtail_dummy_kvaddr, - dd->rcvhdrtail_dummy_dma); - dd->rcvhdrtail_dummy_kvaddr = NULL; - } - - for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) { - struct hfi1_ctxtdata *rcd = tmp[ctxt]; - - tmp[ctxt] = NULL; /* debugging paranoia */ if (rcd) { - hfi1_clear_tids(rcd); - hfi1_free_ctxtdata(dd, rcd); + hfi1_free_ctxt_rcv_groups(rcd); + hfi1_free_ctxt(rcd); } } - kfree(tmp); + + kfree(dd->rcd); + dd->rcd = NULL; + free_pio_map(dd); /* must follow rcv context free - need to remove rcv's hooks */ for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) @@ -1387,6 +1541,8 @@ static void cleanup_device_data(struct hfi1_devdata *dd) static void postinit_cleanup(struct hfi1_devdata *dd) { hfi1_start_cleanup(dd); + hfi1_comp_vectors_clean_up(dd); + hfi1_dev_affinity_clean_up(dd); hfi1_pcie_ddcleanup(dd); hfi1_pcie_cleanup(dd->pcidev); @@ -1396,29 +1552,6 @@ static void postinit_cleanup(struct hfi1_devdata *dd) hfi1_free_devdata(dd); } -static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt) -{ - if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { - hfi1_early_err(dev, "Receive header queue count too small\n"); - return -EINVAL; - } - - if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { - hfi1_early_err(dev, - "Receive header queue count cannot be greater than %u\n", - HFI1_MAX_HDRQ_EGRBUF_CNT); - return -EINVAL; - } - - if (thecnt % HDRQ_INCREMENT) { - hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n", - thecnt, HDRQ_INCREMENT); - return -EINVAL; - } - - return 0; -} - static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { int ret = 0, j, pidx, initfail; @@ -1431,22 +1564,29 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* Validate dev ids */ if (!(ent->device == PCI_DEVICE_ID_INTEL0 || ent->device == PCI_DEVICE_ID_INTEL1)) { - hfi1_early_err(&pdev->dev, - "Failing on unknown Intel deviceid 0x%x\n", - ent->device); + dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", + ent->device); ret = -ENODEV; goto bail; } + /* Allocate the dd so we can get to work */ + dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * + sizeof(struct hfi1_pportdata)); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + goto bail; + } + /* Validate some global module parameters */ - ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt); + ret = hfi1_validate_rcvhdrcnt(dd, rcvhdrcnt); if (ret) goto bail; /* use the encoding function as a sanitization check */ if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { - hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", - hfi1_hdrq_entsize); + dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", + hfi1_hdrq_entsize); ret = -EINVAL; goto bail; } @@ -1468,10 +1608,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) clamp_val(eager_buffer_size, MIN_EAGER_BUFFER * 8, MAX_EAGER_BUFFER_TOTAL); - hfi1_early_info(&pdev->dev, "Eager buffer size %u\n", - eager_buffer_size); + dd_dev_info(dd, "Eager buffer size %u\n", + eager_buffer_size); } else { - hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n"); + dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); ret = -EINVAL; goto bail; } @@ -1479,7 +1619,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* restrict value of hfi1_rcvarr_split */ hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); - ret = hfi1_pcie_init(pdev, ent); + ret = hfi1_pcie_init(dd); if (ret) goto bail; @@ -1487,12 +1627,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) * Do device-specific initialization, function table setup, dd * allocation, etc. */ - dd = hfi1_init_dd(pdev, ent); - - if (IS_ERR(dd)) { - ret = PTR_ERR(dd); + ret = hfi1_init_dd(dd); + if (ret) goto clean_bail; /* error already printed */ - } ret = create_workqueues(dd); if (ret) @@ -1501,9 +1638,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* do the generic initialization */ initfail = hfi1_init(dd, 0); - /* setup vnic */ - hfi1_vnic_setup(dd); - ret = hfi1_register_ib_device(dd); /* @@ -1523,6 +1657,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); if (initfail || ret) { + msix_clean_up_interrupts(dd); stop_timers(dd); flush_workqueue(ib_wq); for (pidx = 0; pidx < dd->num_pports; ++pidx) { @@ -1532,12 +1667,15 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) destroy_workqueue(ppd->hfi1_wq); ppd->hfi1_wq = NULL; } + if (ppd->link_wq) { + destroy_workqueue(ppd->link_wq); + ppd->link_wq = NULL; + } } if (!j) hfi1_device_remove(dd); if (!ret) hfi1_unregister_ib_device(dd); - hfi1_vnic_cleanup(dd); postinit_cleanup(dd); if (initfail) ret = initfail; @@ -1560,7 +1698,7 @@ static void wait_for_clients(struct hfi1_devdata *dd) * Remove the device init value and complete the device if there is * no clients or wait for active clients to finish. */ - if (atomic_dec_and_test(&dd->user_refcount)) + if (refcount_dec_and_test(&dd->user_refcount)) complete(&dd->user_comp); wait_for_completion(&dd->user_comp); @@ -1582,14 +1720,15 @@ static void remove_one(struct pci_dev *pdev) /* unregister from IB core */ hfi1_unregister_ib_device(dd); - /* cleanup vnic */ - hfi1_vnic_cleanup(dd); + /* free netdev data */ + hfi1_free_rx(dd); /* * Disable the IB link, disable interrupts on the device, * clear dma engines, etc. */ shutdown_device(dd); + destroy_workqueues(dd); stop_timers(dd); @@ -1599,6 +1738,13 @@ static void remove_one(struct pci_dev *pdev) postinit_cleanup(dd); } +static void shutdown_one(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + shutdown_device(dd); +} + /** * hfi1_create_rcvhdrq - create a receive header queue * @dd: the hfi1_ib device @@ -1611,27 +1757,13 @@ static void remove_one(struct pci_dev *pdev) int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) { unsigned amt; - u64 reg; if (!rcd->rcvhdrq) { - dma_addr_t dma_hdrqtail; - gfp_t gfp_flags; + amt = rcvhdrq_size(rcd); - /* - * rcvhdrqentsize is in DWs, so we have to convert to bytes - * (* sizeof(u32)). - */ - amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize * - sizeof(u32)); - - if ((rcd->ctxt < dd->first_dyn_alloc_ctxt) || - (rcd->sc && (rcd->sc->type == SC_KERNEL))) - gfp_flags = GFP_KERNEL; - else - gfp_flags = GFP_USER; - rcd->rcvhdrq = dma_zalloc_coherent( - &dd->pcidev->dev, amt, &rcd->rcvhdrq_dma, - gfp_flags | __GFP_COMP); + rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt, + &rcd->rcvhdrq_dma, + GFP_KERNEL); if (!rcd->rcvhdrq) { dd_dev_err(dd, @@ -1640,41 +1772,19 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) goto bail; } - if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { - rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( - &dd->pcidev->dev, PAGE_SIZE, &dma_hdrqtail, - gfp_flags); + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || + HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { + rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, + PAGE_SIZE, + &rcd->rcvhdrqtailaddr_dma, + GFP_KERNEL); if (!rcd->rcvhdrtail_kvaddr) goto bail_free; - rcd->rcvhdrqtailaddr_dma = dma_hdrqtail; } - - rcd->rcvhdrq_size = amt; } - /* - * These values are per-context: - * RcvHdrCnt - * RcvHdrEntSize - * RcvHdrSize - */ - reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT) - & RCV_HDR_CNT_CNT_MASK) - << RCV_HDR_CNT_CNT_SHIFT; - write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg); - reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize) - & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) - << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; - write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg); - reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK) - << RCV_HDR_SIZE_HDR_SIZE_SHIFT; - write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg); - /* - * Program dummy tail address for every receive context - * before enabling any receive context - */ - write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR, - dd->rcvhdrtail_dummy_dma); + set_hdrq_regs(rcd->dd, rcd->ctxt, rcd->rcvhdrqentsize, + rcd->rcvhdrq_cnt); return 0; @@ -1690,7 +1800,8 @@ bail: } /** - * allocate eager buffers, both kernel and user contexts. + * hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user + * contexts. * @rcd: the context we are setting up. * * Allocate the eager TID buffers and program them into hip. @@ -1701,21 +1812,12 @@ bail: int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) { struct hfi1_devdata *dd = rcd->dd; - u32 max_entries, egrtop, alloced_bytes = 0, idx = 0; - gfp_t gfp_flags; - u16 order; + u32 max_entries, egrtop, alloced_bytes = 0; + u16 order, idx = 0; int ret = 0; u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); /* - * GFP_USER, but without GFP_FS, so buffer cache can be - * coalesced (we hope); otherwise, even at order 4, - * heavy filesystem activity makes these fail, and we can - * use compound pages. - */ - gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; - - /* * The minimum size of the eager buffers is a groups of MTU-sized * buffers. * The global eager_buffer_size parameter is checked against the @@ -1742,10 +1844,10 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) while (alloced_bytes < rcd->egrbufs.size && rcd->egrbufs.alloced < rcd->egrbufs.count) { rcd->egrbufs.buffers[idx].addr = - dma_zalloc_coherent(&dd->pcidev->dev, - rcd->egrbufs.rcvtid_size, - &rcd->egrbufs.buffers[idx].dma, - gfp_flags); + dma_alloc_coherent(&dd->pcidev->dev, + rcd->egrbufs.rcvtid_size, + &rcd->egrbufs.buffers[idx].dma, + GFP_KERNEL); if (rcd->egrbufs.buffers[idx].addr) { rcd->egrbufs.buffers[idx].len = rcd->egrbufs.rcvtid_size; @@ -1816,7 +1918,7 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) rcd->egrbufs.size = alloced_bytes; hfi1_cdbg(PROC, - "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", + "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB", rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); @@ -1839,13 +1941,13 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; rcd->expected_base = rcd->eager_base + egrtop; - hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n", + hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u", rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, rcd->eager_base, rcd->expected_base); if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { hfi1_cdbg(PROC, - "ctxt%u: current Eager buffer size is invalid %u\n", + "ctxt%u: current Eager buffer size is invalid %u", rcd->ctxt, rcd->egrbufs.rcvtid_size); ret = -EINVAL; goto bail_rcvegrbuf_phys; |
