diff options
Diffstat (limited to 'drivers/net/ethernet/ibm/ibmvnic.c')
| -rw-r--r-- | drivers/net/ethernet/ibm/ibmvnic.c | 5256 |
1 files changed, 4084 insertions, 1172 deletions
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index a3e694679635..3808148c1fc7 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /**************************************************************************/ /* */ /* IBM System i and System p Virtual NIC Device Driver */ @@ -6,18 +7,6 @@ /* Thomas Falcon (tlfalcon@linux.vnet.ibm.com) */ /* John Allen (jallen@linux.vnet.ibm.com) */ /* */ -/* This program is free software; you can redistribute it and/or modify */ -/* it under the terms of the GNU General Public License as published by */ -/* the Free Software Foundation; either version 2 of the License, or */ -/* (at your option) any later version. */ -/* */ -/* This program is distributed in the hope that it will be useful, */ -/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ -/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ -/* GNU General Public License for more details. */ -/* */ -/* You should have received a copy of the GNU General Public License */ -/* along with this program. */ /* */ /* This module contains the implementation of a virtual ethernet device */ /* for use with IBM i/p Series LPAR Linux. It utilizes the logical LAN */ @@ -59,10 +48,12 @@ #include <linux/mm.h> #include <linux/ethtool.h> #include <linux/proc_fs.h> +#include <linux/if_arp.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/irq.h> +#include <linux/irqdomain.h> #include <linux/kthread.h> #include <linux/seq_file.h> #include <linux/interrupt.h> @@ -70,11 +61,14 @@ #include <asm/hvcall.h> #include <linux/atomic.h> #include <asm/vio.h> +#include <asm/xive.h> #include <asm/iommu.h> #include <linux/uaccess.h> #include <asm/firmware.h> #include <linux/workqueue.h> #include <linux/if_vlan.h> +#include <linux/utsname.h> +#include <linux/cpu.h> #include "ibmvnic.h" @@ -87,14 +81,11 @@ MODULE_LICENSE("GPL"); MODULE_VERSION(IBMVNIC_DRIVER_VERSION); static int ibmvnic_version = IBMVNIC_INITIAL_VERSION; -static int ibmvnic_remove(struct vio_dev *); -static void release_sub_crqs(struct ibmvnic_adapter *); +static void release_sub_crqs(struct ibmvnic_adapter *, bool); static int ibmvnic_reset_crq(struct ibmvnic_adapter *); static int ibmvnic_send_crq_init(struct ibmvnic_adapter *); static int ibmvnic_reenable_crq_queue(struct ibmvnic_adapter *); static int ibmvnic_send_crq(struct ibmvnic_adapter *, union ibmvnic_crq *); -static int send_subcrq(struct ibmvnic_adapter *adapter, u64 remote_handle, - union sub_crq *sub_crq); static int send_subcrq_indirect(struct ibmvnic_adapter *, u64, u64, u64); static irqreturn_t ibmvnic_interrupt_rx(int irq, void *instance); static int enable_scrq_irq(struct ibmvnic_adapter *, @@ -106,14 +97,27 @@ static int pending_scrq(struct ibmvnic_adapter *, static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *, struct ibmvnic_sub_crq_queue *); static int ibmvnic_poll(struct napi_struct *napi, int data); -static void send_map_query(struct ibmvnic_adapter *adapter); -static void send_request_map(struct ibmvnic_adapter *, dma_addr_t, __be32, u8); -static void send_request_unmap(struct ibmvnic_adapter *, u8); -static void send_login(struct ibmvnic_adapter *adapter); -static void send_cap_queries(struct ibmvnic_adapter *adapter); +static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter); +static inline void reinit_init_done(struct ibmvnic_adapter *adapter); +static void send_query_map(struct ibmvnic_adapter *adapter); +static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, u32, u8); +static int send_request_unmap(struct ibmvnic_adapter *, u8); +static int send_login(struct ibmvnic_adapter *adapter); +static void send_query_cap(struct ibmvnic_adapter *adapter); +static int init_sub_crqs(struct ibmvnic_adapter *); static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter); -static int ibmvnic_init(struct ibmvnic_adapter *); +static int ibmvnic_reset_init(struct ibmvnic_adapter *, bool reset); static void release_crq_queue(struct ibmvnic_adapter *); +static int __ibmvnic_set_mac(struct net_device *, u8 *); +static int init_crq_queue(struct ibmvnic_adapter *adapter); +static int send_query_phys_parms(struct ibmvnic_adapter *adapter); +static void ibmvnic_tx_scrq_clean_buffer(struct ibmvnic_adapter *adapter, + struct ibmvnic_sub_crq_queue *tx_scrq); +static void free_long_term_buff(struct ibmvnic_adapter *adapter, + struct ibmvnic_long_term_buff *ltb); +static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter); +static void flush_reset_queue(struct ibmvnic_adapter *adapter); +static void print_subcrq_error(struct device *dev, int rc, const char *func); struct ibmvnic_stat { char name[ETH_GSTRING_LEN]; @@ -122,7 +126,7 @@ struct ibmvnic_stat { #define IBMVNIC_STAT_OFF(stat) (offsetof(struct ibmvnic_adapter, stats) + \ offsetof(struct ibmvnic_statistics, stat)) -#define IBMVNIC_GET_STAT(a, off) (*((u64 *)(((unsigned long)(a)) + off))) +#define IBMVNIC_GET_STAT(a, off) (*((u64 *)(((unsigned long)(a)) + (off)))) static const struct ibmvnic_stat ibmvnic_stats[] = { {"rx_packets", IBMVNIC_STAT_OFF(rx_packets)}, @@ -149,6 +153,222 @@ static const struct ibmvnic_stat ibmvnic_stats[] = { {"internal_mac_rx_errors", IBMVNIC_STAT_OFF(internal_mac_rx_errors)}, }; +static int send_crq_init_complete(struct ibmvnic_adapter *adapter) +{ + union ibmvnic_crq crq; + + memset(&crq, 0, sizeof(crq)); + crq.generic.first = IBMVNIC_CRQ_INIT_CMD; + crq.generic.cmd = IBMVNIC_CRQ_INIT_COMPLETE; + + return ibmvnic_send_crq(adapter, &crq); +} + +static int send_version_xchg(struct ibmvnic_adapter *adapter) +{ + union ibmvnic_crq crq; + + memset(&crq, 0, sizeof(crq)); + crq.version_exchange.first = IBMVNIC_CRQ_CMD; + crq.version_exchange.cmd = VERSION_EXCHANGE; + crq.version_exchange.version = cpu_to_be16(ibmvnic_version); + + return ibmvnic_send_crq(adapter, &crq); +} + +static void ibmvnic_clean_queue_affinity(struct ibmvnic_adapter *adapter, + struct ibmvnic_sub_crq_queue *queue) +{ + if (!(queue && queue->irq)) + return; + + cpumask_clear(queue->affinity_mask); + + if (irq_set_affinity_and_hint(queue->irq, NULL)) + netdev_warn(adapter->netdev, + "%s: Clear affinity failed, queue addr = %p, IRQ = %d\n", + __func__, queue, queue->irq); +} + +static void ibmvnic_clean_affinity(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_sub_crq_queue **rxqs; + struct ibmvnic_sub_crq_queue **txqs; + int num_rxqs, num_txqs; + int i; + + rxqs = adapter->rx_scrq; + txqs = adapter->tx_scrq; + num_txqs = adapter->num_active_tx_scrqs; + num_rxqs = adapter->num_active_rx_scrqs; + + netdev_dbg(adapter->netdev, "%s: Cleaning irq affinity hints", __func__); + if (txqs) { + for (i = 0; i < num_txqs; i++) + ibmvnic_clean_queue_affinity(adapter, txqs[i]); + } + if (rxqs) { + for (i = 0; i < num_rxqs; i++) + ibmvnic_clean_queue_affinity(adapter, rxqs[i]); + } +} + +static int ibmvnic_set_queue_affinity(struct ibmvnic_sub_crq_queue *queue, + unsigned int *cpu, int *stragglers, + int stride) +{ + cpumask_var_t mask; + int i; + int rc = 0; + + if (!(queue && queue->irq)) + return rc; + + /* cpumask_var_t is either a pointer or array, allocation works here */ + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + /* while we have extra cpu give one extra to this irq */ + if (*stragglers) { + stride++; + (*stragglers)--; + } + /* atomic write is safer than writing bit by bit directly */ + for_each_online_cpu_wrap(i, *cpu) { + if (!stride--) { + /* For the next queue we start from the first + * unused CPU in this queue + */ + *cpu = i; + break; + } + cpumask_set_cpu(i, mask); + } + + /* set queue affinity mask */ + cpumask_copy(queue->affinity_mask, mask); + rc = irq_set_affinity_and_hint(queue->irq, queue->affinity_mask); + free_cpumask_var(mask); + + return rc; +} + +/* assumes cpu read lock is held */ +static void ibmvnic_set_affinity(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_sub_crq_queue **rxqs = adapter->rx_scrq; + struct ibmvnic_sub_crq_queue **txqs = adapter->tx_scrq; + struct ibmvnic_sub_crq_queue *queue; + int num_rxqs = adapter->num_active_rx_scrqs, i_rxqs = 0; + int num_txqs = adapter->num_active_tx_scrqs, i_txqs = 0; + int total_queues, stride, stragglers, i; + unsigned int num_cpu, cpu = 0; + bool is_rx_queue; + int rc = 0; + + netdev_dbg(adapter->netdev, "%s: Setting irq affinity hints", __func__); + if (!(adapter->rx_scrq && adapter->tx_scrq)) { + netdev_warn(adapter->netdev, + "%s: Set affinity failed, queues not allocated\n", + __func__); + return; + } + + total_queues = num_rxqs + num_txqs; + num_cpu = num_online_cpus(); + /* number of cpu's assigned per irq */ + stride = max_t(int, num_cpu / total_queues, 1); + /* number of leftover cpu's */ + stragglers = num_cpu >= total_queues ? num_cpu % total_queues : 0; + + for (i = 0; i < total_queues; i++) { + is_rx_queue = false; + /* balance core load by alternating rx and tx assignments + * ex: TX0 -> RX0 -> TX1 -> RX1 etc. + */ + if ((i % 2 == 1 && i_rxqs < num_rxqs) || i_txqs == num_txqs) { + queue = rxqs[i_rxqs++]; + is_rx_queue = true; + } else { + queue = txqs[i_txqs++]; + } + + rc = ibmvnic_set_queue_affinity(queue, &cpu, &stragglers, + stride); + if (rc) + goto out; + + if (!queue || is_rx_queue) + continue; + + rc = __netif_set_xps_queue(adapter->netdev, + cpumask_bits(queue->affinity_mask), + i_txqs - 1, XPS_CPUS); + if (rc) + netdev_warn(adapter->netdev, "%s: Set XPS on queue %d failed, rc = %d.\n", + __func__, i_txqs - 1, rc); + } + +out: + if (rc) { + netdev_warn(adapter->netdev, + "%s: Set affinity failed, queue addr = %p, IRQ = %d, rc = %d.\n", + __func__, queue, queue->irq, rc); + ibmvnic_clean_affinity(adapter); + } +} + +static int ibmvnic_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct ibmvnic_adapter *adapter; + + adapter = hlist_entry_safe(node, struct ibmvnic_adapter, node); + ibmvnic_set_affinity(adapter); + return 0; +} + +static int ibmvnic_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct ibmvnic_adapter *adapter; + + adapter = hlist_entry_safe(node, struct ibmvnic_adapter, node_dead); + ibmvnic_set_affinity(adapter); + return 0; +} + +static int ibmvnic_cpu_down_prep(unsigned int cpu, struct hlist_node *node) +{ + struct ibmvnic_adapter *adapter; + + adapter = hlist_entry_safe(node, struct ibmvnic_adapter, node); + ibmvnic_clean_affinity(adapter); + return 0; +} + +static enum cpuhp_state ibmvnic_online; + +static int ibmvnic_cpu_notif_add(struct ibmvnic_adapter *adapter) +{ + int ret; + + ret = cpuhp_state_add_instance_nocalls(ibmvnic_online, &adapter->node); + if (ret) + return ret; + ret = cpuhp_state_add_instance_nocalls(CPUHP_IBMVNIC_DEAD, + &adapter->node_dead); + if (!ret) + return ret; + cpuhp_state_remove_instance_nocalls(ibmvnic_online, &adapter->node); + return ret; +} + +static void ibmvnic_cpu_notif_remove(struct ibmvnic_adapter *adapter) +{ + cpuhp_state_remove_instance_nocalls(ibmvnic_online, &adapter->node); + cpuhp_state_remove_instance_nocalls(CPUHP_IBMVNIC_DEAD, + &adapter->node_dead); +} + static long h_reg_sub_crq(unsigned long unit_address, unsigned long token, unsigned long length, unsigned long *number, unsigned long *irq) @@ -163,33 +383,142 @@ static long h_reg_sub_crq(unsigned long unit_address, unsigned long token, return rc; } +/** + * ibmvnic_wait_for_completion - Check device state and wait for completion + * @adapter: private device data + * @comp_done: completion structure to wait for + * @timeout: time to wait in milliseconds + * + * Wait for a completion signal or until the timeout limit is reached + * while checking that the device is still active. + */ +static int ibmvnic_wait_for_completion(struct ibmvnic_adapter *adapter, + struct completion *comp_done, + unsigned long timeout) +{ + struct net_device *netdev; + unsigned long div_timeout; + u8 retry; + + netdev = adapter->netdev; + retry = 5; + div_timeout = msecs_to_jiffies(timeout / retry); + while (true) { + if (!adapter->crq.active) { + netdev_err(netdev, "Device down!\n"); + return -ENODEV; + } + if (!retry--) + break; + if (wait_for_completion_timeout(comp_done, div_timeout)) + return 0; + } + netdev_err(netdev, "Operation timed out.\n"); + return -ETIMEDOUT; +} + +/** + * reuse_ltb() - Check if a long term buffer can be reused + * @ltb: The long term buffer to be checked + * @size: The size of the long term buffer. + * + * An LTB can be reused unless its size has changed. + * + * Return: Return true if the LTB can be reused, false otherwise. + */ +static bool reuse_ltb(struct ibmvnic_long_term_buff *ltb, int size) +{ + return (ltb->buff && ltb->size == size); +} + +/** + * alloc_long_term_buff() - Allocate a long term buffer (LTB) + * + * @adapter: ibmvnic adapter associated to the LTB + * @ltb: container object for the LTB + * @size: size of the LTB + * + * Allocate an LTB of the specified size and notify VIOS. + * + * If the given @ltb already has the correct size, reuse it. Otherwise if + * its non-NULL, free it. Then allocate a new one of the correct size. + * Notify the VIOS either way since we may now be working with a new VIOS. + * + * Allocating larger chunks of memory during resets, specially LPM or under + * low memory situations can cause resets to fail/timeout and for LPAR to + * lose connectivity. So hold onto the LTB even if we fail to communicate + * with the VIOS and reuse it on next open. Free LTB when adapter is closed. + * + * Return: 0 if we were able to allocate the LTB and notify the VIOS and + * a negative value otherwise. + */ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter, struct ibmvnic_long_term_buff *ltb, int size) { struct device *dev = &adapter->vdev->dev; + u64 prev = 0; + int rc; + + if (!reuse_ltb(ltb, size)) { + dev_dbg(dev, + "LTB size changed from 0x%llx to 0x%x, reallocating\n", + ltb->size, size); + prev = ltb->size; + free_long_term_buff(adapter, ltb); + } - ltb->size = size; - ltb->buff = dma_alloc_coherent(dev, ltb->size, <b->addr, - GFP_KERNEL); + if (ltb->buff) { + dev_dbg(dev, "Reusing LTB [map %d, size 0x%llx]\n", + ltb->map_id, ltb->size); + } else { + ltb->buff = dma_alloc_coherent(dev, size, <b->addr, + GFP_KERNEL); + if (!ltb->buff) { + dev_err(dev, "Couldn't alloc long term buffer\n"); + return -ENOMEM; + } + ltb->size = size; - if (!ltb->buff) { - dev_err(dev, "Couldn't alloc long term buffer\n"); - return -ENOMEM; + ltb->map_id = find_first_zero_bit(adapter->map_ids, + MAX_MAP_ID); + bitmap_set(adapter->map_ids, ltb->map_id, 1); + + dev_dbg(dev, + "Allocated new LTB [map %d, size 0x%llx was 0x%llx]\n", + ltb->map_id, ltb->size, prev); } - ltb->map_id = adapter->map_id; - adapter->map_id++; - init_completion(&adapter->fw_done); - send_request_map(adapter, ltb->addr, - ltb->size, ltb->map_id); - wait_for_completion(&adapter->fw_done); + /* Ensure ltb is zeroed - specially when reusing it. */ + memset(ltb->buff, 0, ltb->size); + + mutex_lock(&adapter->fw_lock); + adapter->fw_done_rc = 0; + reinit_completion(&adapter->fw_done); + + rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); + if (rc) { + dev_err(dev, "send_request_map failed, rc = %d\n", rc); + goto out; + } + + rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); + if (rc) { + dev_err(dev, "LTB map request aborted or timed out, rc = %d\n", + rc); + goto out; + } if (adapter->fw_done_rc) { - dev_err(dev, "Couldn't map long term buffer,rc = %d\n", + dev_err(dev, "Couldn't map LTB, rc = %d\n", adapter->fw_done_rc); - return -1; + rc = -EIO; + goto out; } - return 0; + rc = 0; +out: + /* don't free LTB on communication error - see function header */ + mutex_unlock(&adapter->fw_lock); + return rc; } static void free_long_term_buff(struct ibmvnic_adapter *adapter, @@ -200,95 +529,319 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, if (!ltb->buff) return; + /* VIOS automatically unmaps the long term buffer at remote + * end for the following resets: + * FAILOVER, MOBILITY, TIMEOUT. + */ if (adapter->reset_reason != VNIC_RESET_FAILOVER && - adapter->reset_reason != VNIC_RESET_MOBILITY) + adapter->reset_reason != VNIC_RESET_MOBILITY && + adapter->reset_reason != VNIC_RESET_TIMEOUT) send_request_unmap(adapter, ltb->map_id); + dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); + + ltb->buff = NULL; + /* mark this map_id free */ + bitmap_clear(adapter->map_ids, ltb->map_id, 1); + ltb->map_id = 0; } -static int reset_long_term_buff(struct ibmvnic_adapter *adapter, - struct ibmvnic_long_term_buff *ltb) +/** + * free_ltb_set - free the given set of long term buffers (LTBS) + * @adapter: The ibmvnic adapter containing this ltb set + * @ltb_set: The ltb_set to be freed + * + * Free the set of LTBs in the given set. + */ + +static void free_ltb_set(struct ibmvnic_adapter *adapter, + struct ibmvnic_ltb_set *ltb_set) { - memset(ltb->buff, 0, ltb->size); + int i; - init_completion(&adapter->fw_done); - send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); - wait_for_completion(&adapter->fw_done); + for (i = 0; i < ltb_set->num_ltbs; i++) + free_long_term_buff(adapter, <b_set->ltbs[i]); - if (adapter->fw_done_rc) { - dev_info(&adapter->vdev->dev, - "Reset failed, attempting to free and reallocate buffer\n"); - free_long_term_buff(adapter, ltb); - return alloc_long_term_buff(adapter, ltb, ltb->size); + kfree(ltb_set->ltbs); + ltb_set->ltbs = NULL; + ltb_set->num_ltbs = 0; +} + +/** + * alloc_ltb_set() - Allocate a set of long term buffers (LTBs) + * + * @adapter: ibmvnic adapter associated to the LTB + * @ltb_set: container object for the set of LTBs + * @num_buffs: Number of buffers in the LTB + * @buff_size: Size of each buffer in the LTB + * + * Allocate a set of LTBs to accommodate @num_buffs buffers of @buff_size + * each. We currently cap size each LTB to IBMVNIC_ONE_LTB_SIZE. If the + * new set of LTBs have fewer LTBs than the old set, free the excess LTBs. + * If new set needs more than in old set, allocate the remaining ones. + * Try and reuse as many LTBs as possible and avoid reallocation. + * + * Any changes to this allocation strategy must be reflected in + * map_rxpool_buff_to_ltb() and map_txpool_buff_to_ltb(). + */ +static int alloc_ltb_set(struct ibmvnic_adapter *adapter, + struct ibmvnic_ltb_set *ltb_set, int num_buffs, + int buff_size) +{ + struct device *dev = &adapter->vdev->dev; + struct ibmvnic_ltb_set old_set; + struct ibmvnic_ltb_set new_set; + int rem_size; + int tot_size; /* size of all ltbs */ + int ltb_size; /* size of one ltb */ + int nltbs; + int rc; + int n; + int i; + + dev_dbg(dev, "%s() num_buffs %d, buff_size %d\n", __func__, num_buffs, + buff_size); + + ltb_size = rounddown(IBMVNIC_ONE_LTB_SIZE, buff_size); + tot_size = num_buffs * buff_size; + + if (ltb_size > tot_size) + ltb_size = tot_size; + + nltbs = tot_size / ltb_size; + if (tot_size % ltb_size) + nltbs++; + + old_set = *ltb_set; + + if (old_set.num_ltbs == nltbs) { + new_set = old_set; + } else { + int tmp = nltbs * sizeof(struct ibmvnic_long_term_buff); + + new_set.ltbs = kzalloc(tmp, GFP_KERNEL); + if (!new_set.ltbs) + return -ENOMEM; + + new_set.num_ltbs = nltbs; + + /* Free any excess ltbs in old set */ + for (i = new_set.num_ltbs; i < old_set.num_ltbs; i++) + free_long_term_buff(adapter, &old_set.ltbs[i]); + + /* Copy remaining ltbs to new set. All LTBs except the + * last one are of the same size. alloc_long_term_buff() + * will realloc if the size changes. + */ + n = min(old_set.num_ltbs, new_set.num_ltbs); + for (i = 0; i < n; i++) + new_set.ltbs[i] = old_set.ltbs[i]; + + /* Any additional ltbs in new set will have NULL ltbs for + * now and will be allocated in alloc_long_term_buff(). + */ + + /* We no longer need the old_set so free it. Note that we + * may have reused some ltbs from old set and freed excess + * ltbs above. So we only need to free the container now + * not the LTBs themselves. (i.e. dont free_ltb_set()!) + */ + kfree(old_set.ltbs); + old_set.ltbs = NULL; + old_set.num_ltbs = 0; + + /* Install the new set. If allocations fail below, we will + * retry later and know what size LTBs we need. + */ + *ltb_set = new_set; } + + i = 0; + rem_size = tot_size; + while (rem_size) { + if (ltb_size > rem_size) + ltb_size = rem_size; + + rem_size -= ltb_size; + + rc = alloc_long_term_buff(adapter, &new_set.ltbs[i], ltb_size); + if (rc) + goto out; + i++; + } + + WARN_ON(i != new_set.num_ltbs); + return 0; +out: + /* We may have allocated one/more LTBs before failing and we + * want to try and reuse on next reset. So don't free ltb set. + */ + return rc; +} + +/** + * map_rxpool_buf_to_ltb - Map given rxpool buffer to offset in an LTB. + * @rxpool: The receive buffer pool containing buffer + * @bufidx: Index of buffer in rxpool + * @ltbp: (Output) pointer to the long term buffer containing the buffer + * @offset: (Output) offset of buffer in the LTB from @ltbp + * + * Map the given buffer identified by [rxpool, bufidx] to an LTB in the + * pool and its corresponding offset. Assume for now that each LTB is of + * different size but could possibly be optimized based on the allocation + * strategy in alloc_ltb_set(). + */ +static void map_rxpool_buf_to_ltb(struct ibmvnic_rx_pool *rxpool, + unsigned int bufidx, + struct ibmvnic_long_term_buff **ltbp, + unsigned int *offset) +{ + struct ibmvnic_long_term_buff *ltb; + int nbufs; /* # of buffers in one ltb */ + int i; + + WARN_ON(bufidx >= rxpool->size); + + for (i = 0; i < rxpool->ltb_set.num_ltbs; i++) { + ltb = &rxpool->ltb_set.ltbs[i]; + nbufs = ltb->size / rxpool->buff_size; + if (bufidx < nbufs) + break; + bufidx -= nbufs; + } + + *ltbp = ltb; + *offset = bufidx * rxpool->buff_size; +} + +/** + * map_txpool_buf_to_ltb - Map given txpool buffer to offset in an LTB. + * @txpool: The transmit buffer pool containing buffer + * @bufidx: Index of buffer in txpool + * @ltbp: (Output) pointer to the long term buffer (LTB) containing the buffer + * @offset: (Output) offset of buffer in the LTB from @ltbp + * + * Map the given buffer identified by [txpool, bufidx] to an LTB in the + * pool and its corresponding offset. + */ +static void map_txpool_buf_to_ltb(struct ibmvnic_tx_pool *txpool, + unsigned int bufidx, + struct ibmvnic_long_term_buff **ltbp, + unsigned int *offset) +{ + struct ibmvnic_long_term_buff *ltb; + int nbufs; /* # of buffers in one ltb */ + int i; + + WARN_ON_ONCE(bufidx >= txpool->num_buffers); + + for (i = 0; i < txpool->ltb_set.num_ltbs; i++) { + ltb = &txpool->ltb_set.ltbs[i]; + nbufs = ltb->size / txpool->buf_size; + if (bufidx < nbufs) + break; + bufidx -= nbufs; + } + + *ltbp = ltb; + *offset = bufidx * txpool->buf_size; } static void deactivate_rx_pools(struct ibmvnic_adapter *adapter) { int i; - for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); - i++) + for (i = 0; i < adapter->num_active_rx_pools; i++) adapter->rx_pool[i].active = 0; } +static void ibmvnic_set_safe_max_ind_descs(struct ibmvnic_adapter *adapter) +{ + if (adapter->cur_max_ind_descs > IBMVNIC_SAFE_IND_DESC) { + netdev_info(adapter->netdev, + "set max ind descs from %u to safe limit %u\n", + adapter->cur_max_ind_descs, + IBMVNIC_SAFE_IND_DESC); + adapter->cur_max_ind_descs = IBMVNIC_SAFE_IND_DESC; + } +} + static void replenish_rx_pool(struct ibmvnic_adapter *adapter, struct ibmvnic_rx_pool *pool) { int count = pool->size - atomic_read(&pool->available); + u64 handle = adapter->rx_scrq[pool->index]->handle; struct device *dev = &adapter->vdev->dev; + struct ibmvnic_ind_xmit_queue *ind_bufp; + struct ibmvnic_sub_crq_queue *rx_scrq; + struct ibmvnic_long_term_buff *ltb; + union sub_crq *sub_crq; int buffers_added = 0; unsigned long lpar_rc; - union sub_crq sub_crq; struct sk_buff *skb; unsigned int offset; dma_addr_t dma_addr; unsigned char *dst; - u64 *handle_array; int shift = 0; - int index; + int bufidx; int i; if (!pool->active) return; - handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + - be32_to_cpu(adapter->login_rsp_buf-> - off_rxadd_subcrqs)); + rx_scrq = adapter->rx_scrq[pool->index]; + ind_bufp = &rx_scrq->ind_buf; - for (i = 0; i < count; ++i) { - skb = alloc_skb(pool->buff_size, GFP_ATOMIC); + /* netdev_skb_alloc() could have failed after we saved a few skbs + * in the indir_buf and we would not have sent them to VIOS yet. + * To account for them, start the loop at ind_bufp->index rather + * than 0. If we pushed all the skbs to VIOS, ind_bufp->index will + * be 0. + */ + for (i = ind_bufp->index; i < count; ++i) { + bufidx = pool->free_map[pool->next_free]; + + /* We maybe reusing the skb from earlier resets. Allocate + * only if necessary. But since the LTB may have changed + * during reset (see init_rx_pools()), update LTB below + * even if reusing skb. + */ + skb = pool->rx_buff[bufidx].skb; if (!skb) { - dev_err(dev, "Couldn't replenish rx buff\n"); - adapter->replenish_no_mem++; - break; + skb = netdev_alloc_skb(adapter->netdev, + pool->buff_size); + if (!skb) { + dev_err(dev, "Couldn't replenish rx buff\n"); + adapter->replenish_no_mem++; + break; + } } - index = pool->free_map[pool->next_free]; - - if (pool->rx_buff[index].skb) - dev_err(dev, "Inconsistent free_map!\n"); + pool->free_map[pool->next_free] = IBMVNIC_INVALID_MAP; + pool->next_free = (pool->next_free + 1) % pool->size; /* Copy the skb to the long term mapped DMA buffer */ - offset = index * pool->buff_size; - dst = pool->long_term_buff.buff + offset; + map_rxpool_buf_to_ltb(pool, bufidx, <b, &offset); + dst = ltb->buff + offset; memset(dst, 0, pool->buff_size); - dma_addr = pool->long_term_buff.addr + offset; - pool->rx_buff[index].data = dst; - - pool->free_map[pool->next_free] = IBMVNIC_INVALID_MAP; - pool->rx_buff[index].dma = dma_addr; - pool->rx_buff[index].skb = skb; - pool->rx_buff[index].pool_index = pool->index; - pool->rx_buff[index].size = pool->buff_size; - - memset(&sub_crq, 0, sizeof(sub_crq)); - sub_crq.rx_add.first = IBMVNIC_CRQ_CMD; - sub_crq.rx_add.correlator = - cpu_to_be64((u64)&pool->rx_buff[index]); - sub_crq.rx_add.ioba = cpu_to_be32(dma_addr); - sub_crq.rx_add.map_id = pool->long_term_buff.map_id; + dma_addr = ltb->addr + offset; + + /* add the skb to an rx_buff in the pool */ + pool->rx_buff[bufidx].data = dst; + pool->rx_buff[bufidx].dma = dma_addr; + pool->rx_buff[bufidx].skb = skb; + pool->rx_buff[bufidx].pool_index = pool->index; + pool->rx_buff[bufidx].size = pool->buff_size; + + /* queue the rx_buff for the next send_subcrq_indirect */ + sub_crq = &ind_bufp->indir_arr[ind_bufp->index++]; + memset(sub_crq, 0, sizeof(*sub_crq)); + sub_crq->rx_add.first = IBMVNIC_CRQ_CMD; + sub_crq->rx_add.correlator = + cpu_to_be64((u64)&pool->rx_buff[bufidx]); + sub_crq->rx_add.ioba = cpu_to_be32(dma_addr); + sub_crq->rx_add.map_id = ltb->map_id; /* The length field of the sCRQ is defined to be 24 bits so the * buffer size needs to be left shifted by a byte before it is @@ -298,36 +851,57 @@ static void replenish_rx_pool(struct ibmvnic_adapter *adapter, #ifdef __LITTLE_ENDIAN__ shift = 8; #endif - sub_crq.rx_add.len = cpu_to_be32(pool->buff_size << shift); - - lpar_rc = send_subcrq(adapter, handle_array[pool->index], - &sub_crq); - if (lpar_rc != H_SUCCESS) - goto failure; - - buffers_added++; - adapter->replenish_add_buff_success++; - pool->next_free = (pool->next_free + 1) % pool->size; + sub_crq->rx_add.len = cpu_to_be32(pool->buff_size << shift); + + /* if send_subcrq_indirect queue is full, flush to VIOS */ + if (ind_bufp->index == adapter->cur_max_ind_descs || + i == count - 1) { + lpar_rc = + send_subcrq_indirect(adapter, handle, + (u64)ind_bufp->indir_dma, + (u64)ind_bufp->index); + if (lpar_rc != H_SUCCESS) + goto failure; + buffers_added += ind_bufp->index; + adapter->replenish_add_buff_success += ind_bufp->index; + ind_bufp->index = 0; + } } atomic_add(buffers_added, &pool->available); return; failure: - dev_info(dev, "replenish pools failure\n"); - pool->free_map[pool->next_free] = index; - pool->rx_buff[index].skb = NULL; - if (!dma_mapping_error(dev, dma_addr)) - dma_unmap_single(dev, dma_addr, pool->buff_size, - DMA_FROM_DEVICE); + if (lpar_rc != H_PARAMETER && lpar_rc != H_CLOSED) + dev_err_ratelimited(dev, "rx: replenish packet buffer failed\n"); - dev_kfree_skb_any(skb); - adapter->replenish_add_buff_failure++; - atomic_add(buffers_added, &pool->available); + /* Detect platform limit H_PARAMETER */ + if (lpar_rc == H_PARAMETER) + ibmvnic_set_safe_max_ind_descs(adapter); - if (lpar_rc == H_CLOSED) { + /* For all error case, temporarily drop only this batch + * Rely on TCP/IP retransmissions to retry and recover + */ + for (i = ind_bufp->index - 1; i >= 0; --i) { + struct ibmvnic_rx_buff *rx_buff; + + pool->next_free = pool->next_free == 0 ? + pool->size - 1 : pool->next_free - 1; + sub_crq = &ind_bufp->indir_arr[i]; + rx_buff = (struct ibmvnic_rx_buff *) + be64_to_cpu(sub_crq->rx_add.correlator); + bufidx = (int)(rx_buff - pool->rx_buff); + pool->free_map[pool->next_free] = bufidx; + dev_kfree_skb_any(pool->rx_buff[bufidx].skb); + pool->rx_buff[bufidx].skb = NULL; + } + adapter->replenish_add_buff_failure += ind_bufp->index; + atomic_add(buffers_added, &pool->available); + ind_bufp->index = 0; + if (lpar_rc == H_CLOSED || adapter->failover_pending) { /* Disable buffer pool replenishment and report carrier off if - * queue is closed. Firmware guarantees that a signal will - * be sent to the driver, triggering a reset. + * queue is closed or pending failover. + * Firmware guarantees that a signal will be sent to the + * driver, triggering a reset. */ deactivate_rx_pools(adapter); netif_carrier_off(adapter->netdev); @@ -339,11 +913,39 @@ static void replenish_pools(struct ibmvnic_adapter *adapter) int i; adapter->replenish_task_cycles++; - for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); - i++) { + for (i = 0; i < adapter->num_active_rx_pools; i++) { if (adapter->rx_pool[i].active) replenish_rx_pool(adapter, &adapter->rx_pool[i]); } + + netdev_dbg(adapter->netdev, "Replenished %d pools\n", i); +} + +static void release_stats_buffers(struct ibmvnic_adapter *adapter) +{ + kfree(adapter->tx_stats_buffers); + kfree(adapter->rx_stats_buffers); + adapter->tx_stats_buffers = NULL; + adapter->rx_stats_buffers = NULL; +} + +static int init_stats_buffers(struct ibmvnic_adapter *adapter) +{ + adapter->tx_stats_buffers = + kcalloc(IBMVNIC_MAX_QUEUES, + sizeof(struct ibmvnic_tx_queue_stats), + GFP_KERNEL); + if (!adapter->tx_stats_buffers) + return -ENOMEM; + + adapter->rx_stats_buffers = + kcalloc(IBMVNIC_MAX_QUEUES, + sizeof(struct ibmvnic_rx_queue_stats), + GFP_KERNEL); + if (!adapter->rx_stats_buffers) + return -ENOMEM; + + return 0; } static void release_stats_token(struct ibmvnic_adapter *adapter) @@ -363,71 +965,52 @@ static int init_stats_token(struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; dma_addr_t stok; + int rc; stok = dma_map_single(dev, &adapter->stats, sizeof(struct ibmvnic_statistics), DMA_FROM_DEVICE); - if (dma_mapping_error(dev, stok)) { - dev_err(dev, "Couldn't map stats buffer\n"); - return -1; + rc = dma_mapping_error(dev, stok); + if (rc) { + dev_err(dev, "Couldn't map stats buffer, rc = %d\n", rc); + return rc; } adapter->stats_token = stok; + netdev_dbg(adapter->netdev, "Stats token initialized (%llx)\n", stok); return 0; } -static int reset_rx_pools(struct ibmvnic_adapter *adapter) -{ - struct ibmvnic_rx_pool *rx_pool; - int rx_scrqs; - int i, j, rc; - - rx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); - for (i = 0; i < rx_scrqs; i++) { - rx_pool = &adapter->rx_pool[i]; - - rc = reset_long_term_buff(adapter, &rx_pool->long_term_buff); - if (rc) - return rc; - - for (j = 0; j < rx_pool->size; j++) - rx_pool->free_map[j] = j; - - memset(rx_pool->rx_buff, 0, - rx_pool->size * sizeof(struct ibmvnic_rx_buff)); - - atomic_set(&rx_pool->available, 0); - rx_pool->next_alloc = 0; - rx_pool->next_free = 0; - rx_pool->active = 1; - } - - return 0; -} - +/** + * release_rx_pools() - Release any rx pools attached to @adapter. + * @adapter: ibmvnic adapter + * + * Safe to call this multiple times - even if no pools are attached. + */ static void release_rx_pools(struct ibmvnic_adapter *adapter) { struct ibmvnic_rx_pool *rx_pool; - int rx_scrqs; int i, j; if (!adapter->rx_pool) return; - rx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); - for (i = 0; i < rx_scrqs; i++) { + for (i = 0; i < adapter->num_active_rx_pools; i++) { rx_pool = &adapter->rx_pool[i]; + netdev_dbg(adapter->netdev, "Releasing rx_pool[%d]\n", i); + kfree(rx_pool->free_map); - free_long_term_buff(adapter, &rx_pool->long_term_buff); + + free_ltb_set(adapter, &rx_pool->ltb_set); if (!rx_pool->rx_buff) continue; for (j = 0; j < rx_pool->size; j++) { if (rx_pool->rx_buff[j].skb) { - dev_kfree_skb_any(rx_pool->rx_buff[i].skb); - rx_pool->rx_buff[i].skb = NULL; + dev_kfree_skb_any(rx_pool->rx_buff[j].skb); + rx_pool->rx_buff[j].skb = NULL; } } @@ -436,48 +1019,113 @@ static void release_rx_pools(struct ibmvnic_adapter *adapter) kfree(adapter->rx_pool); adapter->rx_pool = NULL; + adapter->num_active_rx_pools = 0; + adapter->prev_rx_pool_size = 0; +} + +/** + * reuse_rx_pools() - Check if the existing rx pools can be reused. + * @adapter: ibmvnic adapter + * + * Check if the existing rx pools in the adapter can be reused. The + * pools can be reused if the pool parameters (number of pools, + * number of buffers in the pool and size of each buffer) have not + * changed. + * + * NOTE: This assumes that all pools have the same number of buffers + * which is the case currently. If that changes, we must fix this. + * + * Return: true if the rx pools can be reused, false otherwise. + */ +static bool reuse_rx_pools(struct ibmvnic_adapter *adapter) +{ + u64 old_num_pools, new_num_pools; + u64 old_pool_size, new_pool_size; + u64 old_buff_size, new_buff_size; + + if (!adapter->rx_pool) + return false; + + old_num_pools = adapter->num_active_rx_pools; + new_num_pools = adapter->req_rx_queues; + + old_pool_size = adapter->prev_rx_pool_size; + new_pool_size = adapter->req_rx_add_entries_per_subcrq; + + old_buff_size = adapter->prev_rx_buf_sz; + new_buff_size = adapter->cur_rx_buf_sz; + + if (old_buff_size != new_buff_size || + old_num_pools != new_num_pools || + old_pool_size != new_pool_size) + return false; + + return true; } +/** + * init_rx_pools(): Initialize the set of receiver pools in the adapter. + * @netdev: net device associated with the vnic interface + * + * Initialize the set of receiver pools in the ibmvnic adapter associated + * with the net_device @netdev. If possible, reuse the existing rx pools. + * Otherwise free any existing pools and allocate a new set of pools + * before initializing them. + * + * Return: 0 on success and negative value on error. + */ static int init_rx_pools(struct net_device *netdev) { struct ibmvnic_adapter *adapter = netdev_priv(netdev); struct device *dev = &adapter->vdev->dev; struct ibmvnic_rx_pool *rx_pool; - int rxadd_subcrqs; - u64 *size_array; - int i, j; + u64 num_pools; + u64 pool_size; /* # of buffers in one pool */ + u64 buff_size; + int i, j, rc; - rxadd_subcrqs = - be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); - size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + - be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size)); + pool_size = adapter->req_rx_add_entries_per_subcrq; + num_pools = adapter->req_rx_queues; + buff_size = adapter->cur_rx_buf_sz; + + if (reuse_rx_pools(adapter)) { + dev_dbg(dev, "Reusing rx pools\n"); + goto update_ltb; + } + + /* Allocate/populate the pools. */ + release_rx_pools(adapter); - adapter->rx_pool = kcalloc(rxadd_subcrqs, + adapter->rx_pool = kcalloc(num_pools, sizeof(struct ibmvnic_rx_pool), GFP_KERNEL); if (!adapter->rx_pool) { dev_err(dev, "Failed to allocate rx pools\n"); - return -1; + return -ENOMEM; } - for (i = 0; i < rxadd_subcrqs; i++) { + /* Set num_active_rx_pools early. If we fail below after partial + * allocation, release_rx_pools() will know how many to look for. + */ + adapter->num_active_rx_pools = num_pools; + + for (i = 0; i < num_pools; i++) { rx_pool = &adapter->rx_pool[i]; netdev_dbg(adapter->netdev, - "Initializing rx_pool %d, %lld buffs, %lld bytes each\n", - i, adapter->req_rx_add_entries_per_subcrq, - be64_to_cpu(size_array[i])); + "Initializing rx_pool[%d], %lld buffs, %lld bytes each\n", + i, pool_size, buff_size); - rx_pool->size = adapter->req_rx_add_entries_per_subcrq; + rx_pool->size = pool_size; rx_pool->index = i; - rx_pool->buff_size = be64_to_cpu(size_array[i]); - rx_pool->active = 1; + rx_pool->buff_size = ALIGN(buff_size, L1_CACHE_BYTES); rx_pool->free_map = kcalloc(rx_pool->size, sizeof(int), GFP_KERNEL); if (!rx_pool->free_map) { - release_rx_pools(adapter); - return -1; + dev_err(dev, "Couldn't alloc free_map %d\n", i); + rc = -ENOMEM; + goto out_release; } rx_pool->rx_buff = kcalloc(rx_pool->size, @@ -485,139 +1133,310 @@ static int init_rx_pools(struct net_device *netdev) GFP_KERNEL); if (!rx_pool->rx_buff) { dev_err(dev, "Couldn't alloc rx buffers\n"); - release_rx_pools(adapter); - return -1; + rc = -ENOMEM; + goto out_release; } + } - if (alloc_long_term_buff(adapter, &rx_pool->long_term_buff, - rx_pool->size * rx_pool->buff_size)) { - release_rx_pools(adapter); - return -1; - } + adapter->prev_rx_pool_size = pool_size; + adapter->prev_rx_buf_sz = adapter->cur_rx_buf_sz; + +update_ltb: + for (i = 0; i < num_pools; i++) { + rx_pool = &adapter->rx_pool[i]; + dev_dbg(dev, "Updating LTB for rx pool %d [%d, %d]\n", + i, rx_pool->size, rx_pool->buff_size); + + rc = alloc_ltb_set(adapter, &rx_pool->ltb_set, + rx_pool->size, rx_pool->buff_size); + if (rc) + goto out; + + for (j = 0; j < rx_pool->size; ++j) { + struct ibmvnic_rx_buff *rx_buff; - for (j = 0; j < rx_pool->size; ++j) rx_pool->free_map[j] = j; + /* NOTE: Don't clear rx_buff->skb here - will leak + * memory! replenish_rx_pool() will reuse skbs or + * allocate as necessary. + */ + rx_buff = &rx_pool->rx_buff[j]; + rx_buff->dma = 0; + rx_buff->data = 0; + rx_buff->size = 0; + rx_buff->pool_index = 0; + } + + /* Mark pool "empty" so replenish_rx_pools() will + * update the LTB info for each buffer + */ atomic_set(&rx_pool->available, 0); rx_pool->next_alloc = 0; rx_pool->next_free = 0; + /* replenish_rx_pool() may have called deactivate_rx_pools() + * on failover. Ensure pool is active now. + */ + rx_pool->active = 1; } - return 0; +out_release: + release_rx_pools(adapter); +out: + /* We failed to allocate one or more LTBs or map them on the VIOS. + * Hold onto the pools and any LTBs that we did allocate/map. + */ + return rc; } -static int reset_tx_pools(struct ibmvnic_adapter *adapter) +static void release_vpd_data(struct ibmvnic_adapter *adapter) { - struct ibmvnic_tx_pool *tx_pool; - int tx_scrqs; - int i, j, rc; - - tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); - for (i = 0; i < tx_scrqs; i++) { - tx_pool = &adapter->tx_pool[i]; - - rc = reset_long_term_buff(adapter, &tx_pool->long_term_buff); - if (rc) - return rc; - - memset(tx_pool->tx_buff, 0, - adapter->req_tx_entries_per_subcrq * - sizeof(struct ibmvnic_tx_buff)); + if (!adapter->vpd) + return; - for (j = 0; j < adapter->req_tx_entries_per_subcrq; j++) - tx_pool->free_map[j] = j; + kfree(adapter->vpd->buff); + kfree(adapter->vpd); - tx_pool->consumer_index = 0; - tx_pool->producer_index = 0; - } + adapter->vpd = NULL; +} - return 0; +static void release_one_tx_pool(struct ibmvnic_adapter *adapter, + struct ibmvnic_tx_pool *tx_pool) +{ + kfree(tx_pool->tx_buff); + kfree(tx_pool->free_map); + free_ltb_set(adapter, &tx_pool->ltb_set); } +/** + * release_tx_pools() - Release any tx pools attached to @adapter. + * @adapter: ibmvnic adapter + * + * Safe to call this multiple times - even if no pools are attached. + */ static void release_tx_pools(struct ibmvnic_adapter *adapter) { - struct ibmvnic_tx_pool *tx_pool; - int i, tx_scrqs; + int i; + /* init_tx_pools() ensures that ->tx_pool and ->tso_pool are + * both NULL or both non-NULL. So we only need to check one. + */ if (!adapter->tx_pool) return; - tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); - for (i = 0; i < tx_scrqs; i++) { - tx_pool = &adapter->tx_pool[i]; - kfree(tx_pool->tx_buff); - free_long_term_buff(adapter, &tx_pool->long_term_buff); - kfree(tx_pool->free_map); + for (i = 0; i < adapter->num_active_tx_pools; i++) { + release_one_tx_pool(adapter, &adapter->tx_pool[i]); + release_one_tx_pool(adapter, &adapter->tso_pool[i]); } kfree(adapter->tx_pool); adapter->tx_pool = NULL; + kfree(adapter->tso_pool); + adapter->tso_pool = NULL; + adapter->num_active_tx_pools = 0; + adapter->prev_tx_pool_size = 0; } +static int init_one_tx_pool(struct net_device *netdev, + struct ibmvnic_tx_pool *tx_pool, + int pool_size, int buf_size) +{ + int i; + + tx_pool->tx_buff = kcalloc(pool_size, + sizeof(struct ibmvnic_tx_buff), + GFP_KERNEL); + if (!tx_pool->tx_buff) + return -ENOMEM; + + tx_pool->free_map = kcalloc(pool_size, sizeof(int), GFP_KERNEL); + if (!tx_pool->free_map) { + kfree(tx_pool->tx_buff); + tx_pool->tx_buff = NULL; + return -ENOMEM; + } + + for (i = 0; i < pool_size; i++) + tx_pool->free_map[i] = i; + + tx_pool->consumer_index = 0; + tx_pool->producer_index = 0; + tx_pool->num_buffers = pool_size; + tx_pool->buf_size = buf_size; + + return 0; +} + +/** + * reuse_tx_pools() - Check if the existing tx pools can be reused. + * @adapter: ibmvnic adapter + * + * Check if the existing tx pools in the adapter can be reused. The + * pools can be reused if the pool parameters (number of pools, + * number of buffers in the pool and mtu) have not changed. + * + * NOTE: This assumes that all pools have the same number of buffers + * which is the case currently. If that changes, we must fix this. + * + * Return: true if the tx pools can be reused, false otherwise. + */ +static bool reuse_tx_pools(struct ibmvnic_adapter *adapter) +{ + u64 old_num_pools, new_num_pools; + u64 old_pool_size, new_pool_size; + u64 old_mtu, new_mtu; + + if (!adapter->tx_pool) + return false; + + old_num_pools = adapter->num_active_tx_pools; + new_num_pools = adapter->num_active_tx_scrqs; + old_pool_size = adapter->prev_tx_pool_size; + new_pool_size = adapter->req_tx_entries_per_subcrq; + old_mtu = adapter->prev_mtu; + new_mtu = adapter->req_mtu; + + if (old_mtu != new_mtu || + old_num_pools != new_num_pools || + old_pool_size != new_pool_size) + return false; + + return true; +} + +/** + * init_tx_pools(): Initialize the set of transmit pools in the adapter. + * @netdev: net device associated with the vnic interface + * + * Initialize the set of transmit pools in the ibmvnic adapter associated + * with the net_device @netdev. If possible, reuse the existing tx pools. + * Otherwise free any existing pools and allocate a new set of pools + * before initializing them. + * + * Return: 0 on success and negative value on error. + */ static int init_tx_pools(struct net_device *netdev) { struct ibmvnic_adapter *adapter = netdev_priv(netdev); struct device *dev = &adapter->vdev->dev; - struct ibmvnic_tx_pool *tx_pool; - int tx_subcrqs; - int i, j; + int num_pools; + u64 pool_size; /* # of buffers in pool */ + u64 buff_size; + int i, j, rc; + + num_pools = adapter->req_tx_queues; + + /* We must notify the VIOS about the LTB on all resets - but we only + * need to alloc/populate pools if either the number of buffers or + * size of each buffer in the pool has changed. + */ + if (reuse_tx_pools(adapter)) { + netdev_dbg(netdev, "Reusing tx pools\n"); + goto update_ltb; + } + + /* Allocate/populate the pools. */ + release_tx_pools(adapter); - tx_subcrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); - adapter->tx_pool = kcalloc(tx_subcrqs, + pool_size = adapter->req_tx_entries_per_subcrq; + num_pools = adapter->num_active_tx_scrqs; + + adapter->tx_pool = kcalloc(num_pools, sizeof(struct ibmvnic_tx_pool), GFP_KERNEL); if (!adapter->tx_pool) - return -1; + return -ENOMEM; - for (i = 0; i < tx_subcrqs; i++) { - tx_pool = &adapter->tx_pool[i]; - tx_pool->tx_buff = kcalloc(adapter->req_tx_entries_per_subcrq, - sizeof(struct ibmvnic_tx_buff), - GFP_KERNEL); - if (!tx_pool->tx_buff) { - dev_err(dev, "tx pool buffer allocation failed\n"); - release_tx_pools(adapter); - return -1; - } + adapter->tso_pool = kcalloc(num_pools, + sizeof(struct ibmvnic_tx_pool), GFP_KERNEL); + /* To simplify release_tx_pools() ensure that ->tx_pool and + * ->tso_pool are either both NULL or both non-NULL. + */ + if (!adapter->tso_pool) { + kfree(adapter->tx_pool); + adapter->tx_pool = NULL; + return -ENOMEM; + } - if (alloc_long_term_buff(adapter, &tx_pool->long_term_buff, - adapter->req_tx_entries_per_subcrq * - adapter->req_mtu)) { - release_tx_pools(adapter); - return -1; - } + /* Set num_active_tx_pools early. If we fail below after partial + * allocation, release_tx_pools() will know how many to look for. + */ + adapter->num_active_tx_pools = num_pools; - tx_pool->free_map = kcalloc(adapter->req_tx_entries_per_subcrq, - sizeof(int), GFP_KERNEL); - if (!tx_pool->free_map) { - release_tx_pools(adapter); - return -1; - } + buff_size = adapter->req_mtu + VLAN_HLEN; + buff_size = ALIGN(buff_size, L1_CACHE_BYTES); - for (j = 0; j < adapter->req_tx_entries_per_subcrq; j++) - tx_pool->free_map[j] = j; + for (i = 0; i < num_pools; i++) { + dev_dbg(dev, "Init tx pool %d [%llu, %llu]\n", + i, adapter->req_tx_entries_per_subcrq, buff_size); + + rc = init_one_tx_pool(netdev, &adapter->tx_pool[i], + pool_size, buff_size); + if (rc) + goto out_release; + + rc = init_one_tx_pool(netdev, &adapter->tso_pool[i], + IBMVNIC_TSO_BUFS, + IBMVNIC_TSO_BUF_SZ); + if (rc) + goto out_release; + } + + adapter->prev_tx_pool_size = pool_size; + adapter->prev_mtu = adapter->req_mtu; + +update_ltb: + /* NOTE: All tx_pools have the same number of buffers (which is + * same as pool_size). All tso_pools have IBMVNIC_TSO_BUFS + * buffers (see calls init_one_tx_pool() for these). + * For consistency, we use tx_pool->num_buffers and + * tso_pool->num_buffers below. + */ + rc = -1; + for (i = 0; i < num_pools; i++) { + struct ibmvnic_tx_pool *tso_pool; + struct ibmvnic_tx_pool *tx_pool; + + tx_pool = &adapter->tx_pool[i]; + + dev_dbg(dev, "Updating LTB for tx pool %d [%d, %d]\n", + i, tx_pool->num_buffers, tx_pool->buf_size); + + rc = alloc_ltb_set(adapter, &tx_pool->ltb_set, + tx_pool->num_buffers, tx_pool->buf_size); + if (rc) + goto out; tx_pool->consumer_index = 0; tx_pool->producer_index = 0; - } - return 0; -} + for (j = 0; j < tx_pool->num_buffers; j++) + tx_pool->free_map[j] = j; -static void release_error_buffers(struct ibmvnic_adapter *adapter) -{ - struct device *dev = &adapter->vdev->dev; - struct ibmvnic_error_buff *error_buff, *tmp; - unsigned long flags; + tso_pool = &adapter->tso_pool[i]; + + dev_dbg(dev, "Updating LTB for tso pool %d [%d, %d]\n", + i, tso_pool->num_buffers, tso_pool->buf_size); - spin_lock_irqsave(&adapter->error_list_lock, flags); - list_for_each_entry_safe(error_buff, tmp, &adapter->errors, list) { - list_del(&error_buff->list); - dma_unmap_single(dev, error_buff->dma, error_buff->len, - DMA_FROM_DEVICE); - kfree(error_buff->buff); - kfree(error_buff); + rc = alloc_ltb_set(adapter, &tso_pool->ltb_set, + tso_pool->num_buffers, tso_pool->buf_size); + if (rc) + goto out; + + tso_pool->consumer_index = 0; + tso_pool->producer_index = 0; + + for (j = 0; j < tso_pool->num_buffers; j++) + tso_pool->free_map[j] = j; } - spin_unlock_irqrestore(&adapter->error_list_lock, flags); + + return 0; +out_release: + release_tx_pools(adapter); +out: + /* We failed to allocate one or more LTBs or map them on the VIOS. + * Hold onto the pools and any LTBs that we did allocate/map. + */ + return rc; } static void ibmvnic_napi_enable(struct ibmvnic_adapter *adapter) @@ -640,71 +1459,257 @@ static void ibmvnic_napi_disable(struct ibmvnic_adapter *adapter) if (!adapter->napi_enabled) return; - for (i = 0; i < adapter->req_rx_queues; i++) + for (i = 0; i < adapter->req_rx_queues; i++) { + netdev_dbg(adapter->netdev, "Disabling napi[%d]\n", i); napi_disable(&adapter->napi[i]); + } adapter->napi_enabled = false; } +static int init_napi(struct ibmvnic_adapter *adapter) +{ + int i; + + adapter->napi = kcalloc(adapter->req_rx_queues, + sizeof(struct napi_struct), GFP_KERNEL); + if (!adapter->napi) + return -ENOMEM; + + for (i = 0; i < adapter->req_rx_queues; i++) { + netdev_dbg(adapter->netdev, "Adding napi[%d]\n", i); + netif_napi_add(adapter->netdev, &adapter->napi[i], + ibmvnic_poll); + } + + adapter->num_active_rx_napi = adapter->req_rx_queues; + return 0; +} + +static void release_napi(struct ibmvnic_adapter *adapter) +{ + int i; + + if (!adapter->napi) + return; + + for (i = 0; i < adapter->num_active_rx_napi; i++) { + netdev_dbg(adapter->netdev, "Releasing napi[%d]\n", i); + netif_napi_del(&adapter->napi[i]); + } + + kfree(adapter->napi); + adapter->napi = NULL; + adapter->num_active_rx_napi = 0; + adapter->napi_enabled = false; +} + +static const char *adapter_state_to_string(enum vnic_state state) +{ + switch (state) { + case VNIC_PROBING: + return "PROBING"; + case VNIC_PROBED: + return "PROBED"; + case VNIC_OPENING: + return "OPENING"; + case VNIC_OPEN: + return "OPEN"; + case VNIC_CLOSING: + return "CLOSING"; + case VNIC_CLOSED: + return "CLOSED"; + case VNIC_REMOVING: + return "REMOVING"; + case VNIC_REMOVED: + return "REMOVED"; + case VNIC_DOWN: + return "DOWN"; + } + return "UNKNOWN"; +} + static int ibmvnic_login(struct net_device *netdev) { + unsigned long flags, timeout = msecs_to_jiffies(20000); struct ibmvnic_adapter *adapter = netdev_priv(netdev); - unsigned long timeout = msecs_to_jiffies(30000); - struct device *dev = &adapter->vdev->dev; + int retry_count = 0; + int retries = 10; + bool retry; + int rc; do { - if (adapter->renegotiate) { - adapter->renegotiate = false; - release_sub_crqs(adapter); + retry = false; + if (retry_count > retries) { + netdev_warn(netdev, "Login attempts exceeded\n"); + return -EACCES; + } + + adapter->init_done_rc = 0; + reinit_completion(&adapter->init_done); + rc = send_login(adapter); + if (rc) + return rc; + + if (!wait_for_completion_timeout(&adapter->init_done, + timeout)) { + netdev_warn(netdev, "Login timed out\n"); + adapter->login_pending = false; + goto partial_reset; + } + if (adapter->init_done_rc == ABORTED) { + netdev_warn(netdev, "Login aborted, retrying...\n"); + retry = true; + adapter->init_done_rc = 0; + retry_count++; + /* FW or device may be busy, so + * wait a bit before retrying login + */ + msleep(500); + } else if (adapter->init_done_rc == PARTIALSUCCESS) { + retry_count++; + release_sub_crqs(adapter, 1); + + retry = true; + netdev_dbg(netdev, + "Received partial success, retrying...\n"); + adapter->init_done_rc = 0; reinit_completion(&adapter->init_done); - send_cap_queries(adapter); + send_query_cap(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { - dev_err(dev, "Capabilities query timeout\n"); - return -1; + netdev_warn(netdev, + "Capabilities query timed out\n"); + return -ETIMEDOUT; } - } - reinit_completion(&adapter->init_done); - send_login(adapter); - if (!wait_for_completion_timeout(&adapter->init_done, - timeout)) { - dev_err(dev, "Login timeout\n"); - return -1; + rc = init_sub_crqs(adapter); + if (rc) { + netdev_warn(netdev, + "SCRQ initialization failed\n"); + return rc; + } + + rc = init_sub_crq_irqs(adapter); + if (rc) { + netdev_warn(netdev, + "SCRQ irq initialization failed\n"); + return rc; + } + /* Default/timeout error handling, reset and start fresh */ + } else if (adapter->init_done_rc) { + netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n", + adapter->init_done_rc); + +partial_reset: + /* adapter login failed, so free any CRQs or sub-CRQs + * and register again before attempting to login again. + * If we don't do this then the VIOS may think that + * we are already logged in and reject any subsequent + * attempts + */ + netdev_warn(netdev, + "Freeing and re-registering CRQs before attempting to login again\n"); + retry = true; + adapter->init_done_rc = 0; + release_sub_crqs(adapter, true); + /* Much of this is similar logic as ibmvnic_probe(), + * we are essentially re-initializing communication + * with the server. We really should not run any + * resets/failovers here because this is already a form + * of reset and we do not want parallel resets occurring + */ + do { + reinit_init_done(adapter); + /* Clear any failovers we got in the previous + * pass since we are re-initializing the CRQ + */ + adapter->failover_pending = false; + release_crq_queue(adapter); + /* If we don't sleep here then we risk an + * unnecessary failover event from the VIOS. + * This is a known VIOS issue caused by a vnic + * device freeing and registering a CRQ too + * quickly. + */ + msleep(1500); + /* Avoid any resets, since we are currently + * resetting. + */ + spin_lock_irqsave(&adapter->rwi_lock, flags); + flush_reset_queue(adapter); + spin_unlock_irqrestore(&adapter->rwi_lock, + flags); + + rc = init_crq_queue(adapter); + if (rc) { + netdev_err(netdev, "login recovery: init CRQ failed %d\n", + rc); + return -EIO; + } + + rc = ibmvnic_reset_init(adapter, false); + if (rc) + netdev_err(netdev, "login recovery: Reset init failed %d\n", + rc); + /* IBMVNIC_CRQ_INIT will return EAGAIN if it + * fails, since ibmvnic_reset_init will free + * irq's in failure, we won't be able to receive + * new CRQs so we need to keep trying. probe() + * handles this similarly. + */ + } while (rc == -EAGAIN && retry_count++ < retries); } - } while (adapter->renegotiate); + } while (retry); + + __ibmvnic_set_mac(netdev, adapter->mac_addr); + netdev_dbg(netdev, "[S:%s] Login succeeded\n", adapter_state_to_string(adapter->state)); return 0; } -static void release_resources(struct ibmvnic_adapter *adapter) +static void release_login_buffer(struct ibmvnic_adapter *adapter) { - int i; + if (!adapter->login_buf) + return; - release_tx_pools(adapter); - release_rx_pools(adapter); + dma_unmap_single(&adapter->vdev->dev, adapter->login_buf_token, + adapter->login_buf_sz, DMA_TO_DEVICE); + kfree(adapter->login_buf); + adapter->login_buf = NULL; +} - release_stats_token(adapter); - release_error_buffers(adapter); +static void release_login_rsp_buffer(struct ibmvnic_adapter *adapter) +{ + if (!adapter->login_rsp_buf) + return; - if (adapter->napi) { - for (i = 0; i < adapter->req_rx_queues; i++) { - if (&adapter->napi[i]) - netif_napi_del(&adapter->napi[i]); - } - } + dma_unmap_single(&adapter->vdev->dev, adapter->login_rsp_buf_token, + adapter->login_rsp_buf_sz, DMA_FROM_DEVICE); + kfree(adapter->login_rsp_buf); + adapter->login_rsp_buf = NULL; +} + +static void release_resources(struct ibmvnic_adapter *adapter) +{ + release_vpd_data(adapter); + + release_napi(adapter); + release_login_buffer(adapter); + release_login_rsp_buffer(adapter); } static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state) { struct net_device *netdev = adapter->netdev; - unsigned long timeout = msecs_to_jiffies(30000); + unsigned long timeout = msecs_to_jiffies(20000); union ibmvnic_crq crq; bool resend; int rc; - netdev_err(netdev, "setting link state %d\n", link_state); + netdev_dbg(netdev, "setting link state %d\n", link_state); + memset(&crq, 0, sizeof(crq)); crq.logical_link_state.first = IBMVNIC_CRQ_CMD; crq.logical_link_state.cmd = LOGICAL_LINK_STATE; @@ -723,13 +1728,17 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state) if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { netdev_err(netdev, "timeout setting link state\n"); - return -1; + return -ETIMEDOUT; } - if (adapter->init_done_rc == 1) { + if (adapter->init_done_rc == PARTIALSUCCESS) { /* Partuial success, delay and re-send */ mdelay(1000); resend = true; + } else if (adapter->init_done_rc) { + netdev_warn(netdev, "Unable to set link state, rc=%d\n", + adapter->init_done_rc); + return adapter->init_done_rc; } } while (resend); @@ -741,6 +1750,9 @@ static int set_real_num_queues(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc; + netdev_dbg(netdev, "Setting real tx/rx queues (%llx/%llx)\n", + adapter->req_tx_queues, adapter->req_rx_queues); + rc = netif_set_real_num_tx_queues(netdev, adapter->req_tx_queues); if (rc) { netdev_err(netdev, "failed to set the number of tx queues\n"); @@ -754,31 +1766,115 @@ static int set_real_num_queues(struct net_device *netdev) return rc; } +static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter) +{ + struct device *dev = &adapter->vdev->dev; + union ibmvnic_crq crq; + int len = 0; + int rc; + + if (adapter->vpd->buff) + len = adapter->vpd->len; + + mutex_lock(&adapter->fw_lock); + adapter->fw_done_rc = 0; + reinit_completion(&adapter->fw_done); + + crq.get_vpd_size.first = IBMVNIC_CRQ_CMD; + crq.get_vpd_size.cmd = GET_VPD_SIZE; + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) { + mutex_unlock(&adapter->fw_lock); + return rc; + } + + rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); + if (rc) { + dev_err(dev, "Could not retrieve VPD size, rc = %d\n", rc); + mutex_unlock(&adapter->fw_lock); + return rc; + } + mutex_unlock(&adapter->fw_lock); + + if (!adapter->vpd->len) + return -ENODATA; + + if (!adapter->vpd->buff) + adapter->vpd->buff = kzalloc(adapter->vpd->len, GFP_KERNEL); + else if (adapter->vpd->len != len) + adapter->vpd->buff = + krealloc(adapter->vpd->buff, + adapter->vpd->len, GFP_KERNEL); + + if (!adapter->vpd->buff) { + dev_err(dev, "Could allocate VPD buffer\n"); + return -ENOMEM; + } + + adapter->vpd->dma_addr = + dma_map_single(dev, adapter->vpd->buff, adapter->vpd->len, + DMA_FROM_DEVICE); + if (dma_mapping_error(dev, adapter->vpd->dma_addr)) { + dev_err(dev, "Could not map VPD buffer\n"); + kfree(adapter->vpd->buff); + adapter->vpd->buff = NULL; + return -ENOMEM; + } + + mutex_lock(&adapter->fw_lock); + adapter->fw_done_rc = 0; + reinit_completion(&adapter->fw_done); + + crq.get_vpd.first = IBMVNIC_CRQ_CMD; + crq.get_vpd.cmd = GET_VPD; + crq.get_vpd.ioba = cpu_to_be32(adapter->vpd->dma_addr); + crq.get_vpd.len = cpu_to_be32((u32)adapter->vpd->len); + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) { + kfree(adapter->vpd->buff); + adapter->vpd->buff = NULL; + mutex_unlock(&adapter->fw_lock); + return rc; + } + + rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); + if (rc) { + dev_err(dev, "Unable to retrieve VPD, rc = %d\n", rc); + kfree(adapter->vpd->buff); + adapter->vpd->buff = NULL; + mutex_unlock(&adapter->fw_lock); + return rc; + } + + mutex_unlock(&adapter->fw_lock); + return 0; +} + static int init_resources(struct ibmvnic_adapter *adapter) { struct net_device *netdev = adapter->netdev; - int i, rc; + int rc; rc = set_real_num_queues(netdev); if (rc) return rc; - rc = init_stats_token(adapter); - if (rc) - return rc; - - adapter->map_id = 1; - adapter->napi = kcalloc(adapter->req_rx_queues, - sizeof(struct napi_struct), GFP_KERNEL); - if (!adapter->napi) + adapter->vpd = kzalloc(sizeof(*adapter->vpd), GFP_KERNEL); + if (!adapter->vpd) return -ENOMEM; - for (i = 0; i < adapter->req_rx_queues; i++) { - netif_napi_add(netdev, &adapter->napi[i], ibmvnic_poll, - NAPI_POLL_WEIGHT); + /* Vital Product Data (VPD) */ + rc = ibmvnic_get_vpd(adapter); + if (rc) { + netdev_err(netdev, "failed to initialize Vital Product Data (VPD)\n"); + return rc; } - send_map_query(adapter); + rc = init_napi(adapter); + if (rc) + return rc; + + send_query_map(adapter); rc = init_rx_pools(netdev); if (rc) @@ -802,27 +1898,43 @@ static int __ibmvnic_open(struct net_device *netdev) * set the logical link state to up */ for (i = 0; i < adapter->req_rx_queues; i++) { + netdev_dbg(netdev, "Enabling rx_scrq[%d] irq\n", i); if (prev_state == VNIC_CLOSED) enable_irq(adapter->rx_scrq[i]->irq); - else - enable_scrq_irq(adapter, adapter->rx_scrq[i]); + enable_scrq_irq(adapter, adapter->rx_scrq[i]); } for (i = 0; i < adapter->req_tx_queues; i++) { + netdev_dbg(netdev, "Enabling tx_scrq[%d] irq\n", i); if (prev_state == VNIC_CLOSED) enable_irq(adapter->tx_scrq[i]->irq); - else - enable_scrq_irq(adapter, adapter->tx_scrq[i]); + enable_scrq_irq(adapter, adapter->tx_scrq[i]); + /* netdev_tx_reset_queue will reset dql stats. During NON_FATAL + * resets, don't reset the stats because there could be batched + * skb's waiting to be sent. If we reset dql stats, we risk + * num_completed being greater than num_queued. This will cause + * a BUG_ON in dql_completed(). + */ + if (adapter->reset_reason != VNIC_RESET_NON_FATAL) + netdev_tx_reset_queue(netdev_get_tx_queue(netdev, i)); } rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP); if (rc) { - for (i = 0; i < adapter->req_rx_queues; i++) - napi_disable(&adapter->napi[i]); - release_resources(adapter); + ibmvnic_napi_disable(adapter); + ibmvnic_disable_irqs(adapter); return rc; } + adapter->tx_queues_active = true; + + /* Since queues were stopped until now, there shouldn't be any + * one in ibmvnic_complete_tx() or ibmvnic_xmit() so maybe we + * don't need the synchronize_rcu()? Leaving it for consistency + * with setting ->tx_queues_active = false. + */ + synchronize_rcu(); + netif_tx_start_all_queues(netdev); if (prev_state == VNIC_CLOSED) { @@ -839,102 +1951,188 @@ static int ibmvnic_open(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc; - mutex_lock(&adapter->reset_lock); + ASSERT_RTNL(); + + /* If device failover is pending or we are about to reset, just set + * device state and return. Device operation will be handled by reset + * routine. + * + * It should be safe to overwrite the adapter->state here. Since + * we hold the rtnl, either the reset has not actually started or + * the rtnl got dropped during the set_link_state() in do_reset(). + * In the former case, no one else is changing the state (again we + * have the rtnl) and in the latter case, do_reset() will detect and + * honor our setting below. + */ + if (adapter->failover_pending || (test_bit(0, &adapter->resetting))) { + netdev_dbg(netdev, "[S:%s FOP:%d] Resetting, deferring open\n", + adapter_state_to_string(adapter->state), + adapter->failover_pending); + adapter->state = VNIC_OPEN; + rc = 0; + goto out; + } if (adapter->state != VNIC_CLOSED) { rc = ibmvnic_login(netdev); - if (rc) { - mutex_unlock(&adapter->reset_lock); - return rc; - } + if (rc) + goto out; rc = init_resources(adapter); if (rc) { netdev_err(netdev, "failed to initialize resources\n"); - release_resources(adapter); - mutex_unlock(&adapter->reset_lock); - return rc; + goto out; } } rc = __ibmvnic_open(netdev); - mutex_unlock(&adapter->reset_lock); + +out: + /* If open failed and there is a pending failover or in-progress reset, + * set device state and return. Device operation will be handled by + * reset routine. See also comments above regarding rtnl. + */ + if (rc && + (adapter->failover_pending || (test_bit(0, &adapter->resetting)))) { + adapter->state = VNIC_OPEN; + rc = 0; + } + + if (rc) { + release_resources(adapter); + release_rx_pools(adapter); + release_tx_pools(adapter); + } return rc; } -static void clean_tx_pools(struct ibmvnic_adapter *adapter) +static void clean_rx_pools(struct ibmvnic_adapter *adapter) { - struct ibmvnic_tx_pool *tx_pool; - u64 tx_entries; - int tx_scrqs; + struct ibmvnic_rx_pool *rx_pool; + struct ibmvnic_rx_buff *rx_buff; + u64 rx_entries; + int rx_scrqs; int i, j; - if (!adapter->tx_pool) + if (!adapter->rx_pool) return; - tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); - tx_entries = adapter->req_tx_entries_per_subcrq; + rx_scrqs = adapter->num_active_rx_pools; + rx_entries = adapter->req_rx_add_entries_per_subcrq; - /* Free any remaining skbs in the tx buffer pools */ - for (i = 0; i < tx_scrqs; i++) { - tx_pool = &adapter->tx_pool[i]; - if (!tx_pool) + /* Free any remaining skbs in the rx buffer pools */ + for (i = 0; i < rx_scrqs; i++) { + rx_pool = &adapter->rx_pool[i]; + if (!rx_pool || !rx_pool->rx_buff) continue; - for (j = 0; j < tx_entries; j++) { - if (tx_pool->tx_buff[j].skb) { - dev_kfree_skb_any(tx_pool->tx_buff[j].skb); - tx_pool->tx_buff[j].skb = NULL; + netdev_dbg(adapter->netdev, "Cleaning rx_pool[%d]\n", i); + for (j = 0; j < rx_entries; j++) { + rx_buff = &rx_pool->rx_buff[j]; + if (rx_buff && rx_buff->skb) { + dev_kfree_skb_any(rx_buff->skb); + rx_buff->skb = NULL; } } } } -static int __ibmvnic_close(struct net_device *netdev) +static void clean_one_tx_pool(struct ibmvnic_adapter *adapter, + struct ibmvnic_tx_pool *tx_pool) { - struct ibmvnic_adapter *adapter = netdev_priv(netdev); - int rc = 0; + struct ibmvnic_tx_buff *tx_buff; + u64 tx_entries; int i; - adapter->state = VNIC_CLOSING; + if (!tx_pool || !tx_pool->tx_buff) + return; - /* ensure that transmissions are stopped if called by do_reset */ - if (adapter->resetting) - netif_tx_disable(netdev); - else - netif_tx_stop_all_queues(netdev); + tx_entries = tx_pool->num_buffers; - ibmvnic_napi_disable(adapter); + for (i = 0; i < tx_entries; i++) { + tx_buff = &tx_pool->tx_buff[i]; + if (tx_buff && tx_buff->skb) { + dev_kfree_skb_any(tx_buff->skb); + tx_buff->skb = NULL; + } + } +} + +static void clean_tx_pools(struct ibmvnic_adapter *adapter) +{ + int tx_scrqs; + int i; + + if (!adapter->tx_pool || !adapter->tso_pool) + return; + + tx_scrqs = adapter->num_active_tx_pools; + + /* Free any remaining skbs in the tx buffer pools */ + for (i = 0; i < tx_scrqs; i++) { + netdev_dbg(adapter->netdev, "Cleaning tx_pool[%d]\n", i); + clean_one_tx_pool(adapter, &adapter->tx_pool[i]); + clean_one_tx_pool(adapter, &adapter->tso_pool[i]); + } +} + +static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int i; if (adapter->tx_scrq) { for (i = 0; i < adapter->req_tx_queues; i++) - if (adapter->tx_scrq[i]->irq) + if (adapter->tx_scrq[i]->irq) { + netdev_dbg(netdev, + "Disabling tx_scrq[%d] irq\n", i); + disable_scrq_irq(adapter, adapter->tx_scrq[i]); disable_irq(adapter->tx_scrq[i]->irq); + } } - rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); - if (rc) - return rc; - if (adapter->rx_scrq) { for (i = 0; i < adapter->req_rx_queues; i++) { - int retries = 10; - - while (pending_scrq(adapter, adapter->rx_scrq[i])) { - retries--; - mdelay(100); - - if (retries == 0) - break; - } - - if (adapter->rx_scrq[i]->irq) + if (adapter->rx_scrq[i]->irq) { + netdev_dbg(netdev, + "Disabling rx_scrq[%d] irq\n", i); + disable_scrq_irq(adapter, adapter->rx_scrq[i]); disable_irq(adapter->rx_scrq[i]->irq); + } } } +} - clean_tx_pools(adapter); +static void ibmvnic_cleanup(struct net_device *netdev) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + /* ensure that transmissions are stopped if called by do_reset */ + + adapter->tx_queues_active = false; + + /* Ensure complete_tx() and ibmvnic_xmit() see ->tx_queues_active + * update so they don't restart a queue after we stop it below. + */ + synchronize_rcu(); + + if (test_bit(0, &adapter->resetting)) + netif_tx_disable(netdev); + else + netif_tx_stop_all_queues(netdev); + + ibmvnic_napi_disable(adapter); + ibmvnic_disable_irqs(adapter); +} + +static int __ibmvnic_close(struct net_device *netdev) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + int rc = 0; + + adapter->state = VNIC_CLOSING; + rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); adapter->state = VNIC_CLOSED; return rc; } @@ -944,171 +2142,320 @@ static int ibmvnic_close(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc; - mutex_lock(&adapter->reset_lock); + netdev_dbg(netdev, "[S:%s FOP:%d FRR:%d] Closing\n", + adapter_state_to_string(adapter->state), + adapter->failover_pending, + adapter->force_reset_recovery); + + /* If device failover is pending, just set device state and return. + * Device operation will be handled by reset routine. + */ + if (adapter->failover_pending) { + adapter->state = VNIC_CLOSED; + return 0; + } + rc = __ibmvnic_close(netdev); - mutex_unlock(&adapter->reset_lock); + ibmvnic_cleanup(netdev); + clean_rx_pools(adapter); + clean_tx_pools(adapter); return rc; } /** - * build_hdr_data - creates L2/L3/L4 header data buffer - * @hdr_field - bitfield determining needed headers - * @skb - socket buffer - * @hdr_len - array of header lengths - * @tot_len - total length of data + * get_hdr_lens - fills list of L2/L3/L4 hdr lens + * @hdr_field: bitfield determining needed headers + * @skb: socket buffer + * @hdr_len: array of header lengths to be filled * * Reads hdr_field to determine which headers are needed by firmware. * Builds a buffer containing these headers. Saves individual header * lengths and total buffer length to be used to build descriptors. + * + * Return: total len of all headers */ -static int build_hdr_data(u8 hdr_field, struct sk_buff *skb, - int *hdr_len, u8 *hdr_data) +static int get_hdr_lens(u8 hdr_field, struct sk_buff *skb, + int *hdr_len) { int len = 0; - u8 *hdr; - hdr_len[0] = sizeof(struct ethhdr); + + if ((hdr_field >> 6) & 1) { + hdr_len[0] = skb_mac_header_len(skb); + len += hdr_len[0]; + } + + if ((hdr_field >> 5) & 1) { + hdr_len[1] = skb_network_header_len(skb); + len += hdr_len[1]; + } + + if (!((hdr_field >> 4) & 1)) + return len; if (skb->protocol == htons(ETH_P_IP)) { - hdr_len[1] = ip_hdr(skb)->ihl * 4; if (ip_hdr(skb)->protocol == IPPROTO_TCP) hdr_len[2] = tcp_hdrlen(skb); else if (ip_hdr(skb)->protocol == IPPROTO_UDP) hdr_len[2] = sizeof(struct udphdr); } else if (skb->protocol == htons(ETH_P_IPV6)) { - hdr_len[1] = sizeof(struct ipv6hdr); if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP) hdr_len[2] = tcp_hdrlen(skb); else if (ipv6_hdr(skb)->nexthdr == IPPROTO_UDP) hdr_len[2] = sizeof(struct udphdr); } - memset(hdr_data, 0, 120); - if ((hdr_field >> 6) & 1) { - hdr = skb_mac_header(skb); - memcpy(hdr_data, hdr, hdr_len[0]); - len += hdr_len[0]; - } - - if ((hdr_field >> 5) & 1) { - hdr = skb_network_header(skb); - memcpy(hdr_data + len, hdr, hdr_len[1]); - len += hdr_len[1]; - } - - if ((hdr_field >> 4) & 1) { - hdr = skb_transport_header(skb); - memcpy(hdr_data + len, hdr, hdr_len[2]); - len += hdr_len[2]; - } - return len; + return len + hdr_len[2]; } /** * create_hdr_descs - create header and header extension descriptors - * @hdr_field - bitfield determining needed headers - * @data - buffer containing header data - * @len - length of data buffer - * @hdr_len - array of individual header lengths - * @scrq_arr - descriptor array + * @hdr_field: bitfield determining needed headers + * @hdr_data: buffer containing header data + * @len: length of data buffer + * @hdr_len: array of individual header lengths + * @scrq_arr: descriptor array * * Creates header and, if needed, header extension descriptors and * places them in a descriptor array, scrq_arr + * + * Return: Number of header descs */ -static void create_hdr_descs(u8 hdr_field, u8 *hdr_data, int len, int *hdr_len, - union sub_crq *scrq_arr) +static int create_hdr_descs(u8 hdr_field, u8 *hdr_data, int len, int *hdr_len, + union sub_crq *scrq_arr) { - union sub_crq hdr_desc; + union sub_crq *hdr_desc; int tmp_len = len; + int num_descs = 0; u8 *data, *cur; int tmp; while (tmp_len > 0) { cur = hdr_data + len - tmp_len; - memset(&hdr_desc, 0, sizeof(hdr_desc)); - if (cur != hdr_data) { - data = hdr_desc.hdr_ext.data; + hdr_desc = &scrq_arr[num_descs]; + if (num_descs) { + data = hdr_desc->hdr_ext.data; tmp = tmp_len > 29 ? 29 : tmp_len; - hdr_desc.hdr_ext.first = IBMVNIC_CRQ_CMD; - hdr_desc.hdr_ext.type = IBMVNIC_HDR_EXT_DESC; - hdr_desc.hdr_ext.len = tmp; + hdr_desc->hdr_ext.first = IBMVNIC_CRQ_CMD; + hdr_desc->hdr_ext.type = IBMVNIC_HDR_EXT_DESC; + hdr_desc->hdr_ext.len = tmp; } else { - data = hdr_desc.hdr.data; + data = hdr_desc->hdr.data; tmp = tmp_len > 24 ? 24 : tmp_len; - hdr_desc.hdr.first = IBMVNIC_CRQ_CMD; - hdr_desc.hdr.type = IBMVNIC_HDR_DESC; - hdr_desc.hdr.len = tmp; - hdr_desc.hdr.l2_len = (u8)hdr_len[0]; - hdr_desc.hdr.l3_len = cpu_to_be16((u16)hdr_len[1]); - hdr_desc.hdr.l4_len = (u8)hdr_len[2]; - hdr_desc.hdr.flag = hdr_field << 1; + hdr_desc->hdr.first = IBMVNIC_CRQ_CMD; + hdr_desc->hdr.type = IBMVNIC_HDR_DESC; + hdr_desc->hdr.len = tmp; + hdr_desc->hdr.l2_len = (u8)hdr_len[0]; + hdr_desc->hdr.l3_len = cpu_to_be16((u16)hdr_len[1]); + hdr_desc->hdr.l4_len = (u8)hdr_len[2]; + hdr_desc->hdr.flag = hdr_field << 1; } memcpy(data, cur, tmp); tmp_len -= tmp; - *scrq_arr = hdr_desc; - scrq_arr++; + num_descs++; } + + return num_descs; } /** * build_hdr_descs_arr - build a header descriptor array - * @skb - socket buffer - * @num_entries - number of descriptors to be sent - * @subcrq - first TX descriptor - * @hdr_field - bit field determining which headers will be sent + * @skb: tx socket buffer + * @indir_arr: indirect array + * @num_entries: number of descriptors to be sent + * @hdr_field: bit field determining which headers will be sent * * This function will build a TX descriptor array with applicable * L2/L3/L4 packet header descriptors to be sent by send_subcrq_indirect. */ -static void build_hdr_descs_arr(struct ibmvnic_tx_buff *txbuff, +static void build_hdr_descs_arr(struct sk_buff *skb, + union sub_crq *indir_arr, int *num_entries, u8 hdr_field) { int hdr_len[3] = {0, 0, 0}; - int tot_len, len; - u8 *hdr_data = txbuff->hdr_data; + int tot_len; - tot_len = build_hdr_data(hdr_field, txbuff->skb, hdr_len, - txbuff->hdr_data); - len = tot_len; - len -= 24; - if (len > 0) - num_entries += len % 29 ? len / 29 + 1 : len / 29; - create_hdr_descs(hdr_field, hdr_data, tot_len, hdr_len, - txbuff->indir_arr + 1); + tot_len = get_hdr_lens(hdr_field, skb, hdr_len); + *num_entries += create_hdr_descs(hdr_field, skb_mac_header(skb), + tot_len, hdr_len, indir_arr + 1); } -static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) +static int ibmvnic_xmit_workarounds(struct sk_buff *skb, + struct net_device *netdev) +{ + /* For some backing devices, mishandling of small packets + * can result in a loss of connection or TX stall. Device + * architects recommend that no packet should be smaller + * than the minimum MTU value provided to the driver, so + * pad any packets to that length + */ + if (skb->len < netdev->min_mtu) + return skb_put_padto(skb, netdev->min_mtu); + + return 0; +} + +static void ibmvnic_tx_scrq_clean_buffer(struct ibmvnic_adapter *adapter, + struct ibmvnic_sub_crq_queue *tx_scrq) +{ + struct ibmvnic_ind_xmit_queue *ind_bufp; + struct ibmvnic_tx_buff *tx_buff; + struct ibmvnic_tx_pool *tx_pool; + union sub_crq tx_scrq_entry; + int queue_num; + int entries; + int index; + int i; + + ind_bufp = &tx_scrq->ind_buf; + entries = (u64)ind_bufp->index; + queue_num = tx_scrq->pool_index; + + for (i = entries - 1; i >= 0; --i) { + tx_scrq_entry = ind_bufp->indir_arr[i]; + if (tx_scrq_entry.v1.type != IBMVNIC_TX_DESC) + continue; + index = be32_to_cpu(tx_scrq_entry.v1.correlator); + if (index & IBMVNIC_TSO_POOL_MASK) { + tx_pool = &adapter->tso_pool[queue_num]; + index &= ~IBMVNIC_TSO_POOL_MASK; + } else { + tx_pool = &adapter->tx_pool[queue_num]; + } + tx_pool->free_map[tx_pool->consumer_index] = index; + tx_pool->consumer_index = tx_pool->consumer_index == 0 ? + tx_pool->num_buffers - 1 : + tx_pool->consumer_index - 1; + tx_buff = &tx_pool->tx_buff[index]; + adapter->tx_stats_buffers[queue_num].batched_packets--; + adapter->tx_stats_buffers[queue_num].bytes -= + tx_buff->skb->len; + dev_kfree_skb_any(tx_buff->skb); + tx_buff->skb = NULL; + adapter->netdev->stats.tx_dropped++; + } + + ind_bufp->index = 0; + + if (atomic_sub_return(entries, &tx_scrq->used) <= + (adapter->req_tx_entries_per_subcrq / 2) && + __netif_subqueue_stopped(adapter->netdev, queue_num)) { + rcu_read_lock(); + + if (adapter->tx_queues_active) { + netif_wake_subqueue(adapter->netdev, queue_num); + netdev_dbg(adapter->netdev, "Started queue %d\n", + queue_num); + } + + rcu_read_unlock(); + } +} + +static int send_subcrq_direct(struct ibmvnic_adapter *adapter, + u64 remote_handle, u64 *entry) +{ + unsigned int ua = adapter->vdev->unit_address; + struct device *dev = &adapter->vdev->dev; + int rc; + + /* Make sure the hypervisor sees the complete request */ + dma_wmb(); + rc = plpar_hcall_norets(H_SEND_SUB_CRQ, ua, + cpu_to_be64(remote_handle), + cpu_to_be64(entry[0]), cpu_to_be64(entry[1]), + cpu_to_be64(entry[2]), cpu_to_be64(entry[3])); + + if (rc) + print_subcrq_error(dev, rc, __func__); + + return rc; +} + +static int ibmvnic_tx_scrq_flush(struct ibmvnic_adapter *adapter, + struct ibmvnic_sub_crq_queue *tx_scrq, + bool indirect) +{ + struct ibmvnic_ind_xmit_queue *ind_bufp; + u64 dma_addr; + u64 entries; + u64 handle; + int rc; + + ind_bufp = &tx_scrq->ind_buf; + dma_addr = (u64)ind_bufp->indir_dma; + entries = (u64)ind_bufp->index; + handle = tx_scrq->handle; + + if (!entries) + return 0; + + if (indirect) + rc = send_subcrq_indirect(adapter, handle, dma_addr, entries); + else + rc = send_subcrq_direct(adapter, handle, + (u64 *)ind_bufp->indir_arr); + + if (rc) { + dev_err_ratelimited(&adapter->vdev->dev, + "tx_flush failed, rc=%u (%llu entries dma=%pad handle=%llx)\n", + rc, entries, &dma_addr, handle); + /* Detect platform limit H_PARAMETER */ + if (rc == H_PARAMETER) + ibmvnic_set_safe_max_ind_descs(adapter); + + /* For all error case, temporarily drop only this batch + * Rely on TCP/IP retransmissions to retry and recover + */ + ibmvnic_tx_scrq_clean_buffer(adapter, tx_scrq); + } else { + ind_bufp->index = 0; + } + return rc; +} + +static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) { struct ibmvnic_adapter *adapter = netdev_priv(netdev); + u32 cur_max_ind_descs = adapter->cur_max_ind_descs; int queue_num = skb_get_queue_mapping(skb); u8 *hdrs = (u8 *)&adapter->tx_rx_desc_req; struct device *dev = &adapter->vdev->dev; + struct ibmvnic_ind_xmit_queue *ind_bufp; struct ibmvnic_tx_buff *tx_buff = NULL; struct ibmvnic_sub_crq_queue *tx_scrq; + struct ibmvnic_long_term_buff *ltb; struct ibmvnic_tx_pool *tx_pool; unsigned int tx_send_failed = 0; + netdev_tx_t ret = NETDEV_TX_OK; unsigned int tx_map_failed = 0; + union sub_crq indir_arr[16]; unsigned int tx_dropped = 0; - unsigned int tx_packets = 0; + unsigned int tx_dpackets = 0; + unsigned int tx_bpackets = 0; unsigned int tx_bytes = 0; dma_addr_t data_dma_addr; struct netdev_queue *txq; unsigned long lpar_rc; + unsigned int skblen; union sub_crq tx_crq; unsigned int offset; + bool use_scrq_send_direct = false; int num_entries = 1; unsigned char *dst; - u64 *handle_array; - int index = 0; - int ret = 0; + int bufidx = 0; + u8 proto = 0; - if (adapter->resetting) { - if (!netif_subqueue_stopped(netdev, skb)) - netif_stop_subqueue(netdev, queue_num); + /* If a reset is in progress, drop the packet since + * the scrqs may get torn down. Otherwise use the + * rcu to ensure reset waits for us to complete. + */ + rcu_read_lock(); + if (!adapter->tx_queues_active) { dev_kfree_skb_any(skb); tx_send_failed++; @@ -1117,30 +2464,99 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) goto out; } - tx_pool = &adapter->tx_pool[queue_num]; tx_scrq = adapter->tx_scrq[queue_num]; - txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb)); - handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + - be32_to_cpu(adapter->login_rsp_buf->off_txsubm_subcrqs)); + txq = netdev_get_tx_queue(netdev, queue_num); + ind_bufp = &tx_scrq->ind_buf; + + if (ibmvnic_xmit_workarounds(skb, netdev)) { + tx_dropped++; + tx_send_failed++; + ret = NETDEV_TX_OK; + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, true); + if (lpar_rc != H_SUCCESS) + goto tx_err; + goto out; + } + + if (skb_is_gso(skb)) + tx_pool = &adapter->tso_pool[queue_num]; + else + tx_pool = &adapter->tx_pool[queue_num]; + + bufidx = tx_pool->free_map[tx_pool->consumer_index]; + + if (bufidx == IBMVNIC_INVALID_MAP) { + dev_kfree_skb_any(skb); + tx_send_failed++; + tx_dropped++; + ret = NETDEV_TX_OK; + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, true); + if (lpar_rc != H_SUCCESS) + goto tx_err; + goto out; + } + + tx_pool->free_map[tx_pool->consumer_index] = IBMVNIC_INVALID_MAP; - index = tx_pool->free_map[tx_pool->consumer_index]; - offset = index * adapter->req_mtu; - dst = tx_pool->long_term_buff.buff + offset; - memset(dst, 0, adapter->req_mtu); - skb_copy_from_linear_data(skb, dst, skb->len); - data_dma_addr = tx_pool->long_term_buff.addr + offset; + map_txpool_buf_to_ltb(tx_pool, bufidx, <b, &offset); + + dst = ltb->buff + offset; + memset(dst, 0, tx_pool->buf_size); + data_dma_addr = ltb->addr + offset; + + /* if we are going to send_subcrq_direct this then we need to + * update the checksum before copying the data into ltb. Essentially + * these packets force disable CSO so that we can guarantee that + * FW does not need header info and we can send direct. Also, vnic + * server must be able to xmit standard packets without header data + */ + if (*hdrs == 0 && !skb_is_gso(skb) && + !ind_bufp->index && !netdev_xmit_more()) { + use_scrq_send_direct = true; + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(skb)) + use_scrq_send_direct = false; + } + + if (skb_shinfo(skb)->nr_frags) { + int cur, i; + + /* Copy the head */ + skb_copy_from_linear_data(skb, dst, skb_headlen(skb)); + cur = skb_headlen(skb); + + /* Copy the frags */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + memcpy(dst + cur, skb_frag_address(frag), + skb_frag_size(frag)); + cur += skb_frag_size(frag); + } + } else { + skb_copy_from_linear_data(skb, dst, skb->len); + } tx_pool->consumer_index = - (tx_pool->consumer_index + 1) % - adapter->req_tx_entries_per_subcrq; + (tx_pool->consumer_index + 1) % tx_pool->num_buffers; + + tx_buff = &tx_pool->tx_buff[bufidx]; + + /* Sanity checks on our free map to make sure it points to an index + * that is not being occupied by another skb. If skb memory is + * not freed then we see congestion control kick in and halt tx. + */ + if (unlikely(tx_buff->skb)) { + dev_warn_ratelimited(dev, "TX free map points to untracked skb (%s %d idx=%d)\n", + skb_is_gso(skb) ? "tso_pool" : "tx_pool", + queue_num, bufidx); + dev_kfree_skb_any(tx_buff->skb); + } - tx_buff = &tx_pool->tx_buff[index]; tx_buff->skb = skb; - tx_buff->data_dma[0] = data_dma_addr; - tx_buff->data_len[0] = skb->len; - tx_buff->index = index; + tx_buff->index = bufidx; tx_buff->pool_index = queue_num; - tx_buff->last_frag = true; + skblen = skb->len; memset(&tx_crq, 0, sizeof(tx_crq)); tx_crq.v1.first = IBMVNIC_CRQ_CMD; @@ -1148,103 +2564,124 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_crq.v1.n_crq_elem = 1; tx_crq.v1.n_sge = 1; tx_crq.v1.flags1 = IBMVNIC_TX_COMP_NEEDED; - tx_crq.v1.correlator = cpu_to_be32(index); - tx_crq.v1.dma_reg = cpu_to_be16(tx_pool->long_term_buff.map_id); + + if (skb_is_gso(skb)) + tx_crq.v1.correlator = + cpu_to_be32(bufidx | IBMVNIC_TSO_POOL_MASK); + else + tx_crq.v1.correlator = cpu_to_be32(bufidx); + tx_crq.v1.dma_reg = cpu_to_be16(ltb->map_id); tx_crq.v1.sge_len = cpu_to_be32(skb->len); tx_crq.v1.ioba = cpu_to_be64(data_dma_addr); - if (adapter->vlan_header_insertion) { + if (adapter->vlan_header_insertion && skb_vlan_tag_present(skb)) { tx_crq.v1.flags2 |= IBMVNIC_TX_VLAN_INSERT; tx_crq.v1.vlan_id = cpu_to_be16(skb->vlan_tci); } if (skb->protocol == htons(ETH_P_IP)) { - if (ip_hdr(skb)->version == 4) - tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_IPV4; - else if (ip_hdr(skb)->version == 6) - tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_IPV6; - - if (ip_hdr(skb)->protocol == IPPROTO_TCP) - tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_TCP; - else if (ip_hdr(skb)->protocol != IPPROTO_TCP) - tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_UDP; + tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_IPV4; + proto = ip_hdr(skb)->protocol; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_IPV6; + proto = ipv6_hdr(skb)->nexthdr; } + if (proto == IPPROTO_TCP) + tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_TCP; + else if (proto == IPPROTO_UDP) + tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_UDP; + if (skb->ip_summed == CHECKSUM_PARTIAL) { tx_crq.v1.flags1 |= IBMVNIC_TX_CHKSUM_OFFLOAD; hdrs += 2; } - /* determine if l2/3/4 headers are sent to firmware */ - if ((*hdrs >> 7) & 1 && - (skb->protocol == htons(ETH_P_IP) || - skb->protocol == htons(ETH_P_IPV6))) { - build_hdr_descs_arr(tx_buff, &num_entries, *hdrs); - tx_crq.v1.n_crq_elem = num_entries; - tx_buff->indir_arr[0] = tx_crq; - tx_buff->indir_dma = dma_map_single(dev, tx_buff->indir_arr, - sizeof(tx_buff->indir_arr), - DMA_TO_DEVICE); - if (dma_mapping_error(dev, tx_buff->indir_dma)) { - dev_kfree_skb_any(skb); - tx_buff->skb = NULL; - if (!firmware_has_feature(FW_FEATURE_CMO)) - dev_err(dev, "tx: unable to map descriptor array\n"); - tx_map_failed++; - tx_dropped++; - ret = NETDEV_TX_OK; - goto out; - } - lpar_rc = send_subcrq_indirect(adapter, handle_array[queue_num], - (u64)tx_buff->indir_dma, - (u64)num_entries); - } else { - lpar_rc = send_subcrq(adapter, handle_array[queue_num], - &tx_crq); + if (skb_is_gso(skb)) { + tx_crq.v1.flags1 |= IBMVNIC_TX_LSO; + tx_crq.v1.mss = cpu_to_be16(skb_shinfo(skb)->gso_size); + hdrs += 2; + } else if (use_scrq_send_direct) { + /* See above comment, CSO disabled with direct xmit */ + tx_crq.v1.flags1 &= ~(IBMVNIC_TX_CHKSUM_OFFLOAD); + ind_bufp->index = 1; + tx_buff->num_entries = 1; + netdev_tx_sent_queue(txq, skb->len); + ind_bufp->indir_arr[0] = tx_crq; + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, false); + if (lpar_rc != H_SUCCESS) + goto tx_err; + + tx_dpackets++; + goto early_exit; } - if (lpar_rc != H_SUCCESS) { - dev_err(dev, "tx failed with code %ld\n", lpar_rc); - if (tx_pool->consumer_index == 0) - tx_pool->consumer_index = - adapter->req_tx_entries_per_subcrq - 1; - else - tx_pool->consumer_index--; + if ((*hdrs >> 7) & 1) + build_hdr_descs_arr(skb, indir_arr, &num_entries, *hdrs); - dev_kfree_skb_any(skb); - tx_buff->skb = NULL; + tx_crq.v1.n_crq_elem = num_entries; + tx_buff->num_entries = num_entries; + /* flush buffer if current entry can not fit */ + if (num_entries + ind_bufp->index > cur_max_ind_descs) { + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, true); + if (lpar_rc != H_SUCCESS) + goto tx_flush_err; + } - if (lpar_rc == H_CLOSED) { - /* Disable TX and report carrier off if queue is closed. - * Firmware guarantees that a signal will be sent to the - * driver, triggering a reset or some other action. - */ - netif_tx_stop_all_queues(netdev); - netif_carrier_off(netdev); - } + indir_arr[0] = tx_crq; + memcpy(&ind_bufp->indir_arr[ind_bufp->index], &indir_arr[0], + num_entries * sizeof(struct ibmvnic_generic_scrq)); - tx_send_failed++; - tx_dropped++; - ret = NETDEV_TX_OK; - goto out; + ind_bufp->index += num_entries; + if (__netdev_tx_sent_queue(txq, skb->len, + netdev_xmit_more() && + ind_bufp->index < cur_max_ind_descs)) { + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, true); + if (lpar_rc != H_SUCCESS) + goto tx_err; } - if (atomic_inc_return(&tx_scrq->used) + tx_bpackets++; + +early_exit: + if (atomic_add_return(num_entries, &tx_scrq->used) >= adapter->req_tx_entries_per_subcrq) { - netdev_info(netdev, "Stopping queue %d\n", queue_num); + netdev_dbg(netdev, "Stopping queue %d\n", queue_num); netif_stop_subqueue(netdev, queue_num); } - tx_packets++; - tx_bytes += skb->len; - txq->trans_start = jiffies; + tx_bytes += skblen; + txq_trans_cond_update(txq); ret = NETDEV_TX_OK; + goto out; +tx_flush_err: + dev_kfree_skb_any(skb); + tx_buff->skb = NULL; + tx_pool->consumer_index = tx_pool->consumer_index == 0 ? + tx_pool->num_buffers - 1 : + tx_pool->consumer_index - 1; + tx_dropped++; +tx_err: + if (lpar_rc != H_CLOSED && lpar_rc != H_PARAMETER) + dev_err_ratelimited(dev, "tx: send failed\n"); + + if (lpar_rc == H_CLOSED || adapter->failover_pending) { + /* Disable TX and report carrier off if queue is closed + * or pending failover. + * Firmware guarantees that a signal will be sent to the + * driver, triggering a reset or some other action. + */ + netif_tx_stop_all_queues(netdev); + netif_carrier_off(netdev); + } out: - netdev->stats.tx_dropped += tx_dropped; - netdev->stats.tx_bytes += tx_bytes; - netdev->stats.tx_packets += tx_packets; + rcu_read_unlock(); adapter->tx_send_failed += tx_send_failed; adapter->tx_map_failed += tx_map_failed; + adapter->tx_stats_buffers[queue_num].batched_packets += tx_bpackets; + adapter->tx_stats_buffers[queue_num].direct_packets += tx_dpackets; + adapter->tx_stats_buffers[queue_num].bytes += tx_bytes; + adapter->tx_stats_buffers[queue_num].dropped_packets += tx_dropped; return ret; } @@ -1292,25 +2729,99 @@ static void ibmvnic_set_multi(struct net_device *netdev) } } -static int ibmvnic_set_mac(struct net_device *netdev, void *p) +static int __ibmvnic_set_mac(struct net_device *netdev, u8 *dev_addr) { struct ibmvnic_adapter *adapter = netdev_priv(netdev); - struct sockaddr *addr = p; union ibmvnic_crq crq; + int rc; - if (!is_valid_ether_addr(addr->sa_data)) - return -EADDRNOTAVAIL; + if (!is_valid_ether_addr(dev_addr)) { + rc = -EADDRNOTAVAIL; + goto err; + } memset(&crq, 0, sizeof(crq)); crq.change_mac_addr.first = IBMVNIC_CRQ_CMD; crq.change_mac_addr.cmd = CHANGE_MAC_ADDR; - ether_addr_copy(&crq.change_mac_addr.mac_addr[0], addr->sa_data); - ibmvnic_send_crq(adapter, &crq); + ether_addr_copy(&crq.change_mac_addr.mac_addr[0], dev_addr); + + mutex_lock(&adapter->fw_lock); + adapter->fw_done_rc = 0; + reinit_completion(&adapter->fw_done); + + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) { + rc = -EIO; + mutex_unlock(&adapter->fw_lock); + goto err; + } + + rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); /* netdev->dev_addr is changed in handle_change_mac_rsp function */ + if (rc || adapter->fw_done_rc) { + rc = -EIO; + mutex_unlock(&adapter->fw_lock); + goto err; + } + mutex_unlock(&adapter->fw_lock); return 0; +err: + ether_addr_copy(adapter->mac_addr, netdev->dev_addr); + return rc; } -/** +static int ibmvnic_set_mac(struct net_device *netdev, void *p) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + struct sockaddr *addr = p; + int rc; + + rc = 0; + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + ether_addr_copy(adapter->mac_addr, addr->sa_data); + if (adapter->state != VNIC_PROBED) + rc = __ibmvnic_set_mac(netdev, addr->sa_data); + + return rc; +} + +static const char *reset_reason_to_string(enum ibmvnic_reset_reason reason) +{ + switch (reason) { + case VNIC_RESET_FAILOVER: + return "FAILOVER"; + case VNIC_RESET_MOBILITY: + return "MOBILITY"; + case VNIC_RESET_FATAL: + return "FATAL"; + case VNIC_RESET_NON_FATAL: + return "NON_FATAL"; + case VNIC_RESET_TIMEOUT: + return "TIMEOUT"; + case VNIC_RESET_CHANGE_PARAM: + return "CHANGE_PARAM"; + case VNIC_RESET_PASSIVE_INIT: + return "PASSIVE_INIT"; + } + return "UNKNOWN"; +} + +/* + * Initialize the init_done completion and return code values. We + * can get a transport event just after registering the CRQ and the + * tasklet will use this to communicate the transport event. To ensure + * we don't miss the notification/error, initialize these _before_ + * regisering the CRQ. + */ +static inline void reinit_init_done(struct ibmvnic_adapter *adapter) +{ + reinit_completion(&adapter->init_done); + adapter->init_done_rc = 0; +} + +/* * do_reset returns zero if we are able to keep processing reset events, or * non-zero if we hit a fatal error and must halt. */ @@ -1318,20 +2829,97 @@ static int do_reset(struct ibmvnic_adapter *adapter, struct ibmvnic_rwi *rwi, u32 reset_state) { struct net_device *netdev = adapter->netdev; - int i, rc; + u64 old_num_rx_queues, old_num_tx_queues; + u64 old_num_rx_slots, old_num_tx_slots; + int rc; + + netdev_dbg(adapter->netdev, + "[S:%s FOP:%d] Reset reason: %s, reset_state: %s\n", + adapter_state_to_string(adapter->state), + adapter->failover_pending, + reset_reason_to_string(rwi->reset_reason), + adapter_state_to_string(reset_state)); - netif_carrier_off(netdev); adapter->reset_reason = rwi->reset_reason; + /* requestor of VNIC_RESET_CHANGE_PARAM already has the rtnl lock */ + if (!(adapter->reset_reason == VNIC_RESET_CHANGE_PARAM)) + rtnl_lock(); - if (rwi->reset_reason == VNIC_RESET_MOBILITY) { - rc = ibmvnic_reenable_crq_queue(adapter); - if (rc) - return 0; + /* Now that we have the rtnl lock, clear any pending failover. + * This will ensure ibmvnic_open() has either completed or will + * block until failover is complete. + */ + if (rwi->reset_reason == VNIC_RESET_FAILOVER) + adapter->failover_pending = false; + + /* read the state and check (again) after getting rtnl */ + reset_state = adapter->state; + + if (reset_state == VNIC_REMOVING || reset_state == VNIC_REMOVED) { + rc = -EBUSY; + goto out; } - rc = __ibmvnic_close(netdev); - if (rc) - return rc; + netif_carrier_off(netdev); + + old_num_rx_queues = adapter->req_rx_queues; + old_num_tx_queues = adapter->req_tx_queues; + old_num_rx_slots = adapter->req_rx_add_entries_per_subcrq; + old_num_tx_slots = adapter->req_tx_entries_per_subcrq; + + ibmvnic_cleanup(netdev); + + if (reset_state == VNIC_OPEN && + adapter->reset_reason != VNIC_RESET_MOBILITY && + adapter->reset_reason != VNIC_RESET_FAILOVER) { + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { + rc = __ibmvnic_close(netdev); + if (rc) + goto out; + } else { + adapter->state = VNIC_CLOSING; + + /* Release the RTNL lock before link state change and + * re-acquire after the link state change to allow + * linkwatch_event to grab the RTNL lock and run during + * a reset. + */ + rtnl_unlock(); + rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); + rtnl_lock(); + if (rc) + goto out; + + if (adapter->state == VNIC_OPEN) { + /* When we dropped rtnl, ibmvnic_open() got + * it and noticed that we are resetting and + * set the adapter state to OPEN. Update our + * new "target" state, and resume the reset + * from VNIC_CLOSING state. + */ + netdev_dbg(netdev, + "Open changed state from %s, updating.\n", + adapter_state_to_string(reset_state)); + reset_state = VNIC_OPEN; + adapter->state = VNIC_CLOSING; + } + + if (adapter->state != VNIC_CLOSING) { + /* If someone else changed the adapter state + * when we dropped the rtnl, fail the reset + */ + rc = -EAGAIN; + goto out; + } + adapter->state = VNIC_CLOSED; + } + } + + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { + release_resources(adapter); + release_sub_crqs(adapter, 1); + release_crq_queue(adapter); + } if (adapter->reset_reason != VNIC_RESET_NON_FATAL) { /* remove the closed state so when we call open it appears @@ -1339,61 +2927,207 @@ static int do_reset(struct ibmvnic_adapter *adapter, */ adapter->state = VNIC_PROBED; - rc = ibmvnic_init(adapter); - if (rc) - return 0; + reinit_init_done(adapter); - /* If the adapter was in PROBE state prior to the reset, - * exit here. - */ - if (reset_state == VNIC_PROBED) - return 0; + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { + rc = init_crq_queue(adapter); + } else if (adapter->reset_reason == VNIC_RESET_MOBILITY) { + rc = ibmvnic_reenable_crq_queue(adapter); + release_sub_crqs(adapter, 1); + } else { + rc = ibmvnic_reset_crq(adapter); + if (rc == H_CLOSED || rc == H_SUCCESS) { + rc = vio_enable_interrupts(adapter->vdev); + if (rc) + netdev_err(adapter->netdev, + "Reset failed to enable interrupts. rc=%d\n", + rc); + } + } - rc = ibmvnic_login(netdev); if (rc) { - adapter->state = VNIC_PROBED; - return 0; + netdev_err(adapter->netdev, + "Reset couldn't initialize crq. rc=%d\n", rc); + goto out; } - rc = reset_tx_pools(adapter); + rc = ibmvnic_reset_init(adapter, true); if (rc) - return rc; + goto out; + + /* If the adapter was in PROBE or DOWN state prior to the reset, + * exit here. + */ + if (reset_state == VNIC_PROBED || reset_state == VNIC_DOWN) { + rc = 0; + goto out; + } - rc = reset_rx_pools(adapter); + rc = ibmvnic_login(netdev); if (rc) - return rc; + goto out; - if (reset_state == VNIC_CLOSED) - return 0; + if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM) { + rc = init_resources(adapter); + if (rc) + goto out; + } else if (adapter->req_rx_queues != old_num_rx_queues || + adapter->req_tx_queues != old_num_tx_queues || + adapter->req_rx_add_entries_per_subcrq != + old_num_rx_slots || + adapter->req_tx_entries_per_subcrq != + old_num_tx_slots || + !adapter->rx_pool || + !adapter->tso_pool || + !adapter->tx_pool) { + release_napi(adapter); + release_vpd_data(adapter); + + rc = init_resources(adapter); + if (rc) + goto out; + + } else { + rc = init_tx_pools(netdev); + if (rc) { + netdev_dbg(netdev, + "init tx pools failed (%d)\n", + rc); + goto out; + } + + rc = init_rx_pools(netdev); + if (rc) { + netdev_dbg(netdev, + "init rx pools failed (%d)\n", + rc); + goto out; + } + } + ibmvnic_disable_irqs(adapter); + } + adapter->state = VNIC_CLOSED; + + if (reset_state == VNIC_CLOSED) { + rc = 0; + goto out; } rc = __ibmvnic_open(netdev); if (rc) { - if (list_empty(&adapter->rwi_list)) - adapter->state = VNIC_CLOSED; - else - adapter->state = reset_state; + rc = IBMVNIC_OPEN_FAILED; + goto out; + } - return 0; + /* refresh device's multicast list */ + ibmvnic_set_multi(netdev); + + if (adapter->reset_reason == VNIC_RESET_FAILOVER || + adapter->reset_reason == VNIC_RESET_MOBILITY) + __netdev_notify_peers(netdev); + + rc = 0; + +out: + /* restore the adapter state if reset failed */ + if (rc) + adapter->state = reset_state; + /* requestor of VNIC_RESET_CHANGE_PARAM should still hold the rtnl lock */ + if (!(adapter->reset_reason == VNIC_RESET_CHANGE_PARAM)) + rtnl_unlock(); + + netdev_dbg(adapter->netdev, "[S:%s FOP:%d] Reset done, rc %d\n", + adapter_state_to_string(adapter->state), + adapter->failover_pending, rc); + return rc; +} + +static int do_hard_reset(struct ibmvnic_adapter *adapter, + struct ibmvnic_rwi *rwi, u32 reset_state) +{ + struct net_device *netdev = adapter->netdev; + int rc; + + netdev_dbg(adapter->netdev, "Hard resetting driver (%s)\n", + reset_reason_to_string(rwi->reset_reason)); + + /* read the state and check (again) after getting rtnl */ + reset_state = adapter->state; + + if (reset_state == VNIC_REMOVING || reset_state == VNIC_REMOVED) { + rc = -EBUSY; + goto out; } - netif_carrier_on(netdev); + netif_carrier_off(netdev); + adapter->reset_reason = rwi->reset_reason; - /* kick napi */ - for (i = 0; i < adapter->req_rx_queues; i++) - napi_schedule(&adapter->napi[i]); + ibmvnic_cleanup(netdev); + release_resources(adapter); + release_sub_crqs(adapter, 0); + release_crq_queue(adapter); + + /* remove the closed state so when we call open it appears + * we are coming from the probed state. + */ + adapter->state = VNIC_PROBED; - if (adapter->reset_reason != VNIC_RESET_FAILOVER) - netdev_notify_peers(netdev); + reinit_init_done(adapter); - return 0; + rc = init_crq_queue(adapter); + if (rc) { + netdev_err(adapter->netdev, + "Couldn't initialize crq. rc=%d\n", rc); + goto out; + } + + rc = ibmvnic_reset_init(adapter, false); + if (rc) + goto out; + + /* If the adapter was in PROBE or DOWN state prior to the reset, + * exit here. + */ + if (reset_state == VNIC_PROBED || reset_state == VNIC_DOWN) + goto out; + + rc = ibmvnic_login(netdev); + if (rc) + goto out; + + rc = init_resources(adapter); + if (rc) + goto out; + + ibmvnic_disable_irqs(adapter); + adapter->state = VNIC_CLOSED; + + if (reset_state == VNIC_CLOSED) + goto out; + + rc = __ibmvnic_open(netdev); + if (rc) { + rc = IBMVNIC_OPEN_FAILED; + goto out; + } + + __netdev_notify_peers(netdev); +out: + /* restore adapter state if reset failed */ + if (rc) + adapter->state = reset_state; + netdev_dbg(adapter->netdev, "[S:%s FOP:%d] Hard reset done, rc %d\n", + adapter_state_to_string(adapter->state), + adapter->failover_pending, rc); + return rc; } static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter) { struct ibmvnic_rwi *rwi; + unsigned long flags; - mutex_lock(&adapter->rwi_lock); + spin_lock_irqsave(&adapter->rwi_lock, flags); if (!list_empty(&adapter->rwi_list)) { rwi = list_first_entry(&adapter->rwi_list, struct ibmvnic_rwi, @@ -1403,103 +3137,382 @@ static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter) rwi = NULL; } - mutex_unlock(&adapter->rwi_lock); + spin_unlock_irqrestore(&adapter->rwi_lock, flags); return rwi; } -static void free_all_rwi(struct ibmvnic_adapter *adapter) +/** + * do_passive_init - complete probing when partner device is detected. + * @adapter: ibmvnic_adapter struct + * + * If the ibmvnic device does not have a partner device to communicate with at boot + * and that partner device comes online at a later time, this function is called + * to complete the initialization process of ibmvnic device. + * Caller is expected to hold rtnl_lock(). + * + * Returns non-zero if sub-CRQs are not initialized properly leaving the device + * in the down state. + * Returns 0 upon success and the device is in PROBED state. + */ + +static int do_passive_init(struct ibmvnic_adapter *adapter) { - struct ibmvnic_rwi *rwi; + unsigned long timeout = msecs_to_jiffies(30000); + struct net_device *netdev = adapter->netdev; + struct device *dev = &adapter->vdev->dev; + int rc; - rwi = get_next_rwi(adapter); - while (rwi) { - kfree(rwi); - rwi = get_next_rwi(adapter); + netdev_dbg(netdev, "Partner device found, probing.\n"); + + adapter->state = VNIC_PROBING; + reinit_completion(&adapter->init_done); + adapter->init_done_rc = 0; + adapter->crq.active = true; + + rc = send_crq_init_complete(adapter); + if (rc) + goto out; + + rc = send_version_xchg(adapter); + if (rc) + netdev_dbg(adapter->netdev, "send_version_xchg failed, rc=%d\n", rc); + + if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { + dev_err(dev, "Initialization sequence timed out\n"); + rc = -ETIMEDOUT; + goto out; } + + rc = init_sub_crqs(adapter); + if (rc) { + dev_err(dev, "Initialization of sub crqs failed, rc=%d\n", rc); + goto out; + } + + rc = init_sub_crq_irqs(adapter); + if (rc) { + dev_err(dev, "Failed to initialize sub crq irqs\n, rc=%d", rc); + goto init_failed; + } + + netdev->mtu = adapter->req_mtu - ETH_HLEN; + netdev->min_mtu = adapter->min_mtu - ETH_HLEN; + netdev->max_mtu = adapter->max_mtu - ETH_HLEN; + + adapter->state = VNIC_PROBED; + netdev_dbg(netdev, "Probed successfully. Waiting for signal from partner device.\n"); + + return 0; + +init_failed: + release_sub_crqs(adapter, 1); +out: + adapter->state = VNIC_DOWN; + return rc; } static void __ibmvnic_reset(struct work_struct *work) { - struct ibmvnic_rwi *rwi; struct ibmvnic_adapter *adapter; - struct net_device *netdev; + unsigned int timeout = 5000; + struct ibmvnic_rwi *tmprwi; + bool saved_state = false; + struct ibmvnic_rwi *rwi; + unsigned long flags; + struct device *dev; + bool need_reset; + int num_fails = 0; u32 reset_state; - int rc; + int rc = 0; adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); - netdev = adapter->netdev; + dev = &adapter->vdev->dev; + + /* Wait for ibmvnic_probe() to complete. If probe is taking too long + * or if another reset is in progress, defer work for now. If probe + * eventually fails it will flush and terminate our work. + * + * Three possibilities here: + * 1. Adpater being removed - just return + * 2. Timed out on probe or another reset in progress - delay the work + * 3. Completed probe - perform any resets in queue + */ + if (adapter->state == VNIC_PROBING && + !wait_for_completion_timeout(&adapter->probe_done, timeout)) { + dev_err(dev, "Reset thread timed out on probe"); + queue_delayed_work(system_long_wq, + &adapter->ibmvnic_delayed_reset, + IBMVNIC_RESET_DELAY); + return; + } - mutex_lock(&adapter->reset_lock); - adapter->resetting = true; - reset_state = adapter->state; + /* adapter is done with probe (i.e state is never VNIC_PROBING now) */ + if (adapter->state == VNIC_REMOVING) + return; + + /* ->rwi_list is stable now (no one else is removing entries) */ + + /* ibmvnic_probe() may have purged the reset queue after we were + * scheduled to process a reset so there maybe no resets to process. + * Before setting the ->resetting bit though, we have to make sure + * that there is infact a reset to process. Otherwise we may race + * with ibmvnic_open() and end up leaving the vnic down: + * + * __ibmvnic_reset() ibmvnic_open() + * ----------------- -------------- + * + * set ->resetting bit + * find ->resetting bit is set + * set ->state to IBMVNIC_OPEN (i.e + * assume reset will open device) + * return + * find reset queue empty + * return + * + * Neither performed vnic login/open and vnic stays down + * + * If we hold the lock and conditionally set the bit, either we + * or ibmvnic_open() will complete the open. + */ + need_reset = false; + spin_lock(&adapter->rwi_lock); + if (!list_empty(&adapter->rwi_list)) { + if (test_and_set_bit_lock(0, &adapter->resetting)) { + queue_delayed_work(system_long_wq, + &adapter->ibmvnic_delayed_reset, + IBMVNIC_RESET_DELAY); + } else { + need_reset = true; + } + } + spin_unlock(&adapter->rwi_lock); + + if (!need_reset) + return; rwi = get_next_rwi(adapter); while (rwi) { - rc = do_reset(adapter, rwi, reset_state); - kfree(rwi); - if (rc) + spin_lock_irqsave(&adapter->state_lock, flags); + + if (adapter->state == VNIC_REMOVING || + adapter->state == VNIC_REMOVED) { + spin_unlock_irqrestore(&adapter->state_lock, flags); + kfree(rwi); + rc = EBUSY; break; + } + + if (!saved_state) { + reset_state = adapter->state; + saved_state = true; + } + spin_unlock_irqrestore(&adapter->state_lock, flags); + + if (rwi->reset_reason == VNIC_RESET_PASSIVE_INIT) { + rtnl_lock(); + rc = do_passive_init(adapter); + rtnl_unlock(); + if (!rc) + netif_carrier_on(adapter->netdev); + } else if (adapter->force_reset_recovery) { + /* Since we are doing a hard reset now, clear the + * failover_pending flag so we don't ignore any + * future MOBILITY or other resets. + */ + adapter->failover_pending = false; + + /* Transport event occurred during previous reset */ + if (adapter->wait_for_reset) { + /* Previous was CHANGE_PARAM; caller locked */ + adapter->force_reset_recovery = false; + rc = do_hard_reset(adapter, rwi, reset_state); + } else { + rtnl_lock(); + adapter->force_reset_recovery = false; + rc = do_hard_reset(adapter, rwi, reset_state); + rtnl_unlock(); + } + if (rc) + num_fails++; + else + num_fails = 0; + + /* If auto-priority-failover is enabled we can get + * back to back failovers during resets, resulting + * in at least two failed resets (from high-priority + * backing device to low-priority one and then back) + * If resets continue to fail beyond that, give the + * adapter some time to settle down before retrying. + */ + if (num_fails >= 3) { + netdev_dbg(adapter->netdev, + "[S:%s] Hard reset failed %d times, waiting 60 secs\n", + adapter_state_to_string(adapter->state), + num_fails); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(60 * HZ); + } + } else { + rc = do_reset(adapter, rwi, reset_state); + } + tmprwi = rwi; + adapter->last_reset_time = jiffies; + + if (rc) + netdev_dbg(adapter->netdev, "Reset failed, rc=%d\n", rc); rwi = get_next_rwi(adapter); + + /* + * If there are no resets queued and the previous reset failed, + * the adapter would be in an undefined state. So retry the + * previous reset as a hard reset. + * + * Else, free the previous rwi and, if there is another reset + * queued, process the new reset even if previous reset failed + * (the previous reset could have failed because of a fail + * over for instance, so process the fail over). + */ + if (!rwi && rc) + rwi = tmprwi; + else + kfree(tmprwi); + + if (rwi && (rwi->reset_reason == VNIC_RESET_FAILOVER || + rwi->reset_reason == VNIC_RESET_MOBILITY || rc)) + adapter->force_reset_recovery = true; } - if (rc) { - free_all_rwi(adapter); - mutex_unlock(&adapter->reset_lock); - return; + if (adapter->wait_for_reset) { + adapter->reset_done_rc = rc; + complete(&adapter->reset_done); } - adapter->resetting = false; - mutex_unlock(&adapter->reset_lock); + clear_bit_unlock(0, &adapter->resetting); + + netdev_dbg(adapter->netdev, + "[S:%s FRR:%d WFR:%d] Done processing resets\n", + adapter_state_to_string(adapter->state), + adapter->force_reset_recovery, + adapter->wait_for_reset); } -static void ibmvnic_reset(struct ibmvnic_adapter *adapter, - enum ibmvnic_reset_reason reason) +static void __ibmvnic_delayed_reset(struct work_struct *work) { - struct ibmvnic_rwi *rwi, *tmp; - struct net_device *netdev = adapter->netdev; - struct list_head *entry; + struct ibmvnic_adapter *adapter; - if (adapter->state == VNIC_REMOVING || - adapter->state == VNIC_REMOVED) { - netdev_dbg(netdev, "Adapter removing, skipping reset\n"); - return; - } + adapter = container_of(work, struct ibmvnic_adapter, + ibmvnic_delayed_reset.work); + __ibmvnic_reset(&adapter->ibmvnic_reset); +} - if (adapter->state == VNIC_PROBING) { - netdev_warn(netdev, "Adapter reset during probe\n"); - adapter->init_done_rc = EAGAIN; - return; +static void flush_reset_queue(struct ibmvnic_adapter *adapter) +{ + struct list_head *entry, *tmp_entry; + + if (!list_empty(&adapter->rwi_list)) { + list_for_each_safe(entry, tmp_entry, &adapter->rwi_list) { + list_del(entry); + kfree(list_entry(entry, struct ibmvnic_rwi, list)); + } } +} + +static int ibmvnic_reset(struct ibmvnic_adapter *adapter, + enum ibmvnic_reset_reason reason) +{ + struct net_device *netdev = adapter->netdev; + struct ibmvnic_rwi *rwi, *tmp; + unsigned long flags; + int ret; - mutex_lock(&adapter->rwi_lock); + spin_lock_irqsave(&adapter->rwi_lock, flags); - list_for_each(entry, &adapter->rwi_list) { - tmp = list_entry(entry, struct ibmvnic_rwi, list); + /* If failover is pending don't schedule any other reset. + * Instead let the failover complete. If there is already a + * a failover reset scheduled, we will detect and drop the + * duplicate reset when walking the ->rwi_list below. + */ + if (adapter->state == VNIC_REMOVING || + adapter->state == VNIC_REMOVED || + (adapter->failover_pending && reason != VNIC_RESET_FAILOVER)) { + ret = EBUSY; + netdev_dbg(netdev, "Adapter removing or pending failover, skipping reset\n"); + goto err; + } + + list_for_each_entry(tmp, &adapter->rwi_list, list) { if (tmp->reset_reason == reason) { - netdev_err(netdev, "Matching reset found, skipping\n"); - mutex_unlock(&adapter->rwi_lock); - return; + netdev_dbg(netdev, "Skipping matching reset, reason=%s\n", + reset_reason_to_string(reason)); + ret = EBUSY; + goto err; } } - rwi = kzalloc(sizeof(*rwi), GFP_KERNEL); + rwi = kzalloc(sizeof(*rwi), GFP_ATOMIC); if (!rwi) { - mutex_unlock(&adapter->rwi_lock); - ibmvnic_close(netdev); - return; + ret = ENOMEM; + goto err; } + /* if we just received a transport event, + * flush reset queue and process this reset + */ + if (adapter->force_reset_recovery) + flush_reset_queue(adapter); rwi->reset_reason = reason; list_add_tail(&rwi->list, &adapter->rwi_list); - mutex_unlock(&adapter->rwi_lock); - schedule_work(&adapter->ibmvnic_reset); + netdev_dbg(adapter->netdev, "Scheduling reset (reason %s)\n", + reset_reason_to_string(reason)); + queue_work(system_long_wq, &adapter->ibmvnic_reset); + + ret = 0; +err: + /* ibmvnic_close() below can block, so drop the lock first */ + spin_unlock_irqrestore(&adapter->rwi_lock, flags); + + if (ret == ENOMEM) + ibmvnic_close(netdev); + + return -ret; } -static void ibmvnic_tx_timeout(struct net_device *dev) +static void ibmvnic_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + int i; + + for (i = 0; i < adapter->req_rx_queues; i++) { + stats->rx_packets += adapter->rx_stats_buffers[i].packets; + stats->rx_bytes += adapter->rx_stats_buffers[i].bytes; + } + + for (i = 0; i < adapter->req_tx_queues; i++) { + stats->tx_packets += adapter->tx_stats_buffers[i].batched_packets; + stats->tx_packets += adapter->tx_stats_buffers[i].direct_packets; + stats->tx_bytes += adapter->tx_stats_buffers[i].bytes; + stats->tx_dropped += adapter->tx_stats_buffers[i].dropped_packets; + } +} + +static void ibmvnic_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct ibmvnic_adapter *adapter = netdev_priv(dev); + if (test_bit(0, &adapter->resetting)) { + netdev_err(adapter->netdev, + "Adapter is resetting, skip timeout reset\n"); + return; + } + /* No queuing up reset until at least 5 seconds (default watchdog val) + * after last reset + */ + if (time_before(jiffies, (adapter->last_reset_time + dev->watchdog_timeo))) { + netdev_dbg(dev, "Not yet time to tx timeout.\n"); + return; + } ibmvnic_reset(adapter, VNIC_RESET_TIMEOUT); } @@ -1518,10 +3531,17 @@ static void remove_buff_from_pool(struct ibmvnic_adapter *adapter, static int ibmvnic_poll(struct napi_struct *napi, int budget) { - struct net_device *netdev = napi->dev; - struct ibmvnic_adapter *adapter = netdev_priv(netdev); - int scrq_num = (int)(napi - adapter->napi); - int frames_processed = 0; + struct ibmvnic_sub_crq_queue *rx_scrq; + struct ibmvnic_adapter *adapter; + struct net_device *netdev; + int frames_processed; + int scrq_num; + + netdev = napi->dev; + adapter = netdev_priv(netdev); + scrq_num = (int)(napi - adapter->napi); + frames_processed = 0; + rx_scrq = adapter->rx_scrq[scrq_num]; restart_poll: while (frames_processed < budget) { @@ -1532,21 +3552,28 @@ restart_poll: u16 offset; u8 flags = 0; - if (unlikely(adapter->resetting)) { - enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]); + if (unlikely(test_bit(0, &adapter->resetting) && + adapter->reset_reason != VNIC_RESET_NON_FATAL)) { + enable_scrq_irq(adapter, rx_scrq); napi_complete_done(napi, frames_processed); return frames_processed; } - if (!pending_scrq(adapter, adapter->rx_scrq[scrq_num])) + if (!pending_scrq(adapter, rx_scrq)) break; - next = ibmvnic_next_scrq(adapter, adapter->rx_scrq[scrq_num]); - rx_buff = - (struct ibmvnic_rx_buff *)be64_to_cpu(next-> - rx_comp.correlator); + next = ibmvnic_next_scrq(adapter, rx_scrq); + rx_buff = (struct ibmvnic_rx_buff *) + be64_to_cpu(next->rx_comp.correlator); /* do error checking */ if (next->rx_comp.rc) { - netdev_err(netdev, "rx error %x\n", next->rx_comp.rc); + netdev_dbg(netdev, "rx buffer returned with rc %x\n", + be16_to_cpu(next->rx_comp.rc)); + /* free the entry */ + next->rx_comp.first = 0; + dev_kfree_skb_any(rx_buff->skb); + remove_buff_from_pool(adapter, rx_buff); + continue; + } else if (!rx_buff->skb) { /* free the entry */ next->rx_comp.first = 0; remove_buff_from_pool(adapter, rx_buff); @@ -1557,6 +3584,8 @@ restart_poll: offset = be16_to_cpu(next->rx_comp.off_frame_data); flags = next->rx_comp.flags; skb = rx_buff->skb; + /* load long_term_buff before copying to skb */ + dma_rmb(); skb_copy_to_linear_data(skb, rx_buff->data + offset, length); @@ -1583,42 +3612,106 @@ restart_poll: length = skb->len; napi_gro_receive(napi, skb); /* send it up */ - netdev->stats.rx_packets++; - netdev->stats.rx_bytes += length; + adapter->rx_stats_buffers[scrq_num].packets++; + adapter->rx_stats_buffers[scrq_num].bytes += length; frames_processed++; } - if (adapter->state != VNIC_CLOSING) + if (adapter->state != VNIC_CLOSING && + (atomic_read(&adapter->rx_pool[scrq_num].available) < + adapter->req_rx_add_entries_per_subcrq / 2)) replenish_rx_pool(adapter, &adapter->rx_pool[scrq_num]); - if (frames_processed < budget) { - enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]); - napi_complete_done(napi, frames_processed); - if (pending_scrq(adapter, adapter->rx_scrq[scrq_num]) && - napi_reschedule(napi)) { - disable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]); - goto restart_poll; + if (napi_complete_done(napi, frames_processed)) { + enable_scrq_irq(adapter, rx_scrq); + if (pending_scrq(adapter, rx_scrq)) { + if (napi_schedule(napi)) { + disable_scrq_irq(adapter, rx_scrq); + goto restart_poll; + } + } } } return frames_processed; } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void ibmvnic_netpoll_controller(struct net_device *dev) +static int wait_for_reset(struct ibmvnic_adapter *adapter) { - struct ibmvnic_adapter *adapter = netdev_priv(dev); - int i; + int rc, ret; - replenish_pools(netdev_priv(dev)); - for (i = 0; i < adapter->req_rx_queues; i++) - ibmvnic_interrupt_rx(adapter->rx_scrq[i]->irq, - adapter->rx_scrq[i]); + adapter->fallback.mtu = adapter->req_mtu; + adapter->fallback.rx_queues = adapter->req_rx_queues; + adapter->fallback.tx_queues = adapter->req_tx_queues; + adapter->fallback.rx_entries = adapter->req_rx_add_entries_per_subcrq; + adapter->fallback.tx_entries = adapter->req_tx_entries_per_subcrq; + + reinit_completion(&adapter->reset_done); + adapter->wait_for_reset = true; + rc = ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM); + + if (rc) { + ret = rc; + goto out; + } + rc = ibmvnic_wait_for_completion(adapter, &adapter->reset_done, 60000); + if (rc) { + ret = -ENODEV; + goto out; + } + + ret = 0; + if (adapter->reset_done_rc) { + ret = -EIO; + adapter->desired.mtu = adapter->fallback.mtu; + adapter->desired.rx_queues = adapter->fallback.rx_queues; + adapter->desired.tx_queues = adapter->fallback.tx_queues; + adapter->desired.rx_entries = adapter->fallback.rx_entries; + adapter->desired.tx_entries = adapter->fallback.tx_entries; + + reinit_completion(&adapter->reset_done); + adapter->wait_for_reset = true; + rc = ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM); + if (rc) { + ret = rc; + goto out; + } + rc = ibmvnic_wait_for_completion(adapter, &adapter->reset_done, + 60000); + if (rc) { + ret = -ENODEV; + goto out; + } + } +out: + adapter->wait_for_reset = false; + + return ret; } -#endif static int ibmvnic_change_mtu(struct net_device *netdev, int new_mtu) { - return -EOPNOTSUPP; + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + adapter->desired.mtu = new_mtu + ETH_HLEN; + + return wait_for_reset(adapter); +} + +static netdev_features_t ibmvnic_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + /* Some backing hardware adapters can not + * handle packets with a MSS less than 224 + * or with only one segment. + */ + if (skb_is_gso(skb)) { + if (skb_shinfo(skb)->gso_size < 224 || + skb_shinfo(skb)->gso_segs == 1) + features &= ~NETIF_F_GSO_MASK; + } + + return features; } static const struct net_device_ops ibmvnic_netdev_ops = { @@ -1628,11 +3721,10 @@ static const struct net_device_ops ibmvnic_netdev_ops = { .ndo_set_rx_mode = ibmvnic_set_multi, .ndo_set_mac_address = ibmvnic_set_mac, .ndo_validate_addr = eth_validate_addr, + .ndo_get_stats64 = ibmvnic_get_stats64, .ndo_tx_timeout = ibmvnic_tx_timeout, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ibmvnic_netpoll_controller, -#endif .ndo_change_mtu = ibmvnic_change_mtu, + .ndo_features_check = ibmvnic_features_check, }; /* ethtool functions */ @@ -1640,31 +3732,32 @@ static const struct net_device_ops ibmvnic_netdev_ops = { static int ibmvnic_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { - u32 supported, advertising; + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + int rc; - supported = (SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg | - SUPPORTED_FIBRE); - advertising = (ADVERTISED_1000baseT_Full | ADVERTISED_Autoneg | - ADVERTISED_FIBRE); - cmd->base.speed = SPEED_1000; - cmd->base.duplex = DUPLEX_FULL; + rc = send_query_phys_parms(adapter); + if (rc) { + adapter->speed = SPEED_UNKNOWN; + adapter->duplex = DUPLEX_UNKNOWN; + } + cmd->base.speed = adapter->speed; + cmd->base.duplex = adapter->duplex; cmd->base.port = PORT_FIBRE; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_ENABLE; - ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported, - supported); - ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising, - advertising); - return 0; } -static void ibmvnic_get_drvinfo(struct net_device *dev, +static void ibmvnic_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { - strlcpy(info->driver, ibmvnic_driver_name, sizeof(info->driver)); - strlcpy(info->version, IBMVNIC_DRIVER_VERSION, sizeof(info->version)); + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + strscpy(info->driver, ibmvnic_driver_name, sizeof(info->driver)); + strscpy(info->version, IBMVNIC_DRIVER_VERSION, sizeof(info->version)); + strscpy(info->fw_version, adapter->fw_version, + sizeof(info->fw_version)); } static u32 ibmvnic_get_msglevel(struct net_device *netdev) @@ -1692,34 +3785,105 @@ static u32 ibmvnic_get_link(struct net_device *netdev) } static void ibmvnic_get_ringparam(struct net_device *netdev, - struct ethtool_ringparam *ring) + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) { - ring->rx_max_pending = 0; - ring->tx_max_pending = 0; + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + ring->rx_max_pending = adapter->max_rx_add_entries_per_subcrq; + ring->tx_max_pending = adapter->max_tx_entries_per_subcrq; ring->rx_mini_max_pending = 0; ring->rx_jumbo_max_pending = 0; - ring->rx_pending = 0; - ring->tx_pending = 0; + ring->rx_pending = adapter->req_rx_add_entries_per_subcrq; + ring->tx_pending = adapter->req_tx_entries_per_subcrq; ring->rx_mini_pending = 0; ring->rx_jumbo_pending = 0; } +static int ibmvnic_set_ringparam(struct net_device *netdev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + if (ring->rx_pending > adapter->max_rx_add_entries_per_subcrq || + ring->tx_pending > adapter->max_tx_entries_per_subcrq) { + netdev_err(netdev, "Invalid request.\n"); + netdev_err(netdev, "Max tx buffers = %llu\n", + adapter->max_rx_add_entries_per_subcrq); + netdev_err(netdev, "Max rx buffers = %llu\n", + adapter->max_tx_entries_per_subcrq); + return -EINVAL; + } + + adapter->desired.rx_entries = ring->rx_pending; + adapter->desired.tx_entries = ring->tx_pending; + + return wait_for_reset(adapter); +} + +static void ibmvnic_get_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + channels->max_rx = adapter->max_rx_queues; + channels->max_tx = adapter->max_tx_queues; + channels->max_other = 0; + channels->max_combined = 0; + channels->rx_count = adapter->req_rx_queues; + channels->tx_count = adapter->req_tx_queues; + channels->other_count = 0; + channels->combined_count = 0; +} + +static int ibmvnic_set_channels(struct net_device *netdev, + struct ethtool_channels *channels) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + + adapter->desired.rx_queues = channels->rx_count; + adapter->desired.tx_queues = channels->tx_count; + + return wait_for_reset(adapter); +} + static void ibmvnic_get_strings(struct net_device *dev, u32 stringset, u8 *data) { + struct ibmvnic_adapter *adapter = netdev_priv(dev); int i; if (stringset != ETH_SS_STATS) return; - for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++, data += ETH_GSTRING_LEN) - memcpy(data, ibmvnic_stats[i].name, ETH_GSTRING_LEN); + for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) + ethtool_puts(&data, ibmvnic_stats[i].name); + + for (i = 0; i < adapter->req_tx_queues; i++) { + ethtool_sprintf(&data, "tx%d_batched_packets", i); + ethtool_sprintf(&data, "tx%d_direct_packets", i); + ethtool_sprintf(&data, "tx%d_bytes", i); + ethtool_sprintf(&data, "tx%d_dropped_packets", i); + } + + for (i = 0; i < adapter->req_rx_queues; i++) { + ethtool_sprintf(&data, "rx%d_packets", i); + ethtool_sprintf(&data, "rx%d_bytes", i); + ethtool_sprintf(&data, "rx%d_interrupts", i); + } } static int ibmvnic_get_sset_count(struct net_device *dev, int sset) { + struct ibmvnic_adapter *adapter = netdev_priv(dev); + switch (sset) { case ETH_SS_STATS: - return ARRAY_SIZE(ibmvnic_stats); + return ARRAY_SIZE(ibmvnic_stats) + + adapter->req_tx_queues * NUM_TX_STATS + + adapter->req_rx_queues * NUM_RX_STATS; default: return -EOPNOTSUPP; } @@ -1730,7 +3894,8 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev, { struct ibmvnic_adapter *adapter = netdev_priv(dev); union ibmvnic_crq crq; - int i; + int i, j; + int rc; memset(&crq, 0, sizeof(crq)); crq.request_statistics.first = IBMVNIC_CRQ_CMD; @@ -1740,12 +3905,37 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev, cpu_to_be32(sizeof(struct ibmvnic_statistics)); /* Wait for data to be written */ - init_completion(&adapter->stats_done); - ibmvnic_send_crq(adapter, &crq); - wait_for_completion(&adapter->stats_done); + reinit_completion(&adapter->stats_done); + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) + return; + rc = ibmvnic_wait_for_completion(adapter, &adapter->stats_done, 10000); + if (rc) + return; for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) - data[i] = IBMVNIC_GET_STAT(adapter, ibmvnic_stats[i].offset); + data[i] = be64_to_cpu(IBMVNIC_GET_STAT + (adapter, ibmvnic_stats[i].offset)); + + for (j = 0; j < adapter->req_tx_queues; j++) { + data[i] = adapter->tx_stats_buffers[j].batched_packets; + i++; + data[i] = adapter->tx_stats_buffers[j].direct_packets; + i++; + data[i] = adapter->tx_stats_buffers[j].bytes; + i++; + data[i] = adapter->tx_stats_buffers[j].dropped_packets; + i++; + } + + for (j = 0; j < adapter->req_rx_queues; j++) { + data[i] = adapter->rx_stats_buffers[j].packets; + i++; + data[i] = adapter->rx_stats_buffers[j].bytes; + i++; + data[i] = adapter->rx_stats_buffers[j].interrupts; + i++; + } } static const struct ethtool_ops ibmvnic_ethtool_ops = { @@ -1754,6 +3944,9 @@ static const struct ethtool_ops ibmvnic_ethtool_ops = { .set_msglevel = ibmvnic_set_msglevel, .get_link = ibmvnic_get_link, .get_ringparam = ibmvnic_get_ringparam, + .set_ringparam = ibmvnic_set_ringparam, + .get_channels = ibmvnic_get_channels, + .set_channels = ibmvnic_set_channels, .get_strings = ibmvnic_get_strings, .get_sset_count = ibmvnic_get_sset_count, .get_ethtool_stats = ibmvnic_get_ethtool_stats, @@ -1767,14 +3960,26 @@ static int reset_one_sub_crq_queue(struct ibmvnic_adapter *adapter, { int rc; + if (!scrq) { + netdev_dbg(adapter->netdev, "Invalid scrq reset.\n"); + return -EINVAL; + } + if (scrq->irq) { free_irq(scrq->irq, scrq); irq_dispose_mapping(scrq->irq); scrq->irq = 0; } - memset(scrq->msgs, 0, 4 * PAGE_SIZE); - scrq->cur = 0; + if (scrq->msgs) { + memset(scrq->msgs, 0, 4 * PAGE_SIZE); + atomic_set(&scrq->used, 0); + scrq->cur = 0; + scrq->ind_buf.index = 0; + } else { + netdev_dbg(adapter->netdev, "Invalid scrq reset\n"); + return -EINVAL; + } rc = h_reg_sub_crq(adapter->vdev->unit_address, scrq->msg_token, 4 * PAGE_SIZE, &scrq->crq_num, &scrq->hw_irq); @@ -1785,13 +3990,20 @@ static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter) { int i, rc; + if (!adapter->tx_scrq || !adapter->rx_scrq) + return -EINVAL; + + ibmvnic_clean_affinity(adapter); + for (i = 0; i < adapter->req_tx_queues; i++) { + netdev_dbg(adapter->netdev, "Re-setting tx_scrq[%d]\n", i); rc = reset_one_sub_crq_queue(adapter, adapter->tx_scrq[i]); if (rc) return rc; } for (i = 0; i < adapter->req_rx_queues; i++) { + netdev_dbg(adapter->netdev, "Re-setting rx_scrq[%d]\n", i); rc = reset_one_sub_crq_queue(adapter, adapter->rx_scrq[i]); if (rc) return rc; @@ -1801,29 +4013,38 @@ static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter) } static void release_sub_crq_queue(struct ibmvnic_adapter *adapter, - struct ibmvnic_sub_crq_queue *scrq) + struct ibmvnic_sub_crq_queue *scrq, + bool do_h_free) { struct device *dev = &adapter->vdev->dev; long rc; netdev_dbg(adapter->netdev, "Releasing sub-CRQ\n"); - /* Close the sub-crqs */ - do { - rc = plpar_hcall_norets(H_FREE_SUB_CRQ, - adapter->vdev->unit_address, - scrq->crq_num); - } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); + if (do_h_free) { + /* Close the sub-crqs */ + do { + rc = plpar_hcall_norets(H_FREE_SUB_CRQ, + adapter->vdev->unit_address, + scrq->crq_num); + } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); - if (rc) { - netdev_err(adapter->netdev, - "Failed to release sub-CRQ %16lx, rc = %ld\n", - scrq->crq_num, rc); + if (rc) { + netdev_err(adapter->netdev, + "Failed to release sub-CRQ %16lx, rc = %ld\n", + scrq->crq_num, rc); + } } + dma_free_coherent(dev, + IBMVNIC_IND_MAX_ARR_SZ, + scrq->ind_buf.indir_arr, + scrq->ind_buf.indir_dma); + dma_unmap_single(dev, scrq->msg_token, 4 * PAGE_SIZE, DMA_BIDIRECTIONAL); free_pages((unsigned long)scrq->msgs, 2); + free_cpumask_var(scrq->affinity_mask); kfree(scrq); } @@ -1844,6 +4065,8 @@ static struct ibmvnic_sub_crq_queue *init_sub_crq_queue(struct ibmvnic_adapter dev_warn(dev, "Couldn't allocate crq queue messages page\n"); goto zero_page_failed; } + if (!zalloc_cpumask_var(&scrq->affinity_mask, GFP_KERNEL)) + goto cpumask_alloc_failed; scrq->msg_token = dma_map_single(dev, scrq->msgs, 4 * PAGE_SIZE, DMA_BIDIRECTIONAL); @@ -1867,6 +4090,17 @@ static struct ibmvnic_sub_crq_queue *init_sub_crq_queue(struct ibmvnic_adapter scrq->adapter = adapter; scrq->size = 4 * PAGE_SIZE / sizeof(*scrq->msgs); + scrq->ind_buf.index = 0; + + scrq->ind_buf.indir_arr = + dma_alloc_coherent(dev, + IBMVNIC_IND_MAX_ARR_SZ, + &scrq->ind_buf.indir_dma, + GFP_KERNEL); + + if (!scrq->ind_buf.indir_arr) + goto indir_failed; + spin_lock_init(&scrq->lock); netdev_dbg(adapter->netdev, @@ -1875,10 +4109,18 @@ static struct ibmvnic_sub_crq_queue *init_sub_crq_queue(struct ibmvnic_adapter return scrq; +indir_failed: + do { + rc = plpar_hcall_norets(H_FREE_SUB_CRQ, + adapter->vdev->unit_address, + scrq->crq_num); + } while (rc == H_BUSY || rc == H_IS_LONG_BUSY(rc)); reg_failed: dma_unmap_single(dev, scrq->msg_token, 4 * PAGE_SIZE, DMA_BIDIRECTIONAL); map_failed: + free_cpumask_var(scrq->affinity_mask); +cpumask_alloc_failed: free_pages((unsigned long)scrq->msgs, 2); zero_page_failed: kfree(scrq); @@ -1886,15 +4128,19 @@ zero_page_failed: return NULL; } -static void release_sub_crqs(struct ibmvnic_adapter *adapter) +static void release_sub_crqs(struct ibmvnic_adapter *adapter, bool do_h_free) { int i; + ibmvnic_clean_affinity(adapter); if (adapter->tx_scrq) { - for (i = 0; i < adapter->req_tx_queues; i++) { + for (i = 0; i < adapter->num_active_tx_scrqs; i++) { if (!adapter->tx_scrq[i]) continue; + netdev_dbg(adapter->netdev, "Releasing tx_scrq[%d]\n", + i); + ibmvnic_tx_scrq_clean_buffer(adapter, adapter->tx_scrq[i]); if (adapter->tx_scrq[i]->irq) { free_irq(adapter->tx_scrq[i]->irq, adapter->tx_scrq[i]); @@ -1902,18 +4148,28 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) adapter->tx_scrq[i]->irq = 0; } - release_sub_crq_queue(adapter, adapter->tx_scrq[i]); + release_sub_crq_queue(adapter, adapter->tx_scrq[i], + do_h_free); } kfree(adapter->tx_scrq); adapter->tx_scrq = NULL; + adapter->num_active_tx_scrqs = 0; } + /* Clean any remaining outstanding SKBs + * we freed the irq so we won't be hearing + * from them + */ + clean_tx_pools(adapter); + if (adapter->rx_scrq) { - for (i = 0; i < adapter->req_rx_queues; i++) { + for (i = 0; i < adapter->num_active_rx_scrqs; i++) { if (!adapter->rx_scrq[i]) continue; + netdev_dbg(adapter->netdev, "Releasing rx_scrq[%d]\n", + i); if (adapter->rx_scrq[i]->irq) { free_irq(adapter->rx_scrq[i]->irq, adapter->rx_scrq[i]); @@ -1921,11 +4177,13 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter) adapter->rx_scrq[i]->irq = 0; } - release_sub_crq_queue(adapter, adapter->rx_scrq[i]); + release_sub_crq_queue(adapter, adapter->rx_scrq[i], + do_h_free); } kfree(adapter->rx_scrq); adapter->rx_scrq = NULL; + adapter->num_active_rx_scrqs = 0; } } @@ -1943,6 +4201,30 @@ static int disable_scrq_irq(struct ibmvnic_adapter *adapter, return rc; } +/* We can not use the IRQ chip EOI handler because that has the + * unintended effect of changing the interrupt priority. + */ +static void ibmvnic_xics_eoi(struct device *dev, struct ibmvnic_sub_crq_queue *scrq) +{ + u64 val = 0xff000000 | scrq->hw_irq; + unsigned long rc; + + rc = plpar_hcall_norets(H_EOI, val); + if (rc) + dev_err(dev, "H_EOI FAILED irq 0x%llx. rc=%ld\n", val, rc); +} + +/* Due to a firmware bug, the hypervisor can send an interrupt to a + * transmit or receive queue just prior to a partition migration. + * Force an EOI after migration. + */ +static void ibmvnic_clear_pending_interrupt(struct device *dev, + struct ibmvnic_sub_crq_queue *scrq) +{ + if (!xive_enabled()) + ibmvnic_xics_eoi(dev, scrq); +} + static int enable_scrq_irq(struct ibmvnic_adapter *adapter, struct ibmvnic_sub_crq_queue *scrq) { @@ -1954,6 +4236,11 @@ static int enable_scrq_irq(struct ibmvnic_adapter *adapter, return 1; } + if (test_bit(0, &adapter->resetting) && + adapter->reset_reason == VNIC_RESET_MOBILITY) { + ibmvnic_clear_pending_interrupt(dev, scrq); + } + rc = plpar_hcall_norets(H_VIOCTL, adapter->vdev->unit_address, H_ENABLE_VIO_INTERRUPT, scrq->hw_irq, 0, 0); if (rc) @@ -1966,61 +4253,66 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter, struct ibmvnic_sub_crq_queue *scrq) { struct device *dev = &adapter->vdev->dev; + int num_packets = 0, total_bytes = 0; + struct ibmvnic_tx_pool *tx_pool; struct ibmvnic_tx_buff *txbuff; + struct netdev_queue *txq; union sub_crq *next; - int index; - int i, j; - u8 first; + int index, i; restart_loop: while (pending_scrq(adapter, scrq)) { unsigned int pool = scrq->pool_index; - + int num_entries = 0; next = ibmvnic_next_scrq(adapter, scrq); for (i = 0; i < next->tx_comp.num_comps; i++) { - if (next->tx_comp.rcs[i]) { - dev_err(dev, "tx error %x\n", - next->tx_comp.rcs[i]); - continue; - } index = be32_to_cpu(next->tx_comp.correlators[i]); - txbuff = &adapter->tx_pool[pool].tx_buff[index]; - - for (j = 0; j < IBMVNIC_MAX_FRAGS_PER_CRQ; j++) { - if (!txbuff->data_dma[j]) - continue; - - txbuff->data_dma[j] = 0; - } - /* if sub_crq was sent indirectly */ - first = txbuff->indir_arr[0].generic.first; - if (first == IBMVNIC_CRQ_CMD) { - dma_unmap_single(dev, txbuff->indir_dma, - sizeof(txbuff->indir_arr), - DMA_TO_DEVICE); + if (index & IBMVNIC_TSO_POOL_MASK) { + tx_pool = &adapter->tso_pool[pool]; + index &= ~IBMVNIC_TSO_POOL_MASK; + } else { + tx_pool = &adapter->tx_pool[pool]; } - if (txbuff->last_frag) { - dev_kfree_skb_any(txbuff->skb); + txbuff = &tx_pool->tx_buff[index]; + num_packets++; + num_entries += txbuff->num_entries; + if (txbuff->skb) { + total_bytes += txbuff->skb->len; + if (next->tx_comp.rcs[i]) { + dev_err(dev, "tx error %x\n", + next->tx_comp.rcs[i]); + dev_kfree_skb_irq(txbuff->skb); + } else { + dev_consume_skb_irq(txbuff->skb); + } txbuff->skb = NULL; + } else { + netdev_warn(adapter->netdev, + "TX completion received with NULL socket buffer\n"); } - - adapter->tx_pool[pool].free_map[adapter->tx_pool[pool]. - producer_index] = index; - adapter->tx_pool[pool].producer_index = - (adapter->tx_pool[pool].producer_index + 1) % - adapter->req_tx_entries_per_subcrq; + tx_pool->free_map[tx_pool->producer_index] = index; + tx_pool->producer_index = + (tx_pool->producer_index + 1) % + tx_pool->num_buffers; } /* remove tx_comp scrq*/ next->tx_comp.first = 0; - if (atomic_sub_return(next->tx_comp.num_comps, &scrq->used) <= + + if (atomic_sub_return(num_entries, &scrq->used) <= (adapter->req_tx_entries_per_subcrq / 2) && __netif_subqueue_stopped(adapter->netdev, scrq->pool_index)) { - netif_wake_subqueue(adapter->netdev, scrq->pool_index); - netdev_info(adapter->netdev, "Started queue %d\n", - scrq->pool_index); + rcu_read_lock(); + if (adapter->tx_queues_active) { + netif_wake_subqueue(adapter->netdev, + scrq->pool_index); + netdev_dbg(adapter->netdev, + "Started queue %d\n", + scrq->pool_index); + } + rcu_read_unlock(); } } @@ -2031,6 +4323,9 @@ restart_loop: goto restart_loop; } + txq = netdev_get_tx_queue(adapter->netdev, scrq->pool_index); + netdev_tx_completed_queue(txq, num_packets, total_bytes); + return 0; } @@ -2050,6 +4345,14 @@ static irqreturn_t ibmvnic_interrupt_rx(int irq, void *instance) struct ibmvnic_sub_crq_queue *scrq = instance; struct ibmvnic_adapter *adapter = scrq->adapter; + /* When booting a kdump kernel we can hit pending interrupts + * prior to completing driver initialization. + */ + if (unlikely(adapter->state != VNIC_OPEN)) + return IRQ_NONE; + + adapter->rx_stats_buffers[scrq->scrq_num].interrupts++; + if (napi_schedule_prep(&adapter->napi[scrq->scrq_num])) { disable_scrq_irq(adapter, scrq); __napi_schedule(&adapter->napi[scrq->scrq_num]); @@ -2066,6 +4369,8 @@ static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter) int rc = 0; for (i = 0; i < adapter->req_tx_queues; i++) { + netdev_dbg(adapter->netdev, "Initializing tx_scrq[%d] irq\n", + i); scrq = adapter->tx_scrq[i]; scrq->irq = irq_create_mapping(NULL, scrq->hw_irq); @@ -2075,18 +4380,22 @@ static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter) goto req_tx_irq_failed; } + snprintf(scrq->name, sizeof(scrq->name), "ibmvnic-%x-tx%d", + adapter->vdev->unit_address, i); rc = request_irq(scrq->irq, ibmvnic_interrupt_tx, - 0, "ibmvnic_tx", scrq); + 0, scrq->name, scrq); if (rc) { dev_err(dev, "Couldn't register tx irq 0x%x. rc=%d\n", scrq->irq, rc); irq_dispose_mapping(scrq->irq); - goto req_rx_irq_failed; + goto req_tx_irq_failed; } } for (i = 0; i < adapter->req_rx_queues; i++) { + netdev_dbg(adapter->netdev, "Initializing rx_scrq[%d] irq\n", + i); scrq = adapter->rx_scrq[i]; scrq->irq = irq_create_mapping(NULL, scrq->hw_irq); if (!scrq->irq) { @@ -2094,8 +4403,10 @@ static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter) dev_err(dev, "Error mapping irq\n"); goto req_rx_irq_failed; } + snprintf(scrq->name, sizeof(scrq->name), "ibmvnic-%x-rx%d", + adapter->vdev->unit_address, i); rc = request_irq(scrq->irq, ibmvnic_interrupt_rx, - 0, "ibmvnic_rx", scrq); + 0, scrq->name, scrq); if (rc) { dev_err(dev, "Couldn't register rx irq 0x%x. rc=%d\n", scrq->irq, rc); @@ -2103,6 +4414,11 @@ static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter) goto req_rx_irq_failed; } } + + cpus_read_lock(); + ibmvnic_set_affinity(adapter); + cpus_read_unlock(); + return rc; req_rx_irq_failed: @@ -2114,9 +4430,9 @@ req_rx_irq_failed: req_tx_irq_failed: for (j = 0; j < i; j++) { free_irq(adapter->tx_scrq[j]->irq, adapter->tx_scrq[j]); - irq_dispose_mapping(adapter->rx_scrq[j]->irq); + irq_dispose_mapping(adapter->tx_scrq[j]->irq); } - release_sub_crqs(adapter); + release_sub_crqs(adapter, 1); return rc; } @@ -2133,7 +4449,7 @@ static int init_sub_crqs(struct ibmvnic_adapter *adapter) allqueues = kcalloc(total_queues, sizeof(*allqueues), GFP_KERNEL); if (!allqueues) - return -1; + return -ENOMEM; for (i = 0; i < total_queues; i++) { allqueues[i] = init_sub_crq_queue(adapter); @@ -2178,6 +4494,7 @@ static int init_sub_crqs(struct ibmvnic_adapter *adapter) for (i = 0; i < adapter->req_tx_queues; i++) { adapter->tx_scrq[i] = allqueues[i]; adapter->tx_scrq[i]->pool_index = i; + adapter->num_active_tx_scrqs++; } adapter->rx_scrq = kcalloc(adapter->req_rx_queues, @@ -2188,6 +4505,7 @@ static int init_sub_crqs(struct ibmvnic_adapter *adapter) for (i = 0; i < adapter->req_rx_queues; i++) { adapter->rx_scrq[i] = allqueues[i + adapter->req_tx_queues]; adapter->rx_scrq[i]->scrq_num = i; + adapter->num_active_rx_scrqs++; } kfree(allqueues); @@ -2198,79 +4516,134 @@ rx_failed: adapter->tx_scrq = NULL; tx_failed: for (i = 0; i < registered_queues; i++) - release_sub_crq_queue(adapter, allqueues[i]); + release_sub_crq_queue(adapter, allqueues[i], 1); kfree(allqueues); - return -1; + return -ENOMEM; } -static void ibmvnic_send_req_caps(struct ibmvnic_adapter *adapter, int retry) +static void send_request_cap(struct ibmvnic_adapter *adapter, int retry) { struct device *dev = &adapter->vdev->dev; union ibmvnic_crq crq; + int max_entries; + int cap_reqs; + + /* We send out 6 or 7 REQUEST_CAPABILITY CRQs below (depending on + * the PROMISC flag). Initialize this count upfront. When the tasklet + * receives a response to all of these, it will send the next protocol + * message (QUERY_IP_OFFLOAD). + */ + if (!(adapter->netdev->flags & IFF_PROMISC) || + adapter->promisc_supported) + cap_reqs = 7; + else + cap_reqs = 6; if (!retry) { /* Sub-CRQ entries are 32 byte long */ int entries_page = 4 * PAGE_SIZE / (sizeof(u64) * 4); + atomic_set(&adapter->running_cap_crqs, cap_reqs); + if (adapter->min_tx_entries_per_subcrq > entries_page || adapter->min_rx_add_entries_per_subcrq > entries_page) { dev_err(dev, "Fatal, invalid entries per sub-crq\n"); return; } - /* Get the minimum between the queried max and the entries - * that fit in our PAGE_SIZE - */ - adapter->req_tx_entries_per_subcrq = - adapter->max_tx_entries_per_subcrq > entries_page ? - entries_page : adapter->max_tx_entries_per_subcrq; - adapter->req_rx_add_entries_per_subcrq = - adapter->max_rx_add_entries_per_subcrq > entries_page ? - entries_page : adapter->max_rx_add_entries_per_subcrq; - - adapter->req_tx_queues = adapter->opt_tx_comp_sub_queues; - adapter->req_rx_queues = adapter->opt_rx_comp_queues; - adapter->req_rx_add_queues = adapter->max_rx_add_queues; + if (adapter->desired.mtu) + adapter->req_mtu = adapter->desired.mtu; + else + adapter->req_mtu = adapter->netdev->mtu + ETH_HLEN; - adapter->req_mtu = adapter->netdev->mtu + ETH_HLEN; - } + if (!adapter->desired.tx_entries) + adapter->desired.tx_entries = + adapter->max_tx_entries_per_subcrq; + if (!adapter->desired.rx_entries) + adapter->desired.rx_entries = + adapter->max_rx_add_entries_per_subcrq; + + max_entries = IBMVNIC_LTB_SET_SIZE / + (adapter->req_mtu + IBMVNIC_BUFFER_HLEN); + + if ((adapter->req_mtu + IBMVNIC_BUFFER_HLEN) * + adapter->desired.tx_entries > IBMVNIC_LTB_SET_SIZE) { + adapter->desired.tx_entries = max_entries; + } + + if ((adapter->req_mtu + IBMVNIC_BUFFER_HLEN) * + adapter->desired.rx_entries > IBMVNIC_LTB_SET_SIZE) { + adapter->desired.rx_entries = max_entries; + } + + if (adapter->desired.tx_entries) + adapter->req_tx_entries_per_subcrq = + adapter->desired.tx_entries; + else + adapter->req_tx_entries_per_subcrq = + adapter->max_tx_entries_per_subcrq; + + if (adapter->desired.rx_entries) + adapter->req_rx_add_entries_per_subcrq = + adapter->desired.rx_entries; + else + adapter->req_rx_add_entries_per_subcrq = + adapter->max_rx_add_entries_per_subcrq; + if (adapter->desired.tx_queues) + adapter->req_tx_queues = + adapter->desired.tx_queues; + else + adapter->req_tx_queues = + adapter->opt_tx_comp_sub_queues; + + if (adapter->desired.rx_queues) + adapter->req_rx_queues = + adapter->desired.rx_queues; + else + adapter->req_rx_queues = + adapter->opt_rx_comp_queues; + + adapter->req_rx_add_queues = adapter->max_rx_add_queues; + } else { + atomic_add(cap_reqs, &adapter->running_cap_crqs); + } memset(&crq, 0, sizeof(crq)); crq.request_capability.first = IBMVNIC_CRQ_CMD; crq.request_capability.cmd = REQUEST_CAPABILITY; crq.request_capability.capability = cpu_to_be16(REQ_TX_QUEUES); crq.request_capability.number = cpu_to_be64(adapter->req_tx_queues); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_RX_QUEUES); crq.request_capability.number = cpu_to_be64(adapter->req_rx_queues); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_RX_ADD_QUEUES); crq.request_capability.number = cpu_to_be64(adapter->req_rx_add_queues); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_TX_ENTRIES_PER_SUBCRQ); crq.request_capability.number = cpu_to_be64(adapter->req_tx_entries_per_subcrq); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_RX_ADD_ENTRIES_PER_SUBCRQ); crq.request_capability.number = cpu_to_be64(adapter->req_rx_add_entries_per_subcrq); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); crq.request_capability.capability = cpu_to_be16(REQ_MTU); crq.request_capability.number = cpu_to_be64(adapter->req_mtu); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); if (adapter->netdev->flags & IFF_PROMISC) { @@ -2278,27 +4651,37 @@ static void ibmvnic_send_req_caps(struct ibmvnic_adapter *adapter, int retry) crq.request_capability.capability = cpu_to_be16(PROMISC_REQUESTED); crq.request_capability.number = cpu_to_be64(1); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); } } else { crq.request_capability.capability = cpu_to_be16(PROMISC_REQUESTED); crq.request_capability.number = cpu_to_be64(0); - atomic_inc(&adapter->running_cap_crqs); + cap_reqs--; ibmvnic_send_crq(adapter, &crq); } + + /* Keep at end to catch any discrepancy between expected and actual + * CRQs sent. + */ + WARN_ON(cap_reqs != 0); } static int pending_scrq(struct ibmvnic_adapter *adapter, struct ibmvnic_sub_crq_queue *scrq) { union sub_crq *entry = &scrq->msgs[scrq->cur]; + int rc; - if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP) - return 1; - else - return 0; + rc = !!(entry->generic.first & IBMVNIC_CRQ_CMD_RSP); + + /* Ensure that the SCRQ valid flag is loaded prior to loading the + * contents of the SCRQ descriptor + */ + dma_rmb(); + + return rc; } static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *adapter, @@ -2317,6 +4700,11 @@ static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *adapter, } spin_unlock_irqrestore(&scrq->lock, flags); + /* Ensure that the SCRQ valid flag is loaded prior to loading the + * contents of the SCRQ descriptor + */ + dma_rmb(); + return entry; } @@ -2336,39 +4724,23 @@ static union ibmvnic_crq *ibmvnic_next_crq(struct ibmvnic_adapter *adapter) return crq; } -static int send_subcrq(struct ibmvnic_adapter *adapter, u64 remote_handle, - union sub_crq *sub_crq) +static void print_subcrq_error(struct device *dev, int rc, const char *func) { - unsigned int ua = adapter->vdev->unit_address; - struct device *dev = &adapter->vdev->dev; - u64 *u64_crq = (u64 *)sub_crq; - int rc; - - netdev_dbg(adapter->netdev, - "Sending sCRQ %016lx: %016lx %016lx %016lx %016lx\n", - (unsigned long int)cpu_to_be64(remote_handle), - (unsigned long int)cpu_to_be64(u64_crq[0]), - (unsigned long int)cpu_to_be64(u64_crq[1]), - (unsigned long int)cpu_to_be64(u64_crq[2]), - (unsigned long int)cpu_to_be64(u64_crq[3])); - - /* Make sure the hypervisor sees the complete request */ - mb(); - - rc = plpar_hcall_norets(H_SEND_SUB_CRQ, ua, - cpu_to_be64(remote_handle), - cpu_to_be64(u64_crq[0]), - cpu_to_be64(u64_crq[1]), - cpu_to_be64(u64_crq[2]), - cpu_to_be64(u64_crq[3])); - - if (rc) { - if (rc == H_CLOSED) - dev_warn(dev, "CRQ Queue closed\n"); - dev_err(dev, "Send error (rc=%d)\n", rc); + switch (rc) { + case H_PARAMETER: + dev_warn_ratelimited(dev, + "%s failed: Send request is malformed or adapter failover pending. (rc=%d)\n", + func, rc); + break; + case H_CLOSED: + dev_warn_ratelimited(dev, + "%s failed: Backing queue closed. Adapter is down or failover pending. (rc=%d)\n", + func, rc); + break; + default: + dev_err_ratelimited(dev, "%s failed: (rc=%d)\n", func, rc); + break; } - - return rc; } static int send_subcrq_indirect(struct ibmvnic_adapter *adapter, @@ -2379,16 +4751,13 @@ static int send_subcrq_indirect(struct ibmvnic_adapter *adapter, int rc; /* Make sure the hypervisor sees the complete request */ - mb(); + dma_wmb(); rc = plpar_hcall_norets(H_SEND_SUB_CRQ_INDIRECT, ua, cpu_to_be64(remote_handle), ioba, num_entries); - if (rc) { - if (rc == H_CLOSED) - dev_warn(dev, "CRQ Queue closed\n"); - dev_err(dev, "Send (indirect) error (rc=%d)\n", rc); - } + if (rc) + print_subcrq_error(dev, rc, __func__); return rc; } @@ -2402,19 +4771,28 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter, int rc; netdev_dbg(adapter->netdev, "Sending CRQ: %016lx %016lx\n", - (unsigned long int)cpu_to_be64(u64_crq[0]), - (unsigned long int)cpu_to_be64(u64_crq[1])); + (unsigned long)cpu_to_be64(u64_crq[0]), + (unsigned long)cpu_to_be64(u64_crq[1])); + + if (!adapter->crq.active && + crq->generic.first != IBMVNIC_CRQ_INIT_CMD) { + dev_warn(dev, "Invalid request detected while CRQ is inactive, possible device state change during reset\n"); + return -EINVAL; + } /* Make sure the hypervisor sees the complete request */ - mb(); + dma_wmb(); rc = plpar_hcall_norets(H_SEND_CRQ, ua, cpu_to_be64(u64_crq[0]), cpu_to_be64(u64_crq[1])); if (rc) { - if (rc == H_CLOSED) + if (rc == H_CLOSED) { dev_warn(dev, "CRQ Queue closed\n"); + /* do not reset, report the fail, wait for passive init from server */ + } + dev_warn(dev, "Send error (rc=%d)\n", rc); } @@ -2423,47 +4801,128 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter, static int ibmvnic_send_crq_init(struct ibmvnic_adapter *adapter) { + struct device *dev = &adapter->vdev->dev; union ibmvnic_crq crq; + int retries = 100; + int rc; memset(&crq, 0, sizeof(crq)); crq.generic.first = IBMVNIC_CRQ_INIT_CMD; crq.generic.cmd = IBMVNIC_CRQ_INIT; netdev_dbg(adapter->netdev, "Sending CRQ init\n"); - return ibmvnic_send_crq(adapter, &crq); + do { + rc = ibmvnic_send_crq(adapter, &crq); + if (rc != H_CLOSED) + break; + retries--; + msleep(50); + + } while (retries > 0); + + if (rc) { + dev_err(dev, "Failed to send init request, rc = %d\n", rc); + return rc; + } + + return 0; } -static int send_version_xchg(struct ibmvnic_adapter *adapter) +struct vnic_login_client_data { + u8 type; + __be16 len; + char name[]; +} __packed; + +static int vnic_client_data_len(struct ibmvnic_adapter *adapter) { - union ibmvnic_crq crq; + int len; - memset(&crq, 0, sizeof(crq)); - crq.version_exchange.first = IBMVNIC_CRQ_CMD; - crq.version_exchange.cmd = VERSION_EXCHANGE; - crq.version_exchange.version = cpu_to_be16(ibmvnic_version); + /* Calculate the amount of buffer space needed for the + * vnic client data in the login buffer. There are four entries, + * OS name, LPAR name, device name, and a null last entry. + */ + len = 4 * sizeof(struct vnic_login_client_data); + len += 6; /* "Linux" plus NULL */ + len += strlen(utsname()->nodename) + 1; + len += strlen(adapter->netdev->name) + 1; - return ibmvnic_send_crq(adapter, &crq); + return len; } -static void send_login(struct ibmvnic_adapter *adapter) +static void vnic_add_client_data(struct ibmvnic_adapter *adapter, + struct vnic_login_client_data *vlcd) +{ + const char *os_name = "Linux"; + int len; + + /* Type 1 - LPAR OS */ + vlcd->type = 1; + len = strlen(os_name) + 1; + vlcd->len = cpu_to_be16(len); + strscpy(vlcd->name, os_name, len); + vlcd = (struct vnic_login_client_data *)(vlcd->name + len); + + /* Type 2 - LPAR name */ + vlcd->type = 2; + len = strlen(utsname()->nodename) + 1; + vlcd->len = cpu_to_be16(len); + strscpy(vlcd->name, utsname()->nodename, len); + vlcd = (struct vnic_login_client_data *)(vlcd->name + len); + + /* Type 3 - device name */ + vlcd->type = 3; + len = strlen(adapter->netdev->name) + 1; + vlcd->len = cpu_to_be16(len); + strscpy(vlcd->name, adapter->netdev->name, len); +} + +static void ibmvnic_print_hex_dump(struct net_device *dev, void *buf, + size_t len) +{ + unsigned char hex_str[16 * 3]; + + for (size_t i = 0; i < len; i += 16) { + hex_dump_to_buffer((unsigned char *)buf + i, len - i, 16, 8, + hex_str, sizeof(hex_str), false); + netdev_dbg(dev, "%s\n", hex_str); + } +} + +static int send_login(struct ibmvnic_adapter *adapter) { struct ibmvnic_login_rsp_buffer *login_rsp_buffer; struct ibmvnic_login_buffer *login_buffer; struct device *dev = &adapter->vdev->dev; + struct vnic_login_client_data *vlcd; dma_addr_t rsp_buffer_token; dma_addr_t buffer_token; size_t rsp_buffer_size; union ibmvnic_crq crq; + int client_data_len; size_t buffer_size; __be64 *tx_list_p; __be64 *rx_list_p; + int rc; int i; + if (!adapter->tx_scrq || !adapter->rx_scrq) { + netdev_err(adapter->netdev, + "RX or TX queues are not allocated, device login failed\n"); + return -ENOMEM; + } + + release_login_buffer(adapter); + release_login_rsp_buffer(adapter); + + client_data_len = vnic_client_data_len(adapter); + buffer_size = sizeof(struct ibmvnic_login_buffer) + - sizeof(u64) * (adapter->req_tx_queues + adapter->req_rx_queues); + sizeof(u64) * (adapter->req_tx_queues + adapter->req_rx_queues) + + client_data_len; - login_buffer = kmalloc(buffer_size, GFP_ATOMIC); + login_buffer = kzalloc(buffer_size, GFP_ATOMIC); if (!login_buffer) goto buf_alloc_failed; @@ -2518,45 +4977,64 @@ static void send_login(struct ibmvnic_adapter *adapter) for (i = 0; i < adapter->req_tx_queues; i++) { if (adapter->tx_scrq[i]) { - tx_list_p[i] = cpu_to_be64(adapter->tx_scrq[i]-> - crq_num); + tx_list_p[i] = + cpu_to_be64(adapter->tx_scrq[i]->crq_num); } } for (i = 0; i < adapter->req_rx_queues; i++) { if (adapter->rx_scrq[i]) { - rx_list_p[i] = cpu_to_be64(adapter->rx_scrq[i]-> - crq_num); + rx_list_p[i] = + cpu_to_be64(adapter->rx_scrq[i]->crq_num); } } + /* Insert vNIC login client data */ + vlcd = (struct vnic_login_client_data *) + ((char *)rx_list_p + (sizeof(u64) * adapter->req_rx_queues)); + login_buffer->client_data_offset = + cpu_to_be32((char *)vlcd - (char *)login_buffer); + login_buffer->client_data_len = cpu_to_be32(client_data_len); + + vnic_add_client_data(adapter, vlcd); + netdev_dbg(adapter->netdev, "Login Buffer:\n"); - for (i = 0; i < (adapter->login_buf_sz - 1) / 8 + 1; i++) { - netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long int *)(adapter->login_buf))[i]); - } + ibmvnic_print_hex_dump(adapter->netdev, adapter->login_buf, + adapter->login_buf_sz); memset(&crq, 0, sizeof(crq)); crq.login.first = IBMVNIC_CRQ_CMD; crq.login.cmd = LOGIN; crq.login.ioba = cpu_to_be32(buffer_token); crq.login.len = cpu_to_be32(buffer_size); - ibmvnic_send_crq(adapter, &crq); - return; + adapter->login_pending = true; + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) { + adapter->login_pending = false; + netdev_err(adapter->netdev, "Failed to send login, rc=%d\n", rc); + goto buf_send_failed; + } + + return 0; +buf_send_failed: + dma_unmap_single(dev, rsp_buffer_token, rsp_buffer_size, + DMA_FROM_DEVICE); buf_rsp_map_failed: kfree(login_rsp_buffer); + adapter->login_rsp_buf = NULL; buf_rsp_alloc_failed: dma_unmap_single(dev, buffer_token, buffer_size, DMA_TO_DEVICE); buf_map_failed: kfree(login_buffer); + adapter->login_buf = NULL; buf_alloc_failed: - return; + return -ENOMEM; } -static void send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr, - u32 len, u8 map_id) +static int send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr, + u32 len, u8 map_id) { union ibmvnic_crq crq; @@ -2566,10 +5044,10 @@ static void send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr, crq.request_map.map_id = map_id; crq.request_map.ioba = cpu_to_be32(addr); crq.request_map.len = cpu_to_be32(len); - ibmvnic_send_crq(adapter, &crq); + return ibmvnic_send_crq(adapter, &crq); } -static void send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id) +static int send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id) { union ibmvnic_crq crq; @@ -2577,10 +5055,10 @@ static void send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id) crq.request_unmap.first = IBMVNIC_CRQ_CMD; crq.request_unmap.cmd = REQUEST_UNMAP; crq.request_unmap.map_id = map_id; - ibmvnic_send_crq(adapter, &crq); + return ibmvnic_send_crq(adapter, &crq); } -static void send_map_query(struct ibmvnic_adapter *adapter) +static void send_query_map(struct ibmvnic_adapter *adapter) { union ibmvnic_crq crq; @@ -2591,209 +5069,234 @@ static void send_map_query(struct ibmvnic_adapter *adapter) } /* Send a series of CRQs requesting various capabilities of the VNIC server */ -static void send_cap_queries(struct ibmvnic_adapter *adapter) +static void send_query_cap(struct ibmvnic_adapter *adapter) { union ibmvnic_crq crq; + int cap_reqs; + + /* We send out 25 QUERY_CAPABILITY CRQs below. Initialize this count + * upfront. When the tasklet receives a response to all of these, it + * can send out the next protocol messaage (REQUEST_CAPABILITY). + */ + cap_reqs = 25; + + atomic_set(&adapter->running_cap_crqs, cap_reqs); - atomic_set(&adapter->running_cap_crqs, 0); memset(&crq, 0, sizeof(crq)); crq.query_capability.first = IBMVNIC_CRQ_CMD; crq.query_capability.cmd = QUERY_CAPABILITY; crq.query_capability.capability = cpu_to_be16(MIN_TX_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_RX_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_RX_ADD_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_TX_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_RX_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_RX_ADD_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_TX_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_RX_ADD_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_TX_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_RX_ADD_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(TCP_IP_OFFLOAD); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(PROMISC_SUPPORTED); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MIN_MTU); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_MTU); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_MULTICAST_FILTERS); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(VLAN_HEADER_INSERTION); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(RX_VLAN_HEADER_INSERTION); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(MAX_TX_SG_ENTRIES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(RX_SG_SUPPORTED); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_TX_COMP_SUB_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_RX_COMP_QUEUES); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_RX_BUFADD_Q_PER_RX_COMP_Q); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_TX_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(OPT_RXBA_ENTRIES_PER_SUBCRQ); - atomic_inc(&adapter->running_cap_crqs); ibmvnic_send_crq(adapter, &crq); + cap_reqs--; crq.query_capability.capability = cpu_to_be16(TX_RX_DESC_REQ); - atomic_inc(&adapter->running_cap_crqs); + ibmvnic_send_crq(adapter, &crq); + cap_reqs--; + + /* Keep at end to catch any discrepancy between expected and actual + * CRQs sent. + */ + WARN_ON(cap_reqs != 0); } -static void handle_query_ip_offload_rsp(struct ibmvnic_adapter *adapter) +static void send_query_ip_offload(struct ibmvnic_adapter *adapter) { + int buf_sz = sizeof(struct ibmvnic_query_ip_offload_buffer); struct device *dev = &adapter->vdev->dev; - struct ibmvnic_query_ip_offload_buffer *buf = &adapter->ip_offload_buf; union ibmvnic_crq crq; - int i; - dma_unmap_single(dev, adapter->ip_offload_tok, - sizeof(adapter->ip_offload_buf), DMA_FROM_DEVICE); + adapter->ip_offload_tok = + dma_map_single(dev, + &adapter->ip_offload_buf, + buf_sz, + DMA_FROM_DEVICE); - netdev_dbg(adapter->netdev, "Query IP Offload Buffer:\n"); - for (i = 0; i < (sizeof(adapter->ip_offload_buf) - 1) / 8 + 1; i++) - netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long int *)(buf))[i]); + if (dma_mapping_error(dev, adapter->ip_offload_tok)) { + if (!firmware_has_feature(FW_FEATURE_CMO)) + dev_err(dev, "Couldn't map offload buffer\n"); + return; + } - netdev_dbg(adapter->netdev, "ipv4_chksum = %d\n", buf->ipv4_chksum); - netdev_dbg(adapter->netdev, "ipv6_chksum = %d\n", buf->ipv6_chksum); - netdev_dbg(adapter->netdev, "tcp_ipv4_chksum = %d\n", - buf->tcp_ipv4_chksum); - netdev_dbg(adapter->netdev, "tcp_ipv6_chksum = %d\n", - buf->tcp_ipv6_chksum); - netdev_dbg(adapter->netdev, "udp_ipv4_chksum = %d\n", - buf->udp_ipv4_chksum); - netdev_dbg(adapter->netdev, "udp_ipv6_chksum = %d\n", - buf->udp_ipv6_chksum); - netdev_dbg(adapter->netdev, "large_tx_ipv4 = %d\n", - buf->large_tx_ipv4); - netdev_dbg(adapter->netdev, "large_tx_ipv6 = %d\n", - buf->large_tx_ipv6); - netdev_dbg(adapter->netdev, "large_rx_ipv4 = %d\n", - buf->large_rx_ipv4); - netdev_dbg(adapter->netdev, "large_rx_ipv6 = %d\n", - buf->large_rx_ipv6); - netdev_dbg(adapter->netdev, "max_ipv4_hdr_sz = %d\n", - buf->max_ipv4_header_size); - netdev_dbg(adapter->netdev, "max_ipv6_hdr_sz = %d\n", - buf->max_ipv6_header_size); - netdev_dbg(adapter->netdev, "max_tcp_hdr_size = %d\n", - buf->max_tcp_header_size); - netdev_dbg(adapter->netdev, "max_udp_hdr_size = %d\n", - buf->max_udp_header_size); - netdev_dbg(adapter->netdev, "max_large_tx_size = %d\n", - buf->max_large_tx_size); - netdev_dbg(adapter->netdev, "max_large_rx_size = %d\n", - buf->max_large_rx_size); - netdev_dbg(adapter->netdev, "ipv6_ext_hdr = %d\n", - buf->ipv6_extension_header); - netdev_dbg(adapter->netdev, "tcp_pseudosum_req = %d\n", - buf->tcp_pseudosum_req); - netdev_dbg(adapter->netdev, "num_ipv6_ext_hd = %d\n", - buf->num_ipv6_ext_headers); - netdev_dbg(adapter->netdev, "off_ipv6_ext_hd = %d\n", - buf->off_ipv6_ext_headers); + memset(&crq, 0, sizeof(crq)); + crq.query_ip_offload.first = IBMVNIC_CRQ_CMD; + crq.query_ip_offload.cmd = QUERY_IP_OFFLOAD; + crq.query_ip_offload.len = cpu_to_be32(buf_sz); + crq.query_ip_offload.ioba = + cpu_to_be32(adapter->ip_offload_tok); + + ibmvnic_send_crq(adapter, &crq); +} + +static void send_control_ip_offload(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_control_ip_offload_buffer *ctrl_buf = &adapter->ip_offload_ctrl; + struct ibmvnic_query_ip_offload_buffer *buf = &adapter->ip_offload_buf; + struct device *dev = &adapter->vdev->dev; + netdev_features_t old_hw_features = 0; + union ibmvnic_crq crq; adapter->ip_offload_ctrl_tok = - dma_map_single(dev, &adapter->ip_offload_ctrl, - sizeof(adapter->ip_offload_ctrl), DMA_TO_DEVICE); + dma_map_single(dev, + ctrl_buf, + sizeof(adapter->ip_offload_ctrl), + DMA_TO_DEVICE); if (dma_mapping_error(dev, adapter->ip_offload_ctrl_tok)) { dev_err(dev, "Couldn't map ip offload control buffer\n"); return; } - adapter->ip_offload_ctrl.version = cpu_to_be32(INITIAL_VERSION_IOB); - adapter->ip_offload_ctrl.tcp_ipv4_chksum = buf->tcp_ipv4_chksum; - adapter->ip_offload_ctrl.udp_ipv4_chksum = buf->udp_ipv4_chksum; - adapter->ip_offload_ctrl.tcp_ipv6_chksum = buf->tcp_ipv6_chksum; - adapter->ip_offload_ctrl.udp_ipv6_chksum = buf->udp_ipv6_chksum; - - /* large_tx/rx disabled for now, additional features needed */ - adapter->ip_offload_ctrl.large_tx_ipv4 = 0; - adapter->ip_offload_ctrl.large_tx_ipv6 = 0; - adapter->ip_offload_ctrl.large_rx_ipv4 = 0; - adapter->ip_offload_ctrl.large_rx_ipv6 = 0; + ctrl_buf->len = cpu_to_be32(sizeof(adapter->ip_offload_ctrl)); + ctrl_buf->version = cpu_to_be32(INITIAL_VERSION_IOB); + ctrl_buf->ipv4_chksum = buf->ipv4_chksum; + ctrl_buf->ipv6_chksum = buf->ipv6_chksum; + ctrl_buf->tcp_ipv4_chksum = buf->tcp_ipv4_chksum; + ctrl_buf->udp_ipv4_chksum = buf->udp_ipv4_chksum; + ctrl_buf->tcp_ipv6_chksum = buf->tcp_ipv6_chksum; + ctrl_buf->udp_ipv6_chksum = buf->udp_ipv6_chksum; + ctrl_buf->large_tx_ipv4 = buf->large_tx_ipv4; + ctrl_buf->large_tx_ipv6 = buf->large_tx_ipv6; + + /* large_rx disabled for now, additional features needed */ + ctrl_buf->large_rx_ipv4 = 0; + ctrl_buf->large_rx_ipv6 = 0; + + if (adapter->state != VNIC_PROBING) { + old_hw_features = adapter->netdev->hw_features; + adapter->netdev->hw_features = 0; + } - adapter->netdev->features = NETIF_F_GSO; + adapter->netdev->hw_features = NETIF_F_SG | NETIF_F_GSO | NETIF_F_GRO; if (buf->tcp_ipv4_chksum || buf->udp_ipv4_chksum) - adapter->netdev->features |= NETIF_F_IP_CSUM; + adapter->netdev->hw_features |= NETIF_F_IP_CSUM; if (buf->tcp_ipv6_chksum || buf->udp_ipv6_chksum) - adapter->netdev->features |= NETIF_F_IPV6_CSUM; + adapter->netdev->hw_features |= NETIF_F_IPV6_CSUM; if ((adapter->netdev->features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) - adapter->netdev->features |= NETIF_F_RXCSUM; + adapter->netdev->hw_features |= NETIF_F_RXCSUM; + + if (buf->large_tx_ipv4) + adapter->netdev->hw_features |= NETIF_F_TSO; + if (buf->large_tx_ipv6) + adapter->netdev->hw_features |= NETIF_F_TSO6; + + if (adapter->state == VNIC_PROBING) { + adapter->netdev->features |= adapter->netdev->hw_features; + } else if (old_hw_features != adapter->netdev->hw_features) { + netdev_features_t tmp = 0; + + /* disable features no longer supported */ + adapter->netdev->features &= adapter->netdev->hw_features; + /* turn on features now supported if previously enabled */ + tmp = (old_hw_features ^ adapter->netdev->hw_features) & + adapter->netdev->hw_features; + adapter->netdev->features |= + tmp & adapter->netdev->wanted_features; + } memset(&crq, 0, sizeof(crq)); crq.control_ip_offload.first = IBMVNIC_CRQ_CMD; @@ -2804,132 +5307,161 @@ static void handle_query_ip_offload_rsp(struct ibmvnic_adapter *adapter) ibmvnic_send_crq(adapter, &crq); } -static void handle_error_info_rsp(union ibmvnic_crq *crq, - struct ibmvnic_adapter *adapter) +static void handle_vpd_size_rsp(union ibmvnic_crq *crq, + struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; - struct ibmvnic_error_buff *error_buff, *tmp; - unsigned long flags; - bool found = false; - int i; - - if (!crq->request_error_rsp.rc.code) { - dev_info(dev, "Request Error Rsp returned with rc=%x\n", - crq->request_error_rsp.rc.code); - return; - } - - spin_lock_irqsave(&adapter->error_list_lock, flags); - list_for_each_entry_safe(error_buff, tmp, &adapter->errors, list) - if (error_buff->error_id == crq->request_error_rsp.error_id) { - found = true; - list_del(&error_buff->list); - break; - } - spin_unlock_irqrestore(&adapter->error_list_lock, flags); - if (!found) { - dev_err(dev, "Couldn't find error id %x\n", - be32_to_cpu(crq->request_error_rsp.error_id)); + if (crq->get_vpd_size_rsp.rc.code) { + dev_err(dev, "Error retrieving VPD size, rc=%x\n", + crq->get_vpd_size_rsp.rc.code); + complete(&adapter->fw_done); return; } - dev_err(dev, "Detailed info for error id %x:", - be32_to_cpu(crq->request_error_rsp.error_id)); - - for (i = 0; i < error_buff->len; i++) { - pr_cont("%02x", (int)error_buff->buff[i]); - if (i % 8 == 7) - pr_cont(" "); - } - pr_cont("\n"); - - dma_unmap_single(dev, error_buff->dma, error_buff->len, - DMA_FROM_DEVICE); - kfree(error_buff->buff); - kfree(error_buff); + adapter->vpd->len = be64_to_cpu(crq->get_vpd_size_rsp.len); + complete(&adapter->fw_done); } -static void request_error_information(struct ibmvnic_adapter *adapter, - union ibmvnic_crq *err_crq) +static void handle_vpd_rsp(union ibmvnic_crq *crq, + struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; - struct net_device *netdev = adapter->netdev; - struct ibmvnic_error_buff *error_buff; - unsigned long timeout = msecs_to_jiffies(30000); - union ibmvnic_crq crq; - unsigned long flags; - int rc, detail_len; + unsigned char *substr = NULL; + u8 fw_level_len = 0; - error_buff = kmalloc(sizeof(*error_buff), GFP_ATOMIC); - if (!error_buff) - return; + memset(adapter->fw_version, 0, 32); - detail_len = be32_to_cpu(err_crq->error_indication.detail_error_sz); - error_buff->buff = kmalloc(detail_len, GFP_ATOMIC); - if (!error_buff->buff) { - kfree(error_buff); - return; + dma_unmap_single(dev, adapter->vpd->dma_addr, adapter->vpd->len, + DMA_FROM_DEVICE); + + if (crq->get_vpd_rsp.rc.code) { + dev_err(dev, "Error retrieving VPD from device, rc=%x\n", + crq->get_vpd_rsp.rc.code); + goto complete; } - error_buff->dma = dma_map_single(dev, error_buff->buff, detail_len, - DMA_FROM_DEVICE); - if (dma_mapping_error(dev, error_buff->dma)) { - netdev_err(netdev, "Couldn't map error buffer\n"); - kfree(error_buff->buff); - kfree(error_buff); - return; + /* get the position of the firmware version info + * located after the ASCII 'RM' substring in the buffer + */ + substr = strnstr(adapter->vpd->buff, "RM", adapter->vpd->len); + if (!substr) { + dev_info(dev, "Warning - No FW level has been provided in the VPD buffer by the VIOS Server\n"); + goto complete; } - error_buff->len = detail_len; - error_buff->error_id = err_crq->error_indication.error_id; + /* get length of firmware level ASCII substring */ + if ((substr + 2) < (adapter->vpd->buff + adapter->vpd->len)) { + fw_level_len = *(substr + 2); + } else { + dev_info(dev, "Length of FW substr extrapolated VDP buff\n"); + goto complete; + } - spin_lock_irqsave(&adapter->error_list_lock, flags); - list_add_tail(&error_buff->list, &adapter->errors); - spin_unlock_irqrestore(&adapter->error_list_lock, flags); + /* copy firmware version string from vpd into adapter */ + if ((substr + 3 + fw_level_len) < + (adapter->vpd->buff + adapter->vpd->len)) { + strscpy(adapter->fw_version, substr + 3, + sizeof(adapter->fw_version)); + } else { + dev_info(dev, "FW substr extrapolated VPD buff\n"); + } - memset(&crq, 0, sizeof(crq)); - crq.request_error_info.first = IBMVNIC_CRQ_CMD; - crq.request_error_info.cmd = REQUEST_ERROR_INFO; - crq.request_error_info.ioba = cpu_to_be32(error_buff->dma); - crq.request_error_info.len = cpu_to_be32(detail_len); - crq.request_error_info.error_id = err_crq->error_indication.error_id; +complete: + if (adapter->fw_version[0] == '\0') + strscpy((char *)adapter->fw_version, "N/A", sizeof(adapter->fw_version)); + complete(&adapter->fw_done); +} - rc = ibmvnic_send_crq(adapter, &crq); - if (rc) { - netdev_err(netdev, "failed to request error information\n"); - goto err_info_fail; - } +static void handle_query_ip_offload_rsp(struct ibmvnic_adapter *adapter) +{ + struct device *dev = &adapter->vdev->dev; + struct ibmvnic_query_ip_offload_buffer *buf = &adapter->ip_offload_buf; - if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { - netdev_err(netdev, "timeout waiting for error information\n"); - goto err_info_fail; - } + dma_unmap_single(dev, adapter->ip_offload_tok, + sizeof(adapter->ip_offload_buf), DMA_FROM_DEVICE); - return; + netdev_dbg(adapter->netdev, "Query IP Offload Buffer:\n"); + ibmvnic_print_hex_dump(adapter->netdev, buf, + sizeof(adapter->ip_offload_buf)); + + netdev_dbg(adapter->netdev, "ipv4_chksum = %d\n", buf->ipv4_chksum); + netdev_dbg(adapter->netdev, "ipv6_chksum = %d\n", buf->ipv6_chksum); + netdev_dbg(adapter->netdev, "tcp_ipv4_chksum = %d\n", + buf->tcp_ipv4_chksum); + netdev_dbg(adapter->netdev, "tcp_ipv6_chksum = %d\n", + buf->tcp_ipv6_chksum); + netdev_dbg(adapter->netdev, "udp_ipv4_chksum = %d\n", + buf->udp_ipv4_chksum); + netdev_dbg(adapter->netdev, "udp_ipv6_chksum = %d\n", + buf->udp_ipv6_chksum); + netdev_dbg(adapter->netdev, "large_tx_ipv4 = %d\n", + buf->large_tx_ipv4); + netdev_dbg(adapter->netdev, "large_tx_ipv6 = %d\n", + buf->large_tx_ipv6); + netdev_dbg(adapter->netdev, "large_rx_ipv4 = %d\n", + buf->large_rx_ipv4); + netdev_dbg(adapter->netdev, "large_rx_ipv6 = %d\n", + buf->large_rx_ipv6); + netdev_dbg(adapter->netdev, "max_ipv4_hdr_sz = %d\n", + buf->max_ipv4_header_size); + netdev_dbg(adapter->netdev, "max_ipv6_hdr_sz = %d\n", + buf->max_ipv6_header_size); + netdev_dbg(adapter->netdev, "max_tcp_hdr_size = %d\n", + buf->max_tcp_header_size); + netdev_dbg(adapter->netdev, "max_udp_hdr_size = %d\n", + buf->max_udp_header_size); + netdev_dbg(adapter->netdev, "max_large_tx_size = %d\n", + buf->max_large_tx_size); + netdev_dbg(adapter->netdev, "max_large_rx_size = %d\n", + buf->max_large_rx_size); + netdev_dbg(adapter->netdev, "ipv6_ext_hdr = %d\n", + buf->ipv6_extension_header); + netdev_dbg(adapter->netdev, "tcp_pseudosum_req = %d\n", + buf->tcp_pseudosum_req); + netdev_dbg(adapter->netdev, "num_ipv6_ext_hd = %d\n", + buf->num_ipv6_ext_headers); + netdev_dbg(adapter->netdev, "off_ipv6_ext_hd = %d\n", + buf->off_ipv6_ext_headers); -err_info_fail: - spin_lock_irqsave(&adapter->error_list_lock, flags); - list_del(&error_buff->list); - spin_unlock_irqrestore(&adapter->error_list_lock, flags); + send_control_ip_offload(adapter); +} - kfree(error_buff->buff); - kfree(error_buff); +static const char *ibmvnic_fw_err_cause(u16 cause) +{ + switch (cause) { + case ADAPTER_PROBLEM: + return "adapter problem"; + case BUS_PROBLEM: + return "bus problem"; + case FW_PROBLEM: + return "firmware problem"; + case DD_PROBLEM: + return "device driver problem"; + case EEH_RECOVERY: + return "EEH recovery"; + case FW_UPDATED: + return "firmware updated"; + case LOW_MEMORY: + return "low Memory"; + default: + return "unknown"; + } } static void handle_error_indication(union ibmvnic_crq *crq, struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; + u16 cause; - dev_err(dev, "Firmware reports %serror id %x, cause %d\n", - crq->error_indication.flags - & IBMVNIC_FATAL_ERROR ? "FATAL " : "", - be32_to_cpu(crq->error_indication.error_id), - be16_to_cpu(crq->error_indication.error_cause)); + cause = be16_to_cpu(crq->error_indication.error_cause); - if (be32_to_cpu(crq->error_indication.error_id)) - request_error_information(adapter, crq); + dev_warn_ratelimited(dev, + "Firmware reports %serror, cause: %s. Starting recovery...\n", + crq->error_indication.flags + & IBMVNIC_FATAL_ERROR ? "FATAL " : "", + ibmvnic_fw_err_cause(cause)); if (crq->error_indication.flags & IBMVNIC_FATAL_ERROR) ibmvnic_reset(adapter, VNIC_RESET_FATAL); @@ -2937,8 +5469,8 @@ static void handle_error_indication(union ibmvnic_crq *crq, ibmvnic_reset(adapter, VNIC_RESET_NON_FATAL); } -static void handle_change_mac_rsp(union ibmvnic_crq *crq, - struct ibmvnic_adapter *adapter) +static int handle_change_mac_rsp(union ibmvnic_crq *crq, + struct ibmvnic_adapter *adapter) { struct net_device *netdev = adapter->netdev; struct device *dev = &adapter->vdev->dev; @@ -2947,10 +5479,17 @@ static void handle_change_mac_rsp(union ibmvnic_crq *crq, rc = crq->change_mac_addr_rsp.rc.code; if (rc) { dev_err(dev, "Error %ld in CHANGE_MAC_ADDR_RSP\n", rc); - return; + goto out; } - memcpy(netdev->dev_addr, &crq->change_mac_addr_rsp.mac_addr[0], - ETH_ALEN); + /* crq->change_mac_addr.mac_addr is the requested one + * crq->change_mac_addr_rsp.mac_addr is the returned valid one. + */ + eth_hw_addr_set(netdev, &crq->change_mac_addr_rsp.mac_addr[0]); + ether_addr_copy(adapter->mac_addr, + &crq->change_mac_addr_rsp.mac_addr[0]); +out: + complete(&adapter->fw_done); + return rc; } static void handle_request_cap_rsp(union ibmvnic_crq *crq, @@ -2961,6 +5500,8 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, char *name; atomic_dec(&adapter->running_cap_crqs); + netdev_dbg(adapter->netdev, "Outstanding request-caps: %d\n", + atomic_read(&adapter->running_cap_crqs)); switch (be16_to_cpu(crq->request_capability_rsp.capability)) { case REQ_TX_QUEUES: req_value = &adapter->req_tx_queues; @@ -3002,11 +5543,20 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, case PARTIALSUCCESS: dev_info(dev, "req=%lld, rsp=%ld in %s queue, retrying.\n", *req_value, - (long int)be64_to_cpu(crq->request_capability_rsp. - number), name); - release_sub_crqs(adapter); - *req_value = be64_to_cpu(crq->request_capability_rsp.number); - ibmvnic_send_req_caps(adapter, 1); + (long)be64_to_cpu(crq->request_capability_rsp.number), + name); + + if (be16_to_cpu(crq->request_capability_rsp.capability) == + REQ_MTU) { + pr_err("mtu of %llu is not supported. Reverting.\n", + *req_value); + *req_value = adapter->fallback.mtu; + } else { + *req_value = + be64_to_cpu(crq->request_capability_rsp.number); + } + + send_request_cap(adapter, 1); return; default: dev_err(dev, "Error %d in request cap rsp\n", @@ -3015,72 +5565,109 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, } /* Done receiving requested capabilities, query IP offload support */ - if (atomic_read(&adapter->running_cap_crqs) == 0) { - union ibmvnic_crq newcrq; - int buf_sz = sizeof(struct ibmvnic_query_ip_offload_buffer); - struct ibmvnic_query_ip_offload_buffer *ip_offload_buf = - &adapter->ip_offload_buf; - - adapter->wait_capability = false; - adapter->ip_offload_tok = dma_map_single(dev, ip_offload_buf, - buf_sz, - DMA_FROM_DEVICE); - - if (dma_mapping_error(dev, adapter->ip_offload_tok)) { - if (!firmware_has_feature(FW_FEATURE_CMO)) - dev_err(dev, "Couldn't map offload buffer\n"); - return; - } - - memset(&newcrq, 0, sizeof(newcrq)); - newcrq.query_ip_offload.first = IBMVNIC_CRQ_CMD; - newcrq.query_ip_offload.cmd = QUERY_IP_OFFLOAD; - newcrq.query_ip_offload.len = cpu_to_be32(buf_sz); - newcrq.query_ip_offload.ioba = - cpu_to_be32(adapter->ip_offload_tok); - - ibmvnic_send_crq(adapter, &newcrq); - } + if (atomic_read(&adapter->running_cap_crqs) == 0) + send_query_ip_offload(adapter); } static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; + struct net_device *netdev = adapter->netdev; struct ibmvnic_login_rsp_buffer *login_rsp = adapter->login_rsp_buf; struct ibmvnic_login_buffer *login = adapter->login_buf; + u64 *tx_handle_array; + u64 *rx_handle_array; + int num_tx_pools; + int num_rx_pools; + u64 *size_array; + u32 rsp_len; int i; - dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz, - DMA_BIDIRECTIONAL); - dma_unmap_single(dev, adapter->login_rsp_buf_token, - adapter->login_rsp_buf_sz, DMA_BIDIRECTIONAL); + /* CHECK: Test/set of login_pending does not need to be atomic + * because only ibmvnic_tasklet tests/clears this. + */ + if (!adapter->login_pending) { + netdev_warn(netdev, "Ignoring unexpected login response\n"); + return 0; + } + adapter->login_pending = false; /* If the number of queues requested can't be allocated by the * server, the login response will return with code 1. We will need * to resend the login buffer with fewer queues requested. */ if (login_rsp_crq->generic.rc.code) { - adapter->renegotiate = true; + adapter->init_done_rc = login_rsp_crq->generic.rc.code; complete(&adapter->init_done); return 0; } - netdev_dbg(adapter->netdev, "Login Response Buffer:\n"); - for (i = 0; i < (adapter->login_rsp_buf_sz - 1) / 8 + 1; i++) { - netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long int *)(adapter->login_rsp_buf))[i]); + if (adapter->failover_pending) { + adapter->init_done_rc = -EAGAIN; + netdev_dbg(netdev, "Failover pending, ignoring login response\n"); + complete(&adapter->init_done); + /* login response buffer will be released on reset */ + return 0; } + netdev->mtu = adapter->req_mtu - ETH_HLEN; + + netdev_dbg(adapter->netdev, "Login Response Buffer:\n"); + ibmvnic_print_hex_dump(netdev, adapter->login_rsp_buf, + adapter->login_rsp_buf_sz); + /* Sanity checks */ if (login->num_txcomp_subcrqs != login_rsp->num_txsubm_subcrqs || (be32_to_cpu(login->num_rxcomp_subcrqs) * adapter->req_rx_add_queues != be32_to_cpu(login_rsp->num_rxadd_subcrqs))) { dev_err(dev, "FATAL: Inconsistent login and login rsp\n"); - ibmvnic_remove(adapter->vdev); + ibmvnic_reset(adapter, VNIC_RESET_FATAL); return -EIO; } + + rsp_len = be32_to_cpu(login_rsp->len); + if (be32_to_cpu(login->login_rsp_len) < rsp_len || + rsp_len <= be32_to_cpu(login_rsp->off_txsubm_subcrqs) || + rsp_len <= be32_to_cpu(login_rsp->off_rxadd_subcrqs) || + rsp_len <= be32_to_cpu(login_rsp->off_rxadd_buff_size) || + rsp_len <= be32_to_cpu(login_rsp->off_supp_tx_desc)) { + /* This can happen if a login request times out and there are + * 2 outstanding login requests sent, the LOGIN_RSP crq + * could have been for the older login request. So we are + * parsing the newer response buffer which may be incomplete + */ + dev_err(dev, "FATAL: Login rsp offsets/lengths invalid\n"); + ibmvnic_reset(adapter, VNIC_RESET_FATAL); + return -EIO; + } + + size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + + be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size)); + /* variable buffer sizes are not supported, so just read the + * first entry. + */ + adapter->cur_rx_buf_sz = be64_to_cpu(size_array[0]); + + num_tx_pools = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); + num_rx_pools = be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); + + tx_handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + + be32_to_cpu(adapter->login_rsp_buf->off_txsubm_subcrqs)); + rx_handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + + be32_to_cpu(adapter->login_rsp_buf->off_rxadd_subcrqs)); + + for (i = 0; i < num_tx_pools; i++) + adapter->tx_scrq[i]->handle = tx_handle_array[i]; + + for (i = 0; i < num_rx_pools; i++) + adapter->rx_scrq[i]->handle = rx_handle_array[i]; + + adapter->num_active_tx_scrqs = num_tx_pools; + adapter->num_active_rx_scrqs = num_rx_pools; + release_login_rsp_buffer(adapter); + release_login_buffer(adapter); complete(&adapter->init_done); return 0; @@ -3109,9 +5696,10 @@ static void handle_query_map_rsp(union ibmvnic_crq *crq, dev_err(dev, "Error %ld in QUERY_MAP_RSP\n", rc); return; } - netdev_dbg(netdev, "page_size = %d\ntot_pages = %d\nfree_pages = %d\n", - crq->query_map_rsp.page_size, crq->query_map_rsp.tot_pages, - crq->query_map_rsp.free_pages); + netdev_dbg(netdev, "page_size = %d\ntot_pages = %u\nfree_pages = %u\n", + crq->query_map_rsp.page_size, + __be32_to_cpu(crq->query_map_rsp.tot_pages), + __be32_to_cpu(crq->query_map_rsp.free_pages)); } static void handle_query_cap_rsp(union ibmvnic_crq *crq, @@ -3287,10 +5875,92 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq, } out: - if (atomic_read(&adapter->running_cap_crqs) == 0) { - adapter->wait_capability = false; - ibmvnic_send_req_caps(adapter, 0); + if (atomic_read(&adapter->running_cap_crqs) == 0) + send_request_cap(adapter, 0); +} + +static int send_query_phys_parms(struct ibmvnic_adapter *adapter) +{ + union ibmvnic_crq crq; + int rc; + + memset(&crq, 0, sizeof(crq)); + crq.query_phys_parms.first = IBMVNIC_CRQ_CMD; + crq.query_phys_parms.cmd = QUERY_PHYS_PARMS; + + mutex_lock(&adapter->fw_lock); + adapter->fw_done_rc = 0; + reinit_completion(&adapter->fw_done); + + rc = ibmvnic_send_crq(adapter, &crq); + if (rc) { + mutex_unlock(&adapter->fw_lock); + return rc; } + + rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); + if (rc) { + mutex_unlock(&adapter->fw_lock); + return rc; + } + + mutex_unlock(&adapter->fw_lock); + return adapter->fw_done_rc ? -EIO : 0; +} + +static int handle_query_phys_parms_rsp(union ibmvnic_crq *crq, + struct ibmvnic_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int rc; + __be32 rspeed = cpu_to_be32(crq->query_phys_parms_rsp.speed); + + rc = crq->query_phys_parms_rsp.rc.code; + if (rc) { + netdev_err(netdev, "Error %d in QUERY_PHYS_PARMS\n", rc); + return rc; + } + switch (rspeed) { + case IBMVNIC_10MBPS: + adapter->speed = SPEED_10; + break; + case IBMVNIC_100MBPS: + adapter->speed = SPEED_100; + break; + case IBMVNIC_1GBPS: + adapter->speed = SPEED_1000; + break; + case IBMVNIC_10GBPS: + adapter->speed = SPEED_10000; + break; + case IBMVNIC_25GBPS: + adapter->speed = SPEED_25000; + break; + case IBMVNIC_40GBPS: + adapter->speed = SPEED_40000; + break; + case IBMVNIC_50GBPS: + adapter->speed = SPEED_50000; + break; + case IBMVNIC_100GBPS: + adapter->speed = SPEED_100000; + break; + case IBMVNIC_200GBPS: + adapter->speed = SPEED_200000; + break; + default: + if (netif_carrier_ok(netdev)) + netdev_warn(netdev, "Unknown speed 0x%08x\n", rspeed); + adapter->speed = SPEED_UNKNOWN; + } + if (crq->query_phys_parms_rsp.flags1 & IBMVNIC_FULL_DUPLEX) + adapter->duplex = DUPLEX_FULL; + else if (crq->query_phys_parms_rsp.flags1 & IBMVNIC_HALF_DUPLEX) + adapter->duplex = DUPLEX_HALF; + else + adapter->duplex = DUPLEX_UNKNOWN; + + return rc; } static void ibmvnic_handle_crq(union ibmvnic_crq *crq, @@ -3303,18 +5973,50 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, long rc; netdev_dbg(netdev, "Handling CRQ: %016lx %016lx\n", - (unsigned long int)cpu_to_be64(u64_crq[0]), - (unsigned long int)cpu_to_be64(u64_crq[1])); + (unsigned long)cpu_to_be64(u64_crq[0]), + (unsigned long)cpu_to_be64(u64_crq[1])); switch (gen_crq->first) { case IBMVNIC_CRQ_INIT_RSP: switch (gen_crq->cmd) { case IBMVNIC_CRQ_INIT: dev_info(dev, "Partner initialized\n"); adapter->from_passive_init = true; - complete(&adapter->init_done); + /* Discard any stale login responses from prev reset. + * CHECK: should we clear even on INIT_COMPLETE? + */ + adapter->login_pending = false; + + if (adapter->state == VNIC_DOWN) + rc = ibmvnic_reset(adapter, VNIC_RESET_PASSIVE_INIT); + else + rc = ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); + + if (rc && rc != -EBUSY) { + /* We were unable to schedule the failover + * reset either because the adapter was still + * probing (eg: during kexec) or we could not + * allocate memory. Clear the failover_pending + * flag since no one else will. We ignore + * EBUSY because it means either FAILOVER reset + * is already scheduled or the adapter is + * being removed. + */ + netdev_err(netdev, + "Error %ld scheduling failover reset\n", + rc); + adapter->failover_pending = false; + } + + if (!completion_done(&adapter->init_done)) { + if (!adapter->init_done_rc) + adapter->init_done_rc = -EAGAIN; + complete(&adapter->init_done); + } + break; case IBMVNIC_CRQ_INIT_COMPLETE: dev_info(dev, "Partner initialization complete\n"); + adapter->crq.active = true; send_version_xchg(adapter); break; default: @@ -3323,12 +6025,31 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, return; case IBMVNIC_CRQ_XPORT_EVENT: netif_carrier_off(netdev); + adapter->crq.active = false; + /* terminate any thread waiting for a response + * from the device + */ + if (!completion_done(&adapter->fw_done)) { + adapter->fw_done_rc = -EIO; + complete(&adapter->fw_done); + } + + /* if we got here during crq-init, retry crq-init */ + if (!completion_done(&adapter->init_done)) { + adapter->init_done_rc = -EAGAIN; + complete(&adapter->init_done); + } + + if (!completion_done(&adapter->stats_done)) + complete(&adapter->stats_done); + if (test_bit(0, &adapter->resetting)) + adapter->force_reset_recovery = true; if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) { dev_info(dev, "Migrated, re-enabling adapter\n"); ibmvnic_reset(adapter, VNIC_RESET_MOBILITY); } else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) { dev_info(dev, "Backing device failover detected\n"); - ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); + adapter->failover_pending = true; } else { /* The adapter lost the connection */ dev_err(dev, "Virtual Adapter failed (rc=%d)\n", @@ -3351,13 +6072,11 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, dev_err(dev, "Error %ld in VERSION_EXCHG_RSP\n", rc); break; } - dev_info(dev, "Partner protocol version is %d\n", - crq->version_exchange_rsp.version); - if (be16_to_cpu(crq->version_exchange_rsp.version) < - ibmvnic_version) - ibmvnic_version = + ibmvnic_version = be16_to_cpu(crq->version_exchange_rsp.version); - send_cap_queries(adapter); + dev_info(dev, "Partner protocol version is %d\n", + ibmvnic_version); + send_query_cap(adapter); break; case QUERY_CAPABILITY_RSP: handle_query_cap_rsp(crq, adapter); @@ -3395,19 +6114,19 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, crq->link_state_indication.phys_link_state; adapter->logical_link_state = crq->link_state_indication.logical_link_state; + if (adapter->phys_link_state && adapter->logical_link_state) + netif_carrier_on(netdev); + else + netif_carrier_off(netdev); break; case CHANGE_MAC_ADDR_RSP: netdev_dbg(netdev, "Got MAC address change Response\n"); - handle_change_mac_rsp(crq, adapter); + adapter->fw_done_rc = handle_change_mac_rsp(crq, adapter); break; case ERROR_INDICATION: netdev_dbg(netdev, "Got Error Indication\n"); handle_error_indication(crq, adapter); break; - case REQUEST_ERROR_RSP: - netdev_dbg(netdev, "Got Error Detail Response\n"); - handle_error_info_rsp(crq, adapter); - break; case REQUEST_STATISTICS_RSP: netdev_dbg(netdev, "Got Statistics Response\n"); complete(&adapter->stats_done); @@ -3430,6 +6149,16 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, netdev_dbg(netdev, "Got Collect firmware trace Response\n"); complete(&adapter->fw_done); break; + case GET_VPD_SIZE_RSP: + handle_vpd_size_rsp(crq, adapter); + break; + case GET_VPD_RSP: + handle_vpd_rsp(crq, adapter); + break; + case QUERY_PHYS_PARMS_RSP: + adapter->fw_done_rc = handle_query_phys_parms_rsp(crq, adapter); + complete(&adapter->fw_done); + break; default: netdev_err(netdev, "Got an invalid cmd type 0x%02x\n", gen_crq->cmd); @@ -3444,33 +6173,27 @@ static irqreturn_t ibmvnic_interrupt(int irq, void *instance) return IRQ_HANDLED; } -static void ibmvnic_tasklet(void *data) +static void ibmvnic_tasklet(struct tasklet_struct *t) { - struct ibmvnic_adapter *adapter = data; + struct ibmvnic_adapter *adapter = from_tasklet(adapter, t, tasklet); struct ibmvnic_crq_queue *queue = &adapter->crq; union ibmvnic_crq *crq; unsigned long flags; - bool done = false; spin_lock_irqsave(&queue->lock, flags); - while (!done) { - /* Pull all the valid messages off the CRQ */ - while ((crq = ibmvnic_next_crq(adapter)) != NULL) { - ibmvnic_handle_crq(crq, adapter); - crq->generic.first = 0; - } - /* remain in tasklet until all - * capabilities responses are received + /* Pull all the valid messages off the CRQ */ + while ((crq = ibmvnic_next_crq(adapter)) != NULL) { + /* This barrier makes sure ibmvnic_next_crq()'s + * crq->generic.first & IBMVNIC_CRQ_CMD_RSP is loaded + * before ibmvnic_handle_crq()'s + * switch(gen_crq->first) and switch(gen_crq->cmd). */ - if (!adapter->wait_capability) - done = true; + dma_rmb(); + ibmvnic_handle_crq(crq, adapter); + crq->generic.first = 0; } - /* if capabilities CRQ's were sent in this tasklet, the following - * tasklet must wait until all responses are received - */ - if (atomic_read(&adapter->running_cap_crqs) != 0) - adapter->wait_capability = true; + spin_unlock_irqrestore(&queue->lock, flags); } @@ -3502,8 +6225,12 @@ static int ibmvnic_reset_crq(struct ibmvnic_adapter *adapter) } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); /* Clean out the queue */ + if (!crq->msgs) + return -EINVAL; + memset(crq->msgs, 0, PAGE_SIZE); crq->cur = 0; + crq->active = false; /* And re-open it again */ rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address, @@ -3538,6 +6265,7 @@ static void release_crq_queue(struct ibmvnic_adapter *adapter) DMA_BIDIRECTIONAL); free_page((unsigned long)crq->msgs); crq->msgs = NULL; + crq->active = false; } static int init_crq_queue(struct ibmvnic_adapter *adapter) @@ -3579,12 +6307,12 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter) retrc = 0; - tasklet_init(&adapter->tasklet, (void *)ibmvnic_tasklet, - (unsigned long)adapter); + tasklet_setup(&adapter->tasklet, (void *)ibmvnic_tasklet); netdev_dbg(adapter->netdev, "registering irq 0x%x\n", vdev->irq); - rc = request_irq(vdev->irq, ibmvnic_interrupt, 0, IBMVNIC_NAME, - adapter); + snprintf(crq->name, sizeof(crq->name), "ibmvnic-%x", + adapter->vdev->unit_address); + rc = request_irq(vdev->irq, ibmvnic_interrupt, 0, crq->name, adapter); if (rc) { dev_err(dev, "Couldn't register irq 0x%x. rc=%d\n", vdev->irq, rc); @@ -3600,6 +6328,9 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter) crq->cur = 0; spin_lock_init(&crq->lock); + /* process any CRQs that were queued before we enabled interrupts */ + tasklet_schedule(&adapter->tasklet); + return retrc; req_irq_failed: @@ -3615,50 +6346,76 @@ map_failed: return retrc; } -static int ibmvnic_init(struct ibmvnic_adapter *adapter) +static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset) { struct device *dev = &adapter->vdev->dev; - unsigned long timeout = msecs_to_jiffies(30000); + unsigned long timeout = msecs_to_jiffies(20000); + u64 old_num_rx_queues = adapter->req_rx_queues; + u64 old_num_tx_queues = adapter->req_tx_queues; int rc; - if (adapter->resetting) { - rc = ibmvnic_reset_crq(adapter); - if (!rc) - rc = vio_enable_interrupts(adapter->vdev); - } else { - rc = init_crq_queue(adapter); - } + adapter->from_passive_init = false; + rc = ibmvnic_send_crq_init(adapter); if (rc) { - dev_err(dev, "Couldn't initialize crq. rc=%d\n", rc); + dev_err(dev, "Send crq init failed with error %d\n", rc); return rc; } - adapter->from_passive_init = false; - - init_completion(&adapter->init_done); - adapter->init_done_rc = 0; - ibmvnic_send_crq_init(adapter); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { dev_err(dev, "Initialization sequence timed out\n"); - return -1; + return -ETIMEDOUT; } if (adapter->init_done_rc) { release_crq_queue(adapter); + dev_err(dev, "CRQ-init failed, %d\n", adapter->init_done_rc); return adapter->init_done_rc; } if (adapter->from_passive_init) { adapter->state = VNIC_OPEN; adapter->from_passive_init = false; - return -1; + dev_err(dev, "CRQ-init failed, passive-init\n"); + return -EINVAL; } - if (adapter->resetting) - rc = reset_sub_crq_queues(adapter); - else + if (reset && + test_bit(0, &adapter->resetting) && !adapter->wait_for_reset && + adapter->reset_reason != VNIC_RESET_MOBILITY) { + if (adapter->req_rx_queues != old_num_rx_queues || + adapter->req_tx_queues != old_num_tx_queues) { + release_sub_crqs(adapter, 0); + rc = init_sub_crqs(adapter); + } else { + /* no need to reinitialize completely, but we do + * need to clean up transmits that were in flight + * when we processed the reset. Failure to do so + * will confound the upper layer, usually TCP, by + * creating the illusion of transmits that are + * awaiting completion. + */ + clean_tx_pools(adapter); + + rc = reset_sub_crq_queues(adapter); + } + } else { + if (adapter->reset_reason == VNIC_RESET_MOBILITY) { + /* After an LPM, reset the max number of indirect + * subcrq descriptors per H_SEND_SUB_CRQ_INDIRECT + * hcall to the default max (e.g POWER8 -> POWER10) + * + * If the new destination platform does not support + * the higher limit max (e.g. POWER10-> POWER8 LPM) + * H_PARAMETER will trigger automatic fallback to the + * safe minimum limit. + */ + adapter->cur_max_ind_descs = IBMVNIC_MAX_IND_DESCS; + } + rc = init_sub_crqs(adapter); + } + if (rc) { dev_err(dev, "Initialization of sub crqs failed\n"); release_crq_queue(adapter); @@ -3681,6 +6438,8 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) struct ibmvnic_adapter *adapter; struct net_device *netdev; unsigned char *mac_addr_p; + unsigned long flags; + bool init_success; int rc; dev_dbg(&dev->dev, "entering ibmvnic_probe for UA 0x%x\n", @@ -3696,7 +6455,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } netdev = alloc_etherdev_mq(sizeof(struct ibmvnic_adapter), - IBMVNIC_MAX_TX_QUEUES); + IBMVNIC_MAX_QUEUES); if (!netdev) return -ENOMEM; @@ -3705,75 +6464,202 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) dev_set_drvdata(&dev->dev, netdev); adapter->vdev = dev; adapter->netdev = netdev; + adapter->login_pending = false; + memset(&adapter->map_ids, 0, sizeof(adapter->map_ids)); + /* map_ids start at 1, so ensure map_id 0 is always "in-use" */ + bitmap_set(adapter->map_ids, 0, 1); ether_addr_copy(adapter->mac_addr, mac_addr_p); - ether_addr_copy(netdev->dev_addr, adapter->mac_addr); + eth_hw_addr_set(netdev, adapter->mac_addr); netdev->irq = dev->irq; netdev->netdev_ops = &ibmvnic_netdev_ops; netdev->ethtool_ops = &ibmvnic_ethtool_ops; SET_NETDEV_DEV(netdev, &dev->dev); - spin_lock_init(&adapter->stats_lock); - - INIT_LIST_HEAD(&adapter->errors); - spin_lock_init(&adapter->error_list_lock); - INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset); + INIT_DELAYED_WORK(&adapter->ibmvnic_delayed_reset, + __ibmvnic_delayed_reset); INIT_LIST_HEAD(&adapter->rwi_list); - mutex_init(&adapter->reset_lock); - mutex_init(&adapter->rwi_lock); - adapter->resetting = false; + spin_lock_init(&adapter->rwi_lock); + spin_lock_init(&adapter->state_lock); + mutex_init(&adapter->fw_lock); + init_completion(&adapter->probe_done); + init_completion(&adapter->init_done); + init_completion(&adapter->fw_done); + init_completion(&adapter->reset_done); + init_completion(&adapter->stats_done); + clear_bit(0, &adapter->resetting); + adapter->prev_rx_buf_sz = 0; + adapter->prev_mtu = 0; + init_success = false; do { - rc = ibmvnic_init(adapter); - if (rc && rc != EAGAIN) { - free_netdev(netdev); - return rc; + reinit_init_done(adapter); + + /* clear any failovers we got in the previous pass + * since we are reinitializing the CRQ + */ + adapter->failover_pending = false; + + /* If we had already initialized CRQ, we may have one or + * more resets queued already. Discard those and release + * the CRQ before initializing the CRQ again. + */ + release_crq_queue(adapter); + + /* Since we are still in PROBING state, __ibmvnic_reset() + * will not access the ->rwi_list and since we released CRQ, + * we won't get _new_ transport events. But there maybe an + * ongoing ibmvnic_reset() call. So serialize access to + * rwi_list. If we win the race, ibvmnic_reset() could add + * a reset after we purged but thats ok - we just may end + * up with an extra reset (i.e similar to having two or more + * resets in the queue at once). + * CHECK. + */ + spin_lock_irqsave(&adapter->rwi_lock, flags); + flush_reset_queue(adapter); + spin_unlock_irqrestore(&adapter->rwi_lock, flags); + + rc = init_crq_queue(adapter); + if (rc) { + dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n", + rc); + goto ibmvnic_init_fail; } - } while (rc == EAGAIN); - netdev->mtu = adapter->req_mtu - ETH_HLEN; + rc = ibmvnic_reset_init(adapter, false); + } while (rc == -EAGAIN); + + /* We are ignoring the error from ibmvnic_reset_init() assuming that the + * partner is not ready. CRQ is not active. When the partner becomes + * ready, we will do the passive init reset. + */ + + if (!rc) + init_success = true; + + rc = init_stats_buffers(adapter); + if (rc) + goto ibmvnic_init_fail; + + rc = init_stats_token(adapter); + if (rc) + goto ibmvnic_stats_fail; rc = device_create_file(&dev->dev, &dev_attr_failover); - if (rc) { - free_netdev(netdev); - return rc; + if (rc) + goto ibmvnic_dev_file_err; + + netif_carrier_off(netdev); + + if (init_success) { + adapter->state = VNIC_PROBED; + netdev->mtu = adapter->req_mtu - ETH_HLEN; + netdev->min_mtu = adapter->min_mtu - ETH_HLEN; + netdev->max_mtu = adapter->max_mtu - ETH_HLEN; + } else { + adapter->state = VNIC_DOWN; } + adapter->wait_for_reset = false; + adapter->last_reset_time = jiffies; + adapter->cur_max_ind_descs = IBMVNIC_MAX_IND_DESCS; + rc = register_netdev(netdev); if (rc) { dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc); - device_remove_file(&dev->dev, &dev_attr_failover); - free_netdev(netdev); - return rc; + goto ibmvnic_register_fail; } dev_info(&dev->dev, "ibmvnic registered\n"); - adapter->state = VNIC_PROBED; + rc = ibmvnic_cpu_notif_add(adapter); + if (rc) { + netdev_err(netdev, "Registering cpu notifier failed\n"); + goto cpu_notif_add_failed; + } + + complete(&adapter->probe_done); + return 0; + +cpu_notif_add_failed: + unregister_netdev(netdev); + +ibmvnic_register_fail: + device_remove_file(&dev->dev, &dev_attr_failover); + +ibmvnic_dev_file_err: + release_stats_token(adapter); + +ibmvnic_stats_fail: + release_stats_buffers(adapter); + +ibmvnic_init_fail: + release_sub_crqs(adapter, 1); + release_crq_queue(adapter); + + /* cleanup worker thread after releasing CRQ so we don't get + * transport events (i.e new work items for the worker thread). + */ + adapter->state = VNIC_REMOVING; + complete(&adapter->probe_done); + flush_work(&adapter->ibmvnic_reset); + flush_delayed_work(&adapter->ibmvnic_delayed_reset); + + flush_reset_queue(adapter); + + mutex_destroy(&adapter->fw_lock); + free_netdev(netdev); + + return rc; } -static int ibmvnic_remove(struct vio_dev *dev) +static void ibmvnic_remove(struct vio_dev *dev) { struct net_device *netdev = dev_get_drvdata(&dev->dev); struct ibmvnic_adapter *adapter = netdev_priv(netdev); + unsigned long flags; + + spin_lock_irqsave(&adapter->state_lock, flags); + /* If ibmvnic_reset() is scheduling a reset, wait for it to + * finish. Then, set the state to REMOVING to prevent it from + * scheduling any more work and to have reset functions ignore + * any resets that have already been scheduled. Drop the lock + * after setting state, so __ibmvnic_reset() which is called + * from the flush_work() below, can make progress. + */ + spin_lock(&adapter->rwi_lock); adapter->state = VNIC_REMOVING; - unregister_netdev(netdev); - mutex_lock(&adapter->reset_lock); + spin_unlock(&adapter->rwi_lock); + + spin_unlock_irqrestore(&adapter->state_lock, flags); + + ibmvnic_cpu_notif_remove(adapter); + + flush_work(&adapter->ibmvnic_reset); + flush_delayed_work(&adapter->ibmvnic_delayed_reset); + + rtnl_lock(); + unregister_netdevice(netdev); release_resources(adapter); - release_sub_crqs(adapter); + release_rx_pools(adapter); + release_tx_pools(adapter); + release_sub_crqs(adapter, 1); release_crq_queue(adapter); + release_stats_token(adapter); + release_stats_buffers(adapter); + adapter->state = VNIC_REMOVED; - mutex_unlock(&adapter->reset_lock); + rtnl_unlock(); + mutex_destroy(&adapter->fw_lock); device_remove_file(&dev->dev, &dev_attr_failover); free_netdev(netdev); dev_set_drvdata(&dev->dev, NULL); - - return 0; } static ssize_t failover_store(struct device *dev, struct device_attribute *attr, @@ -3793,7 +6679,7 @@ static ssize_t failover_store(struct device *dev, struct device_attribute *attr, if (rc) { netdev_err(netdev, "Couldn't retrieve session token, rc %ld\n", rc); - return -EINVAL; + goto last_resort; } session_token = (__be64)retbuf[0]; @@ -3802,15 +6688,21 @@ static ssize_t failover_store(struct device *dev, struct device_attribute *attr, rc = plpar_hcall_norets(H_VIOCTL, adapter->vdev->unit_address, H_SESSION_ERR_DETECTED, session_token, 0, 0); if (rc) { - netdev_err(netdev, "Client initiated failover failed, rc %ld\n", + netdev_err(netdev, + "H_VIOCTL initiated failover failed, rc %ld\n", rc); - return -EINVAL; + goto last_resort; } return count; -} -static DEVICE_ATTR(failover, 0200, NULL, failover_store); +last_resort: + netdev_dbg(netdev, "Trying to send CRQ_CMD, the last resort\n"); + ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); + + return count; +} +static DEVICE_ATTR_WO(failover); static unsigned long ibmvnic_get_desired_dma(struct vio_dev *vdev) { @@ -3834,8 +6726,7 @@ static unsigned long ibmvnic_get_desired_dma(struct vio_dev *vdev) for (i = 0; i < adapter->req_tx_queues + adapter->req_rx_queues; i++) ret += 4 * PAGE_SIZE; /* the scrq message queue */ - for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); - i++) + for (i = 0; i < adapter->num_active_rx_pools; i++) ret += adapter->rx_pool[i].size * IOMMU_PAGE_ALIGN(adapter->rx_pool[i].buff_size, tbl); @@ -3846,20 +6737,16 @@ static int ibmvnic_resume(struct device *dev) { struct net_device *netdev = dev_get_drvdata(dev); struct ibmvnic_adapter *adapter = netdev_priv(netdev); - int i; if (adapter->state != VNIC_OPEN) return 0; - /* kick the interrupt handlers just in case we lost an interrupt */ - for (i = 0; i < adapter->req_rx_queues; i++) - ibmvnic_interrupt_rx(adapter->rx_scrq[i]->irq, - adapter->rx_scrq[i]); + tasklet_schedule(&adapter->tasklet); return 0; } -static struct vio_device_id ibmvnic_device_table[] = { +static const struct vio_device_id ibmvnic_device_table[] = { {"network", "IBM,vnic"}, {"", "" } }; @@ -3881,15 +6768,40 @@ static struct vio_driver ibmvnic_driver = { /* module functions */ static int __init ibmvnic_module_init(void) { + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "net/ibmvnic:online", + ibmvnic_cpu_online, + ibmvnic_cpu_down_prep); + if (ret < 0) + goto out; + ibmvnic_online = ret; + ret = cpuhp_setup_state_multi(CPUHP_IBMVNIC_DEAD, "net/ibmvnic:dead", + NULL, ibmvnic_cpu_dead); + if (ret) + goto err_dead; + + ret = vio_register_driver(&ibmvnic_driver); + if (ret) + goto err_vio_register; + pr_info("%s: %s %s\n", ibmvnic_driver_name, ibmvnic_driver_string, IBMVNIC_DRIVER_VERSION); - return vio_register_driver(&ibmvnic_driver); + return 0; +err_vio_register: + cpuhp_remove_multi_state(CPUHP_IBMVNIC_DEAD); +err_dead: + cpuhp_remove_multi_state(ibmvnic_online); +out: + return ret; } static void __exit ibmvnic_module_exit(void) { vio_unregister_driver(&ibmvnic_driver); + cpuhp_remove_multi_state(CPUHP_IBMVNIC_DEAD); + cpuhp_remove_multi_state(ibmvnic_online); } module_init(ibmvnic_module_init); |
