summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/pci.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r--drivers/nvme/host/pci.c606
1 files changed, 268 insertions, 338 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 488ad7dabeb8..f0f8027644bb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
+#include <linux/kstrtox.h>
#include <linux/memremap.h>
#include <linux/mm.h>
#include <linux/module.h>
@@ -108,7 +109,7 @@ struct nvme_dev;
struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
-static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
+static void nvme_delete_io_queues(struct nvme_dev *dev);
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
@@ -130,7 +131,6 @@ struct nvme_dev {
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
- struct work_struct remove_work;
struct mutex shutdown_lock;
bool subsystem;
u64 cmb_size;
@@ -158,8 +158,6 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
-
- bool attrs_added;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -241,10 +239,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
return dev->nr_allocated_queues * 8 * dev->db_stride;
}
-static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
unsigned int mem_size = nvme_dbbuf_size(dev);
+ if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP))
+ return;
+
if (dev->dbbuf_dbs) {
/*
* Clear the dbbuf memory so the driver doesn't observe stale
@@ -252,25 +253,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
*/
memset(dev->dbbuf_dbs, 0, mem_size);
memset(dev->dbbuf_eis, 0, mem_size);
- return 0;
+ return;
}
dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
&dev->dbbuf_dbs_dma_addr,
GFP_KERNEL);
if (!dev->dbbuf_dbs)
- return -ENOMEM;
+ goto fail;
dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
&dev->dbbuf_eis_dma_addr,
GFP_KERNEL);
- if (!dev->dbbuf_eis) {
- dma_free_coherent(dev->dev, mem_size,
- dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
- dev->dbbuf_dbs = NULL;
- return -ENOMEM;
- }
+ if (!dev->dbbuf_eis)
+ goto fail_free_dbbuf_dbs;
+ return;
- return 0;
+fail_free_dbbuf_dbs:
+ dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs,
+ dev->dbbuf_dbs_dma_addr);
+ dev->dbbuf_dbs = NULL;
+fail:
+ dev_warn(dev->dev, "unable to allocate dma for dbbuf\n");
}
static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
@@ -392,18 +395,10 @@ static int nvme_pci_npages_sgl(void)
PAGE_SIZE);
}
-static size_t nvme_pci_iod_alloc_size(void)
-{
- size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
-
- return sizeof(__le64 *) * npages +
- sizeof(struct scatterlist) * NVME_MAX_SEGS;
-}
-
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
- struct nvme_dev *dev = data;
+ struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[0];
WARN_ON(hctx_idx != 0);
@@ -416,7 +411,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
- struct nvme_dev *dev = data;
+ struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
@@ -428,7 +423,7 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set,
struct request *req, unsigned int hctx_idx,
unsigned int numa_node)
{
- struct nvme_dev *dev = set->driver_data;
+ struct nvme_dev *dev = to_nvme_dev(set->driver_data);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
nvme_req(req)->ctrl = &dev->ctrl;
@@ -447,7 +442,7 @@ static int queue_irq_offset(struct nvme_dev *dev)
static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
- struct nvme_dev *dev = set->driver_data;
+ struct nvme_dev *dev = to_nvme_dev(set->driver_data);
int i, qoff, offset;
offset = queue_irq_offset(dev);
@@ -914,7 +909,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
goto out_unmap_data;
}
- blk_mq_start_request(req);
+ nvme_start_request(req);
return BLK_STS_OK;
out_unmap_data:
nvme_unmap_data(dev, req);
@@ -1474,24 +1469,21 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
}
}
-/**
- * nvme_suspend_queue - put queue into suspended state
- * @nvmeq: queue to suspend
- */
-static int nvme_suspend_queue(struct nvme_queue *nvmeq)
+static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid)
{
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+
if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
- return 1;
+ return;
/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
mb();
nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
- nvme_stop_admin_queue(&nvmeq->dev->ctrl);
+ nvme_quiesce_admin_queue(&nvmeq->dev->ctrl);
if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
- pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
- return 0;
+ pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq);
}
static void nvme_suspend_io_queues(struct nvme_dev *dev)
@@ -1499,19 +1491,7 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev)
int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
- nvme_suspend_queue(&dev->queues[i]);
-}
-
-static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
-{
- struct nvme_queue *nvmeq = &dev->queues[0];
-
- if (shutdown)
- nvme_shutdown_ctrl(&dev->ctrl);
- else
- nvme_disable_ctrl(&dev->ctrl);
-
- nvme_poll_irqdisable(nvmeq);
+ nvme_suspend_queue(dev, i);
}
/*
@@ -1748,44 +1728,11 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
* user requests may be waiting on a stopped queue. Start the
* queue to flush these to completion.
*/
- nvme_start_admin_queue(&dev->ctrl);
- blk_mq_destroy_queue(dev->ctrl.admin_q);
- blk_mq_free_tag_set(&dev->admin_tagset);
+ nvme_unquiesce_admin_queue(&dev->ctrl);
+ nvme_remove_admin_tag_set(&dev->ctrl);
}
}
-static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev)
-{
- struct blk_mq_tag_set *set = &dev->admin_tagset;
-
- set->ops = &nvme_mq_admin_ops;
- set->nr_hw_queues = 1;
-
- set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- set->timeout = NVME_ADMIN_TIMEOUT;
- set->numa_node = dev->ctrl.numa_node;
- set->cmd_size = sizeof(struct nvme_iod);
- set->flags = BLK_MQ_F_NO_SCHED;
- set->driver_data = dev;
-
- if (blk_mq_alloc_tag_set(set))
- return -ENOMEM;
- dev->ctrl.admin_tagset = set;
-
- dev->ctrl.admin_q = blk_mq_init_queue(set);
- if (IS_ERR(dev->ctrl.admin_q)) {
- blk_mq_free_tag_set(set);
- dev->ctrl.admin_q = NULL;
- return -ENOMEM;
- }
- if (!blk_get_queue(dev->ctrl.admin_q)) {
- nvme_dev_remove_admin(dev);
- dev->ctrl.admin_q = NULL;
- return -ENODEV;
- }
- return 0;
-}
-
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
@@ -1829,7 +1776,14 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
- result = nvme_disable_ctrl(&dev->ctrl);
+ /*
+ * If the device has been passed off to us in an enabled state, just
+ * clear the enabled bit. The spec says we should set the 'shutdown
+ * notification bits', but doing so may cause the device to complete
+ * commands to the admin queue ... and we don't know what memory that
+ * might be pointing at!
+ */
+ result = nvme_disable_ctrl(&dev->ctrl, false);
if (result < 0)
return result;
@@ -2112,6 +2066,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
u32 enable_bits = NVME_HOST_MEM_ENABLE;
int ret;
+ if (!dev->ctrl.hmpre)
+ return 0;
+
preferred = min(preferred, max);
if (min > max) {
dev_warn(dev->ctrl.device,
@@ -2192,7 +2149,7 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
bool new;
int ret;
- if (strtobool(buf, &new) < 0)
+ if (kstrtobool(buf, &new) < 0)
return -EINVAL;
if (new == ndev->hmb)
@@ -2240,11 +2197,17 @@ static struct attribute *nvme_pci_attrs[] = {
NULL,
};
-static const struct attribute_group nvme_pci_attr_group = {
+static const struct attribute_group nvme_pci_dev_attrs_group = {
.attrs = nvme_pci_attrs,
.is_visible = nvme_pci_attrs_are_visible,
};
+static const struct attribute_group *nvme_pci_dev_attr_groups[] = {
+ &nvme_dev_attrs_group,
+ &nvme_pci_dev_attrs_group,
+ NULL,
+};
+
/*
* nirqs is the number of interrupts available for write and read
* queues. The core already reserved an interrupt for the admin queue.
@@ -2319,12 +2282,6 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
-static void nvme_disable_io_queues(struct nvme_dev *dev)
-{
- if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
- __nvme_disable_io_queues(dev, nvme_admin_delete_cq);
-}
-
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
/*
@@ -2432,7 +2389,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (dev->online_queues - 1 < dev->max_qid) {
nr_io_queues = dev->online_queues - 1;
- nvme_disable_io_queues(dev);
+ nvme_delete_io_queues(dev);
result = nvme_setup_io_queues_trylock(dev);
if (result)
return result;
@@ -2495,7 +2452,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
return 0;
}
-static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
+static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode)
{
int nr_queues = dev->online_queues - 1, sent = 0;
unsigned long timeout;
@@ -2523,40 +2480,19 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true;
}
-static void nvme_pci_alloc_tag_set(struct nvme_dev *dev)
+static void nvme_delete_io_queues(struct nvme_dev *dev)
{
- struct blk_mq_tag_set * set = &dev->tagset;
- int ret;
+ if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq))
+ __nvme_delete_io_queues(dev, nvme_admin_delete_cq);
+}
- set->ops = &nvme_mq_ops;
- set->nr_hw_queues = dev->online_queues - 1;
- set->nr_maps = 1;
- if (dev->io_queues[HCTX_TYPE_READ])
- set->nr_maps = 2;
+static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev)
+{
if (dev->io_queues[HCTX_TYPE_POLL])
- set->nr_maps = 3;
- set->timeout = NVME_IO_TIMEOUT;
- set->numa_node = dev->ctrl.numa_node;
- set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- set->cmd_size = sizeof(struct nvme_iod);
- set->flags = BLK_MQ_F_SHOULD_MERGE;
- set->driver_data = dev;
-
- /*
- * Some Apple controllers requires tags to be unique
- * across admin and IO queue, so reserve the first 32
- * tags of the IO queue.
- */
- if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
- set->reserved_tags = NVME_AQ_DEPTH;
-
- ret = blk_mq_alloc_tag_set(set);
- if (ret) {
- dev_warn(dev->ctrl.device,
- "IO queues tagset allocation failed %d\n", ret);
- return;
- }
- dev->ctrl.tagset = set;
+ return 3;
+ if (dev->io_queues[HCTX_TYPE_READ])
+ return 2;
+ return 1;
}
static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
@@ -2647,7 +2583,8 @@ static int nvme_pci_enable(struct nvme_dev *dev)
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
- return 0;
+
+ return nvme_pci_configure_admin_queue(dev);
disable:
pci_disable_device(pdev);
@@ -2661,57 +2598,53 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
pci_release_mem_regions(to_pci_dev(dev->dev));
}
-static void nvme_pci_disable(struct nvme_dev *dev)
+static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ u32 csts;
- pci_free_irq_vectors(pdev);
+ if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev))
+ return true;
+ if (pdev->error_state != pci_channel_io_normal)
+ return true;
- if (pci_is_enabled(pdev)) {
- pci_disable_pcie_error_reporting(pdev);
- pci_disable_device(pdev);
- }
+ csts = readl(dev->bar + NVME_REG_CSTS);
+ return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY);
}
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
- bool dead = true, freeze = false;
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ bool dead;
mutex_lock(&dev->shutdown_lock);
- if (pci_is_enabled(pdev)) {
- u32 csts;
-
- if (pci_device_is_present(pdev))
- csts = readl(dev->bar + NVME_REG_CSTS);
- else
- csts = ~0;
-
- if (dev->ctrl.state == NVME_CTRL_LIVE ||
- dev->ctrl.state == NVME_CTRL_RESETTING) {
- freeze = true;
+ dead = nvme_pci_ctrl_is_dead(dev);
+ if (dev->ctrl.state == NVME_CTRL_LIVE ||
+ dev->ctrl.state == NVME_CTRL_RESETTING) {
+ if (pci_is_enabled(pdev))
nvme_start_freeze(&dev->ctrl);
- }
- dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
- pdev->error_state != pci_channel_io_normal);
+ /*
+ * Give the controller a chance to complete all entered requests
+ * if doing a safe shutdown.
+ */
+ if (!dead && shutdown)
+ nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
}
- /*
- * Give the controller a chance to complete all entered requests if
- * doing a safe shutdown.
- */
- if (!dead && shutdown && freeze)
- nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-
- nvme_stop_queues(&dev->ctrl);
+ nvme_quiesce_io_queues(&dev->ctrl);
if (!dead && dev->ctrl.queue_count > 0) {
- nvme_disable_io_queues(dev);
- nvme_disable_admin_queue(dev, shutdown);
+ nvme_delete_io_queues(dev);
+ nvme_disable_ctrl(&dev->ctrl, shutdown);
+ nvme_poll_irqdisable(&dev->queues[0]);
}
nvme_suspend_io_queues(dev);
- nvme_suspend_queue(&dev->queues[0]);
- nvme_pci_disable(dev);
+ nvme_suspend_queue(dev, 0);
+ pci_free_irq_vectors(pdev);
+ if (pci_is_enabled(pdev)) {
+ pci_disable_pcie_error_reporting(pdev);
+ pci_disable_device(pdev);
+ }
nvme_reap_pending_cqes(dev);
nvme_cancel_tagset(&dev->ctrl);
@@ -2723,9 +2656,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
* deadlocking blk-mq hot-cpu notifier.
*/
if (shutdown) {
- nvme_start_queues(&dev->ctrl);
+ nvme_unquiesce_io_queues(&dev->ctrl);
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
- nvme_start_admin_queue(&dev->ctrl);
+ nvme_unquiesce_admin_queue(&dev->ctrl);
}
mutex_unlock(&dev->shutdown_lock);
}
@@ -2762,42 +2695,40 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
dma_pool_destroy(dev->prp_small_pool);
}
+static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
+{
+ size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
+ size_t alloc_size = sizeof(__le64 *) * npages +
+ sizeof(struct scatterlist) * NVME_MAX_SEGS;
+
+ WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+ dev->iod_mempool = mempool_create_node(1,
+ mempool_kmalloc, mempool_kfree,
+ (void *)alloc_size, GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->iod_mempool)
+ return -ENOMEM;
+ return 0;
+}
+
static void nvme_free_tagset(struct nvme_dev *dev)
{
if (dev->tagset.tags)
- blk_mq_free_tag_set(&dev->tagset);
+ nvme_remove_io_tag_set(&dev->ctrl);
dev->ctrl.tagset = NULL;
}
+/* pairs with nvme_pci_alloc_dev */
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
- nvme_dbbuf_dma_free(dev);
nvme_free_tagset(dev);
- if (dev->ctrl.admin_q)
- blk_put_queue(dev->ctrl.admin_q);
- free_opal_dev(dev->ctrl.opal_dev);
- mempool_destroy(dev->iod_mempool);
put_device(dev->dev);
kfree(dev->queues);
kfree(dev);
}
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
- /*
- * Set state to deleting now to avoid blocking nvme_wait_reset(), which
- * may be holding this pci_dev's device lock.
- */
- nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
- nvme_get_ctrl(&dev->ctrl);
- nvme_dev_disable(dev, false);
- nvme_kill_queues(&dev->ctrl);
- if (!queue_work(nvme_wq, &dev->remove_work))
- nvme_put_ctrl(&dev->ctrl);
-}
-
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev =
@@ -2808,8 +2739,7 @@ static void nvme_reset_work(struct work_struct *work)
if (dev->ctrl.state != NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
dev->ctrl.state);
- result = -ENODEV;
- goto out;
+ return;
}
/*
@@ -2824,34 +2754,7 @@ static void nvme_reset_work(struct work_struct *work)
result = nvme_pci_enable(dev);
if (result)
goto out_unlock;
-
- result = nvme_pci_configure_admin_queue(dev);
- if (result)
- goto out_unlock;
-
- if (!dev->ctrl.admin_q) {
- result = nvme_pci_alloc_admin_tag_set(dev);
- if (result)
- goto out_unlock;
- } else {
- nvme_start_admin_queue(&dev->ctrl);
- }
-
- dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
-
- /*
- * Limit the max command size to prevent iod->sg allocations going
- * over a single page.
- */
- dev->ctrl.max_hw_sectors = min_t(u32,
- NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
- dev->ctrl.max_segments = NVME_MAX_SEGS;
-
- /*
- * Don't limit the IOMMU merged segment size.
- */
- dma_set_max_seg_size(dev->dev, 0xffffffff);
-
+ nvme_unquiesce_admin_queue(&dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
/*
@@ -2865,62 +2768,37 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
- /*
- * We do not support an SGL for metadata (yet), so we are limited to a
- * single integrity segment for the separate metadata pointer.
- */
- dev->ctrl.max_integrity_segments = 1;
-
- result = nvme_init_ctrl_finish(&dev->ctrl);
+ result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend);
if (result)
goto out;
- if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
- if (!dev->ctrl.opal_dev)
- dev->ctrl.opal_dev =
- init_opal_dev(&dev->ctrl, &nvme_sec_submit);
- else if (was_suspend)
- opal_unlock_from_suspend(dev->ctrl.opal_dev);
- } else {
- free_opal_dev(dev->ctrl.opal_dev);
- dev->ctrl.opal_dev = NULL;
- }
-
- if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
- result = nvme_dbbuf_dma_alloc(dev);
- if (result)
- dev_warn(dev->dev,
- "unable to allocate dma for dbbuf\n");
- }
+ nvme_dbbuf_dma_alloc(dev);
- if (dev->ctrl.hmpre) {
- result = nvme_setup_host_mem(dev);
- if (result < 0)
- goto out;
- }
+ result = nvme_setup_host_mem(dev);
+ if (result < 0)
+ goto out;
result = nvme_setup_io_queues(dev);
if (result)
goto out;
/*
- * Keep the controller around but remove all namespaces if we don't have
- * any working I/O queue.
+ * Freeze and update the number of I/O queues as thos might have
+ * changed. If there are no I/O queues left after this reset, keep the
+ * controller around but remove all namespaces.
*/
- if (dev->online_queues < 2) {
- dev_warn(dev->ctrl.device, "IO queues not created\n");
- nvme_kill_queues(&dev->ctrl);
- nvme_remove_namespaces(&dev->ctrl);
- nvme_free_tagset(dev);
- } else {
- nvme_start_queues(&dev->ctrl);
+ if (dev->online_queues > 1) {
+ nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- if (!dev->ctrl.tagset)
- nvme_pci_alloc_tag_set(dev);
- else
- nvme_pci_update_nr_queues(dev);
+ nvme_pci_update_nr_queues(dev);
nvme_dbbuf_set(dev);
nvme_unfreeze(&dev->ctrl);
+ } else {
+ dev_warn(dev->ctrl.device, "IO queues lost\n");
+ nvme_mark_namespaces_dead(&dev->ctrl);
+ nvme_unquiesce_io_queues(&dev->ctrl);
+ nvme_remove_namespaces(&dev->ctrl);
+ nvme_free_tagset(dev);
}
/*
@@ -2934,30 +2812,22 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
- if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
- &nvme_pci_attr_group))
- dev->attrs_added = true;
-
nvme_start_ctrl(&dev->ctrl);
return;
out_unlock:
mutex_unlock(&dev->shutdown_lock);
out:
- if (result)
- dev_warn(dev->ctrl.device,
- "Removing after probe failure status: %d\n", result);
- nvme_remove_dead_ctrl(dev);
-}
-
-static void nvme_remove_dead_ctrl_work(struct work_struct *work)
-{
- struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
- struct pci_dev *pdev = to_pci_dev(dev->dev);
-
- if (pci_get_drvdata(pdev))
- device_release_driver(&pdev->dev);
- nvme_put_ctrl(&dev->ctrl);
+ /*
+ * Set state to deleting now to avoid blocking nvme_wait_reset(), which
+ * may be holding this pci_dev's device lock.
+ */
+ dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
+ result);
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ nvme_dev_disable(dev, true);
+ nvme_mark_namespaces_dead(&dev->ctrl);
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
}
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
@@ -3010,6 +2880,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
.flags = NVME_F_METADATA_SUPPORTED,
+ .dev_attr_groups = nvme_pci_dev_attr_groups,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
@@ -3079,29 +2950,22 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
return 0;
}
-static void nvme_async_probe(void *data, async_cookie_t cookie)
+static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
+ const struct pci_device_id *id)
{
- struct nvme_dev *dev = data;
-
- flush_work(&dev->ctrl.reset_work);
- flush_work(&dev->ctrl.scan_work);
- nvme_put_ctrl(&dev->ctrl);
-}
-
-static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
- int node, result = -ENOMEM;
- struct nvme_dev *dev;
unsigned long quirks = id->driver_data;
- size_t alloc_size;
+ int node = dev_to_node(&pdev->dev);
+ struct nvme_dev *dev;
+ int ret = -ENOMEM;
- node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, first_memory_node);
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
- return -ENOMEM;
+ return NULL;
+ INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
+ mutex_init(&dev->shutdown_lock);
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
@@ -3109,25 +2973,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev->queues = kcalloc_node(dev->nr_allocated_queues,
sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
- goto free;
+ goto out_free_dev;
dev->dev = get_device(&pdev->dev);
- pci_set_drvdata(pdev, dev);
-
- result = nvme_dev_map(dev);
- if (result)
- goto put_pci;
-
- INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
- INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
- mutex_init(&dev->shutdown_lock);
-
- result = nvme_setup_prp_pools(dev);
- if (result)
- goto unmap;
quirks |= check_vendor_combination_bug(pdev);
-
if (!noacpi && acpi_storage_d3(&pdev->dev)) {
/*
* Some systems use a bios work around to ask for D3 on
@@ -3137,46 +2987,131 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
"platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
+ ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
+ quirks);
+ if (ret)
+ goto out_put_device;
+
+ dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1);
+ dma_set_max_seg_size(&pdev->dev, 0xffffffff);
/*
- * Double check that our mempool alloc size will cover the biggest
- * command we support.
+ * Limit the max command size to prevent iod->sg allocations going
+ * over a single page.
*/
- alloc_size = nvme_pci_iod_alloc_size();
- WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+ dev->ctrl.max_hw_sectors = min_t(u32,
+ NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9);
+ dev->ctrl.max_segments = NVME_MAX_SEGS;
- dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
- mempool_kfree,
- (void *) alloc_size,
- GFP_KERNEL, node);
- if (!dev->iod_mempool) {
- result = -ENOMEM;
- goto release_pools;
- }
+ /*
+ * There is no support for SGLs for metadata (yet), so we are limited to
+ * a single integrity segment for the separate metadata pointer.
+ */
+ dev->ctrl.max_integrity_segments = 1;
+ return dev;
+
+out_put_device:
+ put_device(dev->dev);
+ kfree(dev->queues);
+out_free_dev:
+ kfree(dev);
+ return ERR_PTR(ret);
+}
- result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
- quirks);
+static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct nvme_dev *dev;
+ int result = -ENOMEM;
+
+ dev = nvme_pci_alloc_dev(pdev, id);
+ if (!dev)
+ return -ENOMEM;
+
+ result = nvme_dev_map(dev);
if (result)
- goto release_mempool;
+ goto out_uninit_ctrl;
+
+ result = nvme_setup_prp_pools(dev);
+ if (result)
+ goto out_dev_unmap;
+
+ result = nvme_pci_alloc_iod_mempool(dev);
+ if (result)
+ goto out_release_prp_pools;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
- nvme_reset_ctrl(&dev->ctrl);
- async_schedule(nvme_async_probe, dev);
+ result = nvme_pci_enable(dev);
+ if (result)
+ goto out_release_iod_mempool;
+
+ result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
+ &nvme_mq_admin_ops, sizeof(struct nvme_iod));
+ if (result)
+ goto out_disable;
+
+ /*
+ * Mark the controller as connecting before sending admin commands to
+ * allow the timeout handler to do the right thing.
+ */
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller CONNECTING\n");
+ result = -EBUSY;
+ goto out_disable;
+ }
+
+ result = nvme_init_ctrl_finish(&dev->ctrl, false);
+ if (result)
+ goto out_disable;
+
+ nvme_dbbuf_dma_alloc(dev);
+
+ result = nvme_setup_host_mem(dev);
+ if (result < 0)
+ goto out_disable;
+
+ result = nvme_setup_io_queues(dev);
+ if (result)
+ goto out_disable;
+ if (dev->online_queues > 1) {
+ nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
+ nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
+ nvme_dbbuf_set(dev);
+ }
+
+ if (!dev->ctrl.tagset)
+ dev_warn(dev->ctrl.device, "IO queues not created\n");
+
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller live state\n");
+ result = -ENODEV;
+ goto out_disable;
+ }
+
+ pci_set_drvdata(pdev, dev);
+
+ nvme_start_ctrl(&dev->ctrl);
+ nvme_put_ctrl(&dev->ctrl);
return 0;
- release_mempool:
+out_disable:
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ nvme_dev_disable(dev, true);
+ nvme_free_host_mem(dev);
+ nvme_dev_remove_admin(dev);
+ nvme_dbbuf_dma_free(dev);
+ nvme_free_queues(dev, 0);
+out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
- release_pools:
+out_release_prp_pools:
nvme_release_prp_pools(dev);
- unmap:
+out_dev_unmap:
nvme_dev_unmap(dev);
- put_pci:
- put_device(dev->dev);
- free:
- kfree(dev->queues);
- kfree(dev);
+out_uninit_ctrl:
+ nvme_uninit_ctrl(&dev->ctrl);
return result;
}
@@ -3208,13 +3143,6 @@ static void nvme_shutdown(struct pci_dev *pdev)
nvme_disable_prepare_reset(dev, true);
}
-static void nvme_remove_attrs(struct nvme_dev *dev)
-{
- if (dev->attrs_added)
- sysfs_remove_group(&dev->ctrl.device->kobj,
- &nvme_pci_attr_group);
-}
-
/*
* The driver's remove may be called on a device in a partially initialized
* state. This function must not have any dependencies on the device state in
@@ -3236,10 +3164,11 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_dev_disable(dev, true);
- nvme_remove_attrs(dev);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
+ nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
+ mempool_destroy(dev->iod_mempool);
nvme_release_prp_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
@@ -3576,11 +3505,12 @@ static struct pci_driver nvme_driver = {
.probe = nvme_probe,
.remove = nvme_remove,
.shutdown = nvme_shutdown,
-#ifdef CONFIG_PM_SLEEP
.driver = {
- .pm = &nvme_dev_pm_ops,
- },
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+#ifdef CONFIG_PM_SLEEP
+ .pm = &nvme_dev_pm_ops,
#endif
+ },
.sriov_configure = pci_sriov_configure_simple,
.err_handler = &nvme_err_handler,
};