From 4d51abf9bcca01efa3afbe94d50cc2cda8095da8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 15 Jun 2014 13:37:33 -0700 Subject: block: Use dma_zalloc_coherent Use the zeroing function instead of dma_alloc_coherent & memset(,0,) Signed-off-by: Joe Perches Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e2bb8afbeae5..baee59530d6e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1275,11 +1275,10 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, if (!nvmeq) return NULL; - nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), - &nvmeq->cq_dma_addr, GFP_KERNEL); + nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), &nvmeq->sq_dma_addr, GFP_KERNEL); -- cgit From 6fccf9383b280d463a7dfe1e0d048aff8df8a25e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 18 Jun 2014 13:58:57 -0600 Subject: NVMe: Async event request Submits NVMe asynchronous event requests, one event up to the controller maximum or number of possible different event types (8), whichever is smaller. Events successfully returned by the controller are logged. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index baee59530d6e..42a62bbf4a11 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -207,6 +207,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) #define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) +#define CMD_CTX_ASYNC (0x31C + CMD_CTX_BASE) static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) @@ -229,6 +230,17 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx, cqe->command_id, le16_to_cpup(&cqe->sq_id)); return; } + if (ctx == CMD_CTX_ASYNC) { + u32 result = le32_to_cpup(&cqe->result); + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + ++nvmeq->dev->event_limit; + if (status == NVME_SC_SUCCESS) + dev_warn(nvmeq->q_dmadev, + "async event result %08x\n", result); + return; + } dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); } @@ -1161,6 +1173,8 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) continue; if (info[cmdid].ctx == CMD_CTX_CANCELLED) continue; + if (timeout && info[cmdid].ctx == CMD_CTX_ASYNC) + continue; if (timeout && nvmeq->dev->initialized) { nvme_abort_cmd(cmdid, nvmeq); continue; @@ -1842,6 +1856,27 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq) } } +static int nvme_submit_async_req(struct nvme_queue *nvmeq) +{ + struct nvme_command *c; + int cmdid; + + cmdid = alloc_cmdid(nvmeq, CMD_CTX_ASYNC, special_completion, 0); + if (cmdid < 0) + return cmdid; + + c = &nvmeq->sq_cmds[nvmeq->sq_tail]; + memset(c, 0, sizeof(*c)); + c->common.opcode = nvme_admin_async_event; + c->common.command_id = cmdid; + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; +} + static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -1875,6 +1910,12 @@ static int nvme_kthread(void *data) nvme_cancel_ios(nvmeq, true); nvme_resubmit_bios(nvmeq); nvme_resubmit_iods(nvmeq); + + while ((i == 0) && (dev->event_limit > 0)) { + if (nvme_submit_async_req(nvmeq)) + break; + dev->event_limit--; + } unlock: spin_unlock_irq(&nvmeq->q_lock); } @@ -2244,6 +2285,7 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; dev->vwc = ctrl->vwc; + dev->event_limit = min(ctrl->aerl + 1, 8); memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); -- cgit From 1d0906246095184d1624c643c2088152d330c40a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jun 2014 11:34:01 -0600 Subject: NVMe: Mismatched host/device page size support Adds support for devices with max page size smaller than the host's. In the case we encounter such a host/device combination, the driver will split a page into as many PRP entries as necessary for the device's page size capabilities. If the device's reported minimum page size is greater than the host's, the driver will not attempt to enable the device and return an error instead. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 59 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 19 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 42a62bbf4a11..e60bb0fec7e3 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -373,17 +373,17 @@ static __le64 **iod_list(struct nvme_iod *iod) * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_npages(unsigned size) +static int nvme_npages(unsigned size, struct nvme_dev *dev) { - unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) +nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes) + + sizeof(__le64 *) * nvme_npages(nbytes, dev) + sizeof(struct scatterlist) * nseg, gfp); if (iod) { @@ -400,7 +400,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { - const int last_prp = PAGE_SIZE / 8 - 1; + const int last_prp = dev->page_size / 8 - 1; int i; __le64 **list = iod_list(iod); dma_addr_t prp_dma = iod->first_dma; @@ -491,26 +491,27 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, __le64 **list = iod_list(iod); dma_addr_t prp_dma; int nprps, i; + u32 page_size = dev->page_size; - length -= (PAGE_SIZE - offset); + length -= (page_size - offset); if (length <= 0) return total_len; - dma_len -= (PAGE_SIZE - offset); + dma_len -= (page_size - offset); if (dma_len) { - dma_addr += (PAGE_SIZE - offset); + dma_addr += (page_size - offset); } else { sg = sg_next(sg); dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - if (length <= PAGE_SIZE) { + if (length <= page_size) { iod->first_dma = dma_addr; return total_len; } - nprps = DIV_ROUND_UP(length, PAGE_SIZE); + nprps = DIV_ROUND_UP(length, page_size); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; iod->npages = 0; @@ -523,13 +524,13 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, if (!prp_list) { iod->first_dma = dma_addr; iod->npages = -1; - return (total_len - length) + PAGE_SIZE; + return (total_len - length) + page_size; } list[0] = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { - if (i == PAGE_SIZE / 8) { + if (i == page_size >> 3) { __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, gfp, &prp_dma); if (!prp_list) @@ -540,9 +541,9 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, i = 1; } prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= PAGE_SIZE; - dma_addr += PAGE_SIZE; - length -= PAGE_SIZE; + dma_len -= page_size; + dma_addr += page_size; + length -= page_size; if (length <= 0) break; if (dma_len > 0) @@ -749,7 +750,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, if ((bio->bi_rw & REQ_FLUSH) && psegs) return nvme_split_flush_data(nvmeq, bio); - iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); + iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, ns->dev, GFP_ATOMIC); if (!iod) return -ENOMEM; @@ -1463,6 +1464,24 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u32 aqa; u64 cap = readq(&dev->bar->cap); struct nvme_queue *nvmeq; + unsigned page_shift = PAGE_SHIFT; + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; + unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; + + if (page_shift < dev_page_min) { + dev_err(&dev->pci_dev->dev, + "Minimum device page size (%u) too large for " + "host (%u)\n", 1 << dev_page_min, + 1 << page_shift); + return -ENODEV; + } + if (page_shift > dev_page_max) { + dev_info(&dev->pci_dev->dev, + "Device maximum page size (%u) smaller than " + "host (%u); enabling work-around\n", + 1 << dev_page_max, 1 << page_shift); + page_shift = dev_page_max; + } result = nvme_disable_ctrl(dev, cap); if (result < 0) @@ -1478,8 +1497,10 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; + dev->page_size = 1 << page_shift; + dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; - dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; + dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; @@ -1529,7 +1550,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, GFP_KERNEL); + iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); if (!iod) goto put_pages; -- cgit From 01079522f94b3a1c87cfa375be3917d2da6e6363 Mon Sep 17 00:00:00 2001 From: Dan McLeran Date: Mon, 23 Jun 2014 08:24:36 -0600 Subject: NVMe: Change nvme_enable_ctrl to set EN and manage CC thru ctrl_config. Change the behavior of nvme_enable_ctrl to set EN. Clear CC.SH for both nvme_enable_ctrl and nvme_disable_ctrl. Remove reading of the CC register and manage the state in dev->ctrl_config. Signed-off-by: Dan McLeran [removed an unwanted write to CC] Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e60bb0fec7e3..0a98de813a12 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1422,25 +1422,30 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) */ static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) { - u32 cc = readl(&dev->bar->cc); + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config &= ~NVME_CC_ENABLE; + writel(dev->ctrl_config, &dev->bar->cc); - if (cc & NVME_CC_ENABLE) - writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); return nvme_wait_ready(dev, cap, false); } static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) { + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config |= NVME_CC_ENABLE; + writel(dev->ctrl_config, &dev->bar->cc); + return nvme_wait_ready(dev, cap, true); } static int nvme_shutdown_ctrl(struct nvme_dev *dev) { unsigned long timeout; - u32 cc; - cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; - writel(cc, &dev->bar->cc); + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config |= NVME_CC_SHN_NORMAL; + + writel(dev->ctrl_config, &dev->bar->cc); timeout = 2 * HZ + jiffies; while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != @@ -1499,7 +1504,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) dev->page_size = 1 << page_shift; - dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; + dev->ctrl_config = NVME_CC_CSS_NVM; dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; @@ -1507,7 +1512,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) writel(aqa, &dev->bar->aqa); writeq(nvmeq->sq_dma_addr, &dev->bar->asq); writeq(nvmeq->cq_dma_addr, &dev->bar->acq); - writel(dev->ctrl_config, &dev->bar->cc); result = nvme_enable_ctrl(dev, cap); if (result) -- cgit From badc34d4154de81b965629a5c3110e058eb8ca2b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jun 2014 14:25:35 -0600 Subject: NVMe: Handling devices incapable of I/O This is a minor refactor for handling devices that are incapable of IO. The driver previously used special error codes to know that IO queues are unavailable, but we have an online queue count now. This also fixes an issue where the driver successfully sets the queue count, but either is unable to allocate an IO queue or the device can't create one for some reason. If the driver can successfully enable the device and get responses to admin commands, the driver will bring up a character device for managment but not create block devices. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 0a98de813a12..f3103aa91d2e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2172,7 +2172,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) if (status > 0) { dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", status); - return -EBUSY; + return 0; } return min(result & 0xffff, result >> 16) + 1; } @@ -2214,7 +2214,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nr_io_queues = num_possible_cpus(); result = set_queue_count(dev, nr_io_queues); - if (result < 0) + if (result <= 0) return result; if (result < nr_io_queues) nr_io_queues = result; @@ -2740,7 +2740,7 @@ static int nvme_dev_start(struct nvme_dev *dev) } result = nvme_setup_io_queues(dev); - if (result && result != -EBUSY) + if (result) goto disable; return result; @@ -2777,9 +2777,9 @@ static int nvme_dev_resume(struct nvme_dev *dev) int ret; ret = nvme_dev_start(dev); - if (ret && ret != -EBUSY) + if (ret) return ret; - if (ret == -EBUSY) { + if (dev->online_queues < 2) { spin_lock(&dev_list_lock); dev->reset_workfn = nvme_remove_disks; queue_work(nvme_workq, &dev->reset_work); @@ -2852,17 +2852,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) kref_init(&dev->kref); result = nvme_dev_start(dev); - if (result) { - if (result == -EBUSY) - goto create_cdev; + if (result) goto release_pools; - } - result = nvme_dev_add(dev); + if (dev->online_queues > 1) + result = nvme_dev_add(dev); if (result) goto shutdown; - create_cdev: scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); dev->miscdev.minor = MISC_DYNAMIC_MINOR; dev->miscdev.parent = &pdev->dev; -- cgit From c81f49758a677e878df4e6a33f29d8ce401bf66d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jun 2014 15:24:53 -0600 Subject: NVMe: Use pci_stop_and_remove_bus_device_locked() Race conditions are theoretically possible between the NVMe PCI device removal and the generic PCI bus rescan and device removal that can be triggered via sysfs. To avoid those race conditions make the NVMe code use pci_stop_and_remove_bus_device_locked(). Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f3103aa91d2e..48a712734b55 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2759,7 +2759,7 @@ static int nvme_remove_dead_ctrl(void *arg) struct pci_dev *pdev = dev->pci_dev; if (pci_get_drvdata(pdev)) - pci_stop_and_remove_bus_device(pdev); + pci_stop_and_remove_bus_device_locked(pdev); kref_put(&dev->kref, nvme_free_dev); return 0; } -- cgit From a67394790a9c52657273212647e350e36a292183 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jun 2014 16:03:21 -0600 Subject: NVMe: Whitespace fixes Fixing tabs inadvertently converted to spaces. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 48a712734b55..c5f379f08f50 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2892,12 +2892,12 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) { - struct nvme_dev *dev = pci_get_drvdata(pdev); + struct nvme_dev *dev = pci_get_drvdata(pdev); - if (prepare) - nvme_dev_shutdown(dev); - else - nvme_dev_resume(dev); + if (prepare) + nvme_dev_shutdown(dev); + else + nvme_dev_resume(dev); } static void nvme_shutdown(struct pci_dev *pdev) -- cgit From 7c1b24503873e6cfc1d31f2e55c35358fd438351 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Jun 2014 11:18:12 -0600 Subject: NVMe: Skip orderly shutdown on failed devices Rather than skipping shutdown only for devices that have been removed, skip the orderly shutdown on failed devices to avoid the long timeout handling that inevitably happens when deleting queues on such a device. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index c5f379f08f50..ac3694083e89 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2567,11 +2567,14 @@ static void nvme_dev_list_remove(struct nvme_dev *dev) static void nvme_dev_shutdown(struct nvme_dev *dev) { int i; + u32 csts = -1; dev->initialized = 0; nvme_dev_list_remove(dev); - if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { + if (dev->bar) + csts = readl(&dev->bar->csts); + if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { for (i = dev->queue_count - 1; i >= 0; i--) { struct nvme_queue *nvmeq = raw_nvmeq(dev, i); nvme_suspend_queue(nvmeq); -- cgit From 2484f40780b97df1b5eb09e78ce4efaa78b21875 Mon Sep 17 00:00:00 2001 From: Dan McLeran Date: Tue, 1 Jul 2014 09:33:32 -0600 Subject: NVMe: Add shutdown timeout as module parameter. The current implementation hard-codes the shutdown timeout to 2 seconds. Some devices take longer than this to complete a normal shutdown. Changing the shutdown timeout to a module parameter with a default timeout of 5 seconds. Signed-off-by: Dan McLeran Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index ac3694083e89..3ada636356e8 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -48,6 +48,7 @@ #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (admin_timeout * HZ) +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) #define IOD_TIMEOUT (retry_time * HZ) static unsigned char admin_timeout = 60; @@ -62,6 +63,10 @@ static unsigned char retry_time = 30; module_param(retry_time, byte, 0644); MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); +static unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + static int nvme_major; module_param(nvme_major, int, 0); @@ -1447,7 +1452,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) writel(dev->ctrl_config, &dev->bar->cc); - timeout = 2 * HZ + jiffies; + timeout = SHUTDOWN_TIMEOUT + jiffies; while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != NVME_CSTS_SHST_CMPLT) { msleep(100); -- cgit From f435c2825b4cc6453b9a1f91418cabbd6ba08cc0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 7 Jul 2014 09:14:42 -0600 Subject: NVMe: Call nvme_free_queue directly Rather than relying on call_rcu, this patch directly frees the nvme_queue's memory after ensuring no readers exist. Some arch specific dma_free_coherent implementations may not be called from a call_rcu's soft interrupt context, hence the change. Signed-off-by: Keith Busch Reported-by: Matthew Minter Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 3ada636356e8..d3a025fc1268 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -95,7 +95,7 @@ struct async_cmd_info { * commands and one for I/O commands). */ struct nvme_queue { - struct rcu_head r_head; + struct llist_node node; struct device *q_dmadev; struct nvme_dev *dev; char irqname[24]; /* nvme4294967295-65535\0 */ @@ -1192,10 +1192,8 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) } } -static void nvme_free_queue(struct rcu_head *r) +static void nvme_free_queue(struct nvme_queue *nvmeq) { - struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head); - spin_lock_irq(&nvmeq->q_lock); while (bio_list_peek(&nvmeq->sq_cong)) { struct bio *bio = bio_list_pop(&nvmeq->sq_cong); @@ -1225,14 +1223,21 @@ static void nvme_free_queue(struct rcu_head *r) static void nvme_free_queues(struct nvme_dev *dev, int lowest) { + LLIST_HEAD(q_list); + struct nvme_queue *nvmeq, *next; + struct llist_node *entry; int i; for (i = dev->queue_count - 1; i >= lowest; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + nvmeq = raw_nvmeq(dev, i); rcu_assign_pointer(dev->queues[i], NULL); - call_rcu(&nvmeq->r_head, nvme_free_queue); + llist_add(&nvmeq->node, &q_list); dev->queue_count--; } + synchronize_rcu(); + entry = llist_del_all(&q_list); + llist_for_each_entry_safe(nvmeq, next, entry, node) + nvme_free_queue(nvmeq); } /** @@ -2929,7 +2934,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove(dev); nvme_dev_shutdown(dev); nvme_free_queues(dev, 0); - rcu_barrier(); nvme_release_instance(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); -- cgit From 302c6727e5eb4d0be0e02d077b65feb3e73ea254 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 18 Jul 2014 11:40:20 -0600 Subject: NVMe: Fix filesystem sync deadlock on removal This changes the order of deleting the gendisks so it happens after the nvme IO queues are freed. If a device is removed while a filesystem has associated dirty data, the removal will wait on these to complete before proceeding from del_gendisk, which could have caused deadlock before. The implication of this is that an orderly removal of a responsive device won't necessarily wait for dirty data to be written, but we are not guaranteed the device is even going to respond at this point either. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d3a025fc1268..48a1be4ab24c 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2781,8 +2781,8 @@ static void nvme_remove_disks(struct work_struct *ws) { struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); - nvme_dev_remove(dev); nvme_free_queues(dev, 1); + nvme_dev_remove(dev); } static int nvme_dev_resume(struct nvme_dev *dev) @@ -2931,9 +2931,9 @@ static void nvme_remove(struct pci_dev *pdev) flush_work(&dev->reset_work); flush_work(&dev->cpu_work); misc_deregister(&dev->miscdev); - nvme_dev_remove(dev); nvme_dev_shutdown(dev); nvme_free_queues(dev, 0); + nvme_dev_remove(dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); -- cgit From 5905535610fc6b11379a999ff45bfa39f0d605b6 Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Tue, 29 Jul 2014 13:31:36 -0700 Subject: NVMe: Correctly handle IOCTL_SUBMIT_IO when cpus > online queues nvme_submit_io_cmd() uses smp_processor_id() to pick an IO queue index. This patch fixes the case where there are more cpus from which the ioctl call can originate than online queues, which can happen when a device supports or was allocated fewer interrupt vectors than exist cpu cores. Thanks to Keith Busch for the implementation suggestion. Signed-off-by: Sam Bradshaw Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 48a1be4ab24c..5d44c9344522 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -984,8 +984,8 @@ int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { - return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result, - NVME_IO_TIMEOUT); + return nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), cmd, + result, NVME_IO_TIMEOUT); } static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, -- cgit From 062261be4e39e35bdf2fba16a4b2d8a432ae8281 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Mon, 18 Aug 2014 17:41:01 +0300 Subject: nvme: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 5d44c9344522..c1e3c1a101b8 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1230,7 +1230,7 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) for (i = dev->queue_count - 1; i >= lowest; i--) { nvmeq = raw_nvmeq(dev, i); - rcu_assign_pointer(dev->queues[i], NULL); + RCU_INIT_POINTER(dev->queues[i], NULL); llist_add(&nvmeq->node, &q_list); dev->queue_count--; } -- cgit From a96d4f5c2da66a0c1537bfd8aaa868b69595476a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 19 Aug 2014 19:15:59 -0600 Subject: NVMe: Reference count pci device If an nvme device is removed but user space has an open reference, the nvme driver would have been holding an invalid reference to its pci device. You may get a general protection fault on x86 h/w when the driver uses that reference in dma_map_sg(), as is done in nvme_map_user_pages() from the IOCTL interface. This patch fixes the fault by taking a reference on the pci device and holding it even after device removal until all opens on the nvme device are closed. Signed-off-by: Keith Busch Reported-by: Nilesh Choudhury Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index c1e3c1a101b8..955b5699ff96 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2678,6 +2678,7 @@ static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + pci_dev_put(dev->pci_dev); nvme_free_namespaces(dev); free_percpu(dev->io_queue); kfree(dev->queues); @@ -2853,11 +2854,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); - dev->pci_dev = pdev; + dev->pci_dev = pci_dev_get(pdev); pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); if (result) - goto free; + goto put_pci; result = nvme_setup_prp_pools(dev); if (result) @@ -2895,6 +2896,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_release_prp_pools(dev); release: nvme_release_instance(dev); + put_pci: + pci_dev_put(dev->pci_dev); free: free_percpu(dev->io_queue); kfree(dev->queues); -- cgit From e179729a821c1b82375f5593533c027462e753b7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2014 13:55:38 -0600 Subject: NVMe: Remove duplicate compat SG_IO code We can return -ENOIOCTLCMD and the ioctl will be handled by fs/compat_ioctl.c instead. This removes a lot of duplicate code in the nvme driver. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 955b5699ff96..37f22b6bb009 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1807,11 +1807,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nvme_ns *ns = bdev->bd_disk->private_data; - switch (cmd) { case SG_IO: - return nvme_sg_io32(ns, arg); + return -ENOIOCTLCMD; } return nvme_ioctl(bdev, mode, cmd, arg); } -- cgit From b4ff9c8ddb6f0cec99a53ab26a5aa2ed0162c472 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 29 Aug 2014 09:06:12 -0600 Subject: NVMe: Translate NVMe status to errno This returns a more appropriate error for the "capacity exceeded" status. In case other NVMe statuses have a better errno, this patch adds a convience function to translate an NVMe status code to an errno for IO commands, defaulting to the current -EIO. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 37f22b6bb009..3153692da288 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -450,6 +450,18 @@ static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) } } +static int nvme_error_status(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; + } +} + static void bio_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { @@ -469,7 +481,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, wake_up(&nvmeq->sq_full); return; } - error = -EIO; + error = nvme_error_status(status); } if (iod->nents) { dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, -- cgit From 7be50e93fbc281967589a04be5b1125539b0d0e2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 10 Sep 2014 15:48:47 -0600 Subject: NVMe: Fix nvmeq waitqueue entry initialization We need to update the nvme queue's wait_queue_t entry during each initialization since the nvme_thread may be ended and restarted when the device is reset. If a device reset occurs during a large amount of buffered IO, it would take a lot longer to complete the outstanding requests due to the 1 second polling instead of waking up as completions occur. Fixes: b9afca3efb18a9b8392cb544a3e29e8b1168400c Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 3153692da288..16a886c2b73d 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1333,7 +1333,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->cq_head = 0; nvmeq->cq_phase = 1; init_waitqueue_head(&nvmeq->sq_full); - init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); bio_list_init(&nvmeq->sq_cong); INIT_LIST_HEAD(&nvmeq->iod_bio); nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; @@ -1373,6 +1372,8 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) struct nvme_dev *dev = nvmeq->dev; unsigned extra = nvme_queue_extra(nvmeq->q_depth); + spin_lock_irq(&nvmeq->q_lock); + init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); nvmeq->sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; @@ -1382,6 +1383,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) nvme_cancel_ios(nvmeq, false); nvmeq->q_suspended = 0; dev->online_queues++; + spin_unlock_irq(&nvmeq->q_lock); } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) @@ -1401,10 +1403,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_sq; - spin_lock_irq(&nvmeq->q_lock); nvme_init_queue(nvmeq, qid); - spin_unlock_irq(&nvmeq->q_lock); - return result; release_sq: @@ -1543,9 +1542,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) if (result) return result; - spin_lock_irq(&nvmeq->q_lock); - nvme_init_queue(nvmeq, 0); - spin_unlock_irq(&nvmeq->q_lock); return result; } @@ -2762,6 +2758,7 @@ static int nvme_dev_start(struct nvme_dev *dev) result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; goto disable; } + nvme_init_queue(raw_nvmeq(dev, 0), 0); result = nvme_setup_io_queues(dev); if (result) -- cgit From 1b9dbf7fe02d85f70e8efc1c12c070206c0d5c5f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 10 Sep 2014 17:21:14 -0600 Subject: NVMe: Add revalidate_disk callback This adds a callback to revalidate the disk and change its block size and capacity if needed. Before, a user would have to remove + rescan an entire device if they changed the logical block size using an NVMe Format or other vendor specific command; now they can just run something that issues the BLKRRPART IOCTL, like # hdparm -z /dev/nvmeXnY This can also be used in response to the 1.2 Spec's Namespace Attribute Change asynchronous event. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 16a886c2b73d..93ee55ee3775 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1853,6 +1853,35 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) return 0; } +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_dev *dev = ns->dev; + struct nvme_id_ns *id; + dma_addr_t dma_addr; + int lbaf; + + id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, + GFP_KERNEL); + if (!id) { + dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n", + __func__); + return 0; + } + + if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) + goto free; + + lbaf = id->flbas & 0xf; + ns->lba_shift = id->lbaf[lbaf].ds; + + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + free: + dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); + return 0; +} + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, @@ -1860,6 +1889,7 @@ static const struct block_device_operations nvme_fops = { .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, + .revalidate_disk= nvme_revalidate_disk, }; static void nvme_resubmit_iods(struct nvme_queue *nvmeq) -- cgit From 7963e521811ed19396d47793cc77d87c643ab891 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 12 Sep 2014 16:07:20 -0600 Subject: NVMe: Passthrough IOCTL for IO commands The NVME_IOCTL_SUBMIT_IO only works for IO commands with block data transfers and isn't usable for other NVMe commands like flush, data set management, or any sort of vendor unique command. The NVME_IOCTL_ADMIN_CMD, however, can easily be modified to accept arbitrary IO commands in addition to arbitrary admin commands without breaking backward compatibility. This patch just adds a new IOCTL to distinguish if the driver should submit the command on an IO or Admin queue. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 93ee55ee3775..70be3a151eb7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1732,10 +1732,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_admin_cmd(struct nvme_dev *dev, - struct nvme_admin_cmd __user *ucmd) +static int nvme_user_cmd(struct nvme_dev *dev, + struct nvme_passthru_cmd __user *ucmd, bool ioq) { - struct nvme_admin_cmd cmd; + struct nvme_passthru_cmd cmd; struct nvme_command c; int status, length; struct nvme_iod *uninitialized_var(iod); @@ -1774,6 +1774,9 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, ADMIN_TIMEOUT; if (length != cmd.data_len) status = -ENOMEM; + else if (ioq) + status = nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), &c, + &cmd.result, timeout); else status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); @@ -1799,7 +1802,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns->dev, (void __user *)arg); + return nvme_user_cmd(ns->dev, (void __user *)arg, false); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->dev, (void __user *)arg, true); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -2743,7 +2748,9 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) struct nvme_dev *dev = f->private_data; switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(dev, (void __user *)arg); + return nvme_user_cmd(dev, (void __user *)arg, false); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(dev, (void __user *)arg, true); default: return -ENOTTY; } -- cgit From 387caa5a2b11ecb1abbbec8ef9d798f757ba3e5f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 22 Sep 2014 13:46:19 -0600 Subject: NVMe: Fix device probe waiting on kthread If we ever do parallel device probing, we need to wake up all processes waiting for nvme kthread to start, not just one. This is currently serialized so the bug is not reachable today, but fixing this anyway in the hopes we implement parallel or asynchronous probe in the future. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 70be3a151eb7..0196fce0ddf5 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2787,7 +2787,7 @@ static int nvme_dev_start(struct nvme_dev *dev) if (start_thread) { nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up(&nvme_kthread_wait); + wake_up_all(&nvme_kthread_wait); } else wait_event_killable(nvme_kthread_wait, nvme_thread); -- cgit From 5940c8578fe720afbf4ef041bad0d72a101f1d88 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 4 Nov 2014 13:18:10 -0700 Subject: NVMe: Clear QUEUE_FLAG_STACKABLE The nvme namespace request_queue's flags are initialized to QUEUE_FLAG_DEFAULT, which currently sets QUEUE_FLAG_STACKABLE. The device-mapper indicates this flag means the block driver is requset based, though this driver is bio-based and problems will occur if an nvme namespace is used with a request based dm device. This patch clears the stackable flag. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 0196fce0ddf5..8fffc68c74eb 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2030,6 +2030,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, if (!ns->queue) goto out_free_ns; ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; + queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue); -- cgit From 9e60352cf83faaba57f99f6960b545687b8bbb20 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 3 Oct 2014 11:15:47 -0600 Subject: NVMe: Do not open disks that are being deleted It is possible the block layer will request to open a block device after the driver deleted it. Subsequent releases will cause a double free, or the disk's private_data is pointing to freed memory. This patch protects the driver's freed disks from being opened and accessed: the nvme namespaces are freed only when the device's refcount is 0, so at that moment there were no active openers and no more should be allowed, and it is safe to clear the disk's private_data that is about to be freed. Signed-off-by: Keith Busch Reported-by: Henry Chow Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 8fffc68c74eb..fb21d365efb5 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1832,11 +1832,18 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, static int nvme_open(struct block_device *bdev, fmode_t mode) { - struct nvme_ns *ns = bdev->bd_disk->private_data; - struct nvme_dev *dev = ns->dev; + int ret = 0; + struct nvme_ns *ns; - kref_get(&dev->kref); - return 0; + spin_lock(&dev_list_lock); + ns = bdev->bd_disk->private_data; + if (!ns) + ret = -ENXIO; + else if (!kref_get_unless_zero(&ns->dev->kref)) + ret = -ENXIO; + spin_unlock(&dev_list_lock); + + return ret; } static void nvme_free_dev(struct kref *kref); @@ -2711,6 +2718,11 @@ static void nvme_free_namespaces(struct nvme_dev *dev) list_for_each_entry_safe(ns, next, &dev->namespaces, list) { list_del(&ns->list); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + put_disk(ns->disk); kfree(ns); } -- cgit From 9dbbfab7d54109626031bf3bc476fb1804113970 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 6 Oct 2014 15:23:06 -0600 Subject: NVMe: Do not over allocate for discard requests Discard requests are often for very large ranges. The discard size is not representative of the data transfer size so we don't need to allocate for such a large prp list. This patch requests allocating only enough for the memory needed for the data transfer and saves a little over 8k of memory per max discard request. Signed-off-by: Keith Busch Reported-by: Paul Grabinar Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index fb21d365efb5..c70eff3673d0 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -763,11 +763,13 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, struct nvme_iod *iod; int psegs = bio_phys_segments(ns->queue, bio); int result; + unsigned size = !(bio->bi_rw & REQ_DISCARD) ? bio->bi_iter.bi_size : + sizeof(struct nvme_dsm_range); if ((bio->bi_rw & REQ_FLUSH) && psegs) return nvme_split_flush_data(nvmeq, bio); - iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, ns->dev, GFP_ATOMIC); + iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); if (!iod) return -ENOMEM; -- cgit From a4aea5623d4a54682b6ff5c18196d7802f3e478f Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 4 Nov 2014 08:20:14 -0700 Subject: NVMe: Convert to blk-mq This converts the NVMe driver to a blk-mq request-based driver. The NVMe driver is currently bio-based and implements queue logic within itself. By using blk-mq, a lot of these responsibilities can be moved and simplified. The patch is divided into the following blocks: * Per-command data and cmdid have been moved into the struct request field. The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id maintenance are now handled by blk-mq through the rq->tag field. * The logic for splitting bio's has been moved into the blk-mq layer. The driver instead notifies the block layer about limited gap support in SG lists. * blk-mq handles timeouts and is reimplemented within nvme_timeout(). This both includes abort handling and command cancelation. * Assignment of nvme queues to CPUs are replaced with the blk-mq version. The current blk-mq strategy is to assign the number of mapped queues and CPUs to provide synergy, while the nvme driver assign as many nvme hw queues as possible. This can be implemented in blk-mq if needed. * NVMe queues are merged with the tags structure of blk-mq. * blk-mq takes care of setup/teardown of nvme queues and guards invalid accesses. Therefore, RCU-usage for nvme queues can be removed. * IO tracing and accounting are handled by blk-mq and therefore removed. * Queue suspension logic is replaced with the logic from the block layer. Contributions in this patch from: Sam Bradshaw Jens Axboe Keith Busch Robert Nelson Acked-by: Keith Busch Acked-by: Jens Axboe Updated for new ->queue_rq() prototype. Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 1366 ++++++++++++++++++--------------------------- 1 file changed, 557 insertions(+), 809 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index c70eff3673d0..39050a3d10fd 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -13,9 +13,9 @@ */ #include -#include #include #include +#include #include #include #include @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -42,9 +41,8 @@ #include #include -#include - #define NVME_Q_DEPTH 1024 +#define NVME_AQ_DEPTH 64 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (admin_timeout * HZ) @@ -81,10 +79,12 @@ static wait_queue_head_t nvme_kthread_wait; static struct notifier_block nvme_nb; static void nvme_reset_failed_dev(struct work_struct *ws); +static int nvme_process_cq(struct nvme_queue *nvmeq); struct async_cmd_info { struct kthread_work work; struct kthread_worker *worker; + struct request *req; u32 result; int status; void *ctx; @@ -104,10 +104,6 @@ struct nvme_queue { volatile struct nvme_completion *cqes; dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; - wait_queue_head_t sq_full; - wait_queue_t sq_cong_wait; - struct bio_list sq_cong; - struct list_head iod_bio; u32 __iomem *q_db; u16 q_depth; u16 cq_vector; @@ -117,10 +113,8 @@ struct nvme_queue { u16 qid; u8 cq_phase; u8 cqe_seen; - u8 q_suspended; - cpumask_var_t cpu_mask; struct async_cmd_info cmdinfo; - unsigned long cmdid_data[]; + struct blk_mq_hw_ctx *hctx; }; /* @@ -148,62 +142,72 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, struct nvme_cmd_info { nvme_completion_fn fn; void *ctx; - unsigned long timeout; int aborted; + struct nvme_queue *nvmeq; }; -static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) { - return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[0]; + + WARN_ON(nvmeq->hctx); + nvmeq->hctx = hctx; + hctx->driver_data = nvmeq; + return 0; } -static unsigned nvme_queue_extra(int depth) +static int nvme_admin_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) { - return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[0]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; } -/** - * alloc_cmdid() - Allocate a Command ID - * @nvmeq: The queue that will be used for this command - * @ctx: A pointer that will be passed to the handler - * @handler: The function to call on completion - * - * Allocate a Command ID for a queue. The data passed in will - * be passed to the completion handler. This is implemented by using - * the bottom two bits of the ctx pointer to store the handler ID. - * Passing in a pointer that's not 4-byte aligned will cause a BUG. - * We can change this if it becomes a problem. - * - * May be called with local interrupts disabled and the q_lock held, - * or with interrupts enabled and no locks held. - */ -static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) { - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - int cmdid; + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[ + (hctx_idx % dev->queue_count) + 1]; - do { - cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); - if (cmdid >= depth) - return -EBUSY; - } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); + if (!nvmeq->hctx) + nvmeq->hctx = hctx; + + /* nvmeq queues are shared between namespaces. We assume here that + * blk-mq map the tags so they match up with the nvme queue tags. */ + WARN_ON(nvmeq->hctx->tags != hctx->tags); - info[cmdid].fn = handler; - info[cmdid].ctx = ctx; - info[cmdid].timeout = jiffies + timeout; - info[cmdid].aborted = 0; - return cmdid; + hctx->driver_data = nvmeq; + return 0; } -static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) +static int nvme_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) { - int cmdid; - wait_event_killable(nvmeq->sq_full, - (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); - return (cmdid < 0) ? -EINTR : cmdid; + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; +} + +static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, + nvme_completion_fn handler) +{ + cmd->fn = handler; + cmd->ctx = ctx; + cmd->aborted = 0; } /* Special values must be less than 0x1000 */ @@ -211,18 +215,12 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) -#define CMD_CTX_ASYNC (0x31C + CMD_CTX_BASE) static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { if (ctx == CMD_CTX_CANCELLED) return; - if (ctx == CMD_CTX_ABORT) { - ++nvmeq->dev->abort_limit; - return; - } if (ctx == CMD_CTX_COMPLETED) { dev_warn(nvmeq->q_dmadev, "completed id %d twice on queue %d\n", @@ -235,110 +233,89 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx, cqe->command_id, le16_to_cpup(&cqe->sq_id)); return; } - if (ctx == CMD_CTX_ASYNC) { - u32 result = le32_to_cpup(&cqe->result); - u16 status = le16_to_cpup(&cqe->status) >> 1; - - if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) - ++nvmeq->dev->event_limit; - if (status == NVME_SC_SUCCESS) - dev_warn(nvmeq->q_dmadev, - "async event result %08x\n", result); - return; - } - dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); } -static void async_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct async_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) +static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) { void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) { - if (fn) - *fn = special_completion; - return CMD_CTX_INVALID; - } if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_COMPLETED; - clear_bit(cmdid, nvmeq->cmdid_data); - wake_up(&nvmeq->sq_full); + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_CANCELLED; return ctx; } -static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) +static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_CANCELLED; - return ctx; -} + struct request *req = ctx; -static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) -{ - return rcu_dereference_raw(dev->queues[qid]); + u32 result = le32_to_cpup(&cqe->result); + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + ++nvmeq->dev->event_limit; + if (status == NVME_SC_SUCCESS) + dev_warn(nvmeq->q_dmadev, + "async event result %08x\n", result); + + blk_put_request(req); } -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) +static void abort_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - struct nvme_queue *nvmeq; - unsigned queue_id = get_cpu_var(*dev->io_queue); + struct request *req = ctx; + + u16 status = le16_to_cpup(&cqe->status) >> 1; + u32 result = le32_to_cpup(&cqe->result); - rcu_read_lock(); - nvmeq = rcu_dereference(dev->queues[queue_id]); - if (nvmeq) - return nvmeq; + blk_put_request(req); - rcu_read_unlock(); - put_cpu_var(*dev->io_queue); - return NULL; + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + ++nvmeq->dev->abort_limit; } -static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +static void async_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - rcu_read_unlock(); - put_cpu_var(nvmeq->dev->io_queue); + struct async_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + queue_kthread_work(cmdinfo->worker, &cmdinfo->work); + blk_put_request(cmdinfo->req); } -static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) - __acquires(RCU) +static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, + unsigned int tag) { - struct nvme_queue *nvmeq; - - rcu_read_lock(); - nvmeq = rcu_dereference(dev->queues[q_idx]); - if (nvmeq) - return nvmeq; + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + struct request *req = blk_mq_tag_to_rq(hctx->tags, tag); - rcu_read_unlock(); - return NULL; + return blk_mq_rq_to_pdu(req); } -static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, + nvme_completion_fn *fn) { - rcu_read_unlock(); + struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); + void *ctx; + if (tag >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + if (fn) + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_COMPLETED; + return ctx; } /** @@ -348,26 +325,29 @@ static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) * * Safe to use from interrupt context */ -static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { - unsigned long flags; - u16 tail; - spin_lock_irqsave(&nvmeq->q_lock, flags); - if (nvmeq->q_suspended) { - spin_unlock_irqrestore(&nvmeq->q_lock, flags); - return -EBUSY; - } - tail = nvmeq->sq_tail; + u16 tail = nvmeq->sq_tail; + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); if (++tail == nvmeq->q_depth) tail = 0; writel(tail, nvmeq->q_db); nvmeq->sq_tail = tail; - spin_unlock_irqrestore(&nvmeq->q_lock, flags); return 0; } +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + int ret; + spin_lock_irqsave(&nvmeq->q_lock, flags); + ret = __nvme_submit_cmd(nvmeq, cmd); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + return ret; +} + static __le64 **iod_list(struct nvme_iod *iod) { return ((void *)iod) + iod->offset; @@ -397,7 +377,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) iod->length = nbytes; iod->nents = 0; iod->first_dma = 0ULL; - iod->start_time = jiffies; } return iod; @@ -421,35 +400,6 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } -static void nvme_start_io_acct(struct bio *bio) -{ - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (blk_queue_io_stat(disk->queue)) { - const int rw = bio_data_dir(bio); - int cpu = part_stat_lock(); - part_round_stats(cpu, &disk->part0); - part_stat_inc(cpu, &disk->part0, ios[rw]); - part_stat_add(cpu, &disk->part0, sectors[rw], - bio_sectors(bio)); - part_inc_in_flight(&disk->part0, rw); - part_stat_unlock(); - } -} - -static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) -{ - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (blk_queue_io_stat(disk->queue)) { - const int rw = bio_data_dir(bio); - unsigned long duration = jiffies - start_time; - int cpu = part_stat_lock(); - part_stat_add(cpu, &disk->part0, ticks[rw], duration); - part_round_stats(cpu, &disk->part0); - part_dec_in_flight(&disk->part0, rw); - part_stat_unlock(); - } -} - static int nvme_error_status(u16 status) { switch (status & 0x7ff) { @@ -462,36 +412,37 @@ static int nvme_error_status(u16 status) } } -static void bio_completion(struct nvme_queue *nvmeq, void *ctx, +static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct bio *bio = iod->private; + struct request *req = iod->private; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + u16 status = le16_to_cpup(&cqe->status) >> 1; - int error = 0; if (unlikely(status)) { - if (!(status & NVME_SC_DNR || - bio->bi_rw & REQ_FAILFAST_MASK) && - (jiffies - iod->start_time) < IOD_TIMEOUT) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - list_add_tail(&iod->node, &nvmeq->iod_bio); - wake_up(&nvmeq->sq_full); + if (!(status & NVME_SC_DNR || blk_noretry_request(req)) + && (jiffies - req->start_time) < req->timeout) { + blk_mq_requeue_request(req); + blk_mq_kick_requeue_list(req->q); return; } - error = nvme_error_status(status); - } - if (iod->nents) { - dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - nvme_end_io_acct(bio, iod->start_time); - } + req->errors = nvme_error_status(status); + } else + req->errors = 0; + + if (cmd_rq->aborted) + dev_warn(&nvmeq->dev->pci_dev->dev, + "completing aborted command with status:%04x\n", + status); + + if (iod->nents) + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_free_iod(nvmeq->dev, iod); - trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); - bio_endio(bio, error); + blk_mq_complete_request(req); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -574,88 +525,25 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, return total_len; } -static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, - int len) -{ - struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL); - if (!split) - return -ENOMEM; - - trace_block_split(bdev_get_queue(bio->bi_bdev), bio, - split->bi_iter.bi_sector); - bio_chain(split, bio); - - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, split); - bio_list_add(&nvmeq->sq_cong, bio); - wake_up(&nvmeq->sq_full); - - return 0; -} - -/* NVMe scatterlists require no holes in the virtual address */ -#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ - (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) - -static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, - struct bio *bio, enum dma_data_direction dma_dir, int psegs) -{ - struct bio_vec bvec, bvprv; - struct bvec_iter iter; - struct scatterlist *sg = NULL; - int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size; - int first = 1; - - if (nvmeq->dev->stripe_size) - split_len = nvmeq->dev->stripe_size - - ((bio->bi_iter.bi_sector << 9) & - (nvmeq->dev->stripe_size - 1)); - - sg_init_table(iod->sg, psegs); - bio_for_each_segment(bvec, bio, iter) { - if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) { - sg->length += bvec.bv_len; - } else { - if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec)) - return nvme_split_and_submit(bio, nvmeq, - length); - - sg = sg ? sg + 1 : iod->sg; - sg_set_page(sg, bvec.bv_page, - bvec.bv_len, bvec.bv_offset); - nsegs++; - } - - if (split_len - length < bvec.bv_len) - return nvme_split_and_submit(bio, nvmeq, split_len); - length += bvec.bv_len; - bvprv = bvec; - first = 0; - } - iod->nents = nsegs; - sg_mark_end(sg); - if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) - return -ENOMEM; - - BUG_ON(length != bio->bi_iter.bi_size); - return length; -} - -static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio, struct nvme_iod *iod, int cmdid) +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req, struct nvme_iod *iod) { struct nvme_dsm_range *range = (struct nvme_dsm_range *)iod_list(iod)[0]; struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); + range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.command_id = cmdid; + cmnd->dsm.command_id = req->tag; cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); cmnd->dsm.nr = 0; @@ -664,11 +552,9 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; } -static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, +static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, int cmdid) { struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; @@ -681,49 +567,34 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; } -static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) +static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, + struct nvme_ns *ns) { - struct bio *bio = iod->private; - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; + struct request *req = iod->private; struct nvme_command *cmnd; - int cmdid; - u16 control; - u32 dsmgmt; + u16 control = 0; + u32 dsmgmt = 0; - cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; - - if (bio->bi_rw & REQ_DISCARD) - return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if (bio->bi_rw & REQ_FLUSH) - return nvme_submit_flush(nvmeq, ns, cmdid); - - control = 0; - if (bio->bi_rw & REQ_FUA) + if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; - if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) control |= NVME_RW_LR; - dsmgmt = 0; - if (bio->bi_rw & REQ_RAHEAD) + if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; memset(cmnd, 0, sizeof(*cmnd)); - cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read; - cmnd->rw.command_id = cmdid; + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; cmnd->rw.nsid = cpu_to_le32(ns->ns_id); cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); - cmnd->rw.length = - cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); @@ -734,47 +605,37 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) return 0; } -static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) -{ - struct bio *split = bio_clone(bio, GFP_ATOMIC); - if (!split) - return -ENOMEM; - - split->bi_iter.bi_size = 0; - split->bi_phys_segments = 0; - bio->bi_rw &= ~REQ_FLUSH; - bio_chain(split, bio); - - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, split); - bio_list_add(&nvmeq->sq_cong, bio); - wake_up_process(nvme_thread); - - return 0; -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio) +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_queue *nvmeq = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = bio_phys_segments(ns->queue, bio); - int result; - unsigned size = !(bio->bi_rw & REQ_DISCARD) ? bio->bi_iter.bi_size : + int psegs = req->nr_phys_segments; + int result = BLK_MQ_RQ_QUEUE_BUSY; + enum dma_data_direction dma_dir; + unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : sizeof(struct nvme_dsm_range); - if ((bio->bi_rw & REQ_FLUSH) && psegs) - return nvme_split_flush_data(nvmeq, bio); + /* + * Requeued IO has already been prepped + */ + iod = req->special; + if (iod) + goto submit_iod; iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); if (!iod) - return -ENOMEM; + return result; + + iod->private = req; + req->special = iod; - iod->private = bio; - if (bio->bi_rw & REQ_DISCARD) { + nvme_set_info(cmd, iod, req_completion); + + if (req->cmd_flags & REQ_DISCARD) { void *range; /* * We reuse the small pool to allocate the 16-byte range here @@ -784,33 +645,45 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, &iod->first_dma); - if (!range) { - result = -ENOMEM; - goto free_iod; - } + if (!range) + goto finish_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; } else if (psegs) { - result = nvme_map_bio(nvmeq, iod, bio, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - psegs); - if (result <= 0) - goto free_iod; - if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) != - result) { - result = -ENOMEM; - goto free_iod; + dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + + sg_init_table(iod->sg, psegs); + iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + if (!iod->nents) { + result = BLK_MQ_RQ_QUEUE_ERROR; + goto finish_cmd; } - nvme_start_io_acct(bio); - } - if (unlikely(nvme_submit_iod(nvmeq, iod))) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - list_add_tail(&iod->node, &nvmeq->iod_bio); + + if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) + goto finish_cmd; + + if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod, + blk_rq_bytes(req), GFP_ATOMIC)) + goto finish_cmd; } - return 0; - free_iod: + blk_mq_start_request(req); + + submit_iod: + spin_lock_irq(&nvmeq->q_lock); + if (req->cmd_flags & REQ_DISCARD) + nvme_submit_discard(nvmeq, ns, req, iod); + else if (req->cmd_flags & REQ_FLUSH) + nvme_submit_flush(nvmeq, ns, req->tag); + else + nvme_submit_iod(nvmeq, iod, ns); + + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; + + finish_cmd: + nvme_finish_cmd(nvmeq, req->tag, NULL); nvme_free_iod(nvmeq->dev, iod); return result; } @@ -833,8 +706,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) head = 0; phase = !phase; } - - ctx = free_cmdid(nvmeq, cqe.command_id, &fn); + ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); fn(nvmeq, ctx, &cqe); } @@ -855,29 +727,13 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) return 1; } -static void nvme_make_request(struct request_queue *q, struct bio *bio) +/* Admin queue isn't initialized as a request queue. If at some point this + * happens anyway, make sure to notify the user */ +static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nvme_ns *ns = q->queuedata; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); - int result = -EBUSY; - - if (!nvmeq) { - bio_endio(bio, -EIO); - return; - } - - spin_lock_irq(&nvmeq->q_lock); - if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) - result = nvme_submit_bio_queue(nvmeq, ns, bio); - if (unlikely(result)) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - } - - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - put_nvmeq(nvmeq); + WARN_ON_ONCE(1); + return BLK_MQ_RQ_QUEUE_ERROR; } static irqreturn_t nvme_irq(int irq, void *data) @@ -901,10 +757,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_WAKE_THREAD; } -static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) +static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info * + cmd_info) { spin_lock_irq(&nvmeq->q_lock); - cancel_cmdid(nvmeq, cmdid, NULL); + cancel_cmd_info(cmd_info, NULL); spin_unlock_irq(&nvmeq->q_lock); } @@ -927,45 +784,31 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx, * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, - struct nvme_command *cmd, +static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, u32 *result, unsigned timeout) { - int cmdid, ret; + int ret; struct sync_cmd_info cmdinfo; - struct nvme_queue *nvmeq; - - nvmeq = lock_nvmeq(dev, q_idx); - if (!nvmeq) - return -ENODEV; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; cmdinfo.task = current; cmdinfo.status = -EINTR; - cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); - if (cmdid < 0) { - unlock_nvmeq(nvmeq); - return cmdid; - } - cmd->common.command_id = cmdid; + cmd->common.command_id = req->tag; + + nvme_set_info(cmd_rq, &cmdinfo, sync_completion); set_current_state(TASK_KILLABLE); ret = nvme_submit_cmd(nvmeq, cmd); if (ret) { - free_cmdid(nvmeq, cmdid, NULL); - unlock_nvmeq(nvmeq); + nvme_finish_cmd(nvmeq, req->tag, NULL); set_current_state(TASK_RUNNING); - return ret; } - unlock_nvmeq(nvmeq); schedule_timeout(timeout); if (cmdinfo.status == -EINTR) { - nvmeq = lock_nvmeq(dev, q_idx); - if (nvmeq) { - nvme_abort_command(nvmeq, cmdid); - unlock_nvmeq(nvmeq); - } + nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); return -EINTR; } @@ -975,59 +818,99 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, return cmdinfo.status; } -static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, +static int nvme_submit_async_admin_req(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_command c; + struct nvme_cmd_info *cmd_info; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (!req) + return -ENOMEM; + + cmd_info = blk_mq_rq_to_pdu(req); + nvme_set_info(cmd_info, req, async_req_completion); + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_async_event; + c.common.command_id = req->tag; + + return __nvme_submit_cmd(nvmeq, &c); +} + +static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, struct nvme_command *cmd, struct async_cmd_info *cmdinfo, unsigned timeout) { - int cmdid; + struct nvme_queue *nvmeq = dev->queues[0]; + struct request *req; + struct nvme_cmd_info *cmd_rq; - cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout); - if (cmdid < 0) - return cmdid; + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (!req) + return -ENOMEM; + + req->timeout = timeout; + cmd_rq = blk_mq_rq_to_pdu(req); + cmdinfo->req = req; + nvme_set_info(cmd_rq, cmdinfo, async_completion); cmdinfo->status = -EINTR; - cmd->common.command_id = cmdid; + + cmd->common.command_id = req->tag; + return nvme_submit_cmd(nvmeq, cmd); } -int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) +int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result, unsigned timeout) { - return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); + int res; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (!req) + return -ENOMEM; + res = nvme_submit_sync_cmd(req, cmd, result, timeout); + blk_put_request(req); + return res; } -int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, +int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { - return nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), cmd, - result, NVME_IO_TIMEOUT); + return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT); } -static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, - struct nvme_command *cmd, struct async_cmd_info *cmdinfo) +int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_command *cmd, u32 *result) { - return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, - ADMIN_TIMEOUT); + int res; + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), + false); + if (!req) + return -ENOMEM; + res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); + blk_put_request(req); + return res; } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { - int status; struct nvme_command c; memset(&c, 0, sizeof(c)); c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { - int status; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; @@ -1039,16 +922,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { - int status; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; @@ -1060,10 +939,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1119,28 +995,27 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, } /** - * nvme_abort_cmd - Attempt aborting a command - * @cmdid: Command id of a timed out IO - * @queue: The queue with timed out IO + * nvme_abort_req - Attempt aborting a request * * Schedule controller reset if the command was already aborted once before and * still hasn't been returned to the driver, or if this is the admin queue. */ -static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) +static void nvme_abort_req(struct request *req) { - int a_cmdid; - struct nvme_command cmd; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; struct nvme_dev *dev = nvmeq->dev; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - struct nvme_queue *adminq; + struct request *abort_req; + struct nvme_cmd_info *abort_cmd; + struct nvme_command cmd; - if (!nvmeq->qid || info[cmdid].aborted) { + if (!nvmeq->qid || cmd_rq->aborted) { if (work_busy(&dev->reset_work)) return; list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, - "I/O %d QID %d timeout, reset controller\n", cmdid, - nvmeq->qid); + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); return; @@ -1149,89 +1024,79 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) if (!dev->abort_limit) return; - adminq = rcu_dereference(dev->queues[0]); - a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, - ADMIN_TIMEOUT); - if (a_cmdid < 0) + abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, + false); + if (!abort_req) return; + abort_cmd = blk_mq_rq_to_pdu(abort_req); + nvme_set_info(abort_cmd, abort_req, abort_completion); + memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = cmdid; + cmd.abort.cid = req->tag; cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - cmd.abort.command_id = a_cmdid; + cmd.abort.command_id = abort_req->tag; --dev->abort_limit; - info[cmdid].aborted = 1; - info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; + cmd_rq->aborted = 1; - dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, + dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, nvmeq->qid); - nvme_submit_cmd(adminq, &cmd); + if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) { + dev_warn(nvmeq->q_dmadev, + "Could not abort I/O %d QID %d", + req->tag, nvmeq->qid); + blk_put_request(req); + } } -/** - * nvme_cancel_ios - Cancel outstanding I/Os - * @queue: The queue to cancel I/Os on - * @timeout: True to only cancel I/Os which have timed out - */ -static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) +static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx, + struct request *req, void *data, bool reserved) { - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - unsigned long now = jiffies; - int cmdid; + struct nvme_queue *nvmeq = data; + void *ctx; + nvme_completion_fn fn; + struct nvme_cmd_info *cmd; + static struct nvme_completion cqe = { + .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), + }; - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { - void *ctx; - nvme_completion_fn fn; - static struct nvme_completion cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), - }; + cmd = blk_mq_rq_to_pdu(req); - if (timeout && !time_after(now, info[cmdid].timeout)) - continue; - if (info[cmdid].ctx == CMD_CTX_CANCELLED) - continue; - if (timeout && info[cmdid].ctx == CMD_CTX_ASYNC) - continue; - if (timeout && nvmeq->dev->initialized) { - nvme_abort_cmd(cmdid, nvmeq); - continue; - } - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, - nvmeq->qid); - ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq, ctx, &cqe); - } + if (cmd->ctx == CMD_CTX_CANCELLED) + return; + + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", + req->tag, nvmeq->qid); + ctx = cancel_cmd_info(cmd, &fn); + fn(nvmeq, ctx, &cqe); } -static void nvme_free_queue(struct nvme_queue *nvmeq) +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { - spin_lock_irq(&nvmeq->q_lock); - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - bio_endio(bio, -EIO); - } - while (!list_empty(&nvmeq->iod_bio)) { - static struct nvme_completion cqe = { - .status = cpu_to_le16( - (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1), - }; - struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio, - struct nvme_iod, - node); - list_del(&iod->node); - bio_completion(nvmeq, iod, &cqe); - } - spin_unlock_irq(&nvmeq->q_lock); + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd->nvmeq; + + dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, + nvmeq->qid); + if (nvmeq->dev->initialized) + nvme_abort_req(req); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; +} +static void nvme_free_queue(struct nvme_queue *nvmeq) +{ dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); - if (nvmeq->qid) - free_cpumask_var(nvmeq->cpu_mask); kfree(nvmeq); } @@ -1243,10 +1108,10 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) int i; for (i = dev->queue_count - 1; i >= lowest; i--) { - nvmeq = raw_nvmeq(dev, i); - RCU_INIT_POINTER(dev->queues[i], NULL); + struct nvme_queue *nvmeq = dev->queues[i]; llist_add(&nvmeq->node, &q_list); dev->queue_count--; + dev->queues[i] = NULL; } synchronize_rcu(); entry = llist_del_all(&q_list); @@ -1257,19 +1122,12 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) /** * nvme_suspend_queue - put queue into suspended state * @nvmeq - queue to suspend - * - * Returns 1 if already suspended, 0 otherwise. */ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->q_suspended) { - spin_unlock_irq(&nvmeq->q_lock); - return 1; - } - nvmeq->q_suspended = 1; nvmeq->dev->online_queues--; spin_unlock_irq(&nvmeq->q_lock); @@ -1281,15 +1139,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_clear_queue(struct nvme_queue *nvmeq) { + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - nvme_cancel_ios(nvmeq, false); + if (hctx && hctx->tags) + blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); } static void nvme_disable_queue(struct nvme_dev *dev, int qid) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); + struct nvme_queue *nvmeq = dev->queues[qid]; if (!nvmeq) return; @@ -1309,8 +1170,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector) { struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = nvme_queue_extra(depth); - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); if (!nvmeq) return NULL; @@ -1324,9 +1184,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, if (!nvmeq->sq_cmds) goto free_cqdma; - if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL)) - goto free_sqdma; - nvmeq->q_dmadev = dmadev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", @@ -1334,22 +1191,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, spin_lock_init(&nvmeq->q_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; - init_waitqueue_head(&nvmeq->sq_full); - bio_list_init(&nvmeq->sq_cong); - INIT_LIST_HEAD(&nvmeq->iod_bio); nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->cq_vector = vector; nvmeq->qid = qid; - nvmeq->q_suspended = 1; dev->queue_count++; - rcu_assign_pointer(dev->queues[qid], nvmeq); + dev->queues[qid] = nvmeq; return nvmeq; - free_sqdma: - dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds, - nvmeq->sq_dma_addr); free_cqdma: dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); @@ -1372,18 +1222,13 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { struct nvme_dev *dev = nvmeq->dev; - unsigned extra = nvme_queue_extra(nvmeq->q_depth); spin_lock_irq(&nvmeq->q_lock); - init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); nvmeq->sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset(nvmeq->cmdid_data, 0, extra); memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); - nvme_cancel_ios(nvmeq, false); - nvmeq->q_suspended = 0; dev->online_queues++; spin_unlock_irq(&nvmeq->q_lock); } @@ -1486,6 +1331,52 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) return 0; } +static struct blk_mq_ops nvme_mq_admin_ops = { + .queue_rq = nvme_admin_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_admin_init_hctx, + .init_request = nvme_admin_init_request, + .timeout = nvme_timeout, +}; + +static struct blk_mq_ops nvme_mq_ops = { + .queue_rq = nvme_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_init_hctx, + .init_request = nvme_init_request, + .timeout = nvme_timeout, +}; + +static int nvme_alloc_admin_tags(struct nvme_dev *dev) +{ + if (!dev->admin_q) { + dev->admin_tagset.ops = &nvme_mq_admin_ops; + dev->admin_tagset.nr_hw_queues = 1; + dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; + dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->admin_tagset)) + return -ENOMEM; + + dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (!dev->admin_q) { + blk_mq_free_tag_set(&dev->admin_tagset); + return -ENOMEM; + } + } + + return 0; +} + +static void nvme_free_admin_tags(struct nvme_dev *dev) +{ + if (dev->admin_q) + blk_mq_free_tag_set(&dev->admin_tagset); +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; @@ -1515,9 +1406,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - nvmeq = raw_nvmeq(dev, 0); + nvmeq = dev->queues[0]; if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0); if (!nvmeq) return -ENOMEM; } @@ -1538,13 +1429,23 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) result = nvme_enable_ctrl(dev, cap); if (result) - return result; + goto free_nvmeq; + + result = nvme_alloc_admin_tags(dev); + if (result) + goto free_nvmeq; result = queue_request_irq(dev, nvmeq, nvmeq->irqname); if (result) - return result; + goto free_tags; return result; + + free_tags: + nvme_free_admin_tags(dev); + free_nvmeq: + nvme_free_queues(dev, 0); + return result; } struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, @@ -1702,7 +1603,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) if (length != (io.nblocks + 1) << ns->lba_shift) status = -ENOMEM; else - status = nvme_submit_io_cmd(dev, &c, NULL); + status = nvme_submit_io_cmd(dev, ns, &c, NULL); if (meta_len) { if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { @@ -1734,8 +1635,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_cmd(struct nvme_dev *dev, - struct nvme_passthru_cmd __user *ucmd, bool ioq) +static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; @@ -1774,13 +1675,23 @@ static int nvme_user_cmd(struct nvme_dev *dev, timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : ADMIN_TIMEOUT; + if (length != cmd.data_len) status = -ENOMEM; - else if (ioq) - status = nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), &c, - &cmd.result, timeout); - else - status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); + else if (ns) { + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, + (GFP_KERNEL|__GFP_WAIT), false); + if (!req) + status = -ENOMEM; + else { + status = nvme_submit_sync_cmd(req, &c, &cmd.result, + timeout); + blk_put_request(req); + } + } else + status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); @@ -1804,9 +1715,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns->dev, (void __user *)arg, false); + return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->dev, (void __user *)arg, true); + return nvme_user_cmd(ns->dev, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -1906,62 +1817,6 @@ static const struct block_device_operations nvme_fops = { .revalidate_disk= nvme_revalidate_disk, }; -static void nvme_resubmit_iods(struct nvme_queue *nvmeq) -{ - struct nvme_iod *iod, *next; - - list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) { - if (unlikely(nvme_submit_iod(nvmeq, iod))) - break; - list_del(&iod->node); - if (bio_list_empty(&nvmeq->sq_cong) && - list_empty(&nvmeq->iod_bio)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - } -} - -static void nvme_resubmit_bios(struct nvme_queue *nvmeq) -{ - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; - - if (bio_list_empty(&nvmeq->sq_cong) && - list_empty(&nvmeq->iod_bio)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - if (nvme_submit_bio_queue(nvmeq, ns, bio)) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - bio_list_add_head(&nvmeq->sq_cong, bio); - break; - } - } -} - -static int nvme_submit_async_req(struct nvme_queue *nvmeq) -{ - struct nvme_command *c; - int cmdid; - - cmdid = alloc_cmdid(nvmeq, CMD_CTX_ASYNC, special_completion, 0); - if (cmdid < 0) - return cmdid; - - c = &nvmeq->sq_cmds[nvmeq->sq_tail]; - memset(c, 0, sizeof(*c)); - c->common.opcode = nvme_admin_async_event; - c->common.command_id = cmdid; - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; -} - static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -1977,34 +1832,26 @@ static int nvme_kthread(void *data) continue; list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, - "Failed status, reset controller\n"); + "Failed status: %x, reset controller\n", + readl(&dev->bar->csts)); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); continue; } - rcu_read_lock(); for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = - rcu_dereference(dev->queues[i]); + struct nvme_queue *nvmeq = dev->queues[i]; if (!nvmeq) continue; spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->q_suspended) - goto unlock; nvme_process_cq(nvmeq); - nvme_cancel_ios(nvmeq, true); - nvme_resubmit_bios(nvmeq); - nvme_resubmit_iods(nvmeq); while ((i == 0) && (dev->event_limit > 0)) { - if (nvme_submit_async_req(nvmeq)) + if (nvme_submit_async_admin_req(dev)) break; dev->event_limit--; } - unlock: spin_unlock_irq(&nvmeq->q_lock); } - rcu_read_unlock(); } spin_unlock(&dev_list_lock); schedule_timeout(round_jiffies_relative(HZ)); @@ -2027,29 +1874,29 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, { struct nvme_ns *ns; struct gendisk *disk; + int node = dev_to_node(&dev->pci_dev->dev); int lbaf; if (rt->attributes & NVME_LBART_ATTRIB_HIDE) return NULL; - ns = kzalloc(sizeof(*ns), GFP_KERNEL); + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) return NULL; - ns->queue = blk_alloc_queue(GFP_KERNEL); + ns->queue = blk_mq_init_queue(&dev->tagset); if (!ns->queue) goto out_free_ns; - ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue); - blk_queue_make_request(ns->queue, nvme_make_request); + queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue); ns->dev = dev; ns->queue->queuedata = ns; - disk = alloc_disk(0); + disk = alloc_disk_node(0, node); if (!disk) goto out_free_queue; + ns->ns_id = nsid; ns->disk = disk; lbaf = id->flbas & 0xf; @@ -2058,6 +1905,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->stripe_size) + blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); if (dev->vwc & NVME_CTRL_VWC_PRESENT) blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); @@ -2083,143 +1932,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, return NULL; } -static int nvme_find_closest_node(int node) -{ - int n, val, min_val = INT_MAX, best_node = node; - - for_each_online_node(n) { - if (n == node) - continue; - val = node_distance(node, n); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - return best_node; -} - -static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq, - int count) -{ - int cpu; - for_each_cpu(cpu, qmask) { - if (cpumask_weight(nvmeq->cpu_mask) >= count) - break; - if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask)) - *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid; - } -} - -static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, - const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue) -{ - int next_cpu; - for_each_cpu(next_cpu, new_mask) { - cpumask_or(mask, mask, get_cpu_mask(next_cpu)); - cpumask_or(mask, mask, topology_thread_cpumask(next_cpu)); - cpumask_and(mask, mask, unassigned_cpus); - nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue); - } -} - static void nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i, max; + unsigned i; - max = min(dev->max_qid, num_online_cpus()); - for (i = dev->queue_count; i <= max; i++) + for (i = dev->queue_count; i <= dev->max_qid; i++) if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) break; - max = min(dev->queue_count - 1, num_online_cpus()); - for (i = dev->online_queues; i <= max; i++) - if (nvme_create_queue(raw_nvmeq(dev, i), i)) + for (i = dev->online_queues; i <= dev->queue_count - 1; i++) + if (nvme_create_queue(dev->queues[i], i)) break; } -/* - * If there are fewer queues than online cpus, this will try to optimally - * assign a queue to multiple cpus by grouping cpus that are "close" together: - * thread siblings, core, socket, closest node, then whatever else is - * available. - */ -static void nvme_assign_io_queues(struct nvme_dev *dev) -{ - unsigned cpu, cpus_per_queue, queues, remainder, i; - cpumask_var_t unassigned_cpus; - - nvme_create_io_queues(dev); - - queues = min(dev->online_queues - 1, num_online_cpus()); - if (!queues) - return; - - cpus_per_queue = num_online_cpus() / queues; - remainder = queues - (num_online_cpus() - queues * cpus_per_queue); - - if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL)) - return; - - cpumask_copy(unassigned_cpus, cpu_online_mask); - cpu = cpumask_first(unassigned_cpus); - for (i = 1; i <= queues; i++) { - struct nvme_queue *nvmeq = lock_nvmeq(dev, i); - cpumask_t mask; - - cpumask_clear(nvmeq->cpu_mask); - if (!cpumask_weight(unassigned_cpus)) { - unlock_nvmeq(nvmeq); - break; - } - - mask = *get_cpu_mask(cpu); - nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - topology_thread_cpumask(cpu), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - topology_core_cpumask(cpu), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - cpumask_of_node(cpu_to_node(cpu)), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - cpumask_of_node( - nvme_find_closest_node( - cpu_to_node(cpu))), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - unassigned_cpus, - nvmeq, cpus_per_queue); - - WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue, - "nvme%d qid:%d mis-matched queue-to-cpu assignment\n", - dev->instance, i); - - irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - nvmeq->cpu_mask); - cpumask_andnot(unassigned_cpus, unassigned_cpus, - nvmeq->cpu_mask); - cpu = cpumask_next(cpu, unassigned_cpus); - if (remainder && !--remainder) - cpus_per_queue++; - unlock_nvmeq(nvmeq); - } - WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n", - dev->instance); - i = 0; - cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); - for_each_cpu(cpu, unassigned_cpus) - *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1; - free_cpumask_var(unassigned_cpus); -} - static int set_queue_count(struct nvme_dev *dev, int count) { int status; @@ -2243,33 +1968,9 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); } -static void nvme_cpu_workfn(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work); - if (dev->initialized) - nvme_assign_io_queues(dev); -} - -static int nvme_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - struct nvme_dev *dev; - - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) - schedule_work(&dev->cpu_work); - spin_unlock(&dev_list_lock); - break; - } - return NOTIFY_OK; -} - static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = raw_nvmeq(dev, 0); + struct nvme_queue *adminq = dev->queues[0]; struct pci_dev *pdev = dev->pci_dev; int result, i, vecs, nr_io_queues, size; @@ -2321,14 +2022,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->max_qid = nr_io_queues; result = queue_request_irq(dev, adminq, adminq->irqname); - if (result) { - adminq->q_suspended = 1; + if (result) goto free_queues; - } /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1); - nvme_assign_io_queues(dev); + nvme_create_io_queues(dev); return 0; @@ -2378,8 +2077,30 @@ static int nvme_dev_add(struct nvme_dev *dev) if (ctrl->mdts) dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && - (pdev->device == 0x0953) && ctrl->vs[3]) + (pdev->device == 0x0953) && ctrl->vs[3]) { + unsigned int max_hw_sectors; + dev->stripe_size = 1 << (ctrl->vs[3] + shift); + max_hw_sectors = dev->stripe_size >> (shift - 9); + if (dev->max_hw_sectors) { + dev->max_hw_sectors = min(max_hw_sectors, + dev->max_hw_sectors); + } else + dev->max_hw_sectors = max_hw_sectors; + } + + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->tagset.queue_depth = + min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->tagset)) + goto out; id_ns = mem; for (i = 1; i <= nn; i++) { @@ -2529,7 +2250,8 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, c.delete_queue.qid = cpu_to_le16(nvmeq->qid); init_kthread_work(&nvmeq->cmdinfo.work, fn); - return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo); + return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, + ADMIN_TIMEOUT); } static void nvme_del_cq_work_handler(struct kthread_work *work) @@ -2592,7 +2314,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) atomic_set(&dq.refcount, 0); dq.worker = &worker; for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + struct nvme_queue *nvmeq = dev->queues[i]; if (nvme_suspend_queue(nvmeq)) continue; @@ -2637,7 +2359,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) csts = readl(&dev->bar->csts); if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { for (i = dev->queue_count - 1; i >= 0; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + struct nvme_queue *nvmeq = dev->queues[i]; nvme_suspend_queue(nvmeq); nvme_clear_queue(nvmeq); } @@ -2649,6 +2371,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_dev_unmap(dev); } +static void nvme_dev_remove_admin(struct nvme_dev *dev) +{ + if (dev->admin_q && !blk_queue_dying(dev->admin_q)) + blk_cleanup_queue(dev->admin_q); +} + static void nvme_dev_remove(struct nvme_dev *dev) { struct nvme_ns *ns; @@ -2736,7 +2464,7 @@ static void nvme_free_dev(struct kref *kref) pci_dev_put(dev->pci_dev); nvme_free_namespaces(dev); - free_percpu(dev->io_queue); + blk_mq_free_tag_set(&dev->tagset); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2761,11 +2489,16 @@ static int nvme_dev_release(struct inode *inode, struct file *f) static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { struct nvme_dev *dev = f->private_data; + struct nvme_ns *ns; + switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(dev, (void __user *)arg, false); + return nvme_user_cmd(dev, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(dev, (void __user *)arg, true); + if (list_empty(&dev->namespaces)) + return -ENOTTY; + ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); + return nvme_user_cmd(dev, ns, (void __user *)arg); default: return -ENOTTY; } @@ -2779,6 +2512,22 @@ static const struct file_operations nvme_dev_fops = { .compat_ioctl = nvme_dev_ioctl, }; +static void nvme_set_irq_hints(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq; + int i; + + for (i = 0; i < dev->online_queues; i++) { + nvmeq = dev->queues[i]; + + if (!nvmeq->hctx) + continue; + + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + nvmeq->hctx->cpumask); + } +} + static int nvme_dev_start(struct nvme_dev *dev) { int result; @@ -2810,12 +2559,15 @@ static int nvme_dev_start(struct nvme_dev *dev) result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; goto disable; } - nvme_init_queue(raw_nvmeq(dev, 0), 0); + + nvme_init_queue(dev->queues[0], 0); result = nvme_setup_io_queues(dev); if (result) goto disable; + nvme_set_irq_hints(dev); + return result; disable: @@ -2866,7 +2618,7 @@ static void nvme_dev_reset(struct nvme_dev *dev) { nvme_dev_shutdown(dev); if (nvme_dev_resume(dev)) { - dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); + dev_warn(&dev->pci_dev->dev, "Device failed to resume\n"); kref_get(&dev->kref); if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", dev->instance))) { @@ -2891,28 +2643,28 @@ static void nvme_reset_workfn(struct work_struct *work) static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - int result = -ENOMEM; + int node, result = -ENOMEM; struct nvme_dev *dev; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + node = dev_to_node(&pdev->dev); + if (node == NUMA_NO_NODE) + set_dev_node(&pdev->dev, 0); + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return -ENOMEM; - dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), - GFP_KERNEL); + dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), + GFP_KERNEL, node); if (!dev->entry) goto free; - dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), - GFP_KERNEL); + dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), + GFP_KERNEL, node); if (!dev->queues) goto free; - dev->io_queue = alloc_percpu(unsigned short); - if (!dev->io_queue) - goto free; INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); - INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); dev->pci_dev = pci_dev_get(pdev); pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); @@ -2942,11 +2694,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto remove; + nvme_set_irq_hints(dev); + dev->initialized = 1; return 0; remove: nvme_dev_remove(dev); + nvme_dev_remove_admin(dev); nvme_free_namespaces(dev); shutdown: nvme_dev_shutdown(dev); @@ -2958,7 +2713,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) put_pci: pci_dev_put(dev->pci_dev); free: - free_percpu(dev->io_queue); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2991,11 +2745,12 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->reset_work); - flush_work(&dev->cpu_work); misc_deregister(&dev->miscdev); + nvme_dev_remove(dev); nvme_dev_shutdown(dev); + nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - nvme_dev_remove(dev); + nvme_free_admin_tags(dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); @@ -3079,18 +2834,11 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; - nvme_nb.notifier_call = &nvme_cpu_notify; - result = register_hotcpu_notifier(&nvme_nb); - if (result) - goto unregister_blkdev; - result = pci_register_driver(&nvme_driver); if (result) - goto unregister_hotcpu; + goto unregister_blkdev; return 0; - unregister_hotcpu: - unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: -- cgit From 9f173b33843552c48e0136e02e7362c8229c8e57 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Nov 2014 23:39:09 +0300 Subject: NVMe: blk_mq_alloc_request() returns error pointers We recently converted this to blk_mq but the error checks have to be updated to check for IS_ERR() instead of NULL. Fixes: a4aea5623d4a ('NVMe: Convert to blk-mq') Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 39050a3d10fd..f7a87173e3f0 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -826,8 +826,8 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev) struct request *req; req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); cmd_info = blk_mq_rq_to_pdu(req); nvme_set_info(cmd_info, req, async_req_completion); @@ -848,8 +848,8 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, struct nvme_cmd_info *cmd_rq; req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); req->timeout = timeout; cmd_rq = blk_mq_rq_to_pdu(req); @@ -1026,7 +1026,7 @@ static void nvme_abort_req(struct request *req) abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, false); - if (!abort_req) + if (IS_ERR(abort_req)) return; abort_cmd = blk_mq_rq_to_pdu(abort_req); @@ -1884,7 +1884,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, if (!ns) return NULL; ns->queue = blk_mq_init_queue(&dev->tagset); - if (!ns->queue) + if (IS_ERR(ns->queue)) goto out_free_ns; queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); -- cgit From a64e6bb4db601b561f2d3a80dfc554ddfe73db74 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Wed, 5 Nov 2014 18:47:07 +0800 Subject: NVMe: __nvme_submit_admin_cmd() can be static drivers/block/nvme-core.c:865:5: sparse: symbol '__nvme_submit_admin_cmd' was not declared. Should it be static? Signed-off-by: Fengguang Wu Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f7a87173e3f0..8393f91b2721 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -862,7 +862,7 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, return nvme_submit_cmd(nvmeq, cmd); } -int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, +static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result, unsigned timeout) { int res; -- cgit From 9d135bb8c2a0d2e54b84ebc1b7d41852614fead8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Nov 2014 10:43:42 -0700 Subject: NVMe: replace blk_put_request() with blk_mq_free_request() No point in using blk_put_request(), since we know we are blk-mq. This only makes sense in core code where we could be dealing with either legacy or blk-mq drivers. Additionally, use blk_mq_free_hctx_request() for the request completion fast path, where we already know the mapping from request to hardware queue. Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 8393f91b2721..bbac17f29fe7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -262,7 +262,7 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); - blk_put_request(req); + blk_mq_free_hctx_request(nvmeq->hctx, req); } static void abort_completion(struct nvme_queue *nvmeq, void *ctx, @@ -273,7 +273,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx, u16 status = le16_to_cpup(&cqe->status) >> 1; u32 result = le32_to_cpup(&cqe->result); - blk_put_request(req); + blk_mq_free_hctx_request(nvmeq->hctx, req); dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); ++nvmeq->dev->abort_limit; @@ -286,7 +286,7 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx, cmdinfo->result = le32_to_cpup(&cqe->result); cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; queue_kthread_work(cmdinfo->worker, &cmdinfo->work); - blk_put_request(cmdinfo->req); + blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req); } static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, @@ -872,7 +872,7 @@ static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cm if (!req) return -ENOMEM; res = nvme_submit_sync_cmd(req, cmd, result, timeout); - blk_put_request(req); + blk_mq_free_request(req); return res; } @@ -893,7 +893,7 @@ int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, if (!req) return -ENOMEM; res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); - blk_put_request(req); + blk_mq_free_request(req); return res; } @@ -1047,7 +1047,7 @@ static void nvme_abort_req(struct request *req) dev_warn(nvmeq->q_dmadev, "Could not abort I/O %d QID %d", req->tag, nvmeq->qid); - blk_put_request(req); + blk_mq_free_request(req); } } @@ -1688,7 +1688,7 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, else { status = nvme_submit_sync_cmd(req, &c, &cmd.result, timeout); - blk_put_request(req); + blk_mq_free_request(req); } } else status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); -- cgit From 6dcc0cf6cb3120cedc0d4c12171894f3d6415981 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Nov 2014 08:21:18 -0700 Subject: NVMe: nvme_submit_async_admin_req() must use atomic rq allocation We are called for async event notification issues, and the nvmeq lock is already held. If we fail the request allocation, we'll just retry next time. Reported-by: Julia Lawall Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index bbac17f29fe7..fb4b205317c6 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -825,7 +825,7 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev) struct nvme_cmd_info *cmd_info; struct request *req; - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, false); if (IS_ERR(req)) return PTR_ERR(req); -- cgit From 139768895309c6c1d6913e909e9c9422f81a1640 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Nov 2014 08:45:31 -0700 Subject: NVMe: enable IO stats by default Before the blk-mq conversion they were on by default, we should not change behavior there. Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index fb4b205317c6..677d7b9ff454 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1889,7 +1889,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue); - queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue); ns->dev = dev; ns->queue->queuedata = ns; -- cgit From e32efbfc35c1b06f1bfe3e6d737acdd14d27baed Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Nov 2014 09:49:26 -0700 Subject: NVMe: make setup work for devices that don't do INTx The setup/probe part currently relies on INTx being there and working, that's not always the case. For devices that don't advertise INTx, enable a single MSIx vector early on and disable it again before we ask for our full range of queue vecs. Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 677d7b9ff454..9310fe51382e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1998,6 +1998,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) /* Deregister the admin queue's interrupt */ free_irq(dev->entry[0].vector, adminq); + /* + * If we enable msix early due to not intx, disable it again before + * setting up the full range we need. + */ + if (!pdev->irq) + pci_disable_msix(pdev); + for (i = 0; i < nr_io_queues; i++) dev->entry[i].entry = i; vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); @@ -2150,10 +2157,22 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); if (!dev->bar) goto disable; + if (readl(&dev->bar->csts) == -1) { result = -ENODEV; goto unmap; } + + /* + * Some devices don't advertse INTx interrupts, pre-enable a single + * MSIX vec for setup. We'll adjust this later. + */ + if (!pdev->irq) { + result = pci_enable_msix(pdev, dev->entry, 1); + if (result < 0) + goto unmap; + } + cap = readq(&dev->bar->cap); dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); dev->db_stride = 1 << NVME_CAP_STRIDE(cap); -- cgit From 2c30540b38d683d4c7f06d13a451f67d4362d7b1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Nov 2014 09:47:32 -0700 Subject: NVMe: add ->exit_hctx() hook If we do teardown and setup of the queue and block related parts of the driver, then we should clear nvmeq->hctx once we kill the hardware queue. Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 9310fe51382e..ba278ae00705 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -171,6 +171,13 @@ static int nvme_admin_init_request(void *data, struct request *req, return 0; } +static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + nvmeq->hctx = NULL; +} + static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -1335,6 +1342,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_admin_queue_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_admin_init_hctx, + .exit_hctx = nvme_exit_hctx, .init_request = nvme_admin_init_request, .timeout = nvme_timeout, }; @@ -1343,6 +1351,7 @@ static struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_init_hctx, + .exit_hctx = nvme_exit_hctx, .init_request = nvme_init_request, .timeout = nvme_timeout, }; -- cgit From be7837e89d610046ae8dd28dc504df09261d9f91 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Nov 2014 09:50:19 -0700 Subject: NVMe: fail pci initialization if the device doesn't have any BARs The PCI init of NVMe doesn't check for valid bars before proceeding to map and use BAR 0. If the device is hosed (or firmware is), then we should catch this case and give up early. This fixes a: [ 1662.035778] WARNING: CPU: 0 PID: 4 at arch/x86/mm/ioremap.c:63 __ioremap_check_ram+0xa7/0xc0() and later badness on such a device. Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index ba278ae00705..c1541652b7e0 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2156,6 +2156,9 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->entry[0].vector = pdev->irq; pci_set_master(pdev); bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (!bars) + goto disable_pci; + if (pci_request_selected_regions(pdev, bars, "nvme")) goto disable_pci; -- cgit From c78b47136f7adebcf953f541c7d9c4c745a54e3f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 21 Nov 2014 15:16:32 -0700 Subject: NVMe: Update module version major number It's already near impossible to tell what bits someone is running based on a 'modinfo nvme', and I don't want to try guessing if someone is running blk-mq or bio-based. Let's make it obvious with the module version that the blk-mq conversion is a major change. Future bio-based versions can increment to 0.10 in a fork if revisions occur. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index c1541652b7e0..c7ea07ce9372 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2888,6 +2888,6 @@ static void __exit nvme_exit(void) MODULE_AUTHOR("Matthew Wilcox "); MODULE_LICENSE("GPL"); -MODULE_VERSION("0.9"); +MODULE_VERSION("1.0"); module_init(nvme_init); module_exit(nvme_exit); -- cgit From 9af8785a38d4528d6675247f873b0f1ae29f3be8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 3 Dec 2014 17:07:13 -0700 Subject: NVMe: Fix command setup on IO retry On retry, the req->special is pointing to an already setup IOD, but we still need to setup the command context and callback, otherwise you'll see false twice completed errors and leak requests. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index c7ea07ce9372..bcbdf832b1b0 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -640,8 +640,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, iod->private = req; req->special = iod; - nvme_set_info(cmd, iod, req_completion); - if (req->cmd_flags & REQ_DISCARD) { void *range; /* @@ -677,6 +675,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); submit_iod: + nvme_set_info(cmd, iod, req_completion); spin_lock_irq(&nvmeq->q_lock); if (req->cmd_flags & REQ_DISCARD) nvme_submit_discard(nvmeq, ns, req, iod); -- cgit From c87fd5407e1aa1cfc6b393b03bf67010cf643dbe Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Wed, 10 Dec 2014 13:00:31 -0700 Subject: NVMe: fix freeing of wrong request in abort path We allocate 'abort_req', but free 'req' in case of an error submitting the IO. Signed-off-by: Sam Bradshaw Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index bcbdf832b1b0..cf9b8a8a24d3 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1053,7 +1053,7 @@ static void nvme_abort_req(struct request *req) dev_warn(nvmeq->q_dmadev, "Could not abort I/O %d QID %d", req->tag, nvmeq->qid); - blk_mq_free_request(req); + blk_mq_free_request(abort_req); } } -- cgit From 97fe383222cb8a01fb67a8823498ad2edcc20b35 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Dec 2014 13:02:44 -0700 Subject: NVMe: fix error return checking from blk_mq_alloc_request() We return an error pointer or the request, not NULL. Half the call paths got it right, the others didn't. Fix those up. Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cf9b8a8a24d3..2cc2cee7a367 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -875,8 +875,8 @@ static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cm struct request *req; req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); res = nvme_submit_sync_cmd(req, cmd, result, timeout); blk_mq_free_request(req); return res; @@ -896,8 +896,8 @@ int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), false); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); blk_mq_free_request(req); return res; @@ -1691,8 +1691,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), false); - if (!req) - status = -ENOMEM; + if (IS_ERR(req)) + status = PTR_ERR(req); else { status = nvme_submit_sync_cmd(req, &c, &cmd.result, timeout); -- cgit From 285dffc9101244ac65c29672a1fb3fe614b52238 Mon Sep 17 00:00:00 2001 From: Indraneel M Date: Thu, 11 Dec 2014 08:24:18 -0700 Subject: NVMe: Fix FS mount issue (hot-remove followed by hot-add) After Hot-remove of a device with a mounted partition, when the device is hot-added again, the new node reappears as nvme0n1. Mounting this new node fails with the error: mount: mount /dev/nvme0n1p1 on /mnt failed: File exists. The old nodes's FS entries still exist and the kernel can't re-create procfs and sysfs entries for the new node with the same name. The patch fixes this issue. Acked-by: Keith Busch Signed-off-by: Indraneel M Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2cc2cee7a367..95f2310255ce 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2493,6 +2493,7 @@ static void nvme_free_dev(struct kref *kref) pci_dev_put(dev->pci_dev); nvme_free_namespaces(dev); + nvme_release_instance(dev); blk_mq_free_tag_set(&dev->tagset); kfree(dev->queues); kfree(dev->entry); @@ -2780,7 +2781,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); nvme_free_admin_tags(dev); - nvme_release_instance(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); } -- cgit From fe54303ee2be293c1c5c7a53a152453789cabc2f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Dec 2014 13:58:39 -0700 Subject: NVMe: fix retry/error logic in nvme_queue_rq() The logic around retrying and erroring IO in nvme_queue_rq() is broken in a few ways: - If we fail allocating dma memory for a discard, we return retry. We have the 'iod' stored in ->special, but we free the 'iod'. - For a normal request, if we fail dma mapping of setting up prps, we have the same iod situation. Additionally, we haven't set the callback for the request yet, so we also potentially leak IOMMU resources. Get rid of the ->special 'iod' store. The retry is uncommon enough that it's not worth optimizing for or holding on to resources to attempt to speed it up. Additionally, it's usually best practice to free any request related resources when doing retries. Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 95f2310255ce..e92bdf4c68fc 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -621,24 +621,15 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; int psegs = req->nr_phys_segments; - int result = BLK_MQ_RQ_QUEUE_BUSY; enum dma_data_direction dma_dir; unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : sizeof(struct nvme_dsm_range); - /* - * Requeued IO has already been prepped - */ - iod = req->special; - if (iod) - goto submit_iod; - iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); if (!iod) - return result; + return BLK_MQ_RQ_QUEUE_BUSY; iod->private = req; - req->special = iod; if (req->cmd_flags & REQ_DISCARD) { void *range; @@ -651,7 +642,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, GFP_ATOMIC, &iod->first_dma); if (!range) - goto finish_cmd; + goto retry_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; } else if (psegs) { @@ -659,22 +650,22 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, sg_init_table(iod->sg, psegs); iod->nents = blk_rq_map_sg(req->q, req, iod->sg); - if (!iod->nents) { - result = BLK_MQ_RQ_QUEUE_ERROR; - goto finish_cmd; - } + if (!iod->nents) + goto error_cmd; if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) - goto finish_cmd; + goto retry_cmd; - if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod, - blk_rq_bytes(req), GFP_ATOMIC)) - goto finish_cmd; + if (blk_rq_bytes(req) != + nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, + iod->nents, dma_dir); + goto retry_cmd; + } } blk_mq_start_request(req); - submit_iod: nvme_set_info(cmd, iod, req_completion); spin_lock_irq(&nvmeq->q_lock); if (req->cmd_flags & REQ_DISCARD) @@ -688,10 +679,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, spin_unlock_irq(&nvmeq->q_lock); return BLK_MQ_RQ_QUEUE_OK; - finish_cmd: - nvme_finish_cmd(nvmeq, req->tag, NULL); + error_cmd: nvme_free_iod(nvmeq->dev, iod); - return result; + return BLK_MQ_RQ_QUEUE_ERROR; + retry_cmd: + nvme_free_iod(nvmeq->dev, iod); + return BLK_MQ_RQ_QUEUE_BUSY; } static int nvme_process_cq(struct nvme_queue *nvmeq) -- cgit From 849c6e7746e4f6317ace6aa7d2fcdcd844e99ddb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Dec 2014 08:53:40 -0700 Subject: NVMe: fix race condition in nvme_submit_sync_cmd() If we have a race between the schedule timing out and the command completing, we could have the task issuing the command exit nvme_submit_sync_cmd() while the irq is running sync_completion(). If that happens, we could be corrupting memory, since the stack that held 'cmdinfo' is no longer valid. Fix this by always calling nvme_abort_cmd_info(). Once that call completes, we know that we have either run sync_completion() if the completion came in, or that we will never run it since we now have special_completion() as the command callback handler. Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/block/nvme-core.c') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e92bdf4c68fc..b1d5d8797315 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -804,12 +804,19 @@ static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, nvme_finish_cmd(nvmeq, req->tag, NULL); set_current_state(TASK_RUNNING); } - schedule_timeout(timeout); + ret = schedule_timeout(timeout); - if (cmdinfo.status == -EINTR) { - nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); + /* + * Ensure that sync_completion has either run, or that it will + * never run. + */ + nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); + + /* + * We never got the completion + */ + if (cmdinfo.status == -EINTR) return -EINTR; - } if (result) *result = cmdinfo.result; -- cgit