From dc52d783d68c07a68743d42ef2f9a6f9a6d80fb1 Mon Sep 17 00:00:00 2001 From: Annie Li Date: Thu, 24 Aug 2017 17:25:59 -0400 Subject: xen-blkback: stop blkback thread of every queue in xen_blkif_disconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In xen_blkif_disconnect, before checking inflight I/O, following code stops the blkback thread, if (ring->xenblkd) { kthread_stop(ring->xenblkd); wake_up(&ring->shutdown_wq); } If there is inflight I/O in any non-last queue, blkback returns -EBUSY directly, and above code would not be called to stop thread of remaining queue and processs them. When removing vbd device with lots of disk I/O load, some queues with inflight I/O still have blkback thread running even though the corresponding vbd device or guest is gone. And this could cause some problems, for example, if the backend device type is file, some loop devices and blkback thread always lingers there forever after guest is destroyed, and this causes failure of umounting repositories unless rebooting the dom0. This patch allows thread of every queue has the chance to get stopped. Otherwise, only thread of queue previous to(including) first busy one get stopped, blkthread of remaining queue will still run. So stop all threads properly and return -EBUSY if any queue has inflight I/O. Signed-off-by: Annie Li Reviewed-by: Herbert van den Bergh Reviewed-by: Bhavesh Davda Reviewed-by: Adnan Misherfi Acked-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 792da683e70d..2adb8599be93 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -244,6 +244,7 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) { struct pending_req *req, *n; unsigned int j, r; + bool busy = false; for (r = 0; r < blkif->nr_rings; r++) { struct xen_blkif_ring *ring = &blkif->rings[r]; @@ -261,8 +262,10 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) * don't have any discard_io or other_io requests. So, checking * for inflight IO is enough. */ - if (atomic_read(&ring->inflight) > 0) - return -EBUSY; + if (atomic_read(&ring->inflight) > 0) { + busy = true; + continue; + } if (ring->irq) { unbind_from_irqhandler(ring->irq, ring); @@ -300,6 +303,9 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); ring->active = false; } + if (busy) + return -EBUSY; + blkif->nr_ring_pages = 0; /* * blkif->rings was allocated in connect_ring, so we should free it in -- cgit From b925a2dc165e5ec2330ca1256704faef8ed96913 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 28 Aug 2017 12:52:27 +0300 Subject: nvme-rdma: default MR page size to 4k Due to various page sizes in the system (IOMMU/device/kernel), we set the fabrics controller page size to 4k and block layer boundaries accordinglly. In architectures that uses different kernel page size we'll have a mismatch to the MR page size that may cause a mapping error. Update the MR page size to correspond to the core ctrl settings. Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index da04df1af231..a03299d77922 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -920,7 +920,11 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; int nr; - nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE); + /* + * Align the MR to a 4K page size to match the ctrl page size and + * the block virtual boundary. + */ + nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); if (nr < count) { if (nr < 0) return nr; @@ -1583,7 +1587,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) goto out_cleanup_queue; ctrl->ctrl.max_hw_sectors = - (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); + (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); error = nvme_init_identify(&ctrl->ctrl); if (error) -- cgit From 4033f35d174af4804a79fd5731d9e6be976f9f28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Aug 2017 10:47:18 +0200 Subject: nvme-pci: use dma memory for the host memory buffer descriptors The NVMe 1.3 specification says in section 5.21.1.13: "After a successful completion of a Set Features enabling the host memory buffer, the host shall not write to the associated host memory region, buffer size, or descriptor list until the host memory buffer has been disabled." While this doesn't state that the descriptor list must remain accessible to the device it certainly implies it must remaing readable by the device. So switch to a dma coherent allocation for the descriptor list just to be safe - it's not like the cost for it matters compared to the actual memory buffers. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Fixes: 87ad72a59a38 ("nvme-pci: implement host memory buffer support") --- drivers/nvme/host/pci.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 925467b31a33..ea892e732268 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -109,6 +109,7 @@ struct nvme_dev { /* host memory buffer support: */ u64 host_mem_size; u32 nr_host_mem_descs; + dma_addr_t host_mem_descs_dma; struct nvme_host_mem_buf_desc *host_mem_descs; void **host_mem_desc_bufs; }; @@ -1565,16 +1566,10 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) { - size_t len = dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs); + u64 dma_addr = dev->host_mem_descs_dma; struct nvme_command c; - u64 dma_addr; int ret; - dma_addr = dma_map_single(dev->dev, dev->host_mem_descs, len, - DMA_TO_DEVICE); - if (dma_mapping_error(dev->dev, dma_addr)) - return -ENOMEM; - memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_set_features; c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); @@ -1591,7 +1586,6 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) "failed to set host mem (err %d, flags %#x).\n", ret, bits); } - dma_unmap_single(dev->dev, dma_addr, len, DMA_TO_DEVICE); return ret; } @@ -1609,7 +1603,9 @@ static void nvme_free_host_mem(struct nvme_dev *dev) kfree(dev->host_mem_desc_bufs); dev->host_mem_desc_bufs = NULL; - kfree(dev->host_mem_descs); + dma_free_coherent(dev->dev, + dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), + dev->host_mem_descs, dev->host_mem_descs_dma); dev->host_mem_descs = NULL; } @@ -1617,6 +1613,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { struct nvme_host_mem_buf_desc *descs; u32 chunk_size, max_entries, len; + dma_addr_t descs_dma; int i = 0; void **bufs; u64 size = 0, tmp; @@ -1627,7 +1624,8 @@ retry: tmp = (preferred + chunk_size - 1); do_div(tmp, chunk_size); max_entries = tmp; - descs = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL); + descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs), + &descs_dma, GFP_KERNEL); if (!descs) goto out; @@ -1661,6 +1659,7 @@ retry: dev->nr_host_mem_descs = i; dev->host_mem_size = size; dev->host_mem_descs = descs; + dev->host_mem_descs_dma = descs_dma; dev->host_mem_desc_bufs = bufs; return 0; @@ -1674,7 +1673,8 @@ out_free_bufs: kfree(bufs); out_free_descs: - kfree(descs); + dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs, + descs_dma); out: /* try a smaller chunk size if we failed early */ if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) { -- cgit From 223694b9ae8bfba99f3528d49d07a740af6ff95a Mon Sep 17 00:00:00 2001 From: Changpeng Liu Date: Thu, 31 Aug 2017 11:22:49 +0800 Subject: nvme: fix the definition of the doorbell buffer config support bit NVMe 1.3 specification defines the Optional Admin Command Support feature flags, bit 8 set to '1' then the controller supports the Doorbell Buffer Config command. Bit 7 is used for Virtualization Mangement command. Signed-off-by: Changpeng Liu Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Fixes: f9f38e33 ("nvme: improve performance for virtual NVMe devices") Cc: stable@vger.kernel.org --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 25d8225dbd04..8efff888bd9b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -254,7 +254,7 @@ enum { NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, NVME_CTRL_OACS_DIRECTIVES = 1 << 5, - NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7, + NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, }; struct nvme_lbaf { -- cgit