diff options
Diffstat (limited to 'drivers/nvme/host/pci.c')
| -rw-r--r-- | drivers/nvme/host/pci.c | 1438 |
1 files changed, 997 insertions, 441 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2f57da12d983..0e4caeab739c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -7,8 +7,7 @@ #include <linux/acpi.h> #include <linux/async.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> -#include <linux/blk-mq-pci.h> +#include <linux/blk-mq-dma.h> #include <linux/blk-integrity.h> #include <linux/dmi.h> #include <linux/init.h> @@ -19,6 +18,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/nodemask.h> #include <linux/once.h> #include <linux/pci.h> #include <linux/suspend.h> @@ -27,7 +27,6 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/io-64-nonatomic-hi-lo.h> #include <linux/sed-opal.h> -#include <linux/pci-p2pdma.h> #include "trace.h" #include "nvme.h" @@ -35,15 +34,43 @@ #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) -#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +/* Optimisation for I/Os between 4k and 128k */ +#define NVME_SMALL_POOL_SIZE 256 /* - * These can be higher, but we need to ensure that any command doesn't - * require an sg allocation that needs more than a page of data. + * Arbitrary upper bound. */ -#define NVME_MAX_KB_SZ 8192 -#define NVME_MAX_SEGS 128 -#define NVME_MAX_NR_ALLOCATIONS 5 +#define NVME_MAX_BYTES SZ_8M +#define NVME_MAX_NR_DESCRIPTORS 5 + +/* + * For data SGLs we support a single descriptors worth of SGL entries. + * For PRPs, segments don't matter at all. + */ +#define NVME_MAX_SEGS \ + (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) + +/* + * For metadata SGLs, only the small descriptor is supported, and the first + * entry is the segment descriptor, which for the data pointer sits in the SQE. + */ +#define NVME_MAX_META_SEGS \ + ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1) + +/* + * The last entry is used to link to the next descriptor. + */ +#define PRPS_PER_PAGE \ + (((NVME_CTRL_PAGE_SIZE / sizeof(__le64))) - 1) + +/* + * I/O could be non-aligned both at the beginning and end. + */ +#define MAX_PRP_RANGE \ + (NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1)) + +static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <= + (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE)); static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -81,7 +108,7 @@ static int io_queue_count_set(const char *val, const struct kernel_param *kp) int ret; ret = kstrtouint(val, 10, &n); - if (ret != 0 || n > num_possible_cpus()) + if (ret != 0 || n > blk_mq_num_possible_queues(0)) return -EINVAL; return param_set_uint(val, kp); } @@ -112,6 +139,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); static void nvme_update_attrs(struct nvme_dev *dev); +struct nvme_descriptor_pools { + struct dma_pool *large; + struct dma_pool *small; +}; + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -121,8 +153,6 @@ struct nvme_dev { struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; struct device *dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; unsigned online_queues; unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; @@ -141,8 +171,8 @@ struct nvme_dev { struct nvme_ctrl ctrl; u32 last_ps; bool hmb; - - mempool_t *iod_mempool; + struct sg_table *hmb_sgt; + mempool_t *dmavec_mempool; /* shadow doorbell buffer support: */ __le32 *dbbuf_dbs; @@ -153,12 +183,14 @@ struct nvme_dev { /* host memory buffer support: */ u64 host_mem_size; u32 nr_host_mem_descs; + u32 host_mem_descs_size; dma_addr_t host_mem_descs_dma; struct nvme_host_mem_buf_desc *host_mem_descs; void **host_mem_desc_bufs; unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + struct nvme_descriptor_pools descriptor_pools[]; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -188,6 +220,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) */ struct nvme_queue { struct nvme_dev *dev; + struct nvme_descriptor_pools descriptor_pools; spinlock_t sq_lock; void *sq_cmds; /* only used for poll queues: */ @@ -216,28 +249,57 @@ struct nvme_queue { struct completion delete_done; }; -union nvme_descriptor { - struct nvme_sgl_desc *sg_list; - __le64 *prp_list; +/* bits for iod->flags */ +enum nvme_iod_flags { + /* this command has been aborted by the timeout handler */ + IOD_ABORTED = 1U << 0, + + /* uses the small descriptor pool */ + IOD_SMALL_DESCRIPTOR = 1U << 1, + + /* single segment dma mapping */ + IOD_SINGLE_SEGMENT = 1U << 2, + + /* Data payload contains p2p memory */ + IOD_DATA_P2P = 1U << 3, + + /* Metadata contains p2p memory */ + IOD_META_P2P = 1U << 4, + + /* Data payload contains MMIO memory */ + IOD_DATA_MMIO = 1U << 5, + + /* Metadata contains MMIO memory */ + IOD_META_MMIO = 1U << 6, + + /* Metadata using non-coalesced MPTR */ + IOD_SINGLE_META_SEGMENT = 1U << 7, +}; + +struct nvme_dma_vec { + dma_addr_t addr; + unsigned int len; }; /* * The nvme_iod describes the data in an I/O. - * - * The sg pointer contains the list of PRP/SGL chunk allocations in addition - * to the actual struct scatterlist. */ struct nvme_iod { struct nvme_request req; struct nvme_command cmd; - bool aborted; - s8 nr_allocations; /* PRP list pool allocations. 0 means small - pool in use */ - unsigned int dma_len; /* length of single DMA segment mapping */ - dma_addr_t first_dma; + u8 flags; + u8 nr_descriptors; + + unsigned int total_len; + struct dma_iova_state dma_state; + void *descriptors[NVME_MAX_NR_DESCRIPTORS]; + struct nvme_dma_vec *dma_vecs; + unsigned int nr_dma_vecs; + dma_addr_t meta_dma; - struct sg_table sgt; - union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; + unsigned int meta_total_len; + struct dma_iova_state meta_dma_state; + struct nvme_sgl_desc *meta_descriptor; }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -367,7 +429,7 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, /* * Ensure that the doorbell is updated before reading the event * index from memory. The controller needs to provide similar - * ordering to ensure the envent index is updated before reading + * ordering to ensure the event index is updated before reading * the doorbell. */ mb(); @@ -380,42 +442,78 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, return true; } -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_pci_npages_prp(void) +static struct nvme_descriptor_pools * +nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) { - unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; - unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); + struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node]; + size_t small_align = NVME_SMALL_POOL_SIZE; + + if (pools->small) + return pools; /* already initialized */ + + pools->large = dma_pool_create_node("nvme descriptor page", dev->dev, + NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node); + if (!pools->large) + return ERR_PTR(-ENOMEM); + + if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) + small_align = 512; + + pools->small = dma_pool_create_node("nvme descriptor small", dev->dev, + NVME_SMALL_POOL_SIZE, small_align, 0, numa_node); + if (!pools->small) { + dma_pool_destroy(pools->large); + pools->large = NULL; + return ERR_PTR(-ENOMEM); + } + + return pools; } -static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +static void nvme_release_descriptor_pools(struct nvme_dev *dev) { - struct nvme_dev *dev = to_nvme_dev(data); - struct nvme_queue *nvmeq = &dev->queues[0]; + unsigned i; - WARN_ON(hctx_idx != 0); - WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + for (i = 0; i < nr_node_ids; i++) { + struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i]; - hctx->driver_data = nvmeq; - return 0; + dma_pool_destroy(pools->large); + dma_pool_destroy(pools->small); + } } -static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data, + unsigned qid) { struct nvme_dev *dev = to_nvme_dev(data); - struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_descriptor_pools *pools; + struct blk_mq_tags *tags; + + tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0]; + WARN_ON(tags != hctx->tags); + pools = nvme_setup_descriptor_pools(dev, hctx->numa_node); + if (IS_ERR(pools)) + return PTR_ERR(pools); - WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); + nvmeq->descriptor_pools = *pools; hctx->driver_data = nvmeq; return 0; } +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + WARN_ON(hctx_idx != 0); + return nvme_init_hctx_common(hctx, data, 0); +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + return nvme_init_hctx_common(hctx, data, hctx_idx + 1); +} + static int nvme_pci_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) @@ -457,7 +555,7 @@ static void nvme_pci_map_queues(struct blk_mq_tag_set *set) */ map->queue_offset = qoff; if (i != HCTX_TYPE_POLL && offset) - blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); + blk_mq_map_hw_queues(map, dev->dev, offset); else blk_mq_map_queues(map); qoff += map->nr_queues; @@ -504,173 +602,368 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) spin_unlock(&nvmeq->sq_lock); } -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, - int nseg) +enum nvme_use_sgl { + SGL_UNSUPPORTED, + SGL_SUPPORTED, + SGL_FORCED, +}; + +static inline bool nvme_pci_metadata_use_sgls(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - unsigned int avg_seg_size; - - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); + struct nvme_dev *dev = nvmeq->dev; - if (!nvme_ctrl_sgl_supported(&dev->ctrl)) - return false; - if (!nvmeq->qid) + if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) return false; - if (!sgl_threshold || avg_seg_size < sgl_threshold) - return false; - return true; + return req->nr_integrity_segments > 1 || + nvme_req(req)->flags & NVME_REQ_USERCMD; } -static void nvme_free_prps(struct nvme_dev *dev, struct request *req) +static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev, + struct request *req) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + + if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) { + /* + * When the controller is capable of using SGL, there are + * several conditions that we force to use it: + * + * 1. A request containing page gaps within the controller's + * mask can not use the PRP format. + * + * 2. User commands use SGL because that lets the device + * validate the requested transfer lengths. + * + * 3. Multiple integrity segments must use SGL as that's the + * only way to describe such a command in NVMe. + */ + if (req_phys_gap_mask(req) & (NVME_CTRL_PAGE_SIZE - 1) || + nvme_req(req)->flags & NVME_REQ_USERCMD || + req->nr_integrity_segments > 1) + return SGL_FORCED; + return SGL_SUPPORTED; + } + + return SGL_UNSUPPORTED; +} + +static unsigned int nvme_pci_avg_seg_size(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + unsigned int nseg; + + if (blk_rq_dma_map_coalesce(&iod->dma_state)) + nseg = 1; + else + nseg = blk_rq_nr_phys_segments(req); + return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); +} + +static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, + struct nvme_iod *iod) +{ + if (iod->flags & IOD_SMALL_DESCRIPTOR) + return nvmeq->descriptor_pools.small; + return nvmeq->descriptor_pools.large; +} + +static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd) +{ + return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG; +} + +static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd) +{ + return cmd->common.flags & + (NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG); +} + +static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd) +{ + if (nvme_pci_cmd_use_sgl(cmd)) + return le64_to_cpu(cmd->common.dptr.sgl.addr); + return le64_to_cpu(cmd->common.dptr.prp2); +} + +static void nvme_free_descriptors(struct request *req) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; + dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd); int i; - for (i = 0; i < iod->nr_allocations; i++) { - __le64 *prp_list = iod->list[i].prp_list; + if (iod->nr_descriptors == 1) { + dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0], + dma_addr); + return; + } + + for (i = 0; i < iod->nr_descriptors; i++) { + __le64 *prp_list = iod->descriptors[i]; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); + dma_pool_free(nvmeq->descriptor_pools.large, prp_list, + dma_addr); dma_addr = next_dma_addr; } } -static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) +static void nvme_free_prps(struct request *req, unsigned int attrs) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int i; + + for (i = 0; i < iod->nr_dma_vecs; i++) + dma_unmap_phys(nvmeq->dev->dev, iod->dma_vecs[i].addr, + iod->dma_vecs[i].len, rq_dma_dir(req), attrs); + mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); +} + +static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, + struct nvme_sgl_desc *sg_list, unsigned int attrs) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + enum dma_data_direction dir = rq_dma_dir(req); + unsigned int len = le32_to_cpu(sge->length); + struct device *dma_dev = nvmeq->dev->dev; + unsigned int i; + + if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) { + dma_unmap_phys(dma_dev, le64_to_cpu(sge->addr), len, dir, + attrs); + return; + } + + for (i = 0; i < len / sizeof(*sg_list); i++) + dma_unmap_phys(dma_dev, le64_to_cpu(sg_list[i].addr), + le32_to_cpu(sg_list[i].length), dir, attrs); +} + +static void nvme_unmap_metadata(struct request *req) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE; + enum dma_data_direction dir = rq_dma_dir(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct device *dma_dev = nvmeq->dev->dev; + struct nvme_sgl_desc *sge = iod->meta_descriptor; + unsigned int attrs = 0; - if (iod->dma_len) { - dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, + if (iod->flags & IOD_SINGLE_META_SEGMENT) { + dma_unmap_page(dma_dev, iod->meta_dma, + rq_integrity_vec(req).bv_len, rq_dma_dir(req)); return; } - WARN_ON_ONCE(!iod->sgt.nents); + if (iod->flags & IOD_META_P2P) + map = PCI_P2PDMA_MAP_BUS_ADDR; + else if (iod->flags & IOD_META_MMIO) { + map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; + attrs |= DMA_ATTR_MMIO; + } - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); + if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len, map)) { + if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) + nvme_free_sgls(req, sge, &sge[1], attrs); + else + dma_unmap_phys(dma_dev, iod->meta_dma, + iod->meta_total_len, dir, attrs); + } - if (iod->nr_allocations == 0) - dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, - iod->first_dma); - else if (iod->nr_allocations == 1) - dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, - iod->first_dma); - else - nvme_free_prps(dev, req); - mempool_free(iod->sgt.sgl, dev->iod_mempool); + if (iod->meta_descriptor) + dma_pool_free(nvmeq->descriptor_pools.small, + iod->meta_descriptor, iod->meta_dma); } -static void nvme_print_sgl(struct scatterlist *sgl, int nents) +static void nvme_unmap_data(struct request *req) { - int i; - struct scatterlist *sg; + enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct device *dma_dev = nvmeq->dev->dev; + unsigned int attrs = 0; + + if (iod->flags & IOD_SINGLE_SEGMENT) { + static_assert(offsetof(union nvme_data_ptr, prp1) == + offsetof(union nvme_data_ptr, sgl.addr)); + dma_unmap_page(dma_dev, le64_to_cpu(iod->cmd.common.dptr.prp1), + iod->total_len, rq_dma_dir(req)); + return; + } + + if (iod->flags & IOD_DATA_P2P) + map = PCI_P2PDMA_MAP_BUS_ADDR; + else if (iod->flags & IOD_DATA_MMIO) { + map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; + attrs |= DMA_ATTR_MMIO; + } - for_each_sg(sgl, sg, nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", - i, &phys, sg->offset, sg->length, &sg_dma_address(sg), - sg_dma_len(sg)); + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, + map)) { + if (nvme_pci_cmd_use_sgl(&iod->cmd)) + nvme_free_sgls(req, iod->descriptors[0], + &iod->cmd.common.dptr.sgl, attrs); + else + nvme_free_prps(req, attrs); } + + if (iod->nr_descriptors) + nvme_free_descriptors(req); } -static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd) +static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, + struct blk_dma_iter *iter) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; - int length = blk_rq_payload_bytes(req); - struct scatterlist *sg = iod->sgt.sgl; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); + + if (iter->len) + return true; + if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter)) + return false; + if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) { + iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr; + iod->dma_vecs[iod->nr_dma_vecs].len = iter->len; + iod->nr_dma_vecs++; + } + return true; +} + +static blk_status_t nvme_pci_setup_data_prp(struct request *req, + struct blk_dma_iter *iter) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int length = blk_rq_payload_bytes(req); + dma_addr_t prp1_dma, prp2_dma = 0; + unsigned int prp_len, i; __le64 *prp_list; - dma_addr_t prp_dma; - int nprps, i; - length -= (NVME_CTRL_PAGE_SIZE - offset); - if (length <= 0) { - iod->first_dma = 0; - goto done; + if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) { + iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool, + GFP_ATOMIC); + if (!iod->dma_vecs) + return BLK_STS_RESOURCE; + iod->dma_vecs[0].addr = iter->addr; + iod->dma_vecs[0].len = iter->len; + iod->nr_dma_vecs = 1; } - dma_len -= (NVME_CTRL_PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (NVME_CTRL_PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); + /* + * PRP1 always points to the start of the DMA transfers. + * + * This is the only PRP (except for the list entries) that could be + * non-aligned. + */ + prp1_dma = iter->addr; + prp_len = min(length, NVME_CTRL_PAGE_SIZE - + (iter->addr & (NVME_CTRL_PAGE_SIZE - 1))); + iod->total_len += prp_len; + iter->addr += prp_len; + iter->len -= prp_len; + length -= prp_len; + if (!length) + goto done; + + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { + if (WARN_ON_ONCE(!iter->status)) + goto bad_sgl; + goto done; } + /* + * PRP2 is usually a list, but can point to data if all data to be + * transferred fits into PRP1 + PRP2: + */ if (length <= NVME_CTRL_PAGE_SIZE) { - iod->first_dma = dma_addr; + prp2_dma = iter->addr; + iod->total_len += length; goto done; } - nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } + if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <= + NVME_SMALL_POOL_SIZE / sizeof(__le64)) + iod->flags |= IOD_SMALL_DESCRIPTOR; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &prp2_dma); if (!prp_list) { - iod->nr_allocations = -1; - return BLK_STS_RESOURCE; + iter->status = BLK_STS_RESOURCE; + goto done; } - iod->list[0].prp_list = prp_list; - iod->first_dma = prp_dma; + iod->descriptors[iod->nr_descriptors++] = prp_list; + i = 0; for (;;) { + prp_list[i++] = cpu_to_le64(iter->addr); + prp_len = min(length, NVME_CTRL_PAGE_SIZE); + if (WARN_ON_ONCE(iter->len < prp_len)) + goto bad_sgl; + + iod->total_len += prp_len; + iter->addr += prp_len; + iter->len -= prp_len; + length -= prp_len; + if (!length) + break; + + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { + if (WARN_ON_ONCE(!iter->status)) + goto bad_sgl; + goto done; + } + + /* + * If we've filled the entire descriptor, allocate a new that is + * pointed to be the last entry in the previous PRP list. To + * accommodate for that move the last actual entry to the new + * descriptor. + */ if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) - goto free_prps; - iod->list[iod->nr_allocations++].prp_list = prp_list; + dma_addr_t prp_list_dma; + + prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large, + GFP_ATOMIC, &prp_list_dma); + if (!prp_list) { + iter->status = BLK_STS_RESOURCE; + goto done; + } + iod->descriptors[iod->nr_descriptors++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); + old_prp_list[i - 1] = cpu_to_le64(prp_list_dma); i = 1; } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= NVME_CTRL_PAGE_SIZE; - dma_addr += NVME_CTRL_PAGE_SIZE; - length -= NVME_CTRL_PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - if (unlikely(dma_len < 0)) - goto bad_sgl; - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); } + done: - cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl)); - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); - return BLK_STS_OK; -free_prps: - nvme_free_prps(dev, req); - return BLK_STS_RESOURCE; + /* + * nvme_unmap_data uses the DPT field in the SQE to tear down the + * mapping, so initialize it even for failures. + */ + iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma); + iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma); + if (unlikely(iter->status)) + nvme_unmap_data(req); + return iter->status; + bad_sgl: - WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), - "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->sgt.nents); + dev_err_once(nvmeq->dev->dev, + "Incorrectly formed request for payload:%d nents:%d\n", + blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req)); return BLK_STS_IOERR; } static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, - struct scatterlist *sg) + struct blk_dma_iter *iter) { - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->addr = cpu_to_le64(iter->addr); + sge->length = cpu_to_le32(iter->len); sge->type = NVME_SGL_FMT_DATA_DESC << 4; } @@ -682,179 +975,263 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } -static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmd) +static blk_status_t nvme_pci_setup_data_sgl(struct request *req, + struct blk_dma_iter *iter) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int entries = blk_rq_nr_phys_segments(req); struct nvme_sgl_desc *sg_list; - struct scatterlist *sg = iod->sgt.sgl; - unsigned int entries = iod->sgt.nents; dma_addr_t sgl_dma; - int i = 0; + unsigned int mapped = 0; - /* setting the transfer type as SGL */ - cmd->flags = NVME_CMD_SGL_METABUF; + /* set the transfer type as SGL */ + iod->cmd.common.flags = NVME_CMD_SGL_METABUF; - if (entries == 1) { - nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); + if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) { + nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter); + iod->total_len += iter->len; return BLK_STS_OK; } - if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } + if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list)) + iod->flags |= IOD_SMALL_DESCRIPTOR; - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) { - iod->nr_allocations = -1; + sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &sgl_dma); + if (!sg_list) return BLK_STS_RESOURCE; - } - - iod->list[0].sg_list = sg_list; - iod->first_dma = sgl_dma; + iod->descriptors[iod->nr_descriptors++] = sg_list; - nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); do { - nvme_pci_sgl_set_data(&sg_list[i++], sg); - sg = sg_next(sg); - } while (--entries > 0); + if (WARN_ON_ONCE(mapped == entries)) { + iter->status = BLK_STS_IOERR; + break; + } + nvme_pci_sgl_set_data(&sg_list[mapped++], iter); + iod->total_len += iter->len; + } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, + iter)); - return BLK_STS_OK; + nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); + if (unlikely(iter->status)) + nvme_unmap_data(req); + return iter->status; } -static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd, - struct bio_vec *bv) +static blk_status_t nvme_pci_setup_data_simple(struct request *req, + enum nvme_use_sgl use_sgl) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); - unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; - - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->first_dma)) + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct bio_vec bv = req_bvec(req); + unsigned int prp1_offset = bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1); + bool prp_possible = prp1_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2; + dma_addr_t dma_addr; + + if (!use_sgl && !prp_possible) + return BLK_STS_AGAIN; + if (is_pci_p2pdma_page(bv.bv_page)) + return BLK_STS_AGAIN; + + dma_addr = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0); + if (dma_mapping_error(nvmeq->dev->dev, dma_addr)) return BLK_STS_RESOURCE; - iod->dma_len = bv->bv_len; + iod->total_len = bv.bv_len; + iod->flags |= IOD_SINGLE_SEGMENT; + + if (use_sgl == SGL_FORCED || !prp_possible) { + iod->cmd.common.flags = NVME_CMD_SGL_METABUF; + iod->cmd.common.dptr.sgl.addr = cpu_to_le64(dma_addr); + iod->cmd.common.dptr.sgl.length = cpu_to_le32(bv.bv_len); + iod->cmd.common.dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; + } else { + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - prp1_offset; + + iod->cmd.common.dptr.prp1 = cpu_to_le64(dma_addr); + iod->cmd.common.dptr.prp2 = 0; + if (bv.bv_len > first_prp_len) + iod->cmd.common.dptr.prp2 = + cpu_to_le64(dma_addr + first_prp_len); + } - cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); - if (bv->bv_len > first_prp_len) - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); - else - cmnd->dptr.prp2 = 0; return BLK_STS_OK; } -static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd, - struct bio_vec *bv) +static blk_status_t nvme_map_data(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req); + struct blk_dma_iter iter; + blk_status_t ret; - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->first_dma)) + /* + * Try to skip the DMA iterator for single segment requests, as that + * significantly improves performances for small I/O sizes. + */ + if (blk_rq_nr_phys_segments(req) == 1) { + ret = nvme_pci_setup_data_simple(req, use_sgl); + if (ret != BLK_STS_AGAIN) + return ret; + } + + if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) + return iter.status; + + switch (iter.p2pdma.map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + iod->flags |= IOD_DATA_P2P; + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + iod->flags |= IOD_DATA_MMIO; + break; + case PCI_P2PDMA_MAP_NONE: + break; + default: return BLK_STS_RESOURCE; - iod->dma_len = bv->bv_len; + } - cmnd->flags = NVME_CMD_SGL_METABUF; - cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); - cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); - cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; - return BLK_STS_OK; + if (use_sgl == SGL_FORCED || + (use_sgl == SGL_SUPPORTED && + (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) + return nvme_pci_setup_data_sgl(req, &iter); + return nvme_pci_setup_data_prp(req, &iter); } -static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) +static blk_status_t nvme_pci_setup_meta_iter(struct request *req) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int entries = req->nr_integrity_segments; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - blk_status_t ret = BLK_STS_RESOURCE; - int rc; + struct nvme_dev *dev = nvmeq->dev; + struct nvme_sgl_desc *sg_list; + struct blk_dma_iter iter; + dma_addr_t sgl_dma; + int i = 0; - if (blk_rq_nr_phys_segments(req) == 1) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct bio_vec bv = req_bvec(req); - - if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) - return nvme_setup_prp_simple(dev, req, - &cmnd->rw, &bv); - - if (nvmeq->qid && sgl_threshold && - nvme_ctrl_sgl_supported(&dev->ctrl)) - return nvme_setup_sgl_simple(dev, req, - &cmnd->rw, &bv); - } + if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev, + &iod->meta_dma_state, &iter)) + return iter.status; + + switch (iter.p2pdma.map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + iod->flags |= IOD_META_P2P; + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + iod->flags |= IOD_META_MMIO; + break; + case PCI_P2PDMA_MAP_NONE: + break; + default: + return BLK_STS_RESOURCE; + } + + if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) + entries = 1; + + /* + * The NVMe MPTR descriptor has an implicit length that the host and + * device must agree on to avoid data/memory corruption. We trust the + * kernel allocated correctly based on the format's parameters, so use + * the more efficient MPTR to avoid extra dma pool allocations for the + * SGL indirection. + * + * But for user commands, we don't necessarily know what they do, so + * the driver can't validate the metadata buffer size. The SGL + * descriptor provides an explicit length, so we're relying on that + * mechanism to catch any misunderstandings between the application and + * device. + * + * P2P DMA also needs to use the blk_dma_iter method, so mptr setup + * leverages this routine when that happens. + */ + if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl) || + (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD))) { + iod->cmd.common.metadata = cpu_to_le64(iter.addr); + iod->meta_total_len = iter.len; + iod->meta_dma = iter.addr; + iod->meta_descriptor = NULL; + return BLK_STS_OK; } - iod->dma_len = 0; - iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); - if (!iod->sgt.sgl) + sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, + &sgl_dma); + if (!sg_list) return BLK_STS_RESOURCE; - sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); - if (!iod->sgt.orig_nents) - goto out_free_sg; - rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) { - if (rc == -EREMOTEIO) - ret = BLK_STS_TARGET; - goto out_free_sg; + iod->meta_descriptor = sg_list; + iod->meta_dma = sgl_dma; + iod->cmd.common.flags = NVME_CMD_SGL_METASEG; + iod->cmd.common.metadata = cpu_to_le64(sgl_dma); + if (entries == 1) { + iod->meta_total_len = iter.len; + nvme_pci_sgl_set_data(sg_list, &iter); + return BLK_STS_OK; } - if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) - ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); - else - ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); - if (ret != BLK_STS_OK) - goto out_unmap_sg; - return BLK_STS_OK; + sgl_dma += sizeof(*sg_list); + do { + nvme_pci_sgl_set_data(&sg_list[++i], &iter); + iod->meta_total_len += iter.len; + } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter)); -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->sgt.sgl, dev->iod_mempool); - return ret; + nvme_pci_sgl_set_seg(sg_list, sgl_dma, i); + if (unlikely(iter.status)) + nvme_unmap_metadata(req); + return iter.status; } -static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) +static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct bio_vec bv = rq_integrity_vec(req); + + if (is_pci_p2pdma_page(bv.bv_page)) + return nvme_pci_setup_meta_iter(req); - iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req), - rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->meta_dma)) + iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0); + if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma)) return BLK_STS_IOERR; - cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); + iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma); + iod->flags |= IOD_SINGLE_META_SEGMENT; return BLK_STS_OK; } -static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) +static blk_status_t nvme_map_metadata(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) && + nvme_pci_metadata_use_sgls(req)) + return nvme_pci_setup_meta_iter(req); + return nvme_pci_setup_meta_mptr(req); +} + +static blk_status_t nvme_prep_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; - iod->aborted = false; - iod->nr_allocations = -1; - iod->sgt.nents = 0; + iod->flags = 0; + iod->nr_descriptors = 0; + iod->total_len = 0; + iod->meta_total_len = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) return ret; if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, &iod->cmd); + ret = nvme_map_data(req); if (ret) goto out_free_cmd; } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, &iod->cmd); + ret = nvme_map_metadata(req); if (ret) goto out_unmap_data; } @@ -862,15 +1239,13 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) nvme_start_request(req); return BLK_STS_OK; out_unmap_data: - nvme_unmap_data(dev, req); + if (blk_rq_nr_phys_segments(req)) + nvme_unmap_data(req); out_free_cmd: nvme_cleanup_cmd(req); return ret; } -/* - * NOTE: ns is NULL when called on the admin queue. - */ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -890,7 +1265,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!nvme_check_ready(&dev->ctrl, req, true))) return nvme_fail_nonready_command(&dev->ctrl, req); - ret = nvme_prep_rq(dev, req); + ret = nvme_prep_rq(req); if (unlikely(ret)) return ret; spin_lock(&nvmeq->sq_lock); @@ -900,11 +1275,15 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist) { + struct request *req; + + if (rq_list_empty(rqlist)) + return; + spin_lock(&nvmeq->sq_lock); - while (!rq_list_empty(*rqlist)) { - struct request *req = rq_list_pop(rqlist); + while ((req = rq_list_pop(rqlist))) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); nvme_sq_copy_cmd(nvmeq, &iod->cmd); @@ -924,54 +1303,38 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) return false; - req->mq_hctx->tags->rqs[req->tag] = req; - return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; + return nvme_prep_rq(req) == BLK_STS_OK; } -static void nvme_queue_rqs(struct request **rqlist) +static void nvme_queue_rqs(struct rq_list *rqlist) { - struct request *req, *next, *prev = NULL; - struct request *requeue_list = NULL; - - rq_list_for_each_safe(rqlist, req, next) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - - if (!nvme_prep_rq_batch(nvmeq, req)) { - /* detach 'req' and add to remainder list */ - rq_list_move(rqlist, &requeue_list, req, prev); + struct rq_list submit_list = { }; + struct rq_list requeue_list = { }; + struct nvme_queue *nvmeq = NULL; + struct request *req; - req = prev; - if (!req) - continue; - } + while ((req = rq_list_pop(rqlist))) { + if (nvmeq && nvmeq != req->mq_hctx->driver_data) + nvme_submit_cmds(nvmeq, &submit_list); + nvmeq = req->mq_hctx->driver_data; - if (!next || req->mq_hctx != next->mq_hctx) { - /* detach rest of list, and submit */ - req->rq_next = NULL; - nvme_submit_cmds(nvmeq, rqlist); - *rqlist = next; - prev = NULL; - } else - prev = req; + if (nvme_prep_rq_batch(nvmeq, req)) + rq_list_add_tail(&submit_list, req); + else + rq_list_add_tail(&requeue_list, req); } + if (nvmeq) + nvme_submit_cmds(nvmeq, &submit_list); *rqlist = requeue_list; } static __always_inline void nvme_pci_unmap_rq(struct request *req) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; - - if (blk_integrity_rq(req)) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - - dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req)->bv_len, rq_dma_dir(req)); - } - + if (blk_integrity_rq(req)) + nvme_unmap_metadata(req); if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req); + nvme_unmap_data(req); } static void nvme_pci_complete_rq(struct request *req) @@ -1038,8 +1401,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); if (!nvme_try_complete_req(req, cqe->status, cqe->result) && - !blk_mq_add_to_batch(req, iob, nvme_req(req)->status, - nvme_pci_complete_batch)) + !blk_mq_add_to_batch(req, iob, + nvme_req(req)->status != NVME_SC_SUCCESS, + nvme_pci_complete_batch)) nvme_pci_complete_rq(req); } @@ -1055,13 +1419,13 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } } -static inline int nvme_poll_cq(struct nvme_queue *nvmeq, - struct io_comp_batch *iob) +static inline bool nvme_poll_cq(struct nvme_queue *nvmeq, + struct io_comp_batch *iob) { - int found = 0; + bool found = false; while (nvme_cqe_pending(nvmeq)) { - found++; + found = true; /* * load-load control dependency between phase and the rest of * the cqe requires a full read memory barrier @@ -1082,7 +1446,7 @@ static irqreturn_t nvme_irq(int irq, void *data) DEFINE_IO_COMP_BATCH(iob); if (nvme_poll_cq(nvmeq, &iob)) { - if (!rq_list_empty(iob.req_list)) + if (!rq_list_empty(&iob.req_list)) nvme_pci_complete_batch(&iob); return IRQ_HANDLED; } @@ -1109,7 +1473,9 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + spin_lock(&nvmeq->cq_poll_lock); nvme_poll_cq(nvmeq, NULL); + spin_unlock(&nvmeq->cq_poll_lock); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } @@ -1143,6 +1509,41 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) spin_unlock(&nvmeq->sq_lock); } +static int nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + int ret = 0; + + /* + * Taking the shutdown_lock ensures the BAR mapping is not being + * altered by reset_work. Holding this lock before the RESETTING state + * change, if successful, also ensures nvme_remove won't be able to + * proceed to iounmap until we're done. + */ + mutex_lock(&dev->shutdown_lock); + if (!dev->bar_mapped_size) { + ret = -ENODEV; + goto unlock; + } + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) { + ret = -EBUSY; + goto unlock; + } + + writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR); + nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE); + + /* + * Read controller status to flush the previous write and trigger a + * pcie read error. + */ + readl(dev->bar + NVME_REG_CSTS); +unlock: + mutex_unlock(&dev->shutdown_lock); + return ret; +} + static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { struct nvme_command c = { }; @@ -1234,7 +1635,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); /* If there is a reset/reinit ongoing, we shouldn't reset again. */ - switch (dev->ctrl.state) { + switch (nvme_ctrl_state(&dev->ctrl)) { case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: return false; @@ -1274,7 +1675,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) dev_warn(dev->ctrl.device, "Does your device have a faulty power saving mode enabled?\n"); dev_warn(dev->ctrl.device, - "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n"); + "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n"); } static enum blk_eh_timer_return nvme_timeout(struct request *req) @@ -1284,13 +1685,28 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd = { }; + struct pci_dev *pdev = to_pci_dev(dev->dev); u32 csts = readl(dev->bar + NVME_REG_CSTS); + u8 opcode; + + /* + * Shutdown the device immediately if we see it is disconnected. This + * unblocks PCIe error handling if the nvme driver is waiting in + * error_resume for a device that has been removed. We can't unbind the + * driver while the driver's error callback is waiting to complete, so + * we're relying on a timeout to break that deadlock if a removal + * occurs while reset work is running. + */ + if (pci_dev_is_disconnected(pdev)) + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + if (nvme_state_terminal(&dev->ctrl)) + goto disable; /* If PCI error recovery process is happening, we cannot reset or * the recovery mechanism will surely fail. */ mb(); - if (pci_channel_offline(to_pci_dev(dev->dev))) + if (pci_channel_offline(pdev)) return BLK_EH_RESET_TIMER; /* @@ -1311,8 +1727,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) { dev_warn(dev->ctrl.device, - "I/O %d QID %d timeout, completion polled\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) QID %d timeout, completion polled\n", + req->tag, nvme_cid(req), nvmeq->qid); return BLK_EH_DONE; } @@ -1322,14 +1738,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * cancellation error. All outstanding requests are completed on * shutdown, so we return BLK_EH_DONE. */ - switch (dev->ctrl.state) { + switch (nvme_ctrl_state(&dev->ctrl)) { case NVME_CTRL_CONNECTING: nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); fallthrough; case NVME_CTRL_DELETING: dev_warn_ratelimited(dev->ctrl.device, - "I/O %d QID %d timeout, disable controller\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) QID %d timeout, disable controller\n", + req->tag, nvme_cid(req), nvmeq->qid); nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_dev_disable(dev, true); return BLK_EH_DONE; @@ -1344,10 +1760,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * command was already aborted once before and still hasn't been * returned to the driver, or if this is the admin queue. */ - if (!nvmeq->qid || iod->aborted) { + opcode = nvme_req(req)->cmd->common.opcode; + if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) { dev_warn(dev->ctrl.device, - "I/O %d QID %d timeout, reset controller\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n", + req->tag, nvme_cid(req), opcode, + nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid); nvme_req(req)->flags |= NVME_REQ_CANCELLED; goto disable; } @@ -1356,17 +1774,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } - iod->aborted = true; + iod->flags |= IOD_ABORTED; cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.cid = nvme_cid(req); cmd.abort.sqid = cpu_to_le16(nvmeq->qid); dev_warn(nvmeq->dev->ctrl.device, - "I/O %d (%s) QID %d timeout, aborting\n", - req->tag, - nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode), - nvmeq->qid); + "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n", + req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode), + nvmeq->qid, blk_op_str(req_op(req)), req_op(req), + blk_rq_bytes(req)); abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT); @@ -1388,8 +1806,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) return BLK_EH_RESET_TIMER; disable: - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) { + if (nvme_state_terminal(&dev->ctrl)) + nvme_dev_disable(dev, true); return BLK_EH_DONE; + } nvme_dev_disable(dev, false); if (nvme_try_sched_reset(&dev->ctrl)) @@ -1594,7 +2015,7 @@ static int nvme_setup_io_queues_trylock(struct nvme_dev *dev) /* * Controller is in wrong state, fail early. */ - if (dev->ctrl.state != NVME_CTRL_CONNECTING) { + if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) { mutex_unlock(&dev->shutdown_lock); return -ENODEV; } @@ -1738,8 +2159,28 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) * might be pointing at! */ result = nvme_disable_ctrl(&dev->ctrl, false); - if (result < 0) - return result; + if (result < 0) { + struct pci_dev *pdev = to_pci_dev(dev->dev); + + /* + * The NVMe Controller Reset method did not get an expected + * CSTS.RDY transition, so something with the device appears to + * be stuck. Use the lower level and bigger hammer PCIe + * Function Level Reset to attempt restoring the device to its + * initial state, and try again. + */ + result = pcie_reset_flr(pdev, false); + if (result < 0) + return result; + + pci_restore_state(pdev); + result = nvme_disable_ctrl(&dev->ctrl, false); + if (result < 0) + return result; + + dev_info(dev->ctrl.device, + "controller reset completed after pcie flr\n"); + } result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); if (result) @@ -1847,6 +2288,18 @@ static void nvme_map_cmb(struct nvme_dev *dev) return; /* + * Controllers may support a CMB size larger than their BAR, for + * example, due to being behind a bridge. Reduce the CMB to the + * reported size of the BAR + */ + size = min(size, bar_size - offset); + + if (!IS_ALIGNED(size, memremap_compat_align()) || + !IS_ALIGNED(pci_resource_start(pdev, bar), + memremap_compat_align())) + return; + + /* * Tell the controller about the host side address mapping the CMB, * and enable CMB decoding for the NVMe 1.4+ scheme: */ @@ -1856,17 +2309,10 @@ static void nvme_map_cmb(struct nvme_dev *dev) dev->bar + NVME_REG_CMBMSC); } - /* - * Controllers may support a CMB size larger than their BAR, - * for example, due to being behind a bridge. Reduce the CMB to - * the reported size of the BAR - */ - if (size > bar_size - offset) - size = bar_size - offset; - if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { dev_warn(dev->ctrl.device, "failed to register the CMB\n"); + hi_lo_writeq(0, dev->bar + NVME_REG_CMBMSC); return; } @@ -1876,8 +2322,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - nvme_update_attrs(dev); } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -1906,7 +2350,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) return ret; } -static void nvme_free_host_mem(struct nvme_dev *dev) +static void nvme_free_host_mem_multi(struct nvme_dev *dev) { int i; @@ -1921,18 +2365,54 @@ static void nvme_free_host_mem(struct nvme_dev *dev) kfree(dev->host_mem_desc_bufs); dev->host_mem_desc_bufs = NULL; - dma_free_coherent(dev->dev, - dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), +} + +static void nvme_free_host_mem(struct nvme_dev *dev) +{ + if (dev->hmb_sgt) + dma_free_noncontiguous(dev->dev, dev->host_mem_size, + dev->hmb_sgt, DMA_BIDIRECTIONAL); + else + nvme_free_host_mem_multi(dev); + + dma_free_coherent(dev->dev, dev->host_mem_descs_size, dev->host_mem_descs, dev->host_mem_descs_dma); dev->host_mem_descs = NULL; + dev->host_mem_descs_size = 0; dev->nr_host_mem_descs = 0; } -static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, +static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size) +{ + dev->hmb_sgt = dma_alloc_noncontiguous(dev->dev, size, + DMA_BIDIRECTIONAL, GFP_KERNEL, 0); + if (!dev->hmb_sgt) + return -ENOMEM; + + dev->host_mem_descs = dma_alloc_coherent(dev->dev, + sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma, + GFP_KERNEL); + if (!dev->host_mem_descs) { + dma_free_noncontiguous(dev->dev, size, dev->hmb_sgt, + DMA_BIDIRECTIONAL); + dev->hmb_sgt = NULL; + return -ENOMEM; + } + dev->host_mem_size = size; + dev->host_mem_descs_size = sizeof(*dev->host_mem_descs); + dev->nr_host_mem_descs = 1; + + dev->host_mem_descs[0].addr = + cpu_to_le64(dev->hmb_sgt->sgl->dma_address); + dev->host_mem_descs[0].size = cpu_to_le32(size / NVME_CTRL_PAGE_SIZE); + return 0; +} + +static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, u32 chunk_size) { struct nvme_host_mem_buf_desc *descs; - u32 max_entries, len; + u32 max_entries, len, descs_size; dma_addr_t descs_dma; int i = 0; void **bufs; @@ -1945,8 +2425,9 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) max_entries = dev->ctrl.hmmaxd; - descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs), - &descs_dma, GFP_KERNEL); + descs_size = max_entries * sizeof(*descs); + descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma, + GFP_KERNEL); if (!descs) goto out; @@ -1975,22 +2456,14 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, dev->host_mem_size = size; dev->host_mem_descs = descs; dev->host_mem_descs_dma = descs_dma; + dev->host_mem_descs_size = descs_size; dev->host_mem_desc_bufs = bufs; return 0; out_free_bufs: - while (--i >= 0) { - size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; - - dma_free_attrs(dev->dev, size, bufs[i], - le64_to_cpu(descs[i].addr), - DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); - } - kfree(bufs); out_free_descs: - dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs, - descs_dma); + dma_free_coherent(dev->dev, descs_size, descs, descs_dma); out: dev->host_mem_descs = NULL; return -ENOMEM; @@ -1998,13 +2471,23 @@ out: static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { + unsigned long dma_merge_boundary = dma_get_merge_boundary(dev->dev); u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); u64 chunk_size; + /* + * If there is an IOMMU that can merge pages, try a virtually + * non-contiguous allocation for a single segment first. + */ + if (dma_merge_boundary && (PAGE_SIZE & dma_merge_boundary) == 0) { + if (!nvme_alloc_host_mem_single(dev, preferred)) + return 0; + } + /* start big and work our way down */ for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { - if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { + if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) { if (!min || dev->host_mem_size >= min) return 0; nvme_free_host_mem(dev); @@ -2052,8 +2535,10 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) } dev_info(dev->ctrl.device, - "allocated %lld MiB host memory buffer.\n", - dev->host_mem_size >> ilog2(SZ_1M)); + "allocated %lld MiB host memory buffer (%u segment%s).\n", + dev->host_mem_size >> ilog2(SZ_1M), + dev->nr_host_mem_descs, + str_plural(dev->nr_host_mem_descs)); } ret = nvme_set_host_mem(dev, enable_bits); @@ -2067,7 +2552,7 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, { struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + return sysfs_emit(buf, "cmbloc : 0x%08x\ncmbsz : 0x%08x\n", ndev->cmbloc, ndev->cmbsz); } static DEVICE_ATTR_RO(cmb); @@ -2216,6 +2701,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) .priv = dev, }; unsigned int irq_queues, poll_queues; + unsigned int flags = PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY; /* * Poll queues don't need interrupts, but we need at least one I/O queue @@ -2239,8 +2725,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) irq_queues = 1; if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) irq_queues += (nr_io_queues - poll_queues); - return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); + if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI) + flags &= ~PCI_IRQ_MSI; + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, flags, + &affd); } static unsigned int nvme_max_io_queues(struct nvme_dev *dev) @@ -2251,7 +2739,8 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev) */ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) return 1; - return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; + return blk_mq_num_possible_queues(0) + dev->nr_write_queues + + dev->nr_poll_queues; } static int nvme_setup_io_queues(struct nvme_dev *dev) @@ -2458,17 +2947,36 @@ static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev) return 1; } -static void nvme_pci_update_nr_queues(struct nvme_dev *dev) +static bool nvme_pci_update_nr_queues(struct nvme_dev *dev) { + if (!dev->ctrl.tagset) { + nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops, + nvme_pci_nr_maps(dev), sizeof(struct nvme_iod)); + return true; + } + + /* Give up if we are racing with nvme_dev_disable() */ + if (!mutex_trylock(&dev->shutdown_lock)) + return false; + + /* Check if nvme_dev_disable() has been executed already */ + if (!dev->online_queues) { + mutex_unlock(&dev->shutdown_lock); + return false; + } + blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); /* free previously allocated queues that are no longer usable */ nvme_free_queues(dev, dev->online_queues); + mutex_unlock(&dev->shutdown_lock); + return true; } static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); + unsigned int flags = PCI_IRQ_ALL_TYPES; if (pci_enable_device_mem(pdev)) return result; @@ -2476,6 +2984,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) pci_set_master(pdev); if (readl(dev->bar + NVME_REG_CSTS) == -1) { + dev_dbg(dev->ctrl.device, "reading CSTS register failed\n"); result = -ENODEV; goto disable; } @@ -2485,7 +2994,9 @@ static int nvme_pci_enable(struct nvme_dev *dev) * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll * adjust this later. */ - result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI) + flags &= ~PCI_IRQ_MSI; + result = pci_alloc_irq_vectors(pdev, 1, 1, flags); if (result < 0) goto disable; @@ -2506,15 +3017,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) else dev->io_sqes = NVME_NVM_IOSQES; - /* - * Temporary fix for the Apple controller found in the MacBook8,1 and - * some MacBook7,1 to avoid controller resets and data loss. - */ - if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { + if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) { dev->q_depth = 2; - dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " - "set queue depth=%u to work around controller resets\n", - dev->q_depth); } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && (pdev->device == 0xa821 || pdev->device == 0xa822) && NVME_CAP_MQES(dev->ctrl.cap) == 0) { @@ -2574,13 +3078,13 @@ static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { + enum nvme_ctrl_state state = nvme_ctrl_state(&dev->ctrl); struct pci_dev *pdev = to_pci_dev(dev->dev); bool dead; mutex_lock(&dev->shutdown_lock); dead = nvme_pci_ctrl_is_dead(dev); - if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) { + if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) { if (pci_is_enabled(pdev)) nvme_start_freeze(&dev->ctrl); /* @@ -2629,39 +3133,15 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) return 0; } -static int nvme_setup_prp_pools(struct nvme_dev *dev) -{ - dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, - NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE, 0); - if (!dev->prp_page_pool) - return -ENOMEM; - - /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, - 256, 256, 0); - if (!dev->prp_small_pool) { - dma_pool_destroy(dev->prp_page_pool); - return -ENOMEM; - } - return 0; -} - -static void nvme_release_prp_pools(struct nvme_dev *dev) -{ - dma_pool_destroy(dev->prp_page_pool); - dma_pool_destroy(dev->prp_small_pool); -} - static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { - size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; + size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS; - dev->iod_mempool = mempool_create_node(1, + dev->dmavec_mempool = mempool_create_node(1, mempool_kmalloc, mempool_kfree, (void *)alloc_size, GFP_KERNEL, dev_to_node(dev->dev)); - if (!dev->iod_mempool) + if (!dev->dmavec_mempool) return -ENOMEM; return 0; } @@ -2691,7 +3171,7 @@ static void nvme_reset_work(struct work_struct *work) bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result; - if (dev->ctrl.state != NVME_CTRL_RESETTING) { + if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", dev->ctrl.state); result = -ENODEV; @@ -2728,26 +3208,34 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) + dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; + else + dev->ctrl.max_integrity_segments = 1; + nvme_dbbuf_dma_alloc(dev); result = nvme_setup_host_mem(dev); if (result < 0) goto out; + nvme_update_attrs(dev); + result = nvme_setup_io_queues(dev); if (result) goto out; /* - * Freeze and update the number of I/O queues as thos might have + * Freeze and update the number of I/O queues as those might have * changed. If there are no I/O queues left after this reset, keep the * controller around but remove all namespaces. */ if (dev->online_queues > 1) { + nvme_dbbuf_set(dev); nvme_unquiesce_io_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); - nvme_pci_update_nr_queues(dev); - nvme_dbbuf_set(dev); + if (!nvme_pci_update_nr_queues(dev)) + goto out; nvme_unfreeze(&dev->ctrl); } else { dev_warn(dev->ctrl.device, "IO queues lost\n"); @@ -2834,6 +3322,14 @@ static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl) return dma_pci_p2pdma_supported(dev->dev); } +static unsigned long nvme_pci_get_virt_boundary(struct nvme_ctrl *ctrl, + bool is_admin) +{ + if (!nvme_ctrl_sgl_supported(ctrl) || is_admin) + return NVME_CTRL_PAGE_SIZE - 1; + return 0; +} + static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, @@ -2844,9 +3340,11 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read64 = nvme_pci_reg_read64, .free_ctrl = nvme_pci_free_ctrl, .submit_async_event = nvme_pci_submit_async_event, + .subsystem_reset = nvme_pci_subsystem_reset, .get_address = nvme_pci_get_address, .print_device_info = nvme_pci_print_device_info, .supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma, + .get_virt_boundary = nvme_pci_get_virt_boundary, }; static int nvme_dev_map(struct nvme_dev *dev) @@ -2903,8 +3401,42 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) && dmi_match(DMI_BOARD_NAME, "LNVNB161216")) return NVME_QUIRK_SIMPLE_SUSPEND; + } else if (pdev->vendor == 0x2646 && (pdev->device == 0x2263 || + pdev->device == 0x500f)) { + /* + * Exclude some Kingston NV1 and A2000 devices from + * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a + * lot of energy with s2idle sleep on some TUXEDO platforms. + */ + if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") || + dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") || + dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") || + dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1")) + return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; + } else if (pdev->vendor == 0x144d && pdev->device == 0xa80d) { + /* + * Exclude Samsung 990 Evo from NVME_QUIRK_SIMPLE_SUSPEND + * because of high power consumption (> 2 Watt) in s2idle + * sleep. Only some boards with Intel CPU are affected. + * (Note for testing: Samsung 990 Evo Plus has same PCI ID) + */ + if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") || + dmi_match(DMI_BOARD_NAME, "GMxPXxx") || + dmi_match(DMI_BOARD_NAME, "GXxMRXx") || + dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") || + dmi_match(DMI_BOARD_NAME, "PH4PG31") || + dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") || + dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71")) + return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; } + /* + * NVMe SSD drops off the PCIe bus after system idle + * for 10 hours on a Lenovo N60z board. + */ + if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6")) + return NVME_QUIRK_NO_APST; + return 0; } @@ -2916,10 +3448,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, struct nvme_dev *dev; int ret = -ENOMEM; - if (node == NUMA_NO_NODE) - set_dev_node(&pdev->dev, first_memory_node); - - dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); + dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids), + GFP_KERNEL, node); if (!dev) return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); @@ -2936,7 +3466,9 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, dev->dev = get_device(&pdev->dev); quirks |= check_vendor_combination_bug(pdev); - if (!noacpi && acpi_storage_d3(&pdev->dev)) { + if (!noacpi && + !(quirks & NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND) && + acpi_storage_d3(&pdev->dev)) { /* * Some systems use a bios work around to ask for D3 on * platforms that support kernel managed suspend. @@ -2962,13 +3494,9 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, * over a single page. */ dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9); + NVME_MAX_BYTES >> SECTOR_SHIFT, + dma_opt_mapping_size(&pdev->dev) >> 9); dev->ctrl.max_segments = NVME_MAX_SEGS; - - /* - * There is no support for SGLs for metadata (yet), so we are limited to - * a single integrity segment for the separate metadata pointer. - */ dev->ctrl.max_integrity_segments = 1; return dev; @@ -2989,17 +3517,17 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (IS_ERR(dev)) return PTR_ERR(dev); - result = nvme_dev_map(dev); + result = nvme_add_ctrl(&dev->ctrl); if (result) - goto out_uninit_ctrl; + goto out_put_ctrl; - result = nvme_setup_prp_pools(dev); + result = nvme_dev_map(dev); if (result) - goto out_dev_unmap; + goto out_uninit_ctrl; result = nvme_pci_alloc_iod_mempool(dev); if (result) - goto out_release_prp_pools; + goto out_dev_unmap; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); @@ -3027,12 +3555,19 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto out_disable; + if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) + dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; + else + dev->ctrl.max_integrity_segments = 1; + nvme_dbbuf_dma_alloc(dev); result = nvme_setup_host_mem(dev); if (result < 0) goto out_disable; + nvme_update_attrs(dev); + result = nvme_setup_io_queues(dev); if (result) goto out_disable; @@ -3068,14 +3603,14 @@ out_disable: nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); out_release_iod_mempool: - mempool_destroy(dev->iod_mempool); -out_release_prp_pools: - nvme_release_prp_pools(dev); + mempool_destroy(dev->dmavec_mempool); out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: nvme_uninit_ctrl(&dev->ctrl); +out_put_ctrl: nvme_put_ctrl(&dev->ctrl); + dev_err_probe(&pdev->dev, result, "probe failed\n"); return result; } @@ -3132,8 +3667,8 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove_admin(dev); nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); - mempool_destroy(dev->iod_mempool); - nvme_release_prp_pools(dev); + mempool_destroy(dev->dmavec_mempool); + nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); } @@ -3196,7 +3731,7 @@ static int nvme_suspend(struct device *dev) nvme_wait_freeze(ctrl); nvme_sync_queues(ctrl); - if (ctrl->state != NVME_CTRL_LIVE) + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) goto unfreeze; /* @@ -3302,7 +3837,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) dev_info(dev->ctrl.device, "restart after slot reset\n"); pci_restore_state(pdev); - if (!nvme_try_sched_reset(&dev->ctrl)) + if (nvme_try_sched_reset(&dev->ctrl)) nvme_unquiesce_io_queues(&dev->ctrl); return PCI_ERS_RESULT_RECOVERED; } @@ -3331,11 +3866,10 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES | - NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + NVME_QUIRK_IGNORE_DEV_SUBNQN | + NVME_QUIRK_BOGUS_NID, }, { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ - .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES, }, + .driver_data = NVME_QUIRK_STRIPE_SIZE, }, { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_MEDIUM_PRIO_SQ | @@ -3349,6 +3883,14 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_BOGUS_NID, }, { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ + .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, }, + { PCI_DEVICE(0x126f, 0x1001), /* Silicon Motion generic */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | NVME_QUIRK_BOGUS_NID, }, @@ -3367,6 +3909,11 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | NVME_QUIRK_DISABLE_WRITE_ZEROES| NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x15b7, 0x5008), /* Sandisk SN530 */ + .driver_data = NVME_QUIRK_BROKEN_MSI }, + { PCI_DEVICE(0x15b7, 0x5009), /* Sandisk SN550 */ + .driver_data = NVME_QUIRK_BROKEN_MSI | + NVME_QUIRK_NO_DEEPEST_PS }, { PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */ @@ -3397,6 +3944,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1c5c, 0x174a), /* SK Hynix P31 SSD */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1c5c, 0x1D59), /* SK Hynix BC901 */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1d97, 0x2263), /* SPCC */ @@ -3442,12 +3991,16 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1dbe, 0x5216), /* Acer/INNOGRIT FA100/5216 NVMe SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e49, 0x0021), /* ZHITAI TiPro5000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x025e, 0xf1ac), /* SOLIDIGM P44 pro SSDPFKKW020X7 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ @@ -3476,7 +4029,12 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), - .driver_data = NVME_QUIRK_SINGLE_VECTOR }, + /* + * Fix for the Apple controller found in the MacBook8,1 and + * some MacBook7,1 to avoid controller resets and data loss. + */ + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_QDEPTH_ONE }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), .driver_data = NVME_QUIRK_SINGLE_VECTOR | @@ -3511,9 +4069,6 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE); - BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE); - BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS); return pci_register_driver(&nvme_driver); } @@ -3527,5 +4082,6 @@ static void __exit nvme_exit(void) MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.0"); +MODULE_DESCRIPTION("NVMe host PCIe transport driver"); module_init(nvme_init); module_exit(nvme_exit); |
