diff options
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/apple.c | 4 | ||||
-rw-r--r-- | drivers/nvme/host/constants.c | 4 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 141 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 10 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 10 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 5 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 646 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 11 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd-bdev.c | 6 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 2 | ||||
-rw-r--r-- | drivers/nvme/target/passthru.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/pci-epf.c | 25 | ||||
-rw-r--r-- | drivers/nvme/target/tcp.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/zns.c | 2 |
16 files changed, 519 insertions, 359 deletions
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index b1fddfa33ab9..1286c31320e6 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -301,8 +301,8 @@ static void apple_nvme_submit_cmd(struct apple_nvme_queue *q, memcpy(&q->sqes[tag], cmd, sizeof(*cmd)); /* - * This lock here doesn't make much sense at a first glace but - * removing it will result in occasional missed completetion + * This lock here doesn't make much sense at a first glance but + * removing it will result in occasional missed completion * interrupts even though the commands still appear on the CQ. * It's unclear why this happens but our best guess is that * there is a bug in the firmware triggered when a new command diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index 1a0058be5821..dc90df9e13a2 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -133,7 +133,7 @@ static const char * const nvme_statuses[] = { [NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached", [NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported", [NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid", - [NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress", + [NVME_SC_SELF_TEST_IN_PROGRESS] = "Device Self-test In Progress", [NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited", [NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier", [NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State", @@ -145,7 +145,7 @@ static const char * const nvme_statuses[] = { [NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes", [NVME_SC_INVALID_PI] = "Invalid Protection Information", [NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range", - [NVME_SC_CMD_SIZE_LIM_EXCEEDED ] = "Command Size Limits Exceeded", + [NVME_SC_CMD_SIZE_LIM_EXCEEDED] = "Command Size Limits Exceeded", [NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error", [NVME_SC_ZONE_FULL] = "Zone Is Full", [NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only", diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 92697f98c601..9d988f4cb87a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -381,12 +381,12 @@ static void nvme_log_err_passthru(struct request *req) nr->status & NVME_SC_MASK, /* Status Code */ nr->status & NVME_STATUS_MORE ? "MORE " : "", nr->status & NVME_STATUS_DNR ? "DNR " : "", - nr->cmd->common.cdw10, - nr->cmd->common.cdw11, - nr->cmd->common.cdw12, - nr->cmd->common.cdw13, - nr->cmd->common.cdw14, - nr->cmd->common.cdw14); + le32_to_cpu(nr->cmd->common.cdw10), + le32_to_cpu(nr->cmd->common.cdw11), + le32_to_cpu(nr->cmd->common.cdw12), + le32_to_cpu(nr->cmd->common.cdw13), + le32_to_cpu(nr->cmd->common.cdw14), + le32_to_cpu(nr->cmd->common.cdw15)); } enum nvme_disposition { @@ -764,6 +764,10 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; + + if (!(rq->rq_flags & RQF_DONTPREP)) + nvme_clear_nvme_request(rq); + return nvme_host_path_error(rq); } EXPORT_SYMBOL_GPL(nvme_fail_nonready_command); @@ -1866,8 +1870,11 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, break; } - bi->tuple_size = head->ms; - bi->pi_offset = info->pi_offset; + bi->metadata_size = head->ms; + if (bi->csum_type) { + bi->pi_tuple_size = head->pi_size; + bi->pi_offset = info->pi_offset; + } return true; } @@ -2015,21 +2022,41 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, } -static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, - struct nvme_id_ns *id, struct queue_limits *lim, - u32 bs, u32 atomic_bs) +static u32 nvme_configure_atomic_write(struct nvme_ns *ns, + struct nvme_id_ns *id, struct queue_limits *lim, u32 bs) { - unsigned int boundary = 0; + u32 atomic_bs, boundary = 0; - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { - if (le16_to_cpu(id->nabspf)) + /* + * We do not support an offset for the atomic boundaries. + */ + if (id->nabo) + return bs; + + if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) { + /* + * Use the per-namespace atomic write unit when available. + */ + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; + if (id->nabspf) boundary = (le16_to_cpu(id->nabspf) + 1) * bs; + } else { + /* + * Use the controller wide atomic write unit. This sucks + * because the limit is defined in terms of logical blocks while + * namespaces can have different formats, and because there is + * no clear language in the specification prohibiting different + * values for different controllers in the subsystem. + */ + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; } + lim->atomic_write_hw_max = atomic_bs; lim->atomic_write_hw_boundary = boundary; lim->atomic_write_hw_unit_min = bs; lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); lim->features |= BLK_FEAT_ATOMIC_WRITES; + return atomic_bs; } static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) @@ -2067,34 +2094,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, valid = false; } - atomic_bs = phys_bs = bs; - if (id->nabo == 0) { - /* - * Bit 1 indicates whether NAWUPF is defined for this namespace - * and whether it should be used instead of AWUPF. If NAWUPF == - * 0 then AWUPF must be used instead. - */ - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) - atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; - else - atomic_bs = (1 + ns->ctrl->awupf) * bs; - - /* - * Set subsystem atomic bs. - */ - if (ns->ctrl->subsys->atomic_bs) { - if (atomic_bs != ns->ctrl->subsys->atomic_bs) { - dev_err_ratelimited(ns->ctrl->device, - "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n", - ns->disk ? ns->disk->disk_name : "?", - ns->ctrl->subsys->atomic_bs, - atomic_bs); - } - } else - ns->ctrl->subsys->atomic_bs = atomic_bs; - - nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs); - } + phys_bs = bs; + atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs); if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { /* NPWG = Namespace Preferred Write Granularity */ @@ -2382,16 +2383,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (!nvme_update_disk_info(ns, id, &lim)) capacity = 0; - /* - * Validate the max atomic write size fits within the subsystem's - * atomic write capabilities. - */ - if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) { - blk_mq_unfreeze_queue(ns->disk->queue, memflags); - ret = -ENXIO; - goto out; - } - nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) @@ -2420,22 +2411,24 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, else lim.write_stream_granularity = 0; - ret = queue_limits_commit_update(ns->disk->queue, &lim); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue, memflags); - goto out; - } - - set_capacity_and_notify(ns->disk, capacity); - /* * Only set the DEAC bit if the device guarantees that reads from * deallocated data return zeroes. While the DEAC bit does not * require that, it must be a no-op if reads from deallocated data * do not return zeroes. */ - if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) + if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) { ns->head->features |= NVME_NS_DEAC; + lim.max_hw_wzeroes_unmap_sectors = lim.max_write_zeroes_sectors; + } + + ret = queue_limits_commit_update(ns->disk->queue, &lim); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue, memflags); + goto out; + } + + set_capacity_and_notify(ns->disk, capacity); set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue, memflags); @@ -3215,6 +3208,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->model, id->mn, sizeof(subsys->model)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; + subsys->awupf = le16_to_cpu(id->awupf); /* Versions prior to 1.4 don't necessarily report a valid type */ if (id->cntrltype == NVME_CTRL_DISC || @@ -3647,7 +3641,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); - ctrl->awupf = le16_to_cpu(id->awupf); out_free: kfree(id); return ret; @@ -4036,6 +4029,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) list_add_tail_rcu(&ns->siblings, &head->list); ns->head = head; mutex_unlock(&ctrl->subsys->lock); + +#ifdef CONFIG_NVME_MULTIPATH + cancel_delayed_work(&head->remove_work); +#endif return 0; out_put_ns_head: @@ -4080,7 +4077,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) return; } } - list_add(&ns->list, &ns->ctrl->namespaces); + list_add_rcu(&ns->list, &ns->ctrl->namespaces); } static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) @@ -4089,6 +4086,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) struct nvme_ns *ns; struct gendisk *disk; int node = ctrl->numa_node; + bool last_path = false; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -4181,9 +4179,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); - if (list_empty(&ns->head->list)) + if (list_empty(&ns->head->list)) { list_del_init(&ns->head->entry); + /* + * If multipath is not configured, we still create a namespace + * head (nshead), but head->disk is not initialized in that + * case. As a result, only a single reference to nshead is held + * (via kref_init()) when it is created. Therefore, ensure that + * we do not release the reference to nshead twice if head->disk + * is not present. + */ + if (ns->head->disk) + last_path = true; + } mutex_unlock(&ctrl->subsys->lock); + if (last_path) + nvme_put_ns_head(ns->head); nvme_put_ns_head(ns->head); out_cleanup_disk: put_disk(disk); @@ -4289,7 +4300,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) } /* - * If available try to use the Command Set Idependent Identify Namespace + * If available try to use the Command Set Independent Identify Namespace * data structure to find all the generic information that is needed to * set up a namespace. If not fall back to the legacy version. */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 014b387f1e8b..08a5ea3e9383 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -899,7 +899,7 @@ EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss); * may crash. * * As such: - * Wrapper all the dma routines and check the dev pointer. + * Wrap all the dma routines and check the dev pointer. * * If simple mappings (return just a dma address, we'll noop them, * returning a dma address of 0. @@ -1955,8 +1955,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) } /* - * For the linux implementation, if we have an unsucceesful - * status, they blk-mq layer can typically be called with the + * For the linux implementation, if we have an unsuccessful + * status, the blk-mq layer can typically be called with the * non-zero status and the content of the cqe isn't important. */ if (status) @@ -2429,7 +2429,7 @@ static bool nvme_fc_terminate_exchange(struct request *req, void *data) /* * This routine runs through all outstanding commands on the association - * and aborts them. This routine is typically be called by the + * and aborts them. This routine is typically called by the * delete_association routine. It is also called due to an error during * reconnect. In that scenario, it is most likely a command that initializes * the controller, including fabric Connect commands on io queues, that @@ -2622,7 +2622,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, * as part of the exchange. The CQE is the last thing for the io, * which is transferred (explicitly or implicitly) with the RSP IU * sent on the exchange. After the CQE is received, the FC exchange is - * terminaed and the Exchange may be used on a different io. + * terminated and the Exchange may be used on a different io. * * The transport to LLDD api has the transport making a request for a * new fcp io request to the LLDD. The LLDD then allocates a FC exchange diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e040e467f9fa..3da980dc60d9 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -690,8 +690,8 @@ static void nvme_remove_head(struct nvme_ns_head *head) nvme_cdev_del(&head->cdev, &head->cdev_device); synchronize_srcu(&head->srcu); del_gendisk(head->disk); - nvme_put_ns_head(head); } + nvme_put_ns_head(head); } static void nvme_remove_head_work(struct work_struct *work) @@ -1200,7 +1200,8 @@ void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head) */ srcu_idx = srcu_read_lock(&head->srcu); - list_for_each_entry_rcu(ns, &head->list, siblings) { + list_for_each_entry_srcu(ns, &head->list, siblings, + srcu_read_lock_held(&head->srcu)) { /* * Ensure that ns path disk node is already added otherwise we * may get invalid kobj name for target @@ -1291,6 +1292,9 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { bool remove = false; + if (!head->disk) + return; + mutex_lock(&head->subsys->lock); /* * We are called when all paths have been removed, and at that point @@ -1311,7 +1315,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) */ if (!try_module_get(THIS_MODULE)) goto out; - queue_delayed_work(nvme_wq, &head->remove_work, + mod_delayed_work(nvme_wq, &head->remove_work, head->delayed_removal_secs * HZ); } else { list_del_init(&head->entry); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a468cdc5b5cb..cfd2b5b90b91 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -69,7 +69,7 @@ enum nvme_quirks { NVME_QUIRK_IDENTIFY_CNS = (1 << 1), /* - * The controller deterministically returns O's on reads to + * The controller deterministically returns 0's on reads to * logical blocks that deallocate was called on. */ NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2), @@ -410,7 +410,6 @@ struct nvme_ctrl { enum nvme_ctrl_type cntrltype; enum nvme_dctype dctype; - u16 awupf; /* 0's based value. */ }; static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl) @@ -443,11 +442,11 @@ struct nvme_subsystem { u8 cmic; enum nvme_subsys_type subtype; u16 vendor_id; + u16 awupf; /* 0's based value. */ struct ida ns_ida; #ifdef CONFIG_NVME_MULTIPATH enum nvme_iopolicy iopolicy; #endif - u32 atomic_bs; }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8ff12e415cb5..071efec25346 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -7,7 +7,7 @@ #include <linux/acpi.h> #include <linux/async.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> +#include <linux/blk-mq-dma.h> #include <linux/blk-integrity.h> #include <linux/dmi.h> #include <linux/init.h> @@ -27,7 +27,6 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/io-64-nonatomic-hi-lo.h> #include <linux/sed-opal.h> -#include <linux/pci-p2pdma.h> #include "trace.h" #include "nvme.h" @@ -39,20 +38,17 @@ #define NVME_SMALL_POOL_SIZE 256 /* - * These can be higher, but we need to ensure that any command doesn't - * require an sg allocation that needs more than a page of data. + * Arbitrary upper bound. */ -#define NVME_MAX_KB_SZ 8192 +#define NVME_MAX_BYTES SZ_8M #define NVME_MAX_NR_DESCRIPTORS 5 /* - * For data SGLs we support a single descriptors worth of SGL entries, but for - * now we also limit it to avoid an allocation larger than PAGE_SIZE for the - * scatterlist. + * For data SGLs we support a single descriptors worth of SGL entries. + * For PRPs, segments don't matter at all. */ #define NVME_MAX_SEGS \ - min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \ - (PAGE_SIZE / sizeof(struct scatterlist))) + (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) /* * For metadata SGLs, only the small descriptor is supported, and the first @@ -61,6 +57,21 @@ #define NVME_MAX_META_SEGS \ ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1) +/* + * The last entry is used to link to the next descriptor. + */ +#define PRPS_PER_PAGE \ + (((NVME_CTRL_PAGE_SIZE / sizeof(__le64))) - 1) + +/* + * I/O could be non-aligned both at the beginning and end. + */ +#define MAX_PRP_RANGE \ + (NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1)) + +static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <= + (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE)); + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -97,7 +108,7 @@ static int io_queue_count_set(const char *val, const struct kernel_param *kp) int ret; ret = kstrtouint(val, 10, &n); - if (ret != 0 || n > num_possible_cpus()) + if (ret != 0 || n > blk_mq_num_possible_queues(0)) return -EINVAL; return param_set_uint(val, kp); } @@ -162,7 +173,7 @@ struct nvme_dev { bool hmb; struct sg_table *hmb_sgt; - mempool_t *iod_mempool; + mempool_t *dmavec_mempool; mempool_t *iod_meta_mempool; /* shadow doorbell buffer support: */ @@ -246,7 +257,15 @@ enum nvme_iod_flags { IOD_ABORTED = 1U << 0, /* uses the small descriptor pool */ - IOD_SMALL_DESCRIPTOR = 1U << 1, + IOD_SMALL_DESCRIPTOR = 1U << 1, + + /* single segment dma mapping */ + IOD_SINGLE_SEGMENT = 1U << 2, +}; + +struct nvme_dma_vec { + dma_addr_t addr; + unsigned int len; }; /* @@ -257,13 +276,16 @@ struct nvme_iod { struct nvme_command cmd; u8 flags; u8 nr_descriptors; - unsigned int dma_len; /* length of single DMA segment mapping */ - dma_addr_t first_dma; + + unsigned int total_len; + struct dma_iova_state dma_state; + void *descriptors[NVME_MAX_NR_DESCRIPTORS]; + struct nvme_dma_vec *dma_vecs; + unsigned int nr_dma_vecs; + dma_addr_t meta_dma; - struct sg_table sgt; struct sg_table meta_sgt; struct nvme_sgl_desc *meta_descriptor; - void *descriptors[NVME_MAX_NR_DESCRIPTORS]; }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -406,18 +428,6 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, return true; } -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static __always_inline int nvme_pci_npages_prp(void) -{ - unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; - unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); -} - static struct nvme_descriptor_pools * nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) { @@ -578,32 +588,49 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) spin_unlock(&nvmeq->sq_lock); } -static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev, - struct request *req) +enum nvme_use_sgl { + SGL_UNSUPPORTED, + SGL_SUPPORTED, + SGL_FORCED, +}; + +static inline bool nvme_pci_metadata_use_sgls(struct request *req) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) return false; return req->nr_integrity_segments > 1 || nvme_req(req)->flags & NVME_REQ_USERCMD; } -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, - int nseg) +static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev, + struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - unsigned int avg_seg_size; - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); + if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) { + if (nvme_req(req)->flags & NVME_REQ_USERCMD) + return SGL_FORCED; + if (req->nr_integrity_segments > 1) + return SGL_FORCED; + return SGL_SUPPORTED; + } - if (!nvme_ctrl_sgl_supported(&dev->ctrl)) - return false; - if (!nvmeq->qid) - return false; - if (nvme_pci_metadata_use_sgls(dev, req)) - return true; - if (!sgl_threshold || avg_seg_size < sgl_threshold) - return nvme_req(req)->flags & NVME_REQ_USERCMD; - return true; + return SGL_UNSUPPORTED; +} + +static unsigned int nvme_pci_avg_seg_size(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + unsigned int nseg; + + if (blk_rq_dma_map_coalesce(&iod->dma_state)) + nseg = 1; + else + nseg = blk_rq_nr_phys_segments(req); + return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); } static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, @@ -614,11 +641,25 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, return nvmeq->descriptor_pools.large; } -static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req) +static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd) +{ + return cmd->common.flags & + (NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG); +} + +static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd) { + if (nvme_pci_cmd_use_sgl(cmd)) + return le64_to_cpu(cmd->common.dptr.sgl.addr); + return le64_to_cpu(cmd->common.dptr.prp2); +} + +static void nvme_free_descriptors(struct request *req) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; + dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd); int i; if (iod->nr_descriptors == 1) { @@ -637,68 +678,130 @@ static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req) } } -static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq, - struct request *req) +static void nvme_free_prps(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int i; + + for (i = 0; i < iod->nr_dma_vecs; i++) + dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr, + iod->dma_vecs[i].len, rq_dma_dir(req)); + mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); +} + +static void nvme_free_sgls(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct device *dma_dev = nvmeq->dev->dev; + dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr); + unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length); + struct nvme_sgl_desc *sg_list = iod->descriptors[0]; + enum dma_data_direction dir = rq_dma_dir(req); + + if (iod->nr_descriptors) { + unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i; + + for (i = 0; i < nr_entries; i++) + dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), + le32_to_cpu(sg_list[i].length), dir); + } else { + dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir); + } +} + +static void nvme_unmap_data(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct device *dma_dev = nvmeq->dev->dev; - if (iod->dma_len) { - dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, - rq_dma_dir(req)); + if (iod->flags & IOD_SINGLE_SEGMENT) { + static_assert(offsetof(union nvme_data_ptr, prp1) == + offsetof(union nvme_data_ptr, sgl.addr)); + dma_unmap_page(dma_dev, le64_to_cpu(iod->cmd.common.dptr.prp1), + iod->total_len, rq_dma_dir(req)); return; } - WARN_ON_ONCE(!iod->sgt.nents); + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { + if (nvme_pci_cmd_use_sgl(&iod->cmd)) + nvme_free_sgls(req); + else + nvme_free_prps(req); + } - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); - nvme_free_descriptors(nvmeq, req); - mempool_free(iod->sgt.sgl, dev->iod_mempool); + if (iod->nr_descriptors) + nvme_free_descriptors(req); } -static void nvme_print_sgl(struct scatterlist *sgl, int nents) +static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, + struct blk_dma_iter *iter) { - int i; - struct scatterlist *sg; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - for_each_sg(sgl, sg, nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", - i, &phys, sg->offset, sg->length, &sg_dma_address(sg), - sg_dma_len(sg)); + if (iter->len) + return true; + if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter)) + return false; + if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) { + iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr; + iod->dma_vecs[iod->nr_dma_vecs].len = iter->len; + iod->nr_dma_vecs++; } + return true; } -static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, - struct request *req, struct nvme_rw_command *cmnd) +static blk_status_t nvme_pci_setup_data_prp(struct request *req, + struct blk_dma_iter *iter) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - int length = blk_rq_payload_bytes(req); - struct scatterlist *sg = iod->sgt.sgl; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int length = blk_rq_payload_bytes(req); + dma_addr_t prp1_dma, prp2_dma = 0; + unsigned int prp_len, i; __le64 *prp_list; - dma_addr_t prp_dma; - int i; - length -= (NVME_CTRL_PAGE_SIZE - offset); - if (length <= 0) { - iod->first_dma = 0; - goto done; + if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) { + iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool, + GFP_ATOMIC); + if (!iod->dma_vecs) + return BLK_STS_RESOURCE; + iod->dma_vecs[0].addr = iter->addr; + iod->dma_vecs[0].len = iter->len; + iod->nr_dma_vecs = 1; } - dma_len -= (NVME_CTRL_PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (NVME_CTRL_PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); + /* + * PRP1 always points to the start of the DMA transfers. + * + * This is the only PRP (except for the list entries) that could be + * non-aligned. + */ + prp1_dma = iter->addr; + prp_len = min(length, NVME_CTRL_PAGE_SIZE - + (iter->addr & (NVME_CTRL_PAGE_SIZE - 1))); + iod->total_len += prp_len; + iter->addr += prp_len; + iter->len -= prp_len; + length -= prp_len; + if (!length) + goto done; + + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { + if (WARN_ON_ONCE(!iter->status)) + goto bad_sgl; + goto done; } + /* + * PRP2 is usually a list, but can point to data if all data to be + * transferred fits into PRP1 + PRP2: + */ if (length <= NVME_CTRL_PAGE_SIZE) { - iod->first_dma = dma_addr; + prp2_dma = iter->addr; + iod->total_len += length; goto done; } @@ -707,58 +810,80 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, iod->flags |= IOD_SMALL_DESCRIPTOR; prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, - &prp_dma); - if (!prp_list) - return BLK_STS_RESOURCE; + &prp2_dma); + if (!prp_list) { + iter->status = BLK_STS_RESOURCE; + goto done; + } iod->descriptors[iod->nr_descriptors++] = prp_list; - iod->first_dma = prp_dma; + i = 0; for (;;) { + prp_list[i++] = cpu_to_le64(iter->addr); + prp_len = min(length, NVME_CTRL_PAGE_SIZE); + if (WARN_ON_ONCE(iter->len < prp_len)) + goto bad_sgl; + + iod->total_len += prp_len; + iter->addr += prp_len; + iter->len -= prp_len; + length -= prp_len; + if (!length) + break; + + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { + if (WARN_ON_ONCE(!iter->status)) + goto bad_sgl; + goto done; + } + + /* + * If we've filled the entire descriptor, allocate a new that is + * pointed to be the last entry in the previous PRP list. To + * accommodate for that move the last actual entry to the new + * descriptor. + */ if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; + dma_addr_t prp_list_dma; prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large, - GFP_ATOMIC, &prp_dma); - if (!prp_list) - goto free_prps; + GFP_ATOMIC, &prp_list_dma); + if (!prp_list) { + iter->status = BLK_STS_RESOURCE; + goto done; + } iod->descriptors[iod->nr_descriptors++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); + old_prp_list[i - 1] = cpu_to_le64(prp_list_dma); i = 1; } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= NVME_CTRL_PAGE_SIZE; - dma_addr += NVME_CTRL_PAGE_SIZE; - length -= NVME_CTRL_PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - if (unlikely(dma_len < 0)) - goto bad_sgl; - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); } + done: - cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl)); - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); - return BLK_STS_OK; -free_prps: - nvme_free_descriptors(nvmeq, req); - return BLK_STS_RESOURCE; + /* + * nvme_unmap_data uses the DPT field in the SQE to tear down the + * mapping, so initialize it even for failures. + */ + iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma); + iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma); + if (unlikely(iter->status)) + nvme_unmap_data(req); + return iter->status; + bad_sgl: - WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), - "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->sgt.nents); + dev_err_once(nvmeq->dev->dev, + "Incorrectly formed request for payload:%d nents:%d\n", + blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req)); return BLK_STS_IOERR; } static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, - struct scatterlist *sg) + struct blk_dma_iter *iter) { - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->addr = cpu_to_le64(iter->addr); + sge->length = cpu_to_le32(iter->len); sge->type = NVME_SGL_FMT_DATA_DESC << 4; } @@ -770,21 +895,22 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } -static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, - struct request *req, struct nvme_rw_command *cmd) +static blk_status_t nvme_pci_setup_data_sgl(struct request *req, + struct blk_dma_iter *iter) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int entries = blk_rq_nr_phys_segments(req); struct nvme_sgl_desc *sg_list; - struct scatterlist *sg = iod->sgt.sgl; - unsigned int entries = iod->sgt.nents; dma_addr_t sgl_dma; - int i = 0; + unsigned int mapped = 0; - /* setting the transfer type as SGL */ - cmd->flags = NVME_CMD_SGL_METABUF; + /* set the transfer type as SGL */ + iod->cmd.common.flags = NVME_CMD_SGL_METABUF; - if (entries == 1) { - nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); + if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) { + nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter); + iod->total_len += iter->len; return BLK_STS_OK; } @@ -796,119 +922,104 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, if (!sg_list) return BLK_STS_RESOURCE; iod->descriptors[iod->nr_descriptors++] = sg_list; - iod->first_dma = sgl_dma; - nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); do { - nvme_pci_sgl_set_data(&sg_list[i++], sg); - sg = sg_next(sg); - } while (--entries > 0); + if (WARN_ON_ONCE(mapped == entries)) { + iter->status = BLK_STS_IOERR; + break; + } + nvme_pci_sgl_set_data(&sg_list[mapped++], iter); + iod->total_len += iter->len; + } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, + iter)); - return BLK_STS_OK; + nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); + if (unlikely(iter->status)) + nvme_free_sgls(req); + return iter->status; } -static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd, - struct bio_vec *bv) +static blk_status_t nvme_pci_setup_data_simple(struct request *req, + enum nvme_use_sgl use_sgl) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); - unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; - - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->first_dma)) + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct bio_vec bv = req_bvec(req); + unsigned int prp1_offset = bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1); + bool prp_possible = prp1_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2; + dma_addr_t dma_addr; + + if (!use_sgl && !prp_possible) + return BLK_STS_AGAIN; + if (is_pci_p2pdma_page(bv.bv_page)) + return BLK_STS_AGAIN; + + dma_addr = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0); + if (dma_mapping_error(nvmeq->dev->dev, dma_addr)) return BLK_STS_RESOURCE; - iod->dma_len = bv->bv_len; - - cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); - if (bv->bv_len > first_prp_len) - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); - else - cmnd->dptr.prp2 = 0; - return BLK_STS_OK; -} - -static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd, - struct bio_vec *bv) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + iod->total_len = bv.bv_len; + iod->flags |= IOD_SINGLE_SEGMENT; + + if (use_sgl == SGL_FORCED || !prp_possible) { + iod->cmd.common.flags = NVME_CMD_SGL_METABUF; + iod->cmd.common.dptr.sgl.addr = cpu_to_le64(dma_addr); + iod->cmd.common.dptr.sgl.length = cpu_to_le32(bv.bv_len); + iod->cmd.common.dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; + } else { + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - prp1_offset; - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->first_dma)) - return BLK_STS_RESOURCE; - iod->dma_len = bv->bv_len; + iod->cmd.common.dptr.prp1 = cpu_to_le64(dma_addr); + iod->cmd.common.dptr.prp2 = 0; + if (bv.bv_len > first_prp_len) + iod->cmd.common.dptr.prp2 = + cpu_to_le64(dma_addr + first_prp_len); + } - cmnd->flags = NVME_CMD_SGL_METABUF; - cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); - cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); - cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; return BLK_STS_OK; } -static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) +static blk_status_t nvme_map_data(struct request *req) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - blk_status_t ret = BLK_STS_RESOURCE; - int rc; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req); + struct blk_dma_iter iter; + blk_status_t ret; + /* + * Try to skip the DMA iterator for single segment requests, as that + * significantly improves performances for small I/O sizes. + */ if (blk_rq_nr_phys_segments(req) == 1) { - struct bio_vec bv = req_bvec(req); - - if (!is_pci_p2pdma_page(bv.bv_page)) { - if (!nvme_pci_metadata_use_sgls(dev, req) && - (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + - bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) - return nvme_setup_prp_simple(dev, req, - &cmnd->rw, &bv); - - if (nvmeq->qid && sgl_threshold && - nvme_ctrl_sgl_supported(&dev->ctrl)) - return nvme_setup_sgl_simple(dev, req, - &cmnd->rw, &bv); - } + ret = nvme_pci_setup_data_simple(req, use_sgl); + if (ret != BLK_STS_AGAIN) + return ret; } - iod->dma_len = 0; - iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); - if (!iod->sgt.sgl) - return BLK_STS_RESOURCE; - sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl); - if (!iod->sgt.orig_nents) - goto out_free_sg; - - rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) { - if (rc == -EREMOTEIO) - ret = BLK_STS_TARGET; - goto out_free_sg; - } + if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) + return iter.status; - if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) - ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw); - else - ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw); - if (ret != BLK_STS_OK) - goto out_unmap_sg; - return BLK_STS_OK; + if (use_sgl == SGL_FORCED || + (use_sgl == SGL_SUPPORTED && + (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) + return nvme_pci_setup_data_sgl(req, &iter); + return nvme_pci_setup_data_prp(req, &iter); +} -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->sgt.sgl, dev->iod_mempool); - return ret; +static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge, + struct scatterlist *sg) +{ + sge->addr = cpu_to_le64(sg_dma_address(sg)); + sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->type = NVME_SGL_FMT_DATA_DESC << 4; } -static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, - struct request *req) +static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_rw_command *cmnd = &iod->cmd.rw; struct nvme_sgl_desc *sg_list; struct scatterlist *sgl, *sg; unsigned int entries; @@ -939,19 +1050,19 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; - cmnd->flags = NVME_CMD_SGL_METASEG; - cmnd->metadata = cpu_to_le64(sgl_dma); + iod->cmd.common.flags = NVME_CMD_SGL_METASEG; + iod->cmd.common.metadata = cpu_to_le64(sgl_dma); sgl = iod->meta_sgt.sgl; if (entries == 1) { - nvme_pci_sgl_set_data(sg_list, sgl); + nvme_pci_sgl_set_data_sg(sg_list, sgl); return BLK_STS_OK; } sgl_dma += sizeof(*sg_list); nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); for_each_sg(sgl, sg, entries, i) - nvme_pci_sgl_set_data(&sg_list[i + 1], sg); + nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg); return BLK_STS_OK; @@ -962,38 +1073,37 @@ out_free_sg: return BLK_STS_RESOURCE; } -static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev, - struct request *req) +static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct bio_vec bv = rq_integrity_vec(req); - struct nvme_command *cmnd = &iod->cmd; - iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->meta_dma)) + iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0); + if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma)) return BLK_STS_IOERR; - cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); + iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma); return BLK_STS_OK; } -static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req) +static blk_status_t nvme_map_metadata(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) && - nvme_pci_metadata_use_sgls(dev, req)) - return nvme_pci_setup_meta_sgls(dev, req); - return nvme_pci_setup_meta_mptr(dev, req); + nvme_pci_metadata_use_sgls(req)) + return nvme_pci_setup_meta_sgls(req); + return nvme_pci_setup_meta_mptr(req); } -static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) +static blk_status_t nvme_prep_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; iod->flags = 0; iod->nr_descriptors = 0; - iod->sgt.nents = 0; + iod->total_len = 0; iod->meta_sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); @@ -1001,13 +1111,13 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) return ret; if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, &iod->cmd); + ret = nvme_map_data(req); if (ret) goto out_free_cmd; } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req); + ret = nvme_map_metadata(req); if (ret) goto out_unmap_data; } @@ -1016,7 +1126,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) return BLK_STS_OK; out_unmap_data: if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req->mq_hctx->driver_data, req); + nvme_unmap_data(req); out_free_cmd: nvme_cleanup_cmd(req); return ret; @@ -1041,7 +1151,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!nvme_check_ready(&dev->ctrl, req, true))) return nvme_fail_nonready_command(&dev->ctrl, req); - ret = nvme_prep_rq(dev, req); + ret = nvme_prep_rq(req); if (unlikely(ret)) return ret; spin_lock(&nvmeq->sq_lock); @@ -1079,7 +1189,7 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) return false; - return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; + return nvme_prep_rq(req) == BLK_STS_OK; } static void nvme_queue_rqs(struct rq_list *rqlist) @@ -1105,11 +1215,11 @@ static void nvme_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } -static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, - struct nvme_queue *nvmeq, - struct request *req) +static __always_inline void nvme_unmap_metadata(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; if (!iod->meta_sgt.nents) { dma_unmap_page(dev->dev, iod->meta_dma, @@ -1126,14 +1236,10 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, static __always_inline void nvme_pci_unmap_rq(struct request *req) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; - if (blk_integrity_rq(req)) - nvme_unmap_metadata(dev, nvmeq, req); - + nvme_unmap_metadata(req); if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, nvmeq, req); + nvme_unmap_data(req); } static void nvme_pci_complete_rq(struct request *req) @@ -1958,8 +2064,28 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) * might be pointing at! */ result = nvme_disable_ctrl(&dev->ctrl, false); - if (result < 0) - return result; + if (result < 0) { + struct pci_dev *pdev = to_pci_dev(dev->dev); + + /* + * The NVMe Controller Reset method did not get an expected + * CSTS.RDY transition, so something with the device appears to + * be stuck. Use the lower level and bigger hammer PCIe + * Function Level Reset to attempt restoring the device to its + * initial state, and try again. + */ + result = pcie_reset_flr(pdev, false); + if (result < 0) + return result; + + pci_restore_state(pdev); + result = nvme_disable_ctrl(&dev->ctrl, false); + if (result < 0) + return result; + + dev_info(dev->ctrl.device, + "controller reset completed after pcie flr\n"); + } result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); if (result) @@ -2101,8 +2227,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - nvme_update_attrs(dev); } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -2333,7 +2457,7 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, { struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + return sysfs_emit(buf, "cmbloc : 0x%08x\ncmbsz : 0x%08x\n", ndev->cmbloc, ndev->cmbsz); } static DEVICE_ATTR_RO(cmb); @@ -2520,7 +2644,8 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev) */ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) return 1; - return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; + return blk_mq_num_possible_queues(0) + dev->nr_write_queues + + dev->nr_poll_queues; } static int nvme_setup_io_queues(struct nvme_dev *dev) @@ -2915,13 +3040,13 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); - size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; + size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS; - dev->iod_mempool = mempool_create_node(1, + dev->dmavec_mempool = mempool_create_node(1, mempool_kmalloc, mempool_kfree, (void *)alloc_size, GFP_KERNEL, dev_to_node(dev->dev)); - if (!dev->iod_mempool) + if (!dev->dmavec_mempool) return -ENOMEM; dev->iod_meta_mempool = mempool_create_node(1, @@ -2930,10 +3055,9 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) dev_to_node(dev->dev)); if (!dev->iod_meta_mempool) goto free; - return 0; free: - mempool_destroy(dev->iod_mempool); + mempool_destroy(dev->dmavec_mempool); return -ENOMEM; } @@ -3010,6 +3134,8 @@ static void nvme_reset_work(struct work_struct *work) if (result < 0) goto out; + nvme_update_attrs(dev); + result = nvme_setup_io_queues(dev); if (result) goto out; @@ -3272,7 +3398,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, * over a single page. */ dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9); + NVME_MAX_BYTES >> SECTOR_SHIFT, + dma_opt_mapping_size(&pdev->dev) >> 9); dev->ctrl.max_segments = NVME_MAX_SEGS; dev->ctrl.max_integrity_segments = 1; return dev; @@ -3343,6 +3470,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result < 0) goto out_disable; + nvme_update_attrs(dev); + result = nvme_setup_io_queues(dev); if (result) goto out_disable; @@ -3378,7 +3507,7 @@ out_disable: nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); out_release_iod_mempool: - mempool_destroy(dev->iod_mempool); + mempool_destroy(dev->dmavec_mempool); mempool_destroy(dev->iod_meta_mempool); out_dev_unmap: nvme_dev_unmap(dev); @@ -3442,7 +3571,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove_admin(dev); nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); - mempool_destroy(dev->iod_mempool); + mempool_destroy(dev->dmavec_mempool); mempool_destroy(dev->iod_meta_mempool); nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); @@ -3845,7 +3974,6 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS); return pci_register_driver(&nvme_driver); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9bd3646568d0..190a4cfa8a5e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -877,7 +877,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) /* * Only start IO queues for which we have allocated the tagset - * and limitted it to the available queues. On reconnects, the + * and limited it to the available queues. On reconnects, the * queue number might have changed. */ nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d924008c3949..9233f088fac8 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1745,9 +1745,14 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, qid, ret); tls_handshake_cancel(queue->sock->sk); } else { - dev_dbg(nctrl->device, - "queue %d: TLS handshake complete, error %d\n", - qid, queue->tls_err); + if (queue->tls_err) { + dev_err(nctrl->device, + "queue %d: TLS handshake complete, error %d\n", + qid, queue->tls_err); + } else { + dev_dbg(nctrl->device, + "queue %d: TLS handshake complete\n", qid); + } ret = queue->tls_err; } return ret; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 175c5b6d4dd5..884286f90688 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -581,8 +581,6 @@ int nvmet_ns_enable(struct nvmet_ns *ns) if (ns->enabled) goto out_unlock; - ret = -EMFILE; - ret = nvmet_bdev_ns_enable(ns); if (ret == -ENOTBLK) ret = nvmet_file_ns_enable(ns); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index eba42df2f821..8d246b8ca604 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -46,6 +46,10 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) id->npda = id->npdg; /* NOWS = Namespace Optimal Write Size */ id->nows = to0based(bdev_io_opt(bdev) / bdev_logical_block_size(bdev)); + + /* Set WZDS and DRB if device supports unmapped write zeroes */ + if (bdev_write_zeroes_unmap_sectors(bdev)) + id->dlfeat = (1 << 3) | 0x1; } void nvmet_bdev_ns_disable(struct nvmet_ns *ns) @@ -65,7 +69,7 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) return; if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC) { - ns->metadata_size = bi->tuple_size; + ns->metadata_size = bi->metadata_size; if (bi->flags & BLK_INTEGRITY_REF_TAG) ns->pi_type = NVME_NS_DPS_PI_TYPE1; else diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index df69a9dee71c..51df72f5e89b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -867,6 +867,8 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio) { if (bio != &req->b.inline_bio) bio_put(bio); + else + bio_uninit(bio); } #ifdef CONFIG_NVME_TARGET_TCP_TLS diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index b7515c53829b..3b4b0df8f879 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -106,7 +106,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) pctrl->max_hw_sectors); /* - * nvmet_passthru_map_sg is limitted to using a single bio so limit + * nvmet_passthru_map_sg is limited to using a single bio so limit * the mdts based on BIO_MAX_VECS as well */ max_hw_sectors = min_not_zero(BIO_MAX_VECS << PAGE_SECTORS_SHIFT, @@ -147,7 +147,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) * When passthru controller is setup using nvme-loop transport it will * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl() - * code path with duplicate ctr subsynqn. In order to prevent that we + * code path with duplicate ctrl subsysnqn. In order to prevent that we * mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn. */ memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn)); diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index a4295a5b8d28..2e78397a7373 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1242,8 +1242,11 @@ static void nvmet_pci_epf_queue_response(struct nvmet_req *req) iod->status = le16_to_cpu(req->cqe->status) >> 1; - /* If we have no data to transfer, directly complete the command. */ - if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) { + /* + * If the command failed or we have no data to transfer, complete the + * command immediately. + */ + if (iod->status || !iod->data_len || iod->dma_dir != DMA_TO_DEVICE) { nvmet_pci_epf_complete_iod(iod); return; } @@ -1604,8 +1607,13 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) goto complete; } + /* + * If nvmet_req_init() fails (e.g., unsupported opcode) it will call + * __nvmet_req_complete() internally which will call + * nvmet_pci_epf_queue_response() and will complete the command directly. + */ if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops)) - goto complete; + return; iod->data_len = nvmet_req_transfer_len(req); if (iod->data_len) { @@ -1643,10 +1651,11 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) wait_for_completion(&iod->done); - if (iod->status == NVME_SC_SUCCESS) { - WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE); - nvmet_pci_epf_transfer_iod_data(iod); - } + if (iod->status != NVME_SC_SUCCESS) + return; + + WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE); + nvmet_pci_epf_transfer_iod_data(iod); complete: nvmet_pci_epf_complete_iod(iod); @@ -1860,7 +1869,7 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc); if (ctrl->io_cqes < sizeof(struct nvme_completion)) { dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n", - ctrl->io_sqes, sizeof(struct nvme_completion)); + ctrl->io_cqes, sizeof(struct nvme_completion)); goto err; } diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 688033b88d38..470bf37e5a63 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1928,10 +1928,10 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, struct sock *sk = queue->sock->sk; /* Restore the default callbacks before starting upcall */ - read_lock_bh(&sk->sk_callback_lock); + write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; sk->sk_data_ready = port->data_ready; - read_unlock_bh(&sk->sk_callback_lock); + write_unlock_bh(&sk->sk_callback_lock); if (!nvmet_tcp_try_peek_pdu(queue)) { if (!nvmet_tcp_tls_handshake(queue)) return; diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 29a60fabfcc8..15a579cf528c 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -541,7 +541,7 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) struct bio *bio; int sg_cnt; - /* Request is completed on len mismatch in nvmet_check_transter_len() */ + /* Request is completed on len mismatch in nvmet_check_transfer_len() */ if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) return; |