summaryrefslogtreecommitdiff
path: root/drivers/nvme
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme')
-rw-r--r--drivers/nvme/host/apple.c4
-rw-r--r--drivers/nvme/host/constants.c4
-rw-r--r--drivers/nvme/host/core.c56
-rw-r--r--drivers/nvme/host/fc.c10
-rw-r--r--drivers/nvme/host/nvme.h2
-rw-r--r--drivers/nvme/host/pci.c640
-rw-r--r--drivers/nvme/host/rdma.c2
-rw-r--r--drivers/nvme/host/tcp.c11
-rw-r--r--drivers/nvme/target/core.c2
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c6
-rw-r--r--drivers/nvme/target/passthru.c4
-rw-r--r--drivers/nvme/target/pci-epf.c25
-rw-r--r--drivers/nvme/target/tcp.c4
-rw-r--r--drivers/nvme/target/zns.c2
14 files changed, 457 insertions, 315 deletions
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index b1fddfa33ab9..1286c31320e6 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -301,8 +301,8 @@ static void apple_nvme_submit_cmd(struct apple_nvme_queue *q,
memcpy(&q->sqes[tag], cmd, sizeof(*cmd));
/*
- * This lock here doesn't make much sense at a first glace but
- * removing it will result in occasional missed completetion
+ * This lock here doesn't make much sense at a first glance but
+ * removing it will result in occasional missed completion
* interrupts even though the commands still appear on the CQ.
* It's unclear why this happens but our best guess is that
* there is a bug in the firmware triggered when a new command
diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 1a0058be5821..dc90df9e13a2 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -133,7 +133,7 @@ static const char * const nvme_statuses[] = {
[NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached",
[NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported",
[NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid",
- [NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress",
+ [NVME_SC_SELF_TEST_IN_PROGRESS] = "Device Self-test In Progress",
[NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited",
[NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier",
[NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State",
@@ -145,7 +145,7 @@ static const char * const nvme_statuses[] = {
[NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes",
[NVME_SC_INVALID_PI] = "Invalid Protection Information",
[NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range",
- [NVME_SC_CMD_SIZE_LIM_EXCEEDED ] = "Command Size Limits Exceeded",
+ [NVME_SC_CMD_SIZE_LIM_EXCEEDED] = "Command Size Limits Exceeded",
[NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error",
[NVME_SC_ZONE_FULL] = "Zone Is Full",
[NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only",
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7493e5aa984c..9d988f4cb87a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -381,12 +381,12 @@ static void nvme_log_err_passthru(struct request *req)
nr->status & NVME_SC_MASK, /* Status Code */
nr->status & NVME_STATUS_MORE ? "MORE " : "",
nr->status & NVME_STATUS_DNR ? "DNR " : "",
- nr->cmd->common.cdw10,
- nr->cmd->common.cdw11,
- nr->cmd->common.cdw12,
- nr->cmd->common.cdw13,
- nr->cmd->common.cdw14,
- nr->cmd->common.cdw15);
+ le32_to_cpu(nr->cmd->common.cdw10),
+ le32_to_cpu(nr->cmd->common.cdw11),
+ le32_to_cpu(nr->cmd->common.cdw12),
+ le32_to_cpu(nr->cmd->common.cdw13),
+ le32_to_cpu(nr->cmd->common.cdw14),
+ le32_to_cpu(nr->cmd->common.cdw15));
}
enum nvme_disposition {
@@ -764,6 +764,10 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
+
+ if (!(rq->rq_flags & RQF_DONTPREP))
+ nvme_clear_nvme_request(rq);
+
return nvme_host_path_error(rq);
}
EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
@@ -1866,8 +1870,11 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
break;
}
- bi->tuple_size = head->ms;
- bi->pi_offset = info->pi_offset;
+ bi->metadata_size = head->ms;
+ if (bi->csum_type) {
+ bi->pi_tuple_size = head->pi_size;
+ bi->pi_offset = info->pi_offset;
+ }
return true;
}
@@ -2404,22 +2411,24 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
else
lim.write_stream_granularity = 0;
- ret = queue_limits_commit_update(ns->disk->queue, &lim);
- if (ret) {
- blk_mq_unfreeze_queue(ns->disk->queue, memflags);
- goto out;
- }
-
- set_capacity_and_notify(ns->disk, capacity);
-
/*
* Only set the DEAC bit if the device guarantees that reads from
* deallocated data return zeroes. While the DEAC bit does not
* require that, it must be a no-op if reads from deallocated data
* do not return zeroes.
*/
- if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
+ if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) {
ns->head->features |= NVME_NS_DEAC;
+ lim.max_hw_wzeroes_unmap_sectors = lim.max_write_zeroes_sectors;
+ }
+
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ if (ret) {
+ blk_mq_unfreeze_queue(ns->disk->queue, memflags);
+ goto out;
+ }
+
+ set_capacity_and_notify(ns->disk, capacity);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
@@ -3537,15 +3546,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ret)
goto out_free;
}
-
- if (le16_to_cpu(id->awupf) != ctrl->subsys->awupf) {
- dev_err_ratelimited(ctrl->device,
- "inconsistent AWUPF, controller not added (%u/%u).\n",
- le16_to_cpu(id->awupf), ctrl->subsys->awupf);
- ret = -EINVAL;
- goto out_free;
- }
-
memcpy(ctrl->subsys->firmware_rev, id->fr,
sizeof(ctrl->subsys->firmware_rev));
@@ -4077,7 +4077,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
return;
}
}
- list_add(&ns->list, &ns->ctrl->namespaces);
+ list_add_rcu(&ns->list, &ns->ctrl->namespaces);
}
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
@@ -4300,7 +4300,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
/*
- * If available try to use the Command Set Idependent Identify Namespace
+ * If available try to use the Command Set Independent Identify Namespace
* data structure to find all the generic information that is needed to
* set up a namespace. If not fall back to the legacy version.
*/
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 014b387f1e8b..08a5ea3e9383 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -899,7 +899,7 @@ EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss);
* may crash.
*
* As such:
- * Wrapper all the dma routines and check the dev pointer.
+ * Wrap all the dma routines and check the dev pointer.
*
* If simple mappings (return just a dma address, we'll noop them,
* returning a dma address of 0.
@@ -1955,8 +1955,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
}
/*
- * For the linux implementation, if we have an unsucceesful
- * status, they blk-mq layer can typically be called with the
+ * For the linux implementation, if we have an unsuccessful
+ * status, the blk-mq layer can typically be called with the
* non-zero status and the content of the cqe isn't important.
*/
if (status)
@@ -2429,7 +2429,7 @@ static bool nvme_fc_terminate_exchange(struct request *req, void *data)
/*
* This routine runs through all outstanding commands on the association
- * and aborts them. This routine is typically be called by the
+ * and aborts them. This routine is typically called by the
* delete_association routine. It is also called due to an error during
* reconnect. In that scenario, it is most likely a command that initializes
* the controller, including fabric Connect commands on io queues, that
@@ -2622,7 +2622,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
* as part of the exchange. The CQE is the last thing for the io,
* which is transferred (explicitly or implicitly) with the RSP IU
* sent on the exchange. After the CQE is received, the FC exchange is
- * terminaed and the Exchange may be used on a different io.
+ * terminated and the Exchange may be used on a different io.
*
* The transport to LLDD api has the transport making a request for a
* new fcp io request to the LLDD. The LLDD then allocates a FC exchange
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7df2ea21851f..cfd2b5b90b91 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -69,7 +69,7 @@ enum nvme_quirks {
NVME_QUIRK_IDENTIFY_CNS = (1 << 1),
/*
- * The controller deterministically returns O's on reads to
+ * The controller deterministically returns 0's on reads to
* logical blocks that deallocate was called on.
*/
NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2),
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 320aaa41ec39..071efec25346 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -7,7 +7,7 @@
#include <linux/acpi.h>
#include <linux/async.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
+#include <linux/blk-mq-dma.h>
#include <linux/blk-integrity.h>
#include <linux/dmi.h>
#include <linux/init.h>
@@ -27,7 +27,6 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/io-64-nonatomic-hi-lo.h>
#include <linux/sed-opal.h>
-#include <linux/pci-p2pdma.h>
#include "trace.h"
#include "nvme.h"
@@ -39,20 +38,17 @@
#define NVME_SMALL_POOL_SIZE 256
/*
- * These can be higher, but we need to ensure that any command doesn't
- * require an sg allocation that needs more than a page of data.
+ * Arbitrary upper bound.
*/
-#define NVME_MAX_KB_SZ 8192
+#define NVME_MAX_BYTES SZ_8M
#define NVME_MAX_NR_DESCRIPTORS 5
/*
- * For data SGLs we support a single descriptors worth of SGL entries, but for
- * now we also limit it to avoid an allocation larger than PAGE_SIZE for the
- * scatterlist.
+ * For data SGLs we support a single descriptors worth of SGL entries.
+ * For PRPs, segments don't matter at all.
*/
#define NVME_MAX_SEGS \
- min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \
- (PAGE_SIZE / sizeof(struct scatterlist)))
+ (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
/*
* For metadata SGLs, only the small descriptor is supported, and the first
@@ -61,6 +57,21 @@
#define NVME_MAX_META_SEGS \
((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
+/*
+ * The last entry is used to link to the next descriptor.
+ */
+#define PRPS_PER_PAGE \
+ (((NVME_CTRL_PAGE_SIZE / sizeof(__le64))) - 1)
+
+/*
+ * I/O could be non-aligned both at the beginning and end.
+ */
+#define MAX_PRP_RANGE \
+ (NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1))
+
+static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <=
+ (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE));
+
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@@ -97,7 +108,7 @@ static int io_queue_count_set(const char *val, const struct kernel_param *kp)
int ret;
ret = kstrtouint(val, 10, &n);
- if (ret != 0 || n > num_possible_cpus())
+ if (ret != 0 || n > blk_mq_num_possible_queues(0))
return -EINVAL;
return param_set_uint(val, kp);
}
@@ -162,7 +173,7 @@ struct nvme_dev {
bool hmb;
struct sg_table *hmb_sgt;
- mempool_t *iod_mempool;
+ mempool_t *dmavec_mempool;
mempool_t *iod_meta_mempool;
/* shadow doorbell buffer support: */
@@ -246,7 +257,15 @@ enum nvme_iod_flags {
IOD_ABORTED = 1U << 0,
/* uses the small descriptor pool */
- IOD_SMALL_DESCRIPTOR = 1U << 1,
+ IOD_SMALL_DESCRIPTOR = 1U << 1,
+
+ /* single segment dma mapping */
+ IOD_SINGLE_SEGMENT = 1U << 2,
+};
+
+struct nvme_dma_vec {
+ dma_addr_t addr;
+ unsigned int len;
};
/*
@@ -257,13 +276,16 @@ struct nvme_iod {
struct nvme_command cmd;
u8 flags;
u8 nr_descriptors;
- unsigned int dma_len; /* length of single DMA segment mapping */
- dma_addr_t first_dma;
+
+ unsigned int total_len;
+ struct dma_iova_state dma_state;
+ void *descriptors[NVME_MAX_NR_DESCRIPTORS];
+ struct nvme_dma_vec *dma_vecs;
+ unsigned int nr_dma_vecs;
+
dma_addr_t meta_dma;
- struct sg_table sgt;
struct sg_table meta_sgt;
struct nvme_sgl_desc *meta_descriptor;
- void *descriptors[NVME_MAX_NR_DESCRIPTORS];
};
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -406,18 +428,6 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
return true;
}
-/*
- * Will slightly overestimate the number of pages needed. This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static __always_inline int nvme_pci_npages_prp(void)
-{
- unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE;
- unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE);
- return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
-}
-
static struct nvme_descriptor_pools *
nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
{
@@ -578,32 +588,49 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
spin_unlock(&nvmeq->sq_lock);
}
-static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
- struct request *req)
+enum nvme_use_sgl {
+ SGL_UNSUPPORTED,
+ SGL_SUPPORTED,
+ SGL_FORCED,
+};
+
+static inline bool nvme_pci_metadata_use_sgls(struct request *req)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
+
if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
return false;
return req->nr_integrity_segments > 1 ||
nvme_req(req)->flags & NVME_REQ_USERCMD;
}
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
- int nseg)
+static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev,
+ struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- unsigned int avg_seg_size;
- avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+ if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) {
+ if (nvme_req(req)->flags & NVME_REQ_USERCMD)
+ return SGL_FORCED;
+ if (req->nr_integrity_segments > 1)
+ return SGL_FORCED;
+ return SGL_SUPPORTED;
+ }
- if (!nvme_ctrl_sgl_supported(&dev->ctrl))
- return false;
- if (!nvmeq->qid)
- return false;
- if (nvme_pci_metadata_use_sgls(dev, req))
- return true;
- if (!sgl_threshold || avg_seg_size < sgl_threshold)
- return nvme_req(req)->flags & NVME_REQ_USERCMD;
- return true;
+ return SGL_UNSUPPORTED;
+}
+
+static unsigned int nvme_pci_avg_seg_size(struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ unsigned int nseg;
+
+ if (blk_rq_dma_map_coalesce(&iod->dma_state))
+ nseg = 1;
+ else
+ nseg = blk_rq_nr_phys_segments(req);
+ return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
}
static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
@@ -614,11 +641,25 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
return nvmeq->descriptor_pools.large;
}
-static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req)
+static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd)
+{
+ return cmd->common.flags &
+ (NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG);
+}
+
+static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd)
{
+ if (nvme_pci_cmd_use_sgl(cmd))
+ return le64_to_cpu(cmd->common.dptr.sgl.addr);
+ return le64_to_cpu(cmd->common.dptr.prp2);
+}
+
+static void nvme_free_descriptors(struct request *req)
+{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = iod->first_dma;
+ dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd);
int i;
if (iod->nr_descriptors == 1) {
@@ -637,68 +678,130 @@ static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req)
}
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq,
- struct request *req)
+static void nvme_free_prps(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int i;
- if (iod->dma_len) {
- dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
- rq_dma_dir(req));
+ for (i = 0; i < iod->nr_dma_vecs; i++)
+ dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
+ iod->dma_vecs[i].len, rq_dma_dir(req));
+ mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
+}
+
+static void nvme_free_sgls(struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct device *dma_dev = nvmeq->dev->dev;
+ dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
+ unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
+ struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+ enum dma_data_direction dir = rq_dma_dir(req);
+
+ if (iod->nr_descriptors) {
+ unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
+
+ for (i = 0; i < nr_entries; i++)
+ dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
+ le32_to_cpu(sg_list[i].length), dir);
+ } else {
+ dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir);
+ }
+}
+
+static void nvme_unmap_data(struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct device *dma_dev = nvmeq->dev->dev;
+
+ if (iod->flags & IOD_SINGLE_SEGMENT) {
+ static_assert(offsetof(union nvme_data_ptr, prp1) ==
+ offsetof(union nvme_data_ptr, sgl.addr));
+ dma_unmap_page(dma_dev, le64_to_cpu(iod->cmd.common.dptr.prp1),
+ iod->total_len, rq_dma_dir(req));
return;
}
- WARN_ON_ONCE(!iod->sgt.nents);
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
+ if (nvme_pci_cmd_use_sgl(&iod->cmd))
+ nvme_free_sgls(req);
+ else
+ nvme_free_prps(req);
+ }
- dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
- nvme_free_descriptors(nvmeq, req);
- mempool_free(iod->sgt.sgl, dev->iod_mempool);
+ if (iod->nr_descriptors)
+ nvme_free_descriptors(req);
}
-static void nvme_print_sgl(struct scatterlist *sgl, int nents)
+static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
+ struct blk_dma_iter *iter)
{
- int i;
- struct scatterlist *sg;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- for_each_sg(sgl, sg, nents, i) {
- dma_addr_t phys = sg_phys(sg);
- pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
- "dma_address:%pad dma_length:%d\n",
- i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
- sg_dma_len(sg));
+ if (iter->len)
+ return true;
+ if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
+ return false;
+ if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) {
+ iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
+ iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
+ iod->nr_dma_vecs++;
}
+ return true;
}
-static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
- struct request *req, struct nvme_rw_command *cmnd)
+static blk_status_t nvme_pci_setup_data_prp(struct request *req,
+ struct blk_dma_iter *iter)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int length = blk_rq_payload_bytes(req);
- struct scatterlist *sg = iod->sgt.sgl;
- int dma_len = sg_dma_len(sg);
- u64 dma_addr = sg_dma_address(sg);
- int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int length = blk_rq_payload_bytes(req);
+ dma_addr_t prp1_dma, prp2_dma = 0;
+ unsigned int prp_len, i;
__le64 *prp_list;
- dma_addr_t prp_dma;
- int i;
- length -= (NVME_CTRL_PAGE_SIZE - offset);
- if (length <= 0) {
- iod->first_dma = 0;
- goto done;
+ if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) {
+ iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
+ GFP_ATOMIC);
+ if (!iod->dma_vecs)
+ return BLK_STS_RESOURCE;
+ iod->dma_vecs[0].addr = iter->addr;
+ iod->dma_vecs[0].len = iter->len;
+ iod->nr_dma_vecs = 1;
}
- dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
- if (dma_len) {
- dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
- } else {
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
+ /*
+ * PRP1 always points to the start of the DMA transfers.
+ *
+ * This is the only PRP (except for the list entries) that could be
+ * non-aligned.
+ */
+ prp1_dma = iter->addr;
+ prp_len = min(length, NVME_CTRL_PAGE_SIZE -
+ (iter->addr & (NVME_CTRL_PAGE_SIZE - 1)));
+ iod->total_len += prp_len;
+ iter->addr += prp_len;
+ iter->len -= prp_len;
+ length -= prp_len;
+ if (!length)
+ goto done;
+
+ if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
+ if (WARN_ON_ONCE(!iter->status))
+ goto bad_sgl;
+ goto done;
}
+ /*
+ * PRP2 is usually a list, but can point to data if all data to be
+ * transferred fits into PRP1 + PRP2:
+ */
if (length <= NVME_CTRL_PAGE_SIZE) {
- iod->first_dma = dma_addr;
+ prp2_dma = iter->addr;
+ iod->total_len += length;
goto done;
}
@@ -707,58 +810,80 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
iod->flags |= IOD_SMALL_DESCRIPTOR;
prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
- &prp_dma);
- if (!prp_list)
- return BLK_STS_RESOURCE;
+ &prp2_dma);
+ if (!prp_list) {
+ iter->status = BLK_STS_RESOURCE;
+ goto done;
+ }
iod->descriptors[iod->nr_descriptors++] = prp_list;
- iod->first_dma = prp_dma;
+
i = 0;
for (;;) {
+ prp_list[i++] = cpu_to_le64(iter->addr);
+ prp_len = min(length, NVME_CTRL_PAGE_SIZE);
+ if (WARN_ON_ONCE(iter->len < prp_len))
+ goto bad_sgl;
+
+ iod->total_len += prp_len;
+ iter->addr += prp_len;
+ iter->len -= prp_len;
+ length -= prp_len;
+ if (!length)
+ break;
+
+ if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
+ if (WARN_ON_ONCE(!iter->status))
+ goto bad_sgl;
+ goto done;
+ }
+
+ /*
+ * If we've filled the entire descriptor, allocate a new that is
+ * pointed to be the last entry in the previous PRP list. To
+ * accommodate for that move the last actual entry to the new
+ * descriptor.
+ */
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
+ dma_addr_t prp_list_dma;
prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
- GFP_ATOMIC, &prp_dma);
- if (!prp_list)
- goto free_prps;
+ GFP_ATOMIC, &prp_list_dma);
+ if (!prp_list) {
+ iter->status = BLK_STS_RESOURCE;
+ goto done;
+ }
iod->descriptors[iod->nr_descriptors++] = prp_list;
+
prp_list[0] = old_prp_list[i - 1];
- old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+ old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
i = 1;
}
- prp_list[i++] = cpu_to_le64(dma_addr);
- dma_len -= NVME_CTRL_PAGE_SIZE;
- dma_addr += NVME_CTRL_PAGE_SIZE;
- length -= NVME_CTRL_PAGE_SIZE;
- if (length <= 0)
- break;
- if (dma_len > 0)
- continue;
- if (unlikely(dma_len < 0))
- goto bad_sgl;
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
}
+
done:
- cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl));
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
- return BLK_STS_OK;
-free_prps:
- nvme_free_descriptors(nvmeq, req);
- return BLK_STS_RESOURCE;
+ /*
+ * nvme_unmap_data uses the DPT field in the SQE to tear down the
+ * mapping, so initialize it even for failures.
+ */
+ iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma);
+ iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma);
+ if (unlikely(iter->status))
+ nvme_unmap_data(req);
+ return iter->status;
+
bad_sgl:
- WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
- "Invalid SGL for payload:%d nents:%d\n",
- blk_rq_payload_bytes(req), iod->sgt.nents);
+ dev_err_once(nvmeq->dev->dev,
+ "Incorrectly formed request for payload:%d nents:%d\n",
+ blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req));
return BLK_STS_IOERR;
}
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
- struct scatterlist *sg)
+ struct blk_dma_iter *iter)
{
- sge->addr = cpu_to_le64(sg_dma_address(sg));
- sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->addr = cpu_to_le64(iter->addr);
+ sge->length = cpu_to_le32(iter->len);
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
@@ -770,21 +895,22 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
-static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
- struct request *req, struct nvme_rw_command *cmd)
+static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
+ struct blk_dma_iter *iter)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int entries = blk_rq_nr_phys_segments(req);
struct nvme_sgl_desc *sg_list;
- struct scatterlist *sg = iod->sgt.sgl;
- unsigned int entries = iod->sgt.nents;
dma_addr_t sgl_dma;
- int i = 0;
+ unsigned int mapped = 0;
- /* setting the transfer type as SGL */
- cmd->flags = NVME_CMD_SGL_METABUF;
+ /* set the transfer type as SGL */
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
- if (entries == 1) {
- nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+ if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) {
+ nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter);
+ iod->total_len += iter->len;
return BLK_STS_OK;
}
@@ -796,119 +922,104 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
if (!sg_list)
return BLK_STS_RESOURCE;
iod->descriptors[iod->nr_descriptors++] = sg_list;
- iod->first_dma = sgl_dma;
- nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
do {
- nvme_pci_sgl_set_data(&sg_list[i++], sg);
- sg = sg_next(sg);
- } while (--entries > 0);
+ if (WARN_ON_ONCE(mapped == entries)) {
+ iter->status = BLK_STS_IOERR;
+ break;
+ }
+ nvme_pci_sgl_set_data(&sg_list[mapped++], iter);
+ iod->total_len += iter->len;
+ } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state,
+ iter));
- return BLK_STS_OK;
+ nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
+ if (unlikely(iter->status))
+ nvme_free_sgls(req);
+ return iter->status;
}
-static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd,
- struct bio_vec *bv)
+static blk_status_t nvme_pci_setup_data_simple(struct request *req,
+ enum nvme_use_sgl use_sgl)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
- unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
-
- iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->first_dma))
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct bio_vec bv = req_bvec(req);
+ unsigned int prp1_offset = bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
+ bool prp_possible = prp1_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2;
+ dma_addr_t dma_addr;
+
+ if (!use_sgl && !prp_possible)
+ return BLK_STS_AGAIN;
+ if (is_pci_p2pdma_page(bv.bv_page))
+ return BLK_STS_AGAIN;
+
+ dma_addr = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(nvmeq->dev->dev, dma_addr))
return BLK_STS_RESOURCE;
- iod->dma_len = bv->bv_len;
-
- cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
- if (bv->bv_len > first_prp_len)
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
- else
- cmnd->dptr.prp2 = 0;
- return BLK_STS_OK;
-}
-
-static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd,
- struct bio_vec *bv)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ iod->total_len = bv.bv_len;
+ iod->flags |= IOD_SINGLE_SEGMENT;
+
+ if (use_sgl == SGL_FORCED || !prp_possible) {
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
+ iod->cmd.common.dptr.sgl.addr = cpu_to_le64(dma_addr);
+ iod->cmd.common.dptr.sgl.length = cpu_to_le32(bv.bv_len);
+ iod->cmd.common.dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
+ } else {
+ unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - prp1_offset;
- iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->first_dma))
- return BLK_STS_RESOURCE;
- iod->dma_len = bv->bv_len;
+ iod->cmd.common.dptr.prp1 = cpu_to_le64(dma_addr);
+ iod->cmd.common.dptr.prp2 = 0;
+ if (bv.bv_len > first_prp_len)
+ iod->cmd.common.dptr.prp2 =
+ cpu_to_le64(dma_addr + first_prp_len);
+ }
- cmnd->flags = NVME_CMD_SGL_METABUF;
- cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
- cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
- cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
return BLK_STS_OK;
}
-static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
+static blk_status_t nvme_map_data(struct request *req)
{
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- blk_status_t ret = BLK_STS_RESOURCE;
- int rc;
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
+ enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req);
+ struct blk_dma_iter iter;
+ blk_status_t ret;
+ /*
+ * Try to skip the DMA iterator for single segment requests, as that
+ * significantly improves performances for small I/O sizes.
+ */
if (blk_rq_nr_phys_segments(req) == 1) {
- struct bio_vec bv = req_bvec(req);
-
- if (!is_pci_p2pdma_page(bv.bv_page)) {
- if (!nvme_pci_metadata_use_sgls(dev, req) &&
- (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
- bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
- return nvme_setup_prp_simple(dev, req,
- &cmnd->rw, &bv);
-
- if (nvmeq->qid && sgl_threshold &&
- nvme_ctrl_sgl_supported(&dev->ctrl))
- return nvme_setup_sgl_simple(dev, req,
- &cmnd->rw, &bv);
- }
+ ret = nvme_pci_setup_data_simple(req, use_sgl);
+ if (ret != BLK_STS_AGAIN)
+ return ret;
}
- iod->dma_len = 0;
- iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
- if (!iod->sgt.sgl)
- return BLK_STS_RESOURCE;
- sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
- iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl);
- if (!iod->sgt.orig_nents)
- goto out_free_sg;
+ if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
+ return iter.status;
- rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
- DMA_ATTR_NO_WARN);
- if (rc) {
- if (rc == -EREMOTEIO)
- ret = BLK_STS_TARGET;
- goto out_free_sg;
- }
-
- if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
- ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw);
- else
- ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw);
- if (ret != BLK_STS_OK)
- goto out_unmap_sg;
- return BLK_STS_OK;
+ if (use_sgl == SGL_FORCED ||
+ (use_sgl == SGL_SUPPORTED &&
+ (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
+ return nvme_pci_setup_data_sgl(req, &iter);
+ return nvme_pci_setup_data_prp(req, &iter);
+}
-out_unmap_sg:
- dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-out_free_sg:
- mempool_free(iod->sgt.sgl, dev->iod_mempool);
- return ret;
+static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge,
+ struct scatterlist *sg)
+{
+ sge->addr = cpu_to_le64(sg_dma_address(sg));
+ sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
-static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
- struct request *req)
+static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_rw_command *cmnd = &iod->cmd.rw;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sgl, *sg;
unsigned int entries;
@@ -939,19 +1050,19 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
iod->meta_descriptor = sg_list;
iod->meta_dma = sgl_dma;
- cmnd->flags = NVME_CMD_SGL_METASEG;
- cmnd->metadata = cpu_to_le64(sgl_dma);
+ iod->cmd.common.flags = NVME_CMD_SGL_METASEG;
+ iod->cmd.common.metadata = cpu_to_le64(sgl_dma);
sgl = iod->meta_sgt.sgl;
if (entries == 1) {
- nvme_pci_sgl_set_data(sg_list, sgl);
+ nvme_pci_sgl_set_data_sg(sg_list, sgl);
return BLK_STS_OK;
}
sgl_dma += sizeof(*sg_list);
nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
for_each_sg(sgl, sg, entries, i)
- nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
+ nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg);
return BLK_STS_OK;
@@ -962,38 +1073,37 @@ out_free_sg:
return BLK_STS_RESOURCE;
}
-static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
- struct request *req)
+static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct bio_vec bv = rq_integrity_vec(req);
- struct nvme_command *cmnd = &iod->cmd;
- iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->meta_dma))
+ iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
- cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+ iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma);
return BLK_STS_OK;
}
-static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_map_metadata(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
- nvme_pci_metadata_use_sgls(dev, req))
- return nvme_pci_setup_meta_sgls(dev, req);
- return nvme_pci_setup_meta_mptr(dev, req);
+ nvme_pci_metadata_use_sgls(req))
+ return nvme_pci_setup_meta_sgls(req);
+ return nvme_pci_setup_meta_mptr(req);
}
-static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_prep_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret;
iod->flags = 0;
iod->nr_descriptors = 0;
- iod->sgt.nents = 0;
+ iod->total_len = 0;
iod->meta_sgt.nents = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
@@ -1001,13 +1111,13 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
return ret;
if (blk_rq_nr_phys_segments(req)) {
- ret = nvme_map_data(dev, req, &iod->cmd);
+ ret = nvme_map_data(req);
if (ret)
goto out_free_cmd;
}
if (blk_integrity_rq(req)) {
- ret = nvme_map_metadata(dev, req);
+ ret = nvme_map_metadata(req);
if (ret)
goto out_unmap_data;
}
@@ -1016,7 +1126,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
return BLK_STS_OK;
out_unmap_data:
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req->mq_hctx->driver_data, req);
+ nvme_unmap_data(req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
@@ -1041,7 +1151,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
return nvme_fail_nonready_command(&dev->ctrl, req);
- ret = nvme_prep_rq(dev, req);
+ ret = nvme_prep_rq(req);
if (unlikely(ret))
return ret;
spin_lock(&nvmeq->sq_lock);
@@ -1079,7 +1189,7 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
return false;
- return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
+ return nvme_prep_rq(req) == BLK_STS_OK;
}
static void nvme_queue_rqs(struct rq_list *rqlist)
@@ -1105,11 +1215,11 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
*rqlist = requeue_list;
}
-static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
- struct nvme_queue *nvmeq,
- struct request *req)
+static __always_inline void nvme_unmap_metadata(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
if (!iod->meta_sgt.nents) {
dma_unmap_page(dev->dev, iod->meta_dma,
@@ -1126,14 +1236,10 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct nvme_dev *dev = nvmeq->dev;
-
if (blk_integrity_rq(req))
- nvme_unmap_metadata(dev, nvmeq, req);
-
+ nvme_unmap_metadata(req);
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, nvmeq, req);
+ nvme_unmap_data(req);
}
static void nvme_pci_complete_rq(struct request *req)
@@ -1958,8 +2064,28 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
* might be pointing at!
*/
result = nvme_disable_ctrl(&dev->ctrl, false);
- if (result < 0)
- return result;
+ if (result < 0) {
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+ /*
+ * The NVMe Controller Reset method did not get an expected
+ * CSTS.RDY transition, so something with the device appears to
+ * be stuck. Use the lower level and bigger hammer PCIe
+ * Function Level Reset to attempt restoring the device to its
+ * initial state, and try again.
+ */
+ result = pcie_reset_flr(pdev, false);
+ if (result < 0)
+ return result;
+
+ pci_restore_state(pdev);
+ result = nvme_disable_ctrl(&dev->ctrl, false);
+ if (result < 0)
+ return result;
+
+ dev_info(dev->ctrl.device,
+ "controller reset completed after pcie flr\n");
+ }
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
if (result)
@@ -2331,7 +2457,7 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
- return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n",
+ return sysfs_emit(buf, "cmbloc : 0x%08x\ncmbsz : 0x%08x\n",
ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmb);
@@ -2518,7 +2644,8 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
return 1;
- return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+ return blk_mq_num_possible_queues(0) + dev->nr_write_queues +
+ dev->nr_poll_queues;
}
static int nvme_setup_io_queues(struct nvme_dev *dev)
@@ -2913,13 +3040,13 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
- size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
+ size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
- dev->iod_mempool = mempool_create_node(1,
+ dev->dmavec_mempool = mempool_create_node(1,
mempool_kmalloc, mempool_kfree,
(void *)alloc_size, GFP_KERNEL,
dev_to_node(dev->dev));
- if (!dev->iod_mempool)
+ if (!dev->dmavec_mempool)
return -ENOMEM;
dev->iod_meta_mempool = mempool_create_node(1,
@@ -2928,10 +3055,9 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
dev_to_node(dev->dev));
if (!dev->iod_meta_mempool)
goto free;
-
return 0;
free:
- mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->dmavec_mempool);
return -ENOMEM;
}
@@ -3272,7 +3398,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
* over a single page.
*/
dev->ctrl.max_hw_sectors = min_t(u32,
- NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
+ NVME_MAX_BYTES >> SECTOR_SHIFT,
+ dma_opt_mapping_size(&pdev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS;
dev->ctrl.max_integrity_segments = 1;
return dev;
@@ -3380,7 +3507,7 @@ out_disable:
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
out_release_iod_mempool:
- mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->dmavec_mempool);
mempool_destroy(dev->iod_meta_mempool);
out_dev_unmap:
nvme_dev_unmap(dev);
@@ -3444,7 +3571,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dev_remove_admin(dev);
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
- mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->dmavec_mempool);
mempool_destroy(dev->iod_meta_mempool);
nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
@@ -3847,7 +3974,6 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
- BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS);
return pci_register_driver(&nvme_driver);
}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 9bd3646568d0..190a4cfa8a5e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -877,7 +877,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
/*
* Only start IO queues for which we have allocated the tagset
- * and limitted it to the available queues. On reconnects, the
+ * and limited it to the available queues. On reconnects, the
* queue number might have changed.
*/
nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d924008c3949..9233f088fac8 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1745,9 +1745,14 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
qid, ret);
tls_handshake_cancel(queue->sock->sk);
} else {
- dev_dbg(nctrl->device,
- "queue %d: TLS handshake complete, error %d\n",
- qid, queue->tls_err);
+ if (queue->tls_err) {
+ dev_err(nctrl->device,
+ "queue %d: TLS handshake complete, error %d\n",
+ qid, queue->tls_err);
+ } else {
+ dev_dbg(nctrl->device,
+ "queue %d: TLS handshake complete\n", qid);
+ }
ret = queue->tls_err;
}
return ret;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 175c5b6d4dd5..884286f90688 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -581,8 +581,6 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
if (ns->enabled)
goto out_unlock;
- ret = -EMFILE;
-
ret = nvmet_bdev_ns_enable(ns);
if (ret == -ENOTBLK)
ret = nvmet_file_ns_enable(ns);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index eba42df2f821..8d246b8ca604 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -46,6 +46,10 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
id->npda = id->npdg;
/* NOWS = Namespace Optimal Write Size */
id->nows = to0based(bdev_io_opt(bdev) / bdev_logical_block_size(bdev));
+
+ /* Set WZDS and DRB if device supports unmapped write zeroes */
+ if (bdev_write_zeroes_unmap_sectors(bdev))
+ id->dlfeat = (1 << 3) | 0x1;
}
void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
@@ -65,7 +69,7 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns)
return;
if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC) {
- ns->metadata_size = bi->tuple_size;
+ ns->metadata_size = bi->metadata_size;
if (bi->flags & BLK_INTEGRITY_REF_TAG)
ns->pi_type = NVME_NS_DPS_PI_TYPE1;
else
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index b7515c53829b..3b4b0df8f879 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -106,7 +106,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
pctrl->max_hw_sectors);
/*
- * nvmet_passthru_map_sg is limitted to using a single bio so limit
+ * nvmet_passthru_map_sg is limited to using a single bio so limit
* the mdts based on BIO_MAX_VECS as well
*/
max_hw_sectors = min_not_zero(BIO_MAX_VECS << PAGE_SECTORS_SHIFT,
@@ -147,7 +147,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
* When passthru controller is setup using nvme-loop transport it will
* export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in
* the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl()
- * code path with duplicate ctr subsynqn. In order to prevent that we
+ * code path with duplicate ctrl subsysnqn. In order to prevent that we
* mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn.
*/
memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn));
diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c
index a4295a5b8d28..2e78397a7373 100644
--- a/drivers/nvme/target/pci-epf.c
+++ b/drivers/nvme/target/pci-epf.c
@@ -1242,8 +1242,11 @@ static void nvmet_pci_epf_queue_response(struct nvmet_req *req)
iod->status = le16_to_cpu(req->cqe->status) >> 1;
- /* If we have no data to transfer, directly complete the command. */
- if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) {
+ /*
+ * If the command failed or we have no data to transfer, complete the
+ * command immediately.
+ */
+ if (iod->status || !iod->data_len || iod->dma_dir != DMA_TO_DEVICE) {
nvmet_pci_epf_complete_iod(iod);
return;
}
@@ -1604,8 +1607,13 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
goto complete;
}
+ /*
+ * If nvmet_req_init() fails (e.g., unsupported opcode) it will call
+ * __nvmet_req_complete() internally which will call
+ * nvmet_pci_epf_queue_response() and will complete the command directly.
+ */
if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops))
- goto complete;
+ return;
iod->data_len = nvmet_req_transfer_len(req);
if (iod->data_len) {
@@ -1643,10 +1651,11 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
wait_for_completion(&iod->done);
- if (iod->status == NVME_SC_SUCCESS) {
- WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE);
- nvmet_pci_epf_transfer_iod_data(iod);
- }
+ if (iod->status != NVME_SC_SUCCESS)
+ return;
+
+ WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE);
+ nvmet_pci_epf_transfer_iod_data(iod);
complete:
nvmet_pci_epf_complete_iod(iod);
@@ -1860,7 +1869,7 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc);
if (ctrl->io_cqes < sizeof(struct nvme_completion)) {
dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n",
- ctrl->io_sqes, sizeof(struct nvme_completion));
+ ctrl->io_cqes, sizeof(struct nvme_completion));
goto err;
}
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 688033b88d38..470bf37e5a63 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1928,10 +1928,10 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
struct sock *sk = queue->sock->sk;
/* Restore the default callbacks before starting upcall */
- read_lock_bh(&sk->sk_callback_lock);
+ write_lock_bh(&sk->sk_callback_lock);
sk->sk_user_data = NULL;
sk->sk_data_ready = port->data_ready;
- read_unlock_bh(&sk->sk_callback_lock);
+ write_unlock_bh(&sk->sk_callback_lock);
if (!nvmet_tcp_try_peek_pdu(queue)) {
if (!nvmet_tcp_tls_handshake(queue))
return;
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 29a60fabfcc8..15a579cf528c 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -541,7 +541,7 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
struct bio *bio;
int sg_cnt;
- /* Request is completed on len mismatch in nvmet_check_transter_len() */
+ /* Request is completed on len mismatch in nvmet_check_transfer_len() */
if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
return;