summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/pci.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r--drivers/nvme/host/pci.c2263
1 files changed, 1422 insertions, 841 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d8585df2c2fd..0e4caeab739c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -5,19 +5,20 @@
*/
#include <linux/acpi.h>
-#include <linux/aer.h>
#include <linux/async.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/blk-mq-pci.h>
+#include <linux/blk-mq-dma.h>
#include <linux/blk-integrity.h>
#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
+#include <linux/kstrtox.h>
+#include <linux/memremap.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/nodemask.h>
#include <linux/once.h>
#include <linux/pci.h>
#include <linux/suspend.h>
@@ -26,7 +27,6 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/io-64-nonatomic-hi-lo.h>
#include <linux/sed-opal.h>
-#include <linux/pci-p2pdma.h>
#include "trace.h"
#include "nvme.h"
@@ -34,17 +34,46 @@
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
-#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+/* Optimisation for I/Os between 4k and 128k */
+#define NVME_SMALL_POOL_SIZE 256
/*
- * These can be higher, but we need to ensure that any command doesn't
- * require an sg allocation that needs more than a page of data.
+ * Arbitrary upper bound.
*/
-#define NVME_MAX_KB_SZ 4096
-#define NVME_MAX_SEGS 127
+#define NVME_MAX_BYTES SZ_8M
+#define NVME_MAX_NR_DESCRIPTORS 5
+
+/*
+ * For data SGLs we support a single descriptors worth of SGL entries.
+ * For PRPs, segments don't matter at all.
+ */
+#define NVME_MAX_SEGS \
+ (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+
+/*
+ * For metadata SGLs, only the small descriptor is supported, and the first
+ * entry is the segment descriptor, which for the data pointer sits in the SQE.
+ */
+#define NVME_MAX_META_SEGS \
+ ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
+
+/*
+ * The last entry is used to link to the next descriptor.
+ */
+#define PRPS_PER_PAGE \
+ (((NVME_CTRL_PAGE_SIZE / sizeof(__le64))) - 1)
+
+/*
+ * I/O could be non-aligned both at the beginning and end.
+ */
+#define MAX_PRP_RANGE \
+ (NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1))
+
+static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <=
+ (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE));
static int use_threaded_interrupts;
-module_param(use_threaded_interrupts, int, 0);
+module_param(use_threaded_interrupts, int, 0444);
static bool use_cmb_sqes = true;
module_param(use_cmb_sqes, bool, 0444);
@@ -79,7 +108,7 @@ static int io_queue_count_set(const char *val, const struct kernel_param *kp)
int ret;
ret = kstrtouint(val, 10, &n);
- if (ret != 0 || n > num_possible_cpus())
+ if (ret != 0 || n > blk_mq_num_possible_queues(0))
return -EINVAL;
return param_set_uint(val, kp);
}
@@ -107,7 +136,13 @@ struct nvme_dev;
struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
-static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
+static void nvme_delete_io_queues(struct nvme_dev *dev);
+static void nvme_update_attrs(struct nvme_dev *dev);
+
+struct nvme_descriptor_pools {
+ struct dma_pool *large;
+ struct dma_pool *small;
+};
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
@@ -118,8 +153,6 @@ struct nvme_dev {
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct device *dev;
- struct dma_pool *prp_page_pool;
- struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
@@ -129,7 +162,6 @@ struct nvme_dev {
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
- struct work_struct remove_work;
struct mutex shutdown_lock;
bool subsystem;
u64 cmb_size;
@@ -139,26 +171,26 @@ struct nvme_dev {
struct nvme_ctrl ctrl;
u32 last_ps;
bool hmb;
-
- mempool_t *iod_mempool;
+ struct sg_table *hmb_sgt;
+ mempool_t *dmavec_mempool;
/* shadow doorbell buffer support: */
- u32 *dbbuf_dbs;
+ __le32 *dbbuf_dbs;
dma_addr_t dbbuf_dbs_dma_addr;
- u32 *dbbuf_eis;
+ __le32 *dbbuf_eis;
dma_addr_t dbbuf_eis_dma_addr;
/* host memory buffer support: */
u64 host_mem_size;
u32 nr_host_mem_descs;
+ u32 host_mem_descs_size;
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
-
- bool attrs_added;
+ struct nvme_descriptor_pools descriptor_pools[];
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -188,6 +220,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
*/
struct nvme_queue {
struct nvme_dev *dev;
+ struct nvme_descriptor_pools descriptor_pools;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
@@ -209,31 +242,64 @@ struct nvme_queue {
#define NVMEQ_SQ_CMB 1
#define NVMEQ_DELETE_ERROR 2
#define NVMEQ_POLLED 3
- u32 *dbbuf_sq_db;
- u32 *dbbuf_cq_db;
- u32 *dbbuf_sq_ei;
- u32 *dbbuf_cq_ei;
+ __le32 *dbbuf_sq_db;
+ __le32 *dbbuf_cq_db;
+ __le32 *dbbuf_sq_ei;
+ __le32 *dbbuf_cq_ei;
struct completion delete_done;
};
+/* bits for iod->flags */
+enum nvme_iod_flags {
+ /* this command has been aborted by the timeout handler */
+ IOD_ABORTED = 1U << 0,
+
+ /* uses the small descriptor pool */
+ IOD_SMALL_DESCRIPTOR = 1U << 1,
+
+ /* single segment dma mapping */
+ IOD_SINGLE_SEGMENT = 1U << 2,
+
+ /* Data payload contains p2p memory */
+ IOD_DATA_P2P = 1U << 3,
+
+ /* Metadata contains p2p memory */
+ IOD_META_P2P = 1U << 4,
+
+ /* Data payload contains MMIO memory */
+ IOD_DATA_MMIO = 1U << 5,
+
+ /* Metadata contains MMIO memory */
+ IOD_META_MMIO = 1U << 6,
+
+ /* Metadata using non-coalesced MPTR */
+ IOD_SINGLE_META_SEGMENT = 1U << 7,
+};
+
+struct nvme_dma_vec {
+ dma_addr_t addr;
+ unsigned int len;
+};
+
/*
* The nvme_iod describes the data in an I/O.
- *
- * The sg pointer contains the list of PRP/SGL chunk allocations in addition
- * to the actual struct scatterlist.
*/
struct nvme_iod {
struct nvme_request req;
struct nvme_command cmd;
- struct nvme_queue *nvmeq;
- bool use_sgl;
- int aborted;
- int npages; /* In the PRP list. 0 means small pool in use */
- int nents; /* Used in scatterlist */
- dma_addr_t first_dma;
- unsigned int dma_len; /* length of single DMA segment mapping */
+ u8 flags;
+ u8 nr_descriptors;
+
+ unsigned int total_len;
+ struct dma_iova_state dma_state;
+ void *descriptors[NVME_MAX_NR_DESCRIPTORS];
+ struct nvme_dma_vec *dma_vecs;
+ unsigned int nr_dma_vecs;
+
dma_addr_t meta_dma;
- struct scatterlist *sg;
+ unsigned int meta_total_len;
+ struct dma_iova_state meta_dma_state;
+ struct nvme_sgl_desc *meta_descriptor;
};
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -241,10 +307,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
return dev->nr_allocated_queues * 8 * dev->db_stride;
}
-static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
unsigned int mem_size = nvme_dbbuf_size(dev);
+ if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP))
+ return;
+
if (dev->dbbuf_dbs) {
/*
* Clear the dbbuf memory so the driver doesn't observe stale
@@ -252,25 +321,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
*/
memset(dev->dbbuf_dbs, 0, mem_size);
memset(dev->dbbuf_eis, 0, mem_size);
- return 0;
+ return;
}
dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
&dev->dbbuf_dbs_dma_addr,
GFP_KERNEL);
if (!dev->dbbuf_dbs)
- return -ENOMEM;
+ goto fail;
dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
&dev->dbbuf_eis_dma_addr,
GFP_KERNEL);
- if (!dev->dbbuf_eis) {
- dma_free_coherent(dev->dev, mem_size,
- dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
- dev->dbbuf_dbs = NULL;
- return -ENOMEM;
- }
+ if (!dev->dbbuf_eis)
+ goto fail_free_dbbuf_dbs;
+ return;
- return 0;
+fail_free_dbbuf_dbs:
+ dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs,
+ dev->dbbuf_dbs_dma_addr);
+ dev->dbbuf_dbs = NULL;
+fail:
+ dev_warn(dev->dev, "unable to allocate dma for dbbuf\n");
}
static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
@@ -340,11 +411,11 @@ static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
}
/* Update dbbuf and return true if an MMIO is required */
-static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
- volatile u32 *dbbuf_ei)
+static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
+ volatile __le32 *dbbuf_ei)
{
if (dbbuf_db) {
- u16 old_value;
+ u16 old_value, event_idx;
/*
* Ensure that the queue is written before updating
@@ -352,90 +423,104 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
*/
wmb();
- old_value = *dbbuf_db;
- *dbbuf_db = value;
+ old_value = le32_to_cpu(*dbbuf_db);
+ *dbbuf_db = cpu_to_le32(value);
/*
* Ensure that the doorbell is updated before reading the event
* index from memory. The controller needs to provide similar
- * ordering to ensure the envent index is updated before reading
+ * ordering to ensure the event index is updated before reading
* the doorbell.
*/
mb();
- if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+ event_idx = le32_to_cpu(*dbbuf_ei);
+ if (!nvme_dbbuf_need_event(event_idx, value, old_value))
return false;
}
return true;
}
-/*
- * Will slightly overestimate the number of pages needed. This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_pci_npages_prp(void)
+static struct nvme_descriptor_pools *
+nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
{
- unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
- NVME_CTRL_PAGE_SIZE);
- return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node];
+ size_t small_align = NVME_SMALL_POOL_SIZE;
+
+ if (pools->small)
+ return pools; /* already initialized */
+
+ pools->large = dma_pool_create_node("nvme descriptor page", dev->dev,
+ NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node);
+ if (!pools->large)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+ small_align = 512;
+
+ pools->small = dma_pool_create_node("nvme descriptor small", dev->dev,
+ NVME_SMALL_POOL_SIZE, small_align, 0, numa_node);
+ if (!pools->small) {
+ dma_pool_destroy(pools->large);
+ pools->large = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return pools;
}
-/*
- * Calculates the number of pages needed for the SGL segments. For example a 4k
- * page can accommodate 256 SGL descriptors.
- */
-static int nvme_pci_npages_sgl(void)
+static void nvme_release_descriptor_pools(struct nvme_dev *dev)
{
- return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
- PAGE_SIZE);
+ unsigned i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i];
+
+ dma_pool_destroy(pools->large);
+ dma_pool_destroy(pools->small);
+ }
}
-static size_t nvme_pci_iod_alloc_size(void)
+static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned qid)
{
- size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
+ struct nvme_dev *dev = to_nvme_dev(data);
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+ struct nvme_descriptor_pools *pools;
+ struct blk_mq_tags *tags;
- return sizeof(__le64 *) * npages +
- sizeof(struct scatterlist) * NVME_MAX_SEGS;
+ tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
+ WARN_ON(tags != hctx->tags);
+ pools = nvme_setup_descriptor_pools(dev, hctx->numa_node);
+ if (IS_ERR(pools))
+ return PTR_ERR(pools);
+
+ nvmeq->descriptor_pools = *pools;
+ hctx->driver_data = nvmeq;
+ return 0;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
- struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = &dev->queues[0];
-
WARN_ON(hctx_idx != 0);
- WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
-
- hctx->driver_data = nvmeq;
- return 0;
+ return nvme_init_hctx_common(hctx, data, 0);
}
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+ unsigned int hctx_idx)
{
- struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
-
- WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
- hctx->driver_data = nvmeq;
- return 0;
+ return nvme_init_hctx_common(hctx, data, hctx_idx + 1);
}
-static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
- unsigned int hctx_idx, unsigned int numa_node)
+static int nvme_pci_init_request(struct blk_mq_tag_set *set,
+ struct request *req, unsigned int hctx_idx,
+ unsigned int numa_node)
{
- struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
- struct nvme_queue *nvmeq = &dev->queues[queue_idx];
-
- BUG_ON(!nvmeq);
- iod->nvmeq = nvmeq;
- nvme_req(req)->ctrl = &dev->ctrl;
+ nvme_req(req)->ctrl = set->driver_data;
nvme_req(req)->cmd = &iod->cmd;
return 0;
}
@@ -449,9 +534,9 @@ static int queue_irq_offset(struct nvme_dev *dev)
return 0;
}
-static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
+static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
- struct nvme_dev *dev = set->driver_data;
+ struct nvme_dev *dev = to_nvme_dev(set->driver_data);
int i, qoff, offset;
offset = queue_irq_offset(dev);
@@ -470,14 +555,12 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
*/
map->queue_offset = qoff;
if (i != HCTX_TYPE_POLL && offset)
- blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+ blk_mq_map_hw_queues(map, dev->dev, offset);
else
blk_mq_map_queues(map);
qoff += map->nr_queues;
offset += map->nr_queues;
}
-
- return 0;
}
/*
@@ -519,206 +602,368 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
spin_unlock(&nvmeq->sq_lock);
}
-static void **nvme_pci_iod_list(struct request *req)
+enum nvme_use_sgl {
+ SGL_UNSUPPORTED,
+ SGL_SUPPORTED,
+ SGL_FORCED,
+};
+
+static inline bool nvme_pci_metadata_use_sgls(struct request *req)
+{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
+
+ if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ return false;
+ return req->nr_integrity_segments > 1 ||
+ nvme_req(req)->flags & NVME_REQ_USERCMD;
+}
+
+static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev,
+ struct request *req)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+
+ if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) {
+ /*
+ * When the controller is capable of using SGL, there are
+ * several conditions that we force to use it:
+ *
+ * 1. A request containing page gaps within the controller's
+ * mask can not use the PRP format.
+ *
+ * 2. User commands use SGL because that lets the device
+ * validate the requested transfer lengths.
+ *
+ * 3. Multiple integrity segments must use SGL as that's the
+ * only way to describe such a command in NVMe.
+ */
+ if (req_phys_gap_mask(req) & (NVME_CTRL_PAGE_SIZE - 1) ||
+ nvme_req(req)->flags & NVME_REQ_USERCMD ||
+ req->nr_integrity_segments > 1)
+ return SGL_FORCED;
+ return SGL_SUPPORTED;
+ }
+
+ return SGL_UNSUPPORTED;
}
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+static unsigned int nvme_pci_avg_seg_size(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int nseg = blk_rq_nr_phys_segments(req);
- unsigned int avg_seg_size;
+ unsigned int nseg;
- avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+ if (blk_rq_dma_map_coalesce(&iod->dma_state))
+ nseg = 1;
+ else
+ nseg = blk_rq_nr_phys_segments(req);
+ return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+}
- if (!nvme_ctrl_sgl_supported(&dev->ctrl))
- return false;
- if (!iod->nvmeq->qid)
- return false;
- if (!sgl_threshold || avg_seg_size < sgl_threshold)
- return false;
- return true;
+static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
+ struct nvme_iod *iod)
+{
+ if (iod->flags & IOD_SMALL_DESCRIPTOR)
+ return nvmeq->descriptor_pools.small;
+ return nvmeq->descriptor_pools.large;
}
-static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd)
{
+ return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG;
+}
+
+static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd)
+{
+ return cmd->common.flags &
+ (NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG);
+}
+
+static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd)
+{
+ if (nvme_pci_cmd_use_sgl(cmd))
+ return le64_to_cpu(cmd->common.dptr.sgl.addr);
+ return le64_to_cpu(cmd->common.dptr.prp2);
+}
+
+static void nvme_free_descriptors(struct request *req)
+{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = iod->first_dma;
+ dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd);
int i;
- for (i = 0; i < iod->npages; i++) {
- __le64 *prp_list = nvme_pci_iod_list(req)[i];
+ if (iod->nr_descriptors == 1) {
+ dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
+ dma_addr);
+ return;
+ }
+
+ for (i = 0; i < iod->nr_descriptors; i++) {
+ __le64 *prp_list = iod->descriptors[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+ dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
+ dma_addr);
dma_addr = next_dma_addr;
}
}
-static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
+static void nvme_free_prps(struct request *req, unsigned int attrs)
{
- const int last_sg = SGES_PER_PAGE - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = iod->first_dma;
- int i;
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int i;
- for (i = 0; i < iod->npages; i++) {
- struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
- dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
+ for (i = 0; i < iod->nr_dma_vecs; i++)
+ dma_unmap_phys(nvmeq->dev->dev, iod->dma_vecs[i].addr,
+ iod->dma_vecs[i].len, rq_dma_dir(req), attrs);
+ mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
+}
- dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
- dma_addr = next_dma_addr;
+static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge,
+ struct nvme_sgl_desc *sg_list, unsigned int attrs)
+{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ enum dma_data_direction dir = rq_dma_dir(req);
+ unsigned int len = le32_to_cpu(sge->length);
+ struct device *dma_dev = nvmeq->dev->dev;
+ unsigned int i;
+
+ if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
+ dma_unmap_phys(dma_dev, le64_to_cpu(sge->addr), len, dir,
+ attrs);
+ return;
}
+
+ for (i = 0; i < len / sizeof(*sg_list); i++)
+ dma_unmap_phys(dma_dev, le64_to_cpu(sg_list[i].addr),
+ le32_to_cpu(sg_list[i].length), dir, attrs);
}
-static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
+static void nvme_unmap_metadata(struct request *req)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE;
+ enum dma_data_direction dir = rq_dma_dir(req);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct device *dma_dev = nvmeq->dev->dev;
+ struct nvme_sgl_desc *sge = iod->meta_descriptor;
+ unsigned int attrs = 0;
- if (is_pci_p2pdma_page(sg_page(iod->sg)))
- pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
- rq_dma_dir(req));
- else
- dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
+ if (iod->flags & IOD_SINGLE_META_SEGMENT) {
+ dma_unmap_page(dma_dev, iod->meta_dma,
+ rq_integrity_vec(req).bv_len,
+ rq_dma_dir(req));
+ return;
+ }
+
+ if (iod->flags & IOD_META_P2P)
+ map = PCI_P2PDMA_MAP_BUS_ADDR;
+ else if (iod->flags & IOD_META_MMIO) {
+ map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
+ attrs |= DMA_ATTR_MMIO;
+ }
+
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state,
+ iod->meta_total_len, map)) {
+ if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
+ nvme_free_sgls(req, sge, &sge[1], attrs);
+ else
+ dma_unmap_phys(dma_dev, iod->meta_dma,
+ iod->meta_total_len, dir, attrs);
+ }
+
+ if (iod->meta_descriptor)
+ dma_pool_free(nvmeq->descriptor_pools.small,
+ iod->meta_descriptor, iod->meta_dma);
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static void nvme_unmap_data(struct request *req)
{
+ enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
- if (iod->dma_len) {
- dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
- rq_dma_dir(req));
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct device *dma_dev = nvmeq->dev->dev;
+ unsigned int attrs = 0;
+
+ if (iod->flags & IOD_SINGLE_SEGMENT) {
+ static_assert(offsetof(union nvme_data_ptr, prp1) ==
+ offsetof(union nvme_data_ptr, sgl.addr));
+ dma_unmap_page(dma_dev, le64_to_cpu(iod->cmd.common.dptr.prp1),
+ iod->total_len, rq_dma_dir(req));
return;
}
- WARN_ON_ONCE(!iod->nents);
+ if (iod->flags & IOD_DATA_P2P)
+ map = PCI_P2PDMA_MAP_BUS_ADDR;
+ else if (iod->flags & IOD_DATA_MMIO) {
+ map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
+ attrs |= DMA_ATTR_MMIO;
+ }
- nvme_unmap_sg(dev, req);
- if (iod->npages == 0)
- dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
- iod->first_dma);
- else if (iod->use_sgl)
- nvme_free_sgls(dev, req);
- else
- nvme_free_prps(dev, req);
- mempool_free(iod->sg, dev->iod_mempool);
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len,
+ map)) {
+ if (nvme_pci_cmd_use_sgl(&iod->cmd))
+ nvme_free_sgls(req, iod->descriptors[0],
+ &iod->cmd.common.dptr.sgl, attrs);
+ else
+ nvme_free_prps(req, attrs);
+ }
+
+ if (iod->nr_descriptors)
+ nvme_free_descriptors(req);
}
-static void nvme_print_sgl(struct scatterlist *sgl, int nents)
+static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
+ struct blk_dma_iter *iter)
{
- int i;
- struct scatterlist *sg;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- for_each_sg(sgl, sg, nents, i) {
- dma_addr_t phys = sg_phys(sg);
- pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
- "dma_address:%pad dma_length:%d\n",
- i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
- sg_dma_len(sg));
+ if (iter->len)
+ return true;
+ if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
+ return false;
+ if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) {
+ iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
+ iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
+ iod->nr_dma_vecs++;
}
+ return true;
}
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd)
+static blk_status_t nvme_pci_setup_data_prp(struct request *req,
+ struct blk_dma_iter *iter)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
- int length = blk_rq_payload_bytes(req);
- struct scatterlist *sg = iod->sg;
- int dma_len = sg_dma_len(sg);
- u64 dma_addr = sg_dma_address(sg);
- int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int length = blk_rq_payload_bytes(req);
+ dma_addr_t prp1_dma, prp2_dma = 0;
+ unsigned int prp_len, i;
__le64 *prp_list;
- void **list = nvme_pci_iod_list(req);
- dma_addr_t prp_dma;
- int nprps, i;
- length -= (NVME_CTRL_PAGE_SIZE - offset);
- if (length <= 0) {
- iod->first_dma = 0;
- goto done;
+ if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) {
+ iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
+ GFP_ATOMIC);
+ if (!iod->dma_vecs)
+ return BLK_STS_RESOURCE;
+ iod->dma_vecs[0].addr = iter->addr;
+ iod->dma_vecs[0].len = iter->len;
+ iod->nr_dma_vecs = 1;
}
- dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
- if (dma_len) {
- dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
- } else {
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
+ /*
+ * PRP1 always points to the start of the DMA transfers.
+ *
+ * This is the only PRP (except for the list entries) that could be
+ * non-aligned.
+ */
+ prp1_dma = iter->addr;
+ prp_len = min(length, NVME_CTRL_PAGE_SIZE -
+ (iter->addr & (NVME_CTRL_PAGE_SIZE - 1)));
+ iod->total_len += prp_len;
+ iter->addr += prp_len;
+ iter->len -= prp_len;
+ length -= prp_len;
+ if (!length)
+ goto done;
+
+ if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
+ if (WARN_ON_ONCE(!iter->status))
+ goto bad_sgl;
+ goto done;
}
+ /*
+ * PRP2 is usually a list, but can point to data if all data to be
+ * transferred fits into PRP1 + PRP2:
+ */
if (length <= NVME_CTRL_PAGE_SIZE) {
- iod->first_dma = dma_addr;
+ prp2_dma = iter->addr;
+ iod->total_len += length;
goto done;
}
- nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
- if (nprps <= (256 / 8)) {
- pool = dev->prp_small_pool;
- iod->npages = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->npages = 1;
- }
+ if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
+ NVME_SMALL_POOL_SIZE / sizeof(__le64))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+ prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &prp2_dma);
if (!prp_list) {
- iod->first_dma = dma_addr;
- iod->npages = -1;
- return BLK_STS_RESOURCE;
+ iter->status = BLK_STS_RESOURCE;
+ goto done;
}
- list[0] = prp_list;
- iod->first_dma = prp_dma;
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
+
i = 0;
for (;;) {
+ prp_list[i++] = cpu_to_le64(iter->addr);
+ prp_len = min(length, NVME_CTRL_PAGE_SIZE);
+ if (WARN_ON_ONCE(iter->len < prp_len))
+ goto bad_sgl;
+
+ iod->total_len += prp_len;
+ iter->addr += prp_len;
+ iter->len -= prp_len;
+ length -= prp_len;
+ if (!length)
+ break;
+
+ if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
+ if (WARN_ON_ONCE(!iter->status))
+ goto bad_sgl;
+ goto done;
+ }
+
+ /*
+ * If we've filled the entire descriptor, allocate a new that is
+ * pointed to be the last entry in the previous PRP list. To
+ * accommodate for that move the last actual entry to the new
+ * descriptor.
+ */
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
- if (!prp_list)
- goto free_prps;
- list[iod->npages++] = prp_list;
+ dma_addr_t prp_list_dma;
+
+ prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
+ GFP_ATOMIC, &prp_list_dma);
+ if (!prp_list) {
+ iter->status = BLK_STS_RESOURCE;
+ goto done;
+ }
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
+
prp_list[0] = old_prp_list[i - 1];
- old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+ old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
i = 1;
}
- prp_list[i++] = cpu_to_le64(dma_addr);
- dma_len -= NVME_CTRL_PAGE_SIZE;
- dma_addr += NVME_CTRL_PAGE_SIZE;
- length -= NVME_CTRL_PAGE_SIZE;
- if (length <= 0)
- break;
- if (dma_len > 0)
- continue;
- if (unlikely(dma_len < 0))
- goto bad_sgl;
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
}
+
done:
- cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
- return BLK_STS_OK;
-free_prps:
- nvme_free_prps(dev, req);
- return BLK_STS_RESOURCE;
+ /*
+ * nvme_unmap_data uses the DPT field in the SQE to tear down the
+ * mapping, so initialize it even for failures.
+ */
+ iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma);
+ iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma);
+ if (unlikely(iter->status))
+ nvme_unmap_data(req);
+ return iter->status;
+
bad_sgl:
- WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
- "Invalid SGL for payload:%d nents:%d\n",
- blk_rq_payload_bytes(req), iod->nents);
+ dev_err_once(nvmeq->dev->dev,
+ "Incorrectly formed request for payload:%d nents:%d\n",
+ blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req));
return BLK_STS_IOERR;
}
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
- struct scatterlist *sg)
+ struct blk_dma_iter *iter)
{
- sge->addr = cpu_to_le64(sg_dma_address(sg));
- sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->addr = cpu_to_le64(iter->addr);
+ sge->length = cpu_to_le32(iter->len);
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
@@ -726,220 +971,281 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
dma_addr_t dma_addr, int entries)
{
sge->addr = cpu_to_le64(dma_addr);
- if (entries < SGES_PER_PAGE) {
- sge->length = cpu_to_le32(entries * sizeof(*sge));
- sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
- } else {
- sge->length = cpu_to_le32(PAGE_SIZE);
- sge->type = NVME_SGL_FMT_SEG_DESC << 4;
- }
+ sge->length = cpu_to_le32(entries * sizeof(*sge));
+ sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
-static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmd, int entries)
+static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
+ struct blk_dma_iter *iter)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int entries = blk_rq_nr_phys_segments(req);
struct nvme_sgl_desc *sg_list;
- struct scatterlist *sg = iod->sg;
dma_addr_t sgl_dma;
- int i = 0;
+ unsigned int mapped = 0;
- /* setting the transfer type as SGL */
- cmd->flags = NVME_CMD_SGL_METABUF;
+ /* set the transfer type as SGL */
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
- if (entries == 1) {
- nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+ if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) {
+ nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter);
+ iod->total_len += iter->len;
return BLK_STS_OK;
}
- if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
- pool = dev->prp_small_pool;
- iod->npages = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->npages = 1;
- }
+ if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
- if (!sg_list) {
- iod->npages = -1;
+ sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &sgl_dma);
+ if (!sg_list)
return BLK_STS_RESOURCE;
- }
-
- nvme_pci_iod_list(req)[0] = sg_list;
- iod->first_dma = sgl_dma;
-
- nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
+ iod->descriptors[iod->nr_descriptors++] = sg_list;
do {
- if (i == SGES_PER_PAGE) {
- struct nvme_sgl_desc *old_sg_desc = sg_list;
- struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
-
- sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
- if (!sg_list)
- goto free_sgls;
-
- i = 0;
- nvme_pci_iod_list(req)[iod->npages++] = sg_list;
- sg_list[i++] = *link;
- nvme_pci_sgl_set_seg(link, sgl_dma, entries);
+ if (WARN_ON_ONCE(mapped == entries)) {
+ iter->status = BLK_STS_IOERR;
+ break;
}
+ nvme_pci_sgl_set_data(&sg_list[mapped++], iter);
+ iod->total_len += iter->len;
+ } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state,
+ iter));
- nvme_pci_sgl_set_data(&sg_list[i++], sg);
- sg = sg_next(sg);
- } while (--entries > 0);
-
- return BLK_STS_OK;
-free_sgls:
- nvme_free_sgls(dev, req);
- return BLK_STS_RESOURCE;
+ nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
+ if (unlikely(iter->status))
+ nvme_unmap_data(req);
+ return iter->status;
}
-static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd,
- struct bio_vec *bv)
+static blk_status_t nvme_pci_setup_data_simple(struct request *req,
+ enum nvme_use_sgl use_sgl)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
- unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
-
- iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->first_dma))
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct bio_vec bv = req_bvec(req);
+ unsigned int prp1_offset = bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
+ bool prp_possible = prp1_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2;
+ dma_addr_t dma_addr;
+
+ if (!use_sgl && !prp_possible)
+ return BLK_STS_AGAIN;
+ if (is_pci_p2pdma_page(bv.bv_page))
+ return BLK_STS_AGAIN;
+
+ dma_addr = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(nvmeq->dev->dev, dma_addr))
return BLK_STS_RESOURCE;
- iod->dma_len = bv->bv_len;
+ iod->total_len = bv.bv_len;
+ iod->flags |= IOD_SINGLE_SEGMENT;
+
+ if (use_sgl == SGL_FORCED || !prp_possible) {
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
+ iod->cmd.common.dptr.sgl.addr = cpu_to_le64(dma_addr);
+ iod->cmd.common.dptr.sgl.length = cpu_to_le32(bv.bv_len);
+ iod->cmd.common.dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
+ } else {
+ unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - prp1_offset;
+
+ iod->cmd.common.dptr.prp1 = cpu_to_le64(dma_addr);
+ iod->cmd.common.dptr.prp2 = 0;
+ if (bv.bv_len > first_prp_len)
+ iod->cmd.common.dptr.prp2 =
+ cpu_to_le64(dma_addr + first_prp_len);
+ }
- cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
- if (bv->bv_len > first_prp_len)
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
return BLK_STS_OK;
}
-static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd,
- struct bio_vec *bv)
+static blk_status_t nvme_map_data(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
+ enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req);
+ struct blk_dma_iter iter;
+ blk_status_t ret;
+
+ /*
+ * Try to skip the DMA iterator for single segment requests, as that
+ * significantly improves performances for small I/O sizes.
+ */
+ if (blk_rq_nr_phys_segments(req) == 1) {
+ ret = nvme_pci_setup_data_simple(req, use_sgl);
+ if (ret != BLK_STS_AGAIN)
+ return ret;
+ }
+
+ if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
+ return iter.status;
- iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->first_dma))
+ switch (iter.p2pdma.map) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ iod->flags |= IOD_DATA_P2P;
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ iod->flags |= IOD_DATA_MMIO;
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ default:
return BLK_STS_RESOURCE;
- iod->dma_len = bv->bv_len;
+ }
- cmnd->flags = NVME_CMD_SGL_METABUF;
- cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
- cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
- cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
- return BLK_STS_OK;
+ if (use_sgl == SGL_FORCED ||
+ (use_sgl == SGL_SUPPORTED &&
+ (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
+ return nvme_pci_setup_data_sgl(req, &iter);
+ return nvme_pci_setup_data_prp(req, &iter);
}
-static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
+static blk_status_t nvme_pci_setup_meta_iter(struct request *req)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int entries = req->nr_integrity_segments;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- blk_status_t ret = BLK_STS_RESOURCE;
- int nr_mapped;
+ struct nvme_dev *dev = nvmeq->dev;
+ struct nvme_sgl_desc *sg_list;
+ struct blk_dma_iter iter;
+ dma_addr_t sgl_dma;
+ int i = 0;
- if (blk_rq_nr_phys_segments(req) == 1) {
- struct bio_vec bv = req_bvec(req);
+ if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev,
+ &iod->meta_dma_state, &iter))
+ return iter.status;
+
+ switch (iter.p2pdma.map) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ iod->flags |= IOD_META_P2P;
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ iod->flags |= IOD_META_MMIO;
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ default:
+ return BLK_STS_RESOURCE;
+ }
- if (!is_pci_p2pdma_page(bv.bv_page)) {
- if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
- return nvme_setup_prp_simple(dev, req,
- &cmnd->rw, &bv);
+ if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
+ entries = 1;
- if (iod->nvmeq->qid && sgl_threshold &&
- nvme_ctrl_sgl_supported(&dev->ctrl))
- return nvme_setup_sgl_simple(dev, req,
- &cmnd->rw, &bv);
- }
+ /*
+ * The NVMe MPTR descriptor has an implicit length that the host and
+ * device must agree on to avoid data/memory corruption. We trust the
+ * kernel allocated correctly based on the format's parameters, so use
+ * the more efficient MPTR to avoid extra dma pool allocations for the
+ * SGL indirection.
+ *
+ * But for user commands, we don't necessarily know what they do, so
+ * the driver can't validate the metadata buffer size. The SGL
+ * descriptor provides an explicit length, so we're relying on that
+ * mechanism to catch any misunderstandings between the application and
+ * device.
+ *
+ * P2P DMA also needs to use the blk_dma_iter method, so mptr setup
+ * leverages this routine when that happens.
+ */
+ if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl) ||
+ (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD))) {
+ iod->cmd.common.metadata = cpu_to_le64(iter.addr);
+ iod->meta_total_len = iter.len;
+ iod->meta_dma = iter.addr;
+ iod->meta_descriptor = NULL;
+ return BLK_STS_OK;
}
- iod->dma_len = 0;
- iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
- if (!iod->sg)
+ sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
+ &sgl_dma);
+ if (!sg_list)
return BLK_STS_RESOURCE;
- sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
- iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
- if (!iod->nents)
- goto out_free_sg;
-
- if (is_pci_p2pdma_page(sg_page(iod->sg)))
- nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
- iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
- else
- nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
- rq_dma_dir(req), DMA_ATTR_NO_WARN);
- if (!nr_mapped)
- goto out_free_sg;
-
- iod->use_sgl = nvme_pci_use_sgls(dev, req);
- if (iod->use_sgl)
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
- else
- ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
- if (ret != BLK_STS_OK)
- goto out_unmap_sg;
- return BLK_STS_OK;
-out_unmap_sg:
- nvme_unmap_sg(dev, req);
-out_free_sg:
- mempool_free(iod->sg, dev->iod_mempool);
- return ret;
+ iod->meta_descriptor = sg_list;
+ iod->meta_dma = sgl_dma;
+ iod->cmd.common.flags = NVME_CMD_SGL_METASEG;
+ iod->cmd.common.metadata = cpu_to_le64(sgl_dma);
+ if (entries == 1) {
+ iod->meta_total_len = iter.len;
+ nvme_pci_sgl_set_data(sg_list, &iter);
+ return BLK_STS_OK;
+ }
+
+ sgl_dma += sizeof(*sg_list);
+ do {
+ nvme_pci_sgl_set_data(&sg_list[++i], &iter);
+ iod->meta_total_len += iter.len;
+ } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter));
+
+ nvme_pci_sgl_set_seg(sg_list, sgl_dma, i);
+ if (unlikely(iter.status))
+ nvme_unmap_metadata(req);
+ return iter.status;
}
-static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
+static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct bio_vec bv = rq_integrity_vec(req);
+
+ if (is_pci_p2pdma_page(bv.bv_page))
+ return nvme_pci_setup_meta_iter(req);
- iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
- rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->meta_dma))
+ iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
- cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+ iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma);
+ iod->flags |= IOD_SINGLE_META_SEGMENT;
return BLK_STS_OK;
}
-static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_map_metadata(struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
+ nvme_pci_metadata_use_sgls(req))
+ return nvme_pci_setup_meta_iter(req);
+ return nvme_pci_setup_meta_mptr(req);
+}
+
+static blk_status_t nvme_prep_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret;
- iod->aborted = 0;
- iod->npages = -1;
- iod->nents = 0;
+ iod->flags = 0;
+ iod->nr_descriptors = 0;
+ iod->total_len = 0;
+ iod->meta_total_len = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
return ret;
if (blk_rq_nr_phys_segments(req)) {
- ret = nvme_map_data(dev, req, &iod->cmd);
+ ret = nvme_map_data(req);
if (ret)
goto out_free_cmd;
}
if (blk_integrity_rq(req)) {
- ret = nvme_map_metadata(dev, req, &iod->cmd);
+ ret = nvme_map_metadata(req);
if (ret)
goto out_unmap_data;
}
- blk_mq_start_request(req);
+ nvme_start_request(req);
return BLK_STS_OK;
out_unmap_data:
- nvme_unmap_data(dev, req);
+ if (blk_rq_nr_phys_segments(req))
+ nvme_unmap_data(req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
}
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -959,7 +1265,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
return nvme_fail_nonready_command(&dev->ctrl, req);
- ret = nvme_prep_rq(dev, req);
+ ret = nvme_prep_rq(req);
if (unlikely(ret))
return ret;
spin_lock(&nvmeq->sq_lock);
@@ -969,11 +1275,15 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist)
{
+ struct request *req;
+
+ if (rq_list_empty(rqlist))
+ return;
+
spin_lock(&nvmeq->sq_lock);
- while (!rq_list_empty(*rqlist)) {
- struct request *req = rq_list_pop(rqlist);
+ while ((req = rq_list_pop(rqlist))) {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
nvme_sq_copy_cmd(nvmeq, &iod->cmd);
@@ -993,50 +1303,38 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
return false;
- req->mq_hctx->tags->rqs[req->tag] = req;
- return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
+ return nvme_prep_rq(req) == BLK_STS_OK;
}
-static void nvme_queue_rqs(struct request **rqlist)
+static void nvme_queue_rqs(struct rq_list *rqlist)
{
- struct request *req, *next, *prev = NULL;
- struct request *requeue_list = NULL;
-
- rq_list_for_each_safe(rqlist, req, next) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-
- if (!nvme_prep_rq_batch(nvmeq, req)) {
- /* detach 'req' and add to remainder list */
- rq_list_move(rqlist, &requeue_list, req, prev);
+ struct rq_list submit_list = { };
+ struct rq_list requeue_list = { };
+ struct nvme_queue *nvmeq = NULL;
+ struct request *req;
- req = prev;
- if (!req)
- continue;
- }
+ while ((req = rq_list_pop(rqlist))) {
+ if (nvmeq && nvmeq != req->mq_hctx->driver_data)
+ nvme_submit_cmds(nvmeq, &submit_list);
+ nvmeq = req->mq_hctx->driver_data;
- if (!next || req->mq_hctx != next->mq_hctx) {
- /* detach rest of list, and submit */
- req->rq_next = NULL;
- nvme_submit_cmds(nvmeq, rqlist);
- *rqlist = next;
- prev = NULL;
- } else
- prev = req;
+ if (nvme_prep_rq_batch(nvmeq, req))
+ rq_list_add_tail(&submit_list, req);
+ else
+ rq_list_add_tail(&requeue_list, req);
}
+ if (nvmeq)
+ nvme_submit_cmds(nvmeq, &submit_list);
*rqlist = requeue_list;
}
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_dev *dev = iod->nvmeq->dev;
-
if (blk_integrity_rq(req))
- dma_unmap_page(dev->dev, iod->meta_dma,
- rq_integrity_vec(req)->bv_len, rq_data_dir(req));
+ nvme_unmap_metadata(req);
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(req);
}
static void nvme_pci_complete_rq(struct request *req)
@@ -1103,8 +1401,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
- !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
- nvme_pci_complete_batch))
+ !blk_mq_add_to_batch(req, iob,
+ nvme_req(req)->status != NVME_SC_SUCCESS,
+ nvme_pci_complete_batch))
nvme_pci_complete_rq(req);
}
@@ -1120,13 +1419,13 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
}
}
-static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
- struct io_comp_batch *iob)
+static inline bool nvme_poll_cq(struct nvme_queue *nvmeq,
+ struct io_comp_batch *iob)
{
- int found = 0;
+ bool found = false;
while (nvme_cqe_pending(nvmeq)) {
- found++;
+ found = true;
/*
* load-load control dependency between phase and the rest of
* the cqe requires a full read memory barrier
@@ -1147,7 +1446,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
DEFINE_IO_COMP_BATCH(iob);
if (nvme_poll_cq(nvmeq, &iob)) {
- if (!rq_list_empty(iob.req_list))
+ if (!rq_list_empty(&iob.req_list))
nvme_pci_complete_batch(&iob);
return IRQ_HANDLED;
}
@@ -1174,7 +1473,9 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ spin_lock(&nvmeq->cq_poll_lock);
nvme_poll_cq(nvmeq, NULL);
+ spin_unlock(&nvmeq->cq_poll_lock);
enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
}
@@ -1208,6 +1509,41 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
spin_unlock(&nvmeq->sq_lock);
}
+static int nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl)
+{
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+ int ret = 0;
+
+ /*
+ * Taking the shutdown_lock ensures the BAR mapping is not being
+ * altered by reset_work. Holding this lock before the RESETTING state
+ * change, if successful, also ensures nvme_remove won't be able to
+ * proceed to iounmap until we're done.
+ */
+ mutex_lock(&dev->shutdown_lock);
+ if (!dev->bar_mapped_size) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR);
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE);
+
+ /*
+ * Read controller status to flush the previous write and trigger a
+ * pcie read error.
+ */
+ readl(dev->bar + NVME_REG_CSTS);
+unlock:
+ mutex_unlock(&dev->shutdown_lock);
+ return ret;
+}
+
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
struct nvme_command c = { };
@@ -1280,15 +1616,15 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}
-static void abort_endio(struct request *req, blk_status_t error)
+static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_queue *nvmeq = iod->nvmeq;
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
dev_warn(nvmeq->dev->ctrl.device,
"Abort status: 0x%x", nvme_req(req)->status);
atomic_inc(&nvmeq->dev->ctrl.abort_limit);
blk_mq_free_request(req);
+ return RQ_END_IO_NONE;
}
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
@@ -1299,7 +1635,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
/* If there is a reset/reinit ongoing, we shouldn't reset again. */
- switch (dev->ctrl.state) {
+ switch (nvme_ctrl_state(&dev->ctrl)) {
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
return false;
@@ -1332,22 +1668,45 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
dev_warn(dev->ctrl.device,
"controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
csts, result);
+
+ if (csts != ~0)
+ return;
+
+ dev_warn(dev->ctrl.device,
+ "Does your device have a faulty power saving mode enabled?\n");
+ dev_warn(dev->ctrl.device,
+ "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n");
}
-static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
+static enum blk_eh_timer_return nvme_timeout(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_queue *nvmeq = iod->nvmeq;
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
struct request *abort_req;
struct nvme_command cmd = { };
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
u32 csts = readl(dev->bar + NVME_REG_CSTS);
+ u8 opcode;
+
+ /*
+ * Shutdown the device immediately if we see it is disconnected. This
+ * unblocks PCIe error handling if the nvme driver is waiting in
+ * error_resume for a device that has been removed. We can't unbind the
+ * driver while the driver's error callback is waiting to complete, so
+ * we're relying on a timeout to break that deadlock if a removal
+ * occurs while reset work is running.
+ */
+ if (pci_dev_is_disconnected(pdev))
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ if (nvme_state_terminal(&dev->ctrl))
+ goto disable;
/* If PCI error recovery process is happening, we cannot reset or
* the recovery mechanism will surely fail.
*/
mb();
- if (pci_channel_offline(to_pci_dev(dev->dev)))
+ if (pci_channel_offline(pdev))
return BLK_EH_RESET_TIMER;
/*
@@ -1355,9 +1714,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
*/
if (nvme_should_reset(dev, csts)) {
nvme_warn_reset(dev, csts);
- nvme_dev_disable(dev, false);
- nvme_reset_ctrl(&dev->ctrl);
- return BLK_EH_DONE;
+ goto disable;
}
/*
@@ -1368,10 +1725,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
else
nvme_poll_irqdisable(nvmeq);
- if (blk_mq_request_completed(req)) {
+ if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) {
dev_warn(dev->ctrl.device,
- "I/O %d QID %d timeout, completion polled\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) QID %d timeout, completion polled\n",
+ req->tag, nvme_cid(req), nvmeq->qid);
return BLK_EH_DONE;
}
@@ -1381,14 +1738,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* cancellation error. All outstanding requests are completed on
* shutdown, so we return BLK_EH_DONE.
*/
- switch (dev->ctrl.state) {
+ switch (nvme_ctrl_state(&dev->ctrl)) {
case NVME_CTRL_CONNECTING:
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
fallthrough;
case NVME_CTRL_DELETING:
dev_warn_ratelimited(dev->ctrl.device,
- "I/O %d QID %d timeout, disable controller\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) QID %d timeout, disable controller\n",
+ req->tag, nvme_cid(req), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
nvme_dev_disable(dev, true);
return BLK_EH_DONE;
@@ -1403,40 +1760,43 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* command was already aborted once before and still hasn't been
* returned to the driver, or if this is the admin queue.
*/
- if (!nvmeq->qid || iod->aborted) {
+ opcode = nvme_req(req)->cmd->common.opcode;
+ if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
dev_warn(dev->ctrl.device,
- "I/O %d QID %d timeout, reset controller\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
+ req->tag, nvme_cid(req), opcode,
+ nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
- nvme_dev_disable(dev, false);
- nvme_reset_ctrl(&dev->ctrl);
-
- return BLK_EH_DONE;
+ goto disable;
}
if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
}
- iod->aborted = 1;
+ iod->flags |= IOD_ABORTED;
cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = nvme_cid(req);
cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
dev_warn(nvmeq->dev->ctrl.device,
- "I/O %d QID %d timeout, aborting\n",
- req->tag, nvmeq->qid);
+ "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n",
+ req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
+ nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
+ blk_rq_bytes(req));
- abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
- BLK_MQ_REQ_NOWAIT);
+ abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
+ BLK_MQ_REQ_NOWAIT);
if (IS_ERR(abort_req)) {
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
}
+ nvme_init_request(abort_req, &cmd);
+ abort_req->end_io = abort_endio;
abort_req->end_io_data = NULL;
- blk_execute_rq_nowait(abort_req, false, abort_endio);
+ blk_execute_rq_nowait(abort_req, false);
/*
* The aborted req will be completed on receiving the abort req.
@@ -1444,6 +1804,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* as the device then is in a faulty state.
*/
return BLK_EH_RESET_TIMER;
+
+disable:
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+ if (nvme_state_terminal(&dev->ctrl))
+ nvme_dev_disable(dev, true);
+ return BLK_EH_DONE;
+ }
+
+ nvme_dev_disable(dev, false);
+ if (nvme_try_sched_reset(&dev->ctrl))
+ nvme_unquiesce_io_queues(&dev->ctrl);
+ return BLK_EH_DONE;
}
static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1472,24 +1844,21 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
}
}
-/**
- * nvme_suspend_queue - put queue into suspended state
- * @nvmeq: queue to suspend
- */
-static int nvme_suspend_queue(struct nvme_queue *nvmeq)
+static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid)
{
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+
if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
- return 1;
+ return;
/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
mb();
nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
- nvme_stop_admin_queue(&nvmeq->dev->ctrl);
+ nvme_quiesce_admin_queue(&nvmeq->dev->ctrl);
if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
- pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
- return 0;
+ pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq);
}
static void nvme_suspend_io_queues(struct nvme_dev *dev)
@@ -1497,19 +1866,7 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev)
int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
- nvme_suspend_queue(&dev->queues[i]);
-}
-
-static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
-{
- struct nvme_queue *nvmeq = &dev->queues[0];
-
- if (shutdown)
- nvme_shutdown_ctrl(&dev->ctrl);
- else
- nvme_disable_ctrl(&dev->ctrl);
-
- nvme_poll_irqdisable(nvmeq);
+ nvme_suspend_queue(dev, i);
}
/*
@@ -1658,7 +2015,7 @@ static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
/*
* Controller is in wrong state, fail early.
*/
- if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
+ if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) {
mutex_unlock(&dev->shutdown_lock);
return -ENODEV;
}
@@ -1722,7 +2079,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
.init_hctx = nvme_admin_init_hctx,
- .init_request = nvme_init_request,
+ .init_request = nvme_pci_init_request,
.timeout = nvme_timeout,
};
@@ -1732,7 +2089,7 @@ static const struct blk_mq_ops nvme_mq_ops = {
.complete = nvme_pci_complete_rq,
.commit_rqs = nvme_commit_rqs,
.init_hctx = nvme_init_hctx,
- .init_request = nvme_init_request,
+ .init_request = nvme_pci_init_request,
.map_queues = nvme_pci_map_queues,
.timeout = nvme_timeout,
.poll = nvme_poll,
@@ -1746,45 +2103,11 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
* user requests may be waiting on a stopped queue. Start the
* queue to flush these to completion.
*/
- nvme_start_admin_queue(&dev->ctrl);
- blk_cleanup_queue(dev->ctrl.admin_q);
- blk_mq_free_tag_set(&dev->admin_tagset);
+ nvme_unquiesce_admin_queue(&dev->ctrl);
+ nvme_remove_admin_tag_set(&dev->ctrl);
}
}
-static int nvme_alloc_admin_tags(struct nvme_dev *dev)
-{
- if (!dev->ctrl.admin_q) {
- dev->admin_tagset.ops = &nvme_mq_admin_ops;
- dev->admin_tagset.nr_hw_queues = 1;
-
- dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
- dev->admin_tagset.numa_node = dev->ctrl.numa_node;
- dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
- dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
- dev->admin_tagset.driver_data = dev;
-
- if (blk_mq_alloc_tag_set(&dev->admin_tagset))
- return -ENOMEM;
- dev->ctrl.admin_tagset = &dev->admin_tagset;
-
- dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
- if (IS_ERR(dev->ctrl.admin_q)) {
- blk_mq_free_tag_set(&dev->admin_tagset);
- return -ENOMEM;
- }
- if (!blk_get_queue(dev->ctrl.admin_q)) {
- nvme_dev_remove_admin(dev);
- dev->ctrl.admin_q = NULL;
- return -ENODEV;
- }
- } else
- nvme_start_admin_queue(&dev->ctrl);
-
- return 0;
-}
-
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
@@ -1828,9 +2151,36 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
- result = nvme_disable_ctrl(&dev->ctrl);
- if (result < 0)
- return result;
+ /*
+ * If the device has been passed off to us in an enabled state, just
+ * clear the enabled bit. The spec says we should set the 'shutdown
+ * notification bits', but doing so may cause the device to complete
+ * commands to the admin queue ... and we don't know what memory that
+ * might be pointing at!
+ */
+ result = nvme_disable_ctrl(&dev->ctrl, false);
+ if (result < 0) {
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+ /*
+ * The NVMe Controller Reset method did not get an expected
+ * CSTS.RDY transition, so something with the device appears to
+ * be stuck. Use the lower level and bigger hammer PCIe
+ * Function Level Reset to attempt restoring the device to its
+ * initial state, and try again.
+ */
+ result = pcie_reset_flr(pdev, false);
+ if (result < 0)
+ return result;
+
+ pci_restore_state(pdev);
+ result = nvme_disable_ctrl(&dev->ctrl, false);
+ if (result < 0)
+ return result;
+
+ dev_info(dev->ctrl.device,
+ "controller reset completed after pcie flr\n");
+ }
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
if (result)
@@ -1938,6 +2288,18 @@ static void nvme_map_cmb(struct nvme_dev *dev)
return;
/*
+ * Controllers may support a CMB size larger than their BAR, for
+ * example, due to being behind a bridge. Reduce the CMB to the
+ * reported size of the BAR
+ */
+ size = min(size, bar_size - offset);
+
+ if (!IS_ALIGNED(size, memremap_compat_align()) ||
+ !IS_ALIGNED(pci_resource_start(pdev, bar),
+ memremap_compat_align()))
+ return;
+
+ /*
* Tell the controller about the host side address mapping the CMB,
* and enable CMB decoding for the NVMe 1.4+ scheme:
*/
@@ -1947,17 +2309,10 @@ static void nvme_map_cmb(struct nvme_dev *dev)
dev->bar + NVME_REG_CMBMSC);
}
- /*
- * Controllers may support a CMB size larger than their BAR,
- * for example, due to being behind a bridge. Reduce the CMB to
- * the reported size of the BAR
- */
- if (size > bar_size - offset)
- size = bar_size - offset;
-
if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
dev_warn(dev->ctrl.device,
"failed to register the CMB\n");
+ hi_lo_writeq(0, dev->bar + NVME_REG_CMBMSC);
return;
}
@@ -1995,7 +2350,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
return ret;
}
-static void nvme_free_host_mem(struct nvme_dev *dev)
+static void nvme_free_host_mem_multi(struct nvme_dev *dev)
{
int i;
@@ -2010,18 +2365,54 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
kfree(dev->host_mem_desc_bufs);
dev->host_mem_desc_bufs = NULL;
- dma_free_coherent(dev->dev,
- dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
+}
+
+static void nvme_free_host_mem(struct nvme_dev *dev)
+{
+ if (dev->hmb_sgt)
+ dma_free_noncontiguous(dev->dev, dev->host_mem_size,
+ dev->hmb_sgt, DMA_BIDIRECTIONAL);
+ else
+ nvme_free_host_mem_multi(dev);
+
+ dma_free_coherent(dev->dev, dev->host_mem_descs_size,
dev->host_mem_descs, dev->host_mem_descs_dma);
dev->host_mem_descs = NULL;
+ dev->host_mem_descs_size = 0;
dev->nr_host_mem_descs = 0;
}
-static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
+static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
+{
+ dev->hmb_sgt = dma_alloc_noncontiguous(dev->dev, size,
+ DMA_BIDIRECTIONAL, GFP_KERNEL, 0);
+ if (!dev->hmb_sgt)
+ return -ENOMEM;
+
+ dev->host_mem_descs = dma_alloc_coherent(dev->dev,
+ sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma,
+ GFP_KERNEL);
+ if (!dev->host_mem_descs) {
+ dma_free_noncontiguous(dev->dev, size, dev->hmb_sgt,
+ DMA_BIDIRECTIONAL);
+ dev->hmb_sgt = NULL;
+ return -ENOMEM;
+ }
+ dev->host_mem_size = size;
+ dev->host_mem_descs_size = sizeof(*dev->host_mem_descs);
+ dev->nr_host_mem_descs = 1;
+
+ dev->host_mem_descs[0].addr =
+ cpu_to_le64(dev->hmb_sgt->sgl->dma_address);
+ dev->host_mem_descs[0].size = cpu_to_le32(size / NVME_CTRL_PAGE_SIZE);
+ return 0;
+}
+
+static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
u32 chunk_size)
{
struct nvme_host_mem_buf_desc *descs;
- u32 max_entries, len;
+ u32 max_entries, len, descs_size;
dma_addr_t descs_dma;
int i = 0;
void **bufs;
@@ -2034,8 +2425,9 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
max_entries = dev->ctrl.hmmaxd;
- descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
- &descs_dma, GFP_KERNEL);
+ descs_size = max_entries * sizeof(*descs);
+ descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma,
+ GFP_KERNEL);
if (!descs)
goto out;
@@ -2064,22 +2456,14 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
dev->host_mem_size = size;
dev->host_mem_descs = descs;
dev->host_mem_descs_dma = descs_dma;
+ dev->host_mem_descs_size = descs_size;
dev->host_mem_desc_bufs = bufs;
return 0;
out_free_bufs:
- while (--i >= 0) {
- size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
-
- dma_free_attrs(dev->dev, size, bufs[i],
- le64_to_cpu(descs[i].addr),
- DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
- }
-
kfree(bufs);
out_free_descs:
- dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
- descs_dma);
+ dma_free_coherent(dev->dev, descs_size, descs, descs_dma);
out:
dev->host_mem_descs = NULL;
return -ENOMEM;
@@ -2087,13 +2471,23 @@ out:
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
+ unsigned long dma_merge_boundary = dma_get_merge_boundary(dev->dev);
u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
u64 chunk_size;
+ /*
+ * If there is an IOMMU that can merge pages, try a virtually
+ * non-contiguous allocation for a single segment first.
+ */
+ if (dma_merge_boundary && (PAGE_SIZE & dma_merge_boundary) == 0) {
+ if (!nvme_alloc_host_mem_single(dev, preferred))
+ return 0;
+ }
+
/* start big and work our way down */
for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
- if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
+ if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) {
if (!min || dev->host_mem_size >= min)
return 0;
nvme_free_host_mem(dev);
@@ -2111,6 +2505,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
u32 enable_bits = NVME_HOST_MEM_ENABLE;
int ret;
+ if (!dev->ctrl.hmpre)
+ return 0;
+
preferred = min(preferred, max);
if (min > max) {
dev_warn(dev->ctrl.device,
@@ -2138,8 +2535,10 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
}
dev_info(dev->ctrl.device,
- "allocated %lld MiB host memory buffer.\n",
- dev->host_mem_size >> ilog2(SZ_1M));
+ "allocated %lld MiB host memory buffer (%u segment%s).\n",
+ dev->host_mem_size >> ilog2(SZ_1M),
+ dev->nr_host_mem_descs,
+ str_plural(dev->nr_host_mem_descs));
}
ret = nvme_set_host_mem(dev, enable_bits);
@@ -2153,7 +2552,7 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
- return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n",
+ return sysfs_emit(buf, "cmbloc : 0x%08x\ncmbsz : 0x%08x\n",
ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmb);
@@ -2191,7 +2590,7 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
bool new;
int ret;
- if (strtobool(buf, &new) < 0)
+ if (kstrtobool(buf, &new) < 0)
return -EINVAL;
if (new == ndev->hmb)
@@ -2239,11 +2638,22 @@ static struct attribute *nvme_pci_attrs[] = {
NULL,
};
-static const struct attribute_group nvme_pci_attr_group = {
+static const struct attribute_group nvme_pci_dev_attrs_group = {
.attrs = nvme_pci_attrs,
.is_visible = nvme_pci_attrs_are_visible,
};
+static const struct attribute_group *nvme_pci_dev_attr_groups[] = {
+ &nvme_dev_attrs_group,
+ &nvme_pci_dev_attrs_group,
+ NULL,
+};
+
+static void nvme_update_attrs(struct nvme_dev *dev)
+{
+ sysfs_update_group(&dev->ctrl.device->kobj, &nvme_pci_dev_attrs_group);
+}
+
/*
* nirqs is the number of interrupts available for write and read
* queues. The core already reserved an interrupt for the admin queue.
@@ -2291,6 +2701,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
.priv = dev,
};
unsigned int irq_queues, poll_queues;
+ unsigned int flags = PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY;
/*
* Poll queues don't need interrupts, but we need at least one I/O queue
@@ -2314,14 +2725,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
irq_queues = 1;
if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
irq_queues += (nr_io_queues - poll_queues);
- return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-}
-
-static void nvme_disable_io_queues(struct nvme_dev *dev)
-{
- if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
- __nvme_disable_io_queues(dev, nvme_admin_delete_cq);
+ if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI)
+ flags &= ~PCI_IRQ_MSI;
+ return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, flags,
+ &affd);
}
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
@@ -2332,7 +2739,8 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
return 1;
- return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+ return blk_mq_num_possible_queues(0) + dev->nr_write_queues +
+ dev->nr_poll_queues;
}
static int nvme_setup_io_queues(struct nvme_dev *dev)
@@ -2374,10 +2782,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
- if (result > 0)
+ if (result > 0) {
dev->q_depth = result;
- else
+ dev->ctrl.sqsize = result - 1;
+ } else {
dev->cmb_use_sqes = false;
+ }
}
do {
@@ -2431,7 +2841,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (dev->online_queues - 1 < dev->max_qid) {
nr_io_queues = dev->online_queues - 1;
- nvme_disable_io_queues(dev);
+ nvme_delete_io_queues(dev);
result = nvme_setup_io_queues_trylock(dev);
if (result)
return result;
@@ -2448,22 +2858,25 @@ out_unlock:
return result;
}
-static void nvme_del_queue_end(struct request *req, blk_status_t error)
+static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
+ blk_status_t error)
{
struct nvme_queue *nvmeq = req->end_io_data;
blk_mq_free_request(req);
complete(&nvmeq->delete_done);
+ return RQ_END_IO_NONE;
}
-static void nvme_del_cq_end(struct request *req, blk_status_t error)
+static enum rq_end_io_ret nvme_del_cq_end(struct request *req,
+ blk_status_t error)
{
struct nvme_queue *nvmeq = req->end_io_data;
if (error)
set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
- nvme_del_queue_end(req, error);
+ return nvme_del_queue_end(req, error);
}
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
@@ -2475,19 +2888,23 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
cmd.delete_queue.opcode = opcode;
cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
- req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
+ req = blk_mq_alloc_request(q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT);
if (IS_ERR(req))
return PTR_ERR(req);
+ nvme_init_request(req, &cmd);
+ if (opcode == nvme_admin_delete_cq)
+ req->end_io = nvme_del_cq_end;
+ else
+ req->end_io = nvme_del_queue_end;
req->end_io_data = nvmeq;
init_completion(&nvmeq->delete_done);
- blk_execute_rq_nowait(req, false, opcode == nvme_admin_delete_cq ?
- nvme_del_cq_end : nvme_del_queue_end);
+ blk_execute_rq_nowait(req, false);
return 0;
}
-static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
+static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode)
{
int nr_queues = dev->online_queues - 1, sent = 0;
unsigned long timeout;
@@ -2515,66 +2932,59 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true;
}
-static void nvme_dev_add(struct nvme_dev *dev)
+static void nvme_delete_io_queues(struct nvme_dev *dev)
{
- int ret;
+ if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq))
+ __nvme_delete_io_queues(dev, nvme_admin_delete_cq);
+}
- if (!dev->ctrl.tagset) {
- dev->tagset.ops = &nvme_mq_ops;
- dev->tagset.nr_hw_queues = dev->online_queues - 1;
- dev->tagset.nr_maps = 2; /* default + read */
- if (dev->io_queues[HCTX_TYPE_POLL])
- dev->tagset.nr_maps++;
- dev->tagset.timeout = NVME_IO_TIMEOUT;
- dev->tagset.numa_node = dev->ctrl.numa_node;
- dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
- BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = sizeof(struct nvme_iod);
- dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
- dev->tagset.driver_data = dev;
+static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev)
+{
+ if (dev->io_queues[HCTX_TYPE_POLL])
+ return 3;
+ if (dev->io_queues[HCTX_TYPE_READ])
+ return 2;
+ return 1;
+}
- /*
- * Some Apple controllers requires tags to be unique
- * across admin and IO queue, so reserve the first 32
- * tags of the IO queue.
- */
- if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
- dev->tagset.reserved_tags = NVME_AQ_DEPTH;
+static bool nvme_pci_update_nr_queues(struct nvme_dev *dev)
+{
+ if (!dev->ctrl.tagset) {
+ nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
+ nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
+ return true;
+ }
- ret = blk_mq_alloc_tag_set(&dev->tagset);
- if (ret) {
- dev_warn(dev->ctrl.device,
- "IO queues tagset allocation failed %d\n", ret);
- return;
- }
- dev->ctrl.tagset = &dev->tagset;
- } else {
- blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+ /* Give up if we are racing with nvme_dev_disable() */
+ if (!mutex_trylock(&dev->shutdown_lock))
+ return false;
- /* Free previously allocated queues that are no longer usable */
- nvme_free_queues(dev, dev->online_queues);
+ /* Check if nvme_dev_disable() has been executed already */
+ if (!dev->online_queues) {
+ mutex_unlock(&dev->shutdown_lock);
+ return false;
}
- nvme_dbbuf_set(dev);
+ blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+ /* free previously allocated queues that are no longer usable */
+ nvme_free_queues(dev, dev->online_queues);
+ mutex_unlock(&dev->shutdown_lock);
+ return true;
}
static int nvme_pci_enable(struct nvme_dev *dev)
{
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
- int dma_address_bits = 64;
+ unsigned int flags = PCI_IRQ_ALL_TYPES;
if (pci_enable_device_mem(pdev))
return result;
pci_set_master(pdev);
- if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
- dma_address_bits = 48;
- if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
- goto disable;
-
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
+ dev_dbg(dev->ctrl.device, "reading CSTS register failed\n");
result = -ENODEV;
goto disable;
}
@@ -2584,15 +2994,16 @@ static int nvme_pci_enable(struct nvme_dev *dev)
* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
* adjust this later.
*/
- result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+ if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI)
+ flags &= ~PCI_IRQ_MSI;
+ result = pci_alloc_irq_vectors(pdev, 1, 1, flags);
if (result < 0)
- return result;
+ goto disable;
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
- dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
dev->dbs = dev->bar + 4096;
@@ -2606,15 +3017,8 @@ static int nvme_pci_enable(struct nvme_dev *dev)
else
dev->io_sqes = NVME_NVM_IOSQES;
- /*
- * Temporary fix for the Apple controller found in the MacBook8,1 and
- * some MacBook7,1 to avoid controller resets and data loss.
- */
- if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
+ if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) {
dev->q_depth = 2;
- dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
- "set queue depth=%u to work around controller resets\n",
- dev->q_depth);
} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
(pdev->device == 0xa821 || pdev->device == 0xa822) &&
NVME_CAP_MQES(dev->ctrl.cap) == 0) {
@@ -2633,14 +3037,19 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
dev->q_depth);
}
-
+ dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
nvme_map_cmb(dev);
- pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
- return 0;
+ result = nvme_pci_configure_admin_queue(dev);
+ if (result)
+ goto free_irq;
+ return result;
+
+ free_irq:
+ pci_free_irq_vectors(pdev);
disable:
pci_disable_device(pdev);
return result;
@@ -2653,58 +3062,55 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
pci_release_mem_regions(to_pci_dev(dev->dev));
}
-static void nvme_pci_disable(struct nvme_dev *dev)
+static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ u32 csts;
- pci_free_irq_vectors(pdev);
+ if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev))
+ return true;
+ if (pdev->error_state != pci_channel_io_normal)
+ return true;
- if (pci_is_enabled(pdev)) {
- pci_disable_pcie_error_reporting(pdev);
- pci_disable_device(pdev);
- }
+ csts = readl(dev->bar + NVME_REG_CSTS);
+ return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY);
}
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
- bool dead = true, freeze = false;
+ enum nvme_ctrl_state state = nvme_ctrl_state(&dev->ctrl);
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ bool dead;
mutex_lock(&dev->shutdown_lock);
- if (pci_is_enabled(pdev)) {
- u32 csts = readl(dev->bar + NVME_REG_CSTS);
-
- if (dev->ctrl.state == NVME_CTRL_LIVE ||
- dev->ctrl.state == NVME_CTRL_RESETTING) {
- freeze = true;
+ dead = nvme_pci_ctrl_is_dead(dev);
+ if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) {
+ if (pci_is_enabled(pdev))
nvme_start_freeze(&dev->ctrl);
- }
- dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
- pdev->error_state != pci_channel_io_normal);
+ /*
+ * Give the controller a chance to complete all entered requests
+ * if doing a safe shutdown.
+ */
+ if (!dead && shutdown)
+ nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
}
- /*
- * Give the controller a chance to complete all entered requests if
- * doing a safe shutdown.
- */
- if (!dead && shutdown && freeze)
- nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-
- nvme_stop_queues(&dev->ctrl);
+ nvme_quiesce_io_queues(&dev->ctrl);
if (!dead && dev->ctrl.queue_count > 0) {
- nvme_disable_io_queues(dev);
- nvme_disable_admin_queue(dev, shutdown);
+ nvme_delete_io_queues(dev);
+ nvme_disable_ctrl(&dev->ctrl, shutdown);
+ nvme_poll_irqdisable(&dev->queues[0]);
}
nvme_suspend_io_queues(dev);
- nvme_suspend_queue(&dev->queues[0]);
- nvme_pci_disable(dev);
+ nvme_suspend_queue(dev, 0);
+ pci_free_irq_vectors(pdev);
+ if (pci_is_enabled(pdev))
+ pci_disable_device(pdev);
nvme_reap_pending_cqes(dev);
- blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
- blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
- blk_mq_tagset_wait_completed_request(&dev->tagset);
- blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
+ nvme_cancel_tagset(&dev->ctrl);
+ nvme_cancel_admin_tagset(&dev->ctrl);
/*
* The driver will not be starting up queues again if shutting down so
@@ -2712,9 +3118,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
* deadlocking blk-mq hot-cpu notifier.
*/
if (shutdown) {
- nvme_start_queues(&dev->ctrl);
+ nvme_unquiesce_io_queues(&dev->ctrl);
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
- nvme_start_admin_queue(&dev->ctrl);
+ nvme_unquiesce_admin_queue(&dev->ctrl);
}
mutex_unlock(&dev->shutdown_lock);
}
@@ -2727,66 +3133,37 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
return 0;
}
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
+static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
- dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
- NVME_CTRL_PAGE_SIZE,
- NVME_CTRL_PAGE_SIZE, 0);
- if (!dev->prp_page_pool)
- return -ENOMEM;
+ size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
- /* Optimisation for I/Os between 4k and 128k */
- dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
- 256, 256, 0);
- if (!dev->prp_small_pool) {
- dma_pool_destroy(dev->prp_page_pool);
+ dev->dmavec_mempool = mempool_create_node(1,
+ mempool_kmalloc, mempool_kfree,
+ (void *)alloc_size, GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->dmavec_mempool)
return -ENOMEM;
- }
return 0;
}
-static void nvme_release_prp_pools(struct nvme_dev *dev)
-{
- dma_pool_destroy(dev->prp_page_pool);
- dma_pool_destroy(dev->prp_small_pool);
-}
-
static void nvme_free_tagset(struct nvme_dev *dev)
{
if (dev->tagset.tags)
- blk_mq_free_tag_set(&dev->tagset);
+ nvme_remove_io_tag_set(&dev->ctrl);
dev->ctrl.tagset = NULL;
}
+/* pairs with nvme_pci_alloc_dev */
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
- nvme_dbbuf_dma_free(dev);
nvme_free_tagset(dev);
- if (dev->ctrl.admin_q)
- blk_put_queue(dev->ctrl.admin_q);
- free_opal_dev(dev->ctrl.opal_dev);
- mempool_destroy(dev->iod_mempool);
put_device(dev->dev);
kfree(dev->queues);
kfree(dev);
}
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
- /*
- * Set state to deleting now to avoid blocking nvme_wait_reset(), which
- * may be holding this pci_dev's device lock.
- */
- nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
- nvme_get_ctrl(&dev->ctrl);
- nvme_dev_disable(dev, false);
- nvme_kill_queues(&dev->ctrl);
- if (!queue_work(nvme_wq, &dev->remove_work))
- nvme_put_ctrl(&dev->ctrl);
-}
-
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev =
@@ -2794,7 +3171,7 @@ static void nvme_reset_work(struct work_struct *work)
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result;
- if (dev->ctrl.state != NVME_CTRL_RESETTING) {
+ if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
dev->ctrl.state);
result = -ENODEV;
@@ -2813,29 +3190,7 @@ static void nvme_reset_work(struct work_struct *work)
result = nvme_pci_enable(dev);
if (result)
goto out_unlock;
-
- result = nvme_pci_configure_admin_queue(dev);
- if (result)
- goto out_unlock;
-
- result = nvme_alloc_admin_tags(dev);
- if (result)
- goto out_unlock;
-
- /*
- * Limit the max command size to prevent iod->sg allocations going
- * over a single page.
- */
- dev->ctrl.max_hw_sectors = min_t(u32,
- NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
- dev->ctrl.max_segments = NVME_MAX_SEGS;
-
- /*
- * Don't limit the IOMMU merged segment size.
- */
- dma_set_max_seg_size(dev->dev, 0xffffffff);
- dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
-
+ nvme_unquiesce_admin_queue(&dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
/*
@@ -2849,58 +3204,45 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
- /*
- * We do not support an SGL for metadata (yet), so we are limited to a
- * single integrity segment for the separate metadata pointer.
- */
- dev->ctrl.max_integrity_segments = 1;
-
- result = nvme_init_ctrl_finish(&dev->ctrl);
+ result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend);
if (result)
goto out;
- if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
- if (!dev->ctrl.opal_dev)
- dev->ctrl.opal_dev =
- init_opal_dev(&dev->ctrl, &nvme_sec_submit);
- else if (was_suspend)
- opal_unlock_from_suspend(dev->ctrl.opal_dev);
- } else {
- free_opal_dev(dev->ctrl.opal_dev);
- dev->ctrl.opal_dev = NULL;
- }
+ if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+ else
+ dev->ctrl.max_integrity_segments = 1;
- if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
- result = nvme_dbbuf_dma_alloc(dev);
- if (result)
- dev_warn(dev->dev,
- "unable to allocate dma for dbbuf\n");
- }
+ nvme_dbbuf_dma_alloc(dev);
- if (dev->ctrl.hmpre) {
- result = nvme_setup_host_mem(dev);
- if (result < 0)
- goto out;
- }
+ result = nvme_setup_host_mem(dev);
+ if (result < 0)
+ goto out;
+
+ nvme_update_attrs(dev);
result = nvme_setup_io_queues(dev);
if (result)
goto out;
/*
- * Keep the controller around but remove all namespaces if we don't have
- * any working I/O queue.
+ * Freeze and update the number of I/O queues as those might have
+ * changed. If there are no I/O queues left after this reset, keep the
+ * controller around but remove all namespaces.
*/
- if (dev->online_queues < 2) {
- dev_warn(dev->ctrl.device, "IO queues not created\n");
- nvme_kill_queues(&dev->ctrl);
- nvme_remove_namespaces(&dev->ctrl);
- nvme_free_tagset(dev);
- } else {
- nvme_start_queues(&dev->ctrl);
+ if (dev->online_queues > 1) {
+ nvme_dbbuf_set(dev);
+ nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- nvme_dev_add(dev);
+ if (!nvme_pci_update_nr_queues(dev))
+ goto out;
nvme_unfreeze(&dev->ctrl);
+ } else {
+ dev_warn(dev->ctrl.device, "IO queues lost\n");
+ nvme_mark_namespaces_dead(&dev->ctrl);
+ nvme_unquiesce_io_queues(&dev->ctrl);
+ nvme_remove_namespaces(&dev->ctrl);
+ nvme_free_tagset(dev);
}
/*
@@ -2914,30 +3256,24 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
- if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
- &nvme_pci_attr_group))
- dev->attrs_added = true;
-
nvme_start_ctrl(&dev->ctrl);
return;
out_unlock:
mutex_unlock(&dev->shutdown_lock);
out:
- if (result)
- dev_warn(dev->ctrl.device,
- "Removing after probe failure status: %d\n", result);
- nvme_remove_dead_ctrl(dev);
-}
-
-static void nvme_remove_dead_ctrl_work(struct work_struct *work)
-{
- struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
- struct pci_dev *pdev = to_pci_dev(dev->dev);
-
- if (pci_get_drvdata(pdev))
- device_release_driver(&pdev->dev);
- nvme_put_ctrl(&dev->ctrl);
+ /*
+ * Set state to deleting now to avoid blocking nvme_wait_reset(), which
+ * may be holding this pci_dev's device lock.
+ */
+ dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
+ result);
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ nvme_dev_disable(dev, true);
+ nvme_sync_queues(&dev->ctrl);
+ nvme_mark_namespaces_dead(&dev->ctrl);
+ nvme_unquiesce_io_queues(&dev->ctrl);
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
}
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
@@ -2965,17 +3301,50 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
}
+static void nvme_pci_print_device_info(struct nvme_ctrl *ctrl)
+{
+ struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
+ struct nvme_subsystem *subsys = ctrl->subsys;
+
+ dev_err(ctrl->device,
+ "VID:DID %04x:%04x model:%.*s firmware:%.*s\n",
+ pdev->vendor, pdev->device,
+ nvme_strlen(subsys->model, sizeof(subsys->model)),
+ subsys->model, nvme_strlen(subsys->firmware_rev,
+ sizeof(subsys->firmware_rev)),
+ subsys->firmware_rev);
+}
+
+static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl)
+{
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+ return dma_pci_p2pdma_supported(dev->dev);
+}
+
+static unsigned long nvme_pci_get_virt_boundary(struct nvme_ctrl *ctrl,
+ bool is_admin)
+{
+ if (!nvme_ctrl_sgl_supported(ctrl) || is_admin)
+ return NVME_CTRL_PAGE_SIZE - 1;
+ return 0;
+}
+
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
- .flags = NVME_F_METADATA_SUPPORTED |
- NVME_F_PCI_P2PDMA,
+ .flags = NVME_F_METADATA_SUPPORTED,
+ .dev_attr_groups = nvme_pci_dev_attr_groups,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
.free_ctrl = nvme_pci_free_ctrl,
.submit_async_event = nvme_pci_submit_async_event,
+ .subsystem_reset = nvme_pci_subsystem_reset,
.get_address = nvme_pci_get_address,
+ .print_device_info = nvme_pci_print_device_info,
+ .supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
+ .get_virt_boundary = nvme_pci_get_virt_boundary,
};
static int nvme_dev_map(struct nvme_dev *dev)
@@ -3032,34 +3401,59 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
return NVME_QUIRK_SIMPLE_SUSPEND;
+ } else if (pdev->vendor == 0x2646 && (pdev->device == 0x2263 ||
+ pdev->device == 0x500f)) {
+ /*
+ * Exclude some Kingston NV1 and A2000 devices from
+ * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a
+ * lot of energy with s2idle sleep on some TUXEDO platforms.
+ */
+ if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
+ dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
+ dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") ||
+ dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1"))
+ return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
+ } else if (pdev->vendor == 0x144d && pdev->device == 0xa80d) {
+ /*
+ * Exclude Samsung 990 Evo from NVME_QUIRK_SIMPLE_SUSPEND
+ * because of high power consumption (> 2 Watt) in s2idle
+ * sleep. Only some boards with Intel CPU are affected.
+ * (Note for testing: Samsung 990 Evo Plus has same PCI ID)
+ */
+ if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") ||
+ dmi_match(DMI_BOARD_NAME, "GMxPXxx") ||
+ dmi_match(DMI_BOARD_NAME, "GXxMRXx") ||
+ dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
+ dmi_match(DMI_BOARD_NAME, "PH4PG31") ||
+ dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") ||
+ dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71"))
+ return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
}
- return 0;
-}
-
-static void nvme_async_probe(void *data, async_cookie_t cookie)
-{
- struct nvme_dev *dev = data;
+ /*
+ * NVMe SSD drops off the PCIe bus after system idle
+ * for 10 hours on a Lenovo N60z board.
+ */
+ if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6"))
+ return NVME_QUIRK_NO_APST;
- flush_work(&dev->ctrl.reset_work);
- flush_work(&dev->ctrl.scan_work);
- nvme_put_ctrl(&dev->ctrl);
+ return 0;
}
-static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
+ const struct pci_device_id *id)
{
- int node, result = -ENOMEM;
- struct nvme_dev *dev;
unsigned long quirks = id->driver_data;
- size_t alloc_size;
-
- node = dev_to_node(&pdev->dev);
- if (node == NUMA_NO_NODE)
- set_dev_node(&pdev->dev, first_memory_node);
+ int node = dev_to_node(&pdev->dev);
+ struct nvme_dev *dev;
+ int ret = -ENOMEM;
- dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+ dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids),
+ GFP_KERNEL, node);
if (!dev)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+ INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
+ mutex_init(&dev->shutdown_lock);
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
@@ -3067,26 +3461,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev->queues = kcalloc_node(dev->nr_allocated_queues,
sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
- goto free;
+ goto out_free_dev;
dev->dev = get_device(&pdev->dev);
- pci_set_drvdata(pdev, dev);
-
- result = nvme_dev_map(dev);
- if (result)
- goto put_pci;
-
- INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
- INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
- mutex_init(&dev->shutdown_lock);
-
- result = nvme_setup_prp_pools(dev);
- if (result)
- goto unmap;
quirks |= check_vendor_combination_bug(pdev);
-
- if (!noacpi && acpi_storage_d3(&pdev->dev)) {
+ if (!noacpi &&
+ !(quirks & NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND) &&
+ acpi_storage_d3(&pdev->dev)) {
/*
* Some systems use a bios work around to ask for D3 on
* platforms that support kernel managed suspend.
@@ -3095,46 +3477,140 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
"platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
+ ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
+ quirks);
+ if (ret)
+ goto out_put_device;
+
+ if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
+ dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(48));
+ else
+ dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+ dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1);
+ dma_set_max_seg_size(&pdev->dev, 0xffffffff);
/*
- * Double check that our mempool alloc size will cover the biggest
- * command we support.
+ * Limit the max command size to prevent iod->sg allocations going
+ * over a single page.
*/
- alloc_size = nvme_pci_iod_alloc_size();
- WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+ dev->ctrl.max_hw_sectors = min_t(u32,
+ NVME_MAX_BYTES >> SECTOR_SHIFT,
+ dma_opt_mapping_size(&pdev->dev) >> 9);
+ dev->ctrl.max_segments = NVME_MAX_SEGS;
+ dev->ctrl.max_integrity_segments = 1;
+ return dev;
- dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
- mempool_kfree,
- (void *) alloc_size,
- GFP_KERNEL, node);
- if (!dev->iod_mempool) {
- result = -ENOMEM;
- goto release_pools;
- }
+out_put_device:
+ put_device(dev->dev);
+ kfree(dev->queues);
+out_free_dev:
+ kfree(dev);
+ return ERR_PTR(ret);
+}
+
+static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct nvme_dev *dev;
+ int result = -ENOMEM;
+
+ dev = nvme_pci_alloc_dev(pdev, id);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
- result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
- quirks);
+ result = nvme_add_ctrl(&dev->ctrl);
if (result)
- goto release_mempool;
+ goto out_put_ctrl;
+
+ result = nvme_dev_map(dev);
+ if (result)
+ goto out_uninit_ctrl;
+
+ result = nvme_pci_alloc_iod_mempool(dev);
+ if (result)
+ goto out_dev_unmap;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
- nvme_reset_ctrl(&dev->ctrl);
- async_schedule(nvme_async_probe, dev);
+ result = nvme_pci_enable(dev);
+ if (result)
+ goto out_release_iod_mempool;
+
+ result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
+ &nvme_mq_admin_ops, sizeof(struct nvme_iod));
+ if (result)
+ goto out_disable;
+
+ /*
+ * Mark the controller as connecting before sending admin commands to
+ * allow the timeout handler to do the right thing.
+ */
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller CONNECTING\n");
+ result = -EBUSY;
+ goto out_disable;
+ }
+
+ result = nvme_init_ctrl_finish(&dev->ctrl, false);
+ if (result)
+ goto out_disable;
+
+ if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+ else
+ dev->ctrl.max_integrity_segments = 1;
+ nvme_dbbuf_dma_alloc(dev);
+
+ result = nvme_setup_host_mem(dev);
+ if (result < 0)
+ goto out_disable;
+
+ nvme_update_attrs(dev);
+
+ result = nvme_setup_io_queues(dev);
+ if (result)
+ goto out_disable;
+
+ if (dev->online_queues > 1) {
+ nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
+ nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
+ nvme_dbbuf_set(dev);
+ }
+
+ if (!dev->ctrl.tagset)
+ dev_warn(dev->ctrl.device, "IO queues not created\n");
+
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller live state\n");
+ result = -ENODEV;
+ goto out_disable;
+ }
+
+ pci_set_drvdata(pdev, dev);
+
+ nvme_start_ctrl(&dev->ctrl);
+ nvme_put_ctrl(&dev->ctrl);
+ flush_work(&dev->ctrl.scan_work);
return 0;
- release_mempool:
- mempool_destroy(dev->iod_mempool);
- release_pools:
- nvme_release_prp_pools(dev);
- unmap:
+out_disable:
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ nvme_dev_disable(dev, true);
+ nvme_free_host_mem(dev);
+ nvme_dev_remove_admin(dev);
+ nvme_dbbuf_dma_free(dev);
+ nvme_free_queues(dev, 0);
+out_release_iod_mempool:
+ mempool_destroy(dev->dmavec_mempool);
+out_dev_unmap:
nvme_dev_unmap(dev);
- put_pci:
- put_device(dev->dev);
- free:
- kfree(dev->queues);
- kfree(dev);
+out_uninit_ctrl:
+ nvme_uninit_ctrl(&dev->ctrl);
+out_put_ctrl:
+ nvme_put_ctrl(&dev->ctrl);
+ dev_err_probe(&pdev->dev, result, "probe failed\n");
return result;
}
@@ -3166,13 +3642,6 @@ static void nvme_shutdown(struct pci_dev *pdev)
nvme_disable_prepare_reset(dev, true);
}
-static void nvme_remove_attrs(struct nvme_dev *dev)
-{
- if (dev->attrs_added)
- sysfs_remove_group(&dev->ctrl.device->kobj,
- &nvme_pci_attr_group);
-}
-
/*
* The driver's remove may be called on a device in a partially initialized
* state. This function must not have any dependencies on the device state in
@@ -3194,11 +3663,12 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_dev_disable(dev, true);
- nvme_remove_attrs(dev);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
+ nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
- nvme_release_prp_pools(dev);
+ mempool_destroy(dev->dmavec_mempool);
+ nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
}
@@ -3261,7 +3731,7 @@ static int nvme_suspend(struct device *dev)
nvme_wait_freeze(ctrl);
nvme_sync_queues(ctrl);
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
goto unfreeze;
/*
@@ -3347,6 +3817,10 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
case pci_channel_io_frozen:
dev_warn(dev->ctrl.device,
"frozen state error detected, reset controller\n");
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+ nvme_dev_disable(dev, true);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
nvme_dev_disable(dev, false);
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
@@ -3363,7 +3837,8 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
dev_info(dev->ctrl.device, "restart after slot reset\n");
pci_restore_state(pdev);
- nvme_reset_ctrl(&dev->ctrl);
+ if (nvme_try_sched_reset(&dev->ctrl))
+ nvme_unquiesce_io_queues(&dev->ctrl);
return PCI_ERS_RESULT_RECOVERED;
}
@@ -3391,10 +3866,10 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DEALLOCATE_ZEROES, },
+ NVME_QUIRK_IGNORE_DEV_SUBNQN |
+ NVME_QUIRK_BOGUS_NID, },
{ PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */
- .driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DEALLOCATE_ZEROES, },
+ .driver_data = NVME_QUIRK_STRIPE_SIZE, },
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_MEDIUM_PRIO_SQ |
@@ -3404,9 +3879,21 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS |
- NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ NVME_QUIRK_DISABLE_WRITE_ZEROES |
+ NVME_QUIRK_BOGUS_NID, },
+ { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */
+ .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, },
+ { PCI_DEVICE(0x126f, 0x1001), /* Silicon Motion generic */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+ NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+ NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */
- .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
+ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
+ NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
NVME_QUIRK_NO_NS_DESC_LIST, },
@@ -3422,26 +3909,113 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
NVME_QUIRK_DISABLE_WRITE_ZEROES|
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x15b7, 0x5008), /* Sandisk SN530 */
+ .driver_data = NVME_QUIRK_BROKEN_MSI },
+ { PCI_DEVICE(0x15b7, 0x5009), /* Sandisk SN550 */
+ .driver_data = NVME_QUIRK_BROKEN_MSI |
+ NVME_QUIRK_NO_DEEPEST_PS },
+ { PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */
- .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
+ NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1987, 0x5019), /* phison E19 */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x1987, 0x5021), /* Phison E21 */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */
.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1cc1, 0x33f8), /* ADATA IM2P33F8ABR1 1 TB */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */
- .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
+ NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x10ec, 0x5763), /* ADATA SX6000PNP */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1344, 0x5407), /* Micron Technology Inc NVMe SSD */
+ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN },
+ { PCI_DEVICE(0x1344, 0x6001), /* Micron Nitro NVMe */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x1c5c, 0x174a), /* SK Hynix P31 SSD */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1c5c, 0x1D59), /* SK Hynix BC901 */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x1d97, 0x2263), /* SPCC */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x144d, 0xa80b), /* Samsung PM9B1 256G and 512G */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES |
+ NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x144d, 0xa809), /* Samsung MZALQ256HBJD 256G */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x144d, 0xa802), /* Samsung SM953 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1cc4, 0x6303), /* UMIS RPJTJ512MGE1QDY 512G */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x1cc4, 0x6302), /* UMIS RPJTJ256MGE1QDY 256G */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x2646, 0x2262), /* KINGSTON SKC2000 NVMe SSD */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
{ PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+ { PCI_DEVICE(0x2646, 0x5013), /* Kingston KC3000, Kingston FURY Renegade */
+ .driver_data = NVME_QUIRK_NO_SECONDARY_TEMP_THRESH, },
+ { PCI_DEVICE(0x2646, 0x5018), /* KINGSTON OM8SFP4xxxxP OS21012 NVMe SSD */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x2646, 0x5016), /* KINGSTON OM3PGP4xxxxP OS21011 NVMe SSD */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x2646, 0x501A), /* KINGSTON OM8PGP4xxxxP OS21005 NVMe SSD */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x2646, 0x501B), /* KINGSTON OM8PGP4xxxxQ OS21005 NVMe SSD */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x1f40, 0x1202), /* Netac Technologies Co. NV3000 NVMe SSD */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1f40, 0x5236), /* Netac Technologies Co. NV7000 NVMe SSD */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1e4B, 0x1001), /* MAXIO MAP1001 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1e4B, 0x1002), /* MAXIO MAP1002 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1e4B, 0x1202), /* MAXIO MAP1202 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1e4B, 0x1602), /* MAXIO MAP1602 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1dbe, 0x5216), /* Acer/INNOGRIT FA100/5216 NVMe SSD */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1e49, 0x0021), /* ZHITAI TiPro5000 NVMe SSD */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+ { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+ { PCI_DEVICE(0x025e, 0xf1ac), /* SOLIDIGM P44 pro SSDPFKKW020X7 */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+ { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1d97, 0x1d97), /* Lexar NM620 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */
+ .driver_data = NVME_QUIRK_BOGUS_NID |
+ NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x10ec, 0x5763), /* TEAMGROUP T-FORCE CARDEA ZERO Z330 SSD */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1e4b, 0x1602), /* HS-SSD-FUTURE 2048G */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x10ec, 0x5765), /* TEAMGROUP MP33 2TB SSD */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
@@ -3455,14 +4029,19 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
- .driver_data = NVME_QUIRK_SINGLE_VECTOR },
+ /*
+ * Fix for the Apple controller found in the MacBook8,1 and
+ * some MacBook7,1 to avoid controller resets and data loss.
+ */
+ .driver_data = NVME_QUIRK_SINGLE_VECTOR |
+ NVME_QUIRK_QDEPTH_ONE },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
NVME_QUIRK_128_BYTES_SQES |
NVME_QUIRK_SHARED_TAGS |
- NVME_QUIRK_SKIP_CID_GEN },
-
+ NVME_QUIRK_SKIP_CID_GEN |
+ NVME_QUIRK_IDENTIFY_CNS },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ 0, }
};
@@ -3474,11 +4053,12 @@ static struct pci_driver nvme_driver = {
.probe = nvme_probe,
.remove = nvme_remove,
.shutdown = nvme_shutdown,
-#ifdef CONFIG_PM_SLEEP
.driver = {
- .pm = &nvme_dev_pm_ops,
- },
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+#ifdef CONFIG_PM_SLEEP
+ .pm = &nvme_dev_pm_ops,
#endif
+ },
.sriov_configure = pci_sriov_configure_simple,
.err_handler = &nvme_err_handler,
};
@@ -3502,5 +4082,6 @@ static void __exit nvme_exit(void)
MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
MODULE_VERSION("1.0");
+MODULE_DESCRIPTION("NVMe host PCIe transport driver");
module_init(nvme_init);
module_exit(nvme_exit);