summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/pci.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r--drivers/nvme/host/pci.c775
1 files changed, 546 insertions, 229 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c1d6357ec98a..8ff12e415cb5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -8,7 +8,6 @@
#include <linux/async.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
-#include <linux/blk-mq-pci.h>
#include <linux/blk-integrity.h>
#include <linux/dmi.h>
#include <linux/init.h>
@@ -19,6 +18,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/nodemask.h>
#include <linux/once.h>
#include <linux/pci.h>
#include <linux/suspend.h>
@@ -35,15 +35,31 @@
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
-#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+/* Optimisation for I/Os between 4k and 128k */
+#define NVME_SMALL_POOL_SIZE 256
/*
* These can be higher, but we need to ensure that any command doesn't
* require an sg allocation that needs more than a page of data.
*/
#define NVME_MAX_KB_SZ 8192
-#define NVME_MAX_SEGS 128
-#define NVME_MAX_NR_ALLOCATIONS 5
+#define NVME_MAX_NR_DESCRIPTORS 5
+
+/*
+ * For data SGLs we support a single descriptors worth of SGL entries, but for
+ * now we also limit it to avoid an allocation larger than PAGE_SIZE for the
+ * scatterlist.
+ */
+#define NVME_MAX_SEGS \
+ min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \
+ (PAGE_SIZE / sizeof(struct scatterlist)))
+
+/*
+ * For metadata SGLs, only the small descriptor is supported, and the first
+ * entry is the segment descriptor, which for the data pointer sits in the SQE.
+ */
+#define NVME_MAX_META_SEGS \
+ ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@@ -112,6 +128,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static void nvme_delete_io_queues(struct nvme_dev *dev);
static void nvme_update_attrs(struct nvme_dev *dev);
+struct nvme_descriptor_pools {
+ struct dma_pool *large;
+ struct dma_pool *small;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
@@ -121,8 +142,6 @@ struct nvme_dev {
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct device *dev;
- struct dma_pool *prp_page_pool;
- struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
@@ -141,8 +160,10 @@ struct nvme_dev {
struct nvme_ctrl ctrl;
u32 last_ps;
bool hmb;
+ struct sg_table *hmb_sgt;
mempool_t *iod_mempool;
+ mempool_t *iod_meta_mempool;
/* shadow doorbell buffer support: */
__le32 *dbbuf_dbs;
@@ -153,12 +174,14 @@ struct nvme_dev {
/* host memory buffer support: */
u64 host_mem_size;
u32 nr_host_mem_descs;
+ u32 host_mem_descs_size;
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
+ struct nvme_descriptor_pools descriptor_pools[];
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -188,6 +211,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
*/
struct nvme_queue {
struct nvme_dev *dev;
+ struct nvme_descriptor_pools descriptor_pools;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
@@ -216,28 +240,30 @@ struct nvme_queue {
struct completion delete_done;
};
-union nvme_descriptor {
- struct nvme_sgl_desc *sg_list;
- __le64 *prp_list;
+/* bits for iod->flags */
+enum nvme_iod_flags {
+ /* this command has been aborted by the timeout handler */
+ IOD_ABORTED = 1U << 0,
+
+ /* uses the small descriptor pool */
+ IOD_SMALL_DESCRIPTOR = 1U << 1,
};
/*
* The nvme_iod describes the data in an I/O.
- *
- * The sg pointer contains the list of PRP/SGL chunk allocations in addition
- * to the actual struct scatterlist.
*/
struct nvme_iod {
struct nvme_request req;
struct nvme_command cmd;
- bool aborted;
- s8 nr_allocations; /* PRP list pool allocations. 0 means small
- pool in use */
+ u8 flags;
+ u8 nr_descriptors;
unsigned int dma_len; /* length of single DMA segment mapping */
dma_addr_t first_dma;
dma_addr_t meta_dma;
struct sg_table sgt;
- union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
+ struct sg_table meta_sgt;
+ struct nvme_sgl_desc *meta_descriptor;
+ void *descriptors[NVME_MAX_NR_DESCRIPTORS];
};
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -367,7 +393,7 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
/*
* Ensure that the doorbell is updated before reading the event
* index from memory. The controller needs to provide similar
- * ordering to ensure the envent index is updated before reading
+ * ordering to ensure the event index is updated before reading
* the doorbell.
*/
mb();
@@ -385,37 +411,85 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
-static int nvme_pci_npages_prp(void)
+static __always_inline int nvme_pci_npages_prp(void)
{
unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE;
unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE);
return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
}
-static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static struct nvme_descriptor_pools *
+nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
{
- struct nvme_dev *dev = to_nvme_dev(data);
- struct nvme_queue *nvmeq = &dev->queues[0];
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node];
+ size_t small_align = NVME_SMALL_POOL_SIZE;
- WARN_ON(hctx_idx != 0);
- WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+ if (pools->small)
+ return pools; /* already initialized */
- hctx->driver_data = nvmeq;
- return 0;
+ pools->large = dma_pool_create_node("nvme descriptor page", dev->dev,
+ NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node);
+ if (!pools->large)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+ small_align = 512;
+
+ pools->small = dma_pool_create_node("nvme descriptor small", dev->dev,
+ NVME_SMALL_POOL_SIZE, small_align, 0, numa_node);
+ if (!pools->small) {
+ dma_pool_destroy(pools->large);
+ pools->large = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return pools;
}
-static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static void nvme_release_descriptor_pools(struct nvme_dev *dev)
+{
+ unsigned i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i];
+
+ dma_pool_destroy(pools->large);
+ dma_pool_destroy(pools->small);
+ }
+}
+
+static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned qid)
{
struct nvme_dev *dev = to_nvme_dev(data);
- struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+ struct nvme_descriptor_pools *pools;
+ struct blk_mq_tags *tags;
- WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
+ tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
+ WARN_ON(tags != hctx->tags);
+ pools = nvme_setup_descriptor_pools(dev, hctx->numa_node);
+ if (IS_ERR(pools))
+ return PTR_ERR(pools);
+
+ nvmeq->descriptor_pools = *pools;
hctx->driver_data = nvmeq;
return 0;
}
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ WARN_ON(hctx_idx != 0);
+ return nvme_init_hctx_common(hctx, data, 0);
+}
+
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ return nvme_init_hctx_common(hctx, data, hctx_idx + 1);
+}
+
static int nvme_pci_init_request(struct blk_mq_tag_set *set,
struct request *req, unsigned int hctx_idx,
unsigned int numa_node)
@@ -457,7 +531,7 @@ static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
*/
map->queue_offset = qoff;
if (i != HCTX_TYPE_POLL && offset)
- blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+ blk_mq_map_hw_queues(map, dev->dev, offset);
else
blk_mq_map_queues(map);
qoff += map->nr_queues;
@@ -504,6 +578,15 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
spin_unlock(&nvmeq->sq_lock);
}
+static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
+ struct request *req)
+{
+ if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ return false;
+ return req->nr_integrity_segments > 1 ||
+ nvme_req(req)->flags & NVME_REQ_USERCMD;
+}
+
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
int nseg)
{
@@ -516,28 +599,46 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
return false;
if (!nvmeq->qid)
return false;
+ if (nvme_pci_metadata_use_sgls(dev, req))
+ return true;
if (!sgl_threshold || avg_seg_size < sgl_threshold)
- return false;
+ return nvme_req(req)->flags & NVME_REQ_USERCMD;
return true;
}
-static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
+ struct nvme_iod *iod)
+{
+ if (iod->flags & IOD_SMALL_DESCRIPTOR)
+ return nvmeq->descriptor_pools.small;
+ return nvmeq->descriptor_pools.large;
+}
+
+static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req)
{
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
dma_addr_t dma_addr = iod->first_dma;
int i;
- for (i = 0; i < iod->nr_allocations; i++) {
- __le64 *prp_list = iod->list[i].prp_list;
+ if (iod->nr_descriptors == 1) {
+ dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
+ dma_addr);
+ return;
+ }
+
+ for (i = 0; i < iod->nr_descriptors; i++) {
+ __le64 *prp_list = iod->descriptors[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+ dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
+ dma_addr);
dma_addr = next_dma_addr;
}
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -550,15 +651,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
WARN_ON_ONCE(!iod->sgt.nents);
dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-
- if (iod->nr_allocations == 0)
- dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
- iod->first_dma);
- else if (iod->nr_allocations == 1)
- dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
- iod->first_dma);
- else
- nvme_free_prps(dev, req);
+ nvme_free_descriptors(nvmeq, req);
mempool_free(iod->sgt.sgl, dev->iod_mempool);
}
@@ -576,11 +669,10 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
}
}
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
int length = blk_rq_payload_bytes(req);
struct scatterlist *sg = iod->sgt.sgl;
int dma_len = sg_dma_len(sg);
@@ -588,7 +680,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
dma_addr_t prp_dma;
- int nprps, i;
+ int i;
length -= (NVME_CTRL_PAGE_SIZE - offset);
if (length <= 0) {
@@ -610,30 +702,26 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
goto done;
}
- nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
- if (nprps <= (256 / 8)) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
+ if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
+ NVME_SMALL_POOL_SIZE / sizeof(__le64))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
- if (!prp_list) {
- iod->nr_allocations = -1;
+ prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &prp_dma);
+ if (!prp_list)
return BLK_STS_RESOURCE;
- }
- iod->list[0].prp_list = prp_list;
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
iod->first_dma = prp_dma;
i = 0;
for (;;) {
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+
+ prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
+ GFP_ATOMIC, &prp_dma);
if (!prp_list)
goto free_prps;
- iod->list[iod->nr_allocations++].prp_list = prp_list;
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
prp_list[0] = old_prp_list[i - 1];
old_prp_list[i - 1] = cpu_to_le64(prp_dma);
i = 1;
@@ -657,7 +745,7 @@ done:
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
return BLK_STS_OK;
free_prps:
- nvme_free_prps(dev, req);
+ nvme_free_descriptors(nvmeq, req);
return BLK_STS_RESOURCE;
bad_sgl:
WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
@@ -682,11 +770,10 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
-static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sg = iod->sgt.sgl;
unsigned int entries = iod->sgt.nents;
@@ -701,21 +788,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
return BLK_STS_OK;
}
- if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
+ if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
- if (!sg_list) {
- iod->nr_allocations = -1;
+ sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &sgl_dma);
+ if (!sg_list)
return BLK_STS_RESOURCE;
- }
-
- iod->list[0].sg_list = sg_list;
+ iod->descriptors[iod->nr_descriptors++] = sg_list;
iod->first_dma = sgl_dma;
nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
@@ -769,16 +849,18 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret = BLK_STS_RESOURCE;
int rc;
if (blk_rq_nr_phys_segments(req) == 1) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
- if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
+ if (!nvme_pci_metadata_use_sgls(dev, req) &&
+ (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
+ bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
@@ -794,7 +876,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
if (!iod->sgt.sgl)
return BLK_STS_RESOURCE;
sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
- iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl);
+ iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl);
if (!iod->sgt.orig_nents)
goto out_free_sg;
@@ -807,9 +889,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
}
if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+ ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw);
else
- ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+ ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw);
if (ret != BLK_STS_OK)
goto out_unmap_sg;
return BLK_STS_OK;
@@ -821,27 +903,98 @@ out_free_sg:
return ret;
}
-static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
+static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
+ struct request *req)
+{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_rw_command *cmnd = &iod->cmd.rw;
+ struct nvme_sgl_desc *sg_list;
+ struct scatterlist *sgl, *sg;
+ unsigned int entries;
+ dma_addr_t sgl_dma;
+ int rc, i;
+
+ iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
+ if (!iod->meta_sgt.sgl)
+ return BLK_STS_RESOURCE;
+
+ sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
+ iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
+ iod->meta_sgt.sgl);
+ if (!iod->meta_sgt.orig_nents)
+ goto out_free_sg;
+
+ rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
+ DMA_ATTR_NO_WARN);
+ if (rc)
+ goto out_free_sg;
+
+ sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
+ &sgl_dma);
+ if (!sg_list)
+ goto out_unmap_sg;
+
+ entries = iod->meta_sgt.nents;
+ iod->meta_descriptor = sg_list;
+ iod->meta_dma = sgl_dma;
+
+ cmnd->flags = NVME_CMD_SGL_METASEG;
+ cmnd->metadata = cpu_to_le64(sgl_dma);
+
+ sgl = iod->meta_sgt.sgl;
+ if (entries == 1) {
+ nvme_pci_sgl_set_data(sg_list, sgl);
+ return BLK_STS_OK;
+ }
+
+ sgl_dma += sizeof(*sg_list);
+ nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
+ for_each_sg(sgl, sg, entries, i)
+ nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
+
+ return BLK_STS_OK;
+
+out_unmap_sg:
+ dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+out_free_sg:
+ mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+ return BLK_STS_RESOURCE;
+}
+
+static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct bio_vec bv = rq_integrity_vec(req);
+ struct nvme_command *cmnd = &iod->cmd;
- iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
- rq_dma_dir(req), 0);
+ iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
return BLK_STS_OK;
}
+static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
+ nvme_pci_metadata_use_sgls(dev, req))
+ return nvme_pci_setup_meta_sgls(dev, req);
+ return nvme_pci_setup_meta_mptr(dev, req);
+}
+
static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret;
- iod->aborted = false;
- iod->nr_allocations = -1;
+ iod->flags = 0;
+ iod->nr_descriptors = 0;
iod->sgt.nents = 0;
+ iod->meta_sgt.nents = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
@@ -854,7 +1007,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
}
if (blk_integrity_rq(req)) {
- ret = nvme_map_metadata(dev, req, &iod->cmd);
+ ret = nvme_map_metadata(dev, req);
if (ret)
goto out_unmap_data;
}
@@ -862,15 +1015,13 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
nvme_start_request(req);
return BLK_STS_OK;
out_unmap_data:
- nvme_unmap_data(dev, req);
+ if (blk_rq_nr_phys_segments(req))
+ nvme_unmap_data(dev, req->mq_hctx->driver_data, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
}
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -900,11 +1051,15 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist)
{
+ struct request *req;
+
+ if (rq_list_empty(rqlist))
+ return;
+
spin_lock(&nvmeq->sq_lock);
- while (!rq_list_empty(*rqlist)) {
- struct request *req = rq_list_pop(rqlist);
+ while ((req = rq_list_pop(rqlist))) {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
nvme_sq_copy_cmd(nvmeq, &iod->cmd);
@@ -927,34 +1082,46 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
}
-static void nvme_queue_rqs(struct request **rqlist)
+static void nvme_queue_rqs(struct rq_list *rqlist)
{
- struct request *req, *next, *prev = NULL;
- struct request *requeue_list = NULL;
+ struct rq_list submit_list = { };
+ struct rq_list requeue_list = { };
+ struct nvme_queue *nvmeq = NULL;
+ struct request *req;
- rq_list_for_each_safe(rqlist, req, next) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ while ((req = rq_list_pop(rqlist))) {
+ if (nvmeq && nvmeq != req->mq_hctx->driver_data)
+ nvme_submit_cmds(nvmeq, &submit_list);
+ nvmeq = req->mq_hctx->driver_data;
- if (!nvme_prep_rq_batch(nvmeq, req)) {
- /* detach 'req' and add to remainder list */
- rq_list_move(rqlist, &requeue_list, req, prev);
+ if (nvme_prep_rq_batch(nvmeq, req))
+ rq_list_add_tail(&submit_list, req);
+ else
+ rq_list_add_tail(&requeue_list, req);
+ }
- req = prev;
- if (!req)
- continue;
- }
+ if (nvmeq)
+ nvme_submit_cmds(nvmeq, &submit_list);
+ *rqlist = requeue_list;
+}
+
+static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+ struct nvme_queue *nvmeq,
+ struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- if (!next || req->mq_hctx != next->mq_hctx) {
- /* detach rest of list, and submit */
- req->rq_next = NULL;
- nvme_submit_cmds(nvmeq, rqlist);
- *rqlist = next;
- prev = NULL;
- } else
- prev = req;
+ if (!iod->meta_sgt.nents) {
+ dma_unmap_page(dev->dev, iod->meta_dma,
+ rq_integrity_vec(req).bv_len,
+ rq_dma_dir(req));
+ return;
}
- *rqlist = requeue_list;
+ dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
+ iod->meta_dma);
+ dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+ mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
}
static __always_inline void nvme_pci_unmap_rq(struct request *req)
@@ -962,15 +1129,11 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
- if (blk_integrity_rq(req)) {
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
- dma_unmap_page(dev->dev, iod->meta_dma,
- rq_integrity_vec(req)->bv_len, rq_dma_dir(req));
- }
+ if (blk_integrity_rq(req))
+ nvme_unmap_metadata(dev, nvmeq, req);
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(dev, nvmeq, req);
}
static void nvme_pci_complete_rq(struct request *req)
@@ -1037,8 +1200,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
- !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
- nvme_pci_complete_batch))
+ !blk_mq_add_to_batch(req, iob,
+ nvme_req(req)->status != NVME_SC_SUCCESS,
+ nvme_pci_complete_batch))
nvme_pci_complete_rq(req);
}
@@ -1054,13 +1218,13 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
}
}
-static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
- struct io_comp_batch *iob)
+static inline bool nvme_poll_cq(struct nvme_queue *nvmeq,
+ struct io_comp_batch *iob)
{
- int found = 0;
+ bool found = false;
while (nvme_cqe_pending(nvmeq)) {
- found++;
+ found = true;
/*
* load-load control dependency between phase and the rest of
* the cqe requires a full read memory barrier
@@ -1081,7 +1245,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
DEFINE_IO_COMP_BATCH(iob);
if (nvme_poll_cq(nvmeq, &iob)) {
- if (!rq_list_empty(iob.req_list))
+ if (!rq_list_empty(&iob.req_list))
nvme_pci_complete_batch(&iob);
return IRQ_HANDLED;
}
@@ -1108,7 +1272,9 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ spin_lock(&nvmeq->cq_poll_lock);
nvme_poll_cq(nvmeq, NULL);
+ spin_unlock(&nvmeq->cq_poll_lock);
enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
}
@@ -1142,6 +1308,41 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
spin_unlock(&nvmeq->sq_lock);
}
+static int nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl)
+{
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+ int ret = 0;
+
+ /*
+ * Taking the shutdown_lock ensures the BAR mapping is not being
+ * altered by reset_work. Holding this lock before the RESETTING state
+ * change, if successful, also ensures nvme_remove won't be able to
+ * proceed to iounmap until we're done.
+ */
+ mutex_lock(&dev->shutdown_lock);
+ if (!dev->bar_mapped_size) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR);
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE);
+
+ /*
+ * Read controller status to flush the previous write and trigger a
+ * pcie read error.
+ */
+ readl(dev->bar + NVME_REG_CSTS);
+unlock:
+ mutex_unlock(&dev->shutdown_lock);
+ return ret;
+}
+
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
struct nvme_command c = { };
@@ -1273,7 +1474,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
dev_warn(dev->ctrl.device,
"Does your device have a faulty power saving mode enabled?\n");
dev_warn(dev->ctrl.device,
- "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n");
+ "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n");
}
static enum blk_eh_timer_return nvme_timeout(struct request *req)
@@ -1283,14 +1484,28 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
struct nvme_dev *dev = nvmeq->dev;
struct request *abort_req;
struct nvme_command cmd = { };
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
u32 csts = readl(dev->bar + NVME_REG_CSTS);
u8 opcode;
+ /*
+ * Shutdown the device immediately if we see it is disconnected. This
+ * unblocks PCIe error handling if the nvme driver is waiting in
+ * error_resume for a device that has been removed. We can't unbind the
+ * driver while the driver's error callback is waiting to complete, so
+ * we're relying on a timeout to break that deadlock if a removal
+ * occurs while reset work is running.
+ */
+ if (pci_dev_is_disconnected(pdev))
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ if (nvme_state_terminal(&dev->ctrl))
+ goto disable;
+
/* If PCI error recovery process is happening, we cannot reset or
* the recovery mechanism will surely fail.
*/
mb();
- if (pci_channel_offline(to_pci_dev(dev->dev)))
+ if (pci_channel_offline(pdev))
return BLK_EH_RESET_TIMER;
/*
@@ -1345,11 +1560,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* returned to the driver, or if this is the admin queue.
*/
opcode = nvme_req(req)->cmd->common.opcode;
- if (!nvmeq->qid || iod->aborted) {
+ if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
dev_warn(dev->ctrl.device,
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
req->tag, nvme_cid(req), opcode,
- nvme_opcode_str(nvmeq->qid, opcode, 0), nvmeq->qid);
+ nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
goto disable;
}
@@ -1358,7 +1573,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
}
- iod->aborted = true;
+ iod->flags |= IOD_ABORTED;
cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = nvme_cid(req);
@@ -1390,8 +1605,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
return BLK_EH_RESET_TIMER;
disable:
- if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+ if (nvme_state_terminal(&dev->ctrl))
+ nvme_dev_disable(dev, true);
return BLK_EH_DONE;
+ }
nvme_dev_disable(dev, false);
if (nvme_try_sched_reset(&dev->ctrl))
@@ -1849,6 +2067,18 @@ static void nvme_map_cmb(struct nvme_dev *dev)
return;
/*
+ * Controllers may support a CMB size larger than their BAR, for
+ * example, due to being behind a bridge. Reduce the CMB to the
+ * reported size of the BAR
+ */
+ size = min(size, bar_size - offset);
+
+ if (!IS_ALIGNED(size, memremap_compat_align()) ||
+ !IS_ALIGNED(pci_resource_start(pdev, bar),
+ memremap_compat_align()))
+ return;
+
+ /*
* Tell the controller about the host side address mapping the CMB,
* and enable CMB decoding for the NVMe 1.4+ scheme:
*/
@@ -1858,17 +2088,10 @@ static void nvme_map_cmb(struct nvme_dev *dev)
dev->bar + NVME_REG_CMBMSC);
}
- /*
- * Controllers may support a CMB size larger than their BAR,
- * for example, due to being behind a bridge. Reduce the CMB to
- * the reported size of the BAR
- */
- if (size > bar_size - offset)
- size = bar_size - offset;
-
if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
dev_warn(dev->ctrl.device,
"failed to register the CMB\n");
+ hi_lo_writeq(0, dev->bar + NVME_REG_CMBMSC);
return;
}
@@ -1908,7 +2131,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
return ret;
}
-static void nvme_free_host_mem(struct nvme_dev *dev)
+static void nvme_free_host_mem_multi(struct nvme_dev *dev)
{
int i;
@@ -1923,18 +2146,54 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
kfree(dev->host_mem_desc_bufs);
dev->host_mem_desc_bufs = NULL;
- dma_free_coherent(dev->dev,
- dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
+}
+
+static void nvme_free_host_mem(struct nvme_dev *dev)
+{
+ if (dev->hmb_sgt)
+ dma_free_noncontiguous(dev->dev, dev->host_mem_size,
+ dev->hmb_sgt, DMA_BIDIRECTIONAL);
+ else
+ nvme_free_host_mem_multi(dev);
+
+ dma_free_coherent(dev->dev, dev->host_mem_descs_size,
dev->host_mem_descs, dev->host_mem_descs_dma);
dev->host_mem_descs = NULL;
+ dev->host_mem_descs_size = 0;
dev->nr_host_mem_descs = 0;
}
-static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
+static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
+{
+ dev->hmb_sgt = dma_alloc_noncontiguous(dev->dev, size,
+ DMA_BIDIRECTIONAL, GFP_KERNEL, 0);
+ if (!dev->hmb_sgt)
+ return -ENOMEM;
+
+ dev->host_mem_descs = dma_alloc_coherent(dev->dev,
+ sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma,
+ GFP_KERNEL);
+ if (!dev->host_mem_descs) {
+ dma_free_noncontiguous(dev->dev, size, dev->hmb_sgt,
+ DMA_BIDIRECTIONAL);
+ dev->hmb_sgt = NULL;
+ return -ENOMEM;
+ }
+ dev->host_mem_size = size;
+ dev->host_mem_descs_size = sizeof(*dev->host_mem_descs);
+ dev->nr_host_mem_descs = 1;
+
+ dev->host_mem_descs[0].addr =
+ cpu_to_le64(dev->hmb_sgt->sgl->dma_address);
+ dev->host_mem_descs[0].size = cpu_to_le32(size / NVME_CTRL_PAGE_SIZE);
+ return 0;
+}
+
+static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
u32 chunk_size)
{
struct nvme_host_mem_buf_desc *descs;
- u32 max_entries, len;
+ u32 max_entries, len, descs_size;
dma_addr_t descs_dma;
int i = 0;
void **bufs;
@@ -1947,8 +2206,9 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
max_entries = dev->ctrl.hmmaxd;
- descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
- &descs_dma, GFP_KERNEL);
+ descs_size = max_entries * sizeof(*descs);
+ descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma,
+ GFP_KERNEL);
if (!descs)
goto out;
@@ -1977,22 +2237,14 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
dev->host_mem_size = size;
dev->host_mem_descs = descs;
dev->host_mem_descs_dma = descs_dma;
+ dev->host_mem_descs_size = descs_size;
dev->host_mem_desc_bufs = bufs;
return 0;
out_free_bufs:
- while (--i >= 0) {
- size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
-
- dma_free_attrs(dev->dev, size, bufs[i],
- le64_to_cpu(descs[i].addr),
- DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
- }
-
kfree(bufs);
out_free_descs:
- dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
- descs_dma);
+ dma_free_coherent(dev->dev, descs_size, descs, descs_dma);
out:
dev->host_mem_descs = NULL;
return -ENOMEM;
@@ -2000,13 +2252,23 @@ out:
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
+ unsigned long dma_merge_boundary = dma_get_merge_boundary(dev->dev);
u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
u64 chunk_size;
+ /*
+ * If there is an IOMMU that can merge pages, try a virtually
+ * non-contiguous allocation for a single segment first.
+ */
+ if (dma_merge_boundary && (PAGE_SIZE & dma_merge_boundary) == 0) {
+ if (!nvme_alloc_host_mem_single(dev, preferred))
+ return 0;
+ }
+
/* start big and work our way down */
for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
- if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
+ if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) {
if (!min || dev->host_mem_size >= min)
return 0;
nvme_free_host_mem(dev);
@@ -2054,8 +2316,10 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
}
dev_info(dev->ctrl.device,
- "allocated %lld MiB host memory buffer.\n",
- dev->host_mem_size >> ilog2(SZ_1M));
+ "allocated %lld MiB host memory buffer (%u segment%s).\n",
+ dev->host_mem_size >> ilog2(SZ_1M),
+ dev->nr_host_mem_descs,
+ str_plural(dev->nr_host_mem_descs));
}
ret = nvme_set_host_mem(dev, enable_bits);
@@ -2218,6 +2482,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
.priv = dev,
};
unsigned int irq_queues, poll_queues;
+ unsigned int flags = PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY;
/*
* Poll queues don't need interrupts, but we need at least one I/O queue
@@ -2241,8 +2506,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
irq_queues = 1;
if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
irq_queues += (nr_io_queues - poll_queues);
- return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+ if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI)
+ flags &= ~PCI_IRQ_MSI;
+ return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, flags,
+ &affd);
}
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
@@ -2460,17 +2727,36 @@ static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev)
return 1;
}
-static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
+static bool nvme_pci_update_nr_queues(struct nvme_dev *dev)
{
+ if (!dev->ctrl.tagset) {
+ nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
+ nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
+ return true;
+ }
+
+ /* Give up if we are racing with nvme_dev_disable() */
+ if (!mutex_trylock(&dev->shutdown_lock))
+ return false;
+
+ /* Check if nvme_dev_disable() has been executed already */
+ if (!dev->online_queues) {
+ mutex_unlock(&dev->shutdown_lock);
+ return false;
+ }
+
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
/* free previously allocated queues that are no longer usable */
nvme_free_queues(dev, dev->online_queues);
+ mutex_unlock(&dev->shutdown_lock);
+ return true;
}
static int nvme_pci_enable(struct nvme_dev *dev)
{
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ unsigned int flags = PCI_IRQ_ALL_TYPES;
if (pci_enable_device_mem(pdev))
return result;
@@ -2487,7 +2773,9 @@ static int nvme_pci_enable(struct nvme_dev *dev)
* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
* adjust this later.
*/
- result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+ if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI)
+ flags &= ~PCI_IRQ_MSI;
+ result = pci_alloc_irq_vectors(pdev, 1, 1, flags);
if (result < 0)
goto disable;
@@ -2508,15 +2796,8 @@ static int nvme_pci_enable(struct nvme_dev *dev)
else
dev->io_sqes = NVME_NVM_IOSQES;
- /*
- * Temporary fix for the Apple controller found in the MacBook8,1 and
- * some MacBook7,1 to avoid controller resets and data loss.
- */
- if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
+ if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) {
dev->q_depth = 2;
- dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
- "set queue depth=%u to work around controller resets\n",
- dev->q_depth);
} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
(pdev->device == 0xa821 || pdev->device == 0xa822) &&
NVME_CAP_MQES(dev->ctrl.cap) == 0) {
@@ -2631,32 +2912,9 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
return 0;
}
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
-{
- dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
- NVME_CTRL_PAGE_SIZE,
- NVME_CTRL_PAGE_SIZE, 0);
- if (!dev->prp_page_pool)
- return -ENOMEM;
-
- /* Optimisation for I/Os between 4k and 128k */
- dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
- 256, 256, 0);
- if (!dev->prp_small_pool) {
- dma_pool_destroy(dev->prp_page_pool);
- return -ENOMEM;
- }
- return 0;
-}
-
-static void nvme_release_prp_pools(struct nvme_dev *dev)
-{
- dma_pool_destroy(dev->prp_page_pool);
- dma_pool_destroy(dev->prp_small_pool);
-}
-
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
+ size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
dev->iod_mempool = mempool_create_node(1,
@@ -2665,7 +2923,18 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
dev_to_node(dev->dev));
if (!dev->iod_mempool)
return -ENOMEM;
+
+ dev->iod_meta_mempool = mempool_create_node(1,
+ mempool_kmalloc, mempool_kfree,
+ (void *)meta_size, GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->iod_meta_mempool)
+ goto free;
+
return 0;
+free:
+ mempool_destroy(dev->iod_mempool);
+ return -ENOMEM;
}
static void nvme_free_tagset(struct nvme_dev *dev)
@@ -2730,6 +2999,11 @@ static void nvme_reset_work(struct work_struct *work)
if (result)
goto out;
+ if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+ else
+ dev->ctrl.max_integrity_segments = 1;
+
nvme_dbbuf_dma_alloc(dev);
result = nvme_setup_host_mem(dev);
@@ -2741,7 +3015,7 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
/*
- * Freeze and update the number of I/O queues as thos might have
+ * Freeze and update the number of I/O queues as those might have
* changed. If there are no I/O queues left after this reset, keep the
* controller around but remove all namespaces.
*/
@@ -2749,7 +3023,8 @@ static void nvme_reset_work(struct work_struct *work)
nvme_dbbuf_set(dev);
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- nvme_pci_update_nr_queues(dev);
+ if (!nvme_pci_update_nr_queues(dev))
+ goto out;
nvme_unfreeze(&dev->ctrl);
} else {
dev_warn(dev->ctrl.device, "IO queues lost\n");
@@ -2846,6 +3121,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.reg_read64 = nvme_pci_reg_read64,
.free_ctrl = nvme_pci_free_ctrl,
.submit_async_event = nvme_pci_submit_async_event,
+ .subsystem_reset = nvme_pci_subsystem_reset,
.get_address = nvme_pci_get_address,
.print_device_info = nvme_pci_print_device_info,
.supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
@@ -2910,15 +3186,35 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
/*
* Exclude some Kingston NV1 and A2000 devices from
* NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a
- * lot fo energy with s2idle sleep on some TUXEDO platforms.
+ * lot of energy with s2idle sleep on some TUXEDO platforms.
*/
if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") ||
dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1"))
return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
+ } else if (pdev->vendor == 0x144d && pdev->device == 0xa80d) {
+ /*
+ * Exclude Samsung 990 Evo from NVME_QUIRK_SIMPLE_SUSPEND
+ * because of high power consumption (> 2 Watt) in s2idle
+ * sleep. Only some boards with Intel CPU are affected.
+ */
+ if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") ||
+ dmi_match(DMI_BOARD_NAME, "GMxPXxx") ||
+ dmi_match(DMI_BOARD_NAME, "GXxMRXx") ||
+ dmi_match(DMI_BOARD_NAME, "PH4PG31") ||
+ dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") ||
+ dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71"))
+ return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
}
+ /*
+ * NVMe SSD drops off the PCIe bus after system idle
+ * for 10 hours on a Lenovo N60z board.
+ */
+ if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6"))
+ return NVME_QUIRK_NO_APST;
+
return 0;
}
@@ -2930,7 +3226,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
struct nvme_dev *dev;
int ret = -ENOMEM;
- dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+ dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids),
+ GFP_KERNEL, node);
if (!dev)
return ERR_PTR(-ENOMEM);
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
@@ -2977,11 +3274,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
dev->ctrl.max_hw_sectors = min_t(u32,
NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS;
-
- /*
- * There is no support for SGLs for metadata (yet), so we are limited to
- * a single integrity segment for the separate metadata pointer.
- */
dev->ctrl.max_integrity_segments = 1;
return dev;
@@ -3002,17 +3294,17 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (IS_ERR(dev))
return PTR_ERR(dev);
- result = nvme_dev_map(dev);
+ result = nvme_add_ctrl(&dev->ctrl);
if (result)
- goto out_uninit_ctrl;
+ goto out_put_ctrl;
- result = nvme_setup_prp_pools(dev);
+ result = nvme_dev_map(dev);
if (result)
- goto out_dev_unmap;
+ goto out_uninit_ctrl;
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
- goto out_release_prp_pools;
+ goto out_dev_unmap;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
@@ -3040,6 +3332,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto out_disable;
+ if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+ else
+ dev->ctrl.max_integrity_segments = 1;
+
nvme_dbbuf_dma_alloc(dev);
result = nvme_setup_host_mem(dev);
@@ -3082,12 +3379,12 @@ out_disable:
nvme_free_queues(dev, 0);
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
-out_release_prp_pools:
- nvme_release_prp_pools(dev);
+ mempool_destroy(dev->iod_meta_mempool);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
nvme_uninit_ctrl(&dev->ctrl);
+out_put_ctrl:
nvme_put_ctrl(&dev->ctrl);
return result;
}
@@ -3146,7 +3443,8 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
mempool_destroy(dev->iod_mempool);
- nvme_release_prp_pools(dev);
+ mempool_destroy(dev->iod_meta_mempool);
+ nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
}
@@ -3315,7 +3613,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
dev_info(dev->ctrl.device, "restart after slot reset\n");
pci_restore_state(pdev);
- if (!nvme_try_sched_reset(&dev->ctrl))
+ if (nvme_try_sched_reset(&dev->ctrl))
nvme_unquiesce_io_queues(&dev->ctrl);
return PCI_ERS_RESULT_RECOVERED;
}
@@ -3344,12 +3642,10 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DEALLOCATE_ZEROES |
NVME_QUIRK_IGNORE_DEV_SUBNQN |
NVME_QUIRK_BOGUS_NID, },
{ PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */
- .driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DEALLOCATE_ZEROES, },
+ .driver_data = NVME_QUIRK_STRIPE_SIZE, },
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_MEDIUM_PRIO_SQ |
@@ -3363,6 +3659,14 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_BOGUS_NID, },
{ PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */
+ .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, },
+ { PCI_DEVICE(0x126f, 0x1001), /* Silicon Motion generic */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+ NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+ NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */
.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
NVME_QUIRK_BOGUS_NID, },
@@ -3381,6 +3685,11 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
NVME_QUIRK_DISABLE_WRITE_ZEROES|
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x15b7, 0x5008), /* Sandisk SN530 */
+ .driver_data = NVME_QUIRK_BROKEN_MSI },
+ { PCI_DEVICE(0x15b7, 0x5009), /* Sandisk SN550 */
+ .driver_data = NVME_QUIRK_BROKEN_MSI |
+ NVME_QUIRK_NO_DEEPEST_PS },
{ PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */
@@ -3458,12 +3767,16 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1dbe, 0x5216), /* Acer/INNOGRIT FA100/5216 NVMe SSD */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1e49, 0x0021), /* ZHITAI TiPro5000 NVMe SSD */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
{ PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+ { PCI_DEVICE(0x025e, 0xf1ac), /* SOLIDIGM P44 pro SSDPFKKW020X7 */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
{ PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */
@@ -3492,7 +3805,12 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
- .driver_data = NVME_QUIRK_SINGLE_VECTOR },
+ /*
+ * Fix for the Apple controller found in the MacBook8,1 and
+ * some MacBook7,1 to avoid controller resets and data loss.
+ */
+ .driver_data = NVME_QUIRK_SINGLE_VECTOR |
+ NVME_QUIRK_QDEPTH_ONE },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
@@ -3527,9 +3845,7 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
- BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
- BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
- BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
+ BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS);
return pci_register_driver(&nvme_driver);
}
@@ -3543,5 +3859,6 @@ static void __exit nvme_exit(void)
MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
MODULE_VERSION("1.0");
+MODULE_DESCRIPTION("NVMe host PCIe transport driver");
module_init(nvme_init);
module_exit(nvme_exit);