summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r--drivers/nvme/host/core.c1170
1 files changed, 722 insertions, 448 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0a96362912ce..8359d0aa0e44 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4,6 +4,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
*/
+#include <linux/async.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-integrity.h>
@@ -21,7 +22,7 @@
#include <linux/nvme_ioctl.h>
#include <linux/pm_qos.h>
#include <linux/ratelimit.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "nvme.h"
#include "fabrics.h"
@@ -36,10 +37,13 @@ struct nvme_ns_info {
struct nvme_ns_ids ids;
u32 nsid;
__le32 anagrpid;
+ u8 pi_offset;
bool is_shared;
bool is_readonly;
bool is_ready;
bool is_removed;
+ bool is_rotational;
+ bool no_vwc;
};
unsigned int admin_timeout = 60;
@@ -90,6 +94,17 @@ MODULE_PARM_DESC(apst_secondary_latency_tol_us,
"secondary APST latency tolerance in us");
/*
+ * Older kernels didn't enable protection information if it was at an offset.
+ * Newer kernels do, so it breaks reads on the upgrade if such formats were
+ * used in prior kernels since the metadata written did not contain a valid
+ * checksum.
+ */
+static bool disable_pi_offsets = false;
+module_param(disable_pi_offsets, bool, 0444);
+MODULE_PARM_DESC(disable_pi_offsets,
+ "disable protection information if it has an offset");
+
+/*
* nvme_wq - hosts nvme related works that are not reset or delete
* nvme_reset_wq - hosts nvme reset works
* nvme_delete_wq - hosts nvme delete works
@@ -110,16 +125,25 @@ struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);
static LIST_HEAD(nvme_subsystems);
-static DEFINE_MUTEX(nvme_subsystems_lock);
+DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
-static struct class *nvme_class;
-static struct class *nvme_subsys_class;
+static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
+static const struct class nvme_class = {
+ .name = "nvme",
+ .dev_uevent = nvme_class_uevent,
+};
+
+static const struct class nvme_subsys_class = {
+ .name = "nvme-subsystem",
+};
static DEFINE_IDA(nvme_ns_chr_minor_ida);
static dev_t nvme_ns_chr_devt;
-static struct class *nvme_ns_chr_class;
+static const struct class nvme_ns_chr_class = {
+ .name = "nvme-generic",
+};
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -252,7 +276,7 @@ void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
static blk_status_t nvme_error_status(u16 status)
{
- switch (status & 0x7ff) {
+ switch (status & NVME_SCT_SC_MASK) {
case NVME_SC_SUCCESS:
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
@@ -298,7 +322,7 @@ static void nvme_retry_req(struct request *req)
u16 crd;
/* The mask and shift result must be <= 3 */
- crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
+ crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11;
if (crd)
delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
@@ -320,10 +344,10 @@ static void nvme_log_error(struct request *req)
nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
blk_rq_bytes(req) >> ns->head->lba_shift,
nvme_get_error_status_str(nr->status),
- nr->status >> 8 & 7, /* Status Code Type */
- nr->status & 0xff, /* Status Code */
- nr->status & NVME_SC_MORE ? "MORE " : "",
- nr->status & NVME_SC_DNR ? "DNR " : "");
+ NVME_SCT(nr->status), /* Status Code Type */
+ nr->status & NVME_SC_MASK, /* Status Code */
+ nr->status & NVME_STATUS_MORE ? "MORE " : "",
+ nr->status & NVME_STATUS_DNR ? "DNR " : "");
return;
}
@@ -332,10 +356,10 @@ static void nvme_log_error(struct request *req)
nvme_get_admin_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
nvme_get_error_status_str(nr->status),
- nr->status >> 8 & 7, /* Status Code Type */
- nr->status & 0xff, /* Status Code */
- nr->status & NVME_SC_MORE ? "MORE " : "",
- nr->status & NVME_SC_DNR ? "DNR " : "");
+ NVME_SCT(nr->status), /* Status Code Type */
+ nr->status & NVME_SC_MASK, /* Status Code */
+ nr->status & NVME_STATUS_MORE ? "MORE " : "",
+ nr->status & NVME_STATUS_DNR ? "DNR " : "");
}
static void nvme_log_err_passthru(struct request *req)
@@ -350,10 +374,10 @@ static void nvme_log_err_passthru(struct request *req)
nvme_get_admin_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
nvme_get_error_status_str(nr->status),
- nr->status >> 8 & 7, /* Status Code Type */
- nr->status & 0xff, /* Status Code */
- nr->status & NVME_SC_MORE ? "MORE " : "",
- nr->status & NVME_SC_DNR ? "DNR " : "",
+ NVME_SCT(nr->status), /* Status Code Type */
+ nr->status & NVME_SC_MASK, /* Status Code */
+ nr->status & NVME_STATUS_MORE ? "MORE " : "",
+ nr->status & NVME_STATUS_DNR ? "DNR " : "",
nr->cmd->common.cdw10,
nr->cmd->common.cdw11,
nr->cmd->common.cdw12,
@@ -374,14 +398,14 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
if (likely(nvme_req(req)->status == 0))
return COMPLETE;
- if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
- return AUTHENTICATE;
-
if (blk_noretry_request(req) ||
- (nvme_req(req)->status & NVME_SC_DNR) ||
+ (nvme_req(req)->status & NVME_STATUS_DNR) ||
nvme_req(req)->retries >= nvme_max_retries)
return COMPLETE;
+ if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED)
+ return AUTHENTICATE;
+
if (req->cmd_flags & REQ_NVME_MPATH) {
if (nvme_is_path_error(nvme_req(req)->status) ||
blk_queue_dying(req->q))
@@ -405,10 +429,8 @@ static inline void nvme_end_req_zoned(struct request *req)
}
}
-static inline void nvme_end_req(struct request *req)
+static inline void __nvme_end_req(struct request *req)
{
- blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
if (blk_rq_is_passthrough(req))
nvme_log_err_passthru(req);
@@ -419,6 +441,13 @@ static inline void nvme_end_req(struct request *req)
nvme_trace_bio_complete(req);
if (req->cmd_flags & REQ_NVME_MPATH)
nvme_mpath_end_request(req);
+}
+
+void nvme_end_req(struct request *req)
+{
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
+
+ __nvme_end_req(req);
blk_mq_end_request(req, status);
}
@@ -467,7 +496,7 @@ void nvme_complete_batch_req(struct request *req)
{
trace_nvme_complete_rq(req);
nvme_cleanup_cmd(req);
- nvme_end_req_zoned(req);
+ __nvme_end_req(req);
}
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
@@ -535,8 +564,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
- case NVME_CTRL_NEW:
- case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
fallthrough;
@@ -620,27 +647,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
/*
- * Returns true for sink states that can't ever transition back to live.
- */
-static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
-{
- switch (nvme_ctrl_state(ctrl)) {
- case NVME_CTRL_NEW:
- case NVME_CTRL_LIVE:
- case NVME_CTRL_RESETTING:
- case NVME_CTRL_CONNECTING:
- return false;
- case NVME_CTRL_DELETING:
- case NVME_CTRL_DELETING_NOIO:
- case NVME_CTRL_DEAD:
- return true;
- default:
- WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
- return true;
- }
-}
-
-/*
* Waits for the controller state to be resetting, or returns false if it is
* not possible to ever transition to that state.
*/
@@ -685,7 +691,7 @@ static void nvme_free_ns(struct kref *kref)
kfree(ns);
}
-static inline bool nvme_get_ns(struct nvme_ns *ns)
+bool nvme_get_ns(struct nvme_ns *ns)
{
return kref_get_unless_zero(&ns->kref);
}
@@ -694,7 +700,7 @@ void nvme_put_ns(struct nvme_ns *ns)
{
kref_put(&ns->kref, nvme_free_ns);
}
-EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_put_ns, "NVME_TARGET_PASSTHRU");
static inline void nvme_clear_nvme_request(struct request *req)
{
@@ -877,6 +883,12 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
return BLK_STS_OK;
}
+static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
+{
+ cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag);
+ cmnd->rw.lbatm = cpu_to_le16(0xffff);
+}
+
static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
struct request *req)
{
@@ -934,6 +946,36 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
return BLK_STS_OK;
}
+/*
+ * NVMe does not support a dedicated command to issue an atomic write. A write
+ * which does adhere to the device atomic limits will silently be executed
+ * non-atomically. The request issuer should ensure that the write is within
+ * the queue atomic writes limits, but just validate this in case it is not.
+ */
+static bool nvme_valid_atomic_write(struct request *req)
+{
+ struct request_queue *q = req->q;
+ u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
+
+ if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
+ return false;
+
+ if (boundary_bytes) {
+ u64 mask = boundary_bytes - 1, imask = ~mask;
+ u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
+ u64 end = start + blk_rq_bytes(req) - 1;
+
+ /* If greater then must be crossing a boundary */
+ if (blk_rq_bytes(req) > boundary_bytes)
+ return false;
+
+ if ((start & imask) != (end & imask))
+ return false;
+ }
+
+ return true;
+}
+
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd,
enum nvme_opcode op)
@@ -949,6 +991,9 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+ if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
+ return BLK_STS_INVAL;
+
cmnd->rw.opcode = op;
cmnd->rw.flags = 0;
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
@@ -960,8 +1005,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
cmnd->rw.reftag = 0;
- cmnd->rw.apptag = 0;
- cmnd->rw.appmask = 0;
+ cmnd->rw.lbat = 0;
+ cmnd->rw.lbatm = 0;
if (ns->head->ms) {
/*
@@ -976,18 +1021,17 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
control |= NVME_RW_PRINFO_PRACT;
}
- switch (ns->head->pi_type) {
- case NVME_NS_DPS_PI_TYPE3:
+ if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
control |= NVME_RW_PRINFO_PRCHK_GUARD;
- break;
- case NVME_NS_DPS_PI_TYPE1:
- case NVME_NS_DPS_PI_TYPE2:
- control |= NVME_RW_PRINFO_PRCHK_GUARD |
- NVME_RW_PRINFO_PRCHK_REF;
+ if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) {
+ control |= NVME_RW_PRINFO_PRCHK_REF;
if (op == nvme_cmd_zone_append)
control |= NVME_RW_APPEND_PIREMAP;
nvme_set_ref_tag(ns, cmnd, req);
- break;
+ }
+ if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) {
+ control |= NVME_RW_PRINFO_PRCHK_APP;
+ nvme_set_app_tag(req, cmnd);
}
}
@@ -1005,6 +1049,7 @@ void nvme_cleanup_cmd(struct request *req)
clear_bit_unlock(0, &ctrl->discard_page_busy);
else
kfree(bvec_virt(&req->special_vec));
+ req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
}
}
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -1081,7 +1126,7 @@ int nvme_execute_rq(struct request *rq, bool at_head)
return nvme_req(rq)->status;
return blk_status_to_errno(status);
}
-EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, "NVME_TARGET_PASSTHRU");
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
@@ -1161,7 +1206,7 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
return effects;
}
-EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_command_effects, "NVME_TARGET_PASSTHRU");
u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
{
@@ -1181,7 +1226,7 @@ u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
}
return effects;
}
-EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, "NVME_TARGET_PASSTHRU");
void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
struct nvme_command *cmd, int status)
@@ -1226,11 +1271,11 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
break;
}
}
-EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, "NVME_TARGET_PASSTHRU");
/*
* Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
- *
+ *
* The host should send Keep Alive commands at half of the Keep Alive Timeout
* accounting for transport roundtrip times [..].
*/
@@ -1267,10 +1312,9 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
blk_status_t status)
{
struct nvme_ctrl *ctrl = rq->end_io_data;
- unsigned long flags;
- bool startka = false;
unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
unsigned long delay = nvme_keep_alive_work_period(ctrl);
+ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
/*
* Subtract off the keepalive RTT so nvme_keep_alive_work runs
@@ -1295,12 +1339,7 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
ctrl->ka_last_check_time = jiffies;
ctrl->comp_seen = false;
- spin_lock_irqsave(&ctrl->lock, flags);
- if (ctrl->state == NVME_CTRL_LIVE ||
- ctrl->state == NVME_CTRL_CONNECTING)
- startka = true;
- spin_unlock_irqrestore(&ctrl->lock, flags);
- if (startka)
+ if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING)
queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
return RQ_END_IO_NONE;
}
@@ -1370,17 +1409,30 @@ static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
nvme_start_keep_alive(ctrl);
}
-/*
- * In NVMe 1.0 the CNS field was just a binary controller or namespace
- * flag, thus sending any new CNS opcodes has a big chance of not working.
- * Qemu unfortunately had that bug after reporting a 1.1 version compliance
- * (but not for any later version).
- */
-static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
+static bool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns)
{
- if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
- return ctrl->vs < NVME_VS(1, 2, 0);
- return ctrl->vs < NVME_VS(1, 1, 0);
+ /*
+ * The CNS field occupies a full byte starting with NVMe 1.2
+ */
+ if (ctrl->vs >= NVME_VS(1, 2, 0))
+ return true;
+
+ /*
+ * NVMe 1.1 expanded the CNS value to two bits, which means values
+ * larger than that could get truncated and treated as an incorrect
+ * value.
+ *
+ * Qemu implemented 1.0 behavior for controllers claiming 1.1
+ * compliance, so they need to be quirked here.
+ */
+ if (ctrl->vs >= NVME_VS(1, 1, 0) &&
+ !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS))
+ return cns <= 3;
+
+ /*
+ * NVMe 1.0 used a single bit for the CNS value.
+ */
+ return cns <= 1;
}
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
@@ -1398,8 +1450,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
sizeof(struct nvme_id_ctrl));
- if (error)
+ if (error) {
kfree(*id);
+ *id = NULL;
+ }
return error;
}
@@ -1528,6 +1582,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
kfree(*id);
+ *id = NULL;
}
return error;
}
@@ -1592,6 +1647,8 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
+ info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
+ info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
}
kfree(id);
return ret;
@@ -1641,7 +1698,13 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
&result);
- if (status < 0)
+
+ /*
+ * It's either a kernel error or the host observed a connection
+ * lost. In either case it's not possible communicate with the
+ * controller and thus enter the error code path.
+ */
+ if (status < 0 || status == NVME_SC_HOST_PATH_ERROR)
return status;
/*
@@ -1727,27 +1790,38 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct gendisk *disk,
- struct nvme_ns_head *head, u32 max_integrity_segments)
+static bool nvme_init_integrity(struct nvme_ns_head *head,
+ struct queue_limits *lim, struct nvme_ns_info *info)
{
- struct blk_integrity integrity = { };
+ struct blk_integrity *bi = &lim->integrity;
+
+ memset(bi, 0, sizeof(*bi));
+
+ if (!head->ms)
+ return true;
+
+ /*
+ * PI can always be supported as we can ask the controller to simply
+ * insert/strip it, which is not possible for other kinds of metadata.
+ */
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
+ !(head->features & NVME_NS_METADATA_SUPPORTED))
+ return nvme_ns_has_pi(head);
switch (head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
- integrity.profile = &t10_pi_type3_crc;
- integrity.tag_size = sizeof(u16) + sizeof(u32);
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
+ bi->tag_size = sizeof(u16) + sizeof(u32);
+ bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
break;
case NVME_NVM_NS_64B_GUARD:
- integrity.profile = &ext_pi_type3_crc64;
- integrity.tag_size = sizeof(u16) + 6;
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
+ bi->tag_size = sizeof(u16) + 6;
+ bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
break;
default:
- integrity.profile = NULL;
break;
}
break;
@@ -1755,73 +1829,48 @@ static void nvme_init_integrity(struct gendisk *disk,
case NVME_NS_DPS_PI_TYPE2:
switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
- integrity.profile = &t10_pi_type1_crc;
- integrity.tag_size = sizeof(u16);
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
+ bi->tag_size = sizeof(u16);
+ bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
+ BLK_INTEGRITY_REF_TAG;
break;
case NVME_NVM_NS_64B_GUARD:
- integrity.profile = &ext_pi_type1_crc64;
- integrity.tag_size = sizeof(u16);
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
+ bi->tag_size = sizeof(u16);
+ bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
+ BLK_INTEGRITY_REF_TAG;
break;
default:
- integrity.profile = NULL;
break;
}
break;
default:
- integrity.profile = NULL;
break;
}
- integrity.tuple_size = head->ms;
- blk_integrity_register(disk, &integrity);
- blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
-}
-#else
-static void nvme_init_integrity(struct gendisk *disk,
- struct nvme_ns_head *head, u32 max_integrity_segments)
-{
+ bi->tuple_size = head->ms;
+ bi->pi_offset = info->pi_offset;
+ return true;
}
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
- struct nvme_ns_head *head)
+static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
{
- struct request_queue *queue = disk->queue;
- u32 max_discard_sectors;
-
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
- max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
- } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
- max_discard_sectors = UINT_MAX;
- } else {
- blk_queue_max_discard_sectors(queue, 0);
- return;
- }
+ struct nvme_ctrl *ctrl = ns->ctrl;
- BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
- NVME_DSM_MAX_RANGES);
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
+ lim->max_hw_discard_sectors =
+ nvme_lba_to_sect(ns->head, ctrl->dmrsl);
+ else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ lim->max_hw_discard_sectors = UINT_MAX;
+ else
+ lim->max_hw_discard_sectors = 0;
- /*
- * If discard is already enabled, don't reset queue limits.
- *
- * This works around the fact that the block layer can't cope well with
- * updating the hardware limits when overridden through sysfs. This is
- * harmless because discard limits in NVMe are purely advisory.
- */
- if (queue->limits.max_discard_sectors)
- return;
+ lim->discard_granularity = lim->logical_block_size;
- blk_queue_max_discard_sectors(queue, max_discard_sectors);
if (ctrl->dmrl)
- blk_queue_max_discard_segments(queue, ctrl->dmrl);
+ lim->max_discard_segments = ctrl->dmrl;
else
- blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
- queue->limits.discard_granularity = queue_logical_block_size(queue);
-
- if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
- blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
+ lim->max_discard_segments = NVME_DSM_MAX_RANGES;
}
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
@@ -1832,44 +1881,46 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
a->csi == b->csi;
}
-static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
- struct nvme_id_ns *id)
+static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
+ struct nvme_id_ns_nvm **nvmp)
{
- bool first = id->dps & NVME_NS_DPS_PI_FIRST;
- unsigned lbaf = nvme_lbaf_index(id->flbas);
- struct nvme_command c = { };
+ struct nvme_command c = {
+ .identify.opcode = nvme_admin_identify,
+ .identify.nsid = cpu_to_le32(nsid),
+ .identify.cns = NVME_ID_CNS_CS_NS,
+ .identify.csi = NVME_CSI_NVM,
+ };
struct nvme_id_ns_nvm *nvm;
- int ret = 0;
- u32 elbaf;
-
- head->pi_size = 0;
- head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
- if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
- head->pi_size = sizeof(struct t10_pi_tuple);
- head->guard_type = NVME_NVM_NS_16B_GUARD;
- goto set_pi;
- }
+ int ret;
nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
if (!nvm)
return -ENOMEM;
- c.identify.opcode = nvme_admin_identify;
- c.identify.nsid = cpu_to_le32(head->ns_id);
- c.identify.cns = NVME_ID_CNS_CS_NS;
- c.identify.csi = NVME_CSI_NVM;
-
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
if (ret)
- goto free_data;
+ kfree(nvm);
+ else
+ *nvmp = nvm;
+ return ret;
+}
- elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
+static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
+ struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
+{
+ u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
+ u8 guard_type;
/* no support for storage tag formats right now */
if (nvme_elbaf_sts(elbaf))
- goto free_data;
+ return;
+
+ guard_type = nvme_elbaf_guard_type(elbaf);
+ if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) &&
+ guard_type == NVME_NVM_NS_QTYPE_GUARD)
+ guard_type = nvme_elbaf_qualified_guard_type(elbaf);
- head->guard_type = nvme_elbaf_guard_type(elbaf);
+ head->guard_type = guard_type;
switch (head->guard_type) {
case NVME_NVM_NS_64B_GUARD:
head->pi_size = sizeof(struct crc64_pi_tuple);
@@ -1880,30 +1931,34 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
default:
break;
}
-
-free_data:
- kfree(nvm);
-set_pi:
- if (head->pi_size && (first || head->ms == head->pi_size))
- head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
- else
- head->pi_type = 0;
-
- return ret;
}
-static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
- struct nvme_ns_head *head, struct nvme_id_ns *id)
+static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head, struct nvme_id_ns *id,
+ struct nvme_id_ns_nvm *nvm, struct nvme_ns_info *info)
{
- int ret;
-
- ret = nvme_init_ms(ctrl, head, id);
- if (ret)
- return ret;
-
head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ head->pi_type = 0;
+ head->pi_size = 0;
+ head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
- return 0;
+ return;
+
+ if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
+ nvme_configure_pi_elbas(head, id, nvm);
+ } else {
+ head->pi_size = sizeof(struct t10_pi_tuple);
+ head->guard_type = NVME_NVM_NS_16B_GUARD;
+ }
+
+ if (head->pi_size && head->ms >= head->pi_size)
+ head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ if (!(id->dps & NVME_NS_DPS_PI_FIRST)) {
+ if (disable_pi_offsets)
+ head->pi_type = 0;
+ else
+ info->pi_offset = head->ms - head->pi_size;
+ }
if (ctrl->ops->flags & NVME_F_FABRICS) {
/*
@@ -1912,7 +1967,7 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
* remap the separate metadata buffer from the block layer.
*/
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
- return 0;
+ return;
head->features |= NVME_NS_EXT_LBAS;
@@ -1939,46 +1994,61 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
else
head->features |= NVME_NS_METADATA_SUPPORTED;
}
- return 0;
}
-static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
- struct request_queue *q)
-{
- bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
- if (ctrl->max_hw_sectors) {
- u32 max_segments =
- (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
+static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
+ struct nvme_id_ns *id, struct queue_limits *lim,
+ u32 bs, u32 atomic_bs)
+{
+ unsigned int boundary = 0;
- max_segments = min_not_zero(max_segments, ctrl->max_segments);
- blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
- blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+ if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
+ if (le16_to_cpu(id->nabspf))
+ boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
}
- blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
- blk_queue_dma_alignment(q, 3);
- blk_queue_write_cache(q, vwc, vwc);
+ lim->atomic_write_hw_max = atomic_bs;
+ lim->atomic_write_hw_boundary = boundary;
+ lim->atomic_write_hw_unit_min = bs;
+ lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
+ lim->features |= BLK_FEAT_ATOMIC_WRITES;
+}
+
+static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
+{
+ return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
+}
+
+static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
+ struct queue_limits *lim)
+{
+ lim->max_hw_sectors = ctrl->max_hw_sectors;
+ lim->max_segments = min_t(u32, USHRT_MAX,
+ min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
+ lim->max_integrity_segments = ctrl->max_integrity_segments;
+ lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
+ lim->max_segment_size = UINT_MAX;
+ lim->dma_alignment = 3;
}
-static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
- struct nvme_ns_head *head, struct nvme_id_ns *id)
+static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
{
- sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze));
+ struct nvme_ns_head *head = ns->head;
u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
+ bool valid = true;
/*
* The block layer can't support LBA sizes larger than the page size
* or smaller than a sector size yet, so catch this early and don't
* allow block I/O.
*/
- if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
- capacity = 0;
+ if (blk_validate_block_size(bs)) {
bs = (1 << 9);
+ valid = false;
}
- blk_integrity_unregister(disk);
-
atomic_bs = phys_bs = bs;
if (id->nabo == 0) {
/*
@@ -1989,46 +2059,34 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
- atomic_bs = (1 + ctrl->subsys->awupf) * bs;
+ atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+
+ nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
}
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
/* NPWG = Namespace Preferred Write Granularity */
phys_bs = bs * (1 + le16_to_cpu(id->npwg));
/* NOWS = Namespace Optimal Write Size */
- io_opt = bs * (1 + le16_to_cpu(id->nows));
+ if (id->nows)
+ io_opt = bs * (1 + le16_to_cpu(id->nows));
}
- blk_queue_logical_block_size(disk->queue, bs);
/*
* Linux filesystems assume writing a single physical block is
* an atomic operation. Hence limit the physical block size to the
* value of the Atomic Write Unit Power Fail parameter.
*/
- blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
- blk_queue_io_min(disk->queue, phys_bs);
- blk_queue_io_opt(disk->queue, io_opt);
-
- /*
- * Register a metadata profile for PI, or the plain non-integrity NVMe
- * metadata masquerading as Type 0 if supported, otherwise reject block
- * I/O to namespaces with metadata except when the namespace supports
- * PI, as it can strip/insert in that case.
- */
- if (head->ms) {
- if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
- (head->features & NVME_NS_METADATA_SUPPORTED))
- nvme_init_integrity(disk, head,
- ctrl->max_integrity_segments);
- else if (!nvme_ns_has_pi(head))
- capacity = 0;
- }
-
- set_capacity_and_notify(disk, capacity);
-
- nvme_config_discard(ctrl, disk, head);
- blk_queue_max_write_zeroes_sectors(disk->queue,
- ctrl->max_zeroes_sectors);
+ lim->logical_block_size = bs;
+ lim->physical_block_size = min(phys_bs, atomic_bs);
+ lim->io_min = phys_bs;
+ lim->io_opt = io_opt;
+ if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
+ (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
+ lim->max_write_zeroes_sectors = UINT_MAX;
+ else
+ lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
+ return valid;
}
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
@@ -2042,7 +2100,8 @@ static inline bool nvme_first_scan(struct gendisk *disk)
return !disk_live(disk);
}
-static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u32 iob;
@@ -2070,38 +2129,39 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
return;
}
- blk_queue_chunk_sectors(ns->queue, iob);
+ lim->chunk_sectors = iob;
}
static int nvme_update_ns_info_generic(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
- blk_mq_freeze_queue(ns->disk->queue);
- nvme_set_queue_limits(ns->ctrl, ns->queue);
- set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
- blk_mq_unfreeze_queue(ns->disk->queue);
+ struct queue_limits lim;
+ unsigned int memflags;
+ int ret;
- if (nvme_ns_head_multipath(ns->head)) {
- blk_mq_freeze_queue(ns->head->disk->queue);
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- ns->head->disk->flags |= GENHD_FL_HIDDEN;
- blk_mq_unfreeze_queue(ns->head->disk->queue);
- }
+ lim = queue_limits_start_update(ns->disk->queue);
+ nvme_set_ctrl_limits(ns->ctrl, &lim);
- /* Hide the block-interface for these devices */
- ns->disk->flags |= GENHD_FL_HIDDEN;
- set_bit(NVME_NS_READY, &ns->flags);
+ memflags = blk_mq_freeze_queue(ns->disk->queue);
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
+ blk_mq_unfreeze_queue(ns->disk->queue, memflags);
- return 0;
+ /* Hide the block-interface for these devices */
+ if (!ret)
+ ret = -ENODEV;
+ return ret;
}
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
+ struct queue_limits lim;
+ struct nvme_id_ns_nvm *nvm = NULL;
+ struct nvme_zone_info zi = {};
struct nvme_id_ns *id;
+ unsigned int memflags;
+ sector_t capacity;
unsigned lbaf;
int ret;
@@ -2112,31 +2172,64 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (id->ncap == 0) {
/* namespace not allocated or attached */
info->is_removed = true;
- ret = -ENODEV;
- goto error;
+ ret = -ENXIO;
+ goto out;
}
-
- blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
+
+ if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
+ ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ns->head->ids.csi == NVME_CSI_ZNS) {
+ ret = nvme_query_zone_info(ns, lbaf, &zi);
+ if (ret < 0)
+ goto out;
+ }
+
+ lim = queue_limits_start_update(ns->disk->queue);
+
+ memflags = blk_mq_freeze_queue(ns->disk->queue);
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
- nvme_set_queue_limits(ns->ctrl, ns->queue);
+ capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
+ nvme_set_ctrl_limits(ns->ctrl, &lim);
+ nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
+ nvme_set_chunk_sectors(ns, id, &lim);
+ if (!nvme_update_disk_info(ns, id, &lim))
+ capacity = 0;
+ nvme_config_discard(ns, &lim);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ns->head->ids.csi == NVME_CSI_ZNS)
+ nvme_update_zone_info(ns, &lim, &zi);
- ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
- if (ret < 0) {
- blk_mq_unfreeze_queue(ns->disk->queue);
+ if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
+ lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
+ else
+ lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
+
+ if (info->is_rotational)
+ lim.features |= BLK_FEAT_ROTATIONAL;
+
+ /*
+ * Register a metadata profile for PI, or the plain non-integrity NVMe
+ * metadata masquerading as Type 0 if supported, otherwise reject block
+ * I/O to namespaces with metadata except when the namespace supports
+ * PI, as it can strip/insert in that case.
+ */
+ if (!nvme_init_integrity(ns->head, &lim, info))
+ capacity = 0;
+
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ if (ret) {
+ blk_mq_unfreeze_queue(ns->disk->queue, memflags);
goto out;
}
- nvme_set_chunk_sectors(ns, id);
- nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id);
- if (ns->head->ids.csi == NVME_CSI_ZNS) {
- ret = nvme_update_zone_info(ns, lbaf);
- if (ret) {
- blk_mq_unfreeze_queue(ns->disk->queue);
- goto out;
- }
- }
+ set_capacity_and_notify(ns->disk, capacity);
/*
* Only set the DEAC bit if the device guarantees that reads from
@@ -2148,61 +2241,127 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
- blk_mq_unfreeze_queue(ns->disk->queue);
+ blk_mq_unfreeze_queue(ns->disk->queue, memflags);
if (blk_queue_is_zoned(ns->queue)) {
- ret = nvme_revalidate_zones(ns);
+ ret = blk_revalidate_disk_zones(ns->disk);
if (ret && !nvme_first_scan(ns->disk))
goto out;
}
- if (nvme_ns_head_multipath(ns->head)) {
- blk_mq_freeze_queue(ns->head->disk->queue);
- nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id);
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- disk_update_readahead(ns->head->disk);
- blk_mq_unfreeze_queue(ns->head->disk->queue);
- }
-
ret = 0;
out:
- /*
- * If probing fails due an unsupported feature, hide the block device,
- * but still allow other access.
- */
- if (ret == -ENODEV) {
- ns->disk->flags |= GENHD_FL_HIDDEN;
- set_bit(NVME_NS_READY, &ns->flags);
- ret = 0;
- }
-
-error:
+ kfree(nvm);
kfree(id);
return ret;
}
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
+ bool unsupported = false;
+ int ret;
+
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
- return nvme_update_ns_info_generic(ns, info);
+ ret = nvme_update_ns_info_generic(ns, info);
+ break;
}
- return nvme_update_ns_info_block(ns, info);
+ ret = nvme_update_ns_info_block(ns, info);
+ break;
case NVME_CSI_NVM:
- return nvme_update_ns_info_block(ns, info);
+ ret = nvme_update_ns_info_block(ns, info);
+ break;
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
- return nvme_update_ns_info_generic(ns, info);
+ ret = nvme_update_ns_info_generic(ns, info);
+ break;
+ }
+
+ /*
+ * If probing fails due an unsupported feature, hide the block device,
+ * but still allow other access.
+ */
+ if (ret == -ENODEV) {
+ ns->disk->flags |= GENHD_FL_HIDDEN;
+ set_bit(NVME_NS_READY, &ns->flags);
+ unsupported = true;
+ ret = 0;
+ }
+
+ if (!ret && nvme_ns_head_multipath(ns->head)) {
+ struct queue_limits *ns_lim = &ns->disk->queue->limits;
+ struct queue_limits lim;
+ unsigned int memflags;
+
+ lim = queue_limits_start_update(ns->head->disk->queue);
+ memflags = blk_mq_freeze_queue(ns->head->disk->queue);
+ /*
+ * queue_limits mixes values that are the hardware limitations
+ * for bio splitting with what is the device configuration.
+ *
+ * For NVMe the device configuration can change after e.g. a
+ * Format command, and we really want to pick up the new format
+ * value here. But we must still stack the queue limits to the
+ * least common denominator for multipathing to split the bios
+ * properly.
+ *
+ * To work around this, we explicitly set the device
+ * configuration to those that we just queried, but only stack
+ * the splitting limits in to make sure we still obey possibly
+ * lower limitations of other controllers.
+ */
+ lim.logical_block_size = ns_lim->logical_block_size;
+ lim.physical_block_size = ns_lim->physical_block_size;
+ lim.io_min = ns_lim->io_min;
+ lim.io_opt = ns_lim->io_opt;
+ queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
+ ns->head->disk->disk_name);
+ if (unsupported)
+ ns->head->disk->flags |= GENHD_FL_HIDDEN;
+ else
+ nvme_init_integrity(ns->head, &lim, info);
+ ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
+
+ set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
+ set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
+ nvme_mpath_revalidate_paths(ns);
+
+ blk_mq_unfreeze_queue(ns->head->disk->queue, memflags);
+ }
+
+ return ret;
+}
+
+int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
+ enum blk_unique_id type)
+{
+ struct nvme_ns_ids *ids = &ns->head->ids;
+
+ if (type != BLK_UID_EUI64)
+ return -EINVAL;
+
+ if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) {
+ memcpy(id, &ids->nguid, sizeof(ids->nguid));
+ return sizeof(ids->nguid);
+ }
+ if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) {
+ memcpy(id, &ids->eui64, sizeof(ids->eui64));
+ return sizeof(ids->eui64);
}
+
+ return -EINVAL;
+}
+
+static int nvme_get_unique_id(struct gendisk *disk, u8 id[16],
+ enum blk_unique_id type)
+{
+ return nvme_ns_get_unique_id(disk->private_data, id, type);
}
#ifdef CONFIG_BLK_SED_OPAL
@@ -2260,6 +2419,7 @@ const struct block_device_operations nvme_bdev_ops = {
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
+ .get_unique_id = nvme_get_unique_id,
.report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -2342,8 +2502,13 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
else
ctrl->ctrl_config = NVME_CC_CSS_NVM;
- if (ctrl->cap & NVME_CAP_CRMS_CRWMS && ctrl->cap & NVME_CAP_CRMS_CRIMS)
- ctrl->ctrl_config |= NVME_CC_CRIME;
+ /*
+ * Setting CRIME results in CSTS.RDY before the media is ready. This
+ * makes it possible for media related commands to return the error
+ * NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY. Until the driver is
+ * restructured to handle retries, disable CC.CRIME.
+ */
+ ctrl->ctrl_config &= ~NVME_CC_CRIME;
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
@@ -2352,11 +2517,6 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
if (ret)
return ret;
- /* Flush write to device (required if transport is PCI) */
- ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
- if (ret)
- return ret;
-
/* CAP value may change after initial CC write */
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
if (ret)
@@ -2378,10 +2538,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
* devices are known to get this wrong. Use the larger of the
* two values.
*/
- if (ctrl->ctrl_config & NVME_CC_CRIME)
- ready_timeout = NVME_CRTO_CRIMT(crto);
- else
- ready_timeout = NVME_CRTO_CRWMT(crto);
+ ready_timeout = NVME_CRTO_CRWMT(crto);
if (ready_timeout < timeout)
dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
@@ -2877,7 +3034,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys->awupf = le16_to_cpu(id->awupf);
nvme_mpath_default_iopolicy(subsys);
- subsys->dev.class = nvme_subsys_class;
+ subsys->dev.class = &nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups;
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
@@ -2949,7 +3106,7 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
- struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
+ struct nvme_effects_log *old, *cel = xa_load(&ctrl->cels, csi);
int ret;
if (cel)
@@ -2966,7 +3123,11 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
return ret;
}
- xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
+ old = xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
+ if (xa_is_err(old)) {
+ kfree(cel);
+ return xa_err(old);
+ }
out:
*log = cel;
return 0;
@@ -3000,7 +3161,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
ctrl->max_zeroes_sectors = 0;
if (ctrl->subsys->subtype != NVME_NQN_NVME ||
- nvme_ctrl_limited_cns(ctrl) ||
+ !nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) ||
test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags))
return 0;
@@ -3028,6 +3189,25 @@ free_data:
return ret;
}
+static int nvme_init_effects_log(struct nvme_ctrl *ctrl,
+ u8 csi, struct nvme_effects_log **log)
+{
+ struct nvme_effects_log *effects, *old;
+
+ effects = kzalloc(sizeof(*effects), GFP_KERNEL);
+ if (!effects)
+ return -ENOMEM;
+
+ old = xa_store(&ctrl->cels, csi, effects, GFP_KERNEL);
+ if (xa_is_err(old)) {
+ kfree(effects);
+ return xa_err(old);
+ }
+
+ *log = effects;
+ return 0;
+}
+
static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
{
struct nvme_effects_log *log = ctrl->effects;
@@ -3074,10 +3254,9 @@ static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
}
if (!ctrl->effects) {
- ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
- if (!ctrl->effects)
- return -ENOMEM;
- xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL);
+ ret = nvme_init_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
+ if (ret < 0)
+ return ret;
}
nvme_init_known_nvm_effects(ctrl);
@@ -3117,11 +3296,18 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct
return -EINVAL;
}
+ if (!ctrl->maxcmd) {
+ dev_warn(ctrl->device,
+ "Firmware bug: maximum outstanding commands is 0\n");
+ ctrl->maxcmd = ctrl->sqsize + 1;
+ }
+
return 0;
}
static int nvme_init_identify(struct nvme_ctrl *ctrl)
{
+ struct queue_limits lim;
struct nvme_id_ctrl *id;
u32 max_hw_sectors;
bool prev_apst_enabled;
@@ -3188,7 +3374,12 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
- nvme_set_queue_limits(ctrl, ctrl->admin_q);
+ lim = queue_limits_start_update(ctrl->admin_q);
+ nvme_set_ctrl_limits(ctrl, &lim);
+ ret = queue_limits_commit_update(ctrl->admin_q, &lim);
+ if (ret)
+ goto out_free;
+
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
@@ -3206,7 +3397,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ctrl->shutdown_timeout != shutdown_timeout)
dev_info(ctrl->device,
- "Shutdown timeout set to %u seconds\n",
+ "D3 entry latency set to %u seconds\n",
ctrl->shutdown_timeout);
} else
ctrl->shutdown_timeout = shutdown_timeout;
@@ -3420,7 +3611,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
if (minor < 0)
return minor;
cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
- cdev_device->class = nvme_ns_chr_class;
+ cdev_device->class = &nvme_ns_chr_class;
cdev_device->release = nvme_cdev_rel;
device_initialize(cdev_device);
cdev_init(cdev, fops);
@@ -3493,6 +3684,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = info->nsid;
head->ids = info->ids;
head->shared = info->is_shared;
+ head->rotational = info->is_rotational;
ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
kref_init(&head->ref);
@@ -3630,7 +3822,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
"Found shared namespace %d, but multipathing not supported.\n",
info->nsid);
dev_warn_once(ctrl->device,
- "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
+ "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
}
}
@@ -3649,9 +3841,11 @@ out_unlock:
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns, *ret = NULL;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu)) {
if (ns->head->ns_id == nsid) {
if (!nvme_get_ns(ns))
continue;
@@ -3661,10 +3855,10 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (ns->head->ns_id > nsid)
break;
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
return ret;
}
-EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, "NVME_TARGET_PASSTHRU");
/*
* Add the namespace to the controller list while keeping the list ordered.
@@ -3675,7 +3869,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
if (tmp->head->ns_id < ns->head->ns_id) {
- list_add(&ns->list, &tmp->list);
+ list_add_rcu(&ns->list, &tmp->list);
return;
}
}
@@ -3684,6 +3878,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
{
+ struct queue_limits lim = { };
struct nvme_ns *ns;
struct gendisk *disk;
int node = ctrl->numa_node;
@@ -3692,7 +3887,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (!ns)
return;
- disk = blk_mq_alloc_disk(ctrl->tagset, ns);
+ if (ctrl->opts && ctrl->opts->data_digest)
+ lim.features |= BLK_FEAT_STABLE_WRITES;
+ if (ctrl->ops->supports_pci_p2pdma &&
+ ctrl->ops->supports_pci_p2pdma(ctrl))
+ lim.features |= BLK_FEAT_PCI_P2PDMA;
+
+ disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
if (IS_ERR(disk))
goto out_free_ns;
disk->fops = &nvme_bdev_ops;
@@ -3700,15 +3901,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
ns->disk = disk;
ns->queue = disk->queue;
-
- if (ctrl->opts && ctrl->opts->data_digest)
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
-
- blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
- if (ctrl->ops->supports_pci_p2pdma &&
- ctrl->ops->supports_pci_p2pdma(ctrl))
- blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
-
ns->ctrl = ctrl;
kref_init(&ns->kref);
@@ -3741,17 +3933,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (nvme_update_ns_info(ns, info))
goto out_unlink_ns;
- down_write(&ctrl->namespaces_rwsem);
+ mutex_lock(&ctrl->namespaces_lock);
/*
* Ensure that no namespaces are added to the ctrl list after the queues
* are frozen, thereby avoiding a deadlock between scan and reset.
*/
if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
- up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_lock);
goto out_unlink_ns;
}
nvme_ns_add_to_ctrl_list(ns);
- up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_lock);
+ synchronize_srcu(&ctrl->srcu);
nvme_get_ctrl(ctrl);
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
@@ -3774,9 +3967,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
out_cleanup_ns_from_list:
nvme_put_ctrl(ctrl);
- down_write(&ctrl->namespaces_rwsem);
- list_del_init(&ns->list);
- up_write(&ctrl->namespaces_rwsem);
+ mutex_lock(&ctrl->namespaces_lock);
+ list_del_rcu(&ns->list);
+ mutex_unlock(&ctrl->namespaces_lock);
+ synchronize_srcu(&ctrl->srcu);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
@@ -3826,9 +4020,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
del_gendisk(ns->disk);
- down_write(&ns->ctrl->namespaces_rwsem);
- list_del_init(&ns->list);
- up_write(&ns->ctrl->namespaces_rwsem);
+ mutex_lock(&ns->ctrl->namespaces_lock);
+ list_del_rcu(&ns->list);
+ mutex_unlock(&ns->ctrl->namespaces_lock);
+ synchronize_srcu(&ns->ctrl->srcu);
if (last_path)
nvme_mpath_shutdown_disk(ns->head);
@@ -3847,7 +4042,7 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
{
- int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
+ int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
dev_err(ns->ctrl->device,
@@ -3863,7 +4058,7 @@ out:
*
* TODO: we should probably schedule a delayed retry here.
*/
- if (ret > 0 && (ret & NVME_SC_DNR))
+ if (ret > 0 && (ret & NVME_STATUS_DNR))
nvme_ns_remove(ns);
}
@@ -3871,7 +4066,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns_info info = { .nsid = nsid };
struct nvme_ns *ns;
- int ret;
+ int ret = 1;
if (nvme_identify_ns_descs(ctrl, &info))
return;
@@ -3888,9 +4083,10 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
* set up a namespace. If not fall back to the legacy version.
*/
if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
- (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS))
+ (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) ||
+ ctrl->vs >= NVME_VS(2, 0, 0))
ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
- else
+ if (ret > 0)
ret = nvme_ns_info_from_identify(ctrl, &info);
if (info.is_removed)
@@ -3912,22 +4108,53 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
}
+/**
+ * struct async_scan_info - keeps track of controller & NSIDs to scan
+ * @ctrl: Controller on which namespaces are being scanned
+ * @next_nsid: Index of next NSID to scan in ns_list
+ * @ns_list: Pointer to list of NSIDs to scan
+ *
+ * Note: There is a single async_scan_info structure shared by all instances
+ * of nvme_scan_ns_async() scanning a given controller, so the atomic
+ * operations on next_nsid are critical to ensure each instance scans a unique
+ * NSID.
+ */
+struct async_scan_info {
+ struct nvme_ctrl *ctrl;
+ atomic_t next_nsid;
+ __le32 *ns_list;
+};
+
+static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
+{
+ struct async_scan_info *scan_info = data;
+ int idx;
+ u32 nsid;
+
+ idx = (u32)atomic_fetch_inc(&scan_info->next_nsid);
+ nsid = le32_to_cpu(scan_info->ns_list[idx]);
+
+ nvme_scan_ns(scan_info->ctrl, nsid);
+}
+
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid)
{
struct nvme_ns *ns, *next;
LIST_HEAD(rm_list);
- down_write(&ctrl->namespaces_rwsem);
+ mutex_lock(&ctrl->namespaces_lock);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
- if (ns->head->ns_id > nsid)
- list_move_tail(&ns->list, &rm_list);
+ if (ns->head->ns_id > nsid) {
+ list_del_rcu(&ns->list);
+ synchronize_srcu(&ctrl->srcu);
+ list_add_tail_rcu(&ns->list, &rm_list);
+ }
}
- up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_lock);
list_for_each_entry_safe(ns, next, &rm_list, list)
nvme_ns_remove(ns);
-
}
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -3936,11 +4163,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
__le32 *ns_list;
u32 prev = 0;
int ret = 0, i;
+ ASYNC_DOMAIN(domain);
+ struct async_scan_info scan_info;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
+ scan_info.ctrl = ctrl;
+ scan_info.ns_list = ns_list;
for (;;) {
struct nvme_command cmd = {
.identify.opcode = nvme_admin_identify,
@@ -3956,19 +4187,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
goto free;
}
+ atomic_set(&scan_info.next_nsid, 0);
for (i = 0; i < nr_entries; i++) {
u32 nsid = le32_to_cpu(ns_list[i]);
if (!nsid) /* end of the list? */
goto out;
- nvme_scan_ns(ctrl, nsid);
+ async_schedule_domain(nvme_scan_ns_async, &scan_info,
+ &domain);
while (++prev < nsid)
nvme_ns_remove_by_nsid(ctrl, prev);
}
+ async_synchronize_full_domain(&domain);
}
out:
nvme_remove_invalid_namespaces(ctrl, prev);
free:
+ async_synchronize_full_domain(&domain);
kfree(ns_list);
return ret;
}
@@ -4044,7 +4279,7 @@ static void nvme_scan_work(struct work_struct *work)
}
mutex_lock(&ctrl->scan_lock);
- if (nvme_ctrl_limited_cns(ctrl)) {
+ if (!nvme_id_cns_ok(ctrl, NVME_ID_CNS_NS_ACTIVE_LIST)) {
nvme_scan_ns_sequential(ctrl);
} else {
/*
@@ -4053,7 +4288,7 @@ static void nvme_scan_work(struct work_struct *work)
* they report) but don't actually support it.
*/
ret = nvme_scan_ns_list(ctrl);
- if (ret > 0 && ret & NVME_SC_DNR)
+ if (ret > 0 && ret & NVME_STATUS_DNR)
nvme_scan_ns_sequential(ctrl);
}
mutex_unlock(&ctrl->scan_lock);
@@ -4097,9 +4332,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
- down_write(&ctrl->namespaces_rwsem);
- list_splice_init(&ctrl->namespaces, &ns_list);
- up_write(&ctrl->namespaces_rwsem);
+ mutex_lock(&ctrl->namespaces_lock);
+ list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
+ mutex_unlock(&ctrl->namespaces_lock);
+ synchronize_srcu(&ctrl->srcu);
list_for_each_entry_safe(ns, next, &ns_list, list)
nvme_ns_remove(ns);
@@ -4306,7 +4542,8 @@ static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
{
- dev_warn(ctrl->device, "resetting controller due to AER\n");
+ dev_warn(ctrl->device,
+ "resetting controller due to persistent internal error\n");
nvme_reset_ctrl(ctrl);
}
@@ -4353,15 +4590,16 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int cmd_size)
{
+ struct queue_limits lim = {};
int ret;
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
if (ctrl->ops->flags & NVME_F_FABRICS)
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ /* Reserved for fabric connect and keep alive */
+ set->reserved_tags = 2;
set->numa_node = ctrl->numa_node;
- set->flags = BLK_MQ_F_NO_SCHED;
if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size;
@@ -4372,14 +4610,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ret)
return ret;
- ctrl->admin_q = blk_mq_init_queue(set);
+ ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q);
goto out_free_tagset;
}
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->fabrics_q = blk_mq_init_queue(set);
+ ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q);
goto out_cleanup_admin_q;
@@ -4402,6 +4640,11 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
{
+ /*
+ * As we're about to destroy the queue and free tagset
+ * we can not have keep-alive work running.
+ */
+ nvme_stop_keep_alive(ctrl);
blk_mq_destroy_queue(ctrl->admin_q);
blk_put_queue(ctrl->admin_q);
if (ctrl->ops->flags & NVME_F_FABRICS) {
@@ -4428,12 +4671,12 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
set->reserved_tags = NVME_AQ_DEPTH;
else if (ctrl->ops->flags & NVME_F_FABRICS)
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ /* Reserved for fabric connect */
+ set->reserved_tags = 1;
set->numa_node = ctrl->numa_node;
- set->flags = BLK_MQ_F_SHOULD_MERGE;
if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
- set->cmd_size = cmd_size,
+ set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = ctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
@@ -4443,13 +4686,15 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
return ret;
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->connect_q = blk_mq_init_queue(set);
+ struct queue_limits lim = {
+ .features = BLK_FEAT_SKIP_TAGSET_QUIESCE,
+ };
+
+ ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
- blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
- ctrl->connect_q);
}
ctrl->tagset = set;
@@ -4476,7 +4721,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
nvme_mpath_stop(ctrl);
nvme_auth_stop(ctrl);
- nvme_stop_keep_alive(ctrl);
nvme_stop_failfast_work(ctrl);
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work);
@@ -4512,6 +4756,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
{
+ nvme_stop_keep_alive(ctrl);
nvme_hwmon_exit(ctrl);
nvme_fault_inject_fini(&ctrl->fault_inject);
dev_pm_qos_hide_latency_tolerance(ctrl->device);
@@ -4541,9 +4786,9 @@ static void nvme_free_ctrl(struct device *dev)
if (!subsys || ctrl->instance != subsys->instance)
ida_free(&nvme_instance_ida, ctrl->instance);
- key_put(ctrl->tls_key);
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
+ cleanup_srcu_struct(&ctrl->srcu);
nvme_auth_stop(ctrl);
nvme_auth_free(ctrl);
__free_page(ctrl->discard_page);
@@ -4566,6 +4811,9 @@ static void nvme_free_ctrl(struct device *dev)
* Initialize a NVMe controller structures. This needs to be called during
* earliest initialization so that we have the initialized structured around
* during probing.
+ *
+ * On success, the caller must use the nvme_put_ctrl() to release this when
+ * needed, which also invokes the ops->free_ctrl() callback.
*/
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks)
@@ -4576,10 +4824,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->passthru_err_log_enabled = false;
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
+ mutex_init(&ctrl->namespaces_lock);
+
+ ret = init_srcu_struct(&ctrl->srcu);
+ if (ret)
+ return ret;
+
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
xa_init(&ctrl->cels);
- init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
@@ -4609,11 +4862,17 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
goto out;
ctrl->instance = ret;
+ ret = nvme_auth_init_ctrl(ctrl);
+ if (ret)
+ goto out_release_instance;
+
+ nvme_mpath_init_ctrl(ctrl);
+
device_initialize(&ctrl->ctrl_device);
ctrl->device = &ctrl->ctrl_device;
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
ctrl->instance);
- ctrl->device->class = nvme_class;
+ ctrl->device->class = &nvme_class;
ctrl->device->parent = ctrl->dev;
if (ops->dev_attr_groups)
ctrl->device->groups = ops->dev_attr_groups;
@@ -4621,16 +4880,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->device->groups = nvme_dev_attr_groups;
ctrl->device->release = nvme_free_ctrl;
dev_set_drvdata(ctrl->device, ctrl);
+
+ return ret;
+
+out_release_instance:
+ ida_free(&nvme_instance_ida, ctrl->instance);
+out:
+ if (ctrl->discard_page)
+ __free_page(ctrl->discard_page);
+ cleanup_srcu_struct(&ctrl->srcu);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_init_ctrl);
+
+/*
+ * On success, returns with an elevated controller reference and caller must
+ * use nvme_uninit_ctrl() to properly free resources associated with the ctrl.
+ */
+int nvme_add_ctrl(struct nvme_ctrl *ctrl)
+{
+ int ret;
+
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
if (ret)
- goto out_release_instance;
+ return ret;
- nvme_get_ctrl(ctrl);
cdev_init(&ctrl->cdev, &nvme_dev_fops);
- ctrl->cdev.owner = ops->module;
+ ctrl->cdev.owner = ctrl->ops->module;
ret = cdev_device_add(&ctrl->cdev, ctrl->device);
if (ret)
- goto out_free_name;
+ return ret;
/*
* Initialize latency tolerance controls. The sysfs files won't
@@ -4641,48 +4920,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
- nvme_mpath_init_ctrl(ctrl);
- ret = nvme_auth_init_ctrl(ctrl);
- if (ret)
- goto out_free_cdev;
+ nvme_get_ctrl(ctrl);
return 0;
-out_free_cdev:
- nvme_fault_inject_fini(&ctrl->fault_inject);
- dev_pm_qos_hide_latency_tolerance(ctrl->device);
- cdev_device_del(&ctrl->cdev, ctrl->device);
-out_free_name:
- nvme_put_ctrl(ctrl);
- kfree_const(ctrl->device->kobj.name);
-out_release_instance:
- ida_free(&nvme_instance_ida, ctrl->instance);
-out:
- if (ctrl->discard_page)
- __free_page(ctrl->discard_page);
- return ret;
}
-EXPORT_SYMBOL_GPL(nvme_init_ctrl);
+EXPORT_SYMBOL_GPL(nvme_add_ctrl);
/* let I/O to all namespaces fail in preparation for surprise removal */
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu))
blk_mark_disk_dead(ns->disk);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
- blk_mq_unfreeze_queue(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu))
+ blk_mq_unfreeze_queue_non_owner(ns->queue);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
@@ -4690,14 +4957,16 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list) {
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu)) {
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
if (timeout <= 0)
break;
}
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
return timeout;
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4705,23 +4974,32 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu))
blk_mq_freeze_queue_wait(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
- blk_freeze_queue_start(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu))
+ /*
+ * Typical non_owner use case is from pci driver, in which
+ * start_freeze is called from timeout work function, but
+ * unfreeze is done in reset work context
+ */
+ blk_freeze_queue_start_non_owner(ns->queue);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
@@ -4764,11 +5042,13 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ int srcu_idx;
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ srcu_idx = srcu_read_lock(&ctrl->srcu);
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu))
blk_sync_queue(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
@@ -4786,7 +5066,7 @@ struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
return NULL;
return file->private_data;
}
-EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, "NVME_TARGET_PASSTHRU");
/*
* Check we didn't inadvertently grow the command structure sizes:
@@ -4814,6 +5094,8 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512);
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
@@ -4822,22 +5104,20 @@ static inline void _nvme_check_size(void)
static int __init nvme_core_init(void)
{
+ unsigned int wq_flags = WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS;
int result = -ENOMEM;
_nvme_check_size();
- nvme_wq = alloc_workqueue("nvme-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_wq = alloc_workqueue("nvme-wq", wq_flags, 0);
if (!nvme_wq)
goto out;
- nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, 0);
if (!nvme_reset_wq)
goto destroy_wq;
- nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, 0);
if (!nvme_delete_wq)
goto destroy_reset_wq;
@@ -4846,42 +5126,36 @@ static int __init nvme_core_init(void)
if (result < 0)
goto destroy_delete_wq;
- nvme_class = class_create("nvme");
- if (IS_ERR(nvme_class)) {
- result = PTR_ERR(nvme_class);
+ result = class_register(&nvme_class);
+ if (result)
goto unregister_chrdev;
- }
- nvme_class->dev_uevent = nvme_class_uevent;
- nvme_subsys_class = class_create("nvme-subsystem");
- if (IS_ERR(nvme_subsys_class)) {
- result = PTR_ERR(nvme_subsys_class);
+ result = class_register(&nvme_subsys_class);
+ if (result)
goto destroy_class;
- }
result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
"nvme-generic");
if (result < 0)
goto destroy_subsys_class;
- nvme_ns_chr_class = class_create("nvme-generic");
- if (IS_ERR(nvme_ns_chr_class)) {
- result = PTR_ERR(nvme_ns_chr_class);
+ result = class_register(&nvme_ns_chr_class);
+ if (result)
goto unregister_generic_ns;
- }
+
result = nvme_init_auth();
if (result)
goto destroy_ns_chr;
return 0;
destroy_ns_chr:
- class_destroy(nvme_ns_chr_class);
+ class_unregister(&nvme_ns_chr_class);
unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
- class_destroy(nvme_subsys_class);
+ class_unregister(&nvme_subsys_class);
destroy_class:
- class_destroy(nvme_class);
+ class_unregister(&nvme_class);
unregister_chrdev:
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_delete_wq:
@@ -4897,9 +5171,9 @@ out:
static void __exit nvme_core_exit(void)
{
nvme_exit_auth();
- class_destroy(nvme_ns_chr_class);
- class_destroy(nvme_subsys_class);
- class_destroy(nvme_class);
+ class_unregister(&nvme_ns_chr_class);
+ class_unregister(&nvme_subsys_class);
+ class_unregister(&nvme_class);
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_workqueue(nvme_delete_wq);