diff options
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r-- | drivers/nvme/host/core.c | 158 |
1 files changed, 100 insertions, 58 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ba6508455e18..1a8d32a4a5c3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -22,7 +22,7 @@ #include <linux/nvme_ioctl.h> #include <linux/pm_qos.h> #include <linux/ratelimit.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "nvme.h" #include "fabrics.h" @@ -42,6 +42,8 @@ struct nvme_ns_info { bool is_readonly; bool is_ready; bool is_removed; + bool is_rotational; + bool no_vwc; }; unsigned int admin_timeout = 60; @@ -92,6 +94,17 @@ MODULE_PARM_DESC(apst_secondary_latency_tol_us, "secondary APST latency tolerance in us"); /* + * Older kernels didn't enable protection information if it was at an offset. + * Newer kernels do, so it breaks reads on the upgrade if such formats were + * used in prior kernels since the metadata written did not contain a valid + * checksum. + */ +static bool disable_pi_offsets = false; +module_param(disable_pi_offsets, bool, 0444); +MODULE_PARM_DESC(disable_pi_offsets, + "disable protection information if it has an offset"); + +/* * nvme_wq - hosts nvme related works that are not reset or delete * nvme_reset_wq - hosts nvme reset works * nvme_delete_wq - hosts nvme delete works @@ -1292,14 +1305,12 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); } -static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, - blk_status_t status) +static void nvme_keep_alive_finish(struct request *rq, + blk_status_t status, struct nvme_ctrl *ctrl) { - struct nvme_ctrl *ctrl = rq->end_io_data; - unsigned long flags; - bool startka = false; unsigned long rtt = jiffies - (rq->deadline - rq->timeout); unsigned long delay = nvme_keep_alive_work_period(ctrl); + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); /* * Subtract off the keepalive RTT so nvme_keep_alive_work runs @@ -1313,25 +1324,17 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, delay = 0; } - blk_mq_free_request(rq); - if (status) { dev_err(ctrl->device, "failed nvme_keep_alive_end_io error=%d\n", status); - return RQ_END_IO_NONE; + return; } ctrl->ka_last_check_time = jiffies; ctrl->comp_seen = false; - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->state == NVME_CTRL_LIVE || - ctrl->state == NVME_CTRL_CONNECTING) - startka = true; - spin_unlock_irqrestore(&ctrl->lock, flags); - if (startka) + if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING) queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); - return RQ_END_IO_NONE; } static void nvme_keep_alive_work(struct work_struct *work) @@ -1340,6 +1343,7 @@ static void nvme_keep_alive_work(struct work_struct *work) struct nvme_ctrl, ka_work); bool comp_seen = ctrl->comp_seen; struct request *rq; + blk_status_t status; ctrl->ka_last_check_time = jiffies; @@ -1362,9 +1366,9 @@ static void nvme_keep_alive_work(struct work_struct *work) nvme_init_request(rq, &ctrl->ka_cmd); rq->timeout = ctrl->kato * HZ; - rq->end_io = nvme_keep_alive_end_io; - rq->end_io_data = ctrl; - blk_execute_rq_nowait(rq, false); + status = blk_execute_rq(rq, false); + nvme_keep_alive_finish(rq, status, ctrl); + blk_mq_free_request(rq); } static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) @@ -1399,17 +1403,30 @@ static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, nvme_start_keep_alive(ctrl); } -/* - * In NVMe 1.0 the CNS field was just a binary controller or namespace - * flag, thus sending any new CNS opcodes has a big chance of not working. - * Qemu unfortunately had that bug after reporting a 1.1 version compliance - * (but not for any later version). - */ -static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl) +static bool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns) { - if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS) - return ctrl->vs < NVME_VS(1, 2, 0); - return ctrl->vs < NVME_VS(1, 1, 0); + /* + * The CNS field occupies a full byte starting with NVMe 1.2 + */ + if (ctrl->vs >= NVME_VS(1, 2, 0)) + return true; + + /* + * NVMe 1.1 expanded the CNS value to two bits, which means values + * larger than that could get truncated and treated as an incorrect + * value. + * + * Qemu implemented 1.0 behavior for controllers claiming 1.1 + * compliance, so they need to be quirked here. + */ + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) + return cns <= 3; + + /* + * NVMe 1.0 used a single bit for the CNS value. + */ + return cns <= 1; } static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) @@ -1624,6 +1641,8 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl, info->is_shared = id->nmic & NVME_NS_NMIC_SHARED; info->is_readonly = id->nsattr & NVME_NS_ATTR_RO; info->is_ready = id->nstat & NVME_NSTAT_NRDY; + info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL; + info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT; } kfree(id); return ret; @@ -1922,8 +1941,12 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, if (head->pi_size && head->ms >= head->pi_size) head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; - if (!(id->dps & NVME_NS_DPS_PI_FIRST)) - info->pi_offset = head->ms - head->pi_size; + if (!(id->dps & NVME_NS_DPS_PI_FIRST)) { + if (disable_pi_offsets) + head->pi_type = 0; + else + info->pi_offset = head->ms - head->pi_size; + } if (ctrl->ops->flags & NVME_F_FABRICS) { /* @@ -2166,11 +2189,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ns->head->ids.csi == NVME_CSI_ZNS) nvme_update_zone_info(ns, &lim, &zi); - if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) + if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc) lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; else lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); + if (info->is_rotational) + lim.features |= BLK_FEAT_ROTATIONAL; + /* * Register a metadata profile for PI, or the plain non-integrity NVMe * metadata masquerading as Type 0 if supported, otherwise reject block @@ -2458,8 +2484,13 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) else ctrl->ctrl_config = NVME_CC_CSS_NVM; - if (ctrl->cap & NVME_CAP_CRMS_CRWMS && ctrl->cap & NVME_CAP_CRMS_CRIMS) - ctrl->ctrl_config |= NVME_CC_CRIME; + /* + * Setting CRIME results in CSTS.RDY before the media is ready. This + * makes it possible for media related commands to return the error + * NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY. Until the driver is + * restructured to handle retries, disable CC.CRIME. + */ + ctrl->ctrl_config &= ~NVME_CC_CRIME; ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; @@ -2489,10 +2520,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) * devices are known to get this wrong. Use the larger of the * two values. */ - if (ctrl->ctrl_config & NVME_CC_CRIME) - ready_timeout = NVME_CRTO_CRIMT(crto); - else - ready_timeout = NVME_CRTO_CRWMT(crto); + ready_timeout = NVME_CRTO_CRWMT(crto); if (ready_timeout < timeout) dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n", @@ -3111,7 +3139,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) ctrl->max_zeroes_sectors = 0; if (ctrl->subsys->subtype != NVME_NQN_NVME || - nvme_ctrl_limited_cns(ctrl) || + !nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) || test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags)) return 0; @@ -3615,6 +3643,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ns_id = info->nsid; head->ids = info->ids; head->shared = info->is_shared; + head->rotational = info->is_rotational; ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1); ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE); kref_init(&head->ref); @@ -3774,7 +3803,8 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) int srcu_idx; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) { if (ns->head->ns_id == nsid) { if (!nvme_get_ns(ns)) continue; @@ -3995,7 +4025,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns_info info = { .nsid = nsid }; struct nvme_ns *ns; - int ret; + int ret = 1; if (nvme_identify_ns_descs(ctrl, &info)) return; @@ -4012,9 +4042,10 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) * set up a namespace. If not fall back to the legacy version. */ if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) || - (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) + (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) || + ctrl->vs >= NVME_VS(2, 0, 0)) ret = nvme_ns_info_from_id_cs_indep(ctrl, &info); - else + if (ret > 0) ret = nvme_ns_info_from_identify(ctrl, &info); if (info.is_removed) @@ -4207,7 +4238,7 @@ static void nvme_scan_work(struct work_struct *work) } mutex_lock(&ctrl->scan_lock); - if (nvme_ctrl_limited_cns(ctrl)) { + if (!nvme_id_cns_ok(ctrl, NVME_ID_CNS_NS_ACTIVE_LIST)) { nvme_scan_ns_sequential(ctrl); } else { /* @@ -4858,7 +4889,8 @@ void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) int srcu_idx; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) blk_mark_disk_dead(ns->disk); srcu_read_unlock(&ctrl->srcu, srcu_idx); } @@ -4870,8 +4902,9 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) int srcu_idx; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) - blk_mq_unfreeze_queue(ns->queue); + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) + blk_mq_unfreeze_queue_non_owner(ns->queue); srcu_read_unlock(&ctrl->srcu, srcu_idx); clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); } @@ -4883,7 +4916,8 @@ int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) int srcu_idx; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) { timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); if (timeout <= 0) break; @@ -4899,7 +4933,8 @@ void nvme_wait_freeze(struct nvme_ctrl *ctrl) int srcu_idx; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) blk_mq_freeze_queue_wait(ns->queue); srcu_read_unlock(&ctrl->srcu, srcu_idx); } @@ -4912,8 +4947,14 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) set_bit(NVME_CTRL_FROZEN, &ctrl->flags); srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) - blk_freeze_queue_start(ns->queue); + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) + /* + * Typical non_owner use case is from pci driver, in which + * start_freeze is called from timeout work function, but + * unfreeze is done in reset work context + */ + blk_freeze_queue_start_non_owner(ns->queue); srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_start_freeze); @@ -4960,7 +5001,8 @@ void nvme_sync_io_queues(struct nvme_ctrl *ctrl) int srcu_idx; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) blk_sync_queue(ns->queue); srcu_read_unlock(&ctrl->srcu, srcu_idx); } @@ -5008,6 +5050,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); + BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512); + BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512); @@ -5016,22 +5060,20 @@ static inline void _nvme_check_size(void) static int __init nvme_core_init(void) { + unsigned int wq_flags = WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS; int result = -ENOMEM; _nvme_check_size(); - nvme_wq = alloc_workqueue("nvme-wq", - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + nvme_wq = alloc_workqueue("nvme-wq", wq_flags, 0); if (!nvme_wq) goto out; - nvme_reset_wq = alloc_workqueue("nvme-reset-wq", - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, 0); if (!nvme_reset_wq) goto destroy_wq; - nvme_delete_wq = alloc_workqueue("nvme-delete-wq", - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, 0); if (!nvme_delete_wq) goto destroy_reset_wq; |