diff options
Diffstat (limited to 'drivers/nvme/target/core.c')
| -rw-r--r-- | drivers/nvme/target/core.c | 1210 |
1 files changed, 869 insertions, 341 deletions
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 88d260f31835..cc88e5a28c8a 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1,28 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common code for the NVMe target. * Copyright (c) 2015-2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/pci-p2pdma.h> +#include <linux/scatterlist.h> + +#include <generated/utsrelease.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" #include "nvmet.h" +#include "debugfs.h" +struct kmem_cache *nvmet_bvec_cache; struct workqueue_struct *buffered_io_wq; +struct workqueue_struct *zbd_wq; static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; static DEFINE_IDA(cntlid_ida); +struct workqueue_struct *nvmet_wq; +EXPORT_SYMBOL_GPL(nvmet_wq); + /* * This read/write semaphore is used to synchronize access to configuration * information on a target system that will result in discovery log page @@ -36,7 +40,7 @@ static DEFINE_IDA(cntlid_ida); * - the nvmet_transports array * * When updating any of those lists/structures write lock should be obtained, - * while when reading (popolating discovery log page or checking host-subsystem + * while when reading (populating discovery log page or checking host-subsystem * link) read lock is obtained to allow concurrent reads. */ DECLARE_RWSEM(nvmet_config_sem); @@ -47,40 +51,36 @@ DECLARE_RWSEM(nvmet_ana_sem); inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) { - u16 status; - switch (errno) { + case 0: + return NVME_SC_SUCCESS; case -ENOSPC: req->error_loc = offsetof(struct nvme_rw_command, length); - status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; - break; + return NVME_SC_CAP_EXCEEDED | NVME_STATUS_DNR; case -EREMOTEIO: req->error_loc = offsetof(struct nvme_rw_command, slba); - status = NVME_SC_LBA_RANGE | NVME_SC_DNR; - break; + return NVME_SC_LBA_RANGE | NVME_STATUS_DNR; case -EOPNOTSUPP: req->error_loc = offsetof(struct nvme_common_command, opcode); - switch (req->cmd->common.opcode) { - case nvme_cmd_dsm: - case nvme_cmd_write_zeroes: - status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; - break; - default: - status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; - } - break; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; case -ENODATA: req->error_loc = offsetof(struct nvme_rw_command, nsid); - status = NVME_SC_ACCESS_DENIED; - break; + return NVME_SC_ACCESS_DENIED; case -EIO: - /* FALLTHRU */ + fallthrough; default: req->error_loc = offsetof(struct nvme_common_command, opcode); - status = NVME_SC_INTERNAL | NVME_SC_DNR; + return NVME_SC_INTERNAL | NVME_STATUS_DNR; } +} - return status; +u16 nvmet_report_invalid_opcode(struct nvmet_req *req) +{ + pr_debug("unhandled cmd %d on qid %d\n", req->cmd->common.opcode, + req->sq->qid); + + req->error_loc = offsetof(struct nvme_common_command, opcode); + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, @@ -91,7 +91,7 @@ u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, { if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { req->error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; } return 0; } @@ -100,7 +100,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) { if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { req->error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; } return 0; } @@ -109,20 +109,21 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) { if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) { req->error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; } return 0; } -static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys) +static u32 nvmet_max_nsid(struct nvmet_subsys *subsys) { - struct nvmet_ns *ns; + struct nvmet_ns *cur; + unsigned long idx; + u32 nsid = 0; - if (list_empty(&subsys->namespaces)) - return 0; + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, cur) + nsid = cur->nsid; - ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link); - return ns->nsid; + return nsid; } static u32 nvmet_async_event_result(struct nvmet_async_event *aen) @@ -130,39 +131,29 @@ static u32 nvmet_async_event_result(struct nvmet_async_event *aen) return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); } -static void nvmet_async_events_free(struct nvmet_ctrl *ctrl) +static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl) { struct nvmet_req *req; - while (1) { - mutex_lock(&ctrl->lock); - if (!ctrl->nr_async_event_cmds) { - mutex_unlock(&ctrl->lock); - return; - } - + mutex_lock(&ctrl->lock); + while (ctrl->nr_async_event_cmds) { req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; mutex_unlock(&ctrl->lock); - nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR); + nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_STATUS_DNR); + mutex_lock(&ctrl->lock); } + mutex_unlock(&ctrl->lock); } -static void nvmet_async_event_work(struct work_struct *work) +static void nvmet_async_events_process(struct nvmet_ctrl *ctrl) { - struct nvmet_ctrl *ctrl = - container_of(work, struct nvmet_ctrl, async_event_work); struct nvmet_async_event *aen; struct nvmet_req *req; - while (1) { - mutex_lock(&ctrl->lock); - aen = list_first_entry_or_null(&ctrl->async_events, - struct nvmet_async_event, entry); - if (!aen || !ctrl->nr_async_event_cmds) { - mutex_unlock(&ctrl->lock); - return; - } - + mutex_lock(&ctrl->lock); + while (ctrl->nr_async_event_cmds && !list_empty(&ctrl->async_events)) { + aen = list_first_entry(&ctrl->async_events, + struct nvmet_async_event, entry); req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; nvmet_set_result(req, nvmet_async_event_result(aen)); @@ -170,8 +161,31 @@ static void nvmet_async_event_work(struct work_struct *work) kfree(aen); mutex_unlock(&ctrl->lock); + trace_nvmet_async_event(ctrl, req->cqe->result.u32); nvmet_req_complete(req, 0); + mutex_lock(&ctrl->lock); } + mutex_unlock(&ctrl->lock); +} + +static void nvmet_async_events_free(struct nvmet_ctrl *ctrl) +{ + struct nvmet_async_event *aen, *tmp; + + mutex_lock(&ctrl->lock); + list_for_each_entry_safe(aen, tmp, &ctrl->async_events, entry) { + list_del(&aen->entry); + kfree(aen); + } + mutex_unlock(&ctrl->lock); +} + +static void nvmet_async_event_work(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, async_event_work); + + nvmet_async_events_process(ctrl); } void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, @@ -191,7 +205,7 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, list_add_tail(&aen->entry, &ctrl->async_events); mutex_unlock(&ctrl->lock); - schedule_work(&ctrl->async_event_work); + queue_work(nvmet_wq, &ctrl->async_event_work); } static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid) @@ -222,11 +236,13 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) { struct nvmet_ctrl *ctrl; + lockdep_assert_held(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR)) continue; - nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + nvmet_add_async_event(ctrl, NVME_AER_NOTICE, NVME_AER_NOTICE_NS_CHANGED, NVME_LOG_CHANGED_NS); } @@ -243,7 +259,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys, continue; if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE)) continue; - nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + nvmet_add_async_event(ctrl, NVME_AER_NOTICE, NVME_AER_NOTICE_ANA, NVME_LOG_ANA); } mutex_unlock(&subsys->lock); @@ -282,6 +298,18 @@ void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops) } EXPORT_SYMBOL_GPL(nvmet_unregister_transport); +void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys) +{ + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->port == port) + ctrl->ops->delete_ctrl(ctrl); + } + mutex_unlock(&subsys->lock); +} + int nvmet_enable_port(struct nvmet_port *port) { const struct nvmet_fabrics_ops *ops; @@ -289,6 +317,9 @@ int nvmet_enable_port(struct nvmet_port *port) lockdep_assert_held(&nvmet_config_sem); + if (port->disc_addr.trtype == NVMF_TRTYPE_MAX) + return -EINVAL; + ops = nvmet_transports[port->disc_addr.trtype]; if (!ops) { up_write(&nvmet_config_sem); @@ -305,18 +336,44 @@ int nvmet_enable_port(struct nvmet_port *port) if (!try_module_get(ops->owner)) return -EINVAL; - ret = ops->add_port(port); - if (ret) { - module_put(ops->owner); - return ret; + /* + * If the user requested PI support and the transport isn't pi capable, + * don't enable the port. + */ + if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) { + pr_err("T10-PI is not supported by transport type %d\n", + port->disc_addr.trtype); + ret = -EINVAL; + goto out_put; } + ret = ops->add_port(port); + if (ret) + goto out_put; + /* If the transport didn't set inline_data_size, then disable it. */ if (port->inline_data_size < 0) port->inline_data_size = 0; + /* + * If the transport didn't set the max_queue_size properly, then clamp + * it to the target limits. Also set default values in case the + * transport didn't set it at all. + */ + if (port->max_queue_size < 0) + port->max_queue_size = NVMET_MAX_QUEUE_SIZE; + else + port->max_queue_size = clamp_t(int, port->max_queue_size, + NVMET_MIN_QUEUE_SIZE, + NVMET_MAX_QUEUE_SIZE); + port->enabled = true; + port->tr_ops = ops; return 0; + +out_put: + module_put(ops->owner); + return ret; } void nvmet_disable_port(struct nvmet_port *port) @@ -326,6 +383,7 @@ void nvmet_disable_port(struct nvmet_port *port) lockdep_assert_held(&nvmet_config_sem); port->enabled = false; + port->tr_ops = NULL; ops = nvmet_transports[port->disc_addr.trtype]; ops->remove_port(port); @@ -336,13 +394,13 @@ static void nvmet_keep_alive_timer(struct work_struct *work) { struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), struct nvmet_ctrl, ka_work); - bool cmd_seen = ctrl->cmd_seen; + bool reset_tbkas = ctrl->reset_tbkas; - ctrl->cmd_seen = false; - if (cmd_seen) { + ctrl->reset_tbkas = false; + if (reset_tbkas) { pr_debug("ctrl %d reschedule traffic based keep-alive timer\n", ctrl->cntlid); - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + queue_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ); return; } @@ -352,46 +410,45 @@ static void nvmet_keep_alive_timer(struct work_struct *work) nvmet_ctrl_fatal_error(ctrl); } -static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) +void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d start keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); - INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + queue_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ); } -static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) +void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); cancel_delayed_work_sync(&ctrl->ka_work); } -static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl, - __le32 nsid) +u16 nvmet_req_find_ns(struct nvmet_req *req) { - struct nvmet_ns *ns; + u32 nsid = le32_to_cpu(req->cmd->common.nsid); + struct nvmet_subsys *subsys = nvmet_req_subsys(req); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { - if (ns->nsid == le32_to_cpu(nsid)) - return ns; - } - - return NULL; -} - -struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) -{ - struct nvmet_ns *ns; + req->ns = xa_load(&subsys->namespaces, nsid); + if (unlikely(!req->ns || !req->ns->enabled)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); + if (!req->ns) /* ns doesn't exist! */ + return NVME_SC_INVALID_NS | NVME_STATUS_DNR; - rcu_read_lock(); - ns = __nvmet_find_namespace(ctrl, nsid); - if (ns) - percpu_ref_get(&ns->ref); - rcu_read_unlock(); + /* ns exists but it's disabled */ + req->ns = NULL; + return NVME_SC_INTERNAL_PATH_ERROR; + } - return ns; + percpu_ref_get(&req->ns->ref); + return NVME_SC_SUCCESS; } static void nvmet_destroy_namespace(struct percpu_ref *ref) @@ -425,7 +482,7 @@ static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns) return -EINVAL; } - if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) { + if (!blk_queue_pci_p2pdma(ns->bdev->bd_disk->queue)) { pr_err("peer-to-peer DMA is not supported by the driver of %s\n", ns->device_path); return -EINVAL; @@ -456,9 +513,6 @@ static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns) return 0; } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, struct nvmet_ns *ns) { @@ -466,6 +520,8 @@ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, struct pci_dev *p2p_dev; int ret; + lockdep_assert_held(&ctrl->subsys->lock); + if (!ctrl->p2p_client || !ns->use_p2pmem) return; @@ -495,6 +551,18 @@ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, ns->nsid); } +bool nvmet_ns_revalidate(struct nvmet_ns *ns) +{ + loff_t oldsize = ns->size; + + if (ns->bdev) + nvmet_bdev_ns_revalidate(ns); + else + nvmet_file_ns_revalidate(ns); + + return oldsize != ns->size; +} + int nvmet_ns_enable(struct nvmet_ns *ns) { struct nvmet_subsys *subsys = ns->subsys; @@ -502,10 +570,13 @@ int nvmet_ns_enable(struct nvmet_ns *ns) int ret; mutex_lock(&subsys->lock); - ret = -EMFILE; - if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) - goto out_unlock; ret = 0; + + if (nvmet_is_passthru_subsys(subsys)) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + if (ns->enabled) goto out_unlock; @@ -517,48 +588,34 @@ int nvmet_ns_enable(struct nvmet_ns *ns) ret = nvmet_p2pmem_ns_enable(ns); if (ret) - goto out_unlock; + goto out_dev_disable; list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) nvmet_p2pmem_ns_add_p2p(ctrl, ns); - ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, - 0, GFP_KERNEL); - if (ret) - goto out_dev_put; - - if (ns->nsid > subsys->max_nsid) - subsys->max_nsid = ns->nsid; - - /* - * The namespaces list needs to be sorted to simplify the implementation - * of the Identify Namepace List subcommand. - */ - if (list_empty(&subsys->namespaces)) { - list_add_tail_rcu(&ns->dev_link, &subsys->namespaces); - } else { - struct nvmet_ns *old; - - list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) { - BUG_ON(ns->nsid == old->nsid); - if (ns->nsid < old->nsid) - break; - } - - list_add_tail_rcu(&ns->dev_link, &old->dev_link); + if (ns->pr.enable) { + ret = nvmet_pr_init_ns(ns); + if (ret) + goto out_dev_put; } - subsys->nr_namespaces++; + + if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL)) + goto out_pr_exit; nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; + xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); ret = 0; out_unlock: mutex_unlock(&subsys->lock); return ret; +out_pr_exit: + if (ns->pr.enable) + nvmet_pr_exit_ns(ns); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); - +out_dev_disable: nvmet_ns_dev_disable(ns); goto out_unlock; } @@ -573,9 +630,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) goto out_unlock; ns->enabled = false; - list_del_rcu(&ns->dev_link); - if (ns->nsid == subsys->max_nsid) - subsys->max_nsid = nvmet_max_nsid(subsys); + xa_clear_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -586,7 +641,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) * Now that we removed the namespaces from the lookup list, we * can kill the per_cpu ref and wait for any remaining references * to be dropped, as well as a RCU grace period for anyone only - * using the namepace under rcu_read_lock(). Note that we can't + * using the namespace under rcu_read_lock(). Note that we can't * use call_rcu here as we need to ensure the namespaces have * been fully destroyed before unloading the module. */ @@ -595,9 +650,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns) wait_for_completion(&ns->disable_done); percpu_ref_exit(&ns->ref); - mutex_lock(&subsys->lock); + if (ns->pr.enable) + nvmet_pr_exit_ns(ns); - subsys->nr_namespaces--; + mutex_lock(&subsys->lock); nvmet_ns_changed(subsys, ns->nsid); nvmet_ns_dev_disable(ns); out_unlock: @@ -606,8 +662,19 @@ out_unlock: void nvmet_ns_free(struct nvmet_ns *ns) { + struct nvmet_subsys *subsys = ns->subsys; + nvmet_ns_disable(ns); + mutex_lock(&subsys->lock); + + xa_erase(&subsys->namespaces, ns->nsid); + if (ns->nsid == subsys->max_nsid) + subsys->max_nsid = nvmet_max_nsid(subsys); + + subsys->nr_namespaces--; + mutex_unlock(&subsys->lock); + down_write(&nvmet_ana_sem); nvmet_ana_group_enabled[ns->anagrpid]--; up_write(&nvmet_ana_sem); @@ -620,16 +687,30 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) { struct nvmet_ns *ns; + mutex_lock(&subsys->lock); + + if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) + goto out_unlock; + ns = kzalloc(sizeof(*ns), GFP_KERNEL); if (!ns) - return NULL; + goto out_unlock; - INIT_LIST_HEAD(&ns->dev_link); init_completion(&ns->disable_done); ns->nsid = nsid; ns->subsys = subsys; + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = nsid; + + if (xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL)) + goto out_exit; + + subsys->nr_namespaces++; + + mutex_unlock(&subsys->lock); + down_write(&nvmet_ana_sem); ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; nvmet_ana_group_enabled[ns->anagrpid]++; @@ -637,8 +718,15 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) uuid_gen(&ns->uuid); ns->buffered_io = false; + ns->csi = NVME_CSI_NVM; return ns; +out_exit: + subsys->max_nsid = nvmet_max_nsid(subsys); + kfree(ns); +out_unlock: + mutex_unlock(&subsys->lock); + return NULL; } static void nvmet_update_sq_head(struct nvmet_req *req) @@ -646,13 +734,12 @@ static void nvmet_update_sq_head(struct nvmet_req *req) if (req->sq->size) { u32 old_sqhd, new_sqhd; + old_sqhd = READ_ONCE(req->sq->sqhd); do { - old_sqhd = req->sq->sqhd; new_sqhd = (old_sqhd + 1) % req->sq->size; - } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != - old_sqhd); + } while (!try_cmpxchg(&req->sq->sqhd, &old_sqhd, new_sqhd)); } - req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); + req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); } static void nvmet_set_error(struct nvmet_req *req, u16 status) @@ -661,7 +748,7 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status) struct nvme_error_slot *new_error_slot; unsigned long flags; - req->rsp->status = cpu_to_le16(status << 1); + req->cqe->status = cpu_to_le16(status << 1); if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC) return; @@ -681,30 +768,60 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status) spin_unlock_irqrestore(&ctrl->error_lock, flags); /* set the more bit for this request */ - req->rsp->status |= cpu_to_le16(1 << 14); + req->cqe->status |= cpu_to_le16(1 << 14); } static void __nvmet_req_complete(struct nvmet_req *req, u16 status) { + struct nvmet_ns *ns = req->ns; + struct nvmet_pr_per_ctrl_ref *pc_ref = req->pc_ref; + if (!req->sq->sqhd_disabled) nvmet_update_sq_head(req); - req->rsp->sq_id = cpu_to_le16(req->sq->qid); - req->rsp->command_id = req->cmd->common.command_id; + req->cqe->sq_id = cpu_to_le16(req->sq->qid); + req->cqe->command_id = req->cmd->common.command_id; if (unlikely(status)) nvmet_set_error(req, status); - if (req->ns) - nvmet_put_namespace(req->ns); + + trace_nvmet_req_complete(req); + req->ops->queue_response(req); + + if (pc_ref) + nvmet_pr_put_ns_pc_ref(pc_ref); + if (ns) + nvmet_put_namespace(ns); } void nvmet_req_complete(struct nvmet_req *req, u16 status) { + struct nvmet_sq *sq = req->sq; + __nvmet_req_complete(req, status); - percpu_ref_put(&req->sq->ref); + percpu_ref_put(&sq->ref); } EXPORT_SYMBOL_GPL(nvmet_req_complete); +void nvmet_cq_init(struct nvmet_cq *cq) +{ + refcount_set(&cq->ref, 1); +} +EXPORT_SYMBOL_GPL(nvmet_cq_init); + +bool nvmet_cq_get(struct nvmet_cq *cq) +{ + return refcount_inc_not_zero(&cq->ref); +} +EXPORT_SYMBOL_GPL(nvmet_cq_get); + +void nvmet_cq_put(struct nvmet_cq *cq) +{ + if (refcount_dec_and_test(&cq->ref)) + nvmet_cq_destroy(cq); +} +EXPORT_SYMBOL_GPL(nvmet_cq_put); + void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size) { @@ -714,6 +831,17 @@ void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, ctrl->cqs[qid] = cq; } +void nvmet_cq_destroy(struct nvmet_cq *cq) +{ + struct nvmet_ctrl *ctrl = cq->ctrl; + + if (ctrl) { + ctrl->cqs[cq->qid] = NULL; + nvmet_ctrl_put(cq->ctrl); + cq->ctrl = NULL; + } +} + void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, u16 size) { @@ -731,21 +859,135 @@ static void nvmet_confirm_sq(struct percpu_ref *ref) complete(&sq->confirm_done); } +u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create) +{ + if (!ctrl->cqs) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + + if (cqid > ctrl->subsys->max_qid) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + if ((create && ctrl->cqs[cqid]) || (!create && !ctrl->cqs[cqid])) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + return NVME_SC_SUCCESS; +} + +u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create) +{ + if (!cqid) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + return nvmet_check_cqid(ctrl, cqid, create); +} + +bool nvmet_cq_in_use(struct nvmet_cq *cq) +{ + return refcount_read(&cq->ref) > 1; +} +EXPORT_SYMBOL_GPL(nvmet_cq_in_use); + +u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, + u16 qid, u16 size) +{ + u16 status; + + status = nvmet_check_cqid(ctrl, qid, true); + if (status != NVME_SC_SUCCESS) + return status; + + if (!kref_get_unless_zero(&ctrl->ref)) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + cq->ctrl = ctrl; + + nvmet_cq_init(cq); + nvmet_cq_setup(ctrl, cq, qid, size); + + return NVME_SC_SUCCESS; +} +EXPORT_SYMBOL_GPL(nvmet_cq_create); + +u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, + bool create) +{ + if (!ctrl->sqs) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + + if (sqid > ctrl->subsys->max_qid) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + if ((create && ctrl->sqs[sqid]) || + (!create && !ctrl->sqs[sqid])) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + return NVME_SC_SUCCESS; +} + +u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + struct nvmet_cq *cq, u16 sqid, u16 size) +{ + u16 status; + int ret; + + if (!kref_get_unless_zero(&ctrl->ref)) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + + status = nvmet_check_sqid(ctrl, sqid, true); + if (status != NVME_SC_SUCCESS) + return status; + + ret = nvmet_sq_init(sq, cq); + if (ret) { + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + goto ctrl_put; + } + + nvmet_sq_setup(ctrl, sq, sqid, size); + sq->ctrl = ctrl; + + return NVME_SC_SUCCESS; + +ctrl_put: + nvmet_ctrl_put(ctrl); + return status; +} +EXPORT_SYMBOL_GPL(nvmet_sq_create); + void nvmet_sq_destroy(struct nvmet_sq *sq) { + struct nvmet_ctrl *ctrl = sq->ctrl; + /* * If this is the admin queue, complete all AERs so that our * queue doesn't have outstanding requests on it. */ - if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq) - nvmet_async_events_free(sq->ctrl); + if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq) + nvmet_async_events_failall(ctrl); percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq); wait_for_completion(&sq->confirm_done); wait_for_completion(&sq->free_done); percpu_ref_exit(&sq->ref); + nvmet_auth_sq_free(sq); + nvmet_cq_put(sq->cq); - if (sq->ctrl) { - nvmet_ctrl_put(sq->ctrl); + /* + * we must reference the ctrl again after waiting for inflight IO + * to complete. Because admin connect may have sneaked in after we + * store sq->ctrl locally, but before we killed the percpu_ref. the + * admin connect allocates and assigns sq->ctrl, which now needs a + * final ref put, as this ctrl is going away. + */ + ctrl = sq->ctrl; + + if (ctrl) { + /* + * The teardown flow may take some time, and the host may not + * send us keep-alive during this period, hence reset the + * traffic based keep-alive timer so we don't trigger a + * controller teardown as a result of a keep-alive expiration. + */ + ctrl->reset_tbkas = true; + sq->ctrl->sqs[sq->qid] = NULL; + nvmet_ctrl_put(ctrl); sq->ctrl = NULL; /* allows reusing the queue later */ } } @@ -758,17 +1000,23 @@ static void nvmet_sq_free(struct percpu_ref *ref) complete(&sq->free_done); } -int nvmet_sq_init(struct nvmet_sq *sq) +int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq) { int ret; + if (!nvmet_cq_get(cq)) + return -EINVAL; + ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); if (ret) { pr_err("percpu_ref init failed!\n"); + nvmet_cq_put(cq); return ret; } init_completion(&sq->free_done); init_completion(&sq->confirm_done); + nvmet_auth_sq_init(sq); + sq->cq = cq; return 0; } @@ -803,20 +1051,55 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) return 0; } +static u32 nvmet_io_cmd_transfer_len(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + u32 metadata_len = 0; + + if (nvme_is_fabrics(cmd)) + return nvmet_fabrics_io_cmd_data_len(req); + + if (!req->ns) + return 0; + + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + case nvme_cmd_zone_append: + if (req->sq->ctrl->pi_support && nvmet_ns_has_pi(req->ns)) + metadata_len = nvmet_rw_metadata_len(req); + return nvmet_rw_data_len(req) + metadata_len; + case nvme_cmd_dsm: + return nvmet_dsm_len(req); + case nvme_cmd_zone_mgmt_recv: + return (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2; + default: + return 0; + } +} + static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; u16 ret; - ret = nvmet_check_ctrl_status(req, cmd); + if (nvme_is_fabrics(cmd)) + return nvmet_parse_fabrics_io_cmd(req); + + if (unlikely(!nvmet_check_auth_status(req))) + return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; + + ret = nvmet_check_ctrl_status(req); + if (unlikely(ret)) + return ret; + + if (nvmet_is_passthru_req(req)) + return nvmet_parse_passthru_io_cmd(req); + + ret = nvmet_req_find_ns(req); if (unlikely(ret)) return ret; - req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); - if (unlikely(!req->ns)) { - req->error_loc = offsetof(struct nvme_common_command, nsid); - return NVME_SC_INVALID_NS | NVME_SC_DNR; - } ret = nvmet_check_ana_state(req->port, req->ns); if (unlikely(ret)) { req->error_loc = offsetof(struct nvme_common_command, nsid); @@ -828,70 +1111,105 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) return ret; } - if (req->ns->file) - return nvmet_file_parse_io_cmd(req); - else - return nvmet_bdev_parse_io_cmd(req); + if (req->ns->pr.enable) { + ret = nvmet_parse_pr_cmd(req); + if (!ret) + return ret; + } + + switch (req->ns->csi) { + case NVME_CSI_NVM: + if (req->ns->file) + ret = nvmet_file_parse_io_cmd(req); + else + ret = nvmet_bdev_parse_io_cmd(req); + break; + case NVME_CSI_ZNS: + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + ret = nvmet_bdev_zns_parse_io_cmd(req); + else + ret = NVME_SC_INVALID_IO_CMD_SET; + break; + default: + ret = NVME_SC_INVALID_IO_CMD_SET; + } + if (ret) + return ret; + + if (req->ns->pr.enable) { + ret = nvmet_pr_check_cmd_access(req); + if (ret) + return ret; + + ret = nvmet_pr_get_ns_pc_ref(req); + } + return ret; } -bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, - struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops) +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq, + const struct nvmet_fabrics_ops *ops) { u8 flags = req->cmd->common.flags; u16 status; - req->cq = cq; + req->cq = sq->cq; req->sq = sq; req->ops = ops; req->sg = NULL; + req->metadata_sg = NULL; req->sg_cnt = 0; + req->metadata_sg_cnt = 0; req->transfer_len = 0; - req->rsp->status = 0; - req->rsp->sq_head = 0; + req->metadata_len = 0; + req->cqe->result.u64 = 0; + req->cqe->status = 0; + req->cqe->sq_head = 0; req->ns = NULL; req->error_loc = NVMET_NO_ERROR_LOC; req->error_slba = 0; + req->pc_ref = NULL; /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { req->error_loc = offsetof(struct nvme_common_command, flags); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto fail; } /* * For fabrics, PSDT field shall describe metadata pointer (MPTR) that * contains an address of a single contiguous physical buffer that is - * byte aligned. + * byte aligned. For PCI controllers, this is optional so not enforced. */ if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) { - req->error_loc = offsetof(struct nvme_common_command, flags); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - goto fail; + if (!req->sq->ctrl || !nvmet_is_pci_ctrl(req->sq->ctrl)) { + req->error_loc = + offsetof(struct nvme_common_command, flags); + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + goto fail; + } } if (unlikely(!req->sq->ctrl)) - /* will return an error for any Non-connect command: */ + /* will return an error for any non-connect command: */ status = nvmet_parse_connect_cmd(req); else if (likely(req->sq->qid != 0)) status = nvmet_parse_io_cmd(req); - else if (req->cmd->common.opcode == nvme_fabrics_command) - status = nvmet_parse_fabrics_cmd(req); - else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) - status = nvmet_parse_discovery_cmd(req); else status = nvmet_parse_admin_cmd(req); if (status) goto fail; + trace_nvmet_req_init(req, req->cmd); + if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto fail; } if (sq->ctrl) - sq->ctrl->cmd_seen = true; + sq->ctrl->reset_tbkas = true; return true; @@ -904,110 +1222,173 @@ EXPORT_SYMBOL_GPL(nvmet_req_init); void nvmet_req_uninit(struct nvmet_req *req) { percpu_ref_put(&req->sq->ref); + if (req->pc_ref) + nvmet_pr_put_ns_pc_ref(req->pc_ref); if (req->ns) nvmet_put_namespace(req->ns); } EXPORT_SYMBOL_GPL(nvmet_req_uninit); -void nvmet_req_execute(struct nvmet_req *req) +size_t nvmet_req_transfer_len(struct nvmet_req *req) { - if (unlikely(req->data_len != req->transfer_len)) { - req->error_loc = offsetof(struct nvme_common_command, dptr); - nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); - } else - req->execute(req); + if (likely(req->sq->qid != 0)) + return nvmet_io_cmd_transfer_len(req); + if (unlikely(!req->sq->ctrl)) + return nvmet_connect_cmd_data_len(req); + return nvmet_admin_cmd_data_len(req); } -EXPORT_SYMBOL_GPL(nvmet_req_execute); +EXPORT_SYMBOL_GPL(nvmet_req_transfer_len); -int nvmet_req_alloc_sgl(struct nvmet_req *req) +bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len) { - struct pci_dev *p2p_dev = NULL; + if (unlikely(len != req->transfer_len)) { + u16 status; - if (IS_ENABLED(CONFIG_PCI_P2PDMA)) { - if (req->sq->ctrl && req->ns) - p2p_dev = radix_tree_lookup(&req->sq->ctrl->p2p_ns_map, - req->ns->nsid); - - req->p2p_dev = NULL; - if (req->sq->qid && p2p_dev) { - req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt, - req->transfer_len); - if (req->sg) { - req->p2p_dev = p2p_dev; - return 0; - } - } - - /* - * If no P2P memory was available we fallback to using - * regular memory - */ + req->error_loc = offsetof(struct nvme_common_command, dptr); + if (req->cmd->common.flags & NVME_CMD_SGL_ALL) + status = NVME_SC_SGL_INVALID_DATA; + else + status = NVME_SC_INVALID_FIELD; + nvmet_req_complete(req, status | NVME_STATUS_DNR); + return false; } - req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt); - if (!req->sg) - return -ENOMEM; - - return 0; + return true; } -EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl); +EXPORT_SYMBOL_GPL(nvmet_check_transfer_len); -void nvmet_req_free_sgl(struct nvmet_req *req) +bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len) { - if (req->p2p_dev) - pci_p2pmem_free_sgl(req->p2p_dev, req->sg); - else - sgl_free(req->sg); + if (unlikely(data_len > req->transfer_len)) { + u16 status; - req->sg = NULL; - req->sg_cnt = 0; -} -EXPORT_SYMBOL_GPL(nvmet_req_free_sgl); + req->error_loc = offsetof(struct nvme_common_command, dptr); + if (req->cmd->common.flags & NVME_CMD_SGL_ALL) + status = NVME_SC_SGL_INVALID_DATA; + else + status = NVME_SC_INVALID_FIELD; + nvmet_req_complete(req, status | NVME_STATUS_DNR); + return false; + } -static inline bool nvmet_cc_en(u32 cc) -{ - return (cc >> NVME_CC_EN_SHIFT) & 0x1; + return true; } -static inline u8 nvmet_cc_css(u32 cc) +static unsigned int nvmet_data_transfer_len(struct nvmet_req *req) { - return (cc >> NVME_CC_CSS_SHIFT) & 0x7; + return req->transfer_len - req->metadata_len; } -static inline u8 nvmet_cc_mps(u32 cc) +static int nvmet_req_alloc_p2pmem_sgls(struct pci_dev *p2p_dev, + struct nvmet_req *req) { - return (cc >> NVME_CC_MPS_SHIFT) & 0xf; + req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt, + nvmet_data_transfer_len(req)); + if (!req->sg) + goto out_err; + + if (req->metadata_len) { + req->metadata_sg = pci_p2pmem_alloc_sgl(p2p_dev, + &req->metadata_sg_cnt, req->metadata_len); + if (!req->metadata_sg) + goto out_free_sg; + } + + req->p2p_dev = p2p_dev; + + return 0; +out_free_sg: + pci_p2pmem_free_sgl(req->p2p_dev, req->sg); +out_err: + return -ENOMEM; } -static inline u8 nvmet_cc_ams(u32 cc) +static struct pci_dev *nvmet_req_find_p2p_dev(struct nvmet_req *req) { - return (cc >> NVME_CC_AMS_SHIFT) & 0x7; + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) || + !req->sq->ctrl || !req->sq->qid || !req->ns) + return NULL; + return radix_tree_lookup(&req->sq->ctrl->p2p_ns_map, req->ns->nsid); } -static inline u8 nvmet_cc_shn(u32 cc) +int nvmet_req_alloc_sgls(struct nvmet_req *req) { - return (cc >> NVME_CC_SHN_SHIFT) & 0x3; + struct pci_dev *p2p_dev = nvmet_req_find_p2p_dev(req); + + if (p2p_dev && !nvmet_req_alloc_p2pmem_sgls(p2p_dev, req)) + return 0; + + req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL, + &req->sg_cnt); + if (unlikely(!req->sg)) + goto out; + + if (req->metadata_len) { + req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL, + &req->metadata_sg_cnt); + if (unlikely(!req->metadata_sg)) + goto out_free; + } + + return 0; +out_free: + sgl_free(req->sg); +out: + return -ENOMEM; } +EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgls); -static inline u8 nvmet_cc_iosqes(u32 cc) +void nvmet_req_free_sgls(struct nvmet_req *req) { - return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; + if (req->p2p_dev) { + pci_p2pmem_free_sgl(req->p2p_dev, req->sg); + if (req->metadata_sg) + pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg); + req->p2p_dev = NULL; + } else { + sgl_free(req->sg); + if (req->metadata_sg) + sgl_free(req->metadata_sg); + } + + req->sg = NULL; + req->metadata_sg = NULL; + req->sg_cnt = 0; + req->metadata_sg_cnt = 0; } +EXPORT_SYMBOL_GPL(nvmet_req_free_sgls); -static inline u8 nvmet_cc_iocqes(u32 cc) +static inline bool nvmet_css_supported(u8 cc_css) { - return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; + switch (cc_css << NVME_CC_CSS_SHIFT) { + case NVME_CC_CSS_NVM: + case NVME_CC_CSS_CSI: + return true; + default: + return false; + } } static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) { lockdep_assert_held(&ctrl->lock); - if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || - nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES || - nvmet_cc_mps(ctrl->cc) != 0 || + /* + * Only I/O controllers should verify iosqes,iocqes. + * Strictly speaking, the spec says a discovery controller + * should verify iosqes,iocqes are zeroed, however that + * would break backwards compatibility, so don't enforce it. + */ + if (!nvmet_is_disc_subsys(ctrl->subsys) && + (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || + nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) { + ctrl->csts = NVME_CSTS_CFS; + return; + } + + if (nvmet_cc_mps(ctrl->cc) != 0 || nvmet_cc_ams(ctrl->cc) != 0 || - nvmet_cc_css(ctrl->cc) != 0) { + !nvmet_css_supported(nvmet_cc_css(ctrl->cc))) { ctrl->csts = NVME_CSTS_CFS; return; } @@ -1020,7 +1401,8 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) * in case a host died before it enabled the controller. Hence, simply * reset the keep alive timer when the controller is enabled. */ - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + if (ctrl->kato) + mod_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ); } static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) @@ -1052,30 +1434,40 @@ void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new) ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; mutex_unlock(&ctrl->lock); } +EXPORT_SYMBOL_GPL(nvmet_update_cc); static void nvmet_init_cap(struct nvmet_ctrl *ctrl) { /* command sets supported: NVMe command set: */ ctrl->cap = (1ULL << 37); + /* Controller supports one or more I/O Command Sets */ + ctrl->cap |= (1ULL << 43); /* CC.EN timeout in 500msec units: */ ctrl->cap |= (15ULL << 24); /* maximum queue entries supported: */ - ctrl->cap |= NVMET_QUEUE_SIZE - 1; + if (ctrl->ops->get_max_queue_size) + ctrl->cap |= min_t(u16, ctrl->ops->get_max_queue_size(ctrl), + ctrl->port->max_queue_size) - 1; + else + ctrl->cap |= ctrl->port->max_queue_size - 1; + + if (nvmet_is_passthru_subsys(ctrl->subsys)) + nvmet_passthrough_override_cap(ctrl); } -u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, - struct nvmet_req *req, struct nvmet_ctrl **ret) +struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, + const char *hostnqn, u16 cntlid, + struct nvmet_req *req) { + struct nvmet_ctrl *ctrl = NULL; struct nvmet_subsys *subsys; - struct nvmet_ctrl *ctrl; - u16 status = 0; subsys = nvmet_find_get_subsys(req->port, subsysnqn); if (!subsys) { pr_warn("connect request for invalid subsystem %s!\n", subsysnqn); - req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); - return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); + goto out; } mutex_lock(&subsys->lock); @@ -1088,34 +1480,40 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, if (!kref_get_unless_zero(&ctrl->ref)) continue; - *ret = ctrl; - goto out; + /* ctrl found */ + goto found; } } + ctrl = NULL; /* ctrl not found */ pr_warn("could not find controller %d for subsys %s / host %s\n", cntlid, subsysnqn, hostnqn); - req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); -out: +found: mutex_unlock(&subsys->lock); nvmet_subsys_put(subsys); - return status; +out: + return ctrl; } -u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd) +u16 nvmet_check_ctrl_status(struct nvmet_req *req) { if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { pr_err("got cmd %d while CC.EN == 0 on qid = %d\n", - cmd->common.opcode, req->sq->qid); - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + req->cmd->common.opcode, req->sq->qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; } if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n", - cmd->common.opcode, req->sq->qid); - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + req->cmd->common.opcode, req->sq->qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; + } + + if (unlikely(!nvmet_check_auth_status(req))) { + pr_warn("qid %d not authenticated\n", req->sq->qid); + return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; } return 0; } @@ -1129,7 +1527,7 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) if (subsys->allow_any_host) return true; - if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */ + if (nvmet_is_disc_subsys(subsys)) /* allow all access to disc subsys */ return true; list_for_each_entry(p, &subsys->hosts, entry) { @@ -1140,85 +1538,102 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) return false; } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, - struct nvmet_req *req) + struct device *p2p_client) { struct nvmet_ns *ns; + unsigned long idx; - if (!req->p2p_client) + lockdep_assert_held(&ctrl->subsys->lock); + + if (!p2p_client) return; - ctrl->p2p_client = get_device(req->p2p_client); + ctrl->p2p_client = get_device(p2p_client); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) nvmet_p2pmem_ns_add_p2p(ctrl, ns); } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) { struct radix_tree_iter iter; void __rcu **slot; + lockdep_assert_held(&ctrl->subsys->lock); + radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0) pci_dev_put(radix_tree_deref_slot(slot)); put_device(ctrl->p2p_client); } -u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, - struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + +struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) { struct nvmet_subsys *subsys; struct nvmet_ctrl *ctrl; + u32 kato = args->kato; + u8 dhchap_status; int ret; - u16 status; - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; - subsys = nvmet_find_get_subsys(req->port, subsysnqn); + args->status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; + subsys = nvmet_find_get_subsys(args->port, args->subsysnqn); if (!subsys) { pr_warn("connect request for invalid subsystem %s!\n", - subsysnqn); - req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); - goto out; + args->subsysnqn); + args->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + args->error_loc = offsetof(struct nvme_common_command, dptr); + return NULL; } - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; down_read(&nvmet_config_sem); - if (!nvmet_host_allowed(subsys, hostnqn)) { + if (!nvmet_host_allowed(subsys, args->hostnqn)) { pr_info("connect by host %s for subsystem %s not allowed\n", - hostnqn, subsysnqn); - req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); + args->hostnqn, args->subsysnqn); + args->result = IPO_IATTR_CONNECT_DATA(hostnqn); up_read(&nvmet_config_sem); - status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR; + args->status = NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR; + args->error_loc = offsetof(struct nvme_common_command, dptr); goto out_put_subsystem; } up_read(&nvmet_config_sem); - status = NVME_SC_INTERNAL; + args->status = NVME_SC_INTERNAL; ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) goto out_put_subsystem; mutex_init(&ctrl->lock); - nvmet_init_cap(ctrl); + ctrl->port = args->port; + ctrl->ops = args->ops; - ctrl->port = req->port; +#ifdef CONFIG_NVME_TARGET_PASSTHRU + /* By default, set loop targets to clear IDS by default */ + if (ctrl->port->disc_addr.trtype == NVMF_TRTYPE_LOOP) + subsys->clear_ids = 1; +#endif INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); INIT_LIST_HEAD(&ctrl->async_events); INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL); + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); + INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); - memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); - memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); + memcpy(ctrl->hostnqn, args->hostnqn, NVMF_NQN_SIZE); kref_init(&ctrl->ref); ctrl->subsys = subsys; + ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; + nvmet_init_cap(ctrl); WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES, @@ -1226,34 +1641,31 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (!ctrl->changed_ns_list) goto out_free_ctrl; - ctrl->cqs = kcalloc(subsys->max_qid + 1, - sizeof(struct nvmet_cq *), - GFP_KERNEL); - if (!ctrl->cqs) - goto out_free_changed_ns_list; - ctrl->sqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_sq *), GFP_KERNEL); if (!ctrl->sqs) - goto out_free_cqs; + goto out_free_changed_ns_list; - ret = ida_simple_get(&cntlid_ida, - NVME_CNTLID_MIN, NVME_CNTLID_MAX, + ctrl->cqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_cq *), + GFP_KERNEL); + if (!ctrl->cqs) + goto out_free_sqs; + + ret = ida_alloc_range(&cntlid_ida, + subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { - status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; - goto out_free_sqs; + args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; + goto out_free_cqs; } ctrl->cntlid = ret; - ctrl->ops = req->ops; - /* * Discovery controllers may use some arbitrary high value * in order to cleanup stale discovery sessions */ - if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato) + if (nvmet_is_disc_subsys(ctrl->subsys) && !kato) kato = NVMET_DISC_KATO_MS; /* keep-alive timeout in seconds */ @@ -1265,26 +1677,58 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, nvmet_start_keep_alive_timer(ctrl); mutex_lock(&subsys->lock); + ret = nvmet_ctrl_init_pr(ctrl); + if (ret) + goto init_pr_fail; list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); - nvmet_setup_p2p_ns_map(ctrl, req); + nvmet_setup_p2p_ns_map(ctrl, args->p2p_client); + nvmet_debugfs_ctrl_setup(ctrl); mutex_unlock(&subsys->lock); - *ctrlp = ctrl; - return 0; + if (args->hostid) + uuid_copy(&ctrl->hostid, args->hostid); + + dhchap_status = nvmet_setup_auth(ctrl, args->sq); + if (dhchap_status) { + pr_err("Failed to setup authentication, dhchap status %u\n", + dhchap_status); + nvmet_ctrl_put(ctrl); + if (dhchap_status == NVME_AUTH_DHCHAP_FAILURE_FAILED) + args->status = + NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR; + else + args->status = NVME_SC_INTERNAL; + return NULL; + } -out_free_sqs: - kfree(ctrl->sqs); + args->status = NVME_SC_SUCCESS; + + pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s%s.\n", + nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", + ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, + ctrl->pi_support ? " T10-PI is enabled" : "", + nvmet_has_auth(ctrl, args->sq) ? " with DH-HMAC-CHAP" : "", + nvmet_queue_tls_keyid(args->sq) ? ", TLS" : ""); + + return ctrl; + +init_pr_fail: + mutex_unlock(&subsys->lock); + nvmet_stop_keep_alive_timer(ctrl); + ida_free(&cntlid_ida, ctrl->cntlid); out_free_cqs: kfree(ctrl->cqs); +out_free_sqs: + kfree(ctrl->sqs); out_free_changed_ns_list: kfree(ctrl->changed_ns_list); out_free_ctrl: kfree(ctrl); out_put_subsystem: nvmet_subsys_put(subsys); -out: - return status; + return NULL; } +EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl); static void nvmet_ctrl_free(struct kref *ref) { @@ -1292,6 +1736,7 @@ static void nvmet_ctrl_free(struct kref *ref) struct nvmet_subsys *subsys = ctrl->subsys; mutex_lock(&subsys->lock); + nvmet_ctrl_destroy_pr(ctrl); nvmet_release_p2p_ns_map(ctrl); list_del(&ctrl->subsys_entry); mutex_unlock(&subsys->lock); @@ -1301,8 +1746,13 @@ static void nvmet_ctrl_free(struct kref *ref) flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fatal_err_work); - ida_simple_remove(&cntlid_ida, ctrl->cntlid); + nvmet_destroy_auth(ctrl); + + nvmet_debugfs_ctrl_free(ctrl); + ida_free(&cntlid_ida, ctrl->cntlid); + + nvmet_async_events_free(ctrl); kfree(ctrl->sqs); kfree(ctrl->cqs); kfree(ctrl->changed_ns_list); @@ -1315,28 +1765,27 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) { kref_put(&ctrl->ref, nvmet_ctrl_free); } - -static void nvmet_fatal_error_handler(struct work_struct *work) -{ - struct nvmet_ctrl *ctrl = - container_of(work, struct nvmet_ctrl, fatal_err_work); - - pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); - ctrl->ops->delete_ctrl(ctrl); -} +EXPORT_SYMBOL_GPL(nvmet_ctrl_put); void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) { mutex_lock(&ctrl->lock); if (!(ctrl->csts & NVME_CSTS_CFS)) { ctrl->csts |= NVME_CSTS_CFS; - INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); - schedule_work(&ctrl->fatal_err_work); + queue_work(nvmet_wq, &ctrl->fatal_err_work); } mutex_unlock(&ctrl->lock); } EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); +ssize_t nvmet_ctrl_host_traddr(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_len) +{ + if (!ctrl->ops->host_traddr) + return -EOPNOTSUPP; + return ctrl->ops->host_traddr(ctrl, traddr, traddr_len); +} + static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, const char *subsysnqn) { @@ -1352,6 +1801,13 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, } down_read(&nvmet_config_sem); + if (!strncmp(nvmet_disc_subsys->subsysnqn, subsysnqn, + NVMF_NQN_SIZE)) { + if (kref_get_unless_zero(&nvmet_disc_subsys->ref)) { + up_read(&nvmet_config_sem); + return nvmet_disc_subsys; + } + } list_for_each_entry(p, &port->subsystems, entry) { if (!strncmp(p->subsys->subsysnqn, subsysnqn, NVMF_NQN_SIZE)) { @@ -1369,43 +1825,76 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, enum nvme_subsys_type type) { struct nvmet_subsys *subsys; + char serial[NVMET_SN_MAX_SIZE / 2]; + int ret; subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); if (!subsys) - return NULL; + return ERR_PTR(-ENOMEM); - subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */ + subsys->ver = NVMET_DEFAULT_VS; /* generate a random serial number as our controllers are ephemeral: */ - get_random_bytes(&subsys->serial, sizeof(subsys->serial)); + get_random_bytes(&serial, sizeof(serial)); + bin2hex(subsys->serial, &serial, sizeof(serial)); + + subsys->model_number = kstrdup(NVMET_DEFAULT_CTRL_MODEL, GFP_KERNEL); + if (!subsys->model_number) { + ret = -ENOMEM; + goto free_subsys; + } + + subsys->ieee_oui = 0; + + subsys->firmware_rev = kstrndup(UTS_RELEASE, NVMET_FR_MAX_SIZE, GFP_KERNEL); + if (!subsys->firmware_rev) { + ret = -ENOMEM; + goto free_mn; + } switch (type) { case NVME_NQN_NVME: subsys->max_qid = NVMET_NR_QUEUES; break; case NVME_NQN_DISC: + case NVME_NQN_CURR: subsys->max_qid = 0; break; default: pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); - kfree(subsys); - return NULL; + ret = -EINVAL; + goto free_fr; } subsys->type = type; subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, GFP_KERNEL); if (!subsys->subsysnqn) { - kfree(subsys); - return NULL; + ret = -ENOMEM; + goto free_fr; } - + subsys->cntlid_min = NVME_CNTLID_MIN; + subsys->cntlid_max = NVME_CNTLID_MAX; kref_init(&subsys->ref); mutex_init(&subsys->lock); - INIT_LIST_HEAD(&subsys->namespaces); + xa_init(&subsys->namespaces); INIT_LIST_HEAD(&subsys->ctrls); INIT_LIST_HEAD(&subsys->hosts); + ret = nvmet_debugfs_subsys_setup(subsys); + if (ret) + goto free_subsysnqn; + return subsys; + +free_subsysnqn: + kfree(subsys->subsysnqn); +free_fr: + kfree(subsys->firmware_rev); +free_mn: + kfree(subsys->model_number); +free_subsys: + kfree(subsys); + return ERR_PTR(ret); } static void nvmet_subsys_free(struct kref *ref) @@ -1413,9 +1902,18 @@ static void nvmet_subsys_free(struct kref *ref) struct nvmet_subsys *subsys = container_of(ref, struct nvmet_subsys, ref); - WARN_ON_ONCE(!list_empty(&subsys->namespaces)); + WARN_ON_ONCE(!list_empty(&subsys->ctrls)); + WARN_ON_ONCE(!list_empty(&subsys->hosts)); + WARN_ON_ONCE(!xa_empty(&subsys->namespaces)); + + nvmet_debugfs_subsys_free(subsys); + + xa_destroy(&subsys->namespaces); + nvmet_passthru_subsys_free(subsys); kfree(subsys->subsysnqn); + kfree(subsys->model_number); + kfree(subsys->firmware_rev); kfree(subsys); } @@ -1436,31 +1934,56 @@ void nvmet_subsys_put(struct nvmet_subsys *subsys) static int __init nvmet_init(void) { - int error; + int error = -ENOMEM; nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; + nvmet_bvec_cache = kmem_cache_create("nvmet-bvec", + NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!nvmet_bvec_cache) + return -ENOMEM; + + zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0); + if (!zbd_wq) + goto out_destroy_bvec_cache; + buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", WQ_MEM_RECLAIM, 0); - if (!buffered_io_wq) { - error = -ENOMEM; - goto out; - } + if (!buffered_io_wq) + goto out_free_zbd_work_queue; + + nvmet_wq = alloc_workqueue("nvmet-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); + if (!nvmet_wq) + goto out_free_buffered_work_queue; + + error = nvmet_init_debugfs(); + if (error) + goto out_free_nvmet_work_queue; error = nvmet_init_discovery(); if (error) - goto out_free_work_queue; + goto out_exit_debugfs; error = nvmet_init_configfs(); if (error) goto out_exit_discovery; + return 0; out_exit_discovery: nvmet_exit_discovery(); -out_free_work_queue: +out_exit_debugfs: + nvmet_exit_debugfs(); +out_free_nvmet_work_queue: + destroy_workqueue(nvmet_wq); +out_free_buffered_work_queue: destroy_workqueue(buffered_io_wq); -out: +out_free_zbd_work_queue: + destroy_workqueue(zbd_wq); +out_destroy_bvec_cache: + kmem_cache_destroy(nvmet_bvec_cache); return error; } @@ -1468,8 +1991,12 @@ static void __exit nvmet_exit(void) { nvmet_exit_configfs(); nvmet_exit_discovery(); + nvmet_exit_debugfs(); ida_destroy(&cntlid_ida); + destroy_workqueue(nvmet_wq); destroy_workqueue(buffered_io_wq); + destroy_workqueue(zbd_wq); + kmem_cache_destroy(nvmet_bvec_cache); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); @@ -1478,4 +2005,5 @@ static void __exit nvmet_exit(void) module_init(nvmet_init); module_exit(nvmet_exit); +MODULE_DESCRIPTION("NVMe target core framework"); MODULE_LICENSE("GPL v2"); |
