summaryrefslogtreecommitdiff
path: root/drivers/nvme
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme')
-rw-r--r--drivers/nvme/host/Kconfig1
-rw-r--r--drivers/nvme/host/core.c183
-rw-r--r--drivers/nvme/host/fabrics.c13
-rw-r--r--drivers/nvme/host/fc.c13
-rw-r--r--drivers/nvme/host/hwmon.c14
-rw-r--r--drivers/nvme/host/multipath.c69
-rw-r--r--drivers/nvme/host/nvme.h41
-rw-r--r--drivers/nvme/host/pci.c40
-rw-r--r--drivers/nvme/host/rdma.c73
-rw-r--r--drivers/nvme/host/tcp.c93
-rw-r--r--drivers/nvme/target/configfs.c1
-rw-r--r--drivers/nvme/target/core.c8
-rw-r--r--drivers/nvme/target/fc.c4
-rw-r--r--drivers/nvme/target/fcloop.c2
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c1
-rw-r--r--drivers/nvme/target/loop.c2
-rw-r--r--drivers/nvme/target/passthru.c27
-rw-r--r--drivers/nvme/target/rdma.c4
-rw-r--r--drivers/nvme/target/tcp.c10
19 files changed, 389 insertions, 210 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 3ed9786b88d8..a44d49d63968 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -73,6 +73,7 @@ config NVME_TCP
depends on INET
depends on BLK_DEV_NVME
select NVME_FABRICS
+ select CRYPTO
select CRYPTO_CRC32C
help
This provides support for the NVMe over Fabrics protocol using
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 88cff309d8e4..893e29624c16 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -241,17 +241,6 @@ static blk_status_t nvme_error_status(u16 status)
}
}
-static inline bool nvme_req_needs_retry(struct request *req)
-{
- if (blk_noretry_request(req))
- return false;
- if (nvme_req(req)->status & NVME_SC_DNR)
- return false;
- if (nvme_req(req)->retries >= nvme_max_retries)
- return false;
- return true;
-}
-
static void nvme_retry_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
@@ -268,34 +257,67 @@ static void nvme_retry_req(struct request *req)
blk_mq_delay_kick_requeue_list(req->q, delay);
}
-void nvme_complete_rq(struct request *req)
+enum nvme_disposition {
+ COMPLETE,
+ RETRY,
+ FAILOVER,
+};
+
+static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
{
- blk_status_t status = nvme_error_status(nvme_req(req)->status);
+ if (likely(nvme_req(req)->status == 0))
+ return COMPLETE;
- trace_nvme_complete_rq(req);
+ if (blk_noretry_request(req) ||
+ (nvme_req(req)->status & NVME_SC_DNR) ||
+ nvme_req(req)->retries >= nvme_max_retries)
+ return COMPLETE;
- nvme_cleanup_cmd(req);
+ if (req->cmd_flags & REQ_NVME_MPATH) {
+ if (nvme_is_path_error(nvme_req(req)->status) ||
+ blk_queue_dying(req->q))
+ return FAILOVER;
+ } else {
+ if (blk_queue_dying(req->q))
+ return COMPLETE;
+ }
- if (nvme_req(req)->ctrl->kas)
- nvme_req(req)->ctrl->comp_seen = true;
+ return RETRY;
+}
- if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
- if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
- return;
+static inline void nvme_end_req(struct request *req)
+{
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
- if (!blk_queue_dying(req->q)) {
- nvme_retry_req(req);
- return;
- }
- } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- req_op(req) == REQ_OP_ZONE_APPEND) {
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = nvme_lba_to_sect(req->q->queuedata,
le64_to_cpu(nvme_req(req)->result.u64));
- }
nvme_trace_bio_complete(req, status);
blk_mq_end_request(req, status);
}
+
+void nvme_complete_rq(struct request *req)
+{
+ trace_nvme_complete_rq(req);
+ nvme_cleanup_cmd(req);
+
+ if (nvme_req(req)->ctrl->kas)
+ nvme_req(req)->ctrl->comp_seen = true;
+
+ switch (nvme_decide_disposition(req)) {
+ case COMPLETE:
+ nvme_end_req(req);
+ return;
+ case RETRY:
+ nvme_retry_req(req);
+ return;
+ case FAILOVER:
+ nvme_failover_req(req);
+ return;
+ }
+}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
bool nvme_cancel_request(struct request *req, void *data, bool reserved)
@@ -330,7 +352,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -340,7 +362,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -350,7 +372,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_NEW:
case NVME_CTRL_RESETTING:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -361,7 +383,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -371,7 +393,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_DELETING:
case NVME_CTRL_DEAD:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -380,7 +402,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (old_state) {
case NVME_CTRL_DELETING:
changed = true;
- /* FALLTHRU */
+ fallthrough;
default:
break;
}
@@ -2004,13 +2026,49 @@ static void nvme_update_disk_info(struct gendisk *disk,
blk_mq_unfreeze_queue(disk->queue);
}
+static inline bool nvme_first_scan(struct gendisk *disk)
+{
+ /* nvme_alloc_ns() scans the disk prior to adding it */
+ return !(disk->flags & GENHD_FL_UP);
+}
+
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ u32 iob;
+
+ if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+ is_power_of_2(ctrl->max_hw_sectors))
+ iob = ctrl->max_hw_sectors;
+ else
+ iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
+
+ if (!iob)
+ return;
+
+ if (!is_power_of_2(iob)) {
+ if (nvme_first_scan(ns->disk))
+ pr_warn("%s: ignoring unaligned IO boundary:%u\n",
+ ns->disk->disk_name, iob);
+ return;
+ }
+
+ if (blk_queue_is_zoned(ns->disk->queue)) {
+ if (nvme_first_scan(ns->disk))
+ pr_warn("%s: ignoring zoned namespace IO boundary\n",
+ ns->disk->disk_name);
+ return;
+ }
+
+ blk_queue_chunk_sectors(ns->queue, iob);
+}
+
static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
int ret;
- u32 iob;
/*
* If identify namespace failed, use default 512 byte block size so
@@ -2038,12 +2096,6 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
return -ENODEV;
}
- if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
- is_power_of_2(ctrl->max_hw_sectors))
- iob = ctrl->max_hw_sectors;
- else
- iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
-
ns->features = 0;
ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
/* the PI implementation requires metadata equal t10 pi tuple size */
@@ -2075,8 +2127,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
}
}
- if (iob)
- blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob));
+ nvme_set_chunk_sectors(ns, id);
nvme_update_disk_info(disk, ns, id);
#ifdef CONFIG_NVME_MULTIPATH
if (ns->head->disk) {
@@ -2965,14 +3016,14 @@ static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
{
struct nvme_cel *cel, *ret = NULL;
- spin_lock(&ctrl->lock);
+ spin_lock_irq(&ctrl->lock);
list_for_each_entry(cel, &ctrl->cels, entry) {
if (cel->csi == csi) {
ret = cel;
break;
}
}
- spin_unlock(&ctrl->lock);
+ spin_unlock_irq(&ctrl->lock);
return ret;
}
@@ -2990,7 +3041,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
if (!cel)
return -ENOMEM;
- ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
+ ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
&cel->log, sizeof(cel->log), 0);
if (ret) {
kfree(cel);
@@ -2999,9 +3050,9 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
cel->csi = csi;
- spin_lock(&ctrl->lock);
+ spin_lock_irq(&ctrl->lock);
list_add_tail(&cel->entry, &ctrl->cels);
- spin_unlock(&ctrl->lock);
+ spin_unlock_irq(&ctrl->lock);
out:
*log = &cel->log;
return 0;
@@ -3185,8 +3236,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ret < 0)
return ret;
- if (!ctrl->identified)
- nvme_hwmon_init(ctrl);
+ if (!ctrl->identified) {
+ ret = nvme_hwmon_init(ctrl);
+ if (ret < 0)
+ return ret;
+ }
ctrl->identified = true;
@@ -3210,10 +3264,26 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
return -EWOULDBLOCK;
}
+ nvme_get_ctrl(ctrl);
+ if (!try_module_get(ctrl->ops->module)) {
+ nvme_put_ctrl(ctrl);
+ return -EINVAL;
+ }
+
file->private_data = ctrl;
return 0;
}
+static int nvme_dev_release(struct inode *inode, struct file *file)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(inode->i_cdev, struct nvme_ctrl, cdev);
+
+ module_put(ctrl->ops->module);
+ nvme_put_ctrl(ctrl);
+ return 0;
+}
+
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
{
struct nvme_ns *ns;
@@ -3276,6 +3346,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
static const struct file_operations nvme_dev_fops = {
.owner = THIS_MODULE,
.open = nvme_dev_open,
+ .release = nvme_dev_release,
.unlocked_ioctl = nvme_dev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
};
@@ -3474,10 +3545,6 @@ static ssize_t nvme_sysfs_delete(struct device *dev,
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
- /* Can't delete non-created controllers */
- if (!ctrl->created)
- return -EBUSY;
-
if (device_remove_file_self(dev, attr))
nvme_delete_ctrl_sync(ctrl);
return count;
@@ -3654,6 +3721,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_hostid.attr && !ctrl->opts)
return 0;
+ if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
+ return 0;
+ if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
+ return 0;
return a->mode;
}
@@ -4348,7 +4419,6 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
nvme_queue_scan(ctrl);
nvme_start_queues(ctrl);
}
- ctrl->created = true;
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
@@ -4368,7 +4438,7 @@ static void nvme_free_ctrl(struct device *dev)
struct nvme_subsystem *subsys = ctrl->subsys;
struct nvme_cel *cel, *next;
- if (subsys && ctrl->instance != subsys->instance)
+ if (!subsys || ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
@@ -4512,7 +4582,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
-void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
@@ -4523,6 +4593,7 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
break;
}
up_read(&ctrl->namespaces_rwsem);
+ return timeout;
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 4ec4829d6233..8575724734e0 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -565,10 +565,14 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
struct nvme_request *req = nvme_req(rq);
/*
- * If we are in some state of setup or teardown only allow
- * internally generated commands.
+ * currently we have a problem sending passthru commands
+ * on the admin_q if the controller is not LIVE because we can't
+ * make sure that they are going out after the admin connect,
+ * controller enable and/or other commands in the initialization
+ * sequence. until the controller will be LIVE, fail with
+ * BLK_STS_RESOURCE so that they will be rescheduled.
*/
- if (!blk_rq_is_passthrough(rq) || (req->flags & NVME_REQ_USERCMD))
+ if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
return false;
/*
@@ -576,9 +580,8 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
* which is require to set the queue live in the appropinquate states.
*/
switch (ctrl->state) {
- case NVME_CTRL_NEW:
case NVME_CTRL_CONNECTING:
- if (nvme_is_fabrics(req->cmd) &&
+ if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
return true;
break;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index eae43bb444e0..e2e09e25c056 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2035,7 +2035,7 @@ done:
}
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
- if (!nvme_end_request(rq, status, result))
+ if (!nvme_try_complete_req(rq, status, result))
nvme_fc_complete_rq(rq);
check_error:
@@ -2078,7 +2078,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) {
dev_err(ctrl->dev,
"FCP Op failed - cmdiu dma mapping failed.\n");
- ret = EFAULT;
+ ret = -EFAULT;
goto out_on_error;
}
@@ -2088,7 +2088,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) {
dev_err(ctrl->dev,
"FCP Op failed - rspiu dma mapping failed.\n");
- ret = EFAULT;
+ ret = -EFAULT;
}
atomic_set(&op->state, FCPOP_STATE_IDLE);
@@ -2160,6 +2160,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
struct nvme_fc_fcp_op *aen_op;
int i;
+ cancel_work_sync(&ctrl->ctrl.async_event_work);
aen_op = ctrl->aen_ops;
for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
__nvme_fc_exit_request(ctrl, aen_op);
@@ -3670,12 +3671,14 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
spin_lock_irqsave(&nvme_fc_lock, flags);
list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
if (lport->localport.node_name != laddr.nn ||
- lport->localport.port_name != laddr.pn)
+ lport->localport.port_name != laddr.pn ||
+ lport->localport.port_state != FC_OBJSTATE_ONLINE)
continue;
list_for_each_entry(rport, &lport->endp_list, endp_list) {
if (rport->remoteport.node_name != raddr.nn ||
- rport->remoteport.port_name != raddr.pn)
+ rport->remoteport.port_name != raddr.pn ||
+ rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
continue;
/* if fail to get reference fall through. Will error */
diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
index 412a6c97c0d8..552dbc04567b 100644
--- a/drivers/nvme/host/hwmon.c
+++ b/drivers/nvme/host/hwmon.c
@@ -59,12 +59,8 @@ static int nvme_set_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under,
static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
{
- int ret;
-
- ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
+ return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
NVME_CSI_NVM, &data->log, sizeof(data->log), 0);
-
- return ret <= 0 ? ret : -EIO;
}
static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
@@ -225,7 +221,7 @@ static const struct hwmon_chip_info nvme_hwmon_chip_info = {
.info = nvme_hwmon_info,
};
-void nvme_hwmon_init(struct nvme_ctrl *ctrl)
+int nvme_hwmon_init(struct nvme_ctrl *ctrl)
{
struct device *dev = ctrl->dev;
struct nvme_hwmon_data *data;
@@ -234,7 +230,7 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl)
data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
if (!data)
- return;
+ return 0;
data->ctrl = ctrl;
mutex_init(&data->read_lock);
@@ -244,7 +240,7 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl)
dev_warn(ctrl->device,
"Failed to read smart log (error %d)\n", err);
devm_kfree(dev, data);
- return;
+ return err;
}
hwmon = devm_hwmon_device_register_with_info(dev, "nvme", data,
@@ -254,4 +250,6 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl)
dev_warn(dev, "Failed to instantiate hwmon device\n");
devm_kfree(dev, data);
}
+
+ return 0;
}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 3ded54d2c9c6..d4ba736c6c89 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
}
}
-bool nvme_failover_req(struct request *req)
+void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
- u16 status = nvme_req(req)->status;
+ u16 status = nvme_req(req)->status & 0x7ff;
unsigned long flags;
- switch (status & 0x7ff) {
- case NVME_SC_ANA_TRANSITION:
- case NVME_SC_ANA_INACCESSIBLE:
- case NVME_SC_ANA_PERSISTENT_LOSS:
- /*
- * If we got back an ANA error we know the controller is alive,
- * but not ready to serve this namespaces. The spec suggests
- * we should update our general state here, but due to the fact
- * that the admin and I/O queues are not serialized that is
- * fundamentally racy. So instead just clear the current path,
- * mark the the path as pending and kick of a re-read of the ANA
- * log page ASAP.
- */
- nvme_mpath_clear_current_path(ns);
- if (ns->ctrl->ana_log_buf) {
- set_bit(NVME_NS_ANA_PENDING, &ns->flags);
- queue_work(nvme_wq, &ns->ctrl->ana_work);
- }
- break;
- case NVME_SC_HOST_PATH_ERROR:
- case NVME_SC_HOST_ABORTED_CMD:
- /*
- * Temporary transport disruption in talking to the controller.
- * Try to send on a new path.
- */
- nvme_mpath_clear_current_path(ns);
- break;
- default:
- /* This was a non-ANA error so follow the normal error path. */
- return false;
+ nvme_mpath_clear_current_path(ns);
+
+ /*
+ * If we got back an ANA error, we know the controller is alive but not
+ * ready to serve this namespace. Kick of a re-read of the ANA
+ * information page, and just try any other available path for now.
+ */
+ if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
+ set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+ queue_work(nvme_wq, &ns->ctrl->ana_work);
}
spin_lock_irqsave(&ns->head->requeue_lock, flags);
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
- blk_mq_end_request(req, 0);
+ blk_mq_end_request(req, 0);
kblockd_schedule_work(&ns->head->requeue_work);
- return true;
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@@ -233,7 +212,7 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
int node, struct nvme_ns *old)
{
- struct nvme_ns *ns, *found, *fallback = NULL;
+ struct nvme_ns *ns, *found = NULL;
if (list_is_singular(&head->list)) {
if (nvme_path_is_disabled(old))
@@ -252,18 +231,22 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
goto out;
}
if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
- fallback = ns;
+ found = ns;
}
- /* No optimized path found, re-check the current path */
+ /*
+ * The loop above skips the current path for round-robin semantics.
+ * Fall back to the current path if either:
+ * - no other optimized path found and current is optimized,
+ * - no other usable path found and current is usable.
+ */
if (!nvme_path_is_disabled(old) &&
- old->ana_state == NVME_ANA_OPTIMIZED) {
- found = old;
- goto out;
- }
- if (!fallback)
+ (old->ana_state == NVME_ANA_OPTIMIZED ||
+ (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
+ return old;
+
+ if (!found)
return NULL;
- found = fallback;
out:
rcu_assign_pointer(head->current_path[node], found);
return found;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ebb8c3ed3885..2aaedfa43ed8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -307,7 +307,6 @@ struct nvme_ctrl {
struct nvme_command ka_cmd;
struct work_struct fw_act_work;
unsigned long events;
- bool created;
#ifdef CONFIG_NVME_MULTIPATH
/* asymmetric namespace access: */
@@ -523,7 +522,31 @@ static inline u32 nvme_bytes_to_numd(size_t len)
return (len >> 2) - 1;
}
-static inline bool nvme_end_request(struct request *req, __le16 status,
+static inline bool nvme_is_ana_error(u16 status)
+{
+ switch (status & 0x7ff) {
+ case NVME_SC_ANA_TRANSITION:
+ case NVME_SC_ANA_INACCESSIBLE:
+ case NVME_SC_ANA_PERSISTENT_LOSS:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool nvme_is_path_error(u16 status)
+{
+ /* check for a status code type of 'path related status' */
+ return (status & 0x700) == 0x300;
+}
+
+/*
+ * Fill in the status and result information from the CQE, and then figure out
+ * if blk-mq will need to use IPI magic to complete the request, and if yes do
+ * so. If not let the caller complete the request without an indirect function
+ * call.
+ */
+static inline bool nvme_try_complete_req(struct request *req, __le16 status,
union nvme_result result)
{
struct nvme_request *rq = nvme_req(req);
@@ -581,7 +604,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl);
void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl);
-void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1
@@ -629,7 +652,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags);
-bool nvme_failover_req(struct request *req);
+void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -688,9 +711,8 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
}
-static inline bool nvme_failover_req(struct request *req)
+static inline void nvme_failover_req(struct request *req)
{
- return false;
}
static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
@@ -805,9 +827,12 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
}
#ifdef CONFIG_NVME_HWMON
-void nvme_hwmon_init(struct nvme_ctrl *ctrl);
+int nvme_hwmon_init(struct nvme_ctrl *ctrl);
#else
-static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
+static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl)
+{
+ return 0;
+}
#endif
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ba725ae47305..8984796db0c8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -120,7 +120,7 @@ struct nvme_dev {
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
- u16 q_depth;
+ u32 q_depth;
int io_sqes;
u32 db_stride;
void __iomem *bar;
@@ -157,13 +157,13 @@ struct nvme_dev {
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
int ret;
- u16 n;
+ u32 n;
- ret = kstrtou16(val, 10, &n);
+ ret = kstrtou32(val, 10, &n);
if (ret != 0 || n < 2)
return -EINVAL;
- return param_set_ushort(val, kp);
+ return param_set_uint(val, kp);
}
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@@ -195,7 +195,7 @@ struct nvme_queue {
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
u32 __iomem *q_db;
- u16 q_depth;
+ u32 q_depth;
u16 cq_vector;
u16 sq_tail;
u16 cq_head;
@@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
- if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
- dev_warn(nvmeq->dev->ctrl.device,
- "invalid id %d completed on queue %d\n",
- cqe->command_id, le16_to_cpu(cqe->sq_id));
- return;
- }
-
/*
* AEN requests are special as they don't time out and can
* survive any kind of queue freeze and often don't respond to
@@ -960,8 +953,15 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
}
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
+ if (unlikely(!req)) {
+ dev_warn(nvmeq->dev->ctrl.device,
+ "invalid id %d completed on queue %d\n",
+ cqe->command_id, le16_to_cpu(cqe->sq_id));
+ return;
+ }
+
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
- if (!nvme_end_request(req, cqe->status, cqe->result))
+ if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
}
@@ -1244,13 +1244,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
switch (dev->ctrl.state) {
case NVME_CTRL_CONNECTING:
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
- /* fall through */
+ fallthrough;
case NVME_CTRL_DELETING:
dev_warn_ratelimited(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, true);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+ nvme_dev_disable(dev, true);
return BLK_EH_DONE;
case NVME_CTRL_RESETTING:
return BLK_EH_RESET_TIMER;
@@ -1267,10 +1267,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, reset controller\n",
req->tag, nvmeq->qid);
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
nvme_dev_disable(dev, false);
nvme_reset_ctrl(&dev->ctrl);
- nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_DONE;
}
@@ -2320,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
- dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1,
+ dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
@@ -2460,7 +2460,8 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
- PAGE_SIZE, PAGE_SIZE, 0);
+ NVME_CTRL_PAGE_SIZE,
+ NVME_CTRL_PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
@@ -3152,7 +3153,8 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_MEDIUM_PRIO_SQ |
- NVME_QUIRK_NO_TEMP_THRESH_CHANGE },
+ NVME_QUIRK_NO_TEMP_THRESH_CHANGE |
+ NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 44c76ffbb264..9e378d0a0c01 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -122,6 +122,7 @@ struct nvme_rdma_ctrl {
struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
+ struct mutex teardown_lock;
bool use_inline_data;
u32 io_queues[HCTX_MAX_TYPES];
};
@@ -834,6 +835,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
+ cancel_work_sync(&ctrl->ctrl.async_event_work);
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
ctrl->async_event_sqe.data = NULL;
@@ -975,7 +977,15 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
if (!new) {
nvme_start_queues(&ctrl->ctrl);
- nvme_wait_freeze(&ctrl->ctrl);
+ if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
+ /*
+ * If we timed out waiting for freeze we are likely to
+ * be stuck. Fail the controller initialization just
+ * to be safe.
+ */
+ ret = -ENODEV;
+ goto out_wait_freeze_timed_out;
+ }
blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
ctrl->ctrl.queue_count - 1);
nvme_unfreeze(&ctrl->ctrl);
@@ -983,6 +993,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
return 0;
+out_wait_freeze_timed_out:
+ nvme_stop_queues(&ctrl->ctrl);
+ nvme_rdma_stop_io_queues(ctrl);
out_cleanup_connect_q:
if (new)
blk_cleanup_queue(ctrl->ctrl.connect_q);
@@ -997,6 +1010,7 @@ out_free_io_queues:
static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
+ mutex_lock(&ctrl->teardown_lock);
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
if (ctrl->ctrl.admin_tagset) {
@@ -1007,11 +1021,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (remove)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
+ mutex_unlock(&ctrl->teardown_lock);
}
static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
+ mutex_lock(&ctrl->teardown_lock);
if (ctrl->ctrl.queue_count > 1) {
nvme_start_freeze(&ctrl->ctrl);
nvme_stop_queues(&ctrl->ctrl);
@@ -1025,6 +1041,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove);
}
+ mutex_unlock(&ctrl->teardown_lock);
}
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
@@ -1180,6 +1197,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
return;
+ dev_warn(ctrl->ctrl.device, "starting error recovery\n");
queue_work(nvme_reset_wq, &ctrl->err_work);
}
@@ -1189,7 +1207,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req)
if (!refcount_dec_and_test(&req->ref))
return;
- if (!nvme_end_request(rq, req->status, req->result))
+ if (!nvme_try_complete_req(rq, req->status, req->result))
nvme_rdma_complete_rq(rq);
}
@@ -1915,7 +1933,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
nvme_rdma_destroy_queue_ib(queue);
- /* fall through */
+ fallthrough;
case RDMA_CM_EVENT_ADDR_ERROR:
dev_dbg(queue->ctrl->ctrl.device,
"CM error event %d\n", ev->event);
@@ -1946,6 +1964,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
return 0;
}
+static void nvme_rdma_complete_timed_out(struct request *rq)
+{
+ struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_rdma_queue *queue = req->queue;
+ struct nvme_rdma_ctrl *ctrl = queue->ctrl;
+
+ /* fence other contexts that may complete the command */
+ mutex_lock(&ctrl->teardown_lock);
+ nvme_rdma_stop_queue(queue);
+ if (!blk_mq_request_completed(rq)) {
+ nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
+ blk_mq_complete_request(rq);
+ }
+ mutex_unlock(&ctrl->teardown_lock);
+}
+
static enum blk_eh_timer_return
nvme_rdma_timeout(struct request *rq, bool reserved)
{
@@ -1956,29 +1990,29 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
rq->tag, nvme_rdma_queue_idx(queue));
- /*
- * Restart the timer if a controller reset is already scheduled. Any
- * timed out commands would be handled before entering the connecting
- * state.
- */
- if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
- return BLK_EH_RESET_TIMER;
-
if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
/*
- * Teardown immediately if controller times out while starting
- * or we are already started error recovery. all outstanding
- * requests are completed on shutdown, so we return BLK_EH_DONE.
+ * If we are resetting, connecting or deleting we should
+ * complete immediately because we may block controller
+ * teardown or setup sequence
+ * - ctrl disable/shutdown fabrics requests
+ * - connect requests
+ * - initialization admin requests
+ * - I/O requests that entered after unquiescing and
+ * the controller stopped responding
+ *
+ * All other requests should be cancelled by the error
+ * recovery work, so it's fine that we fail it here.
*/
- flush_work(&ctrl->err_work);
- nvme_rdma_teardown_io_queues(ctrl, false);
- nvme_rdma_teardown_admin_queue(ctrl, false);
+ nvme_rdma_complete_timed_out(rq);
return BLK_EH_DONE;
}
- dev_warn(ctrl->ctrl.device, "starting error recovery\n");
+ /*
+ * LIVE state should trigger the normal error recovery which will
+ * handle completing this request.
+ */
nvme_rdma_error_recovery(ctrl);
-
return BLK_EH_RESET_TIMER;
}
@@ -2278,6 +2312,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
return ERR_PTR(-ENOMEM);
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
+ mutex_init(&ctrl->teardown_lock);
if (!(opts->mask & NVMF_OPT_TRSVCID)) {
opts->trsvcid =
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 62fbaecdc960..d6a3e1487354 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -124,6 +124,7 @@ struct nvme_tcp_ctrl {
struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
+ struct mutex teardown_lock;
struct work_struct err_work;
struct delayed_work connect_work;
struct nvme_tcp_request async_req;
@@ -464,6 +465,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return;
+ dev_warn(ctrl->device, "starting error recovery\n");
queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
}
@@ -481,7 +483,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
return -EINVAL;
}
- if (!nvme_end_request(rq, cqe->status, cqe->result))
+ if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
nvme_complete_rq(rq);
queue->nr_cqe++;
@@ -672,7 +674,7 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
{
union nvme_result res = {};
- if (!nvme_end_request(rq, cpu_to_le16(status << 1), res))
+ if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
nvme_complete_rq(rq);
}
@@ -866,7 +868,6 @@ static void nvme_tcp_state_change(struct sock *sk)
case TCP_LAST_ACK:
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
- /* fallthrough */
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
break;
default:
@@ -912,12 +913,11 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
else
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
- /* can't zcopy slab pages */
- if (unlikely(PageSlab(page))) {
- ret = sock_no_sendpage(queue->sock, page, offset, len,
+ if (sendpage_ok(page)) {
+ ret = kernel_sendpage(queue->sock, page, offset, len,
flags);
} else {
- ret = kernel_sendpage(queue->sock, page, offset, len,
+ ret = sock_no_sendpage(queue->sock, page, offset, len,
flags);
}
if (ret <= 0)
@@ -1527,7 +1527,6 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
return;
-
__nvme_tcp_stop_queue(queue);
}
@@ -1596,6 +1595,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
{
if (to_tcp_ctrl(ctrl)->async_req.pdu) {
+ cancel_work_sync(&ctrl->async_event_work);
nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
}
@@ -1782,7 +1782,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
if (!new) {
nvme_start_queues(ctrl);
- nvme_wait_freeze(ctrl);
+ if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
+ /*
+ * If we timed out waiting for freeze we are likely to
+ * be stuck. Fail the controller initialization just
+ * to be safe.
+ */
+ ret = -ENODEV;
+ goto out_wait_freeze_timed_out;
+ }
blk_mq_update_nr_hw_queues(ctrl->tagset,
ctrl->queue_count - 1);
nvme_unfreeze(ctrl);
@@ -1790,6 +1798,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
return 0;
+out_wait_freeze_timed_out:
+ nvme_stop_queues(ctrl);
+ nvme_tcp_stop_io_queues(ctrl);
out_cleanup_connect_q:
if (new)
blk_cleanup_queue(ctrl->connect_q);
@@ -1875,6 +1886,7 @@ out_free_queue:
static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
bool remove)
{
+ mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock);
blk_mq_quiesce_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
if (ctrl->admin_tagset) {
@@ -1885,13 +1897,16 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
if (remove)
blk_mq_unquiesce_queue(ctrl->admin_q);
nvme_tcp_destroy_admin_queue(ctrl, remove);
+ mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock);
}
static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
bool remove)
{
+ mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock);
if (ctrl->queue_count <= 1)
- return;
+ goto out;
+ blk_mq_quiesce_queue(ctrl->admin_q);
nvme_start_freeze(ctrl);
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
@@ -1903,6 +1918,8 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
if (remove)
nvme_start_queues(ctrl);
nvme_tcp_destroy_io_queues(ctrl, remove);
+out:
+ mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock);
}
static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
@@ -2149,40 +2166,55 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
nvme_tcp_queue_request(&ctrl->async_req, true, true);
}
+static void nvme_tcp_complete_timed_out(struct request *rq)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
+
+ /* fence other contexts that may complete the command */
+ mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock);
+ nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
+ if (!blk_mq_request_completed(rq)) {
+ nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
+ blk_mq_complete_request(rq);
+ }
+ mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock);
+}
+
static enum blk_eh_timer_return
nvme_tcp_timeout(struct request *rq, bool reserved)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
+ struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
- /*
- * Restart the timer if a controller reset is already scheduled. Any
- * timed out commands would be handled before entering the connecting
- * state.
- */
- if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
- return BLK_EH_RESET_TIMER;
-
- dev_warn(ctrl->ctrl.device,
+ dev_warn(ctrl->device,
"queue %d: timeout request %#x type %d\n",
nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
- if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+ if (ctrl->state != NVME_CTRL_LIVE) {
/*
- * Teardown immediately if controller times out while starting
- * or we are already started error recovery. all outstanding
- * requests are completed on shutdown, so we return BLK_EH_DONE.
+ * If we are resetting, connecting or deleting we should
+ * complete immediately because we may block controller
+ * teardown or setup sequence
+ * - ctrl disable/shutdown fabrics requests
+ * - connect requests
+ * - initialization admin requests
+ * - I/O requests that entered after unquiescing and
+ * the controller stopped responding
+ *
+ * All other requests should be cancelled by the error
+ * recovery work, so it's fine that we fail it here.
*/
- flush_work(&ctrl->err_work);
- nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
- nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
+ nvme_tcp_complete_timed_out(rq);
return BLK_EH_DONE;
}
- dev_warn(ctrl->ctrl.device, "starting error recovery\n");
- nvme_tcp_error_recovery(&ctrl->ctrl);
-
+ /*
+ * LIVE state should trigger the normal error recovery which will
+ * handle completing this request.
+ */
+ nvme_tcp_error_recovery(ctrl);
return BLK_EH_RESET_TIMER;
}
@@ -2423,6 +2455,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
nvme_tcp_reconnect_ctrl_work);
INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
+ mutex_init(&ctrl->teardown_lock);
if (!(opts->mask & NVMF_OPT_TRSVCID)) {
opts->trsvcid =
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 74b2b61c773b..37e1d7784e17 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -1136,6 +1136,7 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item,
up_write(&nvmet_config_sem);
kfree_rcu(new_model, rcuhead);
+ kfree(new_model_number);
return count;
}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b92f45f5cd5b..b7b63330b5ef 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -73,7 +73,7 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
status = NVME_SC_ACCESS_DENIED;
break;
case -EIO:
- /* FALLTHRU */
+ fallthrough;
default:
req->error_loc = offsetof(struct nvme_common_command, opcode);
status = NVME_SC_INTERNAL | NVME_SC_DNR;
@@ -397,6 +397,9 @@ static void nvmet_keep_alive_timer(struct work_struct *work)
static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
{
+ if (unlikely(ctrl->kato == 0))
+ return;
+
pr_debug("ctrl %d start keep-alive timer for %d secs\n",
ctrl->cntlid, ctrl->kato);
@@ -406,6 +409,9 @@ static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
{
+ if (unlikely(ctrl->kato == 0))
+ return;
+
pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
cancel_delayed_work_sync(&ctrl->ka_work);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 55bafd56166a..e6861cc10e7d 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -2342,9 +2342,9 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
return;
if (fcpreq->fcp_error ||
fcpreq->transferred_length != fcpreq->transfer_length) {
- spin_lock(&fod->flock);
+ spin_lock_irqsave(&fod->flock, flags);
fod->abort = true;
- spin_unlock(&fod->flock);
+ spin_unlock_irqrestore(&fod->flock, flags);
nvmet_req_complete(&fod->req, NVME_SC_INTERNAL);
return;
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index c97e60b71bbc..3da067a8311e 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -812,7 +812,7 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
break;
/* Fall-Thru to RSP handling */
- /* FALLTHRU */
+ fallthrough;
case NVMET_FCOP_RSP:
if (fcpreq) {
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 3dd6f566a240..125dde3f410e 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -139,7 +139,6 @@ static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
req->error_loc = offsetof(struct nvme_rw_command, nsid);
break;
case BLK_STS_IOERR:
- /* fallthru */
default:
status = NVME_SC_INTERNAL | NVME_SC_DNR;
req->error_loc = offsetof(struct nvme_common_command, opcode);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 4884ef1e46a2..0d6008cf66a2 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -115,7 +115,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
return;
}
- if (!nvme_end_request(rq, cqe->status, cqe->result))
+ if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
nvme_loop_complete_rq(rq);
}
}
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 89d91dc999a6..dacfa7435d0b 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -165,7 +165,7 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
req->cqe->result = nvme_req(rq)->result;
nvmet_req_complete(req, status);
- blk_put_request(rq);
+ blk_mq_free_request(rq);
}
static void nvmet_passthru_req_done(struct request *rq,
@@ -175,7 +175,7 @@ static void nvmet_passthru_req_done(struct request *rq,
req->cqe->result = nvme_req(rq)->result;
nvmet_req_complete(req, nvme_req(rq)->status);
- blk_put_request(rq);
+ blk_mq_free_request(rq);
}
static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
@@ -230,7 +230,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
if (unlikely(!ns)) {
pr_err("failed to get passthru ns nsid:%u\n", nsid);
status = NVME_SC_INVALID_NS | NVME_SC_DNR;
- goto fail_out;
+ goto out;
}
q = ns->queue;
@@ -238,16 +238,15 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
if (IS_ERR(rq)) {
- rq = NULL;
status = NVME_SC_INTERNAL;
- goto fail_out;
+ goto out_put_ns;
}
if (req->sg_cnt) {
ret = nvmet_passthru_map_sg(req, rq);
if (unlikely(ret)) {
status = NVME_SC_INTERNAL;
- goto fail_out;
+ goto out_put_req;
}
}
@@ -274,11 +273,13 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
return;
-fail_out:
+out_put_req:
+ blk_mq_free_request(rq);
+out_put_ns:
if (ns)
nvme_put_ns(ns);
+out:
nvmet_req_complete(req, status);
- blk_put_request(rq);
}
/*
@@ -326,6 +327,10 @@ static u16 nvmet_setup_passthru_command(struct nvmet_req *req)
u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
{
+ /* Reject any commands with non-sgl flags set (ie. fused commands) */
+ if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL)
+ return NVME_SC_INVALID_FIELD;
+
switch (req->cmd->common.opcode) {
case nvme_cmd_resv_register:
case nvme_cmd_resv_report:
@@ -396,6 +401,10 @@ static u16 nvmet_passthru_get_set_features(struct nvmet_req *req)
u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
{
+ /* Reject any commands with non-sgl flags set (ie. fused commands) */
+ if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL)
+ return NVME_SC_INVALID_FIELD;
+
/*
* Passthru all vendor specific commands
*/
@@ -508,6 +517,7 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys)
subsys->ver = NVME_VS(1, 2, 1);
}
+ __module_get(subsys->passthru_ctrl->ops->module);
mutex_unlock(&subsys->lock);
return 0;
@@ -522,6 +532,7 @@ static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
{
if (subsys->passthru_ctrl) {
xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid);
+ module_put(subsys->passthru_ctrl->ops->module);
nvme_put_ctrl(subsys->passthru_ctrl);
}
subsys->passthru_ctrl = NULL;
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 3ccb59260b4a..ae6620489457 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1758,7 +1758,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
schedule_delayed_work(&port->repair_work, 0);
break;
}
- /* FALLTHROUGH */
+ fallthrough;
case RDMA_CM_EVENT_DISCONNECTED:
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
nvmet_rdma_queue_disconnect(queue);
@@ -1769,7 +1769,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_REJECTED:
pr_debug("Connection rejected: %s\n",
rdma_reject_msg(cm_id, event->status));
- /* FALLTHROUGH */
+ fallthrough;
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_CONNECT_ERROR:
nvmet_rdma_queue_connect_fail(cm_id, queue);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 9eda91162fe4..8e0d766d2722 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -160,6 +160,11 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
struct nvmet_tcp_cmd *cmd)
{
+ if (unlikely(!queue->nr_cmds)) {
+ /* We didn't allocate cmds yet, send 0xffff */
+ return USHRT_MAX;
+ }
+
return cmd - queue->cmds;
}
@@ -866,7 +871,10 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
struct nvme_tcp_data_pdu *data = &queue->pdu.data;
struct nvmet_tcp_cmd *cmd;
- cmd = &queue->cmds[data->ttag];
+ if (likely(queue->nr_cmds))
+ cmd = &queue->cmds[data->ttag];
+ else
+ cmd = &queue->connect;
if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
pr_err("ttag %u unexpected data offset %u (expected %u)\n",