diff options
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/Kconfig | 1 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 703 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 13 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 13 | ||||
-rw-r--r-- | drivers/nvme/host/hwmon.c | 14 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 79 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 67 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 75 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 73 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 93 | ||||
-rw-r--r-- | drivers/nvme/host/zns.c | 57 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 1 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 12 | ||||
-rw-r--r-- | drivers/nvme/target/fc.c | 6 | ||||
-rw-r--r-- | drivers/nvme/target/fcloop.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd-bdev.c | 1 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 6 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 2 | ||||
-rw-r--r-- | drivers/nvme/target/passthru.c | 70 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/tcp.c | 31 |
22 files changed, 715 insertions, 610 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 3ed9786b88d8..a44d49d63968 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -73,6 +73,7 @@ config NVME_TCP depends on INET depends on BLK_DEV_NVME select NVME_FABRICS + select CRYPTO select CRYPTO_CRC32C help This provides support for the NVMe over Fabrics protocol using diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88cff309d8e4..56e2a22e8a02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,26 +89,38 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); +static void nvme_update_bdev_size(struct gendisk *disk) +{ + struct block_device *bdev = bdget_disk(disk, 0); + + if (bdev) { + bd_set_nr_sectors(bdev, get_capacity(disk)); + bdput(bdev); + } +} + +/* + * Prepare a queue for teardown. + * + * This must forcibly unquiesce queues to avoid blocking dispatch, and only set + * the capacity to 0 after that to avoid blocking dispatchers that may be + * holding bd_butex. This will end buffered writers dirtying pages that can't + * be synced. + */ static void nvme_set_queue_dying(struct nvme_ns *ns) { - /* - * Revalidating a dead namespace sets capacity to 0. This will end - * buffered writers dirtying pages that can't be synced. - */ if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; + blk_set_queue_dying(ns->queue); - /* Forcibly unquiesce queues to avoid blocking dispatch */ blk_mq_unquiesce_queue(ns->queue); - /* - * Revalidate after unblocking dispatchers that may be holding bd_butex - */ - revalidate_disk(ns->disk); + + set_capacity(ns->disk, 0); + nvme_update_bdev_size(ns->disk); } static void nvme_queue_scan(struct nvme_ctrl *ctrl) @@ -241,17 +253,6 @@ static blk_status_t nvme_error_status(u16 status) } } -static inline bool nvme_req_needs_retry(struct request *req) -{ - if (blk_noretry_request(req)) - return false; - if (nvme_req(req)->status & NVME_SC_DNR) - return false; - if (nvme_req(req)->retries >= nvme_max_retries) - return false; - return true; -} - static void nvme_retry_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; @@ -268,34 +269,67 @@ static void nvme_retry_req(struct request *req) blk_mq_delay_kick_requeue_list(req->q, delay); } -void nvme_complete_rq(struct request *req) +enum nvme_disposition { + COMPLETE, + RETRY, + FAILOVER, +}; + +static inline enum nvme_disposition nvme_decide_disposition(struct request *req) { - blk_status_t status = nvme_error_status(nvme_req(req)->status); + if (likely(nvme_req(req)->status == 0)) + return COMPLETE; - trace_nvme_complete_rq(req); + if (blk_noretry_request(req) || + (nvme_req(req)->status & NVME_SC_DNR) || + nvme_req(req)->retries >= nvme_max_retries) + return COMPLETE; - nvme_cleanup_cmd(req); + if (req->cmd_flags & REQ_NVME_MPATH) { + if (nvme_is_path_error(nvme_req(req)->status) || + blk_queue_dying(req->q)) + return FAILOVER; + } else { + if (blk_queue_dying(req->q)) + return COMPLETE; + } - if (nvme_req(req)->ctrl->kas) - nvme_req(req)->ctrl->comp_seen = true; + return RETRY; +} - if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) - return; +static inline void nvme_end_req(struct request *req) +{ + blk_status_t status = nvme_error_status(nvme_req(req)->status); - if (!blk_queue_dying(req->q)) { - nvme_retry_req(req); - return; - } - } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - req_op(req) == REQ_OP_ZONE_APPEND) { + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = nvme_lba_to_sect(req->q->queuedata, le64_to_cpu(nvme_req(req)->result.u64)); - } nvme_trace_bio_complete(req, status); blk_mq_end_request(req, status); } + +void nvme_complete_rq(struct request *req) +{ + trace_nvme_complete_rq(req); + nvme_cleanup_cmd(req); + + if (nvme_req(req)->ctrl->kas) + nvme_req(req)->ctrl->comp_seen = true; + + switch (nvme_decide_disposition(req)) { + case COMPLETE: + nvme_end_req(req); + return; + case RETRY: + nvme_retry_req(req); + return; + case FAILOVER: + nvme_failover_req(req); + return; + } +} EXPORT_SYMBOL_GPL(nvme_complete_rq); bool nvme_cancel_request(struct request *req, void *data, bool reserved) @@ -330,7 +364,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -340,7 +374,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_NEW: case NVME_CTRL_LIVE: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -350,7 +384,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_NEW: case NVME_CTRL_RESETTING: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -361,7 +395,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -371,7 +405,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_DELETING: case NVME_CTRL_DEAD: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -380,7 +414,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_DELETING: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -933,10 +967,10 @@ static u32 nvme_known_admin_effects(u8 opcode) { switch (opcode) { case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | NVME_CMD_EFFECTS_CSE_MASK; case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_CSE_MASK; + return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; default: break; } @@ -974,7 +1008,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, * For simplicity, IO to all namespaces is quiesced even if the command * effects say only one namespace is affected. */ - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + if (effects & NVME_CMD_EFFECTS_CSE_MASK) { mutex_lock(&ctrl->scan_lock); mutex_lock(&ctrl->subsys->lock); nvme_mpath_start_freeze(ctrl->subsys); @@ -985,36 +1019,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (_nvme_revalidate_disk(ns->disk)) - nvme_set_queue_dying(ns); - else if (blk_queue_is_zoned(ns->disk->queue)) { - /* - * IO commands are required to fully revalidate a zoned - * device. Force the command effects to trigger rescan - * work so report zones can run in a context with - * unfrozen IO queues. - */ - *effects |= NVME_CMD_EFFECTS_NCC; - } - up_read(&ctrl->namespaces_rwsem); -} - static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) { - /* - * Revalidate LBA changes prior to unfreezing. This is necessary to - * prevent memory corruption if a logical block size was changed by - * this command. - */ - if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl, &effects); - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + if (effects & NVME_CMD_EFFECTS_CSE_MASK) { nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); mutex_unlock(&ctrl->subsys->lock); @@ -1274,6 +1281,8 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, int status, pos, len; void *data; + if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl)) + return 0; if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) return 0; @@ -1317,19 +1326,8 @@ free_data: return status; } -static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) -{ - struct nvme_command c = { }; - - c.identify.opcode = nvme_admin_identify; - c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; - c.identify.nsid = cpu_to_le32(nsid); - return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, - NVME_IDENTIFY_DATA_SIZE); -} - -static int nvme_identify_ns(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_id_ns **id) +static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids, struct nvme_id_ns **id) { struct nvme_command c = { }; int error; @@ -1346,9 +1344,24 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - kfree(*id); + goto out_free_id; } + error = -ENODEV; + if ((*id)->ncap == 0) /* namespace not allocated or attached */ + goto out_free_id; + + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + + return 0; + +out_free_id: + kfree(*id); return error; } @@ -1870,20 +1883,6 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) nvme_lba_to_sect(ns, max_blocks)); } -static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, - struct nvme_id_ns *id, struct nvme_ns_ids *ids) -{ - memset(ids, 0, sizeof(*ids)); - - if (ctrl->vs >= NVME_VS(1, 1, 0)) - memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); - if (ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) - return nvme_identify_ns_descs(ctrl, nsid, ids); - return 0; -} - static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) { return !uuid_is_null(&ids->uuid) || @@ -1924,6 +1923,68 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return 0; } +static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + + /* + * The PI implementation requires the metadata size to be equal to the + * t10 pi tuple size. + */ + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + if (ns->ms == sizeof(struct t10_pi_tuple)) + ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + else + ns->pi_type = 0; + + ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) + return 0; + if (ctrl->ops->flags & NVME_F_FABRICS) { + /* + * The NVMe over Fabrics specification only supports metadata as + * part of the extended data LBA. We rely on HCA/HBA support to + * remap the separate metadata buffer from the block layer. + */ + if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) + return -EINVAL; + if (ctrl->max_integrity_segments) + ns->features |= + (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + } else { + /* + * For PCIe controllers, we can't easily remap the separate + * metadata buffer from the block layer and thus require a + * separate metadata buffer for block layer metadata/PI support. + * We allow extended LBAs for the passthrough interface, though. + */ + if (id->flbas & NVME_NS_FLBAS_META_EXT) + ns->features |= NVME_NS_EXT_LBAS; + else + ns->features |= NVME_NS_METADATA_SUPPORTED; + } + + return 0; +} + +static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, + struct request_queue *q) +{ + bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; + + if (ctrl->max_hw_sectors) { + u32 max_segments = + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; + + max_segments = min_not_zero(max_segments, ctrl->max_segments); + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); + } + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); + blk_queue_dma_alignment(q, 7); + blk_queue_write_cache(q, vwc, vwc); +} + static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { @@ -1931,11 +1992,15 @@ static void nvme_update_disk_info(struct gendisk *disk, unsigned short bs = 1 << ns->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; + /* + * The block layer can't support LBA sizes larger than the page size + * yet, so catch this early and don't allow block I/O. + */ if (ns->lba_shift > PAGE_SHIFT) { - /* unsupported block size, set capacity to 0 later */ + capacity = 0; bs = (1 << 9); } - blk_mq_freeze_queue(disk->queue); + blk_integrity_unregister(disk); atomic_bs = phys_bs = bs; @@ -1970,13 +2035,6 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_queue_io_opt(disk->queue, io_opt); /* - * The block layer can't support LBA sizes larger than the page size - * yet, so catch this early and don't allow block I/O. - */ - if (ns->lba_shift > PAGE_SHIFT) - capacity = 0; - - /* * Register a metadata profile for PI, or the plain non-integrity NVMe * metadata masquerading as Type 0 if supported, otherwise reject block * I/O to namespaces with metadata except when the namespace supports @@ -2000,162 +2058,88 @@ static void nvme_update_disk_info(struct gendisk *disk, set_disk_ro(disk, true); else set_disk_ro(disk, false); +} - blk_mq_unfreeze_queue(disk->queue); +static inline bool nvme_first_scan(struct gendisk *disk) +{ + /* nvme_alloc_ns() scans the disk prior to adding it */ + return !(disk->flags & GENHD_FL_UP); } -static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) +static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) { - unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; - int ret; u32 iob; - /* - * If identify namespace failed, use default 512 byte block size so - * block layer can use before failing read/write for 0 capacity. - */ - ns->lba_shift = id->lbaf[lbaf].ds; - if (ns->lba_shift == 0) - ns->lba_shift = 9; - - switch (ns->head->ids.csi) { - case NVME_CSI_NVM: - break; - case NVME_CSI_ZNS: - ret = nvme_update_zone_info(disk, ns, lbaf); - if (ret) { - dev_warn(ctrl->device, - "failed to add zoned namespace:%u ret:%d\n", - ns->head->ns_id, ret); - return ret; - } - break; - default: - dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", - ns->head->ids.csi, ns->head->ns_id); - return -ENODEV; - } - if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && is_power_of_2(ctrl->max_hw_sectors)) iob = ctrl->max_hw_sectors; else iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); - ns->features = 0; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - /* the PI implementation requires metadata equal t10 pi tuple size */ - if (ns->ms == sizeof(struct t10_pi_tuple)) - ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; - else - ns->pi_type = 0; + if (!iob) + return; - if (ns->ms) { - /* - * For PCIe only the separate metadata pointer is supported, - * as the block layer supplies metadata in a separate bio_vec - * chain. For Fabrics, only metadata as part of extended data - * LBA is supported on the wire per the Fabrics specification, - * but the HBA/HCA will do the remapping from the separate - * metadata buffers for us. - */ - if (id->flbas & NVME_NS_FLBAS_META_EXT) { - ns->features |= NVME_NS_EXT_LBAS; - if ((ctrl->ops->flags & NVME_F_FABRICS) && - (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) && - ctrl->max_integrity_segments) - ns->features |= NVME_NS_METADATA_SUPPORTED; - } else { - if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS)) - return -EINVAL; - if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) - ns->features |= NVME_NS_METADATA_SUPPORTED; - } + if (!is_power_of_2(iob)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring unaligned IO boundary:%u\n", + ns->disk->disk_name, iob); + return; } - if (iob) - blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); - nvme_update_disk_info(disk, ns, id); -#ifdef CONFIG_NVME_MULTIPATH - if (ns->head->disk) { - nvme_update_disk_info(ns->head->disk, ns, id); - blk_stack_limits(&ns->head->disk->queue->limits, - &ns->queue->limits, 0); - nvme_mpath_update_disk_size(ns->head->disk); + if (blk_queue_is_zoned(ns->disk->queue)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring zoned namespace IO boundary\n", + ns->disk->disk_name); + return; } -#endif - return 0; + + blk_queue_chunk_sectors(ns->queue, iob); } -static int _nvme_revalidate_disk(struct gendisk *disk) +static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) { - struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ns *id; - struct nvme_ns_ids ids; - int ret = 0; - - if (test_bit(NVME_NS_DEAD, &ns->flags)) { - set_capacity(disk, 0); - return -ENODEV; - } + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; + int ret; - ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); - if (ret) - goto out; + blk_mq_freeze_queue(ns->disk->queue); + ns->lba_shift = id->lbaf[lbaf].ds; + nvme_set_queue_limits(ns->ctrl, ns->queue); - if (id->ncap == 0) { - ret = -ENODEV; - goto free_id; + if (ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_update_zone_info(ns, lbaf); + if (ret) + goto out_unfreeze; } - ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + ret = nvme_configure_metadata(ns, id); if (ret) - goto free_id; + goto out_unfreeze; + nvme_set_chunk_sectors(ns, id); + nvme_update_disk_info(ns->disk, ns, id); + blk_mq_unfreeze_queue(ns->disk->queue); - if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { - dev_err(ctrl->device, - "identifiers changed for nsid %d\n", ns->head->ns_id); - ret = -ENODEV; - goto free_id; + if (blk_queue_is_zoned(ns->queue)) { + ret = nvme_revalidate_zones(ns); + if (ret) + return ret; } - ret = __nvme_revalidate_disk(disk, id); -free_id: - kfree(id); -out: - /* - * Only fail the function if we got a fatal error back from the - * device, otherwise ignore the error and just move on. - */ - if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) - ret = 0; - else if (ret > 0) - ret = blk_status_to_errno(nvme_error_status(ret)); - return ret; -} - -static int nvme_revalidate_disk(struct gendisk *disk) -{ - int ret; - - ret = _nvme_revalidate_disk(disk); - if (ret) - return ret; - -#ifdef CONFIG_BLK_DEV_ZONED - if (blk_queue_is_zoned(disk->queue)) { - struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; - - ret = blk_revalidate_disk_zones(disk, NULL); - if (!ret) - blk_queue_max_zone_append_sectors(disk->queue, - ctrl->max_zone_append); +#ifdef CONFIG_NVME_MULTIPATH + if (ns->head->disk) { + blk_mq_freeze_queue(ns->head->disk->queue); + nvme_update_disk_info(ns->head->disk, ns, id); + blk_stack_limits(&ns->head->disk->queue->limits, + &ns->queue->limits, 0); + blk_queue_update_readahead(ns->head->disk->queue); + nvme_update_bdev_size(ns->head->disk); + blk_mq_unfreeze_queue(ns->head->disk->queue); } #endif + return 0; + +out_unfreeze: + blk_mq_unfreeze_queue(ns->disk->queue); return ret; } @@ -2288,7 +2272,6 @@ static const struct block_device_operations nvme_fops = { .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, - .revalidate_disk= nvme_revalidate_disk, .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; @@ -2438,26 +2421,6 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); -static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, - struct request_queue *q) -{ - bool vwc = false; - - if (ctrl->max_hw_sectors) { - u32 max_segments = - (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; - - max_segments = min_not_zero(max_segments, ctrl->max_segments); - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); - blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); - } - blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); - blk_queue_dma_alignment(q, 7); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - vwc = true; - blk_queue_write_cache(q, vwc, vwc); -} - static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) { __le64 ts; @@ -2961,26 +2924,10 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) -{ - struct nvme_cel *cel, *ret = NULL; - - spin_lock(&ctrl->lock); - list_for_each_entry(cel, &ctrl->cels, entry) { - if (cel->csi == csi) { - ret = cel; - break; - } - } - spin_unlock(&ctrl->lock); - - return ret; -} - static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, struct nvme_effects_log **log) { - struct nvme_cel *cel = nvme_find_cel(ctrl, csi); + struct nvme_cel *cel = xa_load(&ctrl->cels, csi); int ret; if (cel) @@ -2990,7 +2937,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, if (!cel) return -ENOMEM; - ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi, + ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi, &cel->log, sizeof(cel->log), 0); if (ret) { kfree(cel); @@ -2998,10 +2945,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, } cel->csi = csi; - - spin_lock(&ctrl->lock); - list_add_tail(&cel->entry, &ctrl->cels); - spin_unlock(&ctrl->lock); + xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL); out: *log = &cel->log; return 0; @@ -3185,8 +3129,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret < 0) return ret; - if (!ctrl->identified) - nvme_hwmon_init(ctrl); + if (!ctrl->identified) { + ret = nvme_hwmon_init(ctrl); + if (ret < 0) + return ret; + } ctrl->identified = true; @@ -3210,10 +3157,26 @@ static int nvme_dev_open(struct inode *inode, struct file *file) return -EWOULDBLOCK; } + nvme_get_ctrl(ctrl); + if (!try_module_get(ctrl->ops->module)) { + nvme_put_ctrl(ctrl); + return -EINVAL; + } + file->private_data = ctrl; return 0; } +static int nvme_dev_release(struct inode *inode, struct file *file) +{ + struct nvme_ctrl *ctrl = + container_of(inode->i_cdev, struct nvme_ctrl, cdev); + + module_put(ctrl->ops->module); + nvme_put_ctrl(ctrl); + return 0; +} + static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) { struct nvme_ns *ns; @@ -3276,6 +3239,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, + .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, }; @@ -3474,10 +3438,6 @@ static ssize_t nvme_sysfs_delete(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - /* Can't delete non-created controllers */ - if (!ctrl->created) - return -EBUSY; - if (device_remove_file_self(dev, attr)) nvme_delete_ctrl_sync(ctrl); return count; @@ -3654,6 +3614,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_hostid.attr && !ctrl->opts) return 0; + if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) + return 0; return a->mode; } @@ -3762,25 +3726,16 @@ out: } static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, - struct nvme_id_ns *id) + struct nvme_ns_ids *ids, bool is_shared) { struct nvme_ctrl *ctrl = ns->ctrl; - bool is_shared = id->nmic & NVME_NS_NMIC_SHARED; struct nvme_ns_head *head = NULL; - struct nvme_ns_ids ids; int ret = 0; - ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); - if (ret) { - if (ret < 0) - return ret; - return blk_status_to_errno(nvme_error_status(ret)); - } - mutex_lock(&ctrl->subsys->lock); head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { - head = nvme_alloc_ns_head(ctrl, nsid, &ids); + head = nvme_alloc_ns_head(ctrl, nsid, ids); if (IS_ERR(head)) { ret = PTR_ERR(head); goto out_unlock; @@ -3793,7 +3748,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, "Duplicate unshared namespace %d\n", nsid); goto out_put_ns_head; } - if (!nvme_ns_ids_equal(&head->ids, &ids)) { + if (!nvme_ns_ids_equal(&head->ids, ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", nsid); @@ -3841,7 +3796,8 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); -static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids) { struct nvme_ns *ns; struct gendisk *disk; @@ -3849,17 +3805,19 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) char disk_name[DISK_NAME_LEN]; int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; + if (nvme_identify_ns(ctrl, nsid, ids, &id)) + return; + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return; + goto out_free_id; ns->queue = blk_mq_init_queue(ctrl->tagset); if (IS_ERR(ns->queue)) goto out_free_ns; if (ctrl->opts && ctrl->opts->data_digest) - ns->queue->backing_dev_info->capabilities - |= BDI_CAP_STABLE_WRITES; + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) @@ -3867,23 +3825,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->queue->queuedata = ns; ns->ctrl = ctrl; - kref_init(&ns->kref); - ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - nvme_set_queue_limits(ctrl, ns->queue); - ret = nvme_identify_ns(ctrl, nsid, &id); + ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED); if (ret) goto out_free_queue; - - if (id->ncap == 0) /* no namespace (legacy quirk) */ - goto out_free_id; - - ret = nvme_init_ns_head(ns, nsid, id); - if (ret) - goto out_free_id; nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); @@ -3897,7 +3843,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); ns->disk = disk; - if (__nvme_revalidate_disk(disk, id)) + if (nvme_update_ns_info(ns, id)) goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { @@ -3932,12 +3878,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); - out_free_id: - kfree(id); out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + out_free_id: + kfree(id); } static void nvme_ns_remove(struct nvme_ns *ns) @@ -3945,6 +3891,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; + set_capacity(ns->disk, 0); nvme_fault_inject_fini(&ns->fault_inject); mutex_lock(&ns->ctrl->subsys->lock); @@ -3982,17 +3929,75 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) } } -static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) +{ + struct nvme_id_ns *id; + int ret = -ENODEV; + + if (test_bit(NVME_NS_DEAD, &ns->flags)) + goto out; + + ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id); + if (ret) + goto out; + + ret = -ENODEV; + if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { + dev_err(ns->ctrl->device, + "identifiers changed for nsid %d\n", ns->head->ns_id); + goto out_free_id; + } + + ret = nvme_update_ns_info(ns, id); + +out_free_id: + kfree(id); +out: + /* + * Only remove the namespace if we got a fatal error back from the + * device, otherwise ignore the error and just move on. + * + * TODO: we should probably schedule a delayed retry here. + */ + if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR))) + nvme_ns_remove(ns); + else + revalidate_disk_size(ns->disk, true); +} + +static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { + struct nvme_ns_ids ids = { }; struct nvme_ns *ns; + if (nvme_identify_ns_descs(ctrl, nsid, &ids)) + return; + ns = nvme_find_get_ns(ctrl, nsid); if (ns) { - if (revalidate_disk(ns->disk)) - nvme_ns_remove(ns); + nvme_validate_ns(ns, &ids); nvme_put_ns(ns); - } else - nvme_alloc_ns(ctrl, nsid); + return; + } + + switch (ids.csi) { + case NVME_CSI_NVM: + nvme_alloc_ns(ctrl, nsid, &ids); + break; + case NVME_CSI_ZNS: + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + dev_warn(ctrl->device, + "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", + nsid); + break; + } + nvme_alloc_ns(ctrl, nsid, &ids); + break; + default: + dev_warn(ctrl->device, "unknown csi %u for nsid %u\n", + ids.csi, nsid); + break; + } } static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -4028,7 +4033,14 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) return -ENOMEM; for (;;) { - ret = nvme_identify_ns_list(ctrl, prev, ns_list); + struct nvme_command cmd = { + .identify.opcode = nvme_admin_identify, + .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST, + .identify.nsid = cpu_to_le32(prev), + }; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, + NVME_IDENTIFY_DATA_SIZE); if (ret) goto free; @@ -4037,7 +4049,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) if (!nsid) /* end of the list? */ goto out; - nvme_validate_ns(ctrl, nsid); + nvme_validate_or_alloc_ns(ctrl, nsid); while (++prev < nsid) nvme_ns_remove_by_nsid(ctrl, prev); } @@ -4060,7 +4072,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) kfree(id); for (i = 1; i <= nn; i++) - nvme_validate_ns(ctrl, i); + nvme_validate_or_alloc_ns(ctrl, i); nvme_remove_invalid_namespaces(ctrl, nn); } @@ -4348,7 +4360,6 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_queue_scan(ctrl); nvme_start_queues(ctrl); } - ctrl->created = true; } EXPORT_SYMBOL_GPL(nvme_start_ctrl); @@ -4366,15 +4377,11 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; - struct nvme_cel *cel, *next; - if (subsys && ctrl->instance != subsys->instance) + if (!subsys || ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { - list_del(&cel->entry); - kfree(cel); - } + xa_destroy(&ctrl->cels); nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4406,7 +4413,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); - INIT_LIST_HEAD(&ctrl->cels); + xa_init(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; @@ -4512,7 +4519,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_unfreeze); -void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) { struct nvme_ns *ns; @@ -4523,6 +4530,7 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) break; } up_read(&ctrl->namespaces_rwsem); + return timeout; } EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); @@ -4585,28 +4593,13 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_sync_queues); -struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path) +struct nvme_ctrl *nvme_ctrl_from_file(struct file *file) { - struct nvme_ctrl *ctrl; - struct file *f; - - f = filp_open(path, O_RDWR, 0); - if (IS_ERR(f)) - return ERR_CAST(f); - - if (f->f_op != &nvme_dev_fops) { - ctrl = ERR_PTR(-EINVAL); - goto out_close; - } - - ctrl = f->private_data; - nvme_get_ctrl(ctrl); - -out_close: - filp_close(f, NULL); - return ctrl; + if (file->f_op != &nvme_dev_fops) + return NULL; + return file->private_data; } -EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU); +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU); /* * Check we didn't inadvertently grow the command structure sizes: diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 4ec4829d6233..8575724734e0 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -565,10 +565,14 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, struct nvme_request *req = nvme_req(rq); /* - * If we are in some state of setup or teardown only allow - * internally generated commands. + * currently we have a problem sending passthru commands + * on the admin_q if the controller is not LIVE because we can't + * make sure that they are going out after the admin connect, + * controller enable and/or other commands in the initialization + * sequence. until the controller will be LIVE, fail with + * BLK_STS_RESOURCE so that they will be rescheduled. */ - if (!blk_rq_is_passthrough(rq) || (req->flags & NVME_REQ_USERCMD)) + if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD)) return false; /* @@ -576,9 +580,8 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, * which is require to set the queue live in the appropinquate states. */ switch (ctrl->state) { - case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: - if (nvme_is_fabrics(req->cmd) && + if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && req->cmd->fabrics.fctype == nvme_fabrics_type_connect) return true; break; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index eae43bb444e0..e2e09e25c056 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2035,7 +2035,7 @@ done: } __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); - if (!nvme_end_request(rq, status, result)) + if (!nvme_try_complete_req(rq, status, result)) nvme_fc_complete_rq(rq); check_error: @@ -2078,7 +2078,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) { dev_err(ctrl->dev, "FCP Op failed - cmdiu dma mapping failed.\n"); - ret = EFAULT; + ret = -EFAULT; goto out_on_error; } @@ -2088,7 +2088,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) { dev_err(ctrl->dev, "FCP Op failed - rspiu dma mapping failed.\n"); - ret = EFAULT; + ret = -EFAULT; } atomic_set(&op->state, FCPOP_STATE_IDLE); @@ -2160,6 +2160,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl) struct nvme_fc_fcp_op *aen_op; int i; + cancel_work_sync(&ctrl->ctrl.async_event_work); aen_op = ctrl->aen_ops; for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { __nvme_fc_exit_request(ctrl, aen_op); @@ -3670,12 +3671,14 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) spin_lock_irqsave(&nvme_fc_lock, flags); list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { if (lport->localport.node_name != laddr.nn || - lport->localport.port_name != laddr.pn) + lport->localport.port_name != laddr.pn || + lport->localport.port_state != FC_OBJSTATE_ONLINE) continue; list_for_each_entry(rport, &lport->endp_list, endp_list) { if (rport->remoteport.node_name != raddr.nn || - rport->remoteport.port_name != raddr.pn) + rport->remoteport.port_name != raddr.pn || + rport->remoteport.port_state != FC_OBJSTATE_ONLINE) continue; /* if fail to get reference fall through. Will error */ diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 412a6c97c0d8..552dbc04567b 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -59,12 +59,8 @@ static int nvme_set_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under, static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) { - int ret; - - ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, + return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, NVME_CSI_NVM, &data->log, sizeof(data->log), 0); - - return ret <= 0 ? ret : -EIO; } static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type, @@ -225,7 +221,7 @@ static const struct hwmon_chip_info nvme_hwmon_chip_info = { .info = nvme_hwmon_info, }; -void nvme_hwmon_init(struct nvme_ctrl *ctrl) +int nvme_hwmon_init(struct nvme_ctrl *ctrl) { struct device *dev = ctrl->dev; struct nvme_hwmon_data *data; @@ -234,7 +230,7 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl) data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); if (!data) - return; + return 0; data->ctrl = ctrl; mutex_init(&data->read_lock); @@ -244,7 +240,7 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl) dev_warn(ctrl->device, "Failed to read smart log (error %d)\n", err); devm_kfree(dev, data); - return; + return err; } hwmon = devm_hwmon_device_register_with_info(dev, "nvme", data, @@ -254,4 +250,6 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl) dev_warn(dev, "Failed to instantiate hwmon device\n"); devm_kfree(dev, data); } + + return 0; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3ded54d2c9c6..74896be40c17 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, } } -bool nvme_failover_req(struct request *req) +void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; - u16 status = nvme_req(req)->status; + u16 status = nvme_req(req)->status & 0x7ff; unsigned long flags; - switch (status & 0x7ff) { - case NVME_SC_ANA_TRANSITION: - case NVME_SC_ANA_INACCESSIBLE: - case NVME_SC_ANA_PERSISTENT_LOSS: - /* - * If we got back an ANA error we know the controller is alive, - * but not ready to serve this namespaces. The spec suggests - * we should update our general state here, but due to the fact - * that the admin and I/O queues are not serialized that is - * fundamentally racy. So instead just clear the current path, - * mark the the path as pending and kick of a re-read of the ANA - * log page ASAP. - */ - nvme_mpath_clear_current_path(ns); - if (ns->ctrl->ana_log_buf) { - set_bit(NVME_NS_ANA_PENDING, &ns->flags); - queue_work(nvme_wq, &ns->ctrl->ana_work); - } - break; - case NVME_SC_HOST_PATH_ERROR: - case NVME_SC_HOST_ABORTED_CMD: - /* - * Temporary transport disruption in talking to the controller. - * Try to send on a new path. - */ - nvme_mpath_clear_current_path(ns); - break; - default: - /* This was a non-ANA error so follow the normal error path. */ - return false; + nvme_mpath_clear_current_path(ns); + + /* + * If we got back an ANA error, we know the controller is alive but not + * ready to serve this namespace. Kick of a re-read of the ANA + * information page, and just try any other available path for now. + */ + if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); } spin_lock_irqsave(&ns->head->requeue_lock, flags); blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); + blk_mq_end_request(req, 0); kblockd_schedule_work(&ns->head->requeue_work); - return true; } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) @@ -233,7 +212,7 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, int node, struct nvme_ns *old) { - struct nvme_ns *ns, *found, *fallback = NULL; + struct nvme_ns *ns, *found = NULL; if (list_is_singular(&head->list)) { if (nvme_path_is_disabled(old)) @@ -252,18 +231,22 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, goto out; } if (ns->ana_state == NVME_ANA_NONOPTIMIZED) - fallback = ns; + found = ns; } - /* No optimized path found, re-check the current path */ + /* + * The loop above skips the current path for round-robin semantics. + * Fall back to the current path if either: + * - no other optimized path found and current is optimized, + * - no other usable path found and current is usable. + */ if (!nvme_path_is_disabled(old) && - old->ana_state == NVME_ANA_OPTIMIZED) { - found = old; - goto out; - } - if (!fallback) + (old->ana_state == NVME_ANA_OPTIMIZED || + (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) + return old; + + if (!found) return NULL; - found = fallback; out: rcu_assign_pointer(head->current_path[node], found); return found; @@ -690,13 +673,9 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) nvme_mpath_set_live(ns); } - if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { - struct gendisk *disk = ns->head->disk; - - if (disk) - disk->queue->backing_dev_info->capabilities |= - BDI_CAP_STABLE_WRITES; - } + if (blk_queue_stable_writes(ns->queue) && ns->head->disk) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, + ns->head->disk->queue); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ebb8c3ed3885..e7c88b40f5bb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -300,14 +300,13 @@ struct nvme_ctrl { unsigned long quirks; struct nvme_id_power_state psd[32]; struct nvme_effects_log *effects; - struct list_head cels; + struct xarray cels; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; - bool created; #ifdef CONFIG_NVME_MULTIPATH /* asymmetric namespace access: */ @@ -523,7 +522,31 @@ static inline u32 nvme_bytes_to_numd(size_t len) return (len >> 2) - 1; } -static inline bool nvme_end_request(struct request *req, __le16 status, +static inline bool nvme_is_ana_error(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_ANA_TRANSITION: + case NVME_SC_ANA_INACCESSIBLE: + case NVME_SC_ANA_PERSISTENT_LOSS: + return true; + default: + return false; + } +} + +static inline bool nvme_is_path_error(u16 status) +{ + /* check for a status code type of 'path related status' */ + return (status & 0x700) == 0x300; +} + +/* + * Fill in the status and result information from the CQE, and then figure out + * if blk-mq will need to use IPI magic to complete the request, and if yes do + * so. If not let the caller complete the request without an indirect function + * call. + */ +static inline bool nvme_try_complete_req(struct request *req, __le16 status, union nvme_result result) { struct nvme_request *rq = nvme_req(req); @@ -581,7 +604,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); -void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 @@ -629,7 +652,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); -bool nvme_failover_req(struct request *req); +void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); @@ -659,16 +682,6 @@ static inline void nvme_trace_bio_complete(struct request *req, trace_block_bio_complete(ns->head->disk->queue, req->bio); } -static inline void nvme_mpath_update_disk_size(struct gendisk *disk) -{ - struct block_device *bdev = bdget_disk(disk, 0); - - if (bdev) { - bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT); - bdput(bdev); - } -} - extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; @@ -688,9 +701,8 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); } -static inline bool nvme_failover_req(struct request *req) +static inline void nvme_failover_req(struct request *req) { - return false; } static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { @@ -744,15 +756,11 @@ static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) { } -static inline void nvme_mpath_update_disk_size(struct gendisk *disk) -{ -} #endif /* CONFIG_NVME_MULTIPATH */ +int nvme_revalidate_zones(struct nvme_ns *ns); #ifdef CONFIG_BLK_DEV_ZONED -int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, - unsigned lbaf); - +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); int nvme_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -769,9 +777,7 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, return BLK_STS_NOTSUPP; } -static inline int nvme_update_zone_info(struct gendisk *disk, - struct nvme_ns *ns, - unsigned lbaf) +static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) { dev_warn(ns->ctrl->device, "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); @@ -805,15 +811,18 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) } #ifdef CONFIG_NVME_HWMON -void nvme_hwmon_init(struct nvme_ctrl *ctrl); +int nvme_hwmon_init(struct nvme_ctrl *ctrl); #else -static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { } +static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl) +{ + return 0; +} #endif u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); void nvme_execute_passthru_rq(struct request *rq); -struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path); +struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); void nvme_put_ns(struct nvme_ns *ns); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ba725ae47305..e5b02242f3ca 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -120,7 +120,7 @@ struct nvme_dev { unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; - u16 q_depth; + u32 q_depth; int io_sqes; u32 db_stride; void __iomem *bar; @@ -157,13 +157,13 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { int ret; - u16 n; + u32 n; - ret = kstrtou16(val, 10, &n); + ret = kstrtou32(val, 10, &n); if (ret != 0 || n < 2) return -EINVAL; - return param_set_ushort(val, kp); + return param_set_uint(val, kp); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -195,7 +195,7 @@ struct nvme_queue { dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; u32 __iomem *q_db; - u16 q_depth; + u32 q_depth; u16 cq_vector; u16 sq_tail; u16 cq_head; @@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) struct nvme_completion *cqe = &nvmeq->cqes[idx]; struct request *req; - if (unlikely(cqe->command_id >= nvmeq->q_depth)) { - dev_warn(nvmeq->dev->ctrl.device, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpu(cqe->sq_id)); - return; - } - /* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to @@ -960,8 +953,15 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) } req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id); + if (unlikely(!req)) { + dev_warn(nvmeq->dev->ctrl.device, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpu(cqe->sq_id)); + return; + } + trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); - if (!nvme_end_request(req, cqe->status, cqe->result)) + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) nvme_pci_complete_rq(req); } @@ -1244,13 +1244,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) switch (dev->ctrl.state) { case NVME_CTRL_CONNECTING: nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - /* fall through */ + fallthrough; case NVME_CTRL_DELETING: dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, true); nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_dev_disable(dev, true); return BLK_EH_DONE; case NVME_CTRL_RESETTING: return BLK_EH_RESET_TIMER; @@ -1267,10 +1267,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); + nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); - nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; } @@ -2038,32 +2038,30 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) .calc_sets = nvme_calc_irq_sets, .priv = dev, }; - unsigned int irq_queues, this_p_queues; + unsigned int irq_queues, poll_queues; /* - * Poll queues don't need interrupts, but we need at least one IO - * queue left over for non-polled IO. + * Poll queues don't need interrupts, but we need at least one I/O queue + * left over for non-polled I/O. */ - this_p_queues = dev->nr_poll_queues; - if (this_p_queues >= nr_io_queues) { - this_p_queues = nr_io_queues - 1; - irq_queues = 1; - } else { - irq_queues = nr_io_queues - this_p_queues + 1; - } - dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; + poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1); + dev->io_queues[HCTX_TYPE_POLL] = poll_queues; - /* Initialize for the single interrupt case */ + /* + * Initialize for the single interrupt case, will be updated in + * nvme_calc_irq_sets(). + */ dev->io_queues[HCTX_TYPE_DEFAULT] = 1; dev->io_queues[HCTX_TYPE_READ] = 0; /* - * Some Apple controllers require all queues to use the - * first vector. + * We need interrupts for the admin queue and each non-polled I/O queue, + * but some Apple controllers require all queues to use the first + * vector. */ - if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) - irq_queues = 1; - + irq_queues = 1; + if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) + irq_queues += (nr_io_queues - poll_queues); return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } @@ -2320,7 +2318,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, + dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); @@ -2460,7 +2458,8 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_setup_prp_pools(struct nvme_dev *dev) { dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, - PAGE_SIZE, PAGE_SIZE, 0); + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE, 0); if (!dev->prp_page_pool) return -ENOMEM; @@ -3152,7 +3151,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_MEDIUM_PRIO_SQ | - NVME_QUIRK_NO_TEMP_THRESH_CHANGE }, + NVME_QUIRK_NO_TEMP_THRESH_CHANGE | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ @@ -3185,7 +3185,6 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, - { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), .driver_data = NVME_QUIRK_SINGLE_VECTOR }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, @@ -3193,6 +3192,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_SINGLE_VECTOR | NVME_QUIRK_128_BYTES_SQES | NVME_QUIRK_SHARED_TAGS }, + + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 44c76ffbb264..9e378d0a0c01 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -122,6 +122,7 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + struct mutex teardown_lock; bool use_inline_data; u32 io_queues[HCTX_MAX_TYPES]; }; @@ -834,6 +835,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); } if (ctrl->async_event_sqe.data) { + cancel_work_sync(&ctrl->ctrl.async_event_work); nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); ctrl->async_event_sqe.data = NULL; @@ -975,7 +977,15 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) if (!new) { nvme_start_queues(&ctrl->ctrl); - nvme_wait_freeze(&ctrl->ctrl); + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, ctrl->ctrl.queue_count - 1); nvme_unfreeze(&ctrl->ctrl); @@ -983,6 +993,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) return 0; +out_wait_freeze_timed_out: + nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); out_cleanup_connect_q: if (new) blk_cleanup_queue(ctrl->ctrl.connect_q); @@ -997,6 +1010,7 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { + mutex_lock(&ctrl->teardown_lock); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); if (ctrl->ctrl.admin_tagset) { @@ -1007,11 +1021,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); + mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { + mutex_lock(&ctrl->teardown_lock); if (ctrl->ctrl.queue_count > 1) { nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); @@ -1025,6 +1041,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); } + mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) @@ -1180,6 +1197,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->ctrl.device, "starting error recovery\n"); queue_work(nvme_reset_wq, &ctrl->err_work); } @@ -1189,7 +1207,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req) if (!refcount_dec_and_test(&req->ref)) return; - if (!nvme_end_request(rq, req->status, req->result)) + if (!nvme_try_complete_req(rq, req->status, req->result)) nvme_rdma_complete_rq(rq); } @@ -1915,7 +1933,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: nvme_rdma_destroy_queue_ib(queue); - /* fall through */ + fallthrough; case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); @@ -1946,6 +1964,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, return 0; } +static void nvme_rdma_complete_timed_out(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&ctrl->teardown_lock); + nvme_rdma_stop_queue(queue); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&ctrl->teardown_lock); +} + static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq, bool reserved) { @@ -1956,29 +1990,29 @@ nvme_rdma_timeout(struct request *rq, bool reserved) dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", rq->tag, nvme_rdma_queue_idx(queue)); - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_rdma_teardown_io_queues(ctrl, false); - nvme_rdma_teardown_admin_queue(ctrl, false); + nvme_rdma_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ nvme_rdma_error_recovery(ctrl); - return BLK_EH_RESET_TIMER; } @@ -2278,6 +2312,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return ERR_PTR(-ENOMEM); ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); + mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 62fbaecdc960..d6a3e1487354 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -124,6 +124,7 @@ struct nvme_tcp_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + struct mutex teardown_lock; struct work_struct err_work; struct delayed_work connect_work; struct nvme_tcp_request async_req; @@ -464,6 +465,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->device, "starting error recovery\n"); queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); } @@ -481,7 +483,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, return -EINVAL; } - if (!nvme_end_request(rq, cqe->status, cqe->result)) + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) nvme_complete_rq(rq); queue->nr_cqe++; @@ -672,7 +674,7 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status) { union nvme_result res = {}; - if (!nvme_end_request(rq, cpu_to_le16(status << 1), res)) + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) nvme_complete_rq(rq); } @@ -866,7 +868,6 @@ static void nvme_tcp_state_change(struct sock *sk) case TCP_LAST_ACK: case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: - /* fallthrough */ nvme_tcp_error_recovery(&queue->ctrl->ctrl); break; default: @@ -912,12 +913,11 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) else flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; - /* can't zcopy slab pages */ - if (unlikely(PageSlab(page))) { - ret = sock_no_sendpage(queue->sock, page, offset, len, + if (sendpage_ok(page)) { + ret = kernel_sendpage(queue->sock, page, offset, len, flags); } else { - ret = kernel_sendpage(queue->sock, page, offset, len, + ret = sock_no_sendpage(queue->sock, page, offset, len, flags); } if (ret <= 0) @@ -1527,7 +1527,6 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) return; - __nvme_tcp_stop_queue(queue); } @@ -1596,6 +1595,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) { if (to_tcp_ctrl(ctrl)->async_req.pdu) { + cancel_work_sync(&ctrl->async_event_work); nvme_tcp_free_async_req(to_tcp_ctrl(ctrl)); to_tcp_ctrl(ctrl)->async_req.pdu = NULL; } @@ -1782,7 +1782,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) if (!new) { nvme_start_queues(ctrl); - nvme_wait_freeze(ctrl); + if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } blk_mq_update_nr_hw_queues(ctrl->tagset, ctrl->queue_count - 1); nvme_unfreeze(ctrl); @@ -1790,6 +1798,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) return 0; +out_wait_freeze_timed_out: + nvme_stop_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: if (new) blk_cleanup_queue(ctrl->connect_q); @@ -1875,6 +1886,7 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); blk_mq_quiesce_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); if (ctrl->admin_tagset) { @@ -1885,13 +1897,16 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, bool remove) { + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); if (ctrl->queue_count <= 1) - return; + goto out; + blk_mq_quiesce_queue(ctrl->admin_q); nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); @@ -1903,6 +1918,8 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); +out: + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) @@ -2149,40 +2166,55 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) nvme_tcp_queue_request(&ctrl->async_req, true, true); } +static void nvme_tcp_complete_timed_out(struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); +} + static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq, bool reserved) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; struct nvme_tcp_cmd_pdu *pdu = req->pdu; - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - - dev_warn(ctrl->ctrl.device, + dev_warn(ctrl->device, "queue %d: timeout request %#x type %d\n", nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + if (ctrl->state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_tcp_teardown_io_queues(&ctrl->ctrl, false); - nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false); + nvme_tcp_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); - nvme_tcp_error_recovery(&ctrl->ctrl); - + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ + nvme_tcp_error_recovery(ctrl); return BLK_EH_RESET_TIMER; } @@ -2423,6 +2455,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, nvme_tcp_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); + mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 57cfd78731fb..67e87e9f306f 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -7,6 +7,17 @@ #include <linux/vmalloc.h> #include "nvme.h" +int nvme_revalidate_zones(struct nvme_ns *ns) +{ + struct request_queue *q = ns->queue; + int ret; + + ret = blk_revalidate_disk_zones(ns->disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); + return ret; +} + static int nvme_set_max_append(struct nvme_ctrl *ctrl) { struct nvme_command c = { }; @@ -35,11 +46,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl) return 0; } -int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, - unsigned lbaf) +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) { struct nvme_effects_log *log = ns->head->effects; - struct request_queue *q = disk->queue; + struct request_queue *q = ns->queue; struct nvme_command c = { }; struct nvme_id_ns_zns *id; int status; @@ -133,28 +143,6 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, return NULL; } -static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, - struct nvme_zone_report *report, - size_t buflen) -{ - struct nvme_command c = { }; - int ret; - - c.zmr.opcode = nvme_cmd_zone_mgmt_recv; - c.zmr.nsid = cpu_to_le32(ns->head->ns_id); - c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); - c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); - c.zmr.zra = NVME_ZRA_ZONE_REPORT; - c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; - c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; - - ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); - if (ret) - return ret; - - return le64_to_cpu(report->nr_zones); -} - static int nvme_zone_parse_entry(struct nvme_ns *ns, struct nvme_zone_descriptor *entry, unsigned int idx, report_zones_cb cb, @@ -182,6 +170,7 @@ static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct nvme_zone_report *report; + struct nvme_command c = { }; int ret, zone_idx = 0; unsigned int nz, i; size_t buflen; @@ -190,14 +179,26 @@ static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, if (!report) return -ENOMEM; + c.zmr.opcode = nvme_cmd_zone_mgmt_recv; + c.zmr.nsid = cpu_to_le32(ns->head->ns_id); + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); + c.zmr.zra = NVME_ZRA_ZONE_REPORT; + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + sector &= ~(ns->zsze - 1); while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { memset(report, 0, buflen); - ret = __nvme_ns_report_zones(ns, sector, report, buflen); - if (ret < 0) + + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); + if (ret) { + if (ret > 0) + ret = -EIO; goto out_free; + } - nz = min_t(unsigned int, ret, nr_zones); + nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones); if (!nz) break; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index e9fe91786bbb..dca34489a1dc 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -727,7 +727,9 @@ u16 nvmet_set_feat_kato(struct nvmet_req *req) { u32 val32 = le32_to_cpu(req->cmd->common.cdw11); + nvmet_stop_keep_alive_timer(req->sq->ctrl); req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + nvmet_start_keep_alive_timer(req->sq->ctrl); nvmet_set_result(req, req->sq->ctrl->kato); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 74b2b61c773b..37e1d7784e17 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1136,6 +1136,7 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, up_write(&nvmet_config_sem); kfree_rcu(new_model, rcuhead); + kfree(new_model_number); return count; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b92f45f5cd5b..25d62d867563 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -73,7 +73,7 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) status = NVME_SC_ACCESS_DENIED; break; case -EIO: - /* FALLTHRU */ + fallthrough; default: req->error_loc = offsetof(struct nvme_common_command, opcode); status = NVME_SC_INTERNAL | NVME_SC_DNR; @@ -395,8 +395,11 @@ static void nvmet_keep_alive_timer(struct work_struct *work) nvmet_ctrl_fatal_error(ctrl); } -static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) +void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d start keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); @@ -404,8 +407,11 @@ static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } -static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) +void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); cancel_delayed_work_sync(&ctrl->ka_work); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 55bafd56166a..cd4e73aa9807 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1019,7 +1019,7 @@ static void nvmet_fc_free_hostport(struct nvmet_fc_hostport *hostport) { /* if LLDD not implemented, leave as NULL */ - if (!hostport->hosthandle) + if (!hostport || !hostport->hosthandle) return; nvmet_fc_hostport_put(hostport); @@ -2342,9 +2342,9 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) return; if (fcpreq->fcp_error || fcpreq->transferred_length != fcpreq->transfer_length) { - spin_lock(&fod->flock); + spin_lock_irqsave(&fod->flock, flags); fod->abort = true; - spin_unlock(&fod->flock); + spin_unlock_irqrestore(&fod->flock, flags); nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return; diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index c97e60b71bbc..3da067a8311e 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -812,7 +812,7 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, break; /* Fall-Thru to RSP handling */ - /* FALLTHRU */ + fallthrough; case NVMET_FCOP_RSP: if (fcpreq) { diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 3dd6f566a240..125dde3f410e 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -139,7 +139,6 @@ static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) req->error_loc = offsetof(struct nvme_rw_command, nsid); break; case BLK_STS_IOERR: - /* fallthru */ default: status = NVME_SC_INTERNAL | NVME_SC_DNR; req->error_loc = offsetof(struct nvme_common_command, opcode); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4884ef1e46a2..f6d81239be21 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -115,7 +115,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req) return; } - if (!nvme_end_request(rq, cqe->status, cqe->result)) + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) nvme_loop_complete_rq(rq); } } @@ -579,7 +579,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) - goto out_put_ctrl; + goto out; if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) WARN_ON_ONCE(1); @@ -635,8 +635,8 @@ out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); -out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); +out: if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 47ee3fb193bd..559a15ccc322 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -395,6 +395,8 @@ void nvmet_get_feat_async_event(struct nvmet_req *req); u16 nvmet_set_feat_kato(struct nvmet_req *req); u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask); void nvmet_execute_async_event(struct nvmet_req *req); +void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl); +void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl); u16 nvmet_parse_connect_cmd(struct nvmet_req *req); void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 89d91dc999a6..56c571052216 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -165,7 +165,7 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) req->cqe->result = nvme_req(rq)->result; nvmet_req_complete(req, status); - blk_put_request(rq); + blk_mq_free_request(rq); } static void nvmet_passthru_req_done(struct request *rq, @@ -175,7 +175,7 @@ static void nvmet_passthru_req_done(struct request *rq, req->cqe->result = nvme_req(rq)->result; nvmet_req_complete(req, nvme_req(rq)->status); - blk_put_request(rq); + blk_mq_free_request(rq); } static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) @@ -230,7 +230,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) if (unlikely(!ns)) { pr_err("failed to get passthru ns nsid:%u\n", nsid); status = NVME_SC_INVALID_NS | NVME_SC_DNR; - goto fail_out; + goto out; } q = ns->queue; @@ -238,16 +238,15 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); if (IS_ERR(rq)) { - rq = NULL; status = NVME_SC_INTERNAL; - goto fail_out; + goto out_put_ns; } if (req->sg_cnt) { ret = nvmet_passthru_map_sg(req, rq); if (unlikely(ret)) { status = NVME_SC_INTERNAL; - goto fail_out; + goto out_put_req; } } @@ -274,11 +273,13 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) return; -fail_out: +out_put_req: + blk_mq_free_request(rq); +out_put_ns: if (ns) nvme_put_ns(ns); +out: nvmet_req_complete(req, status); - blk_put_request(rq); } /* @@ -326,6 +327,10 @@ static u16 nvmet_setup_passthru_command(struct nvmet_req *req) u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) { + /* Reject any commands with non-sgl flags set (ie. fused commands) */ + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD; + switch (req->cmd->common.opcode) { case nvme_cmd_resv_register: case nvme_cmd_resv_report: @@ -396,6 +401,10 @@ static u16 nvmet_passthru_get_set_features(struct nvmet_req *req) u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) { + /* Reject any commands with non-sgl flags set (ie. fused commands) */ + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD; + /* * Passthru all vendor specific commands */ @@ -447,10 +456,26 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) req->execute = nvmet_passthru_execute_cmd; req->p.use_workqueue = true; return NVME_SC_SUCCESS; + case NVME_ID_CNS_CS_CTRL: + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + } + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; case NVME_ID_CNS_NS: req->execute = nvmet_passthru_execute_cmd; req->p.use_workqueue = true; return NVME_SC_SUCCESS; + case NVME_ID_CNS_CS_NS: + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + } + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; default: return nvmet_setup_passthru_command(req); } @@ -465,6 +490,7 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) { struct nvme_ctrl *ctrl; + struct file *file; int ret = -EINVAL; void *old; @@ -479,24 +505,29 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) goto out_unlock; } - ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path); - if (IS_ERR(ctrl)) { - ret = PTR_ERR(ctrl); + file = filp_open(subsys->passthru_ctrl_path, O_RDWR, 0); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto out_unlock; + } + + ctrl = nvme_ctrl_from_file(file); + if (!ctrl) { pr_err("failed to open nvme controller %s\n", subsys->passthru_ctrl_path); - goto out_unlock; + goto out_put_file; } old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL, subsys, GFP_KERNEL); if (xa_is_err(old)) { ret = xa_err(old); - goto out_put_ctrl; + goto out_put_file; } if (old) - goto out_put_ctrl; + goto out_put_file; subsys->passthru_ctrl = ctrl; subsys->ver = ctrl->vs; @@ -507,12 +538,12 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) NVME_TERTIARY(subsys->ver)); subsys->ver = NVME_VS(1, 2, 1); } + nvme_get_ctrl(ctrl); + __module_get(subsys->passthru_ctrl->ops->module); + ret = 0; - mutex_unlock(&subsys->lock); - return 0; - -out_put_ctrl: - nvme_put_ctrl(ctrl); +out_put_file: + filp_close(file, NULL); out_unlock: mutex_unlock(&subsys->lock); return ret; @@ -522,6 +553,7 @@ static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) { if (subsys->passthru_ctrl) { xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid); + module_put(subsys->passthru_ctrl->ops->module); nvme_put_ctrl(subsys->passthru_ctrl); } subsys->passthru_ctrl = NULL; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3ccb59260b4a..ae6620489457 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1758,7 +1758,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, schedule_delayed_work(&port->repair_work, 0); break; } - /* FALLTHROUGH */ + fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: nvmet_rdma_queue_disconnect(queue); @@ -1769,7 +1769,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_REJECTED: pr_debug("Connection rejected: %s\n", rdma_reject_msg(cm_id, event->status)); - /* FALLTHROUGH */ + fallthrough; case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_CONNECT_ERROR: nvmet_rdma_queue_connect_fail(cm_id, queue); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 9eda91162fe4..dc1f0f647189 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -94,7 +94,6 @@ struct nvmet_tcp_queue { struct socket *sock; struct nvmet_tcp_port *port; struct work_struct io_work; - int cpu; struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; @@ -144,7 +143,6 @@ struct nvmet_tcp_port { struct work_struct accept_work; struct nvmet_port *nport; struct sockaddr_storage addr; - int last_cpu; void (*data_ready)(struct sock *); }; @@ -160,6 +158,11 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd) { + if (unlikely(!queue->nr_cmds)) { + /* We didn't allocate cmds yet, send 0xffff */ + return USHRT_MAX; + } + return cmd - queue->cmds; } @@ -214,6 +217,11 @@ static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) list_add_tail(&cmd->entry, &cmd->queue->free_list); } +static inline int queue_cpu(struct nvmet_tcp_queue *queue) +{ + return queue->sock->sk->sk_incoming_cpu; +} + static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) { return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; @@ -501,7 +509,7 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req) struct nvmet_tcp_queue *queue = cmd->queue; llist_add(&cmd->lentry, &queue->resp_list); - queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); } static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) @@ -866,7 +874,10 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; - cmd = &queue->cmds[data->ttag]; + if (likely(queue->nr_cmds)) + cmd = &queue->cmds[data->ttag]; + else + cmd = &queue->connect; if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { pr_err("ttag %u unexpected data offset %u (expected %u)\n", @@ -1215,7 +1226,7 @@ static void nvmet_tcp_io_work(struct work_struct *w) * We exahusted our budget, requeue our selves */ if (pending) - queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, @@ -1375,7 +1386,7 @@ static void nvmet_tcp_data_ready(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (likely(queue)) - queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); read_unlock_bh(&sk->sk_callback_lock); } @@ -1395,7 +1406,7 @@ static void nvmet_tcp_write_space(struct sock *sk) if (sk_stream_is_writeable(sk)) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } out: read_unlock_bh(&sk->sk_callback_lock); @@ -1504,9 +1515,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, if (ret) goto out_free_connect; - port->last_cpu = cpumask_next_wrap(port->last_cpu, - cpu_online_mask, -1, false); - queue->cpu = port->last_cpu; nvmet_prepare_receive_pdu(queue); mutex_lock(&nvmet_tcp_queue_mutex); @@ -1517,7 +1525,7 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, if (ret) goto out_destroy_sq; - queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); return 0; out_destroy_sq: @@ -1604,7 +1612,6 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) } port->nport = nport; - port->last_cpu = -1; INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); if (port->nport->inline_data_size < 0) port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; |