From aa306ab703e9452b1e25cc8e8f04b8df523d0bb8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2019 11:48:39 +0800 Subject: blk-mq: introduce blk_mq_request_completed() NVMe needs this function to decide if one request to be aborted has been completed in normal IO path already. So introduce it. Cc: Max Gurtovoy Cc: Sagi Grimberg Cc: Keith Busch Cc: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index f78d3287dd82..8bb5854a62f3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -665,6 +665,12 @@ int blk_mq_request_started(struct request *rq) } EXPORT_SYMBOL_GPL(blk_mq_request_started); +int blk_mq_request_completed(struct request *rq) +{ + return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; +} +EXPORT_SYMBOL_GPL(blk_mq_request_completed); + void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; -- cgit From a87ccce0b5a06ee546931859fa62e10f1bce54f9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2019 11:48:43 +0800 Subject: blk-mq: remove blk_mq_complete_request_sync blk_mq_tagset_wait_completed_request() has been applied for waiting for completed request's fn, so not necessary to use blk_mq_complete_request_sync() any more. Cc: Max Gurtovoy Cc: Sagi Grimberg Cc: Keith Busch Cc: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8bb5854a62f3..6968de9d7402 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -652,13 +652,6 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -void blk_mq_complete_request_sync(struct request *rq) -{ - WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); - rq->q->mq_ops->complete(rq); -} -EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync); - int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; -- cgit From 73d9c8d4c0017e21e1ff519474ceb1450484dc9a Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 23 Jul 2019 22:10:42 +0800 Subject: blk-mq: Fix memory leak in blk_mq_init_allocated_queue error handling If blk_mq_init_allocated_queue->elevator_init_mq fails, need to release the previously requested resources. Fixes: d34849913819 ("blk-mq-sched: allow setting of default IO scheduler") Signed-off-by: zhengbin Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 6968de9d7402..509f69fdfcf2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2846,6 +2846,8 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { + int ret = -ENOMEM; + /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -2907,17 +2909,18 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_map_swqueue(q); if (!(set->flags & BLK_MQ_F_NO_SCHED)) { - int ret; - ret = elevator_init_mq(q); if (ret) - return ERR_PTR(ret); + goto err_tag_set; } return q; +err_tag_set: + blk_mq_del_queue_tag_set(q); err_hctxs: kfree(q->queue_hw_ctx); + q->nr_hw_queues = 0; err_sys_init: blk_mq_sysfs_deinit(q); err_poll: -- cgit From c6ba933358f0d7a6a042b894dba20cc70396a6d3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Aug 2019 19:01:46 +0800 Subject: blk-mq: don't hold q->sysfs_lock in blk_mq_map_swqueue blk_mq_map_swqueue() is called from blk_mq_init_allocated_queue() and blk_mq_update_nr_hw_queues(). For the former caller, the kobject isn't exposed to userspace yet. For the latter caller, hctx sysfs entries and debugfs are un-registered before updating nr_hw_queues. On the other hand, commit 2f8f1336a48b ("blk-mq: always free hctx after request queue is freed") moves freeing hctx into queue's release handler, so there won't be race with queue release path too. So don't hold q->sysfs_lock in blk_mq_map_swqueue(). Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Greg KH Cc: Mike Snitzer Cc: Bart Van Assche Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 509f69fdfcf2..cf768d0c2950 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2456,11 +2456,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; - /* - * Avoid others reading imcomplete hctx->cpumask through sysfs - */ - mutex_lock(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -2521,8 +2516,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) HCTX_TYPE_DEFAULT, i); } - mutex_unlock(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { /* * If no software queues are mapped to this hardware queue, -- cgit From 6f816b4b746c2241540e537682d30d8e9997d674 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 28 Aug 2019 15:05:57 -0700 Subject: blk-mq: add optional request->alloc_time_ns There are currently two start time timestamps - start_time_ns and io_start_time_ns. The former marks the request allocation and and the second issue-to-device time. The planned io.weight controller needs to measure the total time bios take to execute after it leaves rq_qos including the time spent waiting for request to become available, which can easily dominate on saturated devices. This patch adds request->alloc_time_ns which records when the request allocation attempt started. As it isn't used for the usual stats, make it optional behind CONFIG_BLK_RQ_ALLOC_TIME and QUEUE_FLAG_RQ_ALLOC_TIME so that it can be compiled out when there are no users and it's active only on queues which need it even when compiled in. v2: s/pre_start_time/alloc_time/ and add CONFIG_BLK_RQ_ALLOC_TIME gating as suggested by Jens. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index cf768d0c2950..004411236034 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -291,7 +291,7 @@ static inline bool blk_mq_need_time_stamp(struct request *rq) } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, unsigned int op) + unsigned int tag, unsigned int op, u64 alloc_time_ns) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; @@ -325,6 +325,9 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + rq->alloc_time_ns = alloc_time_ns; +#endif if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else @@ -356,8 +359,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct request *rq; unsigned int tag; bool clear_ctx_on_error = false; + u64 alloc_time_ns = 0; blk_queue_enter_live(q); + + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); + data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); @@ -393,7 +402,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, return NULL; } - rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); + rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns); if (!op_is_flush(data->cmd_flags)) { rq->elv.icq = NULL; if (e && e->type->ops.prepare_request) { -- cgit From 61db437d1cc16c470cf6fccc04d34be9cf6e4e4b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 5 Sep 2019 18:51:29 +0900 Subject: block: Cleanup elevator_init_mq() use Instead of checking a queue tag_set BLK_MQ_F_NO_SCHED flag before calling elevator_init_mq() to make sure that the queue supports IO scheduling, use the elevator.c function elv_support_iosched() in elevator_init_mq(). This does not introduce any functional change but ensure that elevator_init_mq() does the right thing based on the queue settings. Reviewed-by: Ming Lei Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 004411236034..c3bd5b48a5b1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2910,11 +2910,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - if (!(set->flags & BLK_MQ_F_NO_SCHED)) { - ret = elevator_init_mq(q); - if (ret) - goto err_tag_set; - } + ret = elevator_init_mq(q); + if (ret) + goto err_tag_set; return q; -- cgit From 954b4a5ce4a806e7c284ce6b2659abdd03d0b6e2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 5 Sep 2019 18:51:30 +0900 Subject: block: Change elevator_init_mq() to always succeed If the default elevator chosen is mq-deadline, elevator_init_mq() may return an error if mq-deadline initialization fails, leading to blk_mq_init_allocated_queue() returning an error, which in turn will cause the block device initialization to fail and the device not being exposed. Instead of taking such extreme measure, handle mq-deadline initialization failures in the same manner as when mq-deadline is not available (no module to load), that is, default to the "none" scheduler. With this change, elevator_init_mq() return type can be changed to void. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index c3bd5b48a5b1..d10a7ab4207a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2848,8 +2848,6 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { - int ret = -ENOMEM; - /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -2910,14 +2908,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - ret = elevator_init_mq(q); - if (ret) - goto err_tag_set; + elevator_init_mq(q); return q; -err_tag_set: - blk_mq_del_queue_tag_set(q); err_hctxs: kfree(q->queue_hw_ctx); q->nr_hw_queues = 0; -- cgit From 737eb78e82d52d35df166d29af32bf61992de71d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 5 Sep 2019 18:51:33 +0900 Subject: block: Delay default elevator initialization When elevator_init_mq() is called from blk_mq_init_allocated_queue(), the only information known about the device is the number of hardware queues as the block device scan by the device driver is not completed yet for most drivers. The device type and elevator required features are not set yet, preventing to correctly select the default elevator most suitable for the device. This currently affects all multi-queue zoned block devices which default to the "none" elevator instead of the required "mq-deadline" elevator. These drives currently include host-managed SMR disks connected to a smartpqi HBA and null_blk block devices with zoned mode enabled. Upcoming NVMe Zoned Namespace devices will also be affected. Fix this by adding the boolean elevator_init argument to blk_mq_init_allocated_queue() to control the execution of elevator_init_mq(). Two cases exist: 1) elevator_init = false is used for calls to blk_mq_init_allocated_queue() within blk_mq_init_queue(). In this case, a call to elevator_init_mq() is added to __device_add_disk(), resulting in the delayed initialization of the queue elevator after the device driver finished probing the device information. This effectively allows elevator_init_mq() access to more information about the device. 2) elevator_init = true preserves the current behavior of initializing the elevator directly from blk_mq_init_allocated_queue(). This case is used for the special request based DM devices where the device gendisk is created before the queue initialization and device information (e.g. queue limits) is already known when the queue initialization is executed. Additionally, to make sure that the elevator initialization is never done while requests are in-flight (there should be none when the device driver calls device_add_disk()), freeze and quiesce the device request queue before calling blk_mq_init_sched() in elevator_init_mq(). Reviewed-by: Ming Lei Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index d10a7ab4207a..3647776a0f6e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2695,7 +2695,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!uninit_q) return ERR_PTR(-ENOMEM); - q = blk_mq_init_allocated_queue(set, uninit_q); + /* + * Initialize the queue without an elevator. device_add_disk() will do + * the initialization. + */ + q = blk_mq_init_allocated_queue(set, uninit_q, false); if (IS_ERR(q)) blk_cleanup_queue(uninit_q); @@ -2846,7 +2850,8 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) } struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q) + struct request_queue *q, + bool elevator_init) { /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -2908,7 +2913,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - elevator_init_mq(q); + if (elevator_init) + elevator_init_mq(q); return q; -- cgit From 3d24430694077313c75c6b89f618db09943621e4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 21 May 2019 15:59:03 +0800 Subject: block: make rq sector size accessible for block stats Currently rq->data_len will be decreased by partial completion or zeroed by completion, so when blk_stat_add() is invoked, data_len will be zero and there will never be samples in poll_cb because blk_mq_poll_stats_bkt() will return -1 if data_len is zero. We could move blk_stat_add() back to __blk_mq_complete_request(), but that would make the effort of trying to call ktime_get_ns() once in vain. Instead we can reuse throtl_size field, and use it for both block stats and block throttle, and adjust the logic in blk_mq_poll_stats_bkt() accordingly. Fixes: 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()") Tested-by: Pavel Begunkov Signed-off-by: Hou Tao Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3647776a0f6e..d30fabb583fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); static int blk_mq_poll_stats_bkt(const struct request *rq) { - int ddir, bytes, bucket; + int ddir, sectors, bucket; ddir = rq_data_dir(rq); - bytes = blk_rq_bytes(rq); + sectors = blk_rq_stats_sectors(rq); - bucket = ddir + 2*(ilog2(bytes) - 9); + bucket = ddir + 2 * ilog2(sectors); if (bucket < 0) return -1; @@ -333,6 +333,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, else rq->start_time_ns = 0; rq->io_start_time_ns = 0; + rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -681,9 +682,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - rq->throtl_size = blk_rq_sectors(rq); -#endif + rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } -- cgit From 9a91b05bba58e5bd83034e69407d11641e8064e9 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 21 May 2019 15:59:04 +0800 Subject: block: also check RQF_STATS in blk_mq_need_time_stamp() In __blk_mq_end_request() if block stats needs update, we should ensure now is valid instead of 0 even when iostat is disabled. Signed-off-by: Hou Tao Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index d30fabb583fd..214ed0739aa5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -282,12 +282,12 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) EXPORT_SYMBOL(blk_mq_can_queue); /* - * Only need start/end time stamping if we have stats enabled, or using - * an IO scheduler. + * Only need start/end time stamping if we have iostat or + * blk stats enabled, or using an IO scheduler. */ static inline bool blk_mq_need_time_stamp(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; + return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, -- cgit