diff options
Diffstat (limited to 'block/blk-mq.c')
| -rw-r--r-- | block/blk-mq.c | 1228 |
1 files changed, 789 insertions, 439 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 3b4df8e5ac9e..bd8b11c472a2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -23,6 +23,7 @@ #include <linux/cache.h> #include <linux/sched/topology.h> #include <linux/sched/signal.h> +#include <linux/suspend.h> #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> @@ -43,6 +44,7 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); +static DEFINE_MUTEX(blk_mq_cpuhp_lock); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, @@ -88,11 +90,11 @@ struct mq_inflight { unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct request *rq, void *priv) +static bool blk_mq_check_in_driver(struct request *rq, void *priv) { struct mq_inflight *mi = priv; - if (rq->part && blk_do_io_stat(rq) && + if (rq->rq_flags & RQF_IO_STAT && (!bdev_is_partition(mi->part) || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; @@ -100,29 +102,71 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv) return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, - struct block_device *part) +void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, + &mi); + inflight[READ] = mi.inflight[READ]; + inflight[WRITE] = mi.inflight[WRITE]; +} + +#ifdef CONFIG_LOCKDEP +static bool blk_freeze_set_owner(struct request_queue *q, + struct task_struct *owner) +{ + if (!owner) + return false; + + if (!q->mq_freeze_depth) { + q->mq_freeze_owner = owner; + q->mq_freeze_owner_depth = 1; + q->mq_freeze_disk_dead = !q->disk || + test_bit(GD_DEAD, &q->disk->state) || + !blk_queue_registered(q); + q->mq_freeze_queue_dying = blk_queue_dying(q); + return true; + } - return mi.inflight[0] + mi.inflight[1]; + if (owner == q->mq_freeze_owner) + q->mq_freeze_owner_depth += 1; + return false; } -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, - unsigned int inflight[2]) +/* verify the last unfreeze in owner context */ +static bool blk_unfreeze_check_owner(struct request_queue *q) { - struct mq_inflight mi = { .part = part }; + if (q->mq_freeze_owner != current) + return false; + if (--q->mq_freeze_owner_depth == 0) { + q->mq_freeze_owner = NULL; + return true; + } + return false; +} + +#else - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - inflight[0] = mi.inflight[0]; - inflight[1] = mi.inflight[1]; +static bool blk_freeze_set_owner(struct request_queue *q, + struct task_struct *owner) +{ + return false; } -void blk_freeze_queue_start(struct request_queue *q) +static bool blk_unfreeze_check_owner(struct request_queue *q) { + return false; +} +#endif + +bool __blk_freeze_queue_start(struct request_queue *q, + struct task_struct *owner) +{ + bool freeze; + mutex_lock(&q->mq_freeze_lock); + freeze = blk_freeze_set_owner(q, owner); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); @@ -131,6 +175,14 @@ void blk_freeze_queue_start(struct request_queue *q) } else { mutex_unlock(&q->mq_freeze_lock); } + + return freeze; +} + +void blk_freeze_queue_start(struct request_queue *q) +{ + if (__blk_freeze_queue_start(q, current)) + blk_freeze_acquire_lock(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -149,35 +201,17 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); -/* - * Guarantee no request is in use, so we can change any data structure of - * the queue afterward. - */ -void blk_freeze_queue(struct request_queue *q) +void blk_mq_freeze_queue_nomemsave(struct request_queue *q) { - /* - * In the !blk_mq case we are only calling this to kill the - * q_usage_counter, otherwise this increases the freeze depth - * and waits for it to return to zero. For this reason there is - * no blk_unfreeze_queue(), and blk_freeze_queue() is not - * exported to drivers as the only user for unfreeze is blk_mq. - */ blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave); -void blk_mq_freeze_queue(struct request_queue *q) +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { - /* - * ...just an alias to keep freeze and unfreeze actions balanced - * in the blk_mq_* namespace - */ - blk_freeze_queue(q); -} -EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); + bool unfreeze; -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) -{ mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; @@ -187,14 +221,38 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } + unfreeze = blk_unfreeze_check_owner(q); mutex_unlock(&q->mq_freeze_lock); + + return unfreeze; +} + +void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q) +{ + if (__blk_mq_unfreeze_queue(q, false)) + blk_unfreeze_release_lock(q); } +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore); -void blk_mq_unfreeze_queue(struct request_queue *q) +/* + * non_owner variant of blk_freeze_queue_start + * + * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen + * by the same task. This is fragile and should not be used if at all + * possible. + */ +void blk_freeze_queue_start_non_owner(struct request_queue *q) +{ + __blk_freeze_queue_start(q, NULL); +} +EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); + +/* non_owner variant of blk_mq_unfreeze_queue */ +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) { __blk_mq_unfreeze_queue(q, false); } -EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the @@ -283,8 +341,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } - blk_mq_wait_quiesce_done(set); mutex_unlock(&set->tag_list_lock); + + blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); @@ -318,12 +377,12 @@ void blk_rq_init(struct request_queue *q, struct request *rq) INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; + rq->phys_gap_bit = 0; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = blk_time_get_ns(); - rq->part = NULL; blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); @@ -331,19 +390,23 @@ EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = blk_time_get_ns(); - else - rq->start_time_ns = 0; - #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) - rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; + rq->alloc_time_ns = alloc_time_ns; else rq->alloc_time_ns = 0; #endif } +static inline void blk_mq_bio_issue_init(struct request_queue *q, + struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) + bio->issue_time_ns = blk_time_get_ns(); +#endif +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { @@ -359,8 +422,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; - if (blk_queue_io_stat(q)) - data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { @@ -376,9 +437,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; -#if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; -#endif rq->end_io = NULL; rq->end_io_data = NULL; @@ -410,28 +469,33 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) unsigned long tag_mask; int i, nr = 0; - tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); - if (unlikely(!tag_mask)) - return NULL; + do { + tag_mask = blk_mq_get_tags(data, data->nr_tags - nr, &tag_offset); + if (unlikely(!tag_mask)) { + if (nr == 0) + return NULL; + break; + } + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) + continue; + tag = tag_offset + i; + prefetch(tags->static_rqs[tag]); + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag); + rq_list_add_head(data->cached_rqs, rq); + nr++; + } + } while (data->nr_tags > nr); - tags = blk_mq_tags_from_data(data); - for (i = 0; tag_mask; i++) { - if (!(tag_mask & (1UL << i))) - continue; - tag = tag_offset + i; - prefetch(tags->static_rqs[tag]); - tag_mask &= ~(1UL << i); - rq = blk_mq_rq_ctx_init(data, tags, tag); - rq_list_add(data->cached_rq, rq); - nr++; - } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; - return rq_list_pop(data->cached_rq); + return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) @@ -448,6 +512,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; +retry: + data->ctx = blk_mq_get_ctx(q); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); + if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is @@ -469,13 +537,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } - } - -retry: - data->ctx = blk_mq_get_ctx(q); - data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->rq_flags & RQF_SCHED_TAGS)) + } else { blk_mq_tag_busy(data->hctx); + } if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; @@ -526,9 +590,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = plug->nr_ios, - .cached_rq = &plug->cached_rq, + .cached_rqs = &plug->cached_rqs, + .ctx = NULL, + .hctx = NULL }; struct request *rq; @@ -553,14 +621,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (!plug) return NULL; - if (rq_list_empty(plug->cached_rq)) { + if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; @@ -569,8 +637,8 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; - plug->cached_rq = rq_list_next(rq); - blk_mq_rq_time_init(rq, 0); + rq_list_pop(&plug->cached_rqs); + blk_mq_rq_time_init(rq, blk_time_get_ns()); } rq->cmd_flags = opf; @@ -588,8 +656,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; int ret; @@ -602,6 +675,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, goto out_queue_exit; } rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -617,8 +691,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; u64 alloc_time_ns = 0; struct request *rq; @@ -652,7 +731,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = xa_load(&q->hctx_table, hctx_idx); + data.hctx = q->queue_hw_ctx[hctx_idx]; if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); @@ -677,6 +756,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -746,7 +826,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; - while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) + while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } @@ -766,7 +846,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_account_io_completion(struct request *req, unsigned int bytes) { - if (req->part && blk_do_io_stat(req)) { + if (req->rq_flags & RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); @@ -786,7 +866,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status) blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* @@ -804,10 +884,8 @@ static void blk_complete_request(struct request *req) if (!bio) return; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) - req->q->integrity.profile->complete_fn(req, total_bytes); -#endif + blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -823,7 +901,8 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - blk_zone_update_request_bio(req, bio); + if (blk_req_bio_is_zone_append(req, bio)) + blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); @@ -875,11 +954,9 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) - req->q->integrity.profile->complete_fn(req, nr_bytes); -#endif + blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -924,7 +1001,8 @@ bool blk_update_request(struct request *req, blk_status_t error, /* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { - blk_zone_update_request_bio(req, bio); + if (blk_req_bio_is_zone_append(req, bio)) + blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); } @@ -988,8 +1066,7 @@ static inline void blk_account_io_done(struct request *req, u64 now) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) { + if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); @@ -1002,28 +1079,63 @@ static inline void blk_account_io_done(struct request *req, u64 now) } } +static inline bool blk_rq_passthrough_stats(struct request *req) +{ + struct bio *bio = req->bio; + + if (!blk_queue_passthrough_stat(req->q)) + return false; + + /* Requests without a bio do not transfer data. */ + if (!bio) + return false; + + /* + * Stats are accumulated in the bdev, so must have one attached to a + * bio to track stats. Most drivers do not set the bdev for passthrough + * requests, but nvme is one that will set it. + */ + if (!bio->bi_bdev) + return false; + + /* + * We don't know what a passthrough command does, but we know the + * payload size and data direction. Ensuring the size is aligned to the + * block size filters out most commands with payloads that don't + * represent sector access. + */ + if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) + return false; + return true; +} + static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); - if (blk_do_io_stat(req)) { - /* - * All non-passthrough requests are created from a bio with one - * exception: when a flush command that is part of a flush sequence - * generated by the state machine in blk-flush.c is cloned onto the - * lower device by dm-multipath we can get here without a bio. - */ - if (req->bio) - req->part = req->bio->bi_bdev; - else - req->part = req->q->disk->part0; + if (!blk_queue_io_stat(req->q)) + return; + if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) + return; - part_stat_lock(); - update_io_ticks(req->part, jiffies, false); - part_stat_local_inc(req->part, - in_flight[op_is_write(req_op(req))]); - part_stat_unlock(); - } + req->rq_flags |= RQF_IO_STAT; + req->start_time_ns = blk_time_get_ns(); + + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (req->bio) + req->part = req->bio->bi_bdev; + else + req->part = req->q->disk->part0; + + part_stat_lock(); + update_io_ticks(req->part, jiffies, false); + part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); + part_stat_unlock(); } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) @@ -1132,7 +1244,7 @@ static void blk_complete_reqs(struct llist_head *list) rq->q->mq_ops->complete(rq); } -static __latent_entropy void blk_done_softirq(struct softirq_action *h) +static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } @@ -1264,10 +1376,9 @@ void blk_mq_start_request(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) - q->integrity.profile->prepare_fn(rq); -#endif + blk_integrity_prepare(rq); + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } @@ -1307,8 +1418,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); + rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } @@ -1460,19 +1570,17 @@ static void blk_mq_requeue_work(struct work_struct *work) while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); /* - * If RQF_DONTPREP ist set, the request has been started by the + * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ - if (rq->rq_flags & RQF_DONTPREP) { - list_del_init(&rq->queuelist); + if (rq->rq_flags & RQF_DONTPREP) blk_mq_request_bypass_insert(rq, 0); - } else { - list_del_init(&rq->queuelist); + else blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); - } } while (!list_empty(&flush_list)) { @@ -1705,7 +1813,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } -EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; @@ -1997,7 +2104,7 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, - unsigned int nr_budgets) + bool get_budget) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; @@ -2019,7 +2126,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); - prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); + prep = blk_mq_prep_dispatch_rq(rq, get_budget); if (prep != PREP_DISPATCH_OK) break; @@ -2028,12 +2135,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bd.rq = rq; bd.last = list_empty(list); - /* - * once the request is queued to lld, no need to cover the - * budget any more - */ - if (nr_budgets) - nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: @@ -2067,7 +2168,11 @@ out: ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); - if (nr_budgets) + /* + * If the caller allocated budgets, free the budgets of the + * requests that have not yet been passed to the block driver. + */ + if (!get_budget) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); @@ -2207,6 +2312,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); +static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) +{ + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + __blk_mq_run_dispatch_ops(hctx->queue, false, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); + return need_run; +} + /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. @@ -2227,20 +2350,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); - /* - * When queue is quiesced, we may be switching io scheduler, or - * updating nr_hw_queues, or other things, and we can't run queue - * any more, even __blk_mq_hctx_has_pending() can't be called safely. - * - * And queue will be rerun in blk_mq_unquiesce_queue() if it is - * quiesced. - */ - __blk_mq_run_dispatch_ops(hctx->queue, false, - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx)); + need_run = blk_mq_hw_queue_need_run(hctx); + if (!need_run) { + unsigned long flags; - if (!need_run) - return; + /* + * Synchronize with blk_mq_unquiesce_queue(), because we check + * if hw queue is quiesced locklessly above, we need the use + * ->queue_lock to make sure we see the up-to-date status to + * not miss rerunning the hw queue. + */ + spin_lock_irqsave(&hctx->queue->queue_lock, flags); + need_run = blk_mq_hw_queue_need_run(hctx); + spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); + + if (!need_run) + return; + } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); @@ -2397,6 +2523,12 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + /* + * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the + * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch + * list in the subsequent routine. + */ + smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); @@ -2548,9 +2680,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; + rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; - rq->write_hint = bio->bi_write_hint; - blk_rq_bio_prep(rq, bio, nr_segs); + rq->__data_len = bio->bi_iter.bi_size; + rq->phys_gap_bit = bio->bi_bvec_gap_bit; + + rq->nr_phys_segments = nr_segs; + if (bio_integrity(bio)) + rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, + bio); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); @@ -2624,6 +2762,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return; } @@ -2654,6 +2793,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } @@ -2662,15 +2802,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) return __blk_mq_issue_directly(hctx, rq, last); } -static void blk_mq_plug_issue_direct(struct blk_plug *plug) +static void blk_mq_issue_direct(struct rq_list *rqs) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; - while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(plug->mq_list); + while ((rq = rq_list_pop(rqs))) { + bool last = rq_list_empty(rqs); if (hctx != rq->mq_hctx) { if (hctx) { @@ -2701,26 +2841,74 @@ out: blk_mq_commit_rqs(hctx, queued, false); } -static void __blk_mq_flush_plug_list(struct request_queue *q, - struct blk_plug *plug) +static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) { if (blk_queue_quiesced(q)) return; - q->mq_ops->queue_rqs(&plug->mq_list); + q->mq_ops->queue_rqs(rqs); +} + +static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, + struct rq_list *queue_rqs) +{ + struct request *rq = rq_list_pop(rqs); + struct request_queue *this_q = rq->q; + struct request **prev = &rqs->head; + struct rq_list matched_rqs = {}; + struct request *last = NULL; + unsigned depth = 1; + + rq_list_add_tail(&matched_rqs, rq); + while ((rq = *prev)) { + if (rq->q == this_q) { + /* move rq from rqs to matched_rqs */ + *prev = rq->rq_next; + rq_list_add_tail(&matched_rqs, rq); + depth++; + } else { + /* leave rq in rqs */ + prev = &rq->rq_next; + last = rq; + } + } + + rqs->tail = last; + *queue_rqs = matched_rqs; + return depth; +} + +static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) +{ + struct request_queue *q = rq_list_peek(rqs)->q; + + trace_block_unplug(q, depth, true); + + /* + * Peek first request and see if we have a ->queue_rqs() hook. + * If we do, we can dispatch the whole list in one go. + * We already know at this point that all requests belong to the + * same queue, caller must ensure that's the case. + */ + if (q->mq_ops->queue_rqs) { + blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); + if (rq_list_empty(rqs)) + return; + } + + blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); } -static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; - struct request *requeue_list = NULL; - struct request **requeue_lastp = &requeue_list; + struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); do { - struct request *rq = rq_list_pop(&plug->mq_list); + struct request *rq = rq_list_pop(rqs); if (!this_hctx) { this_hctx = rq->mq_hctx; @@ -2728,14 +2916,14 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { - rq_list_add_tail(&requeue_lastp, rq); + rq_list_add_tail(&requeue_list, rq); continue; } - list_add(&rq->queuelist, &list); + list_add_tail(&rq->queuelist, &list); depth++; - } while (!rq_list_empty(plug->mq_list)); + } while (!rq_list_empty(rqs)); - plug->mq_list = requeue_list; + *rqs = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); @@ -2755,9 +2943,22 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) percpu_ref_put(&this_hctx->queue->q_usage_counter); } +static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) +{ + do { + struct rq_list queue_rqs; + unsigned depth; + + depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); + blk_mq_dispatch_queue_requests(&queue_rqs, depth); + while (!rq_list_empty(&queue_rqs)) + blk_mq_dispatch_list(&queue_rqs, false); + } while (!rq_list_empty(rqs)); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct request *rq; + unsigned int depth; /* * We may have been called recursively midway through handling @@ -2768,36 +2969,23 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (plug->rq_count == 0) return; + depth = plug->rq_count; plug->rq_count = 0; - if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - struct request_queue *q; - - rq = rq_list_peek(&plug->mq_list); - q = rq->q; - - /* - * Peek first request and see if we have a ->queue_rqs() hook. - * If we do, we can dispatch the whole plug list in one go. We - * already know at this point that all requests belong to the - * same queue, caller must ensure that's the case. - */ - if (q->mq_ops->queue_rqs) { - blk_mq_run_dispatch_ops(q, - __blk_mq_flush_plug_list(q, plug)); - if (rq_list_empty(plug->mq_list)) - return; + if (!plug->has_elevator && !from_schedule) { + if (plug->multiple_queues) { + blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); + return; } - blk_mq_run_dispatch_ops(q, - blk_mq_plug_issue_direct(plug)); - if (rq_list_empty(plug->mq_list)) + blk_mq_dispatch_queue_requests(&plug->mq_list, depth); + if (rq_list_empty(&plug->mq_list)) return; } do { - blk_mq_dispatch_plug_list(plug, from_schedule); - } while (!rq_list_empty(plug->mq_list)); + blk_mq_dispatch_list(&plug->mq_list, from_schedule); + } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2847,13 +3035,18 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio, - unsigned int nsegs) + struct bio *bio) { struct blk_mq_alloc_data data = { .q = q, - .nr_tags = 1, + .flags = 0, + .shallow_depth = 0, .cmd_flags = bio->bi_opf, + .rq_flags = 0, + .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; struct request *rq; @@ -2862,16 +3055,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; - data.cached_rq = &plug->cached_rq; + data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); - if (rq) - return rq; - rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) - bio_wouldblock_error(bio); - return NULL; + if (unlikely(!rq)) + rq_qos_cleanup(q, bio); + return rq; } /* @@ -2885,7 +3075,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, if (!plug) return NULL; - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && @@ -2899,21 +3089,32 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + if (rq_list_pop(&plug->cached_rqs) != rq) + WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ - plug->cached_rq = rq_list_next(rq); rq_qos_throttle(rq->q, bio); - blk_mq_rq_time_init(rq, 0); + blk_mq_rq_time_init(rq, blk_time_get_ns()); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); } +static bool bio_unaligned(const struct bio *bio, struct request_queue *q) +{ + unsigned int bs_mask = queue_logical_block_size(q) - 1; + + /* .bi_sector of any zero sized bio need to be initialized */ + if ((bio->bi_iter.bi_size & bs_mask) || + ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) + return true; + return false; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2933,7 +3134,7 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - unsigned int nr_segs = 1; + unsigned int nr_segs; struct request *rq; blk_status_t ret; @@ -2955,8 +3156,6 @@ void blk_mq_submit_bio(struct bio *bio) goto new_request; } - bio = blk_queue_bounce(bio, q); - /* * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. @@ -2966,27 +3165,48 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto queue_exit; + /* + * Device reconfiguration may change logical block size or reduce the + * number of poll queues, so the checks for alignment and poll support + * have to be done with queue usage counter held. + */ + if (unlikely(bio_unaligned(bio, q))) { + bio_io_error(bio); + goto queue_exit; } + + if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + goto queue_exit; + } + + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto queue_exit; + if (!bio_integrity_prep(bio)) goto queue_exit; + blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; - if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) - goto queue_exit; + if (bio_needs_zone_write_plugging(bio)) { + if (blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + } new_request: - if (!rq) { - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) - goto queue_exit; - } else { + if (rq) { blk_mq_use_cached_rq(rq, plug, bio); + } else { + rq = blk_mq_get_new_requests(q, plug, bio); + if (unlikely(!rq)) { + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + goto queue_exit; + } } trace_block_getrq(bio); @@ -3041,7 +3261,7 @@ queue_exit: blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; @@ -3138,19 +3358,21 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { - struct bio *bio, *bio_src; + struct bio *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { - bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, - bs); + struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, + gfp_mask, bs); if (!bio) goto free_and_out; - if (bio_ctr && bio_ctr(bio, bio_src, data)) + if (bio_ctr && bio_ctr(bio, bio_src, data)) { + bio_put(bio); goto free_and_out; + } if (rq->bio) { rq->biotail->bi_next = bio; @@ -3158,7 +3380,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } else { rq->bio = rq->biotail = bio; } - bio = NULL; } /* Copy attributes of the original request to the clone request. */ @@ -3169,8 +3390,8 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - rq->write_hint = rq_src->write_hint; + rq->nr_integrity_segments = rq_src->nr_integrity_segments; + rq->phys_gap_bit = rq_src->phys_gap_bit; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -3178,8 +3399,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, return 0; free_and_out: - if (bio) - bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; @@ -3218,7 +3437,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; - unsigned long flags; /* * There is no need to clear mapping if driver tags is not initialized @@ -3242,22 +3460,12 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, } } } - - /* - * Wait until all pending iteration is done. - * - * Request reference is cleared and it is guaranteed to be observed - * after the ->lock is released. - */ - spin_lock_irqsave(&drv_tags->lock, flags); - spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; - struct page *page; if (list_empty(&tags->page_list)) return; @@ -3281,27 +3489,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } blk_mq_clear_rq_mapping(drv_tags, tags); - - while (!list_empty(&tags->page_list)) { - page = list_first_entry(&tags->page_list, struct page, lru); - list_del_init(&page->lru); - /* - * Remove kmemleak object previously allocated in - * blk_mq_alloc_rqs(). - */ - kmemleak_free(page_address(page)); - __free_pages(page, page->private); - } + /* + * Free request pages in SRCU callback, which is called from + * blk_mq_free_tags(). + */ } -void blk_mq_free_rq_map(struct blk_mq_tags *tags) +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags); + blk_mq_free_tags(set, tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, @@ -3342,8 +3543,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); if (!tags) return NULL; @@ -3364,7 +3564,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, err_free_rqs: kfree(tags->rqs); err_free_tags: - blk_mq_free_tags(tags); + blk_mq_free_tags(set, tags); return NULL; } @@ -3394,8 +3594,6 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - INIT_LIST_HEAD(&tags->page_list); - /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size @@ -3482,8 +3680,12 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) struct rq_iter_data data = { .hctx = hctx, }; + int srcu_idx; + srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu); blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx); + return data.has_rq; } @@ -3517,6 +3719,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); + int ret = 0; if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; @@ -3537,12 +3740,24 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) * frozen and there are no requests. */ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { - while (blk_mq_hctx_has_requests(hctx)) + while (blk_mq_hctx_has_requests(hctx)) { + /* + * The wakeup capable IRQ handler of block device is + * not called during suspend. Skip the loop by checking + * pm_wakeup_pending to prevent the deadlock and improve + * suspend latency. + */ + if (pm_wakeup_pending()) { + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); + ret = -EBUSY; + break; + } msleep(5); + } percpu_ref_put(&hctx->queue->q_usage_counter); } - return 0; + return ret; } /* @@ -3608,13 +3823,91 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) return 0; } -static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) +static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { - if (!(hctx->flags & BLK_MQ_F_STACKING)) + lockdep_assert_held(&blk_mq_cpuhp_lock); + + if (!(hctx->flags & BLK_MQ_F_STACKING) && + !hlist_unhashed(&hctx->cpuhp_online)) { cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); - cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, - &hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_online); + } + + if (!hlist_unhashed(&hctx->cpuhp_dead)) { + cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_dead); + } +} + +static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) +{ + mutex_lock(&blk_mq_cpuhp_lock); + __blk_mq_remove_cpuhp(hctx); + mutex_unlock(&blk_mq_cpuhp_lock); +} + +static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) +{ + lockdep_assert_held(&blk_mq_cpuhp_lock); + + if (!(hctx->flags & BLK_MQ_F_STACKING) && + hlist_unhashed(&hctx->cpuhp_online)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); + + if (hlist_unhashed(&hctx->cpuhp_dead)) + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); +} + +static void __blk_mq_remove_cpuhp_list(struct list_head *head) +{ + struct blk_mq_hw_ctx *hctx; + + lockdep_assert_held(&blk_mq_cpuhp_lock); + + list_for_each_entry(hctx, head, hctx_list) + __blk_mq_remove_cpuhp(hctx); +} + +/* + * Unregister cpuhp callbacks from exited hw queues + * + * Safe to call if this `request_queue` is live + */ +static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) +{ + LIST_HEAD(hctx_list); + + spin_lock(&q->unused_hctx_lock); + list_splice_init(&q->unused_hctx_list, &hctx_list); + spin_unlock(&q->unused_hctx_lock); + + mutex_lock(&blk_mq_cpuhp_lock); + __blk_mq_remove_cpuhp_list(&hctx_list); + mutex_unlock(&blk_mq_cpuhp_lock); + + spin_lock(&q->unused_hctx_lock); + list_splice(&hctx_list, &q->unused_hctx_list); + spin_unlock(&q->unused_hctx_lock); +} + +/* + * Register cpuhp callbacks from all hw queues + * + * Safe to call if this `request_queue` is live + */ +static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&blk_mq_cpuhp_lock); + queue_for_each_hw_ctx(q, hctx, i) + __blk_mq_add_cpuhp(hctx); + mutex_unlock(&blk_mq_cpuhp_lock); } /* @@ -3625,7 +3918,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; - unsigned long flags; /* The hw queue may not be mapped yet */ if (!tags) @@ -3635,15 +3927,14 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); +} - /* - * Wait until all pending iteration is done. - * - * Request reference is cleared and it is guaranteed to be observed - * after the ->lock is released. - */ - spin_lock_irqsave(&tags->lock, flags); - spin_unlock_irqrestore(&tags->lock, flags); +static void blk_free_flush_queue_callback(struct rcu_head *head) +{ + struct blk_flush_queue *fq = + container_of(head, struct blk_flush_queue, rcu_head); + + blk_free_flush_queue(fq); } /* hctx->ctxs will be freed in queue's release handler */ @@ -3665,9 +3956,9 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - blk_mq_remove_cpuhp(hctx); - - xa_erase(&q->hctx_table, hctx_idx); + call_srcu(&set->tags_srcu, &hctx->fq->rcu_head, + blk_free_flush_queue_callback); + hctx->fq = NULL; spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); @@ -3683,6 +3974,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; + blk_mq_remove_cpuhp(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } @@ -3691,36 +3983,33 @@ static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { - hctx->queue_num = hctx_idx; + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; - if (!(hctx->flags & BLK_MQ_F_STACKING)) - cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, - &hctx->cpuhp_online); - cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); + if (!hctx->fq) + goto fail; + + hctx->queue_num = hctx_idx; hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) - goto unregister_cpu_notifier; + goto fail_free_fq; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; - if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) - goto exit_flush_rq; - return 0; - exit_flush_rq: - if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - unregister_cpu_notifier: - blk_mq_remove_cpuhp(hctx); + fail_free_fq: + blk_free_flush_queue(hctx->fq); + hctx->fq = NULL; + fail: return -1; } @@ -3746,6 +4035,8 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); + INIT_HLIST_NODE(&hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_online); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; @@ -3769,16 +4060,10 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); - hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); - if (!hctx->fq) - goto free_bitmap; - blk_mq_hctx_kobj_init(hctx); return hctx; - free_bitmap: - sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); free_cpumask: @@ -3832,7 +4117,7 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { - blk_mq_free_rq_map(tags); + blk_mq_free_rq_map(set, tags); return NULL; } @@ -3860,7 +4145,7 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); - blk_mq_free_rq_map(tags); + blk_mq_free_rq_map(set, tags); } } @@ -4010,13 +4295,14 @@ static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, bool shared) { struct request_queue *q; + unsigned int memflags; lockdep_assert_held(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } } @@ -4105,7 +4391,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - xa_destroy(&q->hctx_table); + kfree(q->queue_hw_ctx); /* * release .mq_kobj and sw queue's kobject now because @@ -4121,7 +4407,13 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); + if (!lim) + lim = &default_lim; + lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; + if (set->nr_maps > HCTX_TYPE_POLL) + lim->features |= BLK_FEAT_POLL; + + q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; @@ -4197,6 +4489,15 @@ struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); +/* + * Only hctx removed from cpuhp list can be reused + */ +static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) +{ + return hlist_unhashed(&hctx->cpuhp_online) && + hlist_unhashed(&hctx->cpuhp_dead); +} + static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) @@ -4206,7 +4507,7 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { - if (tmp->numa_node == node) { + if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } @@ -4231,31 +4532,52 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( return NULL; } -static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, - struct request_queue *q) +static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned long i, j; + int i, j, end; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + rcu_assign_pointer(q->queue_hw_ctx, new_hctxs); + /* + * Make sure reading the old queue_hw_ctx from other + * context concurrently won't trigger uaf. + */ + synchronize_rcu_expedited(); + kfree(hctxs); + hctxs = new_hctxs; + } - /* protect against switching io scheduler */ - mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); - struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); + struct blk_mq_hw_ctx *old_hctx = hctxs[i]; if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } - if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node); + if (!hctxs[i]) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); - hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); - WARN_ON_ONCE(!hctx); + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, + old_node); + WARN_ON_ONCE(!hctxs[i]); } } /* @@ -4264,25 +4586,33 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; + end = i; } else { j = i; + end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - xa_for_each_start(&q->hctx_table, j, hctx, j) - blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->sysfs_lock); + for (; j < end; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + blk_mq_exit_hctx(q, set, hctx, j); + hctxs[j] = NULL; + } + } } -static void blk_mq_update_poll_flag(struct request_queue *q) +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { - struct blk_mq_tag_set *set = q->tag_set; + __blk_mq_realloc_hw_ctxs(set, q); - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); + /* unregister cpuhp callbacks for exited hctxs */ + blk_mq_remove_hw_queues_cpuhp(q); + + /* register cpuhp for new initialized hctxs */ + blk_mq_add_hw_queues_cpuhp(q); } int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, @@ -4291,6 +4621,12 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; + /* + * ->tag_set has to be setup before initialize hctx, which cpuphp + * handler needs it for checking queue mapping + */ + q->tag_set = set; + if (blk_mq_alloc_ctxs(q)) goto err_exit; @@ -4300,8 +4636,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); - xa_init(&q->hctx_table); - blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4309,10 +4643,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); - q->tag_set = set; - q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); @@ -4322,8 +4653,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); - blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); + blk_mq_add_queue_tag_set(set, q); return 0; err_hctxs: @@ -4542,13 +4873,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_srcu; } + ret = init_srcu_struct(&set->tags_srcu); + if (ret) + goto out_cleanup_srcu; + + init_rwsem(&set->update_nr_hwq_lock); ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out_cleanup_srcu; + goto out_cleanup_tags_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, @@ -4577,6 +4913,8 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_tags_srcu: + cleanup_srcu_struct(&set->tags_srcu); out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); @@ -4622,6 +4960,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + + srcu_barrier(&set->tags_srcu); + cleanup_srcu_struct(&set->tags_srcu); if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); @@ -4629,143 +4970,132 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_free_tag_set); -int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, + struct elevator_tags *et, + unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; + struct elevator_tags *old_et = NULL; struct blk_mq_hw_ctx *hctx; - int ret; unsigned long i; - if (!set) - return -EINVAL; - - if (q->nr_requests == nr) - return 0; - - blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); - ret = 0; - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->tags) - continue; + if (blk_mq_is_shared_tags(set->flags)) { /* - * If we're using an MQ scheduler, just update the scheduler - * queue depth. This is similar to what the old code would do. + * Shared tags, for sched tags, we allocate max initially hence + * tags can't grow, see blk_mq_alloc_sched_tags(). */ - if (hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); - } else { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); + if (q->elevator) + blk_mq_tag_update_sched_shared_tags(q, nr); + else + blk_mq_tag_resize_shared_tags(set, nr); + } else if (!q->elevator) { + /* + * Non-shared hardware tags, nr is already checked from + * queue_requests_store() and tags can't grow. + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; + sbitmap_queue_resize(&hctx->tags->bitmap_tags, + nr - hctx->tags->nr_reserved_tags); } - if (ret) - break; - if (q->elevator && q->elevator->type->ops.depth_updated) - q->elevator->type->ops.depth_updated(hctx); - } - if (!ret) { - q->nr_requests = nr; - if (blk_mq_is_shared_tags(set->flags)) { - if (q->elevator) - blk_mq_tag_update_sched_shared_tags(q); - else - blk_mq_tag_resize_shared_tags(set, nr); + } else if (nr <= q->elevator->et->nr_requests) { + /* Non-shared sched tags, and tags don't grow. */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->sched_tags) + continue; + sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags, + nr - hctx->sched_tags->nr_reserved_tags); } + } else { + /* Non-shared sched tags, and tags grow */ + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = et->tags[i]; + old_et = q->elevator->et; + q->elevator->et = et; } - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + q->nr_requests = nr; + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); - return ret; + blk_mq_unquiesce_queue(q); + return old_et; } /* - * request_queue and elevator_type pair. - * It is just used by __blk_mq_update_nr_hw_queues to cache - * the elevator_type associated with a request_queue. + * Switch back to the elevator type stored in the xarray. */ -struct blk_mq_qe_pair { - struct list_head node; - struct request_queue *q; - struct elevator_type *type; -}; - -/* - * Cache the elevator_type in qe pair list and switch the - * io scheduler to 'none' - */ -static bool blk_mq_elv_switch_none(struct list_head *head, - struct request_queue *q) +static void blk_mq_elv_switch_back(struct request_queue *q, + struct xarray *elv_tbl) { - struct blk_mq_qe_pair *qe; - - qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); - if (!qe) - return false; - - /* q->elevator needs protection from ->sysfs_lock */ - mutex_lock(&q->sysfs_lock); + struct elv_change_ctx *ctx = xa_load(elv_tbl, q->id); - /* the check has to be done with holding sysfs_lock */ - if (!q->elevator) { - kfree(qe); - goto unlock; - } + if (WARN_ON_ONCE(!ctx)) + return; - INIT_LIST_HEAD(&qe->node); - qe->q = q; - qe->type = q->elevator->type; - /* keep a reference to the elevator module as we'll switch back */ - __elevator_get(qe->type); - list_add(&qe->node, head); - elevator_disable(q); -unlock: - mutex_unlock(&q->sysfs_lock); + /* The elv_update_nr_hw_queues unfreezes the queue. */ + elv_update_nr_hw_queues(q, ctx); - return true; + /* Drop the reference acquired in blk_mq_elv_switch_none. */ + if (ctx->type) + elevator_put(ctx->type); } -static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head, - struct request_queue *q) +/* + * Stores elevator name and type in ctx and set current elevator to none. + */ +static int blk_mq_elv_switch_none(struct request_queue *q, + struct xarray *elv_tbl) { - struct blk_mq_qe_pair *qe; + struct elv_change_ctx *ctx; - list_for_each_entry(qe, head, node) - if (qe->q == q) - return qe; + lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); - return NULL; -} + /* + * Accessing q->elevator without holding q->elevator_lock is safe here + * because we're called from nr_hw_queue update which is protected by + * set->update_nr_hwq_lock in the writer context. So, scheduler update/ + * switch code (which acquires the same lock in the reader context) + * can't run concurrently. + */ + if (q->elevator) { + ctx = xa_load(elv_tbl, q->id); + if (WARN_ON_ONCE(!ctx)) + return -ENOENT; -static void blk_mq_elv_switch_back(struct list_head *head, - struct request_queue *q) -{ - struct blk_mq_qe_pair *qe; - struct elevator_type *t; + ctx->name = q->elevator->type->elevator_name; - qe = blk_lookup_qe_pair(head, q); - if (!qe) - return; - t = qe->type; - list_del(&qe->node); - kfree(qe); + /* + * Before we switch elevator to 'none', take a reference to + * the elevator module so that while nr_hw_queue update is + * running, no one can remove elevator module. We'd put the + * reference to elevator module later when we switch back + * elevator. + */ + __elevator_get(q->elevator->type); - mutex_lock(&q->sysfs_lock); - elevator_switch(q, t); - /* drop the reference acquired in blk_mq_elv_switch_none */ - elevator_put(t); - mutex_unlock(&q->sysfs_lock); + /* + * Store elevator type so that we can release the reference + * taken above later. + */ + ctx->type = q->elevator->type; + elevator_set_none(q); + } + return 0; } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; - LIST_HEAD(head); int prev_nr_hw_queues = set->nr_hw_queues; + unsigned int memflags; int i; + struct xarray elv_tbl; + bool queues_frozen = false; lockdep_assert_held(&set->tag_list_lock); @@ -4776,30 +5106,40 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_freeze_queue(q); + memflags = memalloc_noio_save(); + + xa_init(&elv_tbl); + if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0) + goto out_free_ctx; + + if (blk_mq_alloc_sched_res_batch(&elv_tbl, set, nr_hw_queues) < 0) + goto out_free_ctx; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_debugfs_unregister_hctxs(q); + blk_mq_sysfs_unregister_hctxs(q); + } + /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings. */ list_for_each_entry(q, &set->tag_list, tag_set_list) - if (!blk_mq_elv_switch_none(&head, q)) + if (blk_mq_elv_switch_none(q, &elv_tbl)) goto switch_back; - list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_debugfs_unregister_hctxs(q); - blk_mq_sysfs_unregister_hctxs(q); - } - + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue_nomemsave(q); + queues_frozen = true; if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) - goto reregister; + goto switch_back; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_realloc_hw_ctxs(set, q); - blk_mq_update_poll_flag(q); + __blk_mq_realloc_hw_ctxs(set, q); + if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -4813,19 +5153,27 @@ fallback: } blk_mq_map_swqueue(q); } +switch_back: + /* The blk_mq_elv_switch_back unfreezes queue for us. */ + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* switch_back expects queue to be frozen */ + if (!queues_frozen) + blk_mq_freeze_queue_nomemsave(q); + blk_mq_elv_switch_back(q, &elv_tbl); + } -reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); - } -switch_back: - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_elv_switch_back(&head, q); + blk_mq_remove_hw_queues_cpuhp(q); + blk_mq_add_hw_queues_cpuhp(q); + } - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_unfreeze_queue(q); +out_free_ctx: + blk_mq_free_sched_ctx_batch(&elv_tbl); + xa_destroy(&elv_tbl); + memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) @@ -4834,9 +5182,11 @@ switch_back: void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { + down_write(&set->update_nr_hwq_lock); mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); + up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); @@ -4870,9 +5220,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { - struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); - - return blk_hctx_poll(q, hctx, iob, flags); + if (!blk_mq_can_poll(q)) + return 0; + return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, |
