diff options
Diffstat (limited to 'block/blk-mq-sched.c')
| -rw-r--r-- | block/blk-mq-sched.c | 319 |
1 files changed, 237 insertions, 82 deletions
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 451a2c1f1f32..e26898128a7e 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) list_first_entry(rq_list, struct request, queuelist)->mq_hctx; struct request *rq; LIST_HEAD(hctx_list); - unsigned int count = 0; list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { list_cut_before(&hctx_list, rq_list, &rq->queuelist); goto dispatch; } - count++; } list_splice_tail_init(rq_list, &hctx_list); dispatch: - return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); + return blk_mq_dispatch_rq_list(hctx, &hctx_list, false); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ @@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) dispatched |= blk_mq_dispatch_hctx_list(&rq_list); } while (!list_empty(&rq_list)); } else { - dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false); } if (busy) @@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; @@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) + if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true)) return 0; need_dispatch = true; } else { @@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (need_dispatch) return blk_mq_do_dispatch_ctx(hctx); blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + blk_mq_dispatch_rq_list(hctx, &rq_list, true); return 0; } @@ -349,10 +347,9 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, } ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + hctx = blk_mq_map_queue(bio->bi_opf, ctx); type = hctx->type; - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || - list_empty_careful(&ctx->rq_lists[type])) + if (list_empty_careful(&ctx->rq_lists[type])) goto out_put; /* default per sw-queue merge */ @@ -377,125 +374,290 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) +/* called in queue's release handler, tagset has gone away */ +static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { - if (blk_mq_is_shared_tags(q->tag_set->flags)) { - hctx->sched_tags = q->sched_shared_tags; - return 0; - } + struct blk_mq_hw_ctx *hctx; + unsigned long i; - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, - q->nr_requests); + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = NULL; - if (!hctx->sched_tags) - return -ENOMEM; - return 0; + if (blk_mq_is_shared_tags(flags)) + q->sched_shared_tags = NULL; } -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) +void blk_mq_sched_reg_debugfs(struct request_queue *q) { - blk_mq_free_rq_map(queue->sched_shared_tags); - queue->sched_shared_tags = NULL; + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_sched(q); + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } -/* called in queue's release handler, tagset has gone away */ -static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) +void blk_mq_sched_unreg_debugfs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) { - if (!blk_mq_is_shared_tags(flags)) - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; + mutex_lock(&q->debugfs_mutex); + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_unregister_sched_hctx(hctx); + blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); +} + +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set) +{ + unsigned long i; + + /* Shared tags are stored at index 0 in @tags. */ + if (blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX); + else { + for (i = 0; i < et->nr_hw_queues; i++) + blk_mq_free_map_and_rqs(set, et->tags[i], i); + } + + kfree(et); +} + +void blk_mq_free_sched_res(struct elevator_resources *res, + struct elevator_type *type, + struct blk_mq_tag_set *set) +{ + if (res->et) { + blk_mq_free_sched_tags(res->et, set); + res->et = NULL; + } + if (res->data) { + blk_mq_free_sched_data(type, res->data); + res->data = NULL; + } +} + +void blk_mq_free_sched_res_batch(struct xarray *elv_tbl, + struct blk_mq_tag_set *set) +{ + struct request_queue *q; + struct elv_change_ctx *ctx; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + ctx = xa_load(elv_tbl, q->id); + if (!ctx) { + WARN_ON_ONCE(1); + continue; + } + blk_mq_free_sched_res(&ctx->res, ctx->type, set); } } +} - if (blk_mq_is_shared_tags(flags)) - blk_mq_exit_sched_shared_tags(q); +void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl) +{ + unsigned long i; + struct elv_change_ctx *ctx; + + xa_for_each(elv_tbl, i, ctx) { + xa_erase(elv_tbl, i); + kfree(ctx); + } } -static int blk_mq_init_sched_shared_tags(struct request_queue *queue) +int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, + struct blk_mq_tag_set *set) { - struct blk_mq_tag_set *set = queue->tag_set; + struct request_queue *q; + struct elv_change_ctx *ctx; - /* - * Set initial depth at max so that we don't need to reallocate for - * updating nr_requests. - */ - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, - BLK_MQ_NO_HCTX_IDX, - MAX_SCHED_RQ); - if (!queue->sched_shared_tags) + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + ctx = kzalloc(sizeof(struct elv_change_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (xa_insert(elv_tbl, q->id, ctx, GFP_KERNEL)) { + kfree(ctx); + return -ENOMEM; + } + } + return 0; +} + +struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues, unsigned int nr_requests) +{ + unsigned int nr_tags; + int i; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + if (blk_mq_is_shared_tags(set->flags)) + nr_tags = 1; + else + nr_tags = nr_hw_queues; + + et = kmalloc(struct_size(et, tags, nr_tags), gfp); + if (!et) + return NULL; + + et->nr_requests = nr_requests; + et->nr_hw_queues = nr_hw_queues; + + if (blk_mq_is_shared_tags(set->flags)) { + /* Shared tags are stored at index 0 in @tags. */ + et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!et->tags[0]) + goto out; + } else { + for (i = 0; i < et->nr_hw_queues; i++) { + et->tags[i] = blk_mq_alloc_map_and_rqs(set, i, + et->nr_requests); + if (!et->tags[i]) + goto out_unwind; + } + } + + return et; +out_unwind: + while (--i >= 0) + blk_mq_free_map_and_rqs(set, et->tags[i], i); +out: + kfree(et); + return NULL; +} + +int blk_mq_alloc_sched_res(struct request_queue *q, + struct elevator_type *type, + struct elevator_resources *res, + unsigned int nr_hw_queues) +{ + struct blk_mq_tag_set *set = q->tag_set; + + res->et = blk_mq_alloc_sched_tags(set, nr_hw_queues, + blk_mq_default_nr_requests(set)); + if (!res->et) + return -ENOMEM; + + res->data = blk_mq_alloc_sched_data(q, type); + if (IS_ERR(res->data)) { + blk_mq_free_sched_tags(res->et, set); return -ENOMEM; + } + + return 0; +} + +int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues) +{ + struct elv_change_ctx *ctx; + struct request_queue *q; + int ret = -ENOMEM; - blk_mq_tag_update_sched_shared_tags(queue); + lockdep_assert_held_write(&set->update_nr_hwq_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + ctx = xa_load(elv_tbl, q->id); + if (WARN_ON_ONCE(!ctx)) { + ret = -ENOENT; + goto out_unwind; + } + + ret = blk_mq_alloc_sched_res(q, q->elevator->type, + &ctx->res, nr_hw_queues); + if (ret) + goto out_unwind; + } + } return 0; + +out_unwind: + list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { + if (q->elevator) { + ctx = xa_load(elv_tbl, q->id); + if (ctx) + blk_mq_free_sched_res(&ctx->res, + ctx->type, set); + } + } + return ret; } /* caller must have a reference to @e, will grab another one if successful */ -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_resources *res) { unsigned int flags = q->tag_set->flags; + struct elevator_tags *et = res->et; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; unsigned long i; int ret; - /* - * Default to double of smaller one between hw queue_depth and 128, - * since we don't split into sync/async like the old code did. - * Additionally, this is a per-hw queue depth. - */ - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_DEFAULT_RQ); + eq = elevator_alloc(q, e, res); + if (!eq) + return -ENOMEM; + + q->nr_requests = et->nr_requests; if (blk_mq_is_shared_tags(flags)) { - ret = blk_mq_init_sched_shared_tags(q); - if (ret) - return ret; + /* Shared tags are stored at index 0 in @et->tags. */ + q->sched_shared_tags = et->tags[0]; + blk_mq_tag_update_sched_shared_tags(q, et->nr_requests); } queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); - if (ret) - goto err_free_map_and_rqs; + if (blk_mq_is_shared_tags(flags)) + hctx->sched_tags = q->sched_shared_tags; + else + hctx->sched_tags = et->tags[i]; } - ret = e->ops.init_sched(q, e); + ret = e->ops.init_sched(q, eq); if (ret) - goto err_free_map_and_rqs; - - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_sched(q); - mutex_unlock(&q->debugfs_mutex); + goto out; queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { - eq = q->elevator; - blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; } } - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_sched_hctx(q, hctx); - mutex_unlock(&q->debugfs_mutex); } - return 0; -err_free_map_and_rqs: - blk_mq_sched_free_rqs(q); +out: blk_mq_sched_tags_teardown(q, flags); - + kobject_put(&eq->kobj); q->elevator = NULL; return ret; } @@ -528,10 +690,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_sched_hctx(hctx); - mutex_unlock(&q->debugfs_mutex); - if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; @@ -539,12 +697,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) flags = hctx->flags; } - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_sched(q); - mutex_unlock(&q->debugfs_mutex); - if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); + set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags); q->elevator = NULL; } |
