summaryrefslogtreecommitdiff
path: root/block/blk-mq-sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq-sched.c')
-rw-r--r--block/blk-mq-sched.c915
1 files changed, 517 insertions, 398 deletions
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..e26898128a7e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* blk-mq scheduling framework
*
@@ -5,7 +6,7 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/blk-mq.h>
+#include <linux/list_sort.h>
#include <trace/events/block.h>
@@ -13,95 +14,261 @@
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
-#include "blk-mq-tag.h"
#include "blk-wbt.h"
-void blk_mq_sched_free_hctx_data(struct request_queue *q,
- void (*exit)(struct blk_mq_hw_ctx *))
+/*
+ * Mark a hardware queue as needing a restart.
+ */
+void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
{
- struct blk_mq_hw_ctx *hctx;
- int i;
+ if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+ return;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (exit && hctx->sched_data)
- exit(hctx);
- kfree(hctx->sched_data);
- hctx->sched_data = NULL;
- }
+ set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
-EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
-void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
- struct request_queue *q = rq->q;
- struct io_context *ioc = rq_ioc(bio);
- struct io_cq *icq;
-
- spin_lock_irq(q->queue_lock);
- icq = ioc_lookup_icq(ioc, q);
- spin_unlock_irq(q->queue_lock);
-
- if (!icq) {
- icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
- if (!icq)
- return;
+ clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+
+ /*
+ * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
+ * in blk_mq_run_hw_queue(). Its pair is the barrier in
+ * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
+ * meantime new request added to hctx->dispatch is missed to check in
+ * blk_mq_run_hw_queue().
+ */
+ smp_mb();
+
+ blk_mq_run_hw_queue(hctx, true);
+}
+
+static int sched_rq_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct request *rqa = container_of(a, struct request, queuelist);
+ struct request *rqb = container_of(b, struct request, queuelist);
+
+ return rqa->mq_hctx > rqb->mq_hctx;
+}
+
+static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
+{
+ struct blk_mq_hw_ctx *hctx =
+ list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
+ struct request *rq;
+ LIST_HEAD(hctx_list);
+
+ list_for_each_entry(rq, rq_list, queuelist) {
+ if (rq->mq_hctx != hctx) {
+ list_cut_before(&hctx_list, rq_list, &rq->queuelist);
+ goto dispatch;
+ }
}
- get_io_context(icq->ioc);
- rq->elv.icq = icq;
+ list_splice_tail_init(rq_list, &hctx_list);
+
+dispatch:
+ return blk_mq_dispatch_rq_list(hctx, &hctx_list, false);
}
+#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
+
/*
- * Mark a hardware queue as needing a restart. For shared queues, maintain
- * a count of how many hardware queues are marked for restart.
+ * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
+ * its queue by itself in its completion handler, so we don't need to
+ * restart queue if .get_budget() fails to get the budget.
+ *
+ * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
+ * be run again. This is necessary to avoid starving flushes.
*/
-static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
+static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
- if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- return;
+ struct request_queue *q = hctx->queue;
+ struct elevator_queue *e = q->elevator;
+ bool multi_hctxs = false, run_queue = false;
+ bool dispatched = false, busy = false;
+ unsigned int max_dispatch;
+ LIST_HEAD(rq_list);
+ int count = 0;
- if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
- struct request_queue *q = hctx->queue;
+ if (hctx->dispatch_busy)
+ max_dispatch = 1;
+ else
+ max_dispatch = hctx->queue->nr_requests;
- if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_inc(&q->shared_hctx_restart);
- } else
- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ do {
+ struct request *rq;
+ int budget_token;
+
+ if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
+ break;
+
+ if (!list_empty_careful(&hctx->dispatch)) {
+ busy = true;
+ break;
+ }
+
+ budget_token = blk_mq_get_dispatch_budget(q);
+ if (budget_token < 0)
+ break;
+
+ rq = e->type->ops.dispatch_request(hctx);
+ if (!rq) {
+ blk_mq_put_dispatch_budget(q, budget_token);
+ /*
+ * We're releasing without dispatching. Holding the
+ * budget could have blocked any "hctx"s with the
+ * same queue and if we didn't dispatch then there's
+ * no guarantee anyone will kick the queue. Kick it
+ * ourselves.
+ */
+ run_queue = true;
+ break;
+ }
+
+ blk_mq_set_rq_budget_token(rq, budget_token);
+
+ /*
+ * Now this rq owns the budget which has to be released
+ * if this rq won't be queued to driver via .queue_rq()
+ * in blk_mq_dispatch_rq_list().
+ */
+ list_add_tail(&rq->queuelist, &rq_list);
+ count++;
+ if (rq->mq_hctx != hctx)
+ multi_hctxs = true;
+
+ /*
+ * If we cannot get tag for the request, stop dequeueing
+ * requests from the IO scheduler. We are unlikely to be able
+ * to submit them anyway and it creates false impression for
+ * scheduling heuristics that the device can take more IO.
+ */
+ if (!blk_mq_get_driver_tag(rq))
+ break;
+ } while (count < max_dispatch);
+
+ if (!count) {
+ if (run_queue)
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
+ } else if (multi_hctxs) {
+ /*
+ * Requests from different hctx may be dequeued from some
+ * schedulers, such as bfq and deadline.
+ *
+ * Sort the requests in the list according to their hctx,
+ * dispatch batching requests from same hctx at a time.
+ */
+ list_sort(NULL, &rq_list, sched_rq_cmp);
+ do {
+ dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
+ } while (!list_empty(&rq_list));
+ } else {
+ dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false);
+ }
+
+ if (busy)
+ return -EAGAIN;
+ return !!dispatched;
}
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- return false;
+ unsigned long end = jiffies + HZ;
+ int ret;
- if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
- struct request_queue *q = hctx->queue;
+ do {
+ ret = __blk_mq_do_dispatch_sched(hctx);
+ if (ret != 1)
+ break;
+ if (need_resched() || time_is_before_jiffies(end)) {
+ blk_mq_delay_run_hw_queue(hctx, 0);
+ break;
+ }
+ } while (1);
- if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_dec(&q->shared_hctx_restart);
- } else
- clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ return ret;
+}
- if (blk_mq_hctx_has_pending(hctx)) {
- blk_mq_run_hw_queue(hctx, true);
- return true;
- }
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+ unsigned short idx = ctx->index_hw[hctx->type];
+
+ if (++idx == hctx->nr_ctx)
+ idx = 0;
- return false;
+ return hctx->ctxs[idx];
}
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+/*
+ * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
+ * its queue by itself in its completion handler, so we don't need to
+ * restart queue if .get_budget() fails to get the budget.
+ *
+ * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
+ * be run again. This is necessary to avoid starving flushes.
+ */
+static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
- struct elevator_queue *e = q->elevator;
- const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
- bool did_work = false;
LIST_HEAD(rq_list);
+ struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+ int ret = 0;
+ struct request *rq;
- /* RCU or SRCU read lock is needed before checking quiesced flag */
- if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
- return;
+ do {
+ int budget_token;
+
+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ if (!sbitmap_any_bit_set(&hctx->ctx_map))
+ break;
+
+ budget_token = blk_mq_get_dispatch_budget(q);
+ if (budget_token < 0)
+ break;
+
+ rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+ if (!rq) {
+ blk_mq_put_dispatch_budget(q, budget_token);
+ /*
+ * We're releasing without dispatching. Holding the
+ * budget could have blocked any "hctx"s with the
+ * same queue and if we didn't dispatch then there's
+ * no guarantee anyone will kick the queue. Kick it
+ * ourselves.
+ */
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
+ break;
+ }
+
+ blk_mq_set_rq_budget_token(rq, budget_token);
+
+ /*
+ * Now this rq owns the budget which has to be released
+ * if this rq won't be queued to driver via .queue_rq()
+ * in blk_mq_dispatch_rq_list().
+ */
+ list_add(&rq->queuelist, &rq_list);
+
+ /* round robin for fair dispatch */
+ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+ } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false));
+
+ WRITE_ONCE(hctx->dispatch_from, ctx);
+ return ret;
+}
- hctx->run++;
+static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+ bool need_dispatch = false;
+ LIST_HEAD(rq_list);
/*
* If we have previous entries on our dispatch list, grab them first for
@@ -122,465 +289,417 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
* scheduler, we can no longer merge or sort them. So it's best to
* leave them there for as long as we can. Mark the hw queue as
* needing a restart in that case.
+ *
+ * We want to dispatch from the scheduler if there was nothing
+ * on the dispatch list or we were able to dispatch from the
+ * dispatch list.
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- did_work = blk_mq_dispatch_rq_list(q, &rq_list);
- } else if (!has_sched_dispatch) {
- blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(q, &rq_list);
- }
-
- /*
- * We want to dispatch from the scheduler if we had no work left
- * on the dispatch list, OR if we did have work but weren't able
- * to make progress.
- */
- if (!did_work && has_sched_dispatch) {
- do {
- struct request *rq;
-
- rq = e->type->ops.mq.dispatch_request(hctx);
- if (!rq)
- break;
- list_add(&rq->queuelist, &rq_list);
- } while (blk_mq_dispatch_rq_list(q, &rq_list));
+ if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true))
+ return 0;
+ need_dispatch = true;
+ } else {
+ need_dispatch = hctx->dispatch_busy;
}
-}
-bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
- struct request **merged_request)
-{
- struct request *rq;
+ if (hctx->queue->elevator)
+ return blk_mq_do_dispatch_sched(hctx);
- switch (elv_merge(q, &rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- if (!blk_mq_sched_allow_merge(q, rq, bio))
- return false;
- if (!bio_attempt_back_merge(q, rq, bio))
- return false;
- *merged_request = attempt_back_merge(q, rq);
- if (!*merged_request)
- elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
- return true;
- case ELEVATOR_FRONT_MERGE:
- if (!blk_mq_sched_allow_merge(q, rq, bio))
- return false;
- if (!bio_attempt_front_merge(q, rq, bio))
- return false;
- *merged_request = attempt_front_merge(q, rq);
- if (!*merged_request)
- elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
- return true;
- default:
- return false;
- }
+ /* dequeue request one by one from sw queue if queue is busy */
+ if (need_dispatch)
+ return blk_mq_do_dispatch_ctx(hctx);
+ blk_mq_flush_busy_ctxs(hctx, &rq_list);
+ blk_mq_dispatch_rq_list(hctx, &rq_list, true);
+ return 0;
}
-EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
-/*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
- */
-static bool blk_mq_attempt_merge(struct request_queue *q,
- struct blk_mq_ctx *ctx, struct bio *bio)
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
- struct request *rq;
- int checked = 8;
-
- lockdep_assert_held(&ctx->lock);
-
- list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
- bool merged = false;
-
- if (!checked--)
- break;
-
- if (!blk_rq_merge_ok(rq, bio))
- continue;
+ struct request_queue *q = hctx->queue;
- switch (blk_try_merge(rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_back_merge(q, rq, bio);
- break;
- case ELEVATOR_FRONT_MERGE:
- if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_front_merge(q, rq, bio);
- break;
- case ELEVATOR_DISCARD_MERGE:
- merged = bio_attempt_discard_merge(q, rq, bio);
- break;
- default:
- continue;
- }
+ /* RCU or SRCU read lock is needed before checking quiesced flag */
+ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
+ return;
- if (merged)
- ctx->rq_merged++;
- return merged;
+ /*
+ * A return of -EAGAIN is an indication that hctx->dispatch is not
+ * empty and we must run again in order to avoid starving flushes.
+ */
+ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
+ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
+ blk_mq_run_hw_queue(hctx, true);
}
-
- return false;
}
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs)
{
struct elevator_queue *e = q->elevator;
- struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+ struct blk_mq_ctx *ctx;
+ struct blk_mq_hw_ctx *hctx;
bool ret = false;
+ enum hctx_type type;
- if (e && e->type->ops.mq.bio_merge) {
- blk_mq_put_ctx(ctx);
- return e->type->ops.mq.bio_merge(hctx, bio);
+ if (e && e->type->ops.bio_merge) {
+ ret = e->type->ops.bio_merge(q, bio, nr_segs);
+ goto out_put;
}
- if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
- /* default per sw-queue merge */
- spin_lock(&ctx->lock);
- ret = blk_mq_attempt_merge(q, ctx, bio);
- spin_unlock(&ctx->lock);
- }
+ ctx = blk_mq_get_ctx(q);
+ hctx = blk_mq_map_queue(bio->bi_opf, ctx);
+ type = hctx->type;
+ if (list_empty_careful(&ctx->rq_lists[type]))
+ goto out_put;
- blk_mq_put_ctx(ctx);
+ /* default per sw-queue merge */
+ spin_lock(&ctx->lock);
+ /*
+ * Reverse check our software queue for entries that we could
+ * potentially merge with. Currently includes a hand-wavy stop
+ * count of 8, to not spend too much time checking for merges.
+ */
+ if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
+ ret = true;
+
+ spin_unlock(&ctx->lock);
+out_put:
return ret;
}
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+ struct list_head *free)
{
- return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+ return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
-void blk_mq_sched_request_inserted(struct request *rq)
+/* called in queue's release handler, tagset has gone away */
+static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
- trace_block_rq_insert(rq->q, rq);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
-static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
- struct request *rq)
-{
- if (rq->tag == -1) {
- rq->rq_flags |= RQF_SORTED;
- return false;
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = NULL;
- /*
- * If we already have a real request tag, send directly to
- * the dispatch list.
- */
- spin_lock(&hctx->lock);
- list_add(&rq->queuelist, &hctx->dispatch);
- spin_unlock(&hctx->lock);
- return true;
+ if (blk_mq_is_shared_tags(flags))
+ q->sched_shared_tags = NULL;
}
-/**
- * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
- * @pos: loop cursor.
- * @skip: the list element that will not be examined. Iteration starts at
- * @skip->next.
- * @head: head of the list to examine. This list must have at least one
- * element, namely @skip.
- * @member: name of the list_head structure within typeof(*pos).
- */
-#define list_for_each_entry_rcu_rr(pos, skip, head, member) \
- for ((pos) = (skip); \
- (pos = (pos)->member.next != (head) ? list_entry_rcu( \
- (pos)->member.next, typeof(*pos), member) : \
- list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
- (pos) != (skip); )
-
-/*
- * Called after a driver tag has been freed to check whether a hctx needs to
- * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
- * queues in a round-robin fashion if the tag set of @hctx is shared with other
- * hardware queues.
- */
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
+void blk_mq_sched_reg_debugfs(struct request_queue *q)
{
- struct blk_mq_tags *const tags = hctx->tags;
- struct blk_mq_tag_set *const set = hctx->queue->tag_set;
- struct request_queue *const queue = hctx->queue, *q;
- struct blk_mq_hw_ctx *hctx2;
- unsigned int i, j;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
- if (set->flags & BLK_MQ_F_TAG_SHARED) {
- /*
- * If this is 0, then we know that no hardware queues
- * have RESTART marked. We're done.
- */
- if (!atomic_read(&queue->shared_hctx_restart))
- return;
-
- rcu_read_lock();
- list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
- tag_set_list) {
- queue_for_each_hw_ctx(q, hctx2, i)
- if (hctx2->tags == tags &&
- blk_mq_sched_restart_hctx(hctx2))
- goto done;
- }
- j = hctx->queue_num + 1;
- for (i = 0; i < queue->nr_hw_queues; i++, j++) {
- if (j == queue->nr_hw_queues)
- j = 0;
- hctx2 = queue->queue_hw_ctx[j];
- if (hctx2->tags == tags &&
- blk_mq_sched_restart_hctx(hctx2))
- break;
- }
-done:
- rcu_read_unlock();
- } else {
- blk_mq_sched_restart_hctx(hctx);
- }
+ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_register_sched(q);
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_debugfs_register_sched_hctx(q, hctx);
+ mutex_unlock(&q->debugfs_mutex);
}
-/*
- * Add flush/fua to the queue. If we fail getting a driver tag, then
- * punt to the requeue list. Requeue will re-invoke us from a context
- * that's safe to block from.
- */
-static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
- struct request *rq, bool can_block)
+void blk_mq_sched_unreg_debugfs(struct request_queue *q)
{
- if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
- blk_insert_flush(rq);
- blk_mq_run_hw_queue(hctx, true);
- } else
- blk_mq_add_to_requeue_list(rq, false, true);
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ mutex_lock(&q->debugfs_mutex);
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_debugfs_unregister_sched_hctx(hctx);
+ blk_mq_debugfs_unregister_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
}
-void blk_mq_sched_insert_request(struct request *rq, bool at_head,
- bool run_queue, bool async, bool can_block)
+void blk_mq_free_sched_tags(struct elevator_tags *et,
+ struct blk_mq_tag_set *set)
{
- struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
- if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
- blk_mq_sched_insert_flush(hctx, rq, can_block);
- return;
+ unsigned long i;
+
+ /* Shared tags are stored at index 0 in @tags. */
+ if (blk_mq_is_shared_tags(set->flags))
+ blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
+ else {
+ for (i = 0; i < et->nr_hw_queues; i++)
+ blk_mq_free_map_and_rqs(set, et->tags[i], i);
}
- if (e && blk_mq_sched_bypass_insert(hctx, rq))
- goto run;
-
- if (e && e->type->ops.mq.insert_requests) {
- LIST_HEAD(list);
+ kfree(et);
+}
- list_add(&rq->queuelist, &list);
- e->type->ops.mq.insert_requests(hctx, &list, at_head);
- } else {
- spin_lock(&ctx->lock);
- __blk_mq_insert_request(hctx, rq, at_head);
- spin_unlock(&ctx->lock);
+void blk_mq_free_sched_res(struct elevator_resources *res,
+ struct elevator_type *type,
+ struct blk_mq_tag_set *set)
+{
+ if (res->et) {
+ blk_mq_free_sched_tags(res->et, set);
+ res->et = NULL;
+ }
+ if (res->data) {
+ blk_mq_free_sched_data(type, res->data);
+ res->data = NULL;
}
-
-run:
- if (run_queue)
- blk_mq_run_hw_queue(hctx, async);
}
-void blk_mq_sched_insert_requests(struct request_queue *q,
- struct blk_mq_ctx *ctx,
- struct list_head *list, bool run_queue_async)
+void blk_mq_free_sched_res_batch(struct xarray *elv_tbl,
+ struct blk_mq_tag_set *set)
{
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
- struct elevator_queue *e = hctx->queue->elevator;
+ struct request_queue *q;
+ struct elv_change_ctx *ctx;
- if (e) {
- struct request *rq, *next;
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
/*
- * We bypass requests that already have a driver tag assigned,
- * which should only be flushes. Flushes are only ever inserted
- * as single requests, so we shouldn't ever hit the
- * WARN_ON_ONCE() below (but let's handle it just in case).
+ * Accessing q->elevator without holding q->elevator_lock is
+ * safe because we're holding here set->update_nr_hwq_lock in
+ * the writer context. So, scheduler update/switch code (which
+ * acquires the same lock but in the reader context) can't run
+ * concurrently.
*/
- list_for_each_entry_safe(rq, next, list, queuelist) {
- if (WARN_ON_ONCE(rq->tag != -1)) {
- list_del_init(&rq->queuelist);
- blk_mq_sched_bypass_insert(hctx, rq);
+ if (q->elevator) {
+ ctx = xa_load(elv_tbl, q->id);
+ if (!ctx) {
+ WARN_ON_ONCE(1);
+ continue;
}
+ blk_mq_free_sched_res(&ctx->res, ctx->type, set);
}
}
-
- if (e && e->type->ops.mq.insert_requests)
- e->type->ops.mq.insert_requests(hctx, list, false);
- else
- blk_mq_insert_requests(hctx, ctx, list);
-
- blk_mq_run_hw_queue(hctx, run_queue_async);
}
-static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
+void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl)
{
- if (hctx->sched_tags) {
- blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
- blk_mq_free_rq_map(hctx->sched_tags);
- hctx->sched_tags = NULL;
+ unsigned long i;
+ struct elv_change_ctx *ctx;
+
+ xa_for_each(elv_tbl, i, ctx) {
+ xa_erase(elv_tbl, i);
+ kfree(ctx);
}
}
-static int blk_mq_sched_alloc_tags(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
+int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl,
+ struct blk_mq_tag_set *set)
{
- struct blk_mq_tag_set *set = q->tag_set;
- int ret;
+ struct request_queue *q;
+ struct elv_change_ctx *ctx;
- hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
- set->reserved_tags);
- if (!hctx->sched_tags)
- return -ENOMEM;
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
- ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
- if (ret)
- blk_mq_sched_free_tags(set, hctx, hctx_idx);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ ctx = kzalloc(sizeof(struct elv_change_ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
- return ret;
+ if (xa_insert(elv_tbl, q->id, ctx, GFP_KERNEL)) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ }
+ return 0;
}
-static void blk_mq_sched_tags_teardown(struct request_queue *q)
+struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues, unsigned int nr_requests)
{
- struct blk_mq_tag_set *set = q->tag_set;
- struct blk_mq_hw_ctx *hctx;
+ unsigned int nr_tags;
int i;
+ struct elevator_tags *et;
+ gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
- queue_for_each_hw_ctx(q, hctx, i)
- blk_mq_sched_free_tags(set, hctx, i);
-}
-
-int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
-{
- struct elevator_queue *e = q->elevator;
- int ret;
+ if (blk_mq_is_shared_tags(set->flags))
+ nr_tags = 1;
+ else
+ nr_tags = nr_hw_queues;
- if (!e)
- return 0;
+ et = kmalloc(struct_size(et, tags, nr_tags), gfp);
+ if (!et)
+ return NULL;
- ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
- if (ret)
- return ret;
+ et->nr_requests = nr_requests;
+ et->nr_hw_queues = nr_hw_queues;
- if (e->type->ops.mq.init_hctx) {
- ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
- if (ret) {
- blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
- return ret;
+ if (blk_mq_is_shared_tags(set->flags)) {
+ /* Shared tags are stored at index 0 in @tags. */
+ et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
+ MAX_SCHED_RQ);
+ if (!et->tags[0])
+ goto out;
+ } else {
+ for (i = 0; i < et->nr_hw_queues; i++) {
+ et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
+ et->nr_requests);
+ if (!et->tags[i])
+ goto out_unwind;
}
}
- blk_mq_debugfs_register_sched_hctx(q, hctx);
+ return et;
+out_unwind:
+ while (--i >= 0)
+ blk_mq_free_map_and_rqs(set, et->tags[i], i);
+out:
+ kfree(et);
+ return NULL;
+}
+
+int blk_mq_alloc_sched_res(struct request_queue *q,
+ struct elevator_type *type,
+ struct elevator_resources *res,
+ unsigned int nr_hw_queues)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ res->et = blk_mq_alloc_sched_tags(set, nr_hw_queues,
+ blk_mq_default_nr_requests(set));
+ if (!res->et)
+ return -ENOMEM;
+
+ res->data = blk_mq_alloc_sched_data(q, type);
+ if (IS_ERR(res->data)) {
+ blk_mq_free_sched_tags(res->et, set);
+ return -ENOMEM;
+ }
return 0;
}
-void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
+int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
+ struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
{
- struct elevator_queue *e = q->elevator;
+ struct elv_change_ctx *ctx;
+ struct request_queue *q;
+ int ret = -ENOMEM;
- if (!e)
- return;
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
- blk_mq_debugfs_unregister_sched_hctx(hctx);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is
+ * safe because we're holding here set->update_nr_hwq_lock in
+ * the writer context. So, scheduler update/switch code (which
+ * acquires the same lock but in the reader context) can't run
+ * concurrently.
+ */
+ if (q->elevator) {
+ ctx = xa_load(elv_tbl, q->id);
+ if (WARN_ON_ONCE(!ctx)) {
+ ret = -ENOENT;
+ goto out_unwind;
+ }
- if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
- e->type->ops.mq.exit_hctx(hctx, hctx_idx);
- hctx->sched_data = NULL;
+ ret = blk_mq_alloc_sched_res(q, q->elevator->type,
+ &ctx->res, nr_hw_queues);
+ if (ret)
+ goto out_unwind;
+ }
}
+ return 0;
- blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
+out_unwind:
+ list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
+ if (q->elevator) {
+ ctx = xa_load(elv_tbl, q->id);
+ if (ctx)
+ blk_mq_free_sched_res(&ctx->res,
+ ctx->type, set);
+ }
+ }
+ return ret;
}
-int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
+/* caller must have a reference to @e, will grab another one if successful */
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
+ struct elevator_resources *res)
{
+ unsigned int flags = q->tag_set->flags;
+ struct elevator_tags *et = res->et;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
- unsigned int i;
+ unsigned long i;
int ret;
- if (!e) {
- q->elevator = NULL;
- return 0;
- }
+ eq = elevator_alloc(q, e, res);
+ if (!eq)
+ return -ENOMEM;
- /*
- * Default to double of smaller one between hw queue_depth and 128,
- * since we don't split into sync/async like the old code did.
- * Additionally, this is a per-hw queue depth.
- */
- q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
- BLKDEV_MAX_RQ);
+ q->nr_requests = et->nr_requests;
+
+ if (blk_mq_is_shared_tags(flags)) {
+ /* Shared tags are stored at index 0 in @et->tags. */
+ q->sched_shared_tags = et->tags[0];
+ blk_mq_tag_update_sched_shared_tags(q, et->nr_requests);
+ }
queue_for_each_hw_ctx(q, hctx, i) {
- ret = blk_mq_sched_alloc_tags(q, hctx, i);
- if (ret)
- goto err;
+ if (blk_mq_is_shared_tags(flags))
+ hctx->sched_tags = q->sched_shared_tags;
+ else
+ hctx->sched_tags = et->tags[i];
}
- ret = e->ops.mq.init_sched(q, e);
+ ret = e->ops.init_sched(q, eq);
if (ret)
- goto err;
-
- blk_mq_debugfs_register_sched(q);
+ goto out;
queue_for_each_hw_ctx(q, hctx, i) {
- if (e->ops.mq.init_hctx) {
- ret = e->ops.mq.init_hctx(hctx, i);
+ if (e->ops.init_hctx) {
+ ret = e->ops.init_hctx(hctx, i);
if (ret) {
- eq = q->elevator;
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
}
}
- blk_mq_debugfs_register_sched_hctx(q, hctx);
}
-
return 0;
-err:
- blk_mq_sched_tags_teardown(q);
+out:
+ blk_mq_sched_tags_teardown(q, flags);
+ kobject_put(&eq->kobj);
q->elevator = NULL;
return ret;
}
-void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
+/*
+ * called in either blk_queue_cleanup or elevator_switch, tagset
+ * is required for freeing requests
+ */
+void blk_mq_sched_free_rqs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
- queue_for_each_hw_ctx(q, hctx, i) {
- blk_mq_debugfs_unregister_sched_hctx(hctx);
- if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
- e->type->ops.mq.exit_hctx(hctx, i);
- hctx->sched_data = NULL;
+ if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+ blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
+ BLK_MQ_NO_HCTX_IDX);
+ } else {
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (hctx->sched_tags)
+ blk_mq_free_rqs(q->tag_set,
+ hctx->sched_tags, i);
}
}
- blk_mq_debugfs_unregister_sched(q);
- if (e->type->ops.mq.exit_sched)
- e->type->ops.mq.exit_sched(e);
- blk_mq_sched_tags_teardown(q);
- q->elevator = NULL;
}
-int blk_mq_sched_init(struct request_queue *q)
+void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
- int ret;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+ unsigned int flags = 0;
- mutex_lock(&q->sysfs_lock);
- ret = elevator_init(q, NULL);
- mutex_unlock(&q->sysfs_lock);
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (e->type->ops.exit_hctx && hctx->sched_data) {
+ e->type->ops.exit_hctx(hctx, i);
+ hctx->sched_data = NULL;
+ }
+ flags = hctx->flags;
+ }
- return ret;
+ if (e->type->ops.exit_sched)
+ e->type->ops.exit_sched(e);
+ blk_mq_sched_tags_teardown(q, flags);
+ set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags);
+ q->elevator = NULL;
}