summaryrefslogtreecommitdiff
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c422
1 files changed, 249 insertions, 173 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a18609755e..b600463791ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -37,6 +37,7 @@
#include "blk-wbt.h"
#include "blk-mq-sched.h"
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -60,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
- return sbitmap_any_bit_set(&hctx->ctx_map) ||
- !list_empty_careful(&hctx->dispatch) ||
+ return !list_empty_careful(&hctx->dispatch) ||
+ sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
}
@@ -125,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q)
freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
if (freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
- blk_mq_run_hw_queues(q, false);
+ if (q->mq_ops)
+ blk_mq_run_hw_queues(q, false);
}
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -255,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_wakeup_all(hctx->tags, true);
-
- /*
- * If we are called because the queue has now been marked as
- * dying, we need to ensure that processes currently waiting on
- * the queue are notified as well.
- */
- wake_up_all(&q->mq_freeze_wq);
}
bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -296,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->q = data->q;
rq->mq_ctx = data->ctx;
rq->cmd_flags = op;
+ if (data->flags & BLK_MQ_REQ_PREEMPT)
+ rq->rq_flags |= RQF_PREEMPT;
if (blk_queue_io_stat(data->q))
rq->rq_flags |= RQF_IO_STAT;
/* do not touch atomic flags, it needs atomic ops against the timer */
@@ -336,12 +333,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
struct elevator_queue *e = q->elevator;
struct request *rq;
unsigned int tag;
- struct blk_mq_ctx *local_ctx = NULL;
+ bool put_ctx_on_error = false;
blk_queue_enter_live(q);
data->q = q;
- if (likely(!data->ctx))
- data->ctx = local_ctx = blk_mq_get_ctx(q);
+ if (likely(!data->ctx)) {
+ data->ctx = blk_mq_get_ctx(q);
+ put_ctx_on_error = true;
+ }
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
if (op & REQ_NOWAIT)
@@ -360,8 +359,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
tag = blk_mq_get_tag(data);
if (tag == BLK_MQ_TAG_FAIL) {
- if (local_ctx) {
- blk_mq_put_ctx(local_ctx);
+ if (put_ctx_on_error) {
+ blk_mq_put_ctx(data->ctx);
data->ctx = NULL;
}
blk_queue_exit(q);
@@ -384,13 +383,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
}
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
- unsigned int flags)
+ blk_mq_req_flags_t flags)
{
struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
int ret;
- ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+ ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
@@ -410,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
- unsigned int op, unsigned int flags, unsigned int hctx_idx)
+ unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
@@ -429,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);
- ret = blk_queue_enter(q, true);
+ ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
@@ -476,8 +475,14 @@ void blk_mq_free_request(struct request *rq)
if (rq->rq_flags & RQF_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
+ if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
+ laptop_io_completion(q->backing_dev_info);
+
wbt_done(q->rq_wb, &rq->issue_stat);
+ if (blk_rq_rl(rq))
+ blk_put_rl(blk_rq_rl(rq));
+
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
if (rq->tag != -1)
@@ -593,22 +598,32 @@ void blk_mq_start_request(struct request *rq)
blk_add_timer(rq);
- /*
- * Ensure that ->deadline is visible before set the started
- * flag and clear the completed flag.
- */
- smp_mb__before_atomic();
+ WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
/*
* Mark us as started and clear complete. Complete might have been
* set if requeue raced with timeout, which then marked it as
* complete. So be sure to clear complete again when we start
* the request, otherwise we'll ignore the completion event.
+ *
+ * Ensure that ->deadline is visible before we set STARTED, such that
+ * blk_mq_check_expired() is guaranteed to observe our ->deadline when
+ * it observes STARTED.
*/
- if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
- set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+ smp_wmb();
+ set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+ if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+ /*
+ * Coherence order guarantees these consecutive stores to a
+ * single variable propagate in the specified order. Thus the
+ * clear_bit() is ordered _after_ the set bit. See
+ * blk_mq_check_expired().
+ *
+ * (the bits must be part of the same byte for this to be
+ * true).
+ */
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+ }
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
@@ -634,6 +649,8 @@ static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_mq_put_driver_tag(rq);
+
trace_block_rq_requeue(q, rq);
wbt_requeue(q->rq_wb, &rq->issue_stat);
blk_mq_sched_requeue_request(rq);
@@ -690,7 +707,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
/*
* We abuse this flag that is otherwise used by the I/O scheduler to
- * request head insertation from the workqueue.
+ * request head insertion from the workqueue.
*/
BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
@@ -778,11 +795,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
struct blk_mq_timeout_data *data = priv;
+ unsigned long deadline;
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
return;
/*
+ * Ensures that if we see STARTED we must also see our
+ * up-to-date deadline, see blk_mq_start_request().
+ */
+ smp_rmb();
+
+ deadline = READ_ONCE(rq->deadline);
+
+ /*
* The rq being checked may have been freed and reallocated
* out already here, we avoid this race by checking rq->deadline
* and REQ_ATOM_COMPLETE flag together:
@@ -795,11 +821,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
* and clearing the flag in blk_mq_start_request(), so
* this rq won't be timed out too.
*/
- if (time_after_eq(jiffies, rq->deadline)) {
- if (!blk_mark_rq_complete(rq))
+ if (time_after_eq(jiffies, deadline)) {
+ if (!blk_mark_rq_complete(rq)) {
+ /*
+ * Again coherence order ensures that consecutive reads
+ * from the same variable must be in that order. This
+ * ensures that if we see COMPLETE clear, we must then
+ * see STARTED set and we'll ignore this timeout.
+ *
+ * (There's also the MB implied by the test_and_clear())
+ */
blk_mq_rq_timed_out(rq, reserved);
- } else if (!data->next_set || time_after(data->next, rq->deadline)) {
- data->next = rq->deadline;
+ }
+ } else if (!data->next_set || time_after(data->next, deadline)) {
+ data->next = deadline;
data->next_set = 1;
}
}
@@ -880,6 +915,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
}
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
+struct dispatch_rq_data {
+ struct blk_mq_hw_ctx *hctx;
+ struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+ void *data)
+{
+ struct dispatch_rq_data *dispatch_data = data;
+ struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+ struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+ spin_lock(&ctx->lock);
+ if (unlikely(!list_empty(&ctx->rq_list))) {
+ dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+ list_del_init(&dispatch_data->rq->queuelist);
+ if (list_empty(&ctx->rq_list))
+ sbitmap_clear_bit(sb, bitnr);
+ }
+ spin_unlock(&ctx->lock);
+
+ return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *start)
+{
+ unsigned off = start ? start->index_hw : 0;
+ struct dispatch_rq_data data = {
+ .hctx = hctx,
+ .rq = NULL,
+ };
+
+ __sbitmap_for_each_set(&hctx->ctx_map, off,
+ dispatch_rq_from_ctx, &data);
+
+ return data.rq;
+}
+
static inline unsigned int queued_to_index(unsigned int queued)
{
if (!queued)
@@ -920,109 +994,95 @@ done:
return rq->tag != -1;
}
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
- struct request *rq)
-{
- blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
- rq->tag = -1;
-
- if (rq->rq_flags & RQF_MQ_INFLIGHT) {
- rq->rq_flags &= ~RQF_MQ_INFLIGHT;
- atomic_dec(&hctx->nr_active);
- }
-}
-
-static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
- struct request *rq)
-{
- if (rq->tag == -1 || rq->internal_tag == -1)
- return;
-
- __blk_mq_put_driver_tag(hctx, rq);
-}
-
-static void blk_mq_put_driver_tag(struct request *rq)
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
+ int flags, void *key)
{
struct blk_mq_hw_ctx *hctx;
- if (rq->tag == -1 || rq->internal_tag == -1)
- return;
+ hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
- hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
- __blk_mq_put_driver_tag(hctx, rq);
+ list_del_init(&wait->entry);
+ blk_mq_run_hw_queue(hctx, true);
+ return 1;
}
/*
- * If we fail getting a driver tag because all the driver tags are already
- * assigned and on the dispatch list, BUT the first entry does not have a
- * tag, then we could deadlock. For that case, move entries with assigned
- * driver tags to the front, leaving the set of tagged requests in the
- * same order, and the untagged set in the same order.
+ * Mark us waiting for a tag. For shared tags, this involves hooking us into
+ * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
+ * restart. For both caes, take care to check the condition again after
+ * marking us as waiting.
*/
-static bool reorder_tags_to_front(struct list_head *list)
-{
- struct request *rq, *tmp, *first = NULL;
-
- list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
- if (rq == first)
- break;
- if (rq->tag != -1) {
- list_move(&rq->queuelist, list);
- if (!first)
- first = rq;
- }
- }
-
- return first != NULL;
-}
-
-static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
- void *key)
+static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+ struct request *rq)
{
- struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_hw_ctx *this_hctx = *hctx;
+ bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
+ struct sbq_wait_state *ws;
+ wait_queue_entry_t *wait;
+ bool ret;
- hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+ if (!shared_tags) {
+ if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
+ set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+ } else {
+ wait = &this_hctx->dispatch_wait;
+ if (!list_empty_careful(&wait->entry))
+ return false;
- list_del(&wait->entry);
- clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
- blk_mq_run_hw_queue(hctx, true);
- return 1;
-}
+ spin_lock(&this_hctx->lock);
+ if (!list_empty(&wait->entry)) {
+ spin_unlock(&this_hctx->lock);
+ return false;
+ }
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
-{
- struct sbq_wait_state *ws;
+ ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+ add_wait_queue(&ws->wait, wait);
+ }
/*
- * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
- * The thread which wins the race to grab this bit adds the hardware
- * queue to the wait queue.
+ * It's possible that a tag was freed in the window between the
+ * allocation failure and adding the hardware queue to the wait
+ * queue.
*/
- if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
- test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
- return false;
+ ret = blk_mq_get_driver_tag(rq, hctx, false);
- init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
- ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+ if (!shared_tags) {
+ /*
+ * Don't clear RESTART here, someone else could have set it.
+ * At most this will cost an extra queue run.
+ */
+ return ret;
+ } else {
+ if (!ret) {
+ spin_unlock(&this_hctx->lock);
+ return false;
+ }
- /*
- * As soon as this returns, it's no longer safe to fiddle with
- * hctx->dispatch_wait, since a completion can wake up the wait queue
- * and unlock the bit.
- */
- add_wait_queue(&ws->wait, &hctx->dispatch_wait);
- return true;
+ /*
+ * We got a tag, remove ourselves from the wait queue to ensure
+ * someone else gets the wakeup.
+ */
+ spin_lock_irq(&ws->wait.lock);
+ list_del_init(&wait->entry);
+ spin_unlock_irq(&ws->wait.lock);
+ spin_unlock(&this_hctx->lock);
+ return true;
+ }
}
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+ bool got_budget)
{
struct blk_mq_hw_ctx *hctx;
- struct request *rq;
+ struct request *rq, *nxt;
+ bool no_tag = false;
int errors, queued;
if (list_empty(list))
return false;
+ WARN_ON(!list_is_singular(list) && got_budget);
+
/*
* Now process all the entries, sending them to the driver.
*/
@@ -1033,23 +1093,29 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
rq = list_first_entry(list, struct request, queuelist);
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
- if (!queued && reorder_tags_to_front(list))
- continue;
-
/*
* The initial allocation attempt failed, so we need to
- * rerun the hardware queue when a tag is freed.
+ * rerun the hardware queue when a tag is freed. The
+ * waitqueue takes care of that. If the queue is run
+ * before we add this entry back on the dispatch list,
+ * we'll re-run it below.
*/
- if (!blk_mq_dispatch_wait_add(hctx))
+ if (!blk_mq_mark_tag_wait(&hctx, rq)) {
+ if (got_budget)
+ blk_mq_put_dispatch_budget(hctx);
+ /*
+ * For non-shared tags, the RESTART check
+ * will suffice.
+ */
+ if (hctx->flags & BLK_MQ_F_TAG_SHARED)
+ no_tag = true;
break;
+ }
+ }
- /*
- * It's possible that a tag was freed in the window
- * between the allocation failure and adding the
- * hardware queue to the wait queue.
- */
- if (!blk_mq_get_driver_tag(rq, &hctx, false))
- break;
+ if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
+ blk_mq_put_driver_tag(rq);
+ break;
}
list_del_init(&rq->queuelist);
@@ -1063,15 +1129,21 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
if (list_empty(list))
bd.last = true;
else {
- struct request *nxt;
-
nxt = list_first_entry(list, struct request, queuelist);
bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
}
ret = q->mq_ops->queue_rq(hctx, &bd);
if (ret == BLK_STS_RESOURCE) {
- blk_mq_put_driver_tag_hctx(hctx, rq);
+ /*
+ * If an I/O scheduler has been configured and we got a
+ * driver tag for the next request already, free it
+ * again.
+ */
+ if (!list_empty(list)) {
+ nxt = list_first_entry(list, struct request, queuelist);
+ blk_mq_put_driver_tag(nxt);
+ }
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
@@ -1093,13 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* that is where we will continue on next queue run.
*/
if (!list_empty(list)) {
- /*
- * If an I/O scheduler has been configured and we got a driver
- * tag for the next request already, free it again.
- */
- rq = list_first_entry(list, struct request, queuelist);
- blk_mq_put_driver_tag(rq);
-
spin_lock(&hctx->lock);
list_splice_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
@@ -1109,10 +1174,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* it is no longer set that means that it was cleared by another
* thread and hence that a queue rerun is needed.
*
- * If TAG_WAITING is set that means that an I/O scheduler has
- * been configured and another thread is waiting for a driver
- * tag. To guarantee fairness, do not rerun this hardware queue
- * but let the other thread grab the driver tag.
+ * If 'no_tag' is set, that means that we failed getting
+ * a driver tag with an I/O scheduler attached. If our dispatch
+ * waitqueue is no longer active, ensure that we run the queue
+ * AFTER adding our entries back to the list.
*
* If no I/O scheduler has been configured it is possible that
* the hardware queue got stopped and restarted before requests
@@ -1124,8 +1189,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
* and dm-rq.
*/
- if (!blk_mq_sched_needs_restart(hctx) &&
- !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+ if (!blk_mq_sched_needs_restart(hctx) ||
+ (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
}
@@ -1218,9 +1283,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- __blk_mq_delay_run_hw_queue(hctx, async, 0);
+ if (blk_mq_hctx_has_pending(hctx)) {
+ __blk_mq_delay_run_hw_queue(hctx, async, 0);
+ return true;
+ }
+
+ return false;
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);
@@ -1230,8 +1300,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (!blk_mq_hctx_has_pending(hctx) ||
- blk_mq_hctx_stopped(hctx))
+ if (blk_mq_hctx_stopped(hctx))
continue;
blk_mq_run_hw_queue(hctx, async);
@@ -1405,7 +1474,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq)
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
@@ -1414,7 +1483,8 @@ void blk_mq_request_bypass_insert(struct request *rq)
list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
- blk_mq_run_hw_queue(hctx, false);
+ if (run_queue)
+ blk_mq_run_hw_queue(hctx, false);
}
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
@@ -1501,13 +1571,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
{
blk_init_request_from_bio(rq, bio);
- blk_account_io_start(rq, true);
-}
+ blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
-static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
-{
- return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
- !blk_queue_nomerges(hctx->queue);
+ blk_account_io_start(rq, true);
}
static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
@@ -1552,6 +1618,11 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (!blk_mq_get_driver_tag(rq, NULL, false))
goto insert;
+ if (!blk_mq_get_dispatch_budget(hctx)) {
+ blk_mq_put_driver_tag(rq);
+ goto insert;
+ }
+
new_cookie = request_to_qc_t(hctx, rq);
/*
@@ -1641,13 +1712,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
if (unlikely(is_flush_fua)) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
- if (q->elevator) {
- blk_mq_sched_insert_request(rq, false, true, true,
- true);
- } else {
- blk_insert_flush(rq);
- blk_mq_run_hw_queue(data.hctx, true);
- }
+
+ /* bypass scheduler for flush rq */
+ blk_insert_flush(rq);
+ blk_mq_run_hw_queue(data.hctx, true);
} else if (plug && q->nr_hw_queues == 1) {
struct request *last = NULL;
@@ -1990,6 +2058,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
hctx->nr_ctx = 0;
+ init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+ INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
+
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto free_bitmap;
@@ -2229,8 +2300,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
mutex_lock(&set->tag_list_lock);
- /* Check to see if we're transitioning to shared (from 1 to 2 queues). */
- if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
+ /*
+ * Check to see if we're transitioning to shared (from 1 to 2 queues).
+ */
+ if (!list_empty(&set->tag_list) &&
+ !(set->flags & BLK_MQ_F_TAG_SHARED)) {
set->flags |= BLK_MQ_F_TAG_SHARED;
/* update existing queue */
blk_mq_update_tag_set_depth(set, true);
@@ -2404,6 +2478,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
spin_lock_init(&q->requeue_lock);
blk_queue_make_request(q, blk_mq_make_request);
+ if (q->mq_ops->poll)
+ q->poll_fn = blk_mq_poll;
/*
* Do this after blk_queue_make_request() overrides it...
@@ -2460,10 +2536,9 @@ static void blk_mq_queue_reinit(struct request_queue *q)
/*
* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
- * we should change hctx numa_node according to new topology (this
- * involves free and re-allocate memory, worthy doing?)
+ * we should change hctx numa_node according to the new topology (this
+ * involves freeing and re-allocating memory, worth doing?)
*/
-
blk_mq_map_swqueue(q);
blk_mq_sysfs_register(q);
@@ -2552,6 +2627,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (!set->ops->queue_rq)
return -EINVAL;
+ if (!set->ops->get_budget ^ !set->ops->put_budget)
+ return -EINVAL;
+
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
pr_info("blk-mq: reduced tag depth to %u\n",
BLK_MQ_MAX_DEPTH);
@@ -2642,8 +2720,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
* queue depth. This is similar to what the old code would do.
*/
if (!hctx->sched_tags) {
- ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
- min(nr, set->queue_depth),
+ ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
false);
} else {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
@@ -2863,20 +2940,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
return false;
}
-bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
{
struct blk_mq_hw_ctx *hctx;
- struct blk_plug *plug;
struct request *rq;
- if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
- !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return false;
- plug = current->plug;
- if (plug)
- blk_flush_plug_list(plug, false);
-
hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
if (!blk_qc_t_is_internal(cookie))
rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
@@ -2894,10 +2965,15 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
return __blk_mq_poll(hctx, rq);
}
-EXPORT_SYMBOL_GPL(blk_mq_poll);
static int __init blk_mq_init(void)
{
+ /*
+ * See comment in block/blk.h rq_atomic_flags enum
+ */
+ BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
+ (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
+
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
return 0;