summaryrefslogtreecommitdiff
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 12:52:58 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 12:52:58 -0700
commit9dd6956b38923dc1b7b349ca1eee3c0bb1f0163a (patch)
treec70bb7d65a50a51686378b6113a8663e0e60d9b8 /block/blk-mq.c
parent5b9a7bb72fddbc5247f56ede55d485fab7abdf92 (diff)
parent55793ea54d77719a071b1ccc05a05056e3b5e009 (diff)
Merge tag 'for-6.4/block-2023-04-21' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - drbd patches, bringing us closer to unifying the out-of-tree version and the in tree one (Andreas, Christoph) - support for auto-quiesce for the s390 dasd driver (Stefan) - MD pull request via Song: - md/bitmap: Optimal last page size (Jon Derrick) - Various raid10 fixes (Yu Kuai, Li Nan) - md: add error_handlers for raid0 and linear (Mariusz Tkaczyk) - NVMe pull request via Christoph: - Drop redundant pci_enable_pcie_error_reporting (Bjorn Helgaas) - Validate nvmet module parameters (Chaitanya Kulkarni) - Fence TCP socket on receive error (Chris Leech) - Fix async event trace event (Keith Busch) - Minor cleanups (Chaitanya Kulkarni, zhenwei pi) - Fix and cleanup nvmet Identify handling (Damien Le Moal, Christoph Hellwig) - Fix double blk_mq_complete_request race in the timeout handler (Lei Yin) - Fix irq locking in nvme-fcloop (Ming Lei) - Remove queue mapping helper for rdma devices (Sagi Grimberg) - use structured request attribute checks for nbd (Jakub) - fix blk-crypto race conditions between keyslot management (Eric) - add sed-opal support for reading read locking range attributes (Ondrej) - make fault injection configurable for null_blk (Akinobu) - clean up the request insertion API (Christoph) - clean up the queue running API (Christoph) - blkg config helper cleanups (Tejun) - lazy init support for blk-iolatency (Tejun) - various fixes and tweaks to ublk (Ming) - remove hybrid polling. It hasn't really been useful since we got async polled IO support, and these days we don't support sync polled IO at all (Keith) - misc fixes, cleanups, improvements (Zhong, Ondrej, Colin, Chengming, Chaitanya, me) * tag 'for-6.4/block-2023-04-21' of git://git.kernel.dk/linux: (118 commits) nbd: fix incomplete validation of ioctl arg ublk: don't return 0 in case of any failure sed-opal: geometry feature reporting command null_blk: Always check queue mode setting from configfs block: ublk: switch to ioctl command encoding blk-mq: fix the blk_mq_add_to_requeue_list call in blk_kick_flush block, bfq: Fix division by zero error on zero wsum fault-inject: fix build error when FAULT_INJECTION_CONFIGFS=y and CONFIGFS_FS=m block: store bdev->bd_disk->fops->submit_bio state in bdev block: re-arrange the struct block_device fields for better layout md/raid5: remove unused working_disks variable md/raid10: don't call bio_start_io_acct twice for bio which experienced read error md/raid10: fix memleak of md thread md/raid10: fix memleak for 'conf->bio_split' md/raid10: fix leak of 'r10bio->remaining' for recovery md/raid10: don't BUG_ON() in raise_barrier() md: fix soft lockup in status_resync md: add error_handlers for raid0 and linear md: Use optimal I/O size for last bitmap page md: Fix types in sb writer ...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c665
1 files changed, 256 insertions, 409 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2831f78f86a0..f6dad0886a2f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,12 +32,10 @@
#include <trace/events/block.h>
-#include <linux/blk-mq.h>
#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
-#include "blk-mq-tag.h"
#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
@@ -46,51 +44,19 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
-static void blk_mq_poll_stats_start(struct request_queue *q);
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
-
-static int blk_mq_poll_stats_bkt(const struct request *rq)
-{
- int ddir, sectors, bucket;
-
- ddir = rq_data_dir(rq);
- sectors = blk_rq_stats_sectors(rq);
-
- bucket = ddir + 2 * ilog2(sectors);
-
- if (bucket < 0)
- return -1;
- else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
- return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
-
- return bucket;
-}
-
-#define BLK_QC_T_SHIFT 16
-#define BLK_QC_T_INTERNAL (1U << 31)
+static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
+static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list);
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
blk_qc_t qc)
{
- return xa_load(&q->hctx_table,
- (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
-}
-
-static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
- blk_qc_t qc)
-{
- unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
-
- if (qc & BLK_QC_T_INTERNAL)
- return blk_mq_tag_to_rq(hctx->sched_tags, tag);
- return blk_mq_tag_to_rq(hctx->tags, tag);
+ return xa_load(&q->hctx_table, qc);
}
static inline blk_qc_t blk_rq_to_qc(struct request *rq)
{
- return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
- (rq->tag != -1 ?
- rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
+ return rq->mq_hctx->queue_num;
}
/*
@@ -840,6 +806,12 @@ static void blk_complete_request(struct request *req)
req->q->integrity.profile->complete_fn(req, total_bytes);
#endif
+ /*
+ * Upper layers may call blk_crypto_evict_key() anytime after the last
+ * bio_endio(). Therefore, the keyslot must be released before that.
+ */
+ blk_crypto_rq_put_keyslot(req);
+
blk_account_io_completion(req, total_bytes);
do {
@@ -905,6 +877,13 @@ bool blk_update_request(struct request *req, blk_status_t error,
req->q->integrity.profile->complete_fn(req, nr_bytes);
#endif
+ /*
+ * Upper layers may call blk_crypto_evict_key() anytime after the last
+ * bio_endio(). Therefore, the keyslot must be released before that.
+ */
+ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
+ __blk_crypto_rq_put_keyslot(req);
+
if (unlikely(error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)) &&
!test_bit(GD_DEAD, &req->q->disk->state)) {
@@ -976,17 +955,6 @@ bool blk_update_request(struct request *req, blk_status_t error,
}
EXPORT_SYMBOL_GPL(blk_update_request);
-static void __blk_account_io_done(struct request *req, u64 now)
-{
- const int sgrp = op_stat_group(req_op(req));
-
- part_stat_lock();
- update_io_ticks(req->part, jiffies, true);
- part_stat_inc(req->part, ios[sgrp]);
- part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
- part_stat_unlock();
-}
-
static inline void blk_account_io_done(struct request *req, u64 now)
{
/*
@@ -995,40 +963,41 @@ static inline void blk_account_io_done(struct request *req, u64 now)
* containing request is enough.
*/
if (blk_do_io_stat(req) && req->part &&
- !(req->rq_flags & RQF_FLUSH_SEQ))
- __blk_account_io_done(req, now);
-}
-
-static void __blk_account_io_start(struct request *rq)
-{
- /*
- * All non-passthrough requests are created from a bio with one
- * exception: when a flush command that is part of a flush sequence
- * generated by the state machine in blk-flush.c is cloned onto the
- * lower device by dm-multipath we can get here without a bio.
- */
- if (rq->bio)
- rq->part = rq->bio->bi_bdev;
- else
- rq->part = rq->q->disk->part0;
+ !(req->rq_flags & RQF_FLUSH_SEQ)) {
+ const int sgrp = op_stat_group(req_op(req));
- part_stat_lock();
- update_io_ticks(rq->part, jiffies, false);
- part_stat_unlock();
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, true);
+ part_stat_inc(req->part, ios[sgrp]);
+ part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_unlock();
+ }
}
static inline void blk_account_io_start(struct request *req)
{
- if (blk_do_io_stat(req))
- __blk_account_io_start(req);
+ if (blk_do_io_stat(req)) {
+ /*
+ * All non-passthrough requests are created from a bio with one
+ * exception: when a flush command that is part of a flush sequence
+ * generated by the state machine in blk-flush.c is cloned onto the
+ * lower device by dm-multipath we can get here without a bio.
+ */
+ if (req->bio)
+ req->part = req->bio->bi_bdev;
+ else
+ req->part = req->q->disk->part0;
+
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, false);
+ part_stat_unlock();
+ }
}
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
- if (rq->rq_flags & RQF_STATS) {
- blk_mq_poll_stats_start(rq->q);
+ if (rq->rq_flags & RQF_STATS)
blk_stat_add(rq, now);
- }
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@@ -1322,6 +1291,8 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*/
void blk_execute_rq_nowait(struct request *rq, bool at_head)
{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
WARN_ON(irqs_disabled());
WARN_ON(!blk_rq_is_passthrough(rq));
@@ -1332,10 +1303,13 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
* device, directly accessing the plug instead of using blk_mq_plug()
* should not have any consequences.
*/
- if (current->plug)
+ if (current->plug && !at_head) {
blk_add_rq_to_plug(current->plug, rq);
- else
- blk_mq_sched_insert_request(rq, at_head, true, false);
+ return;
+ }
+
+ blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
+ blk_mq_run_hw_queue(hctx, false);
}
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
@@ -1383,6 +1357,7 @@ static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
*/
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
struct blk_rq_wait wait = {
.done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
};
@@ -1394,7 +1369,8 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head)
rq->end_io = blk_end_sync_rq;
blk_account_io_start(rq);
- blk_mq_sched_insert_request(rq, at_head, true, false);
+ blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
+ blk_mq_run_hw_queue(hctx, false);
if (blk_rq_is_poll(rq)) {
blk_rq_poll_completion(rq, &wait.done);
@@ -1434,12 +1410,17 @@ static void __blk_mq_requeue_request(struct request *rq)
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
+ struct request_queue *q = rq->q;
+
__blk_mq_requeue_request(rq);
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
- blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
+ blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD);
+
+ if (kick_requeue_list)
+ blk_mq_kick_requeue_list(q);
}
EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -1455,33 +1436,33 @@ static void blk_mq_requeue_work(struct work_struct *work)
spin_unlock_irq(&q->requeue_lock);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
- if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
- continue;
-
- rq->rq_flags &= ~RQF_SOFTBARRIER;
- list_del_init(&rq->queuelist);
/*
- * If RQF_DONTPREP, rq has contained some driver specific
- * data, so insert it to hctx dispatch list to avoid any
- * merge.
+ * If RQF_DONTPREP ist set, the request has been started by the
+ * driver already and might have driver-specific data allocated
+ * already. Insert it into the hctx dispatch list to avoid
+ * block layer merges for the request.
*/
- if (rq->rq_flags & RQF_DONTPREP)
- blk_mq_request_bypass_insert(rq, false, false);
- else
- blk_mq_sched_insert_request(rq, true, false, false);
+ if (rq->rq_flags & RQF_DONTPREP) {
+ rq->rq_flags &= ~RQF_SOFTBARRIER;
+ list_del_init(&rq->queuelist);
+ blk_mq_request_bypass_insert(rq, 0);
+ } else if (rq->rq_flags & RQF_SOFTBARRIER) {
+ rq->rq_flags &= ~RQF_SOFTBARRIER;
+ list_del_init(&rq->queuelist);
+ blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
+ }
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, false, false, false);
+ blk_mq_insert_request(rq, 0);
}
blk_mq_run_hw_queues(q, false);
}
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
- bool kick_requeue_list)
+void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags)
{
struct request_queue *q = rq->q;
unsigned long flags;
@@ -1493,16 +1474,13 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
spin_lock_irqsave(&q->requeue_lock, flags);
- if (at_head) {
+ if (insert_flags & BLK_MQ_INSERT_AT_HEAD) {
rq->rq_flags |= RQF_SOFTBARRIER;
list_add(&rq->queuelist, &q->requeue_list);
} else {
list_add_tail(&rq->queuelist, &q->requeue_list);
}
spin_unlock_irqrestore(&q->requeue_lock, flags);
-
- if (kick_requeue_list)
- blk_mq_kick_requeue_list(q);
}
void blk_mq_kick_requeue_list(struct request_queue *q)
@@ -2158,24 +2136,6 @@ out:
return true;
}
-/**
- * __blk_mq_run_hw_queue - Run a hardware queue.
- * @hctx: Pointer to the hardware queue to run.
- *
- * Send pending requests to the hardware.
- */
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
-{
- /*
- * We can't run the queue inline with ints disabled. Ensure that
- * we catch bad users of this early.
- */
- WARN_ON_ONCE(in_interrupt());
-
- blk_mq_run_dispatch_ops(hctx->queue,
- blk_mq_sched_dispatch_requests(hctx));
-}
-
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{
int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
@@ -2232,42 +2192,19 @@ select_cpu:
}
/**
- * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
* @hctx: Pointer to the hardware queue to run.
- * @async: If we want to run the queue asynchronously.
* @msecs: Milliseconds of delay to wait before running the queue.
*
- * If !@async, try to run the queue now. Else, run the queue asynchronously and
- * with a delay of @msecs.
+ * Run a hardware queue asynchronously with a delay of @msecs.
*/
-static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
- unsigned long msecs)
+void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
if (unlikely(blk_mq_hctx_stopped(hctx)))
return;
-
- if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
- if (cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
- __blk_mq_run_hw_queue(hctx);
- return;
- }
- }
-
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
msecs_to_jiffies(msecs));
}
-
-/**
- * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
- * @hctx: Pointer to the hardware queue to run.
- * @msecs: Milliseconds of delay to wait before running the queue.
- *
- * Run a hardware queue asynchronously with a delay of @msecs.
- */
-void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
-{
- __blk_mq_delay_run_hw_queue(hctx, true, msecs);
-}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
/**
@@ -2284,6 +2221,11 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
bool need_run;
/*
+ * We can't run the queue inline with interrupts disabled.
+ */
+ WARN_ON_ONCE(!async && in_interrupt());
+
+ /*
* When queue is quiesced, we may be switching io scheduler, or
* updating nr_hw_queues, or other things, and we can't run queue
* any more, even __blk_mq_hctx_has_pending() can't be called safely.
@@ -2295,8 +2237,17 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
need_run = !blk_queue_quiesced(hctx->queue) &&
blk_mq_hctx_has_pending(hctx));
- if (need_run)
- __blk_mq_delay_run_hw_queue(hctx, async, 0);
+ if (!need_run)
+ return;
+
+ if (async || (hctx->flags & BLK_MQ_F_BLOCKING) ||
+ !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
+ blk_mq_delay_run_hw_queue(hctx, 0);
+ return;
+ }
+
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_sched_dispatch_requests(hctx));
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);
@@ -2461,80 +2412,52 @@ EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
static void blk_mq_run_work_fn(struct work_struct *work)
{
- struct blk_mq_hw_ctx *hctx;
-
- hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
-
- /*
- * If we are stopped, don't run the queue.
- */
- if (blk_mq_hctx_stopped(hctx))
- return;
-
- __blk_mq_run_hw_queue(hctx);
-}
-
-static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- bool at_head)
-{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- enum hctx_type type = hctx->type;
-
- lockdep_assert_held(&ctx->lock);
-
- trace_block_rq_insert(rq);
-
- if (at_head)
- list_add(&rq->queuelist, &ctx->rq_lists[type]);
- else
- list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
-}
-
-void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- bool at_head)
-{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
-
- lockdep_assert_held(&ctx->lock);
+ struct blk_mq_hw_ctx *hctx =
+ container_of(work, struct blk_mq_hw_ctx, run_work.work);
- __blk_mq_insert_req_list(hctx, rq, at_head);
- blk_mq_hctx_mark_pending(hctx, ctx);
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_sched_dispatch_requests(hctx));
}
/**
* blk_mq_request_bypass_insert - Insert a request at dispatch list.
* @rq: Pointer to request to be inserted.
- * @at_head: true if the request should be inserted at the head of the list.
- * @run_queue: If we should run the hardware queue after inserting the request.
+ * @flags: BLK_MQ_INSERT_*
*
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
- bool run_queue)
+void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
spin_lock(&hctx->lock);
- if (at_head)
+ if (flags & BLK_MQ_INSERT_AT_HEAD)
list_add(&rq->queuelist, &hctx->dispatch);
else
list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
-
- if (run_queue)
- blk_mq_run_hw_queue(hctx, false);
}
-void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- struct list_head *list)
-
+static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx, struct list_head *list,
+ bool run_queue_async)
{
struct request *rq;
enum hctx_type type = hctx->type;
/*
+ * Try to issue requests directly if the hw queue isn't busy to save an
+ * extra enqueue & dequeue to the sw queue.
+ */
+ if (!hctx->dispatch_busy && !run_queue_async) {
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_try_issue_list_directly(hctx, list));
+ if (list_empty(list))
+ goto out;
+ }
+
+ /*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
@@ -2547,6 +2470,70 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
list_splice_tail_init(list, &ctx->rq_lists[type]);
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
+out:
+ blk_mq_run_hw_queue(hctx, run_queue_async);
+}
+
+static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
+{
+ struct request_queue *q = rq->q;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ if (blk_rq_is_passthrough(rq)) {
+ /*
+ * Passthrough request have to be added to hctx->dispatch
+ * directly. The device may be in a situation where it can't
+ * handle FS request, and always returns BLK_STS_RESOURCE for
+ * them, which gets them added to hctx->dispatch.
+ *
+ * If a passthrough request is required to unblock the queues,
+ * and it is added to the scheduler queue, there is no chance to
+ * dispatch it given we prioritize requests in hctx->dispatch.
+ */
+ blk_mq_request_bypass_insert(rq, flags);
+ } else if (rq->rq_flags & RQF_FLUSH_SEQ) {
+ /*
+ * Firstly normal IO request is inserted to scheduler queue or
+ * sw queue, meantime we add flush request to dispatch queue(
+ * hctx->dispatch) directly and there is at most one in-flight
+ * flush request for each hw queue, so it doesn't matter to add
+ * flush request to tail or front of the dispatch queue.
+ *
+ * Secondly in case of NCQ, flush request belongs to non-NCQ
+ * command, and queueing it will fail when there is any
+ * in-flight normal IO request(NCQ command). When adding flush
+ * rq to the front of hctx->dispatch, it is easier to introduce
+ * extra time to flush rq's latency because of S_SCHED_RESTART
+ * compared with adding to the tail of dispatch queue, then
+ * chance of flush merge is increased, and less flush requests
+ * will be issued to controller. It is observed that ~10% time
+ * is saved in blktests block/004 on disk attached to AHCI/NCQ
+ * drive when adding flush rq to the front of hctx->dispatch.
+ *
+ * Simply queue flush rq to the front of hctx->dispatch so that
+ * intensive flush workloads can benefit in case of NCQ HW.
+ */
+ blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
+ } else if (q->elevator) {
+ LIST_HEAD(list);
+
+ WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG);
+
+ list_add(&rq->queuelist, &list);
+ q->elevator->type->ops.insert_requests(hctx, &list, flags);
+ } else {
+ trace_block_rq_insert(rq);
+
+ spin_lock(&ctx->lock);
+ if (flags & BLK_MQ_INSERT_AT_HEAD)
+ list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
+ else
+ list_add_tail(&rq->queuelist,
+ &ctx->rq_lists[hctx->type]);
+ blk_mq_hctx_mark_pending(hctx, ctx);
+ spin_unlock(&ctx->lock);
+ }
}
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
@@ -2600,49 +2587,19 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
return ret;
}
-static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- bool bypass_insert, bool last)
+static bool blk_mq_get_budget_and_tag(struct request *rq)
{
- struct request_queue *q = rq->q;
- bool run_queue = true;
int budget_token;
- /*
- * RCU or SRCU read lock is needed before checking quiesced flag.
- *
- * When queue is stopped or quiesced, ignore 'bypass_insert' from
- * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
- * and avoid driver to try to dispatch again.
- */
- if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
- run_queue = false;
- bypass_insert = false;
- goto insert;
- }
-
- if ((rq->rq_flags & RQF_ELV) && !bypass_insert)
- goto insert;
-
- budget_token = blk_mq_get_dispatch_budget(q);
+ budget_token = blk_mq_get_dispatch_budget(rq->q);
if (budget_token < 0)
- goto insert;
-
+ return false;
blk_mq_set_rq_budget_token(rq, budget_token);
-
if (!blk_mq_get_driver_tag(rq)) {
- blk_mq_put_dispatch_budget(q, budget_token);
- goto insert;
+ blk_mq_put_dispatch_budget(rq->q, budget_token);
+ return false;
}
-
- return __blk_mq_issue_directly(hctx, rq, last);
-insert:
- if (bypass_insert)
- return BLK_STS_RESOURCE;
-
- blk_mq_sched_insert_request(rq, false, run_queue, false);
-
- return BLK_STS_OK;
+ return true;
}
/**
@@ -2658,18 +2615,46 @@ insert:
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- blk_status_t ret =
- __blk_mq_try_issue_directly(hctx, rq, false, true);
+ blk_status_t ret;
+
+ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
+ blk_mq_insert_request(rq, 0);
+ return;
+ }
+
+ if ((rq->rq_flags & RQF_ELV) || !blk_mq_get_budget_and_tag(rq)) {
+ blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
+ return;
+ }
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- blk_mq_request_bypass_insert(rq, false, true);
- else if (ret != BLK_STS_OK)
+ ret = __blk_mq_issue_directly(hctx, rq, true);
+ switch (ret) {
+ case BLK_STS_OK:
+ break;
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_request_bypass_insert(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
+ break;
+ default:
blk_mq_end_request(rq, ret);
+ break;
+ }
}
static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
- return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last);
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
+ blk_mq_insert_request(rq, 0);
+ return BLK_STS_OK;
+ }
+
+ if (!blk_mq_get_budget_and_tag(rq))
+ return BLK_STS_RESOURCE;
+ return __blk_mq_issue_directly(hctx, rq, last);
}
static void blk_mq_plug_issue_direct(struct blk_plug *plug)
@@ -2697,7 +2682,8 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
- blk_mq_request_bypass_insert(rq, false, true);
+ blk_mq_request_bypass_insert(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
goto out;
default:
blk_mq_end_request(rq, ret);
@@ -2743,7 +2729,16 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
plug->mq_list = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
- blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched);
+
+ percpu_ref_get(&this_hctx->queue->q_usage_counter);
+ if (this_hctx->queue->elevator) {
+ this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
+ &list, 0);
+ blk_mq_run_hw_queue(this_hctx, from_sched);
+ } else {
+ blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched);
+ }
+ percpu_ref_put(&this_hctx->queue->q_usage_counter);
}
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
@@ -2789,7 +2784,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
} while (!rq_list_empty(plug->mq_list));
}
-void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list)
{
int queued = 0;
@@ -2807,8 +2802,9 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
- blk_mq_request_bypass_insert(rq, false,
- list_empty(list));
+ blk_mq_request_bypass_insert(rq, 0);
+ if (list_empty(list))
+ blk_mq_run_hw_queue(hctx, false);
goto out;
default:
blk_mq_end_request(rq, ret);
@@ -2934,6 +2930,7 @@ void blk_mq_submit_bio(struct bio *bio)
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct blk_plug *plug = blk_mq_plug(bio);
const int is_sync = op_is_sync(bio->bi_opf);
+ struct blk_mq_hw_ctx *hctx;
struct request *rq;
unsigned int nr_segs = 1;
blk_status_t ret;
@@ -2965,7 +2962,7 @@ void blk_mq_submit_bio(struct bio *bio)
blk_mq_bio_to_request(rq, bio, nr_segs);
- ret = blk_crypto_init_request(rq);
+ ret = blk_crypto_rq_get_keyslot(rq);
if (ret != BLK_STS_OK) {
bio->bi_status = ret;
bio_endio(bio);
@@ -2978,15 +2975,19 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
- if (plug)
+ if (plug) {
blk_add_rq_to_plug(plug, rq);
- else if ((rq->rq_flags & RQF_ELV) ||
- (rq->mq_hctx->dispatch_busy &&
- (q->nr_hw_queues == 1 || !is_sync)))
- blk_mq_sched_insert_request(rq, false, true, true);
- else
- blk_mq_run_dispatch_ops(rq->q,
- blk_mq_try_issue_directly(rq->mq_hctx, rq));
+ return;
+ }
+
+ hctx = rq->mq_hctx;
+ if ((rq->rq_flags & RQF_ELV) ||
+ (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
+ blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, true);
+ } else {
+ blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
+ }
}
#ifdef CONFIG_BLK_MQ_STACKING
@@ -3034,8 +3035,9 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
- if (blk_crypto_insert_cloned_request(rq))
- return BLK_STS_IOERR;
+ ret = blk_crypto_rq_get_keyslot(rq);
+ if (ret != BLK_STS_OK)
+ return ret;
blk_account_io_start(rq);
@@ -4206,14 +4208,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
- q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
- blk_mq_poll_stats_bkt,
- BLK_MQ_POLL_STATS_BKTS, q);
- if (!q->poll_cb)
- goto err_exit;
-
if (blk_mq_alloc_ctxs(q))
- goto err_poll;
+ goto err_exit;
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
@@ -4241,11 +4237,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_requests = set->queue_depth;
- /*
- * Default to classic polling
- */
- q->poll_nsec = BLK_MQ_POLL_CLASSIC;
-
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
@@ -4253,9 +4244,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
err_hctxs:
blk_mq_release(q);
-err_poll:
- blk_stat_free_callback(q->poll_cb);
- q->poll_cb = NULL;
err_exit:
q->mq_ops = NULL;
return -ENOMEM;
@@ -4752,138 +4740,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
-/* Enable polling stats and return whether they were already enabled. */
-static bool blk_poll_stats_enable(struct request_queue *q)
-{
- if (q->poll_stat)
- return true;
-
- return blk_stats_alloc_enable(q);
-}
-
-static void blk_mq_poll_stats_start(struct request_queue *q)
-{
- /*
- * We don't arm the callback if polling stats are not enabled or the
- * callback is already active.
- */
- if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
- return;
-
- blk_stat_activate_msecs(q->poll_cb, 100);
-}
-
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
-{
- struct request_queue *q = cb->data;
- int bucket;
-
- for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
- if (cb->stat[bucket].nr_samples)
- q->poll_stat[bucket] = cb->stat[bucket];
- }
-}
-
-static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
- struct request *rq)
-{
- unsigned long ret = 0;
- int bucket;
-
- /*
- * If stats collection isn't on, don't sleep but turn it on for
- * future users
- */
- if (!blk_poll_stats_enable(q))
- return 0;
-
- /*
- * As an optimistic guess, use half of the mean service time
- * for this type of request. We can (and should) make this smarter.
- * For instance, if the completion latencies are tight, we can
- * get closer than just half the mean. This is especially
- * important on devices where the completion latencies are longer
- * than ~10 usec. We do use the stats for the relevant IO size
- * if available which does lead to better estimates.
- */
- bucket = blk_mq_poll_stats_bkt(rq);
- if (bucket < 0)
- return ret;
-
- if (q->poll_stat[bucket].nr_samples)
- ret = (q->poll_stat[bucket].mean + 1) / 2;
-
- return ret;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
-{
- struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
- struct request *rq = blk_qc_to_rq(hctx, qc);
- struct hrtimer_sleeper hs;
- enum hrtimer_mode mode;
- unsigned int nsecs;
- ktime_t kt;
-
- /*
- * If a request has completed on queue that uses an I/O scheduler, we
- * won't get back a request from blk_qc_to_rq.
- */
- if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
- return false;
-
- /*
- * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
- *
- * 0: use half of prev avg
- * >0: use this specific value
- */
- if (q->poll_nsec > 0)
- nsecs = q->poll_nsec;
- else
- nsecs = blk_mq_poll_nsecs(q, rq);
-
- if (!nsecs)
- return false;
-
- rq->rq_flags |= RQF_MQ_POLL_SLEPT;
-
- /*
- * This will be replaced with the stats tracking code, using
- * 'avg_completion_time / 2' as the pre-sleep target.
- */
- kt = nsecs;
-
- mode = HRTIMER_MODE_REL;
- hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
- hrtimer_set_expires(&hs.timer, kt);
-
- do {
- if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
- break;
- set_current_state(TASK_UNINTERRUPTIBLE);
- hrtimer_sleeper_start_expires(&hs, mode);
- if (hs.task)
- io_schedule();
- hrtimer_cancel(&hs.timer);
- mode = HRTIMER_MODE_ABS;
- } while (hs.task && !signal_pending(current));
-
- __set_current_state(TASK_RUNNING);
- destroy_hrtimer_on_stack(&hs.timer);
-
- /*
- * If we sleep, have the caller restart the poll loop to reset the
- * state. Like for the other success return cases, the caller is
- * responsible for checking if the IO completed. If the IO isn't
- * complete, we'll get called again and will go straight to the busy
- * poll loop.
- */
- return true;
-}
-
-static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
- struct io_comp_batch *iob, unsigned int flags)
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+ unsigned int flags)
{
struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
long state = get_current_state();
@@ -4910,17 +4768,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
return 0;
}
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
- unsigned int flags)
-{
- if (!(flags & BLK_POLL_NOSLEEP) &&
- q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
- if (blk_mq_poll_hybrid(q, cookie))
- return 1;
- }
- return blk_mq_poll_classic(q, cookie, iob, flags);
-}
-
unsigned int blk_mq_rq_cpu(struct request *rq)
{
return rq->mq_ctx->cpu;