summaryrefslogtreecommitdiff
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c1228
1 files changed, 789 insertions, 439 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b4df8e5ac9e..bd8b11c472a2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -23,6 +23,7 @@
#include <linux/cache.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
+#include <linux/suspend.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
@@ -43,6 +44,7 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
+static DEFINE_MUTEX(blk_mq_cpuhp_lock);
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
static void blk_mq_request_bypass_insert(struct request *rq,
@@ -88,11 +90,11 @@ struct mq_inflight {
unsigned int inflight[2];
};
-static bool blk_mq_check_inflight(struct request *rq, void *priv)
+static bool blk_mq_check_in_driver(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
- if (rq->part && blk_do_io_stat(rq) &&
+ if (rq->rq_flags & RQF_IO_STAT &&
(!bdev_is_partition(mi->part) || rq->part == mi->part) &&
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
@@ -100,29 +102,71 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
return true;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part)
+void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+ blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver,
+ &mi);
+ inflight[READ] = mi.inflight[READ];
+ inflight[WRITE] = mi.inflight[WRITE];
+}
+
+#ifdef CONFIG_LOCKDEP
+static bool blk_freeze_set_owner(struct request_queue *q,
+ struct task_struct *owner)
+{
+ if (!owner)
+ return false;
+
+ if (!q->mq_freeze_depth) {
+ q->mq_freeze_owner = owner;
+ q->mq_freeze_owner_depth = 1;
+ q->mq_freeze_disk_dead = !q->disk ||
+ test_bit(GD_DEAD, &q->disk->state) ||
+ !blk_queue_registered(q);
+ q->mq_freeze_queue_dying = blk_queue_dying(q);
+ return true;
+ }
- return mi.inflight[0] + mi.inflight[1];
+ if (owner == q->mq_freeze_owner)
+ q->mq_freeze_owner_depth += 1;
+ return false;
}
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2])
+/* verify the last unfreeze in owner context */
+static bool blk_unfreeze_check_owner(struct request_queue *q)
{
- struct mq_inflight mi = { .part = part };
+ if (q->mq_freeze_owner != current)
+ return false;
+ if (--q->mq_freeze_owner_depth == 0) {
+ q->mq_freeze_owner = NULL;
+ return true;
+ }
+ return false;
+}
+
+#else
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
- inflight[0] = mi.inflight[0];
- inflight[1] = mi.inflight[1];
+static bool blk_freeze_set_owner(struct request_queue *q,
+ struct task_struct *owner)
+{
+ return false;
}
-void blk_freeze_queue_start(struct request_queue *q)
+static bool blk_unfreeze_check_owner(struct request_queue *q)
{
+ return false;
+}
+#endif
+
+bool __blk_freeze_queue_start(struct request_queue *q,
+ struct task_struct *owner)
+{
+ bool freeze;
+
mutex_lock(&q->mq_freeze_lock);
+ freeze = blk_freeze_set_owner(q, owner);
if (++q->mq_freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
mutex_unlock(&q->mq_freeze_lock);
@@ -131,6 +175,14 @@ void blk_freeze_queue_start(struct request_queue *q)
} else {
mutex_unlock(&q->mq_freeze_lock);
}
+
+ return freeze;
+}
+
+void blk_freeze_queue_start(struct request_queue *q)
+{
+ if (__blk_freeze_queue_start(q, current))
+ blk_freeze_acquire_lock(q);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -149,35 +201,17 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
-/*
- * Guarantee no request is in use, so we can change any data structure of
- * the queue afterward.
- */
-void blk_freeze_queue(struct request_queue *q)
+void blk_mq_freeze_queue_nomemsave(struct request_queue *q)
{
- /*
- * In the !blk_mq case we are only calling this to kill the
- * q_usage_counter, otherwise this increases the freeze depth
- * and waits for it to return to zero. For this reason there is
- * no blk_unfreeze_queue(), and blk_freeze_queue() is not
- * exported to drivers as the only user for unfreeze is blk_mq.
- */
blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave);
-void blk_mq_freeze_queue(struct request_queue *q)
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
- /*
- * ...just an alias to keep freeze and unfreeze actions balanced
- * in the blk_mq_* namespace
- */
- blk_freeze_queue(q);
-}
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
+ bool unfreeze;
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
-{
mutex_lock(&q->mq_freeze_lock);
if (force_atomic)
q->q_usage_counter.data->force_atomic = true;
@@ -187,14 +221,38 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
+ unfreeze = blk_unfreeze_check_owner(q);
mutex_unlock(&q->mq_freeze_lock);
+
+ return unfreeze;
+}
+
+void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q)
+{
+ if (__blk_mq_unfreeze_queue(q, false))
+ blk_unfreeze_release_lock(q);
}
+EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore);
-void blk_mq_unfreeze_queue(struct request_queue *q)
+/*
+ * non_owner variant of blk_freeze_queue_start
+ *
+ * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen
+ * by the same task. This is fragile and should not be used if at all
+ * possible.
+ */
+void blk_freeze_queue_start_non_owner(struct request_queue *q)
+{
+ __blk_freeze_queue_start(q, NULL);
+}
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
+
+/* non_owner variant of blk_mq_unfreeze_queue */
+void blk_mq_unfreeze_queue_non_owner(struct request_queue *q)
{
__blk_mq_unfreeze_queue(q, false);
}
-EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
+EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner);
/*
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
@@ -283,8 +341,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
if (!blk_queue_skip_tagset_quiesce(q))
blk_mq_quiesce_queue_nowait(q);
}
- blk_mq_wait_quiesce_done(set);
mutex_unlock(&set->tag_list_lock);
+
+ blk_mq_wait_quiesce_done(set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
@@ -318,12 +377,12 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
INIT_LIST_HEAD(&rq->queuelist);
rq->q = q;
rq->__sector = (sector_t) -1;
+ rq->phys_gap_bit = 0;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = blk_time_get_ns();
- rq->part = NULL;
blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -331,19 +390,23 @@ EXPORT_SYMBOL(blk_rq_init);
/* Set start and alloc time when the allocated request is actually used */
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
- if (blk_mq_need_time_stamp(rq))
- rq->start_time_ns = blk_time_get_ns();
- else
- rq->start_time_ns = 0;
-
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
if (blk_queue_rq_alloc_time(rq->q))
- rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns;
+ rq->alloc_time_ns = alloc_time_ns;
else
rq->alloc_time_ns = 0;
#endif
}
+static inline void blk_mq_bio_issue_init(struct request_queue *q,
+ struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+ if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags))
+ bio->issue_time_ns = blk_time_get_ns();
+#endif
+}
+
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
struct blk_mq_tags *tags, unsigned int tag)
{
@@ -359,8 +422,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
if (data->flags & BLK_MQ_REQ_PM)
data->rq_flags |= RQF_PM;
- if (blk_queue_io_stat(q))
- data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
if (data->rq_flags & RQF_SCHED_TAGS) {
@@ -376,9 +437,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->io_start_time_ns = 0;
rq->stats_sectors = 0;
rq->nr_phys_segments = 0;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
-#endif
rq->end_io = NULL;
rq->end_io_data = NULL;
@@ -410,28 +469,33 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
unsigned long tag_mask;
int i, nr = 0;
- tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
- if (unlikely(!tag_mask))
- return NULL;
+ do {
+ tag_mask = blk_mq_get_tags(data, data->nr_tags - nr, &tag_offset);
+ if (unlikely(!tag_mask)) {
+ if (nr == 0)
+ return NULL;
+ break;
+ }
+ tags = blk_mq_tags_from_data(data);
+ for (i = 0; tag_mask; i++) {
+ if (!(tag_mask & (1UL << i)))
+ continue;
+ tag = tag_offset + i;
+ prefetch(tags->static_rqs[tag]);
+ tag_mask &= ~(1UL << i);
+ rq = blk_mq_rq_ctx_init(data, tags, tag);
+ rq_list_add_head(data->cached_rqs, rq);
+ nr++;
+ }
+ } while (data->nr_tags > nr);
- tags = blk_mq_tags_from_data(data);
- for (i = 0; tag_mask; i++) {
- if (!(tag_mask & (1UL << i)))
- continue;
- tag = tag_offset + i;
- prefetch(tags->static_rqs[tag]);
- tag_mask &= ~(1UL << i);
- rq = blk_mq_rq_ctx_init(data, tags, tag);
- rq_list_add(data->cached_rq, rq);
- nr++;
- }
if (!(data->rq_flags & RQF_SCHED_TAGS))
blk_mq_add_active_requests(data->hctx, nr);
/* caller already holds a reference, add for remainder */
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
data->nr_tags -= nr;
- return rq_list_pop(data->cached_rq);
+ return rq_list_pop(data->cached_rqs);
}
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
@@ -448,6 +512,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
+retry:
+ data->ctx = blk_mq_get_ctx(q);
+ data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
+
if (q->elevator) {
/*
* All requests use scheduler tags when an I/O scheduler is
@@ -469,13 +537,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
if (ops->limit_depth)
ops->limit_depth(data->cmd_flags, data);
}
- }
-
-retry:
- data->ctx = blk_mq_get_ctx(q);
- data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->rq_flags & RQF_SCHED_TAGS))
+ } else {
blk_mq_tag_busy(data->hctx);
+ }
if (data->flags & BLK_MQ_REQ_RESERVED)
data->rq_flags |= RQF_RESV;
@@ -526,9 +590,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = plug->nr_ios,
- .cached_rq = &plug->cached_rq,
+ .cached_rqs = &plug->cached_rqs,
+ .ctx = NULL,
+ .hctx = NULL
};
struct request *rq;
@@ -553,14 +621,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
- if (rq_list_empty(plug->cached_rq)) {
+ if (rq_list_empty(&plug->cached_rqs)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
if (!rq)
return NULL;
} else {
- rq = rq_list_peek(&plug->cached_rq);
+ rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
@@ -569,8 +637,8 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
- plug->cached_rq = rq_list_next(rq);
- blk_mq_rq_time_init(rq, 0);
+ rq_list_pop(&plug->cached_rqs);
+ blk_mq_rq_time_init(rq, blk_time_get_ns());
}
rq->cmd_flags = opf;
@@ -588,8 +656,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
int ret;
@@ -602,6 +675,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
goto out_queue_exit;
}
rq->__data_len = 0;
+ rq->phys_gap_bit = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
@@ -617,8 +691,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
u64 alloc_time_ns = 0;
struct request *rq;
@@ -652,7 +731,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
- data.hctx = xa_load(&q->hctx_table, hctx_idx);
+ data.hctx = q->queue_hw_ctx[hctx_idx];
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
@@ -677,6 +756,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
rq->__data_len = 0;
+ rq->phys_gap_bit = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
@@ -746,7 +826,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
struct request *rq;
- while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
+ while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
blk_mq_free_request(rq);
}
@@ -766,7 +846,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags);
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
- if (req->part && blk_do_io_stat(req)) {
+ if (req->rq_flags & RQF_IO_STAT) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
@@ -786,7 +866,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
blk_op_str(req_op(req)),
(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
req->nr_phys_segments,
- IOPRIO_PRIO_CLASS(req->ioprio));
+ IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
}
/*
@@ -804,10 +884,8 @@ static void blk_complete_request(struct request *req)
if (!bio)
return;
-#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
- req->q->integrity.profile->complete_fn(req, total_bytes);
-#endif
+ blk_integrity_complete(req, total_bytes);
/*
* Upper layers may call blk_crypto_evict_key() anytime after the last
@@ -823,7 +901,8 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- blk_zone_update_request_bio(req, bio);
+ if (blk_req_bio_is_zone_append(req, bio))
+ blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -875,11 +954,9 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (!req->bio)
return false;
-#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
error == BLK_STS_OK)
- req->q->integrity.profile->complete_fn(req, nr_bytes);
-#endif
+ blk_integrity_complete(req, nr_bytes);
/*
* Upper layers may call blk_crypto_evict_key() anytime after the last
@@ -924,7 +1001,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
/* Don't actually finish bio if it's part of flush sequence */
if (!bio->bi_iter.bi_size) {
- blk_zone_update_request_bio(req, bio);
+ if (blk_req_bio_is_zone_append(req, bio))
+ blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
}
@@ -988,8 +1066,7 @@ static inline void blk_account_io_done(struct request *req, u64 now)
* normal IO on queueing nor completion. Accounting the
* containing request is enough.
*/
- if (blk_do_io_stat(req) && req->part &&
- !(req->rq_flags & RQF_FLUSH_SEQ)) {
+ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
@@ -1002,28 +1079,63 @@ static inline void blk_account_io_done(struct request *req, u64 now)
}
}
+static inline bool blk_rq_passthrough_stats(struct request *req)
+{
+ struct bio *bio = req->bio;
+
+ if (!blk_queue_passthrough_stat(req->q))
+ return false;
+
+ /* Requests without a bio do not transfer data. */
+ if (!bio)
+ return false;
+
+ /*
+ * Stats are accumulated in the bdev, so must have one attached to a
+ * bio to track stats. Most drivers do not set the bdev for passthrough
+ * requests, but nvme is one that will set it.
+ */
+ if (!bio->bi_bdev)
+ return false;
+
+ /*
+ * We don't know what a passthrough command does, but we know the
+ * payload size and data direction. Ensuring the size is aligned to the
+ * block size filters out most commands with payloads that don't
+ * represent sector access.
+ */
+ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
+ return false;
+ return true;
+}
+
static inline void blk_account_io_start(struct request *req)
{
trace_block_io_start(req);
- if (blk_do_io_stat(req)) {
- /*
- * All non-passthrough requests are created from a bio with one
- * exception: when a flush command that is part of a flush sequence
- * generated by the state machine in blk-flush.c is cloned onto the
- * lower device by dm-multipath we can get here without a bio.
- */
- if (req->bio)
- req->part = req->bio->bi_bdev;
- else
- req->part = req->q->disk->part0;
+ if (!blk_queue_io_stat(req->q))
+ return;
+ if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
+ return;
- part_stat_lock();
- update_io_ticks(req->part, jiffies, false);
- part_stat_local_inc(req->part,
- in_flight[op_is_write(req_op(req))]);
- part_stat_unlock();
- }
+ req->rq_flags |= RQF_IO_STAT;
+ req->start_time_ns = blk_time_get_ns();
+
+ /*
+ * All non-passthrough requests are created from a bio with one
+ * exception: when a flush command that is part of a flush sequence
+ * generated by the state machine in blk-flush.c is cloned onto the
+ * lower device by dm-multipath we can get here without a bio.
+ */
+ if (req->bio)
+ req->part = req->bio->bi_bdev;
+ else
+ req->part = req->q->disk->part0;
+
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, false);
+ part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
+ part_stat_unlock();
}
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
@@ -1132,7 +1244,7 @@ static void blk_complete_reqs(struct llist_head *list)
rq->q->mq_ops->complete(rq);
}
-static __latent_entropy void blk_done_softirq(struct softirq_action *h)
+static __latent_entropy void blk_done_softirq(void)
{
blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
}
@@ -1264,10 +1376,9 @@ void blk_mq_start_request(struct request *rq)
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
rq->mq_hctx->tags->rqs[rq->tag] = rq;
-#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
- q->integrity.profile->prepare_fn(rq);
-#endif
+ blk_integrity_prepare(rq);
+
if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
}
@@ -1307,8 +1418,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*/
if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
plug->has_elevator = true;
- rq->rq_next = NULL;
- rq_list_add(&plug->mq_list, rq);
+ rq_list_add_tail(&plug->mq_list, rq);
plug->rq_count++;
}
@@ -1460,19 +1570,17 @@ static void blk_mq_requeue_work(struct work_struct *work)
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
+ list_del_init(&rq->queuelist);
/*
- * If RQF_DONTPREP ist set, the request has been started by the
+ * If RQF_DONTPREP is set, the request has been started by the
* driver already and might have driver-specific data allocated
* already. Insert it into the hctx dispatch list to avoid
* block layer merges for the request.
*/
- if (rq->rq_flags & RQF_DONTPREP) {
- list_del_init(&rq->queuelist);
+ if (rq->rq_flags & RQF_DONTPREP)
blk_mq_request_bypass_insert(rq, 0);
- } else {
- list_del_init(&rq->queuelist);
+ else
blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
- }
}
while (!list_empty(&flush_list)) {
@@ -1705,7 +1813,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}
-EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
struct dispatch_rq_data {
struct blk_mq_hw_ctx *hctx;
@@ -1997,7 +2104,7 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
- unsigned int nr_budgets)
+ bool get_budget)
{
enum prep_dispatch prep;
struct request_queue *q = hctx->queue;
@@ -2019,7 +2126,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
WARN_ON_ONCE(hctx != rq->mq_hctx);
- prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+ prep = blk_mq_prep_dispatch_rq(rq, get_budget);
if (prep != PREP_DISPATCH_OK)
break;
@@ -2028,12 +2135,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
bd.rq = rq;
bd.last = list_empty(list);
- /*
- * once the request is queued to lld, no need to cover the
- * budget any more
- */
- if (nr_budgets)
- nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
@@ -2067,7 +2168,11 @@ out:
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
blk_mq_is_shared_tags(hctx->flags));
- if (nr_budgets)
+ /*
+ * If the caller allocated budgets, free the budgets of the
+ * requests that have not yet been passed to the block driver.
+ */
+ if (!get_budget)
blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
@@ -2207,6 +2312,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
+static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx)
+{
+ bool need_run;
+
+ /*
+ * When queue is quiesced, we may be switching io scheduler, or
+ * updating nr_hw_queues, or other things, and we can't run queue
+ * any more, even blk_mq_hctx_has_pending() can't be called safely.
+ *
+ * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+ * quiesced.
+ */
+ __blk_mq_run_dispatch_ops(hctx->queue, false,
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx));
+ return need_run;
+}
+
/**
* blk_mq_run_hw_queue - Start to run a hardware queue.
* @hctx: Pointer to the hardware queue to run.
@@ -2227,20 +2350,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
- /*
- * When queue is quiesced, we may be switching io scheduler, or
- * updating nr_hw_queues, or other things, and we can't run queue
- * any more, even __blk_mq_hctx_has_pending() can't be called safely.
- *
- * And queue will be rerun in blk_mq_unquiesce_queue() if it is
- * quiesced.
- */
- __blk_mq_run_dispatch_ops(hctx->queue, false,
- need_run = !blk_queue_quiesced(hctx->queue) &&
- blk_mq_hctx_has_pending(hctx));
+ need_run = blk_mq_hw_queue_need_run(hctx);
+ if (!need_run) {
+ unsigned long flags;
- if (!need_run)
- return;
+ /*
+ * Synchronize with blk_mq_unquiesce_queue(), because we check
+ * if hw queue is quiesced locklessly above, we need the use
+ * ->queue_lock to make sure we see the up-to-date status to
+ * not miss rerunning the hw queue.
+ */
+ spin_lock_irqsave(&hctx->queue->queue_lock, flags);
+ need_run = blk_mq_hw_queue_need_run(hctx);
+ spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);
+
+ if (!need_run)
+ return;
+ }
if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
blk_mq_delay_run_hw_queue(hctx, 0);
@@ -2397,6 +2523,12 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
return;
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ /*
+ * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the
+ * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch
+ * list in the subsequent routine.
+ */
+ smp_mb__after_atomic();
blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
@@ -2548,9 +2680,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
if (bio->bi_opf & REQ_RAHEAD)
rq->cmd_flags |= REQ_FAILFAST_MASK;
+ rq->bio = rq->biotail = bio;
rq->__sector = bio->bi_iter.bi_sector;
- rq->write_hint = bio->bi_write_hint;
- blk_rq_bio_prep(rq, bio, nr_segs);
+ rq->__data_len = bio->bi_iter.bi_size;
+ rq->phys_gap_bit = bio->bi_bvec_gap_bit;
+
+ rq->nr_phys_segments = nr_segs;
+ if (bio_integrity(bio))
+ rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
+ bio);
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
@@ -2624,6 +2762,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
return;
}
@@ -2654,6 +2793,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
return BLK_STS_OK;
}
@@ -2662,15 +2802,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
return __blk_mq_issue_directly(hctx, rq, last);
}
-static void blk_mq_plug_issue_direct(struct blk_plug *plug)
+static void blk_mq_issue_direct(struct rq_list *rqs)
{
struct blk_mq_hw_ctx *hctx = NULL;
struct request *rq;
int queued = 0;
blk_status_t ret = BLK_STS_OK;
- while ((rq = rq_list_pop(&plug->mq_list))) {
- bool last = rq_list_empty(plug->mq_list);
+ while ((rq = rq_list_pop(rqs))) {
+ bool last = rq_list_empty(rqs);
if (hctx != rq->mq_hctx) {
if (hctx) {
@@ -2701,26 +2841,74 @@ out:
blk_mq_commit_rqs(hctx, queued, false);
}
-static void __blk_mq_flush_plug_list(struct request_queue *q,
- struct blk_plug *plug)
+static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs)
{
if (blk_queue_quiesced(q))
return;
- q->mq_ops->queue_rqs(&plug->mq_list);
+ q->mq_ops->queue_rqs(rqs);
+}
+
+static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs,
+ struct rq_list *queue_rqs)
+{
+ struct request *rq = rq_list_pop(rqs);
+ struct request_queue *this_q = rq->q;
+ struct request **prev = &rqs->head;
+ struct rq_list matched_rqs = {};
+ struct request *last = NULL;
+ unsigned depth = 1;
+
+ rq_list_add_tail(&matched_rqs, rq);
+ while ((rq = *prev)) {
+ if (rq->q == this_q) {
+ /* move rq from rqs to matched_rqs */
+ *prev = rq->rq_next;
+ rq_list_add_tail(&matched_rqs, rq);
+ depth++;
+ } else {
+ /* leave rq in rqs */
+ prev = &rq->rq_next;
+ last = rq;
+ }
+ }
+
+ rqs->tail = last;
+ *queue_rqs = matched_rqs;
+ return depth;
+}
+
+static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth)
+{
+ struct request_queue *q = rq_list_peek(rqs)->q;
+
+ trace_block_unplug(q, depth, true);
+
+ /*
+ * Peek first request and see if we have a ->queue_rqs() hook.
+ * If we do, we can dispatch the whole list in one go.
+ * We already know at this point that all requests belong to the
+ * same queue, caller must ensure that's the case.
+ */
+ if (q->mq_ops->queue_rqs) {
+ blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs));
+ if (rq_list_empty(rqs))
+ return;
+ }
+
+ blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs));
}
-static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched)
{
struct blk_mq_hw_ctx *this_hctx = NULL;
struct blk_mq_ctx *this_ctx = NULL;
- struct request *requeue_list = NULL;
- struct request **requeue_lastp = &requeue_list;
+ struct rq_list requeue_list = {};
unsigned int depth = 0;
bool is_passthrough = false;
LIST_HEAD(list);
do {
- struct request *rq = rq_list_pop(&plug->mq_list);
+ struct request *rq = rq_list_pop(rqs);
if (!this_hctx) {
this_hctx = rq->mq_hctx;
@@ -2728,14 +2916,14 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
is_passthrough = blk_rq_is_passthrough(rq);
} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
is_passthrough != blk_rq_is_passthrough(rq)) {
- rq_list_add_tail(&requeue_lastp, rq);
+ rq_list_add_tail(&requeue_list, rq);
continue;
}
- list_add(&rq->queuelist, &list);
+ list_add_tail(&rq->queuelist, &list);
depth++;
- } while (!rq_list_empty(plug->mq_list));
+ } while (!rq_list_empty(rqs));
- plug->mq_list = requeue_list;
+ *rqs = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
percpu_ref_get(&this_hctx->queue->q_usage_counter);
@@ -2755,9 +2943,22 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
percpu_ref_put(&this_hctx->queue->q_usage_counter);
}
+static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs)
+{
+ do {
+ struct rq_list queue_rqs;
+ unsigned depth;
+
+ depth = blk_mq_extract_queue_requests(rqs, &queue_rqs);
+ blk_mq_dispatch_queue_requests(&queue_rqs, depth);
+ while (!rq_list_empty(&queue_rqs))
+ blk_mq_dispatch_list(&queue_rqs, false);
+ } while (!rq_list_empty(rqs));
+}
+
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
- struct request *rq;
+ unsigned int depth;
/*
* We may have been called recursively midway through handling
@@ -2768,36 +2969,23 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
if (plug->rq_count == 0)
return;
+ depth = plug->rq_count;
plug->rq_count = 0;
- if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
- struct request_queue *q;
-
- rq = rq_list_peek(&plug->mq_list);
- q = rq->q;
-
- /*
- * Peek first request and see if we have a ->queue_rqs() hook.
- * If we do, we can dispatch the whole plug list in one go. We
- * already know at this point that all requests belong to the
- * same queue, caller must ensure that's the case.
- */
- if (q->mq_ops->queue_rqs) {
- blk_mq_run_dispatch_ops(q,
- __blk_mq_flush_plug_list(q, plug));
- if (rq_list_empty(plug->mq_list))
- return;
+ if (!plug->has_elevator && !from_schedule) {
+ if (plug->multiple_queues) {
+ blk_mq_dispatch_multiple_queue_requests(&plug->mq_list);
+ return;
}
- blk_mq_run_dispatch_ops(q,
- blk_mq_plug_issue_direct(plug));
- if (rq_list_empty(plug->mq_list))
+ blk_mq_dispatch_queue_requests(&plug->mq_list, depth);
+ if (rq_list_empty(&plug->mq_list))
return;
}
do {
- blk_mq_dispatch_plug_list(plug, from_schedule);
- } while (!rq_list_empty(plug->mq_list));
+ blk_mq_dispatch_list(&plug->mq_list, from_schedule);
+ } while (!rq_list_empty(&plug->mq_list));
}
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2847,13 +3035,18 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q,
static struct request *blk_mq_get_new_requests(struct request_queue *q,
struct blk_plug *plug,
- struct bio *bio,
- unsigned int nsegs)
+ struct bio *bio)
{
struct blk_mq_alloc_data data = {
.q = q,
- .nr_tags = 1,
+ .flags = 0,
+ .shallow_depth = 0,
.cmd_flags = bio->bi_opf,
+ .rq_flags = 0,
+ .nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
struct request *rq;
@@ -2862,16 +3055,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
if (plug) {
data.nr_tags = plug->nr_ios;
plug->nr_ios = 1;
- data.cached_rq = &plug->cached_rq;
+ data.cached_rqs = &plug->cached_rqs;
}
rq = __blk_mq_alloc_requests(&data);
- if (rq)
- return rq;
- rq_qos_cleanup(q, bio);
- if (bio->bi_opf & REQ_NOWAIT)
- bio_wouldblock_error(bio);
- return NULL;
+ if (unlikely(!rq))
+ rq_qos_cleanup(q, bio);
+ return rq;
}
/*
@@ -2885,7 +3075,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
if (!plug)
return NULL;
- rq = rq_list_peek(&plug->cached_rq);
+ rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
if (type != rq->mq_hctx->type &&
@@ -2899,21 +3089,32 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
- WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+ if (rq_list_pop(&plug->cached_rqs) != rq)
+ WARN_ON_ONCE(1);
/*
* If any qos ->throttle() end up blocking, we will have flushed the
* plug and hence killed the cached_rq list as well. Pop this entry
* before we throttle.
*/
- plug->cached_rq = rq_list_next(rq);
rq_qos_throttle(rq->q, bio);
- blk_mq_rq_time_init(rq, 0);
+ blk_mq_rq_time_init(rq, blk_time_get_ns());
rq->cmd_flags = bio->bi_opf;
INIT_LIST_HEAD(&rq->queuelist);
}
+static bool bio_unaligned(const struct bio *bio, struct request_queue *q)
+{
+ unsigned int bs_mask = queue_logical_block_size(q) - 1;
+
+ /* .bi_sector of any zero sized bio need to be initialized */
+ if ((bio->bi_iter.bi_size & bs_mask) ||
+ ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask))
+ return true;
+ return false;
+}
+
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
@@ -2933,7 +3134,7 @@ void blk_mq_submit_bio(struct bio *bio)
struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
- unsigned int nr_segs = 1;
+ unsigned int nr_segs;
struct request *rq;
blk_status_t ret;
@@ -2955,8 +3156,6 @@ void blk_mq_submit_bio(struct bio *bio)
goto new_request;
}
- bio = blk_queue_bounce(bio, q);
-
/*
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
@@ -2966,27 +3165,48 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
- if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
- bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
- if (!bio)
- goto queue_exit;
+ /*
+ * Device reconfiguration may change logical block size or reduce the
+ * number of poll queues, so the checks for alignment and poll support
+ * have to be done with queue usage counter held.
+ */
+ if (unlikely(bio_unaligned(bio, q))) {
+ bio_io_error(bio);
+ goto queue_exit;
}
+
+ if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ goto queue_exit;
+ }
+
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ goto queue_exit;
+
if (!bio_integrity_prep(bio))
goto queue_exit;
+ blk_mq_bio_issue_init(q, bio);
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
- if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
- goto queue_exit;
+ if (bio_needs_zone_write_plugging(bio)) {
+ if (blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+ }
new_request:
- if (!rq) {
- rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
- if (unlikely(!rq))
- goto queue_exit;
- } else {
+ if (rq) {
blk_mq_use_cached_rq(rq, plug, bio);
+ } else {
+ rq = blk_mq_get_new_requests(q, plug, bio);
+ if (unlikely(!rq)) {
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
+ goto queue_exit;
+ }
}
trace_block_getrq(bio);
@@ -3041,7 +3261,7 @@ queue_exit:
blk_status_t blk_insert_cloned_request(struct request *rq)
{
struct request_queue *q = rq->q;
- unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+ unsigned int max_sectors = blk_queue_get_max_sectors(rq);
unsigned int max_segments = blk_rq_get_max_segments(rq);
blk_status_t ret;
@@ -3138,19 +3358,21 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
int (*bio_ctr)(struct bio *, struct bio *, void *),
void *data)
{
- struct bio *bio, *bio_src;
+ struct bio *bio_src;
if (!bs)
bs = &fs_bio_set;
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
- bs);
+ struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src,
+ gfp_mask, bs);
if (!bio)
goto free_and_out;
- if (bio_ctr && bio_ctr(bio, bio_src, data))
+ if (bio_ctr && bio_ctr(bio, bio_src, data)) {
+ bio_put(bio);
goto free_and_out;
+ }
if (rq->bio) {
rq->biotail->bi_next = bio;
@@ -3158,7 +3380,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
} else {
rq->bio = rq->biotail = bio;
}
- bio = NULL;
}
/* Copy attributes of the original request to the clone request. */
@@ -3169,8 +3390,8 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
rq->special_vec = rq_src->special_vec;
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
- rq->ioprio = rq_src->ioprio;
- rq->write_hint = rq_src->write_hint;
+ rq->nr_integrity_segments = rq_src->nr_integrity_segments;
+ rq->phys_gap_bit = rq_src->phys_gap_bit;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;
@@ -3178,8 +3399,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
return 0;
free_and_out:
- if (bio)
- bio_put(bio);
blk_rq_unprep_clone(rq);
return -ENOMEM;
@@ -3218,7 +3437,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
struct blk_mq_tags *tags)
{
struct page *page;
- unsigned long flags;
/*
* There is no need to clear mapping if driver tags is not initialized
@@ -3242,22 +3460,12 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
}
}
}
-
- /*
- * Wait until all pending iteration is done.
- *
- * Request reference is cleared and it is guaranteed to be observed
- * after the ->lock is released.
- */
- spin_lock_irqsave(&drv_tags->lock, flags);
- spin_unlock_irqrestore(&drv_tags->lock, flags);
}
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
struct blk_mq_tags *drv_tags;
- struct page *page;
if (list_empty(&tags->page_list))
return;
@@ -3281,27 +3489,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
blk_mq_clear_rq_mapping(drv_tags, tags);
-
- while (!list_empty(&tags->page_list)) {
- page = list_first_entry(&tags->page_list, struct page, lru);
- list_del_init(&page->lru);
- /*
- * Remove kmemleak object previously allocated in
- * blk_mq_alloc_rqs().
- */
- kmemleak_free(page_address(page));
- __free_pages(page, page->private);
- }
+ /*
+ * Free request pages in SRCU callback, which is called from
+ * blk_mq_free_tags().
+ */
}
-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
{
kfree(tags->rqs);
tags->rqs = NULL;
kfree(tags->static_rqs);
tags->static_rqs = NULL;
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(set, tags);
}
static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
@@ -3342,8 +3543,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE)
node = set->numa_node;
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node);
if (!tags)
return NULL;
@@ -3364,7 +3564,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
err_free_rqs:
kfree(tags->rqs);
err_free_tags:
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(set, tags);
return NULL;
}
@@ -3394,8 +3594,6 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE)
node = set->numa_node;
- INIT_LIST_HEAD(&tags->page_list);
-
/*
* rq_size is the size of the request plus driver payload, rounded
* to the cacheline size
@@ -3482,8 +3680,12 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
struct rq_iter_data data = {
.hctx = hctx,
};
+ int srcu_idx;
+ srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu);
blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+ srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx);
+
return data.has_rq;
}
@@ -3517,6 +3719,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
+ int ret = 0;
if (blk_mq_hctx_has_online_cpu(hctx, cpu))
return 0;
@@ -3537,12 +3740,24 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
* frozen and there are no requests.
*/
if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
- while (blk_mq_hctx_has_requests(hctx))
+ while (blk_mq_hctx_has_requests(hctx)) {
+ /*
+ * The wakeup capable IRQ handler of block device is
+ * not called during suspend. Skip the loop by checking
+ * pm_wakeup_pending to prevent the deadlock and improve
+ * suspend latency.
+ */
+ if (pm_wakeup_pending()) {
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+ ret = -EBUSY;
+ break;
+ }
msleep(5);
+ }
percpu_ref_put(&hctx->queue->q_usage_counter);
}
- return 0;
+ return ret;
}
/*
@@ -3608,13 +3823,91 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
return 0;
}
-static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
+static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
- if (!(hctx->flags & BLK_MQ_F_STACKING))
+ lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+ if (!(hctx->flags & BLK_MQ_F_STACKING) &&
+ !hlist_unhashed(&hctx->cpuhp_online)) {
cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
- cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
- &hctx->cpuhp_dead);
+ INIT_HLIST_NODE(&hctx->cpuhp_online);
+ }
+
+ if (!hlist_unhashed(&hctx->cpuhp_dead)) {
+ cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
+ &hctx->cpuhp_dead);
+ INIT_HLIST_NODE(&hctx->cpuhp_dead);
+ }
+}
+
+static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
+{
+ mutex_lock(&blk_mq_cpuhp_lock);
+ __blk_mq_remove_cpuhp(hctx);
+ mutex_unlock(&blk_mq_cpuhp_lock);
+}
+
+static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx)
+{
+ lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+ if (!(hctx->flags & BLK_MQ_F_STACKING) &&
+ hlist_unhashed(&hctx->cpuhp_online))
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+ &hctx->cpuhp_online);
+
+ if (hlist_unhashed(&hctx->cpuhp_dead))
+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD,
+ &hctx->cpuhp_dead);
+}
+
+static void __blk_mq_remove_cpuhp_list(struct list_head *head)
+{
+ struct blk_mq_hw_ctx *hctx;
+
+ lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+ list_for_each_entry(hctx, head, hctx_list)
+ __blk_mq_remove_cpuhp(hctx);
+}
+
+/*
+ * Unregister cpuhp callbacks from exited hw queues
+ *
+ * Safe to call if this `request_queue` is live
+ */
+static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q)
+{
+ LIST_HEAD(hctx_list);
+
+ spin_lock(&q->unused_hctx_lock);
+ list_splice_init(&q->unused_hctx_list, &hctx_list);
+ spin_unlock(&q->unused_hctx_lock);
+
+ mutex_lock(&blk_mq_cpuhp_lock);
+ __blk_mq_remove_cpuhp_list(&hctx_list);
+ mutex_unlock(&blk_mq_cpuhp_lock);
+
+ spin_lock(&q->unused_hctx_lock);
+ list_splice(&hctx_list, &q->unused_hctx_list);
+ spin_unlock(&q->unused_hctx_lock);
+}
+
+/*
+ * Register cpuhp callbacks from all hw queues
+ *
+ * Safe to call if this `request_queue` is live
+ */
+static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ mutex_lock(&blk_mq_cpuhp_lock);
+ queue_for_each_hw_ctx(q, hctx, i)
+ __blk_mq_add_cpuhp(hctx);
+ mutex_unlock(&blk_mq_cpuhp_lock);
}
/*
@@ -3625,7 +3918,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
unsigned int queue_depth, struct request *flush_rq)
{
int i;
- unsigned long flags;
/* The hw queue may not be mapped yet */
if (!tags)
@@ -3635,15 +3927,14 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
for (i = 0; i < queue_depth; i++)
cmpxchg(&tags->rqs[i], flush_rq, NULL);
+}
- /*
- * Wait until all pending iteration is done.
- *
- * Request reference is cleared and it is guaranteed to be observed
- * after the ->lock is released.
- */
- spin_lock_irqsave(&tags->lock, flags);
- spin_unlock_irqrestore(&tags->lock, flags);
+static void blk_free_flush_queue_callback(struct rcu_head *head)
+{
+ struct blk_flush_queue *fq =
+ container_of(head, struct blk_flush_queue, rcu_head);
+
+ blk_free_flush_queue(fq);
}
/* hctx->ctxs will be freed in queue's release handler */
@@ -3665,9 +3956,9 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
- blk_mq_remove_cpuhp(hctx);
-
- xa_erase(&q->hctx_table, hctx_idx);
+ call_srcu(&set->tags_srcu, &hctx->fq->rcu_head,
+ blk_free_flush_queue_callback);
+ hctx->fq = NULL;
spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
@@ -3683,6 +3974,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
+ blk_mq_remove_cpuhp(hctx);
blk_mq_exit_hctx(q, set, hctx, i);
}
}
@@ -3691,36 +3983,33 @@ static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
- hctx->queue_num = hctx_idx;
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
- if (!(hctx->flags & BLK_MQ_F_STACKING))
- cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
- &hctx->cpuhp_online);
- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
+ if (!hctx->fq)
+ goto fail;
+
+ hctx->queue_num = hctx_idx;
hctx->tags = set->tags[hctx_idx];
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
- goto unregister_cpu_notifier;
+ goto fail_free_fq;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
goto exit_hctx;
- if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
- goto exit_flush_rq;
-
return 0;
- exit_flush_rq:
- if (set->ops->exit_request)
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
- unregister_cpu_notifier:
- blk_mq_remove_cpuhp(hctx);
+ fail_free_fq:
+ blk_free_flush_queue(hctx->fq);
+ hctx->fq = NULL;
+ fail:
return -1;
}
@@ -3746,6 +4035,8 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
+ INIT_HLIST_NODE(&hctx->cpuhp_dead);
+ INIT_HLIST_NODE(&hctx->cpuhp_online);
hctx->queue = q;
hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
@@ -3769,16 +4060,10 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
- hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
- if (!hctx->fq)
- goto free_bitmap;
-
blk_mq_hctx_kobj_init(hctx);
return hctx;
- free_bitmap:
- sbitmap_free(&hctx->ctx_map);
free_ctxs:
kfree(hctx->ctxs);
free_cpumask:
@@ -3832,7 +4117,7 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
if (ret) {
- blk_mq_free_rq_map(tags);
+ blk_mq_free_rq_map(set, tags);
return NULL;
}
@@ -3860,7 +4145,7 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
{
if (tags) {
blk_mq_free_rqs(set, tags, hctx_idx);
- blk_mq_free_rq_map(tags);
+ blk_mq_free_rq_map(set, tags);
}
}
@@ -4010,13 +4295,14 @@ static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
bool shared)
{
struct request_queue *q;
+ unsigned int memflags;
lockdep_assert_held(&set->tag_list_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
queue_set_hctx_shared(q, shared);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
}
}
@@ -4105,7 +4391,7 @@ void blk_mq_release(struct request_queue *q)
kobject_put(&hctx->kobj);
}
- xa_destroy(&q->hctx_table);
+ kfree(q->queue_hw_ctx);
/*
* release .mq_kobj and sw queue's kobject now because
@@ -4121,7 +4407,13 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
- q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
+ if (!lim)
+ lim = &default_lim;
+ lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
+ if (set->nr_maps > HCTX_TYPE_POLL)
+ lim->features |= BLK_FEAT_POLL;
+
+ q = blk_alloc_queue(lim, set->numa_node);
if (IS_ERR(q))
return q;
q->queuedata = queuedata;
@@ -4197,6 +4489,15 @@ struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
+/*
+ * Only hctx removed from cpuhp list can be reused
+ */
+static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx)
+{
+ return hlist_unhashed(&hctx->cpuhp_online) &&
+ hlist_unhashed(&hctx->cpuhp_dead);
+}
+
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
int hctx_idx, int node)
@@ -4206,7 +4507,7 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
/* reuse dead hctx first */
spin_lock(&q->unused_hctx_lock);
list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
- if (tmp->numa_node == node) {
+ if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) {
hctx = tmp;
break;
}
@@ -4231,31 +4532,52 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
return NULL;
}
-static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
- struct request_queue *q)
+static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
- struct blk_mq_hw_ctx *hctx;
- unsigned long i, j;
+ int i, j, end;
+ struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
+
+ if (q->nr_hw_queues < set->nr_hw_queues) {
+ struct blk_mq_hw_ctx **new_hctxs;
+
+ new_hctxs = kcalloc_node(set->nr_hw_queues,
+ sizeof(*new_hctxs), GFP_KERNEL,
+ set->numa_node);
+ if (!new_hctxs)
+ return;
+ if (hctxs)
+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
+ sizeof(*hctxs));
+ rcu_assign_pointer(q->queue_hw_ctx, new_hctxs);
+ /*
+ * Make sure reading the old queue_hw_ctx from other
+ * context concurrently won't trigger uaf.
+ */
+ synchronize_rcu_expedited();
+ kfree(hctxs);
+ hctxs = new_hctxs;
+ }
- /* protect against switching io scheduler */
- mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
- struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
+ struct blk_mq_hw_ctx *old_hctx = hctxs[i];
if (old_hctx) {
old_node = old_hctx->numa_node;
blk_mq_exit_hctx(q, set, old_hctx, i);
}
- if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
+ hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node);
+ if (!hctxs[i]) {
if (!old_hctx)
break;
pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
node, old_node);
- hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
- WARN_ON_ONCE(!hctx);
+ hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i,
+ old_node);
+ WARN_ON_ONCE(!hctxs[i]);
}
}
/*
@@ -4264,25 +4586,33 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
+ end = i;
} else {
j = i;
+ end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
- xa_for_each_start(&q->hctx_table, j, hctx, j)
- blk_mq_exit_hctx(q, set, hctx, j);
- mutex_unlock(&q->sysfs_lock);
+ for (; j < end; j++) {
+ struct blk_mq_hw_ctx *hctx = hctxs[j];
+
+ if (hctx) {
+ blk_mq_exit_hctx(q, set, hctx, j);
+ hctxs[j] = NULL;
+ }
+ }
}
-static void blk_mq_update_poll_flag(struct request_queue *q)
+static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
- struct blk_mq_tag_set *set = q->tag_set;
+ __blk_mq_realloc_hw_ctxs(set, q);
- if (set->nr_maps > HCTX_TYPE_POLL &&
- set->map[HCTX_TYPE_POLL].nr_queues)
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+ /* unregister cpuhp callbacks for exited hctxs */
+ blk_mq_remove_hw_queues_cpuhp(q);
+
+ /* register cpuhp for new initialized hctxs */
+ blk_mq_add_hw_queues_cpuhp(q);
}
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
@@ -4291,6 +4621,12 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
+ /*
+ * ->tag_set has to be setup before initialize hctx, which cpuphp
+ * handler needs it for checking queue mapping
+ */
+ q->tag_set = set;
+
if (blk_mq_alloc_ctxs(q))
goto err_exit;
@@ -4300,8 +4636,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
- xa_init(&q->hctx_table);
-
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
@@ -4309,10 +4643,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
- q->tag_set = set;
-
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
- blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->flush_list);
@@ -4322,8 +4653,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_requests = set->queue_depth;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
- blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
+ blk_mq_add_queue_tag_set(set, q);
return 0;
err_hctxs:
@@ -4542,13 +4873,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_srcu;
}
+ ret = init_srcu_struct(&set->tags_srcu);
+ if (ret)
+ goto out_cleanup_srcu;
+
+ init_rwsem(&set->update_nr_hwq_lock);
ret = -ENOMEM;
set->tags = kcalloc_node(set->nr_hw_queues,
sizeof(struct blk_mq_tags *), GFP_KERNEL,
set->numa_node);
if (!set->tags)
- goto out_cleanup_srcu;
+ goto out_cleanup_tags_srcu;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
@@ -4577,6 +4913,8 @@ out_free_mq_map:
}
kfree(set->tags);
set->tags = NULL;
+out_cleanup_tags_srcu:
+ cleanup_srcu_struct(&set->tags_srcu);
out_cleanup_srcu:
if (set->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(set->srcu);
@@ -4622,6 +4960,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
+
+ srcu_barrier(&set->tags_srcu);
+ cleanup_srcu_struct(&set->tags_srcu);
if (set->flags & BLK_MQ_F_BLOCKING) {
cleanup_srcu_struct(set->srcu);
kfree(set->srcu);
@@ -4629,143 +4970,132 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *et,
+ unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *old_et = NULL;
struct blk_mq_hw_ctx *hctx;
- int ret;
unsigned long i;
- if (!set)
- return -EINVAL;
-
- if (q->nr_requests == nr)
- return 0;
-
- blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
- ret = 0;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!hctx->tags)
- continue;
+ if (blk_mq_is_shared_tags(set->flags)) {
/*
- * If we're using an MQ scheduler, just update the scheduler
- * queue depth. This is similar to what the old code would do.
+ * Shared tags, for sched tags, we allocate max initially hence
+ * tags can't grow, see blk_mq_alloc_sched_tags().
*/
- if (hctx->sched_tags) {
- ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
- nr, true);
- } else {
- ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
- false);
+ if (q->elevator)
+ blk_mq_tag_update_sched_shared_tags(q, nr);
+ else
+ blk_mq_tag_resize_shared_tags(set, nr);
+ } else if (!q->elevator) {
+ /*
+ * Non-shared hardware tags, nr is already checked from
+ * queue_requests_store() and tags can't grow.
+ */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->tags)
+ continue;
+ sbitmap_queue_resize(&hctx->tags->bitmap_tags,
+ nr - hctx->tags->nr_reserved_tags);
}
- if (ret)
- break;
- if (q->elevator && q->elevator->type->ops.depth_updated)
- q->elevator->type->ops.depth_updated(hctx);
- }
- if (!ret) {
- q->nr_requests = nr;
- if (blk_mq_is_shared_tags(set->flags)) {
- if (q->elevator)
- blk_mq_tag_update_sched_shared_tags(q);
- else
- blk_mq_tag_resize_shared_tags(set, nr);
+ } else if (nr <= q->elevator->et->nr_requests) {
+ /* Non-shared sched tags, and tags don't grow. */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->sched_tags)
+ continue;
+ sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags,
+ nr - hctx->sched_tags->nr_reserved_tags);
}
+ } else {
+ /* Non-shared sched tags, and tags grow */
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = et->tags[i];
+ old_et = q->elevator->et;
+ q->elevator->et = et;
}
- blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ q->nr_requests = nr;
+ if (q->elevator && q->elevator->type->ops.depth_updated)
+ q->elevator->type->ops.depth_updated(q);
- return ret;
+ blk_mq_unquiesce_queue(q);
+ return old_et;
}
/*
- * request_queue and elevator_type pair.
- * It is just used by __blk_mq_update_nr_hw_queues to cache
- * the elevator_type associated with a request_queue.
+ * Switch back to the elevator type stored in the xarray.
*/
-struct blk_mq_qe_pair {
- struct list_head node;
- struct request_queue *q;
- struct elevator_type *type;
-};
-
-/*
- * Cache the elevator_type in qe pair list and switch the
- * io scheduler to 'none'
- */
-static bool blk_mq_elv_switch_none(struct list_head *head,
- struct request_queue *q)
+static void blk_mq_elv_switch_back(struct request_queue *q,
+ struct xarray *elv_tbl)
{
- struct blk_mq_qe_pair *qe;
-
- qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
- if (!qe)
- return false;
-
- /* q->elevator needs protection from ->sysfs_lock */
- mutex_lock(&q->sysfs_lock);
+ struct elv_change_ctx *ctx = xa_load(elv_tbl, q->id);
- /* the check has to be done with holding sysfs_lock */
- if (!q->elevator) {
- kfree(qe);
- goto unlock;
- }
+ if (WARN_ON_ONCE(!ctx))
+ return;
- INIT_LIST_HEAD(&qe->node);
- qe->q = q;
- qe->type = q->elevator->type;
- /* keep a reference to the elevator module as we'll switch back */
- __elevator_get(qe->type);
- list_add(&qe->node, head);
- elevator_disable(q);
-unlock:
- mutex_unlock(&q->sysfs_lock);
+ /* The elv_update_nr_hw_queues unfreezes the queue. */
+ elv_update_nr_hw_queues(q, ctx);
- return true;
+ /* Drop the reference acquired in blk_mq_elv_switch_none. */
+ if (ctx->type)
+ elevator_put(ctx->type);
}
-static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
- struct request_queue *q)
+/*
+ * Stores elevator name and type in ctx and set current elevator to none.
+ */
+static int blk_mq_elv_switch_none(struct request_queue *q,
+ struct xarray *elv_tbl)
{
- struct blk_mq_qe_pair *qe;
+ struct elv_change_ctx *ctx;
- list_for_each_entry(qe, head, node)
- if (qe->q == q)
- return qe;
+ lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock);
- return NULL;
-}
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is safe here
+ * because we're called from nr_hw_queue update which is protected by
+ * set->update_nr_hwq_lock in the writer context. So, scheduler update/
+ * switch code (which acquires the same lock in the reader context)
+ * can't run concurrently.
+ */
+ if (q->elevator) {
+ ctx = xa_load(elv_tbl, q->id);
+ if (WARN_ON_ONCE(!ctx))
+ return -ENOENT;
-static void blk_mq_elv_switch_back(struct list_head *head,
- struct request_queue *q)
-{
- struct blk_mq_qe_pair *qe;
- struct elevator_type *t;
+ ctx->name = q->elevator->type->elevator_name;
- qe = blk_lookup_qe_pair(head, q);
- if (!qe)
- return;
- t = qe->type;
- list_del(&qe->node);
- kfree(qe);
+ /*
+ * Before we switch elevator to 'none', take a reference to
+ * the elevator module so that while nr_hw_queue update is
+ * running, no one can remove elevator module. We'd put the
+ * reference to elevator module later when we switch back
+ * elevator.
+ */
+ __elevator_get(q->elevator->type);
- mutex_lock(&q->sysfs_lock);
- elevator_switch(q, t);
- /* drop the reference acquired in blk_mq_elv_switch_none */
- elevator_put(t);
- mutex_unlock(&q->sysfs_lock);
+ /*
+ * Store elevator type so that we can release the reference
+ * taken above later.
+ */
+ ctx->type = q->elevator->type;
+ elevator_set_none(q);
+ }
+ return 0;
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
- LIST_HEAD(head);
int prev_nr_hw_queues = set->nr_hw_queues;
+ unsigned int memflags;
int i;
+ struct xarray elv_tbl;
+ bool queues_frozen = false;
lockdep_assert_held(&set->tag_list_lock);
@@ -4776,30 +5106,40 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
return;
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_freeze_queue(q);
+ memflags = memalloc_noio_save();
+
+ xa_init(&elv_tbl);
+ if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0)
+ goto out_free_ctx;
+
+ if (blk_mq_alloc_sched_res_batch(&elv_tbl, set, nr_hw_queues) < 0)
+ goto out_free_ctx;
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ blk_mq_debugfs_unregister_hctxs(q);
+ blk_mq_sysfs_unregister_hctxs(q);
+ }
+
/*
* Switch IO scheduler to 'none', cleaning up the data associated
* with the previous scheduler. We will switch back once we are done
* updating the new sw to hw queue mappings.
*/
list_for_each_entry(q, &set->tag_list, tag_set_list)
- if (!blk_mq_elv_switch_none(&head, q))
+ if (blk_mq_elv_switch_none(q, &elv_tbl))
goto switch_back;
- list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_debugfs_unregister_hctxs(q);
- blk_mq_sysfs_unregister_hctxs(q);
- }
-
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_freeze_queue_nomemsave(q);
+ queues_frozen = true;
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
- goto reregister;
+ goto switch_back;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_realloc_hw_ctxs(set, q);
- blk_mq_update_poll_flag(q);
+ __blk_mq_realloc_hw_ctxs(set, q);
+
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
@@ -4813,19 +5153,27 @@ fallback:
}
blk_mq_map_swqueue(q);
}
+switch_back:
+ /* The blk_mq_elv_switch_back unfreezes queue for us. */
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /* switch_back expects queue to be frozen */
+ if (!queues_frozen)
+ blk_mq_freeze_queue_nomemsave(q);
+ blk_mq_elv_switch_back(q, &elv_tbl);
+ }
-reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
- }
-switch_back:
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_elv_switch_back(&head, q);
+ blk_mq_remove_hw_queues_cpuhp(q);
+ blk_mq_add_hw_queues_cpuhp(q);
+ }
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue(q);
+out_free_ctx:
+ blk_mq_free_sched_ctx_batch(&elv_tbl);
+ xa_destroy(&elv_tbl);
+ memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
@@ -4834,9 +5182,11 @@ switch_back:
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
+ down_write(&set->update_nr_hwq_lock);
mutex_lock(&set->tag_list_lock);
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
mutex_unlock(&set->tag_list_lock);
+ up_write(&set->update_nr_hwq_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
@@ -4870,9 +5220,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
struct io_comp_batch *iob, unsigned int flags)
{
- struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);
-
- return blk_hctx_poll(q, hctx, iob, flags);
+ if (!blk_mq_can_poll(q))
+ return 0;
+ return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags);
}
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,