summaryrefslogtreecommitdiff
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-30 12:12:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-30 12:12:56 -0700
commitdf668a5fe461bb9d7e899c538acc7197746038f4 (patch)
tree315a71104f5cea7feeb56c9f2c768453408b72f7 /block/blk-mq.c
parentdf04fbe8680bfe07f3d7487eccff9f768bb02533 (diff)
parent2705dfb2094777e405e065105e307074af8965c1 (diff)
Merge tag 'for-5.14/block-2021-06-29' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: - disk events cleanup (Christoph) - gendisk and request queue allocation simplifications (Christoph) - bdev_disk_changed cleanups (Christoph) - IO priority improvements (Bart) - Chained bio completion trace fix (Edward) - blk-wbt fixes (Jan) - blk-wbt enable/disable fix (Zhang) - Scheduler dispatch improvements (Jan, Ming) - Shared tagset scheduler improvements (John) - BFQ updates (Paolo, Luca, Pietro) - BFQ lock inversion fix (Jan) - Documentation improvements (Kir) - CLONE_IO block cgroup fix (Tejun) - Remove of ancient and deprecated block dump feature (zhangyi) - Discard merge fix (Ming) - Misc fixes or followup fixes (Colin, Damien, Dan, Long, Max, Thomas, Yang) * tag 'for-5.14/block-2021-06-29' of git://git.kernel.dk/linux-block: (129 commits) block: fix discard request merge block/mq-deadline: Remove a WARN_ON_ONCE() call blk-mq: update hctx->dispatch_busy in case of real scheduler blk: Fix lock inversion between ioc lock and bfqd lock bfq: Remove merged request already in bfq_requests_merged() block: pass a gendisk to bdev_disk_changed block: move bdev_disk_changed block: add the events* attributes to disk_attrs block: move the disk events code to a separate file block: fix trace completion for chained bio block/partitions/msdos: Fix typo inidicator -> indicator block, bfq: reset waker pointer with shared queues block, bfq: check waker only for queues with no in-flight I/O block, bfq: avoid delayed merge of async queues block, bfq: boost throughput by extending queue-merging times block, bfq: consider also creation time in delayed stable merge block, bfq: fix delayed stable merge check block, bfq: let also stably merged queues enjoy weight raising blk-wbt: make sure throttle is enabled properly blk-wbt: introduce a new disable state to prevent false positive by rwb_enabled() ...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c206
1 files changed, 139 insertions, 67 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e41edae97487..2e9fd0ec63d7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -909,6 +909,14 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
return false;
}
+void blk_mq_put_rq_ref(struct request *rq)
+{
+ if (is_flush_rq(rq, rq->mq_hctx))
+ rq->end_io(rq, 0);
+ else if (refcount_dec_and_test(&rq->ref))
+ __blk_mq_free_request(rq);
+}
+
static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
@@ -942,11 +950,7 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
- if (is_flush_rq(rq, hctx))
- rq->end_io(rq, 0);
- else if (refcount_dec_and_test(&rq->ref))
- __blk_mq_free_request(rq);
-
+ blk_mq_put_rq_ref(rq);
return true;
}
@@ -1100,7 +1104,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
return true;
}
-static bool blk_mq_get_driver_tag(struct request *rq)
+bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -1220,9 +1224,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
unsigned int ewma;
- if (hctx->queue->elevator)
- return;
-
ewma = hctx->dispatch_busy;
if (!ewma && !busy)
@@ -2303,6 +2304,45 @@ queue_exit:
return BLK_QC_T_NONE;
}
+static size_t order_to_size(unsigned int order)
+{
+ return (size_t)PAGE_SIZE << order;
+}
+
+/* called before freeing request pool in @tags */
+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags, unsigned int hctx_idx)
+{
+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
+ struct page *page;
+ unsigned long flags;
+
+ list_for_each_entry(page, &tags->page_list, lru) {
+ unsigned long start = (unsigned long)page_address(page);
+ unsigned long end = start + order_to_size(page->private);
+ int i;
+
+ for (i = 0; i < set->queue_depth; i++) {
+ struct request *rq = drv_tags->rqs[i];
+ unsigned long rq_addr = (unsigned long)rq;
+
+ if (rq_addr >= start && rq_addr < end) {
+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
+ }
+ }
+ }
+
+ /*
+ * Wait until all pending iteration is done.
+ *
+ * Request reference is cleared and it is guaranteed to be observed
+ * after the ->lock is released.
+ */
+ spin_lock_irqsave(&drv_tags->lock, flags);
+ spin_unlock_irqrestore(&drv_tags->lock, flags);
+}
+
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
@@ -2321,6 +2361,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
}
+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
+
while (!list_empty(&tags->page_list)) {
page = list_first_entry(&tags->page_list, struct page, lru);
list_del_init(&page->lru);
@@ -2380,11 +2422,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
return tags;
}
-static size_t order_to_size(unsigned int order)
-{
- return (size_t)PAGE_SIZE << order;
-}
-
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, int node)
{
@@ -2603,16 +2640,49 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
&hctx->cpuhp_dead);
}
+/*
+ * Before freeing hw queue, clearing the flush request reference in
+ * tags->rqs[] for avoiding potential UAF.
+ */
+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
+ unsigned int queue_depth, struct request *flush_rq)
+{
+ int i;
+ unsigned long flags;
+
+ /* The hw queue may not be mapped yet */
+ if (!tags)
+ return;
+
+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
+
+ for (i = 0; i < queue_depth; i++)
+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
+
+ /*
+ * Wait until all pending iteration is done.
+ *
+ * Request reference is cleared and it is guaranteed to be observed
+ * after the ->lock is released.
+ */
+ spin_lock_irqsave(&tags->lock, flags);
+ spin_unlock_irqrestore(&tags->lock, flags);
+}
+
/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
+ struct request *flush_rq = hctx->fq->flush_rq;
+
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
+ set->queue_depth, flush_rq);
if (set->ops->exit_request)
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
+ set->ops->exit_request(set, flush_rq, hctx_idx);
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -3042,21 +3112,18 @@ void blk_mq_release(struct request_queue *q)
struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
void *queuedata)
{
- struct request_queue *uninit_q, *q;
+ struct request_queue *q;
+ int ret;
- uninit_q = blk_alloc_queue(set->numa_node);
- if (!uninit_q)
+ q = blk_alloc_queue(set->numa_node);
+ if (!q)
return ERR_PTR(-ENOMEM);
- uninit_q->queuedata = queuedata;
-
- /*
- * Initialize the queue without an elevator. device_add_disk() will do
- * the initialization.
- */
- q = blk_mq_init_allocated_queue(set, uninit_q, false);
- if (IS_ERR(q))
- blk_cleanup_queue(uninit_q);
-
+ q->queuedata = queuedata;
+ ret = blk_mq_init_allocated_queue(set, q);
+ if (ret) {
+ blk_cleanup_queue(q);
+ return ERR_PTR(ret);
+ }
return q;
}
EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
@@ -3067,39 +3134,24 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
-/*
- * Helper for setting up a queue with mq ops, given queue depth, and
- * the passed in mq ops flags.
- */
-struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
- const struct blk_mq_ops *ops,
- unsigned int queue_depth,
- unsigned int set_flags)
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata)
{
struct request_queue *q;
- int ret;
+ struct gendisk *disk;
- memset(set, 0, sizeof(*set));
- set->ops = ops;
- set->nr_hw_queues = 1;
- set->nr_maps = 1;
- set->queue_depth = queue_depth;
- set->numa_node = NUMA_NO_NODE;
- set->flags = set_flags;
-
- ret = blk_mq_alloc_tag_set(set);
- if (ret)
- return ERR_PTR(ret);
+ q = blk_mq_init_queue_data(set, queuedata);
+ if (IS_ERR(q))
+ return ERR_CAST(q);
- q = blk_mq_init_queue(set);
- if (IS_ERR(q)) {
- blk_mq_free_tag_set(set);
- return q;
+ disk = __alloc_disk_node(0, set->numa_node);
+ if (!disk) {
+ blk_cleanup_queue(q);
+ return ERR_PTR(-ENOMEM);
}
-
- return q;
+ disk->queue = q;
+ return disk;
}
-EXPORT_SYMBOL(blk_mq_init_sq_queue);
+EXPORT_SYMBOL(__blk_mq_alloc_disk);
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
@@ -3212,9 +3264,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
mutex_unlock(&q->sysfs_lock);
}
-struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
- struct request_queue *q,
- bool elevator_init)
+int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -3264,11 +3315,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
-
- if (elevator_init)
- elevator_init_mq(q);
-
- return q;
+ return 0;
err_hctxs:
kfree(q->queue_hw_ctx);
@@ -3279,7 +3326,7 @@ err_poll:
q->poll_cb = NULL;
err_exit:
q->mq_ops = NULL;
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@@ -3491,7 +3538,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (blk_mq_is_sbitmap_shared(set->flags)) {
atomic_set(&set->active_queues_shared_sbitmap, 0);
- if (blk_mq_init_shared_sbitmap(set, set->flags)) {
+ if (blk_mq_init_shared_sbitmap(set)) {
ret = -ENOMEM;
goto out_free_mq_rq_maps;
}
@@ -3516,6 +3563,22 @@ out_free_mq_map:
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
+/* allocate and initialize a tagset for a simple single-queue device */
+int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
+ const struct blk_mq_ops *ops, unsigned int queue_depth,
+ unsigned int set_flags)
+{
+ memset(set, 0, sizeof(*set));
+ set->ops = ops;
+ set->nr_hw_queues = 1;
+ set->nr_maps = 1;
+ set->queue_depth = queue_depth;
+ set->numa_node = NUMA_NO_NODE;
+ set->flags = set_flags;
+ return blk_mq_alloc_tag_set(set);
+}
+EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
+
void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i, j;
@@ -3567,15 +3630,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
} else {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
nr, true);
+ if (blk_mq_is_sbitmap_shared(set->flags)) {
+ hctx->sched_tags->bitmap_tags =
+ &q->sched_bitmap_tags;
+ hctx->sched_tags->breserved_tags =
+ &q->sched_breserved_tags;
+ }
}
if (ret)
break;
if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(hctx);
}
-
- if (!ret)
+ if (!ret) {
q->nr_requests = nr;
+ if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
+ sbitmap_queue_resize(&q->sched_bitmap_tags,
+ nr - set->reserved_tags);
+ }
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);