summaryrefslogtreecommitdiff
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-03-03 10:53:35 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-03-03 10:53:35 -0800
commite0d072250a54669dce876d8ade70e417356aae74 (patch)
treeecbb2fc170349231f3885749f07748779225805f /block/blk-mq.c
parent1827adb11ad26b2290dc9fe2aaf54976b2439865 (diff)
parent165a5e22fafb127ecb5914e12e8c32a1f0d3f820 (diff)
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block layer fixes from Jens Axboe: "A collection of fixes for this merge window, either fixes for existing issues, or parts that were waiting for acks to come in. This pull request contains: - Allocation of nvme queues on the right node from Shaohua. This was ready long before the merge window, but waiting on an ack from Bjorn on the PCI bit. Now that we have that, the three patches can go in. - Two fixes for blk-mq-sched with nvmeof, which uses hctx specific request allocations. This caused an oops. One part from Sagi, one part from Omar. - A loop partition scan deadlock fix from Omar, fixing a regression in this merge window. - A three-patch series from Keith, closing up a hole on clearing out requests on shutdown/resume. - A stable fix for nbd from Josef, fixing a leak of sockets. - Two fixes for a regression in this window from Jan, fixing a problem with one of his earlier patches dealing with queue vs bdi life times. - A fix for a regression with virtio-blk, causing an IO stall if scheduling is used. From me. - A fix for an io context lock ordering problem. From me" * 'for-linus' of git://git.kernel.dk/linux-block: block: Move bdi_unregister() to del_gendisk() blk-mq: ensure that bd->last is always set correctly block: don't call ioc_exit_icq() with the queue lock held for blk-mq block: Initialize bd_bdi on inode initialization loop: fix LO_FLAGS_PARTSCAN hang nvme: Complete all stuck requests blk-mq: Provide freeze queue timeout blk-mq: Export blk_mq_freeze_queue_wait nbd: stop leaking sockets blk-mq: move update of tags->rqs to __blk_mq_alloc_request() blk-mq: kill blk_mq_set_alloc_data() blk-mq: make blk_mq_alloc_request_hctx() allocate a scheduler request blk-mq-sched: Allocate sched reserved tags as specified in the original queue tagset nvme: allocate nvme_queue in correct node PCI: add an API to get node from vector blk-mq: allocate blk_mq_tags and requests in correct node
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c120
1 files changed, 88 insertions, 32 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6f35b6fd4799..b2fd175e84d7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -77,10 +77,20 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
-static void blk_mq_freeze_queue_wait(struct request_queue *q)
+void blk_mq_freeze_queue_wait(struct request_queue *q)
{
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
}
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
+
+int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
+ unsigned long timeout)
+{
+ return wait_event_timeout(q->mq_freeze_wq,
+ percpu_ref_is_zero(&q->q_usage_counter),
+ timeout);
+}
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
/*
* Guarantee no request is in use, so we can change any data structure of
@@ -236,6 +246,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
}
rq->tag = tag;
rq->internal_tag = -1;
+ data->hctx->tags->rqs[rq->tag] = rq;
}
blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
@@ -275,10 +286,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
unsigned int flags, unsigned int hctx_idx)
{
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
+ struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
- struct blk_mq_alloc_data alloc_data;
+ unsigned int cpu;
int ret;
/*
@@ -301,25 +311,23 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
- hctx = q->queue_hw_ctx[hctx_idx];
- if (!blk_mq_hw_queue_mapped(hctx)) {
- ret = -EXDEV;
- goto out_queue_exit;
+ alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
+ if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
+ blk_queue_exit(q);
+ return ERR_PTR(-EXDEV);
}
- ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
+ cpu = cpumask_first(alloc_data.hctx->cpumask);
+ alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
- blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, rw);
- if (!rq) {
- ret = -EWOULDBLOCK;
- goto out_queue_exit;
- }
-
- return rq;
+ rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
-out_queue_exit:
+ blk_mq_put_ctx(alloc_data.ctx);
blk_queue_exit(q);
- return ERR_PTR(ret);
+
+ if (!rq)
+ return ERR_PTR(-EWOULDBLOCK);
+
+ return rq;
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
@@ -854,6 +862,9 @@ done:
return true;
}
+ if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
+ data.flags |= BLK_MQ_REQ_RESERVED;
+
rq->tag = blk_mq_get_tag(&data);
if (rq->tag >= 0) {
if (blk_mq_tag_busy(data.hctx)) {
@@ -867,12 +878,9 @@ done:
return false;
}
-static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
- struct request *rq)
+static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
{
- if (rq->tag == -1 || rq->internal_tag == -1)
- return;
-
blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = -1;
@@ -882,6 +890,26 @@ static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
}
}
+static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ if (rq->tag == -1 || rq->internal_tag == -1)
+ return;
+
+ __blk_mq_put_driver_tag(hctx, rq);
+}
+
+static void blk_mq_put_driver_tag(struct request *rq)
+{
+ struct blk_mq_hw_ctx *hctx;
+
+ if (rq->tag == -1 || rq->internal_tag == -1)
+ return;
+
+ hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+ __blk_mq_put_driver_tag(hctx, rq);
+}
+
/*
* If we fail getting a driver tag because all the driver tags are already
* assigned and on the dispatch list, BUT the first entry does not have a
@@ -991,7 +1019,19 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
bd.rq = rq;
bd.list = dptr;
- bd.last = list_empty(list);
+
+ /*
+ * Flag last if we have no more requests, or if we have more
+ * but can't assign a driver tag to it.
+ */
+ if (list_empty(list))
+ bd.last = true;
+ else {
+ struct request *nxt;
+
+ nxt = list_first_entry(list, struct request, queuelist);
+ bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
+ }
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
@@ -999,7 +1039,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
queued++;
break;
case BLK_MQ_RQ_QUEUE_BUSY:
- blk_mq_put_driver_tag(hctx, rq);
+ blk_mq_put_driver_tag_hctx(hctx, rq);
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
@@ -1029,6 +1069,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
* that is where we will continue on next queue run.
*/
if (!list_empty(list)) {
+ /*
+ * If we got a driver tag for the next request already,
+ * free it again.
+ */
+ rq = list_first_entry(list, struct request, queuelist);
+ blk_mq_put_driver_tag(rq);
+
spin_lock(&hctx->lock);
list_splice_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
@@ -1715,16 +1762,20 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int reserved_tags)
{
struct blk_mq_tags *tags;
+ int node;
- tags = blk_mq_init_tags(nr_tags, reserved_tags,
- set->numa_node,
+ node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+ if (node == NUMA_NO_NODE)
+ node = set->numa_node;
+
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;
tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
- set->numa_node);
+ node);
if (!tags->rqs) {
blk_mq_free_tags(tags);
return NULL;
@@ -1732,7 +1783,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
- set->numa_node);
+ node);
if (!tags->static_rqs) {
kfree(tags->rqs);
blk_mq_free_tags(tags);
@@ -1752,6 +1803,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
{
unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left;
+ int node;
+
+ node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+ if (node == NUMA_NO_NODE)
+ node = set->numa_node;
INIT_LIST_HEAD(&tags->page_list);
@@ -1773,7 +1829,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
this_order--;
do {
- page = alloc_pages_node(set->numa_node,
+ page = alloc_pages_node(node,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
this_order);
if (page)
@@ -1806,7 +1862,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
if (set->ops->init_request) {
if (set->ops->init_request(set->driver_data,
rq, hctx_idx, i,
- set->numa_node)) {
+ node)) {
tags->static_rqs[i] = NULL;
goto fail;
}