From 748dc0b65ec2b4b7b3dbd7befcc4a54fdcac7988 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Jan 2024 18:29:42 +0900 Subject: block: fix partial zone append completion handling in req_bio_endio() Partial completions of zone append request is not allowed but if a zone append completion indicates a number of completed bytes different from the original BIO size, only the BIO status is set to error. This leads to bio_advance() not setting the BIO size to 0 and thus to not call bio_endio() at the end of req_bio_endio(). Make sure a partially completed zone append is failed and completed immediately by forcing the completed number of bytes (nbytes) to be equal to the BIO size, thus ensuring that bio_endio() is called. Fixes: 297db731847e ("block: fix req_bio_endio append error handling") Cc: stable@kernel.vger.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240110092942.442334-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index fb29ff5cc281..aa9a05fdd023 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -772,11 +772,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. + * For such case, force the completed nbytes to be equal to + * the BIO size so that bio_advance() sets the BIO remaining + * size to 0 and we end up calling bio_endio() before returning. */ - if (bio->bi_iter.bi_size != nbytes) + if (bio->bi_iter.bi_size != nbytes) { bio->bi_status = BLK_STS_IOERR; - else + nbytes = bio->bi_iter.bi_size; + } else { bio->bi_iter.bi_sector = rq->__sector; + } } bio_advance(bio, nbytes); -- cgit From 5266caaf5660529e3da53004b8b7174cab6374ed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 12 Jan 2024 20:26:26 +0800 Subject: blk-mq: fix IO hang from sbitmap wakeup race In blk_mq_mark_tag_wait(), __add_wait_queue() may be re-ordered with the following blk_mq_get_driver_tag() in case of getting driver tag failure. Then in __sbitmap_queue_wake_up(), waitqueue_active() may not observe the added waiter in blk_mq_mark_tag_wait() and wake up nothing, meantime blk_mq_mark_tag_wait() can't get driver tag successfully. This issue can be reproduced by running the following test in loop, and fio hang can be observed in < 30min when running it on my test VM in laptop. modprobe -r scsi_debug modprobe scsi_debug delay=0 dev_size_mb=4096 max_queue=1 host_max_queue=1 submit_queues=4 dev=`ls -d /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/* | head -1 | xargs basename` fio --filename=/dev/"$dev" --direct=1 --rw=randrw --bs=4k --iodepth=1 \ --runtime=100 --numjobs=40 --time_based --name=test \ --ioengine=libaio Fix the issue by adding one explicit barrier in blk_mq_mark_tag_wait(), which is just fine in case of running out of tag. Cc: Jan Kara Cc: Kemeng Shi Reported-by: Changhui Zhong Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240112122626.4181044-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index aa9a05fdd023..a4c54c5895a1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1852,6 +1852,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); + /* + * Add one explicit barrier since blk_mq_get_driver_tag() may + * not imply barrier in case of failure. + * + * Order adding us to wait queue and allocating driver tag. + * + * The pair is the one implied in sbitmap_queue_wake_up() which + * orders clearing sbitmap tag bits and waitqueue_active() in + * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless + * + * Otherwise, re-order of adding wait queue and getting driver tag + * may cause __sbitmap_queue_wake_up() to wake up nothing because + * the waitqueue_active() may not observe us in wait queue. + */ + smp_mb(); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait -- cgit From 309ce6741430b5a7b5e85cd1a7569647f8d9dfa6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jan 2024 14:57:04 +0100 Subject: blk-mq: rename blk_mq_can_use_cached_rq blk_mq_can_use_cached_rq doesn't just check if we can use the request, but also performs the work to actually use it. Remove the _can in the naming, and improve the comment describing the function. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240111135705.2155518-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a4c54c5895a1..f57b86d6de6a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2900,8 +2900,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, return NULL; } -/* return true if this @rq can be used for @bio */ -static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, +/* + * Check if we can use the passed on request for submitting the passed in bio, + * and remove it from the request list if it can be used. + */ +static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); @@ -2979,7 +2982,7 @@ void blk_mq_submit_bio(struct bio *bio) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; - if (blk_mq_can_use_cached_rq(rq, plug, bio)) + if (blk_mq_use_cached_rq(rq, plug, bio)) goto done; percpu_ref_get(&q->q_usage_counter); } else { -- cgit From 7b4f36cd22a65b750b4cb6ac14804fb7d6e6c67d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Jan 2024 09:12:20 -0700 Subject: block: ensure we hold a queue reference when using queue limits q_usage_counter is the only thing preventing us from the limits changing under us in __bio_split_to_limits, but blk_mq_submit_bio doesn't hold it while calling into it. Move the splitting inside the region where we know we've got a queue reference. Ideally this could still remain a shared section of code, but let's keep the fix simple and defer any refactoring here to later. Reported-by: Christoph Hellwig Fixes: 900e08075202 ("block: move queue enter logic into blk_mq_submit_bio()") Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index f57b86d6de6a..e02c4b1af8c5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2964,12 +2964,6 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t ret; bio = blk_queue_bounce(bio, q); - if (bio_may_exceed_limits(bio, &q->limits)) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - return; - } - bio_set_ioprio(bio); if (plug) { @@ -2978,6 +2972,11 @@ void blk_mq_submit_bio(struct bio *bio) rq = NULL; } if (rq) { + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + return; + } if (!bio_integrity_prep(bio)) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) @@ -2988,6 +2987,11 @@ void blk_mq_submit_bio(struct bio *bio) } else { if (unlikely(bio_queue_enter(bio))) return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto fail; + } if (!bio_integrity_prep(bio)) goto fail; } -- cgit