From 0df71650c051ab106c921de257f4b38e9e3dd251 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Mar 2022 16:35:24 -0400 Subject: block: allow using the per-cpu bio cache from bio_alloc_bioset Replace the BIO_PERCPU_CACHE bio-internal flag with a REQ_ALLOC_CACHE one that can be passed to bio_alloc / bio_alloc_bioset, and implement the percpu cache allocation logic in a helper called from bio_alloc_bioset. This allows any bio_alloc_bioset user to use the percpu caches instead of having the functionality tied to struct kiocb. Signed-off-by: Mike Snitzer [hch: refactored a bit] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220324203526.62306-2-snitzer@kernel.org Signed-off-by: Jens Axboe --- block/bio.c | 86 +++++++++++++++++++++++++++++------------------------------- block/blk.h | 3 +-- block/fops.c | 11 +++++--- 3 files changed, 50 insertions(+), 50 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 4259125e16ab..c67f47c6cc8c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -419,6 +419,28 @@ static void punt_bios_to_rescuer(struct bio_set *bs) queue_work(bs->rescue_workqueue, &bs->rescue_work); } +static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, + unsigned short nr_vecs, unsigned int opf, gfp_t gfp, + struct bio_set *bs) +{ + struct bio_alloc_cache *cache; + struct bio *bio; + + cache = per_cpu_ptr(bs->cache, get_cpu()); + if (!cache->free_list) { + put_cpu(); + return NULL; + } + bio = cache->free_list; + cache->free_list = bio->bi_next; + cache->nr--; + put_cpu(); + + bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + bio->bi_pool = bs; + return bio; +} + /** * bio_alloc_bioset - allocate a bio for I/O * @bdev: block device to allocate the bio for (can be %NULL) @@ -451,6 +473,9 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * + * If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process + * context, not hard/soft IRQ. + * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, @@ -465,6 +490,21 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; + if (opf & REQ_ALLOC_CACHE) { + if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { + bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, + gfp_mask, bs); + if (bio) + return bio; + /* + * No cached bio available, bio returned below marked with + * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache. + */ + } else { + opf &= ~REQ_ALLOC_CACHE; + } + } + /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be @@ -711,7 +751,7 @@ void bio_put(struct bio *bio) return; } - if (bio_flagged(bio, BIO_PERCPU_CACHE)) { + if (bio->bi_opf & REQ_ALLOC_CACHE) { struct bio_alloc_cache *cache; bio_uninit(bio); @@ -1732,50 +1772,6 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -/** - * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb - * @kiocb: kiocb describing the IO - * @bdev: block device to allocate the bio for (can be %NULL) - * @nr_vecs: number of iovecs to pre-allocate - * @opf: operation and flags for bio - * @bs: bio_set to allocate from - * - * Description: - * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only - * used to check if we should dip into the per-cpu bio_set allocation - * cache. The allocation uses GFP_KERNEL internally. On return, the - * bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio - * MUST be done from process context, not hard/soft IRQ. - * - */ -struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, - unsigned short nr_vecs, unsigned int opf, struct bio_set *bs) -{ - struct bio_alloc_cache *cache; - struct bio *bio; - - if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS) - return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs); - - cache = per_cpu_ptr(bs->cache, get_cpu()); - if (cache->free_list) { - bio = cache->free_list; - cache->free_list = bio->bi_next; - cache->nr--; - put_cpu(); - bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, - nr_vecs, opf); - bio->bi_pool = bs; - bio_set_flag(bio, BIO_PERCPU_CACHE); - return bio; - } - put_cpu(); - bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs); - bio_set_flag(bio, BIO_PERCPU_CACHE); - return bio; -} -EXPORT_SYMBOL_GPL(bio_alloc_kiocb); - static int __init init_bio(void) { int i; diff --git a/block/blk.h b/block/blk.h index 8ccbc6e07636..0d33e4ca2c52 100644 --- a/block/blk.h +++ b/block/blk.h @@ -453,8 +453,7 @@ extern struct device_attribute dev_attr_events_poll_msecs; static inline void bio_clear_polled(struct bio *bio) { /* can't support alloc cache if we turn off polling */ - bio_clear_flag(bio, BIO_PERCPU_CACHE); - bio->bi_opf &= ~REQ_POLLED; + bio->bi_opf &= ~(REQ_POLLED | REQ_ALLOC_CACHE); } long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/block/fops.c b/block/fops.c index 9f2ecec406b0..ba5e7d5ff9a5 100644 --- a/block/fops.c +++ b/block/fops.c @@ -197,8 +197,10 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool); - + if (iocb->ki_flags & IOCB_ALLOC_CACHE) + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, + &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); atomic_set(&dio->ref, 1); /* @@ -320,7 +322,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool); + if (iocb->ki_flags & IOCB_ALLOC_CACHE) + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, + &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->flags = 0; dio->iocb = iocb; -- cgit From b53f3dcd705e52a07bd0311870dbfb6842e88c91 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Mar 2022 16:35:25 -0400 Subject: block: allow use of per-cpu bio alloc cache by block drivers Refine per-cpu bio alloc cache interfaces so that DM and other block drivers can properly create and use the cache: DM uses bioset_init_from_src() to do its final bioset initialization, so must update bioset_init_from_src() to set BIOSET_PERCPU_CACHE if %src bioset has a cache. Also move bio_clear_polled() to include/linux/bio.h to allow users outside of block core. Signed-off-by: Mike Snitzer Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220324203526.62306-3-snitzer@kernel.org Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ block/blk.h | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index c67f47c6cc8c..643a3a052c7e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1767,6 +1767,8 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) flags |= BIOSET_NEED_BVECS; if (src->rescue_workqueue) flags |= BIOSET_NEED_RESCUER; + if (src->cache) + flags |= BIOSET_PERCPU_CACHE; return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags); } diff --git a/block/blk.h b/block/blk.h index 0d33e4ca2c52..4ea5167dc339 100644 --- a/block/blk.h +++ b/block/blk.h @@ -450,12 +450,6 @@ extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; -static inline void bio_clear_polled(struct bio *bio) -{ - /* can't support alloc cache if we turn off polling */ - bio->bi_opf &= ~(REQ_POLLED | REQ_ALLOC_CACHE); -} - long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); -- cgit From 066ff571011d8416e903d3d4f1f41e0b5eb91e1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Apr 2022 08:12:27 +0200 Subject: block: turn bio_kmalloc into a simple kmalloc wrapper Remove the magic autofree semantics and require the callers to explicitly call bio_init to initialize the bio. This allows bio_free to catch accidental bio_put calls on bio_init()ed bios as well. Signed-off-by: Christoph Hellwig Acked-by: Coly Li Acked-by: Mike Snitzer Link: https://lore.kernel.org/r/20220406061228.410163-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 47 +++++++++++++++++---------------------------- block/blk-crypto-fallback.c | 14 ++++++++------ block/blk-map.c | 42 +++++++++++++++++++++++++--------------- 3 files changed, 53 insertions(+), 50 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 643a3a052c7e..212ccd5b5212 100644 --- a/block/bio.c +++ b/block/bio.c @@ -224,24 +224,13 @@ EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; - void *p; - - bio_uninit(bio); + void *p = bio; - if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + WARN_ON_ONCE(!bs); - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - p -= bs->front_pad; - - mempool_free(p, &bs->bio_pool); - } else { - /* Bio was allocated by bio_kmalloc() */ - kfree(bio); - } + bio_uninit(bio); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + mempool_free(p - bs->front_pad, &bs->bio_pool); } /* @@ -568,28 +557,28 @@ err_free: EXPORT_SYMBOL(bio_alloc_bioset); /** - * bio_kmalloc - kmalloc a bio for I/O + * bio_kmalloc - kmalloc a bio + * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate * - * Use kmalloc to allocate and initialize a bio. + * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized + * using bio_init() before use. To free a bio returned from this function use + * kfree() after calling bio_uninit(). A bio returned from this function can + * be reused by calling bio_uninit() before calling bio_init() again. + * + * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this + * function are not backed by a mempool can can fail. Do not use this function + * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) +struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_iovecs > UIO_MAXIOV) + if (nr_vecs > UIO_MAXIOV) return NULL; - - bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - if (unlikely(!bio)) - return NULL; - bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, - 0); - bio->bi_pool = NULL; - return bio; + return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 7c854584b52b..5d1aa5b1d30a 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -152,23 +152,25 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) src_bio->bi_status = enc_bio->bi_status; - bio_put(enc_bio); + bio_uninit(enc_bio); + kfree(enc_bio); bio_endio(src_bio); } static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) { + unsigned int nr_segs = bio_segments(bio_src); struct bvec_iter iter; struct bio_vec bv; struct bio *bio; - bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); + bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio->bi_bdev = bio_src->bi_bdev; + bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, + bio_src->bi_opf); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); - bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -363,8 +365,8 @@ out_release_keyslot: blk_crypto_put_keyslot(slot); out_put_enc_bio: if (enc_bio) - bio_put(enc_bio); - + bio_uninit(enc_bio); + kfree(enc_bio); return ret; } diff --git a/block/blk-map.c b/block/blk-map.c index c7f71d83eff1..7ffde64f9019 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -152,10 +152,10 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE)); ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - bio->bi_opf |= req_op(rq); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1 << map_data->page_order; @@ -224,7 +224,8 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, cleanup: if (!map_data) bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); out_bmd: kfree(bmd); return ret; @@ -234,6 +235,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { unsigned int max_sectors = queue_max_hw_sectors(rq->q); + unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; int j; @@ -241,10 +243,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_VECS)); + bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return -ENOMEM; - bio->bi_opf |= req_op(rq); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); while (iov_iter_count(iter)) { struct page **pages; @@ -303,7 +305,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ret; } @@ -323,7 +326,8 @@ static void bio_invalidate_vmalloc_pages(struct bio *bio) static void bio_map_kern_endio(struct bio *bio) { bio_invalidate_vmalloc_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } /** @@ -348,9 +352,10 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, int offset, i; struct bio *bio; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); if (is_vmalloc) { flush_kernel_vmap_range(data, len); @@ -374,7 +379,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ERR_PTR(-EINVAL); } @@ -390,7 +396,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, static void bio_copy_kern_endio(struct bio *bio) { bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } static void bio_copy_kern_endio_read(struct bio *bio) @@ -435,9 +442,10 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, return ERR_PTR(-EINVAL); nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); while (len) { struct page *page; @@ -471,7 +479,8 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, cleanup: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ERR_PTR(-ENOMEM); } @@ -602,7 +611,8 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_put(next_bio); + bio_uninit(next_bio); + kfree(next_bio); } return ret; @@ -648,8 +658,10 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf |= req_op(rq); ret = blk_rq_append_bio(rq, bio); - if (unlikely(ret)) - bio_put(bio); + if (unlikely(ret)) { + bio_uninit(bio); + kfree(bio); + } return ret; } EXPORT_SYMBOL(blk_rq_map_kern); -- cgit From 70456e5210f40ffdb8f6d905acfdcec5bd5fad9e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:42 +0200 Subject: bfq: Avoid false marking of bic as stably merged bfq_setup_cooperator() can mark bic as stably merged even though it decides to not merge its bfqqs (when bfq_setup_merge() returns NULL). Make sure to mark bic as stably merged only if we are really going to merge bfqqs. CC: stable@vger.kernel.org Tested-by: "yukuai (C)" Fixes: 430a67f9d616 ("block, bfq: merge bursts of newly-created queues") Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2e0dd68a3cbe..6d122c28086e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2895,9 +2895,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_queue *new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); - bic->stably_merged = true; - if (new_bfqq && new_bfqq->bic) - new_bfqq->bic->stably_merged = true; + if (new_bfqq) { + bic->stably_merged = true; + if (new_bfqq->bic) + new_bfqq->bic->stably_merged = + true; + } return new_bfqq; } else return NULL; -- cgit From c1cee4ab36acef271be9101590756ed0c0c374d9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:43 +0200 Subject: bfq: Avoid merging queues with different parents It can happen that the parent of a bfqq changes between the moment we decide two queues are worth to merge (and set bic->stable_merge_bfqq) and the moment bfq_setup_merge() is called. This can happen e.g. because the process submitted IO for a different cgroup and thus bfqq got reparented. It can even happen that the bfqq we are merging with has parent cgroup that is already offline and going to be destroyed in which case the merge can lead to use-after-free issues such as: BUG: KASAN: use-after-free in __bfq_deactivate_entity+0x9cb/0xa50 Read of size 8 at addr ffff88800693c0c0 by task runc:[2:INIT]/10544 CPU: 0 PID: 10544 Comm: runc:[2:INIT] Tainted: G E 5.15.2-0.g5fb85fd-default #1 openSUSE Tumbleweed (unreleased) f1f3b891c72369aebecd2e43e4641a6358867c70 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a-rebuilt.opensuse.org 04/01/2014 Call Trace: dump_stack_lvl+0x46/0x5a print_address_description.constprop.0+0x1f/0x140 ? __bfq_deactivate_entity+0x9cb/0xa50 kasan_report.cold+0x7f/0x11b ? __bfq_deactivate_entity+0x9cb/0xa50 __bfq_deactivate_entity+0x9cb/0xa50 ? update_curr+0x32f/0x5d0 bfq_deactivate_entity+0xa0/0x1d0 bfq_del_bfqq_busy+0x28a/0x420 ? resched_curr+0x116/0x1d0 ? bfq_requeue_bfqq+0x70/0x70 ? check_preempt_wakeup+0x52b/0xbc0 __bfq_bfqq_expire+0x1a2/0x270 bfq_bfqq_expire+0xd16/0x2160 ? try_to_wake_up+0x4ee/0x1260 ? bfq_end_wr_async_queues+0xe0/0xe0 ? _raw_write_unlock_bh+0x60/0x60 ? _raw_spin_lock_irq+0x81/0xe0 bfq_idle_slice_timer+0x109/0x280 ? bfq_dispatch_request+0x4870/0x4870 __hrtimer_run_queues+0x37d/0x700 ? enqueue_hrtimer+0x1b0/0x1b0 ? kvm_clock_get_cycles+0xd/0x10 ? ktime_get_update_offsets_now+0x6f/0x280 hrtimer_interrupt+0x2c8/0x740 Fix the problem by checking that the parent of the two bfqqs we are merging in bfq_setup_merge() is the same. Link: https://lore.kernel.org/linux-block/20211125172809.GC19572@quack2.suse.cz/ CC: stable@vger.kernel.org Fixes: 430a67f9d616 ("block, bfq: merge bursts of newly-created queues") Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-2-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6d122c28086e..7d00b21ebe5d 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2758,6 +2758,14 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) if (process_refs == 0 || new_process_refs == 0) return NULL; + /* + * Make sure merged queues belong to the same parent. Parents could + * have changed since the time we decided the two queues are suitable + * for merging. + */ + if (new_bfqq->entity.parent != bfqq->entity.parent) + return NULL; + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", new_bfqq->pid); -- cgit From 3bc5e683c67d94bd839a1da2e796c15847b51b69 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:44 +0200 Subject: bfq: Split shared queues on move between cgroups When bfqq is shared by multiple processes it can happen that one of the processes gets moved to a different cgroup (or just starts submitting IO for different cgroup). In case that happens we need to split the merged bfqq as otherwise we will have IO for multiple cgroups in one bfqq and we will just account IO time to wrong entities etc. Similarly if the bfqq is scheduled to merge with another bfqq but the merge didn't happen yet, cancel the merge as it need not be valid anymore. CC: stable@vger.kernel.org Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support") Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-3-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 36 +++++++++++++++++++++++++++++++++--- block/bfq-iosched.c | 2 +- block/bfq-iosched.h | 1 + 3 files changed, 35 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 420eda2589c0..9352f3cc2377 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -743,9 +743,39 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, } if (sync_bfqq) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { + /* We are the only user of this bfqq, just move it */ + if (sync_bfqq->entity.sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + } else { + struct bfq_queue *bfqq; + + /* + * The queue was merged to a different queue. Check + * that the merge chain still belongs to the same + * cgroup. + */ + for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) + if (bfqq->entity.sched_data != + &bfqg->sched_data) + break; + if (bfqq) { + /* + * Some queue changed cgroup so the merge is + * not valid anymore. We cannot easily just + * cancel the merge (by clearing new_bfqq) as + * there may be other processes using this + * queue and holding refs to all queues below + * sync_bfqq->new_bfqq. Similarly if the merge + * already happened, we need to detach from + * bfqq now so that we cannot merge bio to a + * request from the old cgroup. + */ + bfq_put_cooperator(sync_bfqq); + bfq_release_process_ref(bfqd, sync_bfqq); + bic_set_bfqq(bic, NULL, 1); + } + } } return bfqg; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7d00b21ebe5d..89fe3f85eb3c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5315,7 +5315,7 @@ static void bfq_put_stable_ref(struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void bfq_put_cooperator(struct bfq_queue *bfqq) +void bfq_put_cooperator(struct bfq_queue *bfqq) { struct bfq_queue *__bfqq, *next; diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 3b83e3d1c2e5..a56763045d19 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -979,6 +979,7 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); +void bfq_put_cooperator(struct bfq_queue *bfqq); void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_schedule_dispatch(struct bfq_data *bfqd); -- cgit From ea591cd4eb270393810e7be01feb8fde6a34fbbe Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:45 +0200 Subject: bfq: Update cgroup information before merging bio When the process is migrated to a different cgroup (or in case of writeback just starts submitting bios associated with a different cgroup) bfq_merge_bio() can operate with stale cgroup information in bic. Thus the bio can be merged to a request from a different cgroup or it can result in merging of bfqqs for different cgroups or bfqqs of already dead cgroups and causing possible use-after-free issues. Fix the problem by updating cgroup information in bfq_merge_bio(). CC: stable@vger.kernel.org Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support") Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-4-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 89fe3f85eb3c..1fc4d4628fba 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2457,10 +2457,17 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, spin_lock_irq(&bfqd->lock); - if (bic) + if (bic) { + /* + * Make sure cgroup info is uptodate for current process before + * considering the merge. + */ + bfq_bic_update_cgroup(bic, bio); + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - else + } else { bfqd->bio_bfqq = NULL; + } bfqd->bio_bic = bic; ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); -- cgit From fc84e1f941b91221092da5b3102ec82da24c5673 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:46 +0200 Subject: bfq: Drop pointless unlock-lock pair In bfq_insert_request() we unlock bfqd->lock only to call trace_block_rq_insert() and then lock bfqd->lock again. This is really pointless since tracing is disabled if we really care about performance and even if the tracepoint is enabled, it is a quick call. CC: stable@vger.kernel.org Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-5-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1fc4d4628fba..19082e14f3c1 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6150,11 +6150,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, return; } - spin_unlock_irq(&bfqd->lock); - trace_block_rq_insert(rq); - spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); if (!bfqq || at_head) { if (at_head) -- cgit From 5f550ede5edf846ecc0067be1ba80514e6fe7f8e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:47 +0200 Subject: bfq: Remove pointless bfq_init_rq() calls We call bfq_init_rq() from request merging functions where requests we get should have already gone through bfq_init_rq() during insert and anyway we want to do anything only if the request is already tracked by BFQ. So replace calls to bfq_init_rq() with RQ_BFQQ() instead to simply skip requests untracked by BFQ. We move bfq_init_rq() call in bfq_insert_request() a bit earlier to cover request merging and thus can transfer FIFO position in case of a merge. CC: stable@vger.kernel.org Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-6-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 19082e14f3c1..d7cf930b47bb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2497,8 +2497,6 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, return ELEVATOR_NO_MERGE; } -static struct bfq_queue *bfq_init_rq(struct request *rq); - static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { @@ -2507,7 +2505,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, blk_rq_pos(req) < blk_rq_pos(container_of(rb_prev(&req->rb_node), struct request, rb_node))) { - struct bfq_queue *bfqq = bfq_init_rq(req); + struct bfq_queue *bfqq = RQ_BFQQ(req); struct bfq_data *bfqd; struct request *prev, *next_rq; @@ -2559,8 +2557,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { - struct bfq_queue *bfqq = bfq_init_rq(rq), - *next_bfqq = bfq_init_rq(next); + struct bfq_queue *bfqq = RQ_BFQQ(rq), + *next_bfqq = RQ_BFQQ(next); if (!bfqq) goto remove; @@ -6129,6 +6127,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q, unsigned int cmd_flags) {} #endif /* CONFIG_BFQ_CGROUP_DEBUG */ +static struct bfq_queue *bfq_init_rq(struct request *rq); + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { @@ -6144,6 +6144,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); + bfqq = bfq_init_rq(rq); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); blk_mq_free_requests(&free); @@ -6152,7 +6153,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); - bfqq = bfq_init_rq(rq); if (!bfqq || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); -- cgit From 09f871868080c33992cd6a9b72a5ca49582578fa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:48 +0200 Subject: bfq: Track whether bfq_group is still online Track whether bfq_group is still online. We cannot rely on blkcg_gq->online because that gets cleared only after all policies are offlined and we need something that gets updated already under bfqd->lock when we are cleaning up our bfq_group to be able to guarantee that when we see online bfq_group, it will stay online while we are holding bfqd->lock lock. CC: stable@vger.kernel.org Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-7-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 3 ++- block/bfq-iosched.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9352f3cc2377..879380c2bc7e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -557,6 +557,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; + bfqg->online = true; bfqg->rq_pos_tree = RB_ROOT; } @@ -603,7 +604,6 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct bfq_entity *entity; bfqg = bfq_lookup_bfqg(bfqd, blkcg); - if (unlikely(!bfqg)) return NULL; @@ -979,6 +979,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) put_async_queues: bfq_put_async_queues(bfqd, bfqg); + bfqg->online = false; spin_unlock_irqrestore(&bfqd->lock, flags); /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index a56763045d19..4664e2f3e828 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -928,6 +928,8 @@ struct bfq_group { /* reference counter (see comments in bfq_bic_update_cgroup) */ int ref; + /* Is bfq_group still online? */ + bool online; struct bfq_entity entity; struct bfq_sched_data sched_data; -- cgit From 4e54a2493e582361adc3bfbf06c7d50d19d18837 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:49 +0200 Subject: bfq: Get rid of __bio_blkcg() usage BFQ usage of __bio_blkcg() is a relict from the past. Furthermore if bio would not be associated with any blkcg, the usage of __bio_blkcg() in BFQ is prone to races with the task being migrated between cgroups as __bio_blkcg() calls at different places could return different blkcgs. Convert BFQ to the new situation where bio->bi_blkg is initialized in bio_set_dev() and thus practically always valid. This allows us to save blkcg_gq lookup and noticeably simplify the code. CC: stable@vger.kernel.org Fixes: 0fe061b9f03c ("blkcg: fix ref count issue with bio_blkcg() using task_css") Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-8-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 63 +++++++++++++++++++---------------------------------- block/bfq-iosched.c | 11 +--------- block/bfq-iosched.h | 3 +-- 3 files changed, 25 insertions(+), 52 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 879380c2bc7e..32d2c2a47480 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -586,27 +586,11 @@ static void bfq_group_set_parent(struct bfq_group *bfqg, entity->sched_data = &parent->sched_data; } -static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, - struct blkcg *blkcg) +static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { - struct blkcg_gq *blkg; - - blkg = blkg_lookup(blkcg, bfqd->queue); - if (likely(blkg)) - return blkg_to_bfqg(blkg); - return NULL; -} - -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - struct bfq_group *bfqg, *parent; + struct bfq_group *parent; struct bfq_entity *entity; - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - if (unlikely(!bfqg)) - return NULL; - /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet @@ -623,8 +607,15 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, bfq_group_set_parent(curr_bfqg, parent); } } +} - return bfqg; +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + if (!blkg) + return bfqd->root_group; + return blkg_to_bfqg(blkg); } /** @@ -714,25 +705,15 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup - * time here, at the price of slightly more complex code. */ -static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct blkcg *blkcg) +static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_group *bfqg; struct bfq_entity *entity; - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) - bfqg = bfqd->root_group; - if (async_bfqq) { entity = &async_bfqq->entity; @@ -784,20 +765,24 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_group *bfqg = NULL; + struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; - rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) - goto out; + return; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); + /* + * New cgroup for this process. Make sure it is linked to bfq internal + * cgroup hierarchy. + */ + bfq_link_bfqg(bfqd, bfqg); + __bfq_bic_change_cgroup(bfqd, bic, bfqg); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following @@ -850,8 +835,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) */ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; -out: - rcu_read_unlock(); } /** @@ -1469,7 +1452,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg) +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d7cf930b47bb..e47c75f1fa0f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5726,14 +5726,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq; struct bfq_group *bfqg; - rcu_read_lock(); - - bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); - if (!bfqg) { - bfqq = &bfqd->oom_bfqq; - goto out; - } - + bfqg = bfq_bio_bfqg(bfqd, bio); if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); @@ -5779,8 +5772,6 @@ out: if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); - - rcu_read_unlock(); return bfqq; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 4664e2f3e828..978ef5d6fe6a 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1009,8 +1009,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_end_wr_async(struct bfq_data *bfqd); -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg); +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio); struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); -- cgit From 075a53b78b815301f8d3dd1ee2cd99554e34f0dd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Apr 2022 12:27:50 +0200 Subject: bfq: Make sure bfqg for which we are queueing requests is online Bios queued into BFQ IO scheduler can be associated with a cgroup that was already offlined. This may then cause insertion of this bfq_group into a service tree. But this bfq_group will get freed as soon as last bio associated with it is completed leading to use after free issues for service tree users. Fix the problem by making sure we always operate on online bfq_group. If the bfq_group associated with the bio is not online, we pick the first online parent. CC: stable@vger.kernel.org Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support") Tested-by: "yukuai (C)" Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220401102752.8599-9-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 32d2c2a47480..09574af83566 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -612,10 +612,19 @@ static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; + struct bfq_group *bfqg; - if (!blkg) - return bfqd->root_group; - return blkg_to_bfqg(blkg); + while (blkg) { + bfqg = blkg_to_bfqg(blkg); + if (bfqg->online) { + bio_associate_blkg_from_css(bio, &blkg->blkcg->css); + return bfqg; + } + blkg = blkg->parent; + } + bio_associate_blkg_from_css(bio, + &bfqg_to_blkg(bfqd->root_group)->blkcg->css); + return bfqd->root_group; } /** -- cgit From 10f0d2a517796b8f6dc04fb0cc3e49003ae6b0bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:42 +0200 Subject: block: add a bdev_nonrot helper Add a helper to check the nonrot flag based on the block_device instead of having to poke into the block layer internal request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: David Sterba [btrfs] Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-12-hch@lst.de Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index f8703db99c73..19c32b763451 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -489,7 +489,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(argp, !blk_queue_nonrot(bdev_get_queue(bdev))); + return put_ushort(argp, !bdev_nonrot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) -- cgit From 64dcc7c2717395b7c83ffb10f040d3be795d03c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:47 +0200 Subject: block: use bdev_alignment_offset in part_alignment_offset_show Replace the open coded offset calculation with the proper helper. This is an ABI change in that the -1 for a misaligned partition is properly propagated, which can be considered a bug fix and matches what is done on the whole device. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-17-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 2ef8dfa1e5c8..240b3fff521e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -200,11 +200,7 @@ static ssize_t part_ro_show(struct device *dev, static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = dev_to_bdev(dev); - - return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits, - bdev->bd_start_sect)); + return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, -- cgit From 640f2a23911b8388989547f89d055afbb910b88e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:48 +0200 Subject: block: use bdev_alignment_offset in disk_alignment_offset_show This does the same as the open coded variant except for an extra branch, and allows to remove queue_alignment_offset entirely. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-18-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index b8b6759d670f..712031ce1907 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1010,7 +1010,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); + return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, -- cgit From 89098b075cb74a80083bc4ed6b71d0ee18b6898f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:49 +0200 Subject: block: move bdev_alignment_offset and queue_limit_alignment_offset out of line No need to inline these fairly larger helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-19-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index b83df3d2eebc..94410a13c0de 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -468,6 +468,16 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); +static int queue_limit_alignment_offset(struct queue_limits *lim, + sector_t sector) +{ + unsigned int granularity = max(lim->physical_block_size, lim->io_min); + unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) + << SECTOR_SHIFT; + + return (granularity + lim->alignment_offset - alignment) % granularity; +} + static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) { sectors = round_down(sectors, lbs >> SECTOR_SHIFT); @@ -901,3 +911,16 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); + +int bdev_alignment_offset(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q->limits.misaligned) + return -1; + if (bdev_is_partition(bdev)) + return queue_limit_alignment_offset(&q->limits, + bdev->bd_start_sect); + return q->limits.alignment_offset; +} +EXPORT_SYMBOL_GPL(bdev_alignment_offset); -- cgit From 4e1462ffe8998749884d61f91be251a7a8719677 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:50 +0200 Subject: block: remove queue_discard_alignment Just use bdev_alignment_offset in disk_discard_alignment_show instead. That helpers is the same except for an always false branch that doesn't matter in this slow path. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-20-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 712031ce1907..36532b931841 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1019,7 +1019,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); + return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, -- cgit From f0f975a4dde890bfe25ce17bf07a6495453988a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:51 +0200 Subject: block: use bdev_discard_alignment in part_discard_alignment_show Use the bdev based alignment helper instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-21-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 240b3fff521e..70dec1c78521 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -206,11 +206,7 @@ static ssize_t part_alignment_offset_show(struct device *dev, static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = dev_to_bdev(dev); - - return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits, - bdev->bd_start_sect)); + return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); -- cgit From 5c4b4a5c6f11c869a57c6bd977143430bc9dc43d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:52 +0200 Subject: block: move {bdev,queue_limit}_discard_alignment out of line No need to inline these fairly larger helpers. Also fix the return value to be unsigned, just like the field in struct queue_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-22-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 94410a13c0de..fd83d674afd0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -478,6 +478,30 @@ static int queue_limit_alignment_offset(struct queue_limits *lim, return (granularity + lim->alignment_offset - alignment) % granularity; } +static unsigned int queue_limit_discard_alignment(struct queue_limits *lim, + sector_t sector) +{ + unsigned int alignment, granularity, offset; + + if (!lim->max_discard_sectors) + return 0; + + /* Why are these in bytes, not sectors? */ + alignment = lim->discard_alignment >> SECTOR_SHIFT; + granularity = lim->discard_granularity >> SECTOR_SHIFT; + if (!granularity) + return 0; + + /* Offset of the partition start in 'granularity' sectors */ + offset = sector_div(sector, granularity); + + /* And why do we do this modulus *again* in blkdev_issue_discard()? */ + offset = (granularity + alignment - offset) % granularity; + + /* Turn it back into bytes, gaah */ + return offset << SECTOR_SHIFT; +} + static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) { sectors = round_down(sectors, lbs >> SECTOR_SHIFT); @@ -924,3 +948,14 @@ int bdev_alignment_offset(struct block_device *bdev) return q->limits.alignment_offset; } EXPORT_SYMBOL_GPL(bdev_alignment_offset); + +unsigned int bdev_discard_alignment(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (bdev_is_partition(bdev)) + return queue_limit_discard_alignment(&q->limits, + bdev->bd_start_sect); + return q->limits.discard_alignment; +} +EXPORT_SYMBOL_GPL(bdev_discard_alignment); -- cgit From e3cc28ea28b5f8794db2aed24f8a0282ad2e85a2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:53 +0200 Subject: block: refactor discard bio size limiting Move all the logic to limit the discard bio size into a common helper so that it is better documented. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Coly Li Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-23-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 59 ++++++++++++++++++++++++++++----------------------------- block/blk.h | 14 -------------- 2 files changed, 29 insertions(+), 44 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 237d60d8b585..2ae32a722851 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -10,6 +10,32 @@ #include "blk.h" +static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) +{ + unsigned int discard_granularity = + bdev_get_queue(bdev)->limits.discard_granularity; + sector_t granularity_aligned_sector; + + if (bdev_is_partition(bdev)) + sector += bdev->bd_start_sect; + + granularity_aligned_sector = + round_up(sector, discard_granularity >> SECTOR_SHIFT); + + /* + * Make sure subsequent bios start aligned to the discard granularity if + * it needs to be split. + */ + if (granularity_aligned_sector != sector) + return granularity_aligned_sector - sector; + + /* + * Align the bio size to the discard granularity to make splitting the bio + * at discard granularity boundaries easier in the driver if needed. + */ + return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; +} + int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop) @@ -17,7 +43,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int op; - sector_t bs_mask, part_offset = 0; + sector_t bs_mask; if (bdev_read_only(bdev)) return -EPERM; @@ -48,36 +74,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!nr_sects) return -EINVAL; - /* In case the discard request is in a partition */ - if (bdev_is_partition(bdev)) - part_offset = bdev->bd_start_sect; - while (nr_sects) { - sector_t granularity_aligned_lba, req_sects; - sector_t sector_mapped = sector + part_offset; - - granularity_aligned_lba = round_up(sector_mapped, - q->limits.discard_granularity >> SECTOR_SHIFT); - - /* - * Check whether the discard bio starts at a discard_granularity - * aligned LBA, - * - If no: set (granularity_aligned_lba - sector_mapped) to - * bi_size of the first split bio, then the second bio will - * start at a discard_granularity aligned LBA on the device. - * - If yes: use bio_aligned_discard_max_sectors() as the max - * possible bi_size of the first split bio. Then when this bio - * is split in device drive, the split ones are very probably - * to be aligned to discard_granularity of the device's queue. - */ - if (granularity_aligned_lba == sector_mapped) - req_sects = min_t(sector_t, nr_sects, - bio_aligned_discard_max_sectors(q)); - else - req_sects = min_t(sector_t, nr_sects, - granularity_aligned_lba - sector_mapped); - - WARN_ON_ONCE((req_sects << 9) > UINT_MAX); + sector_t req_sects = + min(nr_sects, bio_discard_limit(bdev, sector)); bio = blk_next_bio(bio, bdev, 0, op, gfp_mask); bio->bi_iter.bi_sector = sector; diff --git a/block/blk.h b/block/blk.h index 4ea5167dc339..434017701403 100644 --- a/block/blk.h +++ b/block/blk.h @@ -346,20 +346,6 @@ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; } -/* - * The max bio size which is aligned to q->limits.discard_granularity. This - * is a hint to split large discard bio in generic block layer, then if device - * driver needs to split the discard bio into smaller ones, their bi_size can - * be very probably and easily aligned to discard_granularity of the device's - * queue. - */ -static inline unsigned int bio_aligned_discard_max_sectors( - struct request_queue *q) -{ - return round_down(UINT_MAX, q->limits.discard_granularity) >> - SECTOR_SHIFT; -} - /* * Internal io_context interface */ -- cgit From 70200574cc229f6ba038259e8142af2aa09e6976 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:55 +0200 Subject: block: remove QUEUE_FLAG_DISCARD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just use a non-zero max_discard_sectors as an indicator for discard support, similar to what is done for write zeroes. The only places where needs special attention is the RAID5 driver, which must clear discard support for security reasons by default, even if the default stacking rules would allow for it. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Christoph Böhmwalder [drbd] Acked-by: Jan Höppner [s390] Acked-by: Coly Li [bcache] Acked-by: David Sterba [btrfs] Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-25-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-lib.c | 2 +- block/blk-mq-debugfs.c | 1 - block/ioctl.c | 3 +-- 4 files changed, 3 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 937bb6b86331..b5c3a8049134 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -820,7 +820,7 @@ void submit_bio_noacct(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_DISCARD: - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) goto not_supported; break; case REQ_OP_SECURE_ERASE: diff --git a/block/blk-lib.c b/block/blk-lib.c index 2ae32a722851..8b4b66d3a9bf 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -53,7 +53,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return -EOPNOTSUPP; op = REQ_OP_SECURE_ERASE; } else { - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; op = REQ_OP_DISCARD; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index aa0349e9f083..fd111c500125 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -113,7 +113,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), - QUEUE_FLAG_NAME(DISCARD), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), QUEUE_FLAG_NAME(SECERASE), diff --git a/block/ioctl.c b/block/ioctl.c index 19c32b763451..eaee0efc0bea 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -87,14 +87,13 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, { uint64_t range[2]; uint64_t start, len; - struct request_queue *q = bdev_get_queue(bdev); struct inode *inode = bdev->bd_inode; int err; if (!(mode & FMODE_WRITE)) return -EBADF; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(range, (void __user *)arg, sizeof(range))) -- cgit From 7b47ef52d0a2025fd1408a8a0990933b8e1e510f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:56 +0200 Subject: block: add a bdev_discard_granularity helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abstract away implementation details from file systems by providing a block_device based helper to retrieve the discard granularity. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Christoph Böhmwalder [drbd] Acked-by: Ryusuke Konishi Acked-by: David Sterba [btrfs] Link: https://lore.kernel.org/r/20220415045258.199825-26-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-lib.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 8b4b66d3a9bf..43aa4d7fe859 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -12,8 +12,7 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) { - unsigned int discard_granularity = - bdev_get_queue(bdev)->limits.discard_granularity; + unsigned int discard_granularity = bdev_discard_granularity(bdev); sector_t granularity_aligned_sector; if (bdev_is_partition(bdev)) @@ -59,7 +58,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, } /* In case the discard granularity isn't set by buggy device driver */ - if (WARN_ON_ONCE(!q->limits.discard_granularity)) { + if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { char dev_name[BDEVNAME_SIZE]; bdevname(bdev, dev_name); -- cgit From 44abff2c0b970ae3d310b97617525dc01f248d7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:57 +0200 Subject: block: decouple REQ_OP_SECURE_ERASE from REQ_OP_DISCARD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Secure erase is a very different operation from discard in that it is a data integrity operation vs hint. Fully split the limits and helper infrastructure to make the separation more clear. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Christoph Böhmwalder [drbd] Acked-by: Ryusuke Konishi [nifs2] Acked-by: Jaegeuk Kim [f2fs] Acked-by: Coly Li [bcache] Acked-by: David Sterba [btrfs] Acked-by: Chao Yu Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-27-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-lib.c | 64 +++++++++++++++++++++++++++++++++++--------------- block/blk-mq-debugfs.c | 1 - block/blk-settings.c | 16 ++++++++++++- block/fops.c | 2 +- block/ioctl.c | 43 ++++++++++++++++++++++++++------- 6 files changed, 97 insertions(+), 31 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index b5c3a8049134..ee18b6a699bd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -824,7 +824,7 @@ void submit_bio_noacct(struct bio *bio) goto not_supported; break; case REQ_OP_SECURE_ERASE: - if (!blk_queue_secure_erase(q)) + if (!bdev_max_secure_erase_sectors(bdev)) goto not_supported; break; case REQ_OP_ZONE_APPEND: diff --git a/block/blk-lib.c b/block/blk-lib.c index 43aa4d7fe859..09b7e1200c0f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -36,26 +36,15 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) } int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags, - struct bio **biop) + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { - struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; - unsigned int op; sector_t bs_mask; if (bdev_read_only(bdev)) return -EPERM; - - if (flags & BLKDEV_DISCARD_SECURE) { - if (!blk_queue_secure_erase(q)) - return -EOPNOTSUPP; - op = REQ_OP_SECURE_ERASE; - } else { - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; - op = REQ_OP_DISCARD; - } + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; /* In case the discard granularity isn't set by buggy device driver */ if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { @@ -77,7 +66,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t req_sects = min(nr_sects, bio_discard_limit(bdev, sector)); - bio = blk_next_bio(bio, bdev, 0, op, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_size = req_sects << 9; sector += req_sects; @@ -103,21 +92,19 @@ EXPORT_SYMBOL(__blkdev_issue_discard); * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_DISCARD_* flags to control behaviour * * Description: * Issue a discard request for the sectors in question. */ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) + sector_t nr_sects, gfp_t gfp_mask) { struct bio *bio = NULL; struct blk_plug plug; int ret; blk_start_plug(&plug); - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, - &bio); + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); if (!ret && bio) { ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) @@ -314,3 +301,42 @@ retry: return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); + +int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp) +{ + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + unsigned int max_sectors = bdev_max_secure_erase_sectors(bdev); + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + if (max_sectors == 0) + return -EOPNOTSUPP; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + if (bdev_read_only(bdev)) + return -EPERM; + + blk_start_plug(&plug); + for (;;) { + unsigned int len = min_t(sector_t, nr_sects, max_sectors); + + bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = len; + + sector += len << SECTOR_SHIFT; + nr_sects -= len << SECTOR_SHIFT; + if (!nr_sects) { + ret = submit_bio_wait(bio); + bio_put(bio); + break; + } + cond_resched(); + } + blk_finish_plug(&plug); + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_secure_erase); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index fd111c500125..7e4136a60e1c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -115,7 +115,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), - QUEUE_FLAG_NAME(SECERASE), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(INIT_DONE), diff --git a/block/blk-settings.c b/block/blk-settings.c index fd83d674afd0..6ccceb421ed2 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -46,6 +46,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; + lim->max_secure_erase_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; @@ -176,6 +177,18 @@ void blk_queue_max_discard_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_discard_sectors); +/** + * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase + * @q: the request queue for the device + * @max_sectors: maximum number of sectors to secure_erase + **/ +void blk_queue_max_secure_erase_sectors(struct request_queue *q, + unsigned int max_sectors) +{ + q->limits.max_secure_erase_sectors = max_sectors; +} +EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); + /** * blk_queue_max_write_zeroes_sectors - set max sectors for a single * write zeroes @@ -661,7 +674,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % t->discard_granularity; } - + t->max_secure_erase_sectors = min_not_zero(t->max_secure_erase_sectors, + b->max_secure_erase_sectors); t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); diff --git a/block/fops.c b/block/fops.c index ba5e7d5ff9a5..e3643362c244 100644 --- a/block/fops.c +++ b/block/fops.c @@ -677,7 +677,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, 0); + len >> SECTOR_SHIFT, GFP_KERNEL); break; default: error = -EOPNOTSUPP; diff --git a/block/ioctl.c b/block/ioctl.c index eaee0efc0bea..46949f1b0dba 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -83,7 +83,7 @@ static int compat_blkpg_ioctl(struct block_device *bdev, #endif static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, - unsigned long arg, unsigned long flags) + unsigned long arg) { uint64_t range[2]; uint64_t start, len; @@ -114,15 +114,43 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; - - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, flags); - + err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); fail: filemap_invalidate_unlock(inode->i_mapping); return err; } +static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, + void __user *argp) +{ + uint64_t start, len; + uint64_t range[2]; + int err; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + if (!bdev_max_secure_erase_sectors(bdev)) + return -EOPNOTSUPP; + if (copy_from_user(range, argp, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + if ((start & 511) || (len & 511)) + return -EINVAL; + if (start + len > bdev_nr_bytes(bdev)) + return -EINVAL; + + filemap_invalidate_lock(bdev->bd_inode->i_mapping); + err = truncate_bdev_range(bdev, mode, start, start + len - 1); + if (!err) + err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, + GFP_KERNEL); + filemap_invalidate_unlock(bdev->bd_inode->i_mapping); + return err; +} + + static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, unsigned long arg) { @@ -450,10 +478,9 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKROSET: return blkdev_roset(bdev, mode, cmd, arg); case BLKDISCARD: - return blk_ioctl_discard(bdev, mode, arg, 0); + return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: - return blk_ioctl_discard(bdev, mode, arg, - BLKDEV_DISCARD_SECURE); + return blk_ioctl_secure_erase(bdev, mode, argp); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); case BLKGETDISKSEQ: -- cgit From 5f0614a55ecebdf55f1a17db0b5f6b787ed009f1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 17 Apr 2022 22:27:13 -0400 Subject: block: change exported IO accounting interface from gendisk to bdev Export IO accounting interfaces in terms of block_device now that gendisk has become more internal to block core. Rename __part_{start,end}_io_acct's first argument from part to bdev. Rename __part_{start,end}_io_acct to bdev_{start,end}_io_acct and export them. Remove disk_{start,end}_io_acct and update caller (zram) to use bdev_{start,end}_io_acct. DM can now be updated to use bdev_{start,end}_io_acct. Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer Link: https://lore.kernel.org/r/20220418022733.56168-2-snitzer@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 52 ++++++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 32 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index ee18b6a699bd..f305cb66c72a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1022,21 +1022,22 @@ again: } } -static unsigned long __part_start_io_acct(struct block_device *part, - unsigned int sectors, unsigned int op, - unsigned long start_time) +unsigned long bdev_start_io_acct(struct block_device *bdev, + unsigned int sectors, unsigned int op, + unsigned long start_time) { const int sgrp = op_stat_group(op); part_stat_lock(); - update_io_ticks(part, start_time, false); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, sectors[sgrp], sectors); - part_stat_local_inc(part, in_flight[op_is_write(op)]); + update_io_ticks(bdev, start_time, false); + part_stat_inc(bdev, ios[sgrp]); + part_stat_add(bdev, sectors[sgrp], sectors); + part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); return start_time; } +EXPORT_SYMBOL(bdev_start_io_acct); /** * bio_start_io_acct_time - start I/O accounting for bio based drivers @@ -1045,8 +1046,8 @@ static unsigned long __part_start_io_acct(struct block_device *part, */ void bio_start_io_acct_time(struct bio *bio, unsigned long start_time) { - __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), start_time); + bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), + bio_op(bio), start_time); } EXPORT_SYMBOL_GPL(bio_start_io_acct_time); @@ -1058,46 +1059,33 @@ EXPORT_SYMBOL_GPL(bio_start_io_acct_time); */ unsigned long bio_start_io_acct(struct bio *bio) { - return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), jiffies); + return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), + bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); -unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, - unsigned int op) -{ - return __part_start_io_acct(disk->part0, sectors, op, jiffies); -} -EXPORT_SYMBOL(disk_start_io_acct); - -static void __part_end_io_acct(struct block_device *part, unsigned int op, - unsigned long start_time) +void bdev_end_io_acct(struct block_device *bdev, unsigned int op, + unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); - update_io_ticks(part, now, true); - part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_local_dec(part, in_flight[op_is_write(op)]); + update_io_ticks(bdev, now, true); + part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); } +EXPORT_SYMBOL(bdev_end_io_acct); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, - struct block_device *orig_bdev) + struct block_device *orig_bdev) { - __part_end_io_acct(orig_bdev, bio_op(bio), start_time); + bdev_end_io_acct(orig_bdev, bio_op(bio), start_time); } EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); -void disk_end_io_acct(struct gendisk *disk, unsigned int op, - unsigned long start_time) -{ - __part_end_io_acct(disk->part0, op, start_time); -} -EXPORT_SYMBOL(disk_end_io_acct); - /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked -- cgit From 3de2e5f28cb133f1d9c1b2403079722d0e7b671e Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Sat, 23 Apr 2022 13:38:07 +0200 Subject: block/badblocks: Remove redundant assignments Get rid of redundant assignments to a variable sectors from functions badblocks_check and badblocks_clear. This variable, that is a function parameter, is being assigned a value that is never read until the end of function. Reported by clang-tidy [deadcode.DeadStores] Signed-off-by: Michal Orzel Link: https://lore.kernel.org/r/20220423113811.13335-1-michalorzel.eng@gmail.com Signed-off-by: Jens Axboe --- block/badblocks.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/badblocks.c b/block/badblocks.c index d39056630d9c..3afb550c0f7b 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -65,7 +65,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors, s >>= bb->shift; target += (1<shift) - 1; target >>= bb->shift; - sectors = target - s; } /* 'target' is now the first block after the bad range */ @@ -345,7 +344,6 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) s += (1<shift) - 1; s >>= bb->shift; target >>= bb->shift; - sectors = target - s; } write_seqlock_irq(&bb->lock); -- cgit From 7ab89db979017255c2163880de5c63d8c9726aef Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Sat, 23 Apr 2022 13:38:08 +0200 Subject: block/blk-map: Remove redundant assignment Get rid of redundant assignment to a variable ret from function bio_map_user_iov as it is being assigned a value that is never read. It is being re-assigned in the first instruction after the while loop Reported by clang-tidy [deadcode.DeadStores] Signed-off-by: Michal Orzel Link: https://lore.kernel.org/r/20220423113811.13335-2-michalorzel.eng@gmail.com Signed-off-by: Jens Axboe --- block/blk-map.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 7ffde64f9019..df8b066cd548 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -262,10 +262,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - if (unlikely(offs & queue_dma_alignment(rq->q))) { - ret = -EINVAL; + if (unlikely(offs & queue_dma_alignment(rq->q))) j = 0; - } else { + else { for (j = 0; j < npages; j++) { struct page *page = pages[j]; unsigned int n = PAGE_SIZE - offs; -- cgit From 834726828b47c08e84df02f975e30c5c65bf316b Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Sat, 23 Apr 2022 13:38:09 +0200 Subject: block/partitions/acorn: Remove redundant assignments Get rid of redundant assignments to a variable slot from function adfspart_check_ADFS. It is being assigned a value that is never read as we are breaking out from the switch case and the function ends. Reported by clang-tidy [deadcode.DeadStores] Signed-off-by: Michal Orzel Link: https://lore.kernel.org/r/20220423113811.13335-3-michalorzel.eng@gmail.com Signed-off-by: Jens Axboe --- block/partitions/acorn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index 2c381c694c57..d2fc122d7426 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -282,13 +282,13 @@ int adfspart_check_ADFS(struct parsed_partitions *state) #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, start_sect, slot, + riscix_partition(state, start_sect, slot, nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, start_sect, slot, + linux_partition(state, start_sect, slot, nr_sects); break; } -- cgit From 87420fa94f6cdd2ae8aa3e1d8602bfa549794fac Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Sat, 23 Apr 2022 13:38:10 +0200 Subject: block/partitions/atari: Remove redundant assignment Get rid of redundant assignment to a variable part_fmt from function atari_partition. It is being assigned a value that is never read until the end of function. Reported by clang-tidy [deadcode.DeadStores] Signed-off-by: Michal Orzel Link: https://lore.kernel.org/r/20220423113811.13335-4-michalorzel.eng@gmail.com Signed-off-by: Jens Axboe --- block/partitions/atari.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/partitions/atari.c b/block/partitions/atari.c index da5994175416..9655c728262a 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -140,7 +140,6 @@ int atari_partition(struct parsed_partitions *state) /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) continue; - part_fmt = 2; put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); -- cgit From e233fe1aa02815f38588a5a965a197bbcabfb125 Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Sat, 23 Apr 2022 13:38:11 +0200 Subject: block/partitions/ldm: Remove redundant assignments Get rid of the following redundant assignments: - to a variable r_cols from function ldm_parse_cmp3 - to variables r_id1 and r_id2 from functions ldm_parse_dgr3 and ldm_parse_dgr4 - to a variable r_index from function ldm_parse_prt3 that end up in values not being read until the end of function. Reported by clang-tidy [deadcode.DeadStores] Signed-off-by: Michal Orzel Link: https://lore.kernel.org/r/20220423113811.13335-5-michalorzel.eng@gmail.com Signed-off-by: Jens Axboe --- block/partitions/ldm.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 27f6c7d9c776..38e58960ae03 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -736,7 +736,6 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) len = r_cols; } else { r_stripe = 0; - r_cols = 0; len = r_parent; } if (len < 0) @@ -783,11 +782,8 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; + } else len = r_diskid; - } if (len < 0) return false; @@ -826,11 +822,8 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; + } else len = r_name; - } if (len < 0) return false; @@ -963,10 +956,8 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) return false; } len = r_index; - } else { - r_index = 0; + } else len = r_diskid; - } if (len < 0) { ldm_error("len %d < 0", len); return false; -- cgit From 9650b453a3d4b1b8ed4ea8bcb9b40109608d1faf Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Apr 2022 22:31:10 +0800 Subject: block: ignore RWF_HIPRI hint for sync dio So far bio is marked as REQ_POLLED if RWF_HIPRI/IOCB_HIPRI is passed from userspace sync io interface, then block layer tries to poll until the bio is completed. But the current implementation calls blk_io_schedule() if bio_poll() returns 0, and this way causes io hang or timeout easily. But looks no one reports this kind of issue, which should have been triggered in normal io poll sanity test or blktests block/007 as observed by Changhui, that means it is very likely that no one uses it or no one cares it. Also after io_uring is invented, io poll for sync dio becomes legacy interface. So ignore RWF_HIPRI hint for sync dio. CC: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Reported-by: Changhui Zhong Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Tested-by: Changhui Zhong Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220420143110.2679002-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/fops.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index e3643362c244..b9b83030e0df 100644 --- a/block/fops.c +++ b/block/fops.c @@ -44,14 +44,6 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) #define DIO_INLINE_BIO_VECS 4 -static void blkdev_bio_end_io_simple(struct bio *bio) -{ - struct task_struct *waiter = bio->bi_private; - - WRITE_ONCE(bio->bi_private, NULL); - blk_wake_io_task(waiter); -} - static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { @@ -83,8 +75,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; - bio.bi_private = current; - bio.bi_end_io = blkdev_bio_end_io_simple; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); @@ -97,18 +87,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(&bio, iocb); - submit_bio(&bio); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio.bi_private)) - break; - if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); + submit_bio_wait(&bio); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) -- cgit From 2524a5783e7d49e7cd936f582485a2bb4567edd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:09 +0200 Subject: blk-cgroup: remove __bio_blkcg Remove the unused and deprecated __bio_blkcg helper. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 47e1e38390c9..49e88fc9cc39 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -139,27 +139,6 @@ static inline struct cgroup_subsys_state *blkcg_css(void) return task_css(current, io_cgrp_id); } -/** - * __bio_blkcg - internal, inconsistent version to get blkcg - * - * DO NOT USE. - * This function is inconsistent and consequently is dangerous to use. The - * first part of the function returns a blkcg where a reference is owned by the - * bio. This means it does not need to be rcu protected as it cannot go away - * with the bio owning a reference to it. However, the latter potentially gets - * it from task_css(). This can race against task migration and the cgroup - * dying. It is also semantically different as it must be called rcu protected - * and is susceptible to failure when trying to get a reference to it. - * Therefore, it is not ok to assume that *_get() will always succeed on the - * blkcg returned here. - */ -static inline struct blkcg *__bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return css_to_blkcg(blkcg_css()); -} - /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. @@ -471,8 +450,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } - static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -- cgit From db05628435aa761d30b4eae481a82befe7a8492a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:12 +0200 Subject: blk-cgroup: move blkcg_{get,set}_fc_appid out of line No need to have these helpers inline. Also remove the stubs and just use an IS_ENABLED for the get side (the set side already is only built conditionlly). Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-5-hch@lst.de Signed-off-by: Jens Axboe --- block/Makefile | 1 + block/blk-cgroup-fc-appid.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 block/blk-cgroup-fc-appid.c (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 3950ecbc5c26..4e01bb71ad6e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o +obj-$(CONFIG_BLK_CGROUP_FC_APPID) += blk-cgroup-fc-appid.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o diff --git a/block/blk-cgroup-fc-appid.c b/block/blk-cgroup-fc-appid.c new file mode 100644 index 000000000000..760a2e1878dd --- /dev/null +++ b/block/blk-cgroup-fc-appid.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "blk-cgroup.h" + +/** + * blkcg_set_fc_appid - set the fc_app_id field associted to blkcg + * @app_id: application identifier + * @cgrp_id: cgroup id + * @app_id_len: size of application identifier + */ +int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + struct blkcg *blkcg; + int ret = 0; + + if (app_id_len > FC_APPID_LEN) + return -EINVAL; + + cgrp = cgroup_get_from_id(cgrp_id); + if (!cgrp) + return -ENOENT; + css = cgroup_get_e_css(cgrp, &io_cgrp_subsys); + if (!css) { + ret = -ENOENT; + goto out_cgrp_put; + } + blkcg = css_to_blkcg(css); + /* + * There is a slight race condition on setting the appid. + * Worst case an I/O may not find the right id. + * This is no different from the I/O we let pass while obtaining + * the vmid from the fabric. + * Adding the overhead of a lock is not necessary. + */ + strlcpy(blkcg->fc_app_id, app_id, app_id_len); + css_put(css); +out_cgrp_put: + cgroup_put(cgrp); + return ret; +} +EXPORT_SYMBOL_GPL(blkcg_set_fc_appid); + +/** + * blkcg_get_fc_appid - get the fc app identifier associated with a bio + * @bio: target bio + * + * On success return the fc_app_id, on failure return NULL + */ +char *blkcg_get_fc_appid(struct bio *bio) +{ + if (!bio->bi_blkg || bio->bi_blkg->blkcg->fc_app_id[0] == '\0') + return NULL; + return bio->bi_blkg->blkcg->fc_app_id; +} +EXPORT_SYMBOL_GPL(blkcg_get_fc_appid); -- cgit From 216889aad362b5b7e998a5371348b5e95d485dd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:13 +0200 Subject: blk-cgroup: move blk_cgroup_congested out line There is no urgent need to inline this function, so move it out of line. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8dfe62786cd5..97266ebde975 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1950,6 +1950,26 @@ void blk_cgroup_bio_start(struct bio *bio) put_cpu(); } +bool blk_cgroup_congested(void) +{ + struct cgroup_subsys_state *css; + bool ret = false; + + rcu_read_lock(); + css = kthread_blkcg(); + if (!css) + css = task_css(current, io_cgrp_id); + while (css) { + if (atomic_read(&css->cgroup->congestion_count)) { + ret = true; + break; + } + css = css->parent; + } + rcu_read_unlock(); + return ret; +} + static int __init blkcg_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", -- cgit From 397c9f46ee4d99024c64954b007c1b5762d01cb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:14 +0200 Subject: blk-cgroup: move blkcg_{pin,unpin}_online out of line Move these two functions out of line as there is no good reason to inline them. Also switch to passing a cgroup_subsys_state instead of doing the conversion in the caller to prepare for making the blkcg structure private to blk-cgroup. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 88 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 97266ebde975..3af0d4a61955 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -155,6 +155,17 @@ static void blkg_async_bio_workfn(struct work_struct *work) blk_finish_plug(&plug); } +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -1015,25 +1026,6 @@ static struct cftype blkcg_legacy_files[] = { * This finally frees the blkcg. */ -/** - * blkcg_css_offline - cgroup css_offline callback - * @css: css of interest - * - * This function is called when @css is about to go away. Here the cgwbs are - * offlined first and only once writeback associated with the blkcg has - * finished do we start step 2 (see above). - */ -static void blkcg_css_offline(struct cgroup_subsys_state *css) -{ - struct blkcg *blkcg = css_to_blkcg(css); - - /* this prevents anyone from attaching or migrating to this blkcg */ - wb_blkcg_offline(blkcg); - - /* put the base online pin allowing step 2 to be triggered */ - blkcg_unpin_online(blkcg); -} - /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest @@ -1045,7 +1037,7 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) * * This is the blkcg counterpart of ioc_release_fn(). */ -void blkcg_destroy_blkgs(struct blkcg *blkcg) +static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); @@ -1075,6 +1067,57 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg) spin_unlock_irq(&blkcg->lock); } +/** + * blkcg_pin_online - pin online state + * @blkcg_css: blkcg of interest + * + * While pinned, a blkcg is kept online. This is primarily used to + * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline + * while an associated cgwb is still active. + */ +void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) +{ + refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); +} + +/** + * blkcg_unpin_online - unpin online state + * @blkcg_css: blkcg of interest + * + * This is primarily used to impedance-match blkg and cgwb lifetimes so + * that blkg doesn't go offline while an associated cgwb is still active. + * When this count goes to zero, all active cgwbs have finished so the + * blkcg can continue destruction by calling blkcg_destroy_blkgs(). + */ +void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) +{ + struct blkcg *blkcg = css_to_blkcg(blkcg_css); + + do { + if (!refcount_dec_and_test(&blkcg->online_pin)) + break; + blkcg_destroy_blkgs(blkcg); + blkcg = blkcg_parent(blkcg); + } while (blkcg); +} + +/** + * blkcg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away. Here the cgwbs are + * offlined first and only once writeback associated with the blkcg has + * finished do we start step 2 (see above). + */ +static void blkcg_css_offline(struct cgroup_subsys_state *css) +{ + /* this prevents anyone from attaching or migrating to this blkcg */ + wb_blkcg_offline(css_to_blkcg(css)); + + /* put the base online pin allowing step 2 to be triggered */ + blkcg_unpin_online(css); +} + static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); @@ -1163,8 +1206,7 @@ unlock: static int blkcg_css_online(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg *parent = blkcg_parent(blkcg); + struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs @@ -1172,7 +1214,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) * parent so that offline always happens towards the root. */ if (parent) - blkcg_pin_online(parent); + blkcg_pin_online(css); return 0; } -- cgit From dec223c92a4688f6c9642d640cfe15a99d289dd4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:15 +0200 Subject: blk-cgroup: move struct blkcg to block/blk-cgroup.h There is no real need to expose the blkcg structure to the whole kernel. Move it to the private header an expose a helper to let the writeback code access the cgwb_list member. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++++++++- block/blk-cgroup.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3af0d4a61955..bb52797c02bd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1005,6 +1005,13 @@ static struct cftype blkcg_legacy_files[] = { { } /* terminate */ }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) +{ + return &css_to_blkcg(css)->cgwb_list; +} +#endif + /* * blkcg destruction is a three-stage process. * @@ -1112,7 +1119,7 @@ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) static void blkcg_css_offline(struct cgroup_subsys_state *css) { /* this prevents anyone from attaching or migrating to this blkcg */ - wb_blkcg_offline(css_to_blkcg(css)); + wb_blkcg_offline(css); /* put the base online pin allowing step 2 to be triggered */ blkcg_unpin_online(css); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 49e88fc9cc39..b00fb1169e7c 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -17,10 +17,38 @@ #include #include +struct blkcg_gq; +struct blkg_policy_data; + + /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + refcount_t online_pin; + + struct radix_tree_root blkg_tree; + struct blkcg_gq __rcu *blkg_hint; + struct hlist_head blkg_list; + + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; + + struct list_head all_blkcgs_node; +#ifdef CONFIG_BLK_CGROUP_FC_APPID + char fc_app_id[FC_APPID_LEN]; +#endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif +}; + +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a -- cgit From f4a6a61cb6d40d9ae63e47743d33200f3efe3fe7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:16 +0200 Subject: blktrace: cleanup the __trace_note_message interface Pass the cgroup_subsys_state instead of a the blkg so that blktrace doesn't need to poke into blk-cgroup internals, and give the name a blk prefix as the current name is way too generic for a public interface. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-9-hch@lst.de Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 4 ++-- block/blk-throttle.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 978ef5d6fe6a..b18d6c31c225 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1102,13 +1102,13 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); break; \ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ + &bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \ "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ + &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ } while (0) #else /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 469c483719be..447e1b8722f7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -227,7 +227,7 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ - tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\ + &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ -- cgit From bbb1ebe7a909db4de49777fb7676d5bf293f34c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:17 +0200 Subject: blk-cgroup: replace bio_blkcg with bio_blkcg_css All callers of bio_blkcg actually want the CSS, so replace it with an interface that does return the CSS. This now allows to move struct blkcg_gq to block/blk-cgroup.h instead of exposing it in a public header. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-10-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 18 ++++++++++++++- block/blk-cgroup.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bb52797c02bd..8e32cc494808 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -155,6 +155,22 @@ static void blkg_async_bio_workfn(struct work_struct *work) blk_finish_plug(&plug); } +/** + * bio_blkcg_css - return the blkcg CSS associated with a bio + * @bio: target bio + * + * This returns the CSS for the blkcg associated with a bio, or %NULL if not + * associated. Callers are expected to either handle %NULL or know association + * has been done prior to calling this. + */ +struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) +{ + if (!bio || !bio->bi_blkg) + return NULL; + return &bio->bi_blkg->blkcg->css; +} +EXPORT_SYMBOL_GPL(bio_blkcg_css); + /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest @@ -1938,7 +1954,7 @@ void bio_associate_blkg(struct bio *bio) rcu_read_lock(); if (bio->bi_blkg) - css = &bio_blkcg(bio)->css; + css = bio_blkcg_css(bio); else css = blkcg_css(); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b00fb1169e7c..03405ddf2a7b 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -25,6 +25,64 @@ struct blkg_policy_data; #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP + +enum blkg_iostat_type { + BLKG_IOSTAT_READ, + BLKG_IOSTAT_WRITE, + BLKG_IOSTAT_DISCARD, + + BLKG_IOSTAT_NR, +}; + +struct blkg_iostat { + u64 bytes[BLKG_IOSTAT_NR]; + u64 ios[BLKG_IOSTAT_NR]; +}; + +struct blkg_iostat_set { + struct u64_stats_sync sync; + struct blkg_iostat cur; + struct blkg_iostat last; +}; + +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* reference count */ + struct percpu_ref refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_iostat_set __percpu *iostat_cpu; + struct blkg_iostat_set iostat; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + spinlock_t async_bio_lock; + struct bio_list async_bios; + union { + struct work_struct async_bio_work; + struct work_struct free_work; + }; + + atomic_t use_delay; + atomic64_t delay_nsec; + atomic64_t delay_start; + u64 last_delay; + int last_use; + + struct rcu_head rcu_head; +}; + struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; @@ -173,9 +231,9 @@ static inline struct cgroup_subsys_state *blkcg_css(void) * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning - * blkg. The idea is we do bio_blkcg() to look up the actual context for the - * bio and attach the appropriate blkg to the bio. Then we call this helper and - * if it is true run with the root blkg for that queue and then do any + * blkg. The idea is we do bio_blkcg_css() to look up the actual context for + * the bio and attach the appropriate blkg to the bio. Then we call this helper + * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) @@ -464,6 +522,9 @@ struct blkcg_policy_data { struct blkcg_policy { }; +struct blkcg { +}; + #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit From 7f20ba7c42fd899557cef7d001f48711c3066ba5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:18 +0200 Subject: blk-cgroup: remove pointless CONFIG_BLOCK ifdefs No need to make BLK_CGROUP stubs conditional on CONFIG_BLOCK as they can't be used without that. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-11-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 03405ddf2a7b..a948f4eb0bff 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -525,8 +525,6 @@ struct blkcg_policy { struct blkcg { }; -#ifdef CONFIG_BLOCK - static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) { return NULL; } @@ -554,7 +552,6 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { r #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) -#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */ -- cgit From c97ab271576dec2170e7b804cb05f7617b30fed9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:19 +0200 Subject: blk-cgroup: remove unneeded includes from Remove all the includes that aren't actually needed from and push them to the actual source files where needed. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-12-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index a948f4eb0bff..62ed8ed50b6e 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,6 +15,8 @@ */ #include +#include +#include #include struct blkcg_gq; -- cgit From bc5fee91f26d8d1428fb744e5ad04b1417a85197 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:20 +0200 Subject: blk-cgroup: move blkcg_css to blk-cgroup.c blkcg_css is only used in blk-cgroup.c, so move it there. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 17 +++++++++++++++++ block/blk-cgroup.h | 17 ----------------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8e32cc494808..8da00ddc1766 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -59,6 +59,23 @@ static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 +/** + * blkcg_css - find the current css + * + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. + */ +static struct cgroup_subsys_state *blkcg_css(void) +{ + struct cgroup_subsys_state *css; + + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 62ed8ed50b6e..bb670e53a1de 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -210,23 +210,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); -/** - * blkcg_css - find the current css - * - * Find the css associated with either the kthread or the current task. - * This may return a dying css, so it is up to the caller to use tryget logic - * to confirm it is alive and well. - */ -static inline struct cgroup_subsys_state *blkcg_css(void) -{ - struct cgroup_subsys_state *css; - - css = kthread_blkcg(); - if (css) - return css; - return task_css(current, io_cgrp_id); -} - /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. -- cgit From d200ca143ac6d0b4391b4e811e67e1a36461d501 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:21 +0200 Subject: blk-cgroup: cleanup blk_cgroup_congested Use blkcg_css instead of open coding it, and switch to a slightly more natural for loop. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-14-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8da00ddc1766..5684a8ce1f75 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2038,15 +2038,11 @@ bool blk_cgroup_congested(void) bool ret = false; rcu_read_lock(); - css = kthread_blkcg(); - if (!css) - css = task_css(current, io_cgrp_id); - while (css) { + for (css = blkcg_css(); css; css = css->parent) { if (atomic_read(&css->cgroup->congestion_count)) { ret = true; break; } - css = css->parent; } rcu_read_unlock(); return ret; -- cgit From 82778259eb201870d6d4f95ca4162de60a682343 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:22 +0200 Subject: blk-cgroup: cleanup blkcg_maybe_throttle_current Use blkcg_css instead of opencoding it. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-15-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5684a8ce1f75..a91f8ae18b49 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1808,7 +1808,6 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) void blkcg_maybe_throttle_current(void) { struct request_queue *q = current->throttle_queue; - struct cgroup_subsys_state *css; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; @@ -1820,12 +1819,7 @@ void blkcg_maybe_throttle_current(void) current->use_memdelay = false; rcu_read_lock(); - css = kthread_blkcg(); - if (css) - blkcg = css_to_blkcg(css); - else - blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); - + blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, q); -- cgit From 513616843d736fb7161b4460cdfe5aa825c5902c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 May 2022 07:29:49 -0700 Subject: block: remove superfluous calls to blkcg_bio_issue_init blkcg_bio_issue_init is called in submit_bio. There is no need to have extra calls that just get overriden in __bio_clone and the two places that copy and pasted from it. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220504142950.567582-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 1 - block/blk-crypto-fallback.c | 1 - block/bounce.c | 1 - 3 files changed, 3 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 212ccd5b5212..aa94195ffd02 100644 --- a/block/bio.c +++ b/block/bio.c @@ -768,7 +768,6 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio->bi_iter = bio_src->bi_iter; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 5d1aa5b1d30a..621abd1b0e4d 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -179,7 +179,6 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio->bi_io_vec[bio->bi_vcnt++] = bv; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); return bio; } diff --git a/block/bounce.c b/block/bounce.c index 467be46d0e65..8f7b6fe3b4db 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -191,7 +191,6 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) goto err_put; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); return bio; -- cgit From 7ecc56c62b27d93838ee67fc2c7a1c3c480aea04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 May 2022 07:29:50 -0700 Subject: block: allow passing a NULL bdev to bio_alloc_clone/bio_init_clone Device mapper wants to allocate a bio before knowing the device it gets send to, so add explicit support for that. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220504142950.567582-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index aa94195ffd02..a3893d80dccc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -761,13 +761,15 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); - if (bio->bi_bdev == bio_src->bi_bdev && - bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter = bio_src->bi_iter; - bio_clone_blkg_association(bio, bio_src); + if (bio->bi_bdev) { + if (bio->bi_bdev == bio_src->bi_bdev && + bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); + bio_clone_blkg_association(bio, bio_src); + } if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; -- cgit From 069adbac2cd85ae00252da6c5576cbf9b9d9ba6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 May 2022 07:33:55 -0700 Subject: block: improve the error message from bio_check_eod Print the start sector and length separately instead of the combined value to help with debugging. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20220504143355.568660-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f305cb66c72a..c3f1e46ddd43 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -592,10 +592,9 @@ static inline int bio_check_eod(struct bio *bio) (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { pr_info_ratelimited("%s: attempt to access beyond end of device\n" - "%pg: rw=%d, want=%llu, limit=%llu\n", - current->comm, - bio->bi_bdev, bio->bi_opf, - bio_end_sector(bio), maxsector); + "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", + current->comm, bio->bi_bdev, bio->bi_opf, + bio->bi_iter.bi_sector, nr_sectors, maxsector); return -EIO; } return 0; -- cgit From 2a371f7d5fa575010b915e325c5d20b9ad0d5d5a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 10 May 2022 11:47:57 +0800 Subject: blk-iocost: combine local_stat and desc_stat to stat When we flush usage, wait, indebt stat in iocg_flush_stat(), we use local_stat and desc_stat, which has no point since the leaf iocg only has local_stat and the inner iocg only has desc_stat. Also we don't need to flush percpu abs_vusage for these inner iocgs. This patch combine local_stat and desc_stat to stat, only flush percpu abs_vusage for active leaf iocgs, then build inner walk list to propagate. Signed-off-by: Chengming Zhou Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220510034757.21761-1-zhouchengming@bytedance.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 71 ++++++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 39 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 70a0a3d680a3..bf83288a2646 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -533,8 +533,7 @@ struct ioc_gq { /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; - struct iocg_stat local_stat; - struct iocg_stat desc_stat; + struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; @@ -1371,7 +1370,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return true; } else { if (iocg->indelay_since) { - iocg->local_stat.indelay_us += now->now - iocg->indelay_since; + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; @@ -1419,7 +1418,7 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { - iocg->local_stat.indebt_us += now->now - iocg->indebt_since; + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, @@ -1513,7 +1512,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; @@ -1641,11 +1640,30 @@ static void iocg_build_inner_walk(struct ioc_gq *iocg, } } +/* propagate the deltas to the parent */ +static void iocg_flush_stat_upward(struct ioc_gq *iocg) +{ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->stat; + + parent_stat->usage_us += + iocg->stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + iocg->stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + iocg->stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + iocg->stat.indelay_us - iocg->last_stat.indelay_us; + } + + iocg->last_stat = iocg->stat; +} + /* collect per-cpu counters and propagate the deltas to the parent */ -static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - struct iocg_stat new_stat; u64 abs_vusage = 0; u64 vusage_delta; int cpu; @@ -1661,34 +1679,9 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); - iocg->local_stat.usage_us += iocg->usage_delta_us; - - /* propagate upwards */ - new_stat.usage_us = - iocg->local_stat.usage_us + iocg->desc_stat.usage_us; - new_stat.wait_us = - iocg->local_stat.wait_us + iocg->desc_stat.wait_us; - new_stat.indebt_us = - iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; - new_stat.indelay_us = - iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; - - /* propagate the deltas to the parent */ - if (iocg->level > 0) { - struct iocg_stat *parent_stat = - &iocg->ancestors[iocg->level - 1]->desc_stat; - - parent_stat->usage_us += - new_stat.usage_us - iocg->last_stat.usage_us; - parent_stat->wait_us += - new_stat.wait_us - iocg->last_stat.wait_us; - parent_stat->indebt_us += - new_stat.indebt_us - iocg->last_stat.indebt_us; - parent_stat->indelay_us += - new_stat.indelay_us - iocg->last_stat.indelay_us; - } + iocg->stat.usage_us += iocg->usage_delta_us; - iocg->last_stat = new_stat; + iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ @@ -1699,13 +1692,13 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } @@ -2152,16 +2145,16 @@ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { - iocg->local_stat.indebt_us += + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { - iocg->local_stat.indelay_us += + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } -- cgit From a3e7689bfaae031177ebbb257d5a704b47236044 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 May 2022 08:36:54 +0200 Subject: block: cleanup the VM accounting in submit_bio submit_bio uses some extremely convoluted checks and confusing comments to only account REQ_OP_READ/REQ_OP_WRITE comments. Just switch to the plain obvious checks instead. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220516063654.2782792-1-hch@lst.de [axboe: fixup WRITE -> REQ_OP_WRITE] Signed-off-by: Jens Axboe --- block/blk-core.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index c3f1e46ddd43..23312df7d25d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -892,19 +892,11 @@ void submit_bio(struct bio *bio) if (blkcg_punt_bio_submit(bio)) return; - /* - * If it's a regular read/write or a barrier with data attached, - * go through the normal accounting stuff before submission. - */ - if (bio_has_data(bio)) { - unsigned int count = bio_sectors(bio); - - if (op_is_write(bio_op(bio))) { - count_vm_events(PGPGOUT, count); - } else { - task_io_account_read(bio->bi_iter.bi_size); - count_vm_events(PGPGIN, count); - } + if (bio_op(bio) == REQ_OP_READ) { + task_io_account_read(bio->bi_iter.bi_size); + count_vm_events(PGPGIN, bio_sectors(bio)); + } else if (bio_op(bio) == REQ_OP_WRITE) { + count_vm_events(PGPGOUT, bio_sectors(bio)); } /* -- cgit From 181490d5321806e537dc5386db5ea640b826bf78 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 13 May 2022 10:35:06 +0800 Subject: block, bfq: protect 'bfqd->queued' by 'bfqd->lock' If bfq_schedule_dispatch() is called from bfq_idle_slice_timer_body(), then 'bfqd->queued' is read without holding 'bfqd->lock'. This is wrong since it can be wrote concurrently. Fix the problem by holding 'bfqd->lock' in such case. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220513023507.2625717-2-yukuai3@huawei.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e47c75f1fa0f..62180dda6bf9 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -456,6 +456,8 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) */ void bfq_schedule_dispatch(struct bfq_data *bfqd) { + lockdep_assert_held(&bfqd->lock); + if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); blk_mq_run_hw_queues(bfqd->queue, true); @@ -6892,8 +6894,8 @@ bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_bfqq_expire(bfqd, bfqq, true, reason); schedule_dispatch: - spin_unlock_irqrestore(&bfqd->lock, flags); bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(&bfqd->lock, flags); } /* -- cgit From ddc25c86b466d2359b57bc7798f167baa1735a44 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 13 May 2022 10:35:07 +0800 Subject: block, bfq: make bfq_has_work() more accurate bfq_has_work() is using busy_queues currently, which is not accurate because bfq_queue is busy doesn't represent that it has requests. Since bfqd aready has a counter 'queued' to record how many requests are in bfq, use it instead of busy_queues. Noted that bfq_has_work() can be called with 'bfqd->lock' held, thus the lock can't be held in bfq_has_work() to protect 'bfqd->queued'. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220513023507.2625717-3-yukuai3@huawei.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 62180dda6bf9..92f0a829a804 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2204,7 +2204,11 @@ static void bfq_add_request(struct request *rq) bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued + 1); if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { bfq_check_waker(bfqd, bfqq, now_ns); @@ -2396,7 +2400,11 @@ static void bfq_remove_request(struct request_queue *q, if (rq->queuelist.prev != &rq->queuelist) list_del_init(&rq->queuelist); bfqq->queued[sync]--; - bfqd->queued--; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued - 1); elv_rb_del(&bfqq->sort_list, rq); elv_rqhash_del(q, rq); @@ -5057,11 +5065,11 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; /* - * Avoiding lock: a race on bfqd->busy_queues should cause at + * Avoiding lock: a race on bfqd->queued should cause at * most a call to dispatch for nothing */ return !list_empty_careful(&bfqd->dispatch) || - bfq_tot_busy_queues(bfqd) > 0; + READ_ONCE(bfqd->queued); } static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -- cgit From 3607849df47822151b05df440759e2dc70160755 Mon Sep 17 00:00:00 2001 From: Wolfgang Bumiller Date: Tue, 11 Jan 2022 09:31:59 +0100 Subject: blk-cgroup: always terminate io.stat lines With the removal of seq_get_buf in blkcg_print_one_stat, we cannot make adding the newline conditional on there being relevant stats because the name was already written out unconditionally. Otherwise we may end up with multiple device names in one line which is confusing and doesn't follow the nested-keyed file format. Signed-off-by: Wolfgang Bumiller Fixes: 252c651a4c85 ("blk-cgroup: stop using seq_get_buf") Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220111083159.42340-1-w.bumiller@proxmox.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++------- block/blk-cgroup.h | 2 +- block/blk-iocost.c | 5 ++--- block/blk-iolatency.c | 8 +++----- 4 files changed, 8 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a91f8ae18b49..204442fad33c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -949,7 +949,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; - bool has_stats = false; const char *dname; unsigned seq; int i; @@ -975,14 +974,12 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { - has_stats = true; seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); @@ -994,12 +991,10 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (!blkg->pd[i] || !pol->pd_stat_fn) continue; - if (pol->pd_stat_fn(blkg->pd[i], s)) - has_stats = true; + pol->pd_stat_fn(blkg->pd[i], s); } - if (has_stats) - seq_printf(s, "\n"); + seq_puts(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index bb670e53a1de..d4de0a35e066 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -151,7 +151,7 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); -typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, +typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { diff --git a/block/blk-iocost.c b/block/blk-iocost.c index bf83288a2646..8132d49df37b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2988,13 +2988,13 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) +static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; if (!ioc->enabled) - return false; + return; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( @@ -3010,7 +3010,6 @@ static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) iocg->last_stat.wait_us, iocg->last_stat.indebt_us, iocg->last_stat.indelay_us); - return true; } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f33932e72e3..5b676c7cf2b6 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -891,7 +891,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) +static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -914,17 +914,16 @@ static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total, iolat->rq_depth.max_depth); - return true; } -static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) +static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return false; + return; if (iolat->ssd) return iolatency_ssd_stat(iolat, s); @@ -937,7 +936,6 @@ static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) else seq_printf(s, " depth=%u avg_lat=%llu win=%llu", iolat->rq_depth.max_depth, avg_lat, cur_win); - return true; } static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, -- cgit From 77c570a1ea85ba4ab135c61a028420a6e9fe77f3 Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Tue, 17 May 2022 01:39:30 +0800 Subject: blk-cgroup: Remove unnecessary rcu_read_lock/unlock() spin_lock_irq/spin_unlock_irq contains preempt_disable/enable(). Which can serve as RCU read-side critical region, so remove rcu_read_lock/unlock(). Signed-off-by: Fanjun Kong Reviewed-by: Muchun Song Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220516173930.159535-1-bh1scw@gmail.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 204442fad33c..0676cf7a41b5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1278,14 +1278,13 @@ int blkcg_init_queue(struct request_queue *q) preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ - rcu_read_lock(); + /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, q, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); @@ -1311,7 +1310,6 @@ err_destroy_all: return ret; err_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); -- cgit From 5a011f889b4832aa80c2a872a5aade5c48d2756f Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Tue, 1 Mar 2022 20:39:19 +0800 Subject: blk-throttle: Set BIO_THROTTLED when bio has been throttled 1.In current process, all bio will set the BIO_THROTTLED flag after __blk_throtl_bio(). 2.If bio needs to be throttled, it will start the timer and stop submit bio directly. Bio will submit in blk_throtl_dispatch_work_fn() when the timer expires.But in the current process, if bio is throttled. The BIO_THROTTLED will be set to bio after timer start. If the bio has been completed, it may cause use-after-free blow. BUG: KASAN: use-after-free in blk_throtl_bio+0x12f0/0x2c70 Read of size 2 at addr ffff88801b8902d4 by task fio/26380 dump_stack+0x9b/0xce print_address_description.constprop.6+0x3e/0x60 kasan_report.cold.9+0x22/0x3a blk_throtl_bio+0x12f0/0x2c70 submit_bio_checks+0x701/0x1550 submit_bio_noacct+0x83/0xc80 submit_bio+0xa7/0x330 mpage_readahead+0x380/0x500 read_pages+0x1c1/0xbf0 page_cache_ra_unbounded+0x471/0x6f0 do_page_cache_ra+0xda/0x110 ondemand_readahead+0x442/0xae0 page_cache_async_ra+0x210/0x300 generic_file_buffered_read+0x4d9/0x2130 generic_file_read_iter+0x315/0x490 blkdev_read_iter+0x113/0x1b0 aio_read+0x2ad/0x450 io_submit_one+0xc8e/0x1d60 __se_sys_io_submit+0x125/0x350 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Allocated by task 26380: kasan_save_stack+0x19/0x40 __kasan_kmalloc.constprop.2+0xc1/0xd0 kmem_cache_alloc+0x146/0x440 mempool_alloc+0x125/0x2f0 bio_alloc_bioset+0x353/0x590 mpage_alloc+0x3b/0x240 do_mpage_readpage+0xddf/0x1ef0 mpage_readahead+0x264/0x500 read_pages+0x1c1/0xbf0 page_cache_ra_unbounded+0x471/0x6f0 do_page_cache_ra+0xda/0x110 ondemand_readahead+0x442/0xae0 page_cache_async_ra+0x210/0x300 generic_file_buffered_read+0x4d9/0x2130 generic_file_read_iter+0x315/0x490 blkdev_read_iter+0x113/0x1b0 aio_read+0x2ad/0x450 io_submit_one+0xc8e/0x1d60 __se_sys_io_submit+0x125/0x350 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 0: kasan_save_stack+0x19/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x1b/0x30 __kasan_slab_free+0x111/0x160 kmem_cache_free+0x94/0x460 mempool_free+0xd6/0x320 bio_free+0xe0/0x130 bio_put+0xab/0xe0 bio_endio+0x3a6/0x5d0 blk_update_request+0x590/0x1370 scsi_end_request+0x7d/0x400 scsi_io_completion+0x1aa/0xe50 scsi_softirq_done+0x11b/0x240 blk_mq_complete_request+0xd4/0x120 scsi_mq_done+0xf0/0x200 virtscsi_vq_done+0xbc/0x150 vring_interrupt+0x179/0x390 __handle_irq_event_percpu+0xf7/0x490 handle_irq_event_percpu+0x7b/0x160 handle_irq_event+0xcc/0x170 handle_edge_irq+0x215/0xb20 common_interrupt+0x60/0x120 asm_common_interrupt+0x1e/0x40 Fix this by move BIO_THROTTLED set into the queue_lock. Signed-off-by: Laibin Qiu Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220301123919.2381579-1-qiulaibin@huawei.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 447e1b8722f7..139b2d7a99e2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2189,13 +2189,14 @@ again: } out_unlock: - spin_unlock_irq(&q->queue_lock); bio_set_flag(bio, BIO_THROTTLED); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); return throttled; } -- cgit From 1305e2c9d91a9f64c0eb5d4e1b5bc29930f3b834 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 May 2022 16:32:00 -0600 Subject: blk-cgroup: delete rcu_read_lock_held() WARN_ON_ONCE() A previous commit got rid of unnecessary rcu_read_lock() inside the IRQ disabling queue_lock, but this debug statement was left. It's now firing since we are indeed not inside a RCU read lock, but we don't need to be as we're still preempt safe. Get rid of the check, as we have a lockdep assert for holding the queue lock right after it anyway. Link: https://lore.kernel.org/linux-block/46253c48-81cb-0787-20ad-9133afdd9e21@samsung.com/ Reported-by: Marek Szyprowski Fixes: 77c570a1ea85 ("blk-cgroup: Remove unnecessary rcu_read_lock/unlock()") Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0676cf7a41b5..40161a3f68d0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -298,7 +298,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *blkg; int i, ret; - WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(&q->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ -- cgit From f950667356ce90a41b446b726d4595a10cb65415 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 19 May 2022 12:52:29 +0200 Subject: bfq: Relax waker detection for shared queues Currently we look for waker only if current queue has no requests. This makes sense for bfq queues with a single process however for shared queues when there is a larger number of processes the condition that queue has no requests is difficult to meet because often at least one process has some request in flight although all the others are waiting for the waker to do the work and this harms throughput. Relax the "no queued request for bfq queue" condition to "the current task has no queued requests yet". For this, we also need to start tracking number of requests in flight for each task. This patch (together with the following one) restores the performance for dbench with 128 clients that regressed with commit c65e6fd460b4 ("bfq: Do not let waker requests skip proper accounting") because this commit makes requests of wakers properly enter BFQ queues and thus these queues become ineligible for the old waker detection logic. Dbench results: Vanilla 5.18-rc3 5.18-rc3 + revert 5.18-rc3 patched Mean 1237.36 ( 0.00%) 950.16 * 23.21%* 988.35 * 20.12%* Numbers are time to complete workload so lower is better. Fixes: c65e6fd460b4 ("bfq: Do not let waker requests skip proper accounting") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220519105235.31397-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 5 +++-- block/bfq-iosched.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 92f0a829a804..7545f589d8c3 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2129,7 +2129,6 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || - bfqq->dispatched > 0 || now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) return; @@ -2210,7 +2209,7 @@ static void bfq_add_request(struct request *rq) */ WRITE_ONCE(bfqd->queued, bfqd->queued + 1); - if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { + if (bfq_bfqq_sync(bfqq) && RQ_BIC(rq)->requests <= 1) { bfq_check_waker(bfqd, bfqq, now_ns); /* @@ -6573,6 +6572,7 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); } bfq_finish_requeue_request_body(bfqq); + RQ_BIC(rq)->requests--; spin_unlock_irqrestore(&bfqd->lock, flags); /* @@ -6806,6 +6806,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq_request_allocated(bfqq); bfqq->ref++; + bic->requests++; bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index b18d6c31c225..ca8177d7bf7c 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -468,6 +468,7 @@ struct bfq_io_cq { struct bfq_queue *stable_merge_bfqq; bool stably_merged; /* non splittable if true */ + unsigned int requests; /* Number of requests this process has in flight */ }; /** -- cgit From c5ac56bb6110e42e79d3106866658376b2e48ab9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 19 May 2022 12:52:30 +0200 Subject: bfq: Allow current waker to defend against a tentative one The code in bfq_check_waker() ignores wake up events from the current waker. This makes it more likely we select a new tentative waker although the current one is generating more wake up events. Treat current waker the same way as any other process and allow it to reset the waker detection logic. Fixes: 71217df39dc6 ("block, bfq: make waker-queue detection more robust") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220519105235.31397-2-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7545f589d8c3..9e1d713de942 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2129,8 +2129,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || - now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || - bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC) return; /* -- cgit From e79cf8892e332b9dafc99aef02189a2897eced24 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 19 May 2022 12:52:31 +0200 Subject: bfq: Remove superfluous conversion from RQ_BIC() We store struct bfq_io_cq pointer in rq->elv.priv[0] in bfq_init_rq(). Thus a call to icq_to_bic() in RQ_BIC() is wrong. Luckily it does no harm currently because struct io_iq is the first one in struct bfq_io_cq. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220519105235.31397-3-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9e1d713de942..62de9f865610 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -374,7 +374,7 @@ static const unsigned long bfq_activation_stable_merging = 600; */ static const unsigned long bfq_late_stable_merging = 600; -#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) +#define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -- cgit From a249ca7dfbce1eb82bcd3a5a6bb21daeade20469 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 19 May 2022 12:52:32 +0200 Subject: bfq: Remove bfq_requeue_request_body() The function has only a single caller and two lines. Just remove it since it is pointless and just harming readability. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220519105235.31397-4-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 62de9f865610..038e075e05b8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6368,12 +6368,6 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_schedule_dispatch(bfqd); } -static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) -{ - bfqq_request_freed(bfqq); - bfq_put_queue(bfqq); -} - /* * The processes associated with bfqq may happen to generate their * cumulative I/O at a lower rate than the rate at which the device @@ -6570,7 +6564,8 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); } - bfq_finish_requeue_request_body(bfqq); + bfqq_request_freed(bfqq); + bfq_put_queue(bfqq); RQ_BIC(rq)->requests--; spin_unlock_irqrestore(&bfqd->lock, flags); -- cgit From 2aaf516084184e4e6f80da01b2b3ed882fd20a79 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:10:39 +0200 Subject: blk-mq: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Link: https://lore.kernel.org/r/20220521111145.81697-29-Julia.Lawall@inria.fr Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c4370d276170..9ef2d3284752 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1083,7 +1083,7 @@ bool blk_mq_complete_request_remote(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* - * For a polled request, always complete locallly, it's pointless + * For a polled request, always complete locally, it's pointless * to redirect the completion. */ if (rq->cmd_flags & REQ_POLLED) -- cgit