From 309dca309fc39a9e3c31b916393b74bd174fd74e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:34 +0100 Subject: block: store a block_device pointer in struct bio Replace the gendisk pointer in struct bio with a pointer to the newly improved struct block device. From that the gendisk can be trivially accessed with an extra indirection, but it also allows to directly look up all information related to partition remapping. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/bio.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 1f2cc1fbe283..0b70ade17da6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -607,16 +607,7 @@ void bio_truncate(struct bio *bio, unsigned new_size) */ void guard_bio_eod(struct bio *bio) { - sector_t maxsector; - struct block_device *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = bdev_nr_sectors(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; @@ -676,11 +667,10 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); /* - * most users will be overriding ->bi_disk with a new target, + * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_disk = bio_src->bi_disk; - bio->bi_partno = bio_src->bi_partno; + bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); @@ -730,7 +720,7 @@ EXPORT_SYMBOL(bio_clone_fast); const char *bio_devname(struct bio *bio, char *buf) { - return disk_name(bio->bi_disk, bio->bi_partno, buf); + return bdevname(bio->bi_bdev, buf); } EXPORT_SYMBOL(bio_devname); @@ -1037,7 +1027,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; unsigned int max_append_sectors = queue_max_zone_append_sectors(q); struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; @@ -1145,7 +1135,8 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); unsigned long hang_check; bio->bi_private = &done; @@ -1422,8 +1413,8 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) - rq_qos_done_bio(bio->bi_disk->queue, bio); + if (bio->bi_bdev) + rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); /* * Need to have a real endio function for chained bios, otherwise @@ -1438,8 +1429,8 @@ again: goto again; } - if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_disk->queue, bio); + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } -- cgit From 49d1ec8573f74ff1e23df1d5092211de46baa236 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:52 +0800 Subject: block: manage bio slab cache by xarray Managing bio slab cache via xarray by using slab cache size as xarray index, and storing 'struct bio_slab' instance into xarray. So code is simplified a lot, meantime it becomes more readable than before. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Tested-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/bio.c | 116 +++++++++++++++++++++++++----------------------------------- 1 file changed, 49 insertions(+), 67 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 0b70ade17da6..87bf16460e0e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "blk.h" @@ -58,89 +59,80 @@ struct bio_slab { char name[8]; }; static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; +static DEFINE_XARRAY(bio_slabs); -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +static struct bio_slab *create_bio_slab(unsigned int size) { - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; + struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); - mutex_lock(&bio_slab_lock); + if (!bslab) + return NULL; - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); + bslab->slab = kmem_cache_create(bslab->name, size, + ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); + if (!bslab->slab) + goto fail_alloc_slab; - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } + bslab->slab_ref = 1; + bslab->slab_size = size; - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; + if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) + return bslab; - bslab = &bio_slabs[entry]; + kmem_cache_destroy(bslab->slab); - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, - SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; +fail_alloc_slab: + kfree(bslab); + return NULL; +} - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: +static inline unsigned int bs_bio_slab_size(struct bio_set *bs) +{ + return bs->front_pad + sizeof(struct bio) + + BIO_INLINE_VECS * sizeof(struct bio_vec); +} + +static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) +{ + unsigned int size = bs_bio_slab_size(bs); + struct bio_slab *bslab; + + mutex_lock(&bio_slab_lock); + bslab = xa_load(&bio_slabs, size); + if (bslab) + bslab->slab_ref++; + else + bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); - return slab; + + if (bslab) + return bslab->slab; + return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; - unsigned int i; + unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - + bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; + WARN_ON_ONCE(bslab->slab != bs->bio_slab); + WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; + xa_erase(&bio_slabs, slab_size); + kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; + kfree(bslab); out: mutex_unlock(&bio_slab_lock); @@ -1570,15 +1562,13 @@ int bioset_init(struct bio_set *bs, unsigned int front_pad, int flags) { - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - bs->front_pad = front_pad; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; @@ -1642,16 +1632,8 @@ static void __init biovec_init_slabs(void) static int __init init_bio(void) { - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), - GFP_KERNEL); - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - if (!bio_slabs) - panic("bio: can't allocate bios\n"); - bio_integrity_init(); biovec_init_slabs(); -- cgit From 9f180e315a93cde559ac1c9c4c5ce980aa681c1c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:54 +0800 Subject: block: don't allocate inline bvecs if this bioset needn't bvecs The inline bvecs won't be used if user needn't bvecs by not passing BIOSET_NEED_BVECS, so don't allocate bvecs in this situation. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Tested-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/bio.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 87bf16460e0e..8ccda51dd831 100644 --- a/block/bio.c +++ b/block/bio.c @@ -89,8 +89,7 @@ fail_alloc_slab: static inline unsigned int bs_bio_slab_size(struct bio_set *bs) { - return bs->front_pad + sizeof(struct bio) + - BIO_INLINE_VECS * sizeof(struct bio_vec); + return bs->front_pad + sizeof(struct bio) + bs->back_pad; } static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) @@ -1563,6 +1562,10 @@ int bioset_init(struct bio_set *bs, int flags) { bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); -- cgit From baa2c7c97153b8064dbeeb99f2f72de3a75c90a7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:55 +0800 Subject: block: set .bi_max_vecs as actual allocated vector number bvec_alloc() may allocate more bio vectors than requested, so set .bi_max_vecs as actual allocated vector number, instead of the requested number. This way can help fs build bigger bio because new bio often won't be allocated until the current one becomes full. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 8ccda51dd831..56a06f94fb63 100644 --- a/block/bio.c +++ b/block/bio.c @@ -505,12 +505,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, goto err_free; bio->bi_flags |= idx << BVEC_POOL_OFFSET; + bio->bi_max_vecs = bvec_nr_vecs(idx); } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; + bio->bi_max_vecs = inline_vecs; } bio->bi_pool = bs; - bio->bi_max_vecs = nr_iovecs; bio->bi_io_vec = bvl; return bio; -- cgit From 0cf41e5e9bafc185490624c3e321c915885a91f3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:02:59 +0000 Subject: block/psi: remove PSI annotations from direct IO Direct IO does not operate on the current working set of pages managed by the kernel, so it should not be accounted as memory stall to PSI infrastructure. The block layer and iomap direct IO use bio_iov_iter_get_pages() to build bios, and they are the only users of it, so to avoid PSI tracking for them clear out BIO_WORKINGSET flag. Do same for dio_bio_submit() because fs/direct_io constructs bios by hand directly calling bio_add_page(). Reported-by: Christoph Hellwig Suggested-by: Christoph Hellwig Suggested-by: Johannes Weiner Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 56a06f94fb63..1cd8a2e79048 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1081,6 +1081,9 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. + * + * It's intended for direct IO, so doesn't do PSI tracking, the caller is + * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1105,6 +1108,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (is_bvec) bio_set_flag(bio, BIO_NO_PAGE_REF); + + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); -- cgit From c42bca92be928ce7dece5fc04cf68d0e37ee6718 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:03:03 +0000 Subject: bio: don't copy bvec for direct IO The block layer spends quite a while in blkdev_direct_IO() to copy and initialise bio's bvec. However, if we've already got a bvec in the input iterator it might be reused in some cases, i.e. when new ITER_BVEC_FLAG_FIXED flag is set. Simple tests show considerable performance boost, and it also reduces memory footprint. Suggested-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 67 ++++++++++++++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 38 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 1cd8a2e79048..99040a7e6656 100644 --- a/block/bio.c +++ b/block/bio.c @@ -942,21 +942,17 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - const struct bio_vec *bv = iter->bvec; - unsigned int len; - size_t size; - - if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) - return -EINVAL; - - len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); - size = bio_add_page(bio, bv->bv_page, len, - bv->bv_offset + iter->iov_offset); - if (unlikely(size != len)) - return -EINVAL; - iov_iter_advance(iter, size); + WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0); + + bio->bi_vcnt = iter->nr_segs; + bio->bi_max_vecs = iter->nr_segs; + bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_bvec_done = iter->iov_offset; + bio->bi_iter.bi_size = iter->count; + + iov_iter_advance(iter, iter->count); return 0; } @@ -1070,12 +1066,12 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. If we're adding kernel pages, and the caller told us it's safe to - * do so, we just have to add the pages to the bio directly. We don't grab an - * extra reference to those pages (the user should already have that), and we - * don't put the page on IO completion. The caller needs to check if the bio is - * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be - * released. + * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided + * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs + * to ensure the bvecs and pages stay referenced until the submitted I/O is + * completed by a call to ->ki_complete() or returns with an error other than + * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF + * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If @@ -1087,27 +1083,22 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - const bool is_bvec = iov_iter_is_bvec(iter); - int ret; - - if (WARN_ON_ONCE(bio->bi_vcnt)) - return -EINVAL; + int ret = 0; - do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - if (WARN_ON_ONCE(is_bvec)) - return -EINVAL; - ret = __bio_iov_append_get_pages(bio, iter); - } else { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); + if (iov_iter_is_bvec(iter)) { + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return -EINVAL; + bio_iov_bvec_set(bio, iter); + bio_set_flag(bio, BIO_NO_PAGE_REF); + return 0; + } else { + do { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = __bio_iov_append_get_pages(bio, iter); else ret = __bio_iov_iter_get_pages(bio, iter); - } - } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - - if (is_bvec) - bio_set_flag(bio, BIO_NO_PAGE_REF); + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + } /* don't account direct I/O as memory stall */ bio_clear_flag(bio, BIO_WORKINGSET); -- cgit From 46bbf653a67a36989a55dbb894c8b94c5ecb2858 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:33:08 +0100 Subject: block: inherit BIO_REMAPPED when cloning bios Cloned bios are can be used to on the same device, in which case we need to inherit the BIO_REMAPPED flag to avoid a double partition remap. When the cloned bios are used on another device, bio_set_dev will clear the flag. Fixes: 309dca309fc3 ("block: store a block_device pointer in struct bio") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 99040a7e6656..dfd7740a3230 100644 --- a/block/bio.c +++ b/block/bio.c @@ -666,6 +666,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; -- cgit From 3175199ab0ac8c874ec25c6bf169f74888917435 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:34 +0100 Subject: block: split bio_kmalloc from bio_alloc_bioset bio_kmalloc shares almost no logic with the bio_set based fast path in bio_alloc_bioset. Split it into an entirely separate implementation. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/bio.c | 167 +++++++++++++++++++++++++++++++----------------------------- 1 file changed, 85 insertions(+), 82 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index dfd7740a3230..d4375619348c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -396,123 +396,101 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. + * Allocate a bio from the mempools in @bs. * - * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will - * always be able to allocate a bio. This is due to the mempool guarantees. - * To make this work, callers must never allocate more than 1 bio at a time - * from this pool. Callers that need to allocate more than 1 bio must always - * submit the previously allocated bio for IO before attempting to allocate - * a new one. Failure to do so can cause deadlocks under memory pressure. + * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to + * allocate a bio. This is due to the mempool guarantees. To make this work, + * callers must never allocate more than 1 bio at a time from the general pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under submit_bio_noacct() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * submit_bio_noacct() that converts recursion into iteration, to prevent - * stack overflows. + * Note that when running under submit_bio_noacct() (i.e. any block driver), + * bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. * - * This would normally mean allocating multiple bios under - * submit_bio_noacct() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. + * This would normally mean allocating multiple bios under submit_bio_noacct() + * would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. * - * RETURNS: - * Pointer to new bio on success, NULL on failure. + * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - struct bio_vec *bvl = NULL; struct bio *bio; void *p; - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && - nr_iovecs > 0)) - return NULL; - /* - * submit_bio_noacct() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath submit_bio_noacct(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_DIRECT_RECLAIM; if that fails, we punt those - * bios we would be blocking to the rescuer workqueue before - * we retry with the original gfp_flags. - */ - - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) + return NULL; + /* + * submit_bio_noacct() converts recursion to iteration; this means if + * we're running beneath it, any bios we allocate and submit will not be + * submitted (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios + * from the same bio_set() while running underneath submit_bio_noacct(). + * If we were to allocate multiple bios (say a stacking block driver + * that was splitting bios), we would deadlock if we exhausted the + * mempool's reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are bios on + * current->bio_list, we first try the allocation without + * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be + * blocking to the rescuer workqueue before we retry with the original + * gfp_flags. + */ + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; } - if (unlikely(!p)) return NULL; - bio = p + front_pad; - bio_init(bio, NULL, 0); - - if (nr_iovecs > inline_vecs) { + bio = p + bs->front_pad; + if (nr_iovecs > BIO_INLINE_VECS) { unsigned long idx = 0; + struct bio_vec *bvl = NULL; bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, + &bs->bvec_pool); } if (unlikely(!bvl)) goto err_free; bio->bi_flags |= idx << BVEC_POOL_OFFSET; - bio->bi_max_vecs = bvec_nr_vecs(idx); + bio_init(bio, bvl, bvec_nr_vecs(idx)); } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; - bio->bi_max_vecs = inline_vecs; + bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); + } else { + bio_init(bio, NULL, 0); } bio->bi_pool = bs; - bio->bi_io_vec = bvl; return bio; err_free: @@ -521,6 +499,31 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); +/** + * bio_kmalloc - kmalloc a bio for I/O + * @gfp_mask: the GFP_* mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * + * Use kmalloc to allocate and initialize a bio. + * + * Returns: Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + struct bio *bio; + + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); + if (unlikely(!bio)) + return NULL; + bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); + bio->bi_pool = NULL; + return bio; +} +EXPORT_SYMBOL(bio_kmalloc); + void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { unsigned long flags; -- cgit From 8358c28a5d44bf0223a55a2334086c3707bb4185 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 2 Feb 2021 23:54:10 +0800 Subject: block: fix memory leak of bvec bio_init() clears bio instance, so the bvec index has to be set after bio_init(), otherwise bio->bi_io_vec may be leaked. Fixes: 3175199ab0ac ("block: split bio_kmalloc from bio_alloc_bioset") Cc: Johannes Thumshirn Cc: Chaitanya Kulkarni Cc: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index d4375619348c..757fee46cefc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -482,8 +482,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (unlikely(!bvl)) goto err_free; - bio->bi_flags |= idx << BVEC_POOL_OFFSET; bio_init(bio, bvl, bvec_nr_vecs(idx)); + bio->bi_flags |= idx << BVEC_POOL_OFFSET; } else if (nr_iovecs) { bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); } else { -- cgit From dc0b8a57ad7b05036fcb19a5bf0319467597e67a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:19 +0100 Subject: block: reuse BIO_INLINE_VECS for integrity bvecs bvec_alloc always uses biovec_slabs, and thus always needs to use the same number of inline vecs. Share a single definition for the data and integrity bvecs. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 757fee46cefc..cee2d310f02e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,12 +25,6 @@ #include "blk.h" #include "blk-rq-qos.h" -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 - /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an -- cgit From 6ac0b71537e1c14e7532408fe4aae553aa314237 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:20 +0100 Subject: block: move struct biovec_slab to bio.c struct biovec_slab is only used inside of bio.c, so move it there. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index cee2d310f02e..2c359dadfdf6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,6 +25,12 @@ #include "blk.h" #include "blk-rq-qos.h" +struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +}; + /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an -- cgit From f2c3eb9bb0ef77517976f8be926a77a574da8fe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:21 +0100 Subject: block: factor out a bvec_alloc_gfp helper Clean up bvec_alloc a little by factoring out a helper for the gfp_t manipulations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 2c359dadfdf6..c2152c4bf8a3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -159,6 +159,16 @@ void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) } } +/* + * Make the first allocation restricted and don't dump info on allocation + * failures, since we'll fall back to the mempool in case of failure. + */ +static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +{ + return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; +} + struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, mempool_t *pool) { @@ -199,20 +209,12 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); - - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); + bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BVEC_POOL_MAX; goto fallback; -- cgit From f007a3d66c5480c8dae3fa20a89a06861ef1f5db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:22 +0100 Subject: block: streamline bvec_alloc Avoid the pointless goto by trying the slab allocation first and falling through to the mempool. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index c2152c4bf8a3..321b3479a154 100644 --- a/block/bio.c +++ b/block/bio.c @@ -172,8 +172,6 @@ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, mempool_t *pool) { - struct bio_vec *bvl; - /* * see comment near bvec_array define! */ @@ -201,28 +199,24 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, } /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. + * Try a slab allocation first for all smaller allocations. If that + * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. + * The mempool is sized to handle up to BIO_MAX_PAGES entries. */ - if (*idx == BVEC_POOL_MAX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { + if (*idx < BVEC_POOL_MAX) { struct biovec_slab *bvs = bvec_slabs + *idx; + struct bio_vec *bvl; - /* - * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM - * is set, retry with the 1-entry mempool - */ bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { - *idx = BVEC_POOL_MAX; - goto fallback; + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) { + (*idx)++; + return bvl; } + *idx = BVEC_POOL_MAX; } (*idx)++; - return bvl; + return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) -- cgit From de76fd893074ab2cea132c28ac9efd9d0434215e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:23 +0100 Subject: block: remove the 1 and 4 vec bvec_slabs entries All bios with up to 4 bvecs use the inline bvecs in the bio itself, so don't bother to define bvec_slabs entries for them. Also decruftify the bvec_slabs definition and initialization while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 53 ++++++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 321b3479a154..ae241252ea14 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,23 +25,17 @@ #include "blk.h" #include "blk-rq-qos.h" -struct biovec_slab { +static struct biovec_slab { int nr_vecs; char *name; struct kmem_cache *slab; +} bvec_slabs[] __read_mostly = { + { .nr_vecs = 16, .name = "biovec-16" }, + { .nr_vecs = 64, .name = "biovec-64" }, + { .nr_vecs = 128, .name = "biovec-128" }, + { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, }; -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } -static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { - BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), -}; -#undef BV - /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. @@ -176,12 +170,7 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, * see comment near bvec_array define! */ switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; + /* smaller bios use inline vecs */ case 5 ... 16: *idx = 2; break; @@ -1613,31 +1602,21 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -static void __init biovec_init_slabs(void) +static int __init init_bio(void) { int i; - for (i = 0; i < BVEC_POOL_NR; i++) { - int size; - struct biovec_slab *bvs = bvec_slabs + i; - - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } - - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } -} - -static int __init init_bio(void) -{ BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); bio_integrity_init(); - biovec_init_slabs(); + + for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { + struct biovec_slab *bvs = bvec_slabs + i; + + bvs->slab = kmem_cache_create(bvs->name, + bvs->nr_vecs * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + } if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) panic("bio: can't allocate bios\n"); -- cgit From 0f2e6ab851ae146c468bc5151c302c6e2473f70a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:24 +0100 Subject: block: turn the nr_iovecs argument to bio_alloc* into an unsigned short The bi_max_vecs and bi_vcnt fields are defined as unsigned short, so don't allow passing larger values in. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index ae241252ea14..3d28d4723f6f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -407,7 +407,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; @@ -493,7 +493,7 @@ EXPORT_SYMBOL(bio_alloc_bioset); * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) { struct bio *bio; -- cgit From 86004515ed80c01d59ab54b5d048164750af3c4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:25 +0100 Subject: block: remove a layer of indentation in bio_iov_iter_get_pages Remove a pointless layer of indentation after a return statement. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index 3d28d4723f6f..dd3b2a01c9bf 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1081,15 +1081,15 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) bio_iov_bvec_set(bio, iter); bio_set_flag(bio, BIO_NO_PAGE_REF); return 0; - } else { - do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = __bio_iov_append_get_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); - } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); } + do { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = __bio_iov_append_get_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + /* don't account direct I/O as memory stall */ bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; -- cgit From ed97ce5e1daf26d456760443fc89dc14d2b677e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:26 +0100 Subject: block: set BIO_NO_PAGE_REF in bio_iov_bvec_set bio_iov_bvec_set assigns the foreign bvec, so setting the NO_PAGE_REF directly there seems like the best fit. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index dd3b2a01c9bf..f75320123827 100644 --- a/block/bio.c +++ b/block/bio.c @@ -941,6 +941,7 @@ static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iter->count; + bio_set_flag(bio, BIO_NO_PAGE_REF); iov_iter_advance(iter, iter->count); return 0; @@ -1078,9 +1079,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (iov_iter_is_bvec(iter)) { if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return -EINVAL; - bio_iov_bvec_set(bio, iter); - bio_set_flag(bio, BIO_NO_PAGE_REF); - return 0; + return bio_iov_bvec_set(bio, iter); } do { -- cgit From 977be01273844626ddeef4a464b42b99418d76e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:27 +0100 Subject: block: mark the bio as cloned in bio_iov_bvec_set bio_iov_bvec_set clones the bio_vecs from the iter, and thus should be treated like a cloned bio in every respect. That also includes not touching bi_max_vecs as that is a property of the bio allocation and not its current payload. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index f75320123827..a36f955cd120 100644 --- a/block/bio.c +++ b/block/bio.c @@ -937,11 +937,11 @@ static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0); bio->bi_vcnt = iter->nr_segs; - bio->bi_max_vecs = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iter->count; bio_set_flag(bio, BIO_NO_PAGE_REF); + bio_set_flag(bio, BIO_CLONED); iov_iter_advance(iter, iter->count); return 0; -- cgit From 7a800a20ae6329e803c5c646b20811a6ae9ca136 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:29 +0100 Subject: block: use bi_max_vecs to find the bvec pool Instead of encoding of the bvec pool using magic bio flags, just use a helper to find the pool based on the max_vecs value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 104 +++++++++++++++++++++++++----------------------------------- 1 file changed, 43 insertions(+), 61 deletions(-) (limited to 'block/bio.c') diff --git a/block/bio.c b/block/bio.c index a36f955cd120..a0eabe2f8b07 100644 --- a/block/bio.c +++ b/block/bio.c @@ -36,6 +36,24 @@ static struct biovec_slab { { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, }; +static struct biovec_slab *biovec_slab(unsigned short nr_vecs) +{ + switch (nr_vecs) { + /* smaller bios use inline vecs */ + case 5 ... 16: + return &bvec_slabs[0]; + case 17 ... 64: + return &bvec_slabs[1]; + case 65 ... 128: + return &bvec_slabs[2]; + case 129 ... BIO_MAX_PAGES: + return &bvec_slabs[3]; + default: + BUG(); + return NULL; + } +} + /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. @@ -131,26 +149,14 @@ out: mutex_unlock(&bio_slab_lock); } -unsigned int bvec_nr_vecs(unsigned short idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - return bvec_slabs[--idx].nr_vecs; -} + BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - if (!idx) - return; - idx--; - - BIO_BUG_ON(idx >= BVEC_POOL_NR); - - if (idx == BVEC_POOL_MAX) { + if (nr_vecs == BIO_MAX_PAGES) mempool_free(bv, pool); - } else { - struct biovec_slab *bvs = bvec_slabs + idx; - - kmem_cache_free(bvs->slab, bv); - } + else if (nr_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } /* @@ -163,48 +169,34 @@ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask) { + struct biovec_slab *bvs = biovec_slab(*nr_vecs); + + if (WARN_ON_ONCE(!bvs)) + return NULL; + /* - * see comment near bvec_array define! + * Upgrade the nr_vecs request to take full advantage of the allocation. + * We also rely on this in the bvec_free path. */ - switch (nr) { - /* smaller bios use inline vecs */ - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: - return NULL; - } + *nr_vecs = bvs->nr_vecs; /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. * The mempool is sized to handle up to BIO_MAX_PAGES entries. */ - if (*idx < BVEC_POOL_MAX) { - struct biovec_slab *bvs = bvec_slabs + *idx; + if (*nr_vecs < BIO_MAX_PAGES) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) { - (*idx)++; + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; - } - *idx = BVEC_POOL_MAX; + *nr_vecs = BIO_MAX_PAGES; } - (*idx)++; return mempool_alloc(pool, gfp_mask); } @@ -231,7 +223,7 @@ static void bio_free(struct bio *bio) bio_uninit(bio); if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); /* * If we have front padding, adjust the bio pointer before freeing @@ -275,12 +267,8 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio) { - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - bio_uninit(bio); - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -453,22 +441,18 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, bio = p + bs->front_pad; if (nr_iovecs > BIO_INLINE_VECS) { - unsigned long idx = 0; struct bio_vec *bvl = NULL; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, - &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); } - if (unlikely(!bvl)) goto err_free; - bio_init(bio, bvl, bvec_nr_vecs(idx)); - bio->bi_flags |= idx << BVEC_POOL_OFFSET; + bio_init(bio, bvl, nr_iovecs); } else if (nr_iovecs) { bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); } else { @@ -644,7 +628,7 @@ EXPORT_SYMBOL(bio_put); */ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); + WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); /* * most users will be overriding ->bi_bdev with a new target, @@ -934,7 +918,7 @@ EXPORT_SYMBOL_GPL(bio_release_pages); static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0); + WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; @@ -1495,7 +1479,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ int biovec_init_pool(mempool_t *pool, int pool_entries) { - struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; + struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } @@ -1605,8 +1589,6 @@ static int __init init_bio(void) { int i; - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - bio_integrity_init(); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { -- cgit