diff options
Diffstat (limited to 'block/bio.c')
| -rw-r--r-- | block/bio.c | 1253 |
1 files changed, 756 insertions, 497 deletions
diff --git a/block/bio.c b/block/bio.c index 1fab762e079b..fa5ff36b443f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -4,7 +4,7 @@ */ #include <linux/mm.h> #include <linux/swap.h> -#include <linux/bio.h> +#include <linux/bio-integrity.h> #include <linux/blkdev.h> #include <linux/uio.h> #include <linux/iocontext.h> @@ -15,15 +15,24 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> -#include <linux/blk-cgroup.h> #include <linux/highmem.h> -#include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" +#include "blk-cgroup.h" + +#define ALLOC_CACHE_THRESHOLD 16 +#define ALLOC_CACHE_MAX 256 + +struct bio_alloc_cache { + struct bio *free_list; + struct bio *free_list_irq; + unsigned int nr; + unsigned int nr_irq; +}; static struct biovec_slab { int nr_vecs; @@ -68,7 +77,7 @@ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; - char name[8]; + char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); @@ -82,7 +91,8 @@ static struct bio_slab *create_bio_slab(unsigned int size) snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); bslab->slab = kmem_cache_create(bslab->name, size, - ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); + ARCH_KMALLOC_MINALIGN, + SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); if (!bslab->slab) goto fail_alloc_slab; @@ -151,7 +161,7 @@ out: void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); + BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); @@ -218,24 +228,13 @@ EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; - void *p; - - bio_uninit(bio); + void *p = bio; - if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + WARN_ON_ONCE(!bs); - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - p -= bs->front_pad; - - mempool_free(p, &bs->bio_pool); - } else { - /* Bio was allocated by bio_kmalloc() */ - kfree(bio); - } + bio_uninit(bio); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + mempool_free(p - bs->front_pad, &bs->bio_pool); } /* @@ -243,21 +242,56 @@ static void bio_free(struct bio *bio) * they must remember to pair any call to bio_init() with bio_uninit() * when IO has completed, or when the bio is released. */ -void bio_init(struct bio *bio, struct bio_vec *table, - unsigned short max_vecs) +void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, + unsigned short max_vecs, blk_opf_t opf) { - memset(bio, 0, sizeof(*bio)); + bio->bi_next = NULL; + bio->bi_bdev = bdev; + bio->bi_opf = opf; + bio->bi_flags = 0; + bio->bi_ioprio = 0; + bio->bi_write_hint = 0; + bio->bi_write_stream = 0; + bio->bi_status = 0; + bio->bi_bvec_gap_bit = 0; + bio->bi_iter.bi_sector = 0; + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_idx = 0; + bio->bi_iter.bi_bvec_done = 0; + bio->bi_end_io = NULL; + bio->bi_private = NULL; +#ifdef CONFIG_BLK_CGROUP + bio->bi_blkg = NULL; + bio->issue_time_ns = 0; + if (bdev) + bio_associate_blkg(bio); +#ifdef CONFIG_BLK_CGROUP_IOCOST + bio->bi_iocost_cost = 0; +#endif +#endif +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + bio->bi_crypt_context = NULL; +#endif +#ifdef CONFIG_BLK_DEV_INTEGRITY + bio->bi_integrity = NULL; +#endif + bio->bi_vcnt = 0; + atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); + bio->bi_cookie = BLK_QC_T_NONE; - bio->bi_io_vec = table; bio->bi_max_vecs = max_vecs; + bio->bi_io_vec = table; + bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); /** * bio_reset - reinitialize a bio * @bio: bio to reset + * @bdev: block device to use the bio for + * @opf: operation and flags for bio * * Description: * After calling bio_reset(), @bio will be in the same state as a freshly @@ -265,11 +299,15 @@ EXPORT_SYMBOL(bio_init); * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ -void bio_reset(struct bio *bio) +void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); + bio->bi_bdev = bdev; + if (bio->bi_bdev) + bio_associate_blkg(bio); + bio->bi_opf = opf; } EXPORT_SYMBOL(bio_reset); @@ -309,6 +347,31 @@ void bio_chain(struct bio *bio, struct bio *parent) } EXPORT_SYMBOL(bio_chain); +/** + * bio_chain_and_submit - submit a bio after chaining it to another one + * @prev: bio to chain and submit + * @new: bio to chain to + * + * If @prev is non-NULL, chain it to @new and submit it. + * + * Return: @new. + */ +struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) +{ + if (prev) { + bio_chain(prev, new); + submit_bio(prev); + } + return new; +} + +struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, + unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) +{ + return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); +} +EXPORT_SYMBOL_GPL(blk_next_bio); + static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); @@ -363,10 +426,57 @@ static void punt_bios_to_rescuer(struct bio_set *bs) queue_work(bs->rescue_workqueue, &bs->rescue_work); } +static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) +{ + unsigned long flags; + + /* cache->free_list must be empty */ + if (WARN_ON_ONCE(cache->free_list)) + return; + + local_irq_save(flags); + cache->free_list = cache->free_list_irq; + cache->free_list_irq = NULL; + cache->nr += cache->nr_irq; + cache->nr_irq = 0; + local_irq_restore(flags); +} + +static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, + unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, + struct bio_set *bs) +{ + struct bio_alloc_cache *cache; + struct bio *bio; + + cache = per_cpu_ptr(bs->cache, get_cpu()); + if (!cache->free_list) { + if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) + bio_alloc_irq_cache_splice(cache); + if (!cache->free_list) { + put_cpu(); + return NULL; + } + } + bio = cache->free_list; + cache->free_list = bio->bi_next; + cache->nr--; + put_cpu(); + + if (nr_vecs) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, NULL, nr_vecs, opf); + bio->bi_pool = bs; + return bio; +} + /** * bio_alloc_bioset - allocate a bio for I/O + * @bdev: block device to allocate the bio for (can be %NULL) + * @nr_vecs: number of bvecs to pre-allocate + * @opf: operation and flags for bio * @gfp_mask: the GFP_* mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. @@ -395,17 +505,31 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, +struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, + blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; struct bio *bio; void *p; - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) + /* should not use nobvec bioset for nr_vecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; + if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, + gfp_mask, bs); + if (bio) + return bio; + /* + * No cached bio available, bio returned below marked with + * REQ_ALLOC_CACHE to participate in per-cpu alloc cache. + */ + } else + opf &= ~REQ_ALLOC_CACHE; + /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be @@ -438,25 +562,27 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, } if (unlikely(!p)) return NULL; + if (!mempool_is_saturated(&bs->bio_pool)) + opf &= ~REQ_ALLOC_CACHE; bio = p + bs->front_pad; - if (nr_iovecs > BIO_INLINE_VECS) { + if (nr_vecs > BIO_INLINE_VECS) { struct bio_vec *bvl = NULL; - bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); + bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); + bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); } if (unlikely(!bvl)) goto err_free; - bio_init(bio, bvl, nr_iovecs); - } else if (nr_iovecs) { - bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); + bio_init(bio, bdev, bvl, nr_vecs, opf); + } else if (nr_vecs) { + bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { - bio_init(bio, NULL, 0); + bio_init(bio, bdev, NULL, 0, opf); } bio->bi_pool = bs; @@ -469,44 +595,41 @@ err_free: EXPORT_SYMBOL(bio_alloc_bioset); /** - * bio_kmalloc - kmalloc a bio for I/O + * bio_kmalloc - kmalloc a bio + * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate * - * Use kmalloc to allocate and initialize a bio. + * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized + * using bio_init() before use. To free a bio returned from this function use + * kfree() after calling bio_uninit(). A bio returned from this function can + * be reused by calling bio_uninit() before calling bio_init() again. + * + * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this + * function are not backed by a mempool can fail. Do not use this function + * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) +struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - if (unlikely(!bio)) + if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; - bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); - bio->bi_pool = NULL; - return bio; + return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), + gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); -void zero_fill_bio(struct bio *bio) +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { - unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } + __bio_for_each_segment(bv, bio, iter, start) + memzero_bvec(&bv); } -EXPORT_SYMBOL(zero_fill_bio); +EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size @@ -518,7 +641,7 @@ EXPORT_SYMBOL(zero_fill_bio); * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ -void bio_truncate(struct bio *bio, unsigned new_size) +static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; @@ -533,13 +656,14 @@ void bio_truncate(struct bio *bio, unsigned new_size) bio_for_each_segment(bv, bio, iter) { if (done + bv.bv_len > new_size) { - unsigned offset; + size_t offset; if (!truncated) offset = new_size - done; else offset = 0; - zero_user(bv.bv_page, offset, bv.bv_len - offset); + memzero_page(bv.bv_page, bv.bv_offset + offset, + bv.bv_len - offset); truncated = true; } done += bv.bv_len; @@ -591,6 +715,95 @@ void guard_bio_eod(struct bio *bio) bio_truncate(bio, maxsector << 9); } +static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) +{ + unsigned int i = 0; + struct bio *bio; + + while ((bio = cache->free_list) != NULL) { + cache->free_list = bio->bi_next; + cache->nr--; + bio_free(bio); + if (++i == nr) + break; + } + return i; +} + +static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) +{ + nr -= __bio_alloc_cache_prune(cache, nr); + if (!READ_ONCE(cache->free_list)) { + bio_alloc_irq_cache_splice(cache); + __bio_alloc_cache_prune(cache, nr); + } +} + +static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct bio_set *bs; + + bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); + if (bs->cache) { + struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); + + bio_alloc_cache_prune(cache, -1U); + } + return 0; +} + +static void bio_alloc_cache_destroy(struct bio_set *bs) +{ + int cpu; + + if (!bs->cache) + return; + + cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); + for_each_possible_cpu(cpu) { + struct bio_alloc_cache *cache; + + cache = per_cpu_ptr(bs->cache, cpu); + bio_alloc_cache_prune(cache, -1U); + } + free_percpu(bs->cache); + bs->cache = NULL; +} + +static inline void bio_put_percpu_cache(struct bio *bio) +{ + struct bio_alloc_cache *cache; + + cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); + if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) + goto out_free; + + if (in_task()) { + bio_uninit(bio); + bio->bi_next = cache->free_list; + /* Not necessary but helps not to iopoll already freed bios */ + bio->bi_bdev = NULL; + cache->free_list = bio; + cache->nr++; + } else if (in_hardirq()) { + lockdep_assert_irqs_disabled(); + + bio_uninit(bio); + bio->bi_next = cache->free_list_irq; + cache->free_list_irq = bio; + cache->nr_irq++; + } else { + goto out_free; + } + put_cpu(); + return; +out_free: + put_cpu(); + bio_free(bio); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to @@ -601,98 +814,116 @@ void guard_bio_eod(struct bio *bio) **/ void bio_put(struct bio *bio) { - if (!bio_flagged(bio, BIO_REFFED)) - bio_free(bio); - else { - BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->__bi_cnt)) - bio_free(bio); + if (unlikely(bio_flagged(bio, BIO_REFFED))) { + BUG_ON(!atomic_read(&bio->__bi_cnt)); + if (!atomic_dec_and_test(&bio->__bi_cnt)) + return; } + if (bio->bi_opf & REQ_ALLOC_CACHE) + bio_put_percpu_cache(bio); + else + bio_free(bio); } EXPORT_SYMBOL(bio_put); -/** - * __bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: destination bio - * @bio_src: bio to clone - * - * Clone a &bio. Caller will own the returned bio, but not - * the actual data it points to. Reference count of returned - * bio will be one. - * - * Caller must ensure that @bio_src is not freed before @bio. - */ -void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { - WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); - - /* - * most users will be overriding ->bi_bdev with a new target, - * so we don't set nor calculate new physical/hw segment counts here - */ - bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); - if (bio_flagged(bio_src, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); - if (bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); - bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; - bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); + if (bio->bi_bdev) { + if (bio->bi_bdev == bio_src->bi_bdev && + bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); + bio_clone_blkg_association(bio, bio_src); + } + + if (bio_crypt_clone(bio, bio_src, gfp) < 0) + return -ENOMEM; + if (bio_integrity(bio_src) && + bio_integrity_clone(bio, bio_src, gfp) < 0) + return -ENOMEM; + return 0; } -EXPORT_SYMBOL(__bio_clone_fast); /** - * bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from + * bio_alloc_clone - clone a bio that shares the original bio's biovec + * @bdev: block_device to clone onto + * @bio_src: bio to clone from + * @gfp: allocation priority + * @bs: bio_set to allocate from + * + * Allocate a new bio that is a clone of @bio_src. The caller owns the returned + * bio, but not the actual data it points to. * - * Like __bio_clone_fast, only also allocates the returned bio + * The caller must ensure that the return bio is not freed before @bio_src. */ -struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, + gfp_t gfp, struct bio_set *bs) { - struct bio *b; + struct bio *bio; - b = bio_alloc_bioset(gfp_mask, 0, bs); - if (!b) + bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); + if (!bio) return NULL; - __bio_clone_fast(b, bio); - - if (bio_crypt_clone(b, bio, gfp_mask) < 0) - goto err_put; + if (__bio_clone(bio, bio_src, gfp) < 0) { + bio_put(bio); + return NULL; + } + bio->bi_io_vec = bio_src->bi_io_vec; - if (bio_integrity(bio) && - bio_integrity_clone(b, bio, gfp_mask) < 0) - goto err_put; + return bio; +} +EXPORT_SYMBOL(bio_alloc_clone); - return b; +/** + * bio_init_clone - clone a bio that shares the original bio's biovec + * @bdev: block_device to clone onto + * @bio: bio to clone into + * @bio_src: bio to clone from + * @gfp: allocation priority + * + * Initialize a new bio in caller provided memory that is a clone of @bio_src. + * The caller owns the returned bio, but not the actual data it points to. + * + * The caller must ensure that @bio_src is not freed before @bio. + */ +int bio_init_clone(struct block_device *bdev, struct bio *bio, + struct bio *bio_src, gfp_t gfp) +{ + int ret; -err_put: - bio_put(b); - return NULL; + bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf); + ret = __bio_clone(bio, bio_src, gfp); + if (ret) + bio_uninit(bio); + return ret; } -EXPORT_SYMBOL(bio_clone_fast); +EXPORT_SYMBOL(bio_init_clone); -const char *bio_devname(struct bio *bio, char *buf) +/** + * bio_full - check if the bio is full + * @bio: bio to check + * @len: length of one segment to be added + * + * Return true if @bio is full and one segment with @len bytes can't be + * added to the bio, otherwise return false + */ +static inline bool bio_full(struct bio *bio, unsigned len) { - return bdevname(bio->bi_bdev, buf); + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; } -EXPORT_SYMBOL(bio_devname); -static inline bool page_is_mergeable(const struct bio_vec *bv, - struct page *page, unsigned int len, unsigned int off, - bool *same_page) +static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, + unsigned int len, unsigned int off) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -703,280 +934,265 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; - *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (*same_page) - return true; - return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); + if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) + return false; + } + + bv->bv_len += len; + return true; } /* * Try to merge a page into a segment, while obeying the hardware segment - * size limit. This is not for normal read/write bios, but for passthrough - * or Zone Append operations that we can't split. + * size limit. + * + * This is kept around for the integrity metadata, which is still tries + * to build the initial bio to the hardware limit and doesn't have proper + * helpers to split. Hopefully this will go away soon. */ -static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, - struct page *page, unsigned len, - unsigned offset, bool *same_page) +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); - phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; + phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; - if (bv->bv_len + len > queue_max_segment_size(q)) + if (len > queue_max_segment_size(q) - bv->bv_len) return false; - return __bio_try_merge_page(bio, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset); } /** - * bio_add_hw_page - attempt to add a page to a bio with hw constraints - * @q: the target queue + * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * @max_sectors: maximum number of sectors that can be added - * @same_page: return if the segment has been merged inside the same page + * @page: start page to add + * @len: length of the data to add, may cross pages + * @off: offset of the data relative to @page, may cross pages * - * Add a page to a bio while respecting the hardware max_sectors, max_segment - * and gap limitations. + * Add the data at @page + @off to @bio as a new bvec. The caller must ensure + * that @bio has space for another bvec. */ -int bio_add_hw_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset, - unsigned int max_sectors, bool *same_page) +void __bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off) { - struct bio_vec *bvec; - - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return 0; - - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) - return 0; - - if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) - return len; - - /* - * If the queue doesn't support SG gaps and adding this segment - * would create a gap, disallow it. - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bvec_gap_to_prev(q, bvec, offset)) - return 0; - } - - if (bio_full(bio, len)) - return 0; + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + WARN_ON_ONCE(bio_full(bio, len)); - if (bio->bi_vcnt >= queue_max_segments(q)) - return 0; + if (is_pci_p2pdma_page(page)) + bio->bi_opf |= REQ_NOMERGE; - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - bio->bi_vcnt++; + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; - return len; + bio->bi_vcnt++; } +EXPORT_SYMBOL_GPL(__bio_add_page); /** - * bio_add_pc_page - attempt to add page to passthrough bio - * @q: the target queue + * bio_add_virt_nofail - add data in the direct kernel mapping to a bio * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. + * @vaddr: data to add + * @len: length of the data to add, may cross pages * - * This should only be used by passthrough bios. + * Add the data at @vaddr to @bio. The caller must have ensure a segment + * is available for the added data. No merging into an existing segment + * will be performed. */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset) +void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) { - bool same_page = false; - return bio_add_hw_page(q, bio, page, len, offset, - queue_max_hw_sectors(q), &same_page); + __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); } -EXPORT_SYMBOL(bio_add_pc_page); +EXPORT_SYMBOL_GPL(bio_add_virt_nofail); /** - * bio_add_zone_append_page - attempt to add page to zone-append bio - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist of a bio that will be submitted - * for a zone-append request. This can fail for a number of reasons, such as the - * bio being full or the target block device is not a zoned block device or - * other limitations of the target block device. The target block device must - * allow bio's up to PAGE_SIZE, so it is always possible to add a single page - * to an empty bio. + * bio_add_page - attempt to add page(s) to bio + * @bio: destination bio + * @page: start page to add + * @len: vec entry length, may cross pages + * @offset: vec entry offset relative to @page, may cross pages * - * Returns: number of bytes added to the bio, or 0 in case of a failure. + * Attempt to add page(s) to the bio_vec maplist. This will only fail + * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ -int bio_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) +int bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; - bool same_page = false; - - if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; - if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + if (bio->bi_vcnt > 0) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + + if (bvec_try_merge_page(bv, page, len, offset)) { + bio->bi_iter.bi_size += len; + return len; + } + } + + if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; + __bio_add_page(bio, page, len, offset); + return len; +} +EXPORT_SYMBOL(bio_add_page); + +void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, + size_t off) +{ + unsigned long nr = off / PAGE_SIZE; - return bio_add_hw_page(q, bio, page, len, offset, - queue_max_zone_append_sectors(q), &same_page); + WARN_ON_ONCE(len > UINT_MAX); + __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } -EXPORT_SYMBOL_GPL(bio_add_zone_append_page); +EXPORT_SYMBOL_GPL(bio_add_folio_nofail); /** - * __bio_try_merge_page - try appending data to an existing bvec. - * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * useful optimisation for file systems with a block size smaller than the - * page size. + * bio_add_folio - Attempt to add part of a folio to a bio. + * @bio: BIO to add to. + * @folio: Folio to add. + * @len: How many bytes from the folio to add. + * @off: First byte in this folio to add. * - * Warn if (@len, @off) crosses pages in case that @same_page is true. + * Filesystems that use folios can call this function instead of calling + * bio_add_page() for each page in the folio. If @off is bigger than + * PAGE_SIZE, this function can create a bio_vec that starts in a page + * after the bv_page. BIOs do not support folios that are 4GiB or larger. * - * Return %true on success or %false on failure. + * Return: Whether the addition was successful. */ -bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) +bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, + size_t off) { - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; - - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + unsigned long nr = off / PAGE_SIZE; - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } - } - return false; + if (len > UINT_MAX) + return false; + return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } -EXPORT_SYMBOL_GPL(__bio_try_merge_page); +EXPORT_SYMBOL(bio_add_folio); /** - * __bio_add_page - add page(s) to a bio in a new segment + * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio * @bio: destination bio - * @page: start page to add - * @len: length of the data to add, may cross pages - * @off: offset of the data relative to @page, may cross pages + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add * - * Add the data at @page + @off to @bio as a new bvec. The caller must ensure - * that @bio has space for another bvec. + * Add data starting at @vaddr to @bio and return how many bytes were added. + * This may be less than the amount originally asked. Returns 0 if no data + * could be added to @bio. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. */ -void __bio_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off) +unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; - - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); - WARN_ON_ONCE(bio_full(bio, len)); - - bv->bv_page = page; - bv->bv_offset = off; - bv->bv_len = len; + unsigned int offset = offset_in_page(vaddr); - bio->bi_iter.bi_size += len; - bio->bi_vcnt++; - - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) - bio_set_flag(bio, BIO_WORKINGSET); + len = min(len, PAGE_SIZE - offset); + if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) + return 0; + if (op_is_write(bio_op(bio))) + flush_kernel_vmap_range(vaddr, len); + return len; } -EXPORT_SYMBOL_GPL(__bio_add_page); +EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); /** - * bio_add_page - attempt to add page(s) to bio - * @bio: destination bio - * @page: start page to add - * @len: vec entry length, may cross pages - * @offset: vec entry offset relative to @page, may cross pages + * bio_add_vmalloc - add a vmalloc region to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add * - * Attempt to add page(s) to the bio_vec maplist. This will only fail - * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. + * Add data starting at @vaddr to @bio. Return %true on success or %false if + * @bio does not have enough space for the payload. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. */ -int bio_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) +bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) { - bool same_page = false; + do { + unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (bio_full(bio, len)) - return 0; - __bio_add_page(bio, page, len, offset); - } - return len; + if (!added) + return false; + vaddr += added; + len -= added; + } while (len); + + return true; } -EXPORT_SYMBOL(bio_add_page); +EXPORT_SYMBOL_GPL(bio_add_vmalloc); -void bio_release_pages(struct bio *bio, bool mark_dirty) +void __bio_release_pages(struct bio *bio, bool mark_dirty) { - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; - if (bio_flagged(bio, BIO_NO_PAGE_REF)) - return; + bio_for_each_folio_all(fi, bio) { + size_t nr_pages; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (mark_dirty && !PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + if (mark_dirty) { + folio_lock(fi.folio); + folio_mark_dirty(fi.folio); + folio_unlock(fi.folio); + } + nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - + fi.offset / PAGE_SIZE + 1; + unpin_user_folio(fi.folio, nr_pages); } } -EXPORT_SYMBOL_GPL(bio_release_pages); +EXPORT_SYMBOL_GPL(__bio_release_pages); -static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = iter->count; - bio_set_flag(bio, BIO_NO_PAGE_REF); + bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +static unsigned int get_contig_folio_len(unsigned int *num_pages, + struct page **pages, unsigned int i, + struct folio *folio, size_t left, + size_t offset) { - __bio_iov_bvec_set(bio, iter); - iov_iter_advance(iter, iter->count); - return 0; -} + size_t bytes = left; + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); + unsigned int j; -static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) -{ - struct request_queue *q = bio->bi_bdev->bd_disk->queue; - struct iov_iter i = *iter; + /* + * We might COW a single page in the middle of + * a large folio, so we have to check that all + * pages belong to the same folio. + */ + bytes -= contig_sz; + for (j = i + 1; j < i + *num_pages; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); - iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9); - __bio_iov_bvec_set(bio, &i); - iov_iter_advance(iter, i.count); - return 0; + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) { + break; + } + contig_sz += next; + bytes -= next; + } + *num_pages = j - i; + + return contig_sz; } #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) @@ -986,105 +1202,123 @@ static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. - * For multi-segment *iter, this function only adds pages from the - * next non-empty segment of the iov iterator. + * Extracts pages from *iter and appends them to @bio's bvec array. The pages + * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. + * For a multi-segment *iter, this function only adds pages from the next + * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { + iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - bool same_page = false; - ssize_t size, left; - unsigned len, i; - size_t offset; + ssize_t size; + unsigned int num_pages, i = 0; + size_t offset, folio_offset, left, len; + int ret = 0; /* * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning * without overwriting the temporary page array. - */ + */ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) + extraction_flags |= ITER_ALLOW_P2PDMA; + + size = iov_iter_extract_pages(iter, &pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; - for (left = size, i = 0; left > 0; left -= len, i++) { + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); + for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); + unsigned int old_vcnt = bio->bi_vcnt; + + folio_offset = ((size_t)folio_page_idx(folio, page) << + PAGE_SHIFT) + offset; + + len = min(folio_size(folio) - folio_offset, left); - len = min_t(size_t, PAGE_SIZE - offset, left); + num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (same_page) - put_page(page); - } else { - if (WARN_ON_ONCE(bio_full(bio, len))) - return -EINVAL; - __bio_add_page(bio, page, len, offset); + if (num_pages > 1) + len = get_contig_folio_len(&num_pages, pages, i, + folio, left, offset); + + if (!bio_add_folio(bio, folio, len, folio_offset)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out; + } + + if (bio_flagged(bio, BIO_PAGE_PINNED)) { + /* + * We're adding another fragment of a page that already + * was part of the last segment. Undo our pin as the + * page was pinned when an earlier fragment of it was + * added to the bio and __bio_release_pages expects a + * single pin per page. + */ + if (offset && bio->bi_vcnt == old_vcnt) + unpin_user_folio(folio, 1); } offset = 0; } - iov_iter_advance(iter, size); - return 0; + iov_iter_revert(iter, left); +out: + while (i < nr_pages) + bio_release_page(bio, pages[i++]); + + return ret; } -static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) +/* + * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that + * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length + * for the next iteration. + */ +static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; - unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; - unsigned int max_append_sectors = queue_max_zone_append_sectors(q); - struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; - struct page **pages = (struct page **)bv; - ssize_t size, left; - unsigned len, i; - size_t offset; - int ret = 0; + size_t nbytes = bio->bi_iter.bi_size & len_align_mask; - if (WARN_ON_ONCE(!max_append_sectors)) + if (!nbytes) return 0; - /* - * Move page array up in the allocated memory for the bio vecs as far as - * possible so that we can start filling biovecs from the beginning - * without overwriting the temporary page array. - */ - BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); - pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); - if (unlikely(size <= 0)) - return size ? size : -EFAULT; - - for (left = size, i = 0; left > 0; left -= len, i++) { - struct page *page = pages[i]; - bool same_page = false; + iov_iter_revert(iter, nbytes); + bio->bi_iter.bi_size -= nbytes; + do { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - len = min_t(size_t, PAGE_SIZE - offset, left); - if (bio_add_hw_page(q, bio, page, len, offset, - max_append_sectors, &same_page) != len) { - ret = -EINVAL; + if (nbytes < bv->bv_len) { + bv->bv_len -= nbytes; break; } - if (same_page) - put_page(page); - offset = 0; - } - iov_iter_advance(iter, size - left); - return ret; + bio_release_page(bio, bv->bv_page); + bio->bi_vcnt--; + nbytes -= bv->bv_len; + } while (nbytes); + + if (!bio->bi_vcnt) + return -EFAULT; + return 0; } /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added + * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and @@ -1100,32 +1334,31 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. - * - * It's intended for direct IO, so doesn't do PSI tracking, the caller is - * responsible for setting BIO_WORKINGSET if necessary. */ -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { int ret = 0; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EIO; + if (iov_iter_is_bvec(iter)) { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - return bio_iov_bvec_set_append(bio, iter); - return bio_iov_bvec_set(bio, iter); + bio_iov_bvec_set(bio, iter); + iov_iter_advance(iter, bio->bi_iter.bi_size); + return 0; } + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = __bio_iov_append_get_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); + ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); - return bio->bi_vcnt ? 0 : ret; + if (bio->bi_vcnt) + return bio_iov_iter_align_down(bio, iter, len_align_mask); + return ret; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); static void submit_bio_wait_endio(struct bio *bio) { @@ -1147,38 +1380,68 @@ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); - unsigned long hang_check; bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - if (hang_check) - while (!wait_for_completion_io_timeout(&done, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&done); + blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); /** - * bio_advance - increment/complete a bio by some number of bytes - * @bio: bio to advance - * @bytes: number of bytes to complete - * - * This updates bi_sector, bi_size and bi_idx; if the number of bytes to - * complete doesn't align with a bvec boundary, then bv_len and bv_offset will - * be updated on the last bvec as well. + * bdev_rw_virt - synchronously read into / write from kernel mapping + * @bdev: block device to access + * @sector: sector to access + * @data: data to read/write + * @len: length in byte to read/write + * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) * - * @bio will then represent the remaining, uncompleted portion of the io. + * Performs synchronous I/O to @bdev for @data/@len. @data must be in + * the kernel direct mapping and not a vmalloc address. */ -void bio_advance(struct bio *bio, unsigned bytes) +int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, + size_t len, enum req_op op) +{ + struct bio_vec bv; + struct bio bio; + int error; + + if (WARN_ON_ONCE(is_vmalloc_addr(data))) + return -EIO; + + bio_init(&bio, bdev, &bv, 1, op); + bio.bi_iter.bi_sector = sector; + bio_add_virt_nofail(&bio, data, len); + error = submit_bio_wait(&bio); + bio_uninit(&bio); + return error; +} +EXPORT_SYMBOL_GPL(bdev_rw_virt); + +static void bio_wait_end_io(struct bio *bio) +{ + complete(bio->bi_private); + bio_put(bio); +} + +/* + * bio_await_chain - ends @bio and waits for every chained bio to complete + */ +void bio_await_chain(struct bio *bio) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); + + bio->bi_private = &done; + bio->bi_end_io = bio_wait_end_io; + bio_endio(bio); + blk_wait_io(&done); +} + +void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); @@ -1186,32 +1449,22 @@ void bio_advance(struct bio *bio, unsigned bytes) bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } -EXPORT_SYMBOL(bio_advance); +EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - while (src_iter->bi_size && dst_iter->bi_size) { - src_bv = bio_iter_iovec(src, *src_iter); - dst_bv = bio_iter_iovec(dst, *dst_iter); - - bytes = min(src_bv.bv_len, dst_bv.bv_len); - - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf = bvec_kmap_local(&src_bv); + void *dst_buf = bvec_kmap_local(&dst_bv); - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); + memcpy(dst_buf, src_buf, bytes); - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - flush_dcache_page(dst_bv.bv_page); + kunmap_local(dst_buf); + kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); @@ -1250,18 +1503,12 @@ EXPORT_SYMBOL(bio_free_pages); * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * - * The problem is that we cannot run set_page_dirty() from interrupt context + * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * - * We special-case compound pages here: normally this means reads into hugetlb - * pages. The logic in here doesn't really work right for compound pages - * because the VM does not uniformly chase down the head page in all cases. - * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't - * handle them at all. So we skip compound pages here at an early stage. - * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. @@ -1277,14 +1524,15 @@ EXPORT_SYMBOL(bio_free_pages); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (!PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); + bio_for_each_folio_all(fi, bio) { + folio_lock(fi.folio); + folio_mark_dirty(fi.folio); + folio_unlock(fi.folio); } } +EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. @@ -1293,8 +1541,8 @@ void bio_set_pages_dirty(struct bio *bio) * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one put_page() against each page and will run one - * bio_put() against the BIO. + * here on. It will unpin each page and will run one bio_put() against the + * BIO. */ static void bio_dirty_fn(struct work_struct *work); @@ -1325,12 +1573,11 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec; + struct folio_iter fi; unsigned long flags; - struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) + bio_for_each_folio_all(fi, bio) { + if (!folio_test_dirty(fi.folio)) goto defer; } @@ -1344,6 +1591,7 @@ defer: spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } +EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { @@ -1385,11 +1633,12 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_bdev) - rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); + blk_zone_bio_endio(bio); + + rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } @@ -1406,9 +1655,18 @@ again: goto again; } - blk_throtl_bio_endio(bio); - /* release cgroup info */ - bio_uninit(bio); +#ifdef CONFIG_BLK_CGROUP + /* + * Release cgroup info. We shouldn't have to do this here, but quite + * a few callers of bio_init fail to call bio_uninit, so we cover up + * for that here at least for now. + */ + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif + if (bio->bi_end_io) bio->bi_end_io(bio); } @@ -1433,16 +1691,22 @@ struct bio *bio_split(struct bio *bio, int sectors, { struct bio *split; - BUG_ON(sectors <= 0); - BUG_ON(sectors >= bio_sectors(bio)); + if (WARN_ON_ONCE(sectors <= 0)) + return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) + return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) - return NULL; + return ERR_PTR(-EINVAL); + + /* atomic writes cannot be split */ + if (bio->bi_opf & REQ_ATOMIC) + return ERR_PTR(-EINVAL); - split = bio_clone_fast(bio, gfp, bs); + split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) - return NULL; + return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; @@ -1463,12 +1727,19 @@ EXPORT_SYMBOL(bio_split); * @bio: bio to trim * @offset: number of sectors to trim from the front of @bio * @size: size we want to trim @bio to, in sectors + * + * This function is typically used for bios that are cloned and submitted + * to the underlying device in parts. */ -void bio_trim(struct bio *bio, int offset, int size) +void bio_trim(struct bio *bio, sector_t offset, sector_t size) { - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - */ + /* We should never trim an atomic write */ + if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size)) + return; + + if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || + offset + size > bio_sectors(bio))) + return; size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size) @@ -1479,7 +1750,6 @@ void bio_trim(struct bio *bio, int offset, int size) if (bio_integrity(bio)) bio_integrity_trim(bio); - } EXPORT_SYMBOL_GPL(bio_trim); @@ -1502,6 +1772,7 @@ int biovec_init_pool(mempool_t *pool, int pool_entries) */ void bioset_exit(struct bio_set *bs) { + bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; @@ -1509,7 +1780,6 @@ void bioset_exit(struct bio_set *bs) mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); - bioset_integrity_free(bs); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; @@ -1532,9 +1802,9 @@ EXPORT_SYMBOL(bioset_exit); * Note that the bio must be embedded at the END of that structure always, * or things will break badly. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated - * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast(). - * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to - * dispatch queued requests when the mempool runs out of space. + * for allocating iovecs. This pool is not needed e.g. for bio_init_clone(). + * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used + * to dispatch queued requests when the mempool runs out of space. * */ int bioset_init(struct bio_set *bs, @@ -1563,12 +1833,18 @@ int bioset_init(struct bio_set *bs, biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; - if (!(flags & BIOSET_NEED_RESCUER)) - return 0; - - bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); - if (!bs->rescue_workqueue) - goto bad; + if (flags & BIOSET_NEED_RESCUER) { + bs->rescue_workqueue = alloc_workqueue("bioset", + WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + } + if (flags & BIOSET_PERCPU_CACHE) { + bs->cache = alloc_percpu(struct bio_alloc_cache); + if (!bs->cache) + goto bad; + cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); + } return 0; bad: @@ -1577,29 +1853,11 @@ bad: } EXPORT_SYMBOL(bioset_init); -/* - * Initialize and setup a new bio_set, based on the settings from - * another bio_set. - */ -int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) -{ - int flags; - - flags = 0; - if (src->bvec_pool.min_nr) - flags |= BIOSET_NEED_BVECS; - if (src->rescue_workqueue) - flags |= BIOSET_NEED_RESCUER; - - return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags); -} -EXPORT_SYMBOL(bioset_init_from_src); - static int __init init_bio(void) { int i; - bio_integrity_init(); + BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; @@ -1609,11 +1867,12 @@ static int __init init_bio(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } - if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) - panic("bio: can't allocate bios\n"); + cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, + bio_cpu_dead); - if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, + BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) + panic("bio: can't allocate bios\n"); return 0; } |
