diff options
Diffstat (limited to 'fs/btrfs/bio.c')
-rw-r--r-- | fs/btrfs/bio.c | 262 |
1 files changed, 140 insertions, 122 deletions
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 477f350a8bd0..f7d8958b7327 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -29,7 +29,7 @@ struct btrfs_failed_bio { /* Is this a data path I/O that needs storage layer checksum and repair? */ static inline bool is_data_bbio(struct btrfs_bio *bbio) { - return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); + return bbio->inode && is_data_inode(bbio->inode); } static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) @@ -49,11 +49,12 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); + WRITE_ONCE(bbio->status, BLK_STS_OK); } /* * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). * * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. @@ -73,20 +74,16 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, - u64 map_length, bool use_append) + u64 map_length) { struct btrfs_bio *bbio; struct bio *bio; - if (use_append) { - unsigned int nr_segs; + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + if (IS_ERR(bio)) + return ERR_CAST(bio); - bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, - &btrfs_clone_bioset, map_length); - } else { - bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, - GFP_NOFS, &btrfs_clone_bioset); - } bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -100,66 +97,41 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return bbio; } -/* Free a bio that was never submitted to the underlying device. */ -static void btrfs_cleanup_bio(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) - btrfs_put_ordered_extent(bbio->ordered); - bio_put(&bbio->bio); -} - -static void __btrfs_bio_end_io(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) { - struct btrfs_ordered_extent *ordered = bbio->ordered; - - bbio->end_io(bbio); - btrfs_put_ordered_extent(ordered); - } else { - bbio->end_io(bbio); - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; - __btrfs_bio_end_io(bbio); -} + if (bbio->bio.bi_pool == &btrfs_clone_bioset) { + struct btrfs_bio *orig_bbio = bbio->private; -static void btrfs_orig_write_end_io(struct bio *bio); + /* Free bio that was never submitted to the underlying device. */ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); + + bbio = orig_bbio; + } -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, - struct btrfs_bio *orig_bbio) -{ /* - * For writes we tolerate nr_mirrors - 1 write failures, so we can't - * just blindly propagate a write failure here. Instead increment the - * error count in the original I/O context so that it is guaranteed to - * be larger than the error tolerance. + * At this point, bbio always points to the original btrfs_bio. Save + * the first error in it. */ - if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { - struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; - struct btrfs_io_context *orig_bioc = orig_stripe->bioc; + if (status != BLK_STS_OK) + cmpxchg(&bbio->status, BLK_STS_OK, status); - atomic_add(orig_bioc->max_errors, &orig_bioc->error); - } else { - orig_bbio->bio.bi_status = bbio->bio.bi_status; - } -} + if (atomic_dec_and_test(&bbio->pending_ios)) { + /* Load split bio's error which might be set above. */ + if (status == BLK_STS_OK) + bbio->bio.bi_status = READ_ONCE(bbio->status); -static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) -{ - if (bbio->bio.bi_pool == &btrfs_clone_bioset) { - struct btrfs_bio *orig_bbio = bbio->private; + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; - if (bbio->bio.bi_status) - btrfs_bbio_propagate_error(bbio, orig_bbio); - btrfs_cleanup_bio(bbio); - bbio = orig_bbio; + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } } - - if (atomic_dec_and_test(&bbio->pending_ios)) - __btrfs_bio_end_io(bbio); } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -179,7 +151,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) static void btrfs_repair_done(struct btrfs_failed_bio *fbio) { if (atomic_dec_and_test(&fbio->repair_count)) { - btrfs_orig_bbio_end_io(fbio->bbio); + btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); mempool_free(fbio, &btrfs_failed_bio_pool); } } @@ -211,7 +183,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return; } @@ -220,7 +192,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - page_folio(bv->bv_page), bv->bv_offset, mirror); + bvec_phys(bv), mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -280,7 +252,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return fbio; } @@ -326,7 +298,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de if (fbio) btrfs_repair_done(fbio); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) @@ -360,7 +332,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) @@ -378,9 +350,9 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } } @@ -394,7 +366,7 @@ static void btrfs_raid56_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -421,10 +393,10 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -435,7 +407,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } @@ -473,6 +445,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); + /* + * Track reads if tracking is enabled; ignore I/O operations before the + * filesystem is fully initialized. + */ + if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) + percpu_counter_add(&dev->fs_info->stats_read_blocks, + bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -502,8 +482,8 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, - struct btrfs_io_stripe *smap, int mirror_num) +static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { if (!bioc) { /* Single mirror read/write fast path. */ @@ -532,7 +512,7 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, } } -static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +static int btrfs_bio_csum(struct btrfs_bio *bbio) { if (bbio->bio.bi_opf & REQ_META) return btree_csum_one_bio(bbio); @@ -563,11 +543,11 @@ static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); - blk_status_t ret; + int ret; ret = btrfs_bio_csum(async->bbio); if (ret) - async->bbio->bio.bi_status = ret; + async->bbio->bio.bi_status = errno_to_blk_status(ret); } /* @@ -593,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_orig_bbio_end_io(async->bbio); + btrfs_bio_end_io(async->bbio, bio->bi_status); return; } @@ -603,14 +583,14 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) * context. This changes nothing when cgroups aren't in use. */ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; - __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); + btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } static bool should_async_write(struct btrfs_bio *bbio) { bool auto_csum_mode = true; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); @@ -664,11 +644,29 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } +static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) +{ + unsigned int nr_segs; + int sector_offset; + + map_length = min(map_length, bbio->fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + &nr_segs, map_length); + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } + return map_length; +} + static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = bbio->fs_info; - struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; @@ -676,25 +674,37 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bool use_append = btrfs_use_zone_append(bbio); struct btrfs_io_context *bioc = NULL; struct btrfs_io_stripe smap; - blk_status_t ret; - int error; + blk_status_t status; + int ret; - smap.is_scrub = !bbio->inode; + if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + smap.rst_search_commit_root = true; + else + smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); - error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num); - if (error) { - ret = errno_to_blk_status(error); - goto fail; + ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num); + if (ret) { + status = errno_to_blk_status(ret); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; } map_length = min(map_length, length); if (use_append) - map_length = min(map_length, fs_info->max_zone_append_size); + map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + struct btrfs_bio *split; + + split = btrfs_split_bio(fs_info, bbio, map_length); + if (IS_ERR(split)) { + status = errno_to_blk_status(PTR_ERR(split)); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; + } + bbio = split; bio = &bbio->bio; } @@ -705,8 +715,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); - if (ret) - goto fail_put_bio; + status = errno_to_blk_status(ret); + if (status) + goto fail; } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { @@ -715,8 +726,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } - if (is_data_bbio(bbio) && bioc && - btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + if (is_data_bbio(bbio) && bioc && bioc->use_rst) { /* * No locking for the list update, as we only add to * the list in the I/O submission path, and list @@ -732,37 +742,51 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * point, so they are handled as part of the no-checksum case. */ if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) goto done; ret = btrfs_bio_csum(bbio); - if (ret) - goto fail_put_bio; - } else if (use_append) { + status = errno_to_blk_status(ret); + if (status) + goto fail; + } else if (use_append || + (btrfs_is_zoned(fs_info) && inode && + inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); - if (ret) - goto fail_put_bio; + status = errno_to_blk_status(ret); + if (status) + goto fail; } } - __btrfs_submit_bio(bio, bioc, &smap, mirror_num); + btrfs_submit_bio(bio, bioc, &smap, mirror_num); done: return map_length == length; -fail_put_bio: - if (map_length < length) - btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(orig_bbio, ret); + /* + * We have split the original bbio, now we have to end both the current + * @bbio and remaining one, as the remaining one will never be submitted. + */ + if (map_length < length) { + struct btrfs_bio *remaining = bbio->private; + + ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); + ASSERT(remaining); + + btrfs_bio_end_io(remaining, status); + } +end_bbio: + btrfs_bio_end_io(bbio, status); /* Do not submit another chunk */ return true; } -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); @@ -774,7 +798,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) /* * Submit a repair write. * - * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * @@ -782,8 +806,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num) + u64 length, u64 logical, phys_addr_t paddr, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -814,8 +837,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - ret = bio_add_folio(&bio, folio, length, folio_offset); - ASSERT(ret); + __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ @@ -863,7 +885,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_ ASSERT(smap.dev == fs_info->dev_replace.srcdev); smap.dev = fs_info->dev_replace.tgtdev; } - __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); return; fail: @@ -879,22 +901,18 @@ int __init btrfs_bioset_init(void) return -ENOMEM; if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), 0)) - goto out_free_bioset; + goto out; if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) - goto out_free_clone_bioset; + goto out; if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, sizeof(struct btrfs_failed_bio))) - goto out_free_repair_bioset; + goto out; return 0; -out_free_repair_bioset: - bioset_exit(&btrfs_repair_bioset); -out_free_clone_bioset: - bioset_exit(&btrfs_clone_bioset); -out_free_bioset: - bioset_exit(&btrfs_bioset); +out: + btrfs_bioset_exit(); return -ENOMEM; } |