summaryrefslogtreecommitdiff
path: root/fs/btrfs/bio.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/bio.c')
-rw-r--r--fs/btrfs/bio.c262
1 files changed, 140 insertions, 122 deletions
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 477f350a8bd0..f7d8958b7327 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -29,7 +29,7 @@ struct btrfs_failed_bio {
/* Is this a data path I/O that needs storage layer checksum and repair? */
static inline bool is_data_bbio(struct btrfs_bio *bbio)
{
- return bbio->inode && is_data_inode(&bbio->inode->vfs_inode);
+ return bbio->inode && is_data_inode(bbio->inode);
}
static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
@@ -49,11 +49,12 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
bbio->end_io = end_io;
bbio->private = private;
atomic_set(&bbio->pending_ios, 1);
+ WRITE_ONCE(bbio->status, BLK_STS_OK);
}
/*
* Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
- * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
*
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
* a mempool.
@@ -73,20 +74,16 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
struct btrfs_bio *orig_bbio,
- u64 map_length, bool use_append)
+ u64 map_length)
{
struct btrfs_bio *bbio;
struct bio *bio;
- if (use_append) {
- unsigned int nr_segs;
+ bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
+ &btrfs_clone_bioset);
+ if (IS_ERR(bio))
+ return ERR_CAST(bio);
- bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
- &btrfs_clone_bioset, map_length);
- } else {
- bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
- GFP_NOFS, &btrfs_clone_bioset);
- }
bbio = btrfs_bio(bio);
btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
bbio->inode = orig_bbio->inode;
@@ -100,66 +97,41 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
return bbio;
}
-/* Free a bio that was never submitted to the underlying device. */
-static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio))
- btrfs_put_ordered_extent(bbio->ordered);
- bio_put(&bbio->bio);
-}
-
-static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio)) {
- struct btrfs_ordered_extent *ordered = bbio->ordered;
-
- bbio->end_io(bbio);
- btrfs_put_ordered_extent(ordered);
- } else {
- bbio->end_io(bbio);
- }
-}
-
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
- __btrfs_bio_end_io(bbio);
-}
+ if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
+ struct btrfs_bio *orig_bbio = bbio->private;
-static void btrfs_orig_write_end_io(struct bio *bio);
+ /* Free bio that was never submitted to the underlying device. */
+ if (bbio_has_ordered_extent(bbio))
+ btrfs_put_ordered_extent(bbio->ordered);
+ bio_put(&bbio->bio);
+
+ bbio = orig_bbio;
+ }
-static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
- struct btrfs_bio *orig_bbio)
-{
/*
- * For writes we tolerate nr_mirrors - 1 write failures, so we can't
- * just blindly propagate a write failure here. Instead increment the
- * error count in the original I/O context so that it is guaranteed to
- * be larger than the error tolerance.
+ * At this point, bbio always points to the original btrfs_bio. Save
+ * the first error in it.
*/
- if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
- struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
- struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
+ if (status != BLK_STS_OK)
+ cmpxchg(&bbio->status, BLK_STS_OK, status);
- atomic_add(orig_bioc->max_errors, &orig_bioc->error);
- } else {
- orig_bbio->bio.bi_status = bbio->bio.bi_status;
- }
-}
+ if (atomic_dec_and_test(&bbio->pending_ios)) {
+ /* Load split bio's error which might be set above. */
+ if (status == BLK_STS_OK)
+ bbio->bio.bi_status = READ_ONCE(bbio->status);
-static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
-{
- if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
- struct btrfs_bio *orig_bbio = bbio->private;
+ if (bbio_has_ordered_extent(bbio)) {
+ struct btrfs_ordered_extent *ordered = bbio->ordered;
- if (bbio->bio.bi_status)
- btrfs_bbio_propagate_error(bbio, orig_bbio);
- btrfs_cleanup_bio(bbio);
- bbio = orig_bbio;
+ bbio->end_io(bbio);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ bbio->end_io(bbio);
+ }
}
-
- if (atomic_dec_and_test(&bbio->pending_ios))
- __btrfs_bio_end_io(bbio);
}
static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
@@ -179,7 +151,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
{
if (atomic_dec_and_test(&fbio->repair_count)) {
- btrfs_orig_bbio_end_io(fbio->bbio);
+ btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
mempool_free(fbio, &btrfs_failed_bio_pool);
}
}
@@ -211,7 +183,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
goto done;
}
- btrfs_submit_bio(repair_bbio, mirror);
+ btrfs_submit_bbio(repair_bbio, mirror);
return;
}
@@ -220,7 +192,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- page_folio(bv->bv_page), bv->bv_offset, mirror);
+ bvec_phys(bv), mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -280,7 +252,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
- btrfs_submit_bio(repair_bbio, mirror);
+ btrfs_submit_bbio(repair_bbio, mirror);
return fbio;
}
@@ -326,7 +298,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
if (fbio)
btrfs_repair_done(fbio);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
@@ -360,7 +332,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
if (is_data_bbio(bbio))
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_simple_end_io(struct bio *bio)
@@ -378,9 +350,9 @@ static void btrfs_simple_end_io(struct bio *bio)
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
} else {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+ if (bio_is_zone_append(bio) && !bio->bi_status)
btrfs_record_physical_zoned(bbio);
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
}
@@ -394,7 +366,7 @@ static void btrfs_raid56_end_io(struct bio *bio)
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
btrfs_check_read_bio(bbio, NULL);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
@@ -421,10 +393,10 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
- if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+ if (bio_is_zone_append(bio) && !bio->bi_status)
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
@@ -435,7 +407,7 @@ static void btrfs_clone_write_end_io(struct bio *bio)
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
- } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ } else if (bio_is_zone_append(bio)) {
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
@@ -473,6 +445,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
dev->devid, bio->bi_iter.bi_size);
+ /*
+ * Track reads if tracking is enabled; ignore I/O operations before the
+ * filesystem is fully initialized.
+ */
+ if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
+ percpu_counter_add(&dev->fs_info->stats_read_blocks,
+ bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
+
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
blkcg_punt_bio_submit(bio);
else
@@ -502,8 +482,8 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
-static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
- struct btrfs_io_stripe *smap, int mirror_num)
+static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
+ struct btrfs_io_stripe *smap, int mirror_num)
{
if (!bioc) {
/* Single mirror read/write fast path. */
@@ -532,7 +512,7 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
}
}
-static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
+static int btrfs_bio_csum(struct btrfs_bio *bbio)
{
if (bbio->bio.bi_opf & REQ_META)
return btree_csum_one_bio(bbio);
@@ -563,11 +543,11 @@ static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
- blk_status_t ret;
+ int ret;
ret = btrfs_bio_csum(async->bbio);
if (ret)
- async->bbio->bio.bi_status = ret;
+ async->bbio->bio.bi_status = errno_to_blk_status(ret);
}
/*
@@ -593,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
- btrfs_orig_bbio_end_io(async->bbio);
+ btrfs_bio_end_io(async->bbio, bio->bi_status);
return;
}
@@ -603,14 +583,14 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
* context. This changes nothing when cgroups aren't in use.
*/
bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
- __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
+ btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
static bool should_async_write(struct btrfs_bio *bbio)
{
bool auto_csum_mode = true;
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
@@ -664,11 +644,29 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
return true;
}
+static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
+{
+ unsigned int nr_segs;
+ int sector_offset;
+
+ map_length = min(map_length, bbio->fs_info->max_zone_append_size);
+ sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
+ &nr_segs, map_length);
+ if (sector_offset) {
+ /*
+ * bio_split_rw_at() could split at a size smaller than our
+ * sectorsize and thus cause unaligned I/Os. Fix that by
+ * always rounding down to the nearest boundary.
+ */
+ return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
+ }
+ return map_length;
+}
+
static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = bbio->fs_info;
- struct btrfs_bio *orig_bbio = bbio;
struct bio *bio = &bbio->bio;
u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bio->bi_iter.bi_size;
@@ -676,25 +674,37 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bool use_append = btrfs_use_zone_append(bbio);
struct btrfs_io_context *bioc = NULL;
struct btrfs_io_stripe smap;
- blk_status_t ret;
- int error;
+ blk_status_t status;
+ int ret;
- smap.is_scrub = !bbio->inode;
+ if (!bbio->inode || btrfs_is_data_reloc_root(inode->root))
+ smap.rst_search_commit_root = true;
+ else
+ smap.rst_search_commit_root = false;
btrfs_bio_counter_inc_blocked(fs_info);
- error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num);
- if (error) {
- ret = errno_to_blk_status(error);
- goto fail;
+ ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num);
+ if (ret) {
+ status = errno_to_blk_status(ret);
+ btrfs_bio_counter_dec(fs_info);
+ goto end_bbio;
}
map_length = min(map_length, length);
if (use_append)
- map_length = min(map_length, fs_info->max_zone_append_size);
+ map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) {
- bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
+ struct btrfs_bio *split;
+
+ split = btrfs_split_bio(fs_info, bbio, map_length);
+ if (IS_ERR(split)) {
+ status = errno_to_blk_status(PTR_ERR(split));
+ btrfs_bio_counter_dec(fs_info);
+ goto end_bbio;
+ }
+ bbio = split;
bio = &bbio->bio;
}
@@ -705,8 +715,9 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
- if (ret)
- goto fail_put_bio;
+ status = errno_to_blk_status(ret);
+ if (status)
+ goto fail;
}
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
@@ -715,8 +726,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
- if (is_data_bbio(bbio) && bioc &&
- btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+ if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
/*
* No locking for the list update, as we only add to
* the list in the I/O submission path, and list
@@ -732,37 +742,51 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
* point, so they are handled as part of the no-checksum case.
*/
if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) {
if (should_async_write(bbio) &&
btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
goto done;
ret = btrfs_bio_csum(bbio);
- if (ret)
- goto fail_put_bio;
- } else if (use_append) {
+ status = errno_to_blk_status(ret);
+ if (status)
+ goto fail;
+ } else if (use_append ||
+ (btrfs_is_zoned(fs_info) && inode &&
+ inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
- if (ret)
- goto fail_put_bio;
+ status = errno_to_blk_status(ret);
+ if (status)
+ goto fail;
}
}
- __btrfs_submit_bio(bio, bioc, &smap, mirror_num);
+ btrfs_submit_bio(bio, bioc, &smap, mirror_num);
done:
return map_length == length;
-fail_put_bio:
- if (map_length < length)
- btrfs_cleanup_bio(bbio);
fail:
btrfs_bio_counter_dec(fs_info);
- btrfs_bio_end_io(orig_bbio, ret);
+ /*
+ * We have split the original bbio, now we have to end both the current
+ * @bbio and remaining one, as the remaining one will never be submitted.
+ */
+ if (map_length < length) {
+ struct btrfs_bio *remaining = bbio->private;
+
+ ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
+ ASSERT(remaining);
+
+ btrfs_bio_end_io(remaining, status);
+ }
+end_bbio:
+ btrfs_bio_end_io(bbio, status);
/* Do not submit another chunk */
return true;
}
-void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
+void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
{
/* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT(bbio->inode || bbio->file_offset == 0);
@@ -774,7 +798,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
/*
* Submit a repair write.
*
- * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
+ * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
* RAID setup. Here we only want to write the one bad copy, so we do the
* mapping ourselves and submit the bio directly.
*
@@ -782,8 +806,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct folio *folio,
- unsigned int folio_offset, int mirror_num)
+ u64 length, u64 logical, phys_addr_t paddr, int mirror_num)
{
struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
@@ -814,8 +837,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- ret = bio_add_folio(&bio, folio, length, folio_offset);
- ASSERT(ret);
+ __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr));
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
@@ -863,7 +885,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_
ASSERT(smap.dev == fs_info->dev_replace.srcdev);
smap.dev = fs_info->dev_replace.tgtdev;
}
- __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
+ btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
return;
fail:
@@ -879,22 +901,18 @@ int __init btrfs_bioset_init(void)
return -ENOMEM;
if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio), 0))
- goto out_free_bioset;
+ goto out;
if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
- goto out_free_clone_bioset;
+ goto out;
if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
sizeof(struct btrfs_failed_bio)))
- goto out_free_repair_bioset;
+ goto out;
return 0;
-out_free_repair_bioset:
- bioset_exit(&btrfs_repair_bioset);
-out_free_clone_bioset:
- bioset_exit(&btrfs_clone_bioset);
-out_free_bioset:
- bioset_exit(&btrfs_bioset);
+out:
+ btrfs_bioset_exit();
return -ENOMEM;
}