diff options
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 529 |
1 files changed, 265 insertions, 264 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c843563914ca..f09db62e61a1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -17,7 +17,7 @@ #include <linux/error-injection.h> #include <linux/crc32c.h> #include <linux/sched/mm.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" @@ -193,7 +192,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + eb->folio_size); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, @@ -214,7 +213,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int failed = 0; @@ -227,7 +226,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -359,7 +358,7 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) /* Do basic extent buffer checks at read time */ int btrfs_validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -368,6 +367,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, u8 result[BTRFS_CSUM_SIZE]; const u8 *header_csum; int ret = 0; + const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS); ASSERT(check); @@ -400,13 +400,16 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, -"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s", eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), - btrfs_header_level(eb)); - ret = -EUCLEAN; - goto out; + btrfs_header_level(eb), + ignore_csum ? ", ignored" : ""); + if (!ignore_csum) { + ret = -EUCLEAN; + goto out; + } } if (found_level != check->level) { @@ -426,7 +429,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } if (check->has_first_key) { - struct btrfs_key *expect_key = &check->first_key; + const struct btrfs_key *expect_key = &check->first_key; struct btrfs_key found_key; if (found_level) @@ -498,15 +501,15 @@ static int btree_migrate_folio(struct address_space *mapping, static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info; int ret; if (wbc->sync_mode == WB_SYNC_NONE) { + struct btrfs_fs_info *fs_info; if (wbc->for_kupdate) return 0; - fs_info = BTRFS_I(mapping->host)->root->fs_info; + fs_info = inode_to_fs_info(mapping->host); /* this is a bit racy, but that's ok */ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, @@ -522,18 +525,19 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; - return try_release_extent_buffer(&folio->page); + return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(folio->mapping->host)->io_tree; + + tree = &folio_to_inode(folio)->io_tree; extent_invalidate_folio(tree, folio, offset); btree_release_folio(folio, GFP_NOFS); if (folio_get_private(folio)) { - btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + btrfs_warn(folio_to_fs_info(folio), "folio private not zero on folio %llu", (unsigned long long)folio_pos(folio)); folio_detach_private(folio); @@ -544,7 +548,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset, static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; @@ -635,10 +639,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, free_extent_buffer_stale(buf); return ERR_PTR(ret); } - if (btrfs_check_eb_owner(buf, check->owner_root)) { - free_extent_buffer_stale(buf); - return ERR_PTR(-EUCLEAN); - } return buf; } @@ -646,7 +646,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { - bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + bool dummy = btrfs_is_testing(fs_info); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -658,13 +658,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->state = 0; RB_CLEAR_NODE(&root->rb_node); - root->last_trans = 0; + btrfs_set_root_last_trans(root, 0); root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; - root->inode_tree = RB_ROOT; - /* GFP flags are compatible with XA_FLAGS_*. */ - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + xa_init(&root->inodes); + xa_init(&root->delayed_nodes); btrfs_init_root_block_rsv(root); @@ -675,7 +674,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); - spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); @@ -776,7 +774,7 @@ int btrfs_global_root_insert(struct btrfs_root *root) if (tmp) { ret = -EEXIST; btrfs_warn(fs_info, "global root %llu %llu already exists", - root->root_key.objectid, root->root_key.offset); + btrfs_root_id(root), root->root_key.offset); } return ret; } @@ -848,13 +846,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) return btrfs_global_root(fs_info, &key); } -struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) -{ - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) - return fs_info->block_group_root; - return btrfs_extent_root(fs_info, 0); -} - struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -926,8 +917,7 @@ fail: return ERR_PTR(ret); } -static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; @@ -975,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, { struct btrfs_root *log_root; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1001,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; int ret; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1011,8 +1001,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return ret; } - log_root->last_trans = trans->transid; - log_root->root_key.offset = root->root_key.objectid; + btrfs_set_root_last_trans(log_root, trans->transid); + log_root->root_key.offset = btrfs_root_id(root); inode_item = &log_root->root_item.inode; btrfs_set_stack_inode_generation(inode_item, 1); @@ -1034,7 +1024,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_tree_parent_check check = { 0 }; @@ -1076,15 +1066,15 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - root->root_key.objectid != btrfs_header_owner(root->node)) { + if (!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node)) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", - root->root_key.objectid, root->node->start, + btrfs_root_id(root), root->node->start, btrfs_header_owner(root->node), - root->root_key.objectid); + btrfs_root_id(root)); ret = -EUCLEAN; goto fail; } @@ -1096,7 +1086,7 @@ fail: } struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_path *path; @@ -1121,9 +1111,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_drew_lock_init(&root->snapshot_lock); - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && - is_fstree(root->root_key.objectid)) { + is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1132,7 +1122,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ - if (is_fstree(root->root_key.objectid) && + if (is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); @@ -1219,7 +1209,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), root); if (ret == 0) { btrfs_grab_root(root); @@ -1231,7 +1221,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } -void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_DEBUG struct btrfs_root *root; @@ -1244,6 +1234,7 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "leaked root %s refcount %d", btrfs_root_name(&root->root_key, buf), refcount_read(&root->refs)); + WARN_ON_ONCE(1); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); btrfs_put_root(root); @@ -1265,9 +1256,15 @@ static void free_global_roots(struct btrfs_fs_info *fs_info) void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { + struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); + if (percpu_counter_initialized(em_counter)) + ASSERT(percpu_counter_sum_positive(em_counter) == 0); + percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -1288,7 +1285,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -1849,7 +1845,8 @@ void btrfs_put_root(struct btrfs_root *root) return; if (refcount_dec_and_test(&root->refs)) { - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (WARN_ON(!xa_empty(&root->inodes))) + xa_destroy(&root->inodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -1923,7 +1920,7 @@ static int btrfs_init_btree_inode(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID); set_nlink(inode, 1); /* * we set the i_size on the btree inode to the max possible int. @@ -1934,15 +1931,11 @@ static int btrfs_init_btree_inode(struct super_block *sb) inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, IO_TREE_BTREE_INODE_IO); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; - BTRFS_I(inode)->location.type = 0; - BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); fs_info->btree_inode = inode; @@ -1966,7 +1959,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2141,7 +2134,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, /* If we have IGNOREDATACSUMS skip loading these roots. */ if (objectid == BTRFS_CSUM_TREE_OBJECTID && btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { - set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); return 0; } @@ -2194,7 +2187,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) - set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) ret = ret ? ret : -ENOENT; @@ -2239,7 +2232,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) struct btrfs_key location; int ret; - BUG_ON(!fs_info->tree_root); + ASSERT(fs_info->tree_root); ret = load_global_roots(tree_root); if (ret) @@ -2335,6 +2328,71 @@ out: return ret; } +static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb) +{ + unsigned int cur = 0; /* Offset inside the sys chunk array */ + /* + * At sb read time, fs_info is not fully initialized. Thus we have + * to use super block sectorsize, which should have been validated. + */ + const u32 sectorsize = btrfs_super_sectorsize(sb); + u32 sys_array_size = btrfs_super_sys_array_size(sb); + + if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + return -EUCLEAN; + } + + while (cur < sys_array_size) { + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + struct btrfs_key key; + u64 type; + u16 num_stripes; + u32 len; + int ret; + + disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); + len = sizeof(*disk_key); + + if (cur + len > sys_array_size) + goto short_read; + cur += len; + + btrfs_disk_key_to_cpu(&key, disk_key); + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + key.type, cur); + return -EUCLEAN; + } + chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + goto short_read; + type = btrfs_stack_chunk_type(chunk); + if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + btrfs_err(fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur); + return -EUCLEAN; + } + ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, + sectorsize); + if (ret < 0) + return ret; + cur += btrfs_chunk_item_size(num_stripes); + } + return 0; +short_read: + btrfs_err(fs_info, + "super block sys chunk array short read, cur=%u sys_array_size=%u", + cur, sys_array_size); + return -EUCLEAN; +} + /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. @@ -2345,21 +2403,29 @@ out: * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ -int btrfs_validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num) +int btrfs_validate_super(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS); if (btrfs_super_magic(sb) != BTRFS_MAGIC) { btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { - btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", - btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); - ret = -EINVAL; + if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { + if (!ignore_flags) { + btrfs_err(fs_info, + "unrecognized or unsupported super flag 0x%llx", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } else { + btrfs_info(fs_info, + "unrecognized or unsupported super flags: 0x%llx, ignored", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + } } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", @@ -2462,7 +2528,7 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || !btrfs_fs_incompat(fs_info, NO_HOLES))) { btrfs_err(fs_info, - "block-group-tree feature requires fres-space-tree and no-holes"); + "block-group-tree feature requires free-space-tree and no-holes"); ret = -EINVAL; } @@ -2495,6 +2561,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + ret = validate_sys_chunk_array(fs_info, sb); + /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk @@ -2583,7 +2651,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev struct btrfs_tree_parent_check check = { .level = level, .transid = gen, - .owner_root = root->root_key.objectid + .owner_root = btrfs_root_id(root) }; int ret = 0; @@ -2785,6 +2853,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; @@ -2839,6 +2908,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block int ret; fs_info->sb = sb; + /* Temporary fixed values for block size until we read the superblock. */ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); @@ -2846,10 +2916,18 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL); + if (ret) + return ret; + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2870,6 +2948,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (sb_rdonly(sb)) set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); + if (btrfs_test_opt(fs_info, IGNOREMETACSUMS)) + set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state); return btrfs_alloc_stripe_hash_table(fs_info); } @@ -2915,22 +2995,22 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i = 0; - int err = 0; - unsigned int ret = 0; + int ret = 0; while (1) { + unsigned int found; + spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + found = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) { + if (!found) { spin_unlock(&fs_info->fs_roots_radix_lock); break; } - root_objectid = gang[ret - 1]->root_key.objectid + 1; + root_objectid = btrfs_root_id(gang[found - 1]) + 1; - for (i = 0; i < ret; i++) { + for (int i = 0; i < found; i++) { /* Avoid to grab roots in dead_roots. */ if (btrfs_root_refs(&gang[i]->root_item) == 0) { gang[i] = NULL; @@ -2941,24 +3021,25 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->fs_roots_radix_lock); - for (i = 0; i < ret; i++) { + for (int i = 0; i < found; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - goto out; + root_objectid = btrfs_root_id(gang[i]); + /* + * Continue to release the remaining roots after the first + * error without cleanup and preserve the first error + * for the return. + */ + if (!ret) + ret = btrfs_orphan_cleanup(gang[i]); btrfs_put_root(gang[i]); } + if (ret) + break; + root_objectid++; } -out: - /* Release the uncleaned roots due to error. */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); - } - return err; + return ret; } /* @@ -3191,8 +3272,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) return 0; } -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - char *options) +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; @@ -3310,8 +3390,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the @@ -3334,20 +3416,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) { - struct btrfs_subpage_info *subpage_info; - + if (sectorsize < PAGE_SIZE) btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) { - ret = -ENOMEM; - goto fail_alloc; - } - btrfs_init_subpage_info(subpage_info, sectorsize); - fs_info->subpage_info = subpage_info; - } ret = btrfs_init_workqueues(fs_info); if (ret) @@ -3356,6 +3428,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); + /* Update the values for the current filesystem. */ sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); @@ -3615,28 +3688,25 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_super_write(struct bio *bio) { struct btrfs_device *device = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - struct page *page; - - bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; + struct folio_iter fi; + bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s (%d)", + "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); - ClearPageUptodate(page); - SetPageError(page); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); - } else { - SetPageUptodate(page); + /* Ensure failure if the primary sb fails. */ + if (bio->bi_opf & REQ_FUA) + atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR, + &device->sb_write_errors); + else + atomic_inc(&device->sb_write_errors); } - - put_page(page); - unlock_page(page); + folio_unlock(fi.folio); + folio_put(fi.folio); } bio_put(bio); @@ -3648,7 +3718,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, struct btrfs_super_block *super; struct page *page; u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; int ret; bytenr_orig = btrfs_sb_offset(copy_num); @@ -3723,34 +3793,36 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) /* * Write superblock @sb to the @device. Do not wait for completion, all the - * pages we use for writing are locked. + * folios we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when page is not found or submission fails. + * Return number of errors when folio is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; - struct address_space *mapping = device->bdev->bd_inode->i_mapping; + struct address_space *mapping = device->bdev->bd_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; - int errors = 0; int ret; u64 bytenr, bytenr_orig; + atomic_set(&device->sb_write_errors, 0); + if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; struct bio *bio; struct btrfs_super_block *disk_super; + size_t offset; bytenr_orig = btrfs_sb_offset(i); ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); @@ -3760,7 +3832,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_err(device->fs_info, "couldn't get super block location for mirror %d", i); - errors++; + atomic_inc(&device->sb_write_errors); continue; } if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3773,20 +3845,20 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); - page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, - GFP_NOFS); - if (!page) { + folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + GFP_NOFS); + if (IS_ERR(folio)) { btrfs_err(device->fs_info, "couldn't get super block page for bytenr %llu", bytenr); - errors++; + atomic_inc(&device->sb_write_errors); continue; } + ASSERT(folio_order(folio) == 0); - /* Bump the refcount for wait_dev_supers() */ - get_page(page); - - disk_super = page_address(page); + offset = offset_in_folio(folio, bytenr); + disk_super = folio_address(folio) + offset; memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); /* @@ -3800,8 +3872,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; - __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, - offset_in_page(bytenr)); + bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset); /* * We FUA only the first super block. The others we allow to @@ -3813,17 +3884,17 @@ static int write_dev_supers(struct btrfs_device *device, submit_bio(bio); if (btrfs_advance_sb_log(device, i)) - errors++; + atomic_inc(&device->sb_write_errors); } - return errors < i ? 0 : -1; + return atomic_read(&device->sb_write_errors) < i ? 0 : -1; } /* * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when page is not found or not marked up to - * date. + * Return -1 if primary super block write failed or when there were no super block + * copies written. Otherwise 0. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { @@ -3837,7 +3908,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { @@ -3852,30 +3923,21 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - page = find_get_page(device->bdev->bd_inode->i_mapping, - bytenr >> PAGE_SHIFT); - if (!page) { - errors++; - if (i == 0) - primary_failed = true; + folio = filemap_get_folio(device->bdev->bd_mapping, + bytenr >> PAGE_SHIFT); + /* If the folio has been removed, then we know it completed. */ + if (IS_ERR(folio)) continue; - } - /* Page is submitted locked and unlocked once the IO completes */ - wait_on_page_locked(page); - if (PageError(page)) { - errors++; - if (i == 0) - primary_failed = true; - } - - /* Drop our reference */ - put_page(page); + ASSERT(folio_order(folio) == 0); - /* Drop the reference from the writing run */ - put_page(page); + /* Folio will be unlocked once the write completes. */ + folio_wait_locked(folio); + folio_put(folio); } - /* log error, force error return */ + errors += atomic_read(&device->sb_write_errors); + if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR) + primary_failed = true; if (primary_failed) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", device->devid); @@ -4136,7 +4198,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); + (unsigned long)btrfs_root_id(root)); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); @@ -4155,9 +4217,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_commit_super(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_trans_handle *trans; - mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); @@ -4167,10 +4226,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - return btrfs_commit_transaction(trans); + return btrfs_commit_current_transaction(fs_info->tree_root); } static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) @@ -4179,9 +4235,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) struct btrfs_transaction *tmp; bool found = false; - if (list_empty(&fs_info->trans_list)) - return; - /* * This function is only called at the very end of close_ctree(), * thus no other running transaction, no need to take trans_lock. @@ -4203,7 +4256,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "transaction %llu (with %llu dirty metadata bytes) is not committed", trans->transid, dirty_bytes); - btrfs_cleanup_one_transaction(trans, fs_info); + btrfs_cleanup_one_transaction(trans); if (trans == fs_info->running_transaction) fs_info->running_transaction = NULL; @@ -4273,6 +4326,26 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Wait for any fixup workers to complete. + * If we don't wait for them here and they are still running by the time + * we call kthread_stop() against the cleaner kthread further below, we + * get an use-after-free on the cleaner because the fixup worker adds an + * inode to the list of delayed iputs and then attempts to wakeup the + * cleaner kthread, which was already stopped and destroyed. We parked + * already the cleaner, but below we run all pending delayed iputs. + */ + btrfs_flush_workqueue(fs_info->fixup_workers); + /* + * Similar case here, we have to wait for delalloc workers before we + * proceed below and stop the cleaner kthread, otherwise we trigger a + * use-after-tree on the cleaner kthread task_struct when a delalloc + * worker running submit_compressed_extents() adds a delayed iput, which + * does a wake up on the cleaner kthread, which was already freed below + * when we call kthread_stop(). + */ + btrfs_flush_workqueue(fs_info->delalloc_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4300,6 +4373,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4481,7 +4555,7 @@ static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; + root_objectid = btrfs_root_id(gang[i]); btrfs_free_log(NULL, gang[i]); btrfs_put_root(gang[i]); } @@ -4534,84 +4608,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) * extents that haven't had their dirty pages IO start writeout yet * actually get run and error out properly. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); -} - -static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - - delayed_refs = &trans->delayed_refs; - - spin_lock(&delayed_refs->lock); - if (atomic_read(&delayed_refs->num_entries) == 0) { - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, "delayed_refs has NO entry"); - return; - } - - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { - struct btrfs_delayed_ref_head *head; - struct rb_node *n; - bool pin_bytes = false; - - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_lock(delayed_refs, head)) - continue; - - spin_lock(&head->lock); - while ((n = rb_first_cached(&head->ref_tree)) != NULL) { - ref = rb_entry(n, struct btrfs_delayed_ref_node, - ref_node); - rb_erase_cached(&ref->ref_node, &head->ref_tree); - RB_CLEAR_NODE(&ref->ref_node); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - atomic_dec(&delayed_refs->num_entries); - btrfs_put_delayed_ref(ref); - btrfs_delayed_refs_rsv_release(fs_info, 1, 0); - } - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - btrfs_delete_ref_head(delayed_refs, head); - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - mutex_unlock(&head->mutex); - - if (pin_bytes) { - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - BUG_ON(!cache); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - cache->space_info, head->num_bytes); - cache->reserved -= head->num_bytes; - cache->space_info->bytes_reserved -= head->num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - btrfs_error_unpin_extent_range(fs_info, head->bytenr, - head->bytenr + head->num_bytes - 1); - } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); - btrfs_put_delayed_ref_head(head); - cond_resched(); - spin_lock(&delayed_refs->lock); - } - btrfs_qgroup_destroy_extent_records(trans); - - spin_unlock(&delayed_refs->lock); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) @@ -4626,7 +4623,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - __btrfs_del_delalloc_inode(root, btrfs_inode); + btrfs_del_delalloc_inode(btrfs_inode); spin_unlock(&root->delalloc_lock); /* @@ -4812,16 +4809,16 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) btrfs_qgroup_free_meta_all_pertrans(root); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); } } spin_unlock(&fs_info->fs_roots_radix_lock); } -void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans) { + struct btrfs_fs_info *fs_info = cur_trans->fs_info; struct btrfs_device *dev, *tmp; btrfs_cleanup_dirty_bgs(cur_trans, fs_info); @@ -4833,7 +4830,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, list_del_init(&dev->post_commit_list); } - btrfs_destroy_delayed_refs(cur_trans, fs_info); + btrfs_destroy_delayed_refs(cur_trans); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); @@ -4841,14 +4838,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&fs_info->transaction_wait); - btrfs_destroy_delayed_inodes(fs_info); - btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); - btrfs_free_all_qgroup_pertrans(fs_info); - cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4883,7 +4876,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } else { spin_unlock(&fs_info->trans_lock); } - btrfs_cleanup_one_transaction(t, fs_info); + btrfs_cleanup_one_transaction(t); spin_lock(&fs_info->trans_lock); if (t == fs_info->running_transaction) @@ -4901,6 +4894,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_all_delalloc_inodes(fs_info); btrfs_drop_all_logs(fs_info); + btrfs_free_all_qgroup_pertrans(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; @@ -4925,7 +4919,14 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * Key with offset -1 found, there would have to exist a root + * with such id, but this is out of valid range. + */ + ret = -EUCLEAN; + goto error; + } if (path->slots[0] > 0) { slot = path->slots[0] - 1; l = path->nodes[0]; @@ -4949,7 +4950,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", - root->root_key.objectid); + btrfs_root_id(root)); ret = -ENOSPC; goto out; } |