diff options
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 805 |
1 files changed, 371 insertions, 434 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3df5477d48a8..1beb9458f622 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -17,7 +17,7 @@ #include <linux/error-injection.h> #include <linux/crc32c.h> #include <linux/sched/mm.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" @@ -182,22 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, folio_pos(folio) + eb->folio_size); u32 len = end - start; + phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + + offset_in_folio(folio, start); - ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, folio, offset_in_folio(folio, start), - mirror_num); + ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, + paddr, mirror_num); if (ret) break; } @@ -213,7 +213,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int failed = 0; @@ -225,8 +225,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, ASSERT(check); while (1) { - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -257,7 +256,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, /* * Checksum a dirty tree block before IO. */ -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +int btree_csum_one_bio(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; @@ -268,9 +267,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) /* Btree blocks are always contiguous on disk. */ if (WARN_ON_ONCE(bbio->file_offset != eb->start)) - return BLK_STS_IOERR; + return -EIO; if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) - return BLK_STS_IOERR; + return -EIO; /* * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't @@ -279,14 +278,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) */ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { memzero_extent_buffer(eb, 0, eb->len); - return BLK_STS_OK; + return 0; } if (WARN_ON_ONCE(found_start != eb->start)) - return BLK_STS_IOERR; - if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], - eb->start, eb->len))) - return BLK_STS_IOERR; + return -EIO; + if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) + return -EIO; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), @@ -314,7 +312,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); - return BLK_STS_OK; + return 0; error: btrfs_print_tree(eb, 0); @@ -328,7 +326,7 @@ error: */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); - return errno_to_blk_status(ret); + return ret; } static bool check_tree_block_fsid(struct extent_buffer *eb) @@ -358,7 +356,7 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) /* Do basic extent buffer checks at read time */ int btrfs_validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -367,6 +365,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, u8 result[BTRFS_CSUM_SIZE]; const u8 *header_csum; int ret = 0; + const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS); ASSERT(check); @@ -399,13 +398,16 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, -"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s", eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), - btrfs_header_level(eb)); - ret = -EUCLEAN; - goto out; + btrfs_header_level(eb), + ignore_csum ? ", ignored" : ""); + if (!ignore_csum) { + ret = -EUCLEAN; + goto out; + } } if (found_level != check->level) { @@ -425,7 +427,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } if (check->has_first_key) { - struct btrfs_key *expect_key = &check->first_key; + const struct btrfs_key *expect_key = &check->first_key; struct btrfs_key found_key; if (found_level) @@ -450,15 +452,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } - /* - * If this is a leaf block and it is corrupt, set the corrupt bit so - * that we don't try and read the other copies of this block, just - * return -EIO. - */ - if (found_level == 0 && btrfs_check_leaf(eb)) { - set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + /* If this is a leaf block and it is corrupt, just return -EIO. */ + if (found_level == 0 && btrfs_check_leaf(eb)) ret = -EIO; - } if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; @@ -521,7 +517,7 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; - return try_release_extent_buffer(&folio->page); + return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, @@ -635,18 +631,19 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, free_extent_buffer_stale(buf); return ERR_PTR(ret); } - if (btrfs_check_eb_owner(buf, check->owner_root)) { - free_extent_buffer_stale(buf); - return ERR_PTR(-EUCLEAN); - } return buf; } -static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, - u64 objectid) +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, + u64 objectid, gfp_t flags) { - bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + struct btrfs_root *root; + bool dummy = btrfs_is_testing(fs_info); + + root = kzalloc(sizeof(*root), flags); + if (!root) + return NULL; memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -658,13 +655,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->state = 0; RB_CLEAR_NODE(&root->rb_node); - root->last_trans = 0; + btrfs_set_root_last_trans(root, 0); root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; - root->inode_tree = RB_ROOT; - /* GFP flags are compatible with XA_FLAGS_*. */ - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + xa_init(&root->inodes); + xa_init(&root->delayed_nodes); btrfs_init_root_block_rsv(root); @@ -675,7 +671,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); - spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); @@ -702,10 +697,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { - extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES); - extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE); + btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, + IO_TREE_ROOT_DIRTY_LOG_PAGES); + btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -716,14 +711,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, list_add_tail(&root->leak_list, &fs_info->allocated_roots); spin_unlock(&fs_info->fs_roots_radix_lock); #endif -} -static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - u64 objectid, gfp_t flags) -{ - struct btrfs_root *root = kzalloc(sizeof(*root), flags); - if (root) - __setup_root(root, fs_info, objectid); return root; } @@ -776,7 +764,7 @@ int btrfs_global_root_insert(struct btrfs_root *root) if (tmp) { ret = -EEXIST; btrfs_warn(fs_info, "global root %llu %llu already exists", - root->root_key.objectid, root->root_key.offset); + btrfs_root_id(root), root->root_key.offset); } return ret; } @@ -848,13 +836,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) return btrfs_global_root(fs_info, &key); } -struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) -{ - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) - return fs_info->block_group_root; - return btrfs_extent_root(fs_info, 0); -} - struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { @@ -926,8 +907,7 @@ fail: return ERR_PTR(ret); } -static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; @@ -975,7 +955,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, { struct btrfs_root *log_root; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1001,7 +981,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; int ret; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1011,8 +991,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return ret; } - log_root->last_trans = trans->transid; - log_root->root_key.offset = root->root_key.objectid; + btrfs_set_root_last_trans(log_root, trans->transid); + log_root->root_key.offset = btrfs_root_id(root); inode_item = &log_root->root_item.inode; btrfs_set_stack_inode_generation(inode_item, 1); @@ -1034,7 +1014,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_tree_parent_check check = { 0 }; @@ -1076,15 +1056,15 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - root->root_key.objectid != btrfs_header_owner(root->node)) { + if (!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node)) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", - root->root_key.objectid, root->node->start, + btrfs_root_id(root), root->node->start, btrfs_header_owner(root->node), - root->root_key.objectid); + btrfs_root_id(root)); ret = -EUCLEAN; goto fail; } @@ -1096,24 +1076,25 @@ fail: } struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); - btrfs_free_path(path); return root; } /* - * Initialize subvolume root in-memory structure + * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new + * + * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { @@ -1121,9 +1102,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_drew_lock_init(&root->snapshot_lock); - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && - is_fstree(root->root_key.objectid)) { + is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1132,12 +1113,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ - if (is_fstree(root->root_key.objectid) && + if (is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + return ret; } else { root->anon_dev = anon_dev; } @@ -1147,7 +1128,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); - goto fail; + return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1155,9 +1136,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) mutex_unlock(&root->objectid_mutex); return 0; -fail: - /* The caller is responsible to call btrfs_free_fs_root */ - return ret; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, @@ -1219,7 +1197,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), root); if (ret == 0) { btrfs_grab_root(root); @@ -1231,7 +1209,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } -void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_DEBUG struct btrfs_root *root; @@ -1266,9 +1244,15 @@ static void free_global_roots(struct btrfs_fs_info *fs_info) void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { + struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); + if (percpu_counter_initialized(em_counter)) + ASSERT(percpu_counter_sum_positive(em_counter) == 0); + percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -1289,7 +1273,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -1573,7 +1556,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = msecs_to_jiffies(fs_info->commit_interval * 1000); + delay = secs_to_jiffies(fs_info->commit_interval); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1588,9 +1571,9 @@ static int transaction_kthread(void *arg) cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay -= msecs_to_jiffies((delta - 1) * 1000); + delay -= secs_to_jiffies(delta - 1); delay = min(delay, - msecs_to_jiffies(fs_info->commit_interval * 1000)); + secs_to_jiffies(fs_info->commit_interval)); goto sleep; } transid = cur->transid; @@ -1850,7 +1833,8 @@ void btrfs_put_root(struct btrfs_root *root) return; if (refcount_dec_and_test(&root->refs)) { - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (WARN_ON(!xa_empty(&root->inodes))) + xa_destroy(&root->inodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -1871,8 +1855,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) int i; while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); + gang[0] = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); list_del(&gang[0]->root_list); if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) @@ -1924,7 +1908,7 @@ static int btrfs_init_btree_inode(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID); set_nlink(inode, 1); /* * we set the i_size on the btree inode to the max possible int. @@ -1935,15 +1919,11 @@ static int btrfs_init_btree_inode(struct super_block *sb) inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO); - extent_map_tree_init(&BTRFS_I(inode)->extent_tree); + btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, + IO_TREE_BTREE_INODE_IO); + btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; - BTRFS_I(inode)->location.type = 0; - BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); fs_info->btree_inode = inode; @@ -1967,7 +1947,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2014,7 +1994,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", ordered_flags); fs_info->discard_ctl.discard_workers = - alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); + alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && @@ -2142,7 +2122,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, /* If we have IGNOREDATACSUMS skip loading these roots. */ if (objectid == BTRFS_CSUM_TREE_OBJECTID && btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { - set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); return 0; } @@ -2195,7 +2175,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) - set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state); if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) ret = ret ? ret : -ENOENT; @@ -2208,8 +2188,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, static int load_global_roots(struct btrfs_root *tree_root) { - struct btrfs_path *path; - int ret = 0; + BTRFS_PATH_AUTO_FREE(path); + int ret; path = btrfs_alloc_path(); if (!path) @@ -2218,18 +2198,17 @@ static int load_global_roots(struct btrfs_root *tree_root) ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) - goto out; + return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); -out: - btrfs_free_path(path); + return ret; } @@ -2336,6 +2315,71 @@ out: return ret; } +static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb) +{ + unsigned int cur = 0; /* Offset inside the sys chunk array */ + /* + * At sb read time, fs_info is not fully initialized. Thus we have + * to use super block sectorsize, which should have been validated. + */ + const u32 sectorsize = btrfs_super_sectorsize(sb); + u32 sys_array_size = btrfs_super_sys_array_size(sb); + + if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + return -EUCLEAN; + } + + while (cur < sys_array_size) { + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + struct btrfs_key key; + u64 type; + u16 num_stripes; + u32 len; + int ret; + + disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); + len = sizeof(*disk_key); + + if (cur + len > sys_array_size) + goto short_read; + cur += len; + + btrfs_disk_key_to_cpu(&key, disk_key); + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + key.type, cur); + return -EUCLEAN; + } + chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + goto short_read; + type = btrfs_stack_chunk_type(chunk); + if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + btrfs_err(fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur); + return -EUCLEAN; + } + ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, + sectorsize); + if (ret < 0) + return ret; + cur += btrfs_chunk_item_size(num_stripes); + } + return 0; +short_read: + btrfs_err(fs_info, + "super block sys chunk array short read, cur=%u sys_array_size=%u", + cur, sys_array_size); + return -EUCLEAN; +} + /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. @@ -2346,21 +2390,29 @@ out: * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ -int btrfs_validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num) +int btrfs_validate_super(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS); if (btrfs_super_magic(sb) != BTRFS_MAGIC) { btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { - btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", - btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); - ret = -EINVAL; + if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { + if (!ignore_flags) { + btrfs_err(fs_info, + "unrecognized or unsupported super flag 0x%llx", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } else { + btrfs_info(fs_info, + "unrecognized or unsupported super flags: 0x%llx, ignored", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + } } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", @@ -2382,21 +2434,27 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* - * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. + * + * For 4K page sized systems with non-debug builds, all 3 matches (4K). + * For 4K page sized systems with debug builds, there are two block sizes + * supported. (4K and 2K) * * We can support 16K sectorsize with 64K page size without problem, * but such sectorsize/pagesize combination doesn't make much sense. * 4K will be our future standard, PAGE_SIZE is supported from the very * beginning. */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && + sectorsize != PAGE_SIZE && + sectorsize != BTRFS_MIN_BLOCKSIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2463,7 +2521,7 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || !btrfs_fs_incompat(fs_info, NO_HOLES))) { btrfs_err(fs_info, - "block-group-tree feature requires fres-space-tree and no-holes"); + "block-group-tree feature requires free-space-tree and no-holes"); ret = -EINVAL; } @@ -2496,6 +2554,11 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (ret) + return ret; + + ret = validate_sys_chunk_array(fs_info, sb); + /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk @@ -2584,7 +2647,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev struct btrfs_tree_parent_check check = { .level = level, .transid = gen, - .owner_root = root->root_key.objectid + .owner_root = btrfs_root_id(root) }; int ret = 0; @@ -2698,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } +/* + * Lockdep gets confused between our buffer_tree which requires IRQ locking because + * we modify marks in the IRQ context, and our delayed inode xarray which doesn't + * have these requirements. Use a class key so lockdep doesn't get them mixed up. + */ +static struct lock_class_key buffer_xa_class; + void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + + /* Use the same flags as mapping->i_pages. */ + xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); + lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class); + INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2713,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); @@ -2758,6 +2831,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); @@ -2786,12 +2860,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; - extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -2848,10 +2923,18 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL); + if (ret) + return ret; + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2872,6 +2955,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (sb_rdonly(sb)) set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); + if (btrfs_test_opt(fs_info, IGNOREMETACSUMS)) + set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state); return btrfs_alloc_stripe_hash_table(fs_info); } @@ -2917,22 +3002,22 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i = 0; - int err = 0; - unsigned int ret = 0; + int ret = 0; while (1) { + unsigned int found; + spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + found = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) { + if (!found) { spin_unlock(&fs_info->fs_roots_radix_lock); break; } - root_objectid = gang[ret - 1]->root_key.objectid + 1; + root_objectid = btrfs_root_id(gang[found - 1]) + 1; - for (i = 0; i < ret; i++) { + for (int i = 0; i < found; i++) { /* Avoid to grab roots in dead_roots. */ if (btrfs_root_refs(&gang[i]->root_item) == 0) { gang[i] = NULL; @@ -2943,24 +3028,25 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->fs_roots_radix_lock); - for (i = 0; i < ret; i++) { + for (int i = 0; i < found; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - goto out; + root_objectid = btrfs_root_id(gang[i]); + /* + * Continue to release the remaining roots after the first + * error without cleanup and preserve the first error + * for the return. + */ + if (!ret) + ret = btrfs_orphan_cleanup(gang[i]); btrfs_put_root(gang[i]); } + if (ret) + break; + root_objectid++; } -out: - /* Release the uncleaned roots due to error. */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); - } - return err; + return ret; } /* @@ -3193,8 +3279,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) return 0; } -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - char *options) +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; @@ -3233,7 +3318,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* * Read super block and check the signature bytes only */ - disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); + disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false); if (IS_ERR(disk_super)) { ret = PTR_ERR(disk_super); goto fail_alloc; @@ -3314,6 +3399,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->sectorsize_bits = ilog2(sectorsize); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the @@ -3336,21 +3422,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) { - struct btrfs_subpage_info *subpage_info; - - btrfs_warn(fs_info, - "read-write for sector size %u with page size %lu is experimental", - sectorsize, PAGE_SIZE); - subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) { - ret = -ENOMEM; - goto fail_alloc; - } - btrfs_init_subpage_info(subpage_info, sectorsize); - fs_info->subpage_info = subpage_info; - } - ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -3618,142 +3689,62 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_super_write(struct bio *bio) { struct btrfs_device *device = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - struct page *page; - - bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; + struct folio_iter fi; + bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s (%d)", + "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); - ClearPageUptodate(page); - SetPageError(page); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); - } else { - SetPageUptodate(page); + /* Ensure failure if the primary sb fails. */ + if (bio->bi_opf & REQ_FUA) + atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR, + &device->sb_write_errors); + else + atomic_inc(&device->sb_write_errors); } - - put_page(page); - unlock_page(page); + folio_unlock(fi.folio); + folio_put(fi.folio); } bio_put(bio); } -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache) -{ - struct btrfs_super_block *super; - struct page *page; - u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_inode->i_mapping; - int ret; - - bytenr_orig = btrfs_sb_offset(copy_num); - ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); - if (ret == -ENOENT) - return ERR_PTR(-EINVAL); - else if (ret) - return ERR_PTR(ret); - - if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); - - if (drop_cache) { - /* This should only be called with the primary sb. */ - ASSERT(copy_num == 0); - - /* - * Drop the page of the primary superblock, so later read will - * always read from the device. - */ - invalidate_inode_pages2_range(mapping, - bytenr >> PAGE_SHIFT, - (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); - } - - page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return ERR_CAST(page); - - super = page_address(page); - if (btrfs_super_magic(super) != BTRFS_MAGIC) { - btrfs_release_disk_super(super); - return ERR_PTR(-ENODATA); - } - - if (btrfs_super_bytenr(super) != bytenr_orig) { - btrfs_release_disk_super(super); - return ERR_PTR(-EINVAL); - } - - return super; -} - - -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) -{ - struct btrfs_super_block *super, *latest = NULL; - int i; - u64 transid = 0; - - /* we would like to check all the supers, but that would make - * a btrfs mount succeed after a mkfs from a different FS. - * So, we need to add a special mount option to scan for - * later supers, using BTRFS_SUPER_MIRROR_MAX instead - */ - for (i = 0; i < 1; i++) { - super = btrfs_read_dev_one_super(bdev, i, false); - if (IS_ERR(super)) - continue; - - if (!latest || btrfs_super_generation(super) > transid) { - if (latest) - btrfs_release_disk_super(super); - - latest = super; - transid = btrfs_super_generation(super); - } - } - - return super; -} - /* * Write superblock @sb to the @device. Do not wait for completion, all the - * pages we use for writing are locked. + * folios we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when page is not found or submission fails. + * Return number of errors when folio is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; - struct address_space *mapping = device->bdev->bd_inode->i_mapping; + struct address_space *mapping = device->bdev->bd_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; - int errors = 0; int ret; u64 bytenr, bytenr_orig; + atomic_set(&device->sb_write_errors, 0); + if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; struct bio *bio; struct btrfs_super_block *disk_super; + size_t offset; bytenr_orig = btrfs_sb_offset(i); ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); @@ -3761,9 +3752,9 @@ static int write_dev_supers(struct btrfs_device *device, continue; } else if (ret < 0) { btrfs_err(device->fs_info, - "couldn't get super block location for mirror %d", - i); - errors++; + "couldn't get super block location for mirror %d error %d", + i, ret); + atomic_inc(&device->sb_write_errors); continue; } if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3776,20 +3767,19 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); - page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, - GFP_NOFS); - if (!page) { + folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + GFP_NOFS); + if (IS_ERR(folio)) { btrfs_err(device->fs_info, - "couldn't get super block page for bytenr %llu", - bytenr); - errors++; + "couldn't get super block page for bytenr %llu error %ld", + bytenr, PTR_ERR(folio)); + atomic_inc(&device->sb_write_errors); continue; } - /* Bump the refcount for wait_dev_supers() */ - get_page(page); - - disk_super = page_address(page); + offset = offset_in_folio(folio, bytenr); + disk_super = folio_address(folio) + offset; memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); /* @@ -3803,8 +3793,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; - __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, - offset_in_page(bytenr)); + bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset); /* * We FUA only the first super block. The others we allow to @@ -3816,17 +3805,17 @@ static int write_dev_supers(struct btrfs_device *device, submit_bio(bio); if (btrfs_advance_sb_log(device, i)) - errors++; + atomic_inc(&device->sb_write_errors); } - return errors < i ? 0 : -1; + return atomic_read(&device->sb_write_errors) < i ? 0 : -1; } /* * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when page is not found or not marked up to - * date. + * Return -1 if primary super block write failed or when there were no super block + * copies written. Otherwise 0. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { @@ -3840,7 +3829,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { @@ -3855,30 +3844,20 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - page = find_get_page(device->bdev->bd_inode->i_mapping, - bytenr >> PAGE_SHIFT); - if (!page) { - errors++; - if (i == 0) - primary_failed = true; + folio = filemap_get_folio(device->bdev->bd_mapping, + bytenr >> PAGE_SHIFT); + /* If the folio has been removed, then we know it completed. */ + if (IS_ERR(folio)) continue; - } - /* Page is submitted locked and unlocked once the IO completes */ - wait_on_page_locked(page); - if (PageError(page)) { - errors++; - if (i == 0) - primary_failed = true; - } - /* Drop our reference */ - put_page(page); - - /* Drop the reference from the writing run */ - put_page(page); + /* Folio will be unlocked once the write completes. */ + folio_wait_locked(folio); + folio_put(folio); } - /* log error, force error return */ + errors += atomic_read(&device->sb_write_errors); + if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR) + primary_failed = true; if (primary_failed) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", device->devid); @@ -4139,7 +4118,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); + (unsigned long)btrfs_root_id(root)); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); @@ -4158,9 +4137,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_commit_super(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_trans_handle *trans; - mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); @@ -4170,10 +4146,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - return btrfs_commit_transaction(trans); + return btrfs_commit_current_transaction(fs_info->tree_root); } static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) @@ -4182,9 +4155,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) struct btrfs_transaction *tmp; bool found = false; - if (list_empty(&fs_info->trans_list)) - return; - /* * This function is only called at the very end of close_ctree(), * thus no other running transaction, no need to take trans_lock. @@ -4198,15 +4168,16 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (find_first_extent_bit(&trans->dirty_pages, cur, - &found_start, &found_end, EXTENT_DIRTY, &cached)) { + while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, + EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; } btrfs_warn(fs_info, "transaction %llu (with %llu dirty metadata bytes) is not committed", trans->transid, dirty_bytes); - btrfs_cleanup_one_transaction(trans, fs_info); + btrfs_cleanup_one_transaction(trans); if (trans == fs_info->running_transaction) fs_info->running_transaction = NULL; @@ -4276,6 +4247,59 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* + * Wait for any fixup workers to complete. + * If we don't wait for them here and they are still running by the time + * we call kthread_stop() against the cleaner kthread further below, we + * get an use-after-free on the cleaner because the fixup worker adds an + * inode to the list of delayed iputs and then attempts to wakeup the + * cleaner kthread, which was already stopped and destroyed. We parked + * already the cleaner, but below we run all pending delayed iputs. + */ + btrfs_flush_workqueue(fs_info->fixup_workers); + /* + * Similar case here, we have to wait for delalloc workers before we + * proceed below and stop the cleaner kthread, otherwise we trigger a + * use-after-tree on the cleaner kthread task_struct when a delalloc + * worker running submit_compressed_extents() adds a delayed iput, which + * does a wake up on the cleaner kthread, which was already freed below + * when we call kthread_stop(). + */ + btrfs_flush_workqueue(fs_info->delalloc_workers); + + /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4299,10 +4323,13 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); btrfs_run_delayed_iputs(fs_info); + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4332,9 +4359,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4342,7 +4366,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); if (btrfs_check_quota_leak(fs_info)) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN("qgroup reserved space leaked"); btrfs_err(fs_info, "qgroup reserved space leaked"); } @@ -4457,10 +4481,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } @@ -4484,7 +4504,7 @@ static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; + root_objectid = btrfs_root_id(gang[i]); btrfs_free_log(NULL, gang[i]); btrfs_put_root(gang[i]); } @@ -4537,84 +4557,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) * extents that haven't had their dirty pages IO start writeout yet * actually get run and error out properly. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); -} - -static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - - delayed_refs = &trans->delayed_refs; - - spin_lock(&delayed_refs->lock); - if (atomic_read(&delayed_refs->num_entries) == 0) { - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, "delayed_refs has NO entry"); - return; - } - - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { - struct btrfs_delayed_ref_head *head; - struct rb_node *n; - bool pin_bytes = false; - - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_lock(delayed_refs, head)) - continue; - - spin_lock(&head->lock); - while ((n = rb_first_cached(&head->ref_tree)) != NULL) { - ref = rb_entry(n, struct btrfs_delayed_ref_node, - ref_node); - rb_erase_cached(&ref->ref_node, &head->ref_tree); - RB_CLEAR_NODE(&ref->ref_node); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - atomic_dec(&delayed_refs->num_entries); - btrfs_put_delayed_ref(ref); - btrfs_delayed_refs_rsv_release(fs_info, 1, 0); - } - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - btrfs_delete_ref_head(delayed_refs, head); - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - mutex_unlock(&head->mutex); - - if (pin_bytes) { - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - BUG_ON(!cache); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - cache->space_info, head->num_bytes); - cache->reserved -= head->num_bytes; - cache->space_info->bytes_reserved -= head->num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - btrfs_error_unpin_extent_range(fs_info, head->bytenr, - head->bytenr + head->num_bytes - 1); - } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); - btrfs_put_delayed_ref_head(head); - cond_resched(); - spin_lock(&delayed_refs->lock); - } - btrfs_qgroup_destroy_extent_records(trans); - - spin_unlock(&delayed_refs->lock); + btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) @@ -4680,9 +4623,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL)) { - clear_extent_bits(dirty_pages, start, end, mark); + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { + btrfs_clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; @@ -4715,14 +4658,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { + if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - clear_extent_dirty(unpin, start, end, &cached_state); - free_extent_state(cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); + btrfs_free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -4815,16 +4758,16 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) btrfs_qgroup_free_meta_all_pertrans(root); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); } } spin_unlock(&fs_info->fs_roots_radix_lock); } -void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans) { + struct btrfs_fs_info *fs_info = cur_trans->fs_info; struct btrfs_device *dev, *tmp; btrfs_cleanup_dirty_bgs(cur_trans, fs_info); @@ -4836,7 +4779,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, list_del_init(&dev->post_commit_list); } - btrfs_destroy_delayed_refs(cur_trans, fs_info); + btrfs_destroy_delayed_refs(cur_trans); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); @@ -4844,14 +4787,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&fs_info->transaction_wait); - btrfs_destroy_delayed_inodes(fs_info); - btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); - btrfs_free_all_qgroup_pertrans(fs_info); - cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4886,7 +4825,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } else { spin_unlock(&fs_info->trans_lock); } - btrfs_cleanup_one_transaction(t, fs_info); + btrfs_cleanup_one_transaction(t); spin_lock(&fs_info->trans_lock); if (t == fs_info->running_transaction) @@ -4904,6 +4843,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_all_delalloc_inodes(fs_info); btrfs_drop_all_logs(fs_info); + btrfs_free_all_qgroup_pertrans(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; @@ -4911,7 +4851,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) int btrfs_init_root_free_objectid(struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; @@ -4927,14 +4867,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; @@ -4945,10 +4884,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) @@ -4959,7 +4896,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", - root->root_key.objectid); + btrfs_root_id(root)); ret = -ENOSPC; goto out; } |