From 96dfcb46ffca4803627f7ae74085fbb2bae2acc5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:18 -0500 Subject: btrfs: push __setup_root into btrfs_alloc_root There's no reason to not init the root at alloc time, and with later patches it actually causes problems if we error out mounting the fs before the tree_root is init'ed because we expect it to have a valid ref count. Fix this by pushing __setup_root into btrfs_alloc_root. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c6c9a6a8e6c8..f1121d814e87 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1130,6 +1130,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + root->fs_info = fs_info; root->node = NULL; root->commit_root = NULL; root->state = 0; @@ -1198,11 +1199,11 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - gfp_t flags) + u64 objectid, gfp_t flags) { struct btrfs_root *root = kzalloc(sizeof(*root), flags); if (root) - root->fs_info = fs_info; + __setup_root(root, fs_info, objectid); return root; } @@ -1215,12 +1216,11 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) if (!fs_info) return ERR_PTR(-EINVAL); - root = btrfs_alloc_root(fs_info, GFP_KERNEL); + root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ - __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; @@ -1244,12 +1244,11 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); - root = btrfs_alloc_root(fs_info, GFP_KERNEL); + root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; @@ -1309,12 +1308,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct extent_buffer *leaf; - root = btrfs_alloc_root(fs_info, GFP_NOFS); + root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID); - root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; @@ -1401,14 +1398,12 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, if (!path) return ERR_PTR(-ENOMEM); - root = btrfs_alloc_root(fs_info, GFP_NOFS); + root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); if (!root) { ret = -ENOMEM; goto alloc_fail; } - __setup_root(root, fs_info, key->objectid); - ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { @@ -2208,12 +2203,11 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return -EIO; } - log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); + log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, + GFP_KERNEL); if (!log_tree_root) return -ENOMEM; - __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - log_tree_root->node = read_tree_block(fs_info, bytenr, fs_info->generation + 1, level, NULL); @@ -2645,8 +2639,12 @@ int __cold open_ctree(struct super_block *sb, int clear_free_space_tree = 0; int level; - tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); - chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); + tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, + GFP_KERNEL); + fs_info->tree_root = tree_root; + chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, + GFP_KERNEL); + fs_info->chunk_root = chunk_root; if (!tree_root || !chunk_root) { err = -ENOMEM; goto fail; @@ -2824,8 +2822,6 @@ int __cold open_ctree(struct super_block *sb, goto fail_alloc; } - __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - invalidate_bdev(fs_devices->latest_bdev); /* @@ -3021,8 +3017,6 @@ int __cold open_ctree(struct super_block *sb, generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); - chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), generation, level, NULL); -- cgit From f39e457156f919b849eb2c12a7a4452a1f2be15b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:19 -0500 Subject: btrfs: move fs root init stuff into btrfs_init_fs_root We have a helper for reading fs roots that just reads the fs root off the disk and then sets REF_COWS and init's the inheritable flags. Move this into btrfs_init_fs_root so we can later get rid of this helper and consolidate all of the fs root reading into one helper. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f1121d814e87..eff6f0e45c3e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1445,12 +1445,6 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, root = btrfs_read_tree_root(tree_root, location); if (IS_ERR(root)) return root; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - set_bit(BTRFS_ROOT_REF_COWS, &root->state); - btrfs_check_and_init_root_item(&root->root_item); - } - return root; } @@ -1474,6 +1468,11 @@ int btrfs_init_fs_root(struct btrfs_root *root) } root->subv_writers = writers; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + set_bit(BTRFS_ROOT_REF_COWS, &root->state); + btrfs_check_and_init_root_item(&root->root_item); + } + btrfs_init_free_ino_ctl(root); spin_lock_init(&root->ino_cache_lock); init_waitqueue_head(&root->ino_cache_wait); -- cgit From 62a2c73ebda3b0163832610c0e91f350a22908bd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:21 -0500 Subject: btrfs: export and use btrfs_read_tree_root for tree-log Tree-log uses btrfs_read_fs_root to load its log, but this just calls btrfs_read_tree_root. We don't save the log roots in our root cache, so just export this helper and use it in the logging code. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eff6f0e45c3e..9807167b6208 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1384,8 +1384,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; -- cgit From 83db2aadb3bd3a117918434448e8c0e90229b73b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:23 -0500 Subject: btrfs: remove btrfs_read_fs_root, not used anymore All helpers should either be using btrfs_get_fs_root() or btrfs_read_tree_root(). Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9807167b6208..2e92a6bc64c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1437,17 +1437,6 @@ alloc_fail: goto out; } -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, - struct btrfs_key *location) -{ - struct btrfs_root *root; - - root = btrfs_read_tree_root(tree_root, location); - if (IS_ERR(root)) - return root; - return root; -} - int btrfs_init_fs_root(struct btrfs_root *root) { int ret; @@ -1568,7 +1557,7 @@ again: return root; } - root = btrfs_read_fs_root(fs_info->tree_root, location); + root = btrfs_read_tree_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; -- cgit From 3619c94f073e4e96bef4cc15e70adbc36f3cb203 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:24 -0500 Subject: btrfs: open code btrfs_read_fs_root_no_name All this does is call btrfs_get_fs_root() with check_ref == true. Just use btrfs_get_fs_root() so we don't have a bunch of different helpers that do the same thing. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2e92a6bc64c0..f0a33e004d91 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3178,7 +3178,7 @@ int __cold open_ctree(struct super_block *sb, location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; - fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); -- cgit From a98db0f304670dfd65375e6d0ea08793c174bf44 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:25 -0500 Subject: btrfs: make the fs root init functions static Now that the orphan cleanup stuff doesn't use this directly we can just make them static. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f0a33e004d91..67e7b0ccdea0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1437,7 +1437,7 @@ alloc_fail: goto out; } -int btrfs_init_fs_root(struct btrfs_root *root) +static int btrfs_init_fs_root(struct btrfs_root *root) { int ret; struct btrfs_subvolume_writers *writers; @@ -1488,8 +1488,8 @@ fail: return ret; } -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; -- cgit From af01d2e53f3343e9f22495e79f07b107374b05a1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:27 -0500 Subject: btrfs: hold a ref on fs roots while they're in the radix tree If the root is sitting in the radix tree, we should probably have a ref for the radix tree. Grab a ref on the root when we insert it, and drop it when it gets deleted. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 67e7b0ccdea0..43af9f9b8583 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1513,8 +1513,10 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) + if (ret == 0) { + btrfs_grab_fs_root(root); set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); @@ -3817,6 +3819,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + btrfs_put_fs_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); if (btrfs_root_refs(&root->root_item) == 0) -- cgit From 0d4b0463011de06288d8ca80a873a97a7d99a948 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:53 -0500 Subject: btrfs: export and rename free_fs_info We're going to start freeing roots and doing other complicated things in free_fs_info, so we need to move it to disk-io.c and export it in order to use things lik btrfs_put_fs_root(). Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 43af9f9b8583..ce36a4b9c8a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1523,6 +1523,24 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->balance_ctl); + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->quota_root); + kfree(fs_info->uuid_root); + kfree(fs_info->free_space_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kvfree(fs_info); +} + + struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, bool check_ref) -- cgit From 4c78e9f59632b5e635e51db9b68111288b38ff22 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:54 -0500 Subject: btrfs: hold a ref on the root in open_ctree We lookup the fs_root and put it in our fs_info directly, we should hold a ref on this root for the lifetime of the fs_info. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ce36a4b9c8a3..76b888728540 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1537,6 +1537,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->free_space_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); + btrfs_put_fs_root(fs_info->fs_root); kvfree(fs_info); } @@ -3206,6 +3207,13 @@ int __cold open_ctree(struct super_block *sb, goto fail_qgroup; } + if (!btrfs_grab_fs_root(fs_info->fs_root)) { + fs_info->fs_root = NULL; + err = -ENOENT; + btrfs_warn(fs_info, "failed to grab a ref on the fs tree"); + goto fail_qgroup; + } + if (sb_rdonly(sb)) return 0; -- cgit From 81f096edf0472864879e1e5a7f0fdf87ea90fe75 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:55 -0500 Subject: btrfs: use btrfs_put_fs_root to free roots always If we are going to track leaked roots we need to free them all the same way, so don't kfree() roots directly, use btrfs_put_fs_root. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 76b888728540..5be61619c71b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1297,7 +1297,7 @@ fail: free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } - kfree(root); + btrfs_put_fs_root(root); return ERR_PTR(ret); } @@ -1328,7 +1328,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { - kfree(root); + btrfs_put_fs_root(root); return ERR_CAST(leaf); } @@ -1431,7 +1431,7 @@ out: return root; find_fail: - kfree(root); + btrfs_put_fs_root(root); alloc_fail: root = ERR_PTR(ret); goto out; @@ -1527,17 +1527,17 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info->quota_root); - kfree(fs_info->uuid_root); - kfree(fs_info->free_space_root); + btrfs_put_fs_root(fs_info->extent_root); + btrfs_put_fs_root(fs_info->tree_root); + btrfs_put_fs_root(fs_info->chunk_root); + btrfs_put_fs_root(fs_info->dev_root); + btrfs_put_fs_root(fs_info->csum_root); + btrfs_put_fs_root(fs_info->quota_root); + btrfs_put_fs_root(fs_info->uuid_root); + btrfs_put_fs_root(fs_info->free_space_root); + btrfs_put_fs_root(fs_info->fs_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - btrfs_put_fs_root(fs_info->fs_root); kvfree(fs_info); } @@ -2223,12 +2223,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_fs_root(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_fs_root(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ @@ -2237,7 +2237,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + btrfs_put_fs_root(log_tree_root); return ret; } -- cgit From bc44d7c4b2b179c4b74fba208b9908e2ecbc1b4d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:56 -0500 Subject: btrfs: push btrfs_grab_fs_root into btrfs_get_fs_root Now that all callers of btrfs_get_fs_root are subsequently calling btrfs_grab_fs_root and handling dropping the ref when they are done appropriately, go ahead and push btrfs_grab_fs_root up into btrfs_get_fs_root. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5be61619c71b..a0c67200fb2a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1496,6 +1496,8 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); + if (root) + root = btrfs_grab_fs_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1552,29 +1554,31 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return fs_info->tree_root; + return btrfs_grab_fs_root(fs_info->tree_root); if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) - return fs_info->extent_root; + return btrfs_grab_fs_root(fs_info->extent_root); if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) - return fs_info->chunk_root; + return btrfs_grab_fs_root(fs_info->chunk_root); if (location->objectid == BTRFS_DEV_TREE_OBJECTID) - return fs_info->dev_root; + return btrfs_grab_fs_root(fs_info->dev_root); if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) - return fs_info->csum_root; + return btrfs_grab_fs_root(fs_info->csum_root); if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) - return fs_info->quota_root ? fs_info->quota_root : - ERR_PTR(-ENOENT); + return btrfs_grab_fs_root(fs_info->quota_root) ? + fs_info->quota_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_UUID_TREE_OBJECTID) - return fs_info->uuid_root ? fs_info->uuid_root : - ERR_PTR(-ENOENT); + return btrfs_grab_fs_root(fs_info->uuid_root) ? + fs_info->uuid_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return fs_info->free_space_root ? fs_info->free_space_root : - ERR_PTR(-ENOENT); + return btrfs_grab_fs_root(fs_info->free_space_root) ? + fs_info->free_space_root : ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { - if (check_ref && btrfs_root_refs(&root->root_item) == 0) + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + btrfs_put_fs_root(root); return ERR_PTR(-ENOENT); + } return root; } @@ -1607,8 +1611,18 @@ again: if (ret == 0) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); + /* + * All roots have two refs on them at all times, one for the mounted fs, + * and one for being in the radix tree. This way we only free the root + * when we are unmounting or deleting the subvolume. We get one ref + * from __setup_root, one for inserting it into the radix tree, and then + * we have the third for returning it, and the caller will put it when + * it's done with the root. + */ + btrfs_grab_fs_root(root); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { + btrfs_put_fs_root(root); if (ret == -EEXIST) { btrfs_free_fs_root(root); goto again; @@ -3207,13 +3221,6 @@ int __cold open_ctree(struct super_block *sb, goto fail_qgroup; } - if (!btrfs_grab_fs_root(fs_info->fs_root)) { - fs_info->fs_root = NULL; - err = -ENOENT; - btrfs_warn(fs_info, "failed to grab a ref on the fs tree"); - goto fail_qgroup; - } - if (sb_rdonly(sb)) return 0; -- cgit From 141386e1a5d641d6d89a7794f806a31945ec1e9f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:57 -0500 Subject: btrfs: free more things in btrfs_free_fs_info Things like the percpu_counters, the mapping_tree, and the csum hash can all be freed at btrfs_free_fs_info time, since the helpers all check if the structure has been initialized already. This significantly cleans up the error cases in open_ctree. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 60 ++++++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a0c67200fb2a..1f2661369738 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -98,6 +98,12 @@ void __cold btrfs_end_io_wq_exit(void) kmem_cache_destroy(btrfs_end_io_wq_cache); } +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + if (fs_info->csum_shash) + crypto_free_shash(fs_info->csum_shash); +} + /* * async submit bios are used to offload expensive checksumming * onto the worker threads. They checksum file and metadata bios @@ -1527,6 +1533,13 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->dev_replace.bio_counter); + btrfs_free_csum_hash(fs_info); + btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); btrfs_put_fs_root(fs_info->extent_root); @@ -2207,11 +2220,6 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) return 0; } -static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) -{ - crypto_free_shash(fs_info->csum_shash); -} - static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { @@ -2688,7 +2696,7 @@ int __cold open_ctree(struct super_block *sb, ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_dio_bytes; + goto fail_srcu; } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2696,14 +2704,14 @@ int __cold open_ctree(struct super_block *sb, ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_dirty_metadata_bytes; + goto fail_srcu; } ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_delalloc_bytes; + goto fail_srcu; } INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); @@ -2770,7 +2778,7 @@ int __cold open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_bio_counter; + goto fail_srcu; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2883,7 +2891,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); - goto fail_csum; + goto fail_alloc; } /* @@ -2920,11 +2928,11 @@ int __cold open_ctree(struct super_block *sb, if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } if (!btrfs_super_root(disk_super)) - goto fail_csum; + goto fail_alloc; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -2939,7 +2947,7 @@ int __cold open_ctree(struct super_block *sb, ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; - goto fail_csum; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super) & @@ -2949,7 +2957,7 @@ int __cold open_ctree(struct super_block *sb, "cannot mount because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super); @@ -2993,7 +3001,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); - goto fail_csum; + goto fail_alloc; } /* @@ -3009,7 +3017,7 @@ int __cold open_ctree(struct super_block *sb, "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_csum; + goto fail_alloc; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3346,25 +3354,14 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); -fail_csum: - btrfs_free_csum_hash(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); -fail_bio_counter: - percpu_counter_destroy(&fs_info->dev_replace.bio_counter); -fail_delalloc_bytes: - percpu_counter_destroy(&fs_info->delalloc_bytes); -fail_dirty_metadata_bytes: - percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_dio_bytes: - percpu_counter_destroy(&fs_info->dio_bytes); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; } @@ -4071,16 +4068,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); - - percpu_counter_destroy(&fs_info->dirty_metadata_bytes); - percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); - percpu_counter_destroy(&fs_info->dev_replace.bio_counter); cleanup_srcu_struct(&fs_info->subvol_srcu); - - btrfs_free_csum_hash(fs_info); - btrfs_free_stripe_hash_table(fs_info); - btrfs_free_ref_cache(fs_info); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, -- cgit From ae18c37ad5a12a73d119dc1d7b30199574a41bf3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:58 -0500 Subject: btrfs: move fs_info init work into it's own helper function open_ctree mixes initialization of fs stuff and fs_info stuff, which makes it confusing when doing things like adding the root leak detection. Make a separate function that inits all the static structures inside of the fs_info needed for the fs to operate, and then call that before we start setting up the fs_info to be mounted. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 165 +++++++++++++++++++++++++++-------------------------- 1 file changed, 84 insertions(+), 81 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1f2661369738..0bc6bc2c10f0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2649,70 +2649,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } -int __cold open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) { - u32 sectorsize; - u32 nodesize; - u32 stripesize; - u64 generation; - u64 features; - u16 csum_type; - struct btrfs_key location; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; int ret; - int err = -EINVAL; - int clear_free_space_tree = 0; - int level; - - tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, - GFP_KERNEL); - fs_info->tree_root = tree_root; - chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, - GFP_KERNEL); - fs_info->chunk_root = chunk_root; - if (!tree_root || !chunk_root) { - err = -ENOMEM; - goto fail; - } - - ret = init_srcu_struct(&fs_info->subvol_srcu); - if (ret) { - err = ret; - goto fail; - } - - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } - - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } - fs_info->dirty_metadata_batch = PAGE_SIZE * - (1 + ilog2(nr_cpu_ids)); - - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } - - ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, - GFP_KERNEL); - if (ret) { - err = ret; - goto fail_srcu; - } INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); @@ -2775,21 +2714,6 @@ int __cold open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_srcu; - } - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - - fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), - GFP_KERNEL); - if (!fs_info->delayed_root) { - err = -ENOMEM; - goto fail_iput; - } - btrfs_init_delayed_root(fs_info->delayed_root); - btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; @@ -2800,8 +2724,6 @@ int __cold open_ctree(struct super_block *sb, sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - btrfs_init_btree_inode(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; @@ -2847,12 +2769,94 @@ int __cold open_ctree(struct super_block *sb, fs_info->send_in_progress = 0; + ret = init_srcu_struct(&fs_info->subvol_srcu); + if (ret) + return ret; + + ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + if (ret) + goto fail; + + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + if (ret) + goto fail; + + fs_info->dirty_metadata_batch = PAGE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); + if (ret) + goto fail; + + ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, + GFP_KERNEL); + if (ret) + goto fail; + + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_KERNEL); + if (!fs_info->delayed_root) { + ret = -ENOMEM; + goto fail; + } + btrfs_init_delayed_root(fs_info->delayed_root); + ret = btrfs_alloc_stripe_hash_table(fs_info); + if (ret) + goto fail; + + return 0; +fail: + cleanup_srcu_struct(&fs_info->subvol_srcu); + return ret; +} + +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 stripesize; + u64 generation; + u64 features; + u16 csum_type; + struct btrfs_key location; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + int ret; + int err = -EINVAL; + int clear_free_space_tree = 0; + int level; + + ret = init_fs_info(fs_info, sb); if (ret) { err = ret; - goto fail_alloc; + goto fail; } + /* These need to be init'ed before we start creating inodes and such. */ + tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, + GFP_KERNEL); + fs_info->tree_root = tree_root; + chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, + GFP_KERNEL); + fs_info->chunk_root = chunk_root; + if (!tree_root || !chunk_root) { + err = -ENOMEM; + goto fail_srcu; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_srcu; + } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + btrfs_init_btree_inode(fs_info); + invalidate_bdev(fs_devices->latest_bdev); /* @@ -3355,7 +3359,6 @@ fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); fail_alloc: -fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); -- cgit From 8260edba67a2e6bd5e709d32188e23aa22cb4a38 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:32:59 -0500 Subject: btrfs: make the init of static elements in fs_info separate In adding things like eb leak checking and root leak checking there were a lot of weird corner cases that come from the fact that 1) We do not init the fs_info until we get to open_ctree time in the normal case and 2) The test infrastructure half-init's the fs_info for things that it needs. This makes it really annoying to make changes because you have to add init in two different places, have special cases for testing fs_info's that may not have certain things initialized, and cases for fs_info's that didn't make it to open_ctree and thus are not fully set up. Fix this by extracting out the non-allocating init of the fs info into it's own public function and use that to make sure we're all getting consistent views of an allocated fs_info. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0bc6bc2c10f0..3e4df0c6a663 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2649,10 +2649,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } -static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - int ret; - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -2695,7 +2693,6 @@ static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); - fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; @@ -2721,9 +2718,6 @@ static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); - sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; - sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; @@ -2768,6 +2762,15 @@ static int init_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) fs_info->swapfile_pins = RB_ROOT; fs_info->send_in_progress = 0; +} + +static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ + int ret; + + fs_info->sb = sb; + sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; + sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) @@ -2831,7 +2834,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device int clear_free_space_tree = 0; int level; - ret = init_fs_info(fs_info, sb); + ret = init_mount_fs_info(fs_info, sb); if (ret) { err = ret; goto fail; -- cgit From bd647ce385ec110fe7796267b6555873e48e44eb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:33:00 -0500 Subject: btrfs: add a leak check for roots Now that we're going to start relying on getting ref counting right for roots, add a list to track allocated roots and print out any roots that aren't freed up at free_fs_info time. Hide this behind CONFIG_BTRFS_DEBUG because this will just be used for developers to verify they aren't breaking things. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3e4df0c6a663..c2838a48327f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1202,6 +1202,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, spin_lock_init(&root->root_item_lock); btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&root->leak_list); + spin_lock(&fs_info->fs_roots_radix_lock); + list_add_tail(&root->leak_list, &fs_info->allocated_roots); + spin_unlock(&fs_info->fs_roots_radix_lock); +#endif } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, @@ -1531,6 +1537,24 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +{ +#ifdef CONFIG_BTRFS_DEBUG + struct btrfs_root *root; + + while (!list_empty(&fs_info->allocated_roots)) { + root = list_first_entry(&fs_info->allocated_roots, + struct btrfs_root, leak_list); + btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", + root->root_key.objectid, root->root_key.offset, + refcount_read(&root->refs)); + while (refcount_read(&root->refs) > 1) + btrfs_put_fs_root(root); + btrfs_put_fs_root(root); + } +#endif +} + void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); @@ -1551,6 +1575,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_fs_root(fs_info->uuid_root); btrfs_put_fs_root(fs_info->free_space_root); btrfs_put_fs_root(fs_info->fs_root); + btrfs_check_leaked_roots(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kvfree(fs_info); @@ -2677,6 +2702,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); +#ifdef CONFIG_BTRFS_DEBUG + INIT_LIST_HEAD(&fs_info->allocated_roots); +#endif extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); -- cgit From 0024652895e3479cd0d372f63b57d9581a0bdd38 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jan 2020 09:33:01 -0500 Subject: btrfs: rename btrfs_put_fs_root and btrfs_grab_fs_root We are now using these for all roots, rename them to btrfs_put_root() and btrfs_grab_root(); Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 78 +++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c2838a48327f..f859a28255dd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1309,7 +1309,7 @@ fail: free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } - btrfs_put_fs_root(root); + btrfs_put_root(root); return ERR_PTR(ret); } @@ -1340,7 +1340,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { - btrfs_put_fs_root(root); + btrfs_put_root(root); return ERR_CAST(leaf); } @@ -1443,7 +1443,7 @@ out: return root; find_fail: - btrfs_put_fs_root(root); + btrfs_put_root(root); alloc_fail: root = ERR_PTR(ret); goto out; @@ -1509,7 +1509,7 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); if (root) - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1528,7 +1528,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, (unsigned long)root->root_key.objectid, root); if (ret == 0) { - btrfs_grab_fs_root(root); + btrfs_grab_root(root); set_bit(BTRFS_ROOT_IN_RADIX, &root->state); } spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1549,8 +1549,8 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) root->root_key.objectid, root->root_key.offset, refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) - btrfs_put_fs_root(root); - btrfs_put_fs_root(root); + btrfs_put_root(root); + btrfs_put_root(root); } #endif } @@ -1566,15 +1566,15 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); - btrfs_put_fs_root(fs_info->extent_root); - btrfs_put_fs_root(fs_info->tree_root); - btrfs_put_fs_root(fs_info->chunk_root); - btrfs_put_fs_root(fs_info->dev_root); - btrfs_put_fs_root(fs_info->csum_root); - btrfs_put_fs_root(fs_info->quota_root); - btrfs_put_fs_root(fs_info->uuid_root); - btrfs_put_fs_root(fs_info->free_space_root); - btrfs_put_fs_root(fs_info->fs_root); + btrfs_put_root(fs_info->extent_root); + btrfs_put_root(fs_info->tree_root); + btrfs_put_root(fs_info->chunk_root); + btrfs_put_root(fs_info->dev_root); + btrfs_put_root(fs_info->csum_root); + btrfs_put_root(fs_info->quota_root); + btrfs_put_root(fs_info->uuid_root); + btrfs_put_root(fs_info->free_space_root); + btrfs_put_root(fs_info->fs_root); btrfs_check_leaked_roots(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); @@ -1592,29 +1592,29 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->tree_root); + return btrfs_grab_root(fs_info->tree_root); if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->extent_root); + return btrfs_grab_root(fs_info->extent_root); if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->chunk_root); + return btrfs_grab_root(fs_info->chunk_root); if (location->objectid == BTRFS_DEV_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->dev_root); + return btrfs_grab_root(fs_info->dev_root); if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->csum_root); + return btrfs_grab_root(fs_info->csum_root); if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->quota_root) ? + return btrfs_grab_root(fs_info->quota_root) ? fs_info->quota_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_UUID_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->uuid_root) ? + return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return btrfs_grab_fs_root(fs_info->free_space_root) ? + return btrfs_grab_root(fs_info->free_space_root) ? fs_info->free_space_root : ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { if (check_ref && btrfs_root_refs(&root->root_item) == 0) { - btrfs_put_fs_root(root); + btrfs_put_root(root); return ERR_PTR(-ENOENT); } return root; @@ -1657,10 +1657,10 @@ again: * we have the third for returning it, and the caller will put it when * it's done with the root. */ - btrfs_grab_fs_root(root); + btrfs_grab_root(root); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_fs_root(root); + btrfs_put_root(root); if (ret == -EEXIST) { btrfs_free_fs_root(root); goto again; @@ -2062,7 +2062,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) } else { free_extent_buffer(gang[0]->node); free_extent_buffer(gang[0]->commit_root); - btrfs_put_fs_root(gang[0]); + btrfs_put_root(gang[0]); } } @@ -2270,12 +2270,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); - btrfs_put_fs_root(log_tree_root); + btrfs_put_root(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); - btrfs_put_fs_root(log_tree_root); + btrfs_put_root(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ @@ -2284,7 +2284,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); - btrfs_put_fs_root(log_tree_root); + btrfs_put_root(log_tree_root); return ret; } @@ -3884,7 +3884,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); if (btrfs_root_refs(&root->root_item) == 0) @@ -3895,7 +3895,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (root->reloc_root) { free_extent_buffer(root->reloc_root->node); free_extent_buffer(root->reloc_root->commit_root); - btrfs_put_fs_root(root->reloc_root); + btrfs_put_root(root->reloc_root); root->reloc_root = NULL; } } @@ -3919,7 +3919,7 @@ void btrfs_free_fs_root(struct btrfs_root *root) free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); - btrfs_put_fs_root(root); + btrfs_put_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3949,7 +3949,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) continue; } /* grab all the search result for later use */ - gang[i] = btrfs_grab_fs_root(gang[i]); + gang[i] = btrfs_grab_root(gang[i]); } srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -3960,7 +3960,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) err = btrfs_orphan_cleanup(gang[i]); if (err) break; - btrfs_put_fs_root(gang[i]); + btrfs_put_root(gang[i]); } root_objectid++; } @@ -3968,7 +3968,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) /* release the uncleaned roots due to error */ for (; i < ret; i++) { if (gang[i]) - btrfs_put_fs_root(gang[i]); + btrfs_put_root(gang[i]); } return err; } @@ -4368,12 +4368,12 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); btrfs_destroy_delalloc_inodes(root); - btrfs_put_fs_root(root); + btrfs_put_root(root); spin_lock(&fs_info->delalloc_root_lock); } -- cgit From 71ad38b44eaa758c70c34ca97a5c3848d6b67e03 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 19:09:35 +0100 Subject: btrfs: sink argument tree to extent_read_full_page The tree pointer can be safely read from the page's inode, use it and drop the redundant argument. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f859a28255dd..49daf6c946df 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -978,9 +978,7 @@ static int btree_writepages(struct address_space *mapping, static int btree_readpage(struct file *file, struct page *page) { - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent, 0); + return extent_read_full_page(page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) -- cgit From f603bb94abbed5b5542cfb514ddfa3c5dda96bcc Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:08 +0200 Subject: btrfs: Perform pinned cleanup directly in btrfs_destroy_delayed_refs Having btrfs_destroy_delayed_refs call btrfs_pin_extent is problematic for making pinned extents tracking per-transaction since btrfs_trans_handle cannot be passed to btrfs_pin_extent in this context. Additionally delayed refs heads pinned in btrfs_destroy_delayed_refs are going to be handled very closely, in btrfs_destroy_pinned_extent. To enable btrfs_pin_extent to take btrfs_trans_handle simply open code it in btrfs_destroy_delayed_refs and call btrfs_error_unpin_extent_range on the range. This enables us to do less work in btrfs_destroy_pinned_extent and leaves btrfs_pin_extent being called in contexts which have a valid btrfs_trans_handle. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 49daf6c946df..194c98a61095 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "ref-verify.h" #include "block-group.h" #include "discard.h" +#include "space-info.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -4308,9 +4309,30 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (pin_bytes) - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + if (pin_bytes) { + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, head->bytenr); + BUG_ON(!cache); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, + cache->space_info, head->num_bytes); + cache->reserved -= head->num_bytes; + cache->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + percpu_counter_add_batch( + &cache->space_info->total_bytes_pinned, + head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + + btrfs_put_block_group(cache); + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); -- cgit From fe119a6eeb670585e29dbe3932e00ad29ae8f5f9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 20 Jan 2020 16:09:18 +0200 Subject: btrfs: switch to per-transaction pinned extents This commit flips the switch to start tracking/processing pinned extents on a per-transaction basis. It mostly replaces all references from btrfs_fs_info::(pinned_extents|freed_extents[]) to btrfs_transaction::pinned_extents. Two notable modifications that warrant explicit mention are changing clean_pinned_extents to get a reference to the previously running transaction. The other one is removal of call to btrfs_destroy_pinned_extent since transactions are going to be cleaned in btrfs_cleanup_one_transaction. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 194c98a61095..e1e111c8b08b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2075,10 +2075,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log_root_tree(NULL, fs_info); - btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); - } } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) @@ -2749,11 +2747,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(fs_info, &fs_info->freed_extents[0], - IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); - extent_io_tree_init(fs_info, &fs_info->freed_extents[1], - IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); - fs_info->pinned_extents = &fs_info->freed_extents[0]; + extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS, NULL); set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); @@ -4434,16 +4429,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, } static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *pinned_extents) + struct extent_io_tree *unpin) { - struct extent_io_tree *unpin; u64 start; u64 end; int ret; - bool loop = true; - unpin = pinned_extents; -again: while (1) { struct extent_state *cached_state = NULL; @@ -4468,15 +4459,6 @@ again: cond_resched(); } - if (loop) { - if (unpin == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; - loop = false; - goto again; - } - return 0; } @@ -4567,8 +4549,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); - btrfs_destroy_pinned_extent(fs_info, - fs_info->pinned_extents); + btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); @@ -4620,7 +4601,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_destroy_all_ordered_extents(fs_info); btrfs_destroy_delayed_inodes(fs_info); btrfs_assert_delayed_root_empty(fs_info); - btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); btrfs_destroy_all_delalloc_inodes(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); -- cgit From 8f32380d3f2904ad0e98574ae17f2018f6dd277e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:32 +0900 Subject: btrfs: use the page cache for super block reading Super-block reading in BTRFS is done using buffer_heads. Buffer_heads have some drawbacks, like not being able to propagate errors from the lower layers. Directly use the page cache for reading the super blocks from disk or invalidating an on-disk super block. We have to use the page cache so to avoid races between mkfs and udev. See also 6f60cbd3ae44 ("btrfs: access superblock via pagecache in scan_one_device"). This patch unwraps the buffer head API and does not change the way the super block is actually read. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 74 +++++++++++++++++++++++------------------------------- 1 file changed, 31 insertions(+), 43 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e1e111c8b08b..ac9554d71a46 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2846,7 +2846,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device u64 features; u16 csum_type; struct btrfs_key location; - struct buffer_head *bh; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; @@ -2887,9 +2886,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* * Read super block and check the signature bytes only */ - bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); + disk_super = btrfs_read_dev_super(fs_devices->latest_bdev); + if (IS_ERR(disk_super)) { + err = PTR_ERR(disk_super); goto fail_alloc; } @@ -2897,18 +2896,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * Verify the type first, if that or the the checksum value are * corrupted, we'll find out */ - csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); + csum_type = btrfs_super_csum_type(disk_super); if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", csum_type); err = -EINVAL; - brelse(bh); + btrfs_release_disk_super(disk_super); goto fail_alloc; } ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { err = ret; + btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -2916,10 +2916,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, bh->b_data)) { + if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; - brelse(bh); + btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -2928,8 +2928,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * following bytes up to INFO_SIZE, the checksum is calculated from * the whole block of INFO_SIZE */ - memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - brelse(bh); + memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy)); + btrfs_release_disk_super(disk_super); disk_super = fs_info->super_copy; @@ -3416,45 +3416,38 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) put_bh(bh); } -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, - struct buffer_head **bh_ret) +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, + int copy_num) { - struct buffer_head *bh; struct btrfs_super_block *super; + struct page *page; u64 bytenr; + struct address_space *mapping = bdev->bd_inode->i_mapping; bytenr = btrfs_sb_offset(copy_num); if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); - bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); - /* - * If we fail to read from the underlying devices, as of now - * the best option we have is to mark it EIO. - */ - if (!bh) - return -EIO; + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page)) + return ERR_CAST(page); - super = (struct btrfs_super_block *)bh->b_data; + super = page_address(page); if (btrfs_super_bytenr(super) != bytenr || btrfs_super_magic(super) != BTRFS_MAGIC) { - brelse(bh); - return -EINVAL; + btrfs_release_disk_super(super); + return ERR_PTR(-EINVAL); } - *bh_ret = bh; - return 0; + return super; } -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) { - struct buffer_head *bh; - struct buffer_head *latest = NULL; - struct btrfs_super_block *super; + struct btrfs_super_block *super, *latest = NULL; int i; u64 transid = 0; - int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -3462,25 +3455,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - ret = btrfs_read_dev_one_super(bdev, i, &bh); - if (ret) + super = btrfs_read_dev_one_super(bdev, i); + if (IS_ERR(super)) continue; - super = (struct btrfs_super_block *)bh->b_data; - if (!latest || btrfs_super_generation(super) > transid) { - brelse(latest); - latest = bh; + if (latest) + btrfs_release_disk_super(super); + + latest = super; transid = btrfs_super_generation(super); - } else { - brelse(bh); } } - if (!latest) - return ERR_PTR(ret); - - return latest; + return super; } /* -- cgit From 314b6dd0eebfa99621202f2be0b5f76e01d849e1 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 14 Feb 2020 00:24:33 +0900 Subject: btrfs: use bios instead of buffer_heads from super block writeout Similar to the superblock read path, change the write path to using bios and pages instead of buffer_heads. This allows us to skip over the buffer_head code, for writing the superblock to disk. This is based on a patch originally authored by Nikolay Borisov. Co-developed-by: Nikolay Borisov Signed-off-by: Nikolay Borisov Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 127 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 73 insertions(+), 54 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ac9554d71a46..756bf2ab64cd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -3395,25 +3394,34 @@ fail: } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +static void btrfs_end_super_write(struct bio *bio) { - if (uptodate) { - set_buffer_uptodate(bh); - } else { - struct btrfs_device *device = (struct btrfs_device *) - bh->b_private; - - btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s", - rcu_str_deref(device->name)); - /* note, we don't set_buffer_write_io_error because we have - * our own ways of dealing with the IO errors - */ - clear_buffer_uptodate(bh); - btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); + struct btrfs_device *device = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + struct page *page; + + bio_for_each_segment_all(bvec, bio, iter_all) { + page = bvec->bv_page; + + if (bio->bi_status) { + btrfs_warn_rl_in_rcu(device->fs_info, + "lost page write due to IO error on %s (%d)", + rcu_str_deref(device->name), + blk_status_to_errno(bio->bi_status)); + ClearPageUptodate(page); + SetPageError(page); + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_WRITE_ERRS); + } else { + SetPageUptodate(page); + } + + put_page(page); + unlock_page(page); } - unlock_buffer(bh); - put_bh(bh); + + bio_put(bio); } struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, @@ -3473,25 +3481,23 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) /* * Write superblock @sb to the @device. Do not wait for completion, all the - * buffer heads we write are pinned. + * pages we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when buffer head is not found or submission fails. + * Return number of errors when page is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct btrfs_fs_info *fs_info = device->fs_info; + struct address_space *mapping = device->bdev->bd_inode->i_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - struct buffer_head *bh; int i; - int ret; int errors = 0; u64 bytenr; - int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3499,6 +3505,10 @@ static int write_dev_supers(struct btrfs_device *device, shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { + struct page *page; + struct bio *bio; + struct btrfs_super_block *disk_super; + bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) @@ -3511,37 +3521,45 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); crypto_shash_final(shash, sb->csum); - /* One reference for us, and we leave it for the caller */ - bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { + page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, + GFP_NOFS); + if (!page) { btrfs_err(device->fs_info, - "couldn't get super buffer head for bytenr %llu", + "couldn't get super block page for bytenr %llu", bytenr); errors++; continue; } - memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + /* Bump the refcount for wait_dev_supers() */ + get_page(page); - /* one reference for submit_bh */ - get_bh(bh); + disk_super = page_address(page); + memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); - set_buffer_uptodate(bh); - lock_buffer(bh); - bh->b_end_io = btrfs_end_buffer_write_sync; - bh->b_private = device; + /* + * Directly use bios here instead of relying on the page cache + * to do I/O, so we don't lose the ability to do integrity + * checking. + */ + bio = bio_alloc(GFP_NOFS, 1); + bio_set_dev(bio, device->bdev); + bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; + bio->bi_private = device; + bio->bi_end_io = btrfs_end_super_write; + __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, + offset_in_page(bytenr)); /* - * we fua the first super. The others we allow - * to go down lazy. + * We FUA only the first super block. The others we allow to + * go down lazy and there's a short window where the on-disk + * copies might still contain the older version. */ - op_flags = REQ_SYNC | REQ_META | REQ_PRIO; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) - op_flags |= REQ_FUA; - ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); - if (ret) - errors++; + bio->bi_opf |= REQ_FUA; + + btrfsic_submit_bio(bio); } return errors < i ? 0 : -1; } @@ -3550,12 +3568,11 @@ static int write_dev_supers(struct btrfs_device *device, * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when buffer head is not found or not marked up to + * Return number of errors when page is not found or not marked up to * date. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { - struct buffer_head *bh; int i; int errors = 0; bool primary_failed = false; @@ -3565,32 +3582,34 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { + struct page *page; + bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; - bh = __find_get_block(device->bdev, - bytenr / BTRFS_BDEV_BLOCKSIZE, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { + page = find_get_page(device->bdev->bd_inode->i_mapping, + bytenr >> PAGE_SHIFT); + if (!page) { errors++; if (i == 0) primary_failed = true; continue; } - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { + /* Page is submitted locked and unlocked once the IO completes */ + wait_on_page_locked(page); + if (PageError(page)) { errors++; if (i == 0) primary_failed = true; } - /* drop our reference */ - brelse(bh); + /* Drop our reference */ + put_page(page); - /* drop the reference from the writing run */ - brelse(bh); + /* Drop the reference from the writing run */ + put_page(page); } /* log error, force error return */ -- cgit From 97f4dd09dad05943a9c4184ace47258ed09e8e74 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 18 Feb 2020 16:56:08 +0200 Subject: btrfs: make btrfs_check_uuid_tree private to disk-io.c It's used only during filesystem mount as such it can be made private to disk-io.c file. Also use the occasion to move btrfs_uuid_rescan_kthread as btrfs_check_uuid_tree is its sole caller. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 756bf2ab64cd..074b77041f89 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2835,6 +2835,41 @@ fail: return ret; } +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + int ret; + + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info); + if (ret < 0) { + btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_rescan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { -- cgit From c94bec2c6190a5d13f59c0f36a660be5e2d85644 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 15:05:01 -0500 Subject: btrfs: bail out of uuid tree scanning if we're closing In doing my fsstress+EIO stress testing I started running into issues where umount would get stuck forever because the uuid checker was chewing through the thousands of subvolumes I had created. We shouldn't block umount on this, simply bail if we're unmounting the fs. We need to make sure we don't mark the UUID tree as ok, so we only set that bit if we made it through the whole rescan operation, but otherwise this is completely safe. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 074b77041f89..864ffc6c81c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2847,7 +2847,9 @@ static int btrfs_uuid_rescan_kthread(void *data) */ ret = btrfs_uuid_tree_iterate(fs_info); if (ret < 0) { - btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); + if (ret != -EINTR) + btrfs_warn(fs_info, "iterating uuid_tree failed %d", + ret); up(&fs_info->uuid_tree_rescan_sem); return ret; } -- cgit From 75ec1db8717a8f0a9d9c8d033e542fdaa7b73898 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 15:22:06 -0500 Subject: btrfs: set update the uuid generation as soon as possible In my EIO stress testing I noticed I was getting forced to rescan the uuid tree pretty often, which was weird. This is because my error injection stuff would sometimes inject an error after log replay but before we loaded the UUID tree. If log replay committed the transaction it wouldn't have updated the uuid tree generation, but the tree was valid and didn't change, so there's no reason to not update the generation here. Fix this by setting the BTRFS_FS_UPDATE_UUID_TREE_GEN bit immediately after reading all the fs roots if the uuid tree generation matches the fs generation. Then any transaction commits that happen during mount won't screw up our uuid tree state, forcing us to do needless uuid rescans. Fixes: 70f801754728 ("Btrfs: check UUID tree during mount if required") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 864ffc6c81c1..770d469e1d9c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3151,6 +3151,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret) goto fail_tree_roots; + /* + * If we have a uuid root and we're not being told to rescan we need to + * check the generation here so we can set the + * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the + * transaction during a balance or the log replay without updating the + * uuid generation, and then if we crash we would rescan the uuid tree, + * even though it was perfectly fine. + */ + if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && + fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + ret = btrfs_verify_dev_extents(fs_info); if (ret) { btrfs_err(fs_info, @@ -3375,8 +3387,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device close_ctree(fs_info); return ret; } - } else { - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); } set_bit(BTRFS_FS_OPEN, &fs_info->flags); -- cgit From dcc3eb9638c3c927f1597075e851d0a16300a876 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 30 Jan 2020 14:59:45 +0200 Subject: btrfs: convert snapshot/nocow exlcusion to drew lock This patch removes all haphazard code implementing nocow writers exclusion from pending snapshot creation and switches to using the drew lock to ensure this invariant still holds. 'Readers' are snapshot creators from create_snapshot and 'writers' are nocow writers from buffered write path or btrfs_setsize. This locking scheme allows for multiple snapshots to happen while any nocow writers are blocked, since writes to page cache in the nocow path will make snapshots inconsistent. So for performance reasons we'd like to have the ability to run multiple concurrent snapshots and also favors readers in this case. And in case there aren't pending snapshots (which will be the majority of the cases) we rely on the percpu's writers counter to avoid cacheline contention. The main gain from using the drew lock is it's now a lot easier to reason about the guarantees of the locking scheme and whether there is some silent breakage lurking. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 45 ++++++++++----------------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 770d469e1d9c..06819c41e4f4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1104,32 +1104,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) } } -static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) -{ - struct btrfs_subvolume_writers *writers; - int ret; - - writers = kmalloc(sizeof(*writers), GFP_NOFS); - if (!writers) - return ERR_PTR(-ENOMEM); - - ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); - if (ret < 0) { - kfree(writers); - return ERR_PTR(ret); - } - - init_waitqueue_head(&writers->wait); - return writers; -} - -static void -btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) -{ - percpu_counter_destroy(&writers->counter); - kfree(writers); -} - static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { @@ -1178,7 +1152,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); refcount_set(&root->refs, 1); - atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); root->log_transid = 0; @@ -1450,7 +1423,7 @@ alloc_fail: static int btrfs_init_fs_root(struct btrfs_root *root) { int ret; - struct btrfs_subvolume_writers *writers; + unsigned int nofs_flag; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), @@ -1460,12 +1433,15 @@ static int btrfs_init_fs_root(struct btrfs_root *root) goto fail; } - writers = btrfs_alloc_subvolume_writers(); - if (IS_ERR(writers)) { - ret = PTR_ERR(writers); + /* + * We might be called under a transaction (e.g. indirect backref + * resolution) which could deadlock if it triggers memory reclaim + */ + nofs_flag = memalloc_nofs_save(); + ret = btrfs_drew_lock_init(&root->snapshot_lock); + memalloc_nofs_restore(nofs_flag); + if (ret) goto fail; - } - root->subv_writers = writers; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { set_bit(BTRFS_ROOT_REF_COWS, &root->state); @@ -3961,8 +3937,7 @@ void btrfs_free_fs_root(struct btrfs_root *root) WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); if (root->anon_dev) free_anon_bdev(root->anon_dev); - if (root->subv_writers) - btrfs_free_subvolume_writers(root->subv_writers); + btrfs_drew_lock_destroy(&root->snapshot_lock); free_extent_buffer(root->node); free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); -- cgit From 9a8658e33d8fd45879eebc44178b2a172e76bb47 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 20 Mar 2019 13:15:57 +0100 Subject: btrfs: open code trivial helper btrfs_header_fsid The helper btrfs_header_fsid follows naming convention of other struct accessors but does something compeletly different. As the offsetof calculation is clear in the context of extent buffer operations we can remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 06819c41e4f4..2e482657a1b4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -541,7 +541,8 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) return -EUCLEAN; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, - btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); + offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE) == 0); if (csum_tree_block(eb, result)) return -EINVAL; @@ -571,7 +572,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); while (fs_devices) { u8 *metadata_uuid; -- cgit From c4ac75419826c7afc997bbe4da07ad6f963da22f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 20 Mar 2019 13:17:13 +0100 Subject: btrfs: open code trivial helper btrfs_header_chunk_tree_uuid The helper btrfs_header_chunk_tree_uuid follows naming convention of other struct accessors but does something compeletly different. As the offsetof calculation is clear in the context of extent buffer operations we can remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2e482657a1b4..abd7a613c8e6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3106,7 +3106,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_UUID_SIZE); ret = btrfs_read_chunk_tree(fs_info); if (ret) { -- cgit From b79ce3dddd3f1be7515452adfed633d13e45d806 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 28 Nov 2019 15:34:28 +0100 Subject: btrfs: adjust delayed refs message level The message seems to be for debugging and has little value for users. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index abd7a613c8e6..f109607ff40b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4302,7 +4302,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_lock(&delayed_refs->lock); if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); - btrfs_info(fs_info, "delayed_refs has NO entry"); + btrfs_debug(fs_info, "delayed_refs has NO entry"); return ret; } -- cgit From 15b6e8a83e910c71e98c368e51b581135539efdf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 28 Nov 2019 16:15:04 +0100 Subject: btrfs: reduce pointer intdirections in btree_readpage_end_io_hook All we need to read is checksum size from fs_info superblock, and fs_info is provided by extent buffer so we can get rid of the wild pointer indirections from page/inode/root. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f109607ff40b..a4493e5dac22 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -604,9 +604,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 found_start; int found_level; struct extent_buffer *eb; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_fs_info *fs_info = root->fs_info; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + struct btrfs_fs_info *fs_info; + u16 csum_size; int ret = 0; u8 result[BTRFS_CSUM_SIZE]; int reads_done; @@ -615,6 +614,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto out; eb = (struct extent_buffer *)page->private; + fs_info = eb->fs_info; + csum_size = btrfs_super_csum_size(fs_info->super_copy); /* the pending IO might have been the only thing that kept this buffer * in memory. Make sure we have a ref for all this other checks -- cgit From 807fc790aa804102df75b6ac0c943a4d205fc4b5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Feb 2020 17:37:51 +0200 Subject: btrfs: switch to use new generic UUID API There are new types and helpers that are supposed to be used in new code. As a preparation to get rid of legacy types and API functions do the conversion here. Reviewed-by: Christoph Hellwig Signed-off-by: Andy Shevchenko Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a4493e5dac22..f01f19c76cf5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1223,7 +1223,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_key key; unsigned int nofs_flag; int ret = 0; - uuid_le uuid = NULL_UUID_LE; /* * We're holding a transaction handle, so use a NOFS memory allocation @@ -1262,8 +1261,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); if (is_fstree(objectid)) - uuid_le_gen(&uuid); - memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); + generate_random_guid(root->root_item.uuid); + else + export_guid(root->root_item.uuid, &guid_null); root->root_item.drop_level = 0; key.objectid = objectid; -- cgit From f0cc2cd70164efe8f75c5d99560f0f69969c72e4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 28 Feb 2020 13:04:36 +0000 Subject: Btrfs: fix crash during unmount due to race with delayed inode workers During unmount we can have a job from the delayed inode items work queue still running, that can lead to at least two bad things: 1) A crash, because the worker can try to create a transaction just after the fs roots were freed; 2) A transaction leak, because the worker can create a transaction before the fs roots are freed and just after we committed the last transaction and after we stopped the transaction kthread. A stack trace example of the crash: [79011.691214] kernel BUG at lib/radix-tree.c:982! [79011.692056] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [79011.693180] CPU: 3 PID: 1394 Comm: kworker/u8:2 Tainted: G W 5.6.0-rc2-btrfs-next-54 #2 (...) [79011.696789] Workqueue: btrfs-delayed-meta btrfs_work_helper [btrfs] [79011.697904] RIP: 0010:radix_tree_tag_set+0xe7/0x170 (...) [79011.702014] RSP: 0018:ffffb3c84a317ca0 EFLAGS: 00010293 [79011.702949] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [79011.704202] RDX: ffffb3c84a317cb0 RSI: ffffb3c84a317ca8 RDI: ffff8db3931340a0 [79011.705463] RBP: 0000000000000005 R08: 0000000000000005 R09: ffffffff974629d0 [79011.706756] R10: ffffb3c84a317bc0 R11: 0000000000000001 R12: ffff8db393134000 [79011.708010] R13: ffff8db3931340a0 R14: ffff8db393134068 R15: 0000000000000001 [79011.709270] FS: 0000000000000000(0000) GS:ffff8db3b6a00000(0000) knlGS:0000000000000000 [79011.710699] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [79011.711710] CR2: 00007f22c2a0a000 CR3: 0000000232ad4005 CR4: 00000000003606e0 [79011.712958] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [79011.714205] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [79011.715448] Call Trace: [79011.715925] record_root_in_trans+0x72/0xf0 [btrfs] [79011.716819] btrfs_record_root_in_trans+0x4b/0x70 [btrfs] [79011.717925] start_transaction+0xdd/0x5c0 [btrfs] [79011.718829] btrfs_async_run_delayed_root+0x17e/0x2b0 [btrfs] [79011.719915] btrfs_work_helper+0xaa/0x720 [btrfs] [79011.720773] process_one_work+0x26d/0x6a0 [79011.721497] worker_thread+0x4f/0x3e0 [79011.722153] ? process_one_work+0x6a0/0x6a0 [79011.722901] kthread+0x103/0x140 [79011.723481] ? kthread_create_worker_on_cpu+0x70/0x70 [79011.724379] ret_from_fork+0x3a/0x50 (...) The following diagram shows a sequence of steps that lead to the crash during ummount of the filesystem: CPU 1 CPU 2 CPU 3 btrfs_punch_hole() btrfs_btree_balance_dirty() btrfs_balance_delayed_items() --> sees fs_info->delayed_root->items with value 200, which is greater than BTRFS_DELAYED_BACKGROUND (128) and smaller than BTRFS_DELAYED_WRITEBACK (512) btrfs_wq_run_delayed_node() --> queues a job for fs_info->delayed_workers to run btrfs_async_run_delayed_root() btrfs_async_run_delayed_root() --> job queued by CPU 1 --> starts picking and running delayed nodes from the prepare_list list close_ctree() btrfs_delete_unused_bgs() btrfs_commit_super() btrfs_join_transaction() --> gets transaction N btrfs_commit_transaction(N) --> set transaction state to TRANTS_STATE_COMMIT_START btrfs_first_prepared_delayed_node() --> picks delayed node X through the prepared_list list btrfs_run_delayed_items() btrfs_first_delayed_node() --> also picks delayed node X but through the node_list list __btrfs_commit_inode_delayed_items() --> runs all delayed items from this node and drops the node's item count to 0 through call to btrfs_release_delayed_inode() --> finishes running any remaining delayed nodes --> finishes transaction commit --> stops cleaner and transaction threads btrfs_free_fs_roots() --> frees all roots and removes them from the radix tree fs_info->fs_roots_radix btrfs_join_transaction() start_transaction() btrfs_record_root_in_trans() record_root_in_trans() radix_tree_tag_set() --> crashes because the root is not in the radix tree anymore If the worker is able to call btrfs_join_transaction() before the unmount task frees the fs roots, we end up leaking a transaction and all its resources, since after the call to btrfs_commit_super() and stopping the transaction kthread, we don't expect to have any transaction open anymore. When this situation happens the worker has a delayed node that has no more items to run, since the task calling btrfs_run_delayed_items(), which is doing a transaction commit, picks the same node and runs all its items first. We can not wait for the worker to complete when running delayed items through btrfs_run_delayed_items(), because we call that function in several phases of a transaction commit, and that could cause a deadlock because the worker calls btrfs_join_transaction() and the task doing the transaction commit may have already set the transaction state to TRANS_STATE_COMMIT_DOING. Also it's not possible to get into a situation where only some of the items of a delayed node are added to the fs/subvolume tree in the current transaction and the remaining ones in the next transaction, because when running the items of a delayed inode we lock its mutex, effectively waiting for the worker if the worker is running the items of the delayed node already. Since this can only cause issues when unmounting a filesystem, fix it in a simple way by waiting for any jobs on the delayed workers queue before calling btrfs_commit_supper() at close_ctree(). This works because at this point no one can call btrfs_btree_balance_dirty() or btrfs_balance_delayed_items(), and if we end up waiting for any worker to complete, btrfs_commit_super() will commit the transaction created by the worker. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f01f19c76cf5..483b54077d01 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4067,6 +4067,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ btrfs_delete_unused_bgs(fs_info); + /* + * There might be existing delayed inode workers still running + * and holding an empty delayed inode item. We must wait for + * them to complete first because they can create a transaction. + * This happens when someone calls btrfs_balance_delayed_items() + * and then a transaction commit runs the same delayed nodes + * before any delayed worker has done something with the nodes. + * We must wait for any worker here and not at transaction + * commit time since that could cause a deadlock. + * This is a very rare case. + */ + btrfs_flush_workqueue(fs_info->delayed_workers); + ret = btrfs_commit_super(fs_info); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); -- cgit From e9be5a303d271faa42b44b606fbe4d7507a7f026 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Feb 2020 21:00:47 +0100 Subject: btrfs: simplify tree block checksumming loop Thw whole point of csum_tree_block is to iterate over all extent buffer pages and pass it to checksumming functions. The bytes where checksum is stored must be skipped, thus map_private_extent_buffer. This complicates further offset calculations. As the first page will be always present, checksum the relevant bytes unconditionally and then do a simple iteration over the remaining pages. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 483b54077d01..6f63cc476d1f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -259,38 +259,22 @@ out: static int csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; + const int num_pages = fs_info->nodesize >> PAGE_SHIFT; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - unsigned long len; - unsigned long cur_len; - unsigned long offset = BTRFS_CSUM_SIZE; char *kaddr; - unsigned long map_start; - unsigned long map_len; - int err; + int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); + kaddr = page_address(buf->pages[0]); + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, + PAGE_SIZE - BTRFS_CSUM_SIZE); - len = buf->len - offset; - - while (len > 0) { - /* - * Note: we don't need to check for the err == 1 case here, as - * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' - * and 'min_len = 32' and the currently implemented mapping - * algorithm we cannot cross a page boundary. - */ - err = map_private_extent_buffer(buf, offset, 32, - &kaddr, &map_start, &map_len); - if (WARN_ON(err)) - return err; - cur_len = min(len, map_len - (offset - map_start)); - crypto_shash_update(shash, kaddr + offset - map_start, cur_len); - len -= cur_len; - offset += cur_len; + for (i = 1; i < num_pages; i++) { + kaddr = page_address(buf->pages[i]); + crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); - crypto_shash_final(shash, result); return 0; -- cgit From c67b38925b682bdd3419ff39688991b1dc7cdfb6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Feb 2020 21:00:49 +0100 Subject: btrfs: return void from csum_tree_block Now that csum_tree_block is not returning any errors, we can make csum_tree_block return void and simplify callers. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6f63cc476d1f..6b00ddea0b48 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -253,10 +253,8 @@ out: /* * Compute the csum of a btree block and store the result to provided buffer. - * - * Returns error if the extent buffer cannot be mapped. */ -static int csum_tree_block(struct extent_buffer *buf, u8 *result) +static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; const int num_pages = fs_info->nodesize >> PAGE_SHIFT; @@ -276,8 +274,6 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) } memset(result, 0, BTRFS_CSUM_SIZE); crypto_shash_final(shash, result); - - return 0; } /* @@ -528,8 +524,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE) == 0); - if (csum_tree_block(eb, result)) - return -EINVAL; + csum_tree_block(eb, result); if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); @@ -640,9 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(eb, result); - if (ret) - goto err; + csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { u32 val; -- cgit From 3fd6372758d91d8ba801e0733b17d082066a04ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:40 -0500 Subject: btrfs: make the extent buffer leak check per fs info I'm going to make the entire destruction of btrfs_root's controlled by their refcount, so it will be helpful to notice if we're leaking their eb's on umount. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6b00ddea0b48..56cad6a51a7d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1530,6 +1530,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->free_space_root); btrfs_put_root(fs_info->fs_root); btrfs_check_leaked_roots(fs_info); + btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kvfree(fs_info); @@ -2656,6 +2657,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->unused_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); + INIT_LIST_HEAD(&fs_info->allocated_ebs); + spin_lock_init(&fs_info->eb_leak_lock); #endif extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, -- cgit From 0e996e7fcf2e3a7a5d13b0ed5f9cebc551bd94ba Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:41 -0500 Subject: btrfs: move ino_cache_inode dropping out of btrfs_free_fs_root We are going to make root life be controlled soley by refcounting, and inodes will be one of the things that hold a ref on the root. This means we need to handle dropping the ino_cache_inode outside of the root freeing logic, so move it into btrfs_drop_and_free_fs_root() so it is cleaned up properly on unmount. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 56cad6a51a7d..37418d1d0820 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3912,12 +3912,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, __btrfs_remove_free_space_cache(root->free_ino_pinned); if (root->free_ino_ctl) __btrfs_remove_free_space_cache(root->free_ino_ctl); + if (root->ino_cache_inode) { + iput(root->ino_cache_inode); + root->ino_cache_inode = NULL; + } btrfs_free_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) { - iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); if (root->anon_dev) free_anon_bdev(root->anon_dev); -- cgit From 8c38938c7bb096313ad00c2bafa82af37636b0ec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:42 -0500 Subject: btrfs: move the root freeing stuff into btrfs_put_root There are a few different ways to free roots, either you allocated them yourself and you just do free_extent_buffer(root->node); free_extent_buffer(root->commit_node); btrfs_put_root(root); Which is the pattern for log roots. Or for snapshots/subvolumes that are being dropped you simply call btrfs_free_fs_root() which does all the cleanup for you. Unify this all into btrfs_put_root(), so that we don't free up things associated with the root until the last reference is dropped. This makes the root freeing code much more significant. The only caveat is at close_ctree() time we have to free the extent buffers for all of our main roots (extent_root, chunk_root, etc) because we have to drop the btree_inode and we'll run into issues if we hold onto those nodes until ->kill_sb() time. This will be addressed in the future when we kill the btree_inode. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 63 +++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 31 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 37418d1d0820..242c072d69a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1255,11 +1255,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return root; fail: - if (leaf) { + if (leaf) btrfs_tree_unlock(leaf); - free_extent_buffer(root->commit_root); - free_extent_buffer(leaf); - } btrfs_put_root(root); return ERR_PTR(ret); @@ -1382,10 +1379,10 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); + root->node = NULL; goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - free_extent_buffer(root->node); goto find_fail; } root->commit_root = btrfs_root_node(root); @@ -1617,14 +1614,14 @@ again: if (ret) { btrfs_put_root(root); if (ret == -EEXIST) { - btrfs_free_fs_root(root); + btrfs_put_root(root); goto again; } goto fail; } return root; fail: - btrfs_free_fs_root(root); + btrfs_put_root(root); return ERR_PTR(ret); } @@ -1996,11 +1993,35 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); + free_root_extent_buffers(info->fs_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); } +void btrfs_put_root(struct btrfs_root *root) +{ + if (!root) + return; + + if (refcount_dec_and_test(&root->refs)) { + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + if (root->anon_dev) + free_anon_bdev(root->anon_dev); + btrfs_drew_lock_destroy(&root->snapshot_lock); + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); +#ifdef CONFIG_BTRFS_DEBUG + spin_lock(&root->fs_info->fs_roots_radix_lock); + list_del_init(&root->leak_list); + spin_unlock(&root->fs_info->fs_roots_radix_lock); +#endif + kfree(root); + } +} + void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; @@ -2012,13 +2033,10 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root, root_list); list_del(&gang[0]->root_list); - if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) btrfs_drop_and_free_fs_root(fs_info, gang[0]); - } else { - free_extent_buffer(gang[0]->node); - free_extent_buffer(gang[0]->commit_root); + else btrfs_put_root(gang[0]); - } } while (1) { @@ -2223,11 +2241,11 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); + log_tree_root->node = NULL; btrfs_put_root(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); - free_extent_buffer(log_tree_root->node); btrfs_put_root(log_tree_root); return -EIO; } @@ -2236,7 +2254,6 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); btrfs_put_root(log_tree_root); return ret; } @@ -3901,8 +3918,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); if (root->reloc_root) { - free_extent_buffer(root->reloc_root->node); - free_extent_buffer(root->reloc_root->commit_root); btrfs_put_root(root->reloc_root); root->reloc_root = NULL; } @@ -3916,19 +3931,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, iput(root->ino_cache_inode); root->ino_cache_inode = NULL; } - btrfs_free_fs_root(root); -} - -void btrfs_free_fs_root(struct btrfs_root *root) -{ - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - if (root->anon_dev) - free_anon_bdev(root->anon_dev); - btrfs_drew_lock_destroy(&root->snapshot_lock); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); btrfs_put_root(root); } @@ -4093,8 +4095,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); - btrfs_free_fs_roots(fs_info); - btrfs_put_block_group_cache(fs_info); /* @@ -4106,6 +4106,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); + btrfs_free_fs_roots(fs_info); /* * We must free the block groups after dropping the fs_roots as we could -- cgit From 5c8fd99fec9d9265ed8fb732120a2ffd2bbab9b8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:43 -0500 Subject: btrfs: make inodes hold a ref on their roots If we make sure all the inodes have refs on their root we don't have to worry about the root disappearing while we have open inodes. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 242c072d69a7..5db76264b07f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2096,7 +2096,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = fs_info->tree_root; + BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); -- cgit From dc9492c14c758639d7b2468d4ed3c77e785c1a35 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:44 -0500 Subject: btrfs: hold a ref on the root on the dead roots list At the point we add a root to the dead roots list we have no open inodes for that root, so we need to hold a ref on that root to keep it from disappearing. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5db76264b07f..831755a867ce 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2035,8 +2035,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) btrfs_drop_and_free_fs_root(fs_info, gang[0]); - else - btrfs_put_root(gang[0]); + btrfs_put_root(gang[0]); } while (1) { -- cgit From 4785e24fa5d23b05bc0f579e10aeb44c4a0a2a3d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:45 -0500 Subject: btrfs: don't take an extra root ref at allocation time Now that all the users of roots take references for them we can drop the extra root ref we've been taking. Before we had roots at 2 refs for the life of the file system, one for the radix tree, and one simply for existing. Now that we have proper ref accounting in all places that use roots we can drop this extra ref simply for existing as we no longer need it. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 831755a867ce..3a6d35e1edb7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1601,22 +1601,11 @@ again: if (ret == 0) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); - /* - * All roots have two refs on them at all times, one for the mounted fs, - * and one for being in the radix tree. This way we only free the root - * when we are unmounting or deleting the subvolume. We get one ref - * from __setup_root, one for inserting it into the radix tree, and then - * we have the third for returning it, and the caller will put it when - * it's done with the root. - */ - btrfs_grab_root(root); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { btrfs_put_root(root); - if (ret == -EEXIST) { - btrfs_put_root(root); + if (ret == -EEXIST) goto again; - } goto fail; } return root; @@ -3904,11 +3893,13 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { + bool drop_ref = false; + spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) - btrfs_put_root(root); + drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); if (btrfs_root_refs(&root->root_item) == 0) @@ -3930,7 +3921,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, iput(root->ino_cache_inode); root->ino_cache_inode = NULL; } - btrfs_put_root(root); + if (drop_ref) + btrfs_put_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) -- cgit From efc3453494af78180ac00d9dd8391fd52c4a921e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:46 -0500 Subject: btrfs: make btrfs_cleanup_fs_roots use the radix tree lock The radix root is primarily protected by the fs_roots_radix_lock, so use that to lookup and get a ref on all of our fs roots in btrfs_cleanup_fs_roots. The tree reference is taken in the protected section as before. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3a6d35e1edb7..4d3eba909664 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3932,15 +3932,14 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) int i = 0; int err = 0; unsigned int ret = 0; - int index; while (1) { - index = srcu_read_lock(&fs_info->subvol_srcu); + spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); if (!ret) { - srcu_read_unlock(&fs_info->subvol_srcu, index); + spin_unlock(&fs_info->fs_roots_radix_lock); break; } root_objectid = gang[ret - 1]->root_key.objectid + 1; @@ -3954,7 +3953,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) /* grab all the search result for later use */ gang[i] = btrfs_grab_root(gang[i]); } - srcu_read_unlock(&fs_info->subvol_srcu, index); + spin_unlock(&fs_info->fs_roots_radix_lock); for (i = 0; i < ret; i++) { if (!gang[i]) -- cgit From c75e839414d3610e6487ae3145199c500d55f7f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Feb 2020 16:11:47 -0500 Subject: btrfs: kill the subvol_srcu Now that we have proper root ref counting everywhere we can kill the subvol_srcu. * removal of fs_info::subvol_srcu reduces size of fs_info by 1176 bytes * the refcount_t used for the references checks for accidental 0->1 in cases where the root lifetime would not be properly protected * there's a leak detector for roots to catch unfreed roots at umount time * SRCU served us well over the years but is was not a proper synchronization mechanism for some cases Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4d3eba909664..a6cb5cbbdb9f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2757,46 +2757,33 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - ret = init_srcu_struct(&fs_info->subvol_srcu); - if (ret) - return ret; - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); if (ret) - goto fail; + return ret; ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) - goto fail; + return ret; fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) - goto fail; + return ret; ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, GFP_KERNEL); if (ret) - goto fail; + return ret; fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_KERNEL); - if (!fs_info->delayed_root) { - ret = -ENOMEM; - goto fail; - } + if (!fs_info->delayed_root) + return -ENOMEM; btrfs_init_delayed_root(fs_info->delayed_root); - ret = btrfs_alloc_stripe_hash_table(fs_info); - if (ret) - goto fail; - - return 0; -fail: - cleanup_srcu_struct(&fs_info->subvol_srcu); - return ret; + return btrfs_alloc_stripe_hash_table(fs_info); } static int btrfs_uuid_rescan_kthread(void *data) @@ -2870,13 +2857,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->chunk_root = chunk_root; if (!tree_root || !chunk_root) { err = -ENOMEM; - goto fail_srcu; + goto fail; } fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_srcu; + goto fail; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); btrfs_init_btree_inode(fs_info); @@ -3398,8 +3385,6 @@ fail_alloc: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); -fail_srcu: - cleanup_srcu_struct(&fs_info->subvol_srcu); fail: btrfs_close_devices(fs_info->fs_devices); return err; @@ -3902,9 +3887,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); - if (btrfs_root_refs(&root->root_item) == 0) - synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); if (root->reloc_root) { @@ -4116,7 +4098,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); - cleanup_srcu_struct(&fs_info->subvol_srcu); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, -- cgit