diff options
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 542 |
1 files changed, 232 insertions, 310 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dfdab849037b..07b5e6f7df67 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -30,10 +30,11 @@ #include <linux/ratelimit.h> #include <linux/uuid.h> #include <linux/semaphore.h> +#include <linux/error-injection.h> +#include <linux/crc32c.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" -#include "hash.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" @@ -50,6 +51,8 @@ #include "sysfs.h" #include "qgroup.h" #include "compression.h" +#include "tree-checker.h" +#include "ref-verify.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -59,7 +62,8 @@ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ BTRFS_SUPER_FLAG_SEEDING |\ - BTRFS_SUPER_FLAG_METADUMP) + BTRFS_SUPER_FLAG_METADUMP |\ + BTRFS_SUPER_FLAG_METADUMP_V2) static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -106,7 +110,7 @@ int __init btrfs_end_io_wq_init(void) return 0; } -void btrfs_end_io_wq_exit(void) +void __cold btrfs_end_io_wq_exit(void) { kmem_cache_destroy(btrfs_end_io_wq_cache); } @@ -120,8 +124,8 @@ struct async_submit_bio { void *private_data; struct btrfs_fs_info *fs_info; struct bio *bio; - extent_submit_bio_hook_t *submit_bio_start; - extent_submit_bio_hook_t *submit_bio_done; + extent_submit_bio_start_t *submit_bio_start; + extent_submit_bio_done_t *submit_bio_done; int mirror_num; unsigned long bio_flags; /* @@ -218,7 +222,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * extents on the btree inode are pretty simple, there's one extent * that covers the entire device */ -static struct extent_map *btree_get_extent(struct btrfs_inode *inode, +struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { @@ -266,7 +270,7 @@ out: u32 btrfs_csum_data(const char *data, u32 seed, size_t len) { - return btrfs_crc32c(seed, data, len); + return crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, u8 *result) @@ -283,7 +287,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, int verify) { u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - char *result = NULL; + char result[BTRFS_CSUM_SIZE]; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -292,7 +296,6 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, unsigned long map_len; int err; u32 crc = ~(u32)0; - unsigned long inline_result; len = buf->len - offset; while (len > 0) { @@ -306,13 +309,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, len -= cur_len; offset += cur_len; } - if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size, GFP_NOFS); - if (!result) - return -ENOMEM; - } else { - result = (char *)&inline_result; - } + memset(result, 0, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); @@ -327,15 +324,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, "%s checksum verify failed on %llu wanted %X found %X level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); - if (result != (char *)&inline_result) - kfree(result); return -EUCLEAN; } } else { write_extent_buffer(buf, result, 0, csum_size); } - if (result != (char *)&inline_result) - kfree(result); + return 0; } @@ -389,7 +383,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state, GFP_NOFS); + &cached_state); if (need_lock) btrfs_tree_read_unlock_blocking(eb); return ret; @@ -409,8 +403,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, if (csum_type == BTRFS_CSUM_TYPE_CRC32) { u32 crc = ~(u32)0; - const int csum_size = sizeof(crc); - char result[csum_size]; + char result[sizeof(crc)]; /* * The super_block structure does not span the whole @@ -421,7 +414,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); - if (memcmp(raw_disk_sb, result, csum_size)) + if (memcmp(raw_disk_sb, result, sizeof(result))) ret = 1; } @@ -434,13 +427,59 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return ret; } +static int verify_level_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int level, + struct btrfs_key *first_key) +{ + int found_level; + struct btrfs_key found_key; + int ret; + + found_level = btrfs_header_level(eb); + if (found_level != level) { +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(1); + btrfs_err(fs_info, +"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", + eb->start, level, found_level); +#endif + return -EIO; + } + + if (!first_key) + return 0; + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + ret = btrfs_comp_cpu_keys(first_key, &found_key); + +#ifdef CONFIG_BTRFS_DEBUG + if (ret) { + WARN_ON(1); + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)", + eb->start, first_key->objectid, first_key->type, + first_key->offset, found_key.objectid, + found_key.type, found_key.offset); + } +#endif + return ret; +} + /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. + * + * @parent_transid: expected transid, skip check if 0 + * @level: expected level, mandatory check + * @first_key: expected key of first slot, skip check if NULL */ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 parent_transid) + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct extent_io_tree *io_tree; int failed = 0; @@ -453,13 +492,16 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, - btree_get_extent, mirror_num); + mirror_num); if (!ret) { - if (!verify_parent_transid(io_tree, eb, + if (verify_parent_transid(io_tree, eb, parent_transid, 0)) - break; - else ret = -EIO; + else if (verify_level_key(fs_info, eb, level, + first_key)) + ret = -EUCLEAN; + else + break; } /* @@ -467,7 +509,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, * there is no reason to read the other copies, they won't be * any less wrong. */ - if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) || + ret == -EUCLEAN) break; num_copies = btrfs_num_copies(fs_info, @@ -543,146 +586,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, return ret; } -#define CORRUPT(reason, eb, root, slot) \ - btrfs_crit(root->fs_info, \ - "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ - btrfs_header_level(eb) == 0 ? "leaf" : "node", \ - reason, btrfs_header_bytenr(eb), root->objectid, slot) - -static noinline int check_leaf(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_key key; - struct btrfs_key leaf_key; - u32 nritems = btrfs_header_nritems(leaf); - int slot; - - /* - * Extent buffers from a relocation tree have a owner field that - * corresponds to the subvolume tree they are based on. So just from an - * extent buffer alone we can not find out what is the id of the - * corresponding subvolume tree, so we can not figure out if the extent - * buffer corresponds to the root of the relocation tree or not. So skip - * this check for relocation trees. - */ - if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { - struct btrfs_root *check_root; - - key.objectid = btrfs_header_owner(leaf); - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - check_root = btrfs_get_fs_root(fs_info, &key, false); - /* - * The only reason we also check NULL here is that during - * open_ctree() some roots has not yet been set up. - */ - if (!IS_ERR_OR_NULL(check_root)) { - struct extent_buffer *eb; - - eb = btrfs_root_node(check_root); - /* if leaf is the root, then it's fine */ - if (leaf != eb) { - CORRUPT("non-root leaf's nritems is 0", - leaf, check_root, 0); - free_extent_buffer(eb); - return -EIO; - } - free_extent_buffer(eb); - } - return 0; - } - - if (nritems == 0) - return 0; - - /* Check the 0 item */ - if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != - BTRFS_LEAF_DATA_SIZE(fs_info)) { - CORRUPT("invalid item offset size pair", leaf, root, 0); - return -EIO; - } - - /* - * Check to make sure each items keys are in the correct order and their - * offsets make sense. We only have to loop through nritems-1 because - * we check the current slot against the next slot, which verifies the - * next slot's offset+size makes sense and that the current's slot - * offset is correct. - */ - for (slot = 0; slot < nritems - 1; slot++) { - btrfs_item_key_to_cpu(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &key, slot + 1); - - /* Make sure the keys are in the right order */ - if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { - CORRUPT("bad key order", leaf, root, slot); - return -EIO; - } - - /* - * Make sure the offset and ends are right, remember that the - * item data starts at the end of the leaf and grows towards the - * front. - */ - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - CORRUPT("slot offset bad", leaf, root, slot); - return -EIO; - } - - /* - * Check to make sure that we don't point outside of the leaf, - * just in case all the items are consistent to each other, but - * all point outside of the leaf. - */ - if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info)) { - CORRUPT("slot end outside of leaf", leaf, root, slot); - return -EIO; - } - } - - return 0; -} - -static int check_node(struct btrfs_root *root, struct extent_buffer *node) -{ - unsigned long nr = btrfs_header_nritems(node); - struct btrfs_key key, next_key; - int slot; - u64 bytenr; - int ret = 0; - - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { - btrfs_crit(root->fs_info, - "corrupt node: block %llu root %llu nritems %lu", - node->start, root->objectid, nr); - return -EIO; - } - - for (slot = 0; slot < nr - 1; slot++) { - bytenr = btrfs_node_blockptr(node, slot); - btrfs_node_key_to_cpu(node, &key, slot); - btrfs_node_key_to_cpu(node, &next_key, slot + 1); - - if (!bytenr) { - CORRUPT("invalid item slot", node, root, slot); - ret = -EIO; - goto out; - } - - if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { - CORRUPT("bad key order", node, root, slot); - ret = -EIO; - goto out; - } - } -out: - return ret; -} - static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -748,12 +651,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && check_leaf(root, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } - if (found_level > 0 && check_node(root, eb)) + if (found_level > 0 && btrfs_check_node(fs_info, eb)) ret = -EIO; if (!ret) @@ -856,14 +759,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, return 0; } -unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) -{ - unsigned long limit = min_t(unsigned long, - info->thread_pool_size, - info->fs_devices->open_devices); - return 256 * limit; -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -871,7 +766,6 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); ret = async->submit_bio_start(async->private_data, async->bio, - async->mirror_num, async->bio_flags, async->bio_offset); if (ret) async->status = ret; @@ -879,22 +773,9 @@ static void run_one_async_start(struct btrfs_work *work) static void run_one_async_done(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; struct async_submit_bio *async; - int limit; async = container_of(work, struct async_submit_bio, work); - fs_info = async->fs_info; - - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; - - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ - if (atomic_dec_return(&fs_info->nr_async_submits) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -903,8 +784,7 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->private_data, async->bio, async->mirror_num, - async->bio_flags, async->bio_offset); + async->submit_bio_done(async->private_data, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -918,8 +798,8 @@ static void run_one_async_free(struct btrfs_work *work) blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, void *private_data, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) + extent_submit_bio_start_t *submit_bio_start, + extent_submit_bio_done_t *submit_bio_done) { struct async_submit_bio *async; @@ -942,19 +822,10 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, async->status = 0; - atomic_inc(&fs_info->nr_async_submits); - if (op_is_sync(bio->bi_opf)) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); - - while (atomic_read(&fs_info->async_submit_draining) && - atomic_read(&fs_info->nr_async_submits)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0)); - } - return 0; } @@ -975,8 +846,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, +static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, u64 bio_offset) { /* @@ -986,9 +856,8 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio return btree_csum_one_bio(bio); } -static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num) { struct inode *inode = private_data; blk_status_t ret; @@ -1005,9 +874,9 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, return ret; } -static int check_async_write(unsigned long bio_flags) +static int check_async_write(struct btrfs_inode *bi) { - if (bio_flags & EXTENT_BIO_TREE_LOG) + if (atomic_read(&bi->sync_writers)) return 0; #ifdef CONFIG_X86 if (static_cpu_has(X86_FEATURE_XMM4_2)) @@ -1022,7 +891,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, { struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(bio_flags); + int async = check_async_write(BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { @@ -1047,8 +916,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, bio_offset, private_data, - __btree_submit_bio_start, - __btree_submit_bio_done); + btree_submit_bio_start, + btree_submit_bio_done); } if (ret) @@ -1172,7 +1041,7 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, btree_get_extent, 0); + buf, WAIT_NONE, 0); free_extent_buffer(buf); } @@ -1191,7 +1060,7 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, - btree_get_extent, mirror_num); + mirror_num); if (ret) { free_extent_buffer(buf); return ret; @@ -1230,8 +1099,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) buf->start, buf->start + buf->len - 1); } +/* + * Read tree block at logical address @bytenr and do variant basic but critical + * verification. + * + * @parent_transid: expected transid of this tree block, skip check if 0 + * @level: expected level, mandatory check + * @first_key: expected key in slot 0, skip check if NULL + */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 parent_transid) + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct extent_buffer *buf = NULL; int ret; @@ -1240,7 +1118,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid); + ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + level, first_key); if (ret) { free_extent_buffer(buf); return ERR_PTR(ret); @@ -1276,7 +1155,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); + ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -1328,6 +1207,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, spin_lock_init(&root->accounting_lock); spin_lock_init(&root->log_extents_lock[0]); spin_lock_init(&root->log_extents_lock[1]); + spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); @@ -1344,7 +1224,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); - atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1403,7 +1282,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct btrfs_key key; int ret = 0; - uuid_le uuid; + uuid_le uuid = NULL_UUID_LE; root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) @@ -1444,7 +1323,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); - uuid_le_gen(&uuid); + if (is_fstree(objectid)) + uuid_le_gen(&uuid); memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); root->root_item.drop_level = 0; @@ -1568,6 +1448,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_path *path; u64 generation; int ret; + int level; path = btrfs_alloc_path(); if (!path) @@ -1590,9 +1471,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, } generation = btrfs_root_generation(&root->root_item); + level = btrfs_root_level(&root->root_item); root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), - generation); + generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); goto find_fail; @@ -1975,12 +1857,10 @@ sleep: if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))) btrfs_cleanup_transaction(fs_info); - set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || cannot_commit)) - schedule_timeout(delay); - __set_current_state(TASK_RUNNING); + schedule_timeout_interruptible(delay); } while (!kthread_should_stop()); return 0; } @@ -2350,7 +2230,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { - int max_active = fs_info->thread_pool_size; + u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = @@ -2443,6 +2323,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); + int level = btrfs_super_log_root_level(disk_super); if (fs_devices->rw_devices == 0) { btrfs_warn(fs_info, "log replay required on RO media"); @@ -2456,7 +2337,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); log_tree_root->node = read_tree_block(fs_info, bytenr, - fs_info->generation + 1); + fs_info->generation + 1, + level, NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2501,23 +2383,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.offset = 0; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->extent_root = root; location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; btrfs_init_devices_late(fs_info); location.objectid = BTRFS_CSUM_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->csum_root = root; @@ -2534,7 +2422,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (IS_ERR(root)) { ret = PTR_ERR(root); if (ret != -ENOENT) - return ret; + goto out; } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; @@ -2543,13 +2431,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->free_space_root = root; } return 0; +out: + btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", + location.objectid, ret); + return ret; } int open_ctree(struct super_block *sb, @@ -2571,8 +2465,8 @@ int open_ctree(struct super_block *sb, int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - int max_active; int clear_free_space_tree = 0; + int level; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -2607,14 +2501,6 @@ int open_ctree(struct super_block *sb, goto fail_delalloc_bytes; } - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_bio_counter; - } - - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -2622,6 +2508,8 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); + INIT_LIST_HEAD(&fs_info->pending_raid_kobjs); + spin_lock_init(&fs_info->pending_raid_kobjs_lock); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); @@ -2647,17 +2535,12 @@ int open_ctree(struct super_block *sb, btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, - BTRFS_BLOCK_RSV_DELALLOC); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); - atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); - atomic_set(&fs_info->async_submit_draining, 0); - atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); @@ -2673,12 +2556,21 @@ int open_ctree(struct super_block *sb, /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); + btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bio_counter; + } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_KERNEL); if (!fs_info->delayed_root) { @@ -2884,8 +2776,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - max_active = fs_info->thread_pool_size; - ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; @@ -2895,12 +2785,13 @@ int open_ctree(struct super_block *sb, sb->s_bdi->congested_fn = btrfs_congested_fn; sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); + memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); @@ -2911,12 +2802,13 @@ int open_ctree(struct super_block *sb, } generation = btrfs_super_chunk_root_generation(disk_super); + level = btrfs_super_chunk_root_level(disk_super); __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), - generation); + generation, level, NULL); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { btrfs_err(fs_info, "failed to read chunk root"); @@ -2938,10 +2830,10 @@ int open_ctree(struct super_block *sb, } /* - * keep the device that is marked to be the target device for the - * dev_replace procedure + * Keep the devid that is marked to be the target device for the + * device replace procedure */ - btrfs_close_extra_devices(fs_devices, 0); + btrfs_free_extra_devids(fs_devices, 0); if (!fs_devices->latest_bdev) { btrfs_err(fs_info, "failed to read devices"); @@ -2950,10 +2842,11 @@ int open_ctree(struct super_block *sb, retry_root_backup: generation = btrfs_super_generation(disk_super); + level = btrfs_super_root_level(disk_super); tree_root->node = read_tree_block(fs_info, btrfs_super_root(disk_super), - generation); + generation, level, NULL); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { btrfs_warn(fs_info, "failed to read tree root"); @@ -3004,7 +2897,7 @@ retry_root_backup: goto fail_block_groups; } - btrfs_close_extra_devices(fs_devices, 1); + btrfs_free_extra_devids(fs_devices, 1); ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { @@ -3038,7 +2931,7 @@ retry_root_backup: goto fail_sysfs; } - if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info)) { + if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writeable mount is not allowed due to too many missing devices"); goto fail_sysfs; @@ -3083,6 +2976,9 @@ retry_root_backup: if (ret) goto fail_trans_kthread; + if (btrfs_build_ref_tree(fs_info)) + btrfs_err(fs_info, "couldn't build ref tree"); + /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { @@ -3120,6 +3016,7 @@ retry_root_backup: fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); + btrfs_warn(fs_info, "failed to read fs tree: %d", err); goto fail_qgroup; } @@ -3283,6 +3180,7 @@ recovery_tree_root: goto fail_block_groups; goto retry_root_backup; } +ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { @@ -3391,6 +3289,7 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; + int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3433,13 +3332,10 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - if (i == 0) { - ret = btrfsic_submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_FUA | REQ_META | REQ_PRIO, bh); - } else { - ret = btrfsic_submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_META | REQ_PRIO, bh); - } + op_flags = REQ_SYNC | REQ_META | REQ_PRIO; + if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) + op_flags |= REQ_FUA; + ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); if (ret) errors++; } @@ -3458,6 +3354,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) struct buffer_head *bh; int i; int errors = 0; + bool primary_failed = false; u64 bytenr; if (max_mirrors == 0) @@ -3474,11 +3371,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) BTRFS_SUPER_INFO_SIZE); if (!bh) { errors++; + if (i == 0) + primary_failed = true; continue; } wait_on_buffer(bh); - if (!buffer_uptodate(bh)) + if (!buffer_uptodate(bh)) { errors++; + if (i == 0) + primary_failed = true; + } /* drop our reference */ brelse(bh); @@ -3487,6 +3389,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) brelse(bh); } + /* log error, force error return */ + if (primary_failed) { + btrfs_err(device->fs_info, "error writing primary super block to device %llu", + device->devid); + return -1; + } + return errors < i ? 0 : -1; } @@ -3519,7 +3428,7 @@ static void write_dev_flush(struct btrfs_device *device) bio->bi_private = &device->flush_wait; btrfsic_submit_bio(bio); - device->flush_bio_sent = 1; + set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } /* @@ -3529,10 +3438,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) { struct bio *bio = device->flush_bio; - if (!device->flush_bio_sent) + if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; - device->flush_bio_sent = 0; + clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); return bio->bi_status; @@ -3540,7 +3449,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) static int check_barrier_error(struct btrfs_fs_info *fs_info) { - if (!btrfs_check_rw_degradable(fs_info)) + if (!btrfs_check_rw_degradable(fs_info, NULL)) return -EIO; return 0; } @@ -3556,14 +3465,16 @@ static int barrier_all_devices(struct btrfs_fs_info *info) int errors_wait = 0; blk_status_t ret; + lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ head = &info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - if (dev->missing) + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) continue; - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; write_dev_flush(dev); @@ -3571,14 +3482,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info) } /* wait for all the barriers */ - list_for_each_entry_rcu(dev, head, dev_list) { - if (dev->missing) + list_for_each_entry(dev, head, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) { errors_wait++; continue; } - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_flush(dev); @@ -3670,12 +3582,13 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) } } - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; } - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; btrfs_set_stack_device_generation(dev_item, 0); @@ -3711,10 +3624,11 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) } total_errors = 0; - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; - if (!dev->in_fs_metadata || !dev->writeable) + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_supers(dev, max_mirrors); @@ -3948,6 +3862,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; @@ -4007,7 +3922,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->len, fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + /* + * Since btrfs_mark_buffer_dirty() can be called with item pointer set + * but item data not updated. + * So here we should only check item pointers, not item data. + */ + if (btrfs_header_level(buf) == 0 && + btrfs_check_leaf_relaxed(fs_info, buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -4046,12 +3967,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) __btrfs_btree_balance_dirty(fs_info, 0); } -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, + struct btrfs_key *first_key) { struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; struct btrfs_fs_info *fs_info = root->fs_info; - return btree_read_extent_buffer_pages(fs_info, buf, parent_transid); + return btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + level, first_key); } static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) @@ -4065,9 +3988,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) - btrfs_warn(fs_info, "unrecognized super flag: %llu", + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { + btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); @@ -4272,26 +4197,28 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_node *tmp; + struct rb_node *n; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); spin_lock(&delayed_refs->lock); continue; } spin_lock(&head->lock); - list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, - list) { + while ((n = rb_first(&head->ref_tree)) != NULL) { + ref = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); ref->in_tree = 0; - list_del(&ref->list); + rb_erase(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); @@ -4304,16 +4231,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); if (pin_bytes) - btrfs_pin_extent(fs_info, head->node.bytenr, - head->node.num_bytes, 1); - btrfs_put_delayed_ref(&head->node); + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } @@ -4466,11 +4393,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, dirty_list); - if (!cache) { - btrfs_err(fs_info, "orphan block group dirty_bgs list"); - spin_unlock(&cur_trans->dirty_bgs_lock); - return; - } if (!list_empty(&cache->io_list)) { spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4490,14 +4412,14 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, } spin_unlock(&cur_trans->dirty_bgs_lock); + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ while (!list_empty(&cur_trans->io_bgs)) { cache = list_first_entry(&cur_trans->io_bgs, struct btrfs_block_group_cache, io_list); - if (!cache) { - btrfs_err(fs_info, "orphan block group on io_bgs list"); - return; - } list_del_init(&cache->io_list); spin_lock(&cache->lock); |