diff options
Diffstat (limited to 'fs/btrfs/ctree.c')
| -rw-r--r-- | fs/btrfs/ctree.c | 5384 |
1 files changed, 2397 insertions, 2987 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5bf4c39e2ad6..a48b4befbee7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1,117 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007,2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/rbtree.h> +#include <linux/mm.h> +#include <linux/error-injection.h> +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "print-tree.h" #include "locking.h" +#include "volumes.h" +#include "qgroup.h" +#include "tree-mod-log.h" +#include "tree-checker.h" +#include "fs.h" +#include "accessors.h" +#include "extent-tree.h" +#include "relocation.h" +#include "file-item.h" + +static struct kmem_cache *btrfs_path_cachep; static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); -static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *ins_key, - struct btrfs_path *path, int data_size, int extend); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *ins_key, struct btrfs_path *path, + int data_size, bool extend); static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *dst, - struct extent_buffer *src, int empty); + struct extent_buffer *dst, + struct extent_buffer *src, bool empty); static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot); -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb); -static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); +/* + * The leaf data grows from end-to-front in the node. this returns the address + * of the start of the last item, which is the stop of the leaf data stack. + */ +static unsigned int leaf_data_end(const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); -struct btrfs_path *btrfs_alloc_path(void) + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); + return btrfs_item_offset(leaf, nr - 1); +} + +/* + * Move data in a @leaf (using memmove, safe for overlapping ranges). + * + * @leaf: leaf that we're doing a memmove on + * @dst_offset: item data offset we're moving to + * @src_offset: item data offset were' moving from + * @len: length of the data we're moving + * + * Wrapper around memmove_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void memmove_leaf_data(const struct extent_buffer *leaf, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long len) { - struct btrfs_path *path; - path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); - return path; + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, + btrfs_item_nr_offset(leaf, 0) + src_offset, len); } /* - * set all locked nodes in the path to blocking locks. This should - * be done before scheduling + * Copy item data from @src into @dst at the given @offset. + * + * @dst: destination leaf that we're copying into + * @src: source leaf that we're copying from + * @dst_offset: item data offset we're copying to + * @src_offset: item data offset were' copying from + * @len: length of the data we're copying + * + * Wrapper around copy_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. */ -noinline void btrfs_set_path_blocking(struct btrfs_path *p) +static inline void copy_leaf_data(const struct extent_buffer *dst, + const struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, unsigned long len) { - int i; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (!p->nodes[i] || !p->locks[i]) - continue; - btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_READ_LOCK) - p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - else if (p->locks[i] == BTRFS_WRITE_LOCK) - p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; - } + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, + btrfs_item_nr_offset(src, 0) + src_offset, len); } /* - * reset all the locked nodes in the patch to spinning locks. + * Move items in a @leaf (using memmove). + * + * @dst: destination leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy * - * held is used to keep lockdep happy, when lockdep is enabled - * we set held to a blocking lock before we go around and - * retake all the spinlocks in the path. You can safely use NULL - * for held + * Wrapper around memmove_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. */ -noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw) +static inline void memmove_leaf_items(const struct extent_buffer *leaf, + int dst_item, int src_item, int nr_items) { - int i; + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), + btrfs_item_nr_offset(leaf, src_item), + nr_items * sizeof(struct btrfs_item)); +} -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* lockdep really cares that we take all of these spinlocks - * in the right order. If any of the locks in the path are not - * currently blocking, it is going to complain. So, make really - * really sure by forcing the path to blocking before we clear - * the path blocking. - */ - if (held) { - btrfs_set_lock_blocking_rw(held, held_rw); - if (held_rw == BTRFS_WRITE_LOCK) - held_rw = BTRFS_WRITE_LOCK_BLOCKING; - else if (held_rw == BTRFS_READ_LOCK) - held_rw = BTRFS_READ_LOCK_BLOCKING; - } - btrfs_set_path_blocking(p); -#endif - - for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) { - btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) - p->locks[i] = BTRFS_WRITE_LOCK; - else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) - p->locks[i] = BTRFS_READ_LOCK; - } - } +/* + * Copy items from @src into @dst at the given @offset. + * + * @dst: destination leaf for the items + * @src: source leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around copy_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void copy_leaf_items(const struct extent_buffer *dst, + const struct extent_buffer *src, + int dst_item, int src_item, int nr_items) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), + btrfs_item_nr_offset(src, src_item), + nr_items * sizeof(struct btrfs_item)); +} + +struct btrfs_path *btrfs_alloc_path(void) +{ + might_sleep(); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - if (held) - btrfs_clear_lock_blocking_rw(held, held_rw); -#endif + return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); } /* this also releases the path */ @@ -166,11 +194,11 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) /* * RCU really hurts here, we could free up the root node because - * it was cow'ed but we may not get the new root node yet so do + * it was COWed but we may not get the new root node yet so do * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ - if (atomic_inc_not_zero(&eb->refs)) { + if (refcount_inc_not_zero(&eb->refs)) { rcu_read_unlock(); break; } @@ -180,56 +208,30 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) return eb; } -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. +/* + * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), + * just get put onto a simple dirty list. Transaction walks this list to make + * sure they get properly updated on disk. */ -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +static void add_root_to_dirty_list(struct btrfs_root *root) { - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_lock(eb); - if (eb == root->node) - break; - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} + struct btrfs_fs_info *fs_info = root->fs_info; -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); - if (eb == root->node) - break; - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || + !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) + return; -/* cowonly root (everything not a reference counted cow subvolume), just get - * put onto a simple dirty list. transaction.c walks this to make sure they - * get properly updated on disk. - */ -static void add_root_to_dirty_list(struct btrfs_root *root) -{ - spin_lock(&root->fs_info->trans_lock); - if (root->track_dirty && list_empty(&root->dirty_list)) { - list_add(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); + spin_lock(&fs_info->trans_lock); + if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { + /* Want the extent tree to be the last on the list */ + if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID) + list_move_tail(&root->dirty_list, + &fs_info->dirty_cowonly_roots); + else + list_move(&root->dirty_list, + &fs_info->dirty_cowonly_roots); } - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); } /* @@ -242,14 +244,17 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cow; int ret = 0; int level; struct btrfs_disk_key disk_key; + u64 reloc_src_root = 0; - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + trans->transid != fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + trans->transid != btrfs_get_root_last_trans(root)); level = btrfs_header_level(buf); if (level == 0) @@ -257,13 +262,15 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, root, buf->len, 0, - new_root_objectid, &disk_key, level, - buf->start, 0); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + reloc_src_root = btrfs_header_owner(buf); + cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, + &disk_key, level, buf->start, 0, + reloc_src_root, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); - copy_extent_buffer(cow, buf, 0, 0, cow->len); + copy_extent_buffer_full(cow, buf); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); @@ -274,603 +281,74 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), - BTRFS_FSID_SIZE); - - WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); - if (ret) + if (unlikely(btrfs_header_generation(buf) > trans->transid)) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); return ret; - - btrfs_mark_buffer_dirty(cow); - *cow_ret = cow; - return 0; -} - -enum mod_log_op { - MOD_LOG_KEY_REPLACE, - MOD_LOG_KEY_ADD, - MOD_LOG_KEY_REMOVE, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, - MOD_LOG_MOVE_KEYS, - MOD_LOG_ROOT_REPLACE, -}; - -struct tree_mod_move { - int dst_slot; - int nr_items; -}; - -struct tree_mod_root { - u64 logical; - u8 level; -}; - -struct tree_mod_elem { - struct rb_node node; - u64 index; /* shifted logical */ - u64 seq; - enum mod_log_op op; - - /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ - int slot; - - /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ - u64 generation; - - /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ - struct btrfs_disk_key key; - u64 blockptr; - - /* this is used for op == MOD_LOG_MOVE_KEYS */ - struct tree_mod_move move; - - /* this is used for op == MOD_LOG_ROOT_REPLACE */ - struct tree_mod_root old_root; -}; - -static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) -{ - read_lock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) -{ - read_unlock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) -{ - write_lock(&fs_info->tree_mod_log_lock); -} - -static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) -{ - write_unlock(&fs_info->tree_mod_log_lock); -} - -/* - * Increment the upper half of tree_mod_seq, set lower half zero. - * - * Must be called with fs_info->tree_mod_seq_lock held. - */ -static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) -{ - u64 seq = atomic64_read(&fs_info->tree_mod_seq); - seq &= 0xffffffff00000000ull; - seq += 1ull << 32; - atomic64_set(&fs_info->tree_mod_seq, seq); - return seq; -} - -/* - * Increment the lower half of tree_mod_seq. - * - * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers - * are generated should not technically require a spin lock here. (Rationale: - * incrementing the minor while incrementing the major seq number is between its - * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it - * just returns a unique sequence number as usual.) We have decided to leave - * that requirement in here and rethink it once we notice it really imposes a - * problem on some workload. - */ -static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) -{ - return atomic64_inc_return(&fs_info->tree_mod_seq); -} - -/* - * return the last minor in the previous major tree_mod_seq number - */ -u64 btrfs_tree_mod_seq_prev(u64 seq) -{ - return (seq & 0xffffffff00000000ull) - 1ull; -} - -/* - * This adds a new blocker to the tree mod log's blocker list if the @elem - * passed does not already have a sequence number set. So when a caller expects - * to record tree modifications, it should ensure to set elem->seq to zero - * before calling btrfs_get_tree_mod_seq. - * Returns a fresh, unused tree log modification sequence number, even if no new - * blocker was added. - */ -u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) -{ - u64 seq; - - tree_mod_log_write_lock(fs_info); - spin_lock(&fs_info->tree_mod_seq_lock); - if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); - list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); - } - seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); - tree_mod_log_write_unlock(fs_info); - - return seq; -} - -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) -{ - struct rb_root *tm_root; - struct rb_node *node; - struct rb_node *next; - struct seq_list *cur_elem; - struct tree_mod_elem *tm; - u64 min_seq = (u64)-1; - u64 seq_putting = elem->seq; - - if (!seq_putting) - return; - - spin_lock(&fs_info->tree_mod_seq_lock); - list_del(&elem->list); - elem->seq = 0; - - list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { - if (cur_elem->seq < min_seq) { - if (seq_putting > cur_elem->seq) { - /* - * blocker with lower sequence number exists, we - * cannot remove anything from the log - */ - spin_unlock(&fs_info->tree_mod_seq_lock); - return; - } - min_seq = cur_elem->seq; - } - } - spin_unlock(&fs_info->tree_mod_seq_lock); - - /* - * anything that's lower than the lowest existing (read: blocked) - * sequence number can be removed from the tree. - */ - tree_mod_log_write_lock(fs_info); - tm_root = &fs_info->tree_mod_log; - for (node = rb_first(tm_root); node; node = next) { - next = rb_next(node); - tm = container_of(node, struct tree_mod_elem, node); - if (tm->seq > min_seq) - continue; - rb_erase(node, tm_root); - kfree(tm); } - tree_mod_log_write_unlock(fs_info); -} -/* - * key order of the log: - * index -> sequence - * - * the index is the shifted logical of the *new* root node for root replace - * operations, or the shifted logical of the affected block for all other - * operations. - */ -static noinline int -__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) -{ - struct rb_root *tm_root; - struct rb_node **new; - struct rb_node *parent = NULL; - struct tree_mod_elem *cur; - - BUG_ON(!tm || !tm->seq); - - tm_root = &fs_info->tree_mod_log; - new = &tm_root->rb_node; - while (*new) { - cur = container_of(*new, struct tree_mod_elem, node); - parent = *new; - if (cur->index < tm->index) - new = &((*new)->rb_left); - else if (cur->index > tm->index) - new = &((*new)->rb_right); - else if (cur->seq < tm->seq) - new = &((*new)->rb_left); - else if (cur->seq > tm->seq) - new = &((*new)->rb_right); - else { - kfree(tm); - return -EEXIST; - } + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_inc_ref(trans, root, cow, 1); + if (unlikely(ret)) + btrfs_abort_transaction(trans, ret); + } else { + ret = btrfs_inc_ref(trans, root, cow, 0); + if (unlikely(ret)) + btrfs_abort_transaction(trans, ret); } - - rb_link_node(&tm->node, parent, new); - rb_insert_color(&tm->node, tm_root); - return 0; -} - -/* - * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it - * returns zero with the tree_mod_log_lock acquired. The caller must hold - * this until all tree mod log insertions are recorded in the rb tree and then - * call tree_mod_log_write_unlock() to release. - */ -static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) { - smp_mb(); - if (list_empty(&(fs_info)->tree_mod_seq_list)) - return 1; - if (eb && btrfs_header_level(eb) == 0) - return 1; - - tree_mod_log_write_lock(fs_info); - if (list_empty(&fs_info->tree_mod_seq_list)) { - /* - * someone emptied the list while we were waiting for the lock. - * we must not add to the list when no blocker exists. - */ - tree_mod_log_write_unlock(fs_info); - return 1; + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + return ret; } + btrfs_mark_buffer_dirty(trans, cow); + *cow_ret = cow; return 0; } /* - * This allocates memory and gets a tree modification sequence number. - * - * Returns <0 on error. - * Returns >0 (the added sequence number) on success. + * check if the tree block can be shared by multiple trees */ -static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, - struct tree_mod_elem **tm_ret) +bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { - struct tree_mod_elem *tm; + const u64 buf_gen = btrfs_header_generation(buf); /* - * once we switch from spin locks to something different, we should - * honor the flags parameter here. + * Tree blocks not in shareable trees and tree roots are never shared. + * If a block was allocated after the last snapshot and the block was + * not allocated by tree relocation, we know the block is not shared. */ - tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); - if (!tm) - return -ENOMEM; - - spin_lock(&fs_info->tree_mod_seq_lock); - tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); - - return tm->seq; -} - -static inline int -__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) -{ - int ret; - struct tree_mod_elem *tm; - - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - return ret; - - tm->index = eb->start >> PAGE_CACHE_SHIFT; - if (op != MOD_LOG_KEY_ADD) { - btrfs_node_key(eb, &tm->key, slot); - tm->blockptr = btrfs_node_blockptr(eb, slot); - } - tm->op = op; - tm->slot = slot; - tm->generation = btrfs_node_ptr_generation(eb, slot); - - return __tree_mod_log_insert(fs_info, tm); -} - -static noinline int -tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) -{ - int ret; - if (tree_mod_dont_log(fs_info, eb)) - return 0; - - ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); - - tree_mod_log_write_unlock(fs_info); - return ret; -} - -static noinline int -tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - int slot, enum mod_log_op op) -{ - return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); -} + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return false; -static noinline int -tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op) -{ - return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); -} + if (buf == root->node) + return false; -static noinline int -tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int dst_slot, int src_slot, - int nr_items, gfp_t flags) -{ - struct tree_mod_elem *tm; - int ret; - int i; + if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return false; - if (tree_mod_dont_log(fs_info, eb)) - return 0; + if (buf != root->commit_root) + return true; /* - * When we override something during the move, we log these removals. - * This can only happen when we move towards the beginning of the - * buffer, i.e. dst_slot < src_slot. + * An extent buffer that used to be the commit root may still be shared + * because the tree height may have increased and it became a child of a + * higher level root. This can happen when snapshotting a subvolume + * created in the current transaction. */ - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING); - BUG_ON(ret < 0); - } - - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - goto out; - - tm->index = eb->start >> PAGE_CACHE_SHIFT; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = MOD_LOG_MOVE_KEYS; - - ret = __tree_mod_log_insert(fs_info, tm); -out: - tree_mod_log_write_unlock(fs_info); - return ret; -} + if (buf_gen == trans->transid) + return true; -static inline void -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) -{ - int i; - u32 nritems; - int ret; - - if (btrfs_header_level(eb) == 0) - return; - - nritems = btrfs_header_nritems(eb); - for (i = nritems - 1; i >= 0; i--) { - ret = tree_mod_log_insert_key_locked(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING); - BUG_ON(ret < 0); - } -} - -static noinline int -tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, - struct extent_buffer *old_root, - struct extent_buffer *new_root, gfp_t flags, - int log_removal) -{ - struct tree_mod_elem *tm; - int ret; - - if (tree_mod_dont_log(fs_info, NULL)) - return 0; - - if (log_removal) - __tree_mod_log_free_eb(fs_info, old_root); - - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - goto out; - - tm->index = new_root->start >> PAGE_CACHE_SHIFT; - tm->old_root.logical = old_root->start; - tm->old_root.level = btrfs_header_level(old_root); - tm->generation = btrfs_header_generation(old_root); - tm->op = MOD_LOG_ROOT_REPLACE; - - ret = __tree_mod_log_insert(fs_info, tm); -out: - tree_mod_log_write_unlock(fs_info); - return ret; -} - -static struct tree_mod_elem * -__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, - int smallest) -{ - struct rb_root *tm_root; - struct rb_node *node; - struct tree_mod_elem *cur = NULL; - struct tree_mod_elem *found = NULL; - u64 index = start >> PAGE_CACHE_SHIFT; - - tree_mod_log_read_lock(fs_info); - tm_root = &fs_info->tree_mod_log; - node = tm_root->rb_node; - while (node) { - cur = container_of(node, struct tree_mod_elem, node); - if (cur->index < index) { - node = node->rb_left; - } else if (cur->index > index) { - node = node->rb_right; - } else if (cur->seq < min_seq) { - node = node->rb_left; - } else if (!smallest) { - /* we want the node with the highest seq */ - if (found) - BUG_ON(found->seq > cur->seq); - found = cur; - node = node->rb_left; - } else if (cur->seq > min_seq) { - /* we want the node with the smallest seq */ - if (found) - BUG_ON(found->seq < cur->seq); - found = cur; - node = node->rb_right; - } else { - found = cur; - break; - } - } - tree_mod_log_read_unlock(fs_info); - - return found; -} - -/* - * this returns the element from the log with the smallest time sequence - * value that's in the log (the oldest log item). any element with a time - * sequence lower than min_seq will be ignored. - */ -static struct tree_mod_elem * -tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, - u64 min_seq) -{ - return __tree_mod_log_search(fs_info, start, min_seq, 1); -} - -/* - * this returns the element from the log with the largest time sequence - * value that's in the log (the most recent log item). any element with - * a time sequence lower than min_seq will be ignored. - */ -static struct tree_mod_elem * -tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) -{ - return __tree_mod_log_search(fs_info, start, min_seq, 0); -} - -static noinline void -tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, - struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) -{ - int ret; - int i; - - if (tree_mod_dont_log(fs_info, NULL)) - return; - - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { - tree_mod_log_write_unlock(fs_info); - return; - } - - for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); - ret = tree_mod_log_insert_key_locked(fs_info, dst, - i + dst_offset, - MOD_LOG_KEY_ADD); - BUG_ON(ret < 0); - } - - tree_mod_log_write_unlock(fs_info); -} - -static inline void -tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, - int dst_offset, int src_offset, int nr_items) -{ - int ret; - ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, - nr_items, GFP_NOFS); - BUG_ON(ret < 0); -} - -static noinline void -tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, int atomic) -{ - int ret; - - ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, - MOD_LOG_KEY_REPLACE, - atomic ? GFP_ATOMIC : GFP_NOFS); - BUG_ON(ret < 0); -} - -static noinline void -tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) -{ - if (tree_mod_dont_log(fs_info, eb)) - return; - - __tree_mod_log_free_eb(fs_info, eb); - - tree_mod_log_write_unlock(fs_info); -} - -static noinline void -tree_mod_log_set_root_pointer(struct btrfs_root *root, - struct extent_buffer *new_root_node, - int log_removal) -{ - int ret; - ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, GFP_NOFS, log_removal); - BUG_ON(ret < 0); -} - -/* - * check if the tree block can be shared by multiple trees - */ -int btrfs_block_can_be_shared(struct btrfs_root *root, - struct extent_buffer *buf) -{ - /* - * Tree blocks not in refernece counted trees and tree roots - * are never shared. If a block was allocated after the last - * snapshot and the block was not allocated by tree relocation, - * we know the block is not shared. - */ - if (root->ref_cows && - buf != root->node && buf != root->commit_root && - (btrfs_header_generation(buf) <= - btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) - return 1; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) - return 1; -#endif - return 0; + return false; } static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, @@ -879,10 +357,10 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, struct extent_buffer *cow, int *last_ref) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 refs; u64 owner; u64 flags; - u64 new_flags = 0; int ret; /* @@ -902,20 +380,24 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * are only allowed for blocks use full backrefs. */ - if (btrfs_block_can_be_shared(root, buf)) { - ret = btrfs_lookup_extent_info(trans, root, buf->start, + if (btrfs_block_can_be_shared(trans, root, buf)) { + ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, - &refs, &flags); + &refs, &flags, NULL); if (ret) return ret; - if (refs == 0) { - ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + if (unlikely(refs == 0)) { + btrfs_crit(fs_info, + "found 0 references for tree block at bytenr %llu level %d root %llu", + buf->start, btrfs_header_level(buf), + btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); return ret; } } else { refs = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; else @@ -923,55 +405,59 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } owner = btrfs_header_owner(buf); - BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && - !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) { + btrfs_crit(fs_info, +"found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set", + buf->start, btrfs_header_level(buf), + btrfs_root_id(root), refs, flags); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } if (refs > 1) { - if ((owner == root->root_key.objectid || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + if ((owner == btrfs_root_id(root) || + btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1, 1); - BUG_ON(ret); /* -ENOMEM */ - - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0, 1); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_inc_ref(trans, root, cow, 1, 1); - BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_inc_ref(trans, root, buf, 1); + if (ret) + return ret; + + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_dec_ref(trans, root, buf, 0); + if (ret) + return ret; + ret = btrfs_inc_ref(trans, root, cow, 1); + if (ret) + return ret; } - new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + ret = btrfs_set_disk_extent_flags(trans, buf, + BTRFS_BLOCK_FLAG_FULL_BACKREF); + if (ret) + return ret; } else { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); /* -ENOMEM */ - } - if (new_flags != 0) { - int level = btrfs_header_level(buf); - - ret = btrfs_set_disk_extent_flags(trans, root, - buf->start, - buf->len, - new_flags, level, 0); + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, buf, 1, 1); - BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_inc_ref(trans, root, cow, 0); + if (ret) + return ret; + ret = btrfs_dec_ref(trans, root, buf, 1); + if (ret) + return ret; } - clean_tree_block(trans, root, buf); + btrfs_clear_buffer_dirty(trans, buf); *last_ref = 1; } return 0; @@ -989,28 +475,32 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. */ -static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *cow; int level, ret; int last_ref = 0; int unlock_orig = 0; - u64 parent_start; + u64 parent_start = 0; + u64 reloc_src_root = 0; if (*cow_ret == buf) unlock_orig = 1; - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + trans->transid != fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && + trans->transid != btrfs_get_root_last_trans(root)); level = btrfs_header_level(buf); @@ -1019,371 +509,116 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { if (parent) parent_start = parent->start; - else - parent_start = 0; - } else - parent_start = 0; - - cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, - root->root_key.objectid, &disk_key, - level, search_start, empty_size); + reloc_src_root = btrfs_header_owner(buf); + } + cow = btrfs_alloc_tree_block(trans, root, parent_start, + btrfs_root_id(root), &disk_key, level, + search_start, empty_size, reloc_src_root, nest); if (IS_ERR(cow)) return PTR_ERR(cow); /* cow is set to blocking by btrfs_init_new_buffer */ - copy_extent_buffer(cow, buf, 0, 0, cow->len); + copy_extent_buffer_full(cow, buf); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | BTRFS_HEADER_FLAG_RELOC); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); else - btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_set_header_owner(cow, btrfs_root_id(root)); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), - BTRFS_FSID_SIZE); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - return ret; + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; } - if (root->ref_cows) - btrfs_reloc_cow_block(trans, root, buf, cow); + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; + } + } if (buf == root->node) { WARN_ON(parent && parent != buf); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; - else - parent_start = 0; - extent_buffer_get(cow); - tree_mod_log_set_root_pointer(root, cow, 1); + ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; + } + refcount_inc(&cow->refs); rcu_assign_pointer(root->node, cow); - btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; + } } else { - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - parent_start = parent->start; - else - parent_start = 0; - WARN_ON(trans->transid != btrfs_header_generation(parent)); - tree_mod_log_insert_key(root->fs_info, parent, parent_slot, - MOD_LOG_KEY_REPLACE); + ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; + } btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); - btrfs_mark_buffer_dirty(parent); - if (last_ref) - tree_mod_log_free_eb(root->fs_info, buf); - btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + btrfs_mark_buffer_dirty(trans, parent); + if (last_ref) { + ret = btrfs_tree_mod_log_free_eb(buf); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; + } + } + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, + parent_start, last_ref); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto error_unlock_cow; + } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); - btrfs_mark_buffer_dirty(cow); + btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; -} - -/* - * returns the logical address of the oldest predecessor of the given root. - * entries older than time_seq are ignored. - */ -static struct tree_mod_elem * -__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb_root, u64 time_seq) -{ - struct tree_mod_elem *tm; - struct tree_mod_elem *found = NULL; - u64 root_logical = eb_root->start; - int looped = 0; - - if (!time_seq) - return 0; - - /* - * the very last operation that's logged for a root is the replacement - * operation (if it is replaced at all). this has the index of the *new* - * root, making it the very first operation that's logged for this root. - */ - while (1) { - tm = tree_mod_log_search_oldest(fs_info, root_logical, - time_seq); - if (!looped && !tm) - return 0; - /* - * if there are no tree operation for the oldest root, we simply - * return it. this should only happen if that (old) root is at - * level 0. - */ - if (!tm) - break; - - /* - * if there's an operation that's not a root replacement, we - * found the oldest version of our root. normally, we'll find a - * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. - */ - if (tm->op != MOD_LOG_ROOT_REPLACE) - break; - - found = tm; - root_logical = tm->old_root.logical; - looped = 1; - } - - /* if there's no old root to return, return what we found instead */ - if (!found) - found = tm; - - return found; -} - -/* - * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewinded (until we reach something older than - * time_seq). - */ -static void -__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 time_seq, struct tree_mod_elem *first_tm) -{ - u32 n; - struct rb_node *next; - struct tree_mod_elem *tm = first_tm; - unsigned long o_dst; - unsigned long o_src; - unsigned long p_size = sizeof(struct btrfs_key_ptr); - - n = btrfs_header_nritems(eb); - tree_mod_log_read_lock(fs_info); - while (tm && tm->seq >= time_seq) { - /* - * all the operations are recorded with the operator used for - * the modification. as we're going backwards, we do the - * opposite of each operation here. - */ - switch (tm->op) { - case MOD_LOG_KEY_REMOVE_WHILE_FREEING: - BUG_ON(tm->slot < n); - /* Fallthrough */ - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: - case MOD_LOG_KEY_REMOVE: - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); - btrfs_set_node_ptr_generation(eb, tm->slot, - tm->generation); - n++; - break; - case MOD_LOG_KEY_REPLACE: - BUG_ON(tm->slot >= n); - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); - btrfs_set_node_ptr_generation(eb, tm->slot, - tm->generation); - break; - case MOD_LOG_KEY_ADD: - /* if a move operation is needed it's in the log */ - n--; - break; - case MOD_LOG_MOVE_KEYS: - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); - memmove_extent_buffer(eb, o_dst, o_src, - tm->move.nr_items * p_size); - break; - case MOD_LOG_ROOT_REPLACE: - /* - * this operation is special. for roots, this must be - * handled explicitly before rewinding. - * for non-roots, this operation may exist if the node - * was a root: root A -> child B; then A gets empty and - * B is promoted to the new root. in the mod log, we'll - * have a root-replace operation for B, a tree block - * that is no root. we simply ignore that operation. - */ - break; - } - next = rb_next(&tm->node); - if (!next) - break; - tm = container_of(next, struct tree_mod_elem, node); - if (tm->index != first_tm->index) - break; - } - tree_mod_log_read_unlock(fs_info); - btrfs_set_header_nritems(eb, n); -} - -/* - * Called with eb read locked. If the buffer cannot be rewinded, the same buffer - * is returned. If rewind operations happen, a fresh buffer is returned. The - * returned buffer is always read-locked. If the returned buffer is not the - * input buffer, the lock on the input buffer is released and the input buffer - * is freed (its refcount is decremented). - */ -static struct extent_buffer * -tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 time_seq) -{ - struct extent_buffer *eb_rewin; - struct tree_mod_elem *tm; - - if (!time_seq) - return eb; - - if (btrfs_header_level(eb) == 0) - return eb; - - tm = tree_mod_log_search(fs_info, eb->start, time_seq); - if (!tm) - return eb; - - if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(eb->start, - fs_info->tree_root->nodesize); - BUG_ON(!eb_rewin); - btrfs_set_header_bytenr(eb_rewin, eb->start); - btrfs_set_header_backref_rev(eb_rewin, - btrfs_header_backref_rev(eb)); - btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); - btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); - } else { - eb_rewin = btrfs_clone_extent_buffer(eb); - BUG_ON(!eb_rewin); - } - - extent_buffer_get(eb_rewin); - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - - extent_buffer_get(eb_rewin); - btrfs_tree_read_lock(eb_rewin); - __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); - WARN_ON(btrfs_header_nritems(eb_rewin) > - BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); - - return eb_rewin; -} - -/* - * get_old_root() rewinds the state of @root's root node to the given @time_seq - * value. If there are no changes, the current root->root_node is returned. If - * anything changed in between, there's a fresh buffer allocated on which the - * rewind operations are done. In any case, the returned buffer is read locked. - * Returns NULL on error (with no locks held). - */ -static inline struct extent_buffer * -get_old_root(struct btrfs_root *root, u64 time_seq) -{ - struct tree_mod_elem *tm; - struct extent_buffer *eb = NULL; - struct extent_buffer *eb_root; - struct extent_buffer *old; - struct tree_mod_root *old_root = NULL; - u64 old_generation = 0; - u64 logical; - u32 blocksize; - - eb_root = btrfs_read_lock_root_node(root); - tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); - if (!tm) - return eb_root; - - if (tm->op == MOD_LOG_ROOT_REPLACE) { - old_root = &tm->old_root; - old_generation = tm->generation; - logical = old_root->logical; - } else { - logical = eb_root->start; - } - - tm = tree_mod_log_search(root->fs_info, logical, time_seq); - if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - blocksize = btrfs_level_size(root, old_root->level); - old = read_tree_block(root, logical, blocksize, 0); - if (!old || !extent_buffer_uptodate(old)) { - free_extent_buffer(old); - pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", - logical); - WARN_ON(1); - } else { - eb = btrfs_clone_extent_buffer(old); - free_extent_buffer(old); - } - } else if (old_root) { - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(logical, root->nodesize); - } else { - eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - } - - if (!eb) - return NULL; - extent_buffer_get(eb); - btrfs_tree_read_lock(eb); - if (old_root) { - btrfs_set_header_bytenr(eb, eb->start); - btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); - btrfs_set_header_level(eb, old_root->level); - btrfs_set_header_generation(eb, old_generation); - } - if (tm) - __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); - else - WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); - - return eb; -} -int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) -{ - struct tree_mod_elem *tm; - int level; - struct extent_buffer *eb_root = btrfs_root_node(root); - - tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); - if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { - level = tm->old_root.level; - } else { - level = btrfs_header_level(eb_root); - } - free_extent_buffer(eb_root); - - return level; +error_unlock_cow: + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + return ret; } -static inline int should_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +static inline bool should_cow_block(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { - /* ensure we can see the force_cow */ - smp_rmb(); + if (btrfs_is_testing(root->fs_info)) + return false; /* * We do not need to cow a block if @@ -1392,91 +627,93 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * 3) the root is not forced COW. * * What is forced COW: - * when we create snapshot during commiting the transaction, - * after we've finished coping src root, we must COW the shared + * when we create snapshot during committing the transaction, + * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ - if (btrfs_header_generation(buf) == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !root->force_cow) - return 0; - return 1; + + if (btrfs_header_generation(buf) != trans->transid) + return true; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) + return true; + + /* Ensure we can see the FORCE_COW bit. */ + smp_mb__before_atomic(); + if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) + return true; + + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return false; + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return true; + + return false; } /* - * cows a single block, see __btrfs_cow_block for the real work. - * This version of it has extra checks so that a block isn't cow'd more than + * COWs a single block, see btrfs_force_cow_block() for the real work. + * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ -noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, +int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; - if (trans->transaction != root->fs_info->running_transaction) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long) - root->fs_info->running_transaction->transid); + if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, + "attempt to COW block %llu on root %llu that is being deleted", + buf->start, btrfs_root_id(root)); + return -EUCLEAN; + } - if (trans->transid != root->fs_info->generation) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long)root->fs_info->generation); + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", + buf->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; return 0; } - search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); - - if (parent) - btrfs_set_lock_blocking(parent); - btrfs_set_lock_blocking(buf); + search_start = round_down(buf->start, SZ_1G); - ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; -} - -/* - * helper function for defrag to decide if two blocks pointed to by a - * node are actually close by - */ -static int close_blocks(u64 blocknr, u64 other, u32 blocksize) -{ - if (blocknr < other && other - (blocknr + blocksize) < 32768) - return 1; - if (blocknr > other && blocknr - (other + blocksize) < 32768) - return 1; - return 0; -} - -/* - * compare two keys in a memcmp fashion - */ -static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) -{ - struct btrfs_key k1; - - btrfs_disk_key_to_cpu(&k1, disk); - - return btrfs_comp_cpu_keys(&k1, k2); + /* + * Before CoWing this block for later modification, check if it's + * the subtree root and do the delayed subtree trace if needed. + * + * Also We don't care about the error, as it's handled internally. + */ + btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } +ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* * same as comp_keys only with two btrfs_key's */ -int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; @@ -1494,183 +731,73 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) } /* - * this is used by the defrag code to go through all the - * leaves pointed to by a node and reallocate them so that - * disk order is close to key order - */ -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress) -{ - struct extent_buffer *cur; - u64 blocknr; - u64 gen; - u64 search_start = *last_ret; - u64 last_block = 0; - u64 other; - u32 parent_nritems; - int end_slot; - int i; - int err = 0; - int parent_level; - int uptodate; - u32 blocksize; - int progress_passed = 0; - struct btrfs_disk_key disk_key; - - parent_level = btrfs_header_level(parent); - - WARN_ON(trans->transaction != root->fs_info->running_transaction); - WARN_ON(trans->transid != root->fs_info->generation); - - parent_nritems = btrfs_header_nritems(parent); - blocksize = btrfs_level_size(root, parent_level - 1); - end_slot = parent_nritems; - - if (parent_nritems == 1) - return 0; - - btrfs_set_lock_blocking(parent); - - for (i = start_slot; i < end_slot; i++) { - int close = 1; - - btrfs_node_key(parent, &disk_key, i); - if (!progress_passed && comp_keys(&disk_key, progress) < 0) - continue; - - progress_passed = 1; - blocknr = btrfs_node_blockptr(parent, i); - gen = btrfs_node_ptr_generation(parent, i); - if (last_block == 0) - last_block = blocknr; - - if (i > 0) { - other = btrfs_node_blockptr(parent, i - 1); - close = close_blocks(blocknr, other, blocksize); - } - if (!close && i < end_slot - 2) { - other = btrfs_node_blockptr(parent, i + 1); - close = close_blocks(blocknr, other, blocksize); - } - if (close) { - last_block = blocknr; - continue; - } - - cur = btrfs_find_tree_block(root, blocknr, blocksize); - if (cur) - uptodate = btrfs_buffer_uptodate(cur, gen, 0); - else - uptodate = 0; - if (!cur || !uptodate) { - if (!cur) { - cur = read_tree_block(root, blocknr, - blocksize, gen); - if (!cur || !extent_buffer_uptodate(cur)) { - free_extent_buffer(cur); - return -EIO; - } - } else if (!uptodate) { - err = btrfs_read_buffer(cur, gen); - if (err) { - free_extent_buffer(cur); - return err; - } - } - } - if (search_start == 0) - search_start = last_block; - - btrfs_tree_lock(cur); - btrfs_set_lock_blocking(cur); - err = __btrfs_cow_block(trans, root, cur, parent, i, - &cur, search_start, - min(16 * blocksize, - (end_slot - i) * blocksize)); - if (err) { - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - break; - } - search_start = cur->start; - last_block = cur->start; - *last_ret = search_start; - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - } - return err; -} - -/* - * The leaf data grows from end-to-front in the node. - * this returns the address of the start of the last item, - * which is the stop of the leaf data stack - */ -static inline unsigned int leaf_data_end(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(root); - return btrfs_item_offset_nr(leaf, nr - 1); -} - - -/* - * search for key in the extent_buffer. The items start at offset p, - * and they are item_size apart. There are 'max' items in p. + * Search for a key in the given extent_buffer. + * + * The lower boundary for the search is specified by the slot number @first_slot. + * Use a value of 0 to search over the whole extent buffer. Works for both + * leaves and nodes. * - * the slot in the array is returned via slot, and it points to - * the place where you would insert key if it is not found in - * the array. + * The slot in the extent buffer is returned via @slot. If the key exists in the + * extent buffer, then @slot will point to the slot where the key is, otherwise + * it points to the slot where you would insert the key. * - * slot may point to max if the key is bigger than all of the keys + * Slot may point to the total number of items (i.e. one position beyond the last + * key) if the key is bigger than the last key in the extent buffer. */ -static noinline int generic_bin_search(struct extent_buffer *eb, - unsigned long p, - int item_size, struct btrfs_key *key, - int max, int *slot) +int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { - int low = 0; - int high = max; - int mid; + unsigned long p; + int item_size; + /* + * Use unsigned types for the low and high slots, so that we get a more + * efficient division in the search loop below. + */ + u32 low = first_slot; + u32 high = btrfs_header_nritems(eb); int ret; - struct btrfs_disk_key *tmp = NULL; - struct btrfs_disk_key unaligned; - unsigned long offset; - char *kaddr = NULL; - unsigned long map_start = 0; - unsigned long map_len = 0; - int err; + const int key_size = sizeof(struct btrfs_disk_key); + + if (unlikely(low > high)) { + btrfs_err(eb->fs_info, + "%s: low (%u) > high (%u) eb %llu owner %llu level %d", + __func__, low, high, eb->start, + btrfs_header_owner(eb), btrfs_header_level(eb)); + return -EINVAL; + } + + if (btrfs_header_level(eb) == 0) { + p = offsetof(struct btrfs_leaf, items); + item_size = sizeof(struct btrfs_item); + } else { + p = offsetof(struct btrfs_node, ptrs); + item_size = sizeof(struct btrfs_key_ptr); + } while (low < high) { + const int unit_size = eb->folio_size; + unsigned long oil; + unsigned long offset; + struct btrfs_disk_key *tmp; + struct btrfs_disk_key unaligned; + int mid; + mid = (low + high) / 2; offset = p + mid * item_size; + oil = get_eb_offset_in_folio(eb, offset); - if (!kaddr || offset < map_start || - (offset + sizeof(struct btrfs_disk_key)) > - map_start + map_len) { - - err = map_private_extent_buffer(eb, offset, - sizeof(struct btrfs_disk_key), - &kaddr, &map_start, &map_len); - - if (!err) { - tmp = (struct btrfs_disk_key *)(kaddr + offset - - map_start); - } else { - read_extent_buffer(eb, &unaligned, - offset, sizeof(unaligned)); - tmp = &unaligned; - } + if (oil + key_size <= unit_size) { + const unsigned long idx = get_eb_folio_index(eb, offset); + char *kaddr = folio_address(eb->folios[idx]); + oil = get_eb_offset_in_folio(eb, offset); + tmp = (struct btrfs_disk_key *)(kaddr + oil); } else { - tmp = (struct btrfs_disk_key *)(kaddr + offset - - map_start); + read_extent_buffer(eb, &unaligned, offset, key_size); + tmp = &unaligned; } - ret = comp_keys(tmp, key); + + ret = btrfs_comp_keys(tmp, key); if (ret < 0) low = mid + 1; @@ -1685,78 +812,125 @@ static noinline int generic_bin_search(struct extent_buffer *eb, return 1; } -/* - * simple bin_search frontend that does the right thing for - * leaves vs nodes - */ -static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, - int level, int *slot) -{ - if (level == 0) - return generic_bin_search(eb, - offsetof(struct btrfs_leaf, items), - sizeof(struct btrfs_item), - key, btrfs_header_nritems(eb), - slot); - else - return generic_bin_search(eb, - offsetof(struct btrfs_node, ptrs), - sizeof(struct btrfs_key_ptr), - key, btrfs_header_nritems(eb), - slot); -} - -int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, - int level, int *slot) -{ - return bin_search(eb, key, level, slot); -} - -static void root_add_used(struct btrfs_root *root, u32 size) +static void root_add_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) + size); + btrfs_root_used(&root->root_item) + root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } -static void root_sub_used(struct btrfs_root *root, u32 size) +static void root_sub_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) - size); + btrfs_root_used(&root->root_item) - root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). - * NULL is returned on error. */ -static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, - struct extent_buffer *parent, int slot) +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot) { int level = btrfs_header_level(parent); + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - if (slot < 0) - return NULL; - if (slot >= btrfs_header_nritems(parent)) - return NULL; + if (slot < 0 || slot >= btrfs_header_nritems(parent)) + return ERR_PTR(-ENOENT); + + ASSERT(level); - BUG_ON(level == 0); + check.level = level - 1; + check.transid = btrfs_node_ptr_generation(parent, slot); + check.owner_root = btrfs_header_owner(parent); + check.has_first_key = true; + btrfs_node_key_to_cpu(parent, &check.first_key, slot); - eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), - btrfs_node_ptr_generation(parent, slot)); - if (eb && !extent_buffer_uptodate(eb)) { + eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), + &check); + if (IS_ERR(eb)) + return eb; + if (unlikely(!extent_buffer_uptodate(eb))) { free_extent_buffer(eb); - eb = NULL; + return ERR_PTR(-EIO); } return eb; } /* + * Promote a child node to become the new tree root. + * + * @trans: Transaction handle + * @root: Tree root structure to update + * @path: Path holding nodes and locks + * @level: Level of the parent (old root) + * @parent: The parent (old root) with exactly one item + * + * This helper is called during rebalancing when the root node contains only + * a single item (nritems == 1). We can reduce the tree height by promoting + * that child to become the new root and freeing the old root node. The path + * locks and references are updated accordingly. + * + * Return: 0 on success, negative errno on failure. The transaction is aborted + * on critical errors. + */ +static int promote_child_to_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int level, struct extent_buffer *parent) +{ + struct extent_buffer *child; + int ret; + + ASSERT(btrfs_header_nritems(parent) == 1); + + child = btrfs_read_node_slot(parent, 0); + if (IS_ERR(child)) + return PTR_ERR(child); + + btrfs_tree_lock(child); + ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + return ret; + } + + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); + if (unlikely(ret < 0)) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + return ret; + } + rcu_assign_pointer(root->node, child); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + + path->locks[level] = 0; + path->nodes[level] = NULL; + btrfs_clear_buffer_dirty(trans, parent); + btrfs_tree_unlock(parent); + /* Once for the path. */ + free_extent_buffer(parent); + + root_sub_used_bytes(root); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1); + /* Once for the root ptr. */ + free_extent_buffer_stale(parent); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + return 0; +} + +/* * node level balancing, used to make sure nodes are in proper order for * item deletion. We balance from the top down, so we have to make sure * that a deletion won't leave an node completely empty later on. @@ -1765,6 +939,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = NULL; struct extent_buffer *mid; struct extent_buffer *left = NULL; @@ -1775,13 +950,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, int orig_slot = path->slots[level]; u64 orig_ptr; - if (level == 0) - return 0; + ASSERT(level > 0); mid = path->nodes[level]; - WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && - path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); @@ -1796,78 +969,55 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * by promoting the node below to a root */ if (!parent) { - struct extent_buffer *child; - if (btrfs_header_nritems(mid) != 1) return 0; - /* promote the child to a root */ - child = read_node_slot(root, mid, 0); - if (!child) { - ret = -EROFS; - btrfs_std_error(root->fs_info, ret); - goto enospc; - } - - btrfs_tree_lock(child); - btrfs_set_lock_blocking(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); - if (ret) { - btrfs_tree_unlock(child); - free_extent_buffer(child); - goto enospc; - } - - tree_mod_log_set_root_pointer(root, child, 1); - rcu_assign_pointer(root->node, child); - - add_root_to_dirty_list(root); - btrfs_tree_unlock(child); - - path->locks[level] = 0; - path->nodes[level] = NULL; - clean_tree_block(trans, root, mid); - btrfs_tree_unlock(mid); - /* once for the path */ - free_extent_buffer(mid); - - root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); - /* once for the root ptr */ - free_extent_buffer_stale(mid); - return 0; + return promote_child_to_root(trans, root, path, level, mid); } if (btrfs_header_nritems(mid) > - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = read_node_slot(root, parent, pslot - 1); - if (left) { - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + if (pslot) { + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) { + ret = PTR_ERR(left); + left = NULL; + goto out; + } + + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); + parent, pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } - right = read_node_slot(root, parent, pslot + 1); - if (right) { - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + + if (pslot + 1 < btrfs_header_nritems(parent)) { + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) { + ret = PTR_ERR(right); + right = NULL; + goto out; + } + + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); + parent, pslot + 1, &right, + BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } /* first, try to make some room in the middle buffer */ if (left) { orig_slot += btrfs_header_nritems(left); - wret = push_node_left(trans, root, left, mid, 1); + wret = push_node_left(trans, left, mid, 1); if (wret < 0) ret = wret; } @@ -1876,24 +1026,38 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * then try to empty the right most buffer into the middle */ if (right) { - wret = push_node_left(trans, root, mid, right, 1); + wret = push_node_left(trans, mid, right, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, root, right); + btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); - del_ptr(root, path, level + 1, pslot + 1); - root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); + if (ret < 0) { + free_extent_buffer_stale(right); + right = NULL; + goto out; + } + root_sub_used_bytes(root); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), + right, 0, 1); free_extent_buffer_stale(right); right = NULL; + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto out; + } } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, - pslot + 1, 0); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_set_node_key(parent, &right_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); } } if (btrfs_header_nritems(mid) == 1) { @@ -1906,49 +1070,67 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - if (!left) { - ret = -EROFS; - btrfs_std_error(root->fs_info, ret); - goto enospc; + if (unlikely(!left)) { + btrfs_crit(fs_info, +"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", + parent->start, btrfs_header_level(parent), + mid->start, btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; } - wret = balance_node_right(trans, root, mid, left); + wret = balance_node_right(trans, mid, left); if (wret < 0) { ret = wret; - goto enospc; + goto out; } if (wret == 1) { - wret = push_node_left(trans, root, left, mid, 1); + wret = push_node_left(trans, left, mid, 1); if (wret < 0) ret = wret; } BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, root, mid); + btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); - del_ptr(root, path, level + 1, pslot); - root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); + if (ret < 0) { + free_extent_buffer_stale(mid); + mid = NULL; + goto out; + } + root_sub_used_bytes(root); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto out; + } } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, - pslot, 0); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_set_node_key(parent, &mid_key, pslot); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); } /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - extent_buffer_get(left); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; + /* Left is now owned by path. */ + left = NULL; if (mid) { btrfs_tree_unlock(mid); free_extent_buffer(mid); @@ -1962,14 +1144,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); -enospc: +out: if (right) { btrfs_tree_unlock(right); free_extent_buffer(right); } if (left) { - if (path->nodes[level] != left) - btrfs_tree_unlock(left); + btrfs_tree_unlock(left); free_extent_buffer(left); } return ret; @@ -1983,6 +1164,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = NULL; struct extent_buffer *mid; struct extent_buffer *left = NULL; @@ -2006,26 +1188,27 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = read_node_slot(root, parent, pslot - 1); - /* first, try to make some room in the middle buffer */ - if (left) { + if (pslot) { u32 left_nr; - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) + return PTR_ERR(left); + + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); left_nr = btrfs_header_nritems(left); - if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); + pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) wret = 1; else { - wret = push_node_left(trans, root, - left, mid, 0); + wret = push_node_left(trans, left, mid, 0); } } if (wret < 0) @@ -2034,10 +1217,16 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, - pslot, 0); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (unlikely(ret < 0)) { + btrfs_tree_unlock(left); + free_extent_buffer(left); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(left) > orig_slot) { path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -2056,29 +1245,30 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = read_node_slot(root, parent, pslot + 1); /* * then try to empty the right most buffer into the middle */ - if (right) { + if (pslot + 1 < btrfs_header_nritems(parent)) { u32 right_nr; - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) + return PTR_ERR(right); + + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); right_nr = btrfs_header_nritems(right); - if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { wret = 1; } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right); + &right, BTRFS_NESTING_RIGHT_COW); if (ret) wret = 1; else { - wret = balance_node_right(trans, root, - right, mid); + wret = balance_node_right(trans, right, mid); } } if (wret < 0) @@ -2087,10 +1277,16 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, - pslot + 1, 0); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE); + if (unlikely(ret < 0)) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(mid) <= orig_slot) { path->nodes[level] = right; @@ -2115,8 +1311,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them. */ -static void reada_for_search(struct btrfs_root *root, - struct btrfs_path *path, +static void reada_for_search(struct btrfs_fs_info *fs_info, + const struct btrfs_path *path, int level, int slot, u64 objectid) { struct extent_buffer *node; @@ -2125,14 +1321,12 @@ static void reada_for_search(struct btrfs_root *root, u64 search; u64 target; u64 nread = 0; - u64 gen; - int direction = path->reada; - struct extent_buffer *eb; + u64 nread_max; u32 nr; u32 blocksize; u32 nscan = 0; - if (level != 1) + if (level != 1 && path->reada != READA_FORWARD_ALWAYS) return; if (!path->nodes[level]) @@ -2140,12 +1334,30 @@ static void reada_for_search(struct btrfs_root *root, node = path->nodes[level]; + /* + * Since the time between visiting leaves is much shorter than the time + * between visiting nodes, limit read ahead of nodes to 1, to avoid too + * much IO at once (possibly random). + */ + if (path->reada == READA_FORWARD_ALWAYS) { + if (level > 1) + nread_max = node->fs_info->nodesize; + else + nread_max = SZ_128K; + } else { + nread_max = SZ_64K; + } + search = btrfs_node_blockptr(node, slot); - blocksize = btrfs_level_size(root, level - 1); - eb = btrfs_find_tree_block(root, search, blocksize); - if (eb) { - free_extent_buffer(eb); - return; + blocksize = fs_info->nodesize; + if (path->reada != READA_FORWARD_ALWAYS) { + struct extent_buffer *eb; + + eb = find_extent_buffer(fs_info, search); + if (eb) { + free_extent_buffer(eb); + return; + } } target = search; @@ -2154,44 +1366,39 @@ static void reada_for_search(struct btrfs_root *root, nr = slot; while (1) { - if (direction < 0) { + if (path->reada == READA_BACK) { if (nr == 0) break; nr--; - } else if (direction > 0) { + } else if (path->reada == READA_FORWARD || + path->reada == READA_FORWARD_ALWAYS) { nr++; if (nr >= nritems) break; } - if (path->reada < 0 && objectid) { + if (path->reada == READA_BACK && objectid) { btrfs_node_key(node, &disk_key, nr); if (btrfs_disk_key_objectid(&disk_key) != objectid) break; } search = btrfs_node_blockptr(node, nr); - if ((search <= target && target - search <= 65536) || + if (path->reada == READA_FORWARD_ALWAYS || + (search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { - gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize, gen); + btrfs_readahead_node_child(node, nr); nread += blocksize; } nscan++; - if ((nread > 65536 || nscan > 32)) + if (nread > nread_max || nscan > 32) break; } } -static noinline void reada_for_balance(struct btrfs_root *root, - struct btrfs_path *path, int level) +static noinline void reada_for_balance(const struct btrfs_path *path, int level) { + struct extent_buffer *parent; int slot; int nritems; - struct extent_buffer *parent; - struct extent_buffer *eb; - u64 gen; - u64 block1 = 0; - u64 block2 = 0; - int blocksize; parent = path->nodes[level + 1]; if (!parent) @@ -2199,34 +1406,11 @@ static noinline void reada_for_balance(struct btrfs_root *root, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = btrfs_level_size(root, level); - - if (slot > 0) { - block1 = btrfs_node_blockptr(parent, slot - 1); - gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1, blocksize); - /* - * if we get -eagain from btrfs_buffer_uptodate, we - * don't want to return eagain here. That will loop - * forever - */ - if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) - block1 = 0; - free_extent_buffer(eb); - } - if (slot + 1 < nritems) { - block2 = btrfs_node_blockptr(parent, slot + 1); - gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) - block2 = 0; - free_extent_buffer(eb); - } - if (block1) - readahead_tree_block(root, block1, blocksize, 0); - if (block2) - readahead_tree_block(root, block2, blocksize, 0); + if (slot > 0) + btrfs_readahead_node_child(parent, slot - 1); + if (slot + 1 < nritems) + btrfs_readahead_node_child(parent, slot + 1); } @@ -2249,33 +1433,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level, { int i; int skip_level = level; - int no_skips = 0; - struct extent_buffer *t; + bool check_skip = true; for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; if (!path->locks[i]) break; - if (!no_skips && path->slots[i] == 0) { - skip_level = i + 1; - continue; - } - if (!no_skips && path->keep_locks) { - u32 nritems; - t = path->nodes[i]; - nritems = btrfs_header_nritems(t); - if (nritems < 1 || path->slots[i] >= nritems - 1) { + + if (check_skip) { + if (path->slots[i] == 0) { skip_level = i + 1; continue; } + + if (path->keep_locks) { + u32 nritems; + + nritems = btrfs_header_nritems(path->nodes[i]); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } } - if (skip_level < i && i >= lowest_unlock) - no_skips = 1; - t = path->nodes[i]; - if (i >= lowest_unlock && i > skip_level && path->locks[i]) { - btrfs_tree_unlock_rw(t, path->locks[i]); + if (i >= lowest_unlock && i > skip_level) { + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + check_skip = false; path->locks[i] = 0; if (write_lock_level && i > min_write_lock_level && @@ -2287,112 +1472,160 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* - * This releases any locks held in the path starting at level and - * going all the way up to the root. - * - * btrfs_search_slot will keep the lock held on higher nodes in a few - * corner cases, such as COW of the block at slot zero in the node. This - * ignores those rules, and it should only be called when there are no - * more updates to be done higher up in the tree. - */ -noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) -{ - int i; - - if (path->keep_locks) - return; - - for (i = level; i < BTRFS_MAX_LEVEL; i++) { - if (!path->nodes[i]) - continue; - if (!path->locks[i]) - continue; - btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); - path->locks[i] = 0; - } -} - -/* - * helper function for btrfs_search_slot. The goal is to find a block - * in cache without setting the path to blocking. If we find the block - * we return zero and the path is unchanged. + * Helper function for btrfs_search_slot() and other functions that do a search + * on a btree. The goal is to find a tree block in the cache (the radix tree at + * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read + * its pages from disk. * - * If we can't find the block, we set the path blocking and do some - * reada. -EAGAIN is returned and the search must be repeated. + * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the + * whole btree search, starting again from the current root node. */ static int -read_block_for_search(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer **eb_ret, int level, int slot, - struct btrfs_key *key, u64 time_seq) +read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int slot, + const struct btrfs_key *key) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; - u64 gen; - u32 blocksize; - struct extent_buffer *b = *eb_ret; - struct extent_buffer *tmp; - int ret; - - blocknr = btrfs_node_blockptr(b, slot); - gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); + struct extent_buffer *tmp = NULL; + int ret = 0; + int ret2; + int parent_level; + bool read_tmp = false; + bool tmp_locked = false; + bool path_released = false; + + blocknr = btrfs_node_blockptr(*eb_ret, slot); + parent_level = btrfs_header_level(*eb_ret); + btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); + check.has_first_key = true; + check.level = parent_level - 1; + check.transid = btrfs_node_ptr_generation(*eb_ret, slot); + check.owner_root = btrfs_root_id(root); - tmp = btrfs_find_tree_block(root, blocknr, blocksize); + /* + * If we need to read an extent buffer from disk and we are holding locks + * on upper level nodes, we unlock all the upper nodes before reading the + * extent buffer, and then return -EAGAIN to the caller as it needs to + * restart the search. We don't release the lock on the current level + * because we need to walk this node to figure out which blocks to read. + */ + tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { + if (p->reada == READA_FORWARD_ALWAYS) + reada_for_search(fs_info, p, parent_level, slot, key->objectid); + /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) { + /* + * Do extra check for first_key, eb can be stale due to + * being cached, read from scrub, or have multiple + * parents (shared tree blocks). + */ + if (unlikely(btrfs_verify_level_key(tmp, &check))) { + ret = -EUCLEAN; + goto out; + } *eb_ret = tmp; - return 0; + tmp = NULL; + ret = 0; + goto out; } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - btrfs_set_path_blocking(p); + if (p->nowait) { + ret = -EAGAIN; + goto out; + } + + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); + tmp_locked = true; + btrfs_tree_read_lock(tmp); + btrfs_release_path(p); + ret = -EAGAIN; + path_released = true; + } + + /* Now we're allowed to do a blocking uptodate check. */ + ret2 = btrfs_read_extent_buffer(tmp, &check); + if (ret2) { + ret = ret2; + goto out; + } - /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_buffer(tmp, gen); - if (!ret) { + if (ret == 0) { + ASSERT(!tmp_locked); *eb_ret = tmp; - return 0; + tmp = NULL; } - free_extent_buffer(tmp); + goto out; + } else if (p->nowait) { + ret = -EAGAIN; + goto out; + } + + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); + ret = -EAGAIN; + } + + if (p->reada != READA_NONE) + reada_for_search(fs_info, p, parent_level, slot, key->objectid); + + tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + tmp = NULL; + goto out; + } + read_tmp = true; + + if (!p->skip_locking) { + ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return -EIO; + path_released = true; + } + + /* Now we're allowed to do a blocking uptodate check. */ + ret2 = btrfs_read_extent_buffer(tmp, &check); + if (ret2) { + ret = ret2; + goto out; } /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. Don't release the lock on the current - * level because we need to walk this node to figure - * out which blocks to read. + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. */ - btrfs_unlock_up_safe(p, level + 1); - btrfs_set_path_blocking(p); - - free_extent_buffer(tmp); - if (p->reada) - reada_for_search(root, p, level, slot, key->objectid); - - btrfs_release_path(p); + if (unlikely(!extent_buffer_uptodate(tmp))) { + ret = -EIO; + goto out; + } - ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, 0); + if (ret == 0) { + ASSERT(!tmp_locked); + *eb_ret = tmp; + tmp = NULL; + } +out: if (tmp) { - /* - * If the read above didn't mark this buffer up to date, - * it will never end up being up to date. Set ret to EIO now - * and give up so that our caller doesn't loop forever - * on our EAGAINs. - */ - if (!btrfs_buffer_uptodate(tmp, 0, 0)) - ret = -EIO; - free_extent_buffer(tmp); + if (tmp_locked) + btrfs_tree_read_unlock(tmp); + if (read_tmp && ret && ret != -EAGAIN) + free_extent_buffer_stale(tmp); + else + free_extent_buffer(tmp); } + if (ret && !path_released) + btrfs_release_path(p); + return ret; } @@ -2411,94 +1644,412 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, struct extent_buffer *b, int level, int ins_len, int *write_lock_level) { - int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret; + BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { if (*write_lock_level < level + 1) { *write_lock_level = level + 1; btrfs_release_path(p); - goto again; + return -EAGAIN; } - btrfs_set_path_blocking(p); - reada_for_balance(root, p, level); - sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); + reada_for_balance(p, level); + ret = split_node(trans, root, p, level); - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } b = p->nodes[level]; } else if (ins_len < 0 && btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { - int sret; + BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { if (*write_lock_level < level + 1) { *write_lock_level = level + 1; btrfs_release_path(p); - goto again; + return -EAGAIN; } - btrfs_set_path_blocking(p); - reada_for_balance(root, p, level); - sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); + reada_for_balance(p, level); + ret = balance_level(trans, root, p, level); + if (ret) + return ret; - if (sret) { - ret = sret; - goto done; - } b = p->nodes[level]; if (!b) { btrfs_release_path(p); - goto again; + return -EAGAIN; } BUG_ON(btrfs_header_nritems(b) == 1); } + return ret; +} + +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, + u64 iobjectid, u64 ioff, u8 key_type, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + + ASSERT(path); + ASSERT(found_key); + + key.type = key_type; + key.objectid = iobjectid; + key.offset = ioff; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || + found_key->objectid != key.objectid) + return 1; + return 0; +} + +static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, + struct btrfs_path *p, + int write_lock_level) +{ + struct extent_buffer *b; + int root_lock = 0; + int level = 0; + + if (p->search_commit_root) { + b = root->commit_root; + refcount_inc(&b->refs); + level = btrfs_header_level(b); + /* + * Ensure that all callers have set skip_locking when + * p->search_commit_root is true. + */ + ASSERT(p->skip_locking); + + goto out; + } + + if (p->skip_locking) { + b = btrfs_root_node(root); + level = btrfs_header_level(b); + goto out; + } + + /* We try very hard to do read locks on the root */ + root_lock = BTRFS_READ_LOCK; + + /* + * If the level is set to maximum, we can skip trying to get the read + * lock. + */ + if (write_lock_level < BTRFS_MAX_LEVEL) { + /* + * We don't know the level of the root node until we actually + * have it read locked + */ + if (p->nowait) { + b = btrfs_try_read_lock_root_node(root); + if (IS_ERR(b)) + return b; + } else { + b = btrfs_read_lock_root_node(root); + } + level = btrfs_header_level(b); + if (level > write_lock_level) + goto out; + + /* Whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + } + + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* The level might have changed, check again */ + level = btrfs_header_level(b); + +out: + /* + * The root may have failed to write out at some point, and thus is no + * longer valid, return an error in this case. + */ + if (unlikely(!extent_buffer_uptodate(b))) { + if (root_lock) + btrfs_tree_unlock_rw(b, root_lock); + free_extent_buffer(b); + return ERR_PTR(-EIO); + } + + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; + /* + * Callers are responsible for dropping b's references. + */ + return b; +} + +/* + * Replace the extent buffer at the lowest level of the path with a cloned + * version. The purpose is to be able to use it safely, after releasing the + * commit root semaphore, even if relocation is happening in parallel, the + * transaction used for relocation is committed and the extent buffer is + * reallocated in the next transaction. + * + * This is used in a context where the caller does not prevent transaction + * commits from happening, either by holding a transaction handle or holding + * some lock, while it's doing searches through a commit root. + * At the moment it's only used for send operations. + */ +static int finish_need_commit_sem_search(struct btrfs_path *path) +{ + const int i = path->lowest_level; + const int slot = path->slots[i]; + struct extent_buffer *lowest = path->nodes[i]; + struct extent_buffer *clone; + + ASSERT(path->need_commit_sem); + + if (!lowest) + return 0; + + lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); + + clone = btrfs_clone_extent_buffer(lowest); + if (!clone) + return -ENOMEM; + + btrfs_release_path(path); + path->nodes[i] = clone; + path->slots[i] = slot; + + return 0; +} + +static inline int search_for_key_slot(const struct extent_buffer *eb, + int search_low_slot, + const struct btrfs_key *key, + int prev_cmp, + int *slot) +{ + /* + * If a previous call to btrfs_bin_search() on a parent node returned an + * exact match (prev_cmp == 0), we can safely assume the target key will + * always be at slot 0 on lower levels, since each key pointer + * (struct btrfs_key_ptr) refers to the lowest key accessible from the + * subtree it points to. Thus we can skip searching lower levels. + */ + if (prev_cmp == 0) { + *slot = 0; + return 0; + } + + return btrfs_bin_search(eb, search_low_slot, key, slot); +} + +static int search_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_path *path, + int ins_len, + int prev_cmp) +{ + struct extent_buffer *leaf = path->nodes[0]; + int leaf_free_space = -1; + int search_low_slot = 0; + int ret; + bool do_bin_search = true; + + /* + * If we are doing an insertion, the leaf has enough free space and the + * destination slot for the key is not slot 0, then we can unlock our + * write lock on the parent, and any other upper nodes, before doing the + * binary search on the leaf (with search_for_key_slot()), allowing other + * tasks to lock the parent and any other upper nodes. + */ + if (ins_len > 0) { + /* + * Cache the leaf free space, since we will need it later and it + * will not change until then. + */ + leaf_free_space = btrfs_leaf_free_space(leaf); + + /* + * !path->locks[1] means we have a single node tree, the leaf is + * the root of the tree. + */ + if (path->locks[1] && leaf_free_space >= ins_len) { + struct btrfs_disk_key first_key; + + ASSERT(btrfs_header_nritems(leaf) > 0); + btrfs_item_key(leaf, &first_key, 0); + + /* + * Doing the extra comparison with the first key is cheap, + * taking into account that the first key is very likely + * already in a cache line because it immediately follows + * the extent buffer's header and we have recently accessed + * the header's level field. + */ + ret = btrfs_comp_keys(&first_key, key); + if (ret < 0) { + /* + * The first key is smaller than the key we want + * to insert, so we are safe to unlock all upper + * nodes and we have to do the binary search. + * + * We do use btrfs_unlock_up_safe() and not + * unlock_up() because the later does not unlock + * nodes with a slot of 0 - we can safely unlock + * any node even if its slot is 0 since in this + * case the key does not end up at slot 0 of the + * leaf and there's no need to split the leaf. + */ + btrfs_unlock_up_safe(path, 1); + search_low_slot = 1; + } else { + /* + * The first key is >= then the key we want to + * insert, so we can skip the binary search as + * the target key will be at slot 0. + * + * We can not unlock upper nodes when the key is + * less than the first key, because we will need + * to update the key at slot 0 of the parent node + * and possibly of other upper nodes too. + * If the key matches the first key, then we can + * unlock all the upper nodes, using + * btrfs_unlock_up_safe() instead of unlock_up() + * as stated above. + */ + if (ret == 0) + btrfs_unlock_up_safe(path, 1); + /* + * ret is already 0 or 1, matching the result of + * a btrfs_bin_search() call, so there is no need + * to adjust it. + */ + do_bin_search = false; + path->slots[0] = 0; + } + } + } + + if (do_bin_search) { + ret = search_for_key_slot(leaf, search_low_slot, key, + prev_cmp, &path->slots[0]); + if (ret < 0) + return ret; + } + + if (ins_len > 0) { + /* + * Item key already exists. In this case, if we are allowed to + * insert the item (for example, in dir_item case, item key + * collision is allowed), it will be merged with the original + * item. Only the item size grows, no new btrfs item will be + * added. If search_for_extension is not set, ins_len already + * accounts the size btrfs_item, deduct it here so leaf space + * check will be correct. + */ + if (ret == 0 && !path->search_for_extension) { + ASSERT(ins_len >= sizeof(struct btrfs_item)); + ins_len -= sizeof(struct btrfs_item); + } + + ASSERT(leaf_free_space >= 0); + + if (leaf_free_space < ins_len) { + int ret2; + + ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0)); + ASSERT(ret2 <= 0); + if (WARN_ON(ret2 > 0)) + ret2 = -EUCLEAN; + if (ret2) + ret = ret2; + } + } -again: - ret = -EAGAIN; -done: return ret; } /* - * look for key in the tree. path is filled in with nodes along the way - * if key is found, we return zero and you can find the item in the leaf - * level of the path (level 0) + * Look for a key in a tree and perform necessary modifications to preserve + * tree invariants. + * + * @trans: Handle of transaction, used when modifying the tree + * @p: Holds all btree nodes along the search path + * @root: The root node of the tree + * @key: The key we are looking for + * @ins_len: Indicates purpose of search: + * >0 for inserts it's size of item inserted (*) + * <0 for deletions + * 0 for plain searches, not modifying the tree + * + * (*) If size of item inserted doesn't include + * sizeof(struct btrfs_item), then p->search_for_extension must + * be set. + * @cow: boolean should CoW operations be performed. Must always be 1 + * when modifying the tree. * - * If the key isn't found, the path points to the slot where it should - * be inserted, and 1 is returned. If there are other errors during the - * search a negative error number is returned. + * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. + * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) * - * if ins_len > 0, nodes and leaves will be split as we walk down the - * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if - * possible) + * If @key is found, 0 is returned and you can find the item in the leaf level + * of the path (level 0) + * + * If @key isn't found, 1 is returned and the leaf level of the path (level 0) + * points to the slot where it should be inserted + * + * If an error is encountered while searching the tree a negative error number + * is returned */ -int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_path *p, int - ins_len, int cow) +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow) { + struct btrfs_fs_info *fs_info; struct extent_buffer *b; int slot; int ret; - int err; int level; int lowest_unlock = 1; - int root_lock; /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; int min_write_lock_level; + int prev_cmp; + + if (!root) + return -EINVAL; + + fs_info = root->fs_info; + might_sleep(); lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); + BUG_ON(!cow && ins_len); + + /* + * For now only allow nowait for read only operations. There's no + * strict reason why we can't, we just only need it for reads so it's + * only implemented for reads. + */ + ASSERT(!p->nowait || !cow); if (ins_len < 0) { lowest_unlock = 2; @@ -2524,56 +2075,33 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root min_write_lock_level = write_lock_level; -again: - /* - * we try very hard to do read locks on the root - */ - root_lock = BTRFS_READ_LOCK; - level = 0; - if (p->search_commit_root) { - /* - * the commit roots are read only - * so we always do read locks - */ - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (!p->skip_locking) - btrfs_tree_read_lock(b); - } else { - if (p->skip_locking) { - b = btrfs_root_node(root); - level = btrfs_header_level(b); + if (p->need_commit_sem) { + ASSERT(p->search_commit_root); + if (p->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) + return -EAGAIN; } else { - /* we don't know the level of the root node - * until we actually have it read locked - */ - b = btrfs_read_lock_root_node(root); - level = btrfs_header_level(b); - if (level <= write_lock_level) { - /* whoops, must trade for write lock */ - btrfs_tree_read_unlock(b); - free_extent_buffer(b); - b = btrfs_lock_root_node(root); - root_lock = BTRFS_WRITE_LOCK; - - /* the level might have changed, check again */ - level = btrfs_header_level(b); - } + down_read(&fs_info->commit_root_sem); } } - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = root_lock; + +again: + prev_cmp = -1; + b = btrfs_search_slot_get_root(root, p, write_lock_level); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto done; + } while (b) { + int dec = 0; + int ret2; + level = btrfs_header_level(b); - /* - * setup the path here so we can release it under lock - * contention with the cow code - */ if (cow) { + bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); + /* * if we don't really need to cow this block * then we don't want to set the path blocking, @@ -2582,8 +2110,6 @@ again: if (!should_cow_block(trans, root, b)) goto cow_done; - btrfs_set_path_blocking(p); - /* * must have write locks on this node and the * parent @@ -2597,19 +2123,21 @@ again: goto again; } - err = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); - if (err) { - ret = err; + if (last_level) + ret2 = btrfs_cow_block(trans, root, b, NULL, 0, + &b, BTRFS_NESTING_COW); + else + ret2 = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b, + BTRFS_NESTING_COW); + if (ret2) { + ret = ret2; goto done; } } cow_done: - BUG_ON(!cow && ins_len); - p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -2617,127 +2145,120 @@ cow_done: * It is safe to drop the lock on our parent before we * go through the expensive btree search on b. * - * If cow is true, then we might be changing slot zero, - * which may require changing the parent. So, we can't - * drop the lock until after we know which slot we're - * operating on. + * If we're inserting or deleting (ins_len != 0), then we might + * be changing slot zero, which may require changing the parent. + * So, we can't drop the lock until after we know which slot + * we're operating on. */ - if (!cow) - btrfs_unlock_up_safe(p, level + 1); - - ret = bin_search(b, key, level, &slot); + if (!ins_len && !p->keep_locks) { + int u = level + 1; - if (level != 0) { - int dec = 0; - if (ret && slot > 0) { - dec = 1; - slot -= 1; - } - p->slots[level] = slot; - err = setup_nodes_for_search(trans, root, p, b, level, - ins_len, &write_lock_level); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; + if (u < BTRFS_MAX_LEVEL && p->locks[u]) { + btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); + p->locks[u] = 0; } - b = p->nodes[level]; - slot = p->slots[level]; + } - /* - * slot 0 is special, if we change the key - * we have to update the parent pointer - * which means we must have a write lock - * on the parent - */ - if (slot == 0 && cow && - write_lock_level < level + 1) { - write_lock_level = level + 1; - btrfs_release_path(p); - goto again; - } + if (level == 0) { + if (ins_len > 0) + ASSERT(write_lock_level >= 1); - unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); + ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock, + min_write_lock_level, NULL); + goto done; + } - if (level == lowest_level) { - if (dec) - p->slots[level]++; - goto done; - } + ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); + if (ret < 0) + goto done; + prev_cmp = ret; - err = read_block_for_search(trans, root, p, - &b, level, slot, key, 0); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } + if (ret && slot > 0) { + dec = 1; + slot--; + } + p->slots[level] = slot; + ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, + &write_lock_level); + if (ret2 == -EAGAIN) + goto again; + if (ret2) { + ret = ret2; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; - if (!p->skip_locking) { - level = btrfs_header_level(b); - if (level <= write_lock_level) { - err = btrfs_try_tree_write_lock(b); - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_WRITE_LOCK); - } - p->locks[level] = BTRFS_WRITE_LOCK; - } else { - err = btrfs_try_tree_read_lock(b); - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); - } - p->locks[level] = BTRFS_READ_LOCK; - } - p->nodes[level] = b; - } - } else { - p->slots[level] = slot; - if (ins_len > 0 && - btrfs_leaf_free_space(root, b) < ins_len) { - if (write_lock_level < 1) { - write_lock_level = 1; - btrfs_release_path(p); - goto again; - } + /* + * Slot 0 is special, if we change the key we have to update + * the parent pointer which means we must have a write lock on + * the parent + */ + if (slot == 0 && ins_len && write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } - btrfs_set_path_blocking(p); - err = split_leaf(trans, root, key, - p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL, 0); + unlock_up(p, level, lowest_unlock, min_write_lock_level, + &write_lock_level); - BUG_ON(err > 0); - if (err) { - ret = err; - goto done; + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + ret2 = read_block_for_search(root, p, &b, slot, key); + if (ret2 == -EAGAIN && !p->nowait) + goto again; + if (ret2) { + ret = ret2; + goto done; + } + + if (!p->skip_locking) { + level = btrfs_header_level(b); + + btrfs_maybe_reset_lockdep_class(root, b); + + if (level <= write_lock_level) { + btrfs_tree_lock(b); + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + if (p->nowait) { + if (!btrfs_try_tree_read_lock(b)) { + free_extent_buffer(b); + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(b); } + p->locks[level] = BTRFS_READ_LOCK; } - if (!p->search_for_split) - unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); - goto done; + p->nodes[level] = b; } } ret = 1; done: - /* - * we don't really know what they plan on doing with the path - * from here on, so for now just mark it as blocking - */ - if (!p->leave_spinning) - btrfs_set_path_blocking(p); - if (ret < 0) + if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); + + if (p->need_commit_sem) { + int ret2; + + ret2 = finish_need_commit_sem_search(p); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } + return ret; } +ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); /* * Like btrfs_search_slot, this looks for a key in the given tree. It uses the @@ -2750,19 +2271,20 @@ done: * The resulting path and return value will be set up as if we called * btrfs_search_slot at that point in time with ins_len and cow both set to 0. */ -int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, +int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, u64 time_seq) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; - int err; int level; int lowest_unlock = 1; u8 lowest_level = 0; lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); + ASSERT(!p->nowait); if (p->search_commit_root) { BUG_ON(time_seq); @@ -2770,14 +2292,20 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, } again: - b = get_old_root(root, time_seq); + b = btrfs_get_old_root(root, time_seq); + if (unlikely(!b)) { + ret = -EIO; + goto done; + } level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; while (b) { + int dec = 0; + int ret2; + level = btrfs_header_level(b); p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -2787,53 +2315,49 @@ again: */ btrfs_unlock_up_safe(p, level + 1); - ret = bin_search(b, key, level, &slot); + ret = btrfs_bin_search(b, 0, key, &slot); + if (ret < 0) + goto done; - if (level != 0) { - int dec = 0; - if (ret && slot > 0) { - dec = 1; - slot -= 1; - } + if (level == 0) { p->slots[level] = slot; unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } - if (level == lowest_level) { - if (dec) - p->slots[level]++; - goto done; - } + if (ret && slot > 0) { + dec = 1; + slot--; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); - err = read_block_for_search(NULL, root, p, &b, level, - slot, key, time_seq); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } - level = btrfs_header_level(b); - err = btrfs_try_tree_read_lock(b); - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); - } - b = tree_mod_log_rewind(root->fs_info, b, time_seq); - p->locks[level] = BTRFS_READ_LOCK; - p->nodes[level] = b; - } else { - p->slots[level] = slot; - unlock_up(p, level, lowest_unlock, 0, NULL); + ret2 = read_block_for_search(root, p, &b, slot, key); + if (ret2 == -EAGAIN && !p->nowait) + goto again; + if (ret2) { + ret = ret2; + goto done; + } + + level = btrfs_header_level(b); + btrfs_tree_read_lock(b); + b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); + if (!b) { + ret = -ENOMEM; goto done; } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; } ret = 1; done: - if (!p->leave_spinning) - btrfs_set_path_blocking(p); if (ret < 0) btrfs_release_path(p); @@ -2841,6 +2365,87 @@ done: } /* + * Search the tree again to find a leaf with smaller keys. + * Returns 0 if it found something. + * Returns 1 if there are no smaller keys. + * Returns < 0 on error. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_key orig_key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; + + if (key.offset > 0) { + key.offset--; + } else if (key.type > 0) { + key.type--; + key.offset = (u64)-1; + } else if (key.objectid > 0) { + key.objectid--; + key.type = (u8)-1; + key.offset = (u64)-1; + } else { + return 1; + } + + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret <= 0) + return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = btrfs_comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = btrfs_comp_keys(&found_key, &key); + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) + return 0; + return 1; +} + +/* * helper to use instead of search slot if no exact match is needed but * instead the next or previous item should be returned. * When find_higher is true, the next higher item is returned, the next lower @@ -2853,8 +2458,9 @@ done: * < 0 on error */ int btrfs_search_slot_for_read(struct btrfs_root *root, - struct btrfs_key *key, struct btrfs_path *p, - int find_higher, int return_any) + const struct btrfs_key *key, + struct btrfs_path *p, int find_higher, + int return_any) { int ret; struct extent_buffer *leaf; @@ -2894,7 +2500,9 @@ again: if (ret < 0) return ret; if (!ret) { - p->slots[0] = btrfs_header_nritems(leaf) - 1; + leaf = p->nodes[0]; + if (p->slots[0] == btrfs_header_nritems(leaf)) + p->slots[0]--; return 0; } if (!return_any) @@ -2915,6 +2523,53 @@ again: } /* + * Execute search and call btrfs_previous_item to traverse backwards if the item + * was not found. + * + * Return 0 if found, 1 if not found and < 0 if error. + */ +int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + int ret; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret > 0) + ret = btrfs_previous_item(root, path, key->objectid, key->type); + + if (ret == 0) + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); + + return ret; +} + +/* + * Search for a valid slot for the given path. + * + * @root: The root node of the tree. + * @key: Will contain a valid item if found. + * @path: The starting point to validate the slot. + * + * Return: 0 if the item is valid + * 1 if not found + * <0 if error. + */ +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + int ret; + + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; + } + + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); + return 0; +} + +/* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. * This is used after shifting pointers to the left, so it stops @@ -2922,20 +2577,25 @@ again: * higher levels * */ -static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, int level) +static void fixup_low_keys(struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const struct btrfs_disk_key *key, int level) { int i; struct extent_buffer *t; + int ret; for (i = level; i < BTRFS_MAX_LEVEL; i++) { int tslot = path->slots[i]; + if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); + ret = btrfs_tree_mod_log_insert_key(t, tslot, + BTRFS_MOD_LOG_KEY_REPLACE); + BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); - btrfs_mark_buffer_dirty(path->nodes[i]); + btrfs_mark_buffer_dirty(trans, path->nodes[i]); if (tslot != 0) break; } @@ -2947,9 +2607,11 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key) +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const struct btrfs_key *new_key) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *eb; int slot; @@ -2958,18 +2620,91 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - BUG_ON(comp_keys(&disk_key, new_key) >= 0); + if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { + btrfs_print_leaf(eb); + btrfs_crit(fs_info, + "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, + slot, btrfs_disk_key_objectid(&disk_key), + btrfs_disk_key_type(&disk_key), + btrfs_disk_key_offset(&disk_key), + BTRFS_KEY_FMT_VALUE(new_key)); + BUG(); + } } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - BUG_ON(comp_keys(&disk_key, new_key) <= 0); + if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { + btrfs_print_leaf(eb); + btrfs_crit(fs_info, + "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, + slot, btrfs_disk_key_objectid(&disk_key), + btrfs_disk_key_type(&disk_key), + btrfs_disk_key_offset(&disk_key), + BTRFS_KEY_FMT_VALUE(new_key)); + BUG(); + } } btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(eb, &disk_key, slot); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); +} + +/* + * Check key order of two sibling extent buffers. + * + * Return true if something is wrong. + * Return false if everything is fine. + * + * Tree-checker only works inside one tree block, thus the following + * corruption can not be detected by tree-checker: + * + * Leaf @left | Leaf @right + * -------------------------------------------------------------- + * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | + * + * Key f6 in leaf @left itself is valid, but not valid when the next + * key in leaf @right is 7. + * This can only be checked at tree block merge time. + * And since tree checker has ensured all key order in each tree block + * is correct, we only need to bother the last key of @left and the first + * key of @right. + */ +static bool check_sibling_keys(const struct extent_buffer *left, + const struct extent_buffer *right) +{ + struct btrfs_key left_last; + struct btrfs_key right_first; + int level = btrfs_header_level(left); + int nr_left = btrfs_header_nritems(left); + int nr_right = btrfs_header_nritems(right); + + /* No key to check in one of the tree blocks */ + if (!nr_left || !nr_right) + return false; + + if (level) { + btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_node_key_to_cpu(right, &right_first, 0); + } else { + btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_item_key_to_cpu(right, &right_first, 0); + } + + if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { + btrfs_crit(left->fs_info, "left extent buffer:"); + btrfs_print_tree(left, false); + btrfs_crit(left->fs_info, "right extent buffer:"); + btrfs_print_tree(right, false); + btrfs_crit(left->fs_info, +"bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&left_last), + BTRFS_KEY_FMT_VALUE(&right_first)); + return true; + } + return false; } /* @@ -2980,9 +2715,10 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, * error, and > 0 if there was no room in the left hand block. */ static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *dst, - struct extent_buffer *src, int empty) + struct extent_buffer *dst, + struct extent_buffer *src, bool empty) { + struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; int src_nritems; int dst_nritems; @@ -2990,7 +2726,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, src_nritems = btrfs_header_nritems(src); dst_nritems = btrfs_header_nritems(dst); - push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; WARN_ON(btrfs_header_generation(src) != trans->transid); WARN_ON(btrfs_header_generation(dst) != trans->transid); @@ -3015,27 +2751,36 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); - tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + /* dst is the left eb, src is the middle eb */ + if (unlikely(check_sibling_keys(dst, src))) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } + ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(dst_nritems), - btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(dst, dst_nritems), + btrfs_node_key_ptr_offset(src, 0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { /* - * don't call tree_mod_log_eb_move here, key removal was already - * fully logged by tree_mod_log_eb_copy above. + * btrfs_tree_mod_log_eb_copy handles logging the move, so we + * don't need to do an explicit tree mod log operation for it. */ - memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(push_items), + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), + btrfs_node_key_ptr_offset(src, push_items), (src_nritems - push_items) * sizeof(struct btrfs_key_ptr)); } btrfs_set_header_nritems(src, src_nritems - push_items); btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); return ret; } @@ -3050,10 +2795,10 @@ static int push_node_left(struct btrfs_trans_handle *trans, * this will only push up to 1/2 the contents of the left node over */ static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *dst, struct extent_buffer *src) { + struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; int max_push; int src_nritems; @@ -3065,7 +2810,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, src_nritems = btrfs_header_nritems(src); dst_nritems = btrfs_header_nritems(dst); - push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; if (push_items <= 0) return 1; @@ -3080,24 +2825,38 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; - tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); - memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), - btrfs_node_key_ptr_offset(0), + /* dst is the right eb, src is the middle eb */ + if (unlikely(check_sibling_keys(src, dst))) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } + + /* + * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't + * need to do an explicit tree mod log operation for it. + */ + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), + btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); - tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, + push_items); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(src_nritems - push_items), + btrfs_node_key_ptr_offset(dst, 0), + btrfs_node_key_ptr_offset(src, src_nritems - push_items), push_items * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(src, src_nritems - push_items); btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); return ret; } @@ -3118,6 +2877,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct extent_buffer *c; struct extent_buffer *old; struct btrfs_disk_key lower_key; + int ret; BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); @@ -3128,30 +2888,15 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, &lower_key, - level, root->node->start, 0); + c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), + &lower_key, level, root->node->start, 0, + 0, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); - root_add_used(root, root->nodesize); + root_add_used_bytes(root); - memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); - btrfs_set_header_level(c, level); - btrfs_set_header_bytenr(c, c->start); - btrfs_set_header_generation(c, trans->transid); - btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(c, root->root_key.objectid); - - write_extent_buffer(c, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(c), - BTRFS_FSID_SIZE); - - write_extent_buffer(c, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(c), - BTRFS_UUID_SIZE); - btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); lower_gen = btrfs_header_generation(lower); @@ -3159,17 +2904,28 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(c, 0, lower_gen); - btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(trans, c); old = root->node; - tree_mod_log_set_root_pointer(root, c, 0); + ret = btrfs_tree_mod_log_insert_root(root->node, c, false); + if (ret < 0) { + int ret2; + + btrfs_clear_buffer_dirty(trans, c); + ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + if (unlikely(ret2 < 0)) + btrfs_abort_transaction(trans, ret2); + btrfs_tree_unlock(c); + free_extent_buffer(c); + return ret; + } rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ free_extent_buffer(old); add_root_to_dirty_list(root); - extent_buffer_get(c); + refcount_inc(&c->refs); path->nodes[level] = c; path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; @@ -3183,41 +2939,51 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. */ -static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) +static int insert_ptr(struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; int ret; BUG_ON(!path->nodes[level]); - btrfs_assert_tree_locked(path->nodes[level]); + btrfs_assert_tree_write_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); if (slot != nritems) { - if (level) - tree_mod_log_eb_move(root->fs_info, lower, slot + 1, - slot, nritems - slot); + if (level) { + ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, + slot, nritems - slot); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } memmove_extent_buffer(lower, - btrfs_node_key_ptr_offset(slot + 1), - btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(lower, slot + 1), + btrfs_node_key_ptr_offset(lower, slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { - ret = tree_mod_log_insert_key(root->fs_info, lower, slot, - MOD_LOG_KEY_ADD); - BUG_ON(ret < 0); + ret = btrfs_tree_mod_log_insert_key(lower, slot, + BTRFS_MOD_LOG_KEY_ADD); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); - btrfs_mark_buffer_dirty(lower); + btrfs_mark_buffer_dirty(trans, lower); + + return 0; } /* @@ -3233,6 +2999,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *c; struct extent_buffer *split; struct btrfs_disk_key disk_key; @@ -3249,9 +3016,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * tree mod log: We don't log_removal old root in * insert_new_root, because that root buffer will be kept as a * normal node. We are going to log removal of half of the - * elements below with tree_mod_log_eb_copy. We're holding a - * tree lock on the buffer, which is why we cannot race with - * other tree_mod_log users. + * elements below with btrfs_tree_mod_log_eb_copy(). We're + * holding a tree lock on the buffer, which is why we cannot + * race with other tree_mod_log users. */ ret = insert_new_root(trans, root, path, level + 1); if (ret) @@ -3260,7 +3027,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) + BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) return 0; if (ret < 0) return ret; @@ -3270,41 +3037,39 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, - &disk_key, level, c->start, 0); + split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), + &disk_key, level, c->start, 0, + 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); - root_add_used(root, root->nodesize); - - memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_level(split, btrfs_header_level(c)); - btrfs_set_header_bytenr(split, split->start); - btrfs_set_header_generation(split, trans->transid); - btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(split, root->root_key.objectid); - write_extent_buffer(split, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(split), - BTRFS_FSID_SIZE); - write_extent_buffer(split, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(split), - BTRFS_UUID_SIZE); - - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + root_add_used_bytes(root); + ASSERT(btrfs_header_level(c) == level); + + ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); + if (unlikely(ret)) { + btrfs_tree_unlock(split); + free_extent_buffer(split); + btrfs_abort_transaction(trans, ret); + return ret; + } copy_extent_buffer(split, c, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(mid), + btrfs_node_key_ptr_offset(split, 0), + btrfs_node_key_ptr_offset(c, mid), (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); - ret = 0; - btrfs_mark_buffer_dirty(c); - btrfs_mark_buffer_dirty(split); + btrfs_mark_buffer_dirty(trans, c); + btrfs_mark_buffer_dirty(trans, split); - insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); + ret = insert_ptr(trans, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); + if (ret < 0) { + btrfs_tree_unlock(split); + free_extent_buffer(split); + return ret; + } if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -3316,7 +3081,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_tree_unlock(split); free_extent_buffer(split); } - return ret; + return 0; } /* @@ -3324,23 +3089,16 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * and nr indicate which items in the leaf to check. This totals up the * space used both by the item structs and the item data */ -static int leaf_space_used(struct extent_buffer *l, int start, int nr) +static int leaf_space_used(const struct extent_buffer *l, int start, int nr) { - struct btrfs_item *start_item; - struct btrfs_item *end_item; - struct btrfs_map_token token; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - btrfs_init_map_token(&token); - start_item = btrfs_item_nr(l, start); - end_item = btrfs_item_nr(l, end); - data_len = btrfs_token_item_offset(l, start_item, &token) + - btrfs_token_item_size(l, start_item, &token); - data_len = data_len - btrfs_token_item_offset(l, end_item, &token); + data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); + data_len = data_len - btrfs_item_offset(l, end); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; @@ -3351,17 +3109,19 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -noinline int btrfs_leaf_free_space(struct btrfs_root *root, - struct extent_buffer *leaf) +int btrfs_leaf_free_space(const struct extent_buffer *leaf) { + struct btrfs_fs_info *fs_info = leaf->fs_info; int nritems = btrfs_header_nritems(leaf); int ret; - ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); - if (ret < 0) { - printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " - "used %d nritems %d\n", - ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), - leaf_space_used(leaf, 0, nritems), nritems); + + ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); + if (unlikely(ret < 0)) { + btrfs_crit(fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", + ret, + (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), + leaf_space_used(leaf, 0, nritems), nritems); } return ret; } @@ -3371,29 +3131,25 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, * right. We'll push up to and including min_slot, but no lower */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - int data_size, int empty, + int data_size, bool empty, struct extent_buffer *right, int free_space, u32 left_nritems, u32 min_slot) { + struct btrfs_fs_info *fs_info = right->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; - struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; int push_space = 0; int push_items = 0; - struct btrfs_item *item; u32 nr; u32 right_nritems; u32 data_end; u32 this_item_size; - btrfs_init_map_token(&token); - if (empty) nr = 0; else @@ -3405,13 +3161,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, slot = path->slots[1]; i = left_nritems - 1; while (i >= nr) { - item = btrfs_item_nr(left, i); - if (!empty && push_items > 0) { if (path->slots[0] > i) break; if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(root, left); + int space = btrfs_leaf_free_space(left); + if (space + push_space * 2 > free_space) break; } @@ -3420,12 +3175,13 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] == i) push_space += data_size; - this_item_size = btrfs_item_size(left, item); - if (this_item_size + sizeof(*item) + push_space > free_space) + this_item_size = btrfs_item_size(left, i); + if (this_item_size + sizeof(struct btrfs_item) + + push_space > free_space) break; push_items++; - push_space += this_item_size + sizeof(*item); + push_space += this_item_size + sizeof(struct btrfs_item); if (i == 0) break; i--; @@ -3439,62 +3195,51 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, /* push left to right */ right_nritems = btrfs_header_nritems(right); - push_space = btrfs_item_end_nr(left, left_nritems - push_items); - push_space -= leaf_data_end(root, left); + push_space = btrfs_item_data_end(left, left_nritems - push_items); + push_space -= leaf_data_end(left); /* make room in the right data area */ - data_end = leaf_data_end(root, right); - memmove_extent_buffer(right, - btrfs_leaf_data(right) + data_end - push_space, - btrfs_leaf_data(right) + data_end, - BTRFS_LEAF_DATA_SIZE(root) - data_end); + data_end = leaf_data_end(right); + memmove_leaf_data(right, data_end - push_space, data_end, + BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, btrfs_leaf_data(right) + - BTRFS_LEAF_DATA_SIZE(root) - push_space, - btrfs_leaf_data(left) + leaf_data_end(root, left), - push_space); + copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(left), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), - btrfs_item_nr_offset(0), - right_nritems * sizeof(struct btrfs_item)); + memmove_leaf_items(right, push_items, 0, right_nritems); /* copy the items from left to right */ - copy_extent_buffer(right, left, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(left_nritems - push_items), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); /* update the item pointers */ right_nritems += push_items; btrfs_set_header_nritems(right, right_nritems); - push_space = BTRFS_LEAF_DATA_SIZE(root); + push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); - push_space -= btrfs_token_item_size(right, item, &token); - btrfs_set_token_item_offset(right, item, push_space, &token); + push_space -= btrfs_item_size(right, i); + btrfs_set_item_offset(right, i, push_space); } left_nritems -= push_items; btrfs_set_header_nritems(left, left_nritems); if (left_nritems) - btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(trans, left); else - clean_tree_block(trans, root, left); + btrfs_clear_buffer_dirty(trans, left); - btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); btrfs_set_node_key(upper, &disk_key, slot + 1); - btrfs_mark_buffer_dirty(upper); + btrfs_mark_buffer_dirty(trans, upper); /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); + btrfs_tree_unlock(left); + free_extent_buffer(left); path->nodes[0] = right; path->slots[1] += 1; } else { @@ -3522,7 +3267,7 @@ out_unlock: static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int min_data_size, int data_size, - int empty, u32 min_slot) + bool empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -3540,35 +3285,49 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (slot >= btrfs_header_nritems(upper) - 1) return 1; - btrfs_assert_tree_locked(path->nodes[1]); + btrfs_assert_tree_write_locked(path->nodes[1]); - right = read_node_slot(root, upper, slot + 1); - if (right == NULL) - return 1; + right = btrfs_read_node_slot(upper, slot + 1); + if (IS_ERR(right)) + return PTR_ERR(right); - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); - free_space = btrfs_leaf_free_space(root, right); + free_space = btrfs_leaf_free_space(right); if (free_space < data_size) goto out_unlock; - /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - left_nritems = btrfs_header_nritems(left); if (left_nritems == 0) goto out_unlock; - return __push_leaf_right(trans, root, path, min_data_size, empty, - right, free_space, left_nritems, min_slot); + if (unlikely(check_sibling_keys(left, right))) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } + if (path->slots[0] == left_nritems && !empty) { + /* Key greater than all keys in the leaf, right neighbor has + * enough room for it and we're not emptying our leaf to delete + * it, therefore use right neighbor to insert the new item and + * no need to touch/dirty our left leaf. */ + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1]++; + return 0; + } + + return __push_leaf_right(trans, path, min_data_size, empty, right, + free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -3584,26 +3343,22 @@ out_unlock: * items */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, int data_size, - int empty, struct extent_buffer *left, + bool empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) { + struct btrfs_fs_info *fs_info = left->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; int i; int push_space = 0; int push_items = 0; - struct btrfs_item *item; u32 old_left_nritems; u32 nr; int ret = 0; u32 this_item_size; u32 old_left_item_size; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); if (empty) nr = min(right_nritems, max_slot); @@ -3611,13 +3366,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { - item = btrfs_item_nr(right, i); - if (!empty && push_items > 0) { if (path->slots[0] < i) break; if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(root, right); + int space = btrfs_leaf_free_space(right); + if (space + push_space * 2 > free_space) break; } @@ -3626,94 +3380,84 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (path->slots[0] == i) push_space += data_size; - this_item_size = btrfs_item_size(right, item); - if (this_item_size + sizeof(*item) + push_space > free_space) + this_item_size = btrfs_item_size(right, i); + if (this_item_size + sizeof(struct btrfs_item) + push_space > + free_space) break; push_items++; - push_space += this_item_size + sizeof(*item); + push_space += this_item_size + sizeof(struct btrfs_item); } if (push_items == 0) { ret = 1; goto out; } - if (!empty && push_items == btrfs_header_nritems(right)) - WARN_ON(1); + WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ - copy_extent_buffer(left, right, - btrfs_item_nr_offset(btrfs_header_nritems(left)), - btrfs_item_nr_offset(0), - push_items * sizeof(struct btrfs_item)); - - push_space = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_offset_nr(right, push_items - 1); - - copy_extent_buffer(left, right, btrfs_leaf_data(left) + - leaf_data_end(root, left) - push_space, - btrfs_leaf_data(right) + - btrfs_item_offset_nr(right, push_items - 1), - push_space); + copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); + + push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - + btrfs_item_offset(right, push_items - 1); + + copy_leaf_data(left, right, leaf_data_end(left) - push_space, + btrfs_item_offset(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); - old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - item = btrfs_item_nr(left, i); - - ioff = btrfs_token_item_offset(left, item, &token); - btrfs_set_token_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), - &token); + ioff = btrfs_item_offset(left, i); + btrfs_set_item_offset(left, i, + ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - if (push_items > right_nritems) - WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, - right_nritems); + if (unlikely(push_items > right_nritems)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)", + push_items, right_nritems); + goto out; + } if (push_items < right_nritems) { - push_space = btrfs_item_offset_nr(right, push_items - 1) - - leaf_data_end(root, right); - memmove_extent_buffer(right, btrfs_leaf_data(right) + - BTRFS_LEAF_DATA_SIZE(root) - push_space, - btrfs_leaf_data(right) + - leaf_data_end(root, right), push_space); - - memmove_extent_buffer(right, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(push_items), - (btrfs_header_nritems(right) - push_items) * - sizeof(struct btrfs_item)); + push_space = btrfs_item_offset(right, push_items - 1) - + leaf_data_end(right); + memmove_leaf_data(right, + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(right), push_space); + + memmove_leaf_items(right, 0, push_items, + btrfs_header_nritems(right) - push_items); } + right_nritems -= push_items; btrfs_set_header_nritems(right, right_nritems); - push_space = BTRFS_LEAF_DATA_SIZE(root); + push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); - - push_space = push_space - btrfs_token_item_size(right, - item, &token); - btrfs_set_token_item_offset(right, item, push_space, &token); + push_space = push_space - btrfs_item_size(right, i); + btrfs_set_item_offset(right, i, push_space); } - btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(trans, left); if (right_nritems) - btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(trans, right); else - clean_tree_block(trans, root, right); + btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); + btrfs_tree_unlock(right); + free_extent_buffer(right); path->nodes[0] = left; path->slots[1] -= 1; } else { @@ -3758,24 +3502,23 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (right_nritems == 0) return 1; - btrfs_assert_tree_locked(path->nodes[1]); + btrfs_assert_tree_write_locked(path->nodes[1]); - left = read_node_slot(root, path->nodes[1], slot - 1); - if (left == NULL) - return 1; + left = btrfs_read_node_slot(path->nodes[1], slot - 1); + if (IS_ERR(left)) + return PTR_ERR(left); - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); - free_space = btrfs_leaf_free_space(root, left); + free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { ret = 1; goto out; } - /* cow and double check */ ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) @@ -3783,15 +3526,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; + if (unlikely(check_sibling_keys(left, right))) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); goto out; } - - return __push_leaf_left(trans, root, path, min_data_size, - empty, left, free_space, right_nritems, - max_slot); + return __push_leaf_left(trans, path, min_data_size, empty, left, + free_space, right_nritems, max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -3802,53 +3543,45 @@ out: * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. */ -static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { + struct btrfs_fs_info *fs_info = trans->fs_info; int data_copy_size; int rt_data_off; int i; + int ret; struct btrfs_disk_key disk_key; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), - nritems * sizeof(struct btrfs_item)); + copy_leaf_items(right, l, 0, mid, nritems); - copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - - data_copy_size, btrfs_leaf_data(l) + - leaf_data_end(root, l), data_copy_size); + copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, + leaf_data_end(l), data_copy_size); - rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_end_nr(l, mid); + rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - ioff = btrfs_token_item_offset(right, item, &token); - btrfs_set_token_item_offset(right, item, - ioff + rt_data_off, &token); + ioff = btrfs_item_offset(right, i); + btrfs_set_item_offset(right, i, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + if (ret < 0) + return ret; - btrfs_mark_buffer_dirty(right); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, right); + btrfs_mark_buffer_dirty(trans, l); BUG_ON(path->slots[0] != slot); if (mid <= slot) { @@ -3863,6 +3596,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); + + return 0; } /* @@ -3884,14 +3619,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, int progress = 0; int slot; u32 nritems; + int space_needed = data_size; slot = path->slots[0]; + if (slot < btrfs_header_nritems(path->nodes[0])) + space_needed -= btrfs_leaf_free_space(path->nodes[0]); /* * try to push all the items after our slot into the * right leaf */ - ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3906,12 +3644,15 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, if (path->slots[0] == 0 || path->slots[0] == nritems) return 0; - if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) return 0; /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; - ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + space_needed = data_size; + if (slot > 0) + space_needed -= btrfs_leaf_free_space(path->nodes[0]); + ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -3931,9 +3672,9 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, */ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *ins_key, + const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, - int extend) + bool extend) { struct btrfs_disk_key disk_key; struct extent_buffer *l; @@ -3941,6 +3682,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int mid; int slot; struct extent_buffer *right; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int wret; int split; @@ -3949,26 +3691,34 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; - if (extend && data_size + btrfs_item_size_nr(l, slot) + - sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) + if (extend && data_size + btrfs_item_size(l, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) return -EOVERFLOW; /* first try to make some room by pushing left and right */ if (data_size && path->nodes[1]) { - wret = push_leaf_right(trans, root, path, data_size, - data_size, 0, 0); + int space_needed = data_size; + + if (slot < btrfs_header_nritems(l)) + space_needed -= btrfs_leaf_free_space(l); + + wret = push_leaf_right(trans, root, path, space_needed, + space_needed, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, - data_size, 0, (u32)-1); + space_needed = data_size; + if (slot > 0) + space_needed -= btrfs_leaf_free_space(l); + wret = push_leaf_left(trans, root, path, space_needed, + space_needed, 0, (u32)-1); if (wret < 0) return wret; } l = path->nodes[0]; /* did the pushes work? */ - if (btrfs_leaf_free_space(root, l) >= data_size) + if (btrfs_leaf_free_space(l) >= data_size) return 0; } @@ -3987,14 +3737,14 @@ again: if (mid <= slot) { if (nritems == 1 || leaf_space_used(l, mid, nritems - mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { + BTRFS_LEAF_DATA_SIZE(fs_info)) { if (slot >= nritems) { split = 0; } else { mid = slot; if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { + data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { if (data_size && !tried_avoid_double) goto push_for_double; split = 2; @@ -4003,7 +3753,7 @@ again: } } else { if (leaf_space_used(l, 0, mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { + BTRFS_LEAF_DATA_SIZE(fs_info)) { if (!extend && data_size && slot == 0) { split = 0; } else if ((extend || !data_size) && slot == 0) { @@ -4012,10 +3762,10 @@ again: mid = slot; if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { + data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { if (data_size && !tried_avoid_double) goto push_for_double; - split = 2 ; + split = 2; } } } @@ -4026,33 +3776,33 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - root->root_key.objectid, - &disk_key, 0, l->start, 0); + /* + * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double + * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES + * subclasses, which is 8 at the time of this patch, and we've maxed it + * out. In the future we could add a + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ + right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), + &disk_key, 0, l->start, 0, 0, + num_doubles ? BTRFS_NESTING_NEW_ROOT : + BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, root->leafsize); - - memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(right, right->start); - btrfs_set_header_generation(right, trans->transid); - btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(right, root->root_key.objectid); - btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(right), - BTRFS_FSID_SIZE); - - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(right), - BTRFS_UUID_SIZE); + root_add_used_bytes(root); if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1] + 1, 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -4060,20 +3810,34 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1], 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1], 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } - btrfs_mark_buffer_dirty(right); + /* + * We create a new leaf 'right' for the required ins_len and + * we'll do btrfs_mark_buffer_dirty() on this leaf after copying + * the content of ins_len to 'right'. + */ return ret; } - copy_for_split(trans, root, path, l, right, slot, mid, nritems); + ret = copy_for_split(trans, path, l, right, slot, mid, nritems); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (split == 2) { BUG_ON(num_doubles != 0); @@ -4086,7 +3850,7 @@ again: push_for_double: push_for_double_split(trans, root, path, data_size); tried_avoid_double = 1; - if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) return 0; goto again; } @@ -4106,12 +3870,13 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); - if (btrfs_leaf_free_space(root, leaf) >= ins_len) + if (btrfs_leaf_free_space(leaf) >= ins_len) return 0; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (key.type == BTRFS_EXTENT_DATA_KEY) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -4119,21 +3884,23 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - path->keep_locks = 1; - path->search_for_split = 1; + path->keep_locks = true; + path->search_for_split = true; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - path->search_for_split = 0; + path->search_for_split = false; + if (ret > 0) + ret = -EAGAIN; if (ret < 0) goto err; ret = -EAGAIN; leaf = path->nodes[0]; - /* if our item isn't there or got smaller, return now */ - if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + /* if our item isn't there, return now */ + if (item_size != btrfs_item_size(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ - if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len) + if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) goto err; if (key.type == BTRFS_EXTENT_DATA_KEY) { @@ -4143,29 +3910,25 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, goto err; } - btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &key, path, ins_len, 1); if (ret) goto err; - path->keep_locks = 0; + path->keep_locks = false; btrfs_unlock_up_safe(path, 1); return 0; err: - path->keep_locks = 0; + path->keep_locks = false; return ret; } static noinline int split_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key, + const struct btrfs_key *new_key, unsigned long split_offset) { struct extent_buffer *leaf; - struct btrfs_item *item; - struct btrfs_item *new_item; - int slot; + int orig_slot, slot; char *buf; u32 nritems; u32 item_size; @@ -4173,13 +3936,16 @@ static noinline int split_item(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); - - btrfs_set_path_blocking(path); + /* + * Shouldn't happen because the caller must have previously called + * setup_leaf_for_split() to make room for the new item in the leaf. + */ + if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) + return -ENOSPC; - item = btrfs_item_nr(leaf, path->slots[0]); - orig_offset = btrfs_item_offset(leaf, item); - item_size = btrfs_item_size(leaf, item); + orig_slot = path->slots[0]; + orig_offset = btrfs_item_offset(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); buf = kmalloc(item_size, GFP_NOFS); if (!buf) @@ -4192,22 +3958,18 @@ static noinline int split_item(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(leaf); if (slot != nritems) { /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); } btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(leaf, &disk_key, slot); - new_item = btrfs_item_nr(leaf, slot); - - btrfs_set_item_offset(leaf, new_item, orig_offset); - btrfs_set_item_size(leaf, new_item, item_size - split_offset); + btrfs_set_item_offset(leaf, slot, orig_offset); + btrfs_set_item_size(leaf, slot, item_size - split_offset); - btrfs_set_item_offset(leaf, item, - orig_offset + item_size - split_offset); - btrfs_set_item_size(leaf, item, split_offset); + btrfs_set_item_offset(leaf, orig_slot, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, orig_slot, split_offset); btrfs_set_header_nritems(leaf, nritems + 1); @@ -4220,9 +3982,9 @@ static noinline int split_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, buf + split_offset, btrfs_item_ptr_offset(leaf, slot), item_size - split_offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); - BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); + BUG_ON(btrfs_leaf_free_space(leaf) < 0); kfree(buf); return 0; } @@ -4245,7 +4007,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key, + const struct btrfs_key *new_key, unsigned long split_offset) { int ret; @@ -4254,79 +4016,39 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(trans, root, path, new_key, split_offset); + ret = split_item(trans, path, new_key, split_offset); return ret; } /* - * This function duplicate a item, giving 'new_key' to the new item. - * It guarantees both items live in the same tree leaf and the new item - * is contiguous with the original item. - * - * This allows us to split file extent in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_duplicate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key) -{ - struct extent_buffer *leaf; - int ret; - u32 item_size; - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ret = setup_leaf_for_split(trans, root, path, - item_size + sizeof(struct btrfs_item)); - if (ret) - return ret; - - path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); - leaf = path->nodes[0]; - memcpy_extent_buffer(leaf, - btrfs_item_ptr_offset(leaf, path->slots[0]), - btrfs_item_ptr_offset(leaf, path->slots[0] - 1), - item_size); - return 0; -} - -/* * make the item pointed to by the path smaller. new_size indicates * how small to make it, and from_end tells us if we just chop bytes * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, - u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + const struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; - struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data_start; unsigned int old_size; unsigned int size_diff; int i; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; slot = path->slots[0]; - old_size = btrfs_item_size_nr(leaf, slot); + old_size = btrfs_item_size(leaf, slot); if (old_size == new_size) return; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); + data_end = leaf_data_end(leaf); - old_data_start = btrfs_item_offset_nr(leaf, slot); + old_data_start = btrfs_item_offset(leaf, slot); size_diff = old_size - new_size; @@ -4339,18 +4061,15 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff + size_diff, &token); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff + size_diff); } /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + - data_end, old_data_start + new_size - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; u64 offset; @@ -4371,28 +4090,25 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, ptr = btrfs_item_ptr_offset(leaf, slot); memmove_extent_buffer(leaf, ptr, (unsigned long)fi, - offsetof(struct btrfs_file_extent_item, - disk_bytenr)); + BTRFS_FILE_EXTENT_INLINE_DATA_START); } } - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + - data_end, old_data_start - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } - item = btrfs_item_nr(leaf, slot); - btrfs_set_item_size(leaf, item, new_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_set_item_size(leaf, slot, new_size); + btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { + btrfs_print_leaf(leaf); BUG(); } } @@ -4400,39 +4116,35 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, /* * make the item pointed to by the path bigger, data_size is the added size. */ -void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, - u32 data_size) +void btrfs_extend_item(struct btrfs_trans_handle *trans, + const struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; - struct btrfs_item *item; u32 nritems; unsigned int data_end; unsigned int old_data; unsigned int old_size; int i; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); + data_end = leaf_data_end(leaf); - if (btrfs_leaf_free_space(root, leaf) < data_size) { - btrfs_print_leaf(root, leaf); + if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) { + btrfs_print_leaf(leaf); BUG(); } slot = path->slots[0]; - old_data = btrfs_item_end_nr(leaf, slot); + old_data = btrfs_item_data_end(leaf, slot); BUG_ON(slot < 0); - if (slot >= nritems) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d too large, nritems %d\n", - slot, nritems); - BUG_ON(1); + if (unlikely(slot >= nritems)) { + btrfs_print_leaf(leaf); + btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", + slot, nritems); + BUG(); } /* @@ -4441,71 +4153,83 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, /* first correct the data pointers */ for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - data_size, &token); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff - data_size); } /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - data_size, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); - - data_end = old_data; - old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(leaf, slot); - btrfs_set_item_size(leaf, item, old_size + data_size); - btrfs_mark_buffer_dirty(leaf); - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); + memmove_leaf_data(leaf, data_end - data_size, data_end, + old_data - data_end); + + old_size = btrfs_item_size(leaf, slot); + btrfs_set_item_size(leaf, slot, old_size + data_size); + btrfs_mark_buffer_dirty(trans, leaf); + + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { + btrfs_print_leaf(leaf); BUG(); } } /* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot + * Make space in the node before inserting one or more items. + * + * @trans: transaction handle + * @root: root we are inserting items to + * @path: points to the leaf/slot where we are going to insert new items + * @batch: information about the batch of items to insert + * + * Main purpose is to save stack depth by doing the bulk of the work in a + * function that doesn't call btrfs_search_slot */ -void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) +static void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + const struct btrfs_item_batch *batch) { - struct btrfs_item *item; + struct btrfs_fs_info *fs_info = root->fs_info; int i; u32 nritems; unsigned int data_end; struct btrfs_disk_key disk_key; struct extent_buffer *leaf; int slot; - struct btrfs_map_token token; + u32 total_size; - btrfs_init_map_token(&token); + /* + * Before anything else, update keys in the parent and other ancestors + * if needed, then release the write locks on them, so that other tasks + * can use them while we modify the leaf. + */ + if (path->slots[0] == 0) { + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); + fixup_low_keys(trans, path, &disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); + data_end = leaf_data_end(leaf); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); - if (btrfs_leaf_free_space(root, leaf) < total_size) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "not enough freespace need %u have %d\n", - total_size, btrfs_leaf_free_space(root, leaf)); + if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) { + btrfs_print_leaf(leaf); + btrfs_crit(fs_info, "not enough freespace need %u have %d", + total_size, btrfs_leaf_free_space(leaf)); BUG(); } if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); - - if (old_data < data_end) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", - slot, old_data, data_end); - BUG_ON(1); + unsigned int old_data = btrfs_item_data_end(leaf, slot); + + if (unlikely(old_data < data_end)) { + btrfs_print_leaf(leaf); + btrfs_crit(fs_info, + "item at slot %d with data offset %u beyond data end of leaf %u", + slot, old_data, data_end); + BUG(); } /* * item0..itemN ... dataN.offset..dataN.size .. data0.size @@ -4514,70 +4238,81 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, for (i = slot; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, + ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); + memmove_leaf_data(leaf, data_end - batch->total_data_size, + data_end, old_data - data_end); data_end = old_data; } /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + for (i = 0; i < batch->nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); - data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); + data_end -= batch->data_sizes[i]; + btrfs_set_item_offset(leaf, slot + i, data_end); + btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]); } - btrfs_set_header_nritems(leaf, nritems + nr); - - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root, path, &disk_key, 1); - } - btrfs_unlock_up_safe(path, 1); - btrfs_mark_buffer_dirty(leaf); + btrfs_set_header_nritems(leaf, nritems + batch->nr); + btrfs_mark_buffer_dirty(trans, leaf); - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); + if (unlikely(btrfs_leaf_free_space(leaf) < 0)) { + btrfs_print_leaf(leaf); BUG(); } } /* + * Insert a new item into a leaf. + * + * @trans: Transaction handle. + * @root: The root of the btree. + * @path: A path pointing to the target leaf and slot. + * @key: The key of the new item. + * @data_size: The size of the data associated with the new key. + */ +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size) +{ + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + setup_items_for_insert(trans, root, path, &batch); +} + +/* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. + * + * Returns: 0 on success + * -EEXIST if the first key already exists + * < 0 on other errors */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) + const struct btrfs_item_batch *batch) { int ret = 0; int slot; - int i; - u32 total_size = 0; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; + u32 total_size; - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); if (ret == 0) return -EEXIST; if (ret < 0) @@ -4586,8 +4321,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, - total_data, total_size, nr); + setup_items_for_insert(trans, root, path, batch); return 0; } @@ -4595,12 +4329,12 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, * Given a key and some data, insert an item into the tree. * This does all the path init required, making room in the tree if needed. */ -int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *cpu_key, void *data, u32 - data_size) +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *cpu_key, void *data, + u32 data_size) { int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; unsigned long ptr; @@ -4612,20 +4346,55 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, data, ptr, data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } - btrfs_free_path(path); return ret; } /* + * This function duplicates an item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item is + * contiguous with the original item. + * + * This allows us to split a file extent in place, keeping a lock on the leaf + * the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* * delete the pointer from a given node. * * the tree should have been previously balanced so the deletion does not * empty a node. + * + * This is exported for use inside btrfs-progs, don't un-export it. */ -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot) +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4633,18 +4402,26 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { - if (level) - tree_mod_log_eb_move(root->fs_info, parent, slot, - slot + 1, nritems - slot - 1); + if (level) { + ret = btrfs_tree_mod_log_insert_move(parent, slot, + slot + 1, nritems - slot - 1); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } memmove_extent_buffer(parent, - btrfs_node_key_ptr_offset(slot), - btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(parent, slot), + btrfs_node_key_ptr_offset(parent, slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + ret = btrfs_tree_mod_log_insert_key(parent, slot, + BTRFS_MOD_LOG_KEY_REMOVE); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } } nritems--; @@ -4657,9 +4434,10 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, level + 1); + fixup_low_keys(trans, path, &disk_key, level + 1); } - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); + return 0; } /* @@ -4672,13 +4450,17 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { + int ret; + WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(root, path, 1, path->slots[1]); + ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); + if (ret < 0) + return ret; /* * btrfs_free_extent is expensive, we want to make sure we @@ -4686,11 +4468,15 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - root_sub_used(root, leaf->len); + root_sub_used_bytes(root); - extent_buffer_get(leaf); - btrfs_free_tree_block(trans, root, leaf, 0, 1); + refcount_inc(&leaf->refs); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); + if (ret < 0) + btrfs_abort_transaction(trans, ret); + + return ret; } /* * delete the item at the leaf level in path. If that empties @@ -4699,59 +4485,46 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - struct btrfs_item *item; - int last_off; - int dsize = 0; int ret = 0; int wret; - int i; u32 nritems; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; - last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); - - for (i = 0; i < nr; i++) - dsize += btrfs_item_size_nr(leaf, slot + i); - nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(root, leaf); + const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); + const int data_end = leaf_data_end(leaf); + u32 dsize = 0; + int i; + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size(leaf, slot + i); - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + dsize, - btrfs_leaf_data(leaf) + data_end, - last_off - data_end); + memmove_leaf_data(leaf, data_end + dsize, data_end, + last_off - data_end); for (i = slot + nr; i < nritems; i++) { u32 ioff; - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff + dsize, &token); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff + dsize); } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), - btrfs_item_nr_offset(slot + nr), - sizeof(struct btrfs_item) * - (nritems - slot - nr)); + memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); } btrfs_set_header_nritems(leaf, nritems - nr); nritems -= nr; /* delete the leaf if we've emptied it */ if (nritems == 0) { - if (leaf == root->node) { - btrfs_set_header_level(leaf, 0); - } else { - btrfs_set_path_blocking(path); - clean_tree_block(trans, root, leaf); - btrfs_del_leaf(trans, root, path, leaf); + if (leaf != root->node) { + btrfs_clear_buffer_dirty(trans, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -4759,37 +4532,63 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } - /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { + /* + * Try to delete the leaf if it is mostly empty. We do this by + * trying to move all its items into its left and right neighbours. + * If we can't move all the items, then we don't delete it - it's + * not ideal, but future insertions might fill the leaf with more + * items, or items from other leaves might be moved later into our + * leaf due to deletions on those leaves. + */ + if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { + u32 min_push_space; + /* push_leaf_left fixes the path. * make sure the path still points to our leaf - * for possible call to del_ptr below + * for possible call to btrfs_del_ptr below */ slot = path->slots[1]; - extent_buffer_get(leaf); - - btrfs_set_path_blocking(path); - wret = push_leaf_left(trans, root, path, 1, 1, - 1, (u32)-1); + refcount_inc(&leaf->refs); + /* + * We want to be able to at least push one item to the + * left neighbour leaf, and that's the first item. + */ + min_push_space = sizeof(struct btrfs_item) + + btrfs_item_size(leaf, 0); + wret = push_leaf_left(trans, root, path, 0, + min_push_space, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, - 1, 1, 0); + /* + * If we were not able to push all items from our + * leaf to its left neighbour, then attempt to + * either push all the remaining items to the + * right neighbour or none. There's no advantage + * in pushing only some items, instead of all, as + * it's pointless to end up with a leaf having + * too few items while the neighbours can be full + * or nearly full. + */ + nritems = btrfs_header_nritems(leaf); + min_push_space = leaf_space_used(leaf, 0, nritems); + wret = push_leaf_right(trans, root, path, 0, + min_push_space, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); free_extent_buffer(leaf); - ret = 0; + if (ret < 0) + return ret; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the @@ -4797,67 +4596,25 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * dirtied this buffer */ if (path->nodes[0] == leaf) - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); free_extent_buffer(leaf); } } else { - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } } return ret; } /* - * search the tree again to find a leaf with lesser keys - * returns 0 if it found something or 1 if there are no lesser leaves. - * returns < 0 on io errors. - * - * This may release the path, and so you may lose any locks held at the - * time you call it. - */ -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - struct btrfs_key key; - struct btrfs_disk_key found_key; - int ret; - - btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - - if (key.offset > 0) - key.offset--; - else if (key.type > 0) - key.type--; - else if (key.objectid > 0) - key.objectid--; - else - return 1; - - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ret; - btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); - if (ret < 0) - return 0; - return 1; -} - -/* * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are have a minimum transaction id. + * for leaves that have a minimum transaction id. * This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the * key and get a writable path. * - * This does lock as it descends, and path->keep_locks should be set - * to 1 by the caller. - * - * This honors path->lowest_level to prevent descent past a given level - * of the tree. - * * min_trans indicates the oldest transaction that you are interested * in walking through. Any nodes or leaves older than min_trans are * skipped over (without reading them). @@ -4866,19 +4623,20 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) * was nothing in the tree that matched the search criteria. */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, struct btrfs_path *path, u64 min_trans) { struct extent_buffer *cur; - struct btrfs_key found_key; int slot; int sret; u32 nritems; int level; int ret = 1; + const bool keep_locks = path->keep_locks; - WARN_ON(!path->keep_locks); + ASSERT(!path->nowait); + ASSERT(path->lowest_level == 0); + path->keep_locks = true; again: cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); @@ -4893,28 +4651,31 @@ again: while (1) { nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); - sret = bin_search(cur, min_key, level, &slot); + sret = btrfs_bin_search(cur, 0, min_key, &slot); + if (sret < 0) { + ret = sret; + goto out; + } - /* at the lowest level, we're done, setup the path and exit */ - if (level == path->lowest_level) { + /* At level 0 we're done, setup the path and exit. */ + if (level == 0) { if (slot >= nritems) goto find_next_key; ret = 0; path->slots[level] = slot; - btrfs_item_key_to_cpu(cur, &found_key, slot); + /* Save our key for returning back. */ + btrfs_item_key_to_cpu(cur, min_key, slot); goto out; } if (sret && slot > 0) slot--; /* * check this node pointer against the min_trans parameters. - * If it is too old, old, skip to the next one. + * If it is too old, skip to the next one. */ while (slot < nritems) { - u64 blockptr; u64 gen; - blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); if (gen < min_trans) { slot++; @@ -4927,9 +4688,8 @@ find_next_key: * we didn't find a candidate key in this node, walk forward * and find another one */ + path->slots[level] = slot; if (slot >= nritems) { - path->slots[level] = slot; - btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -4939,457 +4699,22 @@ find_next_key: goto out; } } - /* save our key for returning back */ - btrfs_node_key_to_cpu(cur, &found_key, slot); - path->slots[level] = slot; - if (level == path->lowest_level) { - ret = 0; - unlock_up(path, level, 1, 0, NULL); + cur = btrfs_read_node_slot(cur, slot); + if (IS_ERR(cur)) { + ret = PTR_ERR(cur); goto out; } - btrfs_set_path_blocking(path); - cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); /* -ENOMEM */ btrfs_tree_read_lock(cur); path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; unlock_up(path, level, 1, 0, NULL); - btrfs_clear_path_blocking(path, NULL, 0); } out: + path->keep_locks = keep_locks; if (ret == 0) - memcpy(min_key, &found_key, sizeof(found_key)); - btrfs_set_path_blocking(path); - return ret; -} - -static void tree_move_down(struct btrfs_root *root, - struct btrfs_path *path, - int *level, int root_level) -{ - BUG_ON(*level == 0); - path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], - path->slots[*level]); - path->slots[*level - 1] = 0; - (*level)--; -} - -static int tree_move_next_or_upnext(struct btrfs_root *root, - struct btrfs_path *path, - int *level, int root_level) -{ - int ret = 0; - int nritems; - nritems = btrfs_header_nritems(path->nodes[*level]); - - path->slots[*level]++; - - while (path->slots[*level] >= nritems) { - if (*level == root_level) - return -1; - - /* move upnext */ - path->slots[*level] = 0; - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - (*level)++; - path->slots[*level]++; - - nritems = btrfs_header_nritems(path->nodes[*level]); - ret = 1; - } - return ret; -} - -/* - * Returns 1 if it had to move up and next. 0 is returned if it moved only next - * or down. - */ -static int tree_advance(struct btrfs_root *root, - struct btrfs_path *path, - int *level, int root_level, - int allow_down, - struct btrfs_key *key) -{ - int ret; - - if (*level == 0 || !allow_down) { - ret = tree_move_next_or_upnext(root, path, level, root_level); - } else { - tree_move_down(root, path, level, root_level); - ret = 0; - } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } - return ret; -} - -static int tree_compare_item(struct btrfs_root *left_root, - struct btrfs_path *left_path, - struct btrfs_path *right_path, - char *tmp_buf) -{ - int cmp; - int len1, len2; - unsigned long off1, off2; - - len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); - len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); - if (len1 != len2) - return 1; - - off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); - off2 = btrfs_item_ptr_offset(right_path->nodes[0], - right_path->slots[0]); - - read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); - - cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); - if (cmp) - return 1; - return 0; -} - -#define ADVANCE 1 -#define ADVANCE_ONLY_NEXT -1 - -/* - * This function compares two trees and calls the provided callback for - * every changed/new/deleted item it finds. - * If shared tree blocks are encountered, whole subtrees are skipped, making - * the compare pretty fast on snapshotted subvolumes. - * - * This currently works on commit roots only. As commit roots are read only, - * we don't do any locking. The commit roots are protected with transactions. - * Transactions are ended and rejoined when a commit is tried in between. - * - * This function checks for modifications done to the trees while comparing. - * If it detects a change, it aborts immediately. - */ -int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) -{ - int ret; - int cmp; - struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *left_path = NULL; - struct btrfs_path *right_path = NULL; - struct btrfs_key left_key; - struct btrfs_key right_key; - char *tmp_buf = NULL; - int left_root_level; - int right_root_level; - int left_level; - int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; - u64 left_blockptr; - u64 right_blockptr; - u64 left_start_ctransid; - u64 right_start_ctransid; - u64 ctransid; - - left_path = btrfs_alloc_path(); - if (!left_path) { - ret = -ENOMEM; - goto out; - } - right_path = btrfs_alloc_path(); - if (!right_path) { - ret = -ENOMEM; - goto out; - } - - tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); - if (!tmp_buf) { - ret = -ENOMEM; - goto out; - } - - left_path->search_commit_root = 1; - left_path->skip_locking = 1; - right_path->search_commit_root = 1; - right_path->skip_locking = 1; - - spin_lock(&left_root->root_item_lock); - left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - - spin_lock(&right_root->root_item_lock); - right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - /* - * Strategy: Go to the first items of both trees. Then do - * - * If both trees are at level 0 - * Compare keys of current items - * If left < right treat left item as new, advance left tree - * and repeat - * If left > right treat right item as deleted, advance right tree - * and repeat - * If left == right do deep compare of items, treat as changed if - * needed, advance both trees and repeat - * If both trees are at the same level but not at level 0 - * Compare keys of current nodes/leafs - * If left < right advance left tree and repeat - * If left > right advance right tree and repeat - * If left == right compare blockptrs of the next nodes/leafs - * If they match advance both trees but stay at the same level - * and repeat - * If they don't match advance both trees while allowing to go - * deeper and repeat - * If tree levels are different - * Advance the tree that needs it and repeat - * - * Advancing a tree means: - * If we are at level 0, try to go to the next slot. If that's not - * possible, go one level up and repeat. Stop when we found a level - * where we could go to the next slot. We may at this point be on a - * node or a leaf. - * - * If we are not at level 0 and not on shared tree blocks, go one - * level deeper. - * - * If we are not at level 0 and on shared tree blocks, go one slot to - * the right if possible or go up and right. - */ - - left_level = btrfs_header_level(left_root->commit_root); - left_root_level = left_level; - left_path->nodes[left_level] = left_root->commit_root; - extent_buffer_get(left_path->nodes[left_level]); - - right_level = btrfs_header_level(right_root->commit_root); - right_root_level = right_level; - right_path->nodes[right_level] = right_root->commit_root; - extent_buffer_get(right_path->nodes[right_level]); - - if (left_level == 0) - btrfs_item_key_to_cpu(left_path->nodes[left_level], - &left_key, left_path->slots[left_level]); - else - btrfs_node_key_to_cpu(left_path->nodes[left_level], - &left_key, left_path->slots[left_level]); - if (right_level == 0) - btrfs_item_key_to_cpu(right_path->nodes[right_level], - &right_key, right_path->slots[right_level]); - else - btrfs_node_key_to_cpu(right_path->nodes[right_level], - &right_key, right_path->slots[right_level]); - - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; - - while (1) { - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. This means, we need to - * join and leave transactions for every item that we process. - */ - if (trans && btrfs_should_end_transaction(trans, left_root)) { - btrfs_release_path(left_path); - btrfs_release_path(right_path); - - ret = btrfs_end_transaction(trans, left_root); - trans = NULL; - if (ret < 0) - goto out; - } - /* now rejoin the transaction */ - if (!trans) { - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - spin_lock(&left_root->root_item_lock); - ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - if (ctransid != left_start_ctransid) - left_start_ctransid = 0; - - spin_lock(&right_root->root_item_lock); - ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - if (ctransid != right_start_ctransid) - right_start_ctransid = 0; - - if (!left_start_ctransid || !right_start_ctransid) { - WARN(1, KERN_WARNING - "btrfs: btrfs_compare_tree detected " - "a change in one of the trees while " - "iterating. This is probably a " - "bug.\n"); - ret = -EIO; - goto out; - } - - /* - * the commit root may have changed, so start again - * where we stopped - */ - left_path->lowest_level = left_level; - right_path->lowest_level = right_level; - ret = btrfs_search_slot(NULL, left_root, - &left_key, left_path, 0, 0); - if (ret < 0) - goto out; - ret = btrfs_search_slot(NULL, right_root, - &right_key, right_path, 0, 0); - if (ret < 0) - goto out; - } - - if (advance_left && !left_end_reached) { - ret = tree_advance(left_root, left_path, &left_level, - left_root_level, - advance_left != ADVANCE_ONLY_NEXT, - &left_key); - if (ret < 0) - left_end_reached = ADVANCE; - advance_left = 0; - } - if (advance_right && !right_end_reached) { - ret = tree_advance(right_root, right_path, &right_level, - right_root_level, - advance_right != ADVANCE_ONLY_NEXT, - &right_key); - if (ret < 0) - right_end_reached = ADVANCE; - advance_right = 0; - } - - if (left_end_reached && right_end_reached) { - ret = 0; - goto out; - } else if (left_end_reached) { - if (right_level == 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, - &right_key, - BTRFS_COMPARE_TREE_DELETED, - ctx); - if (ret < 0) - goto out; - } - advance_right = ADVANCE; - continue; - } else if (right_end_reached) { - if (left_level == 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_NEW, - ctx); - if (ret < 0) - goto out; - } - advance_left = ADVANCE; - continue; - } - - if (left_level == 0 && right_level == 0) { - cmp = btrfs_comp_cpu_keys(&left_key, &right_key); - if (cmp < 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_NEW, - ctx); - if (ret < 0) - goto out; - advance_left = ADVANCE; - } else if (cmp > 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, - &right_key, - BTRFS_COMPARE_TREE_DELETED, - ctx); - if (ret < 0) - goto out; - advance_right = ADVANCE; - } else { - WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = tree_compare_item(left_root, left_path, - right_path, tmp_buf); - if (ret) { - WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = changed_cb(left_root, right_root, - left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_CHANGED, - ctx); - if (ret < 0) - goto out; - } - advance_left = ADVANCE; - advance_right = ADVANCE; - } - } else if (left_level == right_level) { - cmp = btrfs_comp_cpu_keys(&left_key, &right_key); - if (cmp < 0) { - advance_left = ADVANCE; - } else if (cmp > 0) { - advance_right = ADVANCE; - } else { - left_blockptr = btrfs_node_blockptr( - left_path->nodes[left_level], - left_path->slots[left_level]); - right_blockptr = btrfs_node_blockptr( - right_path->nodes[right_level], - right_path->slots[right_level]); - if (left_blockptr == right_blockptr) { - /* - * As we're on a shared block, don't - * allow to go deeper. - */ - advance_left = ADVANCE_ONLY_NEXT; - advance_right = ADVANCE_ONLY_NEXT; - } else { - advance_left = ADVANCE; - advance_right = ADVANCE; - } - } - } else if (left_level < right_level) { - advance_right = ADVANCE; - } else { - advance_left = ADVANCE; - } - } - -out: - btrfs_free_path(left_path); - btrfs_free_path(right_path); - kfree(tmp_buf); - - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, left_root); - else - btrfs_end_transaction(trans, left_root); - } - + btrfs_unlock_up_safe(path, 1); return ret; } @@ -5401,7 +4726,7 @@ out: * 0 is returned if another key is found, < 0 if there are any errors * and 1 is returned if there are no higher keys in the tree * - * path->keep_locks should be set to 1 on the search made before + * path->keep_locks should be set to true on the search made before * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -5410,7 +4735,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, int slot; struct extent_buffer *c; - WARN_ON(!path->keep_locks); + WARN_ON(!path->keep_locks && !path->skip_locking); while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) return 1; @@ -5426,7 +4751,7 @@ next: !path->nodes[level + 1]) return 1; - if (path->locks[level + 1]) { + if (path->locks[level + 1] || path->skip_locking) { level++; continue; } @@ -5469,16 +4794,6 @@ next: return 1; } -/* - * search the tree again to find a leaf with greater keys - * returns 0 if it found something or 1 if there are no greater leaves. - * returns < 0 on io errors. - */ -int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - return btrfs_next_old_leaf(root, path, 0); -} - int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { @@ -5486,11 +4801,19 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int level; struct extent_buffer *c; struct extent_buffer *next; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; + bool need_commit_sem = false; u32 nritems; int ret; - int old_spinning = path->leave_spinning; - int next_rw_lock = 0; + int i; + + /* + * The nowait semantics are used only for write paths, where we don't + * use the tree mod log and sequence numbers. + */ + if (time_seq) + ASSERT(!path->nowait); nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) @@ -5500,33 +4823,51 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, again: level = 1; next = NULL; - next_rw_lock = 0; btrfs_release_path(path); - path->keep_locks = 1; - path->leave_spinning = 1; + path->keep_locks = true; - if (time_seq) + if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); - else + } else { + if (path->need_commit_sem) { + path->need_commit_sem = false; + need_commit_sem = true; + if (path->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) { + ret = -EAGAIN; + goto done; + } + } else { + down_read(&fs_info->commit_root_sem); + } + } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - path->keep_locks = 0; + } + path->keep_locks = false; if (ret < 0) - return ret; + goto done; nritems = btrfs_header_nritems(path->nodes[0]); /* - * by releasing the path above we dropped all our locks. A balance - * could have added more items next to the key that used to be - * at the very end of the block. So, check again here and - * advance the path if there are now more items available. + * By releasing the path above we dropped all our locks. A balance + * could have happened and + * + * 1. added more items after the previous last item + * 2. deleted the previous last item + * + * So, check again here and advance the path if there are now more + * items available. */ - if (nritems > 0 && path->slots[0] < nritems - 1) { - if (ret == 0) + if (nritems > 0 && path->slots[0] <= nritems - 1) { + if (ret == 0 && path->slots[0] != nritems - 1) { path->slots[0]++; - ret = 0; - goto done; + goto done; + } else if (ret > 0) { + ret = 0; + goto done; + } } while (level < BTRFS_MAX_LEVEL) { @@ -5546,16 +4887,24 @@ again: continue; } - if (next) { - btrfs_tree_unlock_rw(next, next_rw_lock); - free_extent_buffer(next); + + /* + * Our current level is where we're going to start from, and to + * make sure lockdep doesn't complain we need to drop our locks + * and nodes from 0 to our current level. + */ + for (i = 0; i < level; i++) { + if (path->locks[level]) { + btrfs_tree_read_unlock(path->nodes[i]); + path->locks[i] = 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; } next = c; - next_rw_lock = path->locks[level]; - ret = read_block_for_search(NULL, root, path, &next, level, - slot, &key, 0); - if (ret == -EAGAIN) + ret = read_block_for_search(root, path, &next, slot, &key); + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -5565,6 +4914,10 @@ again: if (!path->skip_locking) { ret = btrfs_try_tree_read_lock(next); + if (!ret && path->nowait) { + ret = -EAGAIN; + goto done; + } if (!ret && time_seq) { /* * If we don't get the lock, we may be racing @@ -5578,34 +4931,23 @@ again: cond_resched(); goto again; } - if (!ret) { - btrfs_set_path_blocking(path); + if (!ret) btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); - } - next_rw_lock = BTRFS_READ_LOCK; } break; } path->slots[level] = slot; while (1) { level--; - c = path->nodes[level]; - if (path->locks[level]) - btrfs_tree_unlock_rw(c, path->locks[level]); - - free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) - path->locks[level] = next_rw_lock; + path->locks[level] = BTRFS_READ_LOCK; if (!level) break; - ret = read_block_for_search(NULL, root, path, &next, level, - 0, &key, 0); - if (ret == -EAGAIN) + ret = read_block_for_search(root, path, &next, 0, &key); + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -5614,26 +4956,40 @@ again: } if (!path->skip_locking) { - ret = btrfs_try_tree_read_lock(next); - if (!ret) { - btrfs_set_path_blocking(path); + if (path->nowait) { + if (!btrfs_try_tree_read_lock(next)) { + ret = -EAGAIN; + goto done; + } + } else { btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); } - next_rw_lock = BTRFS_READ_LOCK; } } ret = 0; done: unlock_up(path, 0, 1, 0, NULL); - path->leave_spinning = old_spinning; - if (!old_spinning) - btrfs_set_path_blocking(path); + if (need_commit_sem) { + int ret2; + + path->need_commit_sem = true; + ret2 = finish_need_commit_sem_search(path); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } return ret; } +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) +{ + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + return btrfs_next_old_leaf(root, path, time_seq); + return 0; +} + /* * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps * searching until it gets past min_objectid or finds an item of 'type' @@ -5651,7 +5007,6 @@ int btrfs_previous_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; @@ -5676,3 +5031,58 @@ int btrfs_previous_item(struct btrfs_root *root, } return 1; } + +/* + * search in extent tree to find a previous Metadata/Data extent item with + * min objecitd. + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_extent_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid < min_objectid) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY || + found_key.type == BTRFS_METADATA_ITEM_KEY) + return 0; + if (found_key.objectid == min_objectid && + found_key.type < BTRFS_EXTENT_ITEM_KEY) + break; + } + return 1; +} + +int __init btrfs_ctree_init(void) +{ + btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0); + if (!btrfs_path_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_ctree_exit(void) +{ + kmem_cache_destroy(btrfs_path_cachep); +} |
