diff options
65 files changed, 4462 insertions, 3536 deletions
| diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9a0ff3384381..e738f6206ea5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \  	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \  	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \  	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ -	   block-rsv.o delalloc-space.o block-group.o discard.o +	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 1d32a07bb2d1..309516e6a968 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -395,3 +395,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work)  {  	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);  } + +void btrfs_flush_workqueue(struct btrfs_workqueue *wq) +{ +	if (wq->high) +		flush_workqueue(wq->high->normal_wq); + +	flush_workqueue(wq->normal->normal_wq); +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index a4434301d84d..3204daa51b95 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -44,5 +44,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work);  struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);  struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);  bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); +void btrfs_flush_workqueue(struct btrfs_workqueue *wq);  #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e5d85311d5d5..9c380e7edf62 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -347,33 +347,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info,  		return -ENOMEM;  	ref->root_id = root_id; -	if (key) { +	if (key)  		ref->key_for_search = *key; -		/* -		 * We can often find data backrefs with an offset that is too -		 * large (>= LLONG_MAX, maximum allowed file offset) due to -		 * underflows when subtracting a file's offset with the data -		 * offset of its corresponding extent data item. This can -		 * happen for example in the clone ioctl. -		 * So if we detect such case we set the search key's offset to -		 * zero to make sure we will find the matching file extent item -		 * at add_all_parents(), otherwise we will miss it because the -		 * offset taken form the backref is much larger then the offset -		 * of the file extent item. This can make us scan a very large -		 * number of file extent items, but at least it will not make -		 * us miss any. -		 * This is an ugly workaround for a behaviour that should have -		 * never existed, but it does and a fix for the clone ioctl -		 * would touch a lot of places, cause backwards incompatibility -		 * and would not fix the problem for extents cloned with older -		 * kernels. -		 */ -		if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY && -		    ref->key_for_search.offset >= LLONG_MAX) -			ref->key_for_search.offset = 0; -	} else { +	else  		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); -	}  	ref->inode_list = NULL;  	ref->level = level; @@ -409,10 +386,36 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info,  			      wanted_disk_byte, count, sc, gfp_mask);  } +static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) +{ +	struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; +	struct rb_node *parent = NULL; +	struct prelim_ref *ref = NULL; +	struct prelim_ref target = {0}; +	int result; + +	target.parent = bytenr; + +	while (*p) { +		parent = *p; +		ref = rb_entry(parent, struct prelim_ref, rbnode); +		result = prelim_ref_compare(ref, &target); + +		if (result < 0) +			p = &(*p)->rb_left; +		else if (result > 0) +			p = &(*p)->rb_right; +		else +			return 1; +	} +	return 0; +} +  static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, -			   struct ulist *parents, struct prelim_ref *ref, +			   struct ulist *parents, +			   struct preftrees *preftrees, struct prelim_ref *ref,  			   int level, u64 time_seq, const u64 *extent_item_pos, -			   u64 total_refs, bool ignore_offset) +			   bool ignore_offset)  {  	int ret = 0;  	int slot; @@ -424,6 +427,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,  	u64 disk_byte;  	u64 wanted_disk_byte = ref->wanted_disk_byte;  	u64 count = 0; +	u64 data_offset;  	if (level != 0) {  		eb = path->nodes[level]; @@ -434,18 +438,26 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,  	}  	/* -	 * We normally enter this function with the path already pointing to -	 * the first item to check. But sometimes, we may enter it with -	 * slot==nritems. In that case, go to the next leaf before we continue. +	 * 1. We normally enter this function with the path already pointing to +	 *    the first item to check. But sometimes, we may enter it with +	 *    slot == nritems. +	 * 2. We are searching for normal backref but bytenr of this leaf +	 *    matches shared data backref +	 * 3. The leaf owner is not equal to the root we are searching +	 * +	 * For these cases, go to the next leaf before we continue.  	 */ -	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { +	eb = path->nodes[0]; +	if (path->slots[0] >= btrfs_header_nritems(eb) || +	    is_shared_data_backref(preftrees, eb->start) || +	    ref->root_id != btrfs_header_owner(eb)) {  		if (time_seq == SEQ_LAST)  			ret = btrfs_next_leaf(root, path);  		else  			ret = btrfs_next_old_leaf(root, path, time_seq);  	} -	while (!ret && count < total_refs) { +	while (!ret && count < ref->count) {  		eb = path->nodes[0];  		slot = path->slots[0]; @@ -455,13 +467,31 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,  		    key.type != BTRFS_EXTENT_DATA_KEY)  			break; +		/* +		 * We are searching for normal backref but bytenr of this leaf +		 * matches shared data backref, OR +		 * the leaf owner is not equal to the root we are searching for +		 */ +		if (slot == 0 && +		    (is_shared_data_backref(preftrees, eb->start) || +		     ref->root_id != btrfs_header_owner(eb))) { +			if (time_seq == SEQ_LAST) +				ret = btrfs_next_leaf(root, path); +			else +				ret = btrfs_next_old_leaf(root, path, time_seq); +			continue; +		}  		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);  		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); +		data_offset = btrfs_file_extent_offset(eb, fi);  		if (disk_byte == wanted_disk_byte) {  			eie = NULL;  			old = NULL; -			count++; +			if (ref->key_for_search.offset == key.offset - data_offset) +				count++; +			else +				goto next;  			if (extent_item_pos) {  				ret = check_extent_in_eb(&key, eb, fi,  						*extent_item_pos, @@ -502,9 +532,9 @@ next:   */  static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,  				struct btrfs_path *path, u64 time_seq, +				struct preftrees *preftrees,  				struct prelim_ref *ref, struct ulist *parents, -				const u64 *extent_item_pos, u64 total_refs, -				bool ignore_offset) +				const u64 *extent_item_pos, bool ignore_offset)  {  	struct btrfs_root *root;  	struct btrfs_key root_key; @@ -512,23 +542,25 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,  	int ret = 0;  	int root_level;  	int level = ref->level; -	int index; +	struct btrfs_key search_key = ref->key_for_search;  	root_key.objectid = ref->root_id;  	root_key.type = BTRFS_ROOT_ITEM_KEY;  	root_key.offset = (u64)-1; -	index = srcu_read_lock(&fs_info->subvol_srcu); -  	root = btrfs_get_fs_root(fs_info, &root_key, false);  	if (IS_ERR(root)) { -		srcu_read_unlock(&fs_info->subvol_srcu, index);  		ret = PTR_ERR(root); +		goto out_free; +	} + +	if (!path->search_commit_root && +	    test_bit(BTRFS_ROOT_DELETING, &root->state)) { +		ret = -ENOENT;  		goto out;  	}  	if (btrfs_is_testing(fs_info)) { -		srcu_read_unlock(&fs_info->subvol_srcu, index);  		ret = -ENOENT;  		goto out;  	} @@ -540,21 +572,36 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,  	else  		root_level = btrfs_old_root_level(root, time_seq); -	if (root_level + 1 == level) { -		srcu_read_unlock(&fs_info->subvol_srcu, index); +	if (root_level + 1 == level)  		goto out; -	} +	/* +	 * We can often find data backrefs with an offset that is too large +	 * (>= LLONG_MAX, maximum allowed file offset) due to underflows when +	 * subtracting a file's offset with the data offset of its +	 * corresponding extent data item. This can happen for example in the +	 * clone ioctl. +	 * +	 * So if we detect such case we set the search key's offset to zero to +	 * make sure we will find the matching file extent item at +	 * add_all_parents(), otherwise we will miss it because the offset +	 * taken form the backref is much larger then the offset of the file +	 * extent item. This can make us scan a very large number of file +	 * extent items, but at least it will not make us miss any. +	 * +	 * This is an ugly workaround for a behaviour that should have never +	 * existed, but it does and a fix for the clone ioctl would touch a lot +	 * of places, cause backwards incompatibility and would not fix the +	 * problem for extents cloned with older kernels. +	 */ +	if (search_key.type == BTRFS_EXTENT_DATA_KEY && +	    search_key.offset >= LLONG_MAX) +		search_key.offset = 0;  	path->lowest_level = level;  	if (time_seq == SEQ_LAST) -		ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, -					0, 0); +		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);  	else -		ret = btrfs_search_old_slot(root, &ref->key_for_search, path, -					    time_seq); - -	/* root node has been locked, we can release @subvol_srcu safely here */ -	srcu_read_unlock(&fs_info->subvol_srcu, index); +		ret = btrfs_search_old_slot(root, &search_key, path, time_seq);  	btrfs_debug(fs_info,  		"search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", @@ -574,9 +621,11 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,  		eb = path->nodes[level];  	} -	ret = add_all_parents(root, path, parents, ref, level, time_seq, -			      extent_item_pos, total_refs, ignore_offset); +	ret = add_all_parents(root, path, parents, preftrees, ref, level, +			      time_seq, extent_item_pos, ignore_offset);  out: +	btrfs_put_root(root); +out_free:  	path->lowest_level = 0;  	btrfs_release_path(path);  	return ret; @@ -609,7 +658,7 @@ unode_aux_to_inode_list(struct ulist_node *node)  static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,  				 struct btrfs_path *path, u64 time_seq,  				 struct preftrees *preftrees, -				 const u64 *extent_item_pos, u64 total_refs, +				 const u64 *extent_item_pos,  				 struct share_check *sc, bool ignore_offset)  {  	int err; @@ -653,9 +702,9 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,  			ret = BACKREF_FOUND_SHARED;  			goto out;  		} -		err = resolve_indirect_ref(fs_info, path, time_seq, ref, -					   parents, extent_item_pos, -					   total_refs, ignore_offset); +		err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, +					   ref, parents, extent_item_pos, +					   ignore_offset);  		/*  		 * we can only tolerate ENOENT,otherwise,we should catch error  		 * and return directly. @@ -758,8 +807,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,   */  static int add_delayed_refs(const struct btrfs_fs_info *fs_info,  			    struct btrfs_delayed_ref_head *head, u64 seq, -			    struct preftrees *preftrees, u64 *total_refs, -			    struct share_check *sc) +			    struct preftrees *preftrees, struct share_check *sc)  {  	struct btrfs_delayed_ref_node *node;  	struct btrfs_delayed_extent_op *extent_op = head->extent_op; @@ -793,7 +841,6 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,  		default:  			BUG();  		} -		*total_refs += count;  		switch (node->type) {  		case BTRFS_TREE_BLOCK_REF_KEY: {  			/* NORMAL INDIRECT METADATA backref */ @@ -876,7 +923,7 @@ out:  static int add_inline_refs(const struct btrfs_fs_info *fs_info,  			   struct btrfs_path *path, u64 bytenr,  			   int *info_level, struct preftrees *preftrees, -			   u64 *total_refs, struct share_check *sc) +			   struct share_check *sc)  {  	int ret = 0;  	int slot; @@ -900,7 +947,6 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,  	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);  	flags = btrfs_extent_flags(leaf, ei); -	*total_refs += btrfs_extent_refs(leaf, ei);  	btrfs_item_key_to_cpu(leaf, &found_key, slot);  	ptr = (unsigned long)(ei + 1); @@ -1125,8 +1171,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,  	struct prelim_ref *ref;  	struct rb_node *node;  	struct extent_inode_elem *eie = NULL; -	/* total of both direct AND indirect refs! */ -	u64 total_refs = 0;  	struct preftrees preftrees = {  		.direct = PREFTREE_INIT,  		.indirect = PREFTREE_INIT, @@ -1195,7 +1239,7 @@ again:  			}  			spin_unlock(&delayed_refs->lock);  			ret = add_delayed_refs(fs_info, head, time_seq, -					       &preftrees, &total_refs, sc); +					       &preftrees, sc);  			mutex_unlock(&head->mutex);  			if (ret)  				goto out; @@ -1216,8 +1260,7 @@ again:  		    (key.type == BTRFS_EXTENT_ITEM_KEY ||  		     key.type == BTRFS_METADATA_ITEM_KEY)) {  			ret = add_inline_refs(fs_info, path, bytenr, -					      &info_level, &preftrees, -					      &total_refs, sc); +					      &info_level, &preftrees, sc);  			if (ret)  				goto out;  			ret = add_keyed_refs(fs_info, path, bytenr, info_level, @@ -1236,7 +1279,7 @@ again:  	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));  	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, -				    extent_item_pos, total_refs, sc, ignore_offset); +				    extent_item_pos, sc, ignore_offset);  	if (ret)  		goto out; @@ -1362,10 +1405,10 @@ static void free_leaf_list(struct ulist *blocks)   *   * returns 0 on success, <0 on error   */ -static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, -				struct btrfs_fs_info *fs_info, u64 bytenr, -				u64 time_seq, struct ulist **leafs, -				const u64 *extent_item_pos, bool ignore_offset) +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, +			 struct btrfs_fs_info *fs_info, u64 bytenr, +			 u64 time_seq, struct ulist **leafs, +			 const u64 *extent_item_pos, bool ignore_offset)  {  	int ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 777f61dc081e..723d6da99114 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,6 +40,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,  int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, +			 struct btrfs_fs_info *fs_info, u64 bytenr, +			 u64 time_seq, struct ulist **leafs, +			 const u64 *extent_item_pos, bool ignore_offset);  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,  			 struct btrfs_fs_info *fs_info, u64 bytenr,  			 u64 time_seq, struct ulist **roots, bool ignore_offset); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 7f09147872dc..786849fcc319 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -460,7 +460,7 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end  	int ret;  	while (start < end) { -		ret = find_first_extent_bit(info->pinned_extents, start, +		ret = find_first_extent_bit(&info->excluded_extents, start,  					    &extent_start, &extent_end,  					    EXTENT_DIRTY | EXTENT_UPTODATE,  					    NULL); @@ -1248,6 +1248,55 @@ out:  	return ret;  } +static bool clean_pinned_extents(struct btrfs_trans_handle *trans, +				 struct btrfs_block_group *bg) +{ +	struct btrfs_fs_info *fs_info = bg->fs_info; +	struct btrfs_transaction *prev_trans = NULL; +	const u64 start = bg->start; +	const u64 end = start + bg->length - 1; +	int ret; + +	spin_lock(&fs_info->trans_lock); +	if (trans->transaction->list.prev != &fs_info->trans_list) { +		prev_trans = list_last_entry(&trans->transaction->list, +					     struct btrfs_transaction, list); +		refcount_inc(&prev_trans->use_count); +	} +	spin_unlock(&fs_info->trans_lock); + +	/* +	 * Hold the unused_bg_unpin_mutex lock to avoid racing with +	 * btrfs_finish_extent_commit(). If we are at transaction N, another +	 * task might be running finish_extent_commit() for the previous +	 * transaction N - 1, and have seen a range belonging to the block +	 * group in pinned_extents before we were able to clear the whole block +	 * group range from pinned_extents. This means that task can lookup for +	 * the block group after we unpinned it from pinned_extents and removed +	 * it, leading to a BUG_ON() at unpin_extent_range(). +	 */ +	mutex_lock(&fs_info->unused_bg_unpin_mutex); +	if (prev_trans) { +		ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, +					EXTENT_DIRTY); +		if (ret) +			goto err; +	} + +	ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, +				EXTENT_DIRTY); +	if (ret) +		goto err; +	mutex_unlock(&fs_info->unused_bg_unpin_mutex); + +	return true; + +err: +	mutex_unlock(&fs_info->unused_bg_unpin_mutex); +	btrfs_dec_block_group_ro(bg); +	return false; +} +  /*   * Process the unused_bgs list and remove any that don't have any allocated   * space inside of them. @@ -1265,7 +1314,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)  	spin_lock(&fs_info->unused_bgs_lock);  	while (!list_empty(&fs_info->unused_bgs)) { -		u64 start, end;  		int trimming;  		block_group = list_first_entry(&fs_info->unused_bgs, @@ -1344,35 +1392,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)  		 * We could have pending pinned extents for this block group,  		 * just delete them, we don't care about them anymore.  		 */ -		start = block_group->start; -		end = start + block_group->length - 1; -		/* -		 * Hold the unused_bg_unpin_mutex lock to avoid racing with -		 * btrfs_finish_extent_commit(). If we are at transaction N, -		 * another task might be running finish_extent_commit() for the -		 * previous transaction N - 1, and have seen a range belonging -		 * to the block group in freed_extents[] before we were able to -		 * clear the whole block group range from freed_extents[]. This -		 * means that task can lookup for the block group after we -		 * unpinned it from freed_extents[] and removed it, leading to -		 * a BUG_ON() at btrfs_unpin_extent_range(). -		 */ -		mutex_lock(&fs_info->unused_bg_unpin_mutex); -		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, -				  EXTENT_DIRTY); -		if (ret) { -			mutex_unlock(&fs_info->unused_bg_unpin_mutex); -			btrfs_dec_block_group_ro(block_group); +		if (!clean_pinned_extents(trans, block_group))  			goto end_trans; -		} -		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, -				  EXTENT_DIRTY); -		if (ret) { -			mutex_unlock(&fs_info->unused_bg_unpin_mutex); -			btrfs_dec_block_group_ro(block_group); -			goto end_trans; -		} -		mutex_unlock(&fs_info->unused_bg_unpin_mutex);  		/*  		 * At this point, the block_group is read only and should fail @@ -1987,6 +2008,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)  		btrfs_release_path(path);  	} +	rcu_read_lock();  	list_for_each_entry_rcu(space_info, &info->space_info, list) {  		if (!(btrfs_get_alloc_profile(info, space_info->flags) &  		      (BTRFS_BLOCK_GROUP_RAID10 | @@ -2007,6 +2029,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)  				list)  			inc_block_group_ro(cache, 1);  	} +	rcu_read_unlock();  	btrfs_init_global_block_rsv(info);  	ret = check_chunk_block_group_mappings(info); @@ -2345,7 +2368,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group,  		return 0;  	} -	if (trans->aborted) +	if (TRANS_ABORTED(trans))  		return 0;  again:  	inode = lookup_free_space_inode(block_group, path); @@ -2881,7 +2904,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,  					&cache->space_info->total_bytes_pinned,  					num_bytes,  					BTRFS_TOTAL_BYTES_PINNED_BATCH); -			set_extent_dirty(info->pinned_extents, +			set_extent_dirty(&trans->transaction->pinned_extents,  					 bytenr, bytenr + num_bytes - 1,  					 GFP_NOFS | __GFP_NOFAIL);  		} diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index d07bd41a7c1e..27efec8f7c5b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,6 +6,98 @@  #include "space-info.h"  #include "transaction.h" +/* + * HOW DO BLOCK RESERVES WORK + * + *   Think of block_rsv's as buckets for logically grouped metadata + *   reservations.  Each block_rsv has a ->size and a ->reserved.  ->size is + *   how large we want our block rsv to be, ->reserved is how much space is + *   currently reserved for this block reserve. + * + *   ->failfast exists for the truncate case, and is described below. + * + * NORMAL OPERATION + * + *   -> Reserve + *     Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill + * + *     We call into btrfs_reserve_metadata_bytes() with our bytes, which is + *     accounted for in space_info->bytes_may_use, and then add the bytes to + *     ->reserved, and ->size in the case of btrfs_block_rsv_add. + * + *     ->size is an over-estimation of how much we may use for a particular + *     operation. + * + *   -> Use + *     Entrance: btrfs_use_block_rsv + * + *     When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv() + *     to determine the appropriate block_rsv to use, and then verify that + *     ->reserved has enough space for our tree block allocation.  Once + *     successful we subtract fs_info->nodesize from ->reserved. + * + *   -> Finish + *     Entrance: btrfs_block_rsv_release + * + *     We are finished with our operation, subtract our individual reservation + *     from ->size, and then subtract ->size from ->reserved and free up the + *     excess if there is any. + * + *     There is some logic here to refill the delayed refs rsv or the global rsv + *     as needed, otherwise the excess is subtracted from + *     space_info->bytes_may_use. + * + * TYPES OF BLOCK RESERVES + * + * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK + *   These behave normally, as described above, just within the confines of the + *   lifetime of their particular operation (transaction for the whole trans + *   handle lifetime, for example). + * + * BLOCK_RSV_GLOBAL + *   It is impossible to properly account for all the space that may be required + *   to make our extent tree updates.  This block reserve acts as an overflow + *   buffer in case our delayed refs reserve does not reserve enough space to + *   update the extent tree. + * + *   We can steal from this in some cases as well, notably on evict() or + *   truncate() in order to help users recover from ENOSPC conditions. + * + * BLOCK_RSV_DELALLOC + *   The individual item sizes are determined by the per-inode size + *   calculations, which are described with the delalloc code.  This is pretty + *   straightforward, it's just the calculation of ->size encodes a lot of + *   different items, and thus it gets used when updating inodes, inserting file + *   extents, and inserting checksums. + * + * BLOCK_RSV_DELREFS + *   We keep a running tally of how many delayed refs we have on the system. + *   We assume each one of these delayed refs are going to use a full + *   reservation.  We use the transaction items and pre-reserve space for every + *   operation, and use this reservation to refill any gap between ->size and + *   ->reserved that may exist. + * + *   From there it's straightforward, removing a delayed ref means we remove its + *   count from ->size and free up reservations as necessary.  Since this is + *   the most dynamic block reserve in the system, we will try to refill this + *   block reserve first with any excess returned by any other block reserve. + * + * BLOCK_RSV_EMPTY + *   This is the fallback block reserve to make us try to reserve space if we + *   don't have a specific bucket for this allocation.  It is mostly used for + *   updating the device tree and such, since that is a separate pool we're + *   content to just reserve space from the space_info on demand. + * + * BLOCK_RSV_TEMP + *   This is used by things like truncate and iput.  We will temporarily + *   allocate a block reserve, set it to some size, and then truncate bytes + *   until we have no space left.  With ->failfast set we'll simply return + *   ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try + *   to make a new reservation.  This is because these operations are + *   unbounded, so we want to do as much work as we can, and then back off and + *   re-reserve. + */ +  static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,  				    struct btrfs_block_rsv *block_rsv,  				    struct btrfs_block_rsv *dest, u64 num_bytes, @@ -111,7 +203,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,  {  	if (!rsv)  		return; -	btrfs_block_rsv_release(fs_info, rsv, (u64)-1); +	btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);  	kfree(rsv);  } @@ -178,9 +270,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,  	return ret;  } -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, -			      struct btrfs_block_rsv *block_rsv, -			      u64 num_bytes, u64 *qgroup_to_release) +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, +			    struct btrfs_block_rsv *block_rsv, u64 num_bytes, +			    u64 *qgroup_to_release)  {  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;  	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; @@ -297,9 +389,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)  	if (block_rsv->reserved < block_rsv->size) {  		num_bytes = block_rsv->size - block_rsv->reserved; -		block_rsv->reserved += num_bytes;  		btrfs_space_info_update_bytes_may_use(fs_info, sinfo,  						      num_bytes); +		block_rsv->reserved = block_rsv->size;  	} else if (block_rsv->reserved > block_rsv->size) {  		num_bytes = block_rsv->reserved - block_rsv->size;  		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, @@ -344,7 +436,8 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)  void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)  { -	btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); +	btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1, +				NULL);  	WARN_ON(fs_info->trans_block_rsv.size > 0);  	WARN_ON(fs_info->trans_block_rsv.reserved > 0);  	WARN_ON(fs_info->chunk_block_rsv.size > 0); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d1428bb73fc5..0b6ae5302837 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -73,7 +73,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,  			     int min_factor);  void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,  			       u64 num_bytes, bool update_size); -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, +u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,  			      struct btrfs_block_rsv *block_rsv,  			      u64 num_bytes, u64 *qgroup_to_release);  void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); @@ -82,20 +82,12 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);  struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,  					    struct btrfs_root *root,  					    u32 blocksize); - -static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, -					   struct btrfs_block_rsv *block_rsv, -					   u64 num_bytes) -{ -	__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); -} -  static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,  					 struct btrfs_block_rsv *block_rsv,  					 u32 blocksize)  {  	btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); -	btrfs_block_rsv_release(fs_info, block_rsv, 0); +	btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);  }  #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4e12a477d32e..27a1fefce508 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -60,6 +60,12 @@ struct btrfs_inode {  	 */  	struct extent_io_tree io_failure_tree; +	/* +	 * Keep track of where the inode has extent items mapped in order to +	 * make sure the i_size adjustments are accurate +	 */ +	struct extent_io_tree file_extent_tree; +  	/* held while logging the inode in tree-log.c */  	struct mutex log_mutex; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index a0ce69f2d27c..32e11a23b47f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -77,7 +77,6 @@  #include <linux/sched.h>  #include <linux/slab.h> -#include <linux/buffer_head.h>  #include <linux/mutex.h>  #include <linux/genhd.h>  #include <linux/blkdev.h> @@ -152,11 +151,8 @@ struct btrfsic_block {  	struct list_head ref_to_list;	/* list */  	struct list_head ref_from_list;	/* list */  	struct btrfsic_block *next_in_same_bio; -	void *orig_bio_bh_private; -	union { -		bio_end_io_t *bio; -		bh_end_io_t *bh; -	} orig_bio_bh_end_io; +	void *orig_bio_private; +	bio_end_io_t *orig_bio_end_io;  	int submit_bio_bh_rw;  	u64 flush_gen; /* only valid if !never_written */  }; @@ -325,14 +321,12 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  					  u64 dev_bytenr, char **mapped_datav,  					  unsigned int num_pages,  					  struct bio *bio, int *bio_is_patched, -					  struct buffer_head *bh,  					  int submit_bio_bh_rw);  static int btrfsic_process_written_superblock(  		struct btrfsic_state *state,  		struct btrfsic_block *const block,  		struct btrfs_super_block *const super_hdr);  static void btrfsic_bio_end_io(struct bio *bp); -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);  static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,  					      const struct btrfsic_block *block,  					      int recursion_level); @@ -399,8 +393,8 @@ static void btrfsic_block_init(struct btrfsic_block *b)  	b->never_written = 0;  	b->mirror_num = 0;  	b->next_in_same_bio = NULL; -	b->orig_bio_bh_private = NULL; -	b->orig_bio_bh_end_io.bio = NULL; +	b->orig_bio_private = NULL; +	b->orig_bio_end_io = NULL;  	INIT_LIST_HEAD(&b->collision_resolving_node);  	INIT_LIST_HEAD(&b->all_blocks_node);  	INIT_LIST_HEAD(&b->ref_to_list); @@ -767,29 +761,31 @@ static int btrfsic_process_superblock_dev_mirror(  	struct btrfs_fs_info *fs_info = state->fs_info;  	struct btrfs_super_block *super_tmp;  	u64 dev_bytenr; -	struct buffer_head *bh;  	struct btrfsic_block *superblock_tmp;  	int pass;  	struct block_device *const superblock_bdev = device->bdev; +	struct page *page; +	struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; +	int ret = 0;  	/* super block bytenr is always the unmapped device bytenr */  	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);  	if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)  		return -1; -	bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE, -		     BTRFS_SUPER_INFO_SIZE); -	if (NULL == bh) + +	page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); +	if (IS_ERR(page))  		return -1; -	super_tmp = (struct btrfs_super_block *) -	    (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1))); + +	super_tmp = page_address(page);  	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||  	    btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||  	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||  	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||  	    btrfs_super_sectorsize(super_tmp) != state->datablock_size) { -		brelse(bh); -		return 0; +		ret = 0; +		goto out;  	}  	superblock_tmp = @@ -800,8 +796,8 @@ static int btrfsic_process_superblock_dev_mirror(  		superblock_tmp = btrfsic_block_alloc();  		if (NULL == superblock_tmp) {  			pr_info("btrfsic: error, kmalloc failed!\n"); -			brelse(bh); -			return -1; +			ret = -1; +			goto out;  		}  		/* for superblock, only the dev_bytenr makes sense */  		superblock_tmp->dev_bytenr = dev_bytenr; @@ -885,8 +881,8 @@ static int btrfsic_process_superblock_dev_mirror(  					      mirror_num)) {  				pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",  				       next_bytenr, mirror_num); -				brelse(bh); -				return -1; +				ret = -1; +				goto out;  			}  			next_block = btrfsic_block_lookup_or_add( @@ -895,8 +891,8 @@ static int btrfsic_process_superblock_dev_mirror(  					mirror_num, NULL);  			if (NULL == next_block) {  				btrfsic_release_block_ctx(&tmp_next_block_ctx); -				brelse(bh); -				return -1; +				ret = -1; +				goto out;  			}  			next_block->disk_key = tmp_disk_key; @@ -907,16 +903,17 @@ static int btrfsic_process_superblock_dev_mirror(  					BTRFSIC_GENERATION_UNKNOWN);  			btrfsic_release_block_ctx(&tmp_next_block_ctx);  			if (NULL == l) { -				brelse(bh); -				return -1; +				ret = -1; +				goto out;  			}  		}  	}  	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)  		btrfsic_dump_tree_sub(state, superblock_tmp, 0); -	brelse(bh); -	return 0; +out: +	put_page(page); +	return ret;  }  static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) @@ -1743,7 +1740,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  					  u64 dev_bytenr, char **mapped_datav,  					  unsigned int num_pages,  					  struct bio *bio, int *bio_is_patched, -					  struct buffer_head *bh,  					  int submit_bio_bh_rw)  {  	int is_metadata; @@ -1902,9 +1898,9 @@ again:  				block->is_iodone = 0;  				BUG_ON(NULL == bio_is_patched);  				if (!*bio_is_patched) { -					block->orig_bio_bh_private = +					block->orig_bio_private =  					    bio->bi_private; -					block->orig_bio_bh_end_io.bio = +					block->orig_bio_end_io =  					    bio->bi_end_io;  					block->next_in_same_bio = NULL;  					bio->bi_private = block; @@ -1916,25 +1912,17 @@ again:  					    bio->bi_private;  					BUG_ON(NULL == chained_block); -					block->orig_bio_bh_private = -					    chained_block->orig_bio_bh_private; -					block->orig_bio_bh_end_io.bio = -					    chained_block->orig_bio_bh_end_io. -					    bio; +					block->orig_bio_private = +					    chained_block->orig_bio_private; +					block->orig_bio_end_io = +					    chained_block->orig_bio_end_io;  					block->next_in_same_bio = chained_block;  					bio->bi_private = block;  				} -			} else if (NULL != bh) { -				block->is_iodone = 0; -				block->orig_bio_bh_private = bh->b_private; -				block->orig_bio_bh_end_io.bh = bh->b_end_io; -				block->next_in_same_bio = NULL; -				bh->b_private = block; -				bh->b_end_io = btrfsic_bh_end_io;  			} else {  				block->is_iodone = 1; -				block->orig_bio_bh_private = NULL; -				block->orig_bio_bh_end_io.bio = NULL; +				block->orig_bio_private = NULL; +				block->orig_bio_end_io = NULL;  				block->next_in_same_bio = NULL;  			}  		} @@ -2042,8 +2030,8 @@ again:  			block->is_iodone = 0;  			BUG_ON(NULL == bio_is_patched);  			if (!*bio_is_patched) { -				block->orig_bio_bh_private = bio->bi_private; -				block->orig_bio_bh_end_io.bio = bio->bi_end_io; +				block->orig_bio_private = bio->bi_private; +				block->orig_bio_end_io = bio->bi_end_io;  				block->next_in_same_bio = NULL;  				bio->bi_private = block;  				bio->bi_end_io = btrfsic_bio_end_io; @@ -2054,24 +2042,17 @@ again:  				    bio->bi_private;  				BUG_ON(NULL == chained_block); -				block->orig_bio_bh_private = -				    chained_block->orig_bio_bh_private; -				block->orig_bio_bh_end_io.bio = -				    chained_block->orig_bio_bh_end_io.bio; +				block->orig_bio_private = +				    chained_block->orig_bio_private; +				block->orig_bio_end_io = +				    chained_block->orig_bio_end_io;  				block->next_in_same_bio = chained_block;  				bio->bi_private = block;  			} -		} else if (NULL != bh) { -			block->is_iodone = 0; -			block->orig_bio_bh_private = bh->b_private; -			block->orig_bio_bh_end_io.bh = bh->b_end_io; -			block->next_in_same_bio = NULL; -			bh->b_private = block; -			bh->b_end_io = btrfsic_bh_end_io;  		} else {  			block->is_iodone = 1; -			block->orig_bio_bh_private = NULL; -			block->orig_bio_bh_end_io.bio = NULL; +			block->orig_bio_private = NULL; +			block->orig_bio_end_io = NULL;  			block->next_in_same_bio = NULL;  		}  		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) @@ -2112,8 +2093,8 @@ static void btrfsic_bio_end_io(struct bio *bp)  		iodone_w_error = 1;  	BUG_ON(NULL == block); -	bp->bi_private = block->orig_bio_bh_private; -	bp->bi_end_io = block->orig_bio_bh_end_io.bio; +	bp->bi_private = block->orig_bio_private; +	bp->bi_end_io = block->orig_bio_end_io;  	do {  		struct btrfsic_block *next_block; @@ -2146,38 +2127,6 @@ static void btrfsic_bio_end_io(struct bio *bp)  	bp->bi_end_io(bp);  } -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) -{ -	struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; -	int iodone_w_error = !uptodate; -	struct btrfsic_dev_state *dev_state; - -	BUG_ON(NULL == block); -	dev_state = block->dev_state; -	if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) -		pr_info("bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", -		       iodone_w_error, -		       btrfsic_get_block_type(dev_state->state, block), -		       block->logical_bytenr, block->dev_state->name, -		       block->dev_bytenr, block->mirror_num); - -	block->iodone_w_error = iodone_w_error; -	if (block->submit_bio_bh_rw & REQ_PREFLUSH) { -		dev_state->last_flush_gen++; -		if ((dev_state->state->print_mask & -		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) -			pr_info("bh_end_io() new %s flush_gen=%llu\n", -			       dev_state->name, dev_state->last_flush_gen); -	} -	if (block->submit_bio_bh_rw & REQ_FUA) -		block->flush_gen = 0; /* FUA completed means block is on disk */ - -	bh->b_private = block->orig_bio_bh_private; -	bh->b_end_io = block->orig_bio_bh_end_io.bh; -	block->is_iodone = 1; /* for FLUSH, this releases the block */ -	bh->b_end_io(bh, uptodate); -} -  static int btrfsic_process_written_superblock(  		struct btrfsic_state *state,  		struct btrfsic_block *const superblock, @@ -2730,63 +2679,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)  						  &btrfsic_dev_state_hashtable);  } -int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) -{ -	struct btrfsic_dev_state *dev_state; - -	if (!btrfsic_is_initialized) -		return submit_bh(op, op_flags, bh); - -	mutex_lock(&btrfsic_mutex); -	/* since btrfsic_submit_bh() might also be called before -	 * btrfsic_mount(), this might return NULL */ -	dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev); - -	/* Only called to write the superblock (incl. FLUSH/FUA) */ -	if (NULL != dev_state && -	    (op == REQ_OP_WRITE) && bh->b_size > 0) { -		u64 dev_bytenr; - -		dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr; -		if (dev_state->state->print_mask & -		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) -			pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n", -			       op, op_flags, (unsigned long long)bh->b_blocknr, -			       dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); -		btrfsic_process_written_block(dev_state, dev_bytenr, -					      &bh->b_data, 1, NULL, -					      NULL, bh, op_flags); -	} else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { -		if (dev_state->state->print_mask & -		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) -			pr_info("submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n", -			       op, op_flags, bh->b_bdev); -		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { -			if ((dev_state->state->print_mask & -			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | -			      BTRFSIC_PRINT_MASK_VERBOSE))) -				pr_info("btrfsic_submit_bh(%s) with FLUSH but dummy block already in use (ignored)!\n", -				       dev_state->name); -		} else { -			struct btrfsic_block *const block = -				&dev_state->dummy_block_for_bio_bh_flush; - -			block->is_iodone = 0; -			block->never_written = 0; -			block->iodone_w_error = 0; -			block->flush_gen = dev_state->last_flush_gen + 1; -			block->submit_bio_bh_rw = op_flags; -			block->orig_bio_bh_private = bh->b_private; -			block->orig_bio_bh_end_io.bh = bh->b_end_io; -			block->next_in_same_bio = NULL; -			bh->b_private = block; -			bh->b_end_io = btrfsic_bh_end_io; -		} -	} -	mutex_unlock(&btrfsic_mutex); -	return submit_bh(op, op_flags, bh); -} -  static void __btrfsic_submit_bio(struct bio *bio)  {  	struct btrfsic_dev_state *dev_state; @@ -2838,7 +2730,7 @@ static void __btrfsic_submit_bio(struct bio *bio)  		btrfsic_process_written_block(dev_state, dev_bytenr,  					      mapped_datav, segs,  					      bio, &bio_is_patched, -					      NULL, bio->bi_opf); +					      bio->bi_opf);  		bio_for_each_segment(bvec, bio, iter)  			kunmap(bvec.bv_page);  		kfree(mapped_datav); @@ -2862,8 +2754,8 @@ static void __btrfsic_submit_bio(struct bio *bio)  			block->iodone_w_error = 0;  			block->flush_gen = dev_state->last_flush_gen + 1;  			block->submit_bio_bh_rw = bio->bi_opf; -			block->orig_bio_bh_private = bio->bi_private; -			block->orig_bio_bh_end_io.bio = bio->bi_end_io; +			block->orig_bio_private = bio->bi_private; +			block->orig_bio_end_io = bio->bi_end_io;  			block->next_in_same_bio = NULL;  			bio->bi_private = block;  			bio->bi_end_io = btrfsic_bio_end_io; diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 9bf4359cc44c..bcc730a06cb5 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@  #define BTRFS_CHECK_INTEGRITY_H  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh);  void btrfsic_submit_bio(struct bio *bio);  int btrfsic_submit_bio_wait(struct bio *bio);  #else -#define btrfsic_submit_bh submit_bh  #define btrfsic_submit_bio submit_bio  #define btrfsic_submit_bio_wait submit_bio_wait  #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f2ec1a9bae28..bfedbbe2311f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -31,8 +31,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,  static const struct btrfs_csums {  	u16		size; -	const char	*name; -	const char	*driver; +	const char	name[10]; +	const char	driver[12];  } btrfs_csums[] = {  	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },  	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, @@ -63,7 +63,8 @@ const char *btrfs_super_csum_name(u16 csum_type)  const char *btrfs_super_csum_driver(u16 csum_type)  {  	/* csum type is validated at mount time */ -	return btrfs_csums[csum_type].driver ?: +	return btrfs_csums[csum_type].driver[0] ? +		btrfs_csums[csum_type].driver :  		btrfs_csums[csum_type].name;  } @@ -143,44 +144,6 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)  	return eb;  } -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root.  A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) -{ -	struct extent_buffer *eb; - -	while (1) { -		eb = btrfs_root_node(root); -		btrfs_tree_lock(eb); -		if (eb == root->node) -			break; -		btrfs_tree_unlock(eb); -		free_extent_buffer(eb); -	} -	return eb; -} - -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root.  A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) -{ -	struct extent_buffer *eb; - -	while (1) { -		eb = btrfs_root_node(root); -		btrfs_tree_read_lock(eb); -		if (eb == root->node) -			break; -		btrfs_tree_read_unlock(eb); -		free_extent_buffer(eb); -	} -	return eb; -} -  /* cowonly root (everything not a reference counted cow subvolume), just get   * put onto a simple dirty list.  transaction.c walks this to make sure they   * get properly updated on disk. @@ -341,7 +304,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,  	struct rb_root *tm_root;  	struct rb_node *node;  	struct rb_node *next; -	struct seq_list *cur_elem;  	struct tree_mod_elem *tm;  	u64 min_seq = (u64)-1;  	u64 seq_putting = elem->seq; @@ -353,18 +315,20 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,  	list_del(&elem->list);  	elem->seq = 0; -	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { -		if (cur_elem->seq < min_seq) { -			if (seq_putting > cur_elem->seq) { -				/* -				 * blocker with lower sequence number exists, we -				 * cannot remove anything from the log -				 */ -				write_unlock(&fs_info->tree_mod_log_lock); -				return; -			} -			min_seq = cur_elem->seq; +	if (!list_empty(&fs_info->tree_mod_seq_list)) { +		struct seq_list *first; + +		first = list_first_entry(&fs_info->tree_mod_seq_list, +					 struct seq_list, list); +		if (seq_putting > first->seq) { +			/* +			 * Blocker with lower sequence number exists, we +			 * cannot remove anything from the log. +			 */ +			write_unlock(&fs_info->tree_mod_log_lock); +			return;  		} +		min_seq = first->seq;  	}  	/* @@ -962,9 +926,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,  		if (new_flags != 0) {  			int level = btrfs_header_level(buf); -			ret = btrfs_set_disk_extent_flags(trans, -							  buf->start, -							  buf->len, +			ret = btrfs_set_disk_extent_flags(trans, buf,  							  new_flags, level, 0);  			if (ret)  				return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 36df977b64d9..8aa7b9dac405 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@  #include "extent_map.h"  #include "async-thread.h"  #include "block-rsv.h" +#include "locking.h"  struct btrfs_trans_handle;  struct btrfs_transaction; @@ -596,8 +597,8 @@ struct btrfs_fs_info {  	/* keep track of unallocated space */  	atomic64_t free_chunk_space; -	struct extent_io_tree freed_extents[2]; -	struct extent_io_tree *pinned_extents; +	/* Track ranges which are used by log trees blocks/logged data extents */ +	struct extent_io_tree excluded_extents;  	/* logical->physical extent mapping */  	struct extent_map_tree mapping_tree; @@ -696,7 +697,6 @@ struct btrfs_fs_info {  	struct rw_semaphore cleanup_work_sem;  	struct rw_semaphore subvol_sem; -	struct srcu_struct subvol_srcu;  	spinlock_t trans_lock;  	/* @@ -947,6 +947,10 @@ struct btrfs_fs_info {  #ifdef CONFIG_BTRFS_DEBUG  	struct kobject *debug_kobj;  	struct kobject *discard_debug_kobj; +	struct list_head allocated_roots; + +	spinlock_t eb_leak_lock; +	struct list_head allocated_ebs;  #endif  }; @@ -955,11 +959,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)  	return sb->s_fs_info;  } -struct btrfs_subvolume_writers { -	struct percpu_counter	counter; -	wait_queue_head_t	wait; -}; -  /*   * The state of btrfs root   */ @@ -1131,8 +1130,9 @@ struct btrfs_root {  	 * root_item_lock.  	 */  	int dedupe_in_progress; -	struct btrfs_subvolume_writers *subv_writers; -	atomic_t will_be_snapshotted; +	/* For exclusion of snapshot creation and nocow writes */ +	struct btrfs_drew_lock snapshot_lock; +  	atomic_t snapshot_force_cow;  	/* For qgroup metadata reserved space */ @@ -1149,6 +1149,10 @@ struct btrfs_root {  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  	u64 alloc_bytenr;  #endif + +#ifdef CONFIG_BTRFS_DEBUG +	struct list_head leak_list; +#endif  };  struct btrfs_clone_extent_info { @@ -1971,16 +1975,6 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,  	btrfs_set_header_flags(eb, flags);  } -static inline unsigned long btrfs_header_fsid(void) -{ -	return offsetof(struct btrfs_header, fsid); -} - -static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb) -{ -	return offsetof(struct btrfs_header, chunk_tree_uuid); -} -  static inline int btrfs_is_leaf(const struct extent_buffer *eb)  {  	return btrfs_header_level(eb) == 0; @@ -2458,9 +2452,9 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);  int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,  			     struct btrfs_fs_info *fs_info, u64 bytenr,  			     u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, -		     u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, +		     int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,  				    u64 bytenr, u64 num_bytes);  int btrfs_exclude_logged_extents(struct extent_buffer *eb);  int btrfs_cross_ref_exist(struct btrfs_root *root, @@ -2490,13 +2484,13 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,  		  struct extent_buffer *buf, int full_backref);  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, -				u64 bytenr, u64 num_bytes, u64 flags, +				struct extent_buffer *eb, u64 flags,  				int level, int is_data);  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);  int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,  			       u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,  			      u64 len);  void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); @@ -2665,9 +2659,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)  	return btrfs_next_old_item(root, p, 0);  }  int btrfs_leaf_free_space(struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, -				     struct btrfs_block_rsv *block_rsv, -				     int update_ref, int for_reloc); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, +				     int for_reloc);  int btrfs_drop_subtree(struct btrfs_trans_handle *trans,  			struct btrfs_root *root,  			struct extent_buffer *node, @@ -2695,23 +2688,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)  	return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);  } -static inline void free_fs_info(struct btrfs_fs_info *fs_info) -{ -	kfree(fs_info->balance_ctl); -	kfree(fs_info->delayed_root); -	kfree(fs_info->extent_root); -	kfree(fs_info->tree_root); -	kfree(fs_info->chunk_root); -	kfree(fs_info->dev_root); -	kfree(fs_info->csum_root); -	kfree(fs_info->quota_root); -	kfree(fs_info->uuid_root); -	kfree(fs_info->free_space_root); -	kfree(fs_info->super_copy); -	kfree(fs_info->super_for_commit); -	kvfree(fs_info); -} -  /* tree mod log functions from ctree.c */  u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,  			   struct seq_list *elem); @@ -2750,9 +2726,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,  			u64 subid);  int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,  			u64 subid); -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, -			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8, -					      u64)); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);  /* dir-item.c */  int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, @@ -2859,6 +2833,12 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,  				     struct btrfs_file_extent_item *fi,  				     const bool new_inline,  				     struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, +					u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, +				      u64 len); +void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path);  /* inode.c */  struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, @@ -2996,9 +2976,6 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,  		      size_t num_pages, loff_t pos, size_t write_bytes,  		      struct extent_state **cached);  int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, -			      struct file *file_out, loff_t pos_out, -			      loff_t len, unsigned int remap_flags);  /* tree-defrag.c */  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -3008,6 +2985,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,  int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  			unsigned long new_flags);  int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, +					  u64 subvol_objectid);  static inline __printf(2, 3) __cold  void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) @@ -3401,6 +3380,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,  			      u64 *bytes_to_reserve);  int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,  			      struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);  /* scrub.c */  int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 4cdac4d834f5..1245739a3a6e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -9,6 +9,108 @@  #include "qgroup.h"  #include "block-group.h" +/* + * HOW DOES THIS WORK + * + * There are two stages to data reservations, one for data and one for metadata + * to handle the new extents and checksums generated by writing data. + * + * + * DATA RESERVATION + *   The general flow of the data reservation is as follows + * + *   -> Reserve + *     We call into btrfs_reserve_data_bytes() for the user request bytes that + *     they wish to write.  We make this reservation and add it to + *     space_info->bytes_may_use.  We set EXTENT_DELALLOC on the inode io_tree + *     for the range and carry on if this is buffered, or follow up trying to + *     make a real allocation if we are pre-allocating or doing O_DIRECT. + * + *   -> Use + *     At writepages()/prealloc/O_DIRECT time we will call into + *     btrfs_reserve_extent() for some part or all of this range of bytes.  We + *     will make the allocation and subtract space_info->bytes_may_use by the + *     original requested length and increase the space_info->bytes_reserved by + *     the allocated length.  This distinction is important because compression + *     may allocate a smaller on disk extent than we previously reserved. + * + *   -> Allocation + *     finish_ordered_io() will insert the new file extent item for this range, + *     and then add a delayed ref update for the extent tree.  Once that delayed + *     ref is written the extent size is subtracted from + *     space_info->bytes_reserved and added to space_info->bytes_used. + * + *   Error handling + * + *   -> By the reservation maker + *     This is the simplest case, we haven't completed our operation and we know + *     how much we reserved, we can simply call + *     btrfs_free_reserved_data_space*() and it will be removed from + *     space_info->bytes_may_use. + * + *   -> After the reservation has been made, but before cow_file_range() + *     This is specifically for the delalloc case.  You must clear + *     EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will + *     be subtracted from space_info->bytes_may_use. + * + * METADATA RESERVATION + *   The general metadata reservation lifetimes are discussed elsewhere, this + *   will just focus on how it is used for delalloc space. + * + *   We keep track of two things on a per inode bases + * + *   ->outstanding_extents + *     This is the number of file extent items we'll need to handle all of the + *     outstanding DELALLOC space we have in this inode.  We limit the maximum + *     size of an extent, so a large contiguous dirty area may require more than + *     one outstanding_extent, which is why count_max_extents() is used to + *     determine how many outstanding_extents get added. + * + *   ->csum_bytes + *     This is essentially how many dirty bytes we have for this inode, so we + *     can calculate the number of checksum items we would have to add in order + *     to checksum our outstanding data. + * + *   We keep a per-inode block_rsv in order to make it easier to keep track of + *   our reservation.  We use btrfs_calculate_inode_block_rsv_size() to + *   calculate the current theoretical maximum reservation we would need for the + *   metadata for this inode.  We call this and then adjust our reservation as + *   necessary, either by attempting to reserve more space, or freeing up excess + *   space. + * + * OUTSTANDING_EXTENTS HANDLING + * + *  ->outstanding_extents is used for keeping track of how many extents we will + *  need to use for this inode, and it will fluctuate depending on where you are + *  in the life cycle of the dirty data.  Consider the following normal case for + *  a completely clean inode, with a num_bytes < our maximum allowed extent size + * + *  -> reserve + *    ->outstanding_extents += 1 (current value is 1) + * + *  -> set_delalloc + *    ->outstanding_extents += 1 (currrent value is 2) + * + *  -> btrfs_delalloc_release_extents() + *    ->outstanding_extents -= 1 (current value is 1) + * + *    We must call this once we are done, as we hold our reservation for the + *    duration of our operation, and then assume set_delalloc will update the + *    counter appropriately. + * + *  -> add ordered extent + *    ->outstanding_extents += 1 (current value is 2) + * + *  -> btrfs_clear_delalloc_extent + *    ->outstanding_extents -= 1 (current value is 1) + * + *  -> finish_ordered_io/btrfs_remove_ordered_extent + *    ->outstanding_extents -= 1 (current value is 0) + * + *  Each stage is responsible for their own accounting of the extent, thus + *  making error handling and cleanup easier. + */ +  int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)  {  	struct btrfs_root *root = inode->root; @@ -228,8 +330,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)  	 * are releasing 0 bytes, and then we'll just get the reservation over  	 * the size free'd.  	 */ -	released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, -					     &qgroup_to_release); +	released = btrfs_block_rsv_release(fs_info, block_rsv, 0, +					   &qgroup_to_release);  	if (released > 0)  		trace_btrfs_space_reservation(fs_info, "delalloc",  					      btrfs_ino(inode), released, 0); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index d3e15e1d4a91..bf1595a42a98 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,6 +6,7 @@  #include <linux/slab.h>  #include <linux/iversion.h> +#include <linux/sched/mm.h>  #include "misc.h"  #include "delayed-inode.h"  #include "disk-io.h" @@ -595,8 +596,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,  	trace_btrfs_space_reservation(fs_info, "delayed_item",  				      item->key.objectid, item->bytes_reserved,  				      0); -	btrfs_block_rsv_release(fs_info, rsv, -				item->bytes_reserved); +	btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);  }  static int btrfs_delayed_inode_reserve_metadata( @@ -677,8 +677,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,  	rsv = &fs_info->delayed_block_rsv;  	trace_btrfs_space_reservation(fs_info, "delayed_inode",  				      node->inode_id, node->bytes_reserved, 0); -	btrfs_block_rsv_release(fs_info, rsv, -				node->bytes_reserved); +	btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);  	if (qgroup_free)  		btrfs_qgroup_free_meta_prealloc(node->root,  				node->bytes_reserved); @@ -805,11 +804,14 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,  				     struct btrfs_delayed_item *delayed_item)  {  	struct extent_buffer *leaf; +	unsigned int nofs_flag;  	char *ptr;  	int ret; +	nofs_flag = memalloc_nofs_save();  	ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,  				      delayed_item->data_len); +	memalloc_nofs_restore(nofs_flag);  	if (ret < 0 && ret != -EEXIST)  		return ret; @@ -937,6 +939,7 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,  				      struct btrfs_delayed_node *node)  {  	struct btrfs_delayed_item *curr, *prev; +	unsigned int nofs_flag;  	int ret = 0;  do_again: @@ -945,7 +948,9 @@ do_again:  	if (!curr)  		goto delete_fail; +	nofs_flag = memalloc_nofs_save();  	ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); +	memalloc_nofs_restore(nofs_flag);  	if (ret < 0)  		goto delete_fail;  	else if (ret > 0) { @@ -1012,6 +1017,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,  	struct btrfs_key key;  	struct btrfs_inode_item *inode_item;  	struct extent_buffer *leaf; +	unsigned int nofs_flag;  	int mod;  	int ret; @@ -1024,7 +1030,9 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,  	else  		mod = 1; +	nofs_flag = memalloc_nofs_save();  	ret = btrfs_lookup_inode(trans, root, path, &key, mod); +	memalloc_nofs_restore(nofs_flag);  	if (ret > 0) {  		btrfs_release_path(path);  		return -ENOENT; @@ -1075,7 +1083,10 @@ search:  	key.type = BTRFS_INODE_EXTREF_KEY;  	key.offset = -1; + +	nofs_flag = memalloc_nofs_save();  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1); +	memalloc_nofs_restore(nofs_flag);  	if (ret < 0)  		goto err_out;  	ASSERT(ret); @@ -1139,7 +1150,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)  	int ret = 0;  	bool count = (nr > 0); -	if (trans->aborted) +	if (TRANS_ABORTED(trans))  		return -EIO;  	path = btrfs_alloc_path(); @@ -1760,6 +1771,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,  int btrfs_fill_inode(struct inode *inode, u32 *rdev)  { +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;  	struct btrfs_delayed_node *delayed_node;  	struct btrfs_inode_item *inode_item; @@ -1779,6 +1791,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)  	i_uid_write(inode, btrfs_stack_inode_uid(inode_item));  	i_gid_write(inode, btrfs_stack_inode_gid(inode_item));  	btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); +	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, +			round_up(i_size_read(inode), fs_info->sectorsize));  	inode->i_mode = btrfs_stack_inode_mode(inode_item);  	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));  	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 74ae226ffaf0..ca96ef007d8f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -70,7 +70,7 @@ struct btrfs_delayed_item {  	refcount_t refs;  	int ins_or_del;  	u32 data_len; -	char data[0]; +	char data[];  };  static inline void btrfs_init_delayed_root( diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index dfdb7d4f8406..353cc2994d10 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -82,8 +82,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)  	u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);  	u64 released = 0; -	released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, -					     NULL); +	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);  	if (released)  		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",  					      0, released, 0); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2ca2a09d0e23..db93909b25e0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -22,6 +22,46 @@  #include "dev-replace.h"  #include "sysfs.h" +/* + * Device replace overview + * + * [Objective] + * To copy all extents (both new and on-disk) from source device to target + * device, while still keeping the filesystem read-write. + * + * [Method] + * There are two main methods involved: + * + * - Write duplication + * + *   All new writes will be written to both target and source devices, so even + *   if replace gets canceled, sources device still contans up-to-date data. + * + *   Location:		handle_ops_on_dev_replace() from __btrfs_map_block() + *   Start:		btrfs_dev_replace_start() + *   End:		btrfs_dev_replace_finishing() + *   Content:		Latest data/metadata + * + * - Copy existing extents + * + *   This happens by re-using scrub facility, as scrub also iterates through + *   existing extents from commit root. + * + *   Location:		scrub_write_block_to_dev_replace() from + *   			scrub_block_complete() + *   Content:		Data/meta from commit root. + * + * Due to the content difference, we need to avoid nocow write when dev-replace + * is happening.  This is done by marking the block group read-only and waiting + * for NOCOW writes. + * + * After replace is done, the finishing part is done by swapping the target and + * source devices. + * + *   Location:		btrfs_dev_replace_update_device_in_mapping_tree() from + *   			btrfs_dev_replace_finishing() + */ +  static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  				       int scrub_ret);  static void btrfs_dev_replace_update_device_in_mapping_tree( @@ -472,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,  	atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);  	up_write(&dev_replace->rwsem); -	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); +	ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);  	if (ret)  		btrfs_err(fs_info, "kobj add dev failed %d", ret); @@ -703,7 +743,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	mutex_unlock(&fs_info->fs_devices->device_list_mutex);  	/* replace the sysfs entry */ -	btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); +	btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);  	btrfs_sysfs_update_devid(tgt_device);  	btrfs_rm_dev_replace_free_srcdev(src_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c6c9a6a8e6c8..a6cb5cbbdb9f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -7,7 +7,6 @@  #include <linux/blkdev.h>  #include <linux/radix-tree.h>  #include <linux/writeback.h> -#include <linux/buffer_head.h>  #include <linux/workqueue.h>  #include <linux/kthread.h>  #include <linux/slab.h> @@ -42,6 +41,7 @@  #include "ref-verify.h"  #include "block-group.h"  #include "discard.h" +#include "space-info.h"  #define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\  				 BTRFS_HEADER_FLAG_RELOC |\ @@ -98,6 +98,12 @@ void __cold btrfs_end_io_wq_exit(void)  	kmem_cache_destroy(btrfs_end_io_wq_cache);  } +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ +	if (fs_info->csum_shash) +		crypto_free_shash(fs_info->csum_shash); +} +  /*   * async submit bios are used to offload expensive checksumming   * onto the worker threads.  They checksum file and metadata bios @@ -247,47 +253,27 @@ out:  /*   * Compute the csum of a btree block and store the result to provided buffer. - * - * Returns error if the extent buffer cannot be mapped.   */ -static int csum_tree_block(struct extent_buffer *buf, u8 *result) +static void csum_tree_block(struct extent_buffer *buf, u8 *result)  {  	struct btrfs_fs_info *fs_info = buf->fs_info; +	const int num_pages = fs_info->nodesize >> PAGE_SHIFT;  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); -	unsigned long len; -	unsigned long cur_len; -	unsigned long offset = BTRFS_CSUM_SIZE;  	char *kaddr; -	unsigned long map_start; -	unsigned long map_len; -	int err; +	int i;  	shash->tfm = fs_info->csum_shash;  	crypto_shash_init(shash); +	kaddr = page_address(buf->pages[0]); +	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, +			    PAGE_SIZE - BTRFS_CSUM_SIZE); -	len = buf->len - offset; - -	while (len > 0) { -		/* -		 * Note: we don't need to check for the err == 1 case here, as -		 * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' -		 * and 'min_len = 32' and the currently implemented mapping -		 * algorithm we cannot cross a page boundary. -		 */ -		err = map_private_extent_buffer(buf, offset, 32, -					&kaddr, &map_start, &map_len); -		if (WARN_ON(err)) -			return err; -		cur_len = min(len, map_len - (offset - map_start)); -		crypto_shash_update(shash, kaddr + offset - map_start, cur_len); -		len -= cur_len; -		offset += cur_len; +	for (i = 1; i < num_pages; i++) { +		kaddr = page_address(buf->pages[i]); +		crypto_shash_update(shash, kaddr, PAGE_SIZE);  	}  	memset(result, 0, BTRFS_CSUM_SIZE); -  	crypto_shash_final(shash, result); - -	return 0;  }  /* @@ -535,10 +521,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)  		return -EUCLEAN;  	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, -			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); +				    offsetof(struct btrfs_header, fsid), +				    BTRFS_FSID_SIZE) == 0); -	if (csum_tree_block(eb, result)) -		return -EINVAL; +	csum_tree_block(eb, result);  	if (btrfs_header_level(eb))  		ret = btrfs_check_node(eb); @@ -565,7 +551,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb)  	u8 fsid[BTRFS_FSID_SIZE];  	int ret = 1; -	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); +	read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), +			   BTRFS_FSID_SIZE);  	while (fs_devices) {  		u8 *metadata_uuid; @@ -596,9 +583,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,  	u64 found_start;  	int found_level;  	struct extent_buffer *eb; -	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; -	struct btrfs_fs_info *fs_info = root->fs_info; -	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); +	struct btrfs_fs_info *fs_info; +	u16 csum_size;  	int ret = 0;  	u8 result[BTRFS_CSUM_SIZE];  	int reads_done; @@ -607,6 +593,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,  		goto out;  	eb = (struct extent_buffer *)page->private; +	fs_info = eb->fs_info; +	csum_size = btrfs_super_csum_size(fs_info->super_copy);  	/* the pending IO might have been the only thing that kept this buffer  	 * in memory.  Make sure we have a ref for all this other checks @@ -647,9 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,  	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),  				       eb, found_level); -	ret = csum_tree_block(eb, result); -	if (ret) -		goto err; +	csum_tree_block(eb, result);  	if (memcmp_extent_buffer(eb, result, 0, csum_size)) {  		u32 val; @@ -972,9 +958,7 @@ static int btree_writepages(struct address_space *mapping,  static int btree_readpage(struct file *file, struct page *page)  { -	struct extent_io_tree *tree; -	tree = &BTRFS_I(page->mapping->host)->io_tree; -	return extent_read_full_page(tree, page, btree_get_extent, 0); +	return extent_read_full_page(page, btree_get_extent, 0);  }  static int btree_releasepage(struct page *page, gfp_t gfp_flags) @@ -1100,36 +1084,11 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)  	}  } -static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) -{ -	struct btrfs_subvolume_writers *writers; -	int ret; - -	writers = kmalloc(sizeof(*writers), GFP_NOFS); -	if (!writers) -		return ERR_PTR(-ENOMEM); - -	ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); -	if (ret < 0) { -		kfree(writers); -		return ERR_PTR(ret); -	} - -	init_waitqueue_head(&writers->wait); -	return writers; -} - -static void -btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) -{ -	percpu_counter_destroy(&writers->counter); -	kfree(writers); -} -  static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,  			 u64 objectid)  {  	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +	root->fs_info = fs_info;  	root->node = NULL;  	root->commit_root = NULL;  	root->state = 0; @@ -1173,7 +1132,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,  	atomic_set(&root->log_writers, 0);  	atomic_set(&root->log_batch, 0);  	refcount_set(&root->refs, 1); -	atomic_set(&root->will_be_snapshotted, 0);  	atomic_set(&root->snapshot_force_cow, 0);  	atomic_set(&root->nr_swapfiles, 0);  	root->log_transid = 0; @@ -1195,14 +1153,20 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,  	spin_lock_init(&root->root_item_lock);  	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); +#ifdef CONFIG_BTRFS_DEBUG +	INIT_LIST_HEAD(&root->leak_list); +	spin_lock(&fs_info->fs_roots_radix_lock); +	list_add_tail(&root->leak_list, &fs_info->allocated_roots); +	spin_unlock(&fs_info->fs_roots_radix_lock); +#endif  }  static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, -		gfp_t flags) +					   u64 objectid, gfp_t flags)  {  	struct btrfs_root *root = kzalloc(sizeof(*root), flags);  	if (root) -		root->fs_info = fs_info; +		__setup_root(root, fs_info, objectid);  	return root;  } @@ -1215,12 +1179,11 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)  	if (!fs_info)  		return ERR_PTR(-EINVAL); -	root = btrfs_alloc_root(fs_info, GFP_KERNEL); +	root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);  	if (!root)  		return ERR_PTR(-ENOMEM);  	/* We don't use the stripesize in selftest, set it as sectorsize */ -	__setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);  	root->alloc_bytenr = 0;  	return root; @@ -1237,19 +1200,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,  	struct btrfs_key key;  	unsigned int nofs_flag;  	int ret = 0; -	uuid_le uuid = NULL_UUID_LE;  	/*  	 * We're holding a transaction handle, so use a NOFS memory allocation  	 * context to avoid deadlock if reclaim happens.  	 */  	nofs_flag = memalloc_nofs_save(); -	root = btrfs_alloc_root(fs_info, GFP_KERNEL); +	root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);  	memalloc_nofs_restore(nofs_flag);  	if (!root)  		return ERR_PTR(-ENOMEM); -	__setup_root(root, fs_info, objectid);  	root->root_key.objectid = objectid;  	root->root_key.type = BTRFS_ROOT_ITEM_KEY;  	root->root_key.offset = 0; @@ -1277,8 +1238,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,  	btrfs_set_root_last_snapshot(&root->root_item, 0);  	btrfs_set_root_dirid(&root->root_item, 0);  	if (is_fstree(objectid)) -		uuid_le_gen(&uuid); -	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); +		generate_random_guid(root->root_item.uuid); +	else +		export_guid(root->root_item.uuid, &guid_null);  	root->root_item.drop_level = 0;  	key.objectid = objectid; @@ -1293,12 +1255,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,  	return root;  fail: -	if (leaf) { +	if (leaf)  		btrfs_tree_unlock(leaf); -		free_extent_buffer(root->commit_root); -		free_extent_buffer(leaf); -	} -	kfree(root); +	btrfs_put_root(root);  	return ERR_PTR(ret);  } @@ -1309,12 +1268,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,  	struct btrfs_root *root;  	struct extent_buffer *leaf; -	root = btrfs_alloc_root(fs_info, GFP_NOFS); +	root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);  	if (!root)  		return ERR_PTR(-ENOMEM); -	__setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID); -  	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;  	root->root_key.type = BTRFS_ROOT_ITEM_KEY;  	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; @@ -1331,7 +1288,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,  	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,  			NULL, 0, 0, 0);  	if (IS_ERR(leaf)) { -		kfree(root); +		btrfs_put_root(root);  		return ERR_CAST(leaf);  	} @@ -1387,8 +1344,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,  	return 0;  } -static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, -					       struct btrfs_key *key) +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, +					struct btrfs_key *key)  {  	struct btrfs_root *root;  	struct btrfs_fs_info *fs_info = tree_root->fs_info; @@ -1401,14 +1358,12 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,  	if (!path)  		return ERR_PTR(-ENOMEM); -	root = btrfs_alloc_root(fs_info, GFP_NOFS); +	root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);  	if (!root) {  		ret = -ENOMEM;  		goto alloc_fail;  	} -	__setup_root(root, fs_info, key->objectid); -  	ret = btrfs_find_root(tree_root, key, path,  			      &root->root_item, &root->root_key);  	if (ret) { @@ -1424,10 +1379,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,  				     generation, level, NULL);  	if (IS_ERR(root->node)) {  		ret = PTR_ERR(root->node); +		root->node = NULL;  		goto find_fail;  	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {  		ret = -EIO; -		free_extent_buffer(root->node);  		goto find_fail;  	}  	root->commit_root = btrfs_root_node(root); @@ -1436,33 +1391,16 @@ out:  	return root;  find_fail: -	kfree(root); +	btrfs_put_root(root);  alloc_fail:  	root = ERR_PTR(ret);  	goto out;  } -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, -				      struct btrfs_key *location) -{ -	struct btrfs_root *root; - -	root = btrfs_read_tree_root(tree_root, location); -	if (IS_ERR(root)) -		return root; - -	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { -		set_bit(BTRFS_ROOT_REF_COWS, &root->state); -		btrfs_check_and_init_root_item(&root->root_item); -	} - -	return root; -} - -int btrfs_init_fs_root(struct btrfs_root *root) +static int btrfs_init_fs_root(struct btrfs_root *root)  {  	int ret; -	struct btrfs_subvolume_writers *writers; +	unsigned int nofs_flag;  	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);  	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), @@ -1472,12 +1410,20 @@ int btrfs_init_fs_root(struct btrfs_root *root)  		goto fail;  	} -	writers = btrfs_alloc_subvolume_writers(); -	if (IS_ERR(writers)) { -		ret = PTR_ERR(writers); +	/* +	 * We might be called under a transaction (e.g. indirect backref +	 * resolution) which could deadlock if it triggers memory reclaim +	 */ +	nofs_flag = memalloc_nofs_save(); +	ret = btrfs_drew_lock_init(&root->snapshot_lock); +	memalloc_nofs_restore(nofs_flag); +	if (ret)  		goto fail; + +	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { +		set_bit(BTRFS_ROOT_REF_COWS, &root->state); +		btrfs_check_and_init_root_item(&root->root_item);  	} -	root->subv_writers = writers;  	btrfs_init_free_ino_ctl(root);  	spin_lock_init(&root->ino_cache_lock); @@ -1505,14 +1451,16 @@ fail:  	return ret;  } -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, -					u64 root_id) +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, +					       u64 root_id)  {  	struct btrfs_root *root;  	spin_lock(&fs_info->fs_roots_radix_lock);  	root = radix_tree_lookup(&fs_info->fs_roots_radix,  				 (unsigned long)root_id); +	if (root) +		root = btrfs_grab_root(root);  	spin_unlock(&fs_info->fs_roots_radix_lock);  	return root;  } @@ -1530,14 +1478,62 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,  	ret = radix_tree_insert(&fs_info->fs_roots_radix,  				(unsigned long)root->root_key.objectid,  				root); -	if (ret == 0) +	if (ret == 0) { +		btrfs_grab_root(root);  		set_bit(BTRFS_ROOT_IN_RADIX, &root->state); +	}  	spin_unlock(&fs_info->fs_roots_radix_lock);  	radix_tree_preload_end();  	return ret;  } +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) +{ +#ifdef CONFIG_BTRFS_DEBUG +	struct btrfs_root *root; + +	while (!list_empty(&fs_info->allocated_roots)) { +		root = list_first_entry(&fs_info->allocated_roots, +					struct btrfs_root, leak_list); +		btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", +			  root->root_key.objectid, root->root_key.offset, +			  refcount_read(&root->refs)); +		while (refcount_read(&root->refs) > 1) +			btrfs_put_root(root); +		btrfs_put_root(root); +	} +#endif +} + +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) +{ +	percpu_counter_destroy(&fs_info->dirty_metadata_bytes); +	percpu_counter_destroy(&fs_info->delalloc_bytes); +	percpu_counter_destroy(&fs_info->dio_bytes); +	percpu_counter_destroy(&fs_info->dev_replace.bio_counter); +	btrfs_free_csum_hash(fs_info); +	btrfs_free_stripe_hash_table(fs_info); +	btrfs_free_ref_cache(fs_info); +	kfree(fs_info->balance_ctl); +	kfree(fs_info->delayed_root); +	btrfs_put_root(fs_info->extent_root); +	btrfs_put_root(fs_info->tree_root); +	btrfs_put_root(fs_info->chunk_root); +	btrfs_put_root(fs_info->dev_root); +	btrfs_put_root(fs_info->csum_root); +	btrfs_put_root(fs_info->quota_root); +	btrfs_put_root(fs_info->uuid_root); +	btrfs_put_root(fs_info->free_space_root); +	btrfs_put_root(fs_info->fs_root); +	btrfs_check_leaked_roots(fs_info); +	btrfs_extent_buffer_leak_debug_check(fs_info); +	kfree(fs_info->super_copy); +	kfree(fs_info->super_for_commit); +	kvfree(fs_info); +} + +  struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,  				     struct btrfs_key *location,  				     bool check_ref) @@ -1548,33 +1544,35 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,  	int ret;  	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) -		return fs_info->tree_root; +		return btrfs_grab_root(fs_info->tree_root);  	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) -		return fs_info->extent_root; +		return btrfs_grab_root(fs_info->extent_root);  	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) -		return fs_info->chunk_root; +		return btrfs_grab_root(fs_info->chunk_root);  	if (location->objectid == BTRFS_DEV_TREE_OBJECTID) -		return fs_info->dev_root; +		return btrfs_grab_root(fs_info->dev_root);  	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) -		return fs_info->csum_root; +		return btrfs_grab_root(fs_info->csum_root);  	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) -		return fs_info->quota_root ? fs_info->quota_root : -					     ERR_PTR(-ENOENT); +		return btrfs_grab_root(fs_info->quota_root) ? +			fs_info->quota_root : ERR_PTR(-ENOENT);  	if (location->objectid == BTRFS_UUID_TREE_OBJECTID) -		return fs_info->uuid_root ? fs_info->uuid_root : -					    ERR_PTR(-ENOENT); +		return btrfs_grab_root(fs_info->uuid_root) ? +			fs_info->uuid_root : ERR_PTR(-ENOENT);  	if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) -		return fs_info->free_space_root ? fs_info->free_space_root : -						  ERR_PTR(-ENOENT); +		return btrfs_grab_root(fs_info->free_space_root) ? +			fs_info->free_space_root : ERR_PTR(-ENOENT);  again:  	root = btrfs_lookup_fs_root(fs_info, location->objectid);  	if (root) { -		if (check_ref && btrfs_root_refs(&root->root_item) == 0) +		if (check_ref && btrfs_root_refs(&root->root_item) == 0) { +			btrfs_put_root(root);  			return ERR_PTR(-ENOENT); +		}  		return root;  	} -	root = btrfs_read_fs_root(fs_info->tree_root, location); +	root = btrfs_read_tree_root(fs_info->tree_root, location);  	if (IS_ERR(root))  		return root; @@ -1605,15 +1603,14 @@ again:  	ret = btrfs_insert_fs_root(fs_info, root);  	if (ret) { -		if (ret == -EEXIST) { -			btrfs_free_fs_root(root); +		btrfs_put_root(root); +		if (ret == -EEXIST)  			goto again; -		}  		goto fail;  	}  	return root;  fail: -	btrfs_free_fs_root(root); +	btrfs_put_root(root);  	return ERR_PTR(ret);  } @@ -1985,11 +1982,35 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)  	free_root_extent_buffers(info->csum_root);  	free_root_extent_buffers(info->quota_root);  	free_root_extent_buffers(info->uuid_root); +	free_root_extent_buffers(info->fs_root);  	if (free_chunk_root)  		free_root_extent_buffers(info->chunk_root);  	free_root_extent_buffers(info->free_space_root);  } +void btrfs_put_root(struct btrfs_root *root) +{ +	if (!root) +		return; + +	if (refcount_dec_and_test(&root->refs)) { +		WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); +		if (root->anon_dev) +			free_anon_bdev(root->anon_dev); +		btrfs_drew_lock_destroy(&root->snapshot_lock); +		free_extent_buffer(root->node); +		free_extent_buffer(root->commit_root); +		kfree(root->free_ino_ctl); +		kfree(root->free_ino_pinned); +#ifdef CONFIG_BTRFS_DEBUG +		spin_lock(&root->fs_info->fs_roots_radix_lock); +		list_del_init(&root->leak_list); +		spin_unlock(&root->fs_info->fs_roots_radix_lock); +#endif +		kfree(root); +	} +} +  void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)  {  	int ret; @@ -2001,13 +2022,9 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)  				     struct btrfs_root, root_list);  		list_del(&gang[0]->root_list); -		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { +		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))  			btrfs_drop_and_free_fs_root(fs_info, gang[0]); -		} else { -			free_extent_buffer(gang[0]->node); -			free_extent_buffer(gang[0]->commit_root); -			btrfs_put_fs_root(gang[0]); -		} +		btrfs_put_root(gang[0]);  	}  	while (1) { @@ -2020,10 +2037,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)  			btrfs_drop_and_free_fs_root(fs_info, gang[i]);  	} -	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { +	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))  		btrfs_free_log_root_tree(NULL, fs_info); -		btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); -	}  }  static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) @@ -2069,7 +2084,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)  	BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; -	BTRFS_I(inode)->root = fs_info->tree_root; +	BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);  	memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));  	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);  	btrfs_insert_inode_hash(inode); @@ -2189,11 +2204,6 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)  	return 0;  } -static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) -{ -	crypto_free_shash(fs_info->csum_shash); -} -  static int btrfs_replay_log(struct btrfs_fs_info *fs_info,  			    struct btrfs_fs_devices *fs_devices)  { @@ -2208,24 +2218,23 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,  		return -EIO;  	} -	log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); +	log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, +					 GFP_KERNEL);  	if (!log_tree_root)  		return -ENOMEM; -	__setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); -  	log_tree_root->node = read_tree_block(fs_info, bytenr,  					      fs_info->generation + 1,  					      level, NULL);  	if (IS_ERR(log_tree_root->node)) {  		btrfs_warn(fs_info, "failed to read log tree");  		ret = PTR_ERR(log_tree_root->node); -		kfree(log_tree_root); +		log_tree_root->node = NULL; +		btrfs_put_root(log_tree_root);  		return ret;  	} else if (!extent_buffer_uptodate(log_tree_root->node)) {  		btrfs_err(fs_info, "failed to read log tree"); -		free_extent_buffer(log_tree_root->node); -		kfree(log_tree_root); +		btrfs_put_root(log_tree_root);  		return -EIO;  	}  	/* returns with log_tree_root freed on success */ @@ -2233,8 +2242,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,  	if (ret) {  		btrfs_handle_fs_error(fs_info, ret,  				      "Failed to recover log tree"); -		free_extent_buffer(log_tree_root->node); -		kfree(log_tree_root); +		btrfs_put_root(log_tree_root);  		return ret;  	} @@ -2624,67 +2632,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)  	return ret;  } -int __cold open_ctree(struct super_block *sb, -	       struct btrfs_fs_devices *fs_devices, -	       char *options) +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)  { -	u32 sectorsize; -	u32 nodesize; -	u32 stripesize; -	u64 generation; -	u64 features; -	u16 csum_type; -	struct btrfs_key location; -	struct buffer_head *bh; -	struct btrfs_super_block *disk_super; -	struct btrfs_fs_info *fs_info = btrfs_sb(sb); -	struct btrfs_root *tree_root; -	struct btrfs_root *chunk_root; -	int ret; -	int err = -EINVAL; -	int clear_free_space_tree = 0; -	int level; - -	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); -	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); -	if (!tree_root || !chunk_root) { -		err = -ENOMEM; -		goto fail; -	} - -	ret = init_srcu_struct(&fs_info->subvol_srcu); -	if (ret) { -		err = ret; -		goto fail; -	} - -	ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); -	if (ret) { -		err = ret; -		goto fail_srcu; -	} - -	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); -	if (ret) { -		err = ret; -		goto fail_dio_bytes; -	} -	fs_info->dirty_metadata_batch = PAGE_SIZE * -					(1 + ilog2(nr_cpu_ids)); - -	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); -	if (ret) { -		err = ret; -		goto fail_dirty_metadata_bytes; -	} - -	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, -			GFP_KERNEL); -	if (ret) { -		err = ret; -		goto fail_delalloc_bytes; -	} -  	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);  	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);  	INIT_LIST_HEAD(&fs_info->trans_list); @@ -2711,6 +2660,11 @@ int __cold open_ctree(struct super_block *sb,  	INIT_LIST_HEAD(&fs_info->space_info);  	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);  	INIT_LIST_HEAD(&fs_info->unused_bgs); +#ifdef CONFIG_BTRFS_DEBUG +	INIT_LIST_HEAD(&fs_info->allocated_roots); +	INIT_LIST_HEAD(&fs_info->allocated_ebs); +	spin_lock_init(&fs_info->eb_leak_lock); +#endif  	extent_map_tree_init(&fs_info->mapping_tree);  	btrfs_init_block_rsv(&fs_info->global_block_rsv,  			     BTRFS_BLOCK_RSV_GLOBAL); @@ -2727,7 +2681,6 @@ int __cold open_ctree(struct super_block *sb,  	atomic_set(&fs_info->reada_works_cnt, 0);  	atomic_set(&fs_info->nr_delayed_iputs, 0);  	atomic64_set(&fs_info->tree_mod_seq, 0); -	fs_info->sb = sb;  	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;  	fs_info->metadata_ratio = 0;  	fs_info->defrag_inodes = RB_ROOT; @@ -2746,21 +2699,6 @@ int __cold open_ctree(struct super_block *sb,  	INIT_LIST_HEAD(&fs_info->ordered_roots);  	spin_lock_init(&fs_info->ordered_root_lock); -	fs_info->btree_inode = new_inode(sb); -	if (!fs_info->btree_inode) { -		err = -ENOMEM; -		goto fail_bio_counter; -	} -	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - -	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), -					GFP_KERNEL); -	if (!fs_info->delayed_root) { -		err = -ENOMEM; -		goto fail_iput; -	} -	btrfs_init_delayed_root(fs_info->delayed_root); -  	btrfs_init_scrub(fs_info);  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY  	fs_info->check_integrity_print_mask = 0; @@ -2768,20 +2706,12 @@ int __cold open_ctree(struct super_block *sb,  	btrfs_init_balance(fs_info);  	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); -	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; -	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - -	btrfs_init_btree_inode(fs_info); -  	spin_lock_init(&fs_info->block_group_cache_lock);  	fs_info->block_group_cache_tree = RB_ROOT;  	fs_info->first_logical_byte = (u64)-1; -	extent_io_tree_init(fs_info, &fs_info->freed_extents[0], -			    IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); -	extent_io_tree_init(fs_info, &fs_info->freed_extents[1], -			    IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); -	fs_info->pinned_extents = &fs_info->freed_extents[0]; +	extent_io_tree_init(fs_info, &fs_info->excluded_extents, +			    IO_TREE_FS_EXCLUDED_EXTENTS, NULL);  	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);  	mutex_init(&fs_info->ordered_operations_mutex); @@ -2817,23 +2747,135 @@ int __cold open_ctree(struct super_block *sb,  	fs_info->swapfile_pins = RB_ROOT;  	fs_info->send_in_progress = 0; +} + +static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ +	int ret; + +	fs_info->sb = sb; +	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; +	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); + +	ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); +	if (ret) +		return ret; + +	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); +	if (ret) +		return ret; + +	fs_info->dirty_metadata_batch = PAGE_SIZE * +					(1 + ilog2(nr_cpu_ids)); + +	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); +	if (ret) +		return ret; + +	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, +			GFP_KERNEL); +	if (ret) +		return ret; -	ret = btrfs_alloc_stripe_hash_table(fs_info); +	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), +					GFP_KERNEL); +	if (!fs_info->delayed_root) +		return -ENOMEM; +	btrfs_init_delayed_root(fs_info->delayed_root); + +	return btrfs_alloc_stripe_hash_table(fs_info); +} + +static int btrfs_uuid_rescan_kthread(void *data) +{ +	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; +	int ret; + +	/* +	 * 1st step is to iterate through the existing UUID tree and +	 * to delete all entries that contain outdated data. +	 * 2nd step is to add all missing entries to the UUID tree. +	 */ +	ret = btrfs_uuid_tree_iterate(fs_info); +	if (ret < 0) { +		if (ret != -EINTR) +			btrfs_warn(fs_info, "iterating uuid_tree failed %d", +				   ret); +		up(&fs_info->uuid_tree_rescan_sem); +		return ret; +	} +	return btrfs_uuid_scan_kthread(data); +} + +static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ +	struct task_struct *task; + +	down(&fs_info->uuid_tree_rescan_sem); +	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); +	if (IS_ERR(task)) { +		/* fs_info->update_uuid_tree_gen remains 0 in all error case */ +		btrfs_warn(fs_info, "failed to start uuid_rescan task"); +		up(&fs_info->uuid_tree_rescan_sem); +		return PTR_ERR(task); +	} + +	return 0; +} + +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, +		      char *options) +{ +	u32 sectorsize; +	u32 nodesize; +	u32 stripesize; +	u64 generation; +	u64 features; +	u16 csum_type; +	struct btrfs_key location; +	struct btrfs_super_block *disk_super; +	struct btrfs_fs_info *fs_info = btrfs_sb(sb); +	struct btrfs_root *tree_root; +	struct btrfs_root *chunk_root; +	int ret; +	int err = -EINVAL; +	int clear_free_space_tree = 0; +	int level; + +	ret = init_mount_fs_info(fs_info, sb);  	if (ret) {  		err = ret; -		goto fail_alloc; +		goto fail;  	} -	__setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); +	/* These need to be init'ed before we start creating inodes and such. */ +	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, +				     GFP_KERNEL); +	fs_info->tree_root = tree_root; +	chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, +				      GFP_KERNEL); +	fs_info->chunk_root = chunk_root; +	if (!tree_root || !chunk_root) { +		err = -ENOMEM; +		goto fail; +	} + +	fs_info->btree_inode = new_inode(sb); +	if (!fs_info->btree_inode) { +		err = -ENOMEM; +		goto fail; +	} +	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); +	btrfs_init_btree_inode(fs_info);  	invalidate_bdev(fs_devices->latest_bdev);  	/*  	 * Read super block and check the signature bytes only  	 */ -	bh = btrfs_read_dev_super(fs_devices->latest_bdev); -	if (IS_ERR(bh)) { -		err = PTR_ERR(bh); +	disk_super = btrfs_read_dev_super(fs_devices->latest_bdev); +	if (IS_ERR(disk_super)) { +		err = PTR_ERR(disk_super);  		goto fail_alloc;  	} @@ -2841,18 +2883,19 @@ int __cold open_ctree(struct super_block *sb,  	 * Verify the type first, if that or the the checksum value are  	 * corrupted, we'll find out  	 */ -	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); +	csum_type = btrfs_super_csum_type(disk_super);  	if (!btrfs_supported_super_csum(csum_type)) {  		btrfs_err(fs_info, "unsupported checksum algorithm: %u",  			  csum_type);  		err = -EINVAL; -		brelse(bh); +		btrfs_release_disk_super(disk_super);  		goto fail_alloc;  	}  	ret = btrfs_init_csum_hash(fs_info, csum_type);  	if (ret) {  		err = ret; +		btrfs_release_disk_super(disk_super);  		goto fail_alloc;  	} @@ -2860,11 +2903,11 @@ int __cold open_ctree(struct super_block *sb,  	 * We want to check superblock checksum, the type is stored inside.  	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).  	 */ -	if (btrfs_check_super_csum(fs_info, bh->b_data)) { +	if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {  		btrfs_err(fs_info, "superblock checksum mismatch");  		err = -EINVAL; -		brelse(bh); -		goto fail_csum; +		btrfs_release_disk_super(disk_super); +		goto fail_alloc;  	}  	/* @@ -2872,8 +2915,8 @@ int __cold open_ctree(struct super_block *sb,  	 * following bytes up to INFO_SIZE, the checksum is calculated from  	 * the whole block of INFO_SIZE  	 */ -	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); -	brelse(bh); +	memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy)); +	btrfs_release_disk_super(disk_super);  	disk_super = fs_info->super_copy; @@ -2901,11 +2944,11 @@ int __cold open_ctree(struct super_block *sb,  	if (ret) {  		btrfs_err(fs_info, "superblock contains fatal errors");  		err = -EINVAL; -		goto fail_csum; +		goto fail_alloc;  	}  	if (!btrfs_super_root(disk_super)) -		goto fail_csum; +		goto fail_alloc;  	/* check FS state, whether FS is broken. */  	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -2920,7 +2963,7 @@ int __cold open_ctree(struct super_block *sb,  	ret = btrfs_parse_options(fs_info, options, sb->s_flags);  	if (ret) {  		err = ret; -		goto fail_csum; +		goto fail_alloc;  	}  	features = btrfs_super_incompat_flags(disk_super) & @@ -2930,7 +2973,7 @@ int __cold open_ctree(struct super_block *sb,  		    "cannot mount because of unsupported optional features (%llx)",  		    features);  		err = -EINVAL; -		goto fail_csum; +		goto fail_alloc;  	}  	features = btrfs_super_incompat_flags(disk_super); @@ -2974,7 +3017,7 @@ int __cold open_ctree(struct super_block *sb,  		btrfs_err(fs_info,  "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",  			nodesize, sectorsize); -		goto fail_csum; +		goto fail_alloc;  	}  	/* @@ -2990,7 +3033,7 @@ int __cold open_ctree(struct super_block *sb,  	"cannot mount read-write because of unsupported optional features (%llx)",  		       features);  		err = -EINVAL; -		goto fail_csum; +		goto fail_alloc;  	}  	ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3021,8 +3064,6 @@ int __cold open_ctree(struct super_block *sb,  	generation = btrfs_super_chunk_root_generation(disk_super);  	level = btrfs_super_chunk_root_level(disk_super); -	__setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); -  	chunk_root->node = read_tree_block(fs_info,  					   btrfs_super_chunk_root(disk_super),  					   generation, level, NULL); @@ -3038,7 +3079,8 @@ int __cold open_ctree(struct super_block *sb,  	chunk_root->commit_root = btrfs_root_node(chunk_root);  	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, -	   btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); +			   offsetof(struct btrfs_header, chunk_tree_uuid), +			   BTRFS_UUID_SIZE);  	ret = btrfs_read_chunk_tree(fs_info);  	if (ret) { @@ -3061,6 +3103,18 @@ int __cold open_ctree(struct super_block *sb,  	if (ret)  		goto fail_tree_roots; +	/* +	 * If we have a uuid root and we're not being told to rescan we need to +	 * check the generation here so we can set the +	 * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the +	 * transaction during a balance or the log replay without updating the +	 * uuid generation, and then if we crash we would rescan the uuid tree, +	 * even though it was perfectly fine. +	 */ +	if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && +	    fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) +		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); +  	ret = btrfs_verify_dev_extents(fs_info);  	if (ret) {  		btrfs_err(fs_info, @@ -3196,7 +3250,7 @@ int __cold open_ctree(struct super_block *sb,  	location.type = BTRFS_ROOT_ITEM_KEY;  	location.offset = 0; -	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); +	fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true);  	if (IS_ERR(fs_info->fs_root)) {  		err = PTR_ERR(fs_info->fs_root);  		btrfs_warn(fs_info, "failed to read fs tree: %d", err); @@ -3285,8 +3339,6 @@ int __cold open_ctree(struct super_block *sb,  			close_ctree(fs_info);  			return ret;  		} -	} else { -		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);  	}  	set_bit(BTRFS_FS_OPEN, &fs_info->flags); @@ -3329,90 +3381,78 @@ fail_tree_roots:  fail_sb_buffer:  	btrfs_stop_all_workers(fs_info);  	btrfs_free_block_groups(fs_info); -fail_csum: -	btrfs_free_csum_hash(fs_info);  fail_alloc: -fail_iput:  	btrfs_mapping_tree_free(&fs_info->mapping_tree);  	iput(fs_info->btree_inode); -fail_bio_counter: -	percpu_counter_destroy(&fs_info->dev_replace.bio_counter); -fail_delalloc_bytes: -	percpu_counter_destroy(&fs_info->delalloc_bytes); -fail_dirty_metadata_bytes: -	percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_dio_bytes: -	percpu_counter_destroy(&fs_info->dio_bytes); -fail_srcu: -	cleanup_srcu_struct(&fs_info->subvol_srcu);  fail: -	btrfs_free_stripe_hash_table(fs_info);  	btrfs_close_devices(fs_info->fs_devices);  	return err;  }  ALLOW_ERROR_INJECTION(open_ctree, ERRNO); -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +static void btrfs_end_super_write(struct bio *bio)  { -	if (uptodate) { -		set_buffer_uptodate(bh); -	} else { -		struct btrfs_device *device = (struct btrfs_device *) -			bh->b_private; - -		btrfs_warn_rl_in_rcu(device->fs_info, -				"lost page write due to IO error on %s", -					  rcu_str_deref(device->name)); -		/* note, we don't set_buffer_write_io_error because we have -		 * our own ways of dealing with the IO errors -		 */ -		clear_buffer_uptodate(bh); -		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); +	struct btrfs_device *device = bio->bi_private; +	struct bio_vec *bvec; +	struct bvec_iter_all iter_all; +	struct page *page; + +	bio_for_each_segment_all(bvec, bio, iter_all) { +		page = bvec->bv_page; + +		if (bio->bi_status) { +			btrfs_warn_rl_in_rcu(device->fs_info, +				"lost page write due to IO error on %s (%d)", +				rcu_str_deref(device->name), +				blk_status_to_errno(bio->bi_status)); +			ClearPageUptodate(page); +			SetPageError(page); +			btrfs_dev_stat_inc_and_print(device, +						     BTRFS_DEV_STAT_WRITE_ERRS); +		} else { +			SetPageUptodate(page); +		} + +		put_page(page); +		unlock_page(page);  	} -	unlock_buffer(bh); -	put_bh(bh); + +	bio_put(bio);  } -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, -			struct buffer_head **bh_ret) +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, +						   int copy_num)  { -	struct buffer_head *bh;  	struct btrfs_super_block *super; +	struct page *page;  	u64 bytenr; +	struct address_space *mapping = bdev->bd_inode->i_mapping;  	bytenr = btrfs_sb_offset(copy_num);  	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) -		return -EINVAL; +		return ERR_PTR(-EINVAL); -	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); -	/* -	 * If we fail to read from the underlying devices, as of now -	 * the best option we have is to mark it EIO. -	 */ -	if (!bh) -		return -EIO; +	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); +	if (IS_ERR(page)) +		return ERR_CAST(page); -	super = (struct btrfs_super_block *)bh->b_data; +	super = page_address(page);  	if (btrfs_super_bytenr(super) != bytenr ||  		    btrfs_super_magic(super) != BTRFS_MAGIC) { -		brelse(bh); -		return -EINVAL; +		btrfs_release_disk_super(super); +		return ERR_PTR(-EINVAL);  	} -	*bh_ret = bh; -	return 0; +	return super;  } -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)  { -	struct buffer_head *bh; -	struct buffer_head *latest = NULL; -	struct btrfs_super_block *super; +	struct btrfs_super_block *super, *latest = NULL;  	int i;  	u64 transid = 0; -	int ret = -EINVAL;  	/* we would like to check all the supers, but that would make  	 * a btrfs mount succeed after a mkfs from a different FS. @@ -3420,48 +3460,41 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)  	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead  	 */  	for (i = 0; i < 1; i++) { -		ret = btrfs_read_dev_one_super(bdev, i, &bh); -		if (ret) +		super = btrfs_read_dev_one_super(bdev, i); +		if (IS_ERR(super))  			continue; -		super = (struct btrfs_super_block *)bh->b_data; -  		if (!latest || btrfs_super_generation(super) > transid) { -			brelse(latest); -			latest = bh; +			if (latest) +				btrfs_release_disk_super(super); + +			latest = super;  			transid = btrfs_super_generation(super); -		} else { -			brelse(bh);  		}  	} -	if (!latest) -		return ERR_PTR(ret); - -	return latest; +	return super;  }  /*   * Write superblock @sb to the @device. Do not wait for completion, all the - * buffer heads we write are pinned. + * pages we use for writing are locked.   *   * Write @max_mirrors copies of the superblock, where 0 means default that fit   * the expected device size at commit time. Note that max_mirrors must be   * same for write and wait phases.   * - * Return number of errors when buffer head is not found or submission fails. + * Return number of errors when page is not found or submission fails.   */  static int write_dev_supers(struct btrfs_device *device,  			    struct btrfs_super_block *sb, int max_mirrors)  {  	struct btrfs_fs_info *fs_info = device->fs_info; +	struct address_space *mapping = device->bdev->bd_inode->i_mapping;  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); -	struct buffer_head *bh;  	int i; -	int ret;  	int errors = 0;  	u64 bytenr; -	int op_flags;  	if (max_mirrors == 0)  		max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3469,6 +3502,10 @@ static int write_dev_supers(struct btrfs_device *device,  	shash->tfm = fs_info->csum_shash;  	for (i = 0; i < max_mirrors; i++) { +		struct page *page; +		struct bio *bio; +		struct btrfs_super_block *disk_super; +  		bytenr = btrfs_sb_offset(i);  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=  		    device->commit_total_bytes) @@ -3481,37 +3518,45 @@ static int write_dev_supers(struct btrfs_device *device,  				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);  		crypto_shash_final(shash, sb->csum); -		/* One reference for us, and we leave it for the caller */ -		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, -			      BTRFS_SUPER_INFO_SIZE); -		if (!bh) { +		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, +					   GFP_NOFS); +		if (!page) {  			btrfs_err(device->fs_info, -			    "couldn't get super buffer head for bytenr %llu", +			    "couldn't get super block page for bytenr %llu",  			    bytenr);  			errors++;  			continue;  		} -		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); +		/* Bump the refcount for wait_dev_supers() */ +		get_page(page); -		/* one reference for submit_bh */ -		get_bh(bh); +		disk_super = page_address(page); +		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); -		set_buffer_uptodate(bh); -		lock_buffer(bh); -		bh->b_end_io = btrfs_end_buffer_write_sync; -		bh->b_private = device; +		/* +		 * Directly use bios here instead of relying on the page cache +		 * to do I/O, so we don't lose the ability to do integrity +		 * checking. +		 */ +		bio = bio_alloc(GFP_NOFS, 1); +		bio_set_dev(bio, device->bdev); +		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; +		bio->bi_private = device; +		bio->bi_end_io = btrfs_end_super_write; +		__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, +			       offset_in_page(bytenr));  		/* -		 * we fua the first super.  The others we allow -		 * to go down lazy. +		 * We FUA only the first super block.  The others we allow to +		 * go down lazy and there's a short window where the on-disk +		 * copies might still contain the older version.  		 */ -		op_flags = REQ_SYNC | REQ_META | REQ_PRIO; +		bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;  		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) -			op_flags |= REQ_FUA; -		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); -		if (ret) -			errors++; +			bio->bi_opf |= REQ_FUA; + +		btrfsic_submit_bio(bio);  	}  	return errors < i ? 0 : -1;  } @@ -3520,12 +3565,11 @@ static int write_dev_supers(struct btrfs_device *device,   * Wait for write completion of superblocks done by write_dev_supers,   * @max_mirrors same for write and wait phases.   * - * Return number of errors when buffer head is not found or not marked up to + * Return number of errors when page is not found or not marked up to   * date.   */  static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)  { -	struct buffer_head *bh;  	int i;  	int errors = 0;  	bool primary_failed = false; @@ -3535,32 +3579,34 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)  		max_mirrors = BTRFS_SUPER_MIRROR_MAX;  	for (i = 0; i < max_mirrors; i++) { +		struct page *page; +  		bytenr = btrfs_sb_offset(i);  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=  		    device->commit_total_bytes)  			break; -		bh = __find_get_block(device->bdev, -				      bytenr / BTRFS_BDEV_BLOCKSIZE, -				      BTRFS_SUPER_INFO_SIZE); -		if (!bh) { +		page = find_get_page(device->bdev->bd_inode->i_mapping, +				     bytenr >> PAGE_SHIFT); +		if (!page) {  			errors++;  			if (i == 0)  				primary_failed = true;  			continue;  		} -		wait_on_buffer(bh); -		if (!buffer_uptodate(bh)) { +		/* Page is submitted locked and unlocked once the IO completes */ +		wait_on_page_locked(page); +		if (PageError(page)) {  			errors++;  			if (i == 0)  				primary_failed = true;  		} -		/* drop our reference */ -		brelse(bh); +		/* Drop our reference */ +		put_page(page); -		/* drop the reference from the writing run */ -		brelse(bh); +		/* Drop the reference from the writing run */ +		put_page(page);  	}  	/* log error, force error return */ @@ -3832,20 +3878,19 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)  void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,  				  struct btrfs_root *root)  { +	bool drop_ref = false; +  	spin_lock(&fs_info->fs_roots_radix_lock);  	radix_tree_delete(&fs_info->fs_roots_radix,  			  (unsigned long)root->root_key.objectid); +	if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) +		drop_ref = true;  	spin_unlock(&fs_info->fs_roots_radix_lock); -	if (btrfs_root_refs(&root->root_item) == 0) -		synchronize_srcu(&fs_info->subvol_srcu); -  	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {  		btrfs_free_log(NULL, root);  		if (root->reloc_root) { -			free_extent_buffer(root->reloc_root->node); -			free_extent_buffer(root->reloc_root->commit_root); -			btrfs_put_fs_root(root->reloc_root); +			btrfs_put_root(root->reloc_root);  			root->reloc_root = NULL;  		}  	} @@ -3854,22 +3899,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,  		__btrfs_remove_free_space_cache(root->free_ino_pinned);  	if (root->free_ino_ctl)  		__btrfs_remove_free_space_cache(root->free_ino_ctl); -	btrfs_free_fs_root(root); -} - -void btrfs_free_fs_root(struct btrfs_root *root) -{ -	iput(root->ino_cache_inode); -	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); -	if (root->anon_dev) -		free_anon_bdev(root->anon_dev); -	if (root->subv_writers) -		btrfs_free_subvolume_writers(root->subv_writers); -	free_extent_buffer(root->node); -	free_extent_buffer(root->commit_root); -	kfree(root->free_ino_ctl); -	kfree(root->free_ino_pinned); -	btrfs_put_fs_root(root); +	if (root->ino_cache_inode) { +		iput(root->ino_cache_inode); +		root->ino_cache_inode = NULL; +	} +	if (drop_ref) +		btrfs_put_root(root);  }  int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3879,15 +3914,14 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)  	int i = 0;  	int err = 0;  	unsigned int ret = 0; -	int index;  	while (1) { -		index = srcu_read_lock(&fs_info->subvol_srcu); +		spin_lock(&fs_info->fs_roots_radix_lock);  		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,  					     (void **)gang, root_objectid,  					     ARRAY_SIZE(gang));  		if (!ret) { -			srcu_read_unlock(&fs_info->subvol_srcu, index); +			spin_unlock(&fs_info->fs_roots_radix_lock);  			break;  		}  		root_objectid = gang[ret - 1]->root_key.objectid + 1; @@ -3899,9 +3933,9 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)  				continue;  			}  			/* grab all the search result for later use */ -			gang[i] = btrfs_grab_fs_root(gang[i]); +			gang[i] = btrfs_grab_root(gang[i]);  		} -		srcu_read_unlock(&fs_info->subvol_srcu, index); +		spin_unlock(&fs_info->fs_roots_radix_lock);  		for (i = 0; i < ret; i++) {  			if (!gang[i]) @@ -3910,7 +3944,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)  			err = btrfs_orphan_cleanup(gang[i]);  			if (err)  				break; -			btrfs_put_fs_root(gang[i]); +			btrfs_put_root(gang[i]);  		}  		root_objectid++;  	} @@ -3918,7 +3952,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)  	/* release the uncleaned roots due to error */  	for (; i < ret; i++) {  		if (gang[i]) -			btrfs_put_fs_root(gang[i]); +			btrfs_put_root(gang[i]);  	}  	return err;  } @@ -3990,6 +4024,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)  		 */  		btrfs_delete_unused_bgs(fs_info); +		/* +		 * There might be existing delayed inode workers still running +		 * and holding an empty delayed inode item. We must wait for +		 * them to complete first because they can create a transaction. +		 * This happens when someone calls btrfs_balance_delayed_items() +		 * and then a transaction commit runs the same delayed nodes +		 * before any delayed worker has done something with the nodes. +		 * We must wait for any worker here and not at transaction +		 * commit time since that could cause a deadlock. +		 * This is a very rare case. +		 */ +		btrfs_flush_workqueue(fs_info->delayed_workers); +  		ret = btrfs_commit_super(fs_info);  		if (ret)  			btrfs_err(fs_info, "commit super ret %d", ret); @@ -4020,8 +4067,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)  	btrfs_sysfs_remove_mounted(fs_info);  	btrfs_sysfs_remove_fsid(fs_info->fs_devices); -	btrfs_free_fs_roots(fs_info); -  	btrfs_put_block_group_cache(fs_info);  	/* @@ -4033,6 +4078,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)  	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);  	free_root_pointers(fs_info, true); +	btrfs_free_fs_roots(fs_info);  	/*  	 * We must free the block groups after dropping the fs_roots as we could @@ -4052,16 +4098,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)  	btrfs_mapping_tree_free(&fs_info->mapping_tree);  	btrfs_close_devices(fs_info->fs_devices); - -	percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -	percpu_counter_destroy(&fs_info->delalloc_bytes); -	percpu_counter_destroy(&fs_info->dio_bytes); -	percpu_counter_destroy(&fs_info->dev_replace.bio_counter); -	cleanup_srcu_struct(&fs_info->subvol_srcu); - -	btrfs_free_csum_hash(fs_info); -	btrfs_free_stripe_hash_table(fs_info); -	btrfs_free_ref_cache(fs_info);  }  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -4235,7 +4271,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,  	spin_lock(&delayed_refs->lock);  	if (atomic_read(&delayed_refs->num_entries) == 0) {  		spin_unlock(&delayed_refs->lock); -		btrfs_info(fs_info, "delayed_refs has NO entry"); +		btrfs_debug(fs_info, "delayed_refs has NO entry");  		return ret;  	} @@ -4269,9 +4305,30 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,  		spin_unlock(&delayed_refs->lock);  		mutex_unlock(&head->mutex); -		if (pin_bytes) -			btrfs_pin_extent(fs_info, head->bytenr, -					 head->num_bytes, 1); +		if (pin_bytes) { +			struct btrfs_block_group *cache; + +			cache = btrfs_lookup_block_group(fs_info, head->bytenr); +			BUG_ON(!cache); + +			spin_lock(&cache->space_info->lock); +			spin_lock(&cache->lock); +			cache->pinned += head->num_bytes; +			btrfs_space_info_update_bytes_pinned(fs_info, +				cache->space_info, head->num_bytes); +			cache->reserved -= head->num_bytes; +			cache->space_info->bytes_reserved -= head->num_bytes; +			spin_unlock(&cache->lock); +			spin_unlock(&cache->space_info->lock); +			percpu_counter_add_batch( +				&cache->space_info->total_bytes_pinned, +				head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + +			btrfs_put_block_group(cache); + +			btrfs_error_unpin_extent_range(fs_info, head->bytenr, +				head->bytenr + head->num_bytes - 1); +		}  		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);  		btrfs_put_delayed_ref_head(head);  		cond_resched(); @@ -4327,12 +4384,12 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)  	while (!list_empty(&splice)) {  		root = list_first_entry(&splice, struct btrfs_root,  					 delalloc_root); -		root = btrfs_grab_fs_root(root); +		root = btrfs_grab_root(root);  		BUG_ON(!root);  		spin_unlock(&fs_info->delalloc_root_lock);  		btrfs_destroy_delalloc_inodes(root); -		btrfs_put_fs_root(root); +		btrfs_put_root(root);  		spin_lock(&fs_info->delalloc_root_lock);  	} @@ -4373,16 +4430,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,  }  static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, -				       struct extent_io_tree *pinned_extents) +				       struct extent_io_tree *unpin)  { -	struct extent_io_tree *unpin;  	u64 start;  	u64 end;  	int ret; -	bool loop = true; -	unpin = pinned_extents; -again:  	while (1) {  		struct extent_state *cached_state = NULL; @@ -4407,15 +4460,6 @@ again:  		cond_resched();  	} -	if (loop) { -		if (unpin == &fs_info->freed_extents[0]) -			unpin = &fs_info->freed_extents[1]; -		else -			unpin = &fs_info->freed_extents[0]; -		loop = false; -		goto again; -	} -  	return 0;  } @@ -4506,8 +4550,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,  	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,  				     EXTENT_DIRTY); -	btrfs_destroy_pinned_extent(fs_info, -				    fs_info->pinned_extents); +	btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);  	cur_trans->state =TRANS_STATE_COMPLETED;  	wake_up(&cur_trans->commit_wait); @@ -4559,7 +4602,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)  	btrfs_destroy_all_ordered_extents(fs_info);  	btrfs_destroy_delayed_inodes(fs_info);  	btrfs_assert_delayed_root_empty(fs_info); -	btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);  	btrfs_destroy_all_delalloc_inodes(fs_info);  	mutex_unlock(&fs_info->transaction_kthread_mutex); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8c2d6cf1ce59..cd629113f61c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,6 +39,8 @@ static inline u64 btrfs_sb_offset(int mirror)  struct btrfs_device;  struct btrfs_fs_devices; +void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); +void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);  int btrfs_verify_level_key(struct extent_buffer *eb, int level,  			   struct btrfs_key *first_key, u64 parent_transid);  struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -54,15 +56,12 @@ int __cold open_ctree(struct super_block *sb,  	       char *options);  void __cold close_ctree(struct btrfs_fs_info *fs_info);  int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); -int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, -			struct buffer_head **bh_ret); +struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); +struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, +						   int copy_num);  int btrfs_commit_super(struct btrfs_fs_info *fs_info); -struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, -				      struct btrfs_key *location); -int btrfs_init_fs_root(struct btrfs_root *root); -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, -					u64 root_id); +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, +					struct btrfs_key *key);  int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,  			 struct btrfs_root *root);  void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); @@ -70,19 +69,13 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);  struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,  				     struct btrfs_key *key,  				     bool check_ref); -static inline struct btrfs_root * -btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, -			   struct btrfs_key *location) -{ -	return btrfs_get_fs_root(fs_info, location, true); -} +void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);  int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);  void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);  void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);  void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,  				 struct btrfs_root *root); -void btrfs_free_fs_root(struct btrfs_root *root);  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS  struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); @@ -95,19 +88,16 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);   * If you want to ensure the whole tree is safe, you should use   * 	fs_info->subvol_srcu   */ -static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)  { +	if (!root) +		return NULL;  	if (refcount_inc_not_zero(&root->refs))  		return root;  	return NULL;  } -static inline void btrfs_put_fs_root(struct btrfs_root *root) -{ -	if (refcount_dec_and_test(&root->refs)) -		kfree(root); -} - +void btrfs_put_root(struct btrfs_root *root);  void btrfs_mark_buffer_dirty(struct extent_buffer *buf);  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,  			  int atomic); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 72e312cae69d..2bb25d2dc44b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -57,16 +57,14 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,  	return type;  } -static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, -				       u64 root_objectid, u32 generation, -				       int check_generation) +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, +				u64 root_objectid, u32 generation, +				int check_generation)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);  	struct btrfs_root *root;  	struct inode *inode;  	struct btrfs_key key; -	int index; -	int err = 0;  	if (objectid < BTRFS_FIRST_FREE_OBJECTID)  		return ERR_PTR(-ESTALE); @@ -75,25 +73,18 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,  	key.type = BTRFS_ROOT_ITEM_KEY;  	key.offset = (u64)-1; -	index = srcu_read_lock(&fs_info->subvol_srcu); - -	root = btrfs_read_fs_root_no_name(fs_info, &key); -	if (IS_ERR(root)) { -		err = PTR_ERR(root); -		goto fail; -	} +	root = btrfs_get_fs_root(fs_info, &key, true); +	if (IS_ERR(root)) +		return ERR_CAST(root);  	key.objectid = objectid;  	key.type = BTRFS_INODE_ITEM_KEY;  	key.offset = 0;  	inode = btrfs_iget(sb, &key, root); -	if (IS_ERR(inode)) { -		err = PTR_ERR(inode); -		goto fail; -	} - -	srcu_read_unlock(&fs_info->subvol_srcu, index); +	btrfs_put_root(root); +	if (IS_ERR(inode)) +		return ERR_CAST(inode);  	if (check_generation && generation != inode->i_generation) {  		iput(inode); @@ -101,9 +92,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,  	}  	return d_obtain_alias(inode); -fail: -	srcu_read_unlock(&fs_info->subvol_srcu, index); -	return ERR_PTR(err);  }  static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -152,7 +140,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,  	return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);  } -static struct dentry *btrfs_get_parent(struct dentry *child) +struct dentry *btrfs_get_parent(struct dentry *child)  {  	struct inode *dir = d_inode(child);  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 57488ecd7d4e..f32f4113c976 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -18,4 +18,9 @@ struct btrfs_fid {  	u64 parent_root_objectid;  } __attribute__ ((packed)); +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, +				u64 root_objectid, u32 generation, +				int check_generation); +struct dentry *btrfs_get_parent(struct dentry *child); +  #endif diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index a3febe746c79..b4a7bad3e82e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -36,13 +36,14 @@ struct io_failure_record;  #define CHUNK_TRIMMED				EXTENT_DEFRAG  enum { -	IO_TREE_FS_INFO_FREED_EXTENTS0, -	IO_TREE_FS_INFO_FREED_EXTENTS1, +	IO_TREE_FS_PINNED_EXTENTS, +	IO_TREE_FS_EXCLUDED_EXTENTS,  	IO_TREE_INODE_IO,  	IO_TREE_INODE_IO_FAILURE,  	IO_TREE_RELOC_BLOCKS,  	IO_TREE_TRANS_DIRTY_PAGES,  	IO_TREE_ROOT_DIRTY_LOG_PAGES, +	IO_TREE_INODE_FILE_EXTENT,  	IO_TREE_SELFTEST,  }; @@ -222,6 +223,8 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,  			  struct extent_state **cached_state);  void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,  				 u64 *start_ret, u64 *end_ret, unsigned bits); +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, +			       u64 *start_ret, u64 *end_ret, unsigned bits);  int extent_invalidatepage(struct extent_io_tree *tree,  			  struct page *page, unsigned long offset);  bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a7bc66121330..54a64d1e18c6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -64,10 +64,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,  			      u64 start, u64 num_bytes)  {  	u64 end = start + num_bytes - 1; -	set_extent_bits(&fs_info->freed_extents[0], -			start, end, EXTENT_UPTODATE); -	set_extent_bits(&fs_info->freed_extents[1], -			start, end, EXTENT_UPTODATE); +	set_extent_bits(&fs_info->excluded_extents, start, end, +			EXTENT_UPTODATE);  	return 0;  } @@ -79,10 +77,8 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)  	start = cache->start;  	end = start + cache->length - 1; -	clear_extent_bits(&fs_info->freed_extents[0], -			  start, end, EXTENT_UPTODATE); -	clear_extent_bits(&fs_info->freed_extents[1], -			  start, end, EXTENT_UPTODATE); +	clear_extent_bits(&fs_info->excluded_extents, start, end, +			  EXTENT_UPTODATE);  }  static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) @@ -1193,24 +1189,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,  	return ret;  } -static int insert_extent_backref(struct btrfs_trans_handle *trans, -				 struct btrfs_path *path, -				 u64 bytenr, u64 parent, u64 root_objectid, -				 u64 owner, u64 offset, int refs_to_add) -{ -	int ret; -	if (owner < BTRFS_FIRST_FREE_OBJECTID) { -		BUG_ON(refs_to_add != 1); -		ret = insert_tree_block_ref(trans, path, bytenr, parent, -					    root_objectid); -	} else { -		ret = insert_extent_data_ref(trans, path, bytenr, parent, -					     root_objectid, owner, offset, -					     refs_to_add); -	} -	return ret; -} -  static int remove_extent_backref(struct btrfs_trans_handle *trans,  				 struct btrfs_path *path,  				 struct btrfs_extent_inline_ref *iref, @@ -1469,7 +1447,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  	if (!path)  		return -ENOMEM; -	path->reada = READA_FORWARD;  	path->leave_spinning = 1;  	/* this will setup the path even if it fails to insert the back ref */  	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, @@ -1494,11 +1471,17 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_release_path(path); -	path->reada = READA_FORWARD;  	path->leave_spinning = 1;  	/* now insert the actual backref */ -	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, -				    owner, offset, refs_to_add); +	if (owner < BTRFS_FIRST_FREE_OBJECTID) { +		BUG_ON(refs_to_add != 1); +		ret = insert_tree_block_ref(trans, path, bytenr, parent, +					    root_objectid); +	} else { +		ret = insert_extent_data_ref(trans, path, bytenr, parent, +					     root_objectid, owner, offset, +					     refs_to_add); +	}  	if (ret)  		btrfs_abort_transaction(trans, ret);  out: @@ -1583,7 +1566,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,  	int err = 0;  	int metadata = !extent_op->is_data; -	if (trans->aborted) +	if (TRANS_ABORTED(trans))  		return 0;  	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) @@ -1604,7 +1587,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,  	}  again: -	path->reada = READA_FORWARD;  	path->leave_spinning = 1;  	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);  	if (ret < 0) { @@ -1703,10 +1685,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,  {  	int ret = 0; -	if (trans->aborted) { +	if (TRANS_ABORTED(trans)) {  		if (insert_reserved) -			btrfs_pin_extent(trans->fs_info, node->bytenr, -					 node->num_bytes, 1); +			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);  		return 0;  	} @@ -1721,8 +1702,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,  	else  		BUG();  	if (ret && insert_reserved) -		btrfs_pin_extent(trans->fs_info, node->bytenr, -				 node->num_bytes, 1); +		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);  	return ret;  } @@ -1867,8 +1847,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,  	spin_unlock(&delayed_refs->lock);  	if (head->must_insert_reserved) { -		btrfs_pin_extent(fs_info, head->bytenr, -				 head->num_bytes, 1); +		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);  		if (head->is_data) {  			ret = btrfs_del_csums(trans, fs_info->csum_root,  					      head->bytenr, head->num_bytes); @@ -2191,7 +2170,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  	int run_all = count == (unsigned long)-1;  	/* We'll clean this up in btrfs_cleanup_transaction */ -	if (trans->aborted) +	if (TRANS_ABORTED(trans))  		return 0;  	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) @@ -2238,7 +2217,7 @@ out:  }  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, -				u64 bytenr, u64 num_bytes, u64 flags, +				struct extent_buffer *eb, u64 flags,  				int level, int is_data)  {  	struct btrfs_delayed_extent_op *extent_op; @@ -2254,7 +2233,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,  	extent_op->is_data = is_data ? true : false;  	extent_op->level = level; -	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); +	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);  	if (ret)  		btrfs_free_delayed_extent_op(extent_op);  	return ret; @@ -2588,7 +2567,8 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)  	return bytenr;  } -static int pin_down_extent(struct btrfs_block_group *cache, +static int pin_down_extent(struct btrfs_trans_handle *trans, +			   struct btrfs_block_group *cache,  			   u64 bytenr, u64 num_bytes, int reserved)  {  	struct btrfs_fs_info *fs_info = cache->fs_info; @@ -2607,22 +2587,20 @@ static int pin_down_extent(struct btrfs_block_group *cache,  	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,  		    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); -	set_extent_dirty(fs_info->pinned_extents, bytenr, +	set_extent_dirty(&trans->transaction->pinned_extents, bytenr,  			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);  	return 0;  } -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans,  		     u64 bytenr, u64 num_bytes, int reserved)  {  	struct btrfs_block_group *cache; -	ASSERT(fs_info->running_transaction); - -	cache = btrfs_lookup_block_group(fs_info, bytenr); +	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);  	BUG_ON(!cache); /* Logic error */ -	pin_down_extent(cache, bytenr, num_bytes, reserved); +	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);  	btrfs_put_block_group(cache);  	return 0; @@ -2631,13 +2609,15 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,  /*   * this function must be called within transaction   */ -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,  				    u64 bytenr, u64 num_bytes)  {  	struct btrfs_block_group *cache;  	int ret; -	cache = btrfs_lookup_block_group(fs_info, bytenr); +	btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes); + +	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);  	if (!cache)  		return -EINVAL; @@ -2649,7 +2629,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,  	 */  	btrfs_cache_block_group(cache, 1); -	pin_down_extent(cache, bytenr, num_bytes, 0); +	pin_down_extent(trans, cache, bytenr, num_bytes, 0);  	/* remove us from the free space cache (if we're there at all) */  	ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -2763,11 +2743,6 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)  		}  	} -	if (fs_info->pinned_extents == &fs_info->freed_extents[0]) -		fs_info->pinned_extents = &fs_info->freed_extents[1]; -	else -		fs_info->pinned_extents = &fs_info->freed_extents[0]; -  	up_write(&fs_info->commit_root_sem);  	btrfs_update_global_block_rsv(fs_info); @@ -2908,12 +2883,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)  	u64 end;  	int ret; -	if (fs_info->pinned_extents == &fs_info->freed_extents[0]) -		unpin = &fs_info->freed_extents[1]; -	else -		unpin = &fs_info->freed_extents[0]; +	unpin = &trans->transaction->pinned_extents; -	while (!trans->aborted) { +	while (!TRANS_ABORTED(trans)) {  		struct extent_state *cached_state = NULL;  		mutex_lock(&fs_info->unused_bg_unpin_mutex); @@ -2923,6 +2895,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)  			mutex_unlock(&fs_info->unused_bg_unpin_mutex);  			break;  		} +		if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) +			clear_extent_bits(&fs_info->excluded_extents, start, +					  end, EXTENT_UPTODATE);  		if (btrfs_test_opt(fs_info, DISCARD_SYNC))  			ret = btrfs_discard_extent(fs_info, start, @@ -2950,7 +2925,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)  		u64 trimmed = 0;  		ret = -EROFS; -		if (!trans->aborted) +		if (!TRANS_ABORTED(trans))  			ret = btrfs_discard_extent(fs_info,  						   block_group->start,  						   block_group->length, @@ -3000,7 +2975,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	if (!path)  		return -ENOMEM; -	path->reada = READA_FORWARD;  	path->leave_spinning = 1;  	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -3301,7 +3275,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  		cache = btrfs_lookup_block_group(fs_info, buf->start);  		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { -			pin_down_extent(cache, buf->start, buf->len, 1); +			pin_down_extent(trans, cache, buf->start, buf->len, 1);  			btrfs_put_block_group(cache);  			goto out;  		} @@ -3345,7 +3319,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)  	    (ref->type == BTRFS_REF_DATA &&  	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {  		/* unlocks the pinned mutex */ -		btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); +		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);  		old_ref_mod = new_ref_mod = 0;  		ret = 0;  	} else if (ref->type == BTRFS_REF_METADATA) { @@ -3438,6 +3412,10 @@ btrfs_release_block_group(struct btrfs_block_group *cache,  	btrfs_put_block_group(cache);  } +enum btrfs_extent_allocation_policy { +	BTRFS_EXTENT_ALLOC_CLUSTERED, +}; +  /*   * Structure used internally for find_free_extent() function.  Wraps needed   * parameters. @@ -3454,6 +3432,8 @@ struct find_free_extent_ctl {  	/* For clustered allocation */  	u64 empty_cluster; +	struct btrfs_free_cluster *last_ptr; +	bool use_cluster;  	bool have_caching_bg;  	bool orig_have_caching_bg; @@ -3489,6 +3469,12 @@ struct find_free_extent_ctl {  	/* Found result */  	u64 found_offset; + +	/* Hint where to start looking for an empty space */ +	u64 hint_byte; + +	/* Allocation policy */ +	enum btrfs_extent_allocation_policy policy;  }; @@ -3501,11 +3487,11 @@ struct find_free_extent_ctl {   * Return 0 means we have found a location and set ffe_ctl->found_offset.   */  static int find_free_extent_clustered(struct btrfs_block_group *bg, -		struct btrfs_free_cluster *last_ptr, -		struct find_free_extent_ctl *ffe_ctl, -		struct btrfs_block_group **cluster_bg_ret) +				      struct find_free_extent_ctl *ffe_ctl, +				      struct btrfs_block_group **cluster_bg_ret)  {  	struct btrfs_block_group *cluster_bg; +	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;  	u64 aligned_cluster;  	u64 offset;  	int ret; @@ -3605,9 +3591,9 @@ refill_cluster:   * Return -EAGAIN to inform caller that we need to re-search this block group   */  static int find_free_extent_unclustered(struct btrfs_block_group *bg, -		struct btrfs_free_cluster *last_ptr, -		struct find_free_extent_ctl *ffe_ctl) +					struct find_free_extent_ctl *ffe_ctl)  { +	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;  	u64 offset;  	/* @@ -3663,16 +3649,101 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,  	return 0;  } +static int do_allocation_clustered(struct btrfs_block_group *block_group, +				   struct find_free_extent_ctl *ffe_ctl, +				   struct btrfs_block_group **bg_ret) +{ +	int ret; + +	/* We want to try and use the cluster allocator, so lets look there */ +	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { +		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); +		if (ret >= 0 || ret == -EAGAIN) +			return ret; +		/* ret == -ENOENT case falls through */ +	} + +	return find_free_extent_unclustered(block_group, ffe_ctl); +} + +static int do_allocation(struct btrfs_block_group *block_group, +			 struct find_free_extent_ctl *ffe_ctl, +			 struct btrfs_block_group **bg_ret) +{ +	switch (ffe_ctl->policy) { +	case BTRFS_EXTENT_ALLOC_CLUSTERED: +		return do_allocation_clustered(block_group, ffe_ctl, bg_ret); +	default: +		BUG(); +	} +} + +static void release_block_group(struct btrfs_block_group *block_group, +				struct find_free_extent_ctl *ffe_ctl, +				int delalloc) +{ +	switch (ffe_ctl->policy) { +	case BTRFS_EXTENT_ALLOC_CLUSTERED: +		ffe_ctl->retry_clustered = false; +		ffe_ctl->retry_unclustered = false; +		break; +	default: +		BUG(); +	} + +	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != +	       ffe_ctl->index); +	btrfs_release_block_group(block_group, delalloc); +} + +static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, +				   struct btrfs_key *ins) +{ +	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + +	if (!ffe_ctl->use_cluster && last_ptr) { +		spin_lock(&last_ptr->lock); +		last_ptr->window_start = ins->objectid; +		spin_unlock(&last_ptr->lock); +	} +} + +static void found_extent(struct find_free_extent_ctl *ffe_ctl, +			 struct btrfs_key *ins) +{ +	switch (ffe_ctl->policy) { +	case BTRFS_EXTENT_ALLOC_CLUSTERED: +		found_extent_clustered(ffe_ctl, ins); +		break; +	default: +		BUG(); +	} +} + +static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) +{ +	switch (ffe_ctl->policy) { +	case BTRFS_EXTENT_ALLOC_CLUSTERED: +		/* +		 * If we can't allocate a new chunk we've already looped through +		 * at least once, move on to the NO_EMPTY_SIZE case. +		 */ +		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; +		return 0; +	default: +		BUG(); +	} +} +  /*   * Return >0 means caller needs to re-search for free extent   * Return 0 means we have the needed free extent.   * Return <0 means we failed to locate any free extent.   */  static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, -					struct btrfs_free_cluster *last_ptr,  					struct btrfs_key *ins,  					struct find_free_extent_ctl *ffe_ctl, -					int full_search, bool use_cluster) +					bool full_search)  {  	struct btrfs_root *root = fs_info->extent_root;  	int ret; @@ -3689,11 +3760,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,  		return 1;  	if (ins->objectid) { -		if (!use_cluster && last_ptr) { -			spin_lock(&last_ptr->lock); -			last_ptr->window_start = ins->objectid; -			spin_unlock(&last_ptr->lock); -		} +		found_extent(ffe_ctl, ins);  		return 0;  	} @@ -3739,16 +3806,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,  			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,  						CHUNK_ALLOC_FORCE); -			/* -			 * If we can't allocate a new chunk we've already looped -			 * through at least once, move on to the NO_EMPTY_SIZE -			 * case. -			 */ -			if (ret == -ENOSPC) -				ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; -  			/* Do not bail out on ENOSPC since we can do more. */ -			if (ret < 0 && ret != -ENOSPC) +			if (ret == -ENOSPC) +				ret = chunk_allocation_failed(ffe_ctl); +			else if (ret < 0)  				btrfs_abort_transaction(trans, ret);  			else  				ret = 0; @@ -3759,6 +3820,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,  		}  		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { +			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) +				return -ENOSPC; +  			/*  			 * Don't loop again if we already have no empty_size and  			 * no empty_cluster. @@ -3774,6 +3838,71 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,  	return -ENOSPC;  } +static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, +					struct find_free_extent_ctl *ffe_ctl, +					struct btrfs_space_info *space_info, +					struct btrfs_key *ins) +{ +	/* +	 * If our free space is heavily fragmented we may not be able to make +	 * big contiguous allocations, so instead of doing the expensive search +	 * for free space, simply return ENOSPC with our max_extent_size so we +	 * can go ahead and search for a more manageable chunk. +	 * +	 * If our max_extent_size is large enough for our allocation simply +	 * disable clustering since we will likely not be able to find enough +	 * space to create a cluster and induce latency trying. +	 */ +	if (space_info->max_extent_size) { +		spin_lock(&space_info->lock); +		if (space_info->max_extent_size && +		    ffe_ctl->num_bytes > space_info->max_extent_size) { +			ins->offset = space_info->max_extent_size; +			spin_unlock(&space_info->lock); +			return -ENOSPC; +		} else if (space_info->max_extent_size) { +			ffe_ctl->use_cluster = false; +		} +		spin_unlock(&space_info->lock); +	} + +	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, +					       &ffe_ctl->empty_cluster); +	if (ffe_ctl->last_ptr) { +		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + +		spin_lock(&last_ptr->lock); +		if (last_ptr->block_group) +			ffe_ctl->hint_byte = last_ptr->window_start; +		if (last_ptr->fragmented) { +			/* +			 * We still set window_start so we can keep track of the +			 * last place we found an allocation to try and save +			 * some time. +			 */ +			ffe_ctl->hint_byte = last_ptr->window_start; +			ffe_ctl->use_cluster = false; +		} +		spin_unlock(&last_ptr->lock); +	} + +	return 0; +} + +static int prepare_allocation(struct btrfs_fs_info *fs_info, +			      struct find_free_extent_ctl *ffe_ctl, +			      struct btrfs_space_info *space_info, +			      struct btrfs_key *ins) +{ +	switch (ffe_ctl->policy) { +	case BTRFS_EXTENT_ALLOC_CLUSTERED: +		return prepare_allocation_clustered(fs_info, ffe_ctl, +						    space_info, ins); +	default: +		BUG(); +	} +} +  /*   * walks the btree of allocated extents and find a hole of a given size.   * The key ins is changed to record the hole: @@ -3801,16 +3930,14 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,   */  static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  				u64 ram_bytes, u64 num_bytes, u64 empty_size, -				u64 hint_byte, struct btrfs_key *ins, +				u64 hint_byte_orig, struct btrfs_key *ins,  				u64 flags, int delalloc)  {  	int ret = 0;  	int cache_block_group_error = 0; -	struct btrfs_free_cluster *last_ptr = NULL;  	struct btrfs_block_group *block_group = NULL;  	struct find_free_extent_ctl ffe_ctl = {0};  	struct btrfs_space_info *space_info; -	bool use_cluster = true;  	bool full_search = false;  	WARN_ON(num_bytes < fs_info->sectorsize); @@ -3819,13 +3946,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  	ffe_ctl.empty_size = empty_size;  	ffe_ctl.flags = flags;  	ffe_ctl.search_start = 0; -	ffe_ctl.retry_clustered = false; -	ffe_ctl.retry_unclustered = false;  	ffe_ctl.delalloc = delalloc;  	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);  	ffe_ctl.have_caching_bg = false;  	ffe_ctl.orig_have_caching_bg = false;  	ffe_ctl.found_offset = 0; +	ffe_ctl.hint_byte = hint_byte_orig; +	ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + +	/* For clustered allocation */ +	ffe_ctl.retry_clustered = false; +	ffe_ctl.retry_unclustered = false; +	ffe_ctl.last_ptr = NULL; +	ffe_ctl.use_cluster = true;  	ins->type = BTRFS_EXTENT_ITEM_KEY;  	ins->objectid = 0; @@ -3839,51 +3972,14 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,  		return -ENOSPC;  	} -	/* -	 * If our free space is heavily fragmented we may not be able to make -	 * big contiguous allocations, so instead of doing the expensive search -	 * for free space, simply return ENOSPC with our max_extent_size so we -	 * can go ahead and search for a more manageable chunk. -	 * -	 * If our max_extent_size is large enough for our allocation simply -	 * disable clustering since we will likely not be able to find enough -	 * space to create a cluster and induce latency trying. -	 */ -	if (unlikely(space_info->max_extent_size)) { -		spin_lock(&space_info->lock); -		if (space_info->max_extent_size && -		    num_bytes > space_info->max_extent_size) { -			ins->offset = space_info->max_extent_size; -			spin_unlock(&space_info->lock); -			return -ENOSPC; -		} else if (space_info->max_extent_size) { -			use_cluster = false; -		} -		spin_unlock(&space_info->lock); -	} - -	last_ptr = fetch_cluster_info(fs_info, space_info, -				      &ffe_ctl.empty_cluster); -	if (last_ptr) { -		spin_lock(&last_ptr->lock); -		if (last_ptr->block_group) -			hint_byte = last_ptr->window_start; -		if (last_ptr->fragmented) { -			/* -			 * We still set window_start so we can keep track of the -			 * last place we found an allocation to try and save -			 * some time. -			 */ -			hint_byte = last_ptr->window_start; -			use_cluster = false; -		} -		spin_unlock(&last_ptr->lock); -	} +	ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins); +	if (ret < 0) +		return ret;  	ffe_ctl.search_start = max(ffe_ctl.search_start,  				   first_logical_byte(fs_info, 0)); -	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); -	if (ffe_ctl.search_start == hint_byte) { +	ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); +	if (ffe_ctl.search_start == ffe_ctl.hint_byte) {  		block_group = btrfs_lookup_block_group(fs_info,  						       ffe_ctl.search_start);  		/* @@ -3924,6 +4020,8 @@ search:  	down_read(&space_info->groups_sem);  	list_for_each_entry(block_group,  			    &space_info->block_groups[ffe_ctl.index], list) { +		struct btrfs_block_group *bg_ret; +  		/* If the block group is read-only, we can skip it entirely. */  		if (unlikely(block_group->ro))  			continue; @@ -3984,39 +4082,20 @@ have_block_group:  		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))  			goto loop; -		/* -		 * Ok we want to try and use the cluster allocator, so -		 * lets look there -		 */ -		if (last_ptr && use_cluster) { -			struct btrfs_block_group *cluster_bg = NULL; - -			ret = find_free_extent_clustered(block_group, last_ptr, -							 &ffe_ctl, &cluster_bg); - -			if (ret == 0) { -				if (cluster_bg && cluster_bg != block_group) { -					btrfs_release_block_group(block_group, -								  delalloc); -					block_group = cluster_bg; -				} -				goto checks; -			} else if (ret == -EAGAIN) { -				goto have_block_group; -			} else if (ret > 0) { -				goto loop; +		bg_ret = NULL; +		ret = do_allocation(block_group, &ffe_ctl, &bg_ret); +		if (ret == 0) { +			if (bg_ret && bg_ret != block_group) { +				btrfs_release_block_group(block_group, delalloc); +				block_group = bg_ret;  			} -			/* ret == -ENOENT case falls through */ -		} - -		ret = find_free_extent_unclustered(block_group, last_ptr, -						   &ffe_ctl); -		if (ret == -EAGAIN) +		} else if (ret == -EAGAIN) {  			goto have_block_group; -		else if (ret > 0) +		} else if (ret > 0) {  			goto loop; -		/* ret == 0 case falls through */ -checks: +		} + +		/* Checks */  		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,  					     fs_info->stripesize); @@ -4050,17 +4129,12 @@ checks:  		btrfs_release_block_group(block_group, delalloc);  		break;  loop: -		ffe_ctl.retry_clustered = false; -		ffe_ctl.retry_unclustered = false; -		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != -		       ffe_ctl.index); -		btrfs_release_block_group(block_group, delalloc); +		release_block_group(block_group, &ffe_ctl, delalloc);  		cond_resched();  	}  	up_read(&space_info->groups_sem); -	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, -					   full_search, use_cluster); +	ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);  	if (ret > 0)  		goto search; @@ -4189,18 +4263,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,  	return 0;  } -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, +			      u64 len)  {  	struct btrfs_block_group *cache;  	int ret = 0; -	cache = btrfs_lookup_block_group(fs_info, start); +	cache = btrfs_lookup_block_group(trans->fs_info, start);  	if (!cache) { -		btrfs_err(fs_info, "unable to find block group for %llu", start); +		btrfs_err(trans->fs_info, "unable to find block group for %llu", +			  start);  		return -ENOSPC;  	} -	ret = pin_down_extent(cache, start, len, 1); +	ret = pin_down_extent(trans, cache, start, len, 1);  	btrfs_put_block_group(cache);  	return ret;  } @@ -4431,7 +4507,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,  					 offset, ins, 1);  	if (ret) -		btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1); +		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);  	btrfs_put_block_group(block_group);  	return ret;  } @@ -4750,8 +4826,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,  		BUG_ON(ret); /* -ENOMEM */  		ret = btrfs_dec_ref(trans, root, eb, 0);  		BUG_ON(ret); /* -ENOMEM */ -		ret = btrfs_set_disk_extent_flags(trans, eb->start, -						  eb->len, flag, +		ret = btrfs_set_disk_extent_flags(trans, eb, flag,  						  btrfs_header_level(eb), 0);  		BUG_ON(ret); /* -ENOMEM */  		wc->flags[level] |= flag; @@ -5209,9 +5284,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,   *   * If called with for_reloc == 0, may exit early with -EAGAIN   */ -int btrfs_drop_snapshot(struct btrfs_root *root, -			 struct btrfs_block_rsv *block_rsv, int update_ref, -			 int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)  {  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_path *path; @@ -5250,9 +5323,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  	if (err)  		goto out_end_trans; -	if (block_rsv) -		trans->block_rsv = block_rsv; -  	/*  	 * This will help us catch people modifying the fs tree while we're  	 * dropping it.  It is unsafe to mess with the fs tree while it's being @@ -5380,8 +5450,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  				err = PTR_ERR(trans);  				goto out_free;  			} -			if (block_rsv) -				trans->block_rsv = block_rsv;  		}  	}  	btrfs_release_path(path); @@ -5413,13 +5481,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root,  		}  	} -	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { +	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))  		btrfs_add_dropped_root(trans, root); -	} else { -		free_extent_buffer(root->node); -		free_extent_buffer(root->commit_root); -		btrfs_put_fs_root(root); -	} +	else +		btrfs_put_root(root);  	root_dropped = true;  out_end_trans:  	btrfs_end_transaction_throttle(trans); @@ -5749,47 +5814,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)  		return bg_ret;  	return dev_ret;  } - -/* - * btrfs_{start,end}_write_no_snapshotting() are similar to - * mnt_{want,drop}_write(), they are used to prevent some tasks from writing - * data into the page cache through nocow before the subvolume is snapshoted, - * but flush the data into disk after the snapshot creation, or to prevent - * operations while snapshotting is ongoing and that cause the snapshot to be - * inconsistent (writes followed by expanding truncates for example). - */ -void btrfs_end_write_no_snapshotting(struct btrfs_root *root) -{ -	percpu_counter_dec(&root->subv_writers->counter); -	cond_wake_up(&root->subv_writers->wait); -} - -int btrfs_start_write_no_snapshotting(struct btrfs_root *root) -{ -	if (atomic_read(&root->will_be_snapshotted)) -		return 0; - -	percpu_counter_inc(&root->subv_writers->counter); -	/* -	 * Make sure counter is updated before we check for snapshot creation. -	 */ -	smp_mb(); -	if (atomic_read(&root->will_be_snapshotted)) { -		btrfs_end_write_no_snapshotting(root); -		return 0; -	} -	return 1; -} - -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) -{ -	while (true) { -		int ret; - -		ret = btrfs_start_write_no_snapshotting(root); -		if (ret) -			break; -		wait_var_event(&root->will_be_snapshotted, -			       !atomic_read(&root->will_be_snapshotted)); -	} -} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c0f202741e09..39e45b8a5031 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -35,42 +35,54 @@ static inline bool extent_state_in_tree(const struct extent_state *state)  }  #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(buffers);  static LIST_HEAD(states); -  static DEFINE_SPINLOCK(leak_lock); -static inline -void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +static inline void btrfs_leak_debug_add(spinlock_t *lock, +					struct list_head *new, +					struct list_head *head)  {  	unsigned long flags; -	spin_lock_irqsave(&leak_lock, flags); +	spin_lock_irqsave(lock, flags);  	list_add(new, head); -	spin_unlock_irqrestore(&leak_lock, flags); +	spin_unlock_irqrestore(lock, flags);  } -static inline -void btrfs_leak_debug_del(struct list_head *entry) +static inline void btrfs_leak_debug_del(spinlock_t *lock, +					struct list_head *entry)  {  	unsigned long flags; -	spin_lock_irqsave(&leak_lock, flags); +	spin_lock_irqsave(lock, flags);  	list_del(entry); -	spin_unlock_irqrestore(&leak_lock, flags); +	spin_unlock_irqrestore(lock, flags);  } -static inline void btrfs_extent_buffer_leak_debug_check(void) +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)  {  	struct extent_buffer *eb; +	unsigned long flags; -	while (!list_empty(&buffers)) { -		eb = list_entry(buffers.next, struct extent_buffer, leak_list); -		pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", -		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); +	/* +	 * If we didn't get into open_ctree our allocated_ebs will not be +	 * initialized, so just skip this. +	 */ +	if (!fs_info->allocated_ebs.next) +		return; + +	spin_lock_irqsave(&fs_info->eb_leak_lock, flags); +	while (!list_empty(&fs_info->allocated_ebs)) { +		eb = list_first_entry(&fs_info->allocated_ebs, +				      struct extent_buffer, leak_list); +		pr_err( +	"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", +		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, +		       btrfs_header_owner(eb));  		list_del(&eb->leak_list);  		kmem_cache_free(extent_buffer_cache, eb);  	} +	spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);  }  static inline void btrfs_extent_state_leak_debug_check(void) @@ -107,9 +119,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,  	}  }  #else -#define btrfs_leak_debug_add(new, head)	do {} while (0) -#define btrfs_leak_debug_del(entry)	do {} while (0) -#define btrfs_extent_buffer_leak_debug_check()	do {} while (0) +#define btrfs_leak_debug_add(lock, new, head)	do {} while (0) +#define btrfs_leak_debug_del(lock, entry)	do {} while (0)  #define btrfs_extent_state_leak_debug_check()	do {} while (0)  #define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)  #endif @@ -122,7 +133,6 @@ struct tree_entry {  struct extent_page_data {  	struct bio *bio; -	struct extent_io_tree *tree;  	/* tells writepage not to lock the state bits for this range  	 * it still does the unlocking  	 */ @@ -246,8 +256,6 @@ void __cold extent_state_cache_exit(void)  void __cold extent_io_exit(void)  { -	btrfs_extent_buffer_leak_debug_check(); -  	/*  	 * Make sure all delayed rcu free are flushed before we  	 * destroy caches. @@ -257,6 +265,15 @@ void __cold extent_io_exit(void)  	bioset_exit(&btrfs_bioset);  } +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc.  These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; +  void extent_io_tree_init(struct btrfs_fs_info *fs_info,  			 struct extent_io_tree *tree, unsigned int owner,  			 void *private_data) @@ -268,6 +285,8 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,  	spin_lock_init(&tree->lock);  	tree->private_data = private_data;  	tree->owner = owner; +	if (owner == IO_TREE_INODE_FILE_EXTENT) +		lockdep_set_class(&tree->lock, &file_extent_tree_class);  }  void extent_io_tree_release(struct extent_io_tree *tree) @@ -314,7 +333,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)  	state->state = 0;  	state->failrec = NULL;  	RB_CLEAR_NODE(&state->rb_node); -	btrfs_leak_debug_add(&state->leak_list, &states); +	btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);  	refcount_set(&state->refs, 1);  	init_waitqueue_head(&state->wq);  	trace_alloc_extent_state(state, mask, _RET_IP_); @@ -327,7 +346,7 @@ void free_extent_state(struct extent_state *state)  		return;  	if (refcount_dec_and_test(&state->refs)) {  		WARN_ON(extent_state_in_tree(state)); -		btrfs_leak_debug_del(&state->leak_list); +		btrfs_leak_debug_del(&leak_lock, &state->leak_list);  		trace_free_extent_state(state, _RET_IP_);  		kmem_cache_free(extent_state_cache, state);  	} @@ -1053,6 +1072,16 @@ hit_next:  			goto out;  		} +		/* +		 * If this extent already has all the bits we want set, then +		 * skip it, not necessary to split it or do anything with it. +		 */ +		if ((state->state & bits) == bits) { +			start = state->end + 1; +			cache_state(state, cached_state); +			goto search_again; +		} +  		prealloc = alloc_extent_state_atomic(prealloc);  		BUG_ON(!prealloc);  		err = split_state(tree, state, prealloc, start); @@ -1568,6 +1597,43 @@ out:  }  /** + * find_contiguous_extent_bit: find a contiguous area of bits + * @tree - io tree to check + * @start - offset to start the search from + * @start_ret - the first offset we found with the bits set + * @end_ret - the final contiguous range of the bits that were set + * @bits - bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again.  During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits.  We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area.  The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, +			       u64 *start_ret, u64 *end_ret, unsigned bits) +{ +	struct extent_state *state; +	int ret = 1; + +	spin_lock(&tree->lock); +	state = find_first_extent_bit_state(tree, start, bits); +	if (state) { +		*start_ret = state->start; +		*end_ret = state->end; +		while ((state = next_state(state)) != NULL) { +			if (state->start > (*end_ret + 1)) +				break; +			*end_ret = state->end; +		} +		ret = 0; +	} +	spin_unlock(&tree->lock); +	return ret; +} + +/**   * find_first_clear_extent_bit - find the first range that has @bits not set.   * This range could start before @start.   * @@ -2926,7 +2992,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)  /*   * @opf:	bio REQ_OP_* and REQ_* flags as one value - * @tree:	tree so we can call our merge_bio hook   * @wbc:	optional writeback control for io accounting   * @page:	page to add to the bio   * @pg_offset:	offset of the new bio or to check whether we are adding @@ -2939,7 +3004,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)   * @prev_bio_flags:  flags of previous bio to see if we can merge the current one   * @bio_flags:	flags of the current bio to see if we can merge them   */ -static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, +static int submit_extent_page(unsigned int opf,  			      struct writeback_control *wbc,  			      struct page *page, u64 offset,  			      size_t size, unsigned long pg_offset, @@ -2954,6 +3019,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,  	struct bio *bio;  	size_t page_size = min_t(size_t, size, PAGE_SIZE);  	sector_t sector = offset >> 9; +	struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;  	ASSERT(bio_ret); @@ -3062,8 +3128,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,   * XXX JDM: This needs looking at to ensure proper page locking   * return 0 on success, otherwise return error   */ -static int __do_readpage(struct extent_io_tree *tree, -			 struct page *page, +static int __do_readpage(struct page *page,  			 get_extent_t *get_extent,  			 struct extent_map **em_cached,  			 struct bio **bio, int mirror_num, @@ -3086,6 +3151,7 @@ static int __do_readpage(struct extent_io_tree *tree,  	size_t disk_io_size;  	size_t blocksize = inode->i_sb->s_blocksize;  	unsigned long this_bio_flag = 0; +	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;  	set_page_extent_mapped(page); @@ -3242,7 +3308,7 @@ static int __do_readpage(struct extent_io_tree *tree,  			continue;  		} -		ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, +		ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,  					 page, offset, disk_io_size,  					 pg_offset, bio,  					 end_bio_extent_readpage, mirror_num, @@ -3269,8 +3335,7 @@ out:  	return ret;  } -static inline void contiguous_readpages(struct extent_io_tree *tree, -					     struct page *pages[], int nr_pages, +static inline void contiguous_readpages(struct page *pages[], int nr_pages,  					     u64 start, u64 end,  					     struct extent_map **em_cached,  					     struct bio **bio, @@ -3280,17 +3345,16 @@ static inline void contiguous_readpages(struct extent_io_tree *tree,  	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);  	int index; -	btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); +	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);  	for (index = 0; index < nr_pages; index++) { -		__do_readpage(tree, pages[index], btrfs_get_extent, em_cached, +		__do_readpage(pages[index], btrfs_get_extent, em_cached,  				bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);  		put_page(pages[index]);  	}  } -static int __extent_read_full_page(struct extent_io_tree *tree, -				   struct page *page, +static int __extent_read_full_page(struct page *page,  				   get_extent_t *get_extent,  				   struct bio **bio, int mirror_num,  				   unsigned long *bio_flags, @@ -3301,21 +3365,21 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  	u64 end = start + PAGE_SIZE - 1;  	int ret; -	btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); +	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); -	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, +	ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,  			    bio_flags, read_flags, NULL);  	return ret;  } -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, -			    get_extent_t *get_extent, int mirror_num) +int extent_read_full_page(struct page *page, get_extent_t *get_extent, +			  int mirror_num)  {  	struct bio *bio = NULL;  	unsigned long bio_flags = 0;  	int ret; -	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, +	ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,  				      &bio_flags, 0);  	if (bio)  		ret = submit_one_bio(bio, mirror_num, bio_flags); @@ -3423,7 +3487,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  				 unsigned long nr_written,  				 int *nr_ret)  { -	struct extent_io_tree *tree = epd->tree; +	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;  	u64 start = page_offset(page);  	u64 page_end = start + PAGE_SIZE - 1;  	u64 end; @@ -3509,7 +3573,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  			       page->index, cur, end);  		} -		ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, +		ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,  					 page, offset, iosize, pg_offset,  					 &epd->bio,  					 end_bio_extent_writepage, @@ -3830,8 +3894,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,  			struct writeback_control *wbc,  			struct extent_page_data *epd)  { -	struct btrfs_fs_info *fs_info = eb->fs_info; -	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;  	u64 offset = eb->start;  	u32 nritems;  	int i, num_pages; @@ -3864,7 +3926,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,  		clear_page_dirty_for_io(p);  		set_page_writeback(p); -		ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, +		ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,  					 p, offset, PAGE_SIZE, 0,  					 &epd->bio,  					 end_bio_extent_buffer_writepage, @@ -3897,14 +3959,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,  int btree_write_cache_pages(struct address_space *mapping,  				   struct writeback_control *wbc)  { -	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;  	struct extent_buffer *eb, *prev_eb = NULL;  	struct extent_page_data epd = {  		.bio = NULL, -		.tree = tree,  		.extent_locked = 0,  		.sync_io = wbc->sync_mode == WB_SYNC_ALL,  	}; +	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;  	int ret = 0;  	int done = 0;  	int nr_to_write_done = 0; @@ -4018,7 +4079,39 @@ retry:  		end_write_bio(&epd, ret);  		return ret;  	} -	ret = flush_write_bio(&epd); +	/* +	 * If something went wrong, don't allow any metadata write bio to be +	 * submitted. +	 * +	 * This would prevent use-after-free if we had dirty pages not +	 * cleaned up, which can still happen by fuzzed images. +	 * +	 * - Bad extent tree +	 *   Allowing existing tree block to be allocated for other trees. +	 * +	 * - Log tree operations +	 *   Exiting tree blocks get allocated to log tree, bumps its +	 *   generation, then get cleaned in tree re-balance. +	 *   Such tree block will not be written back, since it's clean, +	 *   thus no WRITTEN flag set. +	 *   And after log writes back, this tree block is not traced by +	 *   any dirty extent_io_tree. +	 * +	 * - Offending tree block gets re-dirtied from its original owner +	 *   Since it has bumped generation, no WRITTEN flag, it can be +	 *   reused without COWing. This tree block will not be traced +	 *   by btrfs_transaction::dirty_pages. +	 * +	 *   Now such dirty tree block will not be cleaned by any dirty +	 *   extent io tree. Thus we don't want to submit such wild eb +	 *   if the fs already has error. +	 */ +	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { +		ret = flush_write_bio(&epd); +	} else { +		ret = -EUCLEAN; +		end_write_bio(&epd, ret); +	}  	return ret;  } @@ -4190,7 +4283,6 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)  	int ret;  	struct extent_page_data epd = {  		.bio = NULL, -		.tree = &BTRFS_I(page->mapping->host)->io_tree,  		.extent_locked = 0,  		.sync_io = wbc->sync_mode == WB_SYNC_ALL,  	}; @@ -4212,14 +4304,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,  {  	int ret = 0;  	struct address_space *mapping = inode->i_mapping; -	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;  	struct page *page;  	unsigned long nr_pages = (end - start + PAGE_SIZE) >>  		PAGE_SHIFT;  	struct extent_page_data epd = {  		.bio = NULL, -		.tree = tree,  		.extent_locked = 1,  		.sync_io = mode == WB_SYNC_ALL,  	}; @@ -4263,7 +4353,6 @@ int extent_writepages(struct address_space *mapping,  	int ret = 0;  	struct extent_page_data epd = {  		.bio = NULL, -		.tree = &BTRFS_I(mapping->host)->io_tree,  		.extent_locked = 0,  		.sync_io = wbc->sync_mode == WB_SYNC_ALL,  	}; @@ -4285,7 +4374,6 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,  	unsigned long bio_flags = 0;  	struct page *pagepool[16];  	struct extent_map *em_cached = NULL; -	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;  	int nr = 0;  	u64 prev_em_start = (u64)-1; @@ -4312,7 +4400,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,  			ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); -			contiguous_readpages(tree, pagepool, nr, contig_start, +			contiguous_readpages(pagepool, nr, contig_start,  				     contig_end, &em_cached, &bio, &bio_flags,  				     &prev_em_start);  		} @@ -4796,7 +4884,6 @@ out_free_ulist:  static void __free_extent_buffer(struct extent_buffer *eb)  { -	btrfs_leak_debug_del(&eb->leak_list);  	kmem_cache_free(extent_buffer_cache, eb);  } @@ -4862,6 +4949,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)  static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)  {  	btrfs_release_extent_buffer_pages(eb); +	btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);  	__free_extent_buffer(eb);  } @@ -4883,7 +4971,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,  	init_waitqueue_head(&eb->write_lock_wq);  	init_waitqueue_head(&eb->read_lock_wq); -	btrfs_leak_debug_add(&eb->leak_list, &buffers); +	btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, +			     &fs_info->allocated_ebs);  	spin_lock_init(&eb->refs_lock);  	atomic_set(&eb->refs, 1); @@ -5230,6 +5319,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)  }  static int release_extent_buffer(struct extent_buffer *eb) +	__releases(&eb->refs_lock)  {  	lockdep_assert_held(&eb->refs_lock); @@ -5248,6 +5338,7 @@ static int release_extent_buffer(struct extent_buffer *eb)  			spin_unlock(&eb->refs_lock);  		} +		btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);  		/* Should be safe to release our pages at this point */  		btrfs_release_extent_buffer_pages(eb);  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -5405,7 +5496,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)  	unsigned long num_reads = 0;  	struct bio *bio = NULL;  	unsigned long bio_flags = 0; -	struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;  	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))  		return 0; @@ -5453,7 +5543,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)  			}  			ClearPageError(page); -			err = __extent_read_full_page(tree, page, +			err = __extent_read_full_page(page,  						      btree_get_extent, &bio,  						      mirror_num, &bio_flags,  						      REQ_META); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5d205bbaafdc..2ed65bd0760e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -189,8 +189,8 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,  int try_release_extent_mapping(struct page *page, gfp_t mask);  int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, -			  get_extent_t *get_extent, int mirror_num); +int extent_read_full_page(struct page *page, get_extent_t *get_extent, +			  int mirror_num);  int extent_write_full_page(struct page *page, struct writeback_control *wbc);  int extent_write_locked_range(struct inode *inode, u64 start, u64 end,  			      int mode); @@ -325,4 +325,11 @@ bool find_lock_delalloc_range(struct inode *inode,  #endif  struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,  					       u64 start); + +#ifdef CONFIG_BTRFS_DEBUG +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info); +#else +#define btrfs_extent_buffer_leak_debug_check(fs_info)	do {} while (0) +#endif +  #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c2f365662d55..b618ad5339ba 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -23,6 +23,97 @@  #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \  				       PAGE_SIZE)) +/** + * @inode - the inode we want to update the disk_i_size for + * @new_i_size - the i_size we want to set to, 0 if we use i_size + * + * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() + * returns as it is perfectly fine with a file that has holes without hole file + * extent items. + * + * However without NO_HOLES we need to only return the area that is contiguous + * from the 0 offset of the file.  Otherwise we could end up adjust i_size up + * to an extent that has a gap in between. + * + * Finally new_i_size should only be set in the case of truncate where we're not + * ready to use i_size_read() as the limiter yet. + */ +void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size) +{ +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; +	u64 start, end, i_size; +	int ret; + +	i_size = new_i_size ?: i_size_read(inode); +	if (btrfs_fs_incompat(fs_info, NO_HOLES)) { +		BTRFS_I(inode)->disk_i_size = i_size; +		return; +	} + +	spin_lock(&BTRFS_I(inode)->lock); +	ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0, +					 &start, &end, EXTENT_DIRTY); +	if (!ret && start == 0) +		i_size = min(i_size, end + 1); +	else +		i_size = 0; +	BTRFS_I(inode)->disk_i_size = i_size; +	spin_unlock(&BTRFS_I(inode)->lock); +} + +/** + * @inode - the inode we're modifying + * @start - the start file offset of the file extent we've inserted + * @len - the logical length of the file extent item + * + * Call when we are inserting a new file extent where there was none before. + * Does not need to call this in the case where we're replacing an existing file + * extent, however if not sure it's fine to call this multiple times. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, +				      u64 len) +{ +	if (len == 0) +		return 0; + +	ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); + +	if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) +		return 0; +	return set_extent_bits(&inode->file_extent_tree, start, start + len - 1, +			       EXTENT_DIRTY); +} + +/** + * @inode - the inode we're modifying + * @start - the start file offset of the file extent we've inserted + * @len - the logical length of the file extent item + * + * Called when we drop a file extent, for example when we truncate.  Doesn't + * need to be called for cases where we're replacing a file extent, like when + * we've COWed a file extent. + * + * The start and len must match the file extent item, so thus must be sectorsize + * aligned. + */ +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, +					u64 len) +{ +	if (len == 0) +		return 0; + +	ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || +	       len == (u64)-1); + +	if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) +		return 0; +	return clear_extent_bit(&inode->file_extent_tree, start, +				start + len - 1, EXTENT_DIRTY, 0, 0, NULL); +} +  static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,  					u16 csum_size)  { @@ -949,18 +1040,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,  	btrfs_item_key_to_cpu(leaf, &key, slot);  	extent_start = key.offset; - -	if (type == BTRFS_FILE_EXTENT_REG || -	    type == BTRFS_FILE_EXTENT_PREALLOC) { -		extent_end = extent_start + -			btrfs_file_extent_num_bytes(leaf, fi); -	} else if (type == BTRFS_FILE_EXTENT_INLINE) { -		size_t size; -		size = btrfs_file_extent_ram_bytes(leaf, fi); -		extent_end = ALIGN(extent_start + size, -				   fs_info->sectorsize); -	} - +	extent_end = btrfs_file_extent_end(path);  	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);  	if (type == BTRFS_FILE_EXTENT_REG ||  	    type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -1007,3 +1087,30 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,  			  root->root_key.objectid);  	}  } + +/* + * Returns the end offset (non inclusive) of the file extent item the given path + * points to. If it points to an inline extent, the returned offset is rounded + * up to the sector size. + */ +u64 btrfs_file_extent_end(const struct btrfs_path *path) +{ +	const struct extent_buffer *leaf = path->nodes[0]; +	const int slot = path->slots[0]; +	struct btrfs_file_extent_item *fi; +	struct btrfs_key key; +	u64 end; + +	btrfs_item_key_to_cpu(leaf, &key, slot); +	ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); +	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + +	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { +		end = btrfs_file_extent_ram_bytes(leaf, fi); +		end = ALIGN(key.offset + end, leaf->fs_info->sectorsize); +	} else { +		end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); +	} + +	return end; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a16da274c9aa..8a144f9cb7ac 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -27,6 +27,7 @@  #include "qgroup.h"  #include "compression.h"  #include "delalloc-space.h" +#include "reflink.h"  static struct kmem_cache *btrfs_inode_defrag_cachep;  /* @@ -277,7 +278,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,  	struct btrfs_key key;  	struct btrfs_ioctl_defrag_range_args range;  	int num_defrag; -	int index;  	int ret;  	/* get the inode */ @@ -285,9 +285,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,  	key.type = BTRFS_ROOT_ITEM_KEY;  	key.offset = (u64)-1; -	index = srcu_read_lock(&fs_info->subvol_srcu); - -	inode_root = btrfs_read_fs_root_no_name(fs_info, &key); +	inode_root = btrfs_get_fs_root(fs_info, &key, true);  	if (IS_ERR(inode_root)) {  		ret = PTR_ERR(inode_root);  		goto cleanup; @@ -297,11 +295,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,  	key.type = BTRFS_INODE_ITEM_KEY;  	key.offset = 0;  	inode = btrfs_iget(fs_info->sb, &key, inode_root); +	btrfs_put_root(inode_root);  	if (IS_ERR(inode)) {  		ret = PTR_ERR(inode);  		goto cleanup;  	} -	srcu_read_unlock(&fs_info->subvol_srcu, index);  	/* do a chunk of defrag */  	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -337,7 +335,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,  	iput(inode);  	return 0;  cleanup: -	srcu_read_unlock(&fs_info->subvol_srcu, index);  	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);  	return ret;  } @@ -1552,15 +1549,14 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,  	u64 num_bytes;  	int ret; -	ret = btrfs_start_write_no_snapshotting(root); -	if (!ret) +	if (!btrfs_drew_try_write_lock(&root->snapshot_lock))  		return -EAGAIN;  	lockstart = round_down(pos, fs_info->sectorsize);  	lockend = round_up(pos + *write_bytes,  			   fs_info->sectorsize) - 1; -	btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, +	btrfs_lock_and_flush_ordered_range(inode, lockstart,  					   lockend, NULL);  	num_bytes = lockend - lockstart + 1; @@ -1568,7 +1564,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,  			NULL, NULL, NULL);  	if (ret <= 0) {  		ret = 0; -		btrfs_end_write_no_snapshotting(root); +		btrfs_drew_write_unlock(&root->snapshot_lock);  	} else {  		*write_bytes = min_t(size_t, *write_bytes ,  				     num_bytes - pos + lockstart); @@ -1674,7 +1670,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,  						data_reserved, pos,  						write_bytes);  			else -				btrfs_end_write_no_snapshotting(root); +				btrfs_drew_write_unlock(&root->snapshot_lock);  			break;  		} @@ -1778,7 +1774,7 @@ again:  		release_bytes = 0;  		if (only_release_metadata) -			btrfs_end_write_no_snapshotting(root); +			btrfs_drew_write_unlock(&root->snapshot_lock);  		if (only_release_metadata && copied > 0) {  			lockstart = round_down(pos, @@ -1807,7 +1803,7 @@ again:  	if (release_bytes) {  		if (only_release_metadata) { -			btrfs_end_write_no_snapshotting(root); +			btrfs_drew_write_unlock(&root->snapshot_lock);  			btrfs_delalloc_release_metadata(BTRFS_I(inode),  					release_bytes, true);  		} else { @@ -2071,6 +2067,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	btrfs_init_log_ctx(&ctx, inode);  	/* +	 * Set the range to full if the NO_HOLES feature is not enabled. +	 * This is to avoid missing file extent items representing holes after +	 * replaying the log. +	 */ +	if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { +		start = 0; +		end = LLONG_MAX; +	} + +	/*  	 * We write the dirty pages in the range and wait until they complete  	 * out of the ->i_mutex. If so, we can flush the dirty pages by  	 * multi-task, and make the performance up.  See @@ -2092,19 +2098,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	atomic_inc(&root->log_batch);  	/* -	 * If the inode needs a full sync, make sure we use a full range to -	 * avoid log tree corruption, due to hole detection racing with ordered -	 * extent completion for adjacent ranges, and assertion failures during -	 * hole detection. Do this while holding the inode lock, to avoid races -	 * with other tasks. -	 */ -	if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, -		     &BTRFS_I(inode)->runtime_flags)) { -		start = 0; -		end = LLONG_MAX; -	} - -	/*  	 * Before we acquired the inode's lock, someone may have dirtied more  	 * pages in the target range. We need to make sure that writeback for  	 * any such pages does not start while we are logging the inode, because @@ -2124,6 +2117,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	 */  	ret = start_ordered_ops(inode, start, end);  	if (ret) { +		up_write(&BTRFS_I(inode)->dio_sem);  		inode_unlock(inode);  		goto out;  	} @@ -2486,6 +2480,11 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,  	btrfs_mark_buffer_dirty(leaf);  	btrfs_release_path(path); +	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), +			clone_info->file_offset, clone_len); +	if (ret) +		return ret; +  	/* If it's a hole, nothing more needs to be done. */  	if (clone_info->disk_offset == 0)  		return 0; @@ -2596,6 +2595,24 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,  				btrfs_abort_transaction(trans, ret);  				break;  			} +		} else if (!clone_info && cur_offset < drop_end) { +			/* +			 * We are past the i_size here, but since we didn't +			 * insert holes we need to clear the mapped area so we +			 * know to not set disk_i_size in this area until a new +			 * file extent is inserted here. +			 */ +			ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), +					cur_offset, drop_end - cur_offset); +			if (ret) { +				/* +				 * We couldn't clear our area, so we could +				 * presumably adjust up and corrupt the fs, so +				 * we need to abort. +				 */ +				btrfs_abort_transaction(trans, ret); +				break; +			}  		}  		if (clone_info && drop_end > clone_info->file_offset) { @@ -2686,6 +2703,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,  			btrfs_abort_transaction(trans, ret);  			goto out_trans;  		} +	} else if (!clone_info && cur_offset < drop_end) { +		/* See the comment in the loop above for the reasoning here. */ +		ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), +					cur_offset, drop_end - cur_offset); +		if (ret) { +			btrfs_abort_transaction(trans, ret); +			goto out_trans; +		} +  	}  	if (clone_info) {  		ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, @@ -2935,7 +2961,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,  	inode->i_ctime = current_time(inode);  	i_size_write(inode, end); -	btrfs_ordered_update_i_size(inode, end, NULL); +	btrfs_inode_safe_disk_i_size_write(inode, 0);  	ret = btrfs_update_inode(trans, root, inode);  	ret2 = btrfs_end_transaction(trans); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0598fd3c6e3f..3613da065a73 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -371,10 +371,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)  	}  } -static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, -				int uptodate) +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)  {  	struct page *page; +	struct inode *inode = io_ctl->inode;  	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);  	int i; @@ -732,7 +732,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,  	readahead_cache(inode); -	ret = io_ctl_prepare_pages(&io_ctl, inode, 1); +	ret = io_ctl_prepare_pages(&io_ctl, true);  	if (ret)  		goto out; @@ -1067,6 +1067,7 @@ fail:  }  static noinline_for_stack int write_pinned_extent_entries( +			    struct btrfs_trans_handle *trans,  			    struct btrfs_block_group *block_group,  			    struct btrfs_io_ctl *io_ctl,  			    int *entries) @@ -1085,7 +1086,7 @@ static noinline_for_stack int write_pinned_extent_entries(  	 * We shouldn't have switched the pinned extents yet so this is the  	 * right one  	 */ -	unpin = block_group->fs_info->pinned_extents; +	unpin = &trans->transaction->pinned_extents;  	start = block_group->start; @@ -1190,7 +1191,7 @@ out:  		invalidate_inode_pages2(inode->i_mapping);  		BTRFS_I(inode)->generation = 0;  		if (block_group) { -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG  			btrfs_err(root->fs_info,  				  "failed to write free space cache for block group %llu",  				  block_group->start); @@ -1291,7 +1292,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	}  	/* Lock all pages first so we can lock the extent safely. */ -	ret = io_ctl_prepare_pages(io_ctl, inode, 0); +	ret = io_ctl_prepare_pages(io_ctl, false);  	if (ret)  		goto out_unlock; @@ -1317,7 +1318,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	 * If this changes while we are working we'll get added back to  	 * the dirty list and redo it.  No locking needed  	 */ -	ret = write_pinned_extent_entries(block_group, io_ctl, &entries); +	ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);  	if (ret)  		goto out_nospc_locked; @@ -1366,18 +1367,6 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,  	return 0; -out: -	io_ctl->inode = NULL; -	io_ctl_free(io_ctl); -	if (ret) { -		invalidate_inode_pages2(inode->i_mapping); -		BTRFS_I(inode)->generation = 0; -	} -	btrfs_update_inode(trans, root, inode); -	if (must_iput) -		iput(inode); -	return ret; -  out_nospc_locked:  	cleanup_bitmap_list(&bitmap_list);  	spin_unlock(&ctl->tree_lock); @@ -1390,7 +1379,17 @@ out_unlock:  	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))  		up_write(&block_group->data_rwsem); -	goto out; +out: +	io_ctl->inode = NULL; +	io_ctl_free(io_ctl); +	if (ret) { +		invalidate_inode_pages2(inode->i_mapping); +		BTRFS_I(inode)->generation = 0; +	} +	btrfs_update_inode(trans, root, inode); +	if (must_iput) +		iput(inode); +	return ret;  }  int btrfs_write_out_cache(struct btrfs_trans_handle *trans, @@ -1416,7 +1415,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,  	ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,  				block_group, &block_group->io_ctl, trans);  	if (ret) { -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG  		btrfs_err(fs_info,  			  "failed to write free space cache for block group %llu",  			  block_group->start); @@ -4036,7 +4035,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,  		if (release_metadata)  			btrfs_delalloc_release_metadata(BTRFS_I(inode),  					inode->i_size, true); -#ifdef DEBUG +#ifdef CONFIG_BTRFS_DEBUG  		btrfs_err(fs_info,  			  "failed to write free ino cache for root %llu",  			  root->root_key.objectid); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 258cb3fae17a..8b1f5c8897b7 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1251,9 +1251,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)  	btrfs_free_tree_block(trans, free_space_root, free_space_root->node,  			      0, 1); -	free_extent_buffer(free_space_root->node); -	free_extent_buffer(free_space_root->commit_root); -	kfree(free_space_root); +	btrfs_put_root(free_space_root);  	return btrfs_commit_transaction(trans); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d5c9c69d8263..6009e0e939b5 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -515,7 +515,7 @@ out_release:  	trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,  				      trans->bytes_reserved, 0);  	btrfs_block_rsv_release(fs_info, trans->block_rsv, -				trans->bytes_reserved); +				trans->bytes_reserved, NULL);  out:  	trans->block_rsv = rsv;  	trans->bytes_reserved = num_bytes; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d267eb5caa7b..320d1062068d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -28,6 +28,7 @@  #include <linux/magic.h>  #include <linux/iversion.h>  #include <linux/swap.h> +#include <linux/migrate.h>  #include <linux/sched/mm.h>  #include <asm/unaligned.h>  #include "misc.h" @@ -242,6 +243,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,  	btrfs_release_path(path);  	/* +	 * We align size to sectorsize for inline extents just for simplicity +	 * sake. +	 */ +	size = ALIGN(size, root->fs_info->sectorsize); +	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); +	if (ret) +		goto fail; + +	/*  	 * we're an inline extent, so nobody can  	 * extend the file past i_size without locking  	 * a page we already have locked. @@ -2446,6 +2456,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	ins.offset = disk_num_bytes;  	ins.type = BTRFS_EXTENT_ITEM_KEY; +	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos, +						ram_bytes); +	if (ret) +		goto out; +  	/*  	 * Release the reserved range from inode dirty range map, as it is  	 * already moved into delayed_ref_head @@ -2536,7 +2551,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  		 */  		btrfs_qgroup_free_data(inode, NULL, start,  				       ordered_extent->num_bytes); -		btrfs_ordered_update_i_size(inode, 0, ordered_extent); +		btrfs_inode_safe_disk_i_size_write(inode, 0);  		if (freespace_inode)  			trans = btrfs_join_transaction_spacecache(root);  		else @@ -2607,7 +2622,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  		goto out;  	} -	btrfs_ordered_update_i_size(inode, 0, ordered_extent); +	btrfs_inode_safe_disk_i_size_write(inode, 0);  	ret = btrfs_update_inode_fallback(trans, root, inode);  	if (ret) { /* -ENOMEM or corruption */  		btrfs_abort_transaction(trans, ret); @@ -3187,6 +3202,8 @@ static int btrfs_read_locked_inode(struct inode *inode,  	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));  	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));  	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); +	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, +			round_up(i_size_read(inode), fs_info->sectorsize));  	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);  	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); @@ -4158,6 +4175,8 @@ search_again:  	}  	while (1) { +		u64 clear_start = 0, clear_len = 0; +  		fi = NULL;  		leaf = path->nodes[0];  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -4208,6 +4227,8 @@ search_again:  		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {  			u64 num_dec; + +			clear_start = found_key.offset;  			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);  			if (!del_item) {  				u64 orig_num_bytes = @@ -4215,6 +4236,7 @@ search_again:  				extent_num_bytes = ALIGN(new_size -  						found_key.offset,  						fs_info->sectorsize); +				clear_start = ALIGN(new_size, fs_info->sectorsize);  				btrfs_set_file_extent_num_bytes(leaf, fi,  							 extent_num_bytes);  				num_dec = (orig_num_bytes - @@ -4240,6 +4262,7 @@ search_again:  						inode_sub_bytes(inode, num_dec);  				}  			} +			clear_len = num_dec;  		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {  			/*  			 * we can't truncate inline items that have had @@ -4261,12 +4284,33 @@ search_again:  				 */  				ret = NEED_TRUNCATE_BLOCK;  				break; +			} else { +				/* +				 * Inline extents are special, we just treat +				 * them as a full sector worth in the file +				 * extent tree just for simplicity sake. +				 */ +				clear_len = fs_info->sectorsize;  			}  			if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))  				inode_sub_bytes(inode, item_end + 1 - new_size);  		}  delete: +		/* +		 * We use btrfs_truncate_inode_items() to clean up log trees for +		 * multiple fsyncs, and in this case we don't want to clear the +		 * file extent range because it's just the log. +		 */ +		if (root == BTRFS_I(inode)->root) { +			ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), +						  clear_start, clear_len); +			if (ret) { +				btrfs_abort_transaction(trans, ret); +				break; +			} +		} +  		if (del_item)  			last_size = found_key.offset;  		else @@ -4368,7 +4412,7 @@ out:  		ASSERT(last_size >= new_size);  		if (!ret && last_size > new_size)  			last_size = new_size; -		btrfs_ordered_update_i_size(inode, last_size, NULL); +		btrfs_inode_safe_disk_i_size_write(inode, last_size);  		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,  				     (u64)-1, &cached_state);  	} @@ -4576,7 +4620,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  	if (size <= hole_start)  		return 0; -	btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, +	btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,  					   block_end - 1, &cached_state);  	cur_offset = hole_start;  	while (1) { @@ -4589,14 +4633,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  		}  		last_byte = min(extent_map_end(em), block_end);  		last_byte = ALIGN(last_byte, fs_info->sectorsize); +		hole_size = last_byte - cur_offset; +  		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {  			struct extent_map *hole_em; -			hole_size = last_byte - cur_offset;  			err = maybe_insert_hole(root, inode, cur_offset,  						hole_size);  			if (err)  				break; + +			err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), +							cur_offset, hole_size); +			if (err) +				break; +  			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,  						cur_offset + hole_size - 1, 0);  			hole_em = alloc_extent_map(); @@ -4628,6 +4679,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  							hole_size - 1, 0);  			}  			free_extent_map(hole_em); +		} else { +			err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), +							cur_offset, hole_size); +			if (err) +				break;  		}  next:  		free_extent_map(em); @@ -4671,24 +4727,24 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)  		 * truncation, it must capture all writes that happened before  		 * this truncation.  		 */ -		btrfs_wait_for_snapshot_creation(root); +		btrfs_drew_write_lock(&root->snapshot_lock);  		ret = btrfs_cont_expand(inode, oldsize, newsize);  		if (ret) { -			btrfs_end_write_no_snapshotting(root); +			btrfs_drew_write_unlock(&root->snapshot_lock);  			return ret;  		}  		trans = btrfs_start_transaction(root, 1);  		if (IS_ERR(trans)) { -			btrfs_end_write_no_snapshotting(root); +			btrfs_drew_write_unlock(&root->snapshot_lock);  			return PTR_ERR(trans);  		}  		i_size_write(inode, newsize); -		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); +		btrfs_inode_safe_disk_i_size_write(inode, 0);  		pagecache_isize_extended(inode, oldsize, newsize);  		ret = btrfs_update_inode(trans, root, inode); -		btrfs_end_write_no_snapshotting(root); +		btrfs_drew_write_unlock(&root->snapshot_lock);  		btrfs_end_transaction(trans);  	} else { @@ -5098,7 +5154,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,  	btrfs_release_path(path); -	new_root = btrfs_read_fs_root_no_name(fs_info, location); +	new_root = btrfs_get_fs_root(fs_info, location, true);  	if (IS_ERR(new_root)) {  		err = PTR_ERR(new_root);  		goto out; @@ -5179,7 +5235,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)  	inode->i_ino = args->location->objectid;  	memcpy(&BTRFS_I(inode)->location, args->location,  	       sizeof(*args->location)); -	BTRFS_I(inode)->root = args->root; +	BTRFS_I(inode)->root = btrfs_grab_root(args->root); +	BUG_ON(args->root && !BTRFS_I(inode)->root);  	return 0;  } @@ -5260,7 +5317,7 @@ static struct inode *new_simple_dir(struct super_block *s,  	if (!inode)  		return ERR_PTR(-ENOMEM); -	BTRFS_I(inode)->root = root; +	BTRFS_I(inode)->root = btrfs_grab_root(root);  	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));  	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -5307,7 +5364,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)  	struct btrfs_root *sub_root = root;  	struct btrfs_key location;  	u8 di_type = 0; -	int index;  	int ret = 0;  	if (dentry->d_name.len > BTRFS_NAME_LEN) @@ -5334,7 +5390,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)  		return inode;  	} -	index = srcu_read_lock(&fs_info->subvol_srcu);  	ret = fixup_tree_root_location(fs_info, dir, dentry,  				       &location, &sub_root);  	if (ret < 0) { @@ -5345,7 +5400,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)  	} else {  		inode = btrfs_iget(dir->i_sb, &location, sub_root);  	} -	srcu_read_unlock(&fs_info->subvol_srcu, index); +	if (root != sub_root) +		btrfs_put_root(sub_root);  	if (!IS_ERR(inode) && root != sub_root) {  		down_read(&fs_info->cleanup_work_sem); @@ -5826,7 +5882,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	 */  	BTRFS_I(inode)->index_cnt = 2;  	BTRFS_I(inode)->dir_index = *index; -	BTRFS_I(inode)->root = root; +	BTRFS_I(inode)->root = btrfs_grab_root(root);  	BTRFS_I(inode)->generation = trans->transid;  	inode->i_generation = BTRFS_I(inode)->generation; @@ -6463,6 +6519,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,  	extent_type = btrfs_file_extent_type(leaf, item);  	extent_start = found_key.offset; +	extent_end = btrfs_file_extent_end(path);  	if (extent_type == BTRFS_FILE_EXTENT_REG ||  	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {  		/* Only regular file could have regular/prealloc extent */ @@ -6473,18 +6530,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,  				   btrfs_ino(inode));  			goto out;  		} -		extent_end = extent_start + -		       btrfs_file_extent_num_bytes(leaf, item); -  		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,  						       extent_start);  	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { -		size_t size; - -		size = btrfs_file_extent_ram_bytes(leaf, item); -		extent_end = ALIGN(extent_start + size, -				   fs_info->sectorsize); -  		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,  						      path->slots[0],  						      extent_start); @@ -8211,9 +8259,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  int btrfs_readpage(struct file *file, struct page *page)  { -	struct extent_io_tree *tree; -	tree = &BTRFS_I(page->mapping->host)->io_tree; -	return extent_read_full_page(tree, page, btrfs_get_extent, 0); +	return extent_read_full_page(page, btrfs_get_extent, 0);  }  static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -8272,6 +8318,39 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)  	return __btrfs_releasepage(page, gfp_flags);  } +#ifdef CONFIG_MIGRATION +static int btrfs_migratepage(struct address_space *mapping, +			     struct page *newpage, struct page *page, +			     enum migrate_mode mode) +{ +	int ret; + +	ret = migrate_page_move_mapping(mapping, newpage, page, 0); +	if (ret != MIGRATEPAGE_SUCCESS) +		return ret; + +	if (page_has_private(page)) { +		ClearPagePrivate(page); +		get_page(newpage); +		set_page_private(newpage, page_private(page)); +		set_page_private(page, 0); +		put_page(page); +		SetPagePrivate(newpage); +	} + +	if (PagePrivate2(page)) { +		ClearPagePrivate2(page); +		SetPagePrivate2(newpage); +	} + +	if (mode != MIGRATE_SYNC_NO_COPY) +		migrate_page_copy(newpage, page); +	else +		migrate_page_states(newpage, page); +	return MIGRATEPAGE_SUCCESS; +} +#endif +  static void btrfs_invalidatepage(struct page *page, unsigned int offset,  				 unsigned int length)  { @@ -8647,7 +8726,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)  			break;  		} -		btrfs_block_rsv_release(fs_info, rsv, -1); +		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);  		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,  					      rsv, min_size, false);  		BUG_ON(ret);	/* shouldn't happen */ @@ -8672,7 +8751,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)  			ret = PTR_ERR(trans);  			goto out;  		} -		btrfs_ordered_update_i_size(inode, inode->i_size, NULL); +		btrfs_inode_safe_disk_i_size_write(inode, 0);  	}  	if (trans) { @@ -8776,6 +8855,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);  	extent_io_tree_init(fs_info, &ei->io_failure_tree,  			    IO_TREE_INODE_IO_FAILURE, inode); +	extent_io_tree_init(fs_info, &ei->file_extent_tree, +			    IO_TREE_INODE_FILE_EXTENT, inode);  	ei->io_tree.track_uptodate = true;  	ei->io_failure_tree.track_uptodate = true;  	atomic_set(&ei->sync_writers, 0); @@ -8842,6 +8923,8 @@ void btrfs_destroy_inode(struct inode *inode)  	btrfs_qgroup_check_reserved_leak(inode);  	inode_tree_del(inode);  	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); +	btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); +	btrfs_put_root(BTRFS_I(inode)->root);  }  int btrfs_drop_inode(struct inode *inode) @@ -9669,14 +9752,14 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)  	while (!list_empty(&splice) && nr) {  		root = list_first_entry(&splice, struct btrfs_root,  					delalloc_root); -		root = btrfs_grab_fs_root(root); +		root = btrfs_grab_root(root);  		BUG_ON(!root);  		list_move_tail(&root->delalloc_root,  			       &fs_info->delalloc_roots);  		spin_unlock(&fs_info->delalloc_root_lock);  		ret = start_delalloc_inodes(root, nr, false); -		btrfs_put_fs_root(root); +		btrfs_put_root(root);  		if (ret < 0)  			goto out; @@ -9938,7 +10021,7 @@ next:  			else  				i_size = cur_offset;  			i_size_write(inode, i_size); -			btrfs_ordered_update_i_size(inode, i_size, NULL); +			btrfs_inode_safe_disk_i_size_write(inode, 0);  		}  		ret = btrfs_update_inode(trans, root, inode); @@ -10474,6 +10557,9 @@ static const struct address_space_operations btrfs_aops = {  	.direct_IO	= btrfs_direct_IO,  	.invalidatepage = btrfs_invalidatepage,  	.releasepage	= btrfs_releasepage, +#ifdef CONFIG_MIGRATION +	.migratepage	= btrfs_migratepage, +#endif  	.set_page_dirty	= btrfs_set_page_dirty,  	.error_remove_page = generic_error_remove_page,  	.swap_activate	= btrfs_swap_activate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4f4b13830b25..40b729dce91c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@  #include <linux/iversion.h>  #include "ctree.h"  #include "disk-io.h" +#include "export.h"  #include "transaction.h"  #include "btrfs_inode.h"  #include "print-tree.h" @@ -86,10 +87,6 @@ struct btrfs_ioctl_send_args_32 {  			       struct btrfs_ioctl_send_args_32)  #endif -static int btrfs_clone(struct inode *src, struct inode *inode, -		       u64 off, u64 olen, u64 olen_aligned, u64 destoff, -		       int no_time_update); -  /* Mask out flags that are inappropriate for the given type of inode. */  static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,  		unsigned int flags) @@ -554,7 +551,6 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)  static noinline int create_subvol(struct inode *dir,  				  struct dentry *dentry,  				  const char *name, int namelen, -				  u64 *async_transid,  				  struct btrfs_qgroup_inherit *inherit)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -573,7 +569,6 @@ static noinline int create_subvol(struct inode *dir,  	u64 objectid;  	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;  	u64 index = 0; -	uuid_le new_uuid;  	root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);  	if (!root_item) @@ -643,8 +638,7 @@ static noinline int create_subvol(struct inode *dir,  	btrfs_set_root_generation_v2(root_item,  			btrfs_root_generation(root_item)); -	uuid_le_gen(&new_uuid); -	memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); +	generate_random_guid(root_item->uuid);  	btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);  	btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);  	root_item->ctime = root_item->otime; @@ -666,7 +660,7 @@ static noinline int create_subvol(struct inode *dir,  		goto fail;  	key.offset = (u64)-1; -	new_root = btrfs_read_fs_root_no_name(fs_info, &key); +	new_root = btrfs_get_fs_root(fs_info, &key, true);  	if (IS_ERR(new_root)) {  		ret = PTR_ERR(new_root);  		btrfs_abort_transaction(trans, ret); @@ -676,6 +670,7 @@ static noinline int create_subvol(struct inode *dir,  	btrfs_record_root_in_trans(trans, new_root);  	ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); +	btrfs_put_root(new_root);  	if (ret) {  		/* We potentially lose an unused inode item here */  		btrfs_abort_transaction(trans, ret); @@ -727,14 +722,7 @@ fail:  	trans->bytes_reserved = 0;  	btrfs_subvolume_release_metadata(fs_info, &block_rsv); -	if (async_transid) { -		*async_transid = trans->transid; -		err = btrfs_commit_transaction_async(trans, 1); -		if (err) -			err = btrfs_commit_transaction(trans); -	} else { -		err = btrfs_commit_transaction(trans); -	} +	err = btrfs_commit_transaction(trans);  	if (err && !ret)  		ret = err; @@ -752,8 +740,7 @@ fail_free:  }  static int create_snapshot(struct btrfs_root *root, struct inode *dir, -			   struct dentry *dentry, -			   u64 *async_transid, bool readonly, +			   struct dentry *dentry, bool readonly,  			   struct btrfs_qgroup_inherit *inherit)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -789,11 +776,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,  	 * possible. This is to avoid later writeback (running dealloc) to  	 * fallback to COW mode and unexpectedly fail with ENOSPC.  	 */ -	atomic_inc(&root->will_be_snapshotted); -	smp_mb__after_atomic(); -	/* wait for no snapshot writes */ -	wait_event(root->subv_writers->wait, -		   percpu_counter_sum(&root->subv_writers->counter) == 0); +	btrfs_drew_read_lock(&root->snapshot_lock);  	ret = btrfs_start_delalloc_snapshot(root);  	if (ret) @@ -841,14 +824,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,  	list_add(&pending_snapshot->list,  		 &trans->transaction->pending_snapshots);  	spin_unlock(&fs_info->trans_lock); -	if (async_transid) { -		*async_transid = trans->transid; -		ret = btrfs_commit_transaction_async(trans, 1); -		if (ret) -			ret = btrfs_commit_transaction(trans); -	} else { -		ret = btrfs_commit_transaction(trans); -	} + +	ret = btrfs_commit_transaction(trans);  	if (ret)  		goto fail; @@ -869,12 +846,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,  	d_instantiate(dentry, inode);  	ret = 0;  fail: +	btrfs_put_root(pending_snapshot->snap);  	btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);  dec_and_free:  	if (snapshot_force_cow)  		atomic_dec(&root->snapshot_force_cow); -	if (atomic_dec_and_test(&root->will_be_snapshotted)) -		wake_up_var(&root->will_be_snapshotted); +	btrfs_drew_read_unlock(&root->snapshot_lock); +  free_pending:  	kfree(pending_snapshot->root_item);  	btrfs_free_path(pending_snapshot->path); @@ -953,7 +931,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)  static noinline int btrfs_mksubvol(const struct path *parent,  				   const char *name, int namelen,  				   struct btrfs_root *snap_src, -				   u64 *async_transid, bool readonly, +				   bool readonly,  				   struct btrfs_qgroup_inherit *inherit)  {  	struct inode *dir = d_inode(parent->dentry); @@ -989,13 +967,11 @@ static noinline int btrfs_mksubvol(const struct path *parent,  	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)  		goto out_up_read; -	if (snap_src) { -		error = create_snapshot(snap_src, dir, dentry, -					async_transid, readonly, inherit); -	} else { -		error = create_subvol(dir, dentry, name, namelen, -				      async_transid, inherit); -	} +	if (snap_src) +		error = create_snapshot(snap_src, dir, dentry, readonly, inherit); +	else +		error = create_subvol(dir, dentry, name, namelen, inherit); +  	if (!error)  		fsnotify_mkdir(dir, dentry);  out_up_read: @@ -1711,9 +1687,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,  	new_size = round_down(new_size, fs_info->sectorsize); -	btrfs_info_in_rcu(fs_info, "new size for %s is %llu", -			  rcu_str_deref(device->name), new_size); -  	if (new_size > old_size) {  		trans = btrfs_start_transaction(root, 0);  		if (IS_ERR(trans)) { @@ -1726,6 +1699,11 @@ static noinline int btrfs_ioctl_resize(struct file *file,  		ret = btrfs_shrink_device(device, new_size);  	} /* equal, nothing need to do */ +	if (ret == 0 && new_size != old_size) +		btrfs_info_in_rcu(fs_info, +			"resize device %s (devid %llu) from %llu to %llu", +			rcu_str_deref(device->name), device->devid, +			old_size, new_size);  out_free:  	kfree(vol_args);  out: @@ -1734,9 +1712,9 @@ out:  	return ret;  } -static noinline int btrfs_ioctl_snap_create_transid(struct file *file, +static noinline int __btrfs_ioctl_snap_create(struct file *file,  				const char *name, unsigned long fd, int subvol, -				u64 *transid, bool readonly, +				bool readonly,  				struct btrfs_qgroup_inherit *inherit)  {  	int namelen; @@ -1763,7 +1741,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,  	if (subvol) {  		ret = btrfs_mksubvol(&file->f_path, name, namelen, -				     NULL, transid, readonly, inherit); +				     NULL, readonly, inherit);  	} else {  		struct fd src = fdget(fd);  		struct inode *src_inode; @@ -1786,7 +1764,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,  		} else {  			ret = btrfs_mksubvol(&file->f_path, name, namelen,  					     BTRFS_I(src_inode)->root, -					     transid, readonly, inherit); +					     readonly, inherit);  		}  		fdput(src);  	} @@ -1810,9 +1788,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,  		return PTR_ERR(vol_args);  	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; -	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, -					      vol_args->fd, subvol, -					      NULL, false, NULL); +	ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, +					subvol, false, NULL);  	kfree(vol_args);  	return ret; @@ -1823,8 +1800,6 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,  {  	struct btrfs_ioctl_vol_args_v2 *vol_args;  	int ret; -	u64 transid = 0; -	u64 *ptr = NULL;  	bool readonly = false;  	struct btrfs_qgroup_inherit *inherit = NULL; @@ -1836,22 +1811,11 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,  		return PTR_ERR(vol_args);  	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; -	if (vol_args->flags & -	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | -	      BTRFS_SUBVOL_QGROUP_INHERIT)) { +	if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {  		ret = -EOPNOTSUPP;  		goto free_args;  	} -	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) { -		struct inode *inode = file_inode(file); -		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - -		btrfs_warn(fs_info, -"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7"); - -		ptr = &transid; -	}  	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)  		readonly = true;  	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { @@ -1866,18 +1830,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,  		}  	} -	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, -					      vol_args->fd, subvol, ptr, -					      readonly, inherit); +	ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, +					subvol, readonly, inherit);  	if (ret)  		goto free_inherit; - -	if (ptr && copy_to_user(arg + -				offsetof(struct btrfs_ioctl_vol_args_v2, -					transid), -				ptr, sizeof(*ptr))) -		ret = -EFAULT; -  free_inherit:  	kfree(inherit);  free_args: @@ -1936,11 +1892,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,  		goto out_drop_write;  	} -	if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { -		ret = -EINVAL; -		goto out_drop_write; -	} -  	if (flags & ~BTRFS_SUBVOL_RDONLY) {  		ret = -EOPNOTSUPP;  		goto out_drop_write; @@ -2174,12 +2125,12 @@ static noinline int search_ioctl(struct inode *inode,  	if (sk->tree_id == 0) {  		/* search the root of the inode that was passed */ -		root = BTRFS_I(inode)->root; +		root = btrfs_grab_root(BTRFS_I(inode)->root);  	} else {  		key.objectid = sk->tree_id;  		key.type = BTRFS_ROOT_ITEM_KEY;  		key.offset = (u64)-1; -		root = btrfs_read_fs_root_no_name(info, &key); +		root = btrfs_get_fs_root(info, &key, true);  		if (IS_ERR(root)) {  			btrfs_free_path(path);  			return PTR_ERR(root); @@ -2208,6 +2159,7 @@ static noinline int search_ioctl(struct inode *inode,  		ret = 0;  err:  	sk->nr_items = num_found; +	btrfs_put_root(root);  	btrfs_free_path(path);  	return ret;  } @@ -2314,9 +2266,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,  	key.objectid = tree_id;  	key.type = BTRFS_ROOT_ITEM_KEY;  	key.offset = (u64)-1; -	root = btrfs_read_fs_root_no_name(info, &key); +	root = btrfs_get_fs_root(info, &key, true);  	if (IS_ERR(root)) {  		ret = PTR_ERR(root); +		root = NULL;  		goto out;  	} @@ -2367,6 +2320,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,  	name[total_len] = '\0';  	ret = 0;  out: +	btrfs_put_root(root);  	btrfs_free_path(path);  	return ret;  } @@ -2383,7 +2337,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,  	unsigned long item_len;  	struct btrfs_inode_ref *iref;  	struct btrfs_root_ref *rref; -	struct btrfs_root *root; +	struct btrfs_root *root = NULL;  	struct btrfs_path *path;  	struct btrfs_key key, key2;  	struct extent_buffer *leaf; @@ -2408,7 +2362,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,  		key.objectid = treeid;  		key.type = BTRFS_ROOT_ITEM_KEY;  		key.offset = (u64)-1; -		root = btrfs_read_fs_root_no_name(fs_info, &key); +		root = btrfs_get_fs_root(fs_info, &key, true);  		if (IS_ERR(root)) {  			ret = PTR_ERR(root);  			goto out; @@ -2420,15 +2374,15 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,  		while (1) {  			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  			if (ret < 0) { -				goto out; +				goto out_put;  			} else if (ret > 0) {  				ret = btrfs_previous_item(root, path, dirid,  							  BTRFS_INODE_REF_KEY);  				if (ret < 0) { -					goto out; +					goto out_put;  				} else if (ret > 0) {  					ret = -ENOENT; -					goto out; +					goto out_put;  				}  			} @@ -2442,7 +2396,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,  			total_len += len + 1;  			if (ptr < args->path) {  				ret = -ENAMETOOLONG; -				goto out; +				goto out_put;  			}  			*(ptr + len) = '/'; @@ -2453,10 +2407,10 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,  			ret = btrfs_previous_item(root, path, dirid,  						  BTRFS_INODE_ITEM_KEY);  			if (ret < 0) { -				goto out; +				goto out_put;  			} else if (ret > 0) {  				ret = -ENOENT; -				goto out; +				goto out_put;  			}  			leaf = path->nodes[0]; @@ -2464,26 +2418,26 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,  			btrfs_item_key_to_cpu(leaf, &key2, slot);  			if (key2.objectid != dirid) {  				ret = -ENOENT; -				goto out; +				goto out_put;  			}  			temp_inode = btrfs_iget(sb, &key2, root);  			if (IS_ERR(temp_inode)) {  				ret = PTR_ERR(temp_inode); -				goto out; +				goto out_put;  			}  			ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);  			iput(temp_inode);  			if (ret) {  				ret = -EACCES; -				goto out; +				goto out_put;  			}  			if (key.offset == upper_limit.objectid)  				break;  			if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {  				ret = -EACCES; -				goto out; +				goto out_put;  			}  			btrfs_release_path(path); @@ -2494,15 +2448,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,  		memmove(args->path, ptr, total_len);  		args->path[total_len] = '\0'; +		btrfs_put_root(root); +		root = NULL;  		btrfs_release_path(path);  	}  	/* Get the bottom subvolume's name from ROOT_REF */ -	root = fs_info->tree_root;  	key.objectid = treeid;  	key.type = BTRFS_ROOT_REF_KEY;  	key.offset = args->treeid; -	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);  	if (ret < 0) {  		goto out;  	} else if (ret > 0) { @@ -2529,6 +2484,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,  	read_extent_buffer(leaf, args->name, item_off, item_len);  	args->name[item_len] = 0; +out_put: +	btrfs_put_root(root);  out:  	btrfs_free_path(path);  	return ret; @@ -2653,10 +2610,10 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)  	key.objectid = BTRFS_I(inode)->root->root_key.objectid;  	key.type = BTRFS_ROOT_ITEM_KEY;  	key.offset = (u64)-1; -	root = btrfs_read_fs_root_no_name(fs_info, &key); +	root = btrfs_get_fs_root(fs_info, &key, true);  	if (IS_ERR(root)) {  		ret = PTR_ERR(root); -		goto out; +		goto out_free;  	}  	root_item = &root->root_item; @@ -2689,16 +2646,14 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)  	if (key.objectid != BTRFS_FS_TREE_OBJECTID) {  		/* Search root tree for ROOT_BACKREF of this subvolume */ -		root = fs_info->tree_root; -  		key.type = BTRFS_ROOT_BACKREF_KEY;  		key.offset = 0; -		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);  		if (ret < 0) {  			goto out;  		} else if (path->slots[0] >=  			   btrfs_header_nritems(path->nodes[0])) { -			ret = btrfs_next_leaf(root, path); +			ret = btrfs_next_leaf(fs_info->tree_root, path);  			if (ret < 0) {  				goto out;  			} else if (ret > 0) { @@ -2733,6 +2688,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)  		ret = -EFAULT;  out: +	btrfs_put_root(root); +out_free:  	btrfs_free_path(path);  	kzfree(subvol_info);  	return ret; @@ -2836,7 +2793,8 @@ out:  }  static noinline int btrfs_ioctl_snap_destroy(struct file *file, -					     void __user *arg) +					     void __user *arg, +					     bool destroy_v2)  {  	struct dentry *parent = file->f_path.dentry;  	struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); @@ -2845,34 +2803,120 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,  	struct inode *inode;  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct btrfs_root *dest = NULL; -	struct btrfs_ioctl_vol_args *vol_args; -	int namelen; +	struct btrfs_ioctl_vol_args *vol_args = NULL; +	struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; +	char *subvol_name, *subvol_name_ptr = NULL; +	int subvol_namelen;  	int err = 0; +	bool destroy_parent = false; -	if (!S_ISDIR(dir->i_mode)) -		return -ENOTDIR; +	if (destroy_v2) { +		vol_args2 = memdup_user(arg, sizeof(*vol_args2)); +		if (IS_ERR(vol_args2)) +			return PTR_ERR(vol_args2); -	vol_args = memdup_user(arg, sizeof(*vol_args)); -	if (IS_ERR(vol_args)) -		return PTR_ERR(vol_args); +		if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { +			err = -EOPNOTSUPP; +			goto out; +		} -	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; -	namelen = strlen(vol_args->name); -	if (strchr(vol_args->name, '/') || -	    strncmp(vol_args->name, "..", namelen) == 0) { -		err = -EINVAL; -		goto out; +		/* +		 * If SPEC_BY_ID is not set, we are looking for the subvolume by +		 * name, same as v1 currently does. +		 */ +		if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { +			vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; +			subvol_name = vol_args2->name; + +			err = mnt_want_write_file(file); +			if (err) +				goto out; +		} else { +			if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { +				err = -EINVAL; +				goto out; +			} + +			err = mnt_want_write_file(file); +			if (err) +				goto out; + +			dentry = btrfs_get_dentry(fs_info->sb, +					BTRFS_FIRST_FREE_OBJECTID, +					vol_args2->subvolid, 0, 0); +			if (IS_ERR(dentry)) { +				err = PTR_ERR(dentry); +				goto out_drop_write; +			} + +			/* +			 * Change the default parent since the subvolume being +			 * deleted can be outside of the current mount point. +			 */ +			parent = btrfs_get_parent(dentry); + +			/* +			 * At this point dentry->d_name can point to '/' if the +			 * subvolume we want to destroy is outsite of the +			 * current mount point, so we need to release the +			 * current dentry and execute the lookup to return a new +			 * one with ->d_name pointing to the +			 * <mount point>/subvol_name. +			 */ +			dput(dentry); +			if (IS_ERR(parent)) { +				err = PTR_ERR(parent); +				goto out_drop_write; +			} +			dir = d_inode(parent); + +			/* +			 * If v2 was used with SPEC_BY_ID, a new parent was +			 * allocated since the subvolume can be outside of the +			 * current mount point. Later on we need to release this +			 * new parent dentry. +			 */ +			destroy_parent = true; + +			subvol_name_ptr = btrfs_get_subvol_name_from_objectid( +						fs_info, vol_args2->subvolid); +			if (IS_ERR(subvol_name_ptr)) { +				err = PTR_ERR(subvol_name_ptr); +				goto free_parent; +			} +			/* subvol_name_ptr is already NULL termined */ +			subvol_name = (char *)kbasename(subvol_name_ptr); +		} +	} else { +		vol_args = memdup_user(arg, sizeof(*vol_args)); +		if (IS_ERR(vol_args)) +			return PTR_ERR(vol_args); + +		vol_args->name[BTRFS_PATH_NAME_MAX] = 0; +		subvol_name = vol_args->name; + +		err = mnt_want_write_file(file); +		if (err) +			goto out;  	} -	err = mnt_want_write_file(file); -	if (err) -		goto out; +	subvol_namelen = strlen(subvol_name); + +	if (strchr(subvol_name, '/') || +	    strncmp(subvol_name, "..", subvol_namelen) == 0) { +		err = -EINVAL; +		goto free_subvol_name; +	} +	if (!S_ISDIR(dir->i_mode)) { +		err = -ENOTDIR; +		goto free_subvol_name; +	}  	err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);  	if (err == -EINTR) -		goto out_drop_write; -	dentry = lookup_one_len(vol_args->name, parent, namelen); +		goto free_subvol_name; +	dentry = lookup_one_len(subvol_name, parent, subvol_namelen);  	if (IS_ERR(dentry)) {  		err = PTR_ERR(dentry);  		goto out_unlock_dir; @@ -2941,9 +2985,15 @@ out_dput:  	dput(dentry);  out_unlock_dir:  	inode_unlock(dir); +free_subvol_name: +	kfree(subvol_name_ptr); +free_parent: +	if (destroy_parent) +		dput(parent);  out_drop_write:  	mnt_drop_write_file(file);  out: +	kfree(vol_args2);  	kfree(vol_args);  	return err;  } @@ -3069,8 +3119,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)  		goto err_drop;  	} -	/* Check for compatibility reject unknown flags */ -	if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) { +	if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {  		ret = -EOPNOTSUPP;  		goto out;  	} @@ -3220,733 +3269,6 @@ out:  	return ret;  } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, -				       struct inode *inode2, u64 loff2, u64 len) -{ -	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); -	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, -				     struct inode *inode2, u64 loff2, u64 len) -{ -	if (inode1 < inode2) { -		swap(inode1, inode2); -		swap(loff1, loff2); -	} else if (inode1 == inode2 && loff2 < loff1) { -		swap(loff1, loff2); -	} -	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); -	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, -				   struct inode *dst, u64 dst_loff) -{ -	const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; -	int ret; - -	/* -	 * Lock destination range to serialize with concurrent readpages() and -	 * source range to serialize with relocation. -	 */ -	btrfs_double_extent_lock(src, loff, dst, dst_loff, len); -	ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); -	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); - -	return ret; -} - -#define BTRFS_MAX_DEDUPE_LEN	SZ_16M - -static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, -			     struct inode *dst, u64 dst_loff) -{ -	int ret; -	u64 i, tail_len, chunk_count; -	struct btrfs_root *root_dst = BTRFS_I(dst)->root; - -	spin_lock(&root_dst->root_item_lock); -	if (root_dst->send_in_progress) { -		btrfs_warn_rl(root_dst->fs_info, -"cannot deduplicate to root %llu while send operations are using it (%d in progress)", -			      root_dst->root_key.objectid, -			      root_dst->send_in_progress); -		spin_unlock(&root_dst->root_item_lock); -		return -EAGAIN; -	} -	root_dst->dedupe_in_progress++; -	spin_unlock(&root_dst->root_item_lock); - -	tail_len = olen % BTRFS_MAX_DEDUPE_LEN; -	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); - -	for (i = 0; i < chunk_count; i++) { -		ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, -					      dst, dst_loff); -		if (ret) -			goto out; - -		loff += BTRFS_MAX_DEDUPE_LEN; -		dst_loff += BTRFS_MAX_DEDUPE_LEN; -	} - -	if (tail_len > 0) -		ret = btrfs_extent_same_range(src, loff, tail_len, dst, -					      dst_loff); -out: -	spin_lock(&root_dst->root_item_lock); -	root_dst->dedupe_in_progress--; -	spin_unlock(&root_dst->root_item_lock); - -	return ret; -} - -static int clone_finish_inode_update(struct btrfs_trans_handle *trans, -				     struct inode *inode, -				     u64 endoff, -				     const u64 destoff, -				     const u64 olen, -				     int no_time_update) -{ -	struct btrfs_root *root = BTRFS_I(inode)->root; -	int ret; - -	inode_inc_iversion(inode); -	if (!no_time_update) -		inode->i_mtime = inode->i_ctime = current_time(inode); -	/* -	 * We round up to the block size at eof when determining which -	 * extents to clone above, but shouldn't round up the file size. -	 */ -	if (endoff > destoff + olen) -		endoff = destoff + olen; -	if (endoff > inode->i_size) -		btrfs_i_size_write(BTRFS_I(inode), endoff); - -	ret = btrfs_update_inode(trans, root, inode); -	if (ret) { -		btrfs_abort_transaction(trans, ret); -		btrfs_end_transaction(trans); -		goto out; -	} -	ret = btrfs_end_transaction(trans); -out: -	return ret; -} - -/* - * Make sure we do not end up inserting an inline extent into a file that has - * already other (non-inline) extents. If a file has an inline extent it can - * not have any other extents and the (single) inline extent must start at the - * file offset 0. Failing to respect these rules will lead to file corruption, - * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc - * - * We can have extents that have been already written to disk or we can have - * dirty ranges still in delalloc, in which case the extent maps and items are - * created only when we run delalloc, and the delalloc ranges might fall outside - * the range we are currently locking in the inode's io tree. So we check the - * inode's i_size because of that (i_size updates are done while holding the - * i_mutex, which we are holding here). - * We also check to see if the inode has a size not greater than "datal" but has - * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are - * protected against such concurrent fallocate calls by the i_mutex). - * - * If the file has no extents but a size greater than datal, do not allow the - * copy because we would need turn the inline extent into a non-inline one (even - * with NO_HOLES enabled). If we find our destination inode only has one inline - * extent, just overwrite it with the source inline extent if its size is less - * than the source extent's size, or we could copy the source inline extent's - * data into the destination inode's inline extent if the later is greater then - * the former. - */ -static int clone_copy_inline_extent(struct inode *dst, -				    struct btrfs_trans_handle *trans, -				    struct btrfs_path *path, -				    struct btrfs_key *new_key, -				    const u64 drop_start, -				    const u64 datal, -				    const u64 skip, -				    const u64 size, -				    char *inline_data) -{ -	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); -	struct btrfs_root *root = BTRFS_I(dst)->root; -	const u64 aligned_end = ALIGN(new_key->offset + datal, -				      fs_info->sectorsize); -	int ret; -	struct btrfs_key key; - -	if (new_key->offset > 0) -		return -EOPNOTSUPP; - -	key.objectid = btrfs_ino(BTRFS_I(dst)); -	key.type = BTRFS_EXTENT_DATA_KEY; -	key.offset = 0; -	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); -	if (ret < 0) { -		return ret; -	} else if (ret > 0) { -		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { -			ret = btrfs_next_leaf(root, path); -			if (ret < 0) -				return ret; -			else if (ret > 0) -				goto copy_inline_extent; -		} -		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); -		if (key.objectid == btrfs_ino(BTRFS_I(dst)) && -		    key.type == BTRFS_EXTENT_DATA_KEY) { -			ASSERT(key.offset > 0); -			return -EOPNOTSUPP; -		} -	} else if (i_size_read(dst) <= datal) { -		struct btrfs_file_extent_item *ei; -		u64 ext_len; - -		/* -		 * If the file size is <= datal, make sure there are no other -		 * extents following (can happen do to an fallocate call with -		 * the flag FALLOC_FL_KEEP_SIZE). -		 */ -		ei = btrfs_item_ptr(path->nodes[0], path->slots[0], -				    struct btrfs_file_extent_item); -		/* -		 * If it's an inline extent, it can not have other extents -		 * following it. -		 */ -		if (btrfs_file_extent_type(path->nodes[0], ei) == -		    BTRFS_FILE_EXTENT_INLINE) -			goto copy_inline_extent; - -		ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); -		if (ext_len > aligned_end) -			return -EOPNOTSUPP; - -		ret = btrfs_next_item(root, path); -		if (ret < 0) { -			return ret; -		} else if (ret == 0) { -			btrfs_item_key_to_cpu(path->nodes[0], &key, -					      path->slots[0]); -			if (key.objectid == btrfs_ino(BTRFS_I(dst)) && -			    key.type == BTRFS_EXTENT_DATA_KEY) -				return -EOPNOTSUPP; -		} -	} - -copy_inline_extent: -	/* -	 * We have no extent items, or we have an extent at offset 0 which may -	 * or may not be inlined. All these cases are dealt the same way. -	 */ -	if (i_size_read(dst) > datal) { -		/* -		 * If the destination inode has an inline extent... -		 * This would require copying the data from the source inline -		 * extent into the beginning of the destination's inline extent. -		 * But this is really complex, both extents can be compressed -		 * or just one of them, which would require decompressing and -		 * re-compressing data (which could increase the new compressed -		 * size, not allowing the compressed data to fit anymore in an -		 * inline extent). -		 * So just don't support this case for now (it should be rare, -		 * we are not really saving space when cloning inline extents). -		 */ -		return -EOPNOTSUPP; -	} - -	btrfs_release_path(path); -	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); -	if (ret) -		return ret; -	ret = btrfs_insert_empty_item(trans, root, path, new_key, size); -	if (ret) -		return ret; - -	if (skip) { -		const u32 start = btrfs_file_extent_calc_inline_size(0); - -		memmove(inline_data + start, inline_data + start + skip, datal); -	} - -	write_extent_buffer(path->nodes[0], inline_data, -			    btrfs_item_ptr_offset(path->nodes[0], -						  path->slots[0]), -			    size); -	inode_add_bytes(dst, datal); -	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); - -	return 0; -} - -/** - * btrfs_clone() - clone a range from inode file to another - * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode - */ -static int btrfs_clone(struct inode *src, struct inode *inode, -		       const u64 off, const u64 olen, const u64 olen_aligned, -		       const u64 destoff, int no_time_update) -{ -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct btrfs_path *path = NULL; -	struct extent_buffer *leaf; -	struct btrfs_trans_handle *trans; -	char *buf = NULL; -	struct btrfs_key key; -	u32 nritems; -	int slot; -	int ret; -	const u64 len = olen_aligned; -	u64 last_dest_end = destoff; - -	ret = -ENOMEM; -	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); -	if (!buf) -		return ret; - -	path = btrfs_alloc_path(); -	if (!path) { -		kvfree(buf); -		return ret; -	} - -	path->reada = READA_FORWARD; -	/* clone data */ -	key.objectid = btrfs_ino(BTRFS_I(src)); -	key.type = BTRFS_EXTENT_DATA_KEY; -	key.offset = off; - -	while (1) { -		u64 next_key_min_offset = key.offset + 1; -		struct btrfs_file_extent_item *extent; -		int type; -		u32 size; -		struct btrfs_key new_key; -		u64 disko = 0, diskl = 0; -		u64 datao = 0, datal = 0; -		u8 comp; -		u64 drop_start; - -		/* -		 * note the key will change type as we walk through the -		 * tree. -		 */ -		path->leave_spinning = 1; -		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, -				0, 0); -		if (ret < 0) -			goto out; -		/* -		 * First search, if no extent item that starts at offset off was -		 * found but the previous item is an extent item, it's possible -		 * it might overlap our target range, therefore process it. -		 */ -		if (key.offset == off && ret > 0 && path->slots[0] > 0) { -			btrfs_item_key_to_cpu(path->nodes[0], &key, -					      path->slots[0] - 1); -			if (key.type == BTRFS_EXTENT_DATA_KEY) -				path->slots[0]--; -		} - -		nritems = btrfs_header_nritems(path->nodes[0]); -process_slot: -		if (path->slots[0] >= nritems) { -			ret = btrfs_next_leaf(BTRFS_I(src)->root, path); -			if (ret < 0) -				goto out; -			if (ret > 0) -				break; -			nritems = btrfs_header_nritems(path->nodes[0]); -		} -		leaf = path->nodes[0]; -		slot = path->slots[0]; - -		btrfs_item_key_to_cpu(leaf, &key, slot); -		if (key.type > BTRFS_EXTENT_DATA_KEY || -		    key.objectid != btrfs_ino(BTRFS_I(src))) -			break; - -		ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); - -		extent = btrfs_item_ptr(leaf, slot, -					struct btrfs_file_extent_item); -		comp = btrfs_file_extent_compression(leaf, extent); -		type = btrfs_file_extent_type(leaf, extent); -		if (type == BTRFS_FILE_EXTENT_REG || -		    type == BTRFS_FILE_EXTENT_PREALLOC) { -			disko = btrfs_file_extent_disk_bytenr(leaf, extent); -			diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); -			datao = btrfs_file_extent_offset(leaf, extent); -			datal = btrfs_file_extent_num_bytes(leaf, extent); -		} else if (type == BTRFS_FILE_EXTENT_INLINE) { -			/* Take upper bound, may be compressed */ -			datal = btrfs_file_extent_ram_bytes(leaf, extent); -		} - -		/* -		 * The first search might have left us at an extent item that -		 * ends before our target range's start, can happen if we have -		 * holes and NO_HOLES feature enabled. -		 */ -		if (key.offset + datal <= off) { -			path->slots[0]++; -			goto process_slot; -		} else if (key.offset >= off + len) { -			break; -		} -		next_key_min_offset = key.offset + datal; -		size = btrfs_item_size_nr(leaf, slot); -		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), -				   size); - -		btrfs_release_path(path); -		path->leave_spinning = 0; - -		memcpy(&new_key, &key, sizeof(new_key)); -		new_key.objectid = btrfs_ino(BTRFS_I(inode)); -		if (off <= key.offset) -			new_key.offset = key.offset + destoff - off; -		else -			new_key.offset = destoff; - -		/* -		 * Deal with a hole that doesn't have an extent item that -		 * represents it (NO_HOLES feature enabled). -		 * This hole is either in the middle of the cloning range or at -		 * the beginning (fully overlaps it or partially overlaps it). -		 */ -		if (new_key.offset != last_dest_end) -			drop_start = last_dest_end; -		else -			drop_start = new_key.offset; - -		if (type == BTRFS_FILE_EXTENT_REG || -		    type == BTRFS_FILE_EXTENT_PREALLOC) { -			struct btrfs_clone_extent_info clone_info; - -			/* -			 *    a  | --- range to clone ---|  b -			 * | ------------- extent ------------- | -			 */ - -			/* Subtract range b */ -			if (key.offset + datal > off + len) -				datal = off + len - key.offset; - -			/* Subtract range a */ -			if (off > key.offset) { -				datao += off - key.offset; -				datal -= off - key.offset; -			} - -			clone_info.disk_offset = disko; -			clone_info.disk_len = diskl; -			clone_info.data_offset = datao; -			clone_info.data_len = datal; -			clone_info.file_offset = new_key.offset; -			clone_info.extent_buf = buf; -			clone_info.item_size = size; -			ret = btrfs_punch_hole_range(inode, path, -						     drop_start, -						     new_key.offset + datal - 1, -						     &clone_info, &trans); -			if (ret) -				goto out; -		} else if (type == BTRFS_FILE_EXTENT_INLINE) { -			u64 skip = 0; -			u64 trim = 0; - -			if (off > key.offset) { -				skip = off - key.offset; -				new_key.offset += skip; -			} - -			if (key.offset + datal > off + len) -				trim = key.offset + datal - (off + len); - -			if (comp && (skip || trim)) { -				ret = -EINVAL; -				goto out; -			} -			size -= skip + trim; -			datal -= skip + trim; - -			/* -			 * If our extent is inline, we know we will drop or -			 * adjust at most 1 extent item in the destination root. -			 * -			 * 1 - adjusting old extent (we may have to split it) -			 * 1 - add new extent -			 * 1 - inode update -			 */ -			trans = btrfs_start_transaction(root, 3); -			if (IS_ERR(trans)) { -				ret = PTR_ERR(trans); -				goto out; -			} - -			ret = clone_copy_inline_extent(inode, trans, path, -						       &new_key, drop_start, -						       datal, skip, size, buf); -			if (ret) { -				if (ret != -EOPNOTSUPP) -					btrfs_abort_transaction(trans, ret); -				btrfs_end_transaction(trans); -				goto out; -			} -		} - -		btrfs_release_path(path); - -		last_dest_end = ALIGN(new_key.offset + datal, -				      fs_info->sectorsize); -		ret = clone_finish_inode_update(trans, inode, last_dest_end, -						destoff, olen, no_time_update); -		if (ret) -			goto out; -		if (new_key.offset + datal >= destoff + len) -			break; - -		btrfs_release_path(path); -		key.offset = next_key_min_offset; - -		if (fatal_signal_pending(current)) { -			ret = -EINTR; -			goto out; -		} -	} -	ret = 0; - -	if (last_dest_end < destoff + len) { -		/* -		 * We have an implicit hole that fully or partially overlaps our -		 * cloning range at its end. This means that we either have the -		 * NO_HOLES feature enabled or the implicit hole happened due to -		 * mixing buffered and direct IO writes against this file. -		 */ -		btrfs_release_path(path); -		path->leave_spinning = 0; - -		ret = btrfs_punch_hole_range(inode, path, -					     last_dest_end, destoff + len - 1, -					     NULL, &trans); -		if (ret) -			goto out; - -		ret = clone_finish_inode_update(trans, inode, destoff + len, -						destoff, olen, no_time_update); -	} - -out: -	btrfs_free_path(path); -	kvfree(buf); -	return ret; -} - -static noinline int btrfs_clone_files(struct file *file, struct file *file_src, -					u64 off, u64 olen, u64 destoff) -{ -	struct inode *inode = file_inode(file); -	struct inode *src = file_inode(file_src); -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -	int ret; -	u64 len = olen; -	u64 bs = fs_info->sb->s_blocksize; - -	/* -	 * TODO: -	 * - split compressed inline extents.  annoying: we need to -	 *   decompress into destination's address_space (the file offset -	 *   may change, so source mapping won't do), then recompress (or -	 *   otherwise reinsert) a subrange. -	 * -	 * - split destination inode's inline extents.  The inline extents can -	 *   be either compressed or non-compressed. -	 */ - -	/* -	 * VFS's generic_remap_file_range_prep() protects us from cloning the -	 * eof block into the middle of a file, which would result in corruption -	 * if the file size is not blocksize aligned. So we don't need to check -	 * for that case here. -	 */ -	if (off + len == src->i_size) -		len = ALIGN(src->i_size, bs) - off; - -	if (destoff > inode->i_size) { -		const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); - -		ret = btrfs_cont_expand(inode, inode->i_size, destoff); -		if (ret) -			return ret; -		/* -		 * We may have truncated the last block if the inode's size is -		 * not sector size aligned, so we need to wait for writeback to -		 * complete before proceeding further, otherwise we can race -		 * with cloning and attempt to increment a reference to an -		 * extent that no longer exists (writeback completed right after -		 * we found the previous extent covering eof and before we -		 * attempted to increment its reference count). -		 */ -		ret = btrfs_wait_ordered_range(inode, wb_start, -					       destoff - wb_start); -		if (ret) -			return ret; -	} - -	/* -	 * Lock destination range to serialize with concurrent readpages() and -	 * source range to serialize with relocation. -	 */ -	btrfs_double_extent_lock(src, off, inode, destoff, len); -	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); -	btrfs_double_extent_unlock(src, off, inode, destoff, len); -	/* -	 * Truncate page cache pages so that future reads will see the cloned -	 * data immediately and not the previous data. -	 */ -	truncate_inode_pages_range(&inode->i_data, -				round_down(destoff, PAGE_SIZE), -				round_up(destoff + len, PAGE_SIZE) - 1); - -	return ret; -} - -static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, -				       struct file *file_out, loff_t pos_out, -				       loff_t *len, unsigned int remap_flags) -{ -	struct inode *inode_in = file_inode(file_in); -	struct inode *inode_out = file_inode(file_out); -	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; -	bool same_inode = inode_out == inode_in; -	u64 wb_len; -	int ret; - -	if (!(remap_flags & REMAP_FILE_DEDUP)) { -		struct btrfs_root *root_out = BTRFS_I(inode_out)->root; - -		if (btrfs_root_readonly(root_out)) -			return -EROFS; - -		if (file_in->f_path.mnt != file_out->f_path.mnt || -		    inode_in->i_sb != inode_out->i_sb) -			return -EXDEV; -	} - -	/* don't make the dst file partly checksummed */ -	if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != -	    (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { -		return -EINVAL; -	} - -	/* -	 * Now that the inodes are locked, we need to start writeback ourselves -	 * and can not rely on the writeback from the VFS's generic helper -	 * generic_remap_file_range_prep() because: -	 * -	 * 1) For compression we must call filemap_fdatawrite_range() range -	 *    twice (btrfs_fdatawrite_range() does it for us), and the generic -	 *    helper only calls it once; -	 * -	 * 2) filemap_fdatawrite_range(), called by the generic helper only -	 *    waits for the writeback to complete, i.e. for IO to be done, and -	 *    not for the ordered extents to complete. We need to wait for them -	 *    to complete so that new file extent items are in the fs tree. -	 */ -	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) -		wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); -	else -		wb_len = ALIGN(*len, bs); - -	/* -	 * Since we don't lock ranges, wait for ongoing lockless dio writes (as -	 * any in progress could create its ordered extents after we wait for -	 * existing ordered extents below). -	 */ -	inode_dio_wait(inode_in); -	if (!same_inode) -		inode_dio_wait(inode_out); - -	/* -	 * Workaround to make sure NOCOW buffered write reach disk as NOCOW. -	 * -	 * Btrfs' back references do not have a block level granularity, they -	 * work at the whole extent level. -	 * NOCOW buffered write without data space reserved may not be able -	 * to fall back to CoW due to lack of data space, thus could cause -	 * data loss. -	 * -	 * Here we take a shortcut by flushing the whole inode, so that all -	 * nocow write should reach disk as nocow before we increase the -	 * reference of the extent. We could do better by only flushing NOCOW -	 * data, but that needs extra accounting. -	 * -	 * Also we don't need to check ASYNC_EXTENT, as async extent will be -	 * CoWed anyway, not affecting nocow part. -	 */ -	ret = filemap_flush(inode_in->i_mapping); -	if (ret < 0) -		return ret; - -	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), -				       wb_len); -	if (ret < 0) -		return ret; -	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), -				       wb_len); -	if (ret < 0) -		return ret; - -	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, -					    len, remap_flags); -} - -loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, -		struct file *dst_file, loff_t destoff, loff_t len, -		unsigned int remap_flags) -{ -	struct inode *src_inode = file_inode(src_file); -	struct inode *dst_inode = file_inode(dst_file); -	bool same_inode = dst_inode == src_inode; -	int ret; - -	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) -		return -EINVAL; - -	if (same_inode) -		inode_lock(src_inode); -	else -		lock_two_nondirectories(src_inode, dst_inode); - -	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, -					  &len, remap_flags); -	if (ret < 0 || len == 0) -		goto out_unlock; - -	if (remap_flags & REMAP_FILE_DEDUP) -		ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); -	else -		ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); - -out_unlock: -	if (same_inode) -		inode_unlock(src_inode); -	else -		unlock_two_nondirectories(src_inode, dst_inode); - -	return ret < 0 ? ret : len; -} -  static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  {  	struct inode *inode = file_inode(file); @@ -3955,7 +3277,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  	struct btrfs_root *new_root;  	struct btrfs_dir_item *di;  	struct btrfs_trans_handle *trans; -	struct btrfs_path *path; +	struct btrfs_path *path = NULL;  	struct btrfs_key location;  	struct btrfs_disk_key disk_key;  	u64 objectid = 0; @@ -3981,49 +3303,51 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  	location.type = BTRFS_ROOT_ITEM_KEY;  	location.offset = (u64)-1; -	new_root = btrfs_read_fs_root_no_name(fs_info, &location); +	new_root = btrfs_get_fs_root(fs_info, &location, true);  	if (IS_ERR(new_root)) {  		ret = PTR_ERR(new_root);  		goto out;  	}  	if (!is_fstree(new_root->root_key.objectid)) {  		ret = -ENOENT; -		goto out; +		goto out_free;  	}  	path = btrfs_alloc_path();  	if (!path) {  		ret = -ENOMEM; -		goto out; +		goto out_free;  	}  	path->leave_spinning = 1;  	trans = btrfs_start_transaction(root, 1);  	if (IS_ERR(trans)) { -		btrfs_free_path(path);  		ret = PTR_ERR(trans); -		goto out; +		goto out_free;  	}  	dir_id = btrfs_super_root_dir(fs_info->super_copy);  	di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,  				   dir_id, "default", 7, 1);  	if (IS_ERR_OR_NULL(di)) { -		btrfs_free_path(path); +		btrfs_release_path(path);  		btrfs_end_transaction(trans);  		btrfs_err(fs_info,  			  "Umm, you don't have the default diritem, this isn't going to work");  		ret = -ENOENT; -		goto out; +		goto out_free;  	}  	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);  	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);  	btrfs_mark_buffer_dirty(path->nodes[0]); -	btrfs_free_path(path); +	btrfs_release_path(path);  	btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);  	btrfs_end_transaction(trans); +out_free: +	btrfs_put_root(new_root); +	btrfs_free_path(path);  out:  	mnt_drop_write_file(file);  	return ret; @@ -5465,7 +4789,9 @@ long btrfs_ioctl(struct file *file, unsigned int  	case BTRFS_IOC_SUBVOL_CREATE_V2:  		return btrfs_ioctl_snap_create_v2(file, argp, 1);  	case BTRFS_IOC_SNAP_DESTROY: -		return btrfs_ioctl_snap_destroy(file, argp); +		return btrfs_ioctl_snap_destroy(file, argp, false); +	case BTRFS_IOC_SNAP_DESTROY_V2: +		return btrfs_ioctl_snap_destroy(file, argp, true);  	case BTRFS_IOC_SUBVOL_GETFLAGS:  		return btrfs_ioctl_subvol_getflags(file, argp);  	case BTRFS_IOC_SUBVOL_SETFLAGS: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 571c4826c428..fb647d8cf527 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -523,3 +523,138 @@ void btrfs_unlock_up_safe(struct btrfs_path *path, int level)  		path->locks[i] = 0;  	}  } + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with write lock held + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ +	struct extent_buffer *eb; + +	while (1) { +		eb = btrfs_root_node(root); +		btrfs_tree_lock(eb); +		if (eb == root->node) +			break; +		btrfs_tree_unlock(eb); +		free_extent_buffer(eb); +	} +	return eb; +} + +/* + * Loop around taking references on and locking the root node of the tree until + * we end up with a lock on the root node. + * + * Return: root extent buffer with read lock held + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ +	struct extent_buffer *eb; + +	while (1) { +		eb = btrfs_root_node(root); +		btrfs_tree_read_lock(eb); +		if (eb == root->node) +			break; +		btrfs_tree_read_unlock(eb); +		free_extent_buffer(eb); +	} +	return eb; +} + +/* + * DREW locks + * ========== + * + * DREW stands for double-reader-writer-exclusion lock. It's used in situation + * where you want to provide A-B exclusion but not AA or BB. + * + * Currently implementation gives more priority to reader. If a reader and a + * writer both race to acquire their respective sides of the lock the writer + * would yield its lock as soon as it detects a concurrent reader. Additionally + * if there are pending readers no new writers would be allowed to come in and + * acquire the lock. + */ + +int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) +{ +	int ret; + +	ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); +	if (ret) +		return ret; + +	atomic_set(&lock->readers, 0); +	init_waitqueue_head(&lock->pending_readers); +	init_waitqueue_head(&lock->pending_writers); + +	return 0; +} + +void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) +{ +	percpu_counter_destroy(&lock->writers); +} + +/* Return true if acquisition is successful, false otherwise */ +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) +{ +	if (atomic_read(&lock->readers)) +		return false; + +	percpu_counter_inc(&lock->writers); + +	/* Ensure writers count is updated before we check for pending readers */ +	smp_mb(); +	if (atomic_read(&lock->readers)) { +		btrfs_drew_write_unlock(lock); +		return false; +	} + +	return true; +} + +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) +{ +	while (true) { +		if (btrfs_drew_try_write_lock(lock)) +			return; +		wait_event(lock->pending_writers, !atomic_read(&lock->readers)); +	} +} + +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) +{ +	percpu_counter_dec(&lock->writers); +	cond_wake_up(&lock->pending_readers); +} + +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) +{ +	atomic_inc(&lock->readers); + +	/* +	 * Ensure the pending reader count is perceieved BEFORE this reader +	 * goes to sleep in case of active writers. This guarantees new writers +	 * won't be allowed and that the current reader will be woken up when +	 * the last active writer finishes its jobs. +	 */ +	smp_mb__after_atomic(); + +	wait_event(lock->pending_readers, +		   percpu_counter_sum(&lock->writers) == 0); +} + +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) +{ +	/* +	 * atomic_dec_and_test implies a full barrier, so woken up writers +	 * are guaranteed to see the decrement +	 */ +	if (atomic_dec_and_test(&lock->readers)) +		wake_up(&lock->pending_writers); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 21a285883e89..d715846c10b8 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -6,6 +6,9 @@  #ifndef BTRFS_LOCKING_H  #define BTRFS_LOCKING_H +#include <linux/atomic.h> +#include <linux/wait.h> +#include <linux/percpu_counter.h>  #include "extent_io.h"  #define BTRFS_WRITE_LOCK 1 @@ -13,6 +16,8 @@  #define BTRFS_WRITE_LOCK_BLOCKING 3  #define BTRFS_READ_LOCK_BLOCKING 4 +struct btrfs_path; +  void btrfs_tree_lock(struct extent_buffer *eb);  void btrfs_tree_unlock(struct extent_buffer *eb); @@ -48,4 +53,19 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)  		BUG();  } +struct btrfs_drew_lock { +	atomic_t readers; +	struct percpu_counter writers; +	wait_queue_head_t pending_writers; +	wait_queue_head_t pending_readers; +}; + +int btrfs_drew_lock_init(struct btrfs_drew_lock *lock); +void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock); +void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); +bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_lock(struct btrfs_drew_lock *lock); +void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock); +  #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a65f189a5b94..e13b3d28c063 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -580,7 +580,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,  	while (!list_empty(&splice) && nr) {  		root = list_first_entry(&splice, struct btrfs_root,  					ordered_root); -		root = btrfs_grab_fs_root(root); +		root = btrfs_grab_root(root);  		BUG_ON(!root);  		list_move_tail(&root->ordered_root,  			       &fs_info->ordered_roots); @@ -588,7 +588,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,  		done = btrfs_wait_ordered_extents(root, nr,  						  range_start, range_len); -		btrfs_put_fs_root(root); +		btrfs_put_root(root);  		spin_lock(&fs_info->ordered_root_lock);  		if (nr != U64_MAX) { @@ -786,134 +786,6 @@ out:  }  /* - * After an extent is done, call this to conditionally update the on disk - * i_size.  i_size is updated to cover any fully written part of the file. - */ -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, -				struct btrfs_ordered_extent *ordered) -{ -	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; -	u64 disk_i_size; -	u64 new_i_size; -	u64 i_size = i_size_read(inode); -	struct rb_node *node; -	struct rb_node *prev = NULL; -	struct btrfs_ordered_extent *test; -	int ret = 1; -	u64 orig_offset = offset; - -	spin_lock_irq(&tree->lock); -	if (ordered) { -		offset = entry_end(ordered); -		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) -			offset = min(offset, -				     ordered->file_offset + -				     ordered->truncated_len); -	} else { -		offset = ALIGN(offset, btrfs_inode_sectorsize(inode)); -	} -	disk_i_size = BTRFS_I(inode)->disk_i_size; - -	/* -	 * truncate file. -	 * If ordered is not NULL, then this is called from endio and -	 * disk_i_size will be updated by either truncate itself or any -	 * in-flight IOs which are inside the disk_i_size. -	 * -	 * Because btrfs_setsize() may set i_size with disk_i_size if truncate -	 * fails somehow, we need to make sure we have a precise disk_i_size by -	 * updating it as usual. -	 * -	 */ -	if (!ordered && disk_i_size > i_size) { -		BTRFS_I(inode)->disk_i_size = orig_offset; -		ret = 0; -		goto out; -	} - -	/* -	 * if the disk i_size is already at the inode->i_size, or -	 * this ordered extent is inside the disk i_size, we're done -	 */ -	if (disk_i_size == i_size) -		goto out; - -	/* -	 * We still need to update disk_i_size if outstanding_isize is greater -	 * than disk_i_size. -	 */ -	if (offset <= disk_i_size && -	    (!ordered || ordered->outstanding_isize <= disk_i_size)) -		goto out; - -	/* -	 * walk backward from this ordered extent to disk_i_size. -	 * if we find an ordered extent then we can't update disk i_size -	 * yet -	 */ -	if (ordered) { -		node = rb_prev(&ordered->rb_node); -	} else { -		prev = tree_search(tree, offset); -		/* -		 * we insert file extents without involving ordered struct, -		 * so there should be no ordered struct cover this offset -		 */ -		if (prev) { -			test = rb_entry(prev, struct btrfs_ordered_extent, -					rb_node); -			BUG_ON(offset_in_entry(test, offset)); -		} -		node = prev; -	} -	for (; node; node = rb_prev(node)) { -		test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - -		/* We treat this entry as if it doesn't exist */ -		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) -			continue; - -		if (entry_end(test) <= disk_i_size) -			break; -		if (test->file_offset >= i_size) -			break; - -		/* -		 * We don't update disk_i_size now, so record this undealt -		 * i_size. Or we will not know the real i_size. -		 */ -		if (test->outstanding_isize < offset) -			test->outstanding_isize = offset; -		if (ordered && -		    ordered->outstanding_isize > test->outstanding_isize) -			test->outstanding_isize = ordered->outstanding_isize; -		goto out; -	} -	new_i_size = min_t(u64, offset, i_size); - -	/* -	 * Some ordered extents may completed before the current one, and -	 * we hold the real i_size in ->outstanding_isize. -	 */ -	if (ordered && ordered->outstanding_isize > new_i_size) -		new_i_size = min_t(u64, ordered->outstanding_isize, i_size); -	BTRFS_I(inode)->disk_i_size = new_i_size; -	ret = 0; -out: -	/* -	 * We need to do this because we can't remove ordered extents until -	 * after the i_disk_size has been updated and then the inode has been -	 * updated to reflect the change, so we need to tell anybody who finds -	 * this ordered extent that we've already done all the real work, we -	 * just haven't completed all the other work. -	 */ -	if (ordered) -		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); -	spin_unlock_irq(&tree->lock); -	return ret; -} - -/*   * search the ordered extents for one corresponding to 'offset' and   * try to find a checksum.  This is used because we allow pages to   * be reclaimed before their checksum is actually put into the btree @@ -963,7 +835,6 @@ out:   * btrfs_flush_ordered_range - Lock the passed range and ensures all pending   * ordered extents in it are run to completion.   * - * @tree:         IO tree used for locking out other users of the range   * @inode:        Inode whose ordered tree is to be searched   * @start:        Beginning of range to flush   * @end:          Last byte of range to lock @@ -973,8 +844,7 @@ out:   * This function always returns with the given range locked, ensuring after it's   * called no order extent can be pending.   */ -void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, -					struct btrfs_inode *inode, u64 start, +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,  					u64 end,  					struct extent_state **cached_state)  { @@ -986,7 +856,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,  		cachedp = cached_state;  	while (1) { -		lock_extent_bits(tree, start, end, cachedp); +		lock_extent_bits(&inode->io_tree, start, end, cachedp);  		ordered = btrfs_lookup_ordered_range(inode, start,  						     end - start + 1);  		if (!ordered) { @@ -999,7 +869,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,  				refcount_dec(&cache->refs);  			break;  		} -		unlock_extent_cached(tree, start, end, cachedp); +		unlock_extent_cached(&inode->io_tree, start, end, cachedp);  		btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);  		btrfs_put_ordered_extent(ordered);  	} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3beb4da4ab41..c01c9698250b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -52,11 +52,6 @@ enum {  	BTRFS_ORDERED_DIRECT,  	/* We had an io error when writing this out */  	BTRFS_ORDERED_IOERR, -	/* -	 * indicates whether this ordered extent has done its due diligence in -	 * updating the isize -	 */ -	BTRFS_ORDERED_UPDATED_ISIZE,  	/* Set when we have to truncate an extent */  	BTRFS_ORDERED_TRUNCATED,  	/* Regular IO for COW */ @@ -182,16 +177,13 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(  		struct btrfs_inode *inode,  		u64 file_offset,  		u64 len); -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, -				struct btrfs_ordered_extent *ordered);  int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,  			   u8 *sum, int len);  u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,  			       const u64 range_start, const u64 range_len);  void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,  			      const u64 range_start, const u64 range_len); -void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, -					struct btrfs_inode *inode, u64 start, +void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,  					u64 end,  					struct extent_state **cached_state);  int __init ordered_data_init(void); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index deb59e7cfcac..ff1ff90e48b1 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -383,7 +383,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,  		if (need_reserve) {  			btrfs_block_rsv_release(fs_info, trans->block_rsv, -					num_bytes); +					num_bytes, NULL);  			if (ret)  				return ret;  		} diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ff1870ff3474..c3888fb367e7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1030,6 +1030,7 @@ out_add_root:  	ret = qgroup_rescan_init(fs_info, 0, 1);  	if (!ret) {  	        qgroup_rescan_zero_tracking(fs_info); +		fs_info->qgroup_rescan_running = true;  	        btrfs_queue_work(fs_info->qgroup_rescan_workers,  	                         &fs_info->qgroup_rescan_work);  	} @@ -1037,11 +1038,8 @@ out_add_root:  out_free_path:  	btrfs_free_path(path);  out_free_root: -	if (ret) { -		free_extent_buffer(quota_root->node); -		free_extent_buffer(quota_root->commit_root); -		kfree(quota_root); -	} +	if (ret) +		btrfs_put_root(quota_root);  out:  	if (ret) {  		ulist_free(fs_info->qgroup_ulist); @@ -1104,9 +1102,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)  	btrfs_tree_unlock(quota_root->node);  	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); -	free_extent_buffer(quota_root->node); -	free_extent_buffer(quota_root->commit_root); -	kfree(quota_root); +	btrfs_put_root(quota_root);  end_trans:  	ret = btrfs_end_transaction(trans); @@ -3237,7 +3233,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,  	}  	mutex_lock(&fs_info->qgroup_rescan_lock); -	spin_lock(&fs_info->qgroup_lock);  	if (init_flags) {  		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { @@ -3252,7 +3247,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,  		}  		if (ret) { -			spin_unlock(&fs_info->qgroup_lock);  			mutex_unlock(&fs_info->qgroup_rescan_lock);  			return ret;  		} @@ -3263,9 +3257,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,  		sizeof(fs_info->qgroup_rescan_progress));  	fs_info->qgroup_rescan_progress.objectid = progress_objectid;  	init_completion(&fs_info->qgroup_rescan_completion); -	fs_info->qgroup_rescan_running = true; - -	spin_unlock(&fs_info->qgroup_lock);  	mutex_unlock(&fs_info->qgroup_rescan_lock);  	btrfs_init_work(&fs_info->qgroup_rescan_work, @@ -3326,8 +3317,11 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)  	qgroup_rescan_zero_tracking(fs_info); +	mutex_lock(&fs_info->qgroup_rescan_lock); +	fs_info->qgroup_rescan_running = true;  	btrfs_queue_work(fs_info->qgroup_rescan_workers,  			 &fs_info->qgroup_rescan_work); +	mutex_unlock(&fs_info->qgroup_rescan_lock);  	return 0;  } @@ -3339,9 +3333,7 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,  	int ret = 0;  	mutex_lock(&fs_info->qgroup_rescan_lock); -	spin_lock(&fs_info->qgroup_lock);  	running = fs_info->qgroup_rescan_running; -	spin_unlock(&fs_info->qgroup_lock);  	mutex_unlock(&fs_info->qgroup_rescan_lock);  	if (!running) @@ -3363,9 +3355,13 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,  void  btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)  { -	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) +	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { +		mutex_lock(&fs_info->qgroup_rescan_lock); +		fs_info->qgroup_rescan_running = true;  		btrfs_queue_work(fs_info->qgroup_rescan_workers,  				 &fs_info->qgroup_rescan_work); +		mutex_unlock(&fs_info->qgroup_rescan_lock); +	}  }  /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a8e53c8e7b01..c870ef70f817 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -206,7 +206,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)  	struct btrfs_stripe_hash *h;  	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;  	int i; -	int table_size;  	if (info->stripe_hash_table)  		return 0; @@ -218,8 +217,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)  	 * Try harder to allocate and fallback to vmalloc to lower the chance  	 * of a failing mount.  	 */ -	table_size = sizeof(*table) + sizeof(*h) * num_entries; -	table = kvzalloc(table_size, GFP_KERNEL); +	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);  	if (!table)  		return -ENOMEM; @@ -1196,22 +1194,19 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)  	int nr_data = rbio->nr_data;  	int stripe;  	int pagenr; -	int p_stripe = -1; -	int q_stripe = -1; +	bool has_qstripe;  	struct bio_list bio_list;  	struct bio *bio;  	int ret;  	bio_list_init(&bio_list); -	if (rbio->real_stripes - rbio->nr_data == 1) { -		p_stripe = rbio->real_stripes - 1; -	} else if (rbio->real_stripes - rbio->nr_data == 2) { -		p_stripe = rbio->real_stripes - 2; -		q_stripe = rbio->real_stripes - 1; -	} else { +	if (rbio->real_stripes - rbio->nr_data == 1) +		has_qstripe = false; +	else if (rbio->real_stripes - rbio->nr_data == 2) +		has_qstripe = true; +	else  		BUG(); -	}  	/* at this point we either have a full stripe,  	 * or we've read the full stripe from the drive. @@ -1255,7 +1250,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)  		SetPageUptodate(p);  		pointers[stripe++] = kmap(p); -		if (q_stripe != -1) { +		if (has_qstripe) {  			/*  			 * raid6, add the qstripe and call the @@ -2353,8 +2348,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,  	int nr_data = rbio->nr_data;  	int stripe;  	int pagenr; -	int p_stripe = -1; -	int q_stripe = -1; +	bool has_qstripe;  	struct page *p_page = NULL;  	struct page *q_page = NULL;  	struct bio_list bio_list; @@ -2364,14 +2358,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,  	bio_list_init(&bio_list); -	if (rbio->real_stripes - rbio->nr_data == 1) { -		p_stripe = rbio->real_stripes - 1; -	} else if (rbio->real_stripes - rbio->nr_data == 2) { -		p_stripe = rbio->real_stripes - 2; -		q_stripe = rbio->real_stripes - 1; -	} else { +	if (rbio->real_stripes - rbio->nr_data == 1) +		has_qstripe = false; +	else if (rbio->real_stripes - rbio->nr_data == 2) +		has_qstripe = true; +	else  		BUG(); -	}  	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {  		is_replace = 1; @@ -2393,7 +2385,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,  		goto cleanup;  	SetPageUptodate(p_page); -	if (q_stripe != -1) { +	if (has_qstripe) {  		q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);  		if (!q_page) {  			__free_page(p_page); @@ -2416,8 +2408,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,  		/* then add the parity stripe */  		pointers[stripe++] = kmap(p_page); -		if (q_stripe != -1) { - +		if (has_qstripe) {  			/*  			 * raid6, add the qstripe and call the  			 * library function to fill in our p/q diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index a97dc74a4d3d..5c1a617eb25d 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -8,7 +8,7 @@  struct rcu_string {  	struct rcu_head rcu; -	char str[0]; +	char str[];  };  static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 454a1015d026..7887317033c9 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -803,6 +803,15 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,  			kfree(ref);  			kfree(ra);  			goto out_unlock; +		} else if (be->num_refs == 0) { +			btrfs_err(fs_info, +		"trying to do action %d for a bytenr that has 0 total references", +				action); +			dump_block_entry(fs_info, be); +			dump_ref_action(fs_info, ra); +			kfree(ref); +			kfree(ra); +			goto out_unlock;  		}  		if (!parent) { diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c new file mode 100644 index 000000000000..d1973141d3bb --- /dev/null +++ b/fs/btrfs/reflink.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/blkdev.h> +#include <linux/iversion.h> +#include "compression.h" +#include "ctree.h" +#include "delalloc-space.h" +#include "reflink.h" +#include "transaction.h" + +#define BTRFS_MAX_DEDUPE_LEN	SZ_16M + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, +				     struct inode *inode, +				     u64 endoff, +				     const u64 destoff, +				     const u64 olen, +				     int no_time_update) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	int ret; + +	inode_inc_iversion(inode); +	if (!no_time_update) +		inode->i_mtime = inode->i_ctime = current_time(inode); +	/* +	 * We round up to the block size at eof when determining which +	 * extents to clone above, but shouldn't round up the file size. +	 */ +	if (endoff > destoff + olen) +		endoff = destoff + olen; +	if (endoff > inode->i_size) { +		i_size_write(inode, endoff); +		btrfs_inode_safe_disk_i_size_write(inode, 0); +	} + +	ret = btrfs_update_inode(trans, root, inode); +	if (ret) { +		btrfs_abort_transaction(trans, ret); +		btrfs_end_transaction(trans); +		goto out; +	} +	ret = btrfs_end_transaction(trans); +out: +	return ret; +} + +static int copy_inline_to_page(struct inode *inode, +			       const u64 file_offset, +			       char *inline_data, +			       const u64 size, +			       const u64 datal, +			       const u8 comp_type) +{ +	const u64 block_size = btrfs_inode_sectorsize(inode); +	const u64 range_end = file_offset + block_size - 1; +	const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); +	char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); +	struct extent_changeset *data_reserved = NULL; +	struct page *page = NULL; +	int ret; + +	ASSERT(IS_ALIGNED(file_offset, block_size)); + +	/* +	 * We have flushed and locked the ranges of the source and destination +	 * inodes, we also have locked the inodes, so we are safe to do a +	 * reservation here. Also we must not do the reservation while holding +	 * a transaction open, otherwise we would deadlock. +	 */ +	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, +					   block_size); +	if (ret) +		goto out; + +	page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, +				   btrfs_alloc_write_mask(inode->i_mapping)); +	if (!page) { +		ret = -ENOMEM; +		goto out_unlock; +	} + +	set_page_extent_mapped(page); +	clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, +			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, +			 0, 0, NULL); +	ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); +	if (ret) +		goto out_unlock; + +	if (comp_type == BTRFS_COMPRESS_NONE) { +		char *map; + +		map = kmap(page); +		memcpy(map, data_start, datal); +		flush_dcache_page(page); +		kunmap(page); +	} else { +		ret = btrfs_decompress(comp_type, data_start, page, 0, +				       inline_size, datal); +		if (ret) +			goto out_unlock; +		flush_dcache_page(page); +	} + +	/* +	 * If our inline data is smaller then the block/page size, then the +	 * remaining of the block/page is equivalent to zeroes. We had something +	 * like the following done: +	 * +	 * $ xfs_io -f -c "pwrite -S 0xab 0 500" file +	 * $ sync  # (or fsync) +	 * $ xfs_io -c "falloc 0 4K" file +	 * $ xfs_io -c "pwrite -S 0xcd 4K 4K" +	 * +	 * So what's in the range [500, 4095] corresponds to zeroes. +	 */ +	if (datal < block_size) { +		char *map; + +		map = kmap(page); +		memset(map + datal, 0, block_size - datal); +		flush_dcache_page(page); +		kunmap(page); +	} + +	SetPageUptodate(page); +	ClearPageChecked(page); +	set_page_dirty(page); +out_unlock: +	if (page) { +		unlock_page(page); +		put_page(page); +	} +	if (ret) +		btrfs_delalloc_release_space(inode, data_reserved, file_offset, +					     block_size, true); +	btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); +out: +	extent_changeset_free(data_reserved); + +	return ret; +} + +/* + * Deal with cloning of inline extents. We try to copy the inline extent from + * the source inode to destination inode when possible. When not possible we + * copy the inline extent's data into the respective page of the inode. + */ +static int clone_copy_inline_extent(struct inode *dst, +				    struct btrfs_path *path, +				    struct btrfs_key *new_key, +				    const u64 drop_start, +				    const u64 datal, +				    const u64 size, +				    const u8 comp_type, +				    char *inline_data, +				    struct btrfs_trans_handle **trans_out) +{ +	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); +	struct btrfs_root *root = BTRFS_I(dst)->root; +	const u64 aligned_end = ALIGN(new_key->offset + datal, +				      fs_info->sectorsize); +	struct btrfs_trans_handle *trans = NULL; +	int ret; +	struct btrfs_key key; + +	if (new_key->offset > 0) { +		ret = copy_inline_to_page(dst, new_key->offset, inline_data, +					  size, datal, comp_type); +		goto out; +	} + +	key.objectid = btrfs_ino(BTRFS_I(dst)); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = 0; +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) { +		return ret; +	} else if (ret > 0) { +		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) +				return ret; +			else if (ret > 0) +				goto copy_inline_extent; +		} +		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); +		if (key.objectid == btrfs_ino(BTRFS_I(dst)) && +		    key.type == BTRFS_EXTENT_DATA_KEY) { +			/* +			 * There's an implicit hole at file offset 0, copy the +			 * inline extent's data to the page. +			 */ +			ASSERT(key.offset > 0); +			ret = copy_inline_to_page(dst, new_key->offset, +						  inline_data, size, datal, +						  comp_type); +			goto out; +		} +	} else if (i_size_read(dst) <= datal) { +		struct btrfs_file_extent_item *ei; + +		ei = btrfs_item_ptr(path->nodes[0], path->slots[0], +				    struct btrfs_file_extent_item); +		/* +		 * If it's an inline extent replace it with the source inline +		 * extent, otherwise copy the source inline extent data into +		 * the respective page at the destination inode. +		 */ +		if (btrfs_file_extent_type(path->nodes[0], ei) == +		    BTRFS_FILE_EXTENT_INLINE) +			goto copy_inline_extent; + +		ret = copy_inline_to_page(dst, new_key->offset, inline_data, +					  size, datal, comp_type); +		goto out; +	} + +copy_inline_extent: +	ret = 0; +	/* +	 * We have no extent items, or we have an extent at offset 0 which may +	 * or may not be inlined. All these cases are dealt the same way. +	 */ +	if (i_size_read(dst) > datal) { +		/* +		 * At the destination offset 0 we have either a hole, a regular +		 * extent or an inline extent larger then the one we want to +		 * clone. Deal with all these cases by copying the inline extent +		 * data into the respective page at the destination inode. +		 */ +		ret = copy_inline_to_page(dst, new_key->offset, inline_data, +					   size, datal, comp_type); +		goto out; +	} + +	btrfs_release_path(path); +	/* +	 * If we end up here it means were copy the inline extent into a leaf +	 * of the destination inode. We know we will drop or adjust at most one +	 * extent item in the destination root. +	 * +	 * 1 unit - adjusting old extent (we may have to split it) +	 * 1 unit - add new extent +	 * 1 unit - inode update +	 */ +	trans = btrfs_start_transaction(root, 3); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		trans = NULL; +		goto out; +	} +	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); +	if (ret) +		goto out; +	ret = btrfs_insert_empty_item(trans, root, path, new_key, size); +	if (ret) +		goto out; + +	write_extent_buffer(path->nodes[0], inline_data, +			    btrfs_item_ptr_offset(path->nodes[0], +						  path->slots[0]), +			    size); +	inode_add_bytes(dst, datal); +	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); +out: +	if (!ret && !trans) { +		/* +		 * No transaction here means we copied the inline extent into a +		 * page of the destination inode. +		 * +		 * 1 unit to update inode item +		 */ +		trans = btrfs_start_transaction(root, 1); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			trans = NULL; +		} +	} +	if (ret && trans) { +		btrfs_abort_transaction(trans, ret); +		btrfs_end_transaction(trans); +	} +	if (!ret) +		*trans_out = trans; + +	return ret; +} + +/** + * btrfs_clone() - clone a range from inode file to another + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode + */ +static int btrfs_clone(struct inode *src, struct inode *inode, +		       const u64 off, const u64 olen, const u64 olen_aligned, +		       const u64 destoff, int no_time_update) +{ +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +	struct btrfs_path *path = NULL; +	struct extent_buffer *leaf; +	struct btrfs_trans_handle *trans; +	char *buf = NULL; +	struct btrfs_key key; +	u32 nritems; +	int slot; +	int ret; +	const u64 len = olen_aligned; +	u64 last_dest_end = destoff; + +	ret = -ENOMEM; +	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); +	if (!buf) +		return ret; + +	path = btrfs_alloc_path(); +	if (!path) { +		kvfree(buf); +		return ret; +	} + +	path->reada = READA_FORWARD; +	/* Clone data */ +	key.objectid = btrfs_ino(BTRFS_I(src)); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = off; + +	while (1) { +		u64 next_key_min_offset = key.offset + 1; +		struct btrfs_file_extent_item *extent; +		int type; +		u32 size; +		struct btrfs_key new_key; +		u64 disko = 0, diskl = 0; +		u64 datao = 0, datal = 0; +		u8 comp; +		u64 drop_start; + +		/* Note the key will change type as we walk through the tree */ +		path->leave_spinning = 1; +		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, +				0, 0); +		if (ret < 0) +			goto out; +		/* +		 * First search, if no extent item that starts at offset off was +		 * found but the previous item is an extent item, it's possible +		 * it might overlap our target range, therefore process it. +		 */ +		if (key.offset == off && ret > 0 && path->slots[0] > 0) { +			btrfs_item_key_to_cpu(path->nodes[0], &key, +					      path->slots[0] - 1); +			if (key.type == BTRFS_EXTENT_DATA_KEY) +				path->slots[0]--; +		} + +		nritems = btrfs_header_nritems(path->nodes[0]); +process_slot: +		if (path->slots[0] >= nritems) { +			ret = btrfs_next_leaf(BTRFS_I(src)->root, path); +			if (ret < 0) +				goto out; +			if (ret > 0) +				break; +			nritems = btrfs_header_nritems(path->nodes[0]); +		} +		leaf = path->nodes[0]; +		slot = path->slots[0]; + +		btrfs_item_key_to_cpu(leaf, &key, slot); +		if (key.type > BTRFS_EXTENT_DATA_KEY || +		    key.objectid != btrfs_ino(BTRFS_I(src))) +			break; + +		ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + +		extent = btrfs_item_ptr(leaf, slot, +					struct btrfs_file_extent_item); +		comp = btrfs_file_extent_compression(leaf, extent); +		type = btrfs_file_extent_type(leaf, extent); +		if (type == BTRFS_FILE_EXTENT_REG || +		    type == BTRFS_FILE_EXTENT_PREALLOC) { +			disko = btrfs_file_extent_disk_bytenr(leaf, extent); +			diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); +			datao = btrfs_file_extent_offset(leaf, extent); +			datal = btrfs_file_extent_num_bytes(leaf, extent); +		} else if (type == BTRFS_FILE_EXTENT_INLINE) { +			/* Take upper bound, may be compressed */ +			datal = btrfs_file_extent_ram_bytes(leaf, extent); +		} + +		/* +		 * The first search might have left us at an extent item that +		 * ends before our target range's start, can happen if we have +		 * holes and NO_HOLES feature enabled. +		 */ +		if (key.offset + datal <= off) { +			path->slots[0]++; +			goto process_slot; +		} else if (key.offset >= off + len) { +			break; +		} +		next_key_min_offset = key.offset + datal; +		size = btrfs_item_size_nr(leaf, slot); +		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), +				   size); + +		btrfs_release_path(path); +		path->leave_spinning = 0; + +		memcpy(&new_key, &key, sizeof(new_key)); +		new_key.objectid = btrfs_ino(BTRFS_I(inode)); +		if (off <= key.offset) +			new_key.offset = key.offset + destoff - off; +		else +			new_key.offset = destoff; + +		/* +		 * Deal with a hole that doesn't have an extent item that +		 * represents it (NO_HOLES feature enabled). +		 * This hole is either in the middle of the cloning range or at +		 * the beginning (fully overlaps it or partially overlaps it). +		 */ +		if (new_key.offset != last_dest_end) +			drop_start = last_dest_end; +		else +			drop_start = new_key.offset; + +		if (type == BTRFS_FILE_EXTENT_REG || +		    type == BTRFS_FILE_EXTENT_PREALLOC) { +			struct btrfs_clone_extent_info clone_info; + +			/* +			 *    a  | --- range to clone ---|  b +			 * | ------------- extent ------------- | +			 */ + +			/* Subtract range b */ +			if (key.offset + datal > off + len) +				datal = off + len - key.offset; + +			/* Subtract range a */ +			if (off > key.offset) { +				datao += off - key.offset; +				datal -= off - key.offset; +			} + +			clone_info.disk_offset = disko; +			clone_info.disk_len = diskl; +			clone_info.data_offset = datao; +			clone_info.data_len = datal; +			clone_info.file_offset = new_key.offset; +			clone_info.extent_buf = buf; +			clone_info.item_size = size; +			ret = btrfs_punch_hole_range(inode, path, drop_start, +					new_key.offset + datal - 1, &clone_info, +					&trans); +			if (ret) +				goto out; +		} else if (type == BTRFS_FILE_EXTENT_INLINE) { +			/* +			 * Inline extents always have to start at file offset 0 +			 * and can never be bigger then the sector size. We can +			 * never clone only parts of an inline extent, since all +			 * reflink operations must start at a sector size aligned +			 * offset, and the length must be aligned too or end at +			 * the i_size (which implies the whole inlined data). +			 */ +			ASSERT(key.offset == 0); +			ASSERT(datal <= fs_info->sectorsize); +			if (key.offset != 0 || datal > fs_info->sectorsize) +				return -EUCLEAN; + +			ret = clone_copy_inline_extent(inode, path, &new_key, +						       drop_start, datal, size, +						       comp, buf, &trans); +			if (ret) +				goto out; +		} + +		btrfs_release_path(path); + +		last_dest_end = ALIGN(new_key.offset + datal, +				      fs_info->sectorsize); +		ret = clone_finish_inode_update(trans, inode, last_dest_end, +						destoff, olen, no_time_update); +		if (ret) +			goto out; +		if (new_key.offset + datal >= destoff + len) +			break; + +		btrfs_release_path(path); +		key.offset = next_key_min_offset; + +		if (fatal_signal_pending(current)) { +			ret = -EINTR; +			goto out; +		} +	} +	ret = 0; + +	if (last_dest_end < destoff + len) { +		/* +		 * We have an implicit hole that fully or partially overlaps our +		 * cloning range at its end. This means that we either have the +		 * NO_HOLES feature enabled or the implicit hole happened due to +		 * mixing buffered and direct IO writes against this file. +		 */ +		btrfs_release_path(path); +		path->leave_spinning = 0; + +		ret = btrfs_punch_hole_range(inode, path, last_dest_end, +				destoff + len - 1, NULL, &trans); +		if (ret) +			goto out; + +		ret = clone_finish_inode_update(trans, inode, destoff + len, +						destoff, olen, no_time_update); +	} + +out: +	btrfs_free_path(path); +	kvfree(buf); +	return ret; +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, +				       struct inode *inode2, u64 loff2, u64 len) +{ +	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); +	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, +				     struct inode *inode2, u64 loff2, u64 len) +{ +	if (inode1 < inode2) { +		swap(inode1, inode2); +		swap(loff1, loff2); +	} else if (inode1 == inode2 && loff2 < loff1) { +		swap(loff1, loff2); +	} +	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); +	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, +				   struct inode *dst, u64 dst_loff) +{ +	const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; +	int ret; + +	/* +	 * Lock destination range to serialize with concurrent readpages() and +	 * source range to serialize with relocation. +	 */ +	btrfs_double_extent_lock(src, loff, dst, dst_loff, len); +	ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); +	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + +	return ret; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, +			     struct inode *dst, u64 dst_loff) +{ +	int ret; +	u64 i, tail_len, chunk_count; +	struct btrfs_root *root_dst = BTRFS_I(dst)->root; + +	spin_lock(&root_dst->root_item_lock); +	if (root_dst->send_in_progress) { +		btrfs_warn_rl(root_dst->fs_info, +"cannot deduplicate to root %llu while send operations are using it (%d in progress)", +			      root_dst->root_key.objectid, +			      root_dst->send_in_progress); +		spin_unlock(&root_dst->root_item_lock); +		return -EAGAIN; +	} +	root_dst->dedupe_in_progress++; +	spin_unlock(&root_dst->root_item_lock); + +	tail_len = olen % BTRFS_MAX_DEDUPE_LEN; +	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); + +	for (i = 0; i < chunk_count; i++) { +		ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, +					      dst, dst_loff); +		if (ret) +			goto out; + +		loff += BTRFS_MAX_DEDUPE_LEN; +		dst_loff += BTRFS_MAX_DEDUPE_LEN; +	} + +	if (tail_len > 0) +		ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); +out: +	spin_lock(&root_dst->root_item_lock); +	root_dst->dedupe_in_progress--; +	spin_unlock(&root_dst->root_item_lock); + +	return ret; +} + +static noinline int btrfs_clone_files(struct file *file, struct file *file_src, +					u64 off, u64 olen, u64 destoff) +{ +	struct inode *inode = file_inode(file); +	struct inode *src = file_inode(file_src); +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +	int ret; +	int wb_ret; +	u64 len = olen; +	u64 bs = fs_info->sb->s_blocksize; + +	/* +	 * VFS's generic_remap_file_range_prep() protects us from cloning the +	 * eof block into the middle of a file, which would result in corruption +	 * if the file size is not blocksize aligned. So we don't need to check +	 * for that case here. +	 */ +	if (off + len == src->i_size) +		len = ALIGN(src->i_size, bs) - off; + +	if (destoff > inode->i_size) { +		const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); + +		ret = btrfs_cont_expand(inode, inode->i_size, destoff); +		if (ret) +			return ret; +		/* +		 * We may have truncated the last block if the inode's size is +		 * not sector size aligned, so we need to wait for writeback to +		 * complete before proceeding further, otherwise we can race +		 * with cloning and attempt to increment a reference to an +		 * extent that no longer exists (writeback completed right after +		 * we found the previous extent covering eof and before we +		 * attempted to increment its reference count). +		 */ +		ret = btrfs_wait_ordered_range(inode, wb_start, +					       destoff - wb_start); +		if (ret) +			return ret; +	} + +	/* +	 * Lock destination range to serialize with concurrent readpages() and +	 * source range to serialize with relocation. +	 */ +	btrfs_double_extent_lock(src, off, inode, destoff, len); +	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); +	btrfs_double_extent_unlock(src, off, inode, destoff, len); + +	/* +	 * We may have copied an inline extent into a page of the destination +	 * range, so wait for writeback to complete before truncating pages +	 * from the page cache. This is a rare case. +	 */ +	wb_ret = btrfs_wait_ordered_range(inode, destoff, len); +	ret = ret ? ret : wb_ret; +	/* +	 * Truncate page cache pages so that future reads will see the cloned +	 * data immediately and not the previous data. +	 */ +	truncate_inode_pages_range(&inode->i_data, +				round_down(destoff, PAGE_SIZE), +				round_up(destoff + len, PAGE_SIZE) - 1); + +	return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, +				       struct file *file_out, loff_t pos_out, +				       loff_t *len, unsigned int remap_flags) +{ +	struct inode *inode_in = file_inode(file_in); +	struct inode *inode_out = file_inode(file_out); +	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; +	bool same_inode = inode_out == inode_in; +	u64 wb_len; +	int ret; + +	if (!(remap_flags & REMAP_FILE_DEDUP)) { +		struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + +		if (btrfs_root_readonly(root_out)) +			return -EROFS; + +		if (file_in->f_path.mnt != file_out->f_path.mnt || +		    inode_in->i_sb != inode_out->i_sb) +			return -EXDEV; +	} + +	/* Don't make the dst file partly checksummed */ +	if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != +	    (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { +		return -EINVAL; +	} + +	/* +	 * Now that the inodes are locked, we need to start writeback ourselves +	 * and can not rely on the writeback from the VFS's generic helper +	 * generic_remap_file_range_prep() because: +	 * +	 * 1) For compression we must call filemap_fdatawrite_range() range +	 *    twice (btrfs_fdatawrite_range() does it for us), and the generic +	 *    helper only calls it once; +	 * +	 * 2) filemap_fdatawrite_range(), called by the generic helper only +	 *    waits for the writeback to complete, i.e. for IO to be done, and +	 *    not for the ordered extents to complete. We need to wait for them +	 *    to complete so that new file extent items are in the fs tree. +	 */ +	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) +		wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); +	else +		wb_len = ALIGN(*len, bs); + +	/* +	 * Since we don't lock ranges, wait for ongoing lockless dio writes (as +	 * any in progress could create its ordered extents after we wait for +	 * existing ordered extents below). +	 */ +	inode_dio_wait(inode_in); +	if (!same_inode) +		inode_dio_wait(inode_out); + +	/* +	 * Workaround to make sure NOCOW buffered write reach disk as NOCOW. +	 * +	 * Btrfs' back references do not have a block level granularity, they +	 * work at the whole extent level. +	 * NOCOW buffered write without data space reserved may not be able +	 * to fall back to CoW due to lack of data space, thus could cause +	 * data loss. +	 * +	 * Here we take a shortcut by flushing the whole inode, so that all +	 * nocow write should reach disk as nocow before we increase the +	 * reference of the extent. We could do better by only flushing NOCOW +	 * data, but that needs extra accounting. +	 * +	 * Also we don't need to check ASYNC_EXTENT, as async extent will be +	 * CoWed anyway, not affecting nocow part. +	 */ +	ret = filemap_flush(inode_in->i_mapping); +	if (ret < 0) +		return ret; + +	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), +				       wb_len); +	if (ret < 0) +		return ret; +	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), +				       wb_len); +	if (ret < 0) +		return ret; + +	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, +					    len, remap_flags); +} + +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, +		struct file *dst_file, loff_t destoff, loff_t len, +		unsigned int remap_flags) +{ +	struct inode *src_inode = file_inode(src_file); +	struct inode *dst_inode = file_inode(dst_file); +	bool same_inode = dst_inode == src_inode; +	int ret; + +	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) +		return -EINVAL; + +	if (same_inode) +		inode_lock(src_inode); +	else +		lock_two_nondirectories(src_inode, dst_inode); + +	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, +					  &len, remap_flags); +	if (ret < 0 || len == 0) +		goto out_unlock; + +	if (remap_flags & REMAP_FILE_DEDUP) +		ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); +	else +		ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); + +out_unlock: +	if (same_inode) +		inode_unlock(src_inode); +	else +		unlock_two_nondirectories(src_inode, dst_inode); + +	return ret < 0 ? ret : len; +} diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h new file mode 100644 index 000000000000..ecb309b4dad0 --- /dev/null +++ b/fs/btrfs/reflink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_REFLINK_H +#define BTRFS_REFLINK_H + +#include <linux/fs.h> + +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, +			      struct file *file_out, loff_t pos_out, +			      loff_t len, unsigned int remap_flags); + +#endif /* BTRFS_REFLINK_H */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 995d4b8b1cfd..f65595602aa8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -9,6 +9,7 @@  #include <linux/blkdev.h>  #include <linux/rbtree.h>  #include <linux/slab.h> +#include <linux/error-injection.h>  #include "ctree.h"  #include "disk-io.h"  #include "transaction.h" @@ -22,6 +23,54 @@  #include "print-tree.h"  #include "delalloc-space.h"  #include "block-group.h" +#include "backref.h" + +/* + * Relocation overview + * + * [What does relocation do] + * + * The objective of relocation is to relocate all extents of the target block + * group to other block groups. + * This is utilized by resize (shrink only), profile converting, compacting + * space, or balance routine to spread chunks over devices. + * + * 		Before		|		After + * ------------------------------------------------------------------ + *  BG A: 10 data extents	| BG A: deleted + *  BG B:  2 data extents	| BG B: 10 data extents (2 old + 8 relocated) + *  BG C:  1 extents		| BG C:  3 data extents (1 old + 2 relocated) + * + * [How does relocation work] + * + * 1.   Mark the target block group read-only + *      New extents won't be allocated from the target block group. + * + * 2.1  Record each extent in the target block group + *      To build a proper map of extents to be relocated. + * + * 2.2  Build data reloc tree and reloc trees + *      Data reloc tree will contain an inode, recording all newly relocated + *      data extents. + *      There will be only one data reloc tree for one data block group. + * + *      Reloc tree will be a special snapshot of its source tree, containing + *      relocated tree blocks. + *      Each tree referring to a tree block in target block group will get its + *      reloc tree built. + * + * 2.3  Swap source tree with its corresponding reloc tree + *      Each involved tree only refers to new extents after swap. + * + * 3.   Cleanup reloc trees and data reloc tree. + *      As old extents in the target block group are still referenced by reloc + *      trees, we need to clean them up before really freeing the target block + *      group. + * + * The main complexity is in steps 2.2 and 2.3. + * + * The entry point of relocation is relocate_block_group() function. + */  /*   * backref_node, mapping_node and tree_block start with this @@ -256,6 +305,7 @@ static void free_backref_node(struct backref_cache *cache,  {  	if (node) {  		cache->nr_nodes--; +		btrfs_put_root(node->root);  		kfree(node);  	}  } @@ -589,22 +639,7 @@ static struct btrfs_root *find_reloc_root(struct reloc_control *rc,  		root = (struct btrfs_root *)node->data;  	}  	spin_unlock(&rc->reloc_root_tree.lock); -	return root; -} - -static int is_cowonly_root(u64 root_objectid) -{ -	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || -	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID || -	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID || -	    root_objectid == BTRFS_DEV_TREE_OBJECTID || -	    root_objectid == BTRFS_TREE_LOG_OBJECTID || -	    root_objectid == BTRFS_CSUM_TREE_OBJECTID || -	    root_objectid == BTRFS_UUID_TREE_OBJECTID || -	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID || -	    root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) -		return 1; -	return 0; +	return btrfs_grab_root(root);  }  static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, @@ -614,10 +649,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,  	key.objectid = root_objectid;  	key.type = BTRFS_ROOT_ITEM_KEY; -	if (is_cowonly_root(root_objectid)) -		key.offset = 0; -	else -		key.offset = (u64)-1; +	key.offset = (u64)-1;  	return btrfs_get_fs_root(fs_info, &key, false);  } @@ -711,8 +743,6 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,  		err = -ENOMEM;  		goto out;  	} -	path1->reada = READA_FORWARD; -	path2->reada = READA_FORWARD;  	node = alloc_backref_node(cache);  	if (!node) { @@ -899,10 +929,12 @@ again:  			/* tree root */  			ASSERT(btrfs_root_bytenr(&root->root_item) ==  			       cur->bytenr); -			if (should_ignore_root(root)) +			if (should_ignore_root(root)) { +				btrfs_put_root(root);  				list_add(&cur->list, &useless); -			else +			} else {  				cur->root = root; +			}  			break;  		} @@ -915,6 +947,7 @@ again:  		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);  		path2->lowest_level = 0;  		if (ret < 0) { +			btrfs_put_root(root);  			err = ret;  			goto out;  		} @@ -930,6 +963,7 @@ again:  				  root->root_key.objectid,  				  node_key->objectid, node_key->type,  				  node_key->offset); +			btrfs_put_root(root);  			err = -ENOENT;  			goto out;  		} @@ -941,15 +975,18 @@ again:  			if (!path2->nodes[level]) {  				ASSERT(btrfs_root_bytenr(&root->root_item) ==  				       lower->bytenr); -				if (should_ignore_root(root)) +				if (should_ignore_root(root)) { +					btrfs_put_root(root);  					list_add(&lower->list, &useless); -				else +				} else {  					lower->root = root; +				}  				break;  			}  			edge = alloc_backref_edge(cache);  			if (!edge) { +				btrfs_put_root(root);  				err = -ENOMEM;  				goto out;  			} @@ -959,6 +996,7 @@ again:  			if (!rb_node) {  				upper = alloc_backref_node(cache);  				if (!upper) { +					btrfs_put_root(root);  					free_backref_edge(cache, edge);  					err = -ENOMEM;  					goto out; @@ -1006,8 +1044,10 @@ again:  			edge->node[LOWER] = lower;  			edge->node[UPPER] = upper; -			if (rb_node) +			if (rb_node) { +				btrfs_put_root(root);  				break; +			}  			lower = upper;  			upper = NULL;  		} @@ -1186,7 +1226,7 @@ out:  			free_backref_node(cache, lower);  		} -		free_backref_node(cache, node); +		remove_backref_node(cache, node);  		return ERR_PTR(err);  	}  	ASSERT(!node || !node->detached); @@ -1244,7 +1284,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,  	new_node->level = node->level;  	new_node->lowest = node->lowest;  	new_node->checked = 1; -	new_node->root = dest; +	new_node->root = btrfs_grab_root(dest); +	ASSERT(new_node->root);  	if (!node->lowest) {  		list_for_each_entry(edge, &node->lower, list[UPPER]) { @@ -1298,7 +1339,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)  	if (!node)  		return -ENOMEM; -	node->bytenr = root->node->start; +	node->bytenr = root->commit_root->start;  	node->data = root;  	spin_lock(&rc->reloc_root_tree.lock); @@ -1325,14 +1366,16 @@ static void __del_reloc_root(struct btrfs_root *root)  	struct rb_node *rb_node;  	struct mapping_node *node = NULL;  	struct reloc_control *rc = fs_info->reloc_ctl; +	bool put_ref = false;  	if (rc && root->node) {  		spin_lock(&rc->reloc_root_tree.lock);  		rb_node = tree_search(&rc->reloc_root_tree.rb_root, -				      root->node->start); +				      root->commit_root->start);  		if (rb_node) {  			node = rb_entry(rb_node, struct mapping_node, rb_node);  			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); +			RB_CLEAR_NODE(&node->rb_node);  		}  		spin_unlock(&rc->reloc_root_tree.lock);  		if (!node) @@ -1340,9 +1383,22 @@ static void __del_reloc_root(struct btrfs_root *root)  		BUG_ON((struct btrfs_root *)node->data != root);  	} +	/* +	 * We only put the reloc root here if it's on the list.  There's a lot +	 * of places where the pattern is to splice the rc->reloc_roots, process +	 * the reloc roots, and then add the reloc root back onto +	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the +	 * list we don't want the reference being dropped, because the guy +	 * messing with the list is in charge of the reference. +	 */  	spin_lock(&fs_info->trans_lock); -	list_del_init(&root->root_list); +	if (!list_empty(&root->root_list)) { +		put_ref = true; +		list_del_init(&root->root_list); +	}  	spin_unlock(&fs_info->trans_lock); +	if (put_ref) +		btrfs_put_root(root);  	kfree(node);  } @@ -1350,7 +1406,7 @@ static void __del_reloc_root(struct btrfs_root *root)   * helper to update the 'address of tree root -> reloc tree'   * mapping   */ -static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +static int __update_reloc_root(struct btrfs_root *root)  {  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct rb_node *rb_node; @@ -1359,7 +1415,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)  	spin_lock(&rc->reloc_root_tree.lock);  	rb_node = tree_search(&rc->reloc_root_tree.rb_root, -			      root->node->start); +			      root->commit_root->start);  	if (rb_node) {  		node = rb_entry(rb_node, struct mapping_node, rb_node);  		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1371,7 +1427,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)  	BUG_ON((struct btrfs_root *)node->data != root);  	spin_lock(&rc->reloc_root_tree.lock); -	node->bytenr = new_bytenr; +	node->bytenr = root->node->start;  	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,  			      node->bytenr, &node->rb_node);  	spin_unlock(&rc->reloc_root_tree.lock); @@ -1447,8 +1503,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,  	BUG_ON(ret);  	kfree(root_item); -	reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key); +	reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);  	BUG_ON(IS_ERR(reloc_root)); +	set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);  	reloc_root->last_trans = trans->transid;  	return reloc_root;  } @@ -1456,6 +1513,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,  /*   * create reloc tree for a given fs tree. reloc tree is just a   * snapshot of the fs tree with special root objectid. + * + * The reloc_root comes out of here with two references, one for + * root->reloc_root, and another for being on the rc->reloc_roots list.   */  int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,  			  struct btrfs_root *root) @@ -1467,6 +1527,10 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,  	int clear_rsv = 0;  	int ret; +	if (!rc || !rc->create_reloc_tree || +	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) +		return 0; +  	/*  	 * The subvolume has reloc tree but the swap is finished, no need to  	 * create/update the dead reloc tree @@ -1480,10 +1544,6 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,  		return 0;  	} -	if (!rc || !rc->create_reloc_tree || -	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) -		return 0; -  	if (!trans->reloc_reserved) {  		rsv = trans->block_rsv;  		trans->block_rsv = rc->block_rsv; @@ -1495,7 +1555,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,  	ret = __add_reloc_root(reloc_root);  	BUG_ON(ret < 0); -	root->reloc_root = reloc_root; +	root->reloc_root = btrfs_grab_root(reloc_root);  	return 0;  } @@ -1516,6 +1576,13 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,  	reloc_root = root->reloc_root;  	root_item = &reloc_root->root_item; +	/* +	 * We are probably ok here, but __del_reloc_root() will drop its ref of +	 * the root.  We have the ref for root->reloc_root, but just in case +	 * hold it while we update the reloc root. +	 */ +	btrfs_grab_root(reloc_root); +  	/* root->reloc_root will stay until current relocation finished */  	if (fs_info->reloc_ctl->merge_reloc_tree &&  	    btrfs_root_refs(root_item) == 0) { @@ -1529,6 +1596,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,  	}  	if (reloc_root->commit_root != reloc_root->node) { +		__update_reloc_root(reloc_root);  		btrfs_set_root_node(root_item, reloc_root->node);  		free_extent_buffer(reloc_root->commit_root);  		reloc_root->commit_root = btrfs_root_node(reloc_root); @@ -1537,7 +1605,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,  	ret = btrfs_update_root(trans, fs_info->tree_root,  				&reloc_root->root_key, root_item);  	BUG_ON(ret); - +	btrfs_put_root(reloc_root);  out:  	return 0;  } @@ -2211,7 +2279,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,  	btrfs_update_reloc_root(trans, root);  	if (list_empty(&root->reloc_dirty_list)) { -		btrfs_grab_fs_root(root); +		btrfs_grab_root(root);  		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);  	}  } @@ -2231,24 +2299,34 @@ static int clean_dirty_subvols(struct reloc_control *rc)  			list_del_init(&root->reloc_dirty_list);  			root->reloc_root = NULL; -			if (reloc_root) { - -				ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); -				if (ret2 < 0 && !ret) -					ret = ret2; -			}  			/*  			 * Need barrier to ensure clear_bit() only happens after  			 * root->reloc_root = NULL. Pairs with have_reloc_root.  			 */  			smp_wmb();  			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); -			btrfs_put_fs_root(root); +			if (reloc_root) { +				/* +				 * btrfs_drop_snapshot drops our ref we hold for +				 * ->reloc_root.  If it fails however we must +				 * drop the ref ourselves. +				 */ +				ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); +				if (ret2 < 0) { +					btrfs_put_root(reloc_root); +					if (!ret) +						ret = ret2; +				} +			} +			btrfs_put_root(root);  		} else {  			/* Orphan reloc tree, just clean it up */ -			ret2 = btrfs_drop_snapshot(root, NULL, 0, 1); -			if (ret2 < 0 && !ret) -				ret = ret2; +			ret2 = btrfs_drop_snapshot(root, 0, 1); +			if (ret2 < 0) { +				btrfs_put_root(root); +				if (!ret) +					ret = ret2; +			}  		}  	}  	return ret; @@ -2325,6 +2403,18 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,  			trans = NULL;  			goto out;  		} + +		/* +		 * At this point we no longer have a reloc_control, so we can't +		 * depend on btrfs_init_reloc_root to update our last_trans. +		 * +		 * But that's ok, we started the trans handle on our +		 * corresponding fs_root, which means it's been added to the +		 * dirty list.  At commit time we'll still call +		 * btrfs_update_reloc_root() and update our root item +		 * appropriately. +		 */ +		reloc_root->last_trans = trans->transid;  		trans->block_rsv = rc->block_rsv;  		replaced = 0; @@ -2435,7 +2525,7 @@ again:  	if (IS_ERR(trans)) {  		if (!err)  			btrfs_block_rsv_release(fs_info, rc->block_rsv, -						num_bytes); +						num_bytes, NULL);  		return PTR_ERR(trans);  	} @@ -2443,7 +2533,7 @@ again:  		if (num_bytes != rc->merging_rsv_size) {  			btrfs_end_transaction(trans);  			btrfs_block_rsv_release(fs_info, rc->block_rsv, -						num_bytes); +						num_bytes, NULL);  			goto again;  		}  	} @@ -2468,6 +2558,7 @@ again:  		btrfs_update_reloc_root(trans, root);  		list_add(&reloc_root->root_list, &reloc_roots); +		btrfs_put_root(root);  	}  	list_splice(&reloc_roots, &rc->reloc_roots); @@ -2488,10 +2579,6 @@ void free_reloc_roots(struct list_head *list)  		reloc_root = list_entry(list->next, struct btrfs_root,  					root_list);  		__del_reloc_root(reloc_root); -		free_extent_buffer(reloc_root->node); -		free_extent_buffer(reloc_root->commit_root); -		reloc_root->node = NULL; -		reloc_root->commit_root = NULL;  	}  } @@ -2529,6 +2616,7 @@ again:  			BUG_ON(root->reloc_root != reloc_root);  			ret = merge_reloc_root(rc, root); +			btrfs_put_root(root);  			if (ret) {  				if (list_empty(&reloc_root->root_list))  					list_add_tail(&reloc_root->root_list, @@ -2561,7 +2649,21 @@ out:  			free_reloc_roots(&reloc_roots);  	} -	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); +	/* +	 * We used to have +	 * +	 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); +	 * +	 * here, but it's wrong.  If we fail to start the transaction in +	 * prepare_to_merge() we will have only 0 ref reloc roots, none of which +	 * have actually been removed from the reloc_root_tree rb tree.  This is +	 * fine because we're bailing here, and we hold a reference on the root +	 * for the list that holds it, so these roots will be cleaned up when we +	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root +	 * will be cleaned up on unmount. +	 * +	 * The remaining nodes will be cleaned up by free_reloc_control. +	 */  }  static void free_block_list(struct rb_root *blocks) @@ -2580,6 +2682,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,  {  	struct btrfs_fs_info *fs_info = reloc_root->fs_info;  	struct btrfs_root *root; +	int ret;  	if (reloc_root->last_trans == trans->transid)  		return 0; @@ -2587,8 +2690,10 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,  	root = read_fs_root(fs_info, reloc_root->root_key.offset);  	BUG_ON(IS_ERR(root));  	BUG_ON(root->reloc_root != reloc_root); +	ret = btrfs_record_root_in_trans(trans, root); +	btrfs_put_root(root); -	return btrfs_record_root_in_trans(trans, root); +	return ret;  }  static noinline_for_stack @@ -2621,7 +2726,9 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,  			BUG_ON(next->new_bytenr);  			BUG_ON(!list_empty(&next->list));  			next->new_bytenr = root->node->start; -			next->root = root; +			btrfs_put_root(next->root); +			next->root = btrfs_grab_root(root); +			ASSERT(next->root);  			list_add_tail(&next->list,  				      &rc->backref_cache.changed);  			__mark_block_processed(rc, next); @@ -3040,7 +3147,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,  {  	struct extent_buffer *eb; -	BUG_ON(block->key_ready);  	eb = read_tree_block(fs_info, block->bytenr, block->key.offset,  			     block->level, NULL);  	if (IS_ERR(eb)) { @@ -3073,6 +3179,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,  	if (!node)  		return 0; +	/* +	 * If we fail here we want to drop our backref_node because we are going +	 * to start over and regenerate the tree for it. +	 */ +	ret = reserve_metadata_space(trans, rc, node); +	if (ret) +		goto out; +  	BUG_ON(node->processed);  	root = select_one_root(node);  	if (root == ERR_PTR(-ENOENT)) { @@ -3080,12 +3194,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,  		goto out;  	} -	if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { -		ret = reserve_metadata_space(trans, rc, node); -		if (ret) -			goto out; -	} -  	if (root) {  		if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {  			BUG_ON(node->new_bytenr); @@ -3093,7 +3201,9 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,  			btrfs_record_root_in_trans(trans, root);  			root = root->reloc_root;  			node->new_bytenr = root->node->start; -			node->root = root; +			btrfs_put_root(node->root); +			node->root = btrfs_grab_root(root); +			ASSERT(node->root);  			list_add_tail(&node->list, &rc->backref_cache.changed);  		} else {  			path->lowest_level = node->level; @@ -3161,9 +3271,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,  		ret = relocate_tree_block(trans, rc, node, &block->key,  					  path);  		if (ret < 0) { -			if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) -				err = ret; -			goto out; +			err = ret; +			break;  		}  	}  out: @@ -3264,6 +3373,15 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,  	return ret;  } +/* + * Allow error injection to test balance cancellation + */ +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +{ +	return atomic_read(&fs_info->balance_cancel_req); +} +ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); +  static int relocate_file_extent_cluster(struct inode *inode,  					struct file_extent_cluster *cluster)  { @@ -3385,6 +3503,10 @@ static int relocate_file_extent_cluster(struct inode *inode,  		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);  		balance_dirty_pages_ratelimited(inode->i_mapping);  		btrfs_throttle(fs_info); +		if (btrfs_should_cancel_balance(fs_info)) { +			ret = -ECANCELED; +			goto out; +		}  	}  	WARN_ON(nr != cluster->nr);  out: @@ -3556,31 +3678,6 @@ out:  	return ret;  } -/* - * helper to check if the block use full backrefs for pointers in it - */ -static int block_use_full_backref(struct reloc_control *rc, -				  struct extent_buffer *eb) -{ -	u64 flags; -	int ret; - -	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || -	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) -		return 1; - -	ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info, -				       eb->start, btrfs_header_level(eb), 1, -				       NULL, &flags); -	BUG_ON(ret); - -	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) -		ret = 1; -	else -		ret = 0; -	return ret; -} -  static int delete_block_group_cache(struct btrfs_fs_info *fs_info,  				    struct btrfs_block_group *block_group,  				    struct inode *inode, @@ -3624,172 +3721,40 @@ out:  }  /* - * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY - * this function scans fs tree to find blocks reference the data extent + * Locate the free space cache EXTENT_DATA in root tree leaf and delete the + * cache inode, to avoid free space cache data extent blocking data relocation.   */ -static int find_data_references(struct reloc_control *rc, -				struct btrfs_key *extent_key, -				struct extent_buffer *leaf, -				struct btrfs_extent_data_ref *ref, -				struct rb_root *blocks) +static int delete_v1_space_cache(struct extent_buffer *leaf, +				 struct btrfs_block_group *block_group, +				 u64 data_bytenr)  { -	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; -	struct btrfs_path *path; -	struct tree_block *block; -	struct btrfs_root *root; -	struct btrfs_file_extent_item *fi; -	struct rb_node *rb_node; +	u64 space_cache_ino; +	struct btrfs_file_extent_item *ei;  	struct btrfs_key key; -	u64 ref_root; -	u64 ref_objectid; -	u64 ref_offset; -	u32 ref_count; -	u32 nritems; -	int err = 0; -	int added = 0; -	int counted; +	bool found = false; +	int i;  	int ret; -	ref_root = btrfs_extent_data_ref_root(leaf, ref); -	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); -	ref_offset = btrfs_extent_data_ref_offset(leaf, ref); -	ref_count = btrfs_extent_data_ref_count(leaf, ref); - -	/* -	 * This is an extent belonging to the free space cache, lets just delete -	 * it and redo the search. -	 */ -	if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { -		ret = delete_block_group_cache(fs_info, rc->block_group, -					       NULL, ref_objectid); -		if (ret != -ENOENT) -			return ret; -		ret = 0; -	} - -	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; -	path->reada = READA_FORWARD; - -	root = read_fs_root(fs_info, ref_root); -	if (IS_ERR(root)) { -		err = PTR_ERR(root); -		goto out; -	} - -	key.objectid = ref_objectid; -	key.type = BTRFS_EXTENT_DATA_KEY; -	if (ref_offset > ((u64)-1 << 32)) -		key.offset = 0; -	else -		key.offset = ref_offset; - -	path->search_commit_root = 1; -	path->skip_locking = 1; -	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); -	if (ret < 0) { -		err = ret; -		goto out; -	} - -	leaf = path->nodes[0]; -	nritems = btrfs_header_nritems(leaf); -	/* -	 * the references in tree blocks that use full backrefs -	 * are not counted in -	 */ -	if (block_use_full_backref(rc, leaf)) -		counted = 0; -	else -		counted = 1; -	rb_node = tree_search(blocks, leaf->start); -	if (rb_node) { -		if (counted) -			added = 1; -		else -			path->slots[0] = nritems; -	} - -	while (ref_count > 0) { -		while (path->slots[0] >= nritems) { -			ret = btrfs_next_leaf(root, path); -			if (ret < 0) { -				err = ret; -				goto out; -			} -			if (WARN_ON(ret > 0)) -				goto out; - -			leaf = path->nodes[0]; -			nritems = btrfs_header_nritems(leaf); -			added = 0; - -			if (block_use_full_backref(rc, leaf)) -				counted = 0; -			else -				counted = 1; -			rb_node = tree_search(blocks, leaf->start); -			if (rb_node) { -				if (counted) -					added = 1; -				else -					path->slots[0] = nritems; -			} -		} +	if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) +		return 0; -		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); -		if (WARN_ON(key.objectid != ref_objectid || -		    key.type != BTRFS_EXTENT_DATA_KEY)) +	for (i = 0; i < btrfs_header_nritems(leaf); i++) { +		btrfs_item_key_to_cpu(leaf, &key, i); +		if (key.type != BTRFS_EXTENT_DATA_KEY) +			continue; +		ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); +		if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG && +		    btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { +			found = true; +			space_cache_ino = key.objectid;  			break; - -		fi = btrfs_item_ptr(leaf, path->slots[0], -				    struct btrfs_file_extent_item); - -		if (btrfs_file_extent_type(leaf, fi) == -		    BTRFS_FILE_EXTENT_INLINE) -			goto next; - -		if (btrfs_file_extent_disk_bytenr(leaf, fi) != -		    extent_key->objectid) -			goto next; - -		key.offset -= btrfs_file_extent_offset(leaf, fi); -		if (key.offset != ref_offset) -			goto next; - -		if (counted) -			ref_count--; -		if (added) -			goto next; - -		if (!tree_block_processed(leaf->start, rc)) { -			block = kmalloc(sizeof(*block), GFP_NOFS); -			if (!block) { -				err = -ENOMEM; -				break; -			} -			block->bytenr = leaf->start; -			btrfs_item_key_to_cpu(leaf, &block->key, 0); -			block->level = 0; -			block->key_ready = 1; -			rb_node = tree_insert(blocks, block->bytenr, -					      &block->rb_node); -			if (rb_node) -				backref_tree_panic(rb_node, -EEXIST, -						   block->bytenr);  		} -		if (counted) -			added = 1; -		else -			path->slots[0] = nritems; -next: -		path->slots[0]++; -  	} -out: -	btrfs_free_path(path); -	return err; +	if (!found) +		return -ENOENT; +	ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, +					space_cache_ino); +	return ret;  }  /* @@ -3801,91 +3766,41 @@ int add_data_references(struct reloc_control *rc,  			struct btrfs_path *path,  			struct rb_root *blocks)  { -	struct btrfs_key key; -	struct extent_buffer *eb; -	struct btrfs_extent_data_ref *dref; -	struct btrfs_extent_inline_ref *iref; -	unsigned long ptr; -	unsigned long end; -	u32 blocksize = rc->extent_root->fs_info->nodesize; +	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; +	struct ulist *leaves = NULL; +	struct ulist_iterator leaf_uiter; +	struct ulist_node *ref_node = NULL; +	const u32 blocksize = fs_info->nodesize;  	int ret = 0; -	int err = 0; - -	eb = path->nodes[0]; -	ptr = btrfs_item_ptr_offset(eb, path->slots[0]); -	end = ptr + btrfs_item_size_nr(eb, path->slots[0]); -	ptr += sizeof(struct btrfs_extent_item); -	while (ptr < end) { -		iref = (struct btrfs_extent_inline_ref *)ptr; -		key.type = btrfs_get_extent_inline_ref_type(eb, iref, -							BTRFS_REF_TYPE_DATA); -		if (key.type == BTRFS_SHARED_DATA_REF_KEY) { -			key.offset = btrfs_extent_inline_ref_offset(eb, iref); -			ret = __add_tree_block(rc, key.offset, blocksize, -					       blocks); -		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { -			dref = (struct btrfs_extent_data_ref *)(&iref->offset); -			ret = find_data_references(rc, extent_key, -						   eb, dref, blocks); -		} else { -			ret = -EUCLEAN; -			btrfs_err(rc->extent_root->fs_info, -		     "extent %llu slot %d has an invalid inline ref type", -			     eb->start, path->slots[0]); -		} -		if (ret) { -			err = ret; -			goto out; -		} -		ptr += btrfs_extent_inline_ref_size(key.type); -	} -	WARN_ON(ptr > end); +	btrfs_release_path(path); +	ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, +				   0, &leaves, NULL, true); +	if (ret < 0) +		return ret; -	while (1) { -		cond_resched(); -		eb = path->nodes[0]; -		if (path->slots[0] >= btrfs_header_nritems(eb)) { -			ret = btrfs_next_leaf(rc->extent_root, path); -			if (ret < 0) { -				err = ret; -				break; -			} -			if (ret > 0) -				break; -			eb = path->nodes[0]; -		} +	ULIST_ITER_INIT(&leaf_uiter); +	while ((ref_node = ulist_next(leaves, &leaf_uiter))) { +		struct extent_buffer *eb; -		btrfs_item_key_to_cpu(eb, &key, path->slots[0]); -		if (key.objectid != extent_key->objectid) +		eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); +		if (IS_ERR(eb)) { +			ret = PTR_ERR(eb);  			break; - -		if (key.type == BTRFS_SHARED_DATA_REF_KEY) { -			ret = __add_tree_block(rc, key.offset, blocksize, -					       blocks); -		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { -			dref = btrfs_item_ptr(eb, path->slots[0], -					      struct btrfs_extent_data_ref); -			ret = find_data_references(rc, extent_key, -						   eb, dref, blocks); -		} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { -			btrfs_print_v0_err(eb->fs_info); -			btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); -			ret = -EINVAL; -		} else { -			ret = 0;  		} -		if (ret) { -			err = ret; +		ret = delete_v1_space_cache(eb, rc->block_group, +					    extent_key->objectid); +		free_extent_buffer(eb); +		if (ret < 0) +			break; +		ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); +		if (ret < 0)  			break; -		} -		path->slots[0]++;  	} -out: -	btrfs_release_path(path); -	if (err) +	if (ret < 0)  		free_block_list(blocks); -	return err; +	ulist_free(leaves); +	return ret;  }  /* @@ -4137,12 +4052,6 @@ restart:  		if (!RB_EMPTY_ROOT(&blocks)) {  			ret = relocate_tree_blocks(trans, rc, &blocks);  			if (ret < 0) { -				/* -				 * if we fail to relocate tree blocks, force to update -				 * backref cache when committing transaction. -				 */ -				rc->backref_cache.last_trans = trans->transid - 1; -  				if (ret != -EAGAIN) {  					err = ret;  					break; @@ -4166,6 +4075,10 @@ restart:  				break;  			}  		} +		if (btrfs_should_cancel_balance(fs_info)) { +			err = -ECANCELED; +			break; +		}  	}  	if (trans && progress && err == -ENOSPC) {  		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); @@ -4195,15 +4108,23 @@ restart:  	set_reloc_control(rc);  	backref_cache_cleanup(&rc->backref_cache); -	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); +	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); +	/* +	 * Even in the case when the relocation is cancelled, we should all go +	 * through prepare_to_merge() and merge_reloc_roots(). +	 * +	 * For error (including cancelled balance), prepare_to_merge() will +	 * mark all reloc trees orphan, then queue them for cleanup in +	 * merge_reloc_roots() +	 */  	err = prepare_to_merge(rc, err);  	merge_reloc_roots(rc);  	rc->merge_reloc_tree = 0;  	unset_reloc_control(rc); -	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); +	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);  	/* get rid of pinned extents */  	trans = btrfs_join_transaction(rc->extent_root); @@ -4212,10 +4133,10 @@ restart:  		goto out_free;  	}  	btrfs_commit_transaction(trans); +out_free:  	ret = clean_dirty_subvols(rc);  	if (ret < 0 && !err)  		err = ret; -out_free:  	btrfs_free_block_rsv(fs_info, rc->block_rsv);  	btrfs_free_path(path);  	return err; @@ -4271,8 +4192,10 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,  		return ERR_CAST(root);  	trans = btrfs_start_transaction(root, 6); -	if (IS_ERR(trans)) +	if (IS_ERR(trans)) { +		btrfs_put_root(root);  		return ERR_CAST(trans); +	}  	err = btrfs_find_free_objectid(root, &objectid);  	if (err) @@ -4290,6 +4213,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,  	err = btrfs_orphan_add(trans, BTRFS_I(inode));  out: +	btrfs_put_root(root);  	btrfs_end_transaction(trans);  	btrfs_btree_balance_dirty(fs_info);  	if (err) { @@ -4317,6 +4241,18 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)  	return rc;  } +static void free_reloc_control(struct reloc_control *rc) +{ +	struct mapping_node *node, *tmp; + +	free_reloc_roots(&rc->reloc_roots); +	rbtree_postorder_for_each_entry_safe(node, tmp, +			&rc->reloc_root_tree.rb_root, rb_node) +		kfree(node); + +	kfree(rc); +} +  /*   * Print the block group being relocated   */ @@ -4461,7 +4397,7 @@ out:  		btrfs_dec_block_group_ro(rc->block_group);  	iput(rc->data_inode);  	btrfs_put_block_group(rc->block_group); -	kfree(rc); +	free_reloc_control(rc);  	return err;  } @@ -4537,12 +4473,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)  		    key.type != BTRFS_ROOT_ITEM_KEY)  			break; -		reloc_root = btrfs_read_fs_root(root, &key); +		reloc_root = btrfs_read_tree_root(root, &key);  		if (IS_ERR(reloc_root)) {  			err = PTR_ERR(reloc_root);  			goto out;  		} +		set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);  		list_add(&reloc_root->root_list, &reloc_roots);  		if (btrfs_root_refs(&reloc_root->root_item) > 0) { @@ -4559,6 +4496,8 @@ int btrfs_recover_relocation(struct btrfs_root *root)  					err = ret;  					goto out;  				} +			} else { +				btrfs_put_root(fs_root);  			}  		} @@ -4584,9 +4523,8 @@ int btrfs_recover_relocation(struct btrfs_root *root)  	trans = btrfs_join_transaction(rc->extent_root);  	if (IS_ERR(trans)) { -		unset_reloc_control(rc);  		err = PTR_ERR(trans); -		goto out_free; +		goto out_unset;  	}  	rc->merge_reloc_tree = 1; @@ -4606,17 +4544,18 @@ int btrfs_recover_relocation(struct btrfs_root *root)  		if (IS_ERR(fs_root)) {  			err = PTR_ERR(fs_root);  			list_add_tail(&reloc_root->root_list, &reloc_roots); -			goto out_free; +			goto out_unset;  		}  		err = __add_reloc_root(reloc_root);  		BUG_ON(err < 0); /* -ENOMEM or logic error */ -		fs_root->reloc_root = reloc_root; +		fs_root->reloc_root = btrfs_grab_root(reloc_root); +		btrfs_put_root(fs_root);  	}  	err = btrfs_commit_transaction(trans);  	if (err) -		goto out_free; +		goto out_unset;  	merge_reloc_roots(rc); @@ -4625,15 +4564,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)  	trans = btrfs_join_transaction(rc->extent_root);  	if (IS_ERR(trans)) {  		err = PTR_ERR(trans); -		goto out_free; +		goto out_clean;  	}  	err = btrfs_commit_transaction(trans); - +out_clean:  	ret = clean_dirty_subvols(rc);  	if (ret < 0 && !err)  		err = ret; -out_free: -	kfree(rc); +out_unset: +	unset_reloc_control(rc); +	free_reloc_control(rc);  out:  	if (!list_empty(&reloc_roots))  		free_reloc_roots(&reloc_roots); @@ -4643,10 +4583,12 @@ out:  	if (err == 0) {  		/* cleanup orphan inode in data relocation tree */  		fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); -		if (IS_ERR(fs_root)) +		if (IS_ERR(fs_root)) {  			err = PTR_ERR(fs_root); -		else +		} else {  			err = btrfs_orphan_cleanup(fs_root); +			btrfs_put_root(fs_root); +		}  	}  	return err;  } @@ -4720,11 +4662,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,  	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&  	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); -	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { -		if (buf == root->node) -			__update_reloc_root(root, cow->start); -	} -  	level = btrfs_header_level(buf);  	if (btrfs_header_generation(buf) <=  	    btrfs_root_last_snapshot(&root->root_item)) @@ -4795,6 +4732,10 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,  /*   * called after snapshot is created. migrate block reservation   * and create reloc root for the newly created snapshot + * + * This is similar to btrfs_init_reloc_root(), we come out of here with two + * references held on the reloc_root, one for root->reloc_root and one for + * rc->reloc_roots.   */  int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,  			       struct btrfs_pending_snapshot *pending) @@ -4827,7 +4768,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,  	ret = __add_reloc_root(reloc_root);  	BUG_ON(ret < 0); -	new_root->reloc_root = reloc_root; +	new_root->reloc_root = btrfs_grab_root(reloc_root);  	if (rc->create_reloc_tree)  		ret = clone_backref_node(trans, rc, root, reloc_root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 612411c74550..668f22844017 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -22,7 +22,6 @@  static void btrfs_read_root_item(struct extent_buffer *eb, int slot,  				struct btrfs_root_item *item)  { -	uuid_le uuid;  	u32 len;  	int need_reset = 0; @@ -44,8 +43,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,  			sizeof(*item) - offsetof(struct btrfs_root_item,  					generation_v2)); -		uuid_le_gen(&uuid); -		memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE); +		generate_random_guid(item->uuid);  	}  } @@ -255,25 +253,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)  		root_key.objectid = key.offset;  		key.offset++; -		/* -		 * The root might have been inserted already, as before we look -		 * for orphan roots, log replay might have happened, which -		 * triggers a transaction commit and qgroup accounting, which -		 * in turn reads and inserts fs roots while doing backref -		 * walking. -		 */ -		root = btrfs_lookup_fs_root(fs_info, root_key.objectid); -		if (root) { -			WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, -					  &root->state)); -			if (btrfs_root_refs(&root->root_item) == 0) { -				set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); -				btrfs_add_dead_root(root); -			} -			continue; -		} - -		root = btrfs_read_fs_root(tree_root, &root_key); +		root = btrfs_get_fs_root(fs_info, &root_key, false);  		err = PTR_ERR_OR_ZERO(root);  		if (err && err != -ENOENT) {  			break; @@ -300,25 +280,12 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)  			continue;  		} -		err = btrfs_init_fs_root(root); -		if (err) { -			btrfs_free_fs_root(root); -			break; -		} - -		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); - -		err = btrfs_insert_fs_root(fs_info, root); -		if (err) { -			BUG_ON(err == -EEXIST); -			btrfs_free_fs_root(root); -			break; -		} - +		WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));  		if (btrfs_root_refs(&root->root_item) == 0) {  			set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);  			btrfs_add_dead_root(root);  		} +		btrfs_put_root(root);  	}  	btrfs_free_path(path); @@ -553,5 +520,5 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,  void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,  				      struct btrfs_block_rsv *rsv)  { -	btrfs_block_rsv_release(fs_info, rsv, (u64)-1); +	btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);  } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 61b37c56a7fb..adaf8ab694d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -149,7 +149,7 @@ struct scrub_parity {  	 */  	unsigned long		*ebitmap; -	unsigned long		bitmap[0]; +	unsigned long		bitmap[];  };  struct scrub_ctx { @@ -653,7 +653,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,  	root_key.objectid = root;  	root_key.type = BTRFS_ROOT_ITEM_KEY;  	root_key.offset = (u64)-1; -	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); +	local_root = btrfs_get_fs_root(fs_info, &root_key, true);  	if (IS_ERR(local_root)) {  		ret = PTR_ERR(local_root);  		goto err; @@ -668,6 +668,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,  	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);  	if (ret) { +		btrfs_put_root(local_root);  		btrfs_release_path(swarn->path);  		goto err;  	} @@ -688,6 +689,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,  	ipath = init_ipath(4096, local_root, swarn->path);  	memalloc_nofs_restore(nofs_flag);  	if (IS_ERR(ipath)) { +		btrfs_put_root(local_root);  		ret = PTR_ERR(ipath);  		ipath = NULL;  		goto err; @@ -711,6 +713,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,  				  min(isize - offset, (u64)PAGE_SIZE), nlink,  				  (char *)(unsigned long)ipath->fspath->val[i]); +	btrfs_put_root(local_root);  	free_ipath(ipath);  	return 0; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a055b657cb85..c5f41bd86765 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5586,10 +5586,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)  {  	struct btrfs_path *path;  	struct btrfs_root *root = sctx->send_root; -	struct btrfs_file_extent_item *fi;  	struct btrfs_key key; -	u64 extent_end; -	u8 type;  	int ret;  	path = alloc_path_for_send(); @@ -5609,18 +5606,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)  	if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)  		goto out; -	fi = btrfs_item_ptr(path->nodes[0], path->slots[0], -			    struct btrfs_file_extent_item); -	type = btrfs_file_extent_type(path->nodes[0], fi); -	if (type == BTRFS_FILE_EXTENT_INLINE) { -		u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); -		extent_end = ALIGN(key.offset + size, -				   sctx->send_root->fs_info->sectorsize); -	} else { -		extent_end = key.offset + -			btrfs_file_extent_num_bytes(path->nodes[0], fi); -	} -	sctx->cur_inode_last_extent = extent_end; +	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);  out:  	btrfs_free_path(path);  	return ret; @@ -5674,16 +5660,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,  			break;  		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); -		if (btrfs_file_extent_type(leaf, fi) == -		    BTRFS_FILE_EXTENT_INLINE) { -			u64 size = btrfs_file_extent_ram_bytes(leaf, fi); - -			extent_end = ALIGN(key.offset + size, -					   root->fs_info->sectorsize); -		} else { -			extent_end = key.offset + -				btrfs_file_extent_num_bytes(leaf, fi); -		} +		extent_end = btrfs_file_extent_end(path);  		if (extent_end <= start)  			goto next;  		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { @@ -5704,9 +5681,6 @@ out:  static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,  			   struct btrfs_key *key)  { -	struct btrfs_file_extent_item *fi; -	u64 extent_end; -	u8 type;  	int ret = 0;  	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) @@ -5718,18 +5692,6 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,  			return ret;  	} -	fi = btrfs_item_ptr(path->nodes[0], path->slots[0], -			    struct btrfs_file_extent_item); -	type = btrfs_file_extent_type(path->nodes[0], fi); -	if (type == BTRFS_FILE_EXTENT_INLINE) { -		u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); -		extent_end = ALIGN(key->offset + size, -				   sctx->send_root->fs_info->sectorsize); -	} else { -		extent_end = key->offset + -			btrfs_file_extent_num_bytes(path->nodes[0], fi); -	} -  	if (path->slots[0] == 0 &&  	    sctx->cur_inode_last_extent < key->offset) {  		/* @@ -5755,7 +5717,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,  		else  			ret = 0;  	} -	sctx->cur_inode_last_extent = extent_end; +	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);  	return ret;  } @@ -7066,7 +7028,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)  	int clone_sources_to_rollback = 0;  	unsigned alloc_size;  	int sort_clone_roots = 0; -	int index;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -7193,11 +7154,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)  			key.type = BTRFS_ROOT_ITEM_KEY;  			key.offset = (u64)-1; -			index = srcu_read_lock(&fs_info->subvol_srcu); - -			clone_root = btrfs_read_fs_root_no_name(fs_info, &key); +			clone_root = btrfs_get_fs_root(fs_info, &key, true);  			if (IS_ERR(clone_root)) { -				srcu_read_unlock(&fs_info->subvol_srcu, index);  				ret = PTR_ERR(clone_root);  				goto out;  			} @@ -7205,20 +7163,19 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)  			if (!btrfs_root_readonly(clone_root) ||  			    btrfs_root_dead(clone_root)) {  				spin_unlock(&clone_root->root_item_lock); -				srcu_read_unlock(&fs_info->subvol_srcu, index); +				btrfs_put_root(clone_root);  				ret = -EPERM;  				goto out;  			}  			if (clone_root->dedupe_in_progress) {  				dedupe_in_progress_warn(clone_root);  				spin_unlock(&clone_root->root_item_lock); -				srcu_read_unlock(&fs_info->subvol_srcu, index); +				btrfs_put_root(clone_root);  				ret = -EAGAIN;  				goto out;  			}  			clone_root->send_in_progress++;  			spin_unlock(&clone_root->root_item_lock); -			srcu_read_unlock(&fs_info->subvol_srcu, index);  			sctx->clone_roots[i].root = clone_root;  			clone_sources_to_rollback = i + 1; @@ -7232,11 +7189,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)  		key.type = BTRFS_ROOT_ITEM_KEY;  		key.offset = (u64)-1; -		index = srcu_read_lock(&fs_info->subvol_srcu); - -		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); +		sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true);  		if (IS_ERR(sctx->parent_root)) { -			srcu_read_unlock(&fs_info->subvol_srcu, index);  			ret = PTR_ERR(sctx->parent_root);  			goto out;  		} @@ -7246,20 +7200,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)  		if (!btrfs_root_readonly(sctx->parent_root) ||  				btrfs_root_dead(sctx->parent_root)) {  			spin_unlock(&sctx->parent_root->root_item_lock); -			srcu_read_unlock(&fs_info->subvol_srcu, index);  			ret = -EPERM;  			goto out;  		}  		if (sctx->parent_root->dedupe_in_progress) {  			dedupe_in_progress_warn(sctx->parent_root);  			spin_unlock(&sctx->parent_root->root_item_lock); -			srcu_read_unlock(&fs_info->subvol_srcu, index);  			ret = -EAGAIN;  			goto out;  		}  		spin_unlock(&sctx->parent_root->root_item_lock); - -		srcu_read_unlock(&fs_info->subvol_srcu, index);  	}  	/* @@ -7267,7 +7217,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)  	 * is behind the current send position. This is checked while searching  	 * for possible clone sources.  	 */ -	sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root; +	sctx->clone_roots[sctx->clone_roots_cnt++].root = +		btrfs_grab_root(sctx->send_root);  	/* We do a bsearch later */  	sort(sctx->clone_roots, sctx->clone_roots_cnt, @@ -7352,18 +7303,24 @@ out:  	}  	if (sort_clone_roots) { -		for (i = 0; i < sctx->clone_roots_cnt; i++) +		for (i = 0; i < sctx->clone_roots_cnt; i++) {  			btrfs_root_dec_send_in_progress(  					sctx->clone_roots[i].root); +			btrfs_put_root(sctx->clone_roots[i].root); +		}  	} else { -		for (i = 0; sctx && i < clone_sources_to_rollback; i++) +		for (i = 0; sctx && i < clone_sources_to_rollback; i++) {  			btrfs_root_dec_send_in_progress(  					sctx->clone_roots[i].root); +			btrfs_put_root(sctx->clone_roots[i].root); +		}  		btrfs_root_dec_send_in_progress(send_root);  	} -	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) +	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {  		btrfs_root_dec_send_in_progress(sctx->parent_root); +		btrfs_put_root(sctx->parent_root); +	}  	kvfree(clone_sources_tmp); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 01297c5b2666..8b0fe053a25d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,6 +10,153 @@  #include "transaction.h"  #include "block-group.h" +/* + * HOW DOES SPACE RESERVATION WORK + * + * If you want to know about delalloc specifically, there is a separate comment + * for that with the delalloc code.  This comment is about how the whole system + * works generally. + * + * BASIC CONCEPTS + * + *   1) space_info.  This is the ultimate arbiter of how much space we can use. + *   There's a description of the bytes_ fields with the struct declaration, + *   refer to that for specifics on each field.  Suffice it to say that for + *   reservations we care about total_bytes - SUM(space_info->bytes_) when + *   determining if there is space to make an allocation.  There is a space_info + *   for METADATA, SYSTEM, and DATA areas. + * + *   2) block_rsv's.  These are basically buckets for every different type of + *   metadata reservation we have.  You can see the comment in the block_rsv + *   code on the rules for each type, but generally block_rsv->reserved is how + *   much space is accounted for in space_info->bytes_may_use. + * + *   3) btrfs_calc*_size.  These are the worst case calculations we used based + *   on the number of items we will want to modify.  We have one for changing + *   items, and one for inserting new items.  Generally we use these helpers to + *   determine the size of the block reserves, and then use the actual bytes + *   values to adjust the space_info counters. + * + * MAKING RESERVATIONS, THE NORMAL CASE + * + *   We call into either btrfs_reserve_data_bytes() or + *   btrfs_reserve_metadata_bytes(), depending on which we're looking for, with + *   num_bytes we want to reserve. + * + *   ->reserve + *     space_info->bytes_may_reserve += num_bytes + * + *   ->extent allocation + *     Call btrfs_add_reserved_bytes() which does + *     space_info->bytes_may_reserve -= num_bytes + *     space_info->bytes_reserved += extent_bytes + * + *   ->insert reference + *     Call btrfs_update_block_group() which does + *     space_info->bytes_reserved -= extent_bytes + *     space_info->bytes_used += extent_bytes + * + * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) + * + *   Assume we are unable to simply make the reservation because we do not have + *   enough space + * + *   -> __reserve_bytes + *     create a reserve_ticket with ->bytes set to our reservation, add it to + *     the tail of space_info->tickets, kick async flush thread + * + *   ->handle_reserve_ticket + *     wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set + *     on the ticket. + * + *   -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space + *     Flushes various things attempting to free up space. + * + *   -> btrfs_try_granting_tickets() + *     This is called by anything that either subtracts space from + *     space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the + *     space_info->total_bytes.  This loops through the ->priority_tickets and + *     then the ->tickets list checking to see if the reservation can be + *     completed.  If it can the space is added to space_info->bytes_may_use and + *     the ticket is woken up. + * + *   -> ticket wakeup + *     Check if ->bytes == 0, if it does we got our reservation and we can carry + *     on, if not return the appropriate error (ENOSPC, but can be EINTR if we + *     were interrupted.) + * + * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY + * + *   Same as the above, except we add ourselves to the + *   space_info->priority_tickets, and we do not use ticket->wait, we simply + *   call flush_space() ourselves for the states that are safe for us to call + *   without deadlocking and hope for the best. + * + * THE FLUSHING STATES + * + *   Generally speaking we will have two cases for each state, a "nice" state + *   and a "ALL THE THINGS" state.  In btrfs we delay a lot of work in order to + *   reduce the locking over head on the various trees, and even to keep from + *   doing any work at all in the case of delayed refs.  Each of these delayed + *   things however hold reservations, and so letting them run allows us to + *   reclaim space so we can make new reservations. + * + *   FLUSH_DELAYED_ITEMS + *     Every inode has a delayed item to update the inode.  Take a simple write + *     for example, we would update the inode item at write time to update the + *     mtime, and then again at finish_ordered_io() time in order to update the + *     isize or bytes.  We keep these delayed items to coalesce these operations + *     into a single operation done on demand.  These are an easy way to reclaim + *     metadata space. + * + *   FLUSH_DELALLOC + *     Look at the delalloc comment to get an idea of how much space is reserved + *     for delayed allocation.  We can reclaim some of this space simply by + *     running delalloc, but usually we need to wait for ordered extents to + *     reclaim the bulk of this space. + * + *   FLUSH_DELAYED_REFS + *     We have a block reserve for the outstanding delayed refs space, and every + *     delayed ref operation holds a reservation.  Running these is a quick way + *     to reclaim space, but we want to hold this until the end because COW can + *     churn a lot and we can avoid making some extent tree modifications if we + *     are able to delay for as long as possible. + * + *   ALLOC_CHUNK + *     We will skip this the first time through space reservation, because of + *     overcommit and we don't want to have a lot of useless metadata space when + *     our worst case reservations will likely never come true. + * + *   RUN_DELAYED_IPUTS + *     If we're freeing inodes we're likely freeing checksums, file extent + *     items, and extent tree items.  Loads of space could be freed up by these + *     operations, however they won't be usable until the transaction commits. + * + *   COMMIT_TRANS + *     may_commit_transaction() is the ultimate arbiter on whether we commit the + *     transaction or not.  In order to avoid constantly churning we do all the + *     above flushing first and then commit the transaction as the last resort. + *     However we need to take into account things like pinned space that would + *     be freed, plus any delayed work we may not have gotten rid of in the case + *     of metadata. + * + * OVERCOMMIT + * + *   Because we hold so many reservations for metadata we will allow you to + *   reserve more space than is currently free in the currently allocate + *   metadata space.  This only happens with metadata, data does not allow + *   overcommitting. + * + *   You can see the current logic for when we allow overcommit in + *   btrfs_can_overcommit(), but it only applies to unallocated space.  If there + *   is no unallocated space to be had, all reservations are kept within the + *   free space in the allocated metadata chunks. + * + *   Because of overcommitting, you generally want to use the + *   btrfs_can_overcommit() logic for metadata allocations, as it does the right + *   thing with or without extra unallocated space. + */ +  u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,  			  bool may_use_included)  { @@ -159,25 +306,19 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)  	return (global->size << 1);  } -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, -			 struct btrfs_space_info *space_info, u64 bytes, -			 enum btrfs_reserve_flush_enum flush) +static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, +			  struct btrfs_space_info *space_info, +			  enum btrfs_reserve_flush_enum flush)  {  	u64 profile;  	u64 avail; -	u64 used;  	int factor; -	/* Don't overcommit when in mixed mode. */ -	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) -		return 0; -  	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)  		profile = btrfs_system_alloc_profile(fs_info);  	else  		profile = btrfs_metadata_alloc_profile(fs_info); -	used = btrfs_space_info_used(space_info, true);  	avail = atomic64_read(&fs_info->free_chunk_space);  	/* @@ -198,6 +339,22 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,  		avail >>= 3;  	else  		avail >>= 1; +	return avail; +} + +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, +			 struct btrfs_space_info *space_info, u64 bytes, +			 enum btrfs_reserve_flush_enum flush) +{ +	u64 avail; +	u64 used; + +	/* Don't overcommit when in mixed mode */ +	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) +		return 0; + +	used = btrfs_space_info_used(space_info, true); +	avail = calc_available_free_space(fs_info, space_info, flush);  	if (used + bytes < space_info->total_bytes + avail)  		return 1; @@ -232,6 +389,8 @@ again:  							      space_info,  							      ticket->bytes);  			list_del_init(&ticket->list); +			ASSERT(space_info->reclaim_size >= ticket->bytes); +			space_info->reclaim_size -= ticket->bytes;  			ticket->bytes = 0;  			space_info->tickets_id++;  			wake_up(&ticket->wait); @@ -627,15 +786,26 @@ static inline u64  btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,  				 struct btrfs_space_info *space_info)  { -	struct reserve_ticket *ticket;  	u64 used; +	u64 avail;  	u64 expected; -	u64 to_reclaim = 0; +	u64 to_reclaim = space_info->reclaim_size; + +	lockdep_assert_held(&space_info->lock); + +	avail = calc_available_free_space(fs_info, space_info, +					  BTRFS_RESERVE_FLUSH_ALL); +	used = btrfs_space_info_used(space_info, true); + +	/* +	 * We may be flushing because suddenly we have less space than we had +	 * before, and now we're well over-committed based on our current free +	 * space.  If that's the case add in our overage so we make sure to put +	 * appropriate pressure on the flushing state machine. +	 */ +	if (space_info->total_bytes + avail < used) +		to_reclaim += used - (space_info->total_bytes + avail); -	list_for_each_entry(ticket, &space_info->tickets, list) -		to_reclaim += ticket->bytes; -	list_for_each_entry(ticket, &space_info->priority_tickets, list) -		to_reclaim += ticket->bytes;  	if (to_reclaim)  		return to_reclaim; @@ -1020,8 +1190,10 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,  	 * the list and we will do our own flushing further down.  	 */  	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { +		ASSERT(space_info->reclaim_size >= 0);  		ticket.bytes = orig_bytes;  		ticket.error = 0; +		space_info->reclaim_size += ticket.bytes;  		init_waitqueue_head(&ticket.wait);  		if (flush == BTRFS_RESERVE_FLUSH_ALL) {  			list_add_tail(&ticket.list, &space_info->tickets); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 24514cd2c6c1..0a5001ef1481 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -54,6 +54,13 @@ struct btrfs_space_info {  	struct list_head ro_bgs;  	struct list_head priority_tickets;  	struct list_head tickets; + +	/* +	 * Size of space that needs to be reclaimed in order to satisfy pending +	 * tickets +	 */ +	u64 reclaim_size; +  	/*  	 * tickets_id just indicates the next ticket will be handled, so note  	 * it's not stored per ticket. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 67c63858812a..7932d8d07cff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -244,7 +244,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,  {  	struct btrfs_fs_info *fs_info = trans->fs_info; -	trans->aborted = errno; +	WRITE_ONCE(trans->aborted, errno);  	/* Nothing used. The other threads that have joined this  	 * transaction may be able to continue. */  	if (!trans->dirty && list_empty(&trans->new_bgs)) { @@ -873,7 +873,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,  			break;  #endif  		case Opt_err: -			btrfs_info(info, "unrecognized mount option '%s'", p); +			btrfs_err(info, "unrecognized mount option '%s'", p);  			ret = -EINVAL;  			goto out;  		default: @@ -1024,11 +1024,11 @@ out:  	return error;  } -static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, -					   u64 subvol_objectid) +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, +					  u64 subvol_objectid)  {  	struct btrfs_root *root = fs_info->tree_root; -	struct btrfs_root *fs_root; +	struct btrfs_root *fs_root = NULL;  	struct btrfs_root_ref *root_ref;  	struct btrfs_inode_ref *inode_ref;  	struct btrfs_key key; @@ -1096,9 +1096,10 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,  		key.objectid = subvol_objectid;  		key.type = BTRFS_ROOT_ITEM_KEY;  		key.offset = (u64)-1; -		fs_root = btrfs_read_fs_root_no_name(fs_info, &key); +		fs_root = btrfs_get_fs_root(fs_info, &key, true);  		if (IS_ERR(fs_root)) {  			ret = PTR_ERR(fs_root); +			fs_root = NULL;  			goto err;  		} @@ -1143,6 +1144,8 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,  			ptr[0] = '/';  			btrfs_release_path(path);  		} +		btrfs_put_root(fs_root); +		fs_root = NULL;  	}  	btrfs_free_path(path); @@ -1155,6 +1158,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,  	return name;  err: +	btrfs_put_root(fs_root);  	btrfs_free_path(path);  	kfree(name);  	return ERR_PTR(ret); @@ -1438,8 +1442,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,  				goto out;  			}  		} -		subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), -							    subvol_objectid); +		subvol_name = btrfs_get_subvol_name_from_objectid( +					btrfs_sb(mnt->mnt_sb), subvol_objectid);  		if (IS_ERR(subvol_name)) {  			root = ERR_CAST(subvol_name);  			subvol_name = NULL; @@ -1518,14 +1522,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,  	/*  	 * Setup a dummy root and fs_info for test/set super.  This is because  	 * we don't actually fill this stuff out until open_ctree, but we need -	 * it for searching for existing supers, so this lets us do that and -	 * then open_ctree will properly initialize everything later. +	 * then open_ctree will properly initialize the file system specific +	 * settings later.  btrfs_init_fs_info initializes the static elements +	 * of the fs_info (locks and such) to make cleanup easier if we find a +	 * superblock with our given fs_devices later on at sget() time.  	 */  	fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);  	if (!fs_info) {  		error = -ENOMEM;  		goto error_sec_opts;  	} +	btrfs_init_fs_info(fs_info);  	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);  	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); @@ -1571,7 +1578,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,  	if (s->s_root) {  		btrfs_close_devices(fs_devices); -		free_fs_info(fs_info); +		btrfs_free_fs_info(fs_info);  		if ((flags ^ s->s_flags) & SB_RDONLY)  			error = -EBUSY;  	} else { @@ -1594,7 +1601,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,  error_close_devices:  	btrfs_close_devices(fs_devices);  error_fs_info: -	free_fs_info(fs_info); +	btrfs_free_fs_info(fs_info);  error_sec_opts:  	security_free_mnt_opts(&new_sec_opts);  	return ERR_PTR(error); @@ -2170,7 +2177,7 @@ static void btrfs_kill_super(struct super_block *sb)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);  	kill_anon_super(sb); -	free_fs_info(fs_info); +	btrfs_free_fs_info(fs_info);  }  static struct file_system_type btrfs_fs_type = { @@ -2203,7 +2210,7 @@ static int btrfs_control_open(struct inode *inode, struct file *file)  }  /* - * used by btrfsctl to scan devices when no FS is mounted + * Used by /dev/btrfs-control for devices ioctls.   */  static long btrfs_control_ioctl(struct file *file, unsigned int cmd,  				unsigned long arg) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3c10e78924d0..a39bff64ff24 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -155,7 +155,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,  	} else  		val = can_modify_feature(fa); -	return snprintf(buf, PAGE_SIZE, "%d\n", val); +	return scnprintf(buf, PAGE_SIZE, "%d\n", val);  }  static ssize_t btrfs_feature_attr_store(struct kobject *kobj, @@ -295,7 +295,7 @@ static const struct attribute_group btrfs_feature_attr_group = {  static ssize_t rmdir_subvol_show(struct kobject *kobj,  				 struct kobj_attribute *ka, char *buf)  { -	return snprintf(buf, PAGE_SIZE, "0\n"); +	return scnprintf(buf, PAGE_SIZE, "0\n");  }  BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); @@ -310,12 +310,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,  		 * This "trick" only works as long as 'enum btrfs_csum_type' has  		 * no holes in it  		 */ -		ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s", +		ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",  				(i == 0 ? "" : " "), btrfs_super_csum_name(i));  	} -	ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n"); +	ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");  	return ret;  }  BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); @@ -350,7 +350,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%lld\n", +	return scnprintf(buf, PAGE_SIZE, "%lld\n",  			atomic64_read(&fs_info->discard_ctl.discardable_bytes));  }  BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); @@ -361,7 +361,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%d\n", +	return scnprintf(buf, PAGE_SIZE, "%d\n",  			atomic_read(&fs_info->discard_ctl.discardable_extents));  }  BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); @@ -372,7 +372,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%lld\n", +	return scnprintf(buf, PAGE_SIZE, "%lld\n",  			fs_info->discard_ctl.discard_bitmap_bytes);  }  BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -383,7 +383,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%lld\n", +	return scnprintf(buf, PAGE_SIZE, "%lld\n",  		atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));  }  BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); @@ -394,7 +394,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%lld\n", +	return scnprintf(buf, PAGE_SIZE, "%lld\n",  			fs_info->discard_ctl.discard_extent_bytes);  }  BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -405,7 +405,7 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%u\n", +	return scnprintf(buf, PAGE_SIZE, "%u\n",  			READ_ONCE(fs_info->discard_ctl.iops_limit));  } @@ -435,7 +435,7 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%u\n", +	return scnprintf(buf, PAGE_SIZE, "%u\n",  			READ_ONCE(fs_info->discard_ctl.kbps_limit));  } @@ -465,7 +465,7 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%llu\n", +	return scnprintf(buf, PAGE_SIZE, "%llu\n",  			READ_ONCE(fs_info->discard_ctl.max_discard_size));  } @@ -530,7 +530,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)  	val = *value_ptr;  	if (lock)  		spin_unlock(lock); -	return snprintf(buf, PAGE_SIZE, "%llu\n", val); +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);  }  static ssize_t global_rsv_size_show(struct kobject *kobj, @@ -576,7 +576,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,  			val += block_group->used;  	}  	up_read(&sinfo->groups_sem); -	return snprintf(buf, PAGE_SIZE, "%llu\n", val); +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);  }  static struct attribute *raid_attrs[] = { @@ -613,7 +613,7 @@ static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,  {  	struct btrfs_space_info *sinfo = to_space_info(kobj);  	s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); -	return snprintf(buf, PAGE_SIZE, "%lld\n", val); +	return scnprintf(buf, PAGE_SIZE, "%lld\n", val);  }  SPACE_INFO_ATTR(flags); @@ -670,7 +670,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,  	ssize_t ret;  	spin_lock(&fs_info->super_lock); -	ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); +	ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);  	spin_unlock(&fs_info->super_lock);  	return ret; @@ -718,7 +718,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); +	return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);  }  BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -728,8 +728,8 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%u\n", -			fs_info->super_copy->sectorsize); +	return scnprintf(buf, PAGE_SIZE, "%u\n", +			 fs_info->super_copy->sectorsize);  }  BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -739,8 +739,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%u\n", -			fs_info->super_copy->sectorsize); +	return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);  }  BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -752,7 +751,7 @@ static ssize_t quota_override_show(struct kobject *kobj,  	int quota_override;  	quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); -	return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); +	return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);  }  static ssize_t quota_override_store(struct kobject *kobj, @@ -790,7 +789,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,  {  	struct btrfs_fs_info *fs_info = to_fs_info(kobj); -	return snprintf(buf, PAGE_SIZE, "%pU\n", +	return scnprintf(buf, PAGE_SIZE, "%pU\n",  			fs_info->fs_devices->metadata_uuid);  } @@ -802,7 +801,7 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,  	struct btrfs_fs_info *fs_info = to_fs_info(kobj);  	u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); -	return snprintf(buf, PAGE_SIZE, "%s (%s)\n", +	return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",  			btrfs_super_csum_name(csum_type),  			crypto_shash_driver_name(fs_info->csum_shash));  } @@ -960,7 +959,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)  	addrm_unknown_feature_attrs(fs_info, false);  	sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);  	sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); -	btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); +	btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);  }  static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -992,7 +991,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)  			continue;  		name = btrfs_feature_attrs[set][i].kobj_attr.attr.name; -		len += snprintf(str + len, bufsize - len, "%s%s", +		len += scnprintf(str + len, bufsize - len, "%s%s",  				len ? "," : "", name);  	} @@ -1149,7 +1148,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,  /* when one_device is NULL, it removes all device links */ -int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,  		struct btrfs_device *one_device)  {  	struct hd_struct *disk; @@ -1201,11 +1200,11 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,  	val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); -	return snprintf(buf, PAGE_SIZE, "%d\n", val); +	return scnprintf(buf, PAGE_SIZE, "%d\n", val);  }  BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); -static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, +static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,  					struct kobj_attribute *a, char *buf)  {  	int val; @@ -1214,9 +1213,9 @@ static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,  	val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); -	return snprintf(buf, PAGE_SIZE, "%d\n", val); +	return scnprintf(buf, PAGE_SIZE, "%d\n", val);  } -BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show); +BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);  static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,  					         struct kobj_attribute *a, @@ -1228,7 +1227,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,  	val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); -	return snprintf(buf, PAGE_SIZE, "%d\n", val); +	return scnprintf(buf, PAGE_SIZE, "%d\n", val);  }  BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); @@ -1241,7 +1240,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,  	val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); -	return snprintf(buf, PAGE_SIZE, "%d\n", val); +	return scnprintf(buf, PAGE_SIZE, "%d\n", val);  }  BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); @@ -1269,7 +1268,7 @@ static struct kobj_type devid_ktype = {  	.release	= btrfs_release_devid_kobj,  }; -int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,  				struct btrfs_device *one_device)  {  	int error = 0; @@ -1371,7 +1370,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)  	if (!fs_devs->devices_kobj) {  		btrfs_err(fs_devs->fs_info,  			  "failed to init sysfs device interface"); -		kobject_put(&fs_devs->fsid_kobj); +		btrfs_sysfs_remove_fsid(fs_devs);  		return -ENOMEM;  	} @@ -1395,13 +1394,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)  	btrfs_set_fs_info_ptr(fs_info); -	error = btrfs_sysfs_add_device_link(fs_devs, NULL); +	error = btrfs_sysfs_add_devices_dir(fs_devs, NULL);  	if (error)  		return error;  	error = sysfs_create_files(fsid_kobj, btrfs_attrs);  	if (error) { -		btrfs_sysfs_rm_device_link(fs_devs, NULL); +		btrfs_sysfs_remove_devices_dir(fs_devs, NULL);  		return error;  	} diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c68582add92e..718a26c97833 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -14,9 +14,9 @@ enum btrfs_feature_set {  char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);  const char * const btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,  		struct btrfs_device *one_device); -int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,                  struct btrfs_device *one_device);  int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);  void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 84fb3fa940a6..999c14e5d0bd 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -120,6 +120,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)  		kfree(fs_info);  		return NULL;  	} +	INIT_LIST_HEAD(&fs_info->fs_devices->devices); +  	fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),  				      GFP_KERNEL);  	if (!fs_info->super_copy) { @@ -128,39 +130,10 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)  		return NULL;  	} +	btrfs_init_fs_info(fs_info); +  	fs_info->nodesize = nodesize;  	fs_info->sectorsize = sectorsize; - -	if (init_srcu_struct(&fs_info->subvol_srcu)) { -		kfree(fs_info->fs_devices); -		kfree(fs_info->super_copy); -		kfree(fs_info); -		return NULL; -	} - -	spin_lock_init(&fs_info->buffer_lock); -	spin_lock_init(&fs_info->qgroup_lock); -	spin_lock_init(&fs_info->super_lock); -	spin_lock_init(&fs_info->fs_roots_radix_lock); -	mutex_init(&fs_info->qgroup_ioctl_lock); -	mutex_init(&fs_info->qgroup_rescan_lock); -	rwlock_init(&fs_info->tree_mod_log_lock); -	fs_info->running_transaction = NULL; -	fs_info->qgroup_tree = RB_ROOT; -	fs_info->qgroup_ulist = NULL; -	atomic64_set(&fs_info->tree_mod_seq, 0); -	INIT_LIST_HEAD(&fs_info->dirty_qgroups); -	INIT_LIST_HEAD(&fs_info->dead_roots); -	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); -	INIT_LIST_HEAD(&fs_info->fs_devices->devices); -	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); -	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); -	extent_io_tree_init(fs_info, &fs_info->freed_extents[0], -			    IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); -	extent_io_tree_init(fs_info, &fs_info->freed_extents[1], -			    IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); -	extent_map_tree_init(&fs_info->mapping_tree); -	fs_info->pinned_extents = &fs_info->freed_extents[0];  	set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);  	test_mnt->mnt_sb->s_fs_info = fs_info; @@ -210,8 +183,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)  	}  	btrfs_free_qgroup_config(fs_info);  	btrfs_free_fs_roots(fs_info); -	cleanup_srcu_struct(&fs_info->subvol_srcu);  	kfree(fs_info->super_copy); +	btrfs_check_leaked_roots(fs_info); +	btrfs_extent_buffer_leak_debug_check(fs_info);  	kfree(fs_info->fs_devices);  	kfree(fs_info);  } @@ -223,11 +197,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)  	/* Will be freed by btrfs_free_fs_roots */  	if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))  		return; -	if (root->node) { -		/* One for allocate_extent_buffer */ -		free_extent_buffer(root->node); -	} -	kfree(root); +	btrfs_put_root(root);  }  struct btrfs_block_group * diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ac035a6fa003..ce1ca8e73c2d 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -507,6 +507,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)  		test_err("couldn't insert fs root %d", ret);  		goto out;  	} +	btrfs_put_root(tmp_root);  	tmp_root = btrfs_alloc_dummy_root(fs_info);  	if (IS_ERR(tmp_root)) { @@ -521,6 +522,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)  		test_err("couldn't insert fs root %d", ret);  		goto out;  	} +	btrfs_put_root(tmp_root);  	test_msg("running qgroup tests");  	ret = test_no_shared_qgroup(root, sectorsize, nodesize); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index beb6c69cd1e5..8cede6eb9843 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -221,7 +221,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)  	WARN_ON_ONCE(!list_empty(&trans->new_bgs));  	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, -				trans->chunk_bytes_reserved); +				trans->chunk_bytes_reserved, NULL);  	trans->chunk_bytes_reserved = 0;  } @@ -243,7 +243,7 @@ loop:  	cur_trans = fs_info->running_transaction;  	if (cur_trans) { -		if (cur_trans->aborted) { +		if (TRANS_ABORTED(cur_trans)) {  			spin_unlock(&fs_info->trans_lock);  			return cur_trans->aborted;  		} @@ -336,6 +336,8 @@ loop:  	list_add_tail(&cur_trans->list, &fs_info->trans_list);  	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,  			IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); +	extent_io_tree_init(fs_info, &cur_trans->pinned_extents, +			IO_TREE_FS_PINNED_EXTENTS, NULL);  	fs_info->generation++;  	cur_trans->transid = fs_info->generation;  	fs_info->running_transaction = cur_trans; @@ -459,7 +461,7 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans)  {  	return (trans->state >= TRANS_STATE_COMMIT_START &&  		trans->state < TRANS_STATE_UNBLOCKED && -		!trans->aborted); +		!TRANS_ABORTED(trans));  }  /* wait for commit against the current transaction to become unblocked @@ -478,7 +480,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)  		wait_event(fs_info->transaction_wait,  			   cur_trans->state >= TRANS_STATE_UNBLOCKED || -			   cur_trans->aborted); +			   TRANS_ABORTED(cur_trans));  		btrfs_put_transaction(cur_trans);  	} else {  		spin_unlock(&fs_info->trans_lock); @@ -673,7 +675,7 @@ join_fail:  alloc_fail:  	if (num_bytes)  		btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, -					num_bytes); +					num_bytes, NULL);  reserve_fail:  	btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);  	return ERR_PTR(ret); @@ -896,7 +898,7 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)  	trace_btrfs_space_reservation(fs_info, "transaction",  				      trans->transid, trans->bytes_reserved, 0);  	btrfs_block_rsv_release(fs_info, trans->block_rsv, -				trans->bytes_reserved); +				trans->bytes_reserved, NULL);  	trans->bytes_reserved = 0;  } @@ -937,7 +939,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,  	if (throttle)  		btrfs_run_delayed_iputs(info); -	if (trans->aborted || +	if (TRANS_ABORTED(trans) ||  	    test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {  		wake_up_process(info->transaction_kthread);  		err = -EIO; @@ -1262,8 +1264,10 @@ void btrfs_add_dead_root(struct btrfs_root *root)  	struct btrfs_fs_info *fs_info = root->fs_info;  	spin_lock(&fs_info->trans_lock); -	if (list_empty(&root->root_list)) +	if (list_empty(&root->root_list)) { +		btrfs_grab_root(root);  		list_add_tail(&root->root_list, &fs_info->dead_roots); +	}  	spin_unlock(&fs_info->trans_lock);  } @@ -1477,7 +1481,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	u64 index = 0;  	u64 objectid;  	u64 root_flags; -	uuid_le new_uuid;  	ASSERT(pending->path);  	path = pending->path; @@ -1570,8 +1573,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	btrfs_set_root_generation_v2(new_root_item,  			trans->transid); -	uuid_le_gen(&new_uuid); -	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); +	generate_random_guid(new_root_item->uuid);  	memcpy(new_root_item->parent_uuid, root->root_item.uuid,  			BTRFS_UUID_SIZE);  	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { @@ -1633,7 +1635,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	}  	key.offset = (u64)-1; -	pending->snap = btrfs_read_fs_root_no_name(fs_info, &key); +	pending->snap = btrfs_get_fs_root(fs_info, &key, true);  	if (IS_ERR(pending->snap)) {  		ret = PTR_ERR(pending->snap);  		btrfs_abort_transaction(trans, ret); @@ -1682,7 +1684,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  		btrfs_abort_transaction(trans, ret);  		goto fail;  	} -	ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, +	ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, +				  BTRFS_UUID_KEY_SUBVOL,  				  objectid);  	if (ret) {  		btrfs_abort_transaction(trans, ret); @@ -1794,7 +1797,8 @@ static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,  					    struct btrfs_transaction *trans)  {  	wait_event(fs_info->transaction_blocked_wait, -		   trans->state >= TRANS_STATE_COMMIT_START || trans->aborted); +		   trans->state >= TRANS_STATE_COMMIT_START || +		   TRANS_ABORTED(trans));  }  /* @@ -1806,7 +1810,8 @@ static void wait_current_trans_commit_start_and_unblock(  					struct btrfs_transaction *trans)  {  	wait_event(fs_info->transaction_wait, -		   trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted); +		   trans->state >= TRANS_STATE_UNBLOCKED || +		   TRANS_ABORTED(trans));  }  /* @@ -2026,7 +2031,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	trans->dirty = true;  	/* Stop the commit early if ->aborted is set */ -	if (unlikely(READ_ONCE(cur_trans->aborted))) { +	if (TRANS_ABORTED(cur_trans)) {  		ret = cur_trans->aborted;  		btrfs_end_transaction(trans);  		return ret; @@ -2100,7 +2105,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  		wait_for_commit(cur_trans); -		if (unlikely(cur_trans->aborted)) +		if (TRANS_ABORTED(cur_trans))  			ret = cur_trans->aborted;  		btrfs_put_transaction(cur_trans); @@ -2119,7 +2124,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  			spin_unlock(&fs_info->trans_lock);  			wait_for_commit(prev_trans); -			ret = prev_trans->aborted; +			ret = READ_ONCE(prev_trans->aborted);  			btrfs_put_transaction(prev_trans);  			if (ret) @@ -2173,8 +2178,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	wait_event(cur_trans->writer_wait,  		   atomic_read(&cur_trans->num_writers) == 1); -	/* ->aborted might be set after the previous check, so check it */ -	if (unlikely(READ_ONCE(cur_trans->aborted))) { +	if (TRANS_ABORTED(cur_trans)) {  		ret = cur_trans->aborted;  		goto scrub_continue;  	} @@ -2191,10 +2195,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	 * core function of the snapshot creation.  	 */  	ret = create_pending_snapshots(trans); -	if (ret) { -		mutex_unlock(&fs_info->reloc_mutex); -		goto scrub_continue; -	} +	if (ret) +		goto unlock_reloc;  	/*  	 * We insert the dir indexes of the snapshots and update the inode @@ -2207,16 +2209,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	 * the nodes and leaves.  	 */  	ret = btrfs_run_delayed_items(trans); -	if (ret) { -		mutex_unlock(&fs_info->reloc_mutex); -		goto scrub_continue; -	} +	if (ret) +		goto unlock_reloc;  	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); -	if (ret) { -		mutex_unlock(&fs_info->reloc_mutex); -		goto scrub_continue; -	} +	if (ret) +		goto unlock_reloc;  	/*  	 * make sure none of the code above managed to slip in a @@ -2242,11 +2240,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	mutex_lock(&fs_info->tree_log_mutex);  	ret = commit_fs_roots(trans); -	if (ret) { -		mutex_unlock(&fs_info->tree_log_mutex); -		mutex_unlock(&fs_info->reloc_mutex); -		goto scrub_continue; -	} +	if (ret) +		goto unlock_tree_log;  	/*  	 * Since the transaction is done, we can apply the pending changes @@ -2264,39 +2259,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	 * new delayed refs. Must handle them or qgroup can be wrong.  	 */  	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); -	if (ret) { -		mutex_unlock(&fs_info->tree_log_mutex); -		mutex_unlock(&fs_info->reloc_mutex); -		goto scrub_continue; -	} +	if (ret) +		goto unlock_tree_log;  	/*  	 * Since fs roots are all committed, we can get a quite accurate  	 * new_roots. So let's do quota accounting.  	 */  	ret = btrfs_qgroup_account_extents(trans); -	if (ret < 0) { -		mutex_unlock(&fs_info->tree_log_mutex); -		mutex_unlock(&fs_info->reloc_mutex); -		goto scrub_continue; -	} +	if (ret < 0) +		goto unlock_tree_log;  	ret = commit_cowonly_roots(trans); -	if (ret) { -		mutex_unlock(&fs_info->tree_log_mutex); -		mutex_unlock(&fs_info->reloc_mutex); -		goto scrub_continue; -	} +	if (ret) +		goto unlock_tree_log;  	/*  	 * The tasks which save the space cache and inode cache may also  	 * update ->aborted, check it.  	 */ -	if (unlikely(READ_ONCE(cur_trans->aborted))) { +	if (TRANS_ABORTED(cur_trans)) {  		ret = cur_trans->aborted; -		mutex_unlock(&fs_info->tree_log_mutex); -		mutex_unlock(&fs_info->reloc_mutex); -		goto scrub_continue; +		goto unlock_tree_log;  	}  	btrfs_prepare_extent_commit(fs_info); @@ -2343,6 +2327,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	if (ret) {  		btrfs_handle_fs_error(fs_info, ret,  				      "Error while writing out transaction"); +		/* +		 * reloc_mutex has been unlocked, tree_log_mutex is still held +		 * but we can't jump to unlock_tree_log causing double unlock +		 */  		mutex_unlock(&fs_info->tree_log_mutex);  		goto scrub_continue;  	} @@ -2391,6 +2379,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)  	return ret; +unlock_tree_log: +	mutex_unlock(&fs_info->tree_log_mutex); +unlock_reloc: +	mutex_unlock(&fs_info->reloc_mutex);  scrub_continue:  	btrfs_scrub_continue(fs_info);  cleanup_transaction: @@ -2434,13 +2426,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)  	btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);  	btrfs_kill_all_delayed_nodes(root); +	if (root->ino_cache_inode) { +		iput(root->ino_cache_inode); +		root->ino_cache_inode = NULL; +	}  	if (btrfs_header_backref_rev(root->node) <  			BTRFS_MIXED_BACKREF_REV) -		ret = btrfs_drop_snapshot(root, NULL, 0, 0); +		ret = btrfs_drop_snapshot(root, 0, 0);  	else -		ret = btrfs_drop_snapshot(root, NULL, 1, 0); +		ret = btrfs_drop_snapshot(root, 1, 0); +	btrfs_put_root(root);  	return (ret < 0) ? 0 : 1;  } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 49f7196368f5..31ae8d273065 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -71,6 +71,7 @@ struct btrfs_transaction {  	 */  	struct list_head io_bgs;  	struct list_head dropped_roots; +	struct extent_io_tree pinned_extents;  	/*  	 * we need to make sure block group deletion doesn't race with @@ -115,6 +116,10 @@ struct btrfs_trans_handle {  	struct btrfs_block_rsv *orig_rsv;  	refcount_t use_count;  	unsigned int type; +	/* +	 * Error code of transaction abort, set outside of locks and must use +	 * the READ_ONCE/WRITE_ONCE access +	 */  	short aborted;  	bool adding_csums;  	bool allocating_chunk; @@ -126,6 +131,14 @@ struct btrfs_trans_handle {  	struct list_head new_bgs;  }; +/* + * The abort status can be changed between calls and is not protected by locks. + * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's + * set to a non-zero value it does not change, so the macro should be in checks + * but is not necessary for further reads of the value. + */ +#define TRANS_ABORTED(trans)		(unlikely(READ_ONCE((trans)->aborted))) +  struct btrfs_pending_snapshot {  	struct dentry *dentry;  	struct inode *dir; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7dd7552f53a4..58c111474ba5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,8 @@  #include "compression.h"  #include "qgroup.h"  #include "inode-map.h" +#include "block-group.h" +#include "space-info.h"  /* magic values for the inode_only field in btrfs_log_inode:   * @@ -94,8 +96,8 @@ enum {  static int btrfs_log_inode(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root, struct btrfs_inode *inode,  			   int inode_only, -			   const loff_t start, -			   const loff_t end, +			   u64 start, +			   u64 end,  			   struct btrfs_log_ctx *ctx);  static int link_to_fixup_dir(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root, @@ -311,7 +313,7 @@ static int process_one_buffer(struct btrfs_root *log,  	}  	if (wc->pin) -		ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, +		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,  						      eb->len);  	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { @@ -830,6 +832,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,  			goto out;  	} +	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, +						extent_end - start); +	if (ret) +		goto out; +  	inode_add_bytes(inode, nbytes);  update_inode:  	ret = btrfs_update_inode(trans, root, inode); @@ -2659,18 +2666,39 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,  	return ret;  } +/* + * Correctly adjust the reserved bytes occupied by a log tree extent buffer + */ +static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +{ +	struct btrfs_block_group *cache; + +	cache = btrfs_lookup_block_group(fs_info, start); +	if (!cache) { +		btrfs_err(fs_info, "unable to find block group for %llu", start); +		return; +	} + +	spin_lock(&cache->space_info->lock); +	spin_lock(&cache->lock); +	cache->reserved -= fs_info->nodesize; +	cache->space_info->bytes_reserved -= fs_info->nodesize; +	spin_unlock(&cache->lock); +	spin_unlock(&cache->space_info->lock); + +	btrfs_put_block_group(cache); +} +  static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   struct btrfs_path *path, int *level,  				   struct walk_control *wc)  {  	struct btrfs_fs_info *fs_info = root->fs_info; -	u64 root_owner;  	u64 bytenr;  	u64 ptr_gen;  	struct extent_buffer *next;  	struct extent_buffer *cur; -	struct extent_buffer *parent;  	u32 blocksize;  	int ret = 0; @@ -2690,9 +2718,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);  		blocksize = fs_info->nodesize; -		parent = path->nodes[*level]; -		root_owner = btrfs_header_owner(parent); -  		next = btrfs_find_create_tree_block(fs_info, bytenr);  		if (IS_ERR(next))  			return PTR_ERR(next); @@ -2720,18 +2745,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,  					btrfs_clean_tree_block(next);  					btrfs_wait_tree_block_writeback(next);  					btrfs_tree_unlock(next); +					ret = btrfs_pin_reserved_extent(trans, +							bytenr, blocksize); +					if (ret) { +						free_extent_buffer(next); +						return ret; +					}  				} else {  					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))  						clear_extent_buffer_dirty(next); -				} - -				WARN_ON(root_owner != -					BTRFS_TREE_LOG_OBJECTID); -				ret = btrfs_pin_reserved_extent(fs_info, -							bytenr, blocksize); -				if (ret) { -					free_extent_buffer(next); -					return ret; +					unaccount_log_buffer(fs_info, bytenr);  				}  			}  			free_extent_buffer(next); @@ -2762,7 +2785,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,  				 struct walk_control *wc)  {  	struct btrfs_fs_info *fs_info = root->fs_info; -	u64 root_owner;  	int i;  	int slot;  	int ret; @@ -2775,13 +2797,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,  			WARN_ON(*level == 0);  			return 0;  		} else { -			struct extent_buffer *parent; -			if (path->nodes[*level] == root->node) -				parent = path->nodes[*level]; -			else -				parent = path->nodes[*level + 1]; - -			root_owner = btrfs_header_owner(parent);  			ret = wc->process_func(root, path->nodes[*level], wc,  				 btrfs_header_generation(path->nodes[*level]),  				 *level); @@ -2799,17 +2814,18 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,  					btrfs_clean_tree_block(next);  					btrfs_wait_tree_block_writeback(next);  					btrfs_tree_unlock(next); +					ret = btrfs_pin_reserved_extent(trans, +						     path->nodes[*level]->start, +						     path->nodes[*level]->len); +					if (ret) +						return ret;  				} else {  					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))  						clear_extent_buffer_dirty(next); -				} -				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); -				ret = btrfs_pin_reserved_extent(fs_info, -						path->nodes[*level]->start, -						path->nodes[*level]->len); -				if (ret) -					return ret; +					unaccount_log_buffer(fs_info, +						path->nodes[*level]->start); +				}  			}  			free_extent_buffer(path->nodes[*level]);  			path->nodes[*level] = NULL; @@ -2880,15 +2896,15 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,  				btrfs_clean_tree_block(next);  				btrfs_wait_tree_block_writeback(next);  				btrfs_tree_unlock(next); +				ret = btrfs_pin_reserved_extent(trans, +						next->start, next->len); +				if (ret) +					goto out;  			} else {  				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))  					clear_extent_buffer_dirty(next); +				unaccount_log_buffer(fs_info, next->start);  			} - -			ret = btrfs_pin_reserved_extent(fs_info, next->start, -							next->len); -			if (ret) -				goto out;  		}  	} @@ -3283,8 +3299,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,  	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,  			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); -	free_extent_buffer(log->node); -	kfree(log); +	btrfs_put_root(log);  }  /* @@ -4518,13 +4533,15 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,  static int btrfs_log_holes(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct btrfs_inode *inode, -			   struct btrfs_path *path) +			   struct btrfs_path *path, +			   const u64 start, +			   const u64 end)  {  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_key key;  	const u64 ino = btrfs_ino(inode);  	const u64 i_size = i_size_read(&inode->vfs_inode); -	u64 prev_extent_end = 0; +	u64 prev_extent_end = start;  	int ret;  	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) @@ -4532,16 +4549,21 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,  	key.objectid = ino;  	key.type = BTRFS_EXTENT_DATA_KEY; -	key.offset = 0; +	key.offset = start;  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  	if (ret < 0)  		return ret; +	if (ret > 0 && path->slots[0] > 0) { +		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); +		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) +			path->slots[0]--; +	} +  	while (true) { -		struct btrfs_file_extent_item *extent;  		struct extent_buffer *leaf = path->nodes[0]; -		u64 len; +		u64 extent_end;  		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {  			ret = btrfs_next_leaf(root, path); @@ -4558,9 +4580,18 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,  		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)  			break; +		extent_end = btrfs_file_extent_end(path); +		if (extent_end <= start) +			goto next_slot; +  		/* We have a hole, log it. */  		if (prev_extent_end < key.offset) { -			const u64 hole_len = key.offset - prev_extent_end; +			u64 hole_len; + +			if (key.offset >= end) +				hole_len = end - prev_extent_end; +			else +				hole_len = key.offset - prev_extent_end;  			/*  			 * Release the path to avoid deadlocks with other code @@ -4590,27 +4621,20 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,  			leaf = path->nodes[0];  		} -		extent = btrfs_item_ptr(leaf, path->slots[0], -					struct btrfs_file_extent_item); -		if (btrfs_file_extent_type(leaf, extent) == -		    BTRFS_FILE_EXTENT_INLINE) { -			len = btrfs_file_extent_ram_bytes(leaf, extent); -			prev_extent_end = ALIGN(key.offset + len, -						fs_info->sectorsize); -		} else { -			len = btrfs_file_extent_num_bytes(leaf, extent); -			prev_extent_end = key.offset + len; -		} - +		prev_extent_end = min(extent_end, end); +		if (extent_end >= end) +			break; +next_slot:  		path->slots[0]++;  		cond_resched();  	} -	if (prev_extent_end < i_size) { +	if (prev_extent_end < end && prev_extent_end < i_size) {  		u64 hole_len;  		btrfs_release_path(path); -		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); +		hole_len = min(ALIGN(i_size, fs_info->sectorsize), end); +		hole_len -= prev_extent_end;  		ret = btrfs_insert_file_extent(trans, root->log_root,  					       ino, prev_extent_end, 0, 0,  					       hole_len, 0, hole_len, @@ -4938,6 +4962,178 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,  	return ret;  } +static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, +				   struct btrfs_inode *inode, +				   struct btrfs_key *min_key, +				   const struct btrfs_key *max_key, +				   struct btrfs_path *path, +				   struct btrfs_path *dst_path, +				   const u64 logged_isize, +				   const bool recursive_logging, +				   const int inode_only, +				   const u64 start, +				   const u64 end, +				   struct btrfs_log_ctx *ctx, +				   bool *need_log_inode_item) +{ +	struct btrfs_root *root = inode->root; +	int ins_start_slot = 0; +	int ins_nr = 0; +	int ret; + +	/* +	 * We must make sure we don't copy extent items that are entirely out of +	 * the range [start, end - 1]. This is not just an optimization to avoid +	 * copying but also needed to avoid a corruption where we end up with +	 * file extent items in the log tree that have overlapping ranges - this +	 * can happen if we race with ordered extent completion for ranges that +	 * are outside our target range. For example we copy an extent item and +	 * when we move to the next leaf, that extent was trimmed and a new one +	 * covering a subrange of it, but with a higher key, was inserted - we +	 * would then copy this other extent too, resulting in a log tree with +	 * 2 extent items that represent overlapping ranges. +	 * +	 * We can copy the entire extents at the range bondaries however, even +	 * if they cover an area outside the target range. That's ok. +	 */ +	while (1) { +		ret = btrfs_search_forward(root, min_key, path, trans->transid); +		if (ret < 0) +			return ret; +		if (ret > 0) { +			ret = 0; +			break; +		} +again: +		/* Note, ins_nr might be > 0 here, cleanup outside the loop */ +		if (min_key->objectid != max_key->objectid) +			break; +		if (min_key->type > max_key->type) +			break; + +		if (min_key->type == BTRFS_INODE_ITEM_KEY) +			*need_log_inode_item = false; + +		if ((min_key->type == BTRFS_INODE_REF_KEY || +		     min_key->type == BTRFS_INODE_EXTREF_KEY) && +		    inode->generation == trans->transid && +		    !recursive_logging) { +			u64 other_ino = 0; +			u64 other_parent = 0; + +			ret = btrfs_check_ref_name_override(path->nodes[0], +					path->slots[0], min_key, inode, +					&other_ino, &other_parent); +			if (ret < 0) { +				return ret; +			} else if (ret > 0 && ctx && +				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { +				if (ins_nr > 0) { +					ins_nr++; +				} else { +					ins_nr = 1; +					ins_start_slot = path->slots[0]; +				} +				ret = copy_items(trans, inode, dst_path, path, +						 ins_start_slot, ins_nr, +						 inode_only, logged_isize); +				if (ret < 0) +					return ret; +				ins_nr = 0; + +				ret = log_conflicting_inodes(trans, root, path, +						ctx, other_ino, other_parent); +				if (ret) +					return ret; +				btrfs_release_path(path); +				goto next_key; +			} +		} + +		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ +		if (min_key->type == BTRFS_XATTR_ITEM_KEY) { +			if (ins_nr == 0) +				goto next_slot; +			ret = copy_items(trans, inode, dst_path, path, +					 ins_start_slot, +					 ins_nr, inode_only, logged_isize); +			if (ret < 0) +				return ret; +			ins_nr = 0; +			goto next_slot; +		} + +		if (min_key->type == BTRFS_EXTENT_DATA_KEY) { +			const u64 extent_end = btrfs_file_extent_end(path); + +			if (extent_end <= start) { +				if (ins_nr > 0) { +					ret = copy_items(trans, inode, dst_path, +							 path, ins_start_slot, +							 ins_nr, inode_only, +							 logged_isize); +					if (ret < 0) +						return ret; +					ins_nr = 0; +				} +				goto next_slot; +			} +			if (extent_end >= end) { +				ins_nr++; +				if (ins_nr == 1) +					ins_start_slot = path->slots[0]; +				break; +			} +		} + +		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { +			ins_nr++; +			goto next_slot; +		} else if (!ins_nr) { +			ins_start_slot = path->slots[0]; +			ins_nr = 1; +			goto next_slot; +		} + +		ret = copy_items(trans, inode, dst_path, path, ins_start_slot, +				 ins_nr, inode_only, logged_isize); +		if (ret < 0) +			return ret; +		ins_nr = 1; +		ins_start_slot = path->slots[0]; +next_slot: +		path->slots[0]++; +		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { +			btrfs_item_key_to_cpu(path->nodes[0], min_key, +					      path->slots[0]); +			goto again; +		} +		if (ins_nr) { +			ret = copy_items(trans, inode, dst_path, path, +					 ins_start_slot, ins_nr, inode_only, +					 logged_isize); +			if (ret < 0) +				return ret; +			ins_nr = 0; +		} +		btrfs_release_path(path); +next_key: +		if (min_key->offset < (u64)-1) { +			min_key->offset++; +		} else if (min_key->type < max_key->type) { +			min_key->type++; +			min_key->offset = 0; +		} else { +			break; +		} +	} +	if (ins_nr) +		ret = copy_items(trans, inode, dst_path, path, ins_start_slot, +				 ins_nr, inode_only, logged_isize); + +	return ret; +} +  /* log a single inode in the tree log.   * At least one parent directory for this inode must exist in the tree   * or be logged already. @@ -4955,8 +5151,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,  static int btrfs_log_inode(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root, struct btrfs_inode *inode,  			   int inode_only, -			   const loff_t start, -			   const loff_t end, +			   u64 start, +			   u64 end,  			   struct btrfs_log_ctx *ctx)  {  	struct btrfs_fs_info *fs_info = root->fs_info; @@ -4967,9 +5163,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  	struct btrfs_root *log = root->log_root;  	int err = 0;  	int ret; -	int nritems; -	int ins_start_slot = 0; -	int ins_nr;  	bool fast_search = false;  	u64 ino = btrfs_ino(inode);  	struct extent_map_tree *em_tree = &inode->extent_tree; @@ -4987,6 +5180,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  		return -ENOMEM;  	} +	start = ALIGN_DOWN(start, fs_info->sectorsize); +	end = ALIGN(end, fs_info->sectorsize); +  	min_key.objectid = ino;  	min_key.type = BTRFS_INODE_ITEM_KEY;  	min_key.offset = 0; @@ -5100,139 +5296,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  		goto out_unlock;  	} -	while (1) { -		ins_nr = 0; -		ret = btrfs_search_forward(root, &min_key, -					   path, trans->transid); -		if (ret < 0) { -			err = ret; -			goto out_unlock; -		} -		if (ret != 0) -			break; -again: -		/* note, ins_nr might be > 0 here, cleanup outside the loop */ -		if (min_key.objectid != ino) -			break; -		if (min_key.type > max_key.type) -			break; - -		if (min_key.type == BTRFS_INODE_ITEM_KEY) -			need_log_inode_item = false; - -		if ((min_key.type == BTRFS_INODE_REF_KEY || -		     min_key.type == BTRFS_INODE_EXTREF_KEY) && -		    inode->generation == trans->transid && -		    !recursive_logging) { -			u64 other_ino = 0; -			u64 other_parent = 0; - -			ret = btrfs_check_ref_name_override(path->nodes[0], -					path->slots[0], &min_key, inode, -					&other_ino, &other_parent); -			if (ret < 0) { -				err = ret; -				goto out_unlock; -			} else if (ret > 0 && ctx && -				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { -				if (ins_nr > 0) { -					ins_nr++; -				} else { -					ins_nr = 1; -					ins_start_slot = path->slots[0]; -				} -				ret = copy_items(trans, inode, dst_path, path, -						 ins_start_slot, -						 ins_nr, inode_only, -						 logged_isize); -				if (ret < 0) { -					err = ret; -					goto out_unlock; -				} -				ins_nr = 0; - -				err = log_conflicting_inodes(trans, root, path, -						ctx, other_ino, other_parent); -				if (err) -					goto out_unlock; -				btrfs_release_path(path); -				goto next_key; -			} -		} - -		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ -		if (min_key.type == BTRFS_XATTR_ITEM_KEY) { -			if (ins_nr == 0) -				goto next_slot; -			ret = copy_items(trans, inode, dst_path, path, -					 ins_start_slot, -					 ins_nr, inode_only, logged_isize); -			if (ret < 0) { -				err = ret; -				goto out_unlock; -			} -			ins_nr = 0; -			goto next_slot; -		} - -		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { -			ins_nr++; -			goto next_slot; -		} else if (!ins_nr) { -			ins_start_slot = path->slots[0]; -			ins_nr = 1; -			goto next_slot; -		} - -		ret = copy_items(trans, inode, dst_path, path, -				 ins_start_slot, ins_nr, inode_only, -				 logged_isize); -		if (ret < 0) { -			err = ret; -			goto out_unlock; -		} -		ins_nr = 1; -		ins_start_slot = path->slots[0]; -next_slot: - -		nritems = btrfs_header_nritems(path->nodes[0]); -		path->slots[0]++; -		if (path->slots[0] < nritems) { -			btrfs_item_key_to_cpu(path->nodes[0], &min_key, -					      path->slots[0]); -			goto again; -		} -		if (ins_nr) { -			ret = copy_items(trans, inode, dst_path, path, -					 ins_start_slot, -					 ins_nr, inode_only, logged_isize); -			if (ret < 0) { -				err = ret; -				goto out_unlock; -			} -			ins_nr = 0; -		} -		btrfs_release_path(path); -next_key: -		if (min_key.offset < (u64)-1) { -			min_key.offset++; -		} else if (min_key.type < max_key.type) { -			min_key.type++; -			min_key.offset = 0; -		} else { -			break; -		} -	} -	if (ins_nr) { -		ret = copy_items(trans, inode, dst_path, path, -				 ins_start_slot, ins_nr, inode_only, -				 logged_isize); -		if (ret < 0) { -			err = ret; -			goto out_unlock; -		} -		ins_nr = 0; -	} +	err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, +				      path, dst_path, logged_isize, +				      recursive_logging, inode_only, +				      start, end, ctx, &need_log_inode_item); +	if (err) +		goto out_unlock;  	btrfs_release_path(path);  	btrfs_release_path(dst_path); @@ -5243,7 +5312,7 @@ next_key:  	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {  		btrfs_release_path(path);  		btrfs_release_path(dst_path); -		err = btrfs_log_holes(trans, root, inode, path); +		err = btrfs_log_holes(trans, root, inode, path, start, end);  		if (err)  			goto out_unlock;  	} @@ -6145,7 +6214,7 @@ again:  		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)  			break; -		log = btrfs_read_fs_root(log_root_tree, &found_key); +		log = btrfs_read_tree_root(log_root_tree, &found_key);  		if (IS_ERR(log)) {  			ret = PTR_ERR(log);  			btrfs_handle_fs_error(fs_info, ret, @@ -6157,7 +6226,7 @@ again:  		tmp_key.type = BTRFS_ROOT_ITEM_KEY;  		tmp_key.offset = (u64)-1; -		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); +		wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true);  		if (IS_ERR(wc.replay_dest)) {  			ret = PTR_ERR(wc.replay_dest); @@ -6173,12 +6242,10 @@ again:  			 * each subsequent pass.  			 */  			if (ret == -ENOENT) -				ret = btrfs_pin_extent_for_log_replay(fs_info, +				ret = btrfs_pin_extent_for_log_replay(trans,  							log->node->start,  							log->node->len); -			free_extent_buffer(log->node); -			free_extent_buffer(log->commit_root); -			kfree(log); +			btrfs_put_root(log);  			if (!ret)  				goto next; @@ -6214,9 +6281,8 @@ again:  		}  		wc.replay_dest->log_root = NULL; -		free_extent_buffer(log->node); -		free_extent_buffer(log->commit_root); -		kfree(log); +		btrfs_put_root(wc.replay_dest); +		btrfs_put_root(log);  		if (ret)  			goto error; @@ -6247,10 +6313,9 @@ next:  	if (ret)  		return ret; -	free_extent_buffer(log_root_tree->node);  	log_root_tree->log_root = NULL;  	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); -	kfree(log_root_tree); +	btrfs_put_root(log_root_tree);  	return 0;  error: diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 76b84f2397b1..76671a6bcb61 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -246,9 +246,53 @@ out:  	return ret;  } -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, -			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8, -					      u64)) +/* + * Check if there's an matching subvolume for given UUID + * + * Return: + * 0	check succeeded, the entry is not outdated + * > 0	if the check failed, the caller should remove the entry + * < 0	if an error occurred + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, +				       u8 *uuid, u8 type, u64 subvolid) +{ +	struct btrfs_key key; +	int ret = 0; +	struct btrfs_root *subvol_root; + +	if (type != BTRFS_UUID_KEY_SUBVOL && +	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) +		goto out; + +	key.objectid = subvolid; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; +	subvol_root = btrfs_get_fs_root(fs_info, &key, true); +	if (IS_ERR(subvol_root)) { +		ret = PTR_ERR(subvol_root); +		if (ret == -ENOENT) +			ret = 1; +		goto out; +	} + +	switch (type) { +	case BTRFS_UUID_KEY_SUBVOL: +		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) +			ret = 1; +		break; +	case BTRFS_UUID_KEY_RECEIVED_SUBVOL: +		if (memcmp(uuid, subvol_root->root_item.received_uuid, +			   BTRFS_UUID_SIZE)) +			ret = 1; +		break; +	} +	btrfs_put_root(subvol_root); +out: +	return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)  {  	struct btrfs_root *root = fs_info->uuid_root;  	struct btrfs_key key; @@ -278,6 +322,10 @@ again_search_slot:  	}  	while (1) { +		if (btrfs_fs_closing(fs_info)) { +			ret = -EINTR; +			goto out; +		}  		cond_resched();  		leaf = path->nodes[0];  		slot = path->slots[0]; @@ -305,7 +353,8 @@ again_search_slot:  			read_extent_buffer(leaf, &subid_le, offset,  					   sizeof(subid_le));  			subid_cpu = le64_to_cpu(subid_le); -			ret = check_func(fs_info, uuid, key.type, subid_cpu); +			ret = btrfs_check_uuid_tree_entry(fs_info, uuid, +							  key.type, subid_cpu);  			if (ret < 0)  				goto out;  			if (ret > 0) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9cfc668f91f4..c1909e5f4506 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6,7 +6,6 @@  #include <linux/sched.h>  #include <linux/bio.h>  #include <linux/slab.h> -#include <linux/buffer_head.h>  #include <linux/blkdev.h>  #include <linux/ratelimit.h>  #include <linux/kthread.h> @@ -500,7 +499,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(  static int  btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,  		      int flush, struct block_device **bdev, -		      struct buffer_head **bh) +		      struct btrfs_super_block **disk_super)  {  	int ret; @@ -519,9 +518,9 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,  		goto error;  	}  	invalidate_bdev(*bdev); -	*bh = btrfs_read_dev_super(*bdev); -	if (IS_ERR(*bh)) { -		ret = PTR_ERR(*bh); +	*disk_super = btrfs_read_dev_super(*bdev); +	if (IS_ERR(*disk_super)) { +		ret = PTR_ERR(*disk_super);  		blkdev_put(*bdev, flags);  		goto error;  	} @@ -530,7 +529,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,  error:  	*bdev = NULL; -	*bh = NULL;  	return ret;  } @@ -611,7 +609,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,  {  	struct request_queue *q;  	struct block_device *bdev; -	struct buffer_head *bh;  	struct btrfs_super_block *disk_super;  	u64 devid;  	int ret; @@ -622,17 +619,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,  		return -EINVAL;  	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, -				    &bdev, &bh); +				    &bdev, &disk_super);  	if (ret)  		return ret; -	disk_super = (struct btrfs_super_block *)bh->b_data;  	devid = btrfs_stack_device_id(&disk_super->dev_item);  	if (devid != device->devid) -		goto error_brelse; +		goto error_free_page;  	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) -		goto error_brelse; +		goto error_free_page;  	device->generation = btrfs_super_generation(disk_super); @@ -641,7 +637,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,  		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {  			pr_err(  		"BTRFS: Invalid seeding and uuid-changed device detected\n"); -			goto error_brelse; +			goto error_free_page;  		}  		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); @@ -667,12 +663,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,  		fs_devices->rw_devices++;  		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);  	} -	brelse(bh); +	btrfs_release_disk_super(disk_super);  	return 0; -error_brelse: -	brelse(bh); +error_free_page: +	btrfs_release_disk_super(disk_super);  	blkdev_put(bdev, flags);  	return -EINVAL; @@ -1209,6 +1205,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,  	fs_devices->opened = 1;  	fs_devices->latest_bdev = latest_dev->bdev;  	fs_devices->total_rw_bytes = 0; +	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;  out:  	return ret;  } @@ -1247,9 +1244,10 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  	return ret;  } -static void btrfs_release_disk_super(struct page *page) +void btrfs_release_disk_super(struct btrfs_super_block *super)  { -	kunmap(page); +	struct page *page = virt_to_page(super); +  	put_page(page);  } @@ -1277,17 +1275,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,  	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,  				   index, GFP_KERNEL); -	if (IS_ERR_OR_NULL(*page)) +	if (IS_ERR(*page))  		return 1; -	p = kmap(*page); +	p = page_address(*page);  	/* align our pointer to the offset of the super block */  	*disk_super = p + offset_in_page(bytenr);  	if (btrfs_super_bytenr(*disk_super) != bytenr ||  	    btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { -		btrfs_release_disk_super(*page); +		btrfs_release_disk_super(p);  		return 1;  	} @@ -1350,7 +1348,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,  			btrfs_free_stale_devices(path, device);  	} -	btrfs_release_disk_super(page); +	btrfs_release_disk_super(disk_super);  error_bdev_put:  	blkdev_put(bdev, flags); @@ -1383,6 +1381,59 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,  	return false;  } +static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +{ +	switch (device->fs_devices->chunk_alloc_policy) { +	case BTRFS_CHUNK_ALLOC_REGULAR: +		/* +		 * We don't want to overwrite the superblock on the drive nor +		 * any area used by the boot loader (grub for example), so we +		 * make sure to start at an offset of at least 1MB. +		 */ +		return max_t(u64, start, SZ_1M); +	default: +		BUG(); +	} +} + +/** + * dev_extent_hole_check - check if specified hole is suitable for allocation + * @device:	the device which we have the hole + * @hole_start: starting position of the hole + * @hole_size:	the size of the hole + * @num_bytes:	the size of the free space that we need + * + * This function may modify @hole_start and @hole_end to reflect the suitable + * position for allocation. Returns 1 if hole position is updated, 0 otherwise. + */ +static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, +				  u64 *hole_size, u64 num_bytes) +{ +	bool changed = false; +	u64 hole_end = *hole_start + *hole_size; + +	/* +	 * Check before we set max_hole_start, otherwise we could end up +	 * sending back this offset anyway. +	 */ +	if (contains_pending_extent(device, hole_start, *hole_size)) { +		if (hole_end >= *hole_start) +			*hole_size = hole_end - *hole_start; +		else +			*hole_size = 0; +		changed = true; +	} + +	switch (device->fs_devices->chunk_alloc_policy) { +	case BTRFS_CHUNK_ALLOC_REGULAR: +		/* No extra check */ +		break; +	default: +		BUG(); +	} + +	return changed; +}  /*   * find_free_dev_extent_start - find free space in the specified device @@ -1429,12 +1480,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device,  	int slot;  	struct extent_buffer *l; -	/* -	 * We don't want to overwrite the superblock on the drive nor any area -	 * used by the boot loader (grub for example), so we make sure to start -	 * at an offset of at least 1MB. -	 */ -	search_start = max_t(u64, search_start, SZ_1M); +	search_start = dev_extent_search_start(device, search_start);  	path = btrfs_alloc_path();  	if (!path) @@ -1492,18 +1538,8 @@ again:  		if (key.offset > search_start) {  			hole_size = key.offset - search_start; - -			/* -			 * Have to check before we set max_hole_start, otherwise -			 * we could end up sending back this offset anyway. -			 */ -			if (contains_pending_extent(device, &search_start, -						    hole_size)) { -				if (key.offset >= search_start) -					hole_size = key.offset - search_start; -				else -					hole_size = 0; -			} +			dev_extent_hole_check(device, &search_start, &hole_size, +					      num_bytes);  			if (hole_size > max_hole_size) {  				max_hole_start = search_start; @@ -1542,8 +1578,8 @@ next:  	 */  	if (search_end > search_start) {  		hole_size = search_end - search_start; - -		if (contains_pending_extent(device, &search_start, hole_size)) { +		if (dev_extent_hole_check(device, &search_start, &hole_size, +					  num_bytes)) {  			btrfs_release_path(path);  			goto again;  		} @@ -1949,6 +1985,46 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)  	return num_devices;  } +static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, +				      struct block_device *bdev, +				      const char *device_path) +{ +	struct btrfs_super_block *disk_super; +	int copy_num; + +	if (!bdev) +		return; + +	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { +		struct page *page; +		int ret; + +		disk_super = btrfs_read_dev_one_super(bdev, copy_num); +		if (IS_ERR(disk_super)) +			continue; + +		memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + +		page = virt_to_page(disk_super); +		set_page_dirty(page); +		lock_page(page); +		/* write_on_page() unlocks the page */ +		ret = write_one_page(page); +		if (ret) +			btrfs_warn(fs_info, +				"error clearing superblock number %d (%d)", +				copy_num, ret); +		btrfs_release_disk_super(disk_super); + +	} + +	/* Notify udev that device has changed */ +	btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + +	/* Update ctime/mtime for device path for libblkid */ +	update_dev_time(device_path); +} +  int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  		u64 devid)  { @@ -2054,7 +2130,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  	if (device->bdev) {  		cur_devices->open_devices--;  		/* remove sysfs entry */ -		btrfs_sysfs_rm_device_link(fs_devices, device); +		btrfs_sysfs_remove_devices_dir(fs_devices, device);  	}  	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2067,7 +2143,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,  	 * supers and free the device.  	 */  	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) -		btrfs_scratch_superblocks(device->bdev, device->name->str); +		btrfs_scratch_superblocks(fs_info, device->bdev, +					  device->name->str);  	btrfs_close_bdev(device);  	synchronize_rcu(); @@ -2135,7 +2212,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)  	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {  		/* zero out the old super if it is writable */ -		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); +		btrfs_scratch_superblocks(fs_info, srcdev->bdev, +					  srcdev->name->str);  	}  	btrfs_close_bdev(srcdev); @@ -2174,7 +2252,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)  	mutex_lock(&fs_devices->device_list_mutex); -	btrfs_sysfs_rm_device_link(fs_devices, tgtdev); +	btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);  	if (tgtdev->bdev)  		fs_devices->open_devices--; @@ -2194,7 +2272,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)  	 * is already out of device list, so we don't have to hold  	 * the device_list_mutex lock.  	 */ -	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); +	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, +				  tgtdev->name->str);  	btrfs_close_bdev(tgtdev);  	synchronize_rcu(); @@ -2209,14 +2288,13 @@ static struct btrfs_device *btrfs_find_device_by_path(  	u64 devid;  	u8 *dev_uuid;  	struct block_device *bdev; -	struct buffer_head *bh;  	struct btrfs_device *device;  	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, -				    fs_info->bdev_holder, 0, &bdev, &bh); +				    fs_info->bdev_holder, 0, &bdev, &disk_super);  	if (ret)  		return ERR_PTR(ret); -	disk_super = (struct btrfs_super_block *)bh->b_data; +  	devid = btrfs_stack_device_id(&disk_super->dev_item);  	dev_uuid = disk_super->dev_item.uuid;  	if (btrfs_fs_incompat(fs_info, METADATA_UUID)) @@ -2226,7 +2304,7 @@ static struct btrfs_device *btrfs_find_device_by_path(  		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,  					   disk_super->fsid, true); -	brelse(bh); +	btrfs_release_disk_super(disk_super);  	if (!device)  		device = ERR_PTR(-ENOENT);  	blkdev_put(bdev, FMODE_READ); @@ -2522,7 +2600,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path  				    orig_super_num_devices + 1);  	/* add sysfs device entry */ -	btrfs_sysfs_add_device_link(fs_devices, device); +	btrfs_sysfs_add_devices_dir(fs_devices, device);  	/*  	 * we've got more storage, clear any full flags on the space @@ -2590,7 +2668,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path  	return ret;  error_sysfs: -	btrfs_sysfs_rm_device_link(fs_devices, device); +	btrfs_sysfs_remove_devices_dir(fs_devices, device);  	mutex_lock(&fs_info->fs_devices->device_list_mutex);  	mutex_lock(&fs_info->chunk_mutex);  	list_del_rcu(&device->dev_list); @@ -3723,13 +3801,25 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)  		 atomic_read(&fs_info->balance_cancel_req) == 0);  } -/* Non-zero return value signifies invalidity */ -static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, -		u64 allowed) +/* + * Validate target profile against allowed profiles and return true if it's OK. + * Otherwise print the error message and return false. + */ +static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, +		const struct btrfs_balance_args *bargs, +		u64 allowed, const char *type)  { -	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && -		(!alloc_profile_is_valid(bctl_arg->target, 1) || -		 (bctl_arg->target & ~allowed))); +	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) +		return true; + +	/* Profile is valid and does not have bits outside of the allowed set */ +	if (alloc_profile_is_valid(bargs->target, 1) && +	    (bargs->target & ~allowed) == 0) +		return true; + +	btrfs_err(fs_info, "balance: invalid convert %s profile %s", +			type, btrfs_bg_type_to_raid_name(bargs->target)); +	return false;  }  /* @@ -3904,7 +3994,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,  	if (btrfs_fs_closing(fs_info) ||  	    atomic_read(&fs_info->balance_pause_req) || -	    atomic_read(&fs_info->balance_cancel_req)) { +	    btrfs_should_cancel_balance(fs_info)) {  		ret = -EINVAL;  		goto out;  	} @@ -3945,24 +4035,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,  		if (num_devices >= btrfs_raid_array[i].devs_min)  			allowed |= btrfs_raid_array[i].bg_flag; -	if (validate_convert_profile(&bctl->data, allowed)) { -		btrfs_err(fs_info, -			  "balance: invalid convert data profile %s", -			  btrfs_bg_type_to_raid_name(bctl->data.target)); -		ret = -EINVAL; -		goto out; -	} -	if (validate_convert_profile(&bctl->meta, allowed)) { -		btrfs_err(fs_info, -			  "balance: invalid convert metadata profile %s", -			  btrfs_bg_type_to_raid_name(bctl->meta.target)); -		ret = -EINVAL; -		goto out; -	} -	if (validate_convert_profile(&bctl->sys, allowed)) { -		btrfs_err(fs_info, -			  "balance: invalid convert system profile %s", -			  btrfs_bg_type_to_raid_name(bctl->sys.target)); +	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || +	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || +	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {  		ret = -EINVAL;  		goto out;  	} @@ -4274,7 +4349,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)  	return 0;  } -static int btrfs_uuid_scan_kthread(void *data) +int btrfs_uuid_scan_kthread(void *data)  {  	struct btrfs_fs_info *fs_info = data;  	struct btrfs_root *root = fs_info->tree_root; @@ -4286,6 +4361,7 @@ static int btrfs_uuid_scan_kthread(void *data)  	struct btrfs_root_item root_item;  	u32 item_size;  	struct btrfs_trans_handle *trans = NULL; +	bool closing = false;  	path = btrfs_alloc_path();  	if (!path) { @@ -4298,6 +4374,10 @@ static int btrfs_uuid_scan_kthread(void *data)  	key.offset = 0;  	while (1) { +		if (btrfs_fs_closing(fs_info)) { +			closing = true; +			break; +		}  		ret = btrfs_search_forward(root, &key, path,  				BTRFS_OLDEST_GENERATION);  		if (ret) { @@ -4397,76 +4477,12 @@ out:  		btrfs_end_transaction(trans);  	if (ret)  		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); -	else +	else if (!closing)  		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);  	up(&fs_info->uuid_tree_rescan_sem);  	return 0;  } -/* - * Callback for btrfs_uuid_tree_iterate(). - * returns: - * 0	check succeeded, the entry is not outdated. - * < 0	if an error occurred. - * > 0	if the check failed, which means the caller shall remove the entry. - */ -static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, -				       u8 *uuid, u8 type, u64 subid) -{ -	struct btrfs_key key; -	int ret = 0; -	struct btrfs_root *subvol_root; - -	if (type != BTRFS_UUID_KEY_SUBVOL && -	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) -		goto out; - -	key.objectid = subid; -	key.type = BTRFS_ROOT_ITEM_KEY; -	key.offset = (u64)-1; -	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); -	if (IS_ERR(subvol_root)) { -		ret = PTR_ERR(subvol_root); -		if (ret == -ENOENT) -			ret = 1; -		goto out; -	} - -	switch (type) { -	case BTRFS_UUID_KEY_SUBVOL: -		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) -			ret = 1; -		break; -	case BTRFS_UUID_KEY_RECEIVED_SUBVOL: -		if (memcmp(uuid, subvol_root->root_item.received_uuid, -			   BTRFS_UUID_SIZE)) -			ret = 1; -		break; -	} - -out: -	return ret; -} - -static int btrfs_uuid_rescan_kthread(void *data) -{ -	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; -	int ret; - -	/* -	 * 1st step is to iterate through the existing UUID tree and -	 * to delete all entries that contain outdated data. -	 * 2nd step is to add all missing entries to the UUID tree. -	 */ -	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); -	if (ret < 0) { -		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); -		up(&fs_info->uuid_tree_rescan_sem); -		return ret; -	} -	return btrfs_uuid_scan_kthread(data); -} -  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)  {  	struct btrfs_trans_handle *trans; @@ -4509,22 +4525,6 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)  	return 0;  } -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) -{ -	struct task_struct *task; - -	down(&fs_info->uuid_tree_rescan_sem); -	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); -	if (IS_ERR(task)) { -		/* fs_info->update_uuid_tree_gen remains 0 in all error case */ -		btrfs_warn(fs_info, "failed to start uuid_rescan task"); -		up(&fs_info->uuid_tree_rescan_sem); -		return PTR_ERR(task); -	} - -	return 0; -} -  /*   * shrinking a device means finding all of the device extents past   * the new size, and then following the back refs to the chunks. @@ -4777,96 +4777,111 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)  	btrfs_set_fs_incompat(info, RAID1C34);  } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, -			       u64 start, u64 type) -{ -	struct btrfs_fs_info *info = trans->fs_info; -	struct btrfs_fs_devices *fs_devices = info->fs_devices; -	struct btrfs_device *device; -	struct map_lookup *map = NULL; -	struct extent_map_tree *em_tree; -	struct extent_map *em; -	struct btrfs_device_info *devices_info = NULL; -	u64 total_avail; -	int num_stripes;	/* total number of stripes to allocate */ -	int data_stripes;	/* number of stripes that count for -				   block group size */ -	int sub_stripes;	/* sub_stripes info for map */ -	int dev_stripes;	/* stripes per dev */ -	int devs_max;		/* max devs to use */ -	int devs_min;		/* min devs needed */ -	int devs_increment;	/* ndevs has to be a multiple of this */ -	int ncopies;		/* how many copies to data has */ -	int nparity;		/* number of stripes worth of bytes to -				   store parity information */ -	int ret; +/* + * Structure used internally for __btrfs_alloc_chunk() function. + * Wraps needed parameters. + */ +struct alloc_chunk_ctl { +	u64 start; +	u64 type; +	/* Total number of stripes to allocate */ +	int num_stripes; +	/* sub_stripes info for map */ +	int sub_stripes; +	/* Stripes per device */ +	int dev_stripes; +	/* Maximum number of devices to use */ +	int devs_max; +	/* Minimum number of devices to use */ +	int devs_min; +	/* ndevs has to be a multiple of this */ +	int devs_increment; +	/* Number of copies */ +	int ncopies; +	/* Number of stripes worth of bytes to store parity information */ +	int nparity;  	u64 max_stripe_size;  	u64 max_chunk_size; +	u64 dev_extent_min;  	u64 stripe_size;  	u64 chunk_size;  	int ndevs; -	int i; -	int j; -	int index; - -	BUG_ON(!alloc_profile_is_valid(type, 0)); - -	if (list_empty(&fs_devices->alloc_list)) { -		if (btrfs_test_opt(info, ENOSPC_DEBUG)) -			btrfs_debug(info, "%s: no writable device", __func__); -		return -ENOSPC; -	} - -	index = btrfs_bg_flags_to_raid_index(type); +}; -	sub_stripes = btrfs_raid_array[index].sub_stripes; -	dev_stripes = btrfs_raid_array[index].dev_stripes; -	devs_max = btrfs_raid_array[index].devs_max; -	if (!devs_max) -		devs_max = BTRFS_MAX_DEVS(info); -	devs_min = btrfs_raid_array[index].devs_min; -	devs_increment = btrfs_raid_array[index].devs_increment; -	ncopies = btrfs_raid_array[index].ncopies; -	nparity = btrfs_raid_array[index].nparity; +static void init_alloc_chunk_ctl_policy_regular( +				struct btrfs_fs_devices *fs_devices, +				struct alloc_chunk_ctl *ctl) +{ +	u64 type = ctl->type;  	if (type & BTRFS_BLOCK_GROUP_DATA) { -		max_stripe_size = SZ_1G; -		max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; +		ctl->max_stripe_size = SZ_1G; +		ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;  	} else if (type & BTRFS_BLOCK_GROUP_METADATA) { -		/* for larger filesystems, use larger metadata chunks */ +		/* For larger filesystems, use larger metadata chunks */  		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) -			max_stripe_size = SZ_1G; +			ctl->max_stripe_size = SZ_1G;  		else -			max_stripe_size = SZ_256M; -		max_chunk_size = max_stripe_size; +			ctl->max_stripe_size = SZ_256M; +		ctl->max_chunk_size = ctl->max_stripe_size;  	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { -		max_stripe_size = SZ_32M; -		max_chunk_size = 2 * max_stripe_size; -		devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); +		ctl->max_stripe_size = SZ_32M; +		ctl->max_chunk_size = 2 * ctl->max_stripe_size; +		ctl->devs_max = min_t(int, ctl->devs_max, +				      BTRFS_MAX_DEVS_SYS_CHUNK);  	} else { -		btrfs_err(info, "invalid chunk type 0x%llx requested", -		       type);  		BUG();  	}  	/* We don't want a chunk larger than 10% of writable space */ -	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), -			     max_chunk_size); +	ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), +				  ctl->max_chunk_size); +	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; +} + +static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, +				 struct alloc_chunk_ctl *ctl) +{ +	int index = btrfs_bg_flags_to_raid_index(ctl->type); + +	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; +	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; +	ctl->devs_max = btrfs_raid_array[index].devs_max; +	if (!ctl->devs_max) +		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); +	ctl->devs_min = btrfs_raid_array[index].devs_min; +	ctl->devs_increment = btrfs_raid_array[index].devs_increment; +	ctl->ncopies = btrfs_raid_array[index].ncopies; +	ctl->nparity = btrfs_raid_array[index].nparity; +	ctl->ndevs = 0; + +	switch (fs_devices->chunk_alloc_policy) { +	case BTRFS_CHUNK_ALLOC_REGULAR: +		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); +		break; +	default: +		BUG(); +	} +} -	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), -			       GFP_NOFS); -	if (!devices_info) -		return -ENOMEM; +static int gather_device_info(struct btrfs_fs_devices *fs_devices, +			      struct alloc_chunk_ctl *ctl, +			      struct btrfs_device_info *devices_info) +{ +	struct btrfs_fs_info *info = fs_devices->fs_info; +	struct btrfs_device *device; +	u64 total_avail; +	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; +	int ret; +	int ndevs = 0; +	u64 max_avail; +	u64 dev_offset;  	/*  	 * in the first pass through the devices list, we gather information  	 * about the available holes on each device.  	 */ -	ndevs = 0;  	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { -		u64 max_avail; -		u64 dev_offset; -  		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {  			WARN(1, KERN_ERR  			       "BTRFS: read-only device in alloc_list\n"); @@ -4884,24 +4899,23 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  			total_avail = 0;  		/* If there is no space on this device, skip it. */ -		if (total_avail == 0) +		if (total_avail < ctl->dev_extent_min)  			continue; -		ret = find_free_dev_extent(device, -					   max_stripe_size * dev_stripes, -					   &dev_offset, &max_avail); +		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, +					   &max_avail);  		if (ret && ret != -ENOSPC) -			goto error; +			return ret;  		if (ret == 0) -			max_avail = max_stripe_size * dev_stripes; +			max_avail = dev_extent_want; -		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { +		if (max_avail < ctl->dev_extent_min) {  			if (btrfs_test_opt(info, ENOSPC_DEBUG))  				btrfs_debug(info, -			"%s: devid %llu has no free space, have=%llu want=%u", +			"%s: devid %llu has no free space, have=%llu want=%llu",  					    __func__, device->devid, max_avail, -					    BTRFS_STRIPE_LEN * dev_stripes); +					    ctl->dev_extent_min);  			continue;  		} @@ -4916,6 +4930,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		devices_info[ndevs].dev = device;  		++ndevs;  	} +	ctl->ndevs = ndevs;  	/*  	 * now sort the devices by hole size / available space @@ -4923,23 +4938,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),  	     btrfs_cmp_device_info, NULL); -	/* -	 * Round down to number of usable stripes, devs_increment can be any -	 * number so we can't use round_down() -	 */ -	ndevs -= ndevs % devs_increment; - -	if (ndevs < devs_min) { -		ret = -ENOSPC; -		if (btrfs_test_opt(info, ENOSPC_DEBUG)) { -			btrfs_debug(info, -	"%s: not enough devices with free space: have=%d minimum required=%d", -				    __func__, ndevs, devs_min); -		} -		goto error; -	} +	return 0; +} -	ndevs = min(ndevs, devs_max); +static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, +				      struct btrfs_device_info *devices_info) +{ +	/* Number of stripes that count for block group size */ +	int data_stripes;  	/*  	 * The primary goal is to maximize the number of stripes, so use as @@ -4948,73 +4954,116 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	 * The DUP profile stores more than one stripe per device, the  	 * max_avail is the total size so we have to adjust.  	 */ -	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); -	num_stripes = ndevs * dev_stripes; +	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, +				   ctl->dev_stripes); +	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; -	/* -	 * this will have to be fixed for RAID1 and RAID10 over -	 * more drives -	 */ -	data_stripes = (num_stripes - nparity) / ncopies; +	/* This will have to be fixed for RAID1 and RAID10 over more drives */ +	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;  	/* -	 * Use the number of data stripes to figure out how big this chunk -	 * is really going to be in terms of logical address space, -	 * and compare that answer with the max chunk size. If it's higher, -	 * we try to reduce stripe_size. +	 * Use the number of data stripes to figure out how big this chunk is +	 * really going to be in terms of logical address space, and compare +	 * that answer with the max chunk size. If it's higher, we try to +	 * reduce stripe_size.  	 */ -	if (stripe_size * data_stripes > max_chunk_size) { +	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {  		/*  		 * Reduce stripe_size, round it up to a 16MB boundary again and  		 * then use it, unless it ends up being even bigger than the  		 * previous value we had already.  		 */ -		stripe_size = min(round_up(div_u64(max_chunk_size, -						   data_stripes), SZ_16M), -				  stripe_size); +		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, +							data_stripes), SZ_16M), +				       ctl->stripe_size);  	} -	/* align to BTRFS_STRIPE_LEN */ -	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); +	/* Align to BTRFS_STRIPE_LEN */ +	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); +	ctl->chunk_size = ctl->stripe_size * data_stripes; -	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); -	if (!map) { -		ret = -ENOMEM; -		goto error; +	return 0; +} + +static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, +			      struct alloc_chunk_ctl *ctl, +			      struct btrfs_device_info *devices_info) +{ +	struct btrfs_fs_info *info = fs_devices->fs_info; + +	/* +	 * Round down to number of usable stripes, devs_increment can be any +	 * number so we can't use round_down() that requires power of 2, while +	 * rounddown is safe. +	 */ +	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); + +	if (ctl->ndevs < ctl->devs_min) { +		if (btrfs_test_opt(info, ENOSPC_DEBUG)) { +			btrfs_debug(info, +	"%s: not enough devices with free space: have=%d minimum required=%d", +				    __func__, ctl->ndevs, ctl->devs_min); +		} +		return -ENOSPC;  	} -	map->num_stripes = num_stripes; -	for (i = 0; i < ndevs; ++i) { -		for (j = 0; j < dev_stripes; ++j) { -			int s = i * dev_stripes + j; +	ctl->ndevs = min(ctl->ndevs, ctl->devs_max); + +	switch (fs_devices->chunk_alloc_policy) { +	case BTRFS_CHUNK_ALLOC_REGULAR: +		return decide_stripe_size_regular(ctl, devices_info); +	default: +		BUG(); +	} +} + +static int create_chunk(struct btrfs_trans_handle *trans, +			struct alloc_chunk_ctl *ctl, +			struct btrfs_device_info *devices_info) +{ +	struct btrfs_fs_info *info = trans->fs_info; +	struct map_lookup *map = NULL; +	struct extent_map_tree *em_tree; +	struct extent_map *em; +	u64 start = ctl->start; +	u64 type = ctl->type; +	int ret; +	int i; +	int j; + +	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); +	if (!map) +		return -ENOMEM; +	map->num_stripes = ctl->num_stripes; + +	for (i = 0; i < ctl->ndevs; ++i) { +		for (j = 0; j < ctl->dev_stripes; ++j) { +			int s = i * ctl->dev_stripes + j;  			map->stripes[s].dev = devices_info[i].dev;  			map->stripes[s].physical = devices_info[i].dev_offset + -						   j * stripe_size; +						   j * ctl->stripe_size;  		}  	}  	map->stripe_len = BTRFS_STRIPE_LEN;  	map->io_align = BTRFS_STRIPE_LEN;  	map->io_width = BTRFS_STRIPE_LEN;  	map->type = type; -	map->sub_stripes = sub_stripes; - -	chunk_size = stripe_size * data_stripes; +	map->sub_stripes = ctl->sub_stripes; -	trace_btrfs_chunk_alloc(info, map, start, chunk_size); +	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);  	em = alloc_extent_map();  	if (!em) {  		kfree(map); -		ret = -ENOMEM; -		goto error; +		return -ENOMEM;  	}  	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);  	em->map_lookup = map;  	em->start = start; -	em->len = chunk_size; +	em->len = ctl->chunk_size;  	em->block_start = 0;  	em->block_len = em->len; -	em->orig_block_len = stripe_size; +	em->orig_block_len = ctl->stripe_size;  	em_tree = &info->mapping_tree;  	write_lock(&em_tree->lock); @@ -5022,30 +5071,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	if (ret) {  		write_unlock(&em_tree->lock);  		free_extent_map(em); -		goto error; +		return ret;  	}  	write_unlock(&em_tree->lock); -	ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); +	ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);  	if (ret)  		goto error_del_extent;  	for (i = 0; i < map->num_stripes; i++) {  		struct btrfs_device *dev = map->stripes[i].dev; -		btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size); +		btrfs_device_set_bytes_used(dev, +					    dev->bytes_used + ctl->stripe_size);  		if (list_empty(&dev->post_commit_list))  			list_add_tail(&dev->post_commit_list,  				      &trans->transaction->dev_update_list);  	} -	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); +	atomic64_sub(ctl->stripe_size * map->num_stripes, +		     &info->free_chunk_space);  	free_extent_map(em);  	check_raid56_incompat_flag(info, type);  	check_raid1c34_incompat_flag(info, type); -	kfree(devices_info);  	return 0;  error_del_extent: @@ -5057,11 +5107,68 @@ error_del_extent:  	free_extent_map(em);  	/* One for the tree reference */  	free_extent_map(em); -error: + +	return ret; +} + +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) +{ +	struct btrfs_fs_info *info = trans->fs_info; +	struct btrfs_fs_devices *fs_devices = info->fs_devices; +	struct btrfs_device_info *devices_info = NULL; +	struct alloc_chunk_ctl ctl; +	int ret; + +	lockdep_assert_held(&info->chunk_mutex); + +	if (!alloc_profile_is_valid(type, 0)) { +		ASSERT(0); +		return -EINVAL; +	} + +	if (list_empty(&fs_devices->alloc_list)) { +		if (btrfs_test_opt(info, ENOSPC_DEBUG)) +			btrfs_debug(info, "%s: no writable device", __func__); +		return -ENOSPC; +	} + +	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { +		btrfs_err(info, "invalid chunk type 0x%llx requested", type); +		ASSERT(0); +		return -EINVAL; +	} + +	ctl.start = find_next_chunk(info); +	ctl.type = type; +	init_alloc_chunk_ctl(fs_devices, &ctl); + +	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), +			       GFP_NOFS); +	if (!devices_info) +		return -ENOMEM; + +	ret = gather_device_info(fs_devices, &ctl, devices_info); +	if (ret < 0) +		goto out; + +	ret = decide_stripe_size(fs_devices, &ctl, devices_info); +	if (ret < 0) +		goto out; + +	ret = create_chunk(trans, &ctl, devices_info); + +out:  	kfree(devices_info);  	return ret;  } +/* + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */  int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,  			     u64 chunk_offset, u64 chunk_size)  { @@ -5160,39 +5267,19 @@ out:  	return ret;  } -/* - * Chunk allocation falls into two parts. The first part does work - * that makes the new allocated chunk usable, but does not do any operation - * that modifies the chunk tree. The second part does the work that - * requires modifying the chunk tree. This division is important for the - * bootstrap process of adding storage to a seed btrfs. - */ -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) -{ -	u64 chunk_offset; - -	lockdep_assert_held(&trans->fs_info->chunk_mutex); -	chunk_offset = find_next_chunk(trans->fs_info); -	return __btrfs_alloc_chunk(trans, chunk_offset, type); -} -  static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)  {  	struct btrfs_fs_info *fs_info = trans->fs_info; -	u64 chunk_offset; -	u64 sys_chunk_offset;  	u64 alloc_profile;  	int ret; -	chunk_offset = find_next_chunk(fs_info);  	alloc_profile = btrfs_metadata_alloc_profile(fs_info); -	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); +	ret = btrfs_alloc_chunk(trans, alloc_profile);  	if (ret)  		return ret; -	sys_chunk_offset = find_next_chunk(fs_info);  	alloc_profile = btrfs_system_alloc_profile(fs_info); -	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); +	ret = btrfs_alloc_chunk(trans, alloc_profile);  	return ret;  } @@ -5389,31 +5476,19 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,  	return preferred_mirror;  } -static inline int parity_smaller(u64 a, u64 b) -{ -	return a > b; -} -  /* Bubble-sort the stripe set to put the parity/syndrome stripes last */  static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)  { -	struct btrfs_bio_stripe s;  	int i; -	u64 l;  	int again = 1;  	while (again) {  		again = 0;  		for (i = 0; i < num_stripes - 1; i++) { -			if (parity_smaller(bbio->raid_map[i], -					   bbio->raid_map[i+1])) { -				s = bbio->stripes[i]; -				l = bbio->raid_map[i]; -				bbio->stripes[i] = bbio->stripes[i+1]; -				bbio->raid_map[i] = bbio->raid_map[i+1]; -				bbio->stripes[i+1] = s; -				bbio->raid_map[i+1] = l; - +			/* Swap if parity is on a smaller index */ +			if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { +				swap(bbio->stripes[i], bbio->stripes[i + 1]); +				swap(bbio->raid_map[i], bbio->raid_map[i + 1]);  				again = 1;  			}  		} @@ -5914,10 +5989,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,  	struct btrfs_io_geometry geom;  	ASSERT(bbio_ret); - -	if (op == BTRFS_MAP_DISCARD) -		return __btrfs_map_block_for_discard(fs_info, logical, -						     length, bbio_ret); +	ASSERT(op != BTRFS_MAP_DISCARD);  	ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);  	if (ret < 0) @@ -6147,6 +6219,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  		      u64 logical, u64 *length,  		      struct btrfs_bio **bbio_ret, int mirror_num)  { +	if (op == BTRFS_MAP_DISCARD) +		return __btrfs_map_block_for_discard(fs_info, logical, +						     length, bbio_ret); +  	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,  				 mirror_num, 0);  } @@ -6241,8 +6317,8 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,  	btrfs_debug_in_rcu(fs_info,  	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",  		bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, -		(u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, -		bio->bi_iter.bi_size); +		(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), +		dev->devid, bio->bi_iter.bi_size);  	bio_set_dev(bio, dev->bdev);  	btrfs_bio_counter_inc_noblocked(fs_info); @@ -7317,36 +7393,6 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,  	return 0;  } -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) -{ -	struct buffer_head *bh; -	struct btrfs_super_block *disk_super; -	int copy_num; - -	if (!bdev) -		return; - -	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; -		copy_num++) { - -		if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) -			continue; - -		disk_super = (struct btrfs_super_block *)bh->b_data; - -		memset(&disk_super->magic, 0, sizeof(disk_super->magic)); -		set_buffer_dirty(bh); -		sync_dirty_buffer(bh); -		brelse(bh); -	} - -	/* Notify udev that device has changed */ -	btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - -	/* Update ctime/mtime for device path for libblkid */ -	update_dev_time(device_path); -} -  /*   * Update the size and bytes used for each device where it changed.  This is   * delayed since we would otherwise get errors while writing out the diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f01552a0785e..f067b5934c46 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,8 +17,6 @@ extern struct mutex uuid_mutex;  #define BTRFS_STRIPE_LEN	SZ_64K -struct buffer_head; -  struct btrfs_io_geometry {  	/* remaining bytes before crossing a stripe */  	u64 len; @@ -209,6 +207,10 @@ BTRFS_DEVICE_GETSET_FUNCS(total_bytes);  BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);  BTRFS_DEVICE_GETSET_FUNCS(bytes_used); +enum btrfs_chunk_allocation_policy { +	BTRFS_CHUNK_ALLOC_REGULAR, +}; +  struct btrfs_fs_devices {  	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */  	u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -260,6 +262,8 @@ struct btrfs_fs_devices {  	struct kobject *devices_kobj;  	struct kobject *devinfo_kobj;  	struct completion kobj_unregister; + +	enum btrfs_chunk_allocation_policy chunk_alloc_policy;  };  #define BTRFS_BIO_INLINE_CSUM_SIZE	64 @@ -461,7 +465,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);  int btrfs_pause_balance(struct btrfs_fs_info *fs_info);  int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data);  int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);  int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,  			 u64 *start, u64 *max_avail); @@ -474,7 +478,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);  void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);  void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);  void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);  int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,  			   u64 logical, u64 len);  unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, @@ -484,6 +487,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,  int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);  struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,  				       u64 logical, u64 length); +void btrfs_release_disk_super(struct btrfs_super_block *super);  static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,  				      int index) diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 0c631e2a73b6..d41b0d3e9474 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -43,6 +43,16 @@ static inline void guid_copy(guid_t *dst, const guid_t *src)  	memcpy(dst, src, sizeof(guid_t));  } +static inline void import_guid(guid_t *dst, const __u8 *src) +{ +	memcpy(dst, src, sizeof(guid_t)); +} + +static inline void export_guid(__u8 *dst, const guid_t *src) +{ +	memcpy(dst, src, sizeof(guid_t)); +} +  static inline bool guid_is_null(const guid_t *guid)  {  	return guid_equal(guid, &guid_null); @@ -58,12 +68,23 @@ static inline void uuid_copy(uuid_t *dst, const uuid_t *src)  	memcpy(dst, src, sizeof(uuid_t));  } +static inline void import_uuid(uuid_t *dst, const __u8 *src) +{ +	memcpy(dst, src, sizeof(uuid_t)); +} + +static inline void export_uuid(__u8 *dst, const uuid_t *src) +{ +	memcpy(dst, src, sizeof(uuid_t)); +} +  static inline bool uuid_is_null(const uuid_t *uuid)  {  	return uuid_equal(uuid, &uuid_null);  }  void generate_random_uuid(unsigned char uuid[16]); +void generate_random_guid(unsigned char guid[16]);  extern void guid_gen(guid_t *u);  extern void uuid_gen(uuid_t *u); @@ -77,7 +98,6 @@ int guid_parse(const char *uuid, guid_t *u);  int uuid_parse(const char *uuid, uuid_t *u);  /* backwards compatibility, don't use in new code */ -#define uuid_le_gen(u)		guid_gen(u)  #define uuid_le_to_bin(guid, u)	guid_parse(guid, u)  static inline int uuid_le_cmp(const guid_t u1, const guid_t u2) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 17088a112ed0..bcbc763b8814 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -81,13 +81,14 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS);  #define show_extent_io_tree_owner(owner)				       \  	__print_symbolic(owner,						       \ -		{ IO_TREE_FS_INFO_FREED_EXTENTS0, "FREED_EXTENTS0" },	       \ -		{ IO_TREE_FS_INFO_FREED_EXTENTS1, "FREED_EXTENTS1" },	       \ +		{ IO_TREE_FS_PINNED_EXTENTS, 	  "PINNED_EXTENTS" },	       \ +		{ IO_TREE_FS_EXCLUDED_EXTENTS,	  "EXCLUDED_EXTENTS" },	       \  		{ IO_TREE_INODE_IO,		  "INODE_IO" },		       \  		{ IO_TREE_INODE_IO_FAILURE,	  "INODE_IO_FAILURE" },	       \  		{ IO_TREE_RELOC_BLOCKS,		  "RELOC_BLOCKS" },	       \  		{ IO_TREE_TRANS_DIRTY_PAGES,	  "TRANS_DIRTY_PAGES" },       \  		{ IO_TREE_ROOT_DIRTY_LOG_PAGES,	  "ROOT_DIRTY_LOG_PAGES" },    \ +		{ IO_TREE_INODE_FILE_EXTENT,	  "INODE_FILE_EXTENT" },       \  		{ IO_TREE_SELFTEST,		  "SELFTEST" })  #define BTRFS_GROUP_FLAGS	\ @@ -468,7 +469,6 @@ DEFINE_EVENT(  		{ (1 << BTRFS_ORDERED_PREALLOC), 	"PREALLOC" 	}, \  		{ (1 << BTRFS_ORDERED_DIRECT),	 	"DIRECT" 	}, \  		{ (1 << BTRFS_ORDERED_IOERR), 		"IOERR" 	}, \ -		{ (1 << BTRFS_ORDERED_UPDATED_ISIZE), 	"UPDATED_ISIZE"	}, \  		{ (1 << BTRFS_ORDERED_TRUNCATED), 	"TRUNCATED"	}) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 7a8bc8b920f5..8134924cfc17 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -36,17 +36,24 @@ struct btrfs_ioctl_vol_args {  #define BTRFS_DEVICE_PATH_NAME_MAX	1024  #define BTRFS_SUBVOL_NAME_MAX 		4039 -#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0) +/* + * Deprecated since 5.7: + * + * BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0) + */ +  #define BTRFS_SUBVOL_RDONLY		(1ULL << 1)  #define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)  #define BTRFS_DEVICE_SPEC_BY_ID		(1ULL << 3) +#define BTRFS_SUBVOL_SPEC_BY_ID	(1ULL << 4) +  #define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED		\ -			(BTRFS_SUBVOL_CREATE_ASYNC |	\ -			BTRFS_SUBVOL_RDONLY |		\ +			(BTRFS_SUBVOL_RDONLY |		\  			BTRFS_SUBVOL_QGROUP_INHERIT |	\ -			BTRFS_DEVICE_SPEC_BY_ID) +			BTRFS_DEVICE_SPEC_BY_ID |	\ +			BTRFS_SUBVOL_SPEC_BY_ID)  #define BTRFS_FSID_SIZE 16  #define BTRFS_UUID_SIZE 16 @@ -97,16 +104,29 @@ struct btrfs_ioctl_qgroup_limit_args {  };  /* - * flags for subvolumes + * Arguments for specification of subvolumes or devices, supporting by-name or + * by-id and flags   * - * Used by: - * struct btrfs_ioctl_vol_args_v2.flags + * The set of supported flags depends on the ioctl   *   * BTRFS_SUBVOL_RDONLY is also provided/consumed by the following ioctls:   * - BTRFS_IOC_SUBVOL_GETFLAGS   * - BTRFS_IOC_SUBVOL_SETFLAGS   */ +/* Supported flags for BTRFS_IOC_RM_DEV_V2 */ +#define BTRFS_DEVICE_REMOVE_ARGS_MASK					\ +	(BTRFS_DEVICE_SPEC_BY_ID) + +/* Supported flags for BTRFS_IOC_SNAP_CREATE_V2 and BTRFS_IOC_SUBVOL_CREATE_V2 */ +#define BTRFS_SUBVOL_CREATE_ARGS_MASK					\ +	 (BTRFS_SUBVOL_RDONLY |						\ +	 BTRFS_SUBVOL_QGROUP_INHERIT) + +/* Supported flags for BTRFS_IOC_SNAP_DESTROY_V2 */ +#define BTRFS_SUBVOL_DELETE_ARGS_MASK					\ +	(BTRFS_SUBVOL_SPEC_BY_ID) +  struct btrfs_ioctl_vol_args_v2 {  	__s64 fd;  	__u64 transid; @@ -121,6 +141,7 @@ struct btrfs_ioctl_vol_args_v2 {  	union {  		char name[BTRFS_SUBVOL_NAME_MAX + 1];  		__u64 devid; +		__u64 subvolid;  	};  }; @@ -949,5 +970,7 @@ enum btrfs_err_code {  				struct btrfs_ioctl_get_subvol_rootref_args)  #define BTRFS_IOC_INO_LOOKUP_USER _IOWR(BTRFS_IOCTL_MAGIC, 62, \  				struct btrfs_ioctl_ino_lookup_user_args) +#define BTRFS_IOC_SNAP_DESTROY_V2 _IOW(BTRFS_IOCTL_MAGIC, 63, \ +				struct btrfs_ioctl_vol_args_v2)  #endif /* _UAPI_LINUX_BTRFS_H */ diff --git a/lib/uuid.c b/lib/uuid.c index b6a1edb61d87..562d53977cab 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -40,6 +40,16 @@ void generate_random_uuid(unsigned char uuid[16])  }  EXPORT_SYMBOL(generate_random_uuid); +void generate_random_guid(unsigned char guid[16]) +{ +	get_random_bytes(guid, 16); +	/* Set GUID version to 4 --- truly random generation */ +	guid[7] = (guid[7] & 0x0F) | 0x40; +	/* Set the GUID variant to DCE */ +	guid[8] = (guid[8] & 0x3F) | 0x80; +} +EXPORT_SYMBOL(generate_random_guid); +  static void __uuid_gen_common(__u8 b[16])  {  	prandom_bytes(b, 16); | 
