Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents

When cloning extents (or deduplicating) we create a transaction with a space reservation that considers we will drop or update a single file extent item of the destination inode (that we modify a single leaf). That is fine for the vast majority of scenarios, however it might happen that we need to drop many file extent items, and adjust at most two file extent items, in the destination root, which can span multiple leafs. This will lead to either the call to btrfs_drop_extents() to fail with ENOSPC or the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode() (called through clone_finish_inode_update()) to fail with ENOSPC. Such failure results in a transaction abort, leaving the filesystem in a read-only mode. In order to fix this we need to follow the same approach as the hole punching code, where we create a local reservation with 1 unit and keep ending and starting transactions, after balancing the btree inode, when __btrfs_drop_extents() returns ENOSPC. So fix this by making the extent cloning call calls the recently added btrfs_punch_hole_range() helper, which is what does the mentioned work for hole punching, and make sure whenever we drop extent items in a transaction, we also add a replacing file extent item, to avoid corruption (a hole) if after ending a transaction and before starting a new one, the old transaction gets committed and a power failure happens before we finish cloning. A test case for fstests follows soon. Reported-by: David Goodwin <david@codepoets.co.uk> Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/ Reported-by: Sam Tygier <sam@tygier.co.uk> Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/ Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Filipe Manana <fdmanana@suse.com> 2019-07-05 11:09:50 +0100
committer: David Sterba <dsterba@suse.com> 2019-09-09 14:58:58 +0200
commit: 690a5dbfc5131572910e6350d65d7b9d55439817 (patch)
tree: b0c45a23efc2fc41a5ace2b9f4b343a65e576166 /fs/btrfs/file.c
parent: 9cba40a693e69badb567d6ce0eaa0150f25c3d39 (diff)
1 files changed, 131 insertions, 15 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 16dc09736310..474ff1cac640 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2439,13 +2439,76 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
 	return 0;
 }
 
+static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+				     struct inode *inode,
+				     struct btrfs_path *path,
+				     struct btrfs_clone_extent_info *clone_info,
+				     const u64 clone_len)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int slot;
+	struct btrfs_ref ref = { 0 };
+	u64 ref_offset;
+	int ret;
+
+	if (clone_len == 0)
+		return 0;
+
+	if (clone_info->disk_offset == 0 &&
+	    btrfs_fs_incompat(fs_info, NO_HOLES))
+		return 0;
+
+	key.objectid = btrfs_ino(BTRFS_I(inode));
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = clone_info->file_offset;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      clone_info->item_size);
+	if (ret)
+		return ret;
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	write_extent_buffer(leaf, clone_info->extent_buf,
+			    btrfs_item_ptr_offset(leaf, slot),
+			    clone_info->item_size);
+	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
+	btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	/* If it's a hole, nothing more needs to be done. */
+	if (clone_info->disk_offset == 0)
+		return 0;
+
+	inode_add_bytes(inode, clone_len);
+	btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+			       clone_info->disk_offset,
+			       clone_info->disk_len, 0);
+	ref_offset = clone_info->file_offset - clone_info->data_offset;
+	btrfs_init_data_ref(&ref, root->root_key.objectid,
+			    btrfs_ino(BTRFS_I(inode)), ref_offset);
+	ret = btrfs_inc_extent_ref(trans, &ref);
+
+	return ret;
+}
+
 /*
  * The respective range must have been previously locked, as well as the inode.
  * The end offset is inclusive (last byte of the range).
+ * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
+ * cloning.
+ * When cloning, we don't want to end up in a state where we dropped extents
+ * without inserting a new one, so we must abort the transaction to avoid a
+ * corruption.
  */
-static int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
-				  const u64 start, const u64 end,
-				  struct btrfs_trans_handle **trans_out)
+int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+			   const u64 start, const u64 end,
+			   struct btrfs_clone_extent_info *clone_info,
+			   struct btrfs_trans_handle **trans_out)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
@@ -2473,9 +2536,14 @@ static int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
 	/*
 	 * 1 - update the inode
 	 * 1 - removing the extents in the range
-	 * 1 - adding the hole extent if no_holes isn't set
+	 * 1 - adding the hole extent if no_holes isn't set or if we are cloning
+	 *     an extent
 	 */
-	rsv_count = btrfs_fs_incompat(fs_info, NO_HOLES) ? 2 : 3;
+	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+		rsv_count = 3;
+	else
+		rsv_count = 2;
+
 	trans = btrfs_start_transaction(root, rsv_count);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
@@ -2493,12 +2561,23 @@ static int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
 		ret = __btrfs_drop_extents(trans, root, inode, path,
 					   cur_offset, end + 1, &drop_end,
 					   1, 0, 0, NULL);
-		if (ret != -ENOSPC)
+		if (ret != -ENOSPC) {
+			/*
+			 * When cloning we want to avoid transaction aborts when
+			 * nothing was done and we are attempting to clone parts
+			 * of inline extents, in such cases -EOPNOTSUPP is
+			 * returned by __btrfs_drop_extents() without having
+			 * changed anything in the file.
+			 */
+			if (clone_info && ret && ret != -EOPNOTSUPP)
+				btrfs_abort_transaction(trans, ret);
 			break;
+		}
 
 		trans->block_rsv = &fs_info->trans_block_rsv;
 
-		if (cur_offset < drop_end && cur_offset < ino_size) {
+		if (!clone_info && cur_offset < drop_end &&
+		    cur_offset < ino_size) {
 			ret = fill_holes(trans, BTRFS_I(inode), path,
 					cur_offset, drop_end);
 			if (ret) {
@@ -2513,6 +2592,20 @@ static int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
 			}
 		}
 
+		if (clone_info) {
+			u64 clone_len = drop_end - cur_offset;
+
+			ret = btrfs_insert_clone_extent(trans, inode, path,
+							clone_info, clone_len);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				break;
+			}
+			clone_info->data_len -= clone_len;
+			clone_info->data_offset += clone_len;
+			clone_info->file_offset += clone_len;
+		}
+
 		cur_offset = drop_end;
 
 		ret = btrfs_update_inode(trans, root, inode);
@@ -2534,15 +2627,29 @@ static int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
 		BUG_ON(ret);	/* shouldn't happen */
 		trans->block_rsv = rsv;
 
-		ret = find_first_non_hole(inode, &cur_offset, &len);
-		if (unlikely(ret < 0))
-			break;
-		if (ret && !len) {
-			ret = 0;
-			break;
+		if (!clone_info) {
+			ret = find_first_non_hole(inode, &cur_offset, &len);
+			if (unlikely(ret < 0))
+				break;
+			if (ret && !len) {
+				ret = 0;
+				break;
+			}
 		}
 	}
 
+	/*
+	 * If we were cloning, force the next fsync to be a full one since we
+	 * we replaced (or just dropped in the case of cloning holes when
+	 * NO_HOLES is enabled) extents and extent maps.
+	 * This is for the sake of simplicity, and cloning into files larger
+	 * than 16Mb would force the full fsync any way (when
+	 * try_release_extent_mapping() is invoked during page cache truncation.
+	 */
+	if (clone_info)
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+
 	if (ret)
 		goto out_trans;
 
@@ -2565,7 +2672,7 @@ static int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
 	 * (because it's useless) or if it represents a 0 bytes range (when
 	 * cur_offset == drop_end).
 	 */
-	if (cur_offset < ino_size && cur_offset < drop_end) {
+	if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
 		ret = fill_holes(trans, BTRFS_I(inode), path,
 				cur_offset, drop_end);
 		if (ret) {
@@ -2574,6 +2681,14 @@ static int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
 			goto out_trans;
 		}
 	}
+	if (clone_info) {
+		ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
+						clone_info->data_len);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out_trans;
+		}
+	}
 
 out_trans:
 	if (!trans)
@@ -2710,7 +2825,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out;
 	}
 
-	ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, &trans);
+	ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+				     &trans);
 	btrfs_free_path(path);
 	if (ret)
 		goto out;
author	Filipe Manana <fdmanana@suse.com>	2019-07-05 11:09:50 +0100
committer	David Sterba <dsterba@suse.com>	2019-09-09 14:58:58 +0200
commit	690a5dbfc5131572910e6350d65d7b9d55439817 (patch)
tree	b0c45a23efc2fc41a5ace2b9f4b343a65e576166 /fs/btrfs/file.c
parent	9cba40a693e69badb567d6ce0eaa0150f25c3d39 (diff)