summaryrefslogtreecommitdiff
path: root/fs/btrfs/reflink.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/reflink.c')
-rw-r--r--fs/btrfs/reflink.c485
1 files changed, 301 insertions, 184 deletions
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 040009d1cc31..b5fe95baf92e 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -1,12 +1,21 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/blkdev.h>
+#include <linux/fscrypt.h>
#include <linux/iversion.h>
-#include "compression.h"
#include "ctree.h"
+#include "fs.h"
+#include "messages.h"
+#include "compression.h"
#include "delalloc-space.h"
+#include "disk-io.h"
#include "reflink.h"
#include "transaction.h"
+#include "subpage.h"
+#include "accessors.h"
+#include "file-item.h"
+#include "file.h"
+#include "super.h"
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
@@ -15,14 +24,14 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
u64 endoff,
const u64 destoff,
const u64 olen,
- int no_time_update)
+ bool no_time_update)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
- if (!no_time_update)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ if (!no_time_update) {
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ }
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
@@ -31,33 +40,33 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
endoff = destoff + olen;
if (endoff > inode->i_size) {
i_size_write(inode, endoff);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
- ret = btrfs_end_transaction(trans);
-out:
- return ret;
+ return btrfs_end_transaction(trans);
}
-static int copy_inline_to_page(struct inode *inode,
+static int copy_inline_to_page(struct btrfs_inode *inode,
const u64 file_offset,
char *inline_data,
const u64 size,
const u64 datal,
const u8 comp_type)
{
- const u64 block_size = btrfs_inode_sectorsize(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 block_size = fs_info->sectorsize;
const u64 range_end = file_offset + block_size - 1;
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
- struct page *page = NULL;
+ struct folio *folio = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
ASSERT(IS_ALIGNED(file_offset, block_size));
@@ -73,34 +82,47 @@ static int copy_inline_to_page(struct inode *inode,
if (ret)
goto out;
- page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
- btrfs_alloc_write_mask(inode->i_mapping));
- if (!page) {
- ret = -ENOMEM;
+ folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ btrfs_alloc_write_mask(mapping));
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto out_unlock;
}
- set_page_extent_mapped(page);
- clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, NULL);
+ ret = set_folio_extent_mapped(folio);
+ if (ret < 0)
+ goto out_unlock;
+
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
- if (comp_type == BTRFS_COMPRESS_NONE) {
- char *map;
+ /*
+ * After dirtying the page our caller will need to start a transaction,
+ * and if we are low on metadata free space, that can cause flushing of
+ * delalloc for all inodes in order to get metadata space released.
+ * However we are holding the range locked for the whole duration of
+ * the clone/dedupe operation, so we may deadlock if that happens and no
+ * other task releases enough space. So mark this inode as not being
+ * possible to flush to avoid such deadlock. We will clear that flag
+ * when we finish cloning all extents, since a transaction is started
+ * after finding each extent to clone.
+ */
+ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
- map = kmap(page);
- memcpy(map, data_start, datal);
- flush_dcache_page(page);
- kunmap(page);
+ if (comp_type == BTRFS_COMPRESS_NONE) {
+ memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start,
+ datal);
} else {
- ret = btrfs_decompress(comp_type, data_start, page, 0,
+ ret = btrfs_decompress(comp_type, data_start, folio,
+ offset_in_folio(folio, file_offset),
inline_size, datal);
if (ret)
goto out_unlock;
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
}
/*
@@ -115,27 +137,21 @@ static int copy_inline_to_page(struct inode *inode,
*
* So what's in the range [500, 4095] corresponds to zeroes.
*/
- if (datal < block_size) {
- char *map;
-
- map = kmap(page);
- memset(map + datal, 0, block_size - datal);
- flush_dcache_page(page);
- kunmap(page);
- }
+ if (datal < block_size)
+ folio_zero_range(folio, datal, block_size - datal);
- SetPageUptodate(page);
- ClearPageChecked(page);
- set_page_dirty(page);
+ btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
+ btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
+ btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
out_unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (!IS_ERR(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
}
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
block_size, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+ btrfs_delalloc_release_extents(inode, block_size);
out:
extent_changeset_free(data_reserved);
@@ -147,7 +163,7 @@ out:
* the source inode to destination inode when possible. When not possible we
* copy the inline extent's data into the respective page of the inode.
*/
-static int clone_copy_inline_extent(struct inode *dst,
+static int clone_copy_inline_extent(struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_key *new_key,
const u64 drop_start,
@@ -157,21 +173,22 @@ static int clone_copy_inline_extent(struct inode *dst,
char *inline_data,
struct btrfs_trans_handle **trans_out)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
- struct btrfs_root *root = BTRFS_I(dst)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(inode, new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
- key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -186,19 +203,16 @@ static int clone_copy_inline_extent(struct inode *dst,
goto copy_inline_extent;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ if (key.objectid == btrfs_ino(inode) &&
key.type == BTRFS_EXTENT_DATA_KEY) {
/*
* There's an implicit hole at file offset 0, copy the
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(dst, new_key->offset,
- inline_data, size, datal,
- comp_type);
- goto out;
+ goto copy_to_page;
}
- } else if (i_size_read(dst) <= datal) {
+ } else if (i_size_read(&inode->vfs_inode) <= datal) {
struct btrfs_file_extent_item *ei;
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -212,29 +226,28 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
copy_inline_extent:
- ret = 0;
/*
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
*/
- if (i_size_read(dst) > datal) {
+ if (i_size_read(&inode->vfs_inode) > datal) {
/*
* At the destination offset 0 we have either a hole, a regular
* extent or an inline extent larger then the one we want to
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
btrfs_release_path(path);
/*
* If we end up here it means were copy the inline extent into a leaf
@@ -251,20 +264,30 @@ copy_inline_extent:
trans = NULL;
goto out;
}
- ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
- if (ret)
+ drop_args.path = path;
+ drop_args.start = drop_start;
+ drop_args.end = aligned_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret)
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
write_extent_buffer(path->nodes[0], inline_data,
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
- inode_add_bytes(dst, datal);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
+ btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
+ btrfs_set_inode_full_sync(inode);
+ ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
+ if (unlikely(ret))
+ btrfs_abort_transaction(trans, ret);
out:
if (!ret && !trans) {
/*
@@ -279,42 +302,56 @@ out:
trans = NULL;
}
}
- if (ret && trans) {
- btrfs_abort_transaction(trans, ret);
+ if (ret && trans)
btrfs_end_transaction(trans);
- }
if (!ret)
*trans_out = trans;
return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(inode, new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
}
-/**
- * btrfs_clone() - clone a range from inode file to another
+/*
+ * Clone a range from inode file to another.
*
- * @src: Inode to clone from
- * @inode: Inode to clone to
- * @off: Offset within source to start clone from
- * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen
- * @destoff: Offset within @inode to start clone
- * @no_time_update: Whether to update mtime/ctime on the target inode
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen
+ * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff, int no_time_update)
+ const u64 destoff, bool no_time_update)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_path *path = NULL;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_trans_handle *trans;
- char *buf = NULL;
+ char AUTO_KVFREE(buf);
struct btrfs_key key;
u32 nritems;
int slot;
int ret;
const u64 len = olen_aligned;
u64 last_dest_end = destoff;
+ u64 prev_extent_end = off;
ret = -ENOMEM;
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
@@ -322,10 +359,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
return ret;
path = btrfs_alloc_path();
- if (!path) {
- kvfree(buf);
+ if (!path)
return ret;
- }
path->reada = READA_FORWARD;
/* Clone data */
@@ -334,8 +369,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
key.offset = off;
while (1) {
- u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
+ u64 extent_gen;
int type;
u32 size;
struct btrfs_key new_key;
@@ -345,7 +380,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 drop_start;
/* Note the key will change type as we walk through the tree */
- path->leave_spinning = 1;
ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
0, 0);
if (ret < 0)
@@ -384,6 +418,7 @@ process_slot:
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
+ extent_gen = btrfs_file_extent_generation(leaf, extent);
comp = btrfs_file_extent_compression(leaf, extent);
type = btrfs_file_extent_type(leaf, extent);
if (type == BTRFS_FILE_EXTENT_REG ||
@@ -401,20 +436,26 @@ process_slot:
* The first search might have left us at an extent item that
* ends before our target range's start, can happen if we have
* holes and NO_HOLES feature enabled.
+ *
+ * Subsequent searches may leave us on a file range we have
+ * processed before - this happens due to a race with ordered
+ * extent completion for a file range that is outside our source
+ * range, but that range was part of a file extent item that
+ * also covered a leading part of our source range.
*/
- if (key.offset + datal <= off) {
+ if (key.offset + datal <= prev_extent_end) {
path->slots[0]++;
goto process_slot;
} else if (key.offset >= off + len) {
break;
}
- next_key_min_offset = key.offset + datal;
- size = btrfs_item_size_nr(leaf, slot);
+
+ prev_extent_end = key.offset + datal;
+ size = btrfs_item_size(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
btrfs_release_path(path);
- path->leave_spinning = 0;
memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = btrfs_ino(BTRFS_I(inode));
@@ -436,7 +477,7 @@ process_slot:
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
+ struct btrfs_replace_extent_info clone_info;
/*
* a | --- range to clone ---| b
@@ -459,13 +500,15 @@ process_slot:
clone_info.data_len = datal;
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path, drop_start,
- new_key.offset + datal - 1, &clone_info,
- &trans);
+ clone_info.is_new_extent = false;
+ clone_info.update_times = !no_time_update;
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ drop_start, new_key.offset + datal - 1,
+ &clone_info, &trans);
if (ret)
goto out;
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ } else {
+ ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
/*
* Inline extents always have to start at file offset 0
* and can never be bigger then the sector size. We can
@@ -476,10 +519,14 @@ process_slot:
*/
ASSERT(key.offset == 0);
ASSERT(datal <= fs_info->sectorsize);
- if (key.offset != 0 || datal > fs_info->sectorsize)
- return -EUCLEAN;
+ if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
+ WARN_ON(key.offset != 0) ||
+ WARN_ON(datal > fs_info->sectorsize)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
- ret = clone_copy_inline_extent(inode, path, &new_key,
+ ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key,
drop_start, datal, size,
comp, buf, &trans);
if (ret)
@@ -488,6 +535,24 @@ process_slot:
btrfs_release_path(path);
+ /*
+ * Whenever we share an extent we update the last_reflink_trans
+ * of each inode to the current transaction. This is needed to
+ * make sure fsync does not log multiple checksum items with
+ * overlapping ranges (because some extent items might refer
+ * only to sections of the original extent). For the destination
+ * inode we do this regardless of the generation of the extents
+ * or even if they are inline extents or explicit holes, to make
+ * sure a full fsync does not skip them. For the source inode,
+ * we only need to update last_reflink_trans in case it's a new
+ * extent that is not a hole or an inline extent, to deal with
+ * the checksums problem on fsync.
+ */
+ if (extent_gen == trans->transid && disko > 0)
+ BTRFS_I(src)->last_reflink_trans = trans->transid;
+
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
+
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
ret = clone_finish_inode_update(trans, inode, last_dest_end,
@@ -498,12 +563,14 @@ process_slot:
break;
btrfs_release_path(path);
- key.offset = next_key_min_offset;
+ key.offset = prev_extent_end;
if (fatal_signal_pending(current)) {
ret = -EINTR;
goto out;
}
+
+ cond_resched();
}
ret = 0;
@@ -515,10 +582,26 @@ process_slot:
* mixing buffered and direct IO writes against this file.
*/
btrfs_release_path(path);
- path->leave_spinning = 0;
- ret = btrfs_punch_hole_range(inode, path, last_dest_end,
- destoff + len - 1, NULL, &trans);
+ /*
+ * When using NO_HOLES and we are cloning a range that covers
+ * only a hole (no extents) into a range beyond the current
+ * i_size, punching a hole in the target range will not create
+ * an extent map defining a hole, because the range starts at or
+ * beyond current i_size. If the file previously had an i_size
+ * greater than the new i_size set by this clone operation, we
+ * need to make sure the next fsync is a full fsync, so that it
+ * detects and logs a hole covering a range from the current
+ * i_size to the new i_size. If the clone range covers extents,
+ * besides a hole, then we know the full sync flag was already
+ * set by previous calls to btrfs_replace_file_extents() that
+ * replaced file extent items.
+ */
+ if (last_dest_end >= i_size_read(inode))
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
+
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ last_dest_end, destoff + len - 1, NULL, &trans);
if (ret)
goto out;
@@ -527,44 +610,46 @@ process_slot:
}
out:
- btrfs_free_path(path);
- kvfree(buf);
+ clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
+
return ret;
}
-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+ if (inode1 < inode2)
+ swap(inode1, inode2);
+ down_write(&inode1->i_mmap_lock);
+ down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING);
}
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
- if (inode1 < inode2) {
- swap(inode1, inode2);
- swap(loff1, loff2);
- } else if (inode1 == inode2 && loff2 < loff1) {
- swap(loff1, loff2);
- }
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+ up_write(&inode1->i_mmap_lock);
+ up_write(&inode2->i_mmap_lock);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
- struct inode *dst, u64 dst_loff)
+static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
+ struct btrfs_inode *dst, u64 dst_loff)
{
- const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ const u64 end = dst_loff + len - 1;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_fs_info *fs_info = src->root->fs_info;
+ const u64 bs = fs_info->sectorsize;
int ret;
/*
- * Lock destination range to serialize with concurrent readpages() and
- * source range to serialize with relocation.
+ * Lock destination range to serialize with concurrent readahead(), and
+ * we are safe from concurrency with relocation of source extents
+ * because we have already locked the inode's i_mmap_lock in exclusive
+ * mode.
*/
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
- ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
+ ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
+ ALIGN(len, bs), dst_loff, 1);
+ btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
+
+ btrfs_btree_balance_dirty(fs_info);
return ret;
}
@@ -572,7 +657,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
- int ret;
+ int ret = 0;
u64 i, tail_len, chunk_count;
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
@@ -580,7 +665,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
if (root_dst->send_in_progress) {
btrfs_warn_rl(root_dst->fs_info,
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
- root_dst->root_key.objectid,
+ btrfs_root_id(root_dst),
root_dst->send_in_progress);
spin_unlock(&root_dst->root_item_lock);
return -EAGAIN;
@@ -592,8 +677,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
for (i = 0; i < chunk_count; i++) {
- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN,
+ BTRFS_I(dst), dst_loff);
if (ret)
goto out;
@@ -602,7 +687,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
}
if (tail_len > 0)
- ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len,
+ BTRFS_I(dst), dst_loff);
out:
spin_lock(&root_dst->root_item_lock);
root_dst->dedupe_in_progress--;
@@ -614,13 +700,15 @@ out:
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
u64 off, u64 olen, u64 destoff)
{
+ struct extent_state *cached_state = NULL;
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
int ret;
int wb_ret;
u64 len = olen;
- u64 bs = fs_info->sb->s_blocksize;
+ u64 bs = fs_info->sectorsize;
+ u64 end;
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
@@ -634,7 +722,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
if (destoff > inode->i_size) {
const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
- ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+ ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
if (ret)
return ret;
/*
@@ -646,26 +734,29 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* we found the previous extent covering eof and before we
* attempted to increment its reference count).
*/
- ret = btrfs_wait_ordered_range(inode, wb_start,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start,
destoff - wb_start);
if (ret)
return ret;
}
/*
- * Lock destination range to serialize with concurrent readpages() and
- * source range to serialize with relocation.
+ * Lock destination range to serialize with concurrent readahead(), and
+ * we are safe from concurrency with relocation of source extents
+ * because we have already locked the inode's i_mmap_lock in exclusive
+ * mode.
*/
- btrfs_double_extent_lock(src, off, inode, destoff, len);
+ end = destoff + len - 1;
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
/*
* We may have copied an inline extent into a page of the destination
* range, so wait for writeback to complete before truncating pages
* from the page cache. This is a rare case.
*/
- wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+ wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len);
ret = ret ? ret : wb_ret;
/*
* Truncate page cache pages so that future reads will see the cloned
@@ -675,6 +766,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
round_down(destoff, PAGE_SIZE),
round_up(destoff + len, PAGE_SIZE) - 1);
+ btrfs_btree_balance_dirty(fs_info);
+
return ret;
}
@@ -682,27 +775,28 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
- bool same_inode = inode_out == inode_in;
+ struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in));
+ struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out));
+ u64 bs = inode_out->root->fs_info->sectorsize;
u64 wb_len;
int ret;
if (!(remap_flags & REMAP_FILE_DEDUP)) {
- struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+ struct btrfs_root *root_out = inode_out->root;
if (btrfs_root_readonly(root_out))
return -EROFS;
- if (file_in->f_path.mnt != file_out->f_path.mnt ||
- inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
+ ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
}
+ /* Can only reflink encrypted files if both files are encrypted. */
+ if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode))
+ return -EINVAL;
+
/* Don't make the dst file partly checksummed */
- if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
+ (inode_out->flags & BTRFS_INODE_NODATASUM)) {
return -EINVAL;
}
@@ -721,20 +815,11 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* to complete so that new file extent items are in the fs tree.
*/
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
- wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs);
else
wb_len = ALIGN(*len, bs);
/*
- * Since we don't lock ranges, wait for ongoing lockless dio writes (as
- * any in progress could create its ordered extents after we wait for
- * existing ordered extents below).
- */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- /*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
*
* Btrfs' back references do not have a block level granularity, they
@@ -751,16 +836,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* Also we don't need to check ASYNC_EXTENT, as async extent will be
* CoWed anyway, not affecting nocow part.
*/
- ret = filemap_flush(inode_in->i_mapping);
+ ret = filemap_flush(inode_in->vfs_inode.i_mapping);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len);
if (ret < 0)
return ret;
@@ -768,22 +851,37 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
len, remap_flags);
}
+static bool file_sync_write(const struct file *file)
+{
+ if (file->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(file)))
+ return true;
+
+ return false;
+}
+
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
- struct inode *src_inode = file_inode(src_file);
- struct inode *dst_inode = file_inode(dst_file);
+ struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file));
+ struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file));
bool same_inode = dst_inode == src_inode;
int ret;
+ if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file)))))
+ return -EIO;
+
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
- if (same_inode)
- inode_lock(src_inode);
- else
- lock_two_nondirectories(src_inode, dst_inode);
+ if (same_inode) {
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
+ lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode);
+ btrfs_double_mmap_lock(src_inode, dst_inode);
+ }
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
&len, remap_flags);
@@ -791,15 +889,34 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
goto out_unlock;
if (remap_flags & REMAP_FILE_DEDUP)
- ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ ret = btrfs_extent_same(&src_inode->vfs_inode, off, len,
+ &dst_inode->vfs_inode, destoff);
else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
- if (same_inode)
- inode_unlock(src_inode);
- else
- unlock_two_nondirectories(src_inode, dst_inode);
+ if (same_inode) {
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
+ btrfs_double_mmap_unlock(src_inode, dst_inode);
+ unlock_two_nondirectories(&src_inode->vfs_inode,
+ &dst_inode->vfs_inode);
+ }
+
+ /*
+ * If either the source or the destination file was opened with O_SYNC,
+ * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
+ * source files/ranges, so that after a successful return (0) followed
+ * by a power failure results in the reflinked data to be readable from
+ * both files/ranges.
+ */
+ if (ret == 0 && len > 0 &&
+ (file_sync_write(src_file) || file_sync_write(dst_file))) {
+ ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
+ if (ret == 0)
+ ret = btrfs_sync_file(dst_file, destoff,
+ destoff + len - 1, 0);
+ }
return ret < 0 ? ret : len;
}