diff options
Diffstat (limited to 'fs/btrfs/reflink.c')
| -rw-r--r-- | fs/btrfs/reflink.c | 485 |
1 files changed, 301 insertions, 184 deletions
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 040009d1cc31..b5fe95baf92e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -1,12 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/blkdev.h> +#include <linux/fscrypt.h> #include <linux/iversion.h> -#include "compression.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" +#include "compression.h" #include "delalloc-space.h" +#include "disk-io.h" #include "reflink.h" #include "transaction.h" +#include "subpage.h" +#include "accessors.h" +#include "file-item.h" +#include "file.h" +#include "super.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M @@ -15,14 +24,14 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, u64 endoff, const u64 destoff, const u64 olen, - int no_time_update) + bool no_time_update) { - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); + if (!no_time_update) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -31,33 +40,33 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, endoff = destoff + olen; if (endoff > inode->i_size) { i_size_write(inode, endoff); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, inode); - if (ret) { + ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); - goto out; + return ret; } - ret = btrfs_end_transaction(trans); -out: - return ret; + return btrfs_end_transaction(trans); } -static int copy_inline_to_page(struct inode *inode, +static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, const u64 datal, const u8 comp_type) { - const u64 block_size = btrfs_inode_sectorsize(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 block_size = fs_info->sectorsize; const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; - struct page *page = NULL; + struct folio *folio = NULL; + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); @@ -73,34 +82,47 @@ static int copy_inline_to_page(struct inode *inode, if (ret) goto out; - page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(inode->i_mapping)); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + btrfs_alloc_write_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto out_unlock; } - set_page_extent_mapped(page); - clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, NULL); + ret = set_folio_extent_mapped(folio); + if (ret < 0) + goto out_unlock; + + btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; - if (comp_type == BTRFS_COMPRESS_NONE) { - char *map; + /* + * After dirtying the page our caller will need to start a transaction, + * and if we are low on metadata free space, that can cause flushing of + * delalloc for all inodes in order to get metadata space released. + * However we are holding the range locked for the whole duration of + * the clone/dedupe operation, so we may deadlock if that happens and no + * other task releases enough space. So mark this inode as not being + * possible to flush to avoid such deadlock. We will clear that flag + * when we finish cloning all extents, since a transaction is started + * after finding each extent to clone. + */ + set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); - map = kmap(page); - memcpy(map, data_start, datal); - flush_dcache_page(page); - kunmap(page); + if (comp_type == BTRFS_COMPRESS_NONE) { + memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, + datal); } else { - ret = btrfs_decompress(comp_type, data_start, page, 0, + ret = btrfs_decompress(comp_type, data_start, folio, + offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; - flush_dcache_page(page); + flush_dcache_folio(folio); } /* @@ -115,27 +137,21 @@ static int copy_inline_to_page(struct inode *inode, * * So what's in the range [500, 4095] corresponds to zeroes. */ - if (datal < block_size) { - char *map; - - map = kmap(page); - memset(map + datal, 0, block_size - datal); - flush_dcache_page(page); - kunmap(page); - } + if (datal < block_size) + folio_zero_range(folio, datal, block_size - datal); - SetPageUptodate(page); - ClearPageChecked(page); - set_page_dirty(page); + btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); + btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); + btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: - if (page) { - unlock_page(page); - put_page(page); + if (!IS_ERR(folio)) { + folio_unlock(folio); + folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, block_size, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); + btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); @@ -147,7 +163,7 @@ out: * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ -static int clone_copy_inline_extent(struct inode *dst, +static int clone_copy_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, @@ -157,21 +173,22 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); - struct btrfs_root *root = BTRFS_I(dst)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(inode, new_key->offset, + inline_data, size, datal, comp_type); goto out; } - key.objectid = btrfs_ino(BTRFS_I(dst)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -186,19 +203,16 @@ static int clone_copy_inline_extent(struct inode *dst, goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + if (key.objectid == btrfs_ino(inode) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(dst, new_key->offset, - inline_data, size, datal, - comp_type); - goto out; + goto copy_to_page; } - } else if (i_size_read(dst) <= datal) { + } else if (i_size_read(&inode->vfs_inode) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -212,29 +226,28 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); - goto out; + goto copy_to_page; } copy_inline_extent: - ret = 0; /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ - if (i_size_read(dst) > datal) { + if (i_size_read(&inode->vfs_inode) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); - goto out; + goto copy_to_page; } + /* + * Release path before starting a new transaction so we don't hold locks + * that would confuse lockdep. + */ btrfs_release_path(path); /* * If we end up here it means were copy the inline extent into a leaf @@ -251,20 +264,30 @@ copy_inline_extent: trans = NULL; goto out; } - ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); - if (ret) + drop_args.path = path; + drop_args.start = drop_start; + drop_args.end = aligned_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, inode, &drop_args); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); goto out; + } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); goto out; + } write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - inode_add_bytes(dst, datal); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); + btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); + btrfs_set_inode_full_sync(inode); + ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); + if (unlikely(ret)) + btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { /* @@ -279,42 +302,56 @@ out: trans = NULL; } } - if (ret && trans) { - btrfs_abort_transaction(trans, ret); + if (ret && trans) btrfs_end_transaction(trans); - } if (!ret) *trans_out = trans; return ret; + +copy_to_page: + /* + * Release our path because we don't need it anymore and also because + * copy_inline_to_page() needs to reserve data and metadata, which may + * need to flush delalloc when we are low on available space and + * therefore cause a deadlock if writeback of an inline extent needs to + * write to the same leaf or an ordered extent completion needs to write + * to the same leaf. + */ + btrfs_release_path(path); + + ret = copy_inline_to_page(inode, new_key->offset, + inline_data, size, datal, comp_type); + goto out; } -/** - * btrfs_clone() - clone a range from inode file to another +/* + * Clone a range from inode file to another. * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff, int no_time_update) + const u64 destoff, bool no_time_update) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_path *path = NULL; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; - char *buf = NULL; + char AUTO_KVFREE(buf); struct btrfs_key key; u32 nritems; int slot; int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; + u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -322,10 +359,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; path = btrfs_alloc_path(); - if (!path) { - kvfree(buf); + if (!path) return ret; - } path->reada = READA_FORWARD; /* Clone data */ @@ -334,8 +369,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { - u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; + u64 extent_gen; int type; u32 size; struct btrfs_key new_key; @@ -345,7 +380,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 drop_start; /* Note the key will change type as we walk through the tree */ - path->leave_spinning = 1; ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) @@ -384,6 +418,7 @@ process_slot: extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + extent_gen = btrfs_file_extent_generation(leaf, extent); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG || @@ -401,20 +436,26 @@ process_slot: * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. */ - if (key.offset + datal <= off) { + if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } - next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); + + prev_extent_end = key.offset + datal; + size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); btrfs_release_path(path); - path->leave_spinning = 0; memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(BTRFS_I(inode)); @@ -436,7 +477,7 @@ process_slot: if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; + struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b @@ -459,13 +500,15 @@ process_slot: clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; - clone_info.item_size = size; - ret = btrfs_punch_hole_range(inode, path, drop_start, - new_key.offset + datal - 1, &clone_info, - &trans); + clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + drop_start, new_key.offset + datal - 1, + &clone_info, &trans); if (ret) goto out; - } else if (type == BTRFS_FILE_EXTENT_INLINE) { + } else { + ASSERT(type == BTRFS_FILE_EXTENT_INLINE); /* * Inline extents always have to start at file offset 0 * and can never be bigger then the sector size. We can @@ -476,10 +519,14 @@ process_slot: */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); - if (key.offset != 0 || datal > fs_info->sectorsize) - return -EUCLEAN; + if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || + WARN_ON(key.offset != 0) || + WARN_ON(datal > fs_info->sectorsize)) { + ret = -EUCLEAN; + goto out; + } - ret = clone_copy_inline_extent(inode, path, &new_key, + ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) @@ -488,6 +535,24 @@ process_slot: btrfs_release_path(path); + /* + * Whenever we share an extent we update the last_reflink_trans + * of each inode to the current transaction. This is needed to + * make sure fsync does not log multiple checksum items with + * overlapping ranges (because some extent items might refer + * only to sections of the original extent). For the destination + * inode we do this regardless of the generation of the extents + * or even if they are inline extents or explicit holes, to make + * sure a full fsync does not skip them. For the source inode, + * we only need to update last_reflink_trans in case it's a new + * extent that is not a hole or an inline extent, to deal with + * the checksums problem on fsync. + */ + if (extent_gen == trans->transid && disko > 0) + BTRFS_I(src)->last_reflink_trans = trans->transid; + + BTRFS_I(inode)->last_reflink_trans = trans->transid; + last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, @@ -498,12 +563,14 @@ process_slot: break; btrfs_release_path(path); - key.offset = next_key_min_offset; + key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } + + cond_resched(); } ret = 0; @@ -515,10 +582,26 @@ process_slot: * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); - path->leave_spinning = 0; - ret = btrfs_punch_hole_range(inode, path, last_dest_end, - destoff + len - 1, NULL, &trans); + /* + * When using NO_HOLES and we are cloning a range that covers + * only a hole (no extents) into a range beyond the current + * i_size, punching a hole in the target range will not create + * an extent map defining a hole, because the range starts at or + * beyond current i_size. If the file previously had an i_size + * greater than the new i_size set by this clone operation, we + * need to make sure the next fsync is a full fsync, so that it + * detects and logs a hole covering a range from the current + * i_size to the new i_size. If the clone range covers extents, + * besides a hole, then we know the full sync flag was already + * set by previous calls to btrfs_replace_file_extents() that + * replaced file extent items. + */ + if (last_dest_end >= i_size_read(inode)) + btrfs_set_inode_full_sync(BTRFS_I(inode)); + + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; @@ -527,44 +610,46 @@ process_slot: } out: - btrfs_free_path(path); - kvfree(buf); + clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); + return ret; } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + if (inode1 < inode2) + swap(inode1, inode2); + down_write(&inode1->i_mmap_lock); + down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING); } -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - } else if (inode1 == inode2 && loff2 < loff1) { - swap(loff1, loff2); - } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + up_write(&inode1->i_mmap_lock); + up_write(&inode2->i_mmap_lock); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, + struct btrfs_inode *dst, u64 dst_loff) { - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + const u64 end = dst_loff + len - 1; + struct extent_state *cached_state = NULL; + struct btrfs_fs_info *fs_info = src->root->fs_info; + const u64 bs = fs_info->sectorsize; int ret; /* - * Lock destination range to serialize with concurrent readpages() and - * source range to serialize with relocation. + * Lock destination range to serialize with concurrent readahead(), and + * we are safe from concurrency with relocation of source extents + * because we have already locked the inode's i_mmap_lock in exclusive + * mode. */ - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state); + ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, + ALIGN(len, bs), dst_loff, 1); + btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); + + btrfs_btree_balance_dirty(fs_info); return ret; } @@ -572,7 +657,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { - int ret; + int ret = 0; u64 i, tail_len, chunk_count; struct btrfs_root *root_dst = BTRFS_I(dst)->root; @@ -580,7 +665,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (root_dst->send_in_progress) { btrfs_warn_rl(root_dst->fs_info, "cannot deduplicate to root %llu while send operations are using it (%d in progress)", - root_dst->root_key.objectid, + btrfs_root_id(root_dst), root_dst->send_in_progress); spin_unlock(&root_dst->root_item_lock); return -EAGAIN; @@ -592,8 +677,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN, + BTRFS_I(dst), dst_loff); if (ret) goto out; @@ -602,7 +687,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, } if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len, + BTRFS_I(dst), dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; @@ -614,13 +700,15 @@ out: static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 off, u64 olen, u64 destoff) { + struct extent_state *cached_state = NULL; struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int ret; int wb_ret; u64 len = olen; - u64 bs = fs_info->sb->s_blocksize; + u64 bs = fs_info->sectorsize; + u64 end; /* * VFS's generic_remap_file_range_prep() protects us from cloning the @@ -634,7 +722,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, if (destoff > inode->i_size) { const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); - ret = btrfs_cont_expand(inode, inode->i_size, destoff); + ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); if (ret) return ret; /* @@ -646,26 +734,29 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * we found the previous extent covering eof and before we * attempted to increment its reference count). */ - ret = btrfs_wait_ordered_range(inode, wb_start, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start, destoff - wb_start); if (ret) return ret; } /* - * Lock destination range to serialize with concurrent readpages() and - * source range to serialize with relocation. + * Lock destination range to serialize with concurrent readahead(), and + * we are safe from concurrency with relocation of source extents + * because we have already locked the inode's i_mmap_lock in exclusive + * mode. */ - btrfs_double_extent_lock(src, off, inode, destoff, len); + end = destoff + len - 1; + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - btrfs_double_extent_unlock(src, off, inode, destoff, len); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination * range, so wait for writeback to complete before truncating pages * from the page cache. This is a rare case. */ - wb_ret = btrfs_wait_ordered_range(inode, destoff, len); + wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); ret = ret ? ret : wb_ret; /* * Truncate page cache pages so that future reads will see the cloned @@ -675,6 +766,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); + btrfs_btree_balance_dirty(fs_info); + return ret; } @@ -682,27 +775,28 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; - bool same_inode = inode_out == inode_in; + struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in)); + struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out)); + u64 bs = inode_out->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + struct btrfs_root *root_out = inode_out->root; if (btrfs_root_readonly(root_out)) return -EROFS; - if (file_in->f_path.mnt != file_out->f_path.mnt || - inode_in->i_sb != inode_out->i_sb) - return -EXDEV; + ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } + /* Can only reflink encrypted files if both files are encrypted. */ + if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode)) + return -EINVAL; + /* Don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + if ((inode_in->flags & BTRFS_INODE_NODATASUM) != + (inode_out->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } @@ -721,20 +815,11 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); /* - * Since we don't lock ranges, wait for ongoing lockless dio writes (as - * any in progress could create its ordered extents after we wait for - * existing ordered extents below). - */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - /* * Workaround to make sure NOCOW buffered write reach disk as NOCOW. * * Btrfs' back references do not have a block level granularity, they @@ -751,16 +836,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ - ret = filemap_flush(inode_in->i_mapping); + ret = filemap_flush(inode_in->vfs_inode.i_mapping); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; @@ -768,22 +851,37 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, len, remap_flags); } +static bool file_sync_write(const struct file *file) +{ + if (file->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(file))) + return true; + + return false; +} + loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); + struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file)); + struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file)); bool same_inode = dst_inode == src_inode; int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))) + return -EIO; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (same_inode) - inode_lock(src_inode); - else - lock_two_nondirectories(src_inode, dst_inode); + if (same_inode) { + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); + } else { + lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); + btrfs_double_mmap_lock(src_inode, dst_inode); + } ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); @@ -791,15 +889,34 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + ret = btrfs_extent_same(&src_inode->vfs_inode, off, len, + &dst_inode->vfs_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: - if (same_inode) - inode_unlock(src_inode); - else - unlock_two_nondirectories(src_inode, dst_inode); + if (same_inode) { + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); + } else { + btrfs_double_mmap_unlock(src_inode, dst_inode); + unlock_two_nondirectories(&src_inode->vfs_inode, + &dst_inode->vfs_inode); + } + + /* + * If either the source or the destination file was opened with O_SYNC, + * O_DSYNC or has the S_SYNC attribute, fsync both the destination and + * source files/ranges, so that after a successful return (0) followed + * by a power failure results in the reflinked data to be readable from + * both files/ranges. + */ + if (ret == 0 && len > 0 && + (file_sync_write(src_file) || file_sync_write(dst_file))) { + ret = btrfs_sync_file(src_file, off, off + len - 1, 0); + if (ret == 0) + ret = btrfs_sync_file(dst_file, destoff, + destoff + len - 1, 0); + } return ret < 0 ? ret : len; } |
